From 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 16 Apr 2005 15:20:36 -0700 Subject: Linux-2.6.12-rc2 Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip! --- net/802/Makefile | 15 + net/802/fc.c | 130 + net/802/fddi.c | 210 + net/802/hippi.c | 234 + net/802/p8022.c | 65 + net/802/p8023.c | 61 + net/802/psnap.c | 159 + net/802/sysctl_net_802.c | 33 + net/802/tr.c | 645 +++ net/8021q/Makefile | 12 + net/8021q/vlan.c | 774 ++++ net/8021q/vlan.h | 72 + net/8021q/vlan_dev.c | 890 ++++ net/8021q/vlanproc.c | 357 ++ net/8021q/vlanproc.h | 19 + net/Kconfig | 646 +++ net/Makefile | 48 + net/TUNABLE | 50 + net/appletalk/Makefile | 9 + net/appletalk/aarp.c | 1069 +++++ net/appletalk/atalk_proc.c | 321 ++ net/appletalk/ddp.c | 1931 +++++++++ net/appletalk/dev.c | 43 + net/appletalk/sysctl_net_atalk.c | 83 + net/atm/Makefile | 18 + net/atm/addr.c | 134 + net/atm/addr.h | 18 + net/atm/atm_misc.c | 106 + net/atm/br2684.c | 824 ++++ net/atm/clip.c | 1045 +++++ net/atm/common.c | 804 ++++ net/atm/common.h | 50 + net/atm/ioctl.c | 139 + net/atm/ipcommon.c | 61 + net/atm/ipcommon.h | 22 + net/atm/lec.c | 2538 +++++++++++ net/atm/lec.h | 142 + net/atm/lec_arpc.h | 92 + net/atm/mpc.c | 1514 +++++++ net/atm/mpc.h | 53 + net/atm/mpoa_caches.c | 576 +++ net/atm/mpoa_caches.h | 96 + net/atm/mpoa_proc.c | 305 ++ net/atm/pppoatm.c | 369 ++ net/atm/proc.c | 514 +++ net/atm/protocols.h | 13 + net/atm/pvc.c | 155 + net/atm/raw.c | 98 + net/atm/resources.c | 432 ++ net/atm/resources.h | 46 + net/atm/signaling.c | 280 ++ net/atm/signaling.h | 30 + net/atm/svc.c | 674 +++ net/ax25/Kconfig | 110 + net/ax25/Makefile | 11 + net/ax25/TODO | 24 + net/ax25/af_ax25.c | 2050 +++++++++ net/ax25/ax25_addr.c | 290 ++ net/ax25/ax25_dev.c | 208 + net/ax25/ax25_ds_in.c | 305 ++ net/ax25/ax25_ds_subr.c | 212 + net/ax25/ax25_ds_timer.c | 241 ++ net/ax25/ax25_iface.c | 266 ++ net/ax25/ax25_in.c | 470 ++ net/ax25/ax25_ip.c | 225 + net/ax25/ax25_out.c | 383 ++ net/ax25/ax25_route.c | 534 +++ net/ax25/ax25_std_in.c | 449 ++ net/ax25/ax25_std_subr.c | 88 + net/ax25/ax25_std_timer.c | 177 + net/ax25/ax25_subr.c | 295 ++ net/ax25/ax25_timer.c | 243 ++ net/ax25/ax25_uid.c | 228 + net/ax25/sysctl_net_ax25.c | 262 ++ net/bluetooth/Kconfig | 63 + net/bluetooth/Makefile | 13 + net/bluetooth/af_bluetooth.c | 355 ++ net/bluetooth/bnep/Kconfig | 24 + net/bluetooth/bnep/Makefile | 7 + net/bluetooth/bnep/bnep.h | 184 + net/bluetooth/bnep/core.c | 713 +++ net/bluetooth/bnep/netdev.c | 247 ++ net/bluetooth/bnep/sock.c | 237 + net/bluetooth/cmtp/Kconfig | 11 + net/bluetooth/cmtp/Makefile | 7 + net/bluetooth/cmtp/capi.c | 600 +++ net/bluetooth/cmtp/cmtp.h | 135 + net/bluetooth/cmtp/core.c | 504 +++ net/bluetooth/cmtp/sock.c | 226 + net/bluetooth/hci_conn.c | 471 ++ net/bluetooth/hci_core.c | 1434 +++++++ net/bluetooth/hci_event.c | 1044 +++++ net/bluetooth/hci_sock.c | 707 +++ net/bluetooth/hci_sysfs.c | 153 + net/bluetooth/hidp/Kconfig | 12 + net/bluetooth/hidp/Makefile | 7 + net/bluetooth/hidp/core.c | 772 ++++ net/bluetooth/hidp/hidp.h | 167 + net/bluetooth/hidp/sock.c | 232 + net/bluetooth/l2cap.c | 2329 ++++++++++ net/bluetooth/lib.c | 178 + net/bluetooth/rfcomm/Kconfig | 17 + net/bluetooth/rfcomm/Makefile | 8 + net/bluetooth/rfcomm/core.c | 2127 +++++++++ net/bluetooth/rfcomm/crc.c | 71 + net/bluetooth/rfcomm/sock.c | 1010 +++++ net/bluetooth/rfcomm/tty.c | 930 ++++ net/bluetooth/sco.c | 1071 +++++ net/bridge/Makefile | 15 + net/bridge/br.c | 69 + net/bridge/br_device.c | 104 + net/bridge/br_fdb.c | 368 ++ net/bridge/br_forward.c | 159 + net/bridge/br_if.c | 388 ++ net/bridge/br_input.c | 144 + net/bridge/br_ioctl.c | 410 ++ net/bridge/br_netfilter.c | 1087 +++++ net/bridge/br_notify.c | 87 + net/bridge/br_private.h | 244 ++ net/bridge/br_private_stp.h | 58 + net/bridge/br_stp.c | 459 ++ net/bridge/br_stp_bpdu.c | 205 + net/bridge/br_stp_if.c | 225 + net/bridge/br_stp_timer.c | 188 + net/bridge/br_sysfs_br.c | 364 ++ net/bridge/br_sysfs_if.c | 269 ++ net/bridge/netfilter/Kconfig | 211 + net/bridge/netfilter/Makefile | 32 + net/bridge/netfilter/ebt_802_3.c | 73 + net/bridge/netfilter/ebt_among.c | 228 + net/bridge/netfilter/ebt_arp.c | 140 + net/bridge/netfilter/ebt_arpreply.c | 97 + net/bridge/netfilter/ebt_dnat.c | 76 + net/bridge/netfilter/ebt_ip.c | 122 + net/bridge/netfilter/ebt_limit.c | 113 + net/bridge/netfilter/ebt_log.c | 171 + net/bridge/netfilter/ebt_mark.c | 68 + net/bridge/netfilter/ebt_mark_m.c | 62 + net/bridge/netfilter/ebt_pkttype.c | 59 + net/bridge/netfilter/ebt_redirect.c | 81 + net/bridge/netfilter/ebt_snat.c | 76 + net/bridge/netfilter/ebt_stp.c | 194 + net/bridge/netfilter/ebt_ulog.c | 295 ++ net/bridge/netfilter/ebt_vlan.c | 195 + net/bridge/netfilter/ebtable_broute.c | 86 + net/bridge/netfilter/ebtable_filter.c | 123 + net/bridge/netfilter/ebtable_nat.c | 130 + net/bridge/netfilter/ebtables.c | 1507 +++++++ net/compat.c | 605 +++ net/core/Makefile | 17 + net/core/datagram.c | 482 +++ net/core/dev.c | 3359 +++++++++++++++ net/core/dev_mcast.c | 299 ++ net/core/dst.c | 276 ++ net/core/dv.c | 548 +++ net/core/ethtool.c | 819 ++++ net/core/filter.c | 432 ++ net/core/flow.c | 371 ++ net/core/gen_estimator.c | 250 ++ net/core/gen_stats.c | 239 ++ net/core/iovec.c | 239 ++ net/core/link_watch.c | 137 + net/core/neighbour.c | 2362 ++++++++++ net/core/net-sysfs.c | 461 ++ net/core/netfilter.c | 799 ++++ net/core/netpoll.c | 735 ++++ net/core/pktgen.c | 3132 ++++++++++++++ net/core/rtnetlink.c | 711 +++ net/core/scm.c | 291 ++ net/core/skbuff.c | 1460 +++++++ net/core/sock.c | 1565 +++++++ net/core/stream.c | 287 ++ net/core/sysctl_net_core.c | 182 + net/core/utils.c | 155 + net/core/wireless.c | 1459 +++++++ net/decnet/Kconfig | 27 + net/decnet/Makefile | 10 + net/decnet/README | 8 + net/decnet/TODO | 41 + net/decnet/af_decnet.c | 2405 +++++++++++ net/decnet/dn_dev.c | 1481 +++++++ net/decnet/dn_fib.c | 802 ++++ net/decnet/dn_neigh.c | 627 +++ net/decnet/dn_nsp_in.c | 934 ++++ net/decnet/dn_nsp_out.c | 782 ++++ net/decnet/dn_route.c | 1840 ++++++++ net/decnet/dn_rules.c | 416 ++ net/decnet/dn_table.c | 825 ++++ net/decnet/dn_timer.c | 109 + net/decnet/netfilter/Kconfig | 15 + net/decnet/netfilter/Makefile | 6 + net/decnet/netfilter/dn_rtmsg.c | 167 + net/decnet/sysctl_net_decnet.c | 480 +++ net/econet/Makefile | 7 + net/econet/af_econet.c | 1129 +++++ net/ethernet/Makefile | 8 + net/ethernet/eth.c | 308 ++ net/ethernet/pe2.c | 40 + net/ethernet/sysctl_net_ether.c | 13 + net/ipv4/Kconfig | 411 ++ net/ipv4/Makefile | 33 + net/ipv4/af_inet.c | 1188 +++++ net/ipv4/ah4.c | 335 ++ net/ipv4/arp.c | 1425 ++++++ net/ipv4/datagram.c | 73 + net/ipv4/devinet.c | 1508 +++++++ net/ipv4/esp4.c | 510 +++ net/ipv4/fib_frontend.c | 611 +++ net/ipv4/fib_hash.c | 1086 +++++ net/ipv4/fib_lookup.h | 43 + net/ipv4/fib_rules.c | 437 ++ net/ipv4/fib_semantics.c | 1332 ++++++ net/ipv4/icmp.c | 1143 +++++ net/ipv4/igmp.c | 2473 +++++++++++ net/ipv4/inetpeer.c | 460 ++ net/ipv4/ip_forward.c | 127 + net/ipv4/ip_fragment.c | 691 +++ net/ipv4/ip_gre.c | 1290 ++++++ net/ipv4/ip_input.c | 431 ++ net/ipv4/ip_options.c | 625 +++ net/ipv4/ip_output.c | 1359 ++++++ net/ipv4/ip_sockglue.c | 1093 +++++ net/ipv4/ipcomp.c | 524 +++ net/ipv4/ipconfig.c | 1507 +++++++ net/ipv4/ipip.c | 905 ++++ net/ipv4/ipmr.c | 1900 ++++++++ net/ipv4/ipvs/Kconfig | 244 ++ net/ipv4/ipvs/Makefile | 34 + net/ipv4/ipvs/ip_vs_app.c | 658 +++ net/ipv4/ipvs/ip_vs_conn.c | 920 ++++ net/ipv4/ipvs/ip_vs_core.c | 1191 ++++++ net/ipv4/ipvs/ip_vs_ctl.c | 2391 +++++++++++ net/ipv4/ipvs/ip_vs_dh.c | 258 ++ net/ipv4/ipvs/ip_vs_est.c | 200 + net/ipv4/ipvs/ip_vs_ftp.c | 400 ++ net/ipv4/ipvs/ip_vs_lblc.c | 624 +++ net/ipv4/ipvs/ip_vs_lblcr.c | 888 ++++ net/ipv4/ipvs/ip_vs_lc.c | 123 + net/ipv4/ipvs/ip_vs_nq.c | 161 + net/ipv4/ipvs/ip_vs_proto.c | 244 ++ net/ipv4/ipvs/ip_vs_proto_ah.c | 177 + net/ipv4/ipvs/ip_vs_proto_esp.c | 175 + net/ipv4/ipvs/ip_vs_proto_icmp.c | 182 + net/ipv4/ipvs/ip_vs_proto_tcp.c | 640 +++ net/ipv4/ipvs/ip_vs_proto_udp.c | 427 ++ net/ipv4/ipvs/ip_vs_rr.c | 118 + net/ipv4/ipvs/ip_vs_sched.c | 251 ++ net/ipv4/ipvs/ip_vs_sed.c | 163 + net/ipv4/ipvs/ip_vs_sh.c | 255 ++ net/ipv4/ipvs/ip_vs_sync.c | 892 ++++ net/ipv4/ipvs/ip_vs_wlc.c | 151 + net/ipv4/ipvs/ip_vs_wrr.c | 235 + net/ipv4/ipvs/ip_vs_xmit.c | 563 +++ net/ipv4/multipath.c | 55 + net/ipv4/multipath_drr.c | 265 ++ net/ipv4/multipath_random.c | 128 + net/ipv4/multipath_rr.c | 115 + net/ipv4/multipath_wrandom.c | 344 ++ net/ipv4/netfilter/Kconfig | 696 +++ net/ipv4/netfilter/Makefile | 89 + net/ipv4/netfilter/arp_tables.c | 1333 ++++++ net/ipv4/netfilter/arpt_mangle.c | 104 + net/ipv4/netfilter/arptable_filter.c | 214 + net/ipv4/netfilter/ip_conntrack_amanda.c | 167 + net/ipv4/netfilter/ip_conntrack_core.c | 1247 ++++++ net/ipv4/netfilter/ip_conntrack_ftp.c | 501 +++ net/ipv4/netfilter/ip_conntrack_irc.c | 313 ++ net/ipv4/netfilter/ip_conntrack_proto_generic.c | 75 + net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 279 ++ net/ipv4/netfilter/ip_conntrack_proto_sctp.c | 649 +++ net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 1098 +++++ net/ipv4/netfilter/ip_conntrack_proto_udp.c | 146 + net/ipv4/netfilter/ip_conntrack_standalone.c | 961 +++++ net/ipv4/netfilter/ip_conntrack_tftp.c | 159 + net/ipv4/netfilter/ip_nat_amanda.c | 88 + net/ipv4/netfilter/ip_nat_core.c | 556 +++ net/ipv4/netfilter/ip_nat_ftp.c | 183 + net/ipv4/netfilter/ip_nat_helper.c | 430 ++ net/ipv4/netfilter/ip_nat_irc.c | 125 + net/ipv4/netfilter/ip_nat_proto_icmp.c | 115 + net/ipv4/netfilter/ip_nat_proto_tcp.c | 178 + net/ipv4/netfilter/ip_nat_proto_udp.c | 165 + net/ipv4/netfilter/ip_nat_proto_unknown.c | 70 + net/ipv4/netfilter/ip_nat_rule.c | 319 ++ net/ipv4/netfilter/ip_nat_snmp_basic.c | 1347 ++++++ net/ipv4/netfilter/ip_nat_standalone.c | 349 ++ net/ipv4/netfilter/ip_nat_tftp.c | 70 + net/ipv4/netfilter/ip_queue.c | 741 ++++ net/ipv4/netfilter/ip_tables.c | 1964 +++++++++ net/ipv4/netfilter/ipt_CLASSIFY.c | 92 + net/ipv4/netfilter/ipt_CLUSTERIP.c | 761 ++++ net/ipv4/netfilter/ipt_CONNMARK.c | 118 + net/ipv4/netfilter/ipt_DSCP.c | 106 + net/ipv4/netfilter/ipt_ECN.c | 175 + net/ipv4/netfilter/ipt_LOG.c | 485 +++ net/ipv4/netfilter/ipt_MARK.c | 162 + net/ipv4/netfilter/ipt_MASQUERADE.c | 207 + net/ipv4/netfilter/ipt_NETMAP.c | 117 + net/ipv4/netfilter/ipt_NOTRACK.c | 76 + net/ipv4/netfilter/ipt_REDIRECT.c | 129 + net/ipv4/netfilter/ipt_REJECT.c | 335 ++ net/ipv4/netfilter/ipt_SAME.c | 211 + net/ipv4/netfilter/ipt_TCPMSS.c | 262 ++ net/ipv4/netfilter/ipt_TOS.c | 105 + net/ipv4/netfilter/ipt_ULOG.c | 419 ++ net/ipv4/netfilter/ipt_addrtype.c | 77 + net/ipv4/netfilter/ipt_ah.c | 117 + net/ipv4/netfilter/ipt_comment.c | 59 + net/ipv4/netfilter/ipt_connmark.c | 81 + net/ipv4/netfilter/ipt_conntrack.c | 136 + net/ipv4/netfilter/ipt_dscp.c | 63 + net/ipv4/netfilter/ipt_ecn.c | 131 + net/ipv4/netfilter/ipt_esp.c | 118 + net/ipv4/netfilter/ipt_hashlimit.c | 731 ++++ net/ipv4/netfilter/ipt_helper.c | 113 + net/ipv4/netfilter/ipt_iprange.c | 99 + net/ipv4/netfilter/ipt_length.c | 64 + net/ipv4/netfilter/ipt_limit.c | 157 + net/ipv4/netfilter/ipt_mac.c | 79 + net/ipv4/netfilter/ipt_mark.c | 64 + net/ipv4/netfilter/ipt_multiport.c | 212 + net/ipv4/netfilter/ipt_owner.c | 217 + net/ipv4/netfilter/ipt_physdev.c | 134 + net/ipv4/netfilter/ipt_pkttype.c | 70 + net/ipv4/netfilter/ipt_realm.c | 76 + net/ipv4/netfilter/ipt_recent.c | 1002 +++++ net/ipv4/netfilter/ipt_sctp.c | 203 + net/ipv4/netfilter/ipt_state.c | 74 + net/ipv4/netfilter/ipt_tcpmss.c | 127 + net/ipv4/netfilter/ipt_tos.c | 64 + net/ipv4/netfilter/ipt_ttl.c | 79 + net/ipv4/netfilter/iptable_filter.c | 194 + net/ipv4/netfilter/iptable_mangle.c | 260 ++ net/ipv4/netfilter/iptable_raw.c | 156 + net/ipv4/proc.c | 382 ++ net/ipv4/protocol.c | 101 + net/ipv4/raw.c | 888 ++++ net/ipv4/route.c | 3177 ++++++++++++++ net/ipv4/syncookies.c | 279 ++ net/ipv4/sysctl_net_ipv4.c | 698 +++ net/ipv4/tcp.c | 2386 +++++++++++ net/ipv4/tcp_diag.c | 802 ++++ net/ipv4/tcp_input.c | 4959 +++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 2663 ++++++++++++ net/ipv4/tcp_minisocks.c | 1077 +++++ net/ipv4/tcp_output.c | 1739 ++++++++ net/ipv4/tcp_timer.c | 656 +++ net/ipv4/udp.c | 1575 +++++++ net/ipv4/utils.c | 59 + net/ipv4/xfrm4_input.c | 160 + net/ipv4/xfrm4_output.c | 141 + net/ipv4/xfrm4_policy.c | 281 ++ net/ipv4/xfrm4_state.c | 126 + net/ipv4/xfrm4_tunnel.c | 144 + net/ipv6/Kconfig | 79 + net/ipv6/Makefile | 25 + net/ipv6/addrconf.c | 3615 ++++++++++++++++ net/ipv6/af_inet6.c | 867 ++++ net/ipv6/ah6.c | 478 +++ net/ipv6/anycast.c | 594 +++ net/ipv6/datagram.c | 600 +++ net/ipv6/esp6.c | 424 ++ net/ipv6/exthdrs.c | 575 +++ net/ipv6/exthdrs_core.c | 109 + net/ipv6/icmp.c | 822 ++++ net/ipv6/ip6_fib.c | 1225 ++++++ net/ipv6/ip6_flowlabel.c | 706 +++ net/ipv6/ip6_input.c | 269 ++ net/ipv6/ip6_output.c | 1197 ++++++ net/ipv6/ip6_tunnel.c | 1163 +++++ net/ipv6/ipcomp6.c | 524 +++ net/ipv6/ipv6_sockglue.c | 704 +++ net/ipv6/ipv6_syms.c | 41 + net/ipv6/mcast.c | 2499 +++++++++++ net/ipv6/ndisc.c | 1690 ++++++++ net/ipv6/netfilter/Kconfig | 242 ++ net/ipv6/netfilter/Makefile | 26 + net/ipv6/netfilter/ip6_queue.c | 741 ++++ net/ipv6/netfilter/ip6_tables.c | 1970 +++++++++ net/ipv6/netfilter/ip6t_LOG.c | 509 +++ net/ipv6/netfilter/ip6t_MARK.c | 78 + net/ipv6/netfilter/ip6t_ah.c | 208 + net/ipv6/netfilter/ip6t_dst.c | 298 ++ net/ipv6/netfilter/ip6t_esp.c | 181 + net/ipv6/netfilter/ip6t_eui64.c | 101 + net/ipv6/netfilter/ip6t_frag.c | 229 + net/ipv6/netfilter/ip6t_hbh.c | 298 ++ net/ipv6/netfilter/ip6t_hl.c | 80 + net/ipv6/netfilter/ip6t_ipv6header.c | 167 + net/ipv6/netfilter/ip6t_length.c | 66 + net/ipv6/netfilter/ip6t_limit.c | 147 + net/ipv6/netfilter/ip6t_mac.c | 80 + net/ipv6/netfilter/ip6t_mark.c | 66 + net/ipv6/netfilter/ip6t_multiport.c | 125 + net/ipv6/netfilter/ip6t_owner.c | 174 + net/ipv6/netfilter/ip6t_physdev.c | 135 + net/ipv6/netfilter/ip6t_rt.c | 301 ++ net/ipv6/netfilter/ip6table_filter.c | 214 + net/ipv6/netfilter/ip6table_mangle.c | 287 ++ net/ipv6/netfilter/ip6table_raw.c | 182 + net/ipv6/proc.c | 303 ++ net/ipv6/protocol.c | 86 + net/ipv6/raw.c | 1157 +++++ net/ipv6/reassembly.c | 771 ++++ net/ipv6/route.c | 2131 +++++++++ net/ipv6/sit.c | 833 ++++ net/ipv6/sysctl_net_ipv6.c | 125 + net/ipv6/tcp_ipv6.c | 2265 ++++++++++ net/ipv6/udp.c | 1075 +++++ net/ipv6/xfrm6_input.c | 150 + net/ipv6/xfrm6_output.c | 143 + net/ipv6/xfrm6_policy.c | 295 ++ net/ipv6/xfrm6_state.c | 136 + net/ipv6/xfrm6_tunnel.c | 543 +++ net/ipx/ChangeLog | 101 + net/ipx/Kconfig | 31 + net/ipx/Makefile | 8 + net/ipx/af_ipx.c | 2024 +++++++++ net/ipx/ipx_proc.c | 408 ++ net/ipx/ipx_route.c | 293 ++ net/ipx/sysctl_net_ipx.c | 62 + net/irda/Kconfig | 96 + net/irda/Makefile | 15 + net/irda/af_irda.c | 2586 +++++++++++ net/irda/discovery.c | 419 ++ net/irda/ircomm/Kconfig | 12 + net/irda/ircomm/Makefile | 8 + net/irda/ircomm/ircomm_core.c | 587 +++ net/irda/ircomm/ircomm_event.c | 251 ++ net/irda/ircomm/ircomm_lmp.c | 372 ++ net/irda/ircomm/ircomm_param.c | 511 +++ net/irda/ircomm/ircomm_ttp.c | 369 ++ net/irda/ircomm/ircomm_tty.c | 1405 ++++++ net/irda/ircomm/ircomm_tty_attach.c | 1006 +++++ net/irda/ircomm/ircomm_tty_ioctl.c | 428 ++ net/irda/irda_device.c | 489 +++ net/irda/iriap.c | 1089 +++++ net/irda/iriap_event.c | 502 +++ net/irda/irias_object.c | 580 +++ net/irda/irlan/Kconfig | 14 + net/irda/irlan/Makefile | 7 + net/irda/irlan/irlan_client.c | 576 +++ net/irda/irlan/irlan_client_event.c | 533 +++ net/irda/irlan/irlan_common.c | 1200 ++++++ net/irda/irlan/irlan_eth.c | 387 ++ net/irda/irlan/irlan_event.c | 60 + net/irda/irlan/irlan_filter.c | 246 ++ net/irda/irlan/irlan_provider.c | 413 ++ net/irda/irlan/irlan_provider_event.c | 241 ++ net/irda/irlap.c | 1258 ++++++ net/irda/irlap_event.c | 2334 ++++++++++ net/irda/irlap_frame.c | 1437 +++++++ net/irda/irlmp.c | 2041 +++++++++ net/irda/irlmp_event.c | 912 ++++ net/irda/irlmp_frame.c | 491 +++ net/irda/irmod.c | 185 + net/irda/irnet/Kconfig | 13 + net/irda/irnet/Makefile | 7 + net/irda/irnet/irnet.h | 529 +++ net/irda/irnet/irnet_irda.c | 1866 ++++++++ net/irda/irnet/irnet_irda.h | 186 + net/irda/irnet/irnet_ppp.c | 1142 +++++ net/irda/irnet/irnet_ppp.h | 119 + net/irda/irproc.c | 100 + net/irda/irqueue.c | 915 ++++ net/irda/irsysctl.c | 297 ++ net/irda/irttp.c | 1912 +++++++++ net/irda/parameters.c | 589 +++ net/irda/qos.c | 774 ++++ net/irda/timer.c | 233 + net/irda/wrapper.c | 491 +++ net/key/Makefile | 5 + net/key/af_key.c | 2903 +++++++++++++ net/lapb/Makefile | 7 + net/lapb/lapb_iface.c | 449 ++ net/lapb/lapb_in.c | 724 ++++ net/lapb/lapb_out.c | 224 + net/lapb/lapb_subr.c | 313 ++ net/lapb/lapb_timer.c | 189 + net/llc/Kconfig | 10 + net/llc/Makefile | 24 + net/llc/af_llc.c | 1079 +++++ net/llc/llc_c_ac.c | 1514 +++++++ net/llc/llc_c_ev.c | 769 ++++ net/llc/llc_c_st.c | 4946 +++++++++++++++++++++ net/llc/llc_conn.c | 915 ++++ net/llc/llc_core.c | 179 + net/llc/llc_if.c | 157 + net/llc/llc_input.c | 189 + net/llc/llc_output.c | 107 + net/llc/llc_output.h | 20 + net/llc/llc_pdu.c | 372 ++ net/llc/llc_proc.c | 267 ++ net/llc/llc_s_ac.c | 205 + net/llc/llc_s_ev.c | 115 + net/llc/llc_s_st.c | 183 + net/llc/llc_sap.c | 316 ++ net/llc/llc_station.c | 713 +++ net/netlink/Makefile | 5 + net/netlink/af_netlink.c | 1454 +++++++ net/netrom/Makefile | 9 + net/netrom/af_netrom.c | 1485 +++++++ net/netrom/nr_dev.c | 220 + net/netrom/nr_in.c | 290 ++ net/netrom/nr_loopback.c | 76 + net/netrom/nr_out.c | 274 ++ net/netrom/nr_route.c | 1041 +++++ net/netrom/nr_subr.c | 283 ++ net/netrom/nr_timer.c | 260 ++ net/netrom/sysctl_net_netrom.c | 189 + net/nonet.c | 30 + net/packet/Makefile | 5 + net/packet/af_packet.c | 1907 +++++++++ net/rose/Makefile | 9 + net/rose/af_rose.c | 1589 +++++++ net/rose/rose_dev.c | 154 + net/rose/rose_in.c | 297 ++ net/rose/rose_link.c | 288 ++ net/rose/rose_loopback.c | 111 + net/rose/rose_out.c | 126 + net/rose/rose_route.c | 1343 ++++++ net/rose/rose_subr.c | 519 +++ net/rose/rose_timer.c | 216 + net/rose/sysctl_net_rose.c | 169 + net/rxrpc/Makefile | 25 + net/rxrpc/call.c | 2278 ++++++++++ net/rxrpc/connection.c | 778 ++++ net/rxrpc/internal.h | 106 + net/rxrpc/krxiod.c | 261 ++ net/rxrpc/krxsecd.c | 270 ++ net/rxrpc/krxtimod.c | 203 + net/rxrpc/main.c | 180 + net/rxrpc/peer.c | 399 ++ net/rxrpc/proc.c | 617 +++ net/rxrpc/rxrpc_syms.c | 35 + net/rxrpc/sysctl.c | 122 + net/rxrpc/transport.c | 854 ++++ net/sched/Kconfig | 508 +++ net/sched/Makefile | 41 + net/sched/act_api.c | 894 ++++ net/sched/cls_api.c | 642 +++ net/sched/cls_basic.c | 303 ++ net/sched/cls_fw.c | 378 ++ net/sched/cls_route.c | 639 +++ net/sched/cls_rsvp.c | 43 + net/sched/cls_rsvp.h | 667 +++ net/sched/cls_rsvp6.c | 44 + net/sched/cls_tcindex.c | 537 +++ net/sched/cls_u32.c | 828 ++++ net/sched/em_cmp.c | 101 + net/sched/em_meta.c | 661 +++ net/sched/em_nbyte.c | 82 + net/sched/em_u32.c | 63 + net/sched/ematch.c | 524 +++ net/sched/estimator.c | 197 + net/sched/gact.c | 231 + net/sched/ipt.c | 326 ++ net/sched/mirred.c | 276 ++ net/sched/pedit.c | 288 ++ net/sched/police.c | 612 +++ net/sched/sch_api.c | 1296 ++++++ net/sched/sch_atm.c | 735 ++++ net/sched/sch_cbq.c | 2124 +++++++++ net/sched/sch_dsmark.c | 479 +++ net/sched/sch_fifo.c | 212 + net/sched/sch_generic.c | 609 +++ net/sched/sch_gred.c | 630 +++ net/sched/sch_hfsc.c | 1822 ++++++++ net/sched/sch_htb.c | 1759 ++++++++ net/sched/sch_ingress.c | 436 ++ net/sched/sch_netem.c | 598 +++ net/sched/sch_prio.c | 444 ++ net/sched/sch_red.c | 459 ++ net/sched/sch_sfq.c | 497 +++ net/sched/sch_tbf.c | 543 +++ net/sched/sch_teql.c | 511 +++ net/sctp/Kconfig | 89 + net/sctp/Makefile | 17 + net/sctp/associola.c | 1205 ++++++ net/sctp/bind_addr.c | 417 ++ net/sctp/chunk.c | 309 ++ net/sctp/command.c | 81 + net/sctp/crc32c.c | 220 + net/sctp/debug.c | 191 + net/sctp/endpointola.c | 389 ++ net/sctp/input.c | 913 ++++ net/sctp/inqueue.c | 204 + net/sctp/ipv6.c | 1013 +++++ net/sctp/objcnt.c | 140 + net/sctp/output.c | 646 +++ net/sctp/outqueue.c | 1734 ++++++++ net/sctp/primitive.c | 219 + net/sctp/proc.c | 288 ++ net/sctp/protocol.c | 1240 ++++++ net/sctp/sm_make_chunk.c | 2766 ++++++++++++ net/sctp/sm_sideeffect.c | 1395 ++++++ net/sctp/sm_statefuns.c | 5238 +++++++++++++++++++++++ net/sctp/sm_statetable.c | 1004 +++++ net/sctp/socket.c | 4797 +++++++++++++++++++++ net/sctp/ssnmap.c | 131 + net/sctp/sysctl.c | 251 ++ net/sctp/transport.c | 514 +++ net/sctp/tsnmap.c | 417 ++ net/sctp/ulpevent.c | 942 ++++ net/sctp/ulpqueue.c | 864 ++++ net/socket.c | 2088 +++++++++ net/sunrpc/Makefile | 15 + net/sunrpc/auth.c | 395 ++ net/sunrpc/auth_gss/Makefile | 18 + net/sunrpc/auth_gss/auth_gss.c | 1152 +++++ net/sunrpc/auth_gss/gss_generic_token.c | 235 + net/sunrpc/auth_gss/gss_krb5_crypto.c | 209 + net/sunrpc/auth_gss/gss_krb5_mech.c | 275 ++ net/sunrpc/auth_gss/gss_krb5_seal.c | 176 + net/sunrpc/auth_gss/gss_krb5_seqnum.c | 88 + net/sunrpc/auth_gss/gss_krb5_unseal.c | 202 + net/sunrpc/auth_gss/gss_mech_switch.c | 301 ++ net/sunrpc/auth_gss/gss_spkm3_mech.c | 300 ++ net/sunrpc/auth_gss/gss_spkm3_seal.c | 132 + net/sunrpc/auth_gss/gss_spkm3_token.c | 266 ++ net/sunrpc/auth_gss/gss_spkm3_unseal.c | 128 + net/sunrpc/auth_gss/svcauth_gss.c | 1080 +++++ net/sunrpc/auth_null.c | 143 + net/sunrpc/auth_unix.c | 242 ++ net/sunrpc/cache.c | 1189 +++++ net/sunrpc/clnt.c | 1085 +++++ net/sunrpc/pmap_clnt.c | 298 ++ net/sunrpc/rpc_pipe.c | 838 ++++ net/sunrpc/sched.c | 1119 +++++ net/sunrpc/stats.c | 175 + net/sunrpc/sunrpc_syms.c | 185 + net/sunrpc/svc.c | 490 +++ net/sunrpc/svcauth.c | 216 + net/sunrpc/svcauth_unix.c | 502 +++ net/sunrpc/svcsock.c | 1585 +++++++ net/sunrpc/sysctl.c | 193 + net/sunrpc/timer.c | 107 + net/sunrpc/xdr.c | 917 ++++ net/sunrpc/xprt.c | 1678 ++++++++ net/sysctl_net.c | 65 + net/unix/Makefile | 8 + net/unix/af_unix.c | 2098 +++++++++ net/unix/garbage.c | 312 ++ net/unix/sysctl_net_unix.c | 60 + net/wanrouter/Makefile | 7 + net/wanrouter/af_wanpipe.c | 2611 +++++++++++ net/wanrouter/patchlevel | 1 + net/wanrouter/wanmain.c | 888 ++++ net/wanrouter/wanproc.c | 381 ++ net/x25/Makefile | 10 + net/x25/af_x25.c | 1435 +++++++ net/x25/sysctl_net_x25.c | 107 + net/x25/x25_dev.c | 207 + net/x25/x25_facilities.c | 231 + net/x25/x25_in.c | 361 ++ net/x25/x25_link.c | 401 ++ net/x25/x25_out.c | 226 + net/x25/x25_proc.c | 256 ++ net/x25/x25_route.c | 221 + net/x25/x25_subr.c | 374 ++ net/x25/x25_timer.c | 176 + net/xfrm/Kconfig | 12 + net/xfrm/Makefile | 7 + net/xfrm/xfrm_algo.c | 729 ++++ net/xfrm/xfrm_input.c | 89 + net/xfrm/xfrm_policy.c | 1367 ++++++ net/xfrm/xfrm_state.c | 1037 +++++ net/xfrm/xfrm_user.c | 1253 ++++++ 668 files changed, 356308 insertions(+) create mode 100644 net/802/Makefile create mode 100644 net/802/fc.c create mode 100644 net/802/fddi.c create mode 100644 net/802/hippi.c create mode 100644 net/802/p8022.c create mode 100644 net/802/p8023.c create mode 100644 net/802/psnap.c create mode 100644 net/802/sysctl_net_802.c create mode 100644 net/802/tr.c create mode 100644 net/8021q/Makefile create mode 100644 net/8021q/vlan.c create mode 100644 net/8021q/vlan.h create mode 100644 net/8021q/vlan_dev.c create mode 100644 net/8021q/vlanproc.c create mode 100644 net/8021q/vlanproc.h create mode 100644 net/Kconfig create mode 100644 net/Makefile create mode 100644 net/TUNABLE create mode 100644 net/appletalk/Makefile create mode 100644 net/appletalk/aarp.c create mode 100644 net/appletalk/atalk_proc.c create mode 100644 net/appletalk/ddp.c create mode 100644 net/appletalk/dev.c create mode 100644 net/appletalk/sysctl_net_atalk.c create mode 100644 net/atm/Makefile create mode 100644 net/atm/addr.c create mode 100644 net/atm/addr.h create mode 100644 net/atm/atm_misc.c create mode 100644 net/atm/br2684.c create mode 100644 net/atm/clip.c create mode 100644 net/atm/common.c create mode 100644 net/atm/common.h create mode 100644 net/atm/ioctl.c create mode 100644 net/atm/ipcommon.c create mode 100644 net/atm/ipcommon.h create mode 100644 net/atm/lec.c create mode 100644 net/atm/lec.h create mode 100644 net/atm/lec_arpc.h create mode 100644 net/atm/mpc.c create mode 100644 net/atm/mpc.h create mode 100644 net/atm/mpoa_caches.c create mode 100644 net/atm/mpoa_caches.h create mode 100644 net/atm/mpoa_proc.c create mode 100644 net/atm/pppoatm.c create mode 100644 net/atm/proc.c create mode 100644 net/atm/protocols.h create mode 100644 net/atm/pvc.c create mode 100644 net/atm/raw.c create mode 100644 net/atm/resources.c create mode 100644 net/atm/resources.h create mode 100644 net/atm/signaling.c create mode 100644 net/atm/signaling.h create mode 100644 net/atm/svc.c create mode 100644 net/ax25/Kconfig create mode 100644 net/ax25/Makefile create mode 100644 net/ax25/TODO create mode 100644 net/ax25/af_ax25.c create mode 100644 net/ax25/ax25_addr.c create mode 100644 net/ax25/ax25_dev.c create mode 100644 net/ax25/ax25_ds_in.c create mode 100644 net/ax25/ax25_ds_subr.c create mode 100644 net/ax25/ax25_ds_timer.c create mode 100644 net/ax25/ax25_iface.c create mode 100644 net/ax25/ax25_in.c create mode 100644 net/ax25/ax25_ip.c create mode 100644 net/ax25/ax25_out.c create mode 100644 net/ax25/ax25_route.c create mode 100644 net/ax25/ax25_std_in.c create mode 100644 net/ax25/ax25_std_subr.c create mode 100644 net/ax25/ax25_std_timer.c create mode 100644 net/ax25/ax25_subr.c create mode 100644 net/ax25/ax25_timer.c create mode 100644 net/ax25/ax25_uid.c create mode 100644 net/ax25/sysctl_net_ax25.c create mode 100644 net/bluetooth/Kconfig create mode 100644 net/bluetooth/Makefile create mode 100644 net/bluetooth/af_bluetooth.c create mode 100644 net/bluetooth/bnep/Kconfig create mode 100644 net/bluetooth/bnep/Makefile create mode 100644 net/bluetooth/bnep/bnep.h create mode 100644 net/bluetooth/bnep/core.c create mode 100644 net/bluetooth/bnep/netdev.c create mode 100644 net/bluetooth/bnep/sock.c create mode 100644 net/bluetooth/cmtp/Kconfig create mode 100644 net/bluetooth/cmtp/Makefile create mode 100644 net/bluetooth/cmtp/capi.c create mode 100644 net/bluetooth/cmtp/cmtp.h create mode 100644 net/bluetooth/cmtp/core.c create mode 100644 net/bluetooth/cmtp/sock.c create mode 100644 net/bluetooth/hci_conn.c create mode 100644 net/bluetooth/hci_core.c create mode 100644 net/bluetooth/hci_event.c create mode 100644 net/bluetooth/hci_sock.c create mode 100644 net/bluetooth/hci_sysfs.c create mode 100644 net/bluetooth/hidp/Kconfig create mode 100644 net/bluetooth/hidp/Makefile create mode 100644 net/bluetooth/hidp/core.c create mode 100644 net/bluetooth/hidp/hidp.h create mode 100644 net/bluetooth/hidp/sock.c create mode 100644 net/bluetooth/l2cap.c create mode 100644 net/bluetooth/lib.c create mode 100644 net/bluetooth/rfcomm/Kconfig create mode 100644 net/bluetooth/rfcomm/Makefile create mode 100644 net/bluetooth/rfcomm/core.c create mode 100644 net/bluetooth/rfcomm/crc.c create mode 100644 net/bluetooth/rfcomm/sock.c create mode 100644 net/bluetooth/rfcomm/tty.c create mode 100644 net/bluetooth/sco.c create mode 100644 net/bridge/Makefile create mode 100644 net/bridge/br.c create mode 100644 net/bridge/br_device.c create mode 100644 net/bridge/br_fdb.c create mode 100644 net/bridge/br_forward.c create mode 100644 net/bridge/br_if.c create mode 100644 net/bridge/br_input.c create mode 100644 net/bridge/br_ioctl.c create mode 100644 net/bridge/br_netfilter.c create mode 100644 net/bridge/br_notify.c create mode 100644 net/bridge/br_private.h create mode 100644 net/bridge/br_private_stp.h create mode 100644 net/bridge/br_stp.c create mode 100644 net/bridge/br_stp_bpdu.c create mode 100644 net/bridge/br_stp_if.c create mode 100644 net/bridge/br_stp_timer.c create mode 100644 net/bridge/br_sysfs_br.c create mode 100644 net/bridge/br_sysfs_if.c create mode 100644 net/bridge/netfilter/Kconfig create mode 100644 net/bridge/netfilter/Makefile create mode 100644 net/bridge/netfilter/ebt_802_3.c create mode 100644 net/bridge/netfilter/ebt_among.c create mode 100644 net/bridge/netfilter/ebt_arp.c create mode 100644 net/bridge/netfilter/ebt_arpreply.c create mode 100644 net/bridge/netfilter/ebt_dnat.c create mode 100644 net/bridge/netfilter/ebt_ip.c create mode 100644 net/bridge/netfilter/ebt_limit.c create mode 100644 net/bridge/netfilter/ebt_log.c create mode 100644 net/bridge/netfilter/ebt_mark.c create mode 100644 net/bridge/netfilter/ebt_mark_m.c create mode 100644 net/bridge/netfilter/ebt_pkttype.c create mode 100644 net/bridge/netfilter/ebt_redirect.c create mode 100644 net/bridge/netfilter/ebt_snat.c create mode 100644 net/bridge/netfilter/ebt_stp.c create mode 100644 net/bridge/netfilter/ebt_ulog.c create mode 100644 net/bridge/netfilter/ebt_vlan.c create mode 100644 net/bridge/netfilter/ebtable_broute.c create mode 100644 net/bridge/netfilter/ebtable_filter.c create mode 100644 net/bridge/netfilter/ebtable_nat.c create mode 100644 net/bridge/netfilter/ebtables.c create mode 100644 net/compat.c create mode 100644 net/core/Makefile create mode 100644 net/core/datagram.c create mode 100644 net/core/dev.c create mode 100644 net/core/dev_mcast.c create mode 100644 net/core/dst.c create mode 100644 net/core/dv.c create mode 100644 net/core/ethtool.c create mode 100644 net/core/filter.c create mode 100644 net/core/flow.c create mode 100644 net/core/gen_estimator.c create mode 100644 net/core/gen_stats.c create mode 100644 net/core/iovec.c create mode 100644 net/core/link_watch.c create mode 100644 net/core/neighbour.c create mode 100644 net/core/net-sysfs.c create mode 100644 net/core/netfilter.c create mode 100644 net/core/netpoll.c create mode 100644 net/core/pktgen.c create mode 100644 net/core/rtnetlink.c create mode 100644 net/core/scm.c create mode 100644 net/core/skbuff.c create mode 100644 net/core/sock.c create mode 100644 net/core/stream.c create mode 100644 net/core/sysctl_net_core.c create mode 100644 net/core/utils.c create mode 100644 net/core/wireless.c create mode 100644 net/decnet/Kconfig create mode 100644 net/decnet/Makefile create mode 100644 net/decnet/README create mode 100644 net/decnet/TODO create mode 100644 net/decnet/af_decnet.c create mode 100644 net/decnet/dn_dev.c create mode 100644 net/decnet/dn_fib.c create mode 100644 net/decnet/dn_neigh.c create mode 100644 net/decnet/dn_nsp_in.c create mode 100644 net/decnet/dn_nsp_out.c create mode 100644 net/decnet/dn_route.c create mode 100644 net/decnet/dn_rules.c create mode 100644 net/decnet/dn_table.c create mode 100644 net/decnet/dn_timer.c create mode 100644 net/decnet/netfilter/Kconfig create mode 100644 net/decnet/netfilter/Makefile create mode 100644 net/decnet/netfilter/dn_rtmsg.c create mode 100644 net/decnet/sysctl_net_decnet.c create mode 100644 net/econet/Makefile create mode 100644 net/econet/af_econet.c create mode 100644 net/ethernet/Makefile create mode 100644 net/ethernet/eth.c create mode 100644 net/ethernet/pe2.c create mode 100644 net/ethernet/sysctl_net_ether.c create mode 100644 net/ipv4/Kconfig create mode 100644 net/ipv4/Makefile create mode 100644 net/ipv4/af_inet.c create mode 100644 net/ipv4/ah4.c create mode 100644 net/ipv4/arp.c create mode 100644 net/ipv4/datagram.c create mode 100644 net/ipv4/devinet.c create mode 100644 net/ipv4/esp4.c create mode 100644 net/ipv4/fib_frontend.c create mode 100644 net/ipv4/fib_hash.c create mode 100644 net/ipv4/fib_lookup.h create mode 100644 net/ipv4/fib_rules.c create mode 100644 net/ipv4/fib_semantics.c create mode 100644 net/ipv4/icmp.c create mode 100644 net/ipv4/igmp.c create mode 100644 net/ipv4/inetpeer.c create mode 100644 net/ipv4/ip_forward.c create mode 100644 net/ipv4/ip_fragment.c create mode 100644 net/ipv4/ip_gre.c create mode 100644 net/ipv4/ip_input.c create mode 100644 net/ipv4/ip_options.c create mode 100644 net/ipv4/ip_output.c create mode 100644 net/ipv4/ip_sockglue.c create mode 100644 net/ipv4/ipcomp.c create mode 100644 net/ipv4/ipconfig.c create mode 100644 net/ipv4/ipip.c create mode 100644 net/ipv4/ipmr.c create mode 100644 net/ipv4/ipvs/Kconfig create mode 100644 net/ipv4/ipvs/Makefile create mode 100644 net/ipv4/ipvs/ip_vs_app.c create mode 100644 net/ipv4/ipvs/ip_vs_conn.c create mode 100644 net/ipv4/ipvs/ip_vs_core.c create mode 100644 net/ipv4/ipvs/ip_vs_ctl.c create mode 100644 net/ipv4/ipvs/ip_vs_dh.c create mode 100644 net/ipv4/ipvs/ip_vs_est.c create mode 100644 net/ipv4/ipvs/ip_vs_ftp.c create mode 100644 net/ipv4/ipvs/ip_vs_lblc.c create mode 100644 net/ipv4/ipvs/ip_vs_lblcr.c create mode 100644 net/ipv4/ipvs/ip_vs_lc.c create mode 100644 net/ipv4/ipvs/ip_vs_nq.c create mode 100644 net/ipv4/ipvs/ip_vs_proto.c create mode 100644 net/ipv4/ipvs/ip_vs_proto_ah.c create mode 100644 net/ipv4/ipvs/ip_vs_proto_esp.c create mode 100644 net/ipv4/ipvs/ip_vs_proto_icmp.c create mode 100644 net/ipv4/ipvs/ip_vs_proto_tcp.c create mode 100644 net/ipv4/ipvs/ip_vs_proto_udp.c create mode 100644 net/ipv4/ipvs/ip_vs_rr.c create mode 100644 net/ipv4/ipvs/ip_vs_sched.c create mode 100644 net/ipv4/ipvs/ip_vs_sed.c create mode 100644 net/ipv4/ipvs/ip_vs_sh.c create mode 100644 net/ipv4/ipvs/ip_vs_sync.c create mode 100644 net/ipv4/ipvs/ip_vs_wlc.c create mode 100644 net/ipv4/ipvs/ip_vs_wrr.c create mode 100644 net/ipv4/ipvs/ip_vs_xmit.c create mode 100644 net/ipv4/multipath.c create mode 100644 net/ipv4/multipath_drr.c create mode 100644 net/ipv4/multipath_random.c create mode 100644 net/ipv4/multipath_rr.c create mode 100644 net/ipv4/multipath_wrandom.c create mode 100644 net/ipv4/netfilter/Kconfig create mode 100644 net/ipv4/netfilter/Makefile create mode 100644 net/ipv4/netfilter/arp_tables.c create mode 100644 net/ipv4/netfilter/arpt_mangle.c create mode 100644 net/ipv4/netfilter/arptable_filter.c create mode 100644 net/ipv4/netfilter/ip_conntrack_amanda.c create mode 100644 net/ipv4/netfilter/ip_conntrack_core.c create mode 100644 net/ipv4/netfilter/ip_conntrack_ftp.c create mode 100644 net/ipv4/netfilter/ip_conntrack_irc.c create mode 100644 net/ipv4/netfilter/ip_conntrack_proto_generic.c create mode 100644 net/ipv4/netfilter/ip_conntrack_proto_icmp.c create mode 100644 net/ipv4/netfilter/ip_conntrack_proto_sctp.c create mode 100644 net/ipv4/netfilter/ip_conntrack_proto_tcp.c create mode 100644 net/ipv4/netfilter/ip_conntrack_proto_udp.c create mode 100644 net/ipv4/netfilter/ip_conntrack_standalone.c create mode 100644 net/ipv4/netfilter/ip_conntrack_tftp.c create mode 100644 net/ipv4/netfilter/ip_nat_amanda.c create mode 100644 net/ipv4/netfilter/ip_nat_core.c create mode 100644 net/ipv4/netfilter/ip_nat_ftp.c create mode 100644 net/ipv4/netfilter/ip_nat_helper.c create mode 100644 net/ipv4/netfilter/ip_nat_irc.c create mode 100644 net/ipv4/netfilter/ip_nat_proto_icmp.c create mode 100644 net/ipv4/netfilter/ip_nat_proto_tcp.c create mode 100644 net/ipv4/netfilter/ip_nat_proto_udp.c create mode 100644 net/ipv4/netfilter/ip_nat_proto_unknown.c create mode 100644 net/ipv4/netfilter/ip_nat_rule.c create mode 100644 net/ipv4/netfilter/ip_nat_snmp_basic.c create mode 100644 net/ipv4/netfilter/ip_nat_standalone.c create mode 100644 net/ipv4/netfilter/ip_nat_tftp.c create mode 100644 net/ipv4/netfilter/ip_queue.c create mode 100644 net/ipv4/netfilter/ip_tables.c create mode 100644 net/ipv4/netfilter/ipt_CLASSIFY.c create mode 100644 net/ipv4/netfilter/ipt_CLUSTERIP.c create mode 100644 net/ipv4/netfilter/ipt_CONNMARK.c create mode 100644 net/ipv4/netfilter/ipt_DSCP.c create mode 100644 net/ipv4/netfilter/ipt_ECN.c create mode 100644 net/ipv4/netfilter/ipt_LOG.c create mode 100644 net/ipv4/netfilter/ipt_MARK.c create mode 100644 net/ipv4/netfilter/ipt_MASQUERADE.c create mode 100644 net/ipv4/netfilter/ipt_NETMAP.c create mode 100644 net/ipv4/netfilter/ipt_NOTRACK.c create mode 100644 net/ipv4/netfilter/ipt_REDIRECT.c create mode 100644 net/ipv4/netfilter/ipt_REJECT.c create mode 100644 net/ipv4/netfilter/ipt_SAME.c create mode 100644 net/ipv4/netfilter/ipt_TCPMSS.c create mode 100644 net/ipv4/netfilter/ipt_TOS.c create mode 100644 net/ipv4/netfilter/ipt_ULOG.c create mode 100644 net/ipv4/netfilter/ipt_addrtype.c create mode 100644 net/ipv4/netfilter/ipt_ah.c create mode 100644 net/ipv4/netfilter/ipt_comment.c create mode 100644 net/ipv4/netfilter/ipt_connmark.c create mode 100644 net/ipv4/netfilter/ipt_conntrack.c create mode 100644 net/ipv4/netfilter/ipt_dscp.c create mode 100644 net/ipv4/netfilter/ipt_ecn.c create mode 100644 net/ipv4/netfilter/ipt_esp.c create mode 100644 net/ipv4/netfilter/ipt_hashlimit.c create mode 100644 net/ipv4/netfilter/ipt_helper.c create mode 100644 net/ipv4/netfilter/ipt_iprange.c create mode 100644 net/ipv4/netfilter/ipt_length.c create mode 100644 net/ipv4/netfilter/ipt_limit.c create mode 100644 net/ipv4/netfilter/ipt_mac.c create mode 100644 net/ipv4/netfilter/ipt_mark.c create mode 100644 net/ipv4/netfilter/ipt_multiport.c create mode 100644 net/ipv4/netfilter/ipt_owner.c create mode 100644 net/ipv4/netfilter/ipt_physdev.c create mode 100644 net/ipv4/netfilter/ipt_pkttype.c create mode 100644 net/ipv4/netfilter/ipt_realm.c create mode 100644 net/ipv4/netfilter/ipt_recent.c create mode 100644 net/ipv4/netfilter/ipt_sctp.c create mode 100644 net/ipv4/netfilter/ipt_state.c create mode 100644 net/ipv4/netfilter/ipt_tcpmss.c create mode 100644 net/ipv4/netfilter/ipt_tos.c create mode 100644 net/ipv4/netfilter/ipt_ttl.c create mode 100644 net/ipv4/netfilter/iptable_filter.c create mode 100644 net/ipv4/netfilter/iptable_mangle.c create mode 100644 net/ipv4/netfilter/iptable_raw.c create mode 100644 net/ipv4/proc.c create mode 100644 net/ipv4/protocol.c create mode 100644 net/ipv4/raw.c create mode 100644 net/ipv4/route.c create mode 100644 net/ipv4/syncookies.c create mode 100644 net/ipv4/sysctl_net_ipv4.c create mode 100644 net/ipv4/tcp.c create mode 100644 net/ipv4/tcp_diag.c create mode 100644 net/ipv4/tcp_input.c create mode 100644 net/ipv4/tcp_ipv4.c create mode 100644 net/ipv4/tcp_minisocks.c create mode 100644 net/ipv4/tcp_output.c create mode 100644 net/ipv4/tcp_timer.c create mode 100644 net/ipv4/udp.c create mode 100644 net/ipv4/utils.c create mode 100644 net/ipv4/xfrm4_input.c create mode 100644 net/ipv4/xfrm4_output.c create mode 100644 net/ipv4/xfrm4_policy.c create mode 100644 net/ipv4/xfrm4_state.c create mode 100644 net/ipv4/xfrm4_tunnel.c create mode 100644 net/ipv6/Kconfig create mode 100644 net/ipv6/Makefile create mode 100644 net/ipv6/addrconf.c create mode 100644 net/ipv6/af_inet6.c create mode 100644 net/ipv6/ah6.c create mode 100644 net/ipv6/anycast.c create mode 100644 net/ipv6/datagram.c create mode 100644 net/ipv6/esp6.c create mode 100644 net/ipv6/exthdrs.c create mode 100644 net/ipv6/exthdrs_core.c create mode 100644 net/ipv6/icmp.c create mode 100644 net/ipv6/ip6_fib.c create mode 100644 net/ipv6/ip6_flowlabel.c create mode 100644 net/ipv6/ip6_input.c create mode 100644 net/ipv6/ip6_output.c create mode 100644 net/ipv6/ip6_tunnel.c create mode 100644 net/ipv6/ipcomp6.c create mode 100644 net/ipv6/ipv6_sockglue.c create mode 100644 net/ipv6/ipv6_syms.c create mode 100644 net/ipv6/mcast.c create mode 100644 net/ipv6/ndisc.c create mode 100644 net/ipv6/netfilter/Kconfig create mode 100644 net/ipv6/netfilter/Makefile create mode 100644 net/ipv6/netfilter/ip6_queue.c create mode 100644 net/ipv6/netfilter/ip6_tables.c create mode 100644 net/ipv6/netfilter/ip6t_LOG.c create mode 100644 net/ipv6/netfilter/ip6t_MARK.c create mode 100644 net/ipv6/netfilter/ip6t_ah.c create mode 100644 net/ipv6/netfilter/ip6t_dst.c create mode 100644 net/ipv6/netfilter/ip6t_esp.c create mode 100644 net/ipv6/netfilter/ip6t_eui64.c create mode 100644 net/ipv6/netfilter/ip6t_frag.c create mode 100644 net/ipv6/netfilter/ip6t_hbh.c create mode 100644 net/ipv6/netfilter/ip6t_hl.c create mode 100644 net/ipv6/netfilter/ip6t_ipv6header.c create mode 100644 net/ipv6/netfilter/ip6t_length.c create mode 100644 net/ipv6/netfilter/ip6t_limit.c create mode 100644 net/ipv6/netfilter/ip6t_mac.c create mode 100644 net/ipv6/netfilter/ip6t_mark.c create mode 100644 net/ipv6/netfilter/ip6t_multiport.c create mode 100644 net/ipv6/netfilter/ip6t_owner.c create mode 100644 net/ipv6/netfilter/ip6t_physdev.c create mode 100644 net/ipv6/netfilter/ip6t_rt.c create mode 100644 net/ipv6/netfilter/ip6table_filter.c create mode 100644 net/ipv6/netfilter/ip6table_mangle.c create mode 100644 net/ipv6/netfilter/ip6table_raw.c create mode 100644 net/ipv6/proc.c create mode 100644 net/ipv6/protocol.c create mode 100644 net/ipv6/raw.c create mode 100644 net/ipv6/reassembly.c create mode 100644 net/ipv6/route.c create mode 100644 net/ipv6/sit.c create mode 100644 net/ipv6/sysctl_net_ipv6.c create mode 100644 net/ipv6/tcp_ipv6.c create mode 100644 net/ipv6/udp.c create mode 100644 net/ipv6/xfrm6_input.c create mode 100644 net/ipv6/xfrm6_output.c create mode 100644 net/ipv6/xfrm6_policy.c create mode 100644 net/ipv6/xfrm6_state.c create mode 100644 net/ipv6/xfrm6_tunnel.c create mode 100644 net/ipx/ChangeLog create mode 100644 net/ipx/Kconfig create mode 100644 net/ipx/Makefile create mode 100644 net/ipx/af_ipx.c create mode 100644 net/ipx/ipx_proc.c create mode 100644 net/ipx/ipx_route.c create mode 100644 net/ipx/sysctl_net_ipx.c create mode 100644 net/irda/Kconfig create mode 100644 net/irda/Makefile create mode 100644 net/irda/af_irda.c create mode 100644 net/irda/discovery.c create mode 100644 net/irda/ircomm/Kconfig create mode 100644 net/irda/ircomm/Makefile create mode 100644 net/irda/ircomm/ircomm_core.c create mode 100644 net/irda/ircomm/ircomm_event.c create mode 100644 net/irda/ircomm/ircomm_lmp.c create mode 100644 net/irda/ircomm/ircomm_param.c create mode 100644 net/irda/ircomm/ircomm_ttp.c create mode 100644 net/irda/ircomm/ircomm_tty.c create mode 100644 net/irda/ircomm/ircomm_tty_attach.c create mode 100644 net/irda/ircomm/ircomm_tty_ioctl.c create mode 100644 net/irda/irda_device.c create mode 100644 net/irda/iriap.c create mode 100644 net/irda/iriap_event.c create mode 100644 net/irda/irias_object.c create mode 100644 net/irda/irlan/Kconfig create mode 100644 net/irda/irlan/Makefile create mode 100644 net/irda/irlan/irlan_client.c create mode 100644 net/irda/irlan/irlan_client_event.c create mode 100644 net/irda/irlan/irlan_common.c create mode 100644 net/irda/irlan/irlan_eth.c create mode 100644 net/irda/irlan/irlan_event.c create mode 100644 net/irda/irlan/irlan_filter.c create mode 100644 net/irda/irlan/irlan_provider.c create mode 100644 net/irda/irlan/irlan_provider_event.c create mode 100644 net/irda/irlap.c create mode 100644 net/irda/irlap_event.c create mode 100644 net/irda/irlap_frame.c create mode 100644 net/irda/irlmp.c create mode 100644 net/irda/irlmp_event.c create mode 100644 net/irda/irlmp_frame.c create mode 100644 net/irda/irmod.c create mode 100644 net/irda/irnet/Kconfig create mode 100644 net/irda/irnet/Makefile create mode 100644 net/irda/irnet/irnet.h create mode 100644 net/irda/irnet/irnet_irda.c create mode 100644 net/irda/irnet/irnet_irda.h create mode 100644 net/irda/irnet/irnet_ppp.c create mode 100644 net/irda/irnet/irnet_ppp.h create mode 100644 net/irda/irproc.c create mode 100644 net/irda/irqueue.c create mode 100644 net/irda/irsysctl.c create mode 100644 net/irda/irttp.c create mode 100644 net/irda/parameters.c create mode 100644 net/irda/qos.c create mode 100644 net/irda/timer.c create mode 100644 net/irda/wrapper.c create mode 100644 net/key/Makefile create mode 100644 net/key/af_key.c create mode 100644 net/lapb/Makefile create mode 100644 net/lapb/lapb_iface.c create mode 100644 net/lapb/lapb_in.c create mode 100644 net/lapb/lapb_out.c create mode 100644 net/lapb/lapb_subr.c create mode 100644 net/lapb/lapb_timer.c create mode 100644 net/llc/Kconfig create mode 100644 net/llc/Makefile create mode 100644 net/llc/af_llc.c create mode 100644 net/llc/llc_c_ac.c create mode 100644 net/llc/llc_c_ev.c create mode 100644 net/llc/llc_c_st.c create mode 100644 net/llc/llc_conn.c create mode 100644 net/llc/llc_core.c create mode 100644 net/llc/llc_if.c create mode 100644 net/llc/llc_input.c create mode 100644 net/llc/llc_output.c create mode 100644 net/llc/llc_output.h create mode 100644 net/llc/llc_pdu.c create mode 100644 net/llc/llc_proc.c create mode 100644 net/llc/llc_s_ac.c create mode 100644 net/llc/llc_s_ev.c create mode 100644 net/llc/llc_s_st.c create mode 100644 net/llc/llc_sap.c create mode 100644 net/llc/llc_station.c create mode 100644 net/netlink/Makefile create mode 100644 net/netlink/af_netlink.c create mode 100644 net/netrom/Makefile create mode 100644 net/netrom/af_netrom.c create mode 100644 net/netrom/nr_dev.c create mode 100644 net/netrom/nr_in.c create mode 100644 net/netrom/nr_loopback.c create mode 100644 net/netrom/nr_out.c create mode 100644 net/netrom/nr_route.c create mode 100644 net/netrom/nr_subr.c create mode 100644 net/netrom/nr_timer.c create mode 100644 net/netrom/sysctl_net_netrom.c create mode 100644 net/nonet.c create mode 100644 net/packet/Makefile create mode 100644 net/packet/af_packet.c create mode 100644 net/rose/Makefile create mode 100644 net/rose/af_rose.c create mode 100644 net/rose/rose_dev.c create mode 100644 net/rose/rose_in.c create mode 100644 net/rose/rose_link.c create mode 100644 net/rose/rose_loopback.c create mode 100644 net/rose/rose_out.c create mode 100644 net/rose/rose_route.c create mode 100644 net/rose/rose_subr.c create mode 100644 net/rose/rose_timer.c create mode 100644 net/rose/sysctl_net_rose.c create mode 100644 net/rxrpc/Makefile create mode 100644 net/rxrpc/call.c create mode 100644 net/rxrpc/connection.c create mode 100644 net/rxrpc/internal.h create mode 100644 net/rxrpc/krxiod.c create mode 100644 net/rxrpc/krxsecd.c create mode 100644 net/rxrpc/krxtimod.c create mode 100644 net/rxrpc/main.c create mode 100644 net/rxrpc/peer.c create mode 100644 net/rxrpc/proc.c create mode 100644 net/rxrpc/rxrpc_syms.c create mode 100644 net/rxrpc/sysctl.c create mode 100644 net/rxrpc/transport.c create mode 100644 net/sched/Kconfig create mode 100644 net/sched/Makefile create mode 100644 net/sched/act_api.c create mode 100644 net/sched/cls_api.c create mode 100644 net/sched/cls_basic.c create mode 100644 net/sched/cls_fw.c create mode 100644 net/sched/cls_route.c create mode 100644 net/sched/cls_rsvp.c create mode 100644 net/sched/cls_rsvp.h create mode 100644 net/sched/cls_rsvp6.c create mode 100644 net/sched/cls_tcindex.c create mode 100644 net/sched/cls_u32.c create mode 100644 net/sched/em_cmp.c create mode 100644 net/sched/em_meta.c create mode 100644 net/sched/em_nbyte.c create mode 100644 net/sched/em_u32.c create mode 100644 net/sched/ematch.c create mode 100644 net/sched/estimator.c create mode 100644 net/sched/gact.c create mode 100644 net/sched/ipt.c create mode 100644 net/sched/mirred.c create mode 100644 net/sched/pedit.c create mode 100644 net/sched/police.c create mode 100644 net/sched/sch_api.c create mode 100644 net/sched/sch_atm.c create mode 100644 net/sched/sch_cbq.c create mode 100644 net/sched/sch_dsmark.c create mode 100644 net/sched/sch_fifo.c create mode 100644 net/sched/sch_generic.c create mode 100644 net/sched/sch_gred.c create mode 100644 net/sched/sch_hfsc.c create mode 100644 net/sched/sch_htb.c create mode 100644 net/sched/sch_ingress.c create mode 100644 net/sched/sch_netem.c create mode 100644 net/sched/sch_prio.c create mode 100644 net/sched/sch_red.c create mode 100644 net/sched/sch_sfq.c create mode 100644 net/sched/sch_tbf.c create mode 100644 net/sched/sch_teql.c create mode 100644 net/sctp/Kconfig create mode 100644 net/sctp/Makefile create mode 100644 net/sctp/associola.c create mode 100644 net/sctp/bind_addr.c create mode 100644 net/sctp/chunk.c create mode 100644 net/sctp/command.c create mode 100644 net/sctp/crc32c.c create mode 100644 net/sctp/debug.c create mode 100644 net/sctp/endpointola.c create mode 100644 net/sctp/input.c create mode 100644 net/sctp/inqueue.c create mode 100644 net/sctp/ipv6.c create mode 100644 net/sctp/objcnt.c create mode 100644 net/sctp/output.c create mode 100644 net/sctp/outqueue.c create mode 100644 net/sctp/primitive.c create mode 100644 net/sctp/proc.c create mode 100644 net/sctp/protocol.c create mode 100644 net/sctp/sm_make_chunk.c create mode 100644 net/sctp/sm_sideeffect.c create mode 100644 net/sctp/sm_statefuns.c create mode 100644 net/sctp/sm_statetable.c create mode 100644 net/sctp/socket.c create mode 100644 net/sctp/ssnmap.c create mode 100644 net/sctp/sysctl.c create mode 100644 net/sctp/transport.c create mode 100644 net/sctp/tsnmap.c create mode 100644 net/sctp/ulpevent.c create mode 100644 net/sctp/ulpqueue.c create mode 100644 net/socket.c create mode 100644 net/sunrpc/Makefile create mode 100644 net/sunrpc/auth.c create mode 100644 net/sunrpc/auth_gss/Makefile create mode 100644 net/sunrpc/auth_gss/auth_gss.c create mode 100644 net/sunrpc/auth_gss/gss_generic_token.c create mode 100644 net/sunrpc/auth_gss/gss_krb5_crypto.c create mode 100644 net/sunrpc/auth_gss/gss_krb5_mech.c create mode 100644 net/sunrpc/auth_gss/gss_krb5_seal.c create mode 100644 net/sunrpc/auth_gss/gss_krb5_seqnum.c create mode 100644 net/sunrpc/auth_gss/gss_krb5_unseal.c create mode 100644 net/sunrpc/auth_gss/gss_mech_switch.c create mode 100644 net/sunrpc/auth_gss/gss_spkm3_mech.c create mode 100644 net/sunrpc/auth_gss/gss_spkm3_seal.c create mode 100644 net/sunrpc/auth_gss/gss_spkm3_token.c create mode 100644 net/sunrpc/auth_gss/gss_spkm3_unseal.c create mode 100644 net/sunrpc/auth_gss/svcauth_gss.c create mode 100644 net/sunrpc/auth_null.c create mode 100644 net/sunrpc/auth_unix.c create mode 100644 net/sunrpc/cache.c create mode 100644 net/sunrpc/clnt.c create mode 100644 net/sunrpc/pmap_clnt.c create mode 100644 net/sunrpc/rpc_pipe.c create mode 100644 net/sunrpc/sched.c create mode 100644 net/sunrpc/stats.c create mode 100644 net/sunrpc/sunrpc_syms.c create mode 100644 net/sunrpc/svc.c create mode 100644 net/sunrpc/svcauth.c create mode 100644 net/sunrpc/svcauth_unix.c create mode 100644 net/sunrpc/svcsock.c create mode 100644 net/sunrpc/sysctl.c create mode 100644 net/sunrpc/timer.c create mode 100644 net/sunrpc/xdr.c create mode 100644 net/sunrpc/xprt.c create mode 100644 net/sysctl_net.c create mode 100644 net/unix/Makefile create mode 100644 net/unix/af_unix.c create mode 100644 net/unix/garbage.c create mode 100644 net/unix/sysctl_net_unix.c create mode 100644 net/wanrouter/Makefile create mode 100644 net/wanrouter/af_wanpipe.c create mode 100644 net/wanrouter/patchlevel create mode 100644 net/wanrouter/wanmain.c create mode 100644 net/wanrouter/wanproc.c create mode 100644 net/x25/Makefile create mode 100644 net/x25/af_x25.c create mode 100644 net/x25/sysctl_net_x25.c create mode 100644 net/x25/x25_dev.c create mode 100644 net/x25/x25_facilities.c create mode 100644 net/x25/x25_in.c create mode 100644 net/x25/x25_link.c create mode 100644 net/x25/x25_out.c create mode 100644 net/x25/x25_proc.c create mode 100644 net/x25/x25_route.c create mode 100644 net/x25/x25_subr.c create mode 100644 net/x25/x25_timer.c create mode 100644 net/xfrm/Kconfig create mode 100644 net/xfrm/Makefile create mode 100644 net/xfrm/xfrm_algo.c create mode 100644 net/xfrm/xfrm_input.c create mode 100644 net/xfrm/xfrm_policy.c create mode 100644 net/xfrm/xfrm_state.c create mode 100644 net/xfrm/xfrm_user.c (limited to 'net') diff --git a/net/802/Makefile b/net/802/Makefile new file mode 100644 index 000000000000..01861929591a --- /dev/null +++ b/net/802/Makefile @@ -0,0 +1,15 @@ +# +# Makefile for the Linux 802.x protocol layers. +# + +obj-y := p8023.o + +# Check the p8022 selections against net/core/Makefile. +obj-$(CONFIG_SYSCTL) += sysctl_net_802.o +obj-$(CONFIG_LLC) += p8022.o psnap.o +obj-$(CONFIG_TR) += p8022.o psnap.o tr.o sysctl_net_802.o +obj-$(CONFIG_NET_FC) += fc.o +obj-$(CONFIG_FDDI) += fddi.o +obj-$(CONFIG_HIPPI) += hippi.o +obj-$(CONFIG_IPX) += p8022.o psnap.o +obj-$(CONFIG_ATALK) += p8022.o psnap.o diff --git a/net/802/fc.c b/net/802/fc.c new file mode 100644 index 000000000000..640d34e026c2 --- /dev/null +++ b/net/802/fc.c @@ -0,0 +1,130 @@ +/* + * NET3: Fibre Channel device handling subroutines + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Vineet Abraham + * v 1.0 03/22/99 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Put the headers on a Fibre Channel packet. + */ + +static int fc_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + void *daddr, void *saddr, unsigned len) +{ + struct fch_hdr *fch; + int hdr_len; + + /* + * Add the 802.2 SNAP header if IP as the IPv4 code calls + * dev->hard_header directly. + */ + if (type == ETH_P_IP || type == ETH_P_ARP) + { + struct fcllc *fcllc; + + hdr_len = sizeof(struct fch_hdr) + sizeof(struct fcllc); + fch = (struct fch_hdr *)skb_push(skb, hdr_len); + fcllc = (struct fcllc *)(fch+1); + fcllc->dsap = fcllc->ssap = EXTENDED_SAP; + fcllc->llc = UI_CMD; + fcllc->protid[0] = fcllc->protid[1] = fcllc->protid[2] = 0x00; + fcllc->ethertype = htons(type); + } + else + { + hdr_len = sizeof(struct fch_hdr); + fch = (struct fch_hdr *)skb_push(skb, hdr_len); + } + + if(saddr) + memcpy(fch->saddr,saddr,dev->addr_len); + else + memcpy(fch->saddr,dev->dev_addr,dev->addr_len); + + if(daddr) + { + memcpy(fch->daddr,daddr,dev->addr_len); + return(hdr_len); + } + return -hdr_len; +} + +/* + * A neighbour discovery of some species (eg arp) has completed. We + * can now send the packet. + */ + +static int fc_rebuild_header(struct sk_buff *skb) +{ + struct fch_hdr *fch=(struct fch_hdr *)skb->data; + struct fcllc *fcllc=(struct fcllc *)(skb->data+sizeof(struct fch_hdr)); + if(fcllc->ethertype != htons(ETH_P_IP)) { + printk("fc_rebuild_header: Don't know how to resolve type %04X addresses ?\n",(unsigned int)htons(fcllc->ethertype)); + return 0; + } +#ifdef CONFIG_INET + return arp_find(fch->daddr, skb); +#else + return 0; +#endif +} + +static void fc_setup(struct net_device *dev) +{ + dev->hard_header = fc_header; + dev->rebuild_header = fc_rebuild_header; + + dev->type = ARPHRD_IEEE802; + dev->hard_header_len = FC_HLEN; + dev->mtu = 2024; + dev->addr_len = FC_ALEN; + dev->tx_queue_len = 100; /* Long queues on fc */ + dev->flags = IFF_BROADCAST; + + memset(dev->broadcast, 0xFF, FC_ALEN); +} + +/** + * alloc_fcdev - Register fibre channel device + * @sizeof_priv: Size of additional driver-private structure to be allocated + * for this fibre channel device + * + * Fill in the fields of the device structure with fibre channel-generic values. + * + * Constructs a new net device, complete with a private data area of + * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for + * this private data area. + */ +struct net_device *alloc_fcdev(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "fc%d", fc_setup); +} +EXPORT_SYMBOL(alloc_fcdev); diff --git a/net/802/fddi.c b/net/802/fddi.c new file mode 100644 index 000000000000..f9a31a9f70f1 --- /dev/null +++ b/net/802/fddi.c @@ -0,0 +1,210 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * FDDI-type device handling. + * + * Version: @(#)fddi.c 1.0.0 08/12/96 + * + * Authors: Lawrence V. Stefani, + * + * fddi.c is based on previous eth.c and tr.c work by + * Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * Florian La Roche, + * Alan Cox, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes + * Alan Cox : New arp/rebuild header + * Maciej W. Rozycki : IPv6 support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Create the FDDI MAC header for an arbitrary protocol layer + * + * saddr=NULL means use device source address + * daddr=NULL means leave destination address (eg unresolved arp) + */ + +static int fddi_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + void *daddr, void *saddr, unsigned len) +{ + int hl = FDDI_K_SNAP_HLEN; + struct fddihdr *fddi; + + if(type != ETH_P_IP && type != ETH_P_IPV6 && type != ETH_P_ARP) + hl=FDDI_K_8022_HLEN-3; + fddi = (struct fddihdr *)skb_push(skb, hl); + fddi->fc = FDDI_FC_K_ASYNC_LLC_DEF; + if(type == ETH_P_IP || type == ETH_P_IPV6 || type == ETH_P_ARP) + { + fddi->hdr.llc_snap.dsap = FDDI_EXTENDED_SAP; + fddi->hdr.llc_snap.ssap = FDDI_EXTENDED_SAP; + fddi->hdr.llc_snap.ctrl = FDDI_UI_CMD; + fddi->hdr.llc_snap.oui[0] = 0x00; + fddi->hdr.llc_snap.oui[1] = 0x00; + fddi->hdr.llc_snap.oui[2] = 0x00; + fddi->hdr.llc_snap.ethertype = htons(type); + } + + /* Set the source and destination hardware addresses */ + + if (saddr != NULL) + memcpy(fddi->saddr, saddr, dev->addr_len); + else + memcpy(fddi->saddr, dev->dev_addr, dev->addr_len); + + if (daddr != NULL) + { + memcpy(fddi->daddr, daddr, dev->addr_len); + return(hl); + } + + return(-hl); +} + + +/* + * Rebuild the FDDI MAC header. This is called after an ARP + * (or in future other address resolution) has completed on + * this sk_buff. We now let ARP fill in the other fields. + */ + +static int fddi_rebuild_header(struct sk_buff *skb) +{ + struct fddihdr *fddi = (struct fddihdr *)skb->data; + +#ifdef CONFIG_INET + if (fddi->hdr.llc_snap.ethertype == __constant_htons(ETH_P_IP)) + /* Try to get ARP to resolve the header and fill destination address */ + return arp_find(fddi->daddr, skb); + else +#endif + { + printk("%s: Don't know how to resolve type %02X addresses.\n", + skb->dev->name, htons(fddi->hdr.llc_snap.ethertype)); + return(0); + } +} + + +/* + * Determine the packet's protocol ID and fill in skb fields. + * This routine is called before an incoming packet is passed + * up. It's used to fill in specific skb fields and to set + * the proper pointer to the start of packet data (skb->data). + */ + +unsigned short fddi_type_trans(struct sk_buff *skb, struct net_device *dev) +{ + struct fddihdr *fddi = (struct fddihdr *)skb->data; + unsigned short type; + + /* + * Set mac.raw field to point to FC byte, set data field to point + * to start of packet data. Assume 802.2 SNAP frames for now. + */ + + skb->mac.raw = skb->data; /* point to frame control (FC) */ + + if(fddi->hdr.llc_8022_1.dsap==0xe0) + { + skb_pull(skb, FDDI_K_8022_HLEN-3); + type = __constant_htons(ETH_P_802_2); + } + else + { + skb_pull(skb, FDDI_K_SNAP_HLEN); /* adjust for 21 byte header */ + type=fddi->hdr.llc_snap.ethertype; + } + + /* Set packet type based on destination address and flag settings */ + + if (*fddi->daddr & 0x01) + { + if (memcmp(fddi->daddr, dev->broadcast, FDDI_K_ALEN) == 0) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + } + + else if (dev->flags & IFF_PROMISC) + { + if (memcmp(fddi->daddr, dev->dev_addr, FDDI_K_ALEN)) + skb->pkt_type = PACKET_OTHERHOST; + } + + /* Assume 802.2 SNAP frames, for now */ + + return(type); +} + +EXPORT_SYMBOL(fddi_type_trans); + +static int fddi_change_mtu(struct net_device *dev, int new_mtu) +{ + if ((new_mtu < FDDI_K_SNAP_HLEN) || (new_mtu > FDDI_K_SNAP_DLEN)) + return(-EINVAL); + dev->mtu = new_mtu; + return(0); +} + +static void fddi_setup(struct net_device *dev) +{ + dev->change_mtu = fddi_change_mtu; + dev->hard_header = fddi_header; + dev->rebuild_header = fddi_rebuild_header; + + dev->type = ARPHRD_FDDI; + dev->hard_header_len = FDDI_K_SNAP_HLEN+3; /* Assume 802.2 SNAP hdr len + 3 pad bytes */ + dev->mtu = FDDI_K_SNAP_DLEN; /* Assume max payload of 802.2 SNAP frame */ + dev->addr_len = FDDI_K_ALEN; + dev->tx_queue_len = 100; /* Long queues on FDDI */ + dev->flags = IFF_BROADCAST | IFF_MULTICAST; + + memset(dev->broadcast, 0xFF, FDDI_K_ALEN); +} + +/** + * alloc_fddidev - Register FDDI device + * @sizeof_priv: Size of additional driver-private structure to be allocated + * for this FDDI device + * + * Fill in the fields of the device structure with FDDI-generic values. + * + * Constructs a new net device, complete with a private data area of + * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for + * this private data area. + */ +struct net_device *alloc_fddidev(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "fddi%d", fddi_setup); +} +EXPORT_SYMBOL(alloc_fddidev); diff --git a/net/802/hippi.c b/net/802/hippi.c new file mode 100644 index 000000000000..4eb135c0afbb --- /dev/null +++ b/net/802/hippi.c @@ -0,0 +1,234 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * HIPPI-type device handling. + * + * Version: @(#)hippi.c 1.0.0 05/29/97 + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * Florian La Roche, + * Alan Cox, + * Jes Sorensen, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Create the HIPPI MAC header for an arbitrary protocol layer + * + * saddr=NULL means use device source address + * daddr=NULL means leave destination address (eg unresolved arp) + */ + +static int hippi_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, void *daddr, void *saddr, + unsigned len) +{ + struct hippi_hdr *hip = (struct hippi_hdr *)skb_push(skb, HIPPI_HLEN); + + if (!len){ + len = skb->len - HIPPI_HLEN; + printk("hippi_header(): length not supplied\n"); + } + + /* + * Due to the stupidity of the little endian byte-order we + * have to set the fp field this way. + */ + hip->fp.fixed = __constant_htonl(0x04800018); + hip->fp.d2_size = htonl(len + 8); + hip->le.fc = 0; + hip->le.double_wide = 0; /* only HIPPI 800 for the time being */ + hip->le.message_type = 0; /* Data PDU */ + + hip->le.dest_addr_type = 2; /* 12 bit SC address */ + hip->le.src_addr_type = 2; /* 12 bit SC address */ + + memcpy(hip->le.src_switch_addr, dev->dev_addr + 3, 3); + memset(&hip->le.reserved, 0, 16); + + hip->snap.dsap = HIPPI_EXTENDED_SAP; + hip->snap.ssap = HIPPI_EXTENDED_SAP; + hip->snap.ctrl = HIPPI_UI_CMD; + hip->snap.oui[0] = 0x00; + hip->snap.oui[1] = 0x00; + hip->snap.oui[2] = 0x00; + hip->snap.ethertype = htons(type); + + if (daddr) + { + memcpy(hip->le.dest_switch_addr, daddr + 3, 3); + memcpy(&skb->private.ifield, daddr + 2, 4); + return HIPPI_HLEN; + } + return -((int)HIPPI_HLEN); +} + + +/* + * Rebuild the HIPPI MAC header. This is called after an ARP has + * completed on this sk_buff. We now let ARP fill in the other fields. + */ + +static int hippi_rebuild_header(struct sk_buff *skb) +{ + struct hippi_hdr *hip = (struct hippi_hdr *)skb->data; + + /* + * Only IP is currently supported + */ + + if(hip->snap.ethertype != __constant_htons(ETH_P_IP)) + { + printk(KERN_DEBUG "%s: unable to resolve type %X addresses.\n",skb->dev->name,ntohs(hip->snap.ethertype)); + return 0; + } + + /* + * We don't support dynamic ARP on HIPPI, but we use the ARP + * static ARP tables to hold the I-FIELDs. + */ + return arp_find(hip->le.daddr, skb); +} + + +/* + * Determine the packet's protocol ID. + */ + +unsigned short hippi_type_trans(struct sk_buff *skb, struct net_device *dev) +{ + struct hippi_hdr *hip; + + hip = (struct hippi_hdr *) skb->data; + + /* + * This is actually wrong ... question is if we really should + * set the raw address here. + */ + skb->mac.raw = skb->data; + skb_pull(skb, HIPPI_HLEN); + + /* + * No fancy promisc stuff here now. + */ + + return hip->snap.ethertype; +} + +EXPORT_SYMBOL(hippi_type_trans); + +static int hippi_change_mtu(struct net_device *dev, int new_mtu) +{ + /* + * HIPPI's got these nice large MTUs. + */ + if ((new_mtu < 68) || (new_mtu > 65280)) + return -EINVAL; + dev->mtu = new_mtu; + return(0); +} + +/* + * For HIPPI we will actually use the lower 4 bytes of the hardware + * address as the I-FIELD rather than the actual hardware address. + */ +static int hippi_mac_addr(struct net_device *dev, void *p) +{ + struct sockaddr *addr = p; + if (netif_running(dev)) + return -EBUSY; + memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); + return 0; +} + +static int hippi_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p) +{ + /* Never send broadcast/multicast ARP messages */ + p->mcast_probes = 0; + + /* In IPv6 unicast probes are valid even on NBMA, + * because they are encapsulated in normal IPv6 protocol. + * Should be a generic flag. + */ + if (p->tbl->family != AF_INET6) + p->ucast_probes = 0; + return 0; +} + +static void hippi_setup(struct net_device *dev) +{ + dev->set_multicast_list = NULL; + dev->change_mtu = hippi_change_mtu; + dev->hard_header = hippi_header; + dev->rebuild_header = hippi_rebuild_header; + dev->set_mac_address = hippi_mac_addr; + dev->hard_header_parse = NULL; + dev->hard_header_cache = NULL; + dev->header_cache_update = NULL; + dev->neigh_setup = hippi_neigh_setup_dev; + + /* + * We don't support HIPPI `ARP' for the time being, and probably + * never will unless someone else implements it. However we + * still need a fake ARPHRD to make ifconfig and friends play ball. + */ + dev->type = ARPHRD_HIPPI; + dev->hard_header_len = HIPPI_HLEN; + dev->mtu = 65280; + dev->addr_len = HIPPI_ALEN; + dev->tx_queue_len = 25 /* 5 */; + memset(dev->broadcast, 0xFF, HIPPI_ALEN); + + + /* + * HIPPI doesn't support broadcast+multicast and we only use + * static ARP tables. ARP is disabled by hippi_neigh_setup_dev. + */ + dev->flags = 0; +} + +/** + * alloc_hippi_dev - Register HIPPI device + * @sizeof_priv: Size of additional driver-private structure to be allocated + * for this HIPPI device + * + * Fill in the fields of the device structure with HIPPI-generic values. + * + * Constructs a new net device, complete with a private data area of + * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for + * this private data area. + */ + +struct net_device *alloc_hippi_dev(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "hip%d", hippi_setup); +} + +EXPORT_SYMBOL(alloc_hippi_dev); diff --git a/net/802/p8022.c b/net/802/p8022.c new file mode 100644 index 000000000000..5ae63416df6d --- /dev/null +++ b/net/802/p8022.c @@ -0,0 +1,65 @@ +/* + * NET3: Support for 802.2 demultiplexing off Ethernet (Token ring + * is kept separate see p8022tr.c) + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Demultiplex 802.2 encoded protocols. We match the entry by the + * SSAP/DSAP pair and then deliver to the registered datalink that + * matches. The control byte is ignored and handling of such items + * is up to the routine passed the frame. + * + * Unlike the 802.3 datalink we have a list of 802.2 entries as + * there are multiple protocols to demux. The list is currently + * short (3 or 4 entries at most). The current demux assumes this. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb, + unsigned char *dest) +{ + llc_build_and_send_ui_pkt(dl->sap, skb, dest, dl->sap->laddr.lsap); + return 0; +} + +struct datalink_proto *register_8022_client(unsigned char type, + int (*func)(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt)) +{ + struct datalink_proto *proto; + + proto = kmalloc(sizeof(*proto), GFP_ATOMIC); + if (proto) { + proto->type[0] = type; + proto->header_length = 3; + proto->request = p8022_request; + proto->sap = llc_sap_open(type, func); + if (!proto->sap) { + kfree(proto); + proto = NULL; + } + } + return proto; +} + +void unregister_8022_client(struct datalink_proto *proto) +{ + llc_sap_close(proto->sap); + kfree(proto); +} + +EXPORT_SYMBOL(register_8022_client); +EXPORT_SYMBOL(unregister_8022_client); + +MODULE_LICENSE("GPL"); diff --git a/net/802/p8023.c b/net/802/p8023.c new file mode 100644 index 000000000000..a0b61b40225f --- /dev/null +++ b/net/802/p8023.c @@ -0,0 +1,61 @@ +/* + * NET3: 802.3 data link hooks used for IPX 802.3 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 802.3 isn't really a protocol data link layer. Some old IPX stuff + * uses it however. Note that there is only one 802.3 protocol layer + * in the system. We don't currently support different protocols + * running raw 802.3 on different devices. Thankfully nobody else + * has done anything like the old IPX. + */ + +#include +#include +#include +#include +#include + +#include + +/* + * Place an 802.3 header on a packet. The driver will do the mac + * addresses, we just need to give it the buffer length. + */ +static int p8023_request(struct datalink_proto *dl, + struct sk_buff *skb, unsigned char *dest_node) +{ + struct net_device *dev = skb->dev; + + dev->hard_header(skb, dev, ETH_P_802_3, dest_node, NULL, skb->len); + return dev_queue_xmit(skb); +} + +/* + * Create an 802.3 client. Note there can be only one 802.3 client + */ +struct datalink_proto *make_8023_client(void) +{ + struct datalink_proto *proto = kmalloc(sizeof(*proto), GFP_ATOMIC); + + if (proto) { + proto->header_length = 0; + proto->request = p8023_request; + } + return proto; +} + +/* + * Destroy the 802.3 client. + */ +void destroy_8023_client(struct datalink_proto *dl) +{ + if (dl) + kfree(dl); +} + +EXPORT_SYMBOL(destroy_8023_client); +EXPORT_SYMBOL(make_8023_client); diff --git a/net/802/psnap.c b/net/802/psnap.c new file mode 100644 index 000000000000..1053821ddf93 --- /dev/null +++ b/net/802/psnap.c @@ -0,0 +1,159 @@ +/* + * SNAP data link layer. Derived from 802.2 + * + * Alan Cox , + * from the 802.2 layer by Greg Page. + * Merged in additions from Greg Page's psnap.c. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static LIST_HEAD(snap_list); +static DEFINE_SPINLOCK(snap_lock); +static struct llc_sap *snap_sap; + +/* + * Find a snap client by matching the 5 bytes. + */ +static struct datalink_proto *find_snap_client(unsigned char *desc) +{ + struct list_head *entry; + struct datalink_proto *proto = NULL, *p; + + list_for_each_rcu(entry, &snap_list) { + p = list_entry(entry, struct datalink_proto, node); + if (!memcmp(p->type, desc, 5)) { + proto = p; + break; + } + } + return proto; +} + +/* + * A SNAP packet has arrived + */ +static int snap_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt) +{ + int rc = 1; + struct datalink_proto *proto; + static struct packet_type snap_packet_type = { + .type = __constant_htons(ETH_P_SNAP), + }; + + rcu_read_lock(); + proto = find_snap_client(skb->h.raw); + if (proto) { + /* Pass the frame on. */ + skb->h.raw += 5; + skb_pull(skb, 5); + rc = proto->rcvfunc(skb, dev, &snap_packet_type); + } else { + skb->sk = NULL; + kfree_skb(skb); + rc = 1; + } + + rcu_read_unlock(); + return rc; +} + +/* + * Put a SNAP header on a frame and pass to 802.2 + */ +static int snap_request(struct datalink_proto *dl, + struct sk_buff *skb, u8 *dest) +{ + memcpy(skb_push(skb, 5), dl->type, 5); + llc_build_and_send_ui_pkt(snap_sap, skb, dest, snap_sap->laddr.lsap); + return 0; +} + +/* + * Set up the SNAP layer + */ +EXPORT_SYMBOL(register_snap_client); +EXPORT_SYMBOL(unregister_snap_client); + +static char snap_err_msg[] __initdata = + KERN_CRIT "SNAP - unable to register with 802.2\n"; + +static int __init snap_init(void) +{ + snap_sap = llc_sap_open(0xAA, snap_rcv); + + if (!snap_sap) + printk(snap_err_msg); + + return 0; +} + +module_init(snap_init); + +static void __exit snap_exit(void) +{ + llc_sap_close(snap_sap); +} + +module_exit(snap_exit); + + +/* + * Register SNAP clients. We don't yet use this for IP. + */ +struct datalink_proto *register_snap_client(unsigned char *desc, + int (*rcvfunc)(struct sk_buff *, + struct net_device *, + struct packet_type *)) +{ + struct datalink_proto *proto = NULL; + + spin_lock_bh(&snap_lock); + + if (find_snap_client(desc)) + goto out; + + proto = kmalloc(sizeof(*proto), GFP_ATOMIC); + if (proto) { + memcpy(proto->type, desc,5); + proto->rcvfunc = rcvfunc; + proto->header_length = 5 + 3; /* snap + 802.2 */ + proto->request = snap_request; + list_add_rcu(&proto->node, &snap_list); + } +out: + spin_unlock_bh(&snap_lock); + + synchronize_net(); + return proto; +} + +/* + * Unregister SNAP clients. Protocols no longer want to play with us ... + */ +void unregister_snap_client(struct datalink_proto *proto) +{ + spin_lock_bh(&snap_lock); + list_del_rcu(&proto->node); + spin_unlock_bh(&snap_lock); + + synchronize_net(); + + kfree(proto); +} + +MODULE_LICENSE("GPL"); diff --git a/net/802/sysctl_net_802.c b/net/802/sysctl_net_802.c new file mode 100644 index 000000000000..36079630c49f --- /dev/null +++ b/net/802/sysctl_net_802.c @@ -0,0 +1,33 @@ +/* -*- linux-c -*- + * sysctl_net_802.c: sysctl interface to net 802 subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/802 directory entry (empty =) ). [MS] + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#ifdef CONFIG_TR +extern int sysctl_tr_rif_timeout; +#endif + +struct ctl_table tr_table[] = { +#ifdef CONFIG_TR + { + .ctl_name = NET_TR_RIF_TIMEOUT, + .procname = "rif_timeout", + .data = &sysctl_tr_rif_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, +#endif /* CONFIG_TR */ + { 0 }, +}; diff --git a/net/802/tr.c b/net/802/tr.c new file mode 100644 index 000000000000..85293ccf7efc --- /dev/null +++ b/net/802/tr.c @@ -0,0 +1,645 @@ +/* + * NET3: Token ring device handling subroutines + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: 3 Feb 97 Paul Norton Minor routing fixes. + * Added rif table to /proc/net/tr_rif and rif timeout to + * /proc/sys/net/token-ring/rif_timeout. + * 22 Jun 98 Paul Norton Rearranged + * tr_header and tr_type_trans to handle passing IPX SNAP and + * 802.2 through the correct layers. Eliminated tr_reformat. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev); +static void rif_check_expire(unsigned long dummy); + +#define TR_SR_DEBUG 0 + +/* + * Each RIF entry we learn is kept this way + */ + +struct rif_cache_s { + unsigned char addr[TR_ALEN]; + int iface; + __u16 rcf; + __u16 rseg[8]; + struct rif_cache_s *next; + unsigned long last_used; + unsigned char local_ring; +}; + +#define RIF_TABLE_SIZE 32 + +/* + * We hash the RIF cache 32 ways. We do after all have to look it + * up a lot. + */ + +static struct rif_cache_s *rif_table[RIF_TABLE_SIZE]; + +static DEFINE_SPINLOCK(rif_lock); + + +/* + * Garbage disposal timer. + */ + +static struct timer_list rif_timer; + +int sysctl_tr_rif_timeout = 60*10*HZ; + +static inline unsigned long rif_hash(const unsigned char *addr) +{ + unsigned long x; + + x = addr[0]; + x = (x << 2) ^ addr[1]; + x = (x << 2) ^ addr[2]; + x = (x << 2) ^ addr[3]; + x = (x << 2) ^ addr[4]; + x = (x << 2) ^ addr[5]; + + x ^= x >> 8; + + return x & (RIF_TABLE_SIZE - 1); +} + +/* + * Put the headers on a token ring packet. Token ring source routing + * makes this a little more exciting than on ethernet. + */ + +static int tr_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, + void *daddr, void *saddr, unsigned len) +{ + struct trh_hdr *trh; + int hdr_len; + + /* + * Add the 802.2 SNAP header if IP as the IPv4/IPv6 code calls + * dev->hard_header directly. + */ + if (type == ETH_P_IP || type == ETH_P_IPV6 || type == ETH_P_ARP) + { + struct trllc *trllc; + + hdr_len = sizeof(struct trh_hdr) + sizeof(struct trllc); + trh = (struct trh_hdr *)skb_push(skb, hdr_len); + trllc = (struct trllc *)(trh+1); + trllc->dsap = trllc->ssap = EXTENDED_SAP; + trllc->llc = UI_CMD; + trllc->protid[0] = trllc->protid[1] = trllc->protid[2] = 0x00; + trllc->ethertype = htons(type); + } + else + { + hdr_len = sizeof(struct trh_hdr); + trh = (struct trh_hdr *)skb_push(skb, hdr_len); + } + + trh->ac=AC; + trh->fc=LLC_FRAME; + + if(saddr) + memcpy(trh->saddr,saddr,dev->addr_len); + else + memcpy(trh->saddr,dev->dev_addr,dev->addr_len); + + /* + * Build the destination and then source route the frame + */ + + if(daddr) + { + memcpy(trh->daddr,daddr,dev->addr_len); + tr_source_route(skb,trh,dev); + return(hdr_len); + } + + return -hdr_len; +} + +/* + * A neighbour discovery of some species (eg arp) has completed. We + * can now send the packet. + */ + +static int tr_rebuild_header(struct sk_buff *skb) +{ + struct trh_hdr *trh=(struct trh_hdr *)skb->data; + struct trllc *trllc=(struct trllc *)(skb->data+sizeof(struct trh_hdr)); + struct net_device *dev = skb->dev; + + /* + * FIXME: We don't yet support IPv6 over token rings + */ + + if(trllc->ethertype != htons(ETH_P_IP)) { + printk("tr_rebuild_header: Don't know how to resolve type %04X addresses ?\n",(unsigned int)htons(trllc->ethertype)); + return 0; + } + +#ifdef CONFIG_INET + if(arp_find(trh->daddr, skb)) { + return 1; + } + else +#endif + { + tr_source_route(skb,trh,dev); + return 0; + } +} + +/* + * Some of this is a bit hackish. We intercept RIF information + * used for source routing. We also grab IP directly and don't feed + * it via SNAP. + */ + +unsigned short tr_type_trans(struct sk_buff *skb, struct net_device *dev) +{ + + struct trh_hdr *trh=(struct trh_hdr *)skb->data; + struct trllc *trllc; + unsigned riflen=0; + + skb->mac.raw = skb->data; + + if(trh->saddr[0] & TR_RII) + riflen = (ntohs(trh->rcf) & TR_RCF_LEN_MASK) >> 8; + + trllc = (struct trllc *)(skb->data+sizeof(struct trh_hdr)-TR_MAXRIFLEN+riflen); + + skb_pull(skb,sizeof(struct trh_hdr)-TR_MAXRIFLEN+riflen); + + if(*trh->daddr & 0x80) + { + if(!memcmp(trh->daddr,dev->broadcast,TR_ALEN)) + skb->pkt_type=PACKET_BROADCAST; + else + skb->pkt_type=PACKET_MULTICAST; + } + else if ( (trh->daddr[0] & 0x01) && (trh->daddr[1] & 0x00) && (trh->daddr[2] & 0x5E)) + { + skb->pkt_type=PACKET_MULTICAST; + } + else if(dev->flags & IFF_PROMISC) + { + if(memcmp(trh->daddr, dev->dev_addr, TR_ALEN)) + skb->pkt_type=PACKET_OTHERHOST; + } + + if ((skb->pkt_type != PACKET_BROADCAST) && + (skb->pkt_type != PACKET_MULTICAST)) + tr_add_rif_info(trh,dev) ; + + /* + * Strip the SNAP header from ARP packets since we don't + * pass them through to the 802.2/SNAP layers. + */ + + if (trllc->dsap == EXTENDED_SAP && + (trllc->ethertype == ntohs(ETH_P_IP) || + trllc->ethertype == ntohs(ETH_P_IPV6) || + trllc->ethertype == ntohs(ETH_P_ARP))) + { + skb_pull(skb, sizeof(struct trllc)); + return trllc->ethertype; + } + + return ntohs(ETH_P_802_2); +} + +/* + * We try to do source routing... + */ + +void tr_source_route(struct sk_buff *skb,struct trh_hdr *trh,struct net_device *dev) +{ + int slack; + unsigned int hash; + struct rif_cache_s *entry; + unsigned char *olddata; + static const unsigned char mcast_func_addr[] + = {0xC0,0x00,0x00,0x04,0x00,0x00}; + + spin_lock_bh(&rif_lock); + + /* + * Broadcasts are single route as stated in RFC 1042 + */ + if( (!memcmp(&(trh->daddr[0]),&(dev->broadcast[0]),TR_ALEN)) || + (!memcmp(&(trh->daddr[0]),&(mcast_func_addr[0]), TR_ALEN)) ) + { + trh->rcf=htons((((sizeof(trh->rcf)) << 8) & TR_RCF_LEN_MASK) + | TR_RCF_FRAME2K | TR_RCF_LIMITED_BROADCAST); + trh->saddr[0]|=TR_RII; + } + else + { + hash = rif_hash(trh->daddr); + /* + * Walk the hash table and look for an entry + */ + for(entry=rif_table[hash];entry && memcmp(&(entry->addr[0]),&(trh->daddr[0]),TR_ALEN);entry=entry->next); + + /* + * If we found an entry we can route the frame. + */ + if(entry) + { +#if TR_SR_DEBUG +printk("source routing for %02X:%02X:%02X:%02X:%02X:%02X\n",trh->daddr[0], + trh->daddr[1],trh->daddr[2],trh->daddr[3],trh->daddr[4],trh->daddr[5]); +#endif + if(!entry->local_ring && (ntohs(entry->rcf) & TR_RCF_LEN_MASK) >> 8) + { + trh->rcf=entry->rcf; + memcpy(&trh->rseg[0],&entry->rseg[0],8*sizeof(unsigned short)); + trh->rcf^=htons(TR_RCF_DIR_BIT); + trh->rcf&=htons(0x1fff); /* Issam Chehab */ + + trh->saddr[0]|=TR_RII; +#if TR_SR_DEBUG + printk("entry found with rcf %04x\n", entry->rcf); + } + else + { + printk("entry found but without rcf length, local=%02x\n", entry->local_ring); +#endif + } + entry->last_used=jiffies; + } + else + { + /* + * Without the information we simply have to shout + * on the wire. The replies should rapidly clean this + * situation up. + */ + trh->rcf=htons((((sizeof(trh->rcf)) << 8) & TR_RCF_LEN_MASK) + | TR_RCF_FRAME2K | TR_RCF_LIMITED_BROADCAST); + trh->saddr[0]|=TR_RII; +#if TR_SR_DEBUG + printk("no entry in rif table found - broadcasting frame\n"); +#endif + } + } + + /* Compress the RIF here so we don't have to do it in the driver(s) */ + if (!(trh->saddr[0] & 0x80)) + slack = 18; + else + slack = 18 - ((ntohs(trh->rcf) & TR_RCF_LEN_MASK)>>8); + olddata = skb->data; + spin_unlock_bh(&rif_lock); + + skb_pull(skb, slack); + memmove(skb->data, olddata, sizeof(struct trh_hdr) - slack); +} + +/* + * We have learned some new RIF information for our source + * routing. + */ + +static void tr_add_rif_info(struct trh_hdr *trh, struct net_device *dev) +{ + unsigned int hash, rii_p = 0; + struct rif_cache_s *entry; + + + spin_lock_bh(&rif_lock); + + /* + * Firstly see if the entry exists + */ + + if(trh->saddr[0] & TR_RII) + { + trh->saddr[0]&=0x7f; + if (((ntohs(trh->rcf) & TR_RCF_LEN_MASK) >> 8) > 2) + { + rii_p = 1; + } + } + + hash = rif_hash(trh->saddr); + for(entry=rif_table[hash];entry && memcmp(&(entry->addr[0]),&(trh->saddr[0]),TR_ALEN);entry=entry->next); + + if(entry==NULL) + { +#if TR_SR_DEBUG +printk("adding rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n", + trh->saddr[0],trh->saddr[1],trh->saddr[2], + trh->saddr[3],trh->saddr[4],trh->saddr[5], + ntohs(trh->rcf)); +#endif + /* + * Allocate our new entry. A failure to allocate loses + * use the information. This is harmless. + * + * FIXME: We ought to keep some kind of cache size + * limiting and adjust the timers to suit. + */ + entry=kmalloc(sizeof(struct rif_cache_s),GFP_ATOMIC); + + if(!entry) + { + printk(KERN_DEBUG "tr.c: Couldn't malloc rif cache entry !\n"); + spin_unlock_bh(&rif_lock); + return; + } + + memcpy(&(entry->addr[0]),&(trh->saddr[0]),TR_ALEN); + entry->iface = dev->ifindex; + entry->next=rif_table[hash]; + entry->last_used=jiffies; + rif_table[hash]=entry; + + if (rii_p) + { + entry->rcf = trh->rcf & htons((unsigned short)~TR_RCF_BROADCAST_MASK); + memcpy(&(entry->rseg[0]),&(trh->rseg[0]),8*sizeof(unsigned short)); + entry->local_ring = 0; + trh->saddr[0]|=TR_RII; /* put the routing indicator back for tcpdump */ + } + else + { + entry->local_ring = 1; + } + } + else /* Y. Tahara added */ + { + /* + * Update existing entries + */ + if (!entry->local_ring) + if (entry->rcf != (trh->rcf & htons((unsigned short)~TR_RCF_BROADCAST_MASK)) && + !(trh->rcf & htons(TR_RCF_BROADCAST_MASK))) + { +#if TR_SR_DEBUG +printk("updating rif_entry: addr:%02X:%02X:%02X:%02X:%02X:%02X rcf:%04X\n", + trh->saddr[0],trh->saddr[1],trh->saddr[2], + trh->saddr[3],trh->saddr[4],trh->saddr[5], + ntohs(trh->rcf)); +#endif + entry->rcf = trh->rcf & htons((unsigned short)~TR_RCF_BROADCAST_MASK); + memcpy(&(entry->rseg[0]),&(trh->rseg[0]),8*sizeof(unsigned short)); + } + entry->last_used=jiffies; + } + spin_unlock_bh(&rif_lock); +} + +/* + * Scan the cache with a timer and see what we need to throw out. + */ + +static void rif_check_expire(unsigned long dummy) +{ + int i; + unsigned long next_interval = jiffies + sysctl_tr_rif_timeout/2; + + spin_lock_bh(&rif_lock); + + for(i =0; i < RIF_TABLE_SIZE; i++) { + struct rif_cache_s *entry, **pentry; + + pentry = rif_table+i; + while((entry=*pentry) != NULL) { + unsigned long expires + = entry->last_used + sysctl_tr_rif_timeout; + + if (time_before_eq(expires, jiffies)) { + *pentry = entry->next; + kfree(entry); + } else { + pentry = &entry->next; + + if (time_before(expires, next_interval)) + next_interval = expires; + } + } + } + + spin_unlock_bh(&rif_lock); + + mod_timer(&rif_timer, next_interval); + +} + +/* + * Generate the /proc/net information for the token ring RIF + * routing. + */ + +#ifdef CONFIG_PROC_FS + +static struct rif_cache_s *rif_get_idx(loff_t pos) +{ + int i; + struct rif_cache_s *entry; + loff_t off = 0; + + for(i = 0; i < RIF_TABLE_SIZE; i++) + for(entry = rif_table[i]; entry; entry = entry->next) { + if (off == pos) + return entry; + ++off; + } + + return NULL; +} + +static void *rif_seq_start(struct seq_file *seq, loff_t *pos) +{ + spin_lock_bh(&rif_lock); + + return *pos ? rif_get_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *rif_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + int i; + struct rif_cache_s *ent = v; + + ++*pos; + + if (v == SEQ_START_TOKEN) { + i = -1; + goto scan; + } + + if (ent->next) + return ent->next; + + i = rif_hash(ent->addr); + scan: + while (++i < RIF_TABLE_SIZE) { + if ((ent = rif_table[i]) != NULL) + return ent; + } + return NULL; +} + +static void rif_seq_stop(struct seq_file *seq, void *v) +{ + spin_unlock_bh(&rif_lock); +} + +static int rif_seq_show(struct seq_file *seq, void *v) +{ + int j, rcf_len, segment, brdgnmb; + struct rif_cache_s *entry = v; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "if TR address TTL rcf routing segments\n"); + else { + struct net_device *dev = dev_get_by_index(entry->iface); + long ttl = (long) (entry->last_used + sysctl_tr_rif_timeout) + - (long) jiffies; + + seq_printf(seq, "%s %02X:%02X:%02X:%02X:%02X:%02X %7li ", + dev?dev->name:"?", + entry->addr[0],entry->addr[1],entry->addr[2], + entry->addr[3],entry->addr[4],entry->addr[5], + ttl/HZ); + + if (entry->local_ring) + seq_puts(seq, "local\n"); + else { + + seq_printf(seq, "%04X", ntohs(entry->rcf)); + rcf_len = ((ntohs(entry->rcf) & TR_RCF_LEN_MASK)>>8)-2; + if (rcf_len) + rcf_len >>= 1; + for(j = 1; j < rcf_len; j++) { + if(j==1) { + segment=ntohs(entry->rseg[j-1])>>4; + seq_printf(seq," %03X",segment); + }; + segment=ntohs(entry->rseg[j])>>4; + brdgnmb=ntohs(entry->rseg[j-1])&0x00f; + seq_printf(seq,"-%01X-%03X",brdgnmb,segment); + } + seq_putc(seq, '\n'); + } + } + return 0; +} + + +static struct seq_operations rif_seq_ops = { + .start = rif_seq_start, + .next = rif_seq_next, + .stop = rif_seq_stop, + .show = rif_seq_show, +}; + +static int rif_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rif_seq_ops); +} + +static struct file_operations rif_seq_fops = { + .owner = THIS_MODULE, + .open = rif_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + +static void tr_setup(struct net_device *dev) +{ + /* + * Configure and register + */ + + dev->hard_header = tr_header; + dev->rebuild_header = tr_rebuild_header; + + dev->type = ARPHRD_IEEE802_TR; + dev->hard_header_len = TR_HLEN; + dev->mtu = 2000; + dev->addr_len = TR_ALEN; + dev->tx_queue_len = 100; /* Long queues on tr */ + + memset(dev->broadcast,0xFF, TR_ALEN); + + /* New-style flags. */ + dev->flags = IFF_BROADCAST | IFF_MULTICAST ; +} + +/** + * alloc_trdev - Register token ring device + * @sizeof_priv: Size of additional driver-private structure to be allocated + * for this token ring device + * + * Fill in the fields of the device structure with token ring-generic values. + * + * Constructs a new net device, complete with a private data area of + * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for + * this private data area. + */ +struct net_device *alloc_trdev(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "tr%d", tr_setup); +} + +/* + * Called during bootup. We don't actually have to initialise + * too much for this. + */ + +static int __init rif_init(void) +{ + init_timer(&rif_timer); + rif_timer.expires = sysctl_tr_rif_timeout; + rif_timer.data = 0L; + rif_timer.function = rif_check_expire; + add_timer(&rif_timer); + + proc_net_fops_create("tr_rif", S_IRUGO, &rif_seq_fops); + return 0; +} + +module_init(rif_init); + +EXPORT_SYMBOL(tr_source_route); +EXPORT_SYMBOL(tr_type_trans); +EXPORT_SYMBOL(alloc_trdev); diff --git a/net/8021q/Makefile b/net/8021q/Makefile new file mode 100644 index 000000000000..97feb44dbdce --- /dev/null +++ b/net/8021q/Makefile @@ -0,0 +1,12 @@ +# +# Makefile for the Linux VLAN layer. +# + +obj-$(CONFIG_VLAN_8021Q) += 8021q.o + +8021q-objs := vlan.o vlan_dev.o + +ifeq ($(CONFIG_PROC_FS),y) +8021q-objs += vlanproc.o +endif + diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c new file mode 100644 index 000000000000..1f6d31670bc7 --- /dev/null +++ b/net/8021q/vlan.c @@ -0,0 +1,774 @@ +/* + * INET 802.1Q VLAN + * Ethernet-type device handling. + * + * Authors: Ben Greear + * Please send support related email to: vlan@scry.wanfear.com + * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html + * + * Fixes: + * Fix for packet capture - Nick Eggleston ; + * Add HW acceleration hooks - David S. Miller ; + * Correct all the locking - David S. Miller ; + * Use hash table for VLAN groups - David S. Miller + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include /* for copy_from_user */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "vlan.h" +#include "vlanproc.h" + +#define DRV_VERSION "1.8" + +/* Global VLAN variables */ + +/* Our listing of VLAN group(s) */ +static struct hlist_head vlan_group_hash[VLAN_GRP_HASH_SIZE]; +#define vlan_grp_hashfn(IDX) ((((IDX) >> VLAN_GRP_HASH_SHIFT) ^ (IDX)) & VLAN_GRP_HASH_MASK) + +static char vlan_fullname[] = "802.1Q VLAN Support"; +static char vlan_version[] = DRV_VERSION; +static char vlan_copyright[] = "Ben Greear "; +static char vlan_buggyright[] = "David S. Miller "; + +static int vlan_device_event(struct notifier_block *, unsigned long, void *); +static int vlan_ioctl_handler(void __user *); +static int unregister_vlan_dev(struct net_device *, unsigned short ); + +static struct notifier_block vlan_notifier_block = { + .notifier_call = vlan_device_event, +}; + +/* These may be changed at run-time through IOCTLs */ + +/* Determines interface naming scheme. */ +unsigned short vlan_name_type = VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD; + +static struct packet_type vlan_packet_type = { + .type = __constant_htons(ETH_P_8021Q), + .func = vlan_skb_recv, /* VLAN receive method */ +}; + +/* Bits of netdev state that are propagated from real device to virtual */ +#define VLAN_LINK_STATE_MASK \ + ((1<<__LINK_STATE_PRESENT)|(1<<__LINK_STATE_NOCARRIER)) + +/* End of global variables definitions. */ + +/* + * Function vlan_proto_init (pro) + * + * Initialize VLAN protocol layer, + * + */ +static int __init vlan_proto_init(void) +{ + int err; + + printk(VLAN_INF "%s v%s %s\n", + vlan_fullname, vlan_version, vlan_copyright); + printk(VLAN_INF "All bugs added by %s\n", + vlan_buggyright); + + /* proc file system initialization */ + err = vlan_proc_init(); + if (err < 0) { + printk(KERN_ERR + "%s %s: can't create entry in proc filesystem!\n", + __FUNCTION__, VLAN_NAME); + return err; + } + + dev_add_pack(&vlan_packet_type); + + /* Register us to receive netdevice events */ + err = register_netdevice_notifier(&vlan_notifier_block); + if (err < 0) { + dev_remove_pack(&vlan_packet_type); + vlan_proc_cleanup(); + return err; + } + + vlan_ioctl_set(vlan_ioctl_handler); + + return 0; +} + +/* Cleanup all vlan devices + * Note: devices that have been registered that but not + * brought up will exist but have no module ref count. + */ +static void __exit vlan_cleanup_devices(void) +{ + struct net_device *dev, *nxt; + + rtnl_lock(); + for (dev = dev_base; dev; dev = nxt) { + nxt = dev->next; + if (dev->priv_flags & IFF_802_1Q_VLAN) { + unregister_vlan_dev(VLAN_DEV_INFO(dev)->real_dev, + VLAN_DEV_INFO(dev)->vlan_id); + + unregister_netdevice(dev); + } + } + rtnl_unlock(); +} + +/* + * Module 'remove' entry point. + * o delete /proc/net/router directory and static entries. + */ +static void __exit vlan_cleanup_module(void) +{ + int i; + + vlan_ioctl_set(NULL); + + /* Un-register us from receiving netdevice events */ + unregister_netdevice_notifier(&vlan_notifier_block); + + dev_remove_pack(&vlan_packet_type); + vlan_cleanup_devices(); + + /* This table must be empty if there are no module + * references left. + */ + for (i = 0; i < VLAN_GRP_HASH_SIZE; i++) { + BUG_ON(!hlist_empty(&vlan_group_hash[i])); + } + vlan_proc_cleanup(); + + synchronize_net(); +} + +module_init(vlan_proto_init); +module_exit(vlan_cleanup_module); + +/* Must be invoked with RCU read lock (no preempt) */ +static struct vlan_group *__vlan_find_group(int real_dev_ifindex) +{ + struct vlan_group *grp; + struct hlist_node *n; + int hash = vlan_grp_hashfn(real_dev_ifindex); + + hlist_for_each_entry_rcu(grp, n, &vlan_group_hash[hash], hlist) { + if (grp->real_dev_ifindex == real_dev_ifindex) + return grp; + } + + return NULL; +} + +/* Find the protocol handler. Assumes VID < VLAN_VID_MASK. + * + * Must be invoked with RCU read lock (no preempt) + */ +struct net_device *__find_vlan_dev(struct net_device *real_dev, + unsigned short VID) +{ + struct vlan_group *grp = __vlan_find_group(real_dev->ifindex); + + if (grp) + return grp->vlan_devices[VID]; + + return NULL; +} + +static void vlan_rcu_free(struct rcu_head *rcu) +{ + kfree(container_of(rcu, struct vlan_group, rcu)); +} + + +/* This returns 0 if everything went fine. + * It will return 1 if the group was killed as a result. + * A negative return indicates failure. + * + * The RTNL lock must be held. + */ +static int unregister_vlan_dev(struct net_device *real_dev, + unsigned short vlan_id) +{ + struct net_device *dev = NULL; + int real_dev_ifindex = real_dev->ifindex; + struct vlan_group *grp; + int i, ret; + +#ifdef VLAN_DEBUG + printk(VLAN_DBG "%s: VID: %i\n", __FUNCTION__, vlan_id); +#endif + + /* sanity check */ + if (vlan_id >= VLAN_VID_MASK) + return -EINVAL; + + ASSERT_RTNL(); + grp = __vlan_find_group(real_dev_ifindex); + + ret = 0; + + if (grp) { + dev = grp->vlan_devices[vlan_id]; + if (dev) { + /* Remove proc entry */ + vlan_proc_rem_dev(dev); + + /* Take it out of our own structures, but be sure to + * interlock with HW accelerating devices or SW vlan + * input packet processing. + */ + if (real_dev->features & + (NETIF_F_HW_VLAN_RX | NETIF_F_HW_VLAN_FILTER)) { + real_dev->vlan_rx_kill_vid(real_dev, vlan_id); + } + + grp->vlan_devices[vlan_id] = NULL; + synchronize_net(); + + + /* Caller unregisters (and if necessary, puts) + * VLAN device, but we get rid of the reference to + * real_dev here. + */ + dev_put(real_dev); + + /* If the group is now empty, kill off the + * group. + */ + for (i = 0; i < VLAN_VID_MASK; i++) + if (grp->vlan_devices[i]) + break; + + if (i == VLAN_VID_MASK) { + if (real_dev->features & NETIF_F_HW_VLAN_RX) + real_dev->vlan_rx_register(real_dev, NULL); + + hlist_del_rcu(&grp->hlist); + + /* Free the group, after all cpu's are done. */ + call_rcu(&grp->rcu, vlan_rcu_free); + + grp = NULL; + ret = 1; + } + } + } + + return ret; +} + +static int unregister_vlan_device(const char *vlan_IF_name) +{ + struct net_device *dev = NULL; + int ret; + + + dev = dev_get_by_name(vlan_IF_name); + ret = -EINVAL; + if (dev) { + if (dev->priv_flags & IFF_802_1Q_VLAN) { + rtnl_lock(); + + ret = unregister_vlan_dev(VLAN_DEV_INFO(dev)->real_dev, + VLAN_DEV_INFO(dev)->vlan_id); + + dev_put(dev); + unregister_netdevice(dev); + + rtnl_unlock(); + + if (ret == 1) + ret = 0; + } else { + printk(VLAN_ERR + "%s: ERROR: Tried to remove a non-vlan device " + "with VLAN code, name: %s priv_flags: %hX\n", + __FUNCTION__, dev->name, dev->priv_flags); + dev_put(dev); + ret = -EPERM; + } + } else { +#ifdef VLAN_DEBUG + printk(VLAN_DBG "%s: WARNING: Could not find dev.\n", __FUNCTION__); +#endif + ret = -EINVAL; + } + + return ret; +} + +static void vlan_setup(struct net_device *new_dev) +{ + SET_MODULE_OWNER(new_dev); + + /* new_dev->ifindex = 0; it will be set when added to + * the global list. + * iflink is set as well. + */ + new_dev->get_stats = vlan_dev_get_stats; + + /* Make this thing known as a VLAN device */ + new_dev->priv_flags |= IFF_802_1Q_VLAN; + + /* Set us up to have no queue, as the underlying Hardware device + * can do all the queueing we could want. + */ + new_dev->tx_queue_len = 0; + + /* set up method calls */ + new_dev->change_mtu = vlan_dev_change_mtu; + new_dev->open = vlan_dev_open; + new_dev->stop = vlan_dev_stop; + new_dev->set_mac_address = vlan_dev_set_mac_address; + new_dev->set_multicast_list = vlan_dev_set_multicast_list; + new_dev->destructor = free_netdev; + new_dev->do_ioctl = vlan_dev_ioctl; +} + +/* Attach a VLAN device to a mac address (ie Ethernet Card). + * Returns the device that was created, or NULL if there was + * an error of some kind. + */ +static struct net_device *register_vlan_device(const char *eth_IF_name, + unsigned short VLAN_ID) +{ + struct vlan_group *grp; + struct net_device *new_dev; + struct net_device *real_dev; /* the ethernet device */ + char name[IFNAMSIZ]; + +#ifdef VLAN_DEBUG + printk(VLAN_DBG "%s: if_name -:%s:- vid: %i\n", + __FUNCTION__, eth_IF_name, VLAN_ID); +#endif + + if (VLAN_ID >= VLAN_VID_MASK) + goto out_ret_null; + + /* find the device relating to eth_IF_name. */ + real_dev = dev_get_by_name(eth_IF_name); + if (!real_dev) + goto out_ret_null; + + if (real_dev->features & NETIF_F_VLAN_CHALLENGED) { + printk(VLAN_DBG "%s: VLANs not supported on %s.\n", + __FUNCTION__, real_dev->name); + goto out_put_dev; + } + + if ((real_dev->features & NETIF_F_HW_VLAN_RX) && + (real_dev->vlan_rx_register == NULL || + real_dev->vlan_rx_kill_vid == NULL)) { + printk(VLAN_DBG "%s: Device %s has buggy VLAN hw accel.\n", + __FUNCTION__, real_dev->name); + goto out_put_dev; + } + + if ((real_dev->features & NETIF_F_HW_VLAN_FILTER) && + (real_dev->vlan_rx_add_vid == NULL || + real_dev->vlan_rx_kill_vid == NULL)) { + printk(VLAN_DBG "%s: Device %s has buggy VLAN hw accel.\n", + __FUNCTION__, real_dev->name); + goto out_put_dev; + } + + /* From this point on, all the data structures must remain + * consistent. + */ + rtnl_lock(); + + /* The real device must be up and operating in order to + * assosciate a VLAN device with it. + */ + if (!(real_dev->flags & IFF_UP)) + goto out_unlock; + + if (__find_vlan_dev(real_dev, VLAN_ID) != NULL) { + /* was already registered. */ + printk(VLAN_DBG "%s: ALREADY had VLAN registered\n", __FUNCTION__); + goto out_unlock; + } + + /* Gotta set up the fields for the device. */ +#ifdef VLAN_DEBUG + printk(VLAN_DBG "About to allocate name, vlan_name_type: %i\n", + vlan_name_type); +#endif + switch (vlan_name_type) { + case VLAN_NAME_TYPE_RAW_PLUS_VID: + /* name will look like: eth1.0005 */ + snprintf(name, IFNAMSIZ, "%s.%.4i", real_dev->name, VLAN_ID); + break; + case VLAN_NAME_TYPE_PLUS_VID_NO_PAD: + /* Put our vlan.VID in the name. + * Name will look like: vlan5 + */ + snprintf(name, IFNAMSIZ, "vlan%i", VLAN_ID); + break; + case VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD: + /* Put our vlan.VID in the name. + * Name will look like: eth0.5 + */ + snprintf(name, IFNAMSIZ, "%s.%i", real_dev->name, VLAN_ID); + break; + case VLAN_NAME_TYPE_PLUS_VID: + /* Put our vlan.VID in the name. + * Name will look like: vlan0005 + */ + default: + snprintf(name, IFNAMSIZ, "vlan%.4i", VLAN_ID); + }; + + new_dev = alloc_netdev(sizeof(struct vlan_dev_info), name, + vlan_setup); + if (new_dev == NULL) + goto out_unlock; + +#ifdef VLAN_DEBUG + printk(VLAN_DBG "Allocated new name -:%s:-\n", new_dev->name); +#endif + /* IFF_BROADCAST|IFF_MULTICAST; ??? */ + new_dev->flags = real_dev->flags; + new_dev->flags &= ~IFF_UP; + + new_dev->state = real_dev->state & VLAN_LINK_STATE_MASK; + + /* need 4 bytes for extra VLAN header info, + * hope the underlying device can handle it. + */ + new_dev->mtu = real_dev->mtu; + + /* TODO: maybe just assign it to be ETHERNET? */ + new_dev->type = real_dev->type; + + new_dev->hard_header_len = real_dev->hard_header_len; + if (!(real_dev->features & NETIF_F_HW_VLAN_TX)) { + /* Regular ethernet + 4 bytes (18 total). */ + new_dev->hard_header_len += VLAN_HLEN; + } + + VLAN_MEM_DBG("new_dev->priv malloc, addr: %p size: %i\n", + new_dev->priv, + sizeof(struct vlan_dev_info)); + + memcpy(new_dev->broadcast, real_dev->broadcast, real_dev->addr_len); + memcpy(new_dev->dev_addr, real_dev->dev_addr, real_dev->addr_len); + new_dev->addr_len = real_dev->addr_len; + + if (real_dev->features & NETIF_F_HW_VLAN_TX) { + new_dev->hard_header = real_dev->hard_header; + new_dev->hard_start_xmit = vlan_dev_hwaccel_hard_start_xmit; + new_dev->rebuild_header = real_dev->rebuild_header; + } else { + new_dev->hard_header = vlan_dev_hard_header; + new_dev->hard_start_xmit = vlan_dev_hard_start_xmit; + new_dev->rebuild_header = vlan_dev_rebuild_header; + } + new_dev->hard_header_parse = real_dev->hard_header_parse; + + VLAN_DEV_INFO(new_dev)->vlan_id = VLAN_ID; /* 1 through VLAN_VID_MASK */ + VLAN_DEV_INFO(new_dev)->real_dev = real_dev; + VLAN_DEV_INFO(new_dev)->dent = NULL; + VLAN_DEV_INFO(new_dev)->flags = 1; + +#ifdef VLAN_DEBUG + printk(VLAN_DBG "About to go find the group for idx: %i\n", + real_dev->ifindex); +#endif + + if (register_netdevice(new_dev)) + goto out_free_newdev; + + /* So, got the sucker initialized, now lets place + * it into our local structure. + */ + grp = __vlan_find_group(real_dev->ifindex); + + /* Note, we are running under the RTNL semaphore + * so it cannot "appear" on us. + */ + if (!grp) { /* need to add a new group */ + grp = kmalloc(sizeof(struct vlan_group), GFP_KERNEL); + if (!grp) + goto out_free_unregister; + + /* printk(KERN_ALERT "VLAN REGISTER: Allocated new group.\n"); */ + memset(grp, 0, sizeof(struct vlan_group)); + grp->real_dev_ifindex = real_dev->ifindex; + + hlist_add_head_rcu(&grp->hlist, + &vlan_group_hash[vlan_grp_hashfn(real_dev->ifindex)]); + + if (real_dev->features & NETIF_F_HW_VLAN_RX) + real_dev->vlan_rx_register(real_dev, grp); + } + + grp->vlan_devices[VLAN_ID] = new_dev; + + if (vlan_proc_add_dev(new_dev)<0)/* create it's proc entry */ + printk(KERN_WARNING "VLAN: failed to add proc entry for %s\n", + new_dev->name); + + if (real_dev->features & NETIF_F_HW_VLAN_FILTER) + real_dev->vlan_rx_add_vid(real_dev, VLAN_ID); + + rtnl_unlock(); + + +#ifdef VLAN_DEBUG + printk(VLAN_DBG "Allocated new device successfully, returning.\n"); +#endif + return new_dev; + +out_free_unregister: + unregister_netdev(new_dev); + goto out_unlock; + +out_free_newdev: + free_netdev(new_dev); + +out_unlock: + rtnl_unlock(); + +out_put_dev: + dev_put(real_dev); + +out_ret_null: + return NULL; +} + +static int vlan_device_event(struct notifier_block *unused, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct vlan_group *grp = __vlan_find_group(dev->ifindex); + int i, flgs; + struct net_device *vlandev; + + if (!grp) + goto out; + + /* It is OK that we do not hold the group lock right now, + * as we run under the RTNL lock. + */ + + switch (event) { + case NETDEV_CHANGE: + /* Propagate real device state to vlan devices */ + flgs = dev->state & VLAN_LINK_STATE_MASK; + for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { + vlandev = grp->vlan_devices[i]; + if (!vlandev) + continue; + + if ((vlandev->state & VLAN_LINK_STATE_MASK) != flgs) { + vlandev->state = (vlandev->state &~ VLAN_LINK_STATE_MASK) + | flgs; + netdev_state_change(vlandev); + } + } + break; + + case NETDEV_DOWN: + /* Put all VLANs for this dev in the down state too. */ + for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { + vlandev = grp->vlan_devices[i]; + if (!vlandev) + continue; + + flgs = vlandev->flags; + if (!(flgs & IFF_UP)) + continue; + + dev_change_flags(vlandev, flgs & ~IFF_UP); + } + break; + + case NETDEV_UP: + /* Put all VLANs for this dev in the up state too. */ + for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { + vlandev = grp->vlan_devices[i]; + if (!vlandev) + continue; + + flgs = vlandev->flags; + if (flgs & IFF_UP) + continue; + + dev_change_flags(vlandev, flgs | IFF_UP); + } + break; + + case NETDEV_UNREGISTER: + /* Delete all VLANs for this dev. */ + for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { + int ret; + + vlandev = grp->vlan_devices[i]; + if (!vlandev) + continue; + + ret = unregister_vlan_dev(dev, + VLAN_DEV_INFO(vlandev)->vlan_id); + + unregister_netdevice(vlandev); + + /* Group was destroyed? */ + if (ret == 1) + break; + } + break; + }; + +out: + return NOTIFY_DONE; +} + +/* + * VLAN IOCTL handler. + * o execute requested action or pass command to the device driver + * arg is really a struct vlan_ioctl_args __user *. + */ +static int vlan_ioctl_handler(void __user *arg) +{ + int err = 0; + unsigned short vid = 0; + struct vlan_ioctl_args args; + + if (copy_from_user(&args, arg, sizeof(struct vlan_ioctl_args))) + return -EFAULT; + + /* Null terminate this sucker, just in case. */ + args.device1[23] = 0; + args.u.device2[23] = 0; + +#ifdef VLAN_DEBUG + printk(VLAN_DBG "%s: args.cmd: %x\n", __FUNCTION__, args.cmd); +#endif + + switch (args.cmd) { + case SET_VLAN_INGRESS_PRIORITY_CMD: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + err = vlan_dev_set_ingress_priority(args.device1, + args.u.skb_priority, + args.vlan_qos); + break; + + case SET_VLAN_EGRESS_PRIORITY_CMD: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + err = vlan_dev_set_egress_priority(args.device1, + args.u.skb_priority, + args.vlan_qos); + break; + + case SET_VLAN_FLAG_CMD: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + err = vlan_dev_set_vlan_flag(args.device1, + args.u.flag, + args.vlan_qos); + break; + + case SET_VLAN_NAME_TYPE_CMD: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if ((args.u.name_type >= 0) && + (args.u.name_type < VLAN_NAME_TYPE_HIGHEST)) { + vlan_name_type = args.u.name_type; + err = 0; + } else { + err = -EINVAL; + } + break; + + case ADD_VLAN_CMD: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + /* we have been given the name of the Ethernet Device we want to + * talk to: args.dev1 We also have the + * VLAN ID: args.u.VID + */ + if (register_vlan_device(args.device1, args.u.VID)) { + err = 0; + } else { + err = -EINVAL; + } + break; + + case DEL_VLAN_CMD: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + /* Here, the args.dev1 is the actual VLAN we want + * to get rid of. + */ + err = unregister_vlan_device(args.device1); + break; + + case GET_VLAN_INGRESS_PRIORITY_CMD: + /* TODO: Implement + err = vlan_dev_get_ingress_priority(args); + if (copy_to_user((void*)arg, &args, + sizeof(struct vlan_ioctl_args))) { + err = -EFAULT; + } + */ + err = -EINVAL; + break; + case GET_VLAN_EGRESS_PRIORITY_CMD: + /* TODO: Implement + err = vlan_dev_get_egress_priority(args.device1, &(args.args); + if (copy_to_user((void*)arg, &args, + sizeof(struct vlan_ioctl_args))) { + err = -EFAULT; + } + */ + err = -EINVAL; + break; + case GET_VLAN_REALDEV_NAME_CMD: + err = vlan_dev_get_realdev_name(args.device1, args.u.device2); + if (copy_to_user(arg, &args, + sizeof(struct vlan_ioctl_args))) { + err = -EFAULT; + } + break; + + case GET_VLAN_VID_CMD: + err = vlan_dev_get_vid(args.device1, &vid); + args.u.VID = vid; + if (copy_to_user(arg, &args, + sizeof(struct vlan_ioctl_args))) { + err = -EFAULT; + } + break; + + default: + /* pass on to underlying device instead?? */ + printk(VLAN_DBG "%s: Unknown VLAN CMD: %x \n", + __FUNCTION__, args.cmd); + return -EINVAL; + }; + + return err; +} + +MODULE_LICENSE("GPL"); +MODULE_VERSION(DRV_VERSION); diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h new file mode 100644 index 000000000000..508b1fa14546 --- /dev/null +++ b/net/8021q/vlan.h @@ -0,0 +1,72 @@ +#ifndef __BEN_VLAN_802_1Q_INC__ +#define __BEN_VLAN_802_1Q_INC__ + +#include + +/* Uncomment this if you want debug traces to be shown. */ +/* #define VLAN_DEBUG */ + +#define VLAN_ERR KERN_ERR +#define VLAN_INF KERN_INFO +#define VLAN_DBG KERN_ALERT /* change these... to debug, having a hard time + * changing the log level at run-time..for some reason. + */ + +/* + +These I use for memory debugging. I feared a leak at one time, but +I never found it..and the problem seems to have dissappeared. Still, +I'll bet they might prove useful again... --Ben + + +#define VLAN_MEM_DBG(x, y, z) printk(VLAN_DBG "%s: " x, __FUNCTION__, y, z); +#define VLAN_FMEM_DBG(x, y) printk(VLAN_DBG "%s: " x, __FUNCTION__, y); +*/ + +/* This way they don't do anything! */ +#define VLAN_MEM_DBG(x, y, z) +#define VLAN_FMEM_DBG(x, y) + + +extern unsigned short vlan_name_type; + +#define VLAN_GRP_HASH_SHIFT 5 +#define VLAN_GRP_HASH_SIZE (1 << VLAN_GRP_HASH_SHIFT) +#define VLAN_GRP_HASH_MASK (VLAN_GRP_HASH_SIZE - 1) + +/* Find a VLAN device by the MAC address of its Ethernet device, and + * it's VLAN ID. The default configuration is to have VLAN's scope + * to be box-wide, so the MAC will be ignored. The mac will only be + * looked at if we are configured to have a separate set of VLANs per + * each MAC addressable interface. Note that this latter option does + * NOT follow the spec for VLANs, but may be useful for doing very + * large quantities of VLAN MUX/DEMUX onto FrameRelay or ATM PVCs. + * + * Must be invoked with rcu_read_lock (ie preempt disabled) + * or with RTNL. + */ +struct net_device *__find_vlan_dev(struct net_device* real_dev, + unsigned short VID); /* vlan.c */ + +/* found in vlan_dev.c */ +int vlan_dev_rebuild_header(struct sk_buff *skb); +int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, + struct packet_type* ptype); +int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, void *daddr, void *saddr, + unsigned len); +int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev); +int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, struct net_device *dev); +int vlan_dev_change_mtu(struct net_device *dev, int new_mtu); +int vlan_dev_set_mac_address(struct net_device *dev, void* addr); +int vlan_dev_open(struct net_device* dev); +int vlan_dev_stop(struct net_device* dev); +int vlan_dev_ioctl(struct net_device* dev, struct ifreq *ifr, int cmd); +int vlan_dev_set_ingress_priority(char* dev_name, __u32 skb_prio, short vlan_prio); +int vlan_dev_set_egress_priority(char* dev_name, __u32 skb_prio, short vlan_prio); +int vlan_dev_set_vlan_flag(char* dev_name, __u32 flag, short flag_val); +int vlan_dev_get_realdev_name(const char* dev_name, char* result); +int vlan_dev_get_vid(const char* dev_name, unsigned short* result); +void vlan_dev_set_multicast_list(struct net_device *vlan_dev); + +#endif /* !(__BEN_VLAN_802_1Q_INC__) */ diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c new file mode 100644 index 000000000000..49c487413518 --- /dev/null +++ b/net/8021q/vlan_dev.c @@ -0,0 +1,890 @@ +/* -*- linux-c -*- + * INET 802.1Q VLAN + * Ethernet-type device handling. + * + * Authors: Ben Greear + * Please send support related email to: vlan@scry.wanfear.com + * VLAN Home Page: http://www.candelatech.com/~greear/vlan.html + * + * Fixes: Mar 22 2001: Martin Bokaemper + * - reset skb->pkt_type on incoming packets when MAC was changed + * - see that changed MAC is saddr for outgoing packets + * Oct 20, 2001: Ard van Breeman: + * - Fix MC-list, finally. + * - Flush MC-list on VLAN destroy. + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include /* for copy_from_user */ +#include +#include +#include +#include +#include +#include + +#include "vlan.h" +#include "vlanproc.h" +#include +#include + +/* + * Rebuild the Ethernet MAC header. This is called after an ARP + * (or in future other address resolution) has completed on this + * sk_buff. We now let ARP fill in the other fields. + * + * This routine CANNOT use cached dst->neigh! + * Really, it is used only when dst->neigh is wrong. + * + * TODO: This needs a checkup, I'm ignorant here. --BLG + */ +int vlan_dev_rebuild_header(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); + + switch (veth->h_vlan_encapsulated_proto) { +#ifdef CONFIG_INET + case __constant_htons(ETH_P_IP): + + /* TODO: Confirm this will work with VLAN headers... */ + return arp_find(veth->h_dest, skb); +#endif + default: + printk(VLAN_DBG + "%s: unable to resolve type %X addresses.\n", + dev->name, (int)veth->h_vlan_encapsulated_proto); + + memcpy(veth->h_source, dev->dev_addr, ETH_ALEN); + break; + }; + + return 0; +} + +static inline struct sk_buff *vlan_check_reorder_header(struct sk_buff *skb) +{ + if (VLAN_DEV_INFO(skb->dev)->flags & 1) { + if (skb_shared(skb) || skb_cloned(skb)) { + struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); + kfree_skb(skb); + skb = nskb; + } + if (skb) { + /* Lifted from Gleb's VLAN code... */ + memmove(skb->data - ETH_HLEN, + skb->data - VLAN_ETH_HLEN, 12); + skb->mac.raw += VLAN_HLEN; + } + } + + return skb; +} + +/* + * Determine the packet's protocol ID. The rule here is that we + * assume 802.3 if the type field is short enough to be a length. + * This is normal practice and works for any 'now in use' protocol. + * + * Also, at this point we assume that we ARE dealing exclusively with + * VLAN packets, or packets that should be made into VLAN packets based + * on a default VLAN ID. + * + * NOTE: Should be similar to ethernet/eth.c. + * + * SANITY NOTE: This method is called when a packet is moving up the stack + * towards userland. To get here, it would have already passed + * through the ethernet/eth.c eth_type_trans() method. + * SANITY NOTE 2: We are referencing to the VLAN_HDR frields, which MAY be + * stored UNALIGNED in the memory. RISC systems don't like + * such cases very much... + * SANITY NOTE 2a: According to Dave Miller & Alexey, it will always be aligned, + * so there doesn't need to be any of the unaligned stuff. It has + * been commented out now... --Ben + * + */ +int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, + struct packet_type* ptype) +{ + unsigned char *rawp = NULL; + struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data); + unsigned short vid; + struct net_device_stats *stats; + unsigned short vlan_TCI; + unsigned short proto; + + /* vlan_TCI = ntohs(get_unaligned(&vhdr->h_vlan_TCI)); */ + vlan_TCI = ntohs(vhdr->h_vlan_TCI); + + vid = (vlan_TCI & VLAN_VID_MASK); + +#ifdef VLAN_DEBUG + printk(VLAN_DBG "%s: skb: %p vlan_id: %hx\n", + __FUNCTION__, skb, vid); +#endif + + /* Ok, we will find the correct VLAN device, strip the header, + * and then go on as usual. + */ + + /* We have 12 bits of vlan ID. + * + * We must not drop allow preempt until we hold a + * reference to the device (netif_rx does that) or we + * fail. + */ + + rcu_read_lock(); + skb->dev = __find_vlan_dev(dev, vid); + if (!skb->dev) { + rcu_read_unlock(); + +#ifdef VLAN_DEBUG + printk(VLAN_DBG "%s: ERROR: No net_device for VID: %i on dev: %s [%i]\n", + __FUNCTION__, (unsigned int)(vid), dev->name, dev->ifindex); +#endif + kfree_skb(skb); + return -1; + } + + skb->dev->last_rx = jiffies; + + /* Bump the rx counters for the VLAN device. */ + stats = vlan_dev_get_stats(skb->dev); + stats->rx_packets++; + stats->rx_bytes += skb->len; + + skb_pull(skb, VLAN_HLEN); /* take off the VLAN header (4 bytes currently) */ + + /* Ok, lets check to make sure the device (dev) we + * came in on is what this VLAN is attached to. + */ + + if (dev != VLAN_DEV_INFO(skb->dev)->real_dev) { + rcu_read_unlock(); + +#ifdef VLAN_DEBUG + printk(VLAN_DBG "%s: dropping skb: %p because came in on wrong device, dev: %s real_dev: %s, skb_dev: %s\n", + __FUNCTION__, skb, dev->name, + VLAN_DEV_INFO(skb->dev)->real_dev->name, + skb->dev->name); +#endif + kfree_skb(skb); + stats->rx_errors++; + return -1; + } + + /* + * Deal with ingress priority mapping. + */ + skb->priority = vlan_get_ingress_priority(skb->dev, ntohs(vhdr->h_vlan_TCI)); + +#ifdef VLAN_DEBUG + printk(VLAN_DBG "%s: priority: %lu for TCI: %hu (hbo)\n", + __FUNCTION__, (unsigned long)(skb->priority), + ntohs(vhdr->h_vlan_TCI)); +#endif + + /* The ethernet driver already did the pkt_type calculations + * for us... + */ + switch (skb->pkt_type) { + case PACKET_BROADCAST: /* Yeah, stats collect these together.. */ + // stats->broadcast ++; // no such counter :-( + break; + + case PACKET_MULTICAST: + stats->multicast++; + break; + + case PACKET_OTHERHOST: + /* Our lower layer thinks this is not local, let's make sure. + * This allows the VLAN to have a different MAC than the underlying + * device, and still route correctly. + */ + if (memcmp(eth_hdr(skb)->h_dest, skb->dev->dev_addr, ETH_ALEN) == 0) { + /* It is for our (changed) MAC-address! */ + skb->pkt_type = PACKET_HOST; + } + break; + default: + break; + }; + + /* Was a VLAN packet, grab the encapsulated protocol, which the layer + * three protocols care about. + */ + /* proto = get_unaligned(&vhdr->h_vlan_encapsulated_proto); */ + proto = vhdr->h_vlan_encapsulated_proto; + + skb->protocol = proto; + if (ntohs(proto) >= 1536) { + /* place it back on the queue to be handled by + * true layer 3 protocols. + */ + + /* See if we are configured to re-write the VLAN header + * to make it look like ethernet... + */ + skb = vlan_check_reorder_header(skb); + + /* Can be null if skb-clone fails when re-ordering */ + if (skb) { + netif_rx(skb); + } else { + /* TODO: Add a more specific counter here. */ + stats->rx_errors++; + } + rcu_read_unlock(); + return 0; + } + + rawp = skb->data; + + /* + * This is a magic hack to spot IPX packets. Older Novell breaks + * the protocol design and runs IPX over 802.3 without an 802.2 LLC + * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This + * won't work for fault tolerant netware but does for the rest. + */ + if (*(unsigned short *)rawp == 0xFFFF) { + skb->protocol = __constant_htons(ETH_P_802_3); + /* place it back on the queue to be handled by true layer 3 protocols. + */ + + /* See if we are configured to re-write the VLAN header + * to make it look like ethernet... + */ + skb = vlan_check_reorder_header(skb); + + /* Can be null if skb-clone fails when re-ordering */ + if (skb) { + netif_rx(skb); + } else { + /* TODO: Add a more specific counter here. */ + stats->rx_errors++; + } + rcu_read_unlock(); + return 0; + } + + /* + * Real 802.2 LLC + */ + skb->protocol = __constant_htons(ETH_P_802_2); + /* place it back on the queue to be handled by upper layer protocols. + */ + + /* See if we are configured to re-write the VLAN header + * to make it look like ethernet... + */ + skb = vlan_check_reorder_header(skb); + + /* Can be null if skb-clone fails when re-ordering */ + if (skb) { + netif_rx(skb); + } else { + /* TODO: Add a more specific counter here. */ + stats->rx_errors++; + } + rcu_read_unlock(); + return 0; +} + +static inline unsigned short vlan_dev_get_egress_qos_mask(struct net_device* dev, + struct sk_buff* skb) +{ + struct vlan_priority_tci_mapping *mp = + VLAN_DEV_INFO(dev)->egress_priority_map[(skb->priority & 0xF)]; + + while (mp) { + if (mp->priority == skb->priority) { + return mp->vlan_qos; /* This should already be shifted to mask + * correctly with the VLAN's TCI + */ + } + mp = mp->next; + } + return 0; +} + +/* + * Create the VLAN header for an arbitrary protocol layer + * + * saddr=NULL means use device source address + * daddr=NULL means leave destination address (eg unresolved arp) + * + * This is called when the SKB is moving down the stack towards the + * physical devices. + */ +int vlan_dev_hard_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, void *daddr, void *saddr, + unsigned len) +{ + struct vlan_hdr *vhdr; + unsigned short veth_TCI = 0; + int rc = 0; + int build_vlan_header = 0; + struct net_device *vdev = dev; /* save this for the bottom of the method */ + +#ifdef VLAN_DEBUG + printk(VLAN_DBG "%s: skb: %p type: %hx len: %x vlan_id: %hx, daddr: %p\n", + __FUNCTION__, skb, type, len, VLAN_DEV_INFO(dev)->vlan_id, daddr); +#endif + + /* build vlan header only if re_order_header flag is NOT set. This + * fixes some programs that get confused when they see a VLAN device + * sending a frame that is VLAN encoded (the consensus is that the VLAN + * device should look completely like an Ethernet device when the + * REORDER_HEADER flag is set) The drawback to this is some extra + * header shuffling in the hard_start_xmit. Users can turn off this + * REORDER behaviour with the vconfig tool. + */ + build_vlan_header = ((VLAN_DEV_INFO(dev)->flags & 1) == 0); + + if (build_vlan_header) { + vhdr = (struct vlan_hdr *) skb_push(skb, VLAN_HLEN); + + /* build the four bytes that make this a VLAN header. */ + + /* Now, construct the second two bytes. This field looks something + * like: + * usr_priority: 3 bits (high bits) + * CFI 1 bit + * VLAN ID 12 bits (low bits) + * + */ + veth_TCI = VLAN_DEV_INFO(dev)->vlan_id; + veth_TCI |= vlan_dev_get_egress_qos_mask(dev, skb); + + vhdr->h_vlan_TCI = htons(veth_TCI); + + /* + * Set the protocol type. + * For a packet of type ETH_P_802_3 we put the length in here instead. + * It is up to the 802.2 layer to carry protocol information. + */ + + if (type != ETH_P_802_3) { + vhdr->h_vlan_encapsulated_proto = htons(type); + } else { + vhdr->h_vlan_encapsulated_proto = htons(len); + } + } + + /* Before delegating work to the lower layer, enter our MAC-address */ + if (saddr == NULL) + saddr = dev->dev_addr; + + dev = VLAN_DEV_INFO(dev)->real_dev; + + /* MPLS can send us skbuffs w/out enough space. This check will grow the + * skb if it doesn't have enough headroom. Not a beautiful solution, so + * I'll tick a counter so that users can know it's happening... If they + * care... + */ + + /* NOTE: This may still break if the underlying device is not the final + * device (and thus there are more headers to add...) It should work for + * good-ole-ethernet though. + */ + if (skb_headroom(skb) < dev->hard_header_len) { + struct sk_buff *sk_tmp = skb; + skb = skb_realloc_headroom(sk_tmp, dev->hard_header_len); + kfree_skb(sk_tmp); + if (skb == NULL) { + struct net_device_stats *stats = vlan_dev_get_stats(vdev); + stats->tx_dropped++; + return -ENOMEM; + } + VLAN_DEV_INFO(vdev)->cnt_inc_headroom_on_tx++; +#ifdef VLAN_DEBUG + printk(VLAN_DBG "%s: %s: had to grow skb.\n", __FUNCTION__, vdev->name); +#endif + } + + if (build_vlan_header) { + /* Now make the underlying real hard header */ + rc = dev->hard_header(skb, dev, ETH_P_8021Q, daddr, saddr, len + VLAN_HLEN); + + if (rc > 0) { + rc += VLAN_HLEN; + } else if (rc < 0) { + rc -= VLAN_HLEN; + } + } else { + /* If here, then we'll just make a normal looking ethernet frame, + * but, the hard_start_xmit method will insert the tag (it has to + * be able to do this for bridged and other skbs that don't come + * down the protocol stack in an orderly manner. + */ + rc = dev->hard_header(skb, dev, type, daddr, saddr, len); + } + + return rc; +} + +int vlan_dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device_stats *stats = vlan_dev_get_stats(dev); + struct vlan_ethhdr *veth = (struct vlan_ethhdr *)(skb->data); + + /* Handle non-VLAN frames if they are sent to us, for example by DHCP. + * + * NOTE: THIS ASSUMES DIX ETHERNET, SPECIFICALLY NOT SUPPORTING + * OTHER THINGS LIKE FDDI/TokenRing/802.3 SNAPs... + */ + + if (veth->h_vlan_proto != __constant_htons(ETH_P_8021Q)) { + int orig_headroom = skb_headroom(skb); + unsigned short veth_TCI; + + /* This is not a VLAN frame...but we can fix that! */ + VLAN_DEV_INFO(dev)->cnt_encap_on_xmit++; + +#ifdef VLAN_DEBUG + printk(VLAN_DBG "%s: proto to encap: 0x%hx (hbo)\n", + __FUNCTION__, htons(veth->h_vlan_proto)); +#endif + /* Construct the second two bytes. This field looks something + * like: + * usr_priority: 3 bits (high bits) + * CFI 1 bit + * VLAN ID 12 bits (low bits) + */ + veth_TCI = VLAN_DEV_INFO(dev)->vlan_id; + veth_TCI |= vlan_dev_get_egress_qos_mask(dev, skb); + + skb = __vlan_put_tag(skb, veth_TCI); + if (!skb) { + stats->tx_dropped++; + return 0; + } + + if (orig_headroom < VLAN_HLEN) { + VLAN_DEV_INFO(dev)->cnt_inc_headroom_on_tx++; + } + } + +#ifdef VLAN_DEBUG + printk(VLAN_DBG "%s: about to send skb: %p to dev: %s\n", + __FUNCTION__, skb, skb->dev->name); + printk(VLAN_DBG " %2hx.%2hx.%2hx.%2xh.%2hx.%2hx %2hx.%2hx.%2hx.%2hx.%2hx.%2hx %4hx %4hx %4hx\n", + veth->h_dest[0], veth->h_dest[1], veth->h_dest[2], veth->h_dest[3], veth->h_dest[4], veth->h_dest[5], + veth->h_source[0], veth->h_source[1], veth->h_source[2], veth->h_source[3], veth->h_source[4], veth->h_source[5], + veth->h_vlan_proto, veth->h_vlan_TCI, veth->h_vlan_encapsulated_proto); +#endif + + stats->tx_packets++; /* for statics only */ + stats->tx_bytes += skb->len; + + skb->dev = VLAN_DEV_INFO(dev)->real_dev; + dev_queue_xmit(skb); + + return 0; +} + +int vlan_dev_hwaccel_hard_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device_stats *stats = vlan_dev_get_stats(dev); + unsigned short veth_TCI; + + /* Construct the second two bytes. This field looks something + * like: + * usr_priority: 3 bits (high bits) + * CFI 1 bit + * VLAN ID 12 bits (low bits) + */ + veth_TCI = VLAN_DEV_INFO(dev)->vlan_id; + veth_TCI |= vlan_dev_get_egress_qos_mask(dev, skb); + skb = __vlan_hwaccel_put_tag(skb, veth_TCI); + + stats->tx_packets++; + stats->tx_bytes += skb->len; + + skb->dev = VLAN_DEV_INFO(dev)->real_dev; + dev_queue_xmit(skb); + + return 0; +} + +int vlan_dev_change_mtu(struct net_device *dev, int new_mtu) +{ + /* TODO: gotta make sure the underlying layer can handle it, + * maybe an IFF_VLAN_CAPABLE flag for devices? + */ + if (VLAN_DEV_INFO(dev)->real_dev->mtu < new_mtu) + return -ERANGE; + + dev->mtu = new_mtu; + + return 0; +} + +int vlan_dev_set_ingress_priority(char *dev_name, __u32 skb_prio, short vlan_prio) +{ + struct net_device *dev = dev_get_by_name(dev_name); + + if (dev) { + if (dev->priv_flags & IFF_802_1Q_VLAN) { + /* see if a priority mapping exists.. */ + VLAN_DEV_INFO(dev)->ingress_priority_map[vlan_prio & 0x7] = skb_prio; + dev_put(dev); + return 0; + } + + dev_put(dev); + } + return -EINVAL; +} + +int vlan_dev_set_egress_priority(char *dev_name, __u32 skb_prio, short vlan_prio) +{ + struct net_device *dev = dev_get_by_name(dev_name); + struct vlan_priority_tci_mapping *mp = NULL; + struct vlan_priority_tci_mapping *np; + + if (dev) { + if (dev->priv_flags & IFF_802_1Q_VLAN) { + /* See if a priority mapping exists.. */ + mp = VLAN_DEV_INFO(dev)->egress_priority_map[skb_prio & 0xF]; + while (mp) { + if (mp->priority == skb_prio) { + mp->vlan_qos = ((vlan_prio << 13) & 0xE000); + dev_put(dev); + return 0; + } + mp = mp->next; + } + + /* Create a new mapping then. */ + mp = VLAN_DEV_INFO(dev)->egress_priority_map[skb_prio & 0xF]; + np = kmalloc(sizeof(struct vlan_priority_tci_mapping), GFP_KERNEL); + if (np) { + np->next = mp; + np->priority = skb_prio; + np->vlan_qos = ((vlan_prio << 13) & 0xE000); + VLAN_DEV_INFO(dev)->egress_priority_map[skb_prio & 0xF] = np; + dev_put(dev); + return 0; + } else { + dev_put(dev); + return -ENOBUFS; + } + } + dev_put(dev); + } + return -EINVAL; +} + +/* Flags are defined in the vlan_dev_info class in include/linux/if_vlan.h file. */ +int vlan_dev_set_vlan_flag(char *dev_name, __u32 flag, short flag_val) +{ + struct net_device *dev = dev_get_by_name(dev_name); + + if (dev) { + if (dev->priv_flags & IFF_802_1Q_VLAN) { + /* verify flag is supported */ + if (flag == 1) { + if (flag_val) { + VLAN_DEV_INFO(dev)->flags |= 1; + } else { + VLAN_DEV_INFO(dev)->flags &= ~1; + } + dev_put(dev); + return 0; + } else { + printk(KERN_ERR "%s: flag %i is not valid.\n", + __FUNCTION__, (int)(flag)); + dev_put(dev); + return -EINVAL; + } + } else { + printk(KERN_ERR + "%s: %s is not a vlan device, priv_flags: %hX.\n", + __FUNCTION__, dev->name, dev->priv_flags); + dev_put(dev); + } + } else { + printk(KERN_ERR "%s: Could not find device: %s\n", + __FUNCTION__, dev_name); + } + + return -EINVAL; +} + + +int vlan_dev_get_realdev_name(const char *dev_name, char* result) +{ + struct net_device *dev = dev_get_by_name(dev_name); + int rv = 0; + if (dev) { + if (dev->priv_flags & IFF_802_1Q_VLAN) { + strncpy(result, VLAN_DEV_INFO(dev)->real_dev->name, 23); + rv = 0; + } else { + rv = -EINVAL; + } + dev_put(dev); + } else { + rv = -ENODEV; + } + return rv; +} + +int vlan_dev_get_vid(const char *dev_name, unsigned short* result) +{ + struct net_device *dev = dev_get_by_name(dev_name); + int rv = 0; + if (dev) { + if (dev->priv_flags & IFF_802_1Q_VLAN) { + *result = VLAN_DEV_INFO(dev)->vlan_id; + rv = 0; + } else { + rv = -EINVAL; + } + dev_put(dev); + } else { + rv = -ENODEV; + } + return rv; +} + + +int vlan_dev_set_mac_address(struct net_device *dev, void *addr_struct_p) +{ + struct sockaddr *addr = (struct sockaddr *)(addr_struct_p); + int i; + + if (netif_running(dev)) + return -EBUSY; + + memcpy(dev->dev_addr, addr->sa_data, dev->addr_len); + + printk("%s: Setting MAC address to ", dev->name); + for (i = 0; i < 6; i++) + printk(" %2.2x", dev->dev_addr[i]); + printk(".\n"); + + if (memcmp(VLAN_DEV_INFO(dev)->real_dev->dev_addr, + dev->dev_addr, + dev->addr_len) != 0) { + if (!(VLAN_DEV_INFO(dev)->real_dev->flags & IFF_PROMISC)) { + int flgs = VLAN_DEV_INFO(dev)->real_dev->flags; + + /* Increment our in-use promiscuity counter */ + dev_set_promiscuity(VLAN_DEV_INFO(dev)->real_dev, 1); + + /* Make PROMISC visible to the user. */ + flgs |= IFF_PROMISC; + printk("VLAN (%s): Setting underlying device (%s) to promiscious mode.\n", + dev->name, VLAN_DEV_INFO(dev)->real_dev->name); + dev_change_flags(VLAN_DEV_INFO(dev)->real_dev, flgs); + } + } else { + printk("VLAN (%s): Underlying device (%s) has same MAC, not checking promiscious mode.\n", + dev->name, VLAN_DEV_INFO(dev)->real_dev->name); + } + + return 0; +} + +static inline int vlan_dmi_equals(struct dev_mc_list *dmi1, + struct dev_mc_list *dmi2) +{ + return ((dmi1->dmi_addrlen == dmi2->dmi_addrlen) && + (memcmp(dmi1->dmi_addr, dmi2->dmi_addr, dmi1->dmi_addrlen) == 0)); +} + +/** dmi is a single entry into a dev_mc_list, a single node. mc_list is + * an entire list, and we'll iterate through it. + */ +static int vlan_should_add_mc(struct dev_mc_list *dmi, struct dev_mc_list *mc_list) +{ + struct dev_mc_list *idmi; + + for (idmi = mc_list; idmi != NULL; ) { + if (vlan_dmi_equals(dmi, idmi)) { + if (dmi->dmi_users > idmi->dmi_users) + return 1; + else + return 0; + } else { + idmi = idmi->next; + } + } + + return 1; +} + +static inline void vlan_destroy_mc_list(struct dev_mc_list *mc_list) +{ + struct dev_mc_list *dmi = mc_list; + struct dev_mc_list *next; + + while(dmi) { + next = dmi->next; + kfree(dmi); + dmi = next; + } +} + +static void vlan_copy_mc_list(struct dev_mc_list *mc_list, struct vlan_dev_info *vlan_info) +{ + struct dev_mc_list *dmi, *new_dmi; + + vlan_destroy_mc_list(vlan_info->old_mc_list); + vlan_info->old_mc_list = NULL; + + for (dmi = mc_list; dmi != NULL; dmi = dmi->next) { + new_dmi = kmalloc(sizeof(*new_dmi), GFP_ATOMIC); + if (new_dmi == NULL) { + printk(KERN_ERR "vlan: cannot allocate memory. " + "Multicast may not work properly from now.\n"); + return; + } + + /* Copy whole structure, then make new 'next' pointer */ + *new_dmi = *dmi; + new_dmi->next = vlan_info->old_mc_list; + vlan_info->old_mc_list = new_dmi; + } +} + +static void vlan_flush_mc_list(struct net_device *dev) +{ + struct dev_mc_list *dmi = dev->mc_list; + + while (dmi) { + printk(KERN_DEBUG "%s: del %.2x:%.2x:%.2x:%.2x:%.2x:%.2x mcast address from vlan interface\n", + dev->name, + dmi->dmi_addr[0], + dmi->dmi_addr[1], + dmi->dmi_addr[2], + dmi->dmi_addr[3], + dmi->dmi_addr[4], + dmi->dmi_addr[5]); + dev_mc_delete(dev, dmi->dmi_addr, dmi->dmi_addrlen, 0); + dmi = dev->mc_list; + } + + /* dev->mc_list is NULL by the time we get here. */ + vlan_destroy_mc_list(VLAN_DEV_INFO(dev)->old_mc_list); + VLAN_DEV_INFO(dev)->old_mc_list = NULL; +} + +int vlan_dev_open(struct net_device *dev) +{ + if (!(VLAN_DEV_INFO(dev)->real_dev->flags & IFF_UP)) + return -ENETDOWN; + + return 0; +} + +int vlan_dev_stop(struct net_device *dev) +{ + vlan_flush_mc_list(dev); + return 0; +} + +int vlan_dev_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + struct net_device *real_dev = VLAN_DEV_INFO(dev)->real_dev; + struct ifreq ifrr; + int err = -EOPNOTSUPP; + + strncpy(ifrr.ifr_name, real_dev->name, IFNAMSIZ); + ifrr.ifr_ifru = ifr->ifr_ifru; + + switch(cmd) { + case SIOCGMIIPHY: + case SIOCGMIIREG: + case SIOCSMIIREG: + if (real_dev->do_ioctl && netif_device_present(real_dev)) + err = real_dev->do_ioctl(real_dev, &ifrr, cmd); + break; + + case SIOCETHTOOL: + err = dev_ethtool(&ifrr); + } + + if (!err) + ifr->ifr_ifru = ifrr.ifr_ifru; + + return err; +} + +/** Taken from Gleb + Lennert's VLAN code, and modified... */ +void vlan_dev_set_multicast_list(struct net_device *vlan_dev) +{ + struct dev_mc_list *dmi; + struct net_device *real_dev; + int inc; + + if (vlan_dev && (vlan_dev->priv_flags & IFF_802_1Q_VLAN)) { + /* Then it's a real vlan device, as far as we can tell.. */ + real_dev = VLAN_DEV_INFO(vlan_dev)->real_dev; + + /* compare the current promiscuity to the last promisc we had.. */ + inc = vlan_dev->promiscuity - VLAN_DEV_INFO(vlan_dev)->old_promiscuity; + if (inc) { + printk(KERN_INFO "%s: dev_set_promiscuity(master, %d)\n", + vlan_dev->name, inc); + dev_set_promiscuity(real_dev, inc); /* found in dev.c */ + VLAN_DEV_INFO(vlan_dev)->old_promiscuity = vlan_dev->promiscuity; + } + + inc = vlan_dev->allmulti - VLAN_DEV_INFO(vlan_dev)->old_allmulti; + if (inc) { + printk(KERN_INFO "%s: dev_set_allmulti(master, %d)\n", + vlan_dev->name, inc); + dev_set_allmulti(real_dev, inc); /* dev.c */ + VLAN_DEV_INFO(vlan_dev)->old_allmulti = vlan_dev->allmulti; + } + + /* looking for addresses to add to master's list */ + for (dmi = vlan_dev->mc_list; dmi != NULL; dmi = dmi->next) { + if (vlan_should_add_mc(dmi, VLAN_DEV_INFO(vlan_dev)->old_mc_list)) { + dev_mc_add(real_dev, dmi->dmi_addr, dmi->dmi_addrlen, 0); + printk(KERN_DEBUG "%s: add %.2x:%.2x:%.2x:%.2x:%.2x:%.2x mcast address to master interface\n", + vlan_dev->name, + dmi->dmi_addr[0], + dmi->dmi_addr[1], + dmi->dmi_addr[2], + dmi->dmi_addr[3], + dmi->dmi_addr[4], + dmi->dmi_addr[5]); + } + } + + /* looking for addresses to delete from master's list */ + for (dmi = VLAN_DEV_INFO(vlan_dev)->old_mc_list; dmi != NULL; dmi = dmi->next) { + if (vlan_should_add_mc(dmi, vlan_dev->mc_list)) { + /* if we think we should add it to the new list, then we should really + * delete it from the real list on the underlying device. + */ + dev_mc_delete(real_dev, dmi->dmi_addr, dmi->dmi_addrlen, 0); + printk(KERN_DEBUG "%s: del %.2x:%.2x:%.2x:%.2x:%.2x:%.2x mcast address from master interface\n", + vlan_dev->name, + dmi->dmi_addr[0], + dmi->dmi_addr[1], + dmi->dmi_addr[2], + dmi->dmi_addr[3], + dmi->dmi_addr[4], + dmi->dmi_addr[5]); + } + } + + /* save multicast list */ + vlan_copy_mc_list(vlan_dev->mc_list, VLAN_DEV_INFO(vlan_dev)); + } +} diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c new file mode 100644 index 000000000000..c32d27af0a3f --- /dev/null +++ b/net/8021q/vlanproc.c @@ -0,0 +1,357 @@ +/****************************************************************************** + * vlanproc.c VLAN Module. /proc filesystem interface. + * + * This module is completely hardware-independent and provides + * access to the router using Linux /proc filesystem. + * + * Author: Ben Greear, coppied from wanproc.c + * by: Gene Kozin + * + * Copyright: (c) 1998 Ben Greear + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * ============================================================================ + * Jan 20, 1998 Ben Greear Initial Version + *****************************************************************************/ + +#include +#include +#include /* offsetof(), etc. */ +#include /* return codes */ +#include +#include /* kmalloc(), kfree() */ +#include /* verify_area(), etc. */ +#include /* inline mem*, str* functions */ +#include /* __initfunc et al. */ +#include /* htons(), etc. */ +#include /* copy_to_user */ +#include +#include +#include +#include +#include +#include +#include "vlanproc.h" +#include "vlan.h" + +/****** Function Prototypes *************************************************/ + +/* Methods for preparing data for reading proc entries */ +static int vlan_seq_show(struct seq_file *seq, void *v); +static void *vlan_seq_start(struct seq_file *seq, loff_t *pos); +static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos); +static void vlan_seq_stop(struct seq_file *seq, void *); +static int vlandev_seq_show(struct seq_file *seq, void *v); + +/* + * Global Data + */ + + +/* + * Names of the proc directory entries + */ + +static const char name_root[] = "vlan"; +static const char name_conf[] = "config"; + +/* + * Structures for interfacing with the /proc filesystem. + * VLAN creates its own directory /proc/net/vlan with the folowing + * entries: + * config device status/configuration + * entry for each device + */ + +/* + * Generic /proc/net/vlan/ file and inode operations + */ + +static struct seq_operations vlan_seq_ops = { + .start = vlan_seq_start, + .next = vlan_seq_next, + .stop = vlan_seq_stop, + .show = vlan_seq_show, +}; + +static int vlan_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &vlan_seq_ops); +} + +static struct file_operations vlan_fops = { + .owner = THIS_MODULE, + .open = vlan_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +/* + * /proc/net/vlan/ file and inode operations + */ + +static int vlandev_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, vlandev_seq_show, PDE(inode)->data); +} + +static struct file_operations vlandev_fops = { + .owner = THIS_MODULE, + .open = vlandev_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * Proc filesystem derectory entries. + */ + +/* + * /proc/net/vlan + */ + +static struct proc_dir_entry *proc_vlan_dir; + +/* + * /proc/net/vlan/config + */ + +static struct proc_dir_entry *proc_vlan_conf; + +/* Strings */ +static const char *vlan_name_type_str[VLAN_NAME_TYPE_HIGHEST] = { + [VLAN_NAME_TYPE_RAW_PLUS_VID] = "VLAN_NAME_TYPE_RAW_PLUS_VID", + [VLAN_NAME_TYPE_PLUS_VID_NO_PAD] = "VLAN_NAME_TYPE_PLUS_VID_NO_PAD", + [VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD]= "VLAN_NAME_TYPE_RAW_PLUS_VID_NO_PAD", + [VLAN_NAME_TYPE_PLUS_VID] = "VLAN_NAME_TYPE_PLUS_VID", +}; +/* + * Interface functions + */ + +/* + * Clean up /proc/net/vlan entries + */ + +void vlan_proc_cleanup(void) +{ + if (proc_vlan_conf) + remove_proc_entry(name_conf, proc_vlan_dir); + + if (proc_vlan_dir) + proc_net_remove(name_root); + + /* Dynamically added entries should be cleaned up as their vlan_device + * is removed, so we should not have to take care of it here... + */ +} + +/* + * Create /proc/net/vlan entries + */ + +int __init vlan_proc_init(void) +{ + proc_vlan_dir = proc_mkdir(name_root, proc_net); + if (proc_vlan_dir) { + proc_vlan_conf = create_proc_entry(name_conf, + S_IFREG|S_IRUSR|S_IWUSR, + proc_vlan_dir); + if (proc_vlan_conf) { + proc_vlan_conf->proc_fops = &vlan_fops; + return 0; + } + } + vlan_proc_cleanup(); + return -ENOBUFS; +} + +/* + * Add directory entry for VLAN device. + */ + +int vlan_proc_add_dev (struct net_device *vlandev) +{ + struct vlan_dev_info *dev_info = VLAN_DEV_INFO(vlandev); + + if (!(vlandev->priv_flags & IFF_802_1Q_VLAN)) { + printk(KERN_ERR + "ERROR: vlan_proc_add, device -:%s:- is NOT a VLAN\n", + vlandev->name); + return -EINVAL; + } + + dev_info->dent = create_proc_entry(vlandev->name, + S_IFREG|S_IRUSR|S_IWUSR, + proc_vlan_dir); + if (!dev_info->dent) + return -ENOBUFS; + + dev_info->dent->proc_fops = &vlandev_fops; + dev_info->dent->data = vlandev; + +#ifdef VLAN_DEBUG + printk(KERN_ERR "vlan_proc_add, device -:%s:- being added.\n", + vlandev->name); +#endif + return 0; +} + +/* + * Delete directory entry for VLAN device. + */ +int vlan_proc_rem_dev(struct net_device *vlandev) +{ + if (!vlandev) { + printk(VLAN_ERR "%s: invalid argument: %p\n", + __FUNCTION__, vlandev); + return -EINVAL; + } + + if (!(vlandev->priv_flags & IFF_802_1Q_VLAN)) { + printk(VLAN_DBG "%s: invalid argument, device: %s is not a VLAN device, priv_flags: 0x%4hX.\n", + __FUNCTION__, vlandev->name, vlandev->priv_flags); + return -EINVAL; + } + +#ifdef VLAN_DEBUG + printk(VLAN_DBG "%s: dev: %p\n", __FUNCTION__, vlandev); +#endif + + /** NOTE: This will consume the memory pointed to by dent, it seems. */ + if (VLAN_DEV_INFO(vlandev)->dent) { + remove_proc_entry(VLAN_DEV_INFO(vlandev)->dent->name, proc_vlan_dir); + VLAN_DEV_INFO(vlandev)->dent = NULL; + } + + return 0; +} + +/****** Proc filesystem entry points ****************************************/ + +/* + * The following few functions build the content of /proc/net/vlan/config + */ + +/* starting at dev, find a VLAN device */ +static struct net_device *vlan_skip(struct net_device *dev) +{ + while (dev && !(dev->priv_flags & IFF_802_1Q_VLAN)) + dev = dev->next; + + return dev; +} + +/* start read of /proc/net/vlan/config */ +static void *vlan_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct net_device *dev; + loff_t i = 1; + + read_lock(&dev_base_lock); + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (dev = vlan_skip(dev_base); dev && i < *pos; + dev = vlan_skip(dev->next), ++i); + + return (i == *pos) ? dev : NULL; +} + +static void *vlan_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + + return vlan_skip((v == SEQ_START_TOKEN) + ? dev_base + : ((struct net_device *)v)->next); +} + +static void vlan_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&dev_base_lock); +} + +static int vlan_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + const char *nmtype = NULL; + + seq_puts(seq, "VLAN Dev name | VLAN ID\n"); + + if (vlan_name_type < ARRAY_SIZE(vlan_name_type_str)) + nmtype = vlan_name_type_str[vlan_name_type]; + + seq_printf(seq, "Name-Type: %s\n", + nmtype ? nmtype : "UNKNOWN" ); + } else { + const struct net_device *vlandev = v; + const struct vlan_dev_info *dev_info = VLAN_DEV_INFO(vlandev); + + seq_printf(seq, "%-15s| %d | %s\n", vlandev->name, + dev_info->vlan_id, dev_info->real_dev->name); + } + return 0; +} + +static int vlandev_seq_show(struct seq_file *seq, void *offset) +{ + struct net_device *vlandev = (struct net_device *) seq->private; + const struct vlan_dev_info *dev_info = VLAN_DEV_INFO(vlandev); + struct net_device_stats *stats; + static const char fmt[] = "%30s %12lu\n"; + int i; + + if ((vlandev == NULL) || (!(vlandev->priv_flags & IFF_802_1Q_VLAN))) + return 0; + + seq_printf(seq, "%s VID: %d REORDER_HDR: %i dev->priv_flags: %hx\n", + vlandev->name, dev_info->vlan_id, + (int)(dev_info->flags & 1), vlandev->priv_flags); + + + stats = vlan_dev_get_stats(vlandev); + + seq_printf(seq, fmt, "total frames received", stats->rx_packets); + seq_printf(seq, fmt, "total bytes received", stats->rx_bytes); + seq_printf(seq, fmt, "Broadcast/Multicast Rcvd", stats->multicast); + seq_puts(seq, "\n"); + seq_printf(seq, fmt, "total frames transmitted", stats->tx_packets); + seq_printf(seq, fmt, "total bytes transmitted", stats->tx_bytes); + seq_printf(seq, fmt, "total headroom inc", + dev_info->cnt_inc_headroom_on_tx); + seq_printf(seq, fmt, "total encap on xmit", + dev_info->cnt_encap_on_xmit); + seq_printf(seq, "Device: %s", dev_info->real_dev->name); + /* now show all PRIORITY mappings relating to this VLAN */ + seq_printf(seq, + "\nINGRESS priority mappings: 0:%lu 1:%lu 2:%lu 3:%lu 4:%lu 5:%lu 6:%lu 7:%lu\n", + dev_info->ingress_priority_map[0], + dev_info->ingress_priority_map[1], + dev_info->ingress_priority_map[2], + dev_info->ingress_priority_map[3], + dev_info->ingress_priority_map[4], + dev_info->ingress_priority_map[5], + dev_info->ingress_priority_map[6], + dev_info->ingress_priority_map[7]); + + seq_printf(seq, "EGRESSS priority Mappings: "); + for (i = 0; i < 16; i++) { + const struct vlan_priority_tci_mapping *mp + = dev_info->egress_priority_map[i]; + while (mp) { + seq_printf(seq, "%lu:%hu ", + mp->priority, ((mp->vlan_qos >> 13) & 0x7)); + mp = mp->next; + } + } + seq_puts(seq, "\n"); + + return 0; +} diff --git a/net/8021q/vlanproc.h b/net/8021q/vlanproc.h new file mode 100644 index 000000000000..f908ee332fd8 --- /dev/null +++ b/net/8021q/vlanproc.h @@ -0,0 +1,19 @@ +#ifndef __BEN_VLAN_PROC_INC__ +#define __BEN_VLAN_PROC_INC__ + +#ifdef CONFIG_PROC_FS +int vlan_proc_init(void); +int vlan_proc_rem_dev(struct net_device *vlandev); +int vlan_proc_add_dev (struct net_device *vlandev); +void vlan_proc_cleanup (void); + +#else /* No CONFIG_PROC_FS */ + +#define vlan_proc_init() (0) +#define vlan_proc_cleanup() do {} while(0) +#define vlan_proc_add_dev(dev) ({(void)(dev), 0;}) +#define vlan_proc_rem_dev(dev) ({(void)(dev), 0;}) + +#endif + +#endif /* !(__BEN_VLAN_PROC_INC__) */ diff --git a/net/Kconfig b/net/Kconfig new file mode 100644 index 000000000000..9251b28e8d5d --- /dev/null +++ b/net/Kconfig @@ -0,0 +1,646 @@ +# +# Network configuration +# + +menu "Networking support" + +config NET + bool "Networking support" + ---help--- + Unless you really know what you are doing, you should say Y here. + The reason is that some programs need kernel networking support even + when running on a stand-alone machine that isn't connected to any + other computer. If you are upgrading from an older kernel, you + should consider updating your networking tools too because changes + in the kernel and the tools often go hand in hand. The tools are + contained in the package net-tools, the location and version number + of which are given in . + + For a general introduction to Linux networking, it is highly + recommended to read the NET-HOWTO, available from + . + +menu "Networking options" + depends on NET + +config PACKET + tristate "Packet socket" + ---help--- + The Packet protocol is used by applications which communicate + directly with network devices without an intermediate network + protocol implemented in the kernel, e.g. tcpdump. If you want them + to work, choose Y. + + To compile this driver as a module, choose M here: the module will + be called af_packet. + + If unsure, say Y. + +config PACKET_MMAP + bool "Packet socket: mmapped IO" + depends on PACKET + help + If you say Y here, the Packet protocol driver will use an IO + mechanism that results in faster communication. + + If unsure, say N. + +config UNIX + tristate "Unix domain sockets" + ---help--- + If you say Y here, you will include support for Unix domain sockets; + sockets are the standard Unix mechanism for establishing and + accessing network connections. Many commonly used programs such as + the X Window system and syslog use these sockets even if your + machine is not connected to any network. Unless you are working on + an embedded system or something similar, you therefore definitely + want to say Y here. + + To compile this driver as a module, choose M here: the module will be + called unix. Note that several important services won't work + correctly if you say M here and then neglect to load the module. + + Say Y unless you know what you are doing. + +config NET_KEY + tristate "PF_KEY sockets" + select XFRM + ---help--- + PF_KEYv2 socket family, compatible to KAME ones. + They are required if you are going to use IPsec tools ported + from KAME. + + Say Y unless you know what you are doing. + +config INET + bool "TCP/IP networking" + ---help--- + These are the protocols used on the Internet and on most local + Ethernets. It is highly recommended to say Y here (this will enlarge + your kernel by about 144 KB), since some programs (e.g. the X window + system) use TCP/IP even if your machine is not connected to any + other computer. You will get the so-called loopback device which + allows you to ping yourself (great fun, that!). + + For an excellent introduction to Linux networking, please read the + Linux Networking HOWTO, available from + . + + If you say Y here and also to "/proc file system support" and + "Sysctl support" below, you can change various aspects of the + behavior of the TCP/IP code by writing to the (virtual) files in + /proc/sys/net/ipv4/*; the options are explained in the file + . + + Short answer: say Y. + +source "net/ipv4/Kconfig" + +# IPv6 as module will cause a CRASH if you try to unload it +config IPV6 + tristate "The IPv6 protocol" + depends on INET + default m + select CRYPTO if IPV6_PRIVACY + select CRYPTO_MD5 if IPV6_PRIVACY + ---help--- + This is complemental support for the IP version 6. + You will still be able to do traditional IPv4 networking as well. + + For general information about IPv6, see + . + For Linux IPv6 development information, see . + For specific information about IPv6 under Linux, read the HOWTO at + . + + To compile this protocol support as a module, choose M here: the + module will be called ipv6. + +source "net/ipv6/Kconfig" + +menuconfig NETFILTER + bool "Network packet filtering (replaces ipchains)" + ---help--- + Netfilter is a framework for filtering and mangling network packets + that pass through your Linux box. + + The most common use of packet filtering is to run your Linux box as + a firewall protecting a local network from the Internet. The type of + firewall provided by this kernel support is called a "packet + filter", which means that it can reject individual network packets + based on type, source, destination etc. The other kind of firewall, + a "proxy-based" one, is more secure but more intrusive and more + bothersome to set up; it inspects the network traffic much more + closely, modifies it and has knowledge about the higher level + protocols, which a packet filter lacks. Moreover, proxy-based + firewalls often require changes to the programs running on the local + clients. Proxy-based firewalls don't need support by the kernel, but + they are often combined with a packet filter, which only works if + you say Y here. + + You should also say Y here if you intend to use your Linux box as + the gateway to the Internet for a local network of machines without + globally valid IP addresses. This is called "masquerading": if one + of the computers on your local network wants to send something to + the outside, your box can "masquerade" as that computer, i.e. it + forwards the traffic to the intended outside destination, but + modifies the packets to make it look like they came from the + firewall box itself. It works both ways: if the outside host + replies, the Linux box will silently forward the traffic to the + correct local computer. This way, the computers on your local net + are completely invisible to the outside world, even though they can + reach the outside and can receive replies. It is even possible to + run globally visible servers from within a masqueraded local network + using a mechanism called portforwarding. Masquerading is also often + called NAT (Network Address Translation). + + Another use of Netfilter is in transparent proxying: if a machine on + the local network tries to connect to an outside host, your Linux + box can transparently forward the traffic to a local server, + typically a caching proxy server. + + Yet another use of Netfilter is building a bridging firewall. Using + a bridge with Network packet filtering enabled makes iptables "see" + the bridged traffic. For filtering on the lower network and Ethernet + protocols over the bridge, use ebtables (under bridge netfilter + configuration). + + Various modules exist for netfilter which replace the previous + masquerading (ipmasqadm), packet filtering (ipchains), transparent + proxying, and portforwarding mechanisms. Please see + under "iptables" for the location of + these packages. + + Make sure to say N to "Fast switching" below if you intend to say Y + here, as Fast switching currently bypasses netfilter. + + Chances are that you should say Y here if you compile a kernel which + will run as a router and N for regular hosts. If unsure, say N. + +if NETFILTER + +config NETFILTER_DEBUG + bool "Network packet filtering debugging" + depends on NETFILTER + help + You can say Y here if you want to get additional messages useful in + debugging the netfilter code. + +config BRIDGE_NETFILTER + bool "Bridged IP/ARP packets filtering" + depends on BRIDGE && NETFILTER && INET + default y + ---help--- + Enabling this option will let arptables resp. iptables see bridged + ARP resp. IP traffic. If you want a bridging firewall, you probably + want this option enabled. + Enabling or disabling this option doesn't enable or disable + ebtables. + + If unsure, say N. + +source "net/ipv4/netfilter/Kconfig" +source "net/ipv6/netfilter/Kconfig" +source "net/decnet/netfilter/Kconfig" +source "net/bridge/netfilter/Kconfig" + +endif + +config XFRM + bool + depends on NET + +source "net/xfrm/Kconfig" + +source "net/sctp/Kconfig" + +config ATM + tristate "Asynchronous Transfer Mode (ATM) (EXPERIMENTAL)" + depends on EXPERIMENTAL + ---help--- + ATM is a high-speed networking technology for Local Area Networks + and Wide Area Networks. It uses a fixed packet size and is + connection oriented, allowing for the negotiation of minimum + bandwidth requirements. + + In order to participate in an ATM network, your Linux box needs an + ATM networking card. If you have that, say Y here and to the driver + of your ATM card below. + + Note that you need a set of user-space programs to actually make use + of ATM. See the file for + further details. + +config ATM_CLIP + tristate "Classical IP over ATM (EXPERIMENTAL)" + depends on ATM && INET + help + Classical IP over ATM for PVCs and SVCs, supporting InARP and + ATMARP. If you want to communication with other IP hosts on your ATM + network, you will typically either say Y here or to "LAN Emulation + (LANE)" below. + +config ATM_CLIP_NO_ICMP + bool "Do NOT send ICMP if no neighbour (EXPERIMENTAL)" + depends on ATM_CLIP + help + Normally, an "ICMP host unreachable" message is sent if a neighbour + cannot be reached because there is no VC to it in the kernel's + ATMARP table. This may cause problems when ATMARP table entries are + briefly removed during revalidation. If you say Y here, packets to + such neighbours are silently discarded instead. + +config ATM_LANE + tristate "LAN Emulation (LANE) support (EXPERIMENTAL)" + depends on ATM + help + LAN Emulation emulates services of existing LANs across an ATM + network. Besides operating as a normal ATM end station client, Linux + LANE client can also act as an proxy client bridging packets between + ELAN and Ethernet segments. You need LANE if you want to try MPOA. + +config ATM_MPOA + tristate "Multi-Protocol Over ATM (MPOA) support (EXPERIMENTAL)" + depends on ATM && INET && ATM_LANE!=n + help + Multi-Protocol Over ATM allows ATM edge devices such as routers, + bridges and ATM attached hosts establish direct ATM VCs across + subnetwork boundaries. These shortcut connections bypass routers + enhancing overall network performance. + +config ATM_BR2684 + tristate "RFC1483/2684 Bridged protocols" + depends on ATM && INET + help + ATM PVCs can carry ethernet PDUs according to rfc2684 (formerly 1483) + This device will act like an ethernet from the kernels point of view, + with the traffic being carried by ATM PVCs (currently 1 PVC/device). + This is sometimes used over DSL lines. If in doubt, say N. + +config ATM_BR2684_IPFILTER + bool "Per-VC IP filter kludge" + depends on ATM_BR2684 + help + This is an experimental mechanism for users who need to terminating a + large number of IP-only vcc's. Do not enable this unless you are sure + you know what you are doing. + +config BRIDGE + tristate "802.1d Ethernet Bridging" + ---help--- + If you say Y here, then your Linux box will be able to act as an + Ethernet bridge, which means that the different Ethernet segments it + is connected to will appear as one Ethernet to the participants. + Several such bridges can work together to create even larger + networks of Ethernets using the IEEE 802.1 spanning tree algorithm. + As this is a standard, Linux bridges will cooperate properly with + other third party bridge products. + + In order to use the Ethernet bridge, you'll need the bridge + configuration tools; see + for location. Please read the Bridge mini-HOWTO for more + information. + + If you enable iptables support along with the bridge support then you + turn your bridge into a bridging IP firewall. + iptables will then see the IP packets being bridged, so you need to + take this into account when setting up your firewall rules. + Enabling arptables support when bridging will let arptables see + bridged ARP traffic in the arptables FORWARD chain. + + To compile this code as a module, choose M here: the module + will be called bridge. + + If unsure, say N. + +config VLAN_8021Q + tristate "802.1Q VLAN Support" + ---help--- + Select this and you will be able to create 802.1Q VLAN interfaces + on your ethernet interfaces. 802.1Q VLAN supports almost + everything a regular ethernet interface does, including + firewalling, bridging, and of course IP traffic. You will need + the 'vconfig' tool from the VLAN project in order to effectively + use VLANs. See the VLAN web page for more information: + + + To compile this code as a module, choose M here: the module + will be called 8021q. + + If unsure, say N. + +config DECNET + tristate "DECnet Support" + ---help--- + The DECnet networking protocol was used in many products made by + Digital (now Compaq). It provides reliable stream and sequenced + packet communications over which run a variety of services similar + to those which run over TCP/IP. + + To find some tools to use with the kernel layer support, please + look at Patrick Caulfield's web site: + . + + More detailed documentation is available in + . + + Be sure to say Y to "/proc file system support" and "Sysctl support" + below when using DECnet, since you will need sysctl support to aid + in configuration at run time. + + The DECnet code is also available as a module ( = code which can be + inserted in and removed from the running kernel whenever you want). + The module is called decnet. + +source "net/decnet/Kconfig" + +source "net/llc/Kconfig" + +config IPX + tristate "The IPX protocol" + select LLC + ---help--- + This is support for the Novell networking protocol, IPX, commonly + used for local networks of Windows machines. You need it if you + want to access Novell NetWare file or print servers using the Linux + Novell client ncpfs (available from + ) or from + within the Linux DOS emulator DOSEMU (read the DOSEMU-HOWTO, + available from ). In order + to do the former, you'll also have to say Y to "NCP file system + support", below. + + IPX is similar in scope to IP, while SPX, which runs on top of IPX, + is similar to TCP. There is also experimental support for SPX in + Linux (see "SPX networking", below). + + To turn your Linux box into a fully featured NetWare file server and + IPX router, say Y here and fetch either lwared from + or + mars_nwe from . For more + information, read the IPX-HOWTO available from + . + + General information about how to connect Linux, Windows machines and + Macs is on the WWW at . + + The IPX driver would enlarge your kernel by about 16 KB. To compile + this driver as a module, choose M here: the module will be called ipx. + Unless you want to integrate your Linux box with a local Novell + network, say N. + +source "net/ipx/Kconfig" + +config ATALK + tristate "Appletalk protocol support" + select LLC + ---help--- + AppleTalk is the protocol that Apple computers can use to communicate + on a network. If your Linux box is connected to such a network and you + wish to connect to it, say Y. You will need to use the netatalk package + so that your Linux box can act as a print and file server for Macs as + well as access AppleTalk printers. Check out + on the WWW for details. + EtherTalk is the name used for AppleTalk over Ethernet and the + cheaper and slower LocalTalk is AppleTalk over a proprietary Apple + network using serial links. EtherTalk and LocalTalk are fully + supported by Linux. + + General information about how to connect Linux, Windows machines and + Macs is on the WWW at . The + NET-3-HOWTO, available from + , contains valuable + information as well. + + To compile this driver as a module, choose M here: the module will be + called appletalk. You almost certainly want to compile it as a + module so you can restart your AppleTalk stack without rebooting + your machine. I hear that the GNU boycott of Apple is over, so + even politically correct people are allowed to say Y here. + +source "drivers/net/appletalk/Kconfig" + +config X25 + tristate "CCITT X.25 Packet Layer (EXPERIMENTAL)" + depends on EXPERIMENTAL + ---help--- + X.25 is a set of standardized network protocols, similar in scope to + frame relay; the one physical line from your box to the X.25 network + entry point can carry several logical point-to-point connections + (called "virtual circuits") to other computers connected to the X.25 + network. Governments, banks, and other organizations tend to use it + to connect to each other or to form Wide Area Networks (WANs). Many + countries have public X.25 networks. X.25 consists of two + protocols: the higher level Packet Layer Protocol (PLP) (say Y here + if you want that) and the lower level data link layer protocol LAPB + (say Y to "LAPB Data Link Driver" below if you want that). + + You can read more about X.25 at and + . + Information about X.25 for Linux is contained in the files + and + . + + One connects to an X.25 network either with a dedicated network card + using the X.21 protocol (not yet supported by Linux) or one can do + X.25 over a standard telephone line using an ordinary modem (say Y + to "X.25 async driver" below) or over Ethernet using an ordinary + Ethernet card and the LAPB over Ethernet (say Y to "LAPB Data Link + Driver" and "LAPB over Ethernet driver" below). + + To compile this driver as a module, choose M here: the module + will be called x25. If unsure, say N. + +config LAPB + tristate "LAPB Data Link Driver (EXPERIMENTAL)" + depends on EXPERIMENTAL + ---help--- + Link Access Procedure, Balanced (LAPB) is the data link layer (i.e. + the lower) part of the X.25 protocol. It offers a reliable + connection service to exchange data frames with one other host, and + it is used to transport higher level protocols (mostly X.25 Packet + Layer, the higher part of X.25, but others are possible as well). + Usually, LAPB is used with specialized X.21 network cards, but Linux + currently supports LAPB only over Ethernet connections. If you want + to use LAPB connections over Ethernet, say Y here and to "LAPB over + Ethernet driver" below. Read + for technical + details. + + To compile this driver as a module, choose M here: the + module will be called lapb. If unsure, say N. + +config NET_DIVERT + bool "Frame Diverter (EXPERIMENTAL)" + depends on EXPERIMENTAL + ---help--- + The Frame Diverter allows you to divert packets from the + network, that are not aimed at the interface receiving it (in + promisc. mode). Typically, a Linux box setup as an Ethernet bridge + with the Frames Diverter on, can do some *really* transparent www + caching using a Squid proxy for example. + + This is very useful when you don't want to change your router's + config (or if you simply don't have access to it). + + The other possible usages of diverting Ethernet Frames are + numberous: + - reroute smtp traffic to another interface + - traffic-shape certain network streams + - transparently proxy smtp connections + - etc... + + For more informations, please refer to: + + + + If unsure, say N. + +config ECONET + tristate "Acorn Econet/AUN protocols (EXPERIMENTAL)" + depends on EXPERIMENTAL && INET + ---help--- + Econet is a fairly old and slow networking protocol mainly used by + Acorn computers to access file and print servers. It uses native + Econet network cards. AUN is an implementation of the higher level + parts of Econet that runs over ordinary Ethernet connections, on + top of the UDP packet protocol, which in turn runs on top of the + Internet protocol IP. + + If you say Y here, you can choose with the next two options whether + to send Econet/AUN traffic over a UDP Ethernet connection or over + a native Econet network card. + + To compile this driver as a module, choose M here: the module + will be called econet. + +config ECONET_AUNUDP + bool "AUN over UDP" + depends on ECONET + help + Say Y here if you want to send Econet/AUN traffic over a UDP + connection (UDP is a packet based protocol that runs on top of the + Internet protocol IP) using an ordinary Ethernet network card. + +config ECONET_NATIVE + bool "Native Econet" + depends on ECONET + help + Say Y here if you have a native Econet network card installed in + your computer. + +config WAN_ROUTER + tristate "WAN router" + depends on EXPERIMENTAL + ---help--- + Wide Area Networks (WANs), such as X.25, frame relay and leased + lines, are used to interconnect Local Area Networks (LANs) over vast + distances with data transfer rates significantly higher than those + achievable with commonly used asynchronous modem connections. + Usually, a quite expensive external device called a `WAN router' is + needed to connect to a WAN. + + As an alternative, WAN routing can be built into the Linux kernel. + With relatively inexpensive WAN interface cards available on the + market, a perfectly usable router can be built for less than half + the price of an external router. If you have one of those cards and + wish to use your Linux box as a WAN router, say Y here and also to + the WAN driver for your card, below. You will then need the + wan-tools package which is available from . + Read for more + information. + + To compile WAN routing support as a module, choose M here: the + module will be called wanrouter. + + If unsure, say N. + +menu "QoS and/or fair queueing" + +config NET_SCHED + bool "QoS and/or fair queueing" + ---help--- + When the kernel has several packets to send out over a network + device, it has to decide which ones to send first, which ones to + delay, and which ones to drop. This is the job of the packet + scheduler, and several different algorithms for how to do this + "fairly" have been proposed. + + If you say N here, you will get the standard packet scheduler, which + is a FIFO (first come, first served). If you say Y here, you will be + able to choose from among several alternative algorithms which can + then be attached to different network devices. This is useful for + example if some of your network devices are real time devices that + need a certain minimum data flow rate, or if you need to limit the + maximum data flow rate for traffic which matches specified criteria. + This code is considered to be experimental. + + To administer these schedulers, you'll need the user-level utilities + from the package iproute2+tc at . + That package also contains some documentation; for more, check out + . + + This Quality of Service (QoS) support will enable you to use + Differentiated Services (diffserv) and Resource Reservation Protocol + (RSVP) on your Linux router if you also say Y to "QoS support", + "Packet classifier API" and to some classifiers below. Documentation + and software is at . + + If you say Y here and to "/proc file system" below, you will be able + to read status information about packet schedulers from the file + /proc/net/psched. + + The available schedulers are listed in the following questions; you + can say Y to as many as you like. If unsure, say N now. + +source "net/sched/Kconfig" + +endmenu + +menu "Network testing" + +config NET_PKTGEN + tristate "Packet Generator (USE WITH CAUTION)" + depends on PROC_FS + ---help--- + This module will inject preconfigured packets, at a configurable + rate, out of a given interface. It is used for network interface + stress testing and performance analysis. If you don't understand + what was just said, you don't need it: say N. + + Documentation on how to use the packet generator can be found + at . + + To compile this code as a module, choose M here: the + module will be called pktgen. + +endmenu + +endmenu + +config NETPOLL + def_bool NETCONSOLE + +config NETPOLL_RX + bool "Netpoll support for trapping incoming packets" + default n + depends on NETPOLL + +config NETPOLL_TRAP + bool "Netpoll traffic trapping" + default n + depends on NETPOLL + +config NET_POLL_CONTROLLER + def_bool NETPOLL + +source "net/ax25/Kconfig" + +source "net/irda/Kconfig" + +source "net/bluetooth/Kconfig" + +source "drivers/net/Kconfig" + +endmenu + diff --git a/net/Makefile b/net/Makefile new file mode 100644 index 000000000000..8e2bdc025ab8 --- /dev/null +++ b/net/Makefile @@ -0,0 +1,48 @@ +# +# Makefile for the linux networking. +# +# 2 Sep 2000, Christoph Hellwig +# Rewritten to use lists instead of if-statements. +# + +obj-y := nonet.o + +obj-$(CONFIG_NET) := socket.o core/ + +tmp-$(CONFIG_COMPAT) := compat.o +obj-$(CONFIG_NET) += $(tmp-y) + +# LLC has to be linked before the files in net/802/ +obj-$(CONFIG_LLC) += llc/ +obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ +obj-$(CONFIG_INET) += ipv4/ +obj-$(CONFIG_XFRM) += xfrm/ +obj-$(CONFIG_UNIX) += unix/ +ifneq ($(CONFIG_IPV6),) +obj-y += ipv6/ +endif +obj-$(CONFIG_PACKET) += packet/ +obj-$(CONFIG_NET_KEY) += key/ +obj-$(CONFIG_NET_SCHED) += sched/ +obj-$(CONFIG_BRIDGE) += bridge/ +obj-$(CONFIG_IPX) += ipx/ +obj-$(CONFIG_ATALK) += appletalk/ +obj-$(CONFIG_WAN_ROUTER) += wanrouter/ +obj-$(CONFIG_X25) += x25/ +obj-$(CONFIG_LAPB) += lapb/ +obj-$(CONFIG_NETROM) += netrom/ +obj-$(CONFIG_ROSE) += rose/ +obj-$(CONFIG_AX25) += ax25/ +obj-$(CONFIG_IRDA) += irda/ +obj-$(CONFIG_BT) += bluetooth/ +obj-$(CONFIG_SUNRPC) += sunrpc/ +obj-$(CONFIG_RXRPC) += rxrpc/ +obj-$(CONFIG_ATM) += atm/ +obj-$(CONFIG_DECNET) += decnet/ +obj-$(CONFIG_ECONET) += econet/ +obj-$(CONFIG_VLAN_8021Q) += 8021q/ +obj-$(CONFIG_IP_SCTP) += sctp/ + +ifeq ($(CONFIG_NET),y) +obj-$(CONFIG_SYSCTL) += sysctl_net.o +endif diff --git a/net/TUNABLE b/net/TUNABLE new file mode 100644 index 000000000000..9913211f07a7 --- /dev/null +++ b/net/TUNABLE @@ -0,0 +1,50 @@ +The following parameters should be tunable at compile time. Some of them +exist as sysctls too. + +This is far from complete + +Item Description +---------------------------------------------------------------------------- +MAX_LINKS Maximum number of netlink minor devices. (1-32) +RIF_TABLE_SIZE Token ring RIF cache size (tunable) +AARP_HASH_SIZE Size of Appletalk hash table (tunable) +AX25_DEF_T1 AX.25 parameters. These are all tunable via +AX25_DEF_T2 SIOCAX25SETPARMS +AX25_DEF_T3 T1-T3,N2 have the meanings in the specification +AX25_DEF_N2 +AX25_DEF_AXDEFMODE 8 = normal 128 is PE1CHL extended +AX25_DEF_IPDEFMODE 'D' - datagram 'V' - virtual connection +AX25_DEF_BACKOFF 'E'xponential 'L'inear +AX25_DEF_NETROM Allow netrom 1=Y +AX25_DF_TEXT Allow PID=Text 1=Y +AX25_DEF_WINDOW Window for normal mode +AX25_DEF_EWINDOW Window for PE1CHL mode +AX25_DEF_DIGI 1 for inband 2 for cross band 3 for both +AX25_DEF_CONMODE Allow connected modes 1=Yes +AX25_ROUTE_MAX AX.25 route cache size - no currently tunable +Unnamed (16) Number of protocol hash slots (tunable) +DEV_NUMBUFFS Number of priority levels (not easily tunable) +Unnamed (300) Maximum packet backlog queue (tunable) +MAX_IOVEC Maximum number of iovecs in a message (tunable) +MIN_WINDOW Offered minimum window (tunable) +MAX_WINDOW Offered maximum window (tunable) +MAX_HEADER Largest physical header (tunable) +MAX_ADDR_LEN Largest physical address (tunable) +SOCK_ARRAY_SIZE IP socket array hash size (tunable) +IP_MAX_MEMBERSHIPS Largest number of groups per socket (BSD style) (tunable) +16 Hard coded constant for amount of room allowed for + cache align and faster forwarding (tunable) +IP_FRAG_TIME Time we hold a fragment for. (tunable) +PORT_MASQ_BEGIN First port reserved for masquerade (tunable) +PORT_MASQ_END Last port used for masquerade (tunable) +MASQUERADE_EXPIRE_TCP_FIN Time we keep a masquerade for after a FIN +MASQUERADE_EXPIRE_UDP Time we keep a UDP masquerade for (tunable) +MAXVIFS Maximum mrouted vifs (1-32) +MFC_LINES Lines in the multicast router cache (tunable) + +NetROM parameters are tunable via an ioctl passing a struct + +4000 Size a Unix domain socket malloc falls back to + (tunable) should be 8K - a bit for 8K machines like + the ALPHA + diff --git a/net/appletalk/Makefile b/net/appletalk/Makefile new file mode 100644 index 000000000000..5cda56edef57 --- /dev/null +++ b/net/appletalk/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for the Linux AppleTalk layer. +# + +obj-$(CONFIG_ATALK) += appletalk.o + +appletalk-y := aarp.o ddp.o dev.o +appletalk-$(CONFIG_PROC_FS) += atalk_proc.o +appletalk-$(CONFIG_SYSCTL) += sysctl_net_atalk.o diff --git a/net/appletalk/aarp.c b/net/appletalk/aarp.c new file mode 100644 index 000000000000..54640c01b50c --- /dev/null +++ b/net/appletalk/aarp.c @@ -0,0 +1,1069 @@ +/* + * AARP: An implementation of the AppleTalk AARP protocol for + * Ethernet 'ELAP'. + * + * Alan Cox + * + * This doesn't fit cleanly with the IP arp. Potentially we can use + * the generic neighbour discovery code to clean this up. + * + * FIXME: + * We ought to handle the retransmits with a single list and a + * separate fast timer for when it is needed. + * Use neighbour discovery code. + * Token Ring Support. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * + * References: + * Inside AppleTalk (2nd Ed). + * Fixes: + * Jaume Grau - flush caches on AARP_PROBE + * Rob Newberry - Added proxy AARP and AARP proc fs, + * moved probing from DDP module. + * Arnaldo C. Melo - don't mangle rx packets + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int sysctl_aarp_expiry_time = AARP_EXPIRY_TIME; +int sysctl_aarp_tick_time = AARP_TICK_TIME; +int sysctl_aarp_retransmit_limit = AARP_RETRANSMIT_LIMIT; +int sysctl_aarp_resolve_time = AARP_RESOLVE_TIME; + +/* Lists of aarp entries */ +/** + * struct aarp_entry - AARP entry + * @last_sent - Last time we xmitted the aarp request + * @packet_queue - Queue of frames wait for resolution + * @status - Used for proxy AARP + * expires_at - Entry expiry time + * target_addr - DDP Address + * dev - Device to use + * hwaddr - Physical i/f address of target/router + * xmit_count - When this hits 10 we give up + * next - Next entry in chain + */ +struct aarp_entry { + /* These first two are only used for unresolved entries */ + unsigned long last_sent; + struct sk_buff_head packet_queue; + int status; + unsigned long expires_at; + struct atalk_addr target_addr; + struct net_device *dev; + char hwaddr[6]; + unsigned short xmit_count; + struct aarp_entry *next; +}; + +/* Hashed list of resolved, unresolved and proxy entries */ +static struct aarp_entry *resolved[AARP_HASH_SIZE]; +static struct aarp_entry *unresolved[AARP_HASH_SIZE]; +static struct aarp_entry *proxies[AARP_HASH_SIZE]; +static int unresolved_count; + +/* One lock protects it all. */ +static DEFINE_RWLOCK(aarp_lock); + +/* Used to walk the list and purge/kick entries. */ +static struct timer_list aarp_timer; + +/* + * Delete an aarp queue + * + * Must run under aarp_lock. + */ +static void __aarp_expire(struct aarp_entry *a) +{ + skb_queue_purge(&a->packet_queue); + kfree(a); +} + +/* + * Send an aarp queue entry request + * + * Must run under aarp_lock. + */ +static void __aarp_send_query(struct aarp_entry *a) +{ + static unsigned char aarp_eth_multicast[ETH_ALEN] = + { 0x09, 0x00, 0x07, 0xFF, 0xFF, 0xFF }; + struct net_device *dev = a->dev; + struct elapaarp *eah; + int len = dev->hard_header_len + sizeof(*eah) + aarp_dl->header_length; + struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC); + struct atalk_addr *sat = atalk_find_dev_addr(dev); + + if (!skb) + return; + + if (!sat) { + kfree_skb(skb); + return; + } + + /* Set up the buffer */ + skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length); + skb->nh.raw = skb->h.raw = skb_put(skb, sizeof(*eah)); + skb->protocol = htons(ETH_P_ATALK); + skb->dev = dev; + eah = aarp_hdr(skb); + + /* Set up the ARP */ + eah->hw_type = htons(AARP_HW_TYPE_ETHERNET); + eah->pa_type = htons(ETH_P_ATALK); + eah->hw_len = ETH_ALEN; + eah->pa_len = AARP_PA_ALEN; + eah->function = htons(AARP_REQUEST); + + memcpy(eah->hw_src, dev->dev_addr, ETH_ALEN); + + eah->pa_src_zero = 0; + eah->pa_src_net = sat->s_net; + eah->pa_src_node = sat->s_node; + + memset(eah->hw_dst, '\0', ETH_ALEN); + + eah->pa_dst_zero = 0; + eah->pa_dst_net = a->target_addr.s_net; + eah->pa_dst_node = a->target_addr.s_node; + + /* Send it */ + aarp_dl->request(aarp_dl, skb, aarp_eth_multicast); + /* Update the sending count */ + a->xmit_count++; + a->last_sent = jiffies; +} + +/* This runs under aarp_lock and in softint context, so only atomic memory + * allocations can be used. */ +static void aarp_send_reply(struct net_device *dev, struct atalk_addr *us, + struct atalk_addr *them, unsigned char *sha) +{ + struct elapaarp *eah; + int len = dev->hard_header_len + sizeof(*eah) + aarp_dl->header_length; + struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC); + + if (!skb) + return; + + /* Set up the buffer */ + skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length); + skb->nh.raw = skb->h.raw = skb_put(skb, sizeof(*eah)); + skb->protocol = htons(ETH_P_ATALK); + skb->dev = dev; + eah = aarp_hdr(skb); + + /* Set up the ARP */ + eah->hw_type = htons(AARP_HW_TYPE_ETHERNET); + eah->pa_type = htons(ETH_P_ATALK); + eah->hw_len = ETH_ALEN; + eah->pa_len = AARP_PA_ALEN; + eah->function = htons(AARP_REPLY); + + memcpy(eah->hw_src, dev->dev_addr, ETH_ALEN); + + eah->pa_src_zero = 0; + eah->pa_src_net = us->s_net; + eah->pa_src_node = us->s_node; + + if (!sha) + memset(eah->hw_dst, '\0', ETH_ALEN); + else + memcpy(eah->hw_dst, sha, ETH_ALEN); + + eah->pa_dst_zero = 0; + eah->pa_dst_net = them->s_net; + eah->pa_dst_node = them->s_node; + + /* Send it */ + aarp_dl->request(aarp_dl, skb, sha); +} + +/* + * Send probe frames. Called from aarp_probe_network and + * aarp_proxy_probe_network. + */ + +static void aarp_send_probe(struct net_device *dev, struct atalk_addr *us) +{ + struct elapaarp *eah; + int len = dev->hard_header_len + sizeof(*eah) + aarp_dl->header_length; + struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC); + static unsigned char aarp_eth_multicast[ETH_ALEN] = + { 0x09, 0x00, 0x07, 0xFF, 0xFF, 0xFF }; + + if (!skb) + return; + + /* Set up the buffer */ + skb_reserve(skb, dev->hard_header_len + aarp_dl->header_length); + skb->nh.raw = skb->h.raw = skb_put(skb, sizeof(*eah)); + skb->protocol = htons(ETH_P_ATALK); + skb->dev = dev; + eah = aarp_hdr(skb); + + /* Set up the ARP */ + eah->hw_type = htons(AARP_HW_TYPE_ETHERNET); + eah->pa_type = htons(ETH_P_ATALK); + eah->hw_len = ETH_ALEN; + eah->pa_len = AARP_PA_ALEN; + eah->function = htons(AARP_PROBE); + + memcpy(eah->hw_src, dev->dev_addr, ETH_ALEN); + + eah->pa_src_zero = 0; + eah->pa_src_net = us->s_net; + eah->pa_src_node = us->s_node; + + memset(eah->hw_dst, '\0', ETH_ALEN); + + eah->pa_dst_zero = 0; + eah->pa_dst_net = us->s_net; + eah->pa_dst_node = us->s_node; + + /* Send it */ + aarp_dl->request(aarp_dl, skb, aarp_eth_multicast); +} + +/* + * Handle an aarp timer expire + * + * Must run under the aarp_lock. + */ + +static void __aarp_expire_timer(struct aarp_entry **n) +{ + struct aarp_entry *t; + + while (*n) + /* Expired ? */ + if (time_after(jiffies, (*n)->expires_at)) { + t = *n; + *n = (*n)->next; + __aarp_expire(t); + } else + n = &((*n)->next); +} + +/* + * Kick all pending requests 5 times a second. + * + * Must run under the aarp_lock. + */ +static void __aarp_kick(struct aarp_entry **n) +{ + struct aarp_entry *t; + + while (*n) + /* Expired: if this will be the 11th tx, we delete instead. */ + if ((*n)->xmit_count >= sysctl_aarp_retransmit_limit) { + t = *n; + *n = (*n)->next; + __aarp_expire(t); + } else { + __aarp_send_query(*n); + n = &((*n)->next); + } +} + +/* + * A device has gone down. Take all entries referring to the device + * and remove them. + * + * Must run under the aarp_lock. + */ +static void __aarp_expire_device(struct aarp_entry **n, struct net_device *dev) +{ + struct aarp_entry *t; + + while (*n) + if ((*n)->dev == dev) { + t = *n; + *n = (*n)->next; + __aarp_expire(t); + } else + n = &((*n)->next); +} + +/* Handle the timer event */ +static void aarp_expire_timeout(unsigned long unused) +{ + int ct; + + write_lock_bh(&aarp_lock); + + for (ct = 0; ct < AARP_HASH_SIZE; ct++) { + __aarp_expire_timer(&resolved[ct]); + __aarp_kick(&unresolved[ct]); + __aarp_expire_timer(&unresolved[ct]); + __aarp_expire_timer(&proxies[ct]); + } + + write_unlock_bh(&aarp_lock); + mod_timer(&aarp_timer, jiffies + + (unresolved_count ? sysctl_aarp_tick_time : + sysctl_aarp_expiry_time)); +} + +/* Network device notifier chain handler. */ +static int aarp_device_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + int ct; + + if (event == NETDEV_DOWN) { + write_lock_bh(&aarp_lock); + + for (ct = 0; ct < AARP_HASH_SIZE; ct++) { + __aarp_expire_device(&resolved[ct], ptr); + __aarp_expire_device(&unresolved[ct], ptr); + __aarp_expire_device(&proxies[ct], ptr); + } + + write_unlock_bh(&aarp_lock); + } + return NOTIFY_DONE; +} + +/* Expire all entries in a hash chain */ +static void __aarp_expire_all(struct aarp_entry **n) +{ + struct aarp_entry *t; + + while (*n) { + t = *n; + *n = (*n)->next; + __aarp_expire(t); + } +} + +/* Cleanup all hash chains -- module unloading */ +static void aarp_purge(void) +{ + int ct; + + write_lock_bh(&aarp_lock); + for (ct = 0; ct < AARP_HASH_SIZE; ct++) { + __aarp_expire_all(&resolved[ct]); + __aarp_expire_all(&unresolved[ct]); + __aarp_expire_all(&proxies[ct]); + } + write_unlock_bh(&aarp_lock); +} + +/* + * Create a new aarp entry. This must use GFP_ATOMIC because it + * runs while holding spinlocks. + */ +static struct aarp_entry *aarp_alloc(void) +{ + struct aarp_entry *a = kmalloc(sizeof(*a), GFP_ATOMIC); + + if (a) + skb_queue_head_init(&a->packet_queue); + return a; +} + +/* + * Find an entry. We might return an expired but not yet purged entry. We + * don't care as it will do no harm. + * + * This must run under the aarp_lock. + */ +static struct aarp_entry *__aarp_find_entry(struct aarp_entry *list, + struct net_device *dev, + struct atalk_addr *sat) +{ + while (list) { + if (list->target_addr.s_net == sat->s_net && + list->target_addr.s_node == sat->s_node && + list->dev == dev) + break; + list = list->next; + } + + return list; +} + +/* Called from the DDP code, and thus must be exported. */ +void aarp_proxy_remove(struct net_device *dev, struct atalk_addr *sa) +{ + int hash = sa->s_node % (AARP_HASH_SIZE - 1); + struct aarp_entry *a; + + write_lock_bh(&aarp_lock); + + a = __aarp_find_entry(proxies[hash], dev, sa); + if (a) + a->expires_at = jiffies - 1; + + write_unlock_bh(&aarp_lock); +} + +/* This must run under aarp_lock. */ +static struct atalk_addr *__aarp_proxy_find(struct net_device *dev, + struct atalk_addr *sa) +{ + int hash = sa->s_node % (AARP_HASH_SIZE - 1); + struct aarp_entry *a = __aarp_find_entry(proxies[hash], dev, sa); + + return a ? sa : NULL; +} + +/* + * Probe a Phase 1 device or a device that requires its Net:Node to + * be set via an ioctl. + */ +static void aarp_send_probe_phase1(struct atalk_iface *iface) +{ + struct ifreq atreq; + struct sockaddr_at *sa = (struct sockaddr_at *)&atreq.ifr_addr; + + sa->sat_addr.s_node = iface->address.s_node; + sa->sat_addr.s_net = ntohs(iface->address.s_net); + + /* We pass the Net:Node to the drivers/cards by a Device ioctl. */ + if (!(iface->dev->do_ioctl(iface->dev, &atreq, SIOCSIFADDR))) { + (void)iface->dev->do_ioctl(iface->dev, &atreq, SIOCGIFADDR); + if (iface->address.s_net != htons(sa->sat_addr.s_net) || + iface->address.s_node != sa->sat_addr.s_node) + iface->status |= ATIF_PROBE_FAIL; + + iface->address.s_net = htons(sa->sat_addr.s_net); + iface->address.s_node = sa->sat_addr.s_node; + } +} + + +void aarp_probe_network(struct atalk_iface *atif) +{ + if (atif->dev->type == ARPHRD_LOCALTLK || + atif->dev->type == ARPHRD_PPP) + aarp_send_probe_phase1(atif); + else { + unsigned int count; + + for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) { + aarp_send_probe(atif->dev, &atif->address); + + /* Defer 1/10th */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ / 10); + + if (atif->status & ATIF_PROBE_FAIL) + break; + } + } +} + +int aarp_proxy_probe_network(struct atalk_iface *atif, struct atalk_addr *sa) +{ + int hash, retval = -EPROTONOSUPPORT; + struct aarp_entry *entry; + unsigned int count; + + /* + * we don't currently support LocalTalk or PPP for proxy AARP; + * if someone wants to try and add it, have fun + */ + if (atif->dev->type == ARPHRD_LOCALTLK || + atif->dev->type == ARPHRD_PPP) + goto out; + + /* + * create a new AARP entry with the flags set to be published -- + * we need this one to hang around even if it's in use + */ + entry = aarp_alloc(); + retval = -ENOMEM; + if (!entry) + goto out; + + entry->expires_at = -1; + entry->status = ATIF_PROBE; + entry->target_addr.s_node = sa->s_node; + entry->target_addr.s_net = sa->s_net; + entry->dev = atif->dev; + + write_lock_bh(&aarp_lock); + + hash = sa->s_node % (AARP_HASH_SIZE - 1); + entry->next = proxies[hash]; + proxies[hash] = entry; + + for (count = 0; count < AARP_RETRANSMIT_LIMIT; count++) { + aarp_send_probe(atif->dev, sa); + + /* Defer 1/10th */ + current->state = TASK_INTERRUPTIBLE; + write_unlock_bh(&aarp_lock); + schedule_timeout(HZ / 10); + write_lock_bh(&aarp_lock); + + if (entry->status & ATIF_PROBE_FAIL) + break; + } + + if (entry->status & ATIF_PROBE_FAIL) { + entry->expires_at = jiffies - 1; /* free the entry */ + retval = -EADDRINUSE; /* return network full */ + } else { /* clear the probing flag */ + entry->status &= ~ATIF_PROBE; + retval = 1; + } + + write_unlock_bh(&aarp_lock); +out: + return retval; +} + +/* Send a DDP frame */ +int aarp_send_ddp(struct net_device *dev, struct sk_buff *skb, + struct atalk_addr *sa, void *hwaddr) +{ + static char ddp_eth_multicast[ETH_ALEN] = + { 0x09, 0x00, 0x07, 0xFF, 0xFF, 0xFF }; + int hash; + struct aarp_entry *a; + + skb->nh.raw = skb->data; + + /* Check for LocalTalk first */ + if (dev->type == ARPHRD_LOCALTLK) { + struct atalk_addr *at = atalk_find_dev_addr(dev); + struct ddpehdr *ddp = (struct ddpehdr *)skb->data; + int ft = 2; + + /* + * Compressible ? + * + * IFF: src_net == dest_net == device_net + * (zero matches anything) + */ + + if ((!ddp->deh_snet || at->s_net == ddp->deh_snet) && + (!ddp->deh_dnet || at->s_net == ddp->deh_dnet)) { + skb_pull(skb, sizeof(*ddp) - 4); + + /* + * The upper two remaining bytes are the port + * numbers we just happen to need. Now put the + * length in the lower two. + */ + *((__u16 *)skb->data) = htons(skb->len); + ft = 1; + } + /* + * Nice and easy. No AARP type protocols occur here so we can + * just shovel it out with a 3 byte LLAP header + */ + + skb_push(skb, 3); + skb->data[0] = sa->s_node; + skb->data[1] = at->s_node; + skb->data[2] = ft; + skb->dev = dev; + goto sendit; + } + + /* On a PPP link we neither compress nor aarp. */ + if (dev->type == ARPHRD_PPP) { + skb->protocol = htons(ETH_P_PPPTALK); + skb->dev = dev; + goto sendit; + } + + /* Non ELAP we cannot do. */ + if (dev->type != ARPHRD_ETHER) + return -1; + + skb->dev = dev; + skb->protocol = htons(ETH_P_ATALK); + hash = sa->s_node % (AARP_HASH_SIZE - 1); + + /* Do we have a resolved entry? */ + if (sa->s_node == ATADDR_BCAST) { + /* Send it */ + ddp_dl->request(ddp_dl, skb, ddp_eth_multicast); + goto sent; + } + + write_lock_bh(&aarp_lock); + a = __aarp_find_entry(resolved[hash], dev, sa); + + if (a) { /* Return 1 and fill in the address */ + a->expires_at = jiffies + (sysctl_aarp_expiry_time * 10); + ddp_dl->request(ddp_dl, skb, a->hwaddr); + write_unlock_bh(&aarp_lock); + goto sent; + } + + /* Do we have an unresolved entry: This is the less common path */ + a = __aarp_find_entry(unresolved[hash], dev, sa); + if (a) { /* Queue onto the unresolved queue */ + skb_queue_tail(&a->packet_queue, skb); + goto out_unlock; + } + + /* Allocate a new entry */ + a = aarp_alloc(); + if (!a) { + /* Whoops slipped... good job it's an unreliable protocol 8) */ + write_unlock_bh(&aarp_lock); + return -1; + } + + /* Set up the queue */ + skb_queue_tail(&a->packet_queue, skb); + a->expires_at = jiffies + sysctl_aarp_resolve_time; + a->dev = dev; + a->next = unresolved[hash]; + a->target_addr = *sa; + a->xmit_count = 0; + unresolved[hash] = a; + unresolved_count++; + + /* Send an initial request for the address */ + __aarp_send_query(a); + + /* + * Switch to fast timer if needed (That is if this is the first + * unresolved entry to get added) + */ + + if (unresolved_count == 1) + mod_timer(&aarp_timer, jiffies + sysctl_aarp_tick_time); + + /* Now finally, it is safe to drop the lock. */ +out_unlock: + write_unlock_bh(&aarp_lock); + + /* Tell the ddp layer we have taken over for this frame. */ + return 0; + +sendit: + if (skb->sk) + skb->priority = skb->sk->sk_priority; + dev_queue_xmit(skb); +sent: + return 1; +} + +/* + * An entry in the aarp unresolved queue has become resolved. Send + * all the frames queued under it. + * + * Must run under aarp_lock. + */ +static void __aarp_resolved(struct aarp_entry **list, struct aarp_entry *a, + int hash) +{ + struct sk_buff *skb; + + while (*list) + if (*list == a) { + unresolved_count--; + *list = a->next; + + /* Move into the resolved list */ + a->next = resolved[hash]; + resolved[hash] = a; + + /* Kick frames off */ + while ((skb = skb_dequeue(&a->packet_queue)) != NULL) { + a->expires_at = jiffies + + sysctl_aarp_expiry_time * 10; + ddp_dl->request(ddp_dl, skb, a->hwaddr); + } + } else + list = &((*list)->next); +} + +/* + * This is called by the SNAP driver whenever we see an AARP SNAP + * frame. We currently only support Ethernet. + */ +static int aarp_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt) +{ + struct elapaarp *ea = aarp_hdr(skb); + int hash, ret = 0; + __u16 function; + struct aarp_entry *a; + struct atalk_addr sa, *ma, da; + struct atalk_iface *ifa; + + /* We only do Ethernet SNAP AARP. */ + if (dev->type != ARPHRD_ETHER) + goto out0; + + /* Frame size ok? */ + if (!skb_pull(skb, sizeof(*ea))) + goto out0; + + function = ntohs(ea->function); + + /* Sanity check fields. */ + if (function < AARP_REQUEST || function > AARP_PROBE || + ea->hw_len != ETH_ALEN || ea->pa_len != AARP_PA_ALEN || + ea->pa_src_zero || ea->pa_dst_zero) + goto out0; + + /* Looks good. */ + hash = ea->pa_src_node % (AARP_HASH_SIZE - 1); + + /* Build an address. */ + sa.s_node = ea->pa_src_node; + sa.s_net = ea->pa_src_net; + + /* Process the packet. Check for replies of me. */ + ifa = atalk_find_dev(dev); + if (!ifa) + goto out1; + + if (ifa->status & ATIF_PROBE && + ifa->address.s_node == ea->pa_dst_node && + ifa->address.s_net == ea->pa_dst_net) { + ifa->status |= ATIF_PROBE_FAIL; /* Fail the probe (in use) */ + goto out1; + } + + /* Check for replies of proxy AARP entries */ + da.s_node = ea->pa_dst_node; + da.s_net = ea->pa_dst_net; + + write_lock_bh(&aarp_lock); + a = __aarp_find_entry(proxies[hash], dev, &da); + + if (a && a->status & ATIF_PROBE) { + a->status |= ATIF_PROBE_FAIL; + /* + * we do not respond to probe or request packets for + * this address while we are probing this address + */ + goto unlock; + } + + switch (function) { + case AARP_REPLY: + if (!unresolved_count) /* Speed up */ + break; + + /* Find the entry. */ + a = __aarp_find_entry(unresolved[hash], dev, &sa); + if (!a || dev != a->dev) + break; + + /* We can fill one in - this is good. */ + memcpy(a->hwaddr, ea->hw_src, ETH_ALEN); + __aarp_resolved(&unresolved[hash], a, hash); + if (!unresolved_count) + mod_timer(&aarp_timer, + jiffies + sysctl_aarp_expiry_time); + break; + + case AARP_REQUEST: + case AARP_PROBE: + + /* + * If it is my address set ma to my address and reply. + * We can treat probe and request the same. Probe + * simply means we shouldn't cache the querying host, + * as in a probe they are proposing an address not + * using one. + * + * Support for proxy-AARP added. We check if the + * address is one of our proxies before we toss the + * packet out. + */ + + sa.s_node = ea->pa_dst_node; + sa.s_net = ea->pa_dst_net; + + /* See if we have a matching proxy. */ + ma = __aarp_proxy_find(dev, &sa); + if (!ma) + ma = &ifa->address; + else { /* We need to make a copy of the entry. */ + da.s_node = sa.s_node; + da.s_net = da.s_net; + ma = &da; + } + + if (function == AARP_PROBE) { + /* + * A probe implies someone trying to get an + * address. So as a precaution flush any + * entries we have for this address. + */ + struct aarp_entry *a; + + a = __aarp_find_entry(resolved[sa.s_node % + (AARP_HASH_SIZE - 1)], + skb->dev, &sa); + + /* + * Make it expire next tick - that avoids us + * getting into a probe/flush/learn/probe/ + * flush/learn cycle during probing of a slow + * to respond host addr. + */ + if (a) { + a->expires_at = jiffies - 1; + mod_timer(&aarp_timer, jiffies + + sysctl_aarp_tick_time); + } + } + + if (sa.s_node != ma->s_node) + break; + + if (sa.s_net && ma->s_net && sa.s_net != ma->s_net) + break; + + sa.s_node = ea->pa_src_node; + sa.s_net = ea->pa_src_net; + + /* aarp_my_address has found the address to use for us. + */ + aarp_send_reply(dev, ma, &sa, ea->hw_src); + break; + } + +unlock: + write_unlock_bh(&aarp_lock); +out1: + ret = 1; +out0: + kfree_skb(skb); + return ret; +} + +static struct notifier_block aarp_notifier = { + .notifier_call = aarp_device_event, +}; + +static unsigned char aarp_snap_id[] = { 0x00, 0x00, 0x00, 0x80, 0xF3 }; + +void __init aarp_proto_init(void) +{ + aarp_dl = register_snap_client(aarp_snap_id, aarp_rcv); + if (!aarp_dl) + printk(KERN_CRIT "Unable to register AARP with SNAP.\n"); + init_timer(&aarp_timer); + aarp_timer.function = aarp_expire_timeout; + aarp_timer.data = 0; + aarp_timer.expires = jiffies + sysctl_aarp_expiry_time; + add_timer(&aarp_timer); + register_netdevice_notifier(&aarp_notifier); +} + +/* Remove the AARP entries associated with a device. */ +void aarp_device_down(struct net_device *dev) +{ + int ct; + + write_lock_bh(&aarp_lock); + + for (ct = 0; ct < AARP_HASH_SIZE; ct++) { + __aarp_expire_device(&resolved[ct], dev); + __aarp_expire_device(&unresolved[ct], dev); + __aarp_expire_device(&proxies[ct], dev); + } + + write_unlock_bh(&aarp_lock); +} + +#ifdef CONFIG_PROC_FS +struct aarp_iter_state { + int bucket; + struct aarp_entry **table; +}; + +/* + * Get the aarp entry that is in the chain described + * by the iterator. + * If pos is set then skip till that index. + * pos = 1 is the first entry + */ +static struct aarp_entry *iter_next(struct aarp_iter_state *iter, loff_t *pos) +{ + int ct = iter->bucket; + struct aarp_entry **table = iter->table; + loff_t off = 0; + struct aarp_entry *entry; + + rescan: + while(ct < AARP_HASH_SIZE) { + for (entry = table[ct]; entry; entry = entry->next) { + if (!pos || ++off == *pos) { + iter->table = table; + iter->bucket = ct; + return entry; + } + } + ++ct; + } + + if (table == resolved) { + ct = 0; + table = unresolved; + goto rescan; + } + if (table == unresolved) { + ct = 0; + table = proxies; + goto rescan; + } + return NULL; +} + +static void *aarp_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct aarp_iter_state *iter = seq->private; + + read_lock_bh(&aarp_lock); + iter->table = resolved; + iter->bucket = 0; + + return *pos ? iter_next(iter, pos) : SEQ_START_TOKEN; +} + +static void *aarp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct aarp_entry *entry = v; + struct aarp_iter_state *iter = seq->private; + + ++*pos; + + /* first line after header */ + if (v == SEQ_START_TOKEN) + entry = iter_next(iter, NULL); + + /* next entry in current bucket */ + else if (entry->next) + entry = entry->next; + + /* next bucket or table */ + else { + ++iter->bucket; + entry = iter_next(iter, NULL); + } + return entry; +} + +static void aarp_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock_bh(&aarp_lock); +} + +static const char *dt2str(unsigned long ticks) +{ + static char buf[32]; + + sprintf(buf, "%ld.%02ld", ticks / HZ, ((ticks % HZ) * 100 ) / HZ); + + return buf; +} + +static int aarp_seq_show(struct seq_file *seq, void *v) +{ + struct aarp_iter_state *iter = seq->private; + struct aarp_entry *entry = v; + unsigned long now = jiffies; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Address Interface Hardware Address" + " Expires LastSend Retry Status\n"); + else { + seq_printf(seq, "%04X:%02X %-12s", + ntohs(entry->target_addr.s_net), + (unsigned int) entry->target_addr.s_node, + entry->dev ? entry->dev->name : "????"); + seq_printf(seq, "%02X:%02X:%02X:%02X:%02X:%02X", + entry->hwaddr[0] & 0xFF, + entry->hwaddr[1] & 0xFF, + entry->hwaddr[2] & 0xFF, + entry->hwaddr[3] & 0xFF, + entry->hwaddr[4] & 0xFF, + entry->hwaddr[5] & 0xFF); + seq_printf(seq, " %8s", + dt2str((long)entry->expires_at - (long)now)); + if (iter->table == unresolved) + seq_printf(seq, " %8s %6hu", + dt2str(now - entry->last_sent), + entry->xmit_count); + else + seq_puts(seq, " "); + seq_printf(seq, " %s\n", + (iter->table == resolved) ? "resolved" + : (iter->table == unresolved) ? "unresolved" + : (iter->table == proxies) ? "proxies" + : "unknown"); + } + return 0; +} + +static struct seq_operations aarp_seq_ops = { + .start = aarp_seq_start, + .next = aarp_seq_next, + .stop = aarp_seq_stop, + .show = aarp_seq_show, +}; + +static int aarp_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct aarp_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &aarp_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +struct file_operations atalk_seq_arp_fops = { + .owner = THIS_MODULE, + .open = aarp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + +/* General module cleanup. Called from cleanup_module() in ddp.c. */ +void aarp_cleanup_module(void) +{ + del_timer_sync(&aarp_timer); + unregister_netdevice_notifier(&aarp_notifier); + unregister_snap_client(aarp_dl); + aarp_purge(); +} diff --git a/net/appletalk/atalk_proc.c b/net/appletalk/atalk_proc.c new file mode 100644 index 000000000000..dc4048dd98c1 --- /dev/null +++ b/net/appletalk/atalk_proc.c @@ -0,0 +1,321 @@ +/* + * atalk_proc.c - proc support for Appletalk + * + * Copyright(c) Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation, version 2. + */ + +#include +#include +#include +#include +#include +#include + + +static __inline__ struct atalk_iface *atalk_get_interface_idx(loff_t pos) +{ + struct atalk_iface *i; + + for (i = atalk_interfaces; pos && i; i = i->next) + --pos; + + return i; +} + +static void *atalk_seq_interface_start(struct seq_file *seq, loff_t *pos) +{ + loff_t l = *pos; + + read_lock_bh(&atalk_interfaces_lock); + return l ? atalk_get_interface_idx(--l) : SEQ_START_TOKEN; +} + +static void *atalk_seq_interface_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct atalk_iface *i; + + ++*pos; + if (v == SEQ_START_TOKEN) { + i = NULL; + if (atalk_interfaces) + i = atalk_interfaces; + goto out; + } + i = v; + i = i->next; +out: + return i; +} + +static void atalk_seq_interface_stop(struct seq_file *seq, void *v) +{ + read_unlock_bh(&atalk_interfaces_lock); +} + +static int atalk_seq_interface_show(struct seq_file *seq, void *v) +{ + struct atalk_iface *iface; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "Interface Address Networks " + "Status\n"); + goto out; + } + + iface = v; + seq_printf(seq, "%-16s %04X:%02X %04X-%04X %d\n", + iface->dev->name, ntohs(iface->address.s_net), + iface->address.s_node, ntohs(iface->nets.nr_firstnet), + ntohs(iface->nets.nr_lastnet), iface->status); +out: + return 0; +} + +static __inline__ struct atalk_route *atalk_get_route_idx(loff_t pos) +{ + struct atalk_route *r; + + for (r = atalk_routes; pos && r; r = r->next) + --pos; + + return r; +} + +static void *atalk_seq_route_start(struct seq_file *seq, loff_t *pos) +{ + loff_t l = *pos; + + read_lock_bh(&atalk_routes_lock); + return l ? atalk_get_route_idx(--l) : SEQ_START_TOKEN; +} + +static void *atalk_seq_route_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct atalk_route *r; + + ++*pos; + if (v == SEQ_START_TOKEN) { + r = NULL; + if (atalk_routes) + r = atalk_routes; + goto out; + } + r = v; + r = r->next; +out: + return r; +} + +static void atalk_seq_route_stop(struct seq_file *seq, void *v) +{ + read_unlock_bh(&atalk_routes_lock); +} + +static int atalk_seq_route_show(struct seq_file *seq, void *v) +{ + struct atalk_route *rt; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "Target Router Flags Dev\n"); + goto out; + } + + if (atrtr_default.dev) { + rt = &atrtr_default; + seq_printf(seq, "Default %04X:%02X %-4d %s\n", + ntohs(rt->gateway.s_net), rt->gateway.s_node, + rt->flags, rt->dev->name); + } + + rt = v; + seq_printf(seq, "%04X:%02X %04X:%02X %-4d %s\n", + ntohs(rt->target.s_net), rt->target.s_node, + ntohs(rt->gateway.s_net), rt->gateway.s_node, + rt->flags, rt->dev->name); +out: + return 0; +} + +static __inline__ struct sock *atalk_get_socket_idx(loff_t pos) +{ + struct sock *s; + struct hlist_node *node; + + sk_for_each(s, node, &atalk_sockets) + if (!pos--) + goto found; + s = NULL; +found: + return s; +} + +static void *atalk_seq_socket_start(struct seq_file *seq, loff_t *pos) +{ + loff_t l = *pos; + + read_lock_bh(&atalk_sockets_lock); + return l ? atalk_get_socket_idx(--l) : SEQ_START_TOKEN; +} + +static void *atalk_seq_socket_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock *i; + + ++*pos; + if (v == SEQ_START_TOKEN) { + i = sk_head(&atalk_sockets); + goto out; + } + i = sk_next(v); +out: + return i; +} + +static void atalk_seq_socket_stop(struct seq_file *seq, void *v) +{ + read_unlock_bh(&atalk_sockets_lock); +} + +static int atalk_seq_socket_show(struct seq_file *seq, void *v) +{ + struct sock *s; + struct atalk_sock *at; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "Type Local_addr Remote_addr Tx_queue " + "Rx_queue St UID\n"); + goto out; + } + + s = v; + at = at_sk(s); + + seq_printf(seq, "%02X %04X:%02X:%02X %04X:%02X:%02X %08X:%08X " + "%02X %d\n", + s->sk_type, ntohs(at->src_net), at->src_node, at->src_port, + ntohs(at->dest_net), at->dest_node, at->dest_port, + atomic_read(&s->sk_wmem_alloc), + atomic_read(&s->sk_rmem_alloc), + s->sk_state, SOCK_INODE(s->sk_socket)->i_uid); +out: + return 0; +} + +static struct seq_operations atalk_seq_interface_ops = { + .start = atalk_seq_interface_start, + .next = atalk_seq_interface_next, + .stop = atalk_seq_interface_stop, + .show = atalk_seq_interface_show, +}; + +static struct seq_operations atalk_seq_route_ops = { + .start = atalk_seq_route_start, + .next = atalk_seq_route_next, + .stop = atalk_seq_route_stop, + .show = atalk_seq_route_show, +}; + +static struct seq_operations atalk_seq_socket_ops = { + .start = atalk_seq_socket_start, + .next = atalk_seq_socket_next, + .stop = atalk_seq_socket_stop, + .show = atalk_seq_socket_show, +}; + +static int atalk_seq_interface_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &atalk_seq_interface_ops); +} + +static int atalk_seq_route_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &atalk_seq_route_ops); +} + +static int atalk_seq_socket_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &atalk_seq_socket_ops); +} + +static struct file_operations atalk_seq_interface_fops = { + .owner = THIS_MODULE, + .open = atalk_seq_interface_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct file_operations atalk_seq_route_fops = { + .owner = THIS_MODULE, + .open = atalk_seq_route_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct file_operations atalk_seq_socket_fops = { + .owner = THIS_MODULE, + .open = atalk_seq_socket_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct proc_dir_entry *atalk_proc_dir; + +int __init atalk_proc_init(void) +{ + struct proc_dir_entry *p; + int rc = -ENOMEM; + + atalk_proc_dir = proc_mkdir("atalk", proc_net); + if (!atalk_proc_dir) + goto out; + atalk_proc_dir->owner = THIS_MODULE; + + p = create_proc_entry("interface", S_IRUGO, atalk_proc_dir); + if (!p) + goto out_interface; + p->proc_fops = &atalk_seq_interface_fops; + + p = create_proc_entry("route", S_IRUGO, atalk_proc_dir); + if (!p) + goto out_route; + p->proc_fops = &atalk_seq_route_fops; + + p = create_proc_entry("socket", S_IRUGO, atalk_proc_dir); + if (!p) + goto out_socket; + p->proc_fops = &atalk_seq_socket_fops; + + p = create_proc_entry("arp", S_IRUGO, atalk_proc_dir); + if (!p) + goto out_arp; + p->proc_fops = &atalk_seq_arp_fops; + + rc = 0; +out: + return rc; +out_arp: + remove_proc_entry("socket", atalk_proc_dir); +out_socket: + remove_proc_entry("route", atalk_proc_dir); +out_route: + remove_proc_entry("interface", atalk_proc_dir); +out_interface: + remove_proc_entry("atalk", proc_net); + goto out; +} + +void __exit atalk_proc_exit(void) +{ + remove_proc_entry("interface", atalk_proc_dir); + remove_proc_entry("route", atalk_proc_dir); + remove_proc_entry("socket", atalk_proc_dir); + remove_proc_entry("arp", atalk_proc_dir); + remove_proc_entry("atalk", proc_net); +} diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c new file mode 100644 index 000000000000..d1fea5c3dda1 --- /dev/null +++ b/net/appletalk/ddp.c @@ -0,0 +1,1931 @@ +/* + * DDP: An implementation of the AppleTalk DDP protocol for + * Ethernet 'ELAP'. + * + * Alan Cox + * + * With more than a little assistance from + * + * Wesley Craig + * + * Fixes: + * Neil Horman : Added missing device ioctls + * Michael Callahan : Made routing work + * Wesley Craig : Fix probing to listen to a + * passed node id. + * Alan Cox : Added send/recvmsg support + * Alan Cox : Moved at. to protinfo in + * socket. + * Alan Cox : Added firewall hooks. + * Alan Cox : Supports new ARPHRD_LOOPBACK + * Christer Weinigel : Routing and /proc fixes. + * Bradford Johnson : LocalTalk. + * Tom Dyas : Module support. + * Alan Cox : Hooks for PPP (based on the + * LocalTalk hook). + * Alan Cox : Posix bits + * Alan Cox/Mike Freeman : Possible fix to NBP problems + * Bradford Johnson : IP-over-DDP (experimental) + * Jay Schulist : Moved IP-over-DDP to its own + * driver file. (ipddp.c & ipddp.h) + * Jay Schulist : Made work as module with + * AppleTalk drivers, cleaned it. + * Rob Newberry : Added proxy AARP and AARP + * procfs, moved probing to AARP + * module. + * Adrian Sun/ + * Michael Zuelsdorff : fix for net.0 packets. don't + * allow illegal ether/tokentalk + * port assignment. we lose a + * valid localtalk port as a + * result. + * Arnaldo C. de Melo : Cleanup, in preparation for + * shared skb support 8) + * Arnaldo C. de Melo : Move proc stuff to atalk_proc.c, + * use seq_file + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include /* For TIOCOUTQ/INQ */ +#include +#include +#include +#include +#include + +struct datalink_proto *ddp_dl, *aarp_dl; +static struct proto_ops atalk_dgram_ops; + +/**************************************************************************\ +* * +* Handlers for the socket list. * +* * +\**************************************************************************/ + +HLIST_HEAD(atalk_sockets); +DEFINE_RWLOCK(atalk_sockets_lock); + +static inline void __atalk_insert_socket(struct sock *sk) +{ + sk_add_node(sk, &atalk_sockets); +} + +static inline void atalk_remove_socket(struct sock *sk) +{ + write_lock_bh(&atalk_sockets_lock); + sk_del_node_init(sk); + write_unlock_bh(&atalk_sockets_lock); +} + +static struct sock *atalk_search_socket(struct sockaddr_at *to, + struct atalk_iface *atif) +{ + struct sock *s; + struct hlist_node *node; + + read_lock_bh(&atalk_sockets_lock); + sk_for_each(s, node, &atalk_sockets) { + struct atalk_sock *at = at_sk(s); + + if (to->sat_port != at->src_port) + continue; + + if (to->sat_addr.s_net == ATADDR_ANYNET && + to->sat_addr.s_node == ATADDR_BCAST && + at->src_net == atif->address.s_net) + goto found; + + if (to->sat_addr.s_net == at->src_net && + (to->sat_addr.s_node == at->src_node || + to->sat_addr.s_node == ATADDR_BCAST || + to->sat_addr.s_node == ATADDR_ANYNODE)) + goto found; + + /* XXXX.0 -- we got a request for this router. make sure + * that the node is appropriately set. */ + if (to->sat_addr.s_node == ATADDR_ANYNODE && + to->sat_addr.s_net != ATADDR_ANYNET && + atif->address.s_node == at->src_node) { + to->sat_addr.s_node = atif->address.s_node; + goto found; + } + } + s = NULL; +found: + read_unlock_bh(&atalk_sockets_lock); + return s; +} + +/** + * atalk_find_or_insert_socket - Try to find a socket matching ADDR + * @sk - socket to insert in the list if it is not there already + * @sat - address to search for + * + * Try to find a socket matching ADDR in the socket list, if found then return + * it. If not, insert SK into the socket list. + * + * This entire operation must execute atomically. + */ +static struct sock *atalk_find_or_insert_socket(struct sock *sk, + struct sockaddr_at *sat) +{ + struct sock *s; + struct hlist_node *node; + struct atalk_sock *at; + + write_lock_bh(&atalk_sockets_lock); + sk_for_each(s, node, &atalk_sockets) { + at = at_sk(s); + + if (at->src_net == sat->sat_addr.s_net && + at->src_node == sat->sat_addr.s_node && + at->src_port == sat->sat_port) + goto found; + } + s = NULL; + __atalk_insert_socket(sk); /* Wheee, it's free, assign and insert. */ +found: + write_unlock_bh(&atalk_sockets_lock); + return s; +} + +static void atalk_destroy_timer(unsigned long data) +{ + struct sock *sk = (struct sock *)data; + + if (atomic_read(&sk->sk_wmem_alloc) || + atomic_read(&sk->sk_rmem_alloc)) { + sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME; + add_timer(&sk->sk_timer); + } else + sock_put(sk); +} + +static inline void atalk_destroy_socket(struct sock *sk) +{ + atalk_remove_socket(sk); + skb_queue_purge(&sk->sk_receive_queue); + + if (atomic_read(&sk->sk_wmem_alloc) || + atomic_read(&sk->sk_rmem_alloc)) { + init_timer(&sk->sk_timer); + sk->sk_timer.expires = jiffies + SOCK_DESTROY_TIME; + sk->sk_timer.function = atalk_destroy_timer; + sk->sk_timer.data = (unsigned long)sk; + add_timer(&sk->sk_timer); + } else + sock_put(sk); +} + +/**************************************************************************\ +* * +* Routing tables for the AppleTalk socket layer. * +* * +\**************************************************************************/ + +/* Anti-deadlock ordering is atalk_routes_lock --> iface_lock -DaveM */ +struct atalk_route *atalk_routes; +DEFINE_RWLOCK(atalk_routes_lock); + +struct atalk_iface *atalk_interfaces; +DEFINE_RWLOCK(atalk_interfaces_lock); + +/* For probing devices or in a routerless network */ +struct atalk_route atrtr_default; + +/* AppleTalk interface control */ +/* + * Drop a device. Doesn't drop any of its routes - that is the caller's + * problem. Called when we down the interface or delete the address. + */ +static void atif_drop_device(struct net_device *dev) +{ + struct atalk_iface **iface = &atalk_interfaces; + struct atalk_iface *tmp; + + write_lock_bh(&atalk_interfaces_lock); + while ((tmp = *iface) != NULL) { + if (tmp->dev == dev) { + *iface = tmp->next; + dev_put(dev); + kfree(tmp); + dev->atalk_ptr = NULL; + } else + iface = &tmp->next; + } + write_unlock_bh(&atalk_interfaces_lock); +} + +static struct atalk_iface *atif_add_device(struct net_device *dev, + struct atalk_addr *sa) +{ + struct atalk_iface *iface = kmalloc(sizeof(*iface), GFP_KERNEL); + + if (!iface) + goto out; + + memset(iface, 0, sizeof(*iface)); + dev_hold(dev); + iface->dev = dev; + dev->atalk_ptr = iface; + iface->address = *sa; + iface->status = 0; + + write_lock_bh(&atalk_interfaces_lock); + iface->next = atalk_interfaces; + atalk_interfaces = iface; + write_unlock_bh(&atalk_interfaces_lock); +out: + return iface; +} + +/* Perform phase 2 AARP probing on our tentative address */ +static int atif_probe_device(struct atalk_iface *atif) +{ + int netrange = ntohs(atif->nets.nr_lastnet) - + ntohs(atif->nets.nr_firstnet) + 1; + int probe_net = ntohs(atif->address.s_net); + int probe_node = atif->address.s_node; + int netct, nodect; + + /* Offset the network we start probing with */ + if (probe_net == ATADDR_ANYNET) { + probe_net = ntohs(atif->nets.nr_firstnet); + if (netrange) + probe_net += jiffies % netrange; + } + if (probe_node == ATADDR_ANYNODE) + probe_node = jiffies & 0xFF; + + /* Scan the networks */ + atif->status |= ATIF_PROBE; + for (netct = 0; netct <= netrange; netct++) { + /* Sweep the available nodes from a given start */ + atif->address.s_net = htons(probe_net); + for (nodect = 0; nodect < 256; nodect++) { + atif->address.s_node = (nodect + probe_node) & 0xFF; + if (atif->address.s_node > 0 && + atif->address.s_node < 254) { + /* Probe a proposed address */ + aarp_probe_network(atif); + + if (!(atif->status & ATIF_PROBE_FAIL)) { + atif->status &= ~ATIF_PROBE; + return 0; + } + } + atif->status &= ~ATIF_PROBE_FAIL; + } + probe_net++; + if (probe_net > ntohs(atif->nets.nr_lastnet)) + probe_net = ntohs(atif->nets.nr_firstnet); + } + atif->status &= ~ATIF_PROBE; + + return -EADDRINUSE; /* Network is full... */ +} + + +/* Perform AARP probing for a proxy address */ +static int atif_proxy_probe_device(struct atalk_iface *atif, + struct atalk_addr* proxy_addr) +{ + int netrange = ntohs(atif->nets.nr_lastnet) - + ntohs(atif->nets.nr_firstnet) + 1; + /* we probe the interface's network */ + int probe_net = ntohs(atif->address.s_net); + int probe_node = ATADDR_ANYNODE; /* we'll take anything */ + int netct, nodect; + + /* Offset the network we start probing with */ + if (probe_net == ATADDR_ANYNET) { + probe_net = ntohs(atif->nets.nr_firstnet); + if (netrange) + probe_net += jiffies % netrange; + } + + if (probe_node == ATADDR_ANYNODE) + probe_node = jiffies & 0xFF; + + /* Scan the networks */ + for (netct = 0; netct <= netrange; netct++) { + /* Sweep the available nodes from a given start */ + proxy_addr->s_net = htons(probe_net); + for (nodect = 0; nodect < 256; nodect++) { + proxy_addr->s_node = (nodect + probe_node) & 0xFF; + if (proxy_addr->s_node > 0 && + proxy_addr->s_node < 254) { + /* Tell AARP to probe a proposed address */ + int ret = aarp_proxy_probe_network(atif, + proxy_addr); + + if (ret != -EADDRINUSE) + return ret; + } + } + probe_net++; + if (probe_net > ntohs(atif->nets.nr_lastnet)) + probe_net = ntohs(atif->nets.nr_firstnet); + } + + return -EADDRINUSE; /* Network is full... */ +} + + +struct atalk_addr *atalk_find_dev_addr(struct net_device *dev) +{ + struct atalk_iface *iface = dev->atalk_ptr; + return iface ? &iface->address : NULL; +} + +static struct atalk_addr *atalk_find_primary(void) +{ + struct atalk_iface *fiface = NULL; + struct atalk_addr *retval; + struct atalk_iface *iface; + + /* + * Return a point-to-point interface only if + * there is no non-ptp interface available. + */ + read_lock_bh(&atalk_interfaces_lock); + for (iface = atalk_interfaces; iface; iface = iface->next) { + if (!fiface && !(iface->dev->flags & IFF_LOOPBACK)) + fiface = iface; + if (!(iface->dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))) { + retval = &iface->address; + goto out; + } + } + + if (fiface) + retval = &fiface->address; + else if (atalk_interfaces) + retval = &atalk_interfaces->address; + else + retval = NULL; +out: + read_unlock_bh(&atalk_interfaces_lock); + return retval; +} + +/* + * Find a match for 'any network' - ie any of our interfaces with that + * node number will do just nicely. + */ +static struct atalk_iface *atalk_find_anynet(int node, struct net_device *dev) +{ + struct atalk_iface *iface = dev->atalk_ptr; + + if (!iface || iface->status & ATIF_PROBE) + goto out_err; + + if (node != ATADDR_BCAST && + iface->address.s_node != node && + node != ATADDR_ANYNODE) + goto out_err; +out: + return iface; +out_err: + iface = NULL; + goto out; +} + +/* Find a match for a specific network:node pair */ +static struct atalk_iface *atalk_find_interface(int net, int node) +{ + struct atalk_iface *iface; + + read_lock_bh(&atalk_interfaces_lock); + for (iface = atalk_interfaces; iface; iface = iface->next) { + if ((node == ATADDR_BCAST || + node == ATADDR_ANYNODE || + iface->address.s_node == node) && + iface->address.s_net == net && + !(iface->status & ATIF_PROBE)) + break; + + /* XXXX.0 -- net.0 returns the iface associated with net */ + if (node == ATADDR_ANYNODE && net != ATADDR_ANYNET && + ntohs(iface->nets.nr_firstnet) <= ntohs(net) && + ntohs(net) <= ntohs(iface->nets.nr_lastnet)) + break; + } + read_unlock_bh(&atalk_interfaces_lock); + return iface; +} + + +/* + * Find a route for an AppleTalk packet. This ought to get cached in + * the socket (later on...). We know about host routes and the fact + * that a route must be direct to broadcast. + */ +static struct atalk_route *atrtr_find(struct atalk_addr *target) +{ + /* + * we must search through all routes unless we find a + * host route, because some host routes might overlap + * network routes + */ + struct atalk_route *net_route = NULL; + struct atalk_route *r; + + read_lock_bh(&atalk_routes_lock); + for (r = atalk_routes; r; r = r->next) { + if (!(r->flags & RTF_UP)) + continue; + + if (r->target.s_net == target->s_net) { + if (r->flags & RTF_HOST) { + /* + * if this host route is for the target, + * the we're done + */ + if (r->target.s_node == target->s_node) + goto out; + } else + /* + * this route will work if there isn't a + * direct host route, so cache it + */ + net_route = r; + } + } + + /* + * if we found a network route but not a direct host + * route, then return it + */ + if (net_route) + r = net_route; + else if (atrtr_default.dev) + r = &atrtr_default; + else /* No route can be found */ + r = NULL; +out: + read_unlock_bh(&atalk_routes_lock); + return r; +} + + +/* + * Given an AppleTalk network, find the device to use. This can be + * a simple lookup. + */ +struct net_device *atrtr_get_dev(struct atalk_addr *sa) +{ + struct atalk_route *atr = atrtr_find(sa); + return atr ? atr->dev : NULL; +} + +/* Set up a default router */ +static void atrtr_set_default(struct net_device *dev) +{ + atrtr_default.dev = dev; + atrtr_default.flags = RTF_UP; + atrtr_default.gateway.s_net = htons(0); + atrtr_default.gateway.s_node = 0; +} + +/* + * Add a router. Basically make sure it looks valid and stuff the + * entry in the list. While it uses netranges we always set them to one + * entry to work like netatalk. + */ +static int atrtr_create(struct rtentry *r, struct net_device *devhint) +{ + struct sockaddr_at *ta = (struct sockaddr_at *)&r->rt_dst; + struct sockaddr_at *ga = (struct sockaddr_at *)&r->rt_gateway; + struct atalk_route *rt; + struct atalk_iface *iface, *riface; + int retval = -EINVAL; + + /* + * Fixme: Raise/Lower a routing change semaphore for these + * operations. + */ + + /* Validate the request */ + if (ta->sat_family != AF_APPLETALK || + (!devhint && ga->sat_family != AF_APPLETALK)) + goto out; + + /* Now walk the routing table and make our decisions */ + write_lock_bh(&atalk_routes_lock); + for (rt = atalk_routes; rt; rt = rt->next) { + if (r->rt_flags != rt->flags) + continue; + + if (ta->sat_addr.s_net == rt->target.s_net) { + if (!(rt->flags & RTF_HOST)) + break; + if (ta->sat_addr.s_node == rt->target.s_node) + break; + } + } + + if (!devhint) { + riface = NULL; + + read_lock_bh(&atalk_interfaces_lock); + for (iface = atalk_interfaces; iface; iface = iface->next) { + if (!riface && + ntohs(ga->sat_addr.s_net) >= + ntohs(iface->nets.nr_firstnet) && + ntohs(ga->sat_addr.s_net) <= + ntohs(iface->nets.nr_lastnet)) + riface = iface; + + if (ga->sat_addr.s_net == iface->address.s_net && + ga->sat_addr.s_node == iface->address.s_node) + riface = iface; + } + read_unlock_bh(&atalk_interfaces_lock); + + retval = -ENETUNREACH; + if (!riface) + goto out_unlock; + + devhint = riface->dev; + } + + if (!rt) { + rt = kmalloc(sizeof(*rt), GFP_ATOMIC); + + retval = -ENOBUFS; + if (!rt) + goto out_unlock; + memset(rt, 0, sizeof(*rt)); + + rt->next = atalk_routes; + atalk_routes = rt; + } + + /* Fill in the routing entry */ + rt->target = ta->sat_addr; + rt->dev = devhint; + rt->flags = r->rt_flags; + rt->gateway = ga->sat_addr; + + retval = 0; +out_unlock: + write_unlock_bh(&atalk_routes_lock); +out: + return retval; +} + +/* Delete a route. Find it and discard it */ +static int atrtr_delete(struct atalk_addr * addr) +{ + struct atalk_route **r = &atalk_routes; + int retval = 0; + struct atalk_route *tmp; + + write_lock_bh(&atalk_routes_lock); + while ((tmp = *r) != NULL) { + if (tmp->target.s_net == addr->s_net && + (!(tmp->flags&RTF_GATEWAY) || + tmp->target.s_node == addr->s_node)) { + *r = tmp->next; + dev_put(tmp->dev); + kfree(tmp); + goto out; + } + r = &tmp->next; + } + retval = -ENOENT; +out: + write_unlock_bh(&atalk_routes_lock); + return retval; +} + +/* + * Called when a device is downed. Just throw away any routes + * via it. + */ +static void atrtr_device_down(struct net_device *dev) +{ + struct atalk_route **r = &atalk_routes; + struct atalk_route *tmp; + + write_lock_bh(&atalk_routes_lock); + while ((tmp = *r) != NULL) { + if (tmp->dev == dev) { + *r = tmp->next; + dev_put(dev); + kfree(tmp); + } else + r = &tmp->next; + } + write_unlock_bh(&atalk_routes_lock); + + if (atrtr_default.dev == dev) + atrtr_set_default(NULL); +} + +/* Actually down the interface */ +static inline void atalk_dev_down(struct net_device *dev) +{ + atrtr_device_down(dev); /* Remove all routes for the device */ + aarp_device_down(dev); /* Remove AARP entries for the device */ + atif_drop_device(dev); /* Remove the device */ +} + +/* + * A device event has occurred. Watch for devices going down and + * delete our use of them (iface and route). + */ +static int ddp_device_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + if (event == NETDEV_DOWN) + /* Discard any use of this */ + atalk_dev_down(ptr); + + return NOTIFY_DONE; +} + +/* ioctl calls. Shouldn't even need touching */ +/* Device configuration ioctl calls */ +static int atif_ioctl(int cmd, void __user *arg) +{ + static char aarp_mcast[6] = { 0x09, 0x00, 0x00, 0xFF, 0xFF, 0xFF }; + struct ifreq atreq; + struct atalk_netrange *nr; + struct sockaddr_at *sa; + struct net_device *dev; + struct atalk_iface *atif; + int ct; + int limit; + struct rtentry rtdef; + int add_route; + + if (copy_from_user(&atreq, arg, sizeof(atreq))) + return -EFAULT; + + dev = __dev_get_by_name(atreq.ifr_name); + if (!dev) + return -ENODEV; + + sa = (struct sockaddr_at *)&atreq.ifr_addr; + atif = atalk_find_dev(dev); + + switch (cmd) { + case SIOCSIFADDR: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (sa->sat_family != AF_APPLETALK) + return -EINVAL; + if (dev->type != ARPHRD_ETHER && + dev->type != ARPHRD_LOOPBACK && + dev->type != ARPHRD_LOCALTLK && + dev->type != ARPHRD_PPP) + return -EPROTONOSUPPORT; + + nr = (struct atalk_netrange *)&sa->sat_zero[0]; + add_route = 1; + + /* + * if this is a point-to-point iface, and we already + * have an iface for this AppleTalk address, then we + * should not add a route + */ + if ((dev->flags & IFF_POINTOPOINT) && + atalk_find_interface(sa->sat_addr.s_net, + sa->sat_addr.s_node)) { + printk(KERN_DEBUG "AppleTalk: point-to-point " + "interface added with " + "existing address\n"); + add_route = 0; + } + + /* + * Phase 1 is fine on LocalTalk but we don't do + * EtherTalk phase 1. Anyone wanting to add it go ahead. + */ + if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2) + return -EPROTONOSUPPORT; + if (sa->sat_addr.s_node == ATADDR_BCAST || + sa->sat_addr.s_node == 254) + return -EINVAL; + if (atif) { + /* Already setting address */ + if (atif->status & ATIF_PROBE) + return -EBUSY; + + atif->address.s_net = sa->sat_addr.s_net; + atif->address.s_node = sa->sat_addr.s_node; + atrtr_device_down(dev); /* Flush old routes */ + } else { + atif = atif_add_device(dev, &sa->sat_addr); + if (!atif) + return -ENOMEM; + } + atif->nets = *nr; + + /* + * Check if the chosen address is used. If so we + * error and atalkd will try another. + */ + + if (!(dev->flags & IFF_LOOPBACK) && + !(dev->flags & IFF_POINTOPOINT) && + atif_probe_device(atif) < 0) { + atif_drop_device(dev); + return -EADDRINUSE; + } + + /* Hey it worked - add the direct routes */ + sa = (struct sockaddr_at *)&rtdef.rt_gateway; + sa->sat_family = AF_APPLETALK; + sa->sat_addr.s_net = atif->address.s_net; + sa->sat_addr.s_node = atif->address.s_node; + sa = (struct sockaddr_at *)&rtdef.rt_dst; + rtdef.rt_flags = RTF_UP; + sa->sat_family = AF_APPLETALK; + sa->sat_addr.s_node = ATADDR_ANYNODE; + if (dev->flags & IFF_LOOPBACK || + dev->flags & IFF_POINTOPOINT) + rtdef.rt_flags |= RTF_HOST; + + /* Routerless initial state */ + if (nr->nr_firstnet == htons(0) && + nr->nr_lastnet == htons(0xFFFE)) { + sa->sat_addr.s_net = atif->address.s_net; + atrtr_create(&rtdef, dev); + atrtr_set_default(dev); + } else { + limit = ntohs(nr->nr_lastnet); + if (limit - ntohs(nr->nr_firstnet) > 4096) { + printk(KERN_WARNING "Too many routes/" + "iface.\n"); + return -EINVAL; + } + if (add_route) + for (ct = ntohs(nr->nr_firstnet); + ct <= limit; ct++) { + sa->sat_addr.s_net = htons(ct); + atrtr_create(&rtdef, dev); + } + } + dev_mc_add(dev, aarp_mcast, 6, 1); + return 0; + + case SIOCGIFADDR: + if (!atif) + return -EADDRNOTAVAIL; + + sa->sat_family = AF_APPLETALK; + sa->sat_addr = atif->address; + break; + + case SIOCGIFBRDADDR: + if (!atif) + return -EADDRNOTAVAIL; + + sa->sat_family = AF_APPLETALK; + sa->sat_addr.s_net = atif->address.s_net; + sa->sat_addr.s_node = ATADDR_BCAST; + break; + + case SIOCATALKDIFADDR: + case SIOCDIFADDR: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (sa->sat_family != AF_APPLETALK) + return -EINVAL; + atalk_dev_down(dev); + break; + + case SIOCSARP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (sa->sat_family != AF_APPLETALK) + return -EINVAL; + if (!atif) + return -EADDRNOTAVAIL; + + /* + * for now, we only support proxy AARP on ELAP; + * we should be able to do it for LocalTalk, too. + */ + if (dev->type != ARPHRD_ETHER) + return -EPROTONOSUPPORT; + + /* + * atif points to the current interface on this network; + * we aren't concerned about its current status (at + * least for now), but it has all the settings about + * the network we're going to probe. Consequently, it + * must exist. + */ + if (!atif) + return -EADDRNOTAVAIL; + + nr = (struct atalk_netrange *)&(atif->nets); + /* + * Phase 1 is fine on Localtalk but we don't do + * Ethertalk phase 1. Anyone wanting to add it go ahead. + */ + if (dev->type == ARPHRD_ETHER && nr->nr_phase != 2) + return -EPROTONOSUPPORT; + + if (sa->sat_addr.s_node == ATADDR_BCAST || + sa->sat_addr.s_node == 254) + return -EINVAL; + + /* + * Check if the chosen address is used. If so we + * error and ATCP will try another. + */ + if (atif_proxy_probe_device(atif, &(sa->sat_addr)) < 0) + return -EADDRINUSE; + + /* + * We now have an address on the local network, and + * the AARP code will defend it for us until we take it + * down. We don't set up any routes right now, because + * ATCP will install them manually via SIOCADDRT. + */ + break; + + case SIOCDARP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (sa->sat_family != AF_APPLETALK) + return -EINVAL; + if (!atif) + return -EADDRNOTAVAIL; + + /* give to aarp module to remove proxy entry */ + aarp_proxy_remove(atif->dev, &(sa->sat_addr)); + return 0; + } + + return copy_to_user(arg, &atreq, sizeof(atreq)) ? -EFAULT : 0; +} + +/* Routing ioctl() calls */ +static int atrtr_ioctl(unsigned int cmd, void __user *arg) +{ + struct rtentry rt; + + if (copy_from_user(&rt, arg, sizeof(rt))) + return -EFAULT; + + switch (cmd) { + case SIOCDELRT: + if (rt.rt_dst.sa_family != AF_APPLETALK) + return -EINVAL; + return atrtr_delete(&((struct sockaddr_at *) + &rt.rt_dst)->sat_addr); + + case SIOCADDRT: { + struct net_device *dev = NULL; + if (rt.rt_dev) { + char name[IFNAMSIZ]; + if (copy_from_user(name, rt.rt_dev, IFNAMSIZ-1)) + return -EFAULT; + name[IFNAMSIZ-1] = '\0'; + dev = __dev_get_by_name(name); + if (!dev) + return -ENODEV; + } + return atrtr_create(&rt, dev); + } + } + return -EINVAL; +} + +/**************************************************************************\ +* * +* Handling for system calls applied via the various interfaces to an * +* AppleTalk socket object. * +* * +\**************************************************************************/ + +/* + * Checksum: This is 'optional'. It's quite likely also a good + * candidate for assembler hackery 8) + */ +static unsigned long atalk_sum_partial(const unsigned char *data, + int len, unsigned long sum) +{ + /* This ought to be unwrapped neatly. I'll trust gcc for now */ + while (len--) { + sum += *data; + sum <<= 1; + if (sum & 0x10000) { + sum++; + sum &= 0xffff; + } + data++; + } + return sum; +} + +/* Checksum skb data -- similar to skb_checksum */ +static unsigned long atalk_sum_skb(const struct sk_buff *skb, int offset, + int len, unsigned long sum) +{ + int start = skb_headlen(skb); + int i, copy; + + /* checksum stuff in header space */ + if ( (copy = start - offset) > 0) { + if (copy > len) + copy = len; + sum = atalk_sum_partial(skb->data + offset, copy, sum); + if ( (len -= copy) == 0) + return sum; + + offset += copy; + } + + /* checksum stuff in frags */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + vaddr = kmap_skb_frag(frag); + sum = atalk_sum_partial(vaddr + frag->page_offset + + offset - start, copy, sum); + kunmap_skb_frag(vaddr); + + if (!(len -= copy)) + return sum; + offset += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + sum = atalk_sum_skb(list, offset - start, + copy, sum); + if ((len -= copy) == 0) + return sum; + offset += copy; + } + start = end; + } + } + + BUG_ON(len > 0); + + return sum; +} + +static unsigned short atalk_checksum(const struct sk_buff *skb, int len) +{ + unsigned long sum; + + /* skip header 4 bytes */ + sum = atalk_sum_skb(skb, 4, len-4, 0); + + /* Use 0xFFFF for 0. 0 itself means none */ + return sum ? htons((unsigned short)sum) : 0xFFFF; +} + +static struct proto ddp_proto = { + .name = "DDP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct atalk_sock), +}; + +/* + * Create a socket. Initialise the socket, blank the addresses + * set the state. + */ +static int atalk_create(struct socket *sock, int protocol) +{ + struct sock *sk; + int rc = -ESOCKTNOSUPPORT; + + /* + * We permit SOCK_DGRAM and RAW is an extension. It is trivial to do + * and gives you the full ELAP frame. Should be handy for CAP 8) + */ + if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) + goto out; + rc = -ENOMEM; + sk = sk_alloc(PF_APPLETALK, GFP_KERNEL, &ddp_proto, 1); + if (!sk) + goto out; + rc = 0; + sock->ops = &atalk_dgram_ops; + sock_init_data(sock, sk); + + /* Checksums on by default */ + sock_set_flag(sk, SOCK_ZAPPED); +out: + return rc; +} + +/* Free a socket. No work needed */ +static int atalk_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk) { + sock_orphan(sk); + sock->sk = NULL; + atalk_destroy_socket(sk); + } + return 0; +} + +/** + * atalk_pick_and_bind_port - Pick a source port when one is not given + * @sk - socket to insert into the tables + * @sat - address to search for + * + * Pick a source port when one is not given. If we can find a suitable free + * one, we insert the socket into the tables using it. + * + * This whole operation must be atomic. + */ +static int atalk_pick_and_bind_port(struct sock *sk, struct sockaddr_at *sat) +{ + int retval; + + write_lock_bh(&atalk_sockets_lock); + + for (sat->sat_port = ATPORT_RESERVED; + sat->sat_port < ATPORT_LAST; + sat->sat_port++) { + struct sock *s; + struct hlist_node *node; + + sk_for_each(s, node, &atalk_sockets) { + struct atalk_sock *at = at_sk(s); + + if (at->src_net == sat->sat_addr.s_net && + at->src_node == sat->sat_addr.s_node && + at->src_port == sat->sat_port) + goto try_next_port; + } + + /* Wheee, it's free, assign and insert. */ + __atalk_insert_socket(sk); + at_sk(sk)->src_port = sat->sat_port; + retval = 0; + goto out; + +try_next_port:; + } + + retval = -EBUSY; +out: + write_unlock_bh(&atalk_sockets_lock); + return retval; +} + +static int atalk_autobind(struct sock *sk) +{ + struct atalk_sock *at = at_sk(sk); + struct sockaddr_at sat; + struct atalk_addr *ap = atalk_find_primary(); + int n = -EADDRNOTAVAIL; + + if (!ap || ap->s_net == htons(ATADDR_ANYNET)) + goto out; + + at->src_net = sat.sat_addr.s_net = ap->s_net; + at->src_node = sat.sat_addr.s_node = ap->s_node; + + n = atalk_pick_and_bind_port(sk, &sat); + if (!n) + sock_reset_flag(sk, SOCK_ZAPPED); +out: + return n; +} + +/* Set the address 'our end' of the connection */ +static int atalk_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_at *addr = (struct sockaddr_at *)uaddr; + struct sock *sk = sock->sk; + struct atalk_sock *at = at_sk(sk); + + if (!sock_flag(sk, SOCK_ZAPPED) || + addr_len != sizeof(struct sockaddr_at)) + return -EINVAL; + + if (addr->sat_family != AF_APPLETALK) + return -EAFNOSUPPORT; + + if (addr->sat_addr.s_net == htons(ATADDR_ANYNET)) { + struct atalk_addr *ap = atalk_find_primary(); + + if (!ap) + return -EADDRNOTAVAIL; + + at->src_net = addr->sat_addr.s_net = ap->s_net; + at->src_node = addr->sat_addr.s_node= ap->s_node; + } else { + if (!atalk_find_interface(addr->sat_addr.s_net, + addr->sat_addr.s_node)) + return -EADDRNOTAVAIL; + + at->src_net = addr->sat_addr.s_net; + at->src_node = addr->sat_addr.s_node; + } + + if (addr->sat_port == ATADDR_ANYPORT) { + int n = atalk_pick_and_bind_port(sk, addr); + + if (n < 0) + return n; + } else { + at->src_port = addr->sat_port; + + if (atalk_find_or_insert_socket(sk, addr)) + return -EADDRINUSE; + } + + sock_reset_flag(sk, SOCK_ZAPPED); + return 0; +} + +/* Set the address we talk to */ +static int atalk_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct atalk_sock *at = at_sk(sk); + struct sockaddr_at *addr; + + sk->sk_state = TCP_CLOSE; + sock->state = SS_UNCONNECTED; + + if (addr_len != sizeof(*addr)) + return -EINVAL; + + addr = (struct sockaddr_at *)uaddr; + + if (addr->sat_family != AF_APPLETALK) + return -EAFNOSUPPORT; + + if (addr->sat_addr.s_node == ATADDR_BCAST && + !sock_flag(sk, SOCK_BROADCAST)) { +#if 1 + printk(KERN_WARNING "%s is broken and did not set " + "SO_BROADCAST. It will break when 2.2 is " + "released.\n", + current->comm); +#else + return -EACCES; +#endif + } + + if (sock_flag(sk, SOCK_ZAPPED)) + if (atalk_autobind(sk) < 0) + return -EBUSY; + + if (!atrtr_get_dev(&addr->sat_addr)) + return -ENETUNREACH; + + at->dest_port = addr->sat_port; + at->dest_net = addr->sat_addr.s_net; + at->dest_node = addr->sat_addr.s_node; + + sock->state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; + return 0; +} + +/* + * Find the name of an AppleTalk socket. Just copy the right + * fields into the sockaddr. + */ +static int atalk_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_at sat; + struct sock *sk = sock->sk; + struct atalk_sock *at = at_sk(sk); + + if (sock_flag(sk, SOCK_ZAPPED)) + if (atalk_autobind(sk) < 0) + return -ENOBUFS; + + *uaddr_len = sizeof(struct sockaddr_at); + + if (peer) { + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + sat.sat_addr.s_net = at->dest_net; + sat.sat_addr.s_node = at->dest_node; + sat.sat_port = at->dest_port; + } else { + sat.sat_addr.s_net = at->src_net; + sat.sat_addr.s_node = at->src_node; + sat.sat_port = at->src_port; + } + + sat.sat_family = AF_APPLETALK; + memcpy(uaddr, &sat, sizeof(sat)); + return 0; +} + +#if defined(CONFIG_IPDDP) || defined(CONFIG_IPDDP_MODULE) +static __inline__ int is_ip_over_ddp(struct sk_buff *skb) +{ + return skb->data[12] == 22; +} + +static int handle_ip_over_ddp(struct sk_buff *skb) +{ + struct net_device *dev = __dev_get_by_name("ipddp0"); + struct net_device_stats *stats; + + /* This needs to be able to handle ipddp"N" devices */ + if (!dev) + return -ENODEV; + + skb->protocol = htons(ETH_P_IP); + skb_pull(skb, 13); + skb->dev = dev; + skb->h.raw = skb->data; + + stats = dev->priv; + stats->rx_packets++; + stats->rx_bytes += skb->len + 13; + netif_rx(skb); /* Send the SKB up to a higher place. */ + return 0; +} +#else +/* make it easy for gcc to optimize this test out, i.e. kill the code */ +#define is_ip_over_ddp(skb) 0 +#define handle_ip_over_ddp(skb) 0 +#endif + +static void atalk_route_packet(struct sk_buff *skb, struct net_device *dev, + struct ddpehdr *ddp, struct ddpebits *ddphv, + int origlen) +{ + struct atalk_route *rt; + struct atalk_addr ta; + + /* + * Don't route multicast, etc., packets, or packets sent to "this + * network" + */ + if (skb->pkt_type != PACKET_HOST || !ddp->deh_dnet) { + /* + * FIXME: + * + * Can it ever happen that a packet is from a PPP iface and + * needs to be broadcast onto the default network? + */ + if (dev->type == ARPHRD_PPP) + printk(KERN_DEBUG "AppleTalk: didn't forward broadcast " + "packet received from PPP iface\n"); + goto free_it; + } + + ta.s_net = ddp->deh_dnet; + ta.s_node = ddp->deh_dnode; + + /* Route the packet */ + rt = atrtr_find(&ta); + if (!rt || ddphv->deh_hops == DDP_MAXHOPS) + goto free_it; + /* FIXME: use skb->cb to be able to use shared skbs */ + ddphv->deh_hops++; + + /* + * Route goes through another gateway, so set the target to the + * gateway instead. + */ + + if (rt->flags & RTF_GATEWAY) { + ta.s_net = rt->gateway.s_net; + ta.s_node = rt->gateway.s_node; + } + + /* Fix up skb->len field */ + skb_trim(skb, min_t(unsigned int, origlen, + (rt->dev->hard_header_len + + ddp_dl->header_length + ddphv->deh_len))); + + /* Mend the byte order */ + /* FIXME: use skb->cb to be able to use shared skbs */ + *((__u16 *)ddp) = ntohs(*((__u16 *)ddphv)); + + /* + * Send the buffer onwards + * + * Now we must always be careful. If it's come from LocalTalk to + * EtherTalk it might not fit + * + * Order matters here: If a packet has to be copied to make a new + * headroom (rare hopefully) then it won't need unsharing. + * + * Note. ddp-> becomes invalid at the realloc. + */ + if (skb_headroom(skb) < 22) { + /* 22 bytes - 12 ether, 2 len, 3 802.2 5 snap */ + struct sk_buff *nskb = skb_realloc_headroom(skb, 32); + kfree_skb(skb); + if (!nskb) + goto out; + skb = nskb; + } else + skb = skb_unshare(skb, GFP_ATOMIC); + + /* + * If the buffer didn't vanish into the lack of space bitbucket we can + * send it. + */ + if (skb && aarp_send_ddp(rt->dev, skb, &ta, NULL) == -1) + goto free_it; +out: + return; +free_it: + kfree_skb(skb); +} + +/** + * atalk_rcv - Receive a packet (in skb) from device dev + * @skb - packet received + * @dev - network device where the packet comes from + * @pt - packet type + * + * Receive a packet (in skb) from device dev. This has come from the SNAP + * decoder, and on entry skb->h.raw is the DDP header, skb->len is the DDP + * header, skb->len is the DDP length. The physical headers have been + * extracted. PPP should probably pass frames marked as for this layer. + * [ie ARPHRD_ETHERTALK] + */ +static int atalk_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt) +{ + struct ddpehdr *ddp; + struct sock *sock; + struct atalk_iface *atif; + struct sockaddr_at tosat; + int origlen; + struct ddpebits ddphv; + + /* Don't mangle buffer if shared */ + if (!(skb = skb_share_check(skb, GFP_ATOMIC))) + goto out; + + /* Size check and make sure header is contiguous */ + if (!pskb_may_pull(skb, sizeof(*ddp))) + goto freeit; + + ddp = ddp_hdr(skb); + + /* + * Fix up the length field [Ok this is horrible but otherwise + * I end up with unions of bit fields and messy bit field order + * compiler/endian dependencies..] + */ + *((__u16 *)&ddphv) = ntohs(*((__u16 *)ddp)); + + /* Trim buffer in case of stray trailing data */ + origlen = skb->len; + skb_trim(skb, min_t(unsigned int, skb->len, ddphv.deh_len)); + + /* + * Size check to see if ddp->deh_len was crap + * (Otherwise we'll detonate most spectacularly + * in the middle of recvmsg()). + */ + if (skb->len < sizeof(*ddp)) + goto freeit; + + /* + * Any checksums. Note we don't do htons() on this == is assumed to be + * valid for net byte orders all over the networking code... + */ + if (ddp->deh_sum && + atalk_checksum(skb, ddphv.deh_len) != ddp->deh_sum) + /* Not a valid AppleTalk frame - dustbin time */ + goto freeit; + + /* Check the packet is aimed at us */ + if (!ddp->deh_dnet) /* Net 0 is 'this network' */ + atif = atalk_find_anynet(ddp->deh_dnode, dev); + else + atif = atalk_find_interface(ddp->deh_dnet, ddp->deh_dnode); + + /* Not ours, so we route the packet via the correct AppleTalk iface */ + if (!atif) { + atalk_route_packet(skb, dev, ddp, &ddphv, origlen); + goto out; + } + + /* if IP over DDP is not selected this code will be optimized out */ + if (is_ip_over_ddp(skb)) + return handle_ip_over_ddp(skb); + /* + * Which socket - atalk_search_socket() looks for a *full match* + * of the tuple. + */ + tosat.sat_addr.s_net = ddp->deh_dnet; + tosat.sat_addr.s_node = ddp->deh_dnode; + tosat.sat_port = ddp->deh_dport; + + sock = atalk_search_socket(&tosat, atif); + if (!sock) /* But not one of our sockets */ + goto freeit; + + /* Queue packet (standard) */ + skb->sk = sock; + + if (sock_queue_rcv_skb(sock, skb) < 0) + goto freeit; +out: + return 0; +freeit: + kfree_skb(skb); + goto out; +} + +/* + * Receive a LocalTalk frame. We make some demands on the caller here. + * Caller must provide enough headroom on the packet to pull the short + * header and append a long one. + */ +static int ltalk_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt) +{ + /* Expand any short form frames */ + if (skb->mac.raw[2] == 1) { + struct ddpehdr *ddp; + /* Find our address */ + struct atalk_addr *ap = atalk_find_dev_addr(dev); + + if (!ap || skb->len < sizeof(struct ddpshdr)) + goto freeit; + + /* Don't mangle buffer if shared */ + if (!(skb = skb_share_check(skb, GFP_ATOMIC))) + return 0; + + /* + * The push leaves us with a ddephdr not an shdr, and + * handily the port bytes in the right place preset. + */ + ddp = (struct ddpehdr *) skb_push(skb, sizeof(*ddp) - 4); + + /* Now fill in the long header */ + + /* + * These two first. The mac overlays the new source/dest + * network information so we MUST copy these before + * we write the network numbers ! + */ + + ddp->deh_dnode = skb->mac.raw[0]; /* From physical header */ + ddp->deh_snode = skb->mac.raw[1]; /* From physical header */ + + ddp->deh_dnet = ap->s_net; /* Network number */ + ddp->deh_snet = ap->s_net; + ddp->deh_sum = 0; /* No checksum */ + /* + * Not sure about this bit... + */ + ddp->deh_len = skb->len; + ddp->deh_hops = DDP_MAXHOPS; /* Non routable, so force a drop + if we slip up later */ + /* Mend the byte order */ + *((__u16 *)ddp) = htons(*((__u16 *)ddp)); + } + skb->h.raw = skb->data; + + return atalk_rcv(skb, dev, pt); +freeit: + kfree_skb(skb); + return 0; +} + +static int atalk_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t len) +{ + struct sock *sk = sock->sk; + struct atalk_sock *at = at_sk(sk); + struct sockaddr_at *usat = (struct sockaddr_at *)msg->msg_name; + int flags = msg->msg_flags; + int loopback = 0; + struct sockaddr_at local_satalk, gsat; + struct sk_buff *skb; + struct net_device *dev; + struct ddpehdr *ddp; + int size; + struct atalk_route *rt; + int err; + + if (flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) + return -EINVAL; + + if (len > DDP_MAXSZ) + return -EMSGSIZE; + + if (usat) { + if (sock_flag(sk, SOCK_ZAPPED)) + if (atalk_autobind(sk) < 0) + return -EBUSY; + + if (msg->msg_namelen < sizeof(*usat) || + usat->sat_family != AF_APPLETALK) + return -EINVAL; + + /* netatalk doesn't implement this check */ + if (usat->sat_addr.s_node == ATADDR_BCAST && + !sock_flag(sk, SOCK_BROADCAST)) { + printk(KERN_INFO "SO_BROADCAST: Fix your netatalk as " + "it will break before 2.2\n"); +#if 0 + return -EPERM; +#endif + } + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + usat = &local_satalk; + usat->sat_family = AF_APPLETALK; + usat->sat_port = at->dest_port; + usat->sat_addr.s_node = at->dest_node; + usat->sat_addr.s_net = at->dest_net; + } + + /* Build a packet */ + SOCK_DEBUG(sk, "SK %p: Got address.\n", sk); + + /* For headers */ + size = sizeof(struct ddpehdr) + len + ddp_dl->header_length; + + if (usat->sat_addr.s_net || usat->sat_addr.s_node == ATADDR_ANYNODE) { + rt = atrtr_find(&usat->sat_addr); + if (!rt) + return -ENETUNREACH; + + dev = rt->dev; + } else { + struct atalk_addr at_hint; + + at_hint.s_node = 0; + at_hint.s_net = at->src_net; + + rt = atrtr_find(&at_hint); + if (!rt) + return -ENETUNREACH; + + dev = rt->dev; + } + + SOCK_DEBUG(sk, "SK %p: Size needed %d, device %s\n", + sk, size, dev->name); + + size += dev->hard_header_len; + skb = sock_alloc_send_skb(sk, size, (flags & MSG_DONTWAIT), &err); + if (!skb) + return err; + + skb->sk = sk; + skb_reserve(skb, ddp_dl->header_length); + skb_reserve(skb, dev->hard_header_len); + skb->dev = dev; + + SOCK_DEBUG(sk, "SK %p: Begin build.\n", sk); + + ddp = (struct ddpehdr *)skb_put(skb, sizeof(struct ddpehdr)); + ddp->deh_pad = 0; + ddp->deh_hops = 0; + ddp->deh_len = len + sizeof(*ddp); + /* + * Fix up the length field [Ok this is horrible but otherwise + * I end up with unions of bit fields and messy bit field order + * compiler/endian dependencies.. + */ + *((__u16 *)ddp) = ntohs(*((__u16 *)ddp)); + + ddp->deh_dnet = usat->sat_addr.s_net; + ddp->deh_snet = at->src_net; + ddp->deh_dnode = usat->sat_addr.s_node; + ddp->deh_snode = at->src_node; + ddp->deh_dport = usat->sat_port; + ddp->deh_sport = at->src_port; + + SOCK_DEBUG(sk, "SK %p: Copy user data (%Zd bytes).\n", sk, len); + + err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); + if (err) { + kfree_skb(skb); + return -EFAULT; + } + + if (sk->sk_no_check == 1) + ddp->deh_sum = 0; + else + ddp->deh_sum = atalk_checksum(skb, len + sizeof(*ddp)); + + /* + * Loopback broadcast packets to non gateway targets (ie routes + * to group we are in) + */ + if (ddp->deh_dnode == ATADDR_BCAST && + !(rt->flags & RTF_GATEWAY) && !(dev->flags & IFF_LOOPBACK)) { + struct sk_buff *skb2 = skb_copy(skb, GFP_KERNEL); + + if (skb2) { + loopback = 1; + SOCK_DEBUG(sk, "SK %p: send out(copy).\n", sk); + if (aarp_send_ddp(dev, skb2, + &usat->sat_addr, NULL) == -1) + kfree_skb(skb2); + /* else queued/sent above in the aarp queue */ + } + } + + if (dev->flags & IFF_LOOPBACK || loopback) { + SOCK_DEBUG(sk, "SK %p: Loop back.\n", sk); + /* loop back */ + skb_orphan(skb); + ddp_dl->request(ddp_dl, skb, dev->dev_addr); + } else { + SOCK_DEBUG(sk, "SK %p: send out.\n", sk); + if (rt->flags & RTF_GATEWAY) { + gsat.sat_addr = rt->gateway; + usat = &gsat; + } + + if (aarp_send_ddp(dev, skb, &usat->sat_addr, NULL) == -1) + kfree_skb(skb); + /* else queued/sent above in the aarp queue */ + } + SOCK_DEBUG(sk, "SK %p: Done write (%Zd).\n", sk, len); + + return len; +} + +static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_at *sat = (struct sockaddr_at *)msg->msg_name; + struct ddpehdr *ddp; + int copied = 0; + int err = 0; + struct ddpebits ddphv; + struct sk_buff *skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, + flags & MSG_DONTWAIT, &err); + if (!skb) + return err; + + /* FIXME: use skb->cb to be able to use shared skbs */ + ddp = ddp_hdr(skb); + *((__u16 *)&ddphv) = ntohs(*((__u16 *)ddp)); + + if (sk->sk_type == SOCK_RAW) { + copied = ddphv.deh_len; + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + } else { + copied = ddphv.deh_len - sizeof(*ddp); + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + err = skb_copy_datagram_iovec(skb, sizeof(*ddp), + msg->msg_iov, copied); + } + + if (!err) { + if (sat) { + sat->sat_family = AF_APPLETALK; + sat->sat_port = ddp->deh_sport; + sat->sat_addr.s_node = ddp->deh_snode; + sat->sat_addr.s_net = ddp->deh_snet; + } + msg->msg_namelen = sizeof(*sat); + } + + skb_free_datagram(sk, skb); /* Free the datagram. */ + return err ? : copied; +} + + +/* + * AppleTalk ioctl calls. + */ +static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + int rc = -EINVAL; + struct sock *sk = sock->sk; + void __user *argp = (void __user *)arg; + + switch (cmd) { + /* Protocol layer */ + case TIOCOUTQ: { + long amount = sk->sk_sndbuf - + atomic_read(&sk->sk_wmem_alloc); + + if (amount < 0) + amount = 0; + rc = put_user(amount, (int __user *)argp); + break; + } + case TIOCINQ: { + /* + * These two are safe on a single CPU system as only + * user tasks fiddle here + */ + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + long amount = 0; + + if (skb) + amount = skb->len - sizeof(struct ddpehdr); + rc = put_user(amount, (int __user *)argp); + break; + } + case SIOCGSTAMP: + rc = sock_get_timestamp(sk, argp); + break; + /* Routing */ + case SIOCADDRT: + case SIOCDELRT: + rc = -EPERM; + if (capable(CAP_NET_ADMIN)) + rc = atrtr_ioctl(cmd, argp); + break; + /* Interface */ + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFBRDADDR: + case SIOCATALKDIFADDR: + case SIOCDIFADDR: + case SIOCSARP: /* proxy AARP */ + case SIOCDARP: /* proxy AARP */ + rtnl_lock(); + rc = atif_ioctl(cmd, argp); + rtnl_unlock(); + break; + /* Physical layer ioctl calls */ + case SIOCSIFLINK: + case SIOCGIFHWADDR: + case SIOCSIFHWADDR: + case SIOCGIFFLAGS: + case SIOCSIFFLAGS: + case SIOCGIFTXQLEN: + case SIOCSIFTXQLEN: + case SIOCGIFMTU: + case SIOCGIFCONF: + case SIOCADDMULTI: + case SIOCDELMULTI: + case SIOCGIFCOUNT: + case SIOCGIFINDEX: + case SIOCGIFNAME: + rc = dev_ioctl(cmd, argp); + break; + } + + return rc; +} + +static struct net_proto_family atalk_family_ops = { + .family = PF_APPLETALK, + .create = atalk_create, + .owner = THIS_MODULE, +}; + +static struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = { + .family = PF_APPLETALK, + .owner = THIS_MODULE, + .release = atalk_release, + .bind = atalk_bind, + .connect = atalk_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = atalk_getname, + .poll = datagram_poll, + .ioctl = atalk_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = atalk_sendmsg, + .recvmsg = atalk_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +#include +SOCKOPS_WRAP(atalk_dgram, PF_APPLETALK); + +static struct notifier_block ddp_notifier = { + .notifier_call = ddp_device_event, +}; + +static struct packet_type ltalk_packet_type = { + .type = __constant_htons(ETH_P_LOCALTALK), + .func = ltalk_rcv, +}; + +static struct packet_type ppptalk_packet_type = { + .type = __constant_htons(ETH_P_PPPTALK), + .func = atalk_rcv, +}; + +static unsigned char ddp_snap_id[] = { 0x08, 0x00, 0x07, 0x80, 0x9B }; + +/* Export symbols for use by drivers when AppleTalk is a module */ +EXPORT_SYMBOL(aarp_send_ddp); +EXPORT_SYMBOL(atrtr_get_dev); +EXPORT_SYMBOL(atalk_find_dev_addr); + +static char atalk_err_snap[] __initdata = + KERN_CRIT "Unable to register DDP with SNAP.\n"; + +/* Called by proto.c on kernel start up */ +static int __init atalk_init(void) +{ + int rc = proto_register(&ddp_proto, 0); + + if (rc != 0) + goto out; + + (void)sock_register(&atalk_family_ops); + ddp_dl = register_snap_client(ddp_snap_id, atalk_rcv); + if (!ddp_dl) + printk(atalk_err_snap); + + dev_add_pack(<alk_packet_type); + dev_add_pack(&ppptalk_packet_type); + + register_netdevice_notifier(&ddp_notifier); + aarp_proto_init(); + atalk_proc_init(); + atalk_register_sysctl(); +out: + return rc; +} +module_init(atalk_init); + +/* + * No explicit module reference count manipulation is needed in the + * protocol. Socket layer sets module reference count for us + * and interfaces reference counting is done + * by the network device layer. + * + * Ergo, before the AppleTalk module can be removed, all AppleTalk + * sockets be closed from user space. + */ +static void __exit atalk_exit(void) +{ +#ifdef CONFIG_SYSCTL + atalk_unregister_sysctl(); +#endif /* CONFIG_SYSCTL */ + atalk_proc_exit(); + aarp_cleanup_module(); /* General aarp clean-up. */ + unregister_netdevice_notifier(&ddp_notifier); + dev_remove_pack(<alk_packet_type); + dev_remove_pack(&ppptalk_packet_type); + unregister_snap_client(ddp_dl); + sock_unregister(PF_APPLETALK); + proto_unregister(&ddp_proto); +} +module_exit(atalk_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Alan Cox "); +MODULE_DESCRIPTION("AppleTalk 0.20\n"); +MODULE_ALIAS_NETPROTO(PF_APPLETALK); diff --git a/net/appletalk/dev.c b/net/appletalk/dev.c new file mode 100644 index 000000000000..76598445d84b --- /dev/null +++ b/net/appletalk/dev.c @@ -0,0 +1,43 @@ +/* + * Moved here from drivers/net/net_init.c, which is: + * Written 1993,1994,1995 by Donald Becker. + */ + +#include +#include +#include +#include +#include + +static int ltalk_change_mtu(struct net_device *dev, int mtu) +{ + return -EINVAL; +} + +static int ltalk_mac_addr(struct net_device *dev, void *addr) +{ + return -EINVAL; +} + +void ltalk_setup(struct net_device *dev) +{ + /* Fill in the fields of the device structure with localtalk-generic values. */ + + dev->change_mtu = ltalk_change_mtu; + dev->hard_header = NULL; + dev->rebuild_header = NULL; + dev->set_mac_address = ltalk_mac_addr; + dev->hard_header_cache = NULL; + dev->header_cache_update= NULL; + + dev->type = ARPHRD_LOCALTLK; + dev->hard_header_len = LTALK_HLEN; + dev->mtu = LTALK_MTU; + dev->addr_len = LTALK_ALEN; + dev->tx_queue_len = 10; + + dev->broadcast[0] = 0xFF; + + dev->flags = IFF_BROADCAST|IFF_MULTICAST|IFF_NOARP; +} +EXPORT_SYMBOL(ltalk_setup); diff --git a/net/appletalk/sysctl_net_atalk.c b/net/appletalk/sysctl_net_atalk.c new file mode 100644 index 000000000000..af7f0604395d --- /dev/null +++ b/net/appletalk/sysctl_net_atalk.c @@ -0,0 +1,83 @@ +/* + * sysctl_net_atalk.c: sysctl interface to net AppleTalk subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/atalk directory entry (empty =) ). [MS] + * Dynamic registration, added aarp entries. (5/30/97 Chris Horn) + */ + +#include +#include +#include +#include + +static struct ctl_table atalk_table[] = { + { + .ctl_name = NET_ATALK_AARP_EXPIRY_TIME, + .procname = "aarp-expiry-time", + .data = &sysctl_aarp_expiry_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_ATALK_AARP_TICK_TIME, + .procname = "aarp-tick-time", + .data = &sysctl_aarp_tick_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_ATALK_AARP_RETRANSMIT_LIMIT, + .procname = "aarp-retransmit-limit", + .data = &sysctl_aarp_retransmit_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_ATALK_AARP_RESOLVE_TIME, + .procname = "aarp-resolve-time", + .data = &sysctl_aarp_resolve_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { 0 }, +}; + +static struct ctl_table atalk_dir_table[] = { + { + .ctl_name = NET_ATALK, + .procname = "appletalk", + .mode = 0555, + .child = atalk_table, + }, + { 0 }, +}; + +static struct ctl_table atalk_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = atalk_dir_table, + }, + { 0 }, +}; + +static struct ctl_table_header *atalk_table_header; + +void atalk_register_sysctl(void) +{ + atalk_table_header = register_sysctl_table(atalk_root_table, 1); +} + +void atalk_unregister_sysctl(void) +{ + unregister_sysctl_table(atalk_table_header); +} diff --git a/net/atm/Makefile b/net/atm/Makefile new file mode 100644 index 000000000000..d5818751f6ba --- /dev/null +++ b/net/atm/Makefile @@ -0,0 +1,18 @@ +# +# Makefile for the ATM Protocol Families. +# + +atm-y := addr.o pvc.o signaling.o svc.o ioctl.o common.o atm_misc.o raw.o resources.o +mpoa-objs := mpc.o mpoa_caches.o mpoa_proc.o + +obj-$(CONFIG_ATM) += atm.o +obj-$(CONFIG_ATM_CLIP) += clip.o +atm-$(subst m,y,$(CONFIG_ATM_CLIP)) += ipcommon.o +obj-$(CONFIG_ATM_BR2684) += br2684.o +atm-$(subst m,y,$(CONFIG_ATM_BR2684)) += ipcommon.o +atm-$(subst m,y,$(CONFIG_NET_SCH_ATM)) += ipcommon.o +atm-$(CONFIG_PROC_FS) += proc.o + +obj-$(CONFIG_ATM_LANE) += lec.o +obj-$(CONFIG_ATM_MPOA) += mpoa.o +obj-$(CONFIG_PPPOATM) += pppoatm.o diff --git a/net/atm/addr.c b/net/atm/addr.c new file mode 100644 index 000000000000..1c8867f7f54a --- /dev/null +++ b/net/atm/addr.c @@ -0,0 +1,134 @@ +/* net/atm/addr.c - Local ATM address registry */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + +#include +#include +#include +#include + +#include "signaling.h" +#include "addr.h" + +static int check_addr(struct sockaddr_atmsvc *addr) +{ + int i; + + if (addr->sas_family != AF_ATMSVC) + return -EAFNOSUPPORT; + if (!*addr->sas_addr.pub) + return *addr->sas_addr.prv ? 0 : -EINVAL; + for (i = 1; i < ATM_E164_LEN + 1; i++) /* make sure it's \0-terminated */ + if (!addr->sas_addr.pub[i]) + return 0; + return -EINVAL; +} + +static int identical(struct sockaddr_atmsvc *a, struct sockaddr_atmsvc *b) +{ + if (*a->sas_addr.prv) + if (memcmp(a->sas_addr.prv, b->sas_addr.prv, ATM_ESA_LEN)) + return 0; + if (!*a->sas_addr.pub) + return !*b->sas_addr.pub; + if (!*b->sas_addr.pub) + return 0; + return !strcmp(a->sas_addr.pub, b->sas_addr.pub); +} + +static void notify_sigd(struct atm_dev *dev) +{ + struct sockaddr_atmpvc pvc; + + pvc.sap_addr.itf = dev->number; + sigd_enq(NULL, as_itf_notify, NULL, &pvc, NULL); +} + +void atm_reset_addr(struct atm_dev *dev) +{ + unsigned long flags; + struct atm_dev_addr *this, *p; + + spin_lock_irqsave(&dev->lock, flags); + list_for_each_entry_safe(this, p, &dev->local, entry) + kfree(this); + spin_unlock_irqrestore(&dev->lock, flags); + notify_sigd(dev); +} + +int atm_add_addr(struct atm_dev *dev, struct sockaddr_atmsvc *addr) +{ + unsigned long flags; + struct atm_dev_addr *this; + int error; + + error = check_addr(addr); + if (error) + return error; + spin_lock_irqsave(&dev->lock, flags); + list_for_each_entry(this, &dev->local, entry) { + if (identical(&this->addr, addr)) { + spin_unlock_irqrestore(&dev->lock, flags); + return -EEXIST; + } + } + this = kmalloc(sizeof(struct atm_dev_addr), GFP_ATOMIC); + if (!this) { + spin_unlock_irqrestore(&dev->lock, flags); + return -ENOMEM; + } + this->addr = *addr; + list_add(&this->entry, &dev->local); + spin_unlock_irqrestore(&dev->lock, flags); + notify_sigd(dev); + return 0; +} + +int atm_del_addr(struct atm_dev *dev, struct sockaddr_atmsvc *addr) +{ + unsigned long flags; + struct atm_dev_addr *this; + int error; + + error = check_addr(addr); + if (error) + return error; + spin_lock_irqsave(&dev->lock, flags); + list_for_each_entry(this, &dev->local, entry) { + if (identical(&this->addr, addr)) { + list_del(&this->entry); + spin_unlock_irqrestore(&dev->lock, flags); + kfree(this); + notify_sigd(dev); + return 0; + } + } + spin_unlock_irqrestore(&dev->lock, flags); + return -ENOENT; +} + +int atm_get_addr(struct atm_dev *dev, struct sockaddr_atmsvc __user * buf, + size_t size) +{ + unsigned long flags; + struct atm_dev_addr *this; + int total = 0, error; + struct sockaddr_atmsvc *tmp_buf, *tmp_bufp; + + spin_lock_irqsave(&dev->lock, flags); + list_for_each_entry(this, &dev->local, entry) + total += sizeof(struct sockaddr_atmsvc); + tmp_buf = tmp_bufp = kmalloc(total, GFP_ATOMIC); + if (!tmp_buf) { + spin_unlock_irqrestore(&dev->lock, flags); + return -ENOMEM; + } + list_for_each_entry(this, &dev->local, entry) + memcpy(tmp_bufp++, &this->addr, sizeof(struct sockaddr_atmsvc)); + spin_unlock_irqrestore(&dev->lock, flags); + error = total > size ? -E2BIG : total; + if (copy_to_user(buf, tmp_buf, total < size ? total : size)) + error = -EFAULT; + kfree(tmp_buf); + return error; +} diff --git a/net/atm/addr.h b/net/atm/addr.h new file mode 100644 index 000000000000..3099d21feeaa --- /dev/null +++ b/net/atm/addr.h @@ -0,0 +1,18 @@ +/* net/atm/addr.h - Local ATM address registry */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + + +#ifndef NET_ATM_ADDR_H +#define NET_ATM_ADDR_H + +#include +#include + + +void atm_reset_addr(struct atm_dev *dev); +int atm_add_addr(struct atm_dev *dev,struct sockaddr_atmsvc *addr); +int atm_del_addr(struct atm_dev *dev,struct sockaddr_atmsvc *addr); +int atm_get_addr(struct atm_dev *dev,struct sockaddr_atmsvc __user *buf,size_t size); + +#endif diff --git a/net/atm/atm_misc.c b/net/atm/atm_misc.c new file mode 100644 index 000000000000..b2113c3454ae --- /dev/null +++ b/net/atm/atm_misc.c @@ -0,0 +1,106 @@ +/* net/atm/atm_misc.c - Various functions for use by ATM drivers */ + +/* Written 1995-2000 by Werner Almesberger, EPFL ICA */ + + +#include +#include +#include +#include +#include +#include +#include +#include + + +int atm_charge(struct atm_vcc *vcc,int truesize) +{ + atm_force_charge(vcc,truesize); + if (atomic_read(&sk_atm(vcc)->sk_rmem_alloc) <= sk_atm(vcc)->sk_rcvbuf) + return 1; + atm_return(vcc,truesize); + atomic_inc(&vcc->stats->rx_drop); + return 0; +} + + +struct sk_buff *atm_alloc_charge(struct atm_vcc *vcc,int pdu_size, + int gfp_flags) +{ + struct sock *sk = sk_atm(vcc); + int guess = atm_guess_pdu2truesize(pdu_size); + + atm_force_charge(vcc,guess); + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { + struct sk_buff *skb = alloc_skb(pdu_size,gfp_flags); + + if (skb) { + atomic_add(skb->truesize-guess, + &sk->sk_rmem_alloc); + return skb; + } + } + atm_return(vcc,guess); + atomic_inc(&vcc->stats->rx_drop); + return NULL; +} + + +/* + * atm_pcr_goal returns the positive PCR if it should be rounded up, the + * negative PCR if it should be rounded down, and zero if the maximum available + * bandwidth should be used. + * + * The rules are as follows (* = maximum, - = absent (0), x = value "x", + * (x+ = x or next value above x, x- = x or next value below): + * + * min max pcr result min max pcr result + * - - - * (UBR only) x - - x+ + * - - * * x - * * + * - - z z- x - z z- + * - * - * x * - x+ + * - * * * x * * * + * - * z z- x * z z- + * - y - y- x y - x+ + * - y * y- x y * y- + * - y z z- x y z z- + * + * All non-error cases can be converted with the following simple set of rules: + * + * if pcr == z then z- + * else if min == x && pcr == - then x+ + * else if max == y then y- + * else * + */ + + +int atm_pcr_goal(struct atm_trafprm *tp) +{ + if (tp->pcr && tp->pcr != ATM_MAX_PCR) return -tp->pcr; + if (tp->min_pcr && !tp->pcr) return tp->min_pcr; + if (tp->max_pcr != ATM_MAX_PCR) return -tp->max_pcr; + return 0; +} + + +void sonet_copy_stats(struct k_sonet_stats *from,struct sonet_stats *to) +{ +#define __HANDLE_ITEM(i) to->i = atomic_read(&from->i) + __SONET_ITEMS +#undef __HANDLE_ITEM +} + + +void sonet_subtract_stats(struct k_sonet_stats *from,struct sonet_stats *to) +{ +#define __HANDLE_ITEM(i) atomic_sub(to->i,&from->i) + __SONET_ITEMS +#undef __HANDLE_ITEM +} + + +EXPORT_SYMBOL(atm_charge); +EXPORT_SYMBOL(atm_alloc_charge); +EXPORT_SYMBOL(atm_pcr_goal); +EXPORT_SYMBOL(sonet_copy_stats); +EXPORT_SYMBOL(sonet_subtract_stats); diff --git a/net/atm/br2684.c b/net/atm/br2684.c new file mode 100644 index 000000000000..e6954cf1459d --- /dev/null +++ b/net/atm/br2684.c @@ -0,0 +1,824 @@ +/* +Experimental ethernet netdevice using ATM AAL5 as underlying carrier +(RFC1483 obsoleted by RFC2684) for Linux 2.4 +Author: Marcell GAL, 2000, XDSL Ltd, Hungary +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "common.h" +#include "ipcommon.h" + +/* + * Define this to use a version of the code which interacts with the higher + * layers in a more intellegent way, by always reserving enough space for + * our header at the begining of the packet. However, there may still be + * some problems with programs like tcpdump. In 2.5 we'll sort out what + * we need to do to get this perfect. For now we just will copy the packet + * if we need space for the header + */ +/* #define FASTER_VERSION */ + +#ifdef DEBUG +#define DPRINTK(format, args...) printk(KERN_DEBUG "br2684: " format, ##args) +#else +#define DPRINTK(format, args...) +#endif + +#ifdef SKB_DEBUG +static void skb_debug(const struct sk_buff *skb) +{ +#define NUM2PRINT 50 + char buf[NUM2PRINT * 3 + 1]; /* 3 chars per byte */ + int i = 0; + for (i = 0; i < skb->len && i < NUM2PRINT; i++) { + sprintf(buf + i * 3, "%2.2x ", 0xff & skb->data[i]); + } + printk(KERN_DEBUG "br2684: skb: %s\n", buf); +} +#else +#define skb_debug(skb) do {} while (0) +#endif + +static unsigned char llc_oui_pid_pad[] = + { 0xAA, 0xAA, 0x03, 0x00, 0x80, 0xC2, 0x00, 0x07, 0x00, 0x00 }; +#define PADLEN (2) + +enum br2684_encaps { + e_vc = BR2684_ENCAPS_VC, + e_llc = BR2684_ENCAPS_LLC, +}; + +struct br2684_vcc { + struct atm_vcc *atmvcc; + struct net_device *device; + /* keep old push,pop functions for chaining */ + void (*old_push)(struct atm_vcc *vcc,struct sk_buff *skb); + /* void (*old_pop)(struct atm_vcc *vcc,struct sk_buff *skb); */ + enum br2684_encaps encaps; + struct list_head brvccs; +#ifdef CONFIG_ATM_BR2684_IPFILTER + struct br2684_filter filter; +#endif /* CONFIG_ATM_BR2684_IPFILTER */ +#ifndef FASTER_VERSION + unsigned copies_needed, copies_failed; +#endif /* FASTER_VERSION */ +}; + +struct br2684_dev { + struct net_device *net_dev; + struct list_head br2684_devs; + int number; + struct list_head brvccs; /* one device <=> one vcc (before xmas) */ + struct net_device_stats stats; + int mac_was_set; +}; + +/* + * This lock should be held for writing any time the list of devices or + * their attached vcc's could be altered. It should be held for reading + * any time these are being queried. Note that we sometimes need to + * do read-locking under interrupt context, so write locking must block + * the current CPU's interrupts + */ +static DEFINE_RWLOCK(devs_lock); + +static LIST_HEAD(br2684_devs); + +static inline struct br2684_dev *BRPRIV(const struct net_device *net_dev) +{ + return (struct br2684_dev *) net_dev->priv; +} + +static inline struct net_device *list_entry_brdev(const struct list_head *le) +{ + return list_entry(le, struct br2684_dev, br2684_devs)->net_dev; +} + +static inline struct br2684_vcc *BR2684_VCC(const struct atm_vcc *atmvcc) +{ + return (struct br2684_vcc *) (atmvcc->user_back); +} + +static inline struct br2684_vcc *list_entry_brvcc(const struct list_head *le) +{ + return list_entry(le, struct br2684_vcc, brvccs); +} + +/* Caller should hold read_lock(&devs_lock) */ +static struct net_device *br2684_find_dev(const struct br2684_if_spec *s) +{ + struct list_head *lh; + struct net_device *net_dev; + switch (s->method) { + case BR2684_FIND_BYNUM: + list_for_each(lh, &br2684_devs) { + net_dev = list_entry_brdev(lh); + if (BRPRIV(net_dev)->number == s->spec.devnum) + return net_dev; + } + break; + case BR2684_FIND_BYIFNAME: + list_for_each(lh, &br2684_devs) { + net_dev = list_entry_brdev(lh); + if (!strncmp(net_dev->name, s->spec.ifname, IFNAMSIZ)) + return net_dev; + } + break; + } + return NULL; +} + +/* + * Send a packet out a particular vcc. Not to useful right now, but paves + * the way for multiple vcc's per itf. Returns true if we can send, + * otherwise false + */ +static int br2684_xmit_vcc(struct sk_buff *skb, struct br2684_dev *brdev, + struct br2684_vcc *brvcc) +{ + struct atm_vcc *atmvcc; +#ifdef FASTER_VERSION + if (brvcc->encaps == e_llc) + memcpy(skb_push(skb, 8), llc_oui_pid_pad, 8); + /* last 2 bytes of llc_oui_pid_pad are managed by header routines; + yes, you got it: 8 + 2 = sizeof(llc_oui_pid_pad) + */ +#else + int minheadroom = (brvcc->encaps == e_llc) ? 10 : 2; + if (skb_headroom(skb) < minheadroom) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, minheadroom); + brvcc->copies_needed++; + dev_kfree_skb(skb); + if (skb2 == NULL) { + brvcc->copies_failed++; + return 0; + } + skb = skb2; + } + skb_push(skb, minheadroom); + if (brvcc->encaps == e_llc) + memcpy(skb->data, llc_oui_pid_pad, 10); + else + memset(skb->data, 0, 2); +#endif /* FASTER_VERSION */ + skb_debug(skb); + + ATM_SKB(skb)->vcc = atmvcc = brvcc->atmvcc; + DPRINTK("atm_skb(%p)->vcc(%p)->dev(%p)\n", skb, atmvcc, atmvcc->dev); + if (!atm_may_send(atmvcc, skb->truesize)) { + /* we free this here for now, because we cannot know in a higher + layer whether the skb point it supplied wasn't freed yet. + now, it always is. + */ + dev_kfree_skb(skb); + return 0; + } + atomic_add(skb->truesize, &sk_atm(atmvcc)->sk_wmem_alloc); + ATM_SKB(skb)->atm_options = atmvcc->atm_options; + brdev->stats.tx_packets++; + brdev->stats.tx_bytes += skb->len; + atmvcc->send(atmvcc, skb); + return 1; +} + +static inline struct br2684_vcc *pick_outgoing_vcc(struct sk_buff *skb, + struct br2684_dev *brdev) +{ + return list_empty(&brdev->brvccs) ? NULL : + list_entry_brvcc(brdev->brvccs.next); /* 1 vcc/dev right now */ +} + +static int br2684_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct br2684_dev *brdev = BRPRIV(dev); + struct br2684_vcc *brvcc; + + DPRINTK("br2684_start_xmit, skb->dst=%p\n", skb->dst); + read_lock(&devs_lock); + brvcc = pick_outgoing_vcc(skb, brdev); + if (brvcc == NULL) { + DPRINTK("no vcc attached to dev %s\n", dev->name); + brdev->stats.tx_errors++; + brdev->stats.tx_carrier_errors++; + /* netif_stop_queue(dev); */ + dev_kfree_skb(skb); + read_unlock(&devs_lock); + return -EUNATCH; + } + if (!br2684_xmit_vcc(skb, brdev, brvcc)) { + /* + * We should probably use netif_*_queue() here, but that + * involves added complication. We need to walk before + * we can run + */ + /* don't free here! this pointer might be no longer valid! + dev_kfree_skb(skb); + */ + brdev->stats.tx_errors++; + brdev->stats.tx_fifo_errors++; + } + read_unlock(&devs_lock); + return 0; +} + +static struct net_device_stats *br2684_get_stats(struct net_device *dev) +{ + DPRINTK("br2684_get_stats\n"); + return &BRPRIV(dev)->stats; +} + +#ifdef FASTER_VERSION +/* + * These mirror eth_header and eth_header_cache. They are not usually + * exported for use in modules, so we grab them from net_device + * after ether_setup() is done with it. Bit of a hack. + */ +static int (*my_eth_header)(struct sk_buff *, struct net_device *, + unsigned short, void *, void *, unsigned); +static int (*my_eth_header_cache)(struct neighbour *, struct hh_cache *); + +static int +br2684_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, void *daddr, void *saddr, unsigned len) +{ + u16 *pad_before_eth; + int t = my_eth_header(skb, dev, type, daddr, saddr, len); + if (t > 0) { + pad_before_eth = (u16 *) skb_push(skb, 2); + *pad_before_eth = 0; + return dev->hard_header_len; /* or return 16; ? */ + } else + return t; +} + +static int +br2684_header_cache(struct neighbour *neigh, struct hh_cache *hh) +{ +/* hh_data is 16 bytes long. if encaps is ether-llc we need 24, so +xmit will add the additional header part in that case */ + u16 *pad_before_eth = (u16 *)(hh->hh_data); + int t = my_eth_header_cache(neigh, hh); + DPRINTK("br2684_header_cache, neigh=%p, hh_cache=%p\n", neigh, hh); + if (t < 0) + return t; + else { + *pad_before_eth = 0; + hh->hh_len = PADLEN + ETH_HLEN; + } + return 0; +} + +/* + * This is similar to eth_type_trans, which cannot be used because of + * our dev->hard_header_len + */ +static inline unsigned short br_type_trans(struct sk_buff *skb, + struct net_device *dev) +{ + struct ethhdr *eth; + unsigned char *rawp; + eth = eth_hdr(skb); + + if (*eth->h_dest & 1) { + if (memcmp(eth->h_dest, dev->broadcast, ETH_ALEN) == 0) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + } + + else if (memcmp(eth->h_dest, dev->dev_addr, ETH_ALEN)) + skb->pkt_type = PACKET_OTHERHOST; + + if (ntohs(eth->h_proto) >= 1536) + return eth->h_proto; + + rawp = skb->data; + + /* + * This is a magic hack to spot IPX packets. Older Novell breaks + * the protocol design and runs IPX over 802.3 without an 802.2 LLC + * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This + * won't work for fault tolerant netware but does for the rest. + */ + if (*(unsigned short *) rawp == 0xFFFF) + return htons(ETH_P_802_3); + + /* + * Real 802.2 LLC + */ + return htons(ETH_P_802_2); +} +#endif /* FASTER_VERSION */ + +/* + * We remember when the MAC gets set, so we don't override it later with + * the ESI of the ATM card of the first VC + */ +static int (*my_eth_mac_addr)(struct net_device *, void *); +static int br2684_mac_addr(struct net_device *dev, void *p) +{ + int err = my_eth_mac_addr(dev, p); + if (!err) + BRPRIV(dev)->mac_was_set = 1; + return err; +} + +#ifdef CONFIG_ATM_BR2684_IPFILTER +/* this IOCTL is experimental. */ +static int br2684_setfilt(struct atm_vcc *atmvcc, void __user *arg) +{ + struct br2684_vcc *brvcc; + struct br2684_filter_set fs; + + if (copy_from_user(&fs, arg, sizeof fs)) + return -EFAULT; + if (fs.ifspec.method != BR2684_FIND_BYNOTHING) { + /* + * This is really a per-vcc thing, but we can also search + * by device + */ + struct br2684_dev *brdev; + read_lock(&devs_lock); + brdev = BRPRIV(br2684_find_dev(&fs.ifspec)); + if (brdev == NULL || list_empty(&brdev->brvccs) || + brdev->brvccs.next != brdev->brvccs.prev) /* >1 VCC */ + brvcc = NULL; + else + brvcc = list_entry_brvcc(brdev->brvccs.next); + read_unlock(&devs_lock); + if (brvcc == NULL) + return -ESRCH; + } else + brvcc = BR2684_VCC(atmvcc); + memcpy(&brvcc->filter, &fs.filter, sizeof(brvcc->filter)); + return 0; +} + +/* Returns 1 if packet should be dropped */ +static inline int +packet_fails_filter(u16 type, struct br2684_vcc *brvcc, struct sk_buff *skb) +{ + if (brvcc->filter.netmask == 0) + return 0; /* no filter in place */ + if (type == __constant_htons(ETH_P_IP) && + (((struct iphdr *) (skb->data))->daddr & brvcc->filter. + netmask) == brvcc->filter.prefix) + return 0; + if (type == __constant_htons(ETH_P_ARP)) + return 0; + /* TODO: we should probably filter ARPs too.. don't want to have + * them returning values that don't make sense, or is that ok? + */ + return 1; /* drop */ +} +#endif /* CONFIG_ATM_BR2684_IPFILTER */ + +static void br2684_close_vcc(struct br2684_vcc *brvcc) +{ + DPRINTK("removing VCC %p from dev %p\n", brvcc, brvcc->device); + write_lock_irq(&devs_lock); + list_del(&brvcc->brvccs); + write_unlock_irq(&devs_lock); + brvcc->atmvcc->user_back = NULL; /* what about vcc->recvq ??? */ + brvcc->old_push(brvcc->atmvcc, NULL); /* pass on the bad news */ + kfree(brvcc); + module_put(THIS_MODULE); +} + +/* when AAL5 PDU comes in: */ +static void br2684_push(struct atm_vcc *atmvcc, struct sk_buff *skb) +{ + struct br2684_vcc *brvcc = BR2684_VCC(atmvcc); + struct net_device *net_dev = brvcc->device; + struct br2684_dev *brdev = BRPRIV(net_dev); + int plen = sizeof(llc_oui_pid_pad) + ETH_HLEN; + + DPRINTK("br2684_push\n"); + + if (unlikely(skb == NULL)) { + /* skb==NULL means VCC is being destroyed */ + br2684_close_vcc(brvcc); + if (list_empty(&brdev->brvccs)) { + read_lock(&devs_lock); + list_del(&brdev->br2684_devs); + read_unlock(&devs_lock); + unregister_netdev(net_dev); + free_netdev(net_dev); + } + return; + } + + skb_debug(skb); + atm_return(atmvcc, skb->truesize); + DPRINTK("skb from brdev %p\n", brdev); + if (brvcc->encaps == e_llc) { + /* let us waste some time for checking the encapsulation. + Note, that only 7 char is checked so frames with a valid FCS + are also accepted (but FCS is not checked of course) */ + if (memcmp(skb->data, llc_oui_pid_pad, 7)) { + brdev->stats.rx_errors++; + dev_kfree_skb(skb); + return; + } + + /* Strip FCS if present */ + if (skb->len > 7 && skb->data[7] == 0x01) + __skb_trim(skb, skb->len - 4); + } else { + plen = PADLEN + ETH_HLEN; /* pad, dstmac,srcmac, ethtype */ + /* first 2 chars should be 0 */ + if (*((u16 *) (skb->data)) != 0) { + brdev->stats.rx_errors++; + dev_kfree_skb(skb); + return; + } + } + if (skb->len < plen) { + brdev->stats.rx_errors++; + dev_kfree_skb(skb); /* dev_ not needed? */ + return; + } + +#ifdef FASTER_VERSION + /* FIXME: tcpdump shows that pointer to mac header is 2 bytes earlier, + than should be. What else should I set? */ + skb_pull(skb, plen); + skb->mac.raw = ((char *) (skb->data)) - ETH_HLEN; + skb->pkt_type = PACKET_HOST; +#ifdef CONFIG_BR2684_FAST_TRANS + skb->protocol = ((u16 *) skb->data)[-1]; +#else /* some protocols might require this: */ + skb->protocol = br_type_trans(skb, net_dev); +#endif /* CONFIG_BR2684_FAST_TRANS */ +#else + skb_pull(skb, plen - ETH_HLEN); + skb->protocol = eth_type_trans(skb, net_dev); +#endif /* FASTER_VERSION */ +#ifdef CONFIG_ATM_BR2684_IPFILTER + if (unlikely(packet_fails_filter(skb->protocol, brvcc, skb))) { + brdev->stats.rx_dropped++; + dev_kfree_skb(skb); + return; + } +#endif /* CONFIG_ATM_BR2684_IPFILTER */ + skb->dev = net_dev; + ATM_SKB(skb)->vcc = atmvcc; /* needed ? */ + DPRINTK("received packet's protocol: %x\n", ntohs(skb->protocol)); + skb_debug(skb); + if (unlikely(!(net_dev->flags & IFF_UP))) { + /* sigh, interface is down */ + brdev->stats.rx_dropped++; + dev_kfree_skb(skb); + return; + } + brdev->stats.rx_packets++; + brdev->stats.rx_bytes += skb->len; + memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); + netif_rx(skb); +} + +static int br2684_regvcc(struct atm_vcc *atmvcc, void __user *arg) +{ +/* assign a vcc to a dev +Note: we do not have explicit unassign, but look at _push() +*/ + int err; + struct br2684_vcc *brvcc; + struct sk_buff_head copy; + struct sk_buff *skb; + struct br2684_dev *brdev; + struct net_device *net_dev; + struct atm_backend_br2684 be; + + if (copy_from_user(&be, arg, sizeof be)) + return -EFAULT; + brvcc = kmalloc(sizeof(struct br2684_vcc), GFP_KERNEL); + if (!brvcc) + return -ENOMEM; + memset(brvcc, 0, sizeof(struct br2684_vcc)); + write_lock_irq(&devs_lock); + net_dev = br2684_find_dev(&be.ifspec); + if (net_dev == NULL) { + printk(KERN_ERR + "br2684: tried to attach to non-existant device\n"); + err = -ENXIO; + goto error; + } + brdev = BRPRIV(net_dev); + if (atmvcc->push == NULL) { + err = -EBADFD; + goto error; + } + if (!list_empty(&brdev->brvccs)) { + /* Only 1 VCC/dev right now */ + err = -EEXIST; + goto error; + } + if (be.fcs_in != BR2684_FCSIN_NO || be.fcs_out != BR2684_FCSOUT_NO || + be.fcs_auto || be.has_vpiid || be.send_padding || (be.encaps != + BR2684_ENCAPS_VC && be.encaps != BR2684_ENCAPS_LLC) || + be.min_size != 0) { + err = -EINVAL; + goto error; + } + DPRINTK("br2684_regvcc vcc=%p, encaps=%d, brvcc=%p\n", atmvcc, be.encaps, + brvcc); + if (list_empty(&brdev->brvccs) && !brdev->mac_was_set) { + unsigned char *esi = atmvcc->dev->esi; + if (esi[0] | esi[1] | esi[2] | esi[3] | esi[4] | esi[5]) + memcpy(net_dev->dev_addr, esi, net_dev->addr_len); + else + net_dev->dev_addr[2] = 1; + } + list_add(&brvcc->brvccs, &brdev->brvccs); + write_unlock_irq(&devs_lock); + brvcc->device = net_dev; + brvcc->atmvcc = atmvcc; + atmvcc->user_back = brvcc; + brvcc->encaps = (enum br2684_encaps) be.encaps; + brvcc->old_push = atmvcc->push; + barrier(); + atmvcc->push = br2684_push; + skb_queue_head_init(©); + skb_migrate(&sk_atm(atmvcc)->sk_receive_queue, ©); + while ((skb = skb_dequeue(©)) != NULL) { + BRPRIV(skb->dev)->stats.rx_bytes -= skb->len; + BRPRIV(skb->dev)->stats.rx_packets--; + br2684_push(atmvcc, skb); + } + __module_get(THIS_MODULE); + return 0; + error: + write_unlock_irq(&devs_lock); + kfree(brvcc); + return err; +} + +static void br2684_setup(struct net_device *netdev) +{ + struct br2684_dev *brdev = BRPRIV(netdev); + + ether_setup(netdev); + brdev->net_dev = netdev; + +#ifdef FASTER_VERSION + my_eth_header = netdev->hard_header; + netdev->hard_header = br2684_header; + my_eth_header_cache = netdev->hard_header_cache; + netdev->hard_header_cache = br2684_header_cache; + netdev->hard_header_len = sizeof(llc_oui_pid_pad) + ETH_HLEN; /* 10 + 14 */ +#endif + my_eth_mac_addr = netdev->set_mac_address; + netdev->set_mac_address = br2684_mac_addr; + netdev->hard_start_xmit = br2684_start_xmit; + netdev->get_stats = br2684_get_stats; + + INIT_LIST_HEAD(&brdev->brvccs); +} + +static int br2684_create(void __user *arg) +{ + int err; + struct net_device *netdev; + struct br2684_dev *brdev; + struct atm_newif_br2684 ni; + + DPRINTK("br2684_create\n"); + + if (copy_from_user(&ni, arg, sizeof ni)) { + return -EFAULT; + } + if (ni.media != BR2684_MEDIA_ETHERNET || ni.mtu != 1500) { + return -EINVAL; + } + + netdev = alloc_netdev(sizeof(struct br2684_dev), + ni.ifname[0] ? ni.ifname : "nas%d", + br2684_setup); + if (!netdev) + return -ENOMEM; + + brdev = BRPRIV(netdev); + + DPRINTK("registered netdev %s\n", netdev->name); + /* open, stop, do_ioctl ? */ + err = register_netdev(netdev); + if (err < 0) { + printk(KERN_ERR "br2684_create: register_netdev failed\n"); + free_netdev(netdev); + return err; + } + + write_lock_irq(&devs_lock); + brdev->number = list_empty(&br2684_devs) ? 1 : + BRPRIV(list_entry_brdev(br2684_devs.prev))->number + 1; + list_add_tail(&brdev->br2684_devs, &br2684_devs); + write_unlock_irq(&devs_lock); + return 0; +} + +/* + * This handles ioctls actually performed on our vcc - we must return + * -ENOIOCTLCMD for any unrecognized ioctl + */ +static int br2684_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + struct atm_vcc *atmvcc = ATM_SD(sock); + void __user *argp = (void __user *)arg; + + int err; + switch(cmd) { + case ATM_SETBACKEND: + case ATM_NEWBACKENDIF: { + atm_backend_t b; + err = get_user(b, (atm_backend_t __user *) argp); + if (err) + return -EFAULT; + if (b != ATM_BACKEND_BR2684) + return -ENOIOCTLCMD; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (cmd == ATM_SETBACKEND) + return br2684_regvcc(atmvcc, argp); + else + return br2684_create(argp); + } +#ifdef CONFIG_ATM_BR2684_IPFILTER + case BR2684_SETFILT: + if (atmvcc->push != br2684_push) + return -ENOIOCTLCMD; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + err = br2684_setfilt(atmvcc, argp); + return err; +#endif /* CONFIG_ATM_BR2684_IPFILTER */ + } + return -ENOIOCTLCMD; +} + +static struct atm_ioctl br2684_ioctl_ops = { + .owner = THIS_MODULE, + .ioctl = br2684_ioctl, +}; + + +#ifdef CONFIG_PROC_FS +static void *br2684_seq_start(struct seq_file *seq, loff_t *pos) +{ + loff_t offs = 0; + struct br2684_dev *brd; + + read_lock(&devs_lock); + + list_for_each_entry(brd, &br2684_devs, br2684_devs) { + if (offs == *pos) + return brd; + ++offs; + } + return NULL; +} + +static void *br2684_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct br2684_dev *brd = v; + + ++*pos; + + brd = list_entry(brd->br2684_devs.next, + struct br2684_dev, br2684_devs); + return (&brd->br2684_devs != &br2684_devs) ? brd : NULL; +} + +static void br2684_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&devs_lock); +} + +static int br2684_seq_show(struct seq_file *seq, void *v) +{ + const struct br2684_dev *brdev = v; + const struct net_device *net_dev = brdev->net_dev; + const struct br2684_vcc *brvcc; + + seq_printf(seq, "dev %.16s: num=%d, mac=%02X:%02X:" + "%02X:%02X:%02X:%02X (%s)\n", net_dev->name, + brdev->number, + net_dev->dev_addr[0], + net_dev->dev_addr[1], + net_dev->dev_addr[2], + net_dev->dev_addr[3], + net_dev->dev_addr[4], + net_dev->dev_addr[5], + brdev->mac_was_set ? "set" : "auto"); + + list_for_each_entry(brvcc, &brdev->brvccs, brvccs) { + seq_printf(seq, " vcc %d.%d.%d: encaps=%s" +#ifndef FASTER_VERSION + ", failed copies %u/%u" +#endif /* FASTER_VERSION */ + "\n", brvcc->atmvcc->dev->number, + brvcc->atmvcc->vpi, brvcc->atmvcc->vci, + (brvcc->encaps == e_llc) ? "LLC" : "VC" +#ifndef FASTER_VERSION + , brvcc->copies_failed + , brvcc->copies_needed +#endif /* FASTER_VERSION */ + ); +#ifdef CONFIG_ATM_BR2684_IPFILTER +#define b1(var, byte) ((u8 *) &brvcc->filter.var)[byte] +#define bs(var) b1(var, 0), b1(var, 1), b1(var, 2), b1(var, 3) + if (brvcc->filter.netmask != 0) + seq_printf(seq, " filter=%d.%d.%d.%d/" + "%d.%d.%d.%d\n", + bs(prefix), bs(netmask)); +#undef bs +#undef b1 +#endif /* CONFIG_ATM_BR2684_IPFILTER */ + } + return 0; +} + +static struct seq_operations br2684_seq_ops = { + .start = br2684_seq_start, + .next = br2684_seq_next, + .stop = br2684_seq_stop, + .show = br2684_seq_show, +}; + +static int br2684_proc_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &br2684_seq_ops); +} + +static struct file_operations br2684_proc_ops = { + .owner = THIS_MODULE, + .open = br2684_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +extern struct proc_dir_entry *atm_proc_root; /* from proc.c */ +#endif + +static int __init br2684_init(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *p; + if ((p = create_proc_entry("br2684", 0, atm_proc_root)) == NULL) + return -ENOMEM; + p->proc_fops = &br2684_proc_ops; +#endif + register_atm_ioctl(&br2684_ioctl_ops); + return 0; +} + +static void __exit br2684_exit(void) +{ + struct net_device *net_dev; + struct br2684_dev *brdev; + struct br2684_vcc *brvcc; + deregister_atm_ioctl(&br2684_ioctl_ops); + +#ifdef CONFIG_PROC_FS + remove_proc_entry("br2684", atm_proc_root); +#endif + + while (!list_empty(&br2684_devs)) { + net_dev = list_entry_brdev(br2684_devs.next); + brdev = BRPRIV(net_dev); + while (!list_empty(&brdev->brvccs)) { + brvcc = list_entry_brvcc(brdev->brvccs.next); + br2684_close_vcc(brvcc); + } + + list_del(&brdev->br2684_devs); + unregister_netdev(net_dev); + free_netdev(net_dev); + } +} + +module_init(br2684_init); +module_exit(br2684_exit); + +MODULE_AUTHOR("Marcell GAL"); +MODULE_DESCRIPTION("RFC2684 bridged protocols over ATM/AAL5"); +MODULE_LICENSE("GPL"); diff --git a/net/atm/clip.c b/net/atm/clip.c new file mode 100644 index 000000000000..28dab55a4387 --- /dev/null +++ b/net/atm/clip.c @@ -0,0 +1,1045 @@ +/* net/atm/clip.c - RFC1577 Classical IP over ATM */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + + +#include +#include +#include +#include /* for UINT_MAX */ +#include +#include +#include +#include +#include +#include +#include /* for some manifest constants */ +#include +#include +#include +#include +#include +#include /* for net/route.h */ +#include /* for struct sockaddr_in */ +#include /* for IFF_UP */ +#include +#include +#include +#include +#include +#include +#include /* for struct rtable and routing */ +#include /* icmp_send */ +#include /* for HZ */ +#include /* for htons etc. */ +#include /* save/restore_flags */ +#include +#include + +#include "common.h" +#include "resources.h" +#include "ipcommon.h" +#include + + +#if 0 +#define DPRINTK(format,args...) printk(format,##args) +#else +#define DPRINTK(format,args...) +#endif + + +static struct net_device *clip_devs; +static struct atm_vcc *atmarpd; +static struct neigh_table clip_tbl; +static struct timer_list idle_timer; +static int start_timer = 1; + + +static int to_atmarpd(enum atmarp_ctrl_type type,int itf,unsigned long ip) +{ + struct sock *sk; + struct atmarp_ctrl *ctrl; + struct sk_buff *skb; + + DPRINTK("to_atmarpd(%d)\n",type); + if (!atmarpd) return -EUNATCH; + skb = alloc_skb(sizeof(struct atmarp_ctrl),GFP_ATOMIC); + if (!skb) return -ENOMEM; + ctrl = (struct atmarp_ctrl *) skb_put(skb,sizeof(struct atmarp_ctrl)); + ctrl->type = type; + ctrl->itf_num = itf; + ctrl->ip = ip; + atm_force_charge(atmarpd,skb->truesize); + + sk = sk_atm(atmarpd); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + return 0; +} + + +static void link_vcc(struct clip_vcc *clip_vcc,struct atmarp_entry *entry) +{ + DPRINTK("link_vcc %p to entry %p (neigh %p)\n",clip_vcc,entry, + entry->neigh); + clip_vcc->entry = entry; + clip_vcc->xoff = 0; /* @@@ may overrun buffer by one packet */ + clip_vcc->next = entry->vccs; + entry->vccs = clip_vcc; + entry->neigh->used = jiffies; +} + + +static void unlink_clip_vcc(struct clip_vcc *clip_vcc) +{ + struct atmarp_entry *entry = clip_vcc->entry; + struct clip_vcc **walk; + + if (!entry) { + printk(KERN_CRIT "!clip_vcc->entry (clip_vcc %p)\n",clip_vcc); + return; + } + spin_lock_bh(&entry->neigh->dev->xmit_lock); /* block clip_start_xmit() */ + entry->neigh->used = jiffies; + for (walk = &entry->vccs; *walk; walk = &(*walk)->next) + if (*walk == clip_vcc) { + int error; + + *walk = clip_vcc->next; /* atomic */ + clip_vcc->entry = NULL; + if (clip_vcc->xoff) + netif_wake_queue(entry->neigh->dev); + if (entry->vccs) + goto out; + entry->expires = jiffies-1; + /* force resolution or expiration */ + error = neigh_update(entry->neigh, NULL, NUD_NONE, + NEIGH_UPDATE_F_ADMIN); + if (error) + printk(KERN_CRIT "unlink_clip_vcc: " + "neigh_update failed with %d\n",error); + goto out; + } + printk(KERN_CRIT "ATMARP: unlink_clip_vcc failed (entry %p, vcc " + "0x%p)\n",entry,clip_vcc); +out: + spin_unlock_bh(&entry->neigh->dev->xmit_lock); +} + +/* The neighbour entry n->lock is held. */ +static int neigh_check_cb(struct neighbour *n) +{ + struct atmarp_entry *entry = NEIGH2ENTRY(n); + struct clip_vcc *cv; + + for (cv = entry->vccs; cv; cv = cv->next) { + unsigned long exp = cv->last_use + cv->idle_timeout; + + if (cv->idle_timeout && time_after(jiffies, exp)) { + DPRINTK("releasing vcc %p->%p of entry %p\n", + cv, cv->vcc, entry); + vcc_release_async(cv->vcc, -ETIMEDOUT); + } + } + + if (entry->vccs || time_before(jiffies, entry->expires)) + return 0; + + if (atomic_read(&n->refcnt) > 1) { + struct sk_buff *skb; + + DPRINTK("destruction postponed with ref %d\n", + atomic_read(&n->refcnt)); + + while ((skb = skb_dequeue(&n->arp_queue)) != NULL) + dev_kfree_skb(skb); + + return 0; + } + + DPRINTK("expired neigh %p\n",n); + return 1; +} + +static void idle_timer_check(unsigned long dummy) +{ + write_lock(&clip_tbl.lock); + __neigh_for_each_release(&clip_tbl, neigh_check_cb); + mod_timer(&idle_timer, jiffies+CLIP_CHECK_INTERVAL*HZ); + write_unlock(&clip_tbl.lock); +} + +static int clip_arp_rcv(struct sk_buff *skb) +{ + struct atm_vcc *vcc; + + DPRINTK("clip_arp_rcv\n"); + vcc = ATM_SKB(skb)->vcc; + if (!vcc || !atm_charge(vcc,skb->truesize)) { + dev_kfree_skb_any(skb); + return 0; + } + DPRINTK("pushing to %p\n",vcc); + DPRINTK("using %p\n",CLIP_VCC(vcc)->old_push); + CLIP_VCC(vcc)->old_push(vcc,skb); + return 0; +} + +static const unsigned char llc_oui[] = { + 0xaa, /* DSAP: non-ISO */ + 0xaa, /* SSAP: non-ISO */ + 0x03, /* Ctrl: Unnumbered Information Command PDU */ + 0x00, /* OUI: EtherType */ + 0x00, + 0x00 }; + +static void clip_push(struct atm_vcc *vcc,struct sk_buff *skb) +{ + struct clip_vcc *clip_vcc = CLIP_VCC(vcc); + + DPRINTK("clip push\n"); + if (!skb) { + DPRINTK("removing VCC %p\n",clip_vcc); + if (clip_vcc->entry) unlink_clip_vcc(clip_vcc); + clip_vcc->old_push(vcc,NULL); /* pass on the bad news */ + kfree(clip_vcc); + return; + } + atm_return(vcc,skb->truesize); + skb->dev = clip_vcc->entry ? clip_vcc->entry->neigh->dev : clip_devs; + /* clip_vcc->entry == NULL if we don't have an IP address yet */ + if (!skb->dev) { + dev_kfree_skb_any(skb); + return; + } + ATM_SKB(skb)->vcc = vcc; + skb->mac.raw = skb->data; + if (!clip_vcc->encap || skb->len < RFC1483LLC_LEN || memcmp(skb->data, + llc_oui,sizeof(llc_oui))) skb->protocol = htons(ETH_P_IP); + else { + skb->protocol = ((u16 *) skb->data)[3]; + skb_pull(skb,RFC1483LLC_LEN); + if (skb->protocol == htons(ETH_P_ARP)) { + PRIV(skb->dev)->stats.rx_packets++; + PRIV(skb->dev)->stats.rx_bytes += skb->len; + clip_arp_rcv(skb); + return; + } + } + clip_vcc->last_use = jiffies; + PRIV(skb->dev)->stats.rx_packets++; + PRIV(skb->dev)->stats.rx_bytes += skb->len; + memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); + netif_rx(skb); +} + + +/* + * Note: these spinlocks _must_not_ block on non-SMP. The only goal is that + * clip_pop is atomic with respect to the critical section in clip_start_xmit. + */ + + +static void clip_pop(struct atm_vcc *vcc,struct sk_buff *skb) +{ + struct clip_vcc *clip_vcc = CLIP_VCC(vcc); + struct net_device *dev = skb->dev; + int old; + unsigned long flags; + + DPRINTK("clip_pop(vcc %p)\n",vcc); + clip_vcc->old_pop(vcc,skb); + /* skb->dev == NULL in outbound ARP packets */ + if (!dev) return; + spin_lock_irqsave(&PRIV(dev)->xoff_lock,flags); + if (atm_may_send(vcc,0)) { + old = xchg(&clip_vcc->xoff,0); + if (old) netif_wake_queue(dev); + } + spin_unlock_irqrestore(&PRIV(dev)->xoff_lock,flags); +} + + +static void clip_neigh_destroy(struct neighbour *neigh) +{ + DPRINTK("clip_neigh_destroy (neigh %p)\n",neigh); + if (NEIGH2ENTRY(neigh)->vccs) + printk(KERN_CRIT "clip_neigh_destroy: vccs != NULL !!!\n"); + NEIGH2ENTRY(neigh)->vccs = (void *) 0xdeadbeef; +} + + +static void clip_neigh_solicit(struct neighbour *neigh,struct sk_buff *skb) +{ + DPRINTK("clip_neigh_solicit (neigh %p, skb %p)\n",neigh,skb); + to_atmarpd(act_need,PRIV(neigh->dev)->number,NEIGH2ENTRY(neigh)->ip); +} + + +static void clip_neigh_error(struct neighbour *neigh,struct sk_buff *skb) +{ +#ifndef CONFIG_ATM_CLIP_NO_ICMP + icmp_send(skb,ICMP_DEST_UNREACH,ICMP_HOST_UNREACH,0); +#endif + kfree_skb(skb); +} + + +static struct neigh_ops clip_neigh_ops = { + .family = AF_INET, + .destructor = clip_neigh_destroy, + .solicit = clip_neigh_solicit, + .error_report = clip_neigh_error, + .output = dev_queue_xmit, + .connected_output = dev_queue_xmit, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + + +static int clip_constructor(struct neighbour *neigh) +{ + struct atmarp_entry *entry = NEIGH2ENTRY(neigh); + struct net_device *dev = neigh->dev; + struct in_device *in_dev; + struct neigh_parms *parms; + + DPRINTK("clip_constructor (neigh %p, entry %p)\n",neigh,entry); + neigh->type = inet_addr_type(entry->ip); + if (neigh->type != RTN_UNICAST) return -EINVAL; + + rcu_read_lock(); + in_dev = rcu_dereference(__in_dev_get(dev)); + if (!in_dev) { + rcu_read_unlock(); + return -EINVAL; + } + + parms = in_dev->arp_parms; + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); + rcu_read_unlock(); + + neigh->ops = &clip_neigh_ops; + neigh->output = neigh->nud_state & NUD_VALID ? + neigh->ops->connected_output : neigh->ops->output; + entry->neigh = neigh; + entry->vccs = NULL; + entry->expires = jiffies-1; + return 0; +} + +static u32 clip_hash(const void *pkey, const struct net_device *dev) +{ + return jhash_2words(*(u32 *)pkey, dev->ifindex, clip_tbl.hash_rnd); +} + +static struct neigh_table clip_tbl = { + .family = AF_INET, + .entry_size = sizeof(struct neighbour)+sizeof(struct atmarp_entry), + .key_len = 4, + .hash = clip_hash, + .constructor = clip_constructor, + .id = "clip_arp_cache", + + /* parameters are copied from ARP ... */ + .parms = { + .tbl = &clip_tbl, + .base_reachable_time = 30 * HZ, + .retrans_time = 1 * HZ, + .gc_staletime = 60 * HZ, + .reachable_time = 30 * HZ, + .delay_probe_time = 5 * HZ, + .queue_len = 3, + .ucast_probes = 3, + .mcast_probes = 3, + .anycast_delay = 1 * HZ, + .proxy_delay = (8 * HZ) / 10, + .proxy_qlen = 64, + .locktime = 1 * HZ, + }, + .gc_interval = 30 * HZ, + .gc_thresh1 = 128, + .gc_thresh2 = 512, + .gc_thresh3 = 1024, +}; + + +/* @@@ copy bh locking from arp.c -- need to bh-enable atm code before */ + +/* + * We play with the resolve flag: 0 and 1 have the usual meaning, but -1 means + * to allocate the neighbour entry but not to ask atmarpd for resolution. Also, + * don't increment the usage count. This is used to create entries in + * clip_setentry. + */ + + +static int clip_encap(struct atm_vcc *vcc,int mode) +{ + CLIP_VCC(vcc)->encap = mode; + return 0; +} + + +static int clip_start_xmit(struct sk_buff *skb,struct net_device *dev) +{ + struct clip_priv *clip_priv = PRIV(dev); + struct atmarp_entry *entry; + struct atm_vcc *vcc; + int old; + unsigned long flags; + + DPRINTK("clip_start_xmit (skb %p)\n",skb); + if (!skb->dst) { + printk(KERN_ERR "clip_start_xmit: skb->dst == NULL\n"); + dev_kfree_skb(skb); + clip_priv->stats.tx_dropped++; + return 0; + } + if (!skb->dst->neighbour) { +#if 0 + skb->dst->neighbour = clip_find_neighbour(skb->dst,1); + if (!skb->dst->neighbour) { + dev_kfree_skb(skb); /* lost that one */ + clip_priv->stats.tx_dropped++; + return 0; + } +#endif + printk(KERN_ERR "clip_start_xmit: NO NEIGHBOUR !\n"); + dev_kfree_skb(skb); + clip_priv->stats.tx_dropped++; + return 0; + } + entry = NEIGH2ENTRY(skb->dst->neighbour); + if (!entry->vccs) { + if (time_after(jiffies, entry->expires)) { + /* should be resolved */ + entry->expires = jiffies+ATMARP_RETRY_DELAY*HZ; + to_atmarpd(act_need,PRIV(dev)->number,entry->ip); + } + if (entry->neigh->arp_queue.qlen < ATMARP_MAX_UNRES_PACKETS) + skb_queue_tail(&entry->neigh->arp_queue,skb); + else { + dev_kfree_skb(skb); + clip_priv->stats.tx_dropped++; + } + return 0; + } + DPRINTK("neigh %p, vccs %p\n",entry,entry->vccs); + ATM_SKB(skb)->vcc = vcc = entry->vccs->vcc; + DPRINTK("using neighbour %p, vcc %p\n",skb->dst->neighbour,vcc); + if (entry->vccs->encap) { + void *here; + + here = skb_push(skb,RFC1483LLC_LEN); + memcpy(here,llc_oui,sizeof(llc_oui)); + ((u16 *) here)[3] = skb->protocol; + } + atomic_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc); + ATM_SKB(skb)->atm_options = vcc->atm_options; + entry->vccs->last_use = jiffies; + DPRINTK("atm_skb(%p)->vcc(%p)->dev(%p)\n",skb,vcc,vcc->dev); + old = xchg(&entry->vccs->xoff,1); /* assume XOFF ... */ + if (old) { + printk(KERN_WARNING "clip_start_xmit: XOFF->XOFF transition\n"); + return 0; + } + clip_priv->stats.tx_packets++; + clip_priv->stats.tx_bytes += skb->len; + (void) vcc->send(vcc,skb); + if (atm_may_send(vcc,0)) { + entry->vccs->xoff = 0; + return 0; + } + spin_lock_irqsave(&clip_priv->xoff_lock,flags); + netif_stop_queue(dev); /* XOFF -> throttle immediately */ + barrier(); + if (!entry->vccs->xoff) + netif_start_queue(dev); + /* Oh, we just raced with clip_pop. netif_start_queue should be + good enough, because nothing should really be asleep because + of the brief netif_stop_queue. If this isn't true or if it + changes, use netif_wake_queue instead. */ + spin_unlock_irqrestore(&clip_priv->xoff_lock,flags); + return 0; +} + + +static struct net_device_stats *clip_get_stats(struct net_device *dev) +{ + return &PRIV(dev)->stats; +} + + +static int clip_mkip(struct atm_vcc *vcc,int timeout) +{ + struct clip_vcc *clip_vcc; + struct sk_buff_head copy; + struct sk_buff *skb; + + if (!vcc->push) return -EBADFD; + clip_vcc = kmalloc(sizeof(struct clip_vcc),GFP_KERNEL); + if (!clip_vcc) return -ENOMEM; + DPRINTK("mkip clip_vcc %p vcc %p\n",clip_vcc,vcc); + clip_vcc->vcc = vcc; + vcc->user_back = clip_vcc; + set_bit(ATM_VF_IS_CLIP, &vcc->flags); + clip_vcc->entry = NULL; + clip_vcc->xoff = 0; + clip_vcc->encap = 1; + clip_vcc->last_use = jiffies; + clip_vcc->idle_timeout = timeout*HZ; + clip_vcc->old_push = vcc->push; + clip_vcc->old_pop = vcc->pop; + vcc->push = clip_push; + vcc->pop = clip_pop; + skb_queue_head_init(©); + skb_migrate(&sk_atm(vcc)->sk_receive_queue, ©); + /* re-process everything received between connection setup and MKIP */ + while ((skb = skb_dequeue(©)) != NULL) + if (!clip_devs) { + atm_return(vcc,skb->truesize); + kfree_skb(skb); + } + else { + unsigned int len = skb->len; + + clip_push(vcc,skb); + PRIV(skb->dev)->stats.rx_packets--; + PRIV(skb->dev)->stats.rx_bytes -= len; + } + return 0; +} + + +static int clip_setentry(struct atm_vcc *vcc,u32 ip) +{ + struct neighbour *neigh; + struct atmarp_entry *entry; + int error; + struct clip_vcc *clip_vcc; + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, .tos = 1 } } }; + struct rtable *rt; + + if (vcc->push != clip_push) { + printk(KERN_WARNING "clip_setentry: non-CLIP VCC\n"); + return -EBADF; + } + clip_vcc = CLIP_VCC(vcc); + if (!ip) { + if (!clip_vcc->entry) { + printk(KERN_ERR "hiding hidden ATMARP entry\n"); + return 0; + } + DPRINTK("setentry: remove\n"); + unlink_clip_vcc(clip_vcc); + return 0; + } + error = ip_route_output_key(&rt,&fl); + if (error) return error; + neigh = __neigh_lookup(&clip_tbl,&ip,rt->u.dst.dev,1); + ip_rt_put(rt); + if (!neigh) + return -ENOMEM; + entry = NEIGH2ENTRY(neigh); + if (entry != clip_vcc->entry) { + if (!clip_vcc->entry) DPRINTK("setentry: add\n"); + else { + DPRINTK("setentry: update\n"); + unlink_clip_vcc(clip_vcc); + } + link_vcc(clip_vcc,entry); + } + error = neigh_update(neigh, llc_oui, NUD_PERMANENT, + NEIGH_UPDATE_F_OVERRIDE|NEIGH_UPDATE_F_ADMIN); + neigh_release(neigh); + return error; +} + + +static void clip_setup(struct net_device *dev) +{ + dev->hard_start_xmit = clip_start_xmit; + /* sg_xmit ... */ + dev->get_stats = clip_get_stats; + dev->type = ARPHRD_ATM; + dev->hard_header_len = RFC1483LLC_LEN; + dev->mtu = RFC1626_MTU; + dev->tx_queue_len = 100; /* "normal" queue (packets) */ + /* When using a "real" qdisc, the qdisc determines the queue */ + /* length. tx_queue_len is only used for the default case, */ + /* without any more elaborate queuing. 100 is a reasonable */ + /* compromise between decent burst-tolerance and protection */ + /* against memory hogs. */ +} + + +static int clip_create(int number) +{ + struct net_device *dev; + struct clip_priv *clip_priv; + int error; + + if (number != -1) { + for (dev = clip_devs; dev; dev = PRIV(dev)->next) + if (PRIV(dev)->number == number) return -EEXIST; + } + else { + number = 0; + for (dev = clip_devs; dev; dev = PRIV(dev)->next) + if (PRIV(dev)->number >= number) + number = PRIV(dev)->number+1; + } + dev = alloc_netdev(sizeof(struct clip_priv), "", clip_setup); + if (!dev) + return -ENOMEM; + clip_priv = PRIV(dev); + sprintf(dev->name,"atm%d",number); + spin_lock_init(&clip_priv->xoff_lock); + clip_priv->number = number; + error = register_netdev(dev); + if (error) { + free_netdev(dev); + return error; + } + clip_priv->next = clip_devs; + clip_devs = dev; + DPRINTK("registered (net:%s)\n",dev->name); + return number; +} + + +static int clip_device_event(struct notifier_block *this,unsigned long event, + void *dev) +{ + /* ignore non-CLIP devices */ + if (((struct net_device *) dev)->type != ARPHRD_ATM || + ((struct net_device *) dev)->hard_start_xmit != clip_start_xmit) + return NOTIFY_DONE; + switch (event) { + case NETDEV_UP: + DPRINTK("clip_device_event NETDEV_UP\n"); + (void) to_atmarpd(act_up,PRIV(dev)->number,0); + break; + case NETDEV_GOING_DOWN: + DPRINTK("clip_device_event NETDEV_DOWN\n"); + (void) to_atmarpd(act_down,PRIV(dev)->number,0); + break; + case NETDEV_CHANGE: + case NETDEV_CHANGEMTU: + DPRINTK("clip_device_event NETDEV_CHANGE*\n"); + (void) to_atmarpd(act_change,PRIV(dev)->number,0); + break; + case NETDEV_REBOOT: + case NETDEV_REGISTER: + case NETDEV_DOWN: + DPRINTK("clip_device_event %ld\n",event); + /* ignore */ + break; + default: + printk(KERN_WARNING "clip_device_event: unknown event " + "%ld\n",event); + break; + } + return NOTIFY_DONE; +} + + +static int clip_inet_event(struct notifier_block *this,unsigned long event, + void *ifa) +{ + struct in_device *in_dev; + + in_dev = ((struct in_ifaddr *) ifa)->ifa_dev; + if (!in_dev || !in_dev->dev) { + printk(KERN_WARNING "clip_inet_event: no device\n"); + return NOTIFY_DONE; + } + /* + * Transitions are of the down-change-up type, so it's sufficient to + * handle the change on up. + */ + if (event != NETDEV_UP) return NOTIFY_DONE; + return clip_device_event(this,NETDEV_CHANGE,in_dev->dev); +} + + +static struct notifier_block clip_dev_notifier = { + clip_device_event, + NULL, + 0 +}; + + + +static struct notifier_block clip_inet_notifier = { + clip_inet_event, + NULL, + 0 +}; + + + +static void atmarpd_close(struct atm_vcc *vcc) +{ + DPRINTK("atmarpd_close\n"); + atmarpd = NULL; /* assumed to be atomic */ + barrier(); + unregister_inetaddr_notifier(&clip_inet_notifier); + unregister_netdevice_notifier(&clip_dev_notifier); + if (skb_peek(&sk_atm(vcc)->sk_receive_queue)) + printk(KERN_ERR "atmarpd_close: closing with requests " + "pending\n"); + skb_queue_purge(&sk_atm(vcc)->sk_receive_queue); + DPRINTK("(done)\n"); + module_put(THIS_MODULE); +} + + +static struct atmdev_ops atmarpd_dev_ops = { + .close = atmarpd_close +}; + + +static struct atm_dev atmarpd_dev = { + .ops = &atmarpd_dev_ops, + .type = "arpd", + .number = 999, + .lock = SPIN_LOCK_UNLOCKED +}; + + +static int atm_init_atmarp(struct atm_vcc *vcc) +{ + if (atmarpd) return -EADDRINUSE; + if (start_timer) { + start_timer = 0; + init_timer(&idle_timer); + idle_timer.expires = jiffies+CLIP_CHECK_INTERVAL*HZ; + idle_timer.function = idle_timer_check; + add_timer(&idle_timer); + } + atmarpd = vcc; + set_bit(ATM_VF_META,&vcc->flags); + set_bit(ATM_VF_READY,&vcc->flags); + /* allow replies and avoid getting closed if signaling dies */ + vcc->dev = &atmarpd_dev; + vcc_insert_socket(sk_atm(vcc)); + vcc->push = NULL; + vcc->pop = NULL; /* crash */ + vcc->push_oam = NULL; /* crash */ + if (register_netdevice_notifier(&clip_dev_notifier)) + printk(KERN_ERR "register_netdevice_notifier failed\n"); + if (register_inetaddr_notifier(&clip_inet_notifier)) + printk(KERN_ERR "register_inetaddr_notifier failed\n"); + return 0; +} + +static int clip_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct atm_vcc *vcc = ATM_SD(sock); + int err = 0; + + switch (cmd) { + case SIOCMKCLIP: + case ATMARPD_CTRL: + case ATMARP_MKIP: + case ATMARP_SETENTRY: + case ATMARP_ENCAP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + break; + default: + return -ENOIOCTLCMD; + } + + switch (cmd) { + case SIOCMKCLIP: + err = clip_create(arg); + break; + case ATMARPD_CTRL: + err = atm_init_atmarp(vcc); + if (!err) { + sock->state = SS_CONNECTED; + __module_get(THIS_MODULE); + } + break; + case ATMARP_MKIP: + err = clip_mkip(vcc ,arg); + break; + case ATMARP_SETENTRY: + err = clip_setentry(vcc, arg); + break; + case ATMARP_ENCAP: + err = clip_encap(vcc, arg); + break; + } + return err; +} + +static struct atm_ioctl clip_ioctl_ops = { + .owner = THIS_MODULE, + .ioctl = clip_ioctl, +}; + +#ifdef CONFIG_PROC_FS + +static void svc_addr(struct seq_file *seq, struct sockaddr_atmsvc *addr) +{ + static int code[] = { 1,2,10,6,1,0 }; + static int e164[] = { 1,8,4,6,1,0 }; + + if (*addr->sas_addr.pub) { + seq_printf(seq, "%s", addr->sas_addr.pub); + if (*addr->sas_addr.prv) + seq_putc(seq, '+'); + } else if (!*addr->sas_addr.prv) { + seq_printf(seq, "%s", "(none)"); + return; + } + if (*addr->sas_addr.prv) { + unsigned char *prv = addr->sas_addr.prv; + int *fields; + int i, j; + + fields = *prv == ATM_AFI_E164 ? e164 : code; + for (i = 0; fields[i]; i++) { + for (j = fields[i]; j; j--) + seq_printf(seq, "%02X", *prv++); + if (fields[i+1]) + seq_putc(seq, '.'); + } + } +} + +/* This means the neighbour entry has no attached VCC objects. */ +#define SEQ_NO_VCC_TOKEN ((void *) 2) + +static void atmarp_info(struct seq_file *seq, struct net_device *dev, + struct atmarp_entry *entry, struct clip_vcc *clip_vcc) +{ + unsigned long exp; + char buf[17]; + int svc, llc, off; + + svc = ((clip_vcc == SEQ_NO_VCC_TOKEN) || + (sk_atm(clip_vcc->vcc)->sk_family == AF_ATMSVC)); + + llc = ((clip_vcc == SEQ_NO_VCC_TOKEN) || + clip_vcc->encap); + + if (clip_vcc == SEQ_NO_VCC_TOKEN) + exp = entry->neigh->used; + else + exp = clip_vcc->last_use; + + exp = (jiffies - exp) / HZ; + + seq_printf(seq, "%-6s%-4s%-4s%5ld ", + dev->name, + svc ? "SVC" : "PVC", + llc ? "LLC" : "NULL", + exp); + + off = scnprintf(buf, sizeof(buf) - 1, "%d.%d.%d.%d", + NIPQUAD(entry->ip)); + while (off < 16) + buf[off++] = ' '; + buf[off] = '\0'; + seq_printf(seq, "%s", buf); + + if (clip_vcc == SEQ_NO_VCC_TOKEN) { + if (time_before(jiffies, entry->expires)) + seq_printf(seq, "(resolving)\n"); + else + seq_printf(seq, "(expired, ref %d)\n", + atomic_read(&entry->neigh->refcnt)); + } else if (!svc) { + seq_printf(seq, "%d.%d.%d\n", + clip_vcc->vcc->dev->number, + clip_vcc->vcc->vpi, + clip_vcc->vcc->vci); + } else { + svc_addr(seq, &clip_vcc->vcc->remote); + seq_putc(seq, '\n'); + } +} + +struct clip_seq_state { + /* This member must be first. */ + struct neigh_seq_state ns; + + /* Local to clip specific iteration. */ + struct clip_vcc *vcc; +}; + +static struct clip_vcc *clip_seq_next_vcc(struct atmarp_entry *e, + struct clip_vcc *curr) +{ + if (!curr) { + curr = e->vccs; + if (!curr) + return SEQ_NO_VCC_TOKEN; + return curr; + } + if (curr == SEQ_NO_VCC_TOKEN) + return NULL; + + curr = curr->next; + + return curr; +} + +static void *clip_seq_vcc_walk(struct clip_seq_state *state, + struct atmarp_entry *e, loff_t *pos) +{ + struct clip_vcc *vcc = state->vcc; + + vcc = clip_seq_next_vcc(e, vcc); + if (vcc && pos != NULL) { + while (*pos) { + vcc = clip_seq_next_vcc(e, vcc); + if (!vcc) + break; + --(*pos); + } + } + state->vcc = vcc; + + return vcc; +} + +static void *clip_seq_sub_iter(struct neigh_seq_state *_state, + struct neighbour *n, loff_t *pos) +{ + struct clip_seq_state *state = (struct clip_seq_state *) _state; + + return clip_seq_vcc_walk(state, NEIGH2ENTRY(n), pos); +} + +static void *clip_seq_start(struct seq_file *seq, loff_t *pos) +{ + return neigh_seq_start(seq, pos, &clip_tbl, NEIGH_SEQ_NEIGH_ONLY); +} + +static int clip_seq_show(struct seq_file *seq, void *v) +{ + static char atm_arp_banner[] = + "IPitf TypeEncp Idle IP address ATM address\n"; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, atm_arp_banner); + } else { + struct clip_seq_state *state = seq->private; + struct neighbour *n = v; + struct clip_vcc *vcc = state->vcc; + + atmarp_info(seq, n->dev, NEIGH2ENTRY(n), vcc); + } + return 0; +} + +static struct seq_operations arp_seq_ops = { + .start = clip_seq_start, + .next = neigh_seq_next, + .stop = neigh_seq_stop, + .show = clip_seq_show, +}; + +static int arp_seq_open(struct inode *inode, struct file *file) +{ + struct clip_seq_state *state; + struct seq_file *seq; + int rc = -EAGAIN; + + state = kmalloc(sizeof(*state), GFP_KERNEL); + if (!state) { + rc = -ENOMEM; + goto out_kfree; + } + memset(state, 0, sizeof(*state)); + state->ns.neigh_sub_iter = clip_seq_sub_iter; + + rc = seq_open(file, &arp_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = state; +out: + return rc; + +out_kfree: + kfree(state); + goto out; +} + +static struct file_operations arp_seq_fops = { + .open = arp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, + .owner = THIS_MODULE +}; +#endif + +static int __init atm_clip_init(void) +{ + neigh_table_init(&clip_tbl); + + clip_tbl_hook = &clip_tbl; + register_atm_ioctl(&clip_ioctl_ops); + +#ifdef CONFIG_PROC_FS +{ + struct proc_dir_entry *p; + + p = create_proc_entry("arp", S_IRUGO, atm_proc_root); + if (p) + p->proc_fops = &arp_seq_fops; +} +#endif + + return 0; +} + +static void __exit atm_clip_exit(void) +{ + struct net_device *dev, *next; + + remove_proc_entry("arp", atm_proc_root); + + deregister_atm_ioctl(&clip_ioctl_ops); + + /* First, stop the idle timer, so it stops banging + * on the table. + */ + if (start_timer == 0) + del_timer(&idle_timer); + + /* Next, purge the table, so that the device + * unregister loop below does not hang due to + * device references remaining in the table. + */ + neigh_ifdown(&clip_tbl, NULL); + + dev = clip_devs; + while (dev) { + next = PRIV(dev)->next; + unregister_netdev(dev); + free_netdev(dev); + dev = next; + } + + /* Now it is safe to fully shutdown whole table. */ + neigh_table_clear(&clip_tbl); + + clip_tbl_hook = NULL; +} + +module_init(atm_clip_init); +module_exit(atm_clip_exit); + +MODULE_LICENSE("GPL"); diff --git a/net/atm/common.c b/net/atm/common.c new file mode 100644 index 000000000000..6d16be334ea0 --- /dev/null +++ b/net/atm/common.c @@ -0,0 +1,804 @@ +/* net/atm/common.c - ATM sockets (common part for PVC and SVC) */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + + +#include +#include +#include +#include /* struct socket, struct proto_ops */ +#include /* ATM stuff */ +#include +#include /* SOL_SOCKET */ +#include /* error codes */ +#include +#include /* verify_area */ +#include +#include /* struct timeval */ +#include +#include +#include +#include /* struct sock */ + +#include +#include +#include + + +#include "resources.h" /* atm_find_dev */ +#include "common.h" /* prototypes */ +#include "protocols.h" /* atm_init_ */ +#include "addr.h" /* address registry */ +#include "signaling.h" /* for WAITING and sigd_attach */ + + +#if 0 +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + +struct hlist_head vcc_hash[VCC_HTABLE_SIZE]; +DEFINE_RWLOCK(vcc_sklist_lock); + +static void __vcc_insert_socket(struct sock *sk) +{ + struct atm_vcc *vcc = atm_sk(sk); + struct hlist_head *head = &vcc_hash[vcc->vci & + (VCC_HTABLE_SIZE - 1)]; + sk->sk_hashent = vcc->vci & (VCC_HTABLE_SIZE - 1); + sk_add_node(sk, head); +} + +void vcc_insert_socket(struct sock *sk) +{ + write_lock_irq(&vcc_sklist_lock); + __vcc_insert_socket(sk); + write_unlock_irq(&vcc_sklist_lock); +} + +static void vcc_remove_socket(struct sock *sk) +{ + write_lock_irq(&vcc_sklist_lock); + sk_del_node_init(sk); + write_unlock_irq(&vcc_sklist_lock); +} + + +static struct sk_buff *alloc_tx(struct atm_vcc *vcc,unsigned int size) +{ + struct sk_buff *skb; + struct sock *sk = sk_atm(vcc); + + if (atomic_read(&sk->sk_wmem_alloc) && !atm_may_send(vcc, size)) { + DPRINTK("Sorry: wmem_alloc = %d, size = %d, sndbuf = %d\n", + atomic_read(&sk->sk_wmem_alloc), size, + sk->sk_sndbuf); + return NULL; + } + while (!(skb = alloc_skb(size,GFP_KERNEL))) schedule(); + DPRINTK("AlTx %d += %d\n", atomic_read(&sk->sk_wmem_alloc), + skb->truesize); + atomic_add(skb->truesize, &sk->sk_wmem_alloc); + return skb; +} + + +EXPORT_SYMBOL(vcc_hash); +EXPORT_SYMBOL(vcc_sklist_lock); +EXPORT_SYMBOL(vcc_insert_socket); + +static void vcc_sock_destruct(struct sock *sk) +{ + if (atomic_read(&sk->sk_rmem_alloc)) + printk(KERN_DEBUG "vcc_sock_destruct: rmem leakage (%d bytes) detected.\n", atomic_read(&sk->sk_rmem_alloc)); + + if (atomic_read(&sk->sk_wmem_alloc)) + printk(KERN_DEBUG "vcc_sock_destruct: wmem leakage (%d bytes) detected.\n", atomic_read(&sk->sk_wmem_alloc)); +} + +static void vcc_def_wakeup(struct sock *sk) +{ + read_lock(&sk->sk_callback_lock); + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up(sk->sk_sleep); + read_unlock(&sk->sk_callback_lock); +} + +static inline int vcc_writable(struct sock *sk) +{ + struct atm_vcc *vcc = atm_sk(sk); + + return (vcc->qos.txtp.max_sdu + + atomic_read(&sk->sk_wmem_alloc)) <= sk->sk_sndbuf; +} + +static void vcc_write_space(struct sock *sk) +{ + read_lock(&sk->sk_callback_lock); + + if (vcc_writable(sk)) { + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + + sk_wake_async(sk, 2, POLL_OUT); + } + + read_unlock(&sk->sk_callback_lock); +} + +static struct proto vcc_proto = { + .name = "VCC", + .owner = THIS_MODULE, + .obj_size = sizeof(struct atm_vcc), +}; + +int vcc_create(struct socket *sock, int protocol, int family) +{ + struct sock *sk; + struct atm_vcc *vcc; + + sock->sk = NULL; + if (sock->type == SOCK_STREAM) + return -EINVAL; + sk = sk_alloc(family, GFP_KERNEL, &vcc_proto, 1); + if (!sk) + return -ENOMEM; + sock_init_data(sock, sk); + sk->sk_state_change = vcc_def_wakeup; + sk->sk_write_space = vcc_write_space; + + vcc = atm_sk(sk); + vcc->dev = NULL; + memset(&vcc->local,0,sizeof(struct sockaddr_atmsvc)); + memset(&vcc->remote,0,sizeof(struct sockaddr_atmsvc)); + vcc->qos.txtp.max_sdu = 1 << 16; /* for meta VCs */ + atomic_set(&sk->sk_wmem_alloc, 0); + atomic_set(&sk->sk_rmem_alloc, 0); + vcc->push = NULL; + vcc->pop = NULL; + vcc->push_oam = NULL; + vcc->vpi = vcc->vci = 0; /* no VCI/VPI yet */ + vcc->atm_options = vcc->aal_options = 0; + sk->sk_destruct = vcc_sock_destruct; + return 0; +} + + +static void vcc_destroy_socket(struct sock *sk) +{ + struct atm_vcc *vcc = atm_sk(sk); + struct sk_buff *skb; + + set_bit(ATM_VF_CLOSE, &vcc->flags); + clear_bit(ATM_VF_READY, &vcc->flags); + if (vcc->dev) { + if (vcc->dev->ops->close) + vcc->dev->ops->close(vcc); + if (vcc->push) + vcc->push(vcc, NULL); /* atmarpd has no push */ + + vcc_remove_socket(sk); /* no more receive */ + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + atm_return(vcc,skb->truesize); + kfree_skb(skb); + } + + module_put(vcc->dev->ops->owner); + atm_dev_put(vcc->dev); + } +} + + +int vcc_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk) { + lock_sock(sk); + vcc_destroy_socket(sock->sk); + release_sock(sk); + sock_put(sk); + } + + return 0; +} + + +void vcc_release_async(struct atm_vcc *vcc, int reply) +{ + struct sock *sk = sk_atm(vcc); + + set_bit(ATM_VF_CLOSE, &vcc->flags); + sk->sk_shutdown |= RCV_SHUTDOWN; + sk->sk_err = -reply; + clear_bit(ATM_VF_WAITING, &vcc->flags); + sk->sk_state_change(sk); +} + + +EXPORT_SYMBOL(vcc_release_async); + + +static int adjust_tp(struct atm_trafprm *tp,unsigned char aal) +{ + int max_sdu; + + if (!tp->traffic_class) return 0; + switch (aal) { + case ATM_AAL0: + max_sdu = ATM_CELL_SIZE-1; + break; + case ATM_AAL34: + max_sdu = ATM_MAX_AAL34_PDU; + break; + default: + printk(KERN_WARNING "ATM: AAL problems ... " + "(%d)\n",aal); + /* fall through */ + case ATM_AAL5: + max_sdu = ATM_MAX_AAL5_PDU; + } + if (!tp->max_sdu) tp->max_sdu = max_sdu; + else if (tp->max_sdu > max_sdu) return -EINVAL; + if (!tp->max_cdv) tp->max_cdv = ATM_MAX_CDV; + return 0; +} + + +static int check_ci(struct atm_vcc *vcc, short vpi, int vci) +{ + struct hlist_head *head = &vcc_hash[vci & + (VCC_HTABLE_SIZE - 1)]; + struct hlist_node *node; + struct sock *s; + struct atm_vcc *walk; + + sk_for_each(s, node, head) { + walk = atm_sk(s); + if (walk->dev != vcc->dev) + continue; + if (test_bit(ATM_VF_ADDR, &walk->flags) && walk->vpi == vpi && + walk->vci == vci && ((walk->qos.txtp.traffic_class != + ATM_NONE && vcc->qos.txtp.traffic_class != ATM_NONE) || + (walk->qos.rxtp.traffic_class != ATM_NONE && + vcc->qos.rxtp.traffic_class != ATM_NONE))) + return -EADDRINUSE; + } + + /* allow VCCs with same VPI/VCI iff they don't collide on + TX/RX (but we may refuse such sharing for other reasons, + e.g. if protocol requires to have both channels) */ + + return 0; +} + + +static int find_ci(struct atm_vcc *vcc, short *vpi, int *vci) +{ + static short p; /* poor man's per-device cache */ + static int c; + short old_p; + int old_c; + int err; + + if (*vpi != ATM_VPI_ANY && *vci != ATM_VCI_ANY) { + err = check_ci(vcc, *vpi, *vci); + return err; + } + /* last scan may have left values out of bounds for current device */ + if (*vpi != ATM_VPI_ANY) + p = *vpi; + else if (p >= 1 << vcc->dev->ci_range.vpi_bits) + p = 0; + if (*vci != ATM_VCI_ANY) + c = *vci; + else if (c < ATM_NOT_RSV_VCI || c >= 1 << vcc->dev->ci_range.vci_bits) + c = ATM_NOT_RSV_VCI; + old_p = p; + old_c = c; + do { + if (!check_ci(vcc, p, c)) { + *vpi = p; + *vci = c; + return 0; + } + if (*vci == ATM_VCI_ANY) { + c++; + if (c >= 1 << vcc->dev->ci_range.vci_bits) + c = ATM_NOT_RSV_VCI; + } + if ((c == ATM_NOT_RSV_VCI || *vci != ATM_VCI_ANY) && + *vpi == ATM_VPI_ANY) { + p++; + if (p >= 1 << vcc->dev->ci_range.vpi_bits) p = 0; + } + } + while (old_p != p || old_c != c); + return -EADDRINUSE; +} + + +static int __vcc_connect(struct atm_vcc *vcc, struct atm_dev *dev, short vpi, + int vci) +{ + struct sock *sk = sk_atm(vcc); + int error; + + if ((vpi != ATM_VPI_UNSPEC && vpi != ATM_VPI_ANY && + vpi >> dev->ci_range.vpi_bits) || (vci != ATM_VCI_UNSPEC && + vci != ATM_VCI_ANY && vci >> dev->ci_range.vci_bits)) + return -EINVAL; + if (vci > 0 && vci < ATM_NOT_RSV_VCI && !capable(CAP_NET_BIND_SERVICE)) + return -EPERM; + error = 0; + if (!try_module_get(dev->ops->owner)) + return -ENODEV; + vcc->dev = dev; + write_lock_irq(&vcc_sklist_lock); + if ((error = find_ci(vcc, &vpi, &vci))) { + write_unlock_irq(&vcc_sklist_lock); + goto fail_module_put; + } + vcc->vpi = vpi; + vcc->vci = vci; + __vcc_insert_socket(sk); + write_unlock_irq(&vcc_sklist_lock); + switch (vcc->qos.aal) { + case ATM_AAL0: + error = atm_init_aal0(vcc); + vcc->stats = &dev->stats.aal0; + break; + case ATM_AAL34: + error = atm_init_aal34(vcc); + vcc->stats = &dev->stats.aal34; + break; + case ATM_NO_AAL: + /* ATM_AAL5 is also used in the "0 for default" case */ + vcc->qos.aal = ATM_AAL5; + /* fall through */ + case ATM_AAL5: + error = atm_init_aal5(vcc); + vcc->stats = &dev->stats.aal5; + break; + default: + error = -EPROTOTYPE; + } + if (!error) error = adjust_tp(&vcc->qos.txtp,vcc->qos.aal); + if (!error) error = adjust_tp(&vcc->qos.rxtp,vcc->qos.aal); + if (error) + goto fail; + DPRINTK("VCC %d.%d, AAL %d\n",vpi,vci,vcc->qos.aal); + DPRINTK(" TX: %d, PCR %d..%d, SDU %d\n",vcc->qos.txtp.traffic_class, + vcc->qos.txtp.min_pcr,vcc->qos.txtp.max_pcr,vcc->qos.txtp.max_sdu); + DPRINTK(" RX: %d, PCR %d..%d, SDU %d\n",vcc->qos.rxtp.traffic_class, + vcc->qos.rxtp.min_pcr,vcc->qos.rxtp.max_pcr,vcc->qos.rxtp.max_sdu); + + if (dev->ops->open) { + if ((error = dev->ops->open(vcc))) + goto fail; + } + return 0; + +fail: + vcc_remove_socket(sk); +fail_module_put: + module_put(dev->ops->owner); + /* ensure we get dev module ref count correct */ + vcc->dev = NULL; + return error; +} + + +int vcc_connect(struct socket *sock, int itf, short vpi, int vci) +{ + struct atm_dev *dev; + struct atm_vcc *vcc = ATM_SD(sock); + int error; + + DPRINTK("vcc_connect (vpi %d, vci %d)\n",vpi,vci); + if (sock->state == SS_CONNECTED) + return -EISCONN; + if (sock->state != SS_UNCONNECTED) + return -EINVAL; + if (!(vpi || vci)) + return -EINVAL; + + if (vpi != ATM_VPI_UNSPEC && vci != ATM_VCI_UNSPEC) + clear_bit(ATM_VF_PARTIAL,&vcc->flags); + else + if (test_bit(ATM_VF_PARTIAL,&vcc->flags)) + return -EINVAL; + DPRINTK("vcc_connect (TX: cl %d,bw %d-%d,sdu %d; " + "RX: cl %d,bw %d-%d,sdu %d,AAL %s%d)\n", + vcc->qos.txtp.traffic_class,vcc->qos.txtp.min_pcr, + vcc->qos.txtp.max_pcr,vcc->qos.txtp.max_sdu, + vcc->qos.rxtp.traffic_class,vcc->qos.rxtp.min_pcr, + vcc->qos.rxtp.max_pcr,vcc->qos.rxtp.max_sdu, + vcc->qos.aal == ATM_AAL5 ? "" : vcc->qos.aal == ATM_AAL0 ? "" : + " ??? code ",vcc->qos.aal == ATM_AAL0 ? 0 : vcc->qos.aal); + if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) + return -EBADFD; + if (vcc->qos.txtp.traffic_class == ATM_ANYCLASS || + vcc->qos.rxtp.traffic_class == ATM_ANYCLASS) + return -EINVAL; + if (itf != ATM_ITF_ANY) { + dev = atm_dev_lookup(itf); + if (!dev) + return -ENODEV; + error = __vcc_connect(vcc, dev, vpi, vci); + if (error) { + atm_dev_put(dev); + return error; + } + } else { + struct list_head *p, *next; + + dev = NULL; + spin_lock(&atm_dev_lock); + list_for_each_safe(p, next, &atm_devs) { + dev = list_entry(p, struct atm_dev, dev_list); + atm_dev_hold(dev); + spin_unlock(&atm_dev_lock); + if (!__vcc_connect(vcc, dev, vpi, vci)) + break; + atm_dev_put(dev); + dev = NULL; + spin_lock(&atm_dev_lock); + } + spin_unlock(&atm_dev_lock); + if (!dev) + return -ENODEV; + } + if (vpi == ATM_VPI_UNSPEC || vci == ATM_VCI_UNSPEC) + set_bit(ATM_VF_PARTIAL,&vcc->flags); + if (test_bit(ATM_VF_READY,&ATM_SD(sock)->flags)) + sock->state = SS_CONNECTED; + return 0; +} + + +int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct atm_vcc *vcc; + struct sk_buff *skb; + int copied, error = -EINVAL; + + if (sock->state != SS_CONNECTED) + return -ENOTCONN; + if (flags & ~MSG_DONTWAIT) /* only handle MSG_DONTWAIT */ + return -EOPNOTSUPP; + vcc = ATM_SD(sock); + if (test_bit(ATM_VF_RELEASED,&vcc->flags) || + test_bit(ATM_VF_CLOSE,&vcc->flags) || + !test_bit(ATM_VF_READY, &vcc->flags)) + return 0; + + skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &error); + if (!skb) + return error; + + copied = skb->len; + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + error = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (error) + return error; + sock_recv_timestamp(msg, sk, skb); + DPRINTK("RcvM %d -= %d\n", atomic_read(&sk->rmem_alloc), skb->truesize); + atm_return(vcc, skb->truesize); + skb_free_datagram(sk, skb); + return copied; +} + + +int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, + size_t total_len) +{ + struct sock *sk = sock->sk; + DEFINE_WAIT(wait); + struct atm_vcc *vcc; + struct sk_buff *skb; + int eff,error; + const void __user *buff; + int size; + + lock_sock(sk); + if (sock->state != SS_CONNECTED) { + error = -ENOTCONN; + goto out; + } + if (m->msg_name) { + error = -EISCONN; + goto out; + } + if (m->msg_iovlen != 1) { + error = -ENOSYS; /* fix this later @@@ */ + goto out; + } + buff = m->msg_iov->iov_base; + size = m->msg_iov->iov_len; + vcc = ATM_SD(sock); + if (test_bit(ATM_VF_RELEASED, &vcc->flags) || + test_bit(ATM_VF_CLOSE, &vcc->flags) || + !test_bit(ATM_VF_READY, &vcc->flags)) { + error = -EPIPE; + send_sig(SIGPIPE, current, 0); + goto out; + } + if (!size) { + error = 0; + goto out; + } + if (size < 0 || size > vcc->qos.txtp.max_sdu) { + error = -EMSGSIZE; + goto out; + } + /* verify_area is done by net/socket.c */ + eff = (size+3) & ~3; /* align to word boundary */ + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + error = 0; + while (!(skb = alloc_tx(vcc,eff))) { + if (m->msg_flags & MSG_DONTWAIT) { + error = -EAGAIN; + break; + } + schedule(); + if (signal_pending(current)) { + error = -ERESTARTSYS; + break; + } + if (test_bit(ATM_VF_RELEASED,&vcc->flags) || + test_bit(ATM_VF_CLOSE,&vcc->flags) || + !test_bit(ATM_VF_READY,&vcc->flags)) { + error = -EPIPE; + send_sig(SIGPIPE, current, 0); + break; + } + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + } + finish_wait(sk->sk_sleep, &wait); + if (error) + goto out; + skb->dev = NULL; /* for paths shared with net_device interfaces */ + ATM_SKB(skb)->atm_options = vcc->atm_options; + if (copy_from_user(skb_put(skb,size),buff,size)) { + kfree_skb(skb); + error = -EFAULT; + goto out; + } + if (eff != size) memset(skb->data+size,0,eff-size); + error = vcc->dev->ops->send(vcc,skb); + error = error ? error : size; +out: + release_sock(sk); + return error; +} + + +unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + struct atm_vcc *vcc; + unsigned int mask; + + poll_wait(file, sk->sk_sleep, wait); + mask = 0; + + vcc = ATM_SD(sock); + + /* exceptional events */ + if (sk->sk_err) + mask = POLLERR; + + if (test_bit(ATM_VF_RELEASED, &vcc->flags) || + test_bit(ATM_VF_CLOSE, &vcc->flags)) + mask |= POLLHUP; + + /* readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue)) + mask |= POLLIN | POLLRDNORM; + + /* writable? */ + if (sock->state == SS_CONNECTING && + test_bit(ATM_VF_WAITING, &vcc->flags)) + return mask; + + if (vcc->qos.txtp.traffic_class != ATM_NONE && + vcc_writable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + + return mask; +} + + +static int atm_change_qos(struct atm_vcc *vcc,struct atm_qos *qos) +{ + int error; + + /* + * Don't let the QoS change the already connected AAL type nor the + * traffic class. + */ + if (qos->aal != vcc->qos.aal || + qos->rxtp.traffic_class != vcc->qos.rxtp.traffic_class || + qos->txtp.traffic_class != vcc->qos.txtp.traffic_class) + return -EINVAL; + error = adjust_tp(&qos->txtp,qos->aal); + if (!error) error = adjust_tp(&qos->rxtp,qos->aal); + if (error) return error; + if (!vcc->dev->ops->change_qos) return -EOPNOTSUPP; + if (sk_atm(vcc)->sk_family == AF_ATMPVC) + return vcc->dev->ops->change_qos(vcc,qos,ATM_MF_SET); + return svc_change_qos(vcc,qos); +} + + +static int check_tp(struct atm_trafprm *tp) +{ + /* @@@ Should be merged with adjust_tp */ + if (!tp->traffic_class || tp->traffic_class == ATM_ANYCLASS) return 0; + if (tp->traffic_class != ATM_UBR && !tp->min_pcr && !tp->pcr && + !tp->max_pcr) return -EINVAL; + if (tp->min_pcr == ATM_MAX_PCR) return -EINVAL; + if (tp->min_pcr && tp->max_pcr && tp->max_pcr != ATM_MAX_PCR && + tp->min_pcr > tp->max_pcr) return -EINVAL; + /* + * We allow pcr to be outside [min_pcr,max_pcr], because later + * adjustment may still push it in the valid range. + */ + return 0; +} + + +static int check_qos(struct atm_qos *qos) +{ + int error; + + if (!qos->txtp.traffic_class && !qos->rxtp.traffic_class) + return -EINVAL; + if (qos->txtp.traffic_class != qos->rxtp.traffic_class && + qos->txtp.traffic_class && qos->rxtp.traffic_class && + qos->txtp.traffic_class != ATM_ANYCLASS && + qos->rxtp.traffic_class != ATM_ANYCLASS) return -EINVAL; + error = check_tp(&qos->txtp); + if (error) return error; + return check_tp(&qos->rxtp); +} + +int vcc_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct atm_vcc *vcc; + unsigned long value; + int error; + + if (__SO_LEVEL_MATCH(optname, level) && optlen != __SO_SIZE(optname)) + return -EINVAL; + + vcc = ATM_SD(sock); + switch (optname) { + case SO_ATMQOS: + { + struct atm_qos qos; + + if (copy_from_user(&qos,optval,sizeof(qos))) + return -EFAULT; + error = check_qos(&qos); + if (error) return error; + if (sock->state == SS_CONNECTED) + return atm_change_qos(vcc,&qos); + if (sock->state != SS_UNCONNECTED) + return -EBADFD; + vcc->qos = qos; + set_bit(ATM_VF_HASQOS,&vcc->flags); + return 0; + } + case SO_SETCLP: + if (get_user(value,(unsigned long __user *)optval)) + return -EFAULT; + if (value) vcc->atm_options |= ATM_ATMOPT_CLP; + else vcc->atm_options &= ~ATM_ATMOPT_CLP; + return 0; + default: + if (level == SOL_SOCKET) return -EINVAL; + break; + } + if (!vcc->dev || !vcc->dev->ops->setsockopt) return -EINVAL; + return vcc->dev->ops->setsockopt(vcc,level,optname,optval,optlen); +} + + +int vcc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct atm_vcc *vcc; + int len; + + if (get_user(len, optlen)) + return -EFAULT; + if (__SO_LEVEL_MATCH(optname, level) && len != __SO_SIZE(optname)) + return -EINVAL; + + vcc = ATM_SD(sock); + switch (optname) { + case SO_ATMQOS: + if (!test_bit(ATM_VF_HASQOS,&vcc->flags)) + return -EINVAL; + return copy_to_user(optval,&vcc->qos,sizeof(vcc->qos)) ? + -EFAULT : 0; + case SO_SETCLP: + return put_user(vcc->atm_options & ATM_ATMOPT_CLP ? 1 : + 0,(unsigned long __user *)optval) ? -EFAULT : 0; + case SO_ATMPVC: + { + struct sockaddr_atmpvc pvc; + + if (!vcc->dev || + !test_bit(ATM_VF_ADDR,&vcc->flags)) + return -ENOTCONN; + pvc.sap_family = AF_ATMPVC; + pvc.sap_addr.itf = vcc->dev->number; + pvc.sap_addr.vpi = vcc->vpi; + pvc.sap_addr.vci = vcc->vci; + return copy_to_user(optval,&pvc,sizeof(pvc)) ? + -EFAULT : 0; + } + default: + if (level == SOL_SOCKET) return -EINVAL; + break; + } + if (!vcc->dev || !vcc->dev->ops->getsockopt) return -EINVAL; + return vcc->dev->ops->getsockopt(vcc, level, optname, optval, len); +} + +static int __init atm_init(void) +{ + int error; + + if ((error = proto_register(&vcc_proto, 0)) < 0) + goto out; + + if ((error = atmpvc_init()) < 0) { + printk(KERN_ERR "atmpvc_init() failed with %d\n", error); + goto out_unregister_vcc_proto; + } + if ((error = atmsvc_init()) < 0) { + printk(KERN_ERR "atmsvc_init() failed with %d\n", error); + goto out_atmpvc_exit; + } + if ((error = atm_proc_init()) < 0) { + printk(KERN_ERR "atm_proc_init() failed with %d\n",error); + goto out_atmsvc_exit; + } +out: + return error; +out_atmsvc_exit: + atmsvc_exit(); +out_atmpvc_exit: + atmsvc_exit(); +out_unregister_vcc_proto: + proto_unregister(&vcc_proto); + goto out; +} + +static void __exit atm_exit(void) +{ + atm_proc_exit(); + atmsvc_exit(); + atmpvc_exit(); + proto_unregister(&vcc_proto); +} + +module_init(atm_init); +module_exit(atm_exit); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_ATMPVC); +MODULE_ALIAS_NETPROTO(PF_ATMSVC); diff --git a/net/atm/common.h b/net/atm/common.h new file mode 100644 index 000000000000..e49ed41c0e33 --- /dev/null +++ b/net/atm/common.h @@ -0,0 +1,50 @@ +/* net/atm/common.h - ATM sockets (common part for PVC and SVC) */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + + +#ifndef NET_ATM_COMMON_H +#define NET_ATM_COMMON_H + +#include +#include /* for poll_table */ + + +int vcc_create(struct socket *sock, int protocol, int family); +int vcc_release(struct socket *sock); +int vcc_connect(struct socket *sock, int itf, short vpi, int vci); +int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size, int flags); +int vcc_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, + size_t total_len); +unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait); +int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); +int vcc_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen); +int vcc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen); + +int atmpvc_init(void); +void atmpvc_exit(void); +int atmsvc_init(void); +void atmsvc_exit(void); + +#ifdef CONFIG_PROC_FS +int atm_proc_init(void); +void atm_proc_exit(void); +#else +static inline int atm_proc_init(void) +{ + return 0; +} + +static inline void atm_proc_exit(void) +{ + /* nothing */ +} +#endif /* CONFIG_PROC_FS */ + +/* SVC */ +int svc_change_qos(struct atm_vcc *vcc,struct atm_qos *qos); + +#endif diff --git a/net/atm/ioctl.c b/net/atm/ioctl.c new file mode 100644 index 000000000000..4dbb5af34a5e --- /dev/null +++ b/net/atm/ioctl.c @@ -0,0 +1,139 @@ +/* ATM ioctl handling */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ +/* 2003 John Levon */ + + +#include +#include +#include +#include /* struct socket, struct proto_ops */ +#include /* ATM stuff */ +#include +#include /* CLIP_*ENCAP */ +#include /* manifest constants */ +#include /* for ioctls */ +#include +#include +#include +#include +#include + +#include "resources.h" +#include "signaling.h" /* for WAITING and sigd_attach */ + + +static DECLARE_MUTEX(ioctl_mutex); +static LIST_HEAD(ioctl_list); + + +void register_atm_ioctl(struct atm_ioctl *ioctl) +{ + down(&ioctl_mutex); + list_add_tail(&ioctl->list, &ioctl_list); + up(&ioctl_mutex); +} + +void deregister_atm_ioctl(struct atm_ioctl *ioctl) +{ + down(&ioctl_mutex); + list_del(&ioctl->list); + up(&ioctl_mutex); +} + +EXPORT_SYMBOL(register_atm_ioctl); +EXPORT_SYMBOL(deregister_atm_ioctl); + +int vcc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + struct atm_vcc *vcc; + int error; + struct list_head * pos; + void __user *argp = (void __user *)arg; + + vcc = ATM_SD(sock); + switch (cmd) { + case SIOCOUTQ: + if (sock->state != SS_CONNECTED || + !test_bit(ATM_VF_READY, &vcc->flags)) { + error = -EINVAL; + goto done; + } + error = put_user(sk->sk_sndbuf - + atomic_read(&sk->sk_wmem_alloc), + (int __user *) argp) ? -EFAULT : 0; + goto done; + case SIOCINQ: + { + struct sk_buff *skb; + + if (sock->state != SS_CONNECTED) { + error = -EINVAL; + goto done; + } + skb = skb_peek(&sk->sk_receive_queue); + error = put_user(skb ? skb->len : 0, + (int __user *)argp) ? -EFAULT : 0; + goto done; + } + case SIOCGSTAMP: /* borrowed from IP */ + error = sock_get_timestamp(sk, argp); + goto done; + case ATM_SETSC: + printk(KERN_WARNING "ATM_SETSC is obsolete\n"); + error = 0; + goto done; + case ATMSIGD_CTRL: + if (!capable(CAP_NET_ADMIN)) { + error = -EPERM; + goto done; + } + /* + * The user/kernel protocol for exchanging signalling + * info uses kernel pointers as opaque references, + * so the holder of the file descriptor can scribble + * on the kernel... so we should make sure that we + * have the same privledges that /proc/kcore needs + */ + if (!capable(CAP_SYS_RAWIO)) { + error = -EPERM; + goto done; + } + error = sigd_attach(vcc); + if (!error) + sock->state = SS_CONNECTED; + goto done; + default: + break; + } + + if (cmd == ATMMPC_CTRL || cmd == ATMMPC_DATA) + request_module("mpoa"); + if (cmd == ATMARPD_CTRL) + request_module("clip"); + if (cmd == ATMLEC_CTRL) + request_module("lec"); + + error = -ENOIOCTLCMD; + + down(&ioctl_mutex); + list_for_each(pos, &ioctl_list) { + struct atm_ioctl * ic = list_entry(pos, struct atm_ioctl, list); + if (try_module_get(ic->owner)) { + error = ic->ioctl(sock, cmd, arg); + module_put(ic->owner); + if (error != -ENOIOCTLCMD) + break; + } + } + up(&ioctl_mutex); + + if (error != -ENOIOCTLCMD) + goto done; + + error = atm_dev_ioctl(cmd, argp); + +done: + return error; +} diff --git a/net/atm/ipcommon.c b/net/atm/ipcommon.c new file mode 100644 index 000000000000..181a3002d8ad --- /dev/null +++ b/net/atm/ipcommon.c @@ -0,0 +1,61 @@ +/* net/atm/ipcommon.c - Common items for all ways of doing IP over ATM */ + +/* Written 1996-2000 by Werner Almesberger, EPFL LRC/ICA */ + + +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" +#include "ipcommon.h" + + +#if 0 +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + + +/* + * skb_migrate appends the list at "from" to "to", emptying "from" in the + * process. skb_migrate is atomic with respect to all other skb operations on + * "from" and "to". Note that it locks both lists at the same time, so beware + * of potential deadlocks. + * + * This function should live in skbuff.c or skbuff.h. + */ + + +void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to) +{ + struct sk_buff *skb; + unsigned long flags; + struct sk_buff *skb_from = (struct sk_buff *) from; + struct sk_buff *skb_to = (struct sk_buff *) to; + struct sk_buff *prev; + + spin_lock_irqsave(&from->lock,flags); + spin_lock(&to->lock); + prev = from->prev; + from->next->prev = to->prev; + prev->next = skb_to; + to->prev->next = from->next; + to->prev = from->prev; + for (skb = from->next; skb != skb_to; skb = skb->next) + skb->list = to; + to->qlen += from->qlen; + spin_unlock(&to->lock); + from->prev = skb_from; + from->next = skb_from; + from->qlen = 0; + spin_unlock_irqrestore(&from->lock,flags); +} + + +EXPORT_SYMBOL(skb_migrate); diff --git a/net/atm/ipcommon.h b/net/atm/ipcommon.h new file mode 100644 index 000000000000..d72165f60939 --- /dev/null +++ b/net/atm/ipcommon.h @@ -0,0 +1,22 @@ +/* net/atm/ipcommon.h - Common items for all ways of doing IP over ATM */ + +/* Written 1996-2000 by Werner Almesberger, EPFL LRC/ICA */ + + +#ifndef NET_ATM_IPCOMMON_H +#define NET_ATM_IPCOMMON_H + + +#include +#include +#include +#include + +/* + * Appends all skbs from "from" to "to". The operation is atomic with respect + * to all other skb operations on "from" or "to". + */ + +void skb_migrate(struct sk_buff_head *from,struct sk_buff_head *to); + +#endif diff --git a/net/atm/lec.c b/net/atm/lec.c new file mode 100644 index 000000000000..a0752487026d --- /dev/null +++ b/net/atm/lec.c @@ -0,0 +1,2538 @@ +/* + * lec.c: Lan Emulation driver + * Marko Kiiskila mkiiskila@yahoo.com + * + */ + +#include +#include +#include + +/* We are ethernet device */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* TokenRing if needed */ +#ifdef CONFIG_TR +#include +#endif + +/* And atm device */ +#include +#include + +/* Proxy LEC knows about bridging */ +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) +#include +#include "../bridge/br_private.h" + +static unsigned char bridge_ula_lec[] = {0x01, 0x80, 0xc2, 0x00, 0x00}; +#endif + +/* Modular too */ +#include +#include + +#include "lec.h" +#include "lec_arpc.h" +#include "resources.h" + +#if 0 +#define DPRINTK printk +#else +#define DPRINTK(format,args...) +#endif + +#define DUMP_PACKETS 0 /* 0 = None, + * 1 = 30 first bytes + * 2 = Whole packet + */ + +#define LEC_UNRES_QUE_LEN 8 /* number of tx packets to queue for a + single destination while waiting for SVC */ + +static int lec_open(struct net_device *dev); +static int lec_start_xmit(struct sk_buff *skb, struct net_device *dev); +static int lec_close(struct net_device *dev); +static struct net_device_stats *lec_get_stats(struct net_device *dev); +static void lec_init(struct net_device *dev); +static struct lec_arp_table* lec_arp_find(struct lec_priv *priv, + unsigned char *mac_addr); +static int lec_arp_remove(struct lec_priv *priv, + struct lec_arp_table *to_remove); +/* LANE2 functions */ +static void lane2_associate_ind (struct net_device *dev, u8 *mac_address, + u8 *tlvs, u32 sizeoftlvs); +static int lane2_resolve(struct net_device *dev, u8 *dst_mac, int force, + u8 **tlvs, u32 *sizeoftlvs); +static int lane2_associate_req (struct net_device *dev, u8 *lan_dst, + u8 *tlvs, u32 sizeoftlvs); + +static int lec_addr_delete(struct lec_priv *priv, unsigned char *atm_addr, + unsigned long permanent); +static void lec_arp_check_empties(struct lec_priv *priv, + struct atm_vcc *vcc, struct sk_buff *skb); +static void lec_arp_destroy(struct lec_priv *priv); +static void lec_arp_init(struct lec_priv *priv); +static struct atm_vcc* lec_arp_resolve(struct lec_priv *priv, + unsigned char *mac_to_find, + int is_rdesc, + struct lec_arp_table **ret_entry); +static void lec_arp_update(struct lec_priv *priv, unsigned char *mac_addr, + unsigned char *atm_addr, unsigned long remoteflag, + unsigned int targetless_le_arp); +static void lec_flush_complete(struct lec_priv *priv, unsigned long tran_id); +static int lec_mcast_make(struct lec_priv *priv, struct atm_vcc *vcc); +static void lec_set_flush_tran_id(struct lec_priv *priv, + unsigned char *atm_addr, + unsigned long tran_id); +static void lec_vcc_added(struct lec_priv *priv, struct atmlec_ioc *ioc_data, + struct atm_vcc *vcc, + void (*old_push)(struct atm_vcc *vcc, struct sk_buff *skb)); +static void lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc); + +static struct lane2_ops lane2_ops = { + lane2_resolve, /* resolve, spec 3.1.3 */ + lane2_associate_req, /* associate_req, spec 3.1.4 */ + NULL /* associate indicator, spec 3.1.5 */ +}; + +static unsigned char bus_mac[ETH_ALEN] = {0xff,0xff,0xff,0xff,0xff,0xff}; + +/* Device structures */ +static struct net_device *dev_lec[MAX_LEC_ITF]; + +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) +static void lec_handle_bridge(struct sk_buff *skb, struct net_device *dev) +{ + struct ethhdr *eth; + char *buff; + struct lec_priv *priv; + + /* Check if this is a BPDU. If so, ask zeppelin to send + * LE_TOPOLOGY_REQUEST with the same value of Topology Change bit + * as the Config BPDU has */ + eth = (struct ethhdr *)skb->data; + buff = skb->data + skb->dev->hard_header_len; + if (*buff++ == 0x42 && *buff++ == 0x42 && *buff++ == 0x03) { + struct sock *sk; + struct sk_buff *skb2; + struct atmlec_msg *mesg; + + skb2 = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC); + if (skb2 == NULL) return; + skb2->len = sizeof(struct atmlec_msg); + mesg = (struct atmlec_msg *)skb2->data; + mesg->type = l_topology_change; + buff += 4; + mesg->content.normal.flag = *buff & 0x01; /* 0x01 is topology change */ + + priv = (struct lec_priv *)dev->priv; + atm_force_charge(priv->lecd, skb2->truesize); + sk = sk_atm(priv->lecd); + skb_queue_tail(&sk->sk_receive_queue, skb2); + sk->sk_data_ready(sk, skb2->len); + } + + return; +} +#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */ + +/* + * Modelled after tr_type_trans + * All multicast and ARE or STE frames go to BUS. + * Non source routed frames go by destination address. + * Last hop source routed frames go by destination address. + * Not last hop source routed frames go by _next_ route descriptor. + * Returns pointer to destination MAC address or fills in rdesc + * and returns NULL. + */ +#ifdef CONFIG_TR +static unsigned char *get_tr_dst(unsigned char *packet, unsigned char *rdesc) +{ + struct trh_hdr *trh; + int riflen, num_rdsc; + + trh = (struct trh_hdr *)packet; + if (trh->daddr[0] & (uint8_t)0x80) + return bus_mac; /* multicast */ + + if (trh->saddr[0] & TR_RII) { + riflen = (ntohs(trh->rcf) & TR_RCF_LEN_MASK) >> 8; + if ((ntohs(trh->rcf) >> 13) != 0) + return bus_mac; /* ARE or STE */ + } + else + return trh->daddr; /* not source routed */ + + if (riflen < 6) + return trh->daddr; /* last hop, source routed */ + + /* riflen is 6 or more, packet has more than one route descriptor */ + num_rdsc = (riflen/2) - 1; + memset(rdesc, 0, ETH_ALEN); + /* offset 4 comes from LAN destination field in LE control frames */ + if (trh->rcf & htons((uint16_t)TR_RCF_DIR_BIT)) + memcpy(&rdesc[4], &trh->rseg[num_rdsc-2], sizeof(uint16_t)); + else { + memcpy(&rdesc[4], &trh->rseg[1], sizeof(uint16_t)); + rdesc[5] = ((ntohs(trh->rseg[0]) & 0x000f) | (rdesc[5] & 0xf0)); + } + + return NULL; +} +#endif /* CONFIG_TR */ + +/* + * Open/initialize the netdevice. This is called (in the current kernel) + * sometime after booting when the 'ifconfig' program is run. + * + * This routine should set everything up anew at each open, even + * registers that "should" only need to be set once at boot, so that + * there is non-reboot way to recover if something goes wrong. + */ + +static int +lec_open(struct net_device *dev) +{ + struct lec_priv *priv = (struct lec_priv *)dev->priv; + + netif_start_queue(dev); + memset(&priv->stats,0,sizeof(struct net_device_stats)); + + return 0; +} + +static __inline__ void +lec_send(struct atm_vcc *vcc, struct sk_buff *skb, struct lec_priv *priv) +{ + ATM_SKB(skb)->vcc = vcc; + ATM_SKB(skb)->atm_options = vcc->atm_options; + + atomic_add(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc); + if (vcc->send(vcc, skb) < 0) { + priv->stats.tx_dropped++; + return; + } + + priv->stats.tx_packets++; + priv->stats.tx_bytes += skb->len; +} + +static void +lec_tx_timeout(struct net_device *dev) +{ + printk(KERN_INFO "%s: tx timeout\n", dev->name); + dev->trans_start = jiffies; + netif_wake_queue(dev); +} + +static int +lec_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct sk_buff *skb2; + struct lec_priv *priv = (struct lec_priv *)dev->priv; + struct lecdatahdr_8023 *lec_h; + struct atm_vcc *vcc; + struct lec_arp_table *entry; + unsigned char *dst; + int min_frame_size; +#ifdef CONFIG_TR + unsigned char rdesc[ETH_ALEN]; /* Token Ring route descriptor */ +#endif + int is_rdesc; +#if DUMP_PACKETS > 0 + char buf[300]; + int i=0; +#endif /* DUMP_PACKETS >0 */ + + DPRINTK("lec_start_xmit called\n"); + if (!priv->lecd) { + printk("%s:No lecd attached\n",dev->name); + priv->stats.tx_errors++; + netif_stop_queue(dev); + return -EUNATCH; + } + + DPRINTK("skbuff head:%lx data:%lx tail:%lx end:%lx\n", + (long)skb->head, (long)skb->data, (long)skb->tail, + (long)skb->end); +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) + if (memcmp(skb->data, bridge_ula_lec, sizeof(bridge_ula_lec)) == 0) + lec_handle_bridge(skb, dev); +#endif + + /* Make sure we have room for lec_id */ + if (skb_headroom(skb) < 2) { + + DPRINTK("lec_start_xmit: reallocating skb\n"); + skb2 = skb_realloc_headroom(skb, LEC_HEADER_LEN); + kfree_skb(skb); + if (skb2 == NULL) return 0; + skb = skb2; + } + skb_push(skb, 2); + + /* Put le header to place, works for TokenRing too */ + lec_h = (struct lecdatahdr_8023*)skb->data; + lec_h->le_header = htons(priv->lecid); + +#ifdef CONFIG_TR + /* Ugly. Use this to realign Token Ring packets for + * e.g. PCA-200E driver. */ + if (priv->is_trdev) { + skb2 = skb_realloc_headroom(skb, LEC_HEADER_LEN); + kfree_skb(skb); + if (skb2 == NULL) return 0; + skb = skb2; + } +#endif + +#if DUMP_PACKETS > 0 + printk("%s: send datalen:%ld lecid:%4.4x\n", dev->name, + skb->len, priv->lecid); +#if DUMP_PACKETS >= 2 + for(i=0;ilen && i <99;i++) { + sprintf(buf+i*3,"%2.2x ",0xff&skb->data[i]); + } +#elif DUMP_PACKETS >= 1 + for(i=0;ilen && i < 30;i++) { + sprintf(buf+i*3,"%2.2x ", 0xff&skb->data[i]); + } +#endif /* DUMP_PACKETS >= 1 */ + if (i==skb->len) + printk("%s\n",buf); + else + printk("%s...\n",buf); +#endif /* DUMP_PACKETS > 0 */ + + /* Minimum ethernet-frame size */ +#ifdef CONFIG_TR + if (priv->is_trdev) + min_frame_size = LEC_MINIMUM_8025_SIZE; + else +#endif + min_frame_size = LEC_MINIMUM_8023_SIZE; + if (skb->len < min_frame_size) { + if ((skb->len + skb_tailroom(skb)) < min_frame_size) { + skb2 = skb_copy_expand(skb, 0, + min_frame_size - skb->truesize, GFP_ATOMIC); + dev_kfree_skb(skb); + if (skb2 == NULL) { + priv->stats.tx_dropped++; + return 0; + } + skb = skb2; + } + skb_put(skb, min_frame_size - skb->len); + } + + /* Send to right vcc */ + is_rdesc = 0; + dst = lec_h->h_dest; +#ifdef CONFIG_TR + if (priv->is_trdev) { + dst = get_tr_dst(skb->data+2, rdesc); + if (dst == NULL) { + dst = rdesc; + is_rdesc = 1; + } + } +#endif + entry = NULL; + vcc = lec_arp_resolve(priv, dst, is_rdesc, &entry); + DPRINTK("%s:vcc:%p vcc_flags:%x, entry:%p\n", dev->name, + vcc, vcc?vcc->flags:0, entry); + if (!vcc || !test_bit(ATM_VF_READY,&vcc->flags)) { + if (entry && (entry->tx_wait.qlen < LEC_UNRES_QUE_LEN)) { + DPRINTK("%s:lec_start_xmit: queuing packet, ", dev->name); + DPRINTK("MAC address 0x%02x:%02x:%02x:%02x:%02x:%02x\n", + lec_h->h_dest[0], lec_h->h_dest[1], lec_h->h_dest[2], + lec_h->h_dest[3], lec_h->h_dest[4], lec_h->h_dest[5]); + skb_queue_tail(&entry->tx_wait, skb); + } else { + DPRINTK("%s:lec_start_xmit: tx queue full or no arp entry, dropping, ", dev->name); + DPRINTK("MAC address 0x%02x:%02x:%02x:%02x:%02x:%02x\n", + lec_h->h_dest[0], lec_h->h_dest[1], lec_h->h_dest[2], + lec_h->h_dest[3], lec_h->h_dest[4], lec_h->h_dest[5]); + priv->stats.tx_dropped++; + dev_kfree_skb(skb); + } + return 0; + } + +#if DUMP_PACKETS > 0 + printk("%s:sending to vpi:%d vci:%d\n", dev->name, + vcc->vpi, vcc->vci); +#endif /* DUMP_PACKETS > 0 */ + + while (entry && (skb2 = skb_dequeue(&entry->tx_wait))) { + DPRINTK("lec.c: emptying tx queue, "); + DPRINTK("MAC address 0x%02x:%02x:%02x:%02x:%02x:%02x\n", + lec_h->h_dest[0], lec_h->h_dest[1], lec_h->h_dest[2], + lec_h->h_dest[3], lec_h->h_dest[4], lec_h->h_dest[5]); + lec_send(vcc, skb2, priv); + } + + lec_send(vcc, skb, priv); + + if (!atm_may_send(vcc, 0)) { + struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); + + vpriv->xoff = 1; + netif_stop_queue(dev); + + /* + * vcc->pop() might have occurred in between, making + * the vcc usuable again. Since xmit is serialized, + * this is the only situation we have to re-test. + */ + + if (atm_may_send(vcc, 0)) + netif_wake_queue(dev); + } + + dev->trans_start = jiffies; + return 0; +} + +/* The inverse routine to net_open(). */ +static int +lec_close(struct net_device *dev) +{ + netif_stop_queue(dev); + return 0; +} + +/* + * Get the current statistics. + * This may be called with the card open or closed. + */ +static struct net_device_stats * +lec_get_stats(struct net_device *dev) +{ + return &((struct lec_priv *)dev->priv)->stats; +} + +static int +lec_atm_send(struct atm_vcc *vcc, struct sk_buff *skb) +{ + unsigned long flags; + struct net_device *dev = (struct net_device*)vcc->proto_data; + struct lec_priv *priv = (struct lec_priv*)dev->priv; + struct atmlec_msg *mesg; + struct lec_arp_table *entry; + int i; + char *tmp; /* FIXME */ + + atomic_sub(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc); + mesg = (struct atmlec_msg *)skb->data; + tmp = skb->data; + tmp += sizeof(struct atmlec_msg); + DPRINTK("%s: msg from zeppelin:%d\n", dev->name, mesg->type); + switch(mesg->type) { + case l_set_mac_addr: + for (i=0;i<6;i++) { + dev->dev_addr[i] = mesg->content.normal.mac_addr[i]; + } + break; + case l_del_mac_addr: + for(i=0;i<6;i++) { + dev->dev_addr[i] = 0; + } + break; + case l_addr_delete: + lec_addr_delete(priv, mesg->content.normal.atm_addr, + mesg->content.normal.flag); + break; + case l_topology_change: + priv->topology_change = mesg->content.normal.flag; + break; + case l_flush_complete: + lec_flush_complete(priv, mesg->content.normal.flag); + break; + case l_narp_req: /* LANE2: see 7.1.35 in the lane2 spec */ + spin_lock_irqsave(&priv->lec_arp_lock, flags); + entry = lec_arp_find(priv, mesg->content.normal.mac_addr); + lec_arp_remove(priv, entry); + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + + if (mesg->content.normal.no_source_le_narp) + break; + /* FALL THROUGH */ + case l_arp_update: + lec_arp_update(priv, mesg->content.normal.mac_addr, + mesg->content.normal.atm_addr, + mesg->content.normal.flag, + mesg->content.normal.targetless_le_arp); + DPRINTK("lec: in l_arp_update\n"); + if (mesg->sizeoftlvs != 0) { /* LANE2 3.1.5 */ + DPRINTK("lec: LANE2 3.1.5, got tlvs, size %d\n", mesg->sizeoftlvs); + lane2_associate_ind(dev, + mesg->content.normal.mac_addr, + tmp, mesg->sizeoftlvs); + } + break; + case l_config: + priv->maximum_unknown_frame_count = + mesg->content.config.maximum_unknown_frame_count; + priv->max_unknown_frame_time = + (mesg->content.config.max_unknown_frame_time*HZ); + priv->max_retry_count = + mesg->content.config.max_retry_count; + priv->aging_time = (mesg->content.config.aging_time*HZ); + priv->forward_delay_time = + (mesg->content.config.forward_delay_time*HZ); + priv->arp_response_time = + (mesg->content.config.arp_response_time*HZ); + priv->flush_timeout = (mesg->content.config.flush_timeout*HZ); + priv->path_switching_delay = + (mesg->content.config.path_switching_delay*HZ); + priv->lane_version = mesg->content.config.lane_version; /* LANE2 */ + priv->lane2_ops = NULL; + if (priv->lane_version > 1) + priv->lane2_ops = &lane2_ops; + if (dev->change_mtu(dev, mesg->content.config.mtu)) + printk("%s: change_mtu to %d failed\n", dev->name, + mesg->content.config.mtu); + priv->is_proxy = mesg->content.config.is_proxy; + break; + case l_flush_tran_id: + lec_set_flush_tran_id(priv, mesg->content.normal.atm_addr, + mesg->content.normal.flag); + break; + case l_set_lecid: + priv->lecid=(unsigned short)(0xffff&mesg->content.normal.flag); + break; + case l_should_bridge: { +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) + struct net_bridge_fdb_entry *f; + + DPRINTK("%s: bridge zeppelin asks about 0x%02x:%02x:%02x:%02x:%02x:%02x\n", + dev->name, + mesg->content.proxy.mac_addr[0], mesg->content.proxy.mac_addr[1], + mesg->content.proxy.mac_addr[2], mesg->content.proxy.mac_addr[3], + mesg->content.proxy.mac_addr[4], mesg->content.proxy.mac_addr[5]); + + if (br_fdb_get_hook == NULL || dev->br_port == NULL) + break; + + f = br_fdb_get_hook(dev->br_port->br, mesg->content.proxy.mac_addr); + if (f != NULL && + f->dst->dev != dev && + f->dst->state == BR_STATE_FORWARDING) { + /* hit from bridge table, send LE_ARP_RESPONSE */ + struct sk_buff *skb2; + struct sock *sk; + + DPRINTK("%s: entry found, responding to zeppelin\n", dev->name); + skb2 = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC); + if (skb2 == NULL) { + br_fdb_put_hook(f); + break; + } + skb2->len = sizeof(struct atmlec_msg); + memcpy(skb2->data, mesg, sizeof(struct atmlec_msg)); + atm_force_charge(priv->lecd, skb2->truesize); + sk = sk_atm(priv->lecd); + skb_queue_tail(&sk->sk_receive_queue, skb2); + sk->sk_data_ready(sk, skb2->len); + } + if (f != NULL) br_fdb_put_hook(f); +#endif /* defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) */ + } + break; + default: + printk("%s: Unknown message type %d\n", dev->name, mesg->type); + dev_kfree_skb(skb); + return -EINVAL; + } + dev_kfree_skb(skb); + return 0; +} + +static void +lec_atm_close(struct atm_vcc *vcc) +{ + struct sk_buff *skb; + struct net_device *dev = (struct net_device *)vcc->proto_data; + struct lec_priv *priv = (struct lec_priv *)dev->priv; + + priv->lecd = NULL; + /* Do something needful? */ + + netif_stop_queue(dev); + lec_arp_destroy(priv); + + if (skb_peek(&sk_atm(vcc)->sk_receive_queue)) + printk("%s lec_atm_close: closing with messages pending\n", + dev->name); + while ((skb = skb_dequeue(&sk_atm(vcc)->sk_receive_queue)) != NULL) { + atm_return(vcc, skb->truesize); + dev_kfree_skb(skb); + } + + printk("%s: Shut down!\n", dev->name); + module_put(THIS_MODULE); +} + +static struct atmdev_ops lecdev_ops = { + .close = lec_atm_close, + .send = lec_atm_send +}; + +static struct atm_dev lecatm_dev = { + .ops = &lecdev_ops, + .type = "lec", + .number = 999, /* dummy device number */ + .lock = SPIN_LOCK_UNLOCKED +}; + +/* + * LANE2: new argument struct sk_buff *data contains + * the LE_ARP based TLVs introduced in the LANE2 spec + */ +static int +send_to_lecd(struct lec_priv *priv, atmlec_msg_type type, + unsigned char *mac_addr, unsigned char *atm_addr, + struct sk_buff *data) +{ + struct sock *sk; + struct sk_buff *skb; + struct atmlec_msg *mesg; + + if (!priv || !priv->lecd) { + return -1; + } + skb = alloc_skb(sizeof(struct atmlec_msg), GFP_ATOMIC); + if (!skb) + return -1; + skb->len = sizeof(struct atmlec_msg); + mesg = (struct atmlec_msg *)skb->data; + memset(mesg, 0, sizeof(struct atmlec_msg)); + mesg->type = type; + if (data != NULL) + mesg->sizeoftlvs = data->len; + if (mac_addr) + memcpy(&mesg->content.normal.mac_addr, mac_addr, ETH_ALEN); + else + mesg->content.normal.targetless_le_arp = 1; + if (atm_addr) + memcpy(&mesg->content.normal.atm_addr, atm_addr, ATM_ESA_LEN); + + atm_force_charge(priv->lecd, skb->truesize); + sk = sk_atm(priv->lecd); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + + if (data != NULL) { + DPRINTK("lec: about to send %d bytes of data\n", data->len); + atm_force_charge(priv->lecd, data->truesize); + skb_queue_tail(&sk->sk_receive_queue, data); + sk->sk_data_ready(sk, skb->len); + } + + return 0; +} + +/* shamelessly stolen from drivers/net/net_init.c */ +static int lec_change_mtu(struct net_device *dev, int new_mtu) +{ + if ((new_mtu < 68) || (new_mtu > 18190)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static void lec_set_multicast_list(struct net_device *dev) +{ + /* by default, all multicast frames arrive over the bus. + * eventually support selective multicast service + */ + return; +} + +static void +lec_init(struct net_device *dev) +{ + dev->change_mtu = lec_change_mtu; + dev->open = lec_open; + dev->stop = lec_close; + dev->hard_start_xmit = lec_start_xmit; + dev->tx_timeout = lec_tx_timeout; + + dev->get_stats = lec_get_stats; + dev->set_multicast_list = lec_set_multicast_list; + dev->do_ioctl = NULL; + printk("%s: Initialized!\n",dev->name); + return; +} + +static unsigned char lec_ctrl_magic[] = { + 0xff, + 0x00, + 0x01, + 0x01 }; + +static void +lec_push(struct atm_vcc *vcc, struct sk_buff *skb) +{ + struct net_device *dev = (struct net_device *)vcc->proto_data; + struct lec_priv *priv = (struct lec_priv *)dev->priv; + +#if DUMP_PACKETS >0 + int i=0; + char buf[300]; + + printk("%s: lec_push vcc vpi:%d vci:%d\n", dev->name, + vcc->vpi, vcc->vci); +#endif + if (!skb) { + DPRINTK("%s: null skb\n",dev->name); + lec_vcc_close(priv, vcc); + return; + } +#if DUMP_PACKETS > 0 + printk("%s: rcv datalen:%ld lecid:%4.4x\n", dev->name, + skb->len, priv->lecid); +#if DUMP_PACKETS >= 2 + for(i=0;ilen && i <99;i++) { + sprintf(buf+i*3,"%2.2x ",0xff&skb->data[i]); + } +#elif DUMP_PACKETS >= 1 + for(i=0;ilen && i < 30;i++) { + sprintf(buf+i*3,"%2.2x ", 0xff&skb->data[i]); + } +#endif /* DUMP_PACKETS >= 1 */ + if (i==skb->len) + printk("%s\n",buf); + else + printk("%s...\n",buf); +#endif /* DUMP_PACKETS > 0 */ + if (memcmp(skb->data, lec_ctrl_magic, 4) ==0) { /* Control frame, to daemon*/ + struct sock *sk = sk_atm(vcc); + + DPRINTK("%s: To daemon\n",dev->name); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + } else { /* Data frame, queue to protocol handlers */ + unsigned char *dst; + + atm_return(vcc,skb->truesize); + if (*(uint16_t *)skb->data == htons(priv->lecid) || + !priv->lecd || + !(dev->flags & IFF_UP)) { + /* Probably looping back, or if lecd is missing, + lecd has gone down */ + DPRINTK("Ignoring frame...\n"); + dev_kfree_skb(skb); + return; + } +#ifdef CONFIG_TR + if (priv->is_trdev) dst = ((struct lecdatahdr_8025 *)skb->data)->h_dest; + else +#endif + dst = ((struct lecdatahdr_8023 *)skb->data)->h_dest; + + if (!(dst[0]&0x01) && /* Never filter Multi/Broadcast */ + !priv->is_proxy && /* Proxy wants all the packets */ + memcmp(dst, dev->dev_addr, dev->addr_len)) { + dev_kfree_skb(skb); + return; + } + if (priv->lec_arp_empty_ones) { + lec_arp_check_empties(priv, vcc, skb); + } + skb->dev = dev; + skb_pull(skb, 2); /* skip lec_id */ +#ifdef CONFIG_TR + if (priv->is_trdev) skb->protocol = tr_type_trans(skb, dev); + else +#endif + skb->protocol = eth_type_trans(skb, dev); + priv->stats.rx_packets++; + priv->stats.rx_bytes += skb->len; + memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); + netif_rx(skb); + } +} + +static void +lec_pop(struct atm_vcc *vcc, struct sk_buff *skb) +{ + struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); + struct net_device *dev = skb->dev; + + if (vpriv == NULL) { + printk("lec_pop(): vpriv = NULL!?!?!?\n"); + return; + } + + vpriv->old_pop(vcc, skb); + + if (vpriv->xoff && atm_may_send(vcc, 0)) { + vpriv->xoff = 0; + if (netif_running(dev) && netif_queue_stopped(dev)) + netif_wake_queue(dev); + } +} + +static int +lec_vcc_attach(struct atm_vcc *vcc, void __user *arg) +{ + struct lec_vcc_priv *vpriv; + int bytes_left; + struct atmlec_ioc ioc_data; + + /* Lecd must be up in this case */ + bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmlec_ioc)); + if (bytes_left != 0) { + printk("lec: lec_vcc_attach, copy from user failed for %d bytes\n", + bytes_left); + } + if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF || + !dev_lec[ioc_data.dev_num]) + return -EINVAL; + if (!(vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL))) + return -ENOMEM; + vpriv->xoff = 0; + vpriv->old_pop = vcc->pop; + vcc->user_back = vpriv; + vcc->pop = lec_pop; + lec_vcc_added(dev_lec[ioc_data.dev_num]->priv, + &ioc_data, vcc, vcc->push); + vcc->proto_data = dev_lec[ioc_data.dev_num]; + vcc->push = lec_push; + return 0; +} + +static int +lec_mcast_attach(struct atm_vcc *vcc, int arg) +{ + if (arg <0 || arg >= MAX_LEC_ITF || !dev_lec[arg]) + return -EINVAL; + vcc->proto_data = dev_lec[arg]; + return (lec_mcast_make((struct lec_priv*)dev_lec[arg]->priv, vcc)); +} + +/* Initialize device. */ +static int +lecd_attach(struct atm_vcc *vcc, int arg) +{ + int i; + struct lec_priv *priv; + + if (arg<0) + i = 0; + else + i = arg; +#ifdef CONFIG_TR + if (arg >= MAX_LEC_ITF) + return -EINVAL; +#else /* Reserve the top NUM_TR_DEVS for TR */ + if (arg >= (MAX_LEC_ITF-NUM_TR_DEVS)) + return -EINVAL; +#endif + if (!dev_lec[i]) { + int is_trdev, size; + + is_trdev = 0; + if (i >= (MAX_LEC_ITF - NUM_TR_DEVS)) + is_trdev = 1; + + size = sizeof(struct lec_priv); +#ifdef CONFIG_TR + if (is_trdev) + dev_lec[i] = alloc_trdev(size); + else +#endif + dev_lec[i] = alloc_etherdev(size); + if (!dev_lec[i]) + return -ENOMEM; + snprintf(dev_lec[i]->name, IFNAMSIZ, "lec%d", i); + if (register_netdev(dev_lec[i])) { + free_netdev(dev_lec[i]); + return -EINVAL; + } + + priv = dev_lec[i]->priv; + priv->is_trdev = is_trdev; + lec_init(dev_lec[i]); + } else { + priv = dev_lec[i]->priv; + if (priv->lecd) + return -EADDRINUSE; + } + lec_arp_init(priv); + priv->itfnum = i; /* LANE2 addition */ + priv->lecd = vcc; + vcc->dev = &lecatm_dev; + vcc_insert_socket(sk_atm(vcc)); + + vcc->proto_data = dev_lec[i]; + set_bit(ATM_VF_META,&vcc->flags); + set_bit(ATM_VF_READY,&vcc->flags); + + /* Set default values to these variables */ + priv->maximum_unknown_frame_count = 1; + priv->max_unknown_frame_time = (1*HZ); + priv->vcc_timeout_period = (1200*HZ); + priv->max_retry_count = 1; + priv->aging_time = (300*HZ); + priv->forward_delay_time = (15*HZ); + priv->topology_change = 0; + priv->arp_response_time = (1*HZ); + priv->flush_timeout = (4*HZ); + priv->path_switching_delay = (6*HZ); + + if (dev_lec[i]->flags & IFF_UP) { + netif_start_queue(dev_lec[i]); + } + __module_get(THIS_MODULE); + return i; +} + +#ifdef CONFIG_PROC_FS +static char* lec_arp_get_status_string(unsigned char status) +{ + static char *lec_arp_status_string[] = { + "ESI_UNKNOWN ", + "ESI_ARP_PENDING ", + "ESI_VC_PENDING ", + " ", + "ESI_FLUSH_PENDING ", + "ESI_FORWARD_DIRECT" + }; + + if (status > ESI_FORWARD_DIRECT) + status = 3; /* ESI_UNDEFINED */ + return lec_arp_status_string[status]; +} + +static void lec_info(struct seq_file *seq, struct lec_arp_table *entry) +{ + int i; + + for (i = 0; i < ETH_ALEN; i++) + seq_printf(seq, "%2.2x", entry->mac_addr[i] & 0xff); + seq_printf(seq, " "); + for (i = 0; i < ATM_ESA_LEN; i++) + seq_printf(seq, "%2.2x", entry->atm_addr[i] & 0xff); + seq_printf(seq, " %s %4.4x", lec_arp_get_status_string(entry->status), + entry->flags & 0xffff); + if (entry->vcc) + seq_printf(seq, "%3d %3d ", entry->vcc->vpi, entry->vcc->vci); + else + seq_printf(seq, " "); + if (entry->recv_vcc) { + seq_printf(seq, " %3d %3d", entry->recv_vcc->vpi, + entry->recv_vcc->vci); + } + seq_putc(seq, '\n'); +} + + +struct lec_state { + unsigned long flags; + struct lec_priv *locked; + struct lec_arp_table *entry; + struct net_device *dev; + int itf; + int arp_table; + int misc_table; +}; + +static void *lec_tbl_walk(struct lec_state *state, struct lec_arp_table *tbl, + loff_t *l) +{ + struct lec_arp_table *e = state->entry; + + if (!e) + e = tbl; + if (e == (void *)1) { + e = tbl; + --*l; + } + for (; e; e = e->next) { + if (--*l < 0) + break; + } + state->entry = e; + return (*l < 0) ? state : NULL; +} + +static void *lec_arp_walk(struct lec_state *state, loff_t *l, + struct lec_priv *priv) +{ + void *v = NULL; + int p; + + for (p = state->arp_table; p < LEC_ARP_TABLE_SIZE; p++) { + v = lec_tbl_walk(state, priv->lec_arp_tables[p], l); + if (v) + break; + } + state->arp_table = p; + return v; +} + +static void *lec_misc_walk(struct lec_state *state, loff_t *l, + struct lec_priv *priv) +{ + struct lec_arp_table *lec_misc_tables[] = { + priv->lec_arp_empty_ones, + priv->lec_no_forward, + priv->mcast_fwds + }; + void *v = NULL; + int q; + + for (q = state->misc_table; q < ARRAY_SIZE(lec_misc_tables); q++) { + v = lec_tbl_walk(state, lec_misc_tables[q], l); + if (v) + break; + } + state->misc_table = q; + return v; +} + +static void *lec_priv_walk(struct lec_state *state, loff_t *l, + struct lec_priv *priv) +{ + if (!state->locked) { + state->locked = priv; + spin_lock_irqsave(&priv->lec_arp_lock, state->flags); + } + if (!lec_arp_walk(state, l, priv) && + !lec_misc_walk(state, l, priv)) { + spin_unlock_irqrestore(&priv->lec_arp_lock, state->flags); + state->locked = NULL; + /* Partial state reset for the next time we get called */ + state->arp_table = state->misc_table = 0; + } + return state->locked; +} + +static void *lec_itf_walk(struct lec_state *state, loff_t *l) +{ + struct net_device *dev; + void *v; + + dev = state->dev ? state->dev : dev_lec[state->itf]; + v = (dev && dev->priv) ? lec_priv_walk(state, l, dev->priv) : NULL; + if (!v && dev) { + dev_put(dev); + /* Partial state reset for the next time we get called */ + dev = NULL; + } + state->dev = dev; + return v; +} + +static void *lec_get_idx(struct lec_state *state, loff_t l) +{ + void *v = NULL; + + for (; state->itf < MAX_LEC_ITF; state->itf++) { + v = lec_itf_walk(state, &l); + if (v) + break; + } + return v; +} + +static void *lec_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct lec_state *state = seq->private; + + state->itf = 0; + state->dev = NULL; + state->locked = NULL; + state->arp_table = 0; + state->misc_table = 0; + state->entry = (void *)1; + + return *pos ? lec_get_idx(state, *pos) : (void*)1; +} + +static void lec_seq_stop(struct seq_file *seq, void *v) +{ + struct lec_state *state = seq->private; + + if (state->dev) { + spin_unlock_irqrestore(&state->locked->lec_arp_lock, + state->flags); + dev_put(state->dev); + } +} + +static void *lec_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct lec_state *state = seq->private; + + v = lec_get_idx(state, 1); + *pos += !!PTR_ERR(v); + return v; +} + +static int lec_seq_show(struct seq_file *seq, void *v) +{ + static char lec_banner[] = "Itf MAC ATM destination" + " Status Flags " + "VPI/VCI Recv VPI/VCI\n"; + + if (v == (void *)1) + seq_puts(seq, lec_banner); + else { + struct lec_state *state = seq->private; + struct net_device *dev = state->dev; + + seq_printf(seq, "%s ", dev->name); + lec_info(seq, state->entry); + } + return 0; +} + +static struct seq_operations lec_seq_ops = { + .start = lec_seq_start, + .next = lec_seq_next, + .stop = lec_seq_stop, + .show = lec_seq_show, +}; + +static int lec_seq_open(struct inode *inode, struct file *file) +{ + struct lec_state *state; + struct seq_file *seq; + int rc = -EAGAIN; + + state = kmalloc(sizeof(*state), GFP_KERNEL); + if (!state) { + rc = -ENOMEM; + goto out; + } + + rc = seq_open(file, &lec_seq_ops); + if (rc) + goto out_kfree; + seq = file->private_data; + seq->private = state; +out: + return rc; + +out_kfree: + kfree(state); + goto out; +} + +static int lec_seq_release(struct inode *inode, struct file *file) +{ + return seq_release_private(inode, file); +} + +static struct file_operations lec_seq_fops = { + .owner = THIS_MODULE, + .open = lec_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = lec_seq_release, +}; +#endif + +static int lane_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct atm_vcc *vcc = ATM_SD(sock); + int err = 0; + + switch (cmd) { + case ATMLEC_CTRL: + case ATMLEC_MCAST: + case ATMLEC_DATA: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + break; + default: + return -ENOIOCTLCMD; + } + + switch (cmd) { + case ATMLEC_CTRL: + err = lecd_attach(vcc, (int) arg); + if (err >= 0) + sock->state = SS_CONNECTED; + break; + case ATMLEC_MCAST: + err = lec_mcast_attach(vcc, (int) arg); + break; + case ATMLEC_DATA: + err = lec_vcc_attach(vcc, (void __user *) arg); + break; + } + + return err; +} + +static struct atm_ioctl lane_ioctl_ops = { + .owner = THIS_MODULE, + .ioctl = lane_ioctl, +}; + +static int __init lane_module_init(void) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *p; + + p = create_proc_entry("lec", S_IRUGO, atm_proc_root); + if (p) + p->proc_fops = &lec_seq_fops; +#endif + + register_atm_ioctl(&lane_ioctl_ops); + printk("lec.c: " __DATE__ " " __TIME__ " initialized\n"); + return 0; +} + +static void __exit lane_module_cleanup(void) +{ + int i; + struct lec_priv *priv; + + remove_proc_entry("lec", atm_proc_root); + + deregister_atm_ioctl(&lane_ioctl_ops); + + for (i = 0; i < MAX_LEC_ITF; i++) { + if (dev_lec[i] != NULL) { + priv = (struct lec_priv *)dev_lec[i]->priv; + unregister_netdev(dev_lec[i]); + free_netdev(dev_lec[i]); + dev_lec[i] = NULL; + } + } + + return; +} + +module_init(lane_module_init); +module_exit(lane_module_cleanup); + +/* + * LANE2: 3.1.3, LE_RESOLVE.request + * Non force allocates memory and fills in *tlvs, fills in *sizeoftlvs. + * If sizeoftlvs == NULL the default TLVs associated with with this + * lec will be used. + * If dst_mac == NULL, targetless LE_ARP will be sent + */ +static int lane2_resolve(struct net_device *dev, u8 *dst_mac, int force, + u8 **tlvs, u32 *sizeoftlvs) +{ + unsigned long flags; + struct lec_priv *priv = (struct lec_priv *)dev->priv; + struct lec_arp_table *table; + struct sk_buff *skb; + int retval; + + if (force == 0) { + spin_lock_irqsave(&priv->lec_arp_lock, flags); + table = lec_arp_find(priv, dst_mac); + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + if(table == NULL) + return -1; + + *tlvs = kmalloc(table->sizeoftlvs, GFP_ATOMIC); + if (*tlvs == NULL) + return -1; + + memcpy(*tlvs, table->tlvs, table->sizeoftlvs); + *sizeoftlvs = table->sizeoftlvs; + + return 0; + } + + if (sizeoftlvs == NULL) + retval = send_to_lecd(priv, l_arp_xmt, dst_mac, NULL, NULL); + + else { + skb = alloc_skb(*sizeoftlvs, GFP_ATOMIC); + if (skb == NULL) + return -1; + skb->len = *sizeoftlvs; + memcpy(skb->data, *tlvs, *sizeoftlvs); + retval = send_to_lecd(priv, l_arp_xmt, dst_mac, NULL, skb); + } + return retval; +} + + +/* + * LANE2: 3.1.4, LE_ASSOCIATE.request + * Associate the *tlvs with the *lan_dst address. + * Will overwrite any previous association + * Returns 1 for success, 0 for failure (out of memory) + * + */ +static int lane2_associate_req (struct net_device *dev, u8 *lan_dst, + u8 *tlvs, u32 sizeoftlvs) +{ + int retval; + struct sk_buff *skb; + struct lec_priv *priv = (struct lec_priv*)dev->priv; + + if ( memcmp(lan_dst, dev->dev_addr, ETH_ALEN) != 0 ) + return (0); /* not our mac address */ + + kfree(priv->tlvs); /* NULL if there was no previous association */ + + priv->tlvs = kmalloc(sizeoftlvs, GFP_KERNEL); + if (priv->tlvs == NULL) + return (0); + priv->sizeoftlvs = sizeoftlvs; + memcpy(priv->tlvs, tlvs, sizeoftlvs); + + skb = alloc_skb(sizeoftlvs, GFP_ATOMIC); + if (skb == NULL) + return 0; + skb->len = sizeoftlvs; + memcpy(skb->data, tlvs, sizeoftlvs); + retval = send_to_lecd(priv, l_associate_req, NULL, NULL, skb); + if (retval != 0) + printk("lec.c: lane2_associate_req() failed\n"); + /* If the previous association has changed we must + * somehow notify other LANE entities about the change + */ + return (1); +} + +/* + * LANE2: 3.1.5, LE_ASSOCIATE.indication + * + */ +static void lane2_associate_ind (struct net_device *dev, u8 *mac_addr, + u8 *tlvs, u32 sizeoftlvs) +{ +#if 0 + int i = 0; +#endif + struct lec_priv *priv = (struct lec_priv *)dev->priv; +#if 0 /* Why have the TLVs in LE_ARP entries since we do not use them? When you + uncomment this code, make sure the TLVs get freed when entry is killed */ + struct lec_arp_table *entry = lec_arp_find(priv, mac_addr); + + if (entry == NULL) + return; /* should not happen */ + + kfree(entry->tlvs); + + entry->tlvs = kmalloc(sizeoftlvs, GFP_KERNEL); + if (entry->tlvs == NULL) + return; + + entry->sizeoftlvs = sizeoftlvs; + memcpy(entry->tlvs, tlvs, sizeoftlvs); +#endif +#if 0 + printk("lec.c: lane2_associate_ind()\n"); + printk("dump of tlvs, sizeoftlvs=%d\n", sizeoftlvs); + while (i < sizeoftlvs) + printk("%02x ", tlvs[i++]); + + printk("\n"); +#endif + + /* tell MPOA about the TLVs we saw */ + if (priv->lane2_ops && priv->lane2_ops->associate_indicator) { + priv->lane2_ops->associate_indicator(dev, mac_addr, + tlvs, sizeoftlvs); + } + return; +} + +/* + * Here starts what used to lec_arpc.c + * + * lec_arpc.c was added here when making + * lane client modular. October 1997 + * + */ + +#include +#include +#include +#include +#include +#include +#include + + +#if 0 +#define DPRINTK(format,args...) +/* +#define DPRINTK printk +*/ +#endif +#define DEBUG_ARP_TABLE 0 + +#define LEC_ARP_REFRESH_INTERVAL (3*HZ) + +static void lec_arp_check_expire(unsigned long data); +static void lec_arp_expire_arp(unsigned long data); + +/* + * Arp table funcs + */ + +#define HASH(ch) (ch & (LEC_ARP_TABLE_SIZE -1)) + +/* + * Initialization of arp-cache + */ +static void +lec_arp_init(struct lec_priv *priv) +{ + unsigned short i; + + for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { + priv->lec_arp_tables[i] = NULL; + } + spin_lock_init(&priv->lec_arp_lock); + init_timer(&priv->lec_arp_timer); + priv->lec_arp_timer.expires = jiffies + LEC_ARP_REFRESH_INTERVAL; + priv->lec_arp_timer.data = (unsigned long)priv; + priv->lec_arp_timer.function = lec_arp_check_expire; + add_timer(&priv->lec_arp_timer); +} + +static void +lec_arp_clear_vccs(struct lec_arp_table *entry) +{ + if (entry->vcc) { + struct atm_vcc *vcc = entry->vcc; + struct lec_vcc_priv *vpriv = LEC_VCC_PRIV(vcc); + struct net_device *dev = (struct net_device*) vcc->proto_data; + + vcc->pop = vpriv->old_pop; + if (vpriv->xoff) + netif_wake_queue(dev); + kfree(vpriv); + vcc->user_back = NULL; + vcc->push = entry->old_push; + vcc_release_async(vcc, -EPIPE); + vcc = NULL; + } + if (entry->recv_vcc) { + entry->recv_vcc->push = entry->old_recv_push; + vcc_release_async(entry->recv_vcc, -EPIPE); + entry->recv_vcc = NULL; + } +} + +/* + * Insert entry to lec_arp_table + * LANE2: Add to the end of the list to satisfy 8.1.13 + */ +static inline void +lec_arp_add(struct lec_priv *priv, struct lec_arp_table *to_add) +{ + unsigned short place; + struct lec_arp_table *tmp; + + place = HASH(to_add->mac_addr[ETH_ALEN-1]); + tmp = priv->lec_arp_tables[place]; + to_add->next = NULL; + if (tmp == NULL) + priv->lec_arp_tables[place] = to_add; + + else { /* add to the end */ + while (tmp->next) + tmp = tmp->next; + tmp->next = to_add; + } + + DPRINTK("LEC_ARP: Added entry:%2.2x %2.2x %2.2x %2.2x %2.2x %2.2x\n", + 0xff&to_add->mac_addr[0], 0xff&to_add->mac_addr[1], + 0xff&to_add->mac_addr[2], 0xff&to_add->mac_addr[3], + 0xff&to_add->mac_addr[4], 0xff&to_add->mac_addr[5]); +} + +/* + * Remove entry from lec_arp_table + */ +static int +lec_arp_remove(struct lec_priv *priv, + struct lec_arp_table *to_remove) +{ + unsigned short place; + struct lec_arp_table *tmp; + int remove_vcc=1; + + if (!to_remove) { + return -1; + } + place = HASH(to_remove->mac_addr[ETH_ALEN-1]); + tmp = priv->lec_arp_tables[place]; + if (tmp == to_remove) { + priv->lec_arp_tables[place] = tmp->next; + } else { + while(tmp && tmp->next != to_remove) { + tmp = tmp->next; + } + if (!tmp) {/* Entry was not found */ + return -1; + } + } + tmp->next = to_remove->next; + del_timer(&to_remove->timer); + + /* If this is the only MAC connected to this VCC, also tear down + the VCC */ + if (to_remove->status >= ESI_FLUSH_PENDING) { + /* + * ESI_FLUSH_PENDING, ESI_FORWARD_DIRECT + */ + for(place = 0; place < LEC_ARP_TABLE_SIZE; place++) { + for(tmp = priv->lec_arp_tables[place]; tmp != NULL; tmp = tmp->next) { + if (memcmp(tmp->atm_addr, to_remove->atm_addr, + ATM_ESA_LEN)==0) { + remove_vcc=0; + break; + } + } + } + if (remove_vcc) + lec_arp_clear_vccs(to_remove); + } + skb_queue_purge(&to_remove->tx_wait); /* FIXME: good place for this? */ + + DPRINTK("LEC_ARP: Removed entry:%2.2x %2.2x %2.2x %2.2x %2.2x %2.2x\n", + 0xff&to_remove->mac_addr[0], 0xff&to_remove->mac_addr[1], + 0xff&to_remove->mac_addr[2], 0xff&to_remove->mac_addr[3], + 0xff&to_remove->mac_addr[4], 0xff&to_remove->mac_addr[5]); + return 0; +} + +#if DEBUG_ARP_TABLE +static char* +get_status_string(unsigned char st) +{ + switch(st) { + case ESI_UNKNOWN: + return "ESI_UNKNOWN"; + case ESI_ARP_PENDING: + return "ESI_ARP_PENDING"; + case ESI_VC_PENDING: + return "ESI_VC_PENDING"; + case ESI_FLUSH_PENDING: + return "ESI_FLUSH_PENDING"; + case ESI_FORWARD_DIRECT: + return "ESI_FORWARD_DIRECT"; + default: + return ""; + } +} +#endif + +static void +dump_arp_table(struct lec_priv *priv) +{ +#if DEBUG_ARP_TABLE + int i,j, offset; + struct lec_arp_table *rulla; + char buf[1024]; + struct lec_arp_table **lec_arp_tables = + (struct lec_arp_table **)priv->lec_arp_tables; + struct lec_arp_table *lec_arp_empty_ones = + (struct lec_arp_table *)priv->lec_arp_empty_ones; + struct lec_arp_table *lec_no_forward = + (struct lec_arp_table *)priv->lec_no_forward; + struct lec_arp_table *mcast_fwds = priv->mcast_fwds; + + + printk("Dump %p:\n",priv); + for (i=0;imac_addr[j]&0xff); + } + offset +=sprintf(buf+offset,"Atm:"); + for(j=0;jatm_addr[j]&0xff); + } + offset+=sprintf(buf+offset, + "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", + rulla->vcc?rulla->vcc->vpi:0, + rulla->vcc?rulla->vcc->vci:0, + rulla->recv_vcc?rulla->recv_vcc->vpi:0, + rulla->recv_vcc?rulla->recv_vcc->vci:0, + rulla->last_used, + rulla->timestamp, rulla->no_tries); + offset+=sprintf(buf+offset, + "Flags:%x, Packets_flooded:%x, Status: %s ", + rulla->flags, rulla->packets_flooded, + get_status_string(rulla->status)); + offset+=sprintf(buf+offset,"->%p\n",rulla->next); + rulla = rulla->next; + } + printk("%s",buf); + } + rulla = lec_no_forward; + if (rulla) + printk("No forward\n"); + while(rulla) { + offset=0; + offset += sprintf(buf+offset,"Mac:"); + for(j=0;jmac_addr[j]&0xff); + } + offset +=sprintf(buf+offset,"Atm:"); + for(j=0;jatm_addr[j]&0xff); + } + offset+=sprintf(buf+offset, + "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", + rulla->vcc?rulla->vcc->vpi:0, + rulla->vcc?rulla->vcc->vci:0, + rulla->recv_vcc?rulla->recv_vcc->vpi:0, + rulla->recv_vcc?rulla->recv_vcc->vci:0, + rulla->last_used, + rulla->timestamp, rulla->no_tries); + offset+=sprintf(buf+offset, + "Flags:%x, Packets_flooded:%x, Status: %s ", + rulla->flags, rulla->packets_flooded, + get_status_string(rulla->status)); + offset+=sprintf(buf+offset,"->%lx\n",(long)rulla->next); + rulla = rulla->next; + printk("%s",buf); + } + rulla = lec_arp_empty_ones; + if (rulla) + printk("Empty ones\n"); + while(rulla) { + offset=0; + offset += sprintf(buf+offset,"Mac:"); + for(j=0;jmac_addr[j]&0xff); + } + offset +=sprintf(buf+offset,"Atm:"); + for(j=0;jatm_addr[j]&0xff); + } + offset+=sprintf(buf+offset, + "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", + rulla->vcc?rulla->vcc->vpi:0, + rulla->vcc?rulla->vcc->vci:0, + rulla->recv_vcc?rulla->recv_vcc->vpi:0, + rulla->recv_vcc?rulla->recv_vcc->vci:0, + rulla->last_used, + rulla->timestamp, rulla->no_tries); + offset+=sprintf(buf+offset, + "Flags:%x, Packets_flooded:%x, Status: %s ", + rulla->flags, rulla->packets_flooded, + get_status_string(rulla->status)); + offset+=sprintf(buf+offset,"->%lx\n",(long)rulla->next); + rulla = rulla->next; + printk("%s",buf); + } + + rulla = mcast_fwds; + if (rulla) + printk("Multicast Forward VCCs\n"); + while(rulla) { + offset=0; + offset += sprintf(buf+offset,"Mac:"); + for(j=0;jmac_addr[j]&0xff); + } + offset +=sprintf(buf+offset,"Atm:"); + for(j=0;jatm_addr[j]&0xff); + } + offset+=sprintf(buf+offset, + "Vcc vpi:%d vci:%d, Recv_vcc vpi:%d vci:%d Last_used:%lx, Timestamp:%lx, No_tries:%d ", + rulla->vcc?rulla->vcc->vpi:0, + rulla->vcc?rulla->vcc->vci:0, + rulla->recv_vcc?rulla->recv_vcc->vpi:0, + rulla->recv_vcc?rulla->recv_vcc->vci:0, + rulla->last_used, + rulla->timestamp, rulla->no_tries); + offset+=sprintf(buf+offset, + "Flags:%x, Packets_flooded:%x, Status: %s ", + rulla->flags, rulla->packets_flooded, + get_status_string(rulla->status)); + offset+=sprintf(buf+offset,"->%lx\n",(long)rulla->next); + rulla = rulla->next; + printk("%s",buf); + } + +#endif +} + +/* + * Destruction of arp-cache + */ +static void +lec_arp_destroy(struct lec_priv *priv) +{ + unsigned long flags; + struct lec_arp_table *entry, *next; + int i; + + del_timer_sync(&priv->lec_arp_timer); + + /* + * Remove all entries + */ + + spin_lock_irqsave(&priv->lec_arp_lock, flags); + for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { + for(entry = priv->lec_arp_tables[i]; entry != NULL; entry=next) { + next = entry->next; + lec_arp_remove(priv, entry); + kfree(entry); + } + } + entry = priv->lec_arp_empty_ones; + while(entry) { + next = entry->next; + del_timer_sync(&entry->timer); + lec_arp_clear_vccs(entry); + kfree(entry); + entry = next; + } + priv->lec_arp_empty_ones = NULL; + entry = priv->lec_no_forward; + while(entry) { + next = entry->next; + del_timer_sync(&entry->timer); + lec_arp_clear_vccs(entry); + kfree(entry); + entry = next; + } + priv->lec_no_forward = NULL; + entry = priv->mcast_fwds; + while(entry) { + next = entry->next; + /* No timer, LANEv2 7.1.20 and 2.3.5.3 */ + lec_arp_clear_vccs(entry); + kfree(entry); + entry = next; + } + priv->mcast_fwds = NULL; + priv->mcast_vcc = NULL; + memset(priv->lec_arp_tables, 0, + sizeof(struct lec_arp_table *) * LEC_ARP_TABLE_SIZE); + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); +} + + +/* + * Find entry by mac_address + */ +static struct lec_arp_table* +lec_arp_find(struct lec_priv *priv, + unsigned char *mac_addr) +{ + unsigned short place; + struct lec_arp_table *to_return; + + DPRINTK("LEC_ARP: lec_arp_find :%2.2x %2.2x %2.2x %2.2x %2.2x %2.2x\n", + mac_addr[0]&0xff, mac_addr[1]&0xff, mac_addr[2]&0xff, + mac_addr[3]&0xff, mac_addr[4]&0xff, mac_addr[5]&0xff); + place = HASH(mac_addr[ETH_ALEN-1]); + + to_return = priv->lec_arp_tables[place]; + while(to_return) { + if (memcmp(mac_addr, to_return->mac_addr, ETH_ALEN) == 0) { + return to_return; + } + to_return = to_return->next; + } + return NULL; +} + +static struct lec_arp_table* +make_entry(struct lec_priv *priv, unsigned char *mac_addr) +{ + struct lec_arp_table *to_return; + + to_return = (struct lec_arp_table *) kmalloc(sizeof(struct lec_arp_table), + GFP_ATOMIC); + if (!to_return) { + printk("LEC: Arp entry kmalloc failed\n"); + return NULL; + } + memset(to_return, 0, sizeof(struct lec_arp_table)); + memcpy(to_return->mac_addr, mac_addr, ETH_ALEN); + init_timer(&to_return->timer); + to_return->timer.function = lec_arp_expire_arp; + to_return->timer.data = (unsigned long) to_return; + to_return->last_used = jiffies; + to_return->priv = priv; + skb_queue_head_init(&to_return->tx_wait); + return to_return; +} + +/* + * + * Arp sent timer expired + * + */ +static void +lec_arp_expire_arp(unsigned long data) +{ + struct lec_arp_table *entry; + + entry = (struct lec_arp_table *)data; + + DPRINTK("lec_arp_expire_arp\n"); + if (entry->status == ESI_ARP_PENDING) { + if (entry->no_tries <= entry->priv->max_retry_count) { + if (entry->is_rdesc) + send_to_lecd(entry->priv, l_rdesc_arp_xmt, entry->mac_addr, NULL, NULL); + else + send_to_lecd(entry->priv, l_arp_xmt, entry->mac_addr, NULL, NULL); + entry->no_tries++; + } + mod_timer(&entry->timer, jiffies + (1*HZ)); + } +} + +/* + * + * Unknown/unused vcc expire, remove associated entry + * + */ +static void +lec_arp_expire_vcc(unsigned long data) +{ + unsigned long flags; + struct lec_arp_table *to_remove = (struct lec_arp_table*)data; + struct lec_priv *priv = (struct lec_priv *)to_remove->priv; + struct lec_arp_table *entry = NULL; + + del_timer(&to_remove->timer); + + DPRINTK("LEC_ARP %p %p: lec_arp_expire_vcc vpi:%d vci:%d\n", + to_remove, priv, + to_remove->vcc?to_remove->recv_vcc->vpi:0, + to_remove->vcc?to_remove->recv_vcc->vci:0); + DPRINTK("eo:%p nf:%p\n",priv->lec_arp_empty_ones,priv->lec_no_forward); + + spin_lock_irqsave(&priv->lec_arp_lock, flags); + if (to_remove == priv->lec_arp_empty_ones) + priv->lec_arp_empty_ones = to_remove->next; + else { + entry = priv->lec_arp_empty_ones; + while (entry && entry->next != to_remove) + entry = entry->next; + if (entry) + entry->next = to_remove->next; + } + if (!entry) { + if (to_remove == priv->lec_no_forward) { + priv->lec_no_forward = to_remove->next; + } else { + entry = priv->lec_no_forward; + while (entry && entry->next != to_remove) + entry = entry->next; + if (entry) + entry->next = to_remove->next; + } + } + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + + lec_arp_clear_vccs(to_remove); + kfree(to_remove); +} + +/* + * Expire entries. + * 1. Re-set timer + * 2. For each entry, delete entries that have aged past the age limit. + * 3. For each entry, depending on the status of the entry, perform + * the following maintenance. + * a. If status is ESI_VC_PENDING or ESI_ARP_PENDING then if the + * tick_count is above the max_unknown_frame_time, clear + * the tick_count to zero and clear the packets_flooded counter + * to zero. This supports the packet rate limit per address + * while flooding unknowns. + * b. If the status is ESI_FLUSH_PENDING and the tick_count is greater + * than or equal to the path_switching_delay, change the status + * to ESI_FORWARD_DIRECT. This causes the flush period to end + * regardless of the progress of the flush protocol. + */ +static void +lec_arp_check_expire(unsigned long data) +{ + unsigned long flags; + struct lec_priv *priv = (struct lec_priv *)data; + struct lec_arp_table *entry, *next; + unsigned long now; + unsigned long time_to_check; + int i; + + DPRINTK("lec_arp_check_expire %p\n",priv); + DPRINTK("expire: eo:%p nf:%p\n",priv->lec_arp_empty_ones, + priv->lec_no_forward); + now = jiffies; + spin_lock_irqsave(&priv->lec_arp_lock, flags); + for(i = 0; i < LEC_ARP_TABLE_SIZE; i++) { + for(entry = priv->lec_arp_tables[i]; entry != NULL; ) { + if ((entry->flags) & LEC_REMOTE_FLAG && + priv->topology_change) + time_to_check = priv->forward_delay_time; + else + time_to_check = priv->aging_time; + + DPRINTK("About to expire: %lx - %lx > %lx\n", + now,entry->last_used, time_to_check); + if( time_after(now, entry->last_used+ + time_to_check) && + !(entry->flags & LEC_PERMANENT_FLAG) && + !(entry->mac_addr[0] & 0x01) ) { /* LANE2: 7.1.20 */ + /* Remove entry */ + DPRINTK("LEC:Entry timed out\n"); + next = entry->next; + lec_arp_remove(priv, entry); + kfree(entry); + entry = next; + } else { + /* Something else */ + if ((entry->status == ESI_VC_PENDING || + entry->status == ESI_ARP_PENDING) + && time_after_eq(now, + entry->timestamp + + priv->max_unknown_frame_time)) { + entry->timestamp = jiffies; + entry->packets_flooded = 0; + if (entry->status == ESI_VC_PENDING) + send_to_lecd(priv, l_svc_setup, entry->mac_addr, entry->atm_addr, NULL); + } + if (entry->status == ESI_FLUSH_PENDING + && + time_after_eq(now, entry->timestamp+ + priv->path_switching_delay)) { + struct sk_buff *skb; + + while ((skb = skb_dequeue(&entry->tx_wait)) != NULL) + lec_send(entry->vcc, skb, entry->priv); + entry->last_used = jiffies; + entry->status = + ESI_FORWARD_DIRECT; + } + entry = entry->next; + } + } + } + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + + mod_timer(&priv->lec_arp_timer, jiffies + LEC_ARP_REFRESH_INTERVAL); +} +/* + * Try to find vcc where mac_address is attached. + * + */ +static struct atm_vcc* +lec_arp_resolve(struct lec_priv *priv, unsigned char *mac_to_find, + int is_rdesc, struct lec_arp_table **ret_entry) +{ + unsigned long flags; + struct lec_arp_table *entry; + struct atm_vcc *found; + + if (mac_to_find[0] & 0x01) { + switch (priv->lane_version) { + case 1: + return priv->mcast_vcc; + break; + case 2: /* LANE2 wants arp for multicast addresses */ + if ( memcmp(mac_to_find, bus_mac, ETH_ALEN) == 0) + return priv->mcast_vcc; + break; + default: + break; + } + } + + spin_lock_irqsave(&priv->lec_arp_lock, flags); + entry = lec_arp_find(priv, mac_to_find); + + if (entry) { + if (entry->status == ESI_FORWARD_DIRECT) { + /* Connection Ok */ + entry->last_used = jiffies; + *ret_entry = entry; + found = entry->vcc; + goto out; + } + /* Data direct VC not yet set up, check to see if the unknown + frame count is greater than the limit. If the limit has + not been reached, allow the caller to send packet to + BUS. */ + if (entry->status != ESI_FLUSH_PENDING && + entry->packets_floodedmaximum_unknown_frame_count) { + entry->packets_flooded++; + DPRINTK("LEC_ARP: Flooding..\n"); + found = priv->mcast_vcc; + goto out; + } + /* We got here because entry->status == ESI_FLUSH_PENDING + * or BUS flood limit was reached for an entry which is + * in ESI_ARP_PENDING or ESI_VC_PENDING state. + */ + *ret_entry = entry; + DPRINTK("lec: entry->status %d entry->vcc %p\n", entry->status, entry->vcc); + found = NULL; + } else { + /* No matching entry was found */ + entry = make_entry(priv, mac_to_find); + DPRINTK("LEC_ARP: Making entry\n"); + if (!entry) { + found = priv->mcast_vcc; + goto out; + } + lec_arp_add(priv, entry); + /* We want arp-request(s) to be sent */ + entry->packets_flooded =1; + entry->status = ESI_ARP_PENDING; + entry->no_tries = 1; + entry->last_used = entry->timestamp = jiffies; + entry->is_rdesc = is_rdesc; + if (entry->is_rdesc) + send_to_lecd(priv, l_rdesc_arp_xmt, mac_to_find, NULL, NULL); + else + send_to_lecd(priv, l_arp_xmt, mac_to_find, NULL, NULL); + entry->timer.expires = jiffies + (1*HZ); + entry->timer.function = lec_arp_expire_arp; + add_timer(&entry->timer); + found = priv->mcast_vcc; + } + +out: + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + return found; +} + +static int +lec_addr_delete(struct lec_priv *priv, unsigned char *atm_addr, + unsigned long permanent) +{ + unsigned long flags; + struct lec_arp_table *entry, *next; + int i; + + DPRINTK("lec_addr_delete\n"); + spin_lock_irqsave(&priv->lec_arp_lock, flags); + for(i = 0; i < LEC_ARP_TABLE_SIZE; i++) { + for(entry = priv->lec_arp_tables[i]; entry != NULL; entry = next) { + next = entry->next; + if (!memcmp(atm_addr, entry->atm_addr, ATM_ESA_LEN) + && (permanent || + !(entry->flags & LEC_PERMANENT_FLAG))) { + lec_arp_remove(priv, entry); + kfree(entry); + } + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + return 0; + } + } + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + return -1; +} + +/* + * Notifies: Response to arp_request (atm_addr != NULL) + */ +static void +lec_arp_update(struct lec_priv *priv, unsigned char *mac_addr, + unsigned char *atm_addr, unsigned long remoteflag, + unsigned int targetless_le_arp) +{ + unsigned long flags; + struct lec_arp_table *entry, *tmp; + int i; + + DPRINTK("lec:%s", (targetless_le_arp) ? "targetless ": " "); + DPRINTK("lec_arp_update mac:%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n", + mac_addr[0],mac_addr[1],mac_addr[2],mac_addr[3], + mac_addr[4],mac_addr[5]); + + spin_lock_irqsave(&priv->lec_arp_lock, flags); + entry = lec_arp_find(priv, mac_addr); + if (entry == NULL && targetless_le_arp) + goto out; /* LANE2: ignore targetless LE_ARPs for which + * we have no entry in the cache. 7.1.30 + */ + if (priv->lec_arp_empty_ones) { + entry = priv->lec_arp_empty_ones; + if (!memcmp(entry->atm_addr, atm_addr, ATM_ESA_LEN)) { + priv->lec_arp_empty_ones = entry->next; + } else { + while(entry->next && memcmp(entry->next->atm_addr, + atm_addr, ATM_ESA_LEN)) + entry = entry->next; + if (entry->next) { + tmp = entry; + entry = entry->next; + tmp->next = entry->next; + } else + entry = NULL; + + } + if (entry) { + del_timer(&entry->timer); + tmp = lec_arp_find(priv, mac_addr); + if (tmp) { + del_timer(&tmp->timer); + tmp->status = ESI_FORWARD_DIRECT; + memcpy(tmp->atm_addr, atm_addr, ATM_ESA_LEN); + tmp->vcc = entry->vcc; + tmp->old_push = entry->old_push; + tmp->last_used = jiffies; + del_timer(&entry->timer); + kfree(entry); + entry=tmp; + } else { + entry->status = ESI_FORWARD_DIRECT; + memcpy(entry->mac_addr, mac_addr, ETH_ALEN); + entry->last_used = jiffies; + lec_arp_add(priv, entry); + } + if (remoteflag) + entry->flags|=LEC_REMOTE_FLAG; + else + entry->flags&=~LEC_REMOTE_FLAG; + DPRINTK("After update\n"); + dump_arp_table(priv); + goto out; + } + } + entry = lec_arp_find(priv, mac_addr); + if (!entry) { + entry = make_entry(priv, mac_addr); + if (!entry) + goto out; + entry->status = ESI_UNKNOWN; + lec_arp_add(priv, entry); + /* Temporary, changes before end of function */ + } + memcpy(entry->atm_addr, atm_addr, ATM_ESA_LEN); + del_timer(&entry->timer); + for(i = 0; i < LEC_ARP_TABLE_SIZE; i++) { + for(tmp = priv->lec_arp_tables[i]; tmp; tmp=tmp->next) { + if (entry != tmp && + !memcmp(tmp->atm_addr, atm_addr, + ATM_ESA_LEN)) { + /* Vcc to this host exists */ + if (tmp->status > ESI_VC_PENDING) { + /* + * ESI_FLUSH_PENDING, + * ESI_FORWARD_DIRECT + */ + entry->vcc = tmp->vcc; + entry->old_push=tmp->old_push; + } + entry->status=tmp->status; + break; + } + } + } + if (remoteflag) + entry->flags|=LEC_REMOTE_FLAG; + else + entry->flags&=~LEC_REMOTE_FLAG; + if (entry->status == ESI_ARP_PENDING || + entry->status == ESI_UNKNOWN) { + entry->status = ESI_VC_PENDING; + send_to_lecd(priv, l_svc_setup, entry->mac_addr, atm_addr, NULL); + } + DPRINTK("After update2\n"); + dump_arp_table(priv); +out: + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); +} + +/* + * Notifies: Vcc setup ready + */ +static void +lec_vcc_added(struct lec_priv *priv, struct atmlec_ioc *ioc_data, + struct atm_vcc *vcc, + void (*old_push)(struct atm_vcc *vcc, struct sk_buff *skb)) +{ + unsigned long flags; + struct lec_arp_table *entry; + int i, found_entry=0; + + spin_lock_irqsave(&priv->lec_arp_lock, flags); + if (ioc_data->receive == 2) { + /* Vcc for Multicast Forward. No timer, LANEv2 7.1.20 and 2.3.5.3 */ + + DPRINTK("LEC_ARP: Attaching mcast forward\n"); +#if 0 + entry = lec_arp_find(priv, bus_mac); + if (!entry) { + printk("LEC_ARP: Multicast entry not found!\n"); + goto out; + } + memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); + entry->recv_vcc = vcc; + entry->old_recv_push = old_push; +#endif + entry = make_entry(priv, bus_mac); + if (entry == NULL) + goto out; + del_timer(&entry->timer); + memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); + entry->recv_vcc = vcc; + entry->old_recv_push = old_push; + entry->next = priv->mcast_fwds; + priv->mcast_fwds = entry; + goto out; + } else if (ioc_data->receive == 1) { + /* Vcc which we don't want to make default vcc, attach it + anyway. */ + DPRINTK("LEC_ARP:Attaching data direct, not default :%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n", + ioc_data->atm_addr[0],ioc_data->atm_addr[1], + ioc_data->atm_addr[2],ioc_data->atm_addr[3], + ioc_data->atm_addr[4],ioc_data->atm_addr[5], + ioc_data->atm_addr[6],ioc_data->atm_addr[7], + ioc_data->atm_addr[8],ioc_data->atm_addr[9], + ioc_data->atm_addr[10],ioc_data->atm_addr[11], + ioc_data->atm_addr[12],ioc_data->atm_addr[13], + ioc_data->atm_addr[14],ioc_data->atm_addr[15], + ioc_data->atm_addr[16],ioc_data->atm_addr[17], + ioc_data->atm_addr[18],ioc_data->atm_addr[19]); + entry = make_entry(priv, bus_mac); + if (entry == NULL) + goto out; + memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); + memset(entry->mac_addr, 0, ETH_ALEN); + entry->recv_vcc = vcc; + entry->old_recv_push = old_push; + entry->status = ESI_UNKNOWN; + entry->timer.expires = jiffies + priv->vcc_timeout_period; + entry->timer.function = lec_arp_expire_vcc; + add_timer(&entry->timer); + entry->next = priv->lec_no_forward; + priv->lec_no_forward = entry; + dump_arp_table(priv); + goto out; + } + DPRINTK("LEC_ARP:Attaching data direct, default:%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x%2.2x\n", + ioc_data->atm_addr[0],ioc_data->atm_addr[1], + ioc_data->atm_addr[2],ioc_data->atm_addr[3], + ioc_data->atm_addr[4],ioc_data->atm_addr[5], + ioc_data->atm_addr[6],ioc_data->atm_addr[7], + ioc_data->atm_addr[8],ioc_data->atm_addr[9], + ioc_data->atm_addr[10],ioc_data->atm_addr[11], + ioc_data->atm_addr[12],ioc_data->atm_addr[13], + ioc_data->atm_addr[14],ioc_data->atm_addr[15], + ioc_data->atm_addr[16],ioc_data->atm_addr[17], + ioc_data->atm_addr[18],ioc_data->atm_addr[19]); + for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { + for (entry = priv->lec_arp_tables[i]; entry; entry=entry->next) { + if (memcmp(ioc_data->atm_addr, entry->atm_addr, + ATM_ESA_LEN)==0) { + DPRINTK("LEC_ARP: Attaching data direct\n"); + DPRINTK("Currently -> Vcc: %d, Rvcc:%d\n", + entry->vcc?entry->vcc->vci:0, + entry->recv_vcc?entry->recv_vcc->vci:0); + found_entry=1; + del_timer(&entry->timer); + entry->vcc = vcc; + entry->old_push = old_push; + if (entry->status == ESI_VC_PENDING) { + if(priv->maximum_unknown_frame_count + ==0) + entry->status = + ESI_FORWARD_DIRECT; + else { + entry->timestamp = jiffies; + entry->status = + ESI_FLUSH_PENDING; +#if 0 + send_to_lecd(priv,l_flush_xmt, + NULL, + entry->atm_addr, + NULL); +#endif + } + } else { + /* They were forming a connection + to us, and we to them. Our + ATM address is numerically lower + than theirs, so we make connection + we formed into default VCC (8.1.11). + Connection they made gets torn + down. This might confuse some + clients. Can be changed if + someone reports trouble... */ + ; + } + } + } + } + if (found_entry) { + DPRINTK("After vcc was added\n"); + dump_arp_table(priv); + goto out; + } + /* Not found, snatch address from first data packet that arrives from + this vcc */ + entry = make_entry(priv, bus_mac); + if (!entry) + goto out; + entry->vcc = vcc; + entry->old_push = old_push; + memcpy(entry->atm_addr, ioc_data->atm_addr, ATM_ESA_LEN); + memset(entry->mac_addr, 0, ETH_ALEN); + entry->status = ESI_UNKNOWN; + entry->next = priv->lec_arp_empty_ones; + priv->lec_arp_empty_ones = entry; + entry->timer.expires = jiffies + priv->vcc_timeout_period; + entry->timer.function = lec_arp_expire_vcc; + add_timer(&entry->timer); + DPRINTK("After vcc was added\n"); + dump_arp_table(priv); +out: + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); +} + +static void +lec_flush_complete(struct lec_priv *priv, unsigned long tran_id) +{ + unsigned long flags; + struct lec_arp_table *entry; + int i; + + DPRINTK("LEC:lec_flush_complete %lx\n",tran_id); + spin_lock_irqsave(&priv->lec_arp_lock, flags); + for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) { + for (entry = priv->lec_arp_tables[i]; entry; entry=entry->next) { + if (entry->flush_tran_id == tran_id && + entry->status == ESI_FLUSH_PENDING) { + struct sk_buff *skb; + + while ((skb = skb_dequeue(&entry->tx_wait)) != NULL) + lec_send(entry->vcc, skb, entry->priv); + entry->status = ESI_FORWARD_DIRECT; + DPRINTK("LEC_ARP: Flushed\n"); + } + } + } + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + dump_arp_table(priv); +} + +static void +lec_set_flush_tran_id(struct lec_priv *priv, + unsigned char *atm_addr, unsigned long tran_id) +{ + unsigned long flags; + struct lec_arp_table *entry; + int i; + + spin_lock_irqsave(&priv->lec_arp_lock, flags); + for (i = 0; i < LEC_ARP_TABLE_SIZE; i++) + for(entry = priv->lec_arp_tables[i]; entry; entry=entry->next) + if (!memcmp(atm_addr, entry->atm_addr, ATM_ESA_LEN)) { + entry->flush_tran_id = tran_id; + DPRINTK("Set flush transaction id to %lx for %p\n",tran_id,entry); + } + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); +} + +static int +lec_mcast_make(struct lec_priv *priv, struct atm_vcc *vcc) +{ + unsigned long flags; + unsigned char mac_addr[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + struct lec_arp_table *to_add; + struct lec_vcc_priv *vpriv; + int err = 0; + + if (!(vpriv = kmalloc(sizeof(struct lec_vcc_priv), GFP_KERNEL))) + return -ENOMEM; + vpriv->xoff = 0; + vpriv->old_pop = vcc->pop; + vcc->user_back = vpriv; + vcc->pop = lec_pop; + spin_lock_irqsave(&priv->lec_arp_lock, flags); + to_add = make_entry(priv, mac_addr); + if (!to_add) { + vcc->pop = vpriv->old_pop; + kfree(vpriv); + err = -ENOMEM; + goto out; + } + memcpy(to_add->atm_addr, vcc->remote.sas_addr.prv, ATM_ESA_LEN); + to_add->status = ESI_FORWARD_DIRECT; + to_add->flags |= LEC_PERMANENT_FLAG; + to_add->vcc = vcc; + to_add->old_push = vcc->push; + vcc->push = lec_push; + priv->mcast_vcc = vcc; + lec_arp_add(priv, to_add); +out: + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + return err; +} + +static void +lec_vcc_close(struct lec_priv *priv, struct atm_vcc *vcc) +{ + unsigned long flags; + struct lec_arp_table *entry, *next; + int i; + + DPRINTK("LEC_ARP: lec_vcc_close vpi:%d vci:%d\n",vcc->vpi,vcc->vci); + dump_arp_table(priv); + spin_lock_irqsave(&priv->lec_arp_lock, flags); + for(i=0;ilec_arp_tables[i];entry; entry=next) { + next = entry->next; + if (vcc == entry->vcc) { + lec_arp_remove(priv, entry); + kfree(entry); + if (priv->mcast_vcc == vcc) { + priv->mcast_vcc = NULL; + } + } + } + } + + entry = priv->lec_arp_empty_ones; + priv->lec_arp_empty_ones = NULL; + while (entry != NULL) { + next = entry->next; + if (entry->vcc == vcc) { /* leave it out from the list */ + lec_arp_clear_vccs(entry); + del_timer(&entry->timer); + kfree(entry); + } + else { /* put it back to the list */ + entry->next = priv->lec_arp_empty_ones; + priv->lec_arp_empty_ones = entry; + } + entry = next; + } + + entry = priv->lec_no_forward; + priv->lec_no_forward = NULL; + while (entry != NULL) { + next = entry->next; + if (entry->recv_vcc == vcc) { + lec_arp_clear_vccs(entry); + del_timer(&entry->timer); + kfree(entry); + } + else { + entry->next = priv->lec_no_forward; + priv->lec_no_forward = entry; + } + entry = next; + } + + entry = priv->mcast_fwds; + priv->mcast_fwds = NULL; + while (entry != NULL) { + next = entry->next; + if (entry->recv_vcc == vcc) { + lec_arp_clear_vccs(entry); + /* No timer, LANEv2 7.1.20 and 2.3.5.3 */ + kfree(entry); + } + else { + entry->next = priv->mcast_fwds; + priv->mcast_fwds = entry; + } + entry = next; + } + + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); + dump_arp_table(priv); +} + +static void +lec_arp_check_empties(struct lec_priv *priv, + struct atm_vcc *vcc, struct sk_buff *skb) +{ + unsigned long flags; + struct lec_arp_table *entry, *prev; + struct lecdatahdr_8023 *hdr = (struct lecdatahdr_8023 *)skb->data; + unsigned char *src; +#ifdef CONFIG_TR + struct lecdatahdr_8025 *tr_hdr = (struct lecdatahdr_8025 *)skb->data; + + if (priv->is_trdev) src = tr_hdr->h_source; + else +#endif + src = hdr->h_source; + + spin_lock_irqsave(&priv->lec_arp_lock, flags); + entry = priv->lec_arp_empty_ones; + if (vcc == entry->vcc) { + del_timer(&entry->timer); + memcpy(entry->mac_addr, src, ETH_ALEN); + entry->status = ESI_FORWARD_DIRECT; + entry->last_used = jiffies; + priv->lec_arp_empty_ones = entry->next; + /* We might have got an entry */ + if ((prev = lec_arp_find(priv,src))) { + lec_arp_remove(priv, prev); + kfree(prev); + } + lec_arp_add(priv, entry); + goto out; + } + prev = entry; + entry = entry->next; + while (entry && entry->vcc != vcc) { + prev= entry; + entry = entry->next; + } + if (!entry) { + DPRINTK("LEC_ARP: Arp_check_empties: entry not found!\n"); + goto out; + } + del_timer(&entry->timer); + memcpy(entry->mac_addr, src, ETH_ALEN); + entry->status = ESI_FORWARD_DIRECT; + entry->last_used = jiffies; + prev->next = entry->next; + if ((prev = lec_arp_find(priv, src))) { + lec_arp_remove(priv, prev); + kfree(prev); + } + lec_arp_add(priv, entry); +out: + spin_unlock_irqrestore(&priv->lec_arp_lock, flags); +} +MODULE_LICENSE("GPL"); diff --git a/net/atm/lec.h b/net/atm/lec.h new file mode 100644 index 000000000000..6606082b29a8 --- /dev/null +++ b/net/atm/lec.h @@ -0,0 +1,142 @@ +/* + * + * Lan Emulation client header file + * + * Marko Kiiskila mkiiskila@yahoo.com + * + */ + +#ifndef _LEC_H_ +#define _LEC_H_ + +#include +#include +#include +#include + +#define LEC_HEADER_LEN 16 + +struct lecdatahdr_8023 { + unsigned short le_header; + unsigned char h_dest[ETH_ALEN]; + unsigned char h_source[ETH_ALEN]; + unsigned short h_type; +}; + +struct lecdatahdr_8025 { + unsigned short le_header; + unsigned char ac_pad; + unsigned char fc; + unsigned char h_dest[ETH_ALEN]; + unsigned char h_source[ETH_ALEN]; +}; + +#define LEC_MINIMUM_8023_SIZE 62 +#define LEC_MINIMUM_8025_SIZE 16 + +/* + * Operations that LANE2 capable device can do. Two first functions + * are used to make the device do things. See spec 3.1.3 and 3.1.4. + * + * The third function is intented for the MPOA component sitting on + * top of the LANE device. The MPOA component assigns it's own function + * to (*associate_indicator)() and the LANE device will use that + * function to tell about TLVs it sees floating through. + * + */ +struct lane2_ops { + int (*resolve)(struct net_device *dev, u8 *dst_mac, int force, + u8 **tlvs, u32 *sizeoftlvs); + int (*associate_req)(struct net_device *dev, u8 *lan_dst, + u8 *tlvs, u32 sizeoftlvs); + void (*associate_indicator)(struct net_device *dev, u8 *mac_addr, + u8 *tlvs, u32 sizeoftlvs); +}; + +/* + * ATM LAN Emulation supports both LLC & Dix Ethernet EtherType + * frames. + * 1. Dix Ethernet EtherType frames encoded by placing EtherType + * field in h_type field. Data follows immediatelly after header. + * 2. LLC Data frames whose total length, including LLC field and data, + * but not padding required to meet the minimum data frame length, + * is less than 1536(0x0600) MUST be encoded by placing that length + * in the h_type field. The LLC field follows header immediatelly. + * 3. LLC data frames longer than this maximum MUST be encoded by placing + * the value 0 in the h_type field. + * + */ + +/* Hash table size */ +#define LEC_ARP_TABLE_SIZE 16 + +struct lec_priv { + struct net_device_stats stats; + unsigned short lecid; /* Lecid of this client */ + struct lec_arp_table *lec_arp_empty_ones; + /* Used for storing VCC's that don't have a MAC address attached yet */ + struct lec_arp_table *lec_arp_tables[LEC_ARP_TABLE_SIZE]; + /* Actual LE ARP table */ + struct lec_arp_table *lec_no_forward; + /* Used for storing VCC's (and forward packets from) which are to + age out by not using them to forward packets. + This is because to some LE clients there will be 2 VCCs. Only + one of them gets used. */ + struct lec_arp_table *mcast_fwds; + /* With LANEv2 it is possible that BUS (or a special multicast server) + establishes multiple Multicast Forward VCCs to us. This list + collects all those VCCs. LANEv1 client has only one item in this + list. These entries are not aged out. */ + spinlock_t lec_arp_lock; + struct atm_vcc *mcast_vcc; /* Default Multicast Send VCC */ + struct atm_vcc *lecd; + struct timer_list lec_arp_timer; + /* C10 */ + unsigned int maximum_unknown_frame_count; +/* Within the period of time defined by this variable, the client will send + no more than C10 frames to BUS for a given unicast destination. (C11) */ + unsigned long max_unknown_frame_time; +/* If no traffic has been sent in this vcc for this period of time, + vcc will be torn down (C12)*/ + unsigned long vcc_timeout_period; +/* An LE Client MUST not retry an LE_ARP_REQUEST for a + given frame's LAN Destination more than maximum retry count times, + after the first LEC_ARP_REQUEST (C13)*/ + unsigned short max_retry_count; +/* Max time the client will maintain an entry in its arp cache in + absence of a verification of that relationship (C17)*/ + unsigned long aging_time; +/* Max time the client will maintain an entry in cache when + topology change flag is true (C18) */ + unsigned long forward_delay_time; +/* Topology change flag (C19)*/ + int topology_change; +/* Max time the client expects an LE_ARP_REQUEST/LE_ARP_RESPONSE + cycle to take (C20)*/ + unsigned long arp_response_time; +/* Time limit ot wait to receive an LE_FLUSH_RESPONSE after the + LE_FLUSH_REQUEST has been sent before taking recover action. (C21)*/ + unsigned long flush_timeout; +/* The time since sending a frame to the bus after which the + LE Client may assume that the frame has been either discarded or + delivered to the recipient (C22) */ + unsigned long path_switching_delay; + + u8 *tlvs; /* LANE2: TLVs are new */ + u32 sizeoftlvs; /* The size of the tlv array in bytes */ + int lane_version; /* LANE2 */ + int itfnum; /* e.g. 2 for lec2, 5 for lec5 */ + struct lane2_ops *lane2_ops; /* can be NULL for LANE v1 */ + int is_proxy; /* bridge between ATM and Ethernet */ + int is_trdev; /* Device type, 0 = Ethernet, 1 = TokenRing */ +}; + +struct lec_vcc_priv { + void (*old_pop)(struct atm_vcc *vcc, struct sk_buff *skb); + int xoff; +}; + +#define LEC_VCC_PRIV(vcc) ((struct lec_vcc_priv *)((vcc)->user_back)) + +#endif /* _LEC_H_ */ + diff --git a/net/atm/lec_arpc.h b/net/atm/lec_arpc.h new file mode 100644 index 000000000000..397448094648 --- /dev/null +++ b/net/atm/lec_arpc.h @@ -0,0 +1,92 @@ +/* + * Lec arp cache + * Marko Kiiskila mkiiskila@yahoo.com + * + */ +#ifndef _LEC_ARP_H +#define _LEC_ARP_H +#include +#include +#include +#include + +struct lec_arp_table { + struct lec_arp_table *next; /* Linked entry list */ + unsigned char atm_addr[ATM_ESA_LEN]; /* Atm address */ + unsigned char mac_addr[ETH_ALEN]; /* Mac address */ + int is_rdesc; /* Mac address is a route descriptor */ + struct atm_vcc *vcc; /* Vcc this entry is attached */ + struct atm_vcc *recv_vcc; /* Vcc we receive data from */ + void (*old_push)(struct atm_vcc *vcc,struct sk_buff *skb); + /* Push that leads to daemon */ + void (*old_recv_push)(struct atm_vcc *vcc, struct sk_buff *skb); + /* Push that leads to daemon */ + void (*old_close)(struct atm_vcc *vcc); + /* We want to see when this + * vcc gets closed */ + unsigned long last_used; /* For expiry */ + unsigned long timestamp; /* Used for various timestamping + * things: + * 1. FLUSH started + * (status=ESI_FLUSH_PENDING) + * 2. Counting to + * max_unknown_frame_time + * (status=ESI_ARP_PENDING|| + * status=ESI_VC_PENDING) + */ + unsigned char no_tries; /* No of times arp retry has been + tried */ + unsigned char status; /* Status of this entry */ + unsigned short flags; /* Flags for this entry */ + unsigned short packets_flooded; /* Data packets flooded */ + unsigned long flush_tran_id; /* Transaction id in flush protocol */ + struct timer_list timer; /* Arping timer */ + struct lec_priv *priv; /* Pointer back */ + + u8 *tlvs; /* LANE2: Each MAC address can have TLVs */ + u32 sizeoftlvs; /* associated with it. sizeoftlvs tells the */ + /* the length of the tlvs array */ + struct sk_buff_head tx_wait; /* wait queue for outgoing packets */ +}; + +struct tlv { /* LANE2: Template tlv struct for accessing */ + /* the tlvs in the lec_arp_table->tlvs array*/ + u32 type; + u8 length; + u8 value[255]; +}; + +/* Status fields */ +#define ESI_UNKNOWN 0 /* + * Next packet sent to this mac address + * causes ARP-request to be sent + */ +#define ESI_ARP_PENDING 1 /* + * There is no ATM address associated with this + * 48-bit address. The LE-ARP protocol is in + * progress. + */ +#define ESI_VC_PENDING 2 /* + * There is a valid ATM address associated with + * this 48-bit address but there is no VC set + * up to that ATM address. The signaling + * protocol is in process. + */ +#define ESI_FLUSH_PENDING 4 /* + * The LEC has been notified of the FLUSH_START + * status and it is assumed that the flush + * protocol is in process. + */ +#define ESI_FORWARD_DIRECT 5 /* + * Either the Path Switching Delay (C22) has + * elapsed or the LEC has notified the Mapping + * that the flush protocol has completed. In + * either case, it is safe to forward packets + * to this address via the data direct VC. + */ + +/* Flag values */ +#define LEC_REMOTE_FLAG 0x0001 +#define LEC_PERMANENT_FLAG 0x0002 + +#endif diff --git a/net/atm/mpc.c b/net/atm/mpc.c new file mode 100644 index 000000000000..17a81ebe7e6e --- /dev/null +++ b/net/atm/mpc.c @@ -0,0 +1,1514 @@ +#include +#include +#include +#include +#include +#include + +/* We are an ethernet device */ +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for ip_fast_csum() */ +#include +#include +#include + +/* And atm device */ +#include +#include +#include +/* Modular too */ +#include +#include + +#include "lec.h" +#include "mpc.h" +#include "resources.h" + +/* + * mpc.c: Implementation of MPOA client kernel part + */ + +#if 0 +#define dprintk printk /* debug */ +#else +#define dprintk(format,args...) +#endif + +#if 0 +#define ddprintk printk /* more debug */ +#else +#define ddprintk(format,args...) +#endif + + + +#define MPOA_TAG_LEN 4 + +/* mpc_daemon -> kernel */ +static void MPOA_trigger_rcvd (struct k_message *msg, struct mpoa_client *mpc); +static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc); +static void ingress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc); +static void egress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc); +static void mps_death(struct k_message *msg, struct mpoa_client *mpc); +static void clean_up(struct k_message *msg, struct mpoa_client *mpc, int action); +static void MPOA_cache_impos_rcvd(struct k_message *msg, struct mpoa_client *mpc); +static void set_mpc_ctrl_addr_rcvd(struct k_message *mesg, struct mpoa_client *mpc); +static void set_mps_mac_addr_rcvd(struct k_message *mesg, struct mpoa_client *mpc); + +static uint8_t *copy_macs(struct mpoa_client *mpc, uint8_t *router_mac, + uint8_t *tlvs, uint8_t mps_macs, uint8_t device_type); +static void purge_egress_shortcut(struct atm_vcc *vcc, eg_cache_entry *entry); + +static void send_set_mps_ctrl_addr(char *addr, struct mpoa_client *mpc); +static void mpoad_close(struct atm_vcc *vcc); +static int msg_from_mpoad(struct atm_vcc *vcc, struct sk_buff *skb); + +static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb); +static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev); +static int mpoa_event_listener(struct notifier_block *mpoa_notifier, unsigned long event, void *dev); +static void mpc_timer_refresh(void); +static void mpc_cache_check( unsigned long checking_time ); + +static struct llc_snap_hdr llc_snap_mpoa_ctrl = { + 0xaa, 0xaa, 0x03, + {0x00, 0x00, 0x5e}, + {0x00, 0x03} /* For MPOA control PDUs */ +}; +static struct llc_snap_hdr llc_snap_mpoa_data = { + 0xaa, 0xaa, 0x03, + {0x00, 0x00, 0x00}, + {0x08, 0x00} /* This is for IP PDUs only */ +}; +static struct llc_snap_hdr llc_snap_mpoa_data_tagged = { + 0xaa, 0xaa, 0x03, + {0x00, 0x00, 0x00}, + {0x88, 0x4c} /* This is for tagged data PDUs */ +}; + +static struct notifier_block mpoa_notifier = { + mpoa_event_listener, + NULL, + 0 +}; + +#ifdef CONFIG_PROC_FS +extern int mpc_proc_init(void); +extern void mpc_proc_clean(void); +#endif + +struct mpoa_client *mpcs = NULL; /* FIXME */ +static struct atm_mpoa_qos *qos_head = NULL; +static struct timer_list mpc_timer = TIMER_INITIALIZER(NULL, 0, 0); + + +static struct mpoa_client *find_mpc_by_itfnum(int itf) +{ + struct mpoa_client *mpc; + + mpc = mpcs; /* our global linked list */ + while (mpc != NULL) { + if (mpc->dev_num == itf) + return mpc; + mpc = mpc->next; + } + + return NULL; /* not found */ +} + +static struct mpoa_client *find_mpc_by_vcc(struct atm_vcc *vcc) +{ + struct mpoa_client *mpc; + + mpc = mpcs; /* our global linked list */ + while (mpc != NULL) { + if (mpc->mpoad_vcc == vcc) + return mpc; + mpc = mpc->next; + } + + return NULL; /* not found */ +} + +static struct mpoa_client *find_mpc_by_lec(struct net_device *dev) +{ + struct mpoa_client *mpc; + + mpc = mpcs; /* our global linked list */ + while (mpc != NULL) { + if (mpc->dev == dev) + return mpc; + mpc = mpc->next; + } + + return NULL; /* not found */ +} + +/* + * Functions for managing QoS list + */ + +/* + * Overwrites the old entry or makes a new one. + */ +struct atm_mpoa_qos *atm_mpoa_add_qos(uint32_t dst_ip, struct atm_qos *qos) +{ + struct atm_mpoa_qos *entry; + + entry = atm_mpoa_search_qos(dst_ip); + if (entry != NULL) { + entry->qos = *qos; + return entry; + } + + entry = kmalloc(sizeof(struct atm_mpoa_qos), GFP_KERNEL); + if (entry == NULL) { + printk("mpoa: atm_mpoa_add_qos: out of memory\n"); + return entry; + } + + entry->ipaddr = dst_ip; + entry->qos = *qos; + + entry->next = qos_head; + qos_head = entry; + + return entry; +} + +struct atm_mpoa_qos *atm_mpoa_search_qos(uint32_t dst_ip) +{ + struct atm_mpoa_qos *qos; + + qos = qos_head; + while( qos != NULL ){ + if(qos->ipaddr == dst_ip) { + break; + } + qos = qos->next; + } + + return qos; +} + +/* + * Returns 0 for failure + */ +int atm_mpoa_delete_qos(struct atm_mpoa_qos *entry) +{ + + struct atm_mpoa_qos *curr; + + if (entry == NULL) return 0; + if (entry == qos_head) { + qos_head = qos_head->next; + kfree(entry); + return 1; + } + + curr = qos_head; + while (curr != NULL) { + if (curr->next == entry) { + curr->next = entry->next; + kfree(entry); + return 1; + } + curr = curr->next; + } + + return 0; +} + +/* this is buggered - we need locking for qos_head */ +void atm_mpoa_disp_qos(struct seq_file *m) +{ + unsigned char *ip; + char ipaddr[16]; + struct atm_mpoa_qos *qos; + + qos = qos_head; + seq_printf(m, "QoS entries for shortcuts:\n"); + seq_printf(m, "IP address\n TX:max_pcr pcr min_pcr max_cdv max_sdu\n RX:max_pcr pcr min_pcr max_cdv max_sdu\n"); + + ipaddr[sizeof(ipaddr)-1] = '\0'; + while (qos != NULL) { + ip = (unsigned char *)&qos->ipaddr; + sprintf(ipaddr, "%u.%u.%u.%u", NIPQUAD(ip)); + seq_printf(m, "%u.%u.%u.%u\n %-7d %-7d %-7d %-7d %-7d\n %-7d %-7d %-7d %-7d %-7d\n", + NIPQUAD(ipaddr), + qos->qos.txtp.max_pcr, qos->qos.txtp.pcr, qos->qos.txtp.min_pcr, qos->qos.txtp.max_cdv, qos->qos.txtp.max_sdu, + qos->qos.rxtp.max_pcr, qos->qos.rxtp.pcr, qos->qos.rxtp.min_pcr, qos->qos.rxtp.max_cdv, qos->qos.rxtp.max_sdu); + qos = qos->next; + } +} + +static struct net_device *find_lec_by_itfnum(int itf) +{ + struct net_device *dev; + char name[IFNAMSIZ]; + + sprintf(name, "lec%d", itf); + dev = dev_get_by_name(name); + + return dev; +} + +static struct mpoa_client *alloc_mpc(void) +{ + struct mpoa_client *mpc; + + mpc = kmalloc(sizeof (struct mpoa_client), GFP_KERNEL); + if (mpc == NULL) + return NULL; + memset(mpc, 0, sizeof(struct mpoa_client)); + rwlock_init(&mpc->ingress_lock); + rwlock_init(&mpc->egress_lock); + mpc->next = mpcs; + atm_mpoa_init_cache(mpc); + + mpc->parameters.mpc_p1 = MPC_P1; + mpc->parameters.mpc_p2 = MPC_P2; + memset(mpc->parameters.mpc_p3,0,sizeof(mpc->parameters.mpc_p3)); + mpc->parameters.mpc_p4 = MPC_P4; + mpc->parameters.mpc_p5 = MPC_P5; + mpc->parameters.mpc_p6 = MPC_P6; + + mpcs = mpc; + + return mpc; +} + +/* + * + * start_mpc() puts the MPC on line. All the packets destined + * to the lec underneath us are now being monitored and + * shortcuts will be established. + * + */ +static void start_mpc(struct mpoa_client *mpc, struct net_device *dev) +{ + + dprintk("mpoa: (%s) start_mpc:\n", mpc->dev->name); + if (dev->hard_start_xmit == NULL) { + printk("mpoa: (%s) start_mpc: dev->hard_start_xmit == NULL, not starting\n", + dev->name); + return; + } + mpc->old_hard_start_xmit = dev->hard_start_xmit; + dev->hard_start_xmit = mpc_send_packet; + + return; +} + +static void stop_mpc(struct mpoa_client *mpc) +{ + + dprintk("mpoa: (%s) stop_mpc:", mpc->dev->name); + + /* Lets not nullify lec device's dev->hard_start_xmit */ + if (mpc->dev->hard_start_xmit != mpc_send_packet) { + dprintk(" mpc already stopped, not fatal\n"); + return; + } + dprintk("\n"); + mpc->dev->hard_start_xmit = mpc->old_hard_start_xmit; + mpc->old_hard_start_xmit = NULL; + /* close_shortcuts(mpc); ??? FIXME */ + + return; +} + +static const char *mpoa_device_type_string(char type) __attribute__ ((unused)); + +static const char *mpoa_device_type_string(char type) +{ + switch(type) { + case NON_MPOA: + return "non-MPOA device"; + break; + case MPS: + return "MPS"; + break; + case MPC: + return "MPC"; + break; + case MPS_AND_MPC: + return "both MPS and MPC"; + break; + default: + return "unspecified (non-MPOA) device"; + break; + } + + return ""; /* not reached */ +} + +/* + * lec device calls this via its dev->priv->lane2_ops->associate_indicator() + * when it sees a TLV in LE_ARP packet. + * We fill in the pointer above when we see a LANE2 lec initializing + * See LANE2 spec 3.1.5 + * + * Quite a big and ugly function but when you look at it + * all it does is to try to locate and parse MPOA Device + * Type TLV. + * We give our lec a pointer to this function and when the + * lec sees a TLV it uses the pointer to call this function. + * + */ +static void lane2_assoc_ind(struct net_device *dev, uint8_t *mac_addr, + uint8_t *tlvs, uint32_t sizeoftlvs) +{ + uint32_t type; + uint8_t length, mpoa_device_type, number_of_mps_macs; + uint8_t *end_of_tlvs; + struct mpoa_client *mpc; + + mpoa_device_type = number_of_mps_macs = 0; /* silence gcc */ + dprintk("mpoa: (%s) lane2_assoc_ind: received TLV(s), ", dev->name); + dprintk("total length of all TLVs %d\n", sizeoftlvs); + mpc = find_mpc_by_lec(dev); /* Sampo-Fix: moved here from below */ + if (mpc == NULL) { + printk("mpoa: (%s) lane2_assoc_ind: no mpc\n", dev->name); + return; + } + end_of_tlvs = tlvs + sizeoftlvs; + while (end_of_tlvs - tlvs >= 5) { + type = (tlvs[0] << 24) | (tlvs[1] << 16) | (tlvs[2] << 8) | tlvs[3]; + length = tlvs[4]; + tlvs += 5; + dprintk(" type 0x%x length %02x\n", type, length); + if (tlvs + length > end_of_tlvs) { + printk("TLV value extends past its buffer, aborting parse\n"); + return; + } + + if (type == 0) { + printk("mpoa: (%s) lane2_assoc_ind: TLV type was 0, returning\n", dev->name); + return; + } + + if (type != TLV_MPOA_DEVICE_TYPE) { + tlvs += length; + continue; /* skip other TLVs */ + } + mpoa_device_type = *tlvs++; + number_of_mps_macs = *tlvs++; + dprintk("mpoa: (%s) MPOA device type '%s', ", dev->name, mpoa_device_type_string(mpoa_device_type)); + if (mpoa_device_type == MPS_AND_MPC && + length < (42 + number_of_mps_macs*ETH_ALEN)) { /* :) */ + printk("\nmpoa: (%s) lane2_assoc_ind: short MPOA Device Type TLV\n", + dev->name); + continue; + } + if ((mpoa_device_type == MPS || mpoa_device_type == MPC) + && length < 22 + number_of_mps_macs*ETH_ALEN) { + printk("\nmpoa: (%s) lane2_assoc_ind: short MPOA Device Type TLV\n", + dev->name); + continue; + } + if (mpoa_device_type != MPS && mpoa_device_type != MPS_AND_MPC) { + dprintk("ignoring non-MPS device\n"); + if (mpoa_device_type == MPC) tlvs += 20; + continue; /* we are only interested in MPSs */ + } + if (number_of_mps_macs == 0 && mpoa_device_type == MPS_AND_MPC) { + printk("\nmpoa: (%s) lane2_assoc_ind: MPS_AND_MPC has zero MACs\n", dev->name); + continue; /* someone should read the spec */ + } + dprintk("this MPS has %d MAC addresses\n", number_of_mps_macs); + + /* ok, now we can go and tell our daemon the control address of MPS */ + send_set_mps_ctrl_addr(tlvs, mpc); + + tlvs = copy_macs(mpc, mac_addr, tlvs, number_of_mps_macs, mpoa_device_type); + if (tlvs == NULL) return; + } + if (end_of_tlvs - tlvs != 0) + printk("mpoa: (%s) lane2_assoc_ind: ignoring %Zd bytes of trailing TLV carbage\n", + dev->name, end_of_tlvs - tlvs); + return; +} + +/* + * Store at least advertizing router's MAC address + * plus the possible MAC address(es) to mpc->mps_macs. + * For a freshly allocated MPOA client mpc->mps_macs == 0. + */ +static uint8_t *copy_macs(struct mpoa_client *mpc, uint8_t *router_mac, + uint8_t *tlvs, uint8_t mps_macs, uint8_t device_type) +{ + int num_macs; + num_macs = (mps_macs > 1) ? mps_macs : 1; + + if (mpc->number_of_mps_macs != num_macs) { /* need to reallocate? */ + if (mpc->number_of_mps_macs != 0) kfree(mpc->mps_macs); + mpc->number_of_mps_macs = 0; + mpc->mps_macs = kmalloc(num_macs*ETH_ALEN, GFP_KERNEL); + if (mpc->mps_macs == NULL) { + printk("mpoa: (%s) copy_macs: out of mem\n", mpc->dev->name); + return NULL; + } + } + memcpy(mpc->mps_macs, router_mac, ETH_ALEN); + tlvs += 20; if (device_type == MPS_AND_MPC) tlvs += 20; + if (mps_macs > 0) + memcpy(mpc->mps_macs, tlvs, mps_macs*ETH_ALEN); + tlvs += mps_macs*ETH_ALEN; + mpc->number_of_mps_macs = num_macs; + + return tlvs; +} + +static int send_via_shortcut(struct sk_buff *skb, struct mpoa_client *mpc) +{ + in_cache_entry *entry; + struct iphdr *iph; + char *buff; + uint32_t ipaddr = 0; + + static struct { + struct llc_snap_hdr hdr; + uint32_t tag; + } tagged_llc_snap_hdr = { + {0xaa, 0xaa, 0x03, {0x00, 0x00, 0x00}, {0x88, 0x4c}}, + 0 + }; + + buff = skb->data + mpc->dev->hard_header_len; + iph = (struct iphdr *)buff; + ipaddr = iph->daddr; + + ddprintk("mpoa: (%s) send_via_shortcut: ipaddr 0x%x\n", mpc->dev->name, ipaddr); + + entry = mpc->in_ops->get(ipaddr, mpc); + if (entry == NULL) { + entry = mpc->in_ops->add_entry(ipaddr, mpc); + if (entry != NULL) mpc->in_ops->put(entry); + return 1; + } + if (mpc->in_ops->cache_hit(entry, mpc) != OPEN){ /* threshold not exceeded or VCC not ready */ + ddprintk("mpoa: (%s) send_via_shortcut: cache_hit: returns != OPEN\n", mpc->dev->name); + mpc->in_ops->put(entry); + return 1; + } + + ddprintk("mpoa: (%s) send_via_shortcut: using shortcut\n", mpc->dev->name); + /* MPOA spec A.1.4, MPOA client must decrement IP ttl at least by one */ + if (iph->ttl <= 1) { + ddprintk("mpoa: (%s) send_via_shortcut: IP ttl = %u, using LANE\n", mpc->dev->name, iph->ttl); + mpc->in_ops->put(entry); + return 1; + } + iph->ttl--; + iph->check = 0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + + if (entry->ctrl_info.tag != 0) { + ddprintk("mpoa: (%s) send_via_shortcut: adding tag 0x%x\n", mpc->dev->name, entry->ctrl_info.tag); + tagged_llc_snap_hdr.tag = entry->ctrl_info.tag; + skb_pull(skb, ETH_HLEN); /* get rid of Eth header */ + skb_push(skb, sizeof(tagged_llc_snap_hdr)); /* add LLC/SNAP header */ + memcpy(skb->data, &tagged_llc_snap_hdr, sizeof(tagged_llc_snap_hdr)); + } else { + skb_pull(skb, ETH_HLEN); /* get rid of Eth header */ + skb_push(skb, sizeof(struct llc_snap_hdr)); /* add LLC/SNAP header + tag */ + memcpy(skb->data, &llc_snap_mpoa_data, sizeof(struct llc_snap_hdr)); + } + + atomic_add(skb->truesize, &sk_atm(entry->shortcut)->sk_wmem_alloc); + ATM_SKB(skb)->atm_options = entry->shortcut->atm_options; + entry->shortcut->send(entry->shortcut, skb); + entry->packets_fwded++; + mpc->in_ops->put(entry); + + return 0; +} + +/* + * Probably needs some error checks and locking, not sure... + */ +static int mpc_send_packet(struct sk_buff *skb, struct net_device *dev) +{ + int retval; + struct mpoa_client *mpc; + struct ethhdr *eth; + int i = 0; + + mpc = find_mpc_by_lec(dev); /* this should NEVER fail */ + if(mpc == NULL) { + printk("mpoa: (%s) mpc_send_packet: no MPC found\n", dev->name); + goto non_ip; + } + + eth = (struct ethhdr *)skb->data; + if (eth->h_proto != htons(ETH_P_IP)) + goto non_ip; /* Multi-Protocol Over ATM :-) */ + + while (i < mpc->number_of_mps_macs) { + if (memcmp(eth->h_dest, (mpc->mps_macs + i*ETH_ALEN), ETH_ALEN) == 0) + if ( send_via_shortcut(skb, mpc) == 0 ) /* try shortcut */ + return 0; /* success! */ + i++; + } + + non_ip: + retval = mpc->old_hard_start_xmit(skb,dev); + + return retval; +} + +static int atm_mpoa_vcc_attach(struct atm_vcc *vcc, void __user *arg) +{ + int bytes_left; + struct mpoa_client *mpc; + struct atmmpc_ioc ioc_data; + in_cache_entry *in_entry; + uint32_t ipaddr; + unsigned char *ip; + + bytes_left = copy_from_user(&ioc_data, arg, sizeof(struct atmmpc_ioc)); + if (bytes_left != 0) { + printk("mpoa: mpc_vcc_attach: Short read (missed %d bytes) from userland\n", bytes_left); + return -EFAULT; + } + ipaddr = ioc_data.ipaddr; + if (ioc_data.dev_num < 0 || ioc_data.dev_num >= MAX_LEC_ITF) + return -EINVAL; + + mpc = find_mpc_by_itfnum(ioc_data.dev_num); + if (mpc == NULL) + return -EINVAL; + + if (ioc_data.type == MPC_SOCKET_INGRESS) { + in_entry = mpc->in_ops->get(ipaddr, mpc); + if (in_entry == NULL || in_entry->entry_state < INGRESS_RESOLVED) { + printk("mpoa: (%s) mpc_vcc_attach: did not find RESOLVED entry from ingress cache\n", + mpc->dev->name); + if (in_entry != NULL) mpc->in_ops->put(in_entry); + return -EINVAL; + } + ip = (unsigned char*)&in_entry->ctrl_info.in_dst_ip; + printk("mpoa: (%s) mpc_vcc_attach: attaching ingress SVC, entry = %u.%u.%u.%u\n", + mpc->dev->name, ip[0], ip[1], ip[2], ip[3]); + in_entry->shortcut = vcc; + mpc->in_ops->put(in_entry); + } else { + printk("mpoa: (%s) mpc_vcc_attach: attaching egress SVC\n", mpc->dev->name); + } + + vcc->proto_data = mpc->dev; + vcc->push = mpc_push; + + return 0; +} + +/* + * + */ +static void mpc_vcc_close(struct atm_vcc *vcc, struct net_device *dev) +{ + struct mpoa_client *mpc; + in_cache_entry *in_entry; + eg_cache_entry *eg_entry; + + mpc = find_mpc_by_lec(dev); + if (mpc == NULL) { + printk("mpoa: (%s) mpc_vcc_close: close for unknown MPC\n", dev->name); + return; + } + + dprintk("mpoa: (%s) mpc_vcc_close:\n", dev->name); + in_entry = mpc->in_ops->get_by_vcc(vcc, mpc); + if (in_entry) { + unsigned char *ip __attribute__ ((unused)) = + (unsigned char *)&in_entry->ctrl_info.in_dst_ip; + dprintk("mpoa: (%s) mpc_vcc_close: ingress SVC closed ip = %u.%u.%u.%u\n", + mpc->dev->name, ip[0], ip[1], ip[2], ip[3]); + in_entry->shortcut = NULL; + mpc->in_ops->put(in_entry); + } + eg_entry = mpc->eg_ops->get_by_vcc(vcc, mpc); + if (eg_entry) { + dprintk("mpoa: (%s) mpc_vcc_close: egress SVC closed\n", mpc->dev->name); + eg_entry->shortcut = NULL; + mpc->eg_ops->put(eg_entry); + } + + if (in_entry == NULL && eg_entry == NULL) + dprintk("mpoa: (%s) mpc_vcc_close: unused vcc closed\n", dev->name); + + return; +} + +static void mpc_push(struct atm_vcc *vcc, struct sk_buff *skb) +{ + struct net_device *dev = (struct net_device *)vcc->proto_data; + struct sk_buff *new_skb; + eg_cache_entry *eg; + struct mpoa_client *mpc; + uint32_t tag; + char *tmp; + + ddprintk("mpoa: (%s) mpc_push:\n", dev->name); + if (skb == NULL) { + dprintk("mpoa: (%s) mpc_push: null skb, closing VCC\n", dev->name); + mpc_vcc_close(vcc, dev); + return; + } + + skb->dev = dev; + if (memcmp(skb->data, &llc_snap_mpoa_ctrl, sizeof(struct llc_snap_hdr)) == 0) { + struct sock *sk = sk_atm(vcc); + + dprintk("mpoa: (%s) mpc_push: control packet arrived\n", dev->name); + /* Pass control packets to daemon */ + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + return; + } + + /* data coming over the shortcut */ + atm_return(vcc, skb->truesize); + + mpc = find_mpc_by_lec(dev); + if (mpc == NULL) { + printk("mpoa: (%s) mpc_push: unknown MPC\n", dev->name); + return; + } + + if (memcmp(skb->data, &llc_snap_mpoa_data_tagged, sizeof(struct llc_snap_hdr)) == 0) { /* MPOA tagged data */ + ddprintk("mpoa: (%s) mpc_push: tagged data packet arrived\n", dev->name); + + } else if (memcmp(skb->data, &llc_snap_mpoa_data, sizeof(struct llc_snap_hdr)) == 0) { /* MPOA data */ + printk("mpoa: (%s) mpc_push: non-tagged data packet arrived\n", dev->name); + printk(" mpc_push: non-tagged data unsupported, purging\n"); + dev_kfree_skb_any(skb); + return; + } else { + printk("mpoa: (%s) mpc_push: garbage arrived, purging\n", dev->name); + dev_kfree_skb_any(skb); + return; + } + + tmp = skb->data + sizeof(struct llc_snap_hdr); + tag = *(uint32_t *)tmp; + + eg = mpc->eg_ops->get_by_tag(tag, mpc); + if (eg == NULL) { + printk("mpoa: (%s) mpc_push: Didn't find egress cache entry, tag = %u\n", + dev->name,tag); + purge_egress_shortcut(vcc, NULL); + dev_kfree_skb_any(skb); + return; + } + + /* + * See if ingress MPC is using shortcut we opened as a return channel. + * This means we have a bi-directional vcc opened by us. + */ + if (eg->shortcut == NULL) { + eg->shortcut = vcc; + printk("mpoa: (%s) mpc_push: egress SVC in use\n", dev->name); + } + + skb_pull(skb, sizeof(struct llc_snap_hdr) + sizeof(tag)); /* get rid of LLC/SNAP header */ + new_skb = skb_realloc_headroom(skb, eg->ctrl_info.DH_length); /* LLC/SNAP is shorter than MAC header :( */ + dev_kfree_skb_any(skb); + if (new_skb == NULL){ + mpc->eg_ops->put(eg); + return; + } + skb_push(new_skb, eg->ctrl_info.DH_length); /* add MAC header */ + memcpy(new_skb->data, eg->ctrl_info.DLL_header, eg->ctrl_info.DH_length); + new_skb->protocol = eth_type_trans(new_skb, dev); + new_skb->nh.raw = new_skb->data; + + eg->latest_ip_addr = new_skb->nh.iph->saddr; + eg->packets_rcvd++; + mpc->eg_ops->put(eg); + + memset(ATM_SKB(skb), 0, sizeof(struct atm_skb_data)); + netif_rx(new_skb); + + return; +} + +static struct atmdev_ops mpc_ops = { /* only send is required */ + .close = mpoad_close, + .send = msg_from_mpoad +}; + +static struct atm_dev mpc_dev = { + .ops = &mpc_ops, + .type = "mpc", + .number = 42, + .lock = SPIN_LOCK_UNLOCKED + /* members not explicitly initialised will be 0 */ +}; + +static int atm_mpoa_mpoad_attach (struct atm_vcc *vcc, int arg) +{ + struct mpoa_client *mpc; + struct lec_priv *priv; + int err; + + if (mpcs == NULL) { + init_timer(&mpc_timer); + mpc_timer_refresh(); + + /* This lets us now how our LECs are doing */ + err = register_netdevice_notifier(&mpoa_notifier); + if (err < 0) { + del_timer(&mpc_timer); + return err; + } + } + + mpc = find_mpc_by_itfnum(arg); + if (mpc == NULL) { + dprintk("mpoa: mpoad_attach: allocating new mpc for itf %d\n", arg); + mpc = alloc_mpc(); + if (mpc == NULL) + return -ENOMEM; + mpc->dev_num = arg; + mpc->dev = find_lec_by_itfnum(arg); /* NULL if there was no lec */ + } + if (mpc->mpoad_vcc) { + printk("mpoa: mpoad_attach: mpoad is already present for itf %d\n", arg); + return -EADDRINUSE; + } + + if (mpc->dev) { /* check if the lec is LANE2 capable */ + priv = (struct lec_priv *)mpc->dev->priv; + if (priv->lane_version < 2) { + dev_put(mpc->dev); + mpc->dev = NULL; + } else + priv->lane2_ops->associate_indicator = lane2_assoc_ind; + } + + mpc->mpoad_vcc = vcc; + vcc->dev = &mpc_dev; + vcc_insert_socket(sk_atm(vcc)); + set_bit(ATM_VF_META,&vcc->flags); + set_bit(ATM_VF_READY,&vcc->flags); + + if (mpc->dev) { + char empty[ATM_ESA_LEN]; + memset(empty, 0, ATM_ESA_LEN); + + start_mpc(mpc, mpc->dev); + /* set address if mpcd e.g. gets killed and restarted. + * If we do not do it now we have to wait for the next LE_ARP + */ + if ( memcmp(mpc->mps_ctrl_addr, empty, ATM_ESA_LEN) != 0 ) + send_set_mps_ctrl_addr(mpc->mps_ctrl_addr, mpc); + } + + __module_get(THIS_MODULE); + return arg; +} + +static void send_set_mps_ctrl_addr(char *addr, struct mpoa_client *mpc) +{ + struct k_message mesg; + + memcpy (mpc->mps_ctrl_addr, addr, ATM_ESA_LEN); + + mesg.type = SET_MPS_CTRL_ADDR; + memcpy(mesg.MPS_ctrl, addr, ATM_ESA_LEN); + msg_to_mpoad(&mesg, mpc); + + return; +} + +static void mpoad_close(struct atm_vcc *vcc) +{ + struct mpoa_client *mpc; + struct sk_buff *skb; + + mpc = find_mpc_by_vcc(vcc); + if (mpc == NULL) { + printk("mpoa: mpoad_close: did not find MPC\n"); + return; + } + if (!mpc->mpoad_vcc) { + printk("mpoa: mpoad_close: close for non-present mpoad\n"); + return; + } + + mpc->mpoad_vcc = NULL; + if (mpc->dev) { + struct lec_priv *priv = (struct lec_priv *)mpc->dev->priv; + priv->lane2_ops->associate_indicator = NULL; + stop_mpc(mpc); + dev_put(mpc->dev); + } + + mpc->in_ops->destroy_cache(mpc); + mpc->eg_ops->destroy_cache(mpc); + + while ((skb = skb_dequeue(&sk_atm(vcc)->sk_receive_queue))) { + atm_return(vcc, skb->truesize); + kfree_skb(skb); + } + + printk("mpoa: (%s) going down\n", + (mpc->dev) ? mpc->dev->name : ""); + module_put(THIS_MODULE); + + return; +} + +/* + * + */ +static int msg_from_mpoad(struct atm_vcc *vcc, struct sk_buff *skb) +{ + + struct mpoa_client *mpc = find_mpc_by_vcc(vcc); + struct k_message *mesg = (struct k_message*)skb->data; + atomic_sub(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc); + + if (mpc == NULL) { + printk("mpoa: msg_from_mpoad: no mpc found\n"); + return 0; + } + dprintk("mpoa: (%s) msg_from_mpoad:", (mpc->dev) ? mpc->dev->name : ""); + switch(mesg->type) { + case MPOA_RES_REPLY_RCVD: + dprintk(" mpoa_res_reply_rcvd\n"); + MPOA_res_reply_rcvd(mesg, mpc); + break; + case MPOA_TRIGGER_RCVD: + dprintk(" mpoa_trigger_rcvd\n"); + MPOA_trigger_rcvd(mesg, mpc); + break; + case INGRESS_PURGE_RCVD: + dprintk(" nhrp_purge_rcvd\n"); + ingress_purge_rcvd(mesg, mpc); + break; + case EGRESS_PURGE_RCVD: + dprintk(" egress_purge_reply_rcvd\n"); + egress_purge_rcvd(mesg, mpc); + break; + case MPS_DEATH: + dprintk(" mps_death\n"); + mps_death(mesg, mpc); + break; + case CACHE_IMPOS_RCVD: + dprintk(" cache_impos_rcvd\n"); + MPOA_cache_impos_rcvd(mesg, mpc); + break; + case SET_MPC_CTRL_ADDR: + dprintk(" set_mpc_ctrl_addr\n"); + set_mpc_ctrl_addr_rcvd(mesg, mpc); + break; + case SET_MPS_MAC_ADDR: + dprintk(" set_mps_mac_addr\n"); + set_mps_mac_addr_rcvd(mesg, mpc); + break; + case CLEAN_UP_AND_EXIT: + dprintk(" clean_up_and_exit\n"); + clean_up(mesg, mpc, DIE); + break; + case RELOAD: + dprintk(" reload\n"); + clean_up(mesg, mpc, RELOAD); + break; + case SET_MPC_PARAMS: + dprintk(" set_mpc_params\n"); + mpc->parameters = mesg->content.params; + break; + default: + dprintk(" unknown message %d\n", mesg->type); + break; + } + kfree_skb(skb); + + return 0; +} + +/* Remember that this function may not do things that sleep */ +int msg_to_mpoad(struct k_message *mesg, struct mpoa_client *mpc) +{ + struct sk_buff *skb; + struct sock *sk; + + if (mpc == NULL || !mpc->mpoad_vcc) { + printk("mpoa: msg_to_mpoad: mesg %d to a non-existent mpoad\n", mesg->type); + return -ENXIO; + } + + skb = alloc_skb(sizeof(struct k_message), GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + skb_put(skb, sizeof(struct k_message)); + memcpy(skb->data, mesg, sizeof(struct k_message)); + atm_force_charge(mpc->mpoad_vcc, skb->truesize); + + sk = sk_atm(mpc->mpoad_vcc); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + + return 0; +} + +static int mpoa_event_listener(struct notifier_block *mpoa_notifier, unsigned long event, void *dev_ptr) +{ + struct net_device *dev; + struct mpoa_client *mpc; + struct lec_priv *priv; + + dev = (struct net_device *)dev_ptr; + if (dev->name == NULL || strncmp(dev->name, "lec", 3)) + return NOTIFY_DONE; /* we are only interested in lec:s */ + + switch (event) { + case NETDEV_REGISTER: /* a new lec device was allocated */ + priv = (struct lec_priv *)dev->priv; + if (priv->lane_version < 2) + break; + priv->lane2_ops->associate_indicator = lane2_assoc_ind; + mpc = find_mpc_by_itfnum(priv->itfnum); + if (mpc == NULL) { + dprintk("mpoa: mpoa_event_listener: allocating new mpc for %s\n", + dev->name); + mpc = alloc_mpc(); + if (mpc == NULL) { + printk("mpoa: mpoa_event_listener: no new mpc"); + break; + } + } + mpc->dev_num = priv->itfnum; + mpc->dev = dev; + dev_hold(dev); + dprintk("mpoa: (%s) was initialized\n", dev->name); + break; + case NETDEV_UNREGISTER: + /* the lec device was deallocated */ + mpc = find_mpc_by_lec(dev); + if (mpc == NULL) + break; + dprintk("mpoa: device (%s) was deallocated\n", dev->name); + stop_mpc(mpc); + dev_put(mpc->dev); + mpc->dev = NULL; + break; + case NETDEV_UP: + /* the dev was ifconfig'ed up */ + mpc = find_mpc_by_lec(dev); + if (mpc == NULL) + break; + if (mpc->mpoad_vcc != NULL) { + start_mpc(mpc, dev); + } + break; + case NETDEV_DOWN: + /* the dev was ifconfig'ed down */ + /* this means that the flow of packets from the + * upper layer stops + */ + mpc = find_mpc_by_lec(dev); + if (mpc == NULL) + break; + if (mpc->mpoad_vcc != NULL) { + stop_mpc(mpc); + } + break; + case NETDEV_REBOOT: + case NETDEV_CHANGE: + case NETDEV_CHANGEMTU: + case NETDEV_CHANGEADDR: + case NETDEV_GOING_DOWN: + break; + default: + break; + } + + return NOTIFY_DONE; +} + +/* + * Functions which are called after a message is received from mpcd. + * Msg is reused on purpose. + */ + + +static void MPOA_trigger_rcvd(struct k_message *msg, struct mpoa_client *mpc) +{ + uint32_t dst_ip = msg->content.in_info.in_dst_ip; + in_cache_entry *entry; + + entry = mpc->in_ops->get(dst_ip, mpc); + if(entry == NULL){ + entry = mpc->in_ops->add_entry(dst_ip, mpc); + entry->entry_state = INGRESS_RESOLVING; + msg->type = SND_MPOA_RES_RQST; + msg->content.in_info = entry->ctrl_info; + msg_to_mpoad(msg, mpc); + do_gettimeofday(&(entry->reply_wait)); + mpc->in_ops->put(entry); + return; + } + + if(entry->entry_state == INGRESS_INVALID){ + entry->entry_state = INGRESS_RESOLVING; + msg->type = SND_MPOA_RES_RQST; + msg->content.in_info = entry->ctrl_info; + msg_to_mpoad(msg, mpc); + do_gettimeofday(&(entry->reply_wait)); + mpc->in_ops->put(entry); + return; + } + + printk("mpoa: (%s) MPOA_trigger_rcvd: entry already in resolving state\n", + (mpc->dev) ? mpc->dev->name : ""); + mpc->in_ops->put(entry); + return; +} + +/* + * Things get complicated because we have to check if there's an egress + * shortcut with suitable traffic parameters we could use. + */ +static void check_qos_and_open_shortcut(struct k_message *msg, struct mpoa_client *client, in_cache_entry *entry) +{ + uint32_t dst_ip = msg->content.in_info.in_dst_ip; + unsigned char *ip __attribute__ ((unused)) = (unsigned char *)&dst_ip; + struct atm_mpoa_qos *qos = atm_mpoa_search_qos(dst_ip); + eg_cache_entry *eg_entry = client->eg_ops->get_by_src_ip(dst_ip, client); + + if(eg_entry && eg_entry->shortcut){ + if(eg_entry->shortcut->qos.txtp.traffic_class & + msg->qos.txtp.traffic_class & + (qos ? qos->qos.txtp.traffic_class : ATM_UBR | ATM_CBR)){ + if(eg_entry->shortcut->qos.txtp.traffic_class == ATM_UBR) + entry->shortcut = eg_entry->shortcut; + else if(eg_entry->shortcut->qos.txtp.max_pcr > 0) + entry->shortcut = eg_entry->shortcut; + } + if(entry->shortcut){ + dprintk("mpoa: (%s) using egress SVC to reach %u.%u.%u.%u\n",client->dev->name, NIPQUAD(ip)); + client->eg_ops->put(eg_entry); + return; + } + } + if (eg_entry != NULL) + client->eg_ops->put(eg_entry); + + /* No luck in the egress cache we must open an ingress SVC */ + msg->type = OPEN_INGRESS_SVC; + if (qos && (qos->qos.txtp.traffic_class == msg->qos.txtp.traffic_class)) + { + msg->qos = qos->qos; + printk("mpoa: (%s) trying to get a CBR shortcut\n",client->dev->name); + } + else memset(&msg->qos,0,sizeof(struct atm_qos)); + msg_to_mpoad(msg, client); + return; +} + +static void MPOA_res_reply_rcvd(struct k_message *msg, struct mpoa_client *mpc) +{ + unsigned char *ip; + + uint32_t dst_ip = msg->content.in_info.in_dst_ip; + in_cache_entry *entry = mpc->in_ops->get(dst_ip, mpc); + ip = (unsigned char *)&dst_ip; + dprintk("mpoa: (%s) MPOA_res_reply_rcvd: ip %u.%u.%u.%u\n", mpc->dev->name, NIPQUAD(ip)); + ddprintk("mpoa: (%s) MPOA_res_reply_rcvd() entry = %p", mpc->dev->name, entry); + if(entry == NULL){ + printk("\nmpoa: (%s) ARGH, received res. reply for an entry that doesn't exist.\n", mpc->dev->name); + return; + } + ddprintk(" entry_state = %d ", entry->entry_state); + + if (entry->entry_state == INGRESS_RESOLVED) { + printk("\nmpoa: (%s) MPOA_res_reply_rcvd for RESOLVED entry!\n", mpc->dev->name); + mpc->in_ops->put(entry); + return; + } + + entry->ctrl_info = msg->content.in_info; + do_gettimeofday(&(entry->tv)); + do_gettimeofday(&(entry->reply_wait)); /* Used in refreshing func from now on */ + entry->refresh_time = 0; + ddprintk("entry->shortcut = %p\n", entry->shortcut); + + if(entry->entry_state == INGRESS_RESOLVING && entry->shortcut != NULL){ + entry->entry_state = INGRESS_RESOLVED; + mpc->in_ops->put(entry); + return; /* Shortcut already open... */ + } + + if (entry->shortcut != NULL) { + printk("mpoa: (%s) MPOA_res_reply_rcvd: entry->shortcut != NULL, impossible!\n", + mpc->dev->name); + mpc->in_ops->put(entry); + return; + } + + check_qos_and_open_shortcut(msg, mpc, entry); + entry->entry_state = INGRESS_RESOLVED; + mpc->in_ops->put(entry); + + return; + +} + +static void ingress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc) +{ + uint32_t dst_ip = msg->content.in_info.in_dst_ip; + uint32_t mask = msg->ip_mask; + unsigned char *ip = (unsigned char *)&dst_ip; + in_cache_entry *entry = mpc->in_ops->get_with_mask(dst_ip, mpc, mask); + + if(entry == NULL){ + printk("mpoa: (%s) ingress_purge_rcvd: purge for a non-existing entry, ", mpc->dev->name); + printk("ip = %u.%u.%u.%u\n", ip[0], ip[1], ip[2], ip[3]); + return; + } + + do { + dprintk("mpoa: (%s) ingress_purge_rcvd: removing an ingress entry, ip = %u.%u.%u.%u\n" , + mpc->dev->name, ip[0], ip[1], ip[2], ip[3]); + write_lock_bh(&mpc->ingress_lock); + mpc->in_ops->remove_entry(entry, mpc); + write_unlock_bh(&mpc->ingress_lock); + mpc->in_ops->put(entry); + entry = mpc->in_ops->get_with_mask(dst_ip, mpc, mask); + } while (entry != NULL); + + return; +} + +static void egress_purge_rcvd(struct k_message *msg, struct mpoa_client *mpc) +{ + uint32_t cache_id = msg->content.eg_info.cache_id; + eg_cache_entry *entry = mpc->eg_ops->get_by_cache_id(cache_id, mpc); + + if (entry == NULL) { + dprintk("mpoa: (%s) egress_purge_rcvd: purge for a non-existing entry\n", mpc->dev->name); + return; + } + + write_lock_irq(&mpc->egress_lock); + mpc->eg_ops->remove_entry(entry, mpc); + write_unlock_irq(&mpc->egress_lock); + + mpc->eg_ops->put(entry); + + return; +} + +static void purge_egress_shortcut(struct atm_vcc *vcc, eg_cache_entry *entry) +{ + struct sock *sk; + struct k_message *purge_msg; + struct sk_buff *skb; + + dprintk("mpoa: purge_egress_shortcut: entering\n"); + if (vcc == NULL) { + printk("mpoa: purge_egress_shortcut: vcc == NULL\n"); + return; + } + + skb = alloc_skb(sizeof(struct k_message), GFP_ATOMIC); + if (skb == NULL) { + printk("mpoa: purge_egress_shortcut: out of memory\n"); + return; + } + + skb_put(skb, sizeof(struct k_message)); + memset(skb->data, 0, sizeof(struct k_message)); + purge_msg = (struct k_message *)skb->data; + purge_msg->type = DATA_PLANE_PURGE; + if (entry != NULL) + purge_msg->content.eg_info = entry->ctrl_info; + + atm_force_charge(vcc, skb->truesize); + + sk = sk_atm(vcc); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + dprintk("mpoa: purge_egress_shortcut: exiting:\n"); + + return; +} + +/* + * Our MPS died. Tell our daemon to send NHRP data plane purge to each + * of the egress shortcuts we have. + */ +static void mps_death( struct k_message * msg, struct mpoa_client * mpc ) +{ + eg_cache_entry *entry; + + dprintk("mpoa: (%s) mps_death:\n", mpc->dev->name); + + if(memcmp(msg->MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN)){ + printk("mpoa: (%s) mps_death: wrong MPS\n", mpc->dev->name); + return; + } + + /* FIXME: This knows too much of the cache structure */ + read_lock_irq(&mpc->egress_lock); + entry = mpc->eg_cache; + while (entry != NULL) { + purge_egress_shortcut(entry->shortcut, entry); + entry = entry->next; + } + read_unlock_irq(&mpc->egress_lock); + + mpc->in_ops->destroy_cache(mpc); + mpc->eg_ops->destroy_cache(mpc); + + return; +} + +static void MPOA_cache_impos_rcvd( struct k_message * msg, struct mpoa_client * mpc) +{ + uint16_t holding_time; + eg_cache_entry *entry = mpc->eg_ops->get_by_cache_id(msg->content.eg_info.cache_id, mpc); + + holding_time = msg->content.eg_info.holding_time; + dprintk("mpoa: (%s) MPOA_cache_impos_rcvd: entry = %p, holding_time = %u\n", + mpc->dev->name, entry, holding_time); + if(entry == NULL && holding_time) { + entry = mpc->eg_ops->add_entry(msg, mpc); + mpc->eg_ops->put(entry); + return; + } + if(holding_time){ + mpc->eg_ops->update(entry, holding_time); + return; + } + + write_lock_irq(&mpc->egress_lock); + mpc->eg_ops->remove_entry(entry, mpc); + write_unlock_irq(&mpc->egress_lock); + + mpc->eg_ops->put(entry); + + return; +} + +static void set_mpc_ctrl_addr_rcvd(struct k_message *mesg, struct mpoa_client *mpc) +{ + struct lec_priv *priv; + int i, retval ; + + uint8_t tlv[4 + 1 + 1 + 1 + ATM_ESA_LEN]; + + tlv[0] = 00; tlv[1] = 0xa0; tlv[2] = 0x3e; tlv[3] = 0x2a; /* type */ + tlv[4] = 1 + 1 + ATM_ESA_LEN; /* length */ + tlv[5] = 0x02; /* MPOA client */ + tlv[6] = 0x00; /* number of MPS MAC addresses */ + + memcpy(&tlv[7], mesg->MPS_ctrl, ATM_ESA_LEN); /* MPC ctrl ATM addr */ + memcpy(mpc->our_ctrl_addr, mesg->MPS_ctrl, ATM_ESA_LEN); + + dprintk("mpoa: (%s) setting MPC ctrl ATM address to ", + (mpc->dev) ? mpc->dev->name : ""); + for (i = 7; i < sizeof(tlv); i++) + dprintk("%02x ", tlv[i]); + dprintk("\n"); + + if (mpc->dev) { + priv = (struct lec_priv *)mpc->dev->priv; + retval = priv->lane2_ops->associate_req(mpc->dev, mpc->dev->dev_addr, tlv, sizeof(tlv)); + if (retval == 0) + printk("mpoa: (%s) MPOA device type TLV association failed\n", mpc->dev->name); + retval = priv->lane2_ops->resolve(mpc->dev, NULL, 1, NULL, NULL); + if (retval < 0) + printk("mpoa: (%s) targetless LE_ARP request failed\n", mpc->dev->name); + } + + return; +} + +static void set_mps_mac_addr_rcvd(struct k_message *msg, struct mpoa_client *client) +{ + + if(client->number_of_mps_macs) + kfree(client->mps_macs); + client->number_of_mps_macs = 0; + client->mps_macs = kmalloc(ETH_ALEN,GFP_KERNEL); + if (client->mps_macs == NULL) { + printk("mpoa: set_mps_mac_addr_rcvd: out of memory\n"); + return; + } + client->number_of_mps_macs = 1; + memcpy(client->mps_macs, msg->MPS_ctrl, ETH_ALEN); + + return; +} + +/* + * purge egress cache and tell daemon to 'action' (DIE, RELOAD) + */ +static void clean_up(struct k_message *msg, struct mpoa_client *mpc, int action) +{ + + eg_cache_entry *entry; + msg->type = SND_EGRESS_PURGE; + + + /* FIXME: This knows too much of the cache structure */ + read_lock_irq(&mpc->egress_lock); + entry = mpc->eg_cache; + while (entry != NULL){ + msg->content.eg_info = entry->ctrl_info; + dprintk("mpoa: cache_id %u\n", entry->ctrl_info.cache_id); + msg_to_mpoad(msg, mpc); + entry = entry->next; + } + read_unlock_irq(&mpc->egress_lock); + + msg->type = action; + msg_to_mpoad(msg, mpc); + return; +} + +static void mpc_timer_refresh(void) +{ + mpc_timer.expires = jiffies + (MPC_P2 * HZ); + mpc_timer.data = mpc_timer.expires; + mpc_timer.function = mpc_cache_check; + add_timer(&mpc_timer); + + return; +} + +static void mpc_cache_check( unsigned long checking_time ) +{ + struct mpoa_client *mpc = mpcs; + static unsigned long previous_resolving_check_time; + static unsigned long previous_refresh_time; + + while( mpc != NULL ){ + mpc->in_ops->clear_count(mpc); + mpc->eg_ops->clear_expired(mpc); + if(checking_time - previous_resolving_check_time > mpc->parameters.mpc_p4 * HZ ){ + mpc->in_ops->check_resolving(mpc); + previous_resolving_check_time = checking_time; + } + if(checking_time - previous_refresh_time > mpc->parameters.mpc_p5 * HZ ){ + mpc->in_ops->refresh(mpc); + previous_refresh_time = checking_time; + } + mpc = mpc->next; + } + mpc_timer_refresh(); + + return; +} + +static int atm_mpoa_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + int err = 0; + struct atm_vcc *vcc = ATM_SD(sock); + + if (cmd != ATMMPC_CTRL && cmd != ATMMPC_DATA) + return -ENOIOCTLCMD; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ATMMPC_CTRL: + err = atm_mpoa_mpoad_attach(vcc, (int)arg); + if (err >= 0) + sock->state = SS_CONNECTED; + break; + case ATMMPC_DATA: + err = atm_mpoa_vcc_attach(vcc, (void __user *)arg); + break; + default: + break; + } + return err; +} + + +static struct atm_ioctl atm_ioctl_ops = { + .owner = THIS_MODULE, + .ioctl = atm_mpoa_ioctl, +}; + +static __init int atm_mpoa_init(void) +{ + register_atm_ioctl(&atm_ioctl_ops); + +#ifdef CONFIG_PROC_FS + if (mpc_proc_init() != 0) + printk(KERN_INFO "mpoa: failed to initialize /proc/mpoa\n"); + else + printk(KERN_INFO "mpoa: /proc/mpoa initialized\n"); +#endif + + printk("mpc.c: " __DATE__ " " __TIME__ " initialized\n"); + + return 0; +} + +static void __exit atm_mpoa_cleanup(void) +{ + struct mpoa_client *mpc, *tmp; + struct atm_mpoa_qos *qos, *nextqos; + struct lec_priv *priv; + +#ifdef CONFIG_PROC_FS + mpc_proc_clean(); +#endif + + del_timer(&mpc_timer); + unregister_netdevice_notifier(&mpoa_notifier); + deregister_atm_ioctl(&atm_ioctl_ops); + + mpc = mpcs; + mpcs = NULL; + while (mpc != NULL) { + tmp = mpc->next; + if (mpc->dev != NULL) { + stop_mpc(mpc); + priv = (struct lec_priv *)mpc->dev->priv; + if (priv->lane2_ops != NULL) + priv->lane2_ops->associate_indicator = NULL; + } + ddprintk("mpoa: cleanup_module: about to clear caches\n"); + mpc->in_ops->destroy_cache(mpc); + mpc->eg_ops->destroy_cache(mpc); + ddprintk("mpoa: cleanup_module: caches cleared\n"); + kfree(mpc->mps_macs); + memset(mpc, 0, sizeof(struct mpoa_client)); + ddprintk("mpoa: cleanup_module: about to kfree %p\n", mpc); + kfree(mpc); + ddprintk("mpoa: cleanup_module: next mpc is at %p\n", tmp); + mpc = tmp; + } + + qos = qos_head; + qos_head = NULL; + while (qos != NULL) { + nextqos = qos->next; + dprintk("mpoa: cleanup_module: freeing qos entry %p\n", qos); + kfree(qos); + qos = nextqos; + } + + return; +} + +module_init(atm_mpoa_init); +module_exit(atm_mpoa_cleanup); + +MODULE_LICENSE("GPL"); diff --git a/net/atm/mpc.h b/net/atm/mpc.h new file mode 100644 index 000000000000..863ddf6079e1 --- /dev/null +++ b/net/atm/mpc.h @@ -0,0 +1,53 @@ +#ifndef _MPC_H_ +#define _MPC_H_ + +#include +#include +#include +#include +#include +#include "mpoa_caches.h" + +/* kernel -> mpc-daemon */ +int msg_to_mpoad(struct k_message *msg, struct mpoa_client *mpc); + +struct mpoa_client { + struct mpoa_client *next; + struct net_device *dev; /* lec in question */ + int dev_num; /* e.g. 2 for lec2 */ + int (*old_hard_start_xmit)(struct sk_buff *skb, struct net_device *dev); + struct atm_vcc *mpoad_vcc; /* control channel to mpoad */ + uint8_t mps_ctrl_addr[ATM_ESA_LEN]; /* MPS control ATM address */ + uint8_t our_ctrl_addr[ATM_ESA_LEN]; /* MPC's control ATM address */ + + rwlock_t ingress_lock; + struct in_cache_ops *in_ops; /* ingress cache operations */ + in_cache_entry *in_cache; /* the ingress cache of this MPC */ + + rwlock_t egress_lock; + struct eg_cache_ops *eg_ops; /* egress cache operations */ + eg_cache_entry *eg_cache; /* the egress cache of this MPC */ + + uint8_t *mps_macs; /* array of MPS MAC addresses, >=1 */ + int number_of_mps_macs; /* number of the above MAC addresses */ + struct mpc_parameters parameters; /* parameters for this client */ +}; + + +struct atm_mpoa_qos { + struct atm_mpoa_qos *next; + uint32_t ipaddr; + struct atm_qos qos; +}; + + +/* MPOA QoS operations */ +struct atm_mpoa_qos *atm_mpoa_add_qos(uint32_t dst_ip, struct atm_qos *qos); +struct atm_mpoa_qos *atm_mpoa_search_qos(uint32_t dst_ip); +int atm_mpoa_delete_qos(struct atm_mpoa_qos *qos); + +/* Display QoS entries. This is for the procfs */ +struct seq_file; +void atm_mpoa_disp_qos(struct seq_file *m); + +#endif /* _MPC_H_ */ diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c new file mode 100644 index 000000000000..64ddebb64060 --- /dev/null +++ b/net/atm/mpoa_caches.c @@ -0,0 +1,576 @@ +#include +#include +#include + +#include "mpoa_caches.h" +#include "mpc.h" + +/* + * mpoa_caches.c: Implementation of ingress and egress cache + * handling functions + */ + +#if 0 +#define dprintk printk /* debug */ +#else +#define dprintk(format,args...) +#endif + +#if 0 +#define ddprintk printk /* more debug */ +#else +#define ddprintk(format,args...) +#endif + +static in_cache_entry *in_cache_get(uint32_t dst_ip, + struct mpoa_client *client) +{ + in_cache_entry *entry; + + read_lock_bh(&client->ingress_lock); + entry = client->in_cache; + while(entry != NULL){ + if( entry->ctrl_info.in_dst_ip == dst_ip ){ + atomic_inc(&entry->use); + read_unlock_bh(&client->ingress_lock); + return entry; + } + entry = entry->next; + } + read_unlock_bh(&client->ingress_lock); + + return NULL; +} + +static in_cache_entry *in_cache_get_with_mask(uint32_t dst_ip, + struct mpoa_client *client, + uint32_t mask) +{ + in_cache_entry *entry; + + read_lock_bh(&client->ingress_lock); + entry = client->in_cache; + while(entry != NULL){ + if((entry->ctrl_info.in_dst_ip & mask) == (dst_ip & mask )){ + atomic_inc(&entry->use); + read_unlock_bh(&client->ingress_lock); + return entry; + } + entry = entry->next; + } + read_unlock_bh(&client->ingress_lock); + + return NULL; + +} + +static in_cache_entry *in_cache_get_by_vcc(struct atm_vcc *vcc, + struct mpoa_client *client ) +{ + in_cache_entry *entry; + + read_lock_bh(&client->ingress_lock); + entry = client->in_cache; + while(entry != NULL){ + if(entry->shortcut == vcc) { + atomic_inc(&entry->use); + read_unlock_bh(&client->ingress_lock); + return entry; + } + entry = entry->next; + } + read_unlock_bh(&client->ingress_lock); + + return NULL; +} + +static in_cache_entry *in_cache_add_entry(uint32_t dst_ip, + struct mpoa_client *client) +{ + unsigned char *ip __attribute__ ((unused)) = (unsigned char *)&dst_ip; + in_cache_entry* entry = kmalloc(sizeof(in_cache_entry), GFP_KERNEL); + + if (entry == NULL) { + printk("mpoa: mpoa_caches.c: new_in_cache_entry: out of memory\n"); + return NULL; + } + + dprintk("mpoa: mpoa_caches.c: adding an ingress entry, ip = %u.%u.%u.%u\n", ip[0], ip[1], ip[2], ip[3]); + memset(entry,0,sizeof(in_cache_entry)); + + atomic_set(&entry->use, 1); + dprintk("mpoa: mpoa_caches.c: new_in_cache_entry: about to lock\n"); + write_lock_bh(&client->ingress_lock); + entry->next = client->in_cache; + entry->prev = NULL; + if (client->in_cache != NULL) + client->in_cache->prev = entry; + client->in_cache = entry; + + memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN); + entry->ctrl_info.in_dst_ip = dst_ip; + do_gettimeofday(&(entry->tv)); + entry->retry_time = client->parameters.mpc_p4; + entry->count = 1; + entry->entry_state = INGRESS_INVALID; + entry->ctrl_info.holding_time = HOLDING_TIME_DEFAULT; + atomic_inc(&entry->use); + + write_unlock_bh(&client->ingress_lock); + dprintk("mpoa: mpoa_caches.c: new_in_cache_entry: unlocked\n"); + + return entry; +} + +static int cache_hit(in_cache_entry *entry, struct mpoa_client *mpc) +{ + struct atm_mpoa_qos *qos; + struct k_message msg; + + entry->count++; + if(entry->entry_state == INGRESS_RESOLVED && entry->shortcut != NULL) + return OPEN; + + if(entry->entry_state == INGRESS_REFRESHING){ + if(entry->count > mpc->parameters.mpc_p1){ + msg.type = SND_MPOA_RES_RQST; + msg.content.in_info = entry->ctrl_info; + memcpy(msg.MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN); + qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip); + if (qos != NULL) msg.qos = qos->qos; + msg_to_mpoad(&msg, mpc); + do_gettimeofday(&(entry->reply_wait)); + entry->entry_state = INGRESS_RESOLVING; + } + if(entry->shortcut != NULL) + return OPEN; + return CLOSED; + } + + if(entry->entry_state == INGRESS_RESOLVING && entry->shortcut != NULL) + return OPEN; + + if( entry->count > mpc->parameters.mpc_p1 && + entry->entry_state == INGRESS_INVALID){ + unsigned char *ip __attribute__ ((unused)) = + (unsigned char *)&entry->ctrl_info.in_dst_ip; + + dprintk("mpoa: (%s) mpoa_caches.c: threshold exceeded for ip %u.%u.%u.%u, sending MPOA res req\n", mpc->dev->name, ip[0], ip[1], ip[2], ip[3]); + entry->entry_state = INGRESS_RESOLVING; + msg.type = SND_MPOA_RES_RQST; + memcpy(msg.MPS_ctrl, mpc->mps_ctrl_addr, ATM_ESA_LEN ); + msg.content.in_info = entry->ctrl_info; + qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip); + if (qos != NULL) msg.qos = qos->qos; + msg_to_mpoad( &msg, mpc); + do_gettimeofday(&(entry->reply_wait)); + } + + return CLOSED; +} + +static void in_cache_put(in_cache_entry *entry) +{ + if (atomic_dec_and_test(&entry->use)) { + memset(entry, 0, sizeof(in_cache_entry)); + kfree(entry); + } + + return; +} + +/* + * This should be called with write lock on + */ +static void in_cache_remove_entry(in_cache_entry *entry, + struct mpoa_client *client) +{ + struct atm_vcc *vcc; + struct k_message msg; + unsigned char *ip; + + vcc = entry->shortcut; + ip = (unsigned char *)&entry->ctrl_info.in_dst_ip; + dprintk("mpoa: mpoa_caches.c: removing an ingress entry, ip = %u.%u.%u.%u\n",ip[0], ip[1], ip[2], ip[3]); + + if (entry->prev != NULL) + entry->prev->next = entry->next; + else + client->in_cache = entry->next; + if (entry->next != NULL) + entry->next->prev = entry->prev; + client->in_ops->put(entry); + if(client->in_cache == NULL && client->eg_cache == NULL){ + msg.type = STOP_KEEP_ALIVE_SM; + msg_to_mpoad(&msg,client); + } + + /* Check if the egress side still uses this VCC */ + if (vcc != NULL) { + eg_cache_entry *eg_entry = client->eg_ops->get_by_vcc(vcc, client); + if (eg_entry != NULL) { + client->eg_ops->put(eg_entry); + return; + } + vcc_release_async(vcc, -EPIPE); + } + + return; +} + + +/* Call this every MPC-p2 seconds... Not exactly correct solution, + but an easy one... */ +static void clear_count_and_expired(struct mpoa_client *client) +{ + unsigned char *ip; + in_cache_entry *entry, *next_entry; + struct timeval now; + + do_gettimeofday(&now); + + write_lock_bh(&client->ingress_lock); + entry = client->in_cache; + while(entry != NULL){ + entry->count=0; + next_entry = entry->next; + if((now.tv_sec - entry->tv.tv_sec) + > entry->ctrl_info.holding_time){ + ip = (unsigned char*)&entry->ctrl_info.in_dst_ip; + dprintk("mpoa: mpoa_caches.c: holding time expired, ip = %u.%u.%u.%u\n", NIPQUAD(ip)); + client->in_ops->remove_entry(entry, client); + } + entry = next_entry; + } + write_unlock_bh(&client->ingress_lock); + + return; +} + +/* Call this every MPC-p4 seconds. */ +static void check_resolving_entries(struct mpoa_client *client) +{ + + struct atm_mpoa_qos *qos; + in_cache_entry *entry; + struct timeval now; + struct k_message msg; + + do_gettimeofday( &now ); + + read_lock_bh(&client->ingress_lock); + entry = client->in_cache; + while( entry != NULL ){ + if(entry->entry_state == INGRESS_RESOLVING){ + if(now.tv_sec - entry->hold_down.tv_sec < client->parameters.mpc_p6){ + entry = entry->next; /* Entry in hold down */ + continue; + } + if( (now.tv_sec - entry->reply_wait.tv_sec) > + entry->retry_time ){ + entry->retry_time = MPC_C1*( entry->retry_time ); + if(entry->retry_time > client->parameters.mpc_p5){ + /* Retry time maximum exceeded, put entry in hold down. */ + do_gettimeofday(&(entry->hold_down)); + entry->retry_time = client->parameters.mpc_p4; + entry = entry->next; + continue; + } + /* Ask daemon to send a resolution request. */ + memset(&(entry->hold_down),0,sizeof(struct timeval)); + msg.type = SND_MPOA_RES_RTRY; + memcpy(msg.MPS_ctrl, client->mps_ctrl_addr, ATM_ESA_LEN); + msg.content.in_info = entry->ctrl_info; + qos = atm_mpoa_search_qos(entry->ctrl_info.in_dst_ip); + if (qos != NULL) msg.qos = qos->qos; + msg_to_mpoad(&msg, client); + do_gettimeofday(&(entry->reply_wait)); + } + } + entry = entry->next; + } + read_unlock_bh(&client->ingress_lock); +} + +/* Call this every MPC-p5 seconds. */ +static void refresh_entries(struct mpoa_client *client) +{ + struct timeval now; + struct in_cache_entry *entry = client->in_cache; + + ddprintk("mpoa: mpoa_caches.c: refresh_entries\n"); + do_gettimeofday(&now); + + read_lock_bh(&client->ingress_lock); + while( entry != NULL ){ + if( entry->entry_state == INGRESS_RESOLVED ){ + if(!(entry->refresh_time)) + entry->refresh_time = (2*(entry->ctrl_info.holding_time))/3; + if( (now.tv_sec - entry->reply_wait.tv_sec) > entry->refresh_time ){ + dprintk("mpoa: mpoa_caches.c: refreshing an entry.\n"); + entry->entry_state = INGRESS_REFRESHING; + + } + } + entry = entry->next; + } + read_unlock_bh(&client->ingress_lock); +} + +static void in_destroy_cache(struct mpoa_client *mpc) +{ + write_lock_irq(&mpc->ingress_lock); + while(mpc->in_cache != NULL) + mpc->in_ops->remove_entry(mpc->in_cache, mpc); + write_unlock_irq(&mpc->ingress_lock); + + return; +} + +static eg_cache_entry *eg_cache_get_by_cache_id(uint32_t cache_id, struct mpoa_client *mpc) +{ + eg_cache_entry *entry; + + read_lock_irq(&mpc->egress_lock); + entry = mpc->eg_cache; + while(entry != NULL){ + if(entry->ctrl_info.cache_id == cache_id){ + atomic_inc(&entry->use); + read_unlock_irq(&mpc->egress_lock); + return entry; + } + entry = entry->next; + } + read_unlock_irq(&mpc->egress_lock); + + return NULL; +} + +/* This can be called from any context since it saves CPU flags */ +static eg_cache_entry *eg_cache_get_by_tag(uint32_t tag, struct mpoa_client *mpc) +{ + unsigned long flags; + eg_cache_entry *entry; + + read_lock_irqsave(&mpc->egress_lock, flags); + entry = mpc->eg_cache; + while (entry != NULL){ + if (entry->ctrl_info.tag == tag) { + atomic_inc(&entry->use); + read_unlock_irqrestore(&mpc->egress_lock, flags); + return entry; + } + entry = entry->next; + } + read_unlock_irqrestore(&mpc->egress_lock, flags); + + return NULL; +} + +/* This can be called from any context since it saves CPU flags */ +static eg_cache_entry *eg_cache_get_by_vcc(struct atm_vcc *vcc, struct mpoa_client *mpc) +{ + unsigned long flags; + eg_cache_entry *entry; + + read_lock_irqsave(&mpc->egress_lock, flags); + entry = mpc->eg_cache; + while (entry != NULL){ + if (entry->shortcut == vcc) { + atomic_inc(&entry->use); + read_unlock_irqrestore(&mpc->egress_lock, flags); + return entry; + } + entry = entry->next; + } + read_unlock_irqrestore(&mpc->egress_lock, flags); + + return NULL; +} + +static eg_cache_entry *eg_cache_get_by_src_ip(uint32_t ipaddr, struct mpoa_client *mpc) +{ + eg_cache_entry *entry; + + read_lock_irq(&mpc->egress_lock); + entry = mpc->eg_cache; + while(entry != NULL){ + if(entry->latest_ip_addr == ipaddr) { + atomic_inc(&entry->use); + read_unlock_irq(&mpc->egress_lock); + return entry; + } + entry = entry->next; + } + read_unlock_irq(&mpc->egress_lock); + + return NULL; +} + +static void eg_cache_put(eg_cache_entry *entry) +{ + if (atomic_dec_and_test(&entry->use)) { + memset(entry, 0, sizeof(eg_cache_entry)); + kfree(entry); + } + + return; +} + +/* + * This should be called with write lock on + */ +static void eg_cache_remove_entry(eg_cache_entry *entry, + struct mpoa_client *client) +{ + struct atm_vcc *vcc; + struct k_message msg; + + vcc = entry->shortcut; + dprintk("mpoa: mpoa_caches.c: removing an egress entry.\n"); + if (entry->prev != NULL) + entry->prev->next = entry->next; + else + client->eg_cache = entry->next; + if (entry->next != NULL) + entry->next->prev = entry->prev; + client->eg_ops->put(entry); + if(client->in_cache == NULL && client->eg_cache == NULL){ + msg.type = STOP_KEEP_ALIVE_SM; + msg_to_mpoad(&msg,client); + } + + /* Check if the ingress side still uses this VCC */ + if (vcc != NULL) { + in_cache_entry *in_entry = client->in_ops->get_by_vcc(vcc, client); + if (in_entry != NULL) { + client->in_ops->put(in_entry); + return; + } + vcc_release_async(vcc, -EPIPE); + } + + return; +} + +static eg_cache_entry *eg_cache_add_entry(struct k_message *msg, struct mpoa_client *client) +{ + unsigned char *ip; + eg_cache_entry *entry = kmalloc(sizeof(eg_cache_entry), GFP_KERNEL); + + if (entry == NULL) { + printk("mpoa: mpoa_caches.c: new_eg_cache_entry: out of memory\n"); + return NULL; + } + + ip = (unsigned char *)&msg->content.eg_info.eg_dst_ip; + dprintk("mpoa: mpoa_caches.c: adding an egress entry, ip = %u.%u.%u.%u, this should be our IP\n", NIPQUAD(ip)); + memset(entry, 0, sizeof(eg_cache_entry)); + + atomic_set(&entry->use, 1); + dprintk("mpoa: mpoa_caches.c: new_eg_cache_entry: about to lock\n"); + write_lock_irq(&client->egress_lock); + entry->next = client->eg_cache; + entry->prev = NULL; + if (client->eg_cache != NULL) + client->eg_cache->prev = entry; + client->eg_cache = entry; + + memcpy(entry->MPS_ctrl_ATM_addr, client->mps_ctrl_addr, ATM_ESA_LEN); + entry->ctrl_info = msg->content.eg_info; + do_gettimeofday(&(entry->tv)); + entry->entry_state = EGRESS_RESOLVED; + dprintk("mpoa: mpoa_caches.c: new_eg_cache_entry cache_id %lu\n", ntohl(entry->ctrl_info.cache_id)); + ip = (unsigned char *)&entry->ctrl_info.mps_ip; + dprintk("mpoa: mpoa_caches.c: mps_ip = %u.%u.%u.%u\n", NIPQUAD(ip)); + atomic_inc(&entry->use); + + write_unlock_irq(&client->egress_lock); + dprintk("mpoa: mpoa_caches.c: new_eg_cache_entry: unlocked\n"); + + return entry; +} + +static void update_eg_cache_entry(eg_cache_entry * entry, uint16_t holding_time) +{ + do_gettimeofday(&(entry->tv)); + entry->entry_state = EGRESS_RESOLVED; + entry->ctrl_info.holding_time = holding_time; + + return; +} + +static void clear_expired(struct mpoa_client *client) +{ + eg_cache_entry *entry, *next_entry; + struct timeval now; + struct k_message msg; + + do_gettimeofday(&now); + + write_lock_irq(&client->egress_lock); + entry = client->eg_cache; + while(entry != NULL){ + next_entry = entry->next; + if((now.tv_sec - entry->tv.tv_sec) + > entry->ctrl_info.holding_time){ + msg.type = SND_EGRESS_PURGE; + msg.content.eg_info = entry->ctrl_info; + dprintk("mpoa: mpoa_caches.c: egress_cache: holding time expired, cache_id = %lu.\n",ntohl(entry->ctrl_info.cache_id)); + msg_to_mpoad(&msg, client); + client->eg_ops->remove_entry(entry, client); + } + entry = next_entry; + } + write_unlock_irq(&client->egress_lock); + + return; +} + +static void eg_destroy_cache(struct mpoa_client *mpc) +{ + write_lock_irq(&mpc->egress_lock); + while(mpc->eg_cache != NULL) + mpc->eg_ops->remove_entry(mpc->eg_cache, mpc); + write_unlock_irq(&mpc->egress_lock); + + return; +} + + + +static struct in_cache_ops ingress_ops = { + in_cache_add_entry, /* add_entry */ + in_cache_get, /* get */ + in_cache_get_with_mask, /* get_with_mask */ + in_cache_get_by_vcc, /* get_by_vcc */ + in_cache_put, /* put */ + in_cache_remove_entry, /* remove_entry */ + cache_hit, /* cache_hit */ + clear_count_and_expired, /* clear_count */ + check_resolving_entries, /* check_resolving */ + refresh_entries, /* refresh */ + in_destroy_cache /* destroy_cache */ +}; + +static struct eg_cache_ops egress_ops = { + eg_cache_add_entry, /* add_entry */ + eg_cache_get_by_cache_id, /* get_by_cache_id */ + eg_cache_get_by_tag, /* get_by_tag */ + eg_cache_get_by_vcc, /* get_by_vcc */ + eg_cache_get_by_src_ip, /* get_by_src_ip */ + eg_cache_put, /* put */ + eg_cache_remove_entry, /* remove_entry */ + update_eg_cache_entry, /* update */ + clear_expired, /* clear_expired */ + eg_destroy_cache /* destroy_cache */ +}; + + +void atm_mpoa_init_cache(struct mpoa_client *mpc) +{ + mpc->in_ops = &ingress_ops; + mpc->eg_ops = &egress_ops; + + return; +} diff --git a/net/atm/mpoa_caches.h b/net/atm/mpoa_caches.h new file mode 100644 index 000000000000..6c9886a03d0b --- /dev/null +++ b/net/atm/mpoa_caches.h @@ -0,0 +1,96 @@ +#ifndef MPOA_CACHES_H +#define MPOA_CACHES_H + +#include +#include +#include +#include +#include + +struct mpoa_client; + +void atm_mpoa_init_cache(struct mpoa_client *mpc); + +typedef struct in_cache_entry { + struct in_cache_entry *next; + struct in_cache_entry *prev; + struct timeval tv; + struct timeval reply_wait; + struct timeval hold_down; + uint32_t packets_fwded; + uint16_t entry_state; + uint32_t retry_time; + uint32_t refresh_time; + uint32_t count; + struct atm_vcc *shortcut; + uint8_t MPS_ctrl_ATM_addr[ATM_ESA_LEN]; + struct in_ctrl_info ctrl_info; + atomic_t use; +} in_cache_entry; + +struct in_cache_ops{ + in_cache_entry *(*add_entry)(uint32_t dst_ip, + struct mpoa_client *client); + in_cache_entry *(*get)(uint32_t dst_ip, struct mpoa_client *client); + in_cache_entry *(*get_with_mask)(uint32_t dst_ip, + struct mpoa_client *client, + uint32_t mask); + in_cache_entry *(*get_by_vcc)(struct atm_vcc *vcc, + struct mpoa_client *client); + void (*put)(in_cache_entry *entry); + void (*remove_entry)(in_cache_entry *delEntry, + struct mpoa_client *client ); + int (*cache_hit)(in_cache_entry *entry, + struct mpoa_client *client); + void (*clear_count)(struct mpoa_client *client); + void (*check_resolving)(struct mpoa_client *client); + void (*refresh)(struct mpoa_client *client); + void (*destroy_cache)(struct mpoa_client *mpc); +}; + +typedef struct eg_cache_entry{ + struct eg_cache_entry *next; + struct eg_cache_entry *prev; + struct timeval tv; + uint8_t MPS_ctrl_ATM_addr[ATM_ESA_LEN]; + struct atm_vcc *shortcut; + uint32_t packets_rcvd; + uint16_t entry_state; + uint32_t latest_ip_addr; /* The src IP address of the last packet */ + struct eg_ctrl_info ctrl_info; + atomic_t use; +} eg_cache_entry; + +struct eg_cache_ops{ + eg_cache_entry *(*add_entry)(struct k_message *msg, struct mpoa_client *client); + eg_cache_entry *(*get_by_cache_id)(uint32_t cache_id, struct mpoa_client *client); + eg_cache_entry *(*get_by_tag)(uint32_t cache_id, struct mpoa_client *client); + eg_cache_entry *(*get_by_vcc)(struct atm_vcc *vcc, struct mpoa_client *client); + eg_cache_entry *(*get_by_src_ip)(uint32_t ipaddr, struct mpoa_client *client); + void (*put)(eg_cache_entry *entry); + void (*remove_entry)(eg_cache_entry *entry, struct mpoa_client *client); + void (*update)(eg_cache_entry *entry, uint16_t holding_time); + void (*clear_expired)(struct mpoa_client *client); + void (*destroy_cache)(struct mpoa_client *mpc); +}; + + +/* Ingress cache entry states */ + +#define INGRESS_REFRESHING 3 +#define INGRESS_RESOLVED 2 +#define INGRESS_RESOLVING 1 +#define INGRESS_INVALID 0 + +/* VCC states */ + +#define OPEN 1 +#define CLOSED 0 + +/* Egress cache entry states */ + +#define EGRESS_RESOLVED 2 +#define EGRESS_PURGE 1 +#define EGRESS_INVALID 0 + +#endif diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c new file mode 100644 index 000000000000..60834b5a14d6 --- /dev/null +++ b/net/atm/mpoa_proc.c @@ -0,0 +1,305 @@ +#include + +#ifdef CONFIG_PROC_FS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mpc.h" +#include "mpoa_caches.h" + +/* + * mpoa_proc.c: Implementation MPOA client's proc + * file system statistics + */ + +#if 1 +#define dprintk printk /* debug */ +#else +#define dprintk(format,args...) +#endif + +#define STAT_FILE_NAME "mpc" /* Our statistic file's name */ + +extern struct mpoa_client *mpcs; +extern struct proc_dir_entry *atm_proc_root; /* from proc.c. */ + +static int proc_mpc_open(struct inode *inode, struct file *file); +static ssize_t proc_mpc_write(struct file *file, const char __user *buff, + size_t nbytes, loff_t *ppos); + +static int parse_qos(const char *buff); + +/* + * Define allowed FILE OPERATIONS + */ +static struct file_operations mpc_file_operations = { + .owner = THIS_MODULE, + .open = proc_mpc_open, + .read = seq_read, + .llseek = seq_lseek, + .write = proc_mpc_write, + .release = seq_release, +}; + +/* + * Returns the state of an ingress cache entry as a string + */ +static const char *ingress_state_string(int state){ + switch(state) { + case INGRESS_RESOLVING: + return "resolving "; + break; + case INGRESS_RESOLVED: + return "resolved "; + break; + case INGRESS_INVALID: + return "invalid "; + break; + case INGRESS_REFRESHING: + return "refreshing "; + break; + default: + return ""; + } +} + +/* + * Returns the state of an egress cache entry as a string + */ +static const char *egress_state_string(int state){ + switch(state) { + case EGRESS_RESOLVED: + return "resolved "; + break; + case EGRESS_PURGE: + return "purge "; + break; + case EGRESS_INVALID: + return "invalid "; + break; + default: + return ""; + } +} + +/* + * FIXME: mpcs (and per-mpc lists) have no locking whatsoever. + */ + +static void *mpc_start(struct seq_file *m, loff_t *pos) +{ + loff_t l = *pos; + struct mpoa_client *mpc; + + if (!l--) + return SEQ_START_TOKEN; + for (mpc = mpcs; mpc; mpc = mpc->next) + if (!l--) + return mpc; + return NULL; +} + +static void *mpc_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct mpoa_client *p = v; + (*pos)++; + return v == SEQ_START_TOKEN ? mpcs : p->next; +} + +static void mpc_stop(struct seq_file *m, void *v) +{ +} + +/* + * READING function - called when the /proc/atm/mpoa file is read from. + */ +static int mpc_show(struct seq_file *m, void *v) +{ + struct mpoa_client *mpc = v; + unsigned char *temp; + int i; + in_cache_entry *in_entry; + eg_cache_entry *eg_entry; + struct timeval now; + unsigned char ip_string[16]; + + if (v == SEQ_START_TOKEN) { + atm_mpoa_disp_qos(m); + return 0; + } + + seq_printf(m, "\nInterface %d:\n\n", mpc->dev_num); + seq_printf(m, "Ingress Entries:\nIP address State Holding time Packets fwded VPI VCI\n"); + do_gettimeofday(&now); + + for (in_entry = mpc->in_cache; in_entry; in_entry = in_entry->next) { + temp = (unsigned char *)&in_entry->ctrl_info.in_dst_ip; + sprintf(ip_string,"%d.%d.%d.%d", temp[0], temp[1], temp[2], temp[3]); + seq_printf(m, "%-16s%s%-14lu%-12u", + ip_string, + ingress_state_string(in_entry->entry_state), + in_entry->ctrl_info.holding_time-(now.tv_sec-in_entry->tv.tv_sec), + in_entry->packets_fwded); + if (in_entry->shortcut) + seq_printf(m, " %-3d %-3d",in_entry->shortcut->vpi,in_entry->shortcut->vci); + seq_printf(m, "\n"); + } + + seq_printf(m, "\n"); + seq_printf(m, "Egress Entries:\nIngress MPC ATM addr\nCache-id State Holding time Packets recvd Latest IP addr VPI VCI\n"); + for (eg_entry = mpc->eg_cache; eg_entry; eg_entry = eg_entry->next) { + unsigned char *p = eg_entry->ctrl_info.in_MPC_data_ATM_addr; + for(i = 0; i < ATM_ESA_LEN; i++) + seq_printf(m, "%02x", p[i]); + seq_printf(m, "\n%-16lu%s%-14lu%-15u", + (unsigned long)ntohl(eg_entry->ctrl_info.cache_id), + egress_state_string(eg_entry->entry_state), + (eg_entry->ctrl_info.holding_time-(now.tv_sec-eg_entry->tv.tv_sec)), + eg_entry->packets_rcvd); + + /* latest IP address */ + temp = (unsigned char *)&eg_entry->latest_ip_addr; + sprintf(ip_string, "%d.%d.%d.%d", temp[0], temp[1], temp[2], temp[3]); + seq_printf(m, "%-16s", ip_string); + + if (eg_entry->shortcut) + seq_printf(m, " %-3d %-3d",eg_entry->shortcut->vpi,eg_entry->shortcut->vci); + seq_printf(m, "\n"); + } + seq_printf(m, "\n"); + return 0; +} + +static struct seq_operations mpc_op = { + .start = mpc_start, + .next = mpc_next, + .stop = mpc_stop, + .show = mpc_show +}; + +static int proc_mpc_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &mpc_op); +} + +static ssize_t proc_mpc_write(struct file *file, const char __user *buff, + size_t nbytes, loff_t *ppos) +{ + char *page, *p; + unsigned len; + + if (nbytes == 0) + return 0; + + if (nbytes >= PAGE_SIZE) + nbytes = PAGE_SIZE-1; + + page = (char *)__get_free_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + + for (p = page, len = 0; len < nbytes; p++, len++) { + if (get_user(*p, buff++)) { + free_page((unsigned long)page); + return -EFAULT; + } + if (*p == '\0' || *p == '\n') + break; + } + + *p = '\0'; + + if (!parse_qos(page)) + printk("mpoa: proc_mpc_write: could not parse '%s'\n", page); + + free_page((unsigned long)page); + + return len; +} + +static int parse_qos(const char *buff) +{ + /* possible lines look like this + * add 130.230.54.142 tx=max_pcr,max_sdu rx=max_pcr,max_sdu + */ + unsigned char ip[4]; + int tx_pcr, tx_sdu, rx_pcr, rx_sdu; + uint32_t ipaddr; + struct atm_qos qos; + + memset(&qos, 0, sizeof(struct atm_qos)); + + if (sscanf(buff, "del %hhu.%hhu.%hhu.%hhu", + ip, ip+1, ip+2, ip+3) == 4) { + ipaddr = *(uint32_t *)ip; + return atm_mpoa_delete_qos(atm_mpoa_search_qos(ipaddr)); + } + + if (sscanf(buff, "add %hhu.%hhu.%hhu.%hhu tx=%d,%d rx=tx", + ip, ip+1, ip+2, ip+3, &tx_pcr, &tx_sdu) == 6) { + rx_pcr = tx_pcr; + rx_sdu = tx_sdu; + } else if (sscanf(buff, "add %hhu.%hhu.%hhu.%hhu tx=%d,%d rx=%d,%d", + ip, ip+1, ip+2, ip+3, &tx_pcr, &tx_sdu, &rx_pcr, &rx_sdu) != 8) + return 0; + + ipaddr = *(uint32_t *)ip; + qos.txtp.traffic_class = ATM_CBR; + qos.txtp.max_pcr = tx_pcr; + qos.txtp.max_sdu = tx_sdu; + qos.rxtp.traffic_class = ATM_CBR; + qos.rxtp.max_pcr = rx_pcr; + qos.rxtp.max_sdu = rx_sdu; + qos.aal = ATM_AAL5; + dprintk("mpoa: mpoa_proc.c: parse_qos(): setting qos paramameters to tx=%d,%d rx=%d,%d\n", + qos.txtp.max_pcr, + qos.txtp.max_sdu, + qos.rxtp.max_pcr, + qos.rxtp.max_sdu + ); + + atm_mpoa_add_qos(ipaddr, &qos); + return 1; +} + +/* + * INITIALIZATION function - called when module is initialized/loaded. + */ +int mpc_proc_init(void) +{ + struct proc_dir_entry *p; + + p = create_proc_entry(STAT_FILE_NAME, 0, atm_proc_root); + if (!p) { + printk(KERN_ERR "Unable to initialize /proc/atm/%s\n", STAT_FILE_NAME); + return -ENOMEM; + } + p->proc_fops = &mpc_file_operations; + p->owner = THIS_MODULE; + return 0; +} + +/* + * DELETING function - called when module is removed. + */ +void mpc_proc_clean(void) +{ + remove_proc_entry(STAT_FILE_NAME,atm_proc_root); +} + + +#endif /* CONFIG_PROC_FS */ + + + + + + diff --git a/net/atm/pppoatm.c b/net/atm/pppoatm.c new file mode 100644 index 000000000000..58f4a2b5aebe --- /dev/null +++ b/net/atm/pppoatm.c @@ -0,0 +1,369 @@ +/* net/atm/pppoatm.c - RFC2364 PPP over ATM/AAL5 */ + +/* Copyright 1999-2000 by Mitchell Blank Jr */ +/* Based on clip.c; 1995-1999 by Werner Almesberger, EPFL LRC/ICA */ +/* And on ppp_async.c; Copyright 1999 Paul Mackerras */ +/* And help from Jens Axboe */ + +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * This driver provides the encapsulation and framing for sending + * and receiving PPP frames in ATM AAL5 PDUs. + */ + +/* + * One shortcoming of this driver is that it does not comply with + * section 8 of RFC2364 - we are supposed to detect a change + * in encapsulation and immediately abort the connection (in order + * to avoid a black-hole being created if our peer loses state + * and changes encapsulation unilaterally. However, since the + * ppp_generic layer actually does the decapsulation, we need + * a way of notifying it when we _think_ there might be a problem) + * There's two cases: + * 1. LLC-encapsulation was missing when it was enabled. In + * this case, we should tell the upper layer "tear down + * this session if this skb looks ok to you" + * 2. LLC-encapsulation was present when it was disabled. Then + * we need to tell the upper layer "this packet may be + * ok, but if its in error tear down the session" + * These hooks are not yet available in ppp_generic + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +#if 0 +#define DPRINTK(format, args...) \ + printk(KERN_DEBUG "pppoatm: " format, ##args) +#else +#define DPRINTK(format, args...) +#endif + +enum pppoatm_encaps { + e_autodetect = PPPOATM_ENCAPS_AUTODETECT, + e_vc = PPPOATM_ENCAPS_VC, + e_llc = PPPOATM_ENCAPS_LLC, +}; + +struct pppoatm_vcc { + struct atm_vcc *atmvcc; /* VCC descriptor */ + void (*old_push)(struct atm_vcc *, struct sk_buff *); + void (*old_pop)(struct atm_vcc *, struct sk_buff *); + /* keep old push/pop for detaching */ + enum pppoatm_encaps encaps; + int flags; /* SC_COMP_PROT - compress protocol */ + struct ppp_channel chan; /* interface to generic ppp layer */ + struct tasklet_struct wakeup_tasklet; +}; + +/* + * Header used for LLC Encapsulated PPP (4 bytes) followed by the LCP protocol + * ID (0xC021) used in autodetection + */ +static const unsigned char pppllc[6] = { 0xFE, 0xFE, 0x03, 0xCF, 0xC0, 0x21 }; +#define LLC_LEN (4) + +static inline struct pppoatm_vcc *atmvcc_to_pvcc(const struct atm_vcc *atmvcc) +{ + return (struct pppoatm_vcc *) (atmvcc->user_back); +} + +static inline struct pppoatm_vcc *chan_to_pvcc(const struct ppp_channel *chan) +{ + return (struct pppoatm_vcc *) (chan->private); +} + +/* + * We can't do this directly from our _pop handler, since the ppp code + * doesn't want to be called in interrupt context, so we do it from + * a tasklet + */ +static void pppoatm_wakeup_sender(unsigned long arg) +{ + ppp_output_wakeup((struct ppp_channel *) arg); +} + +/* + * This gets called every time the ATM card has finished sending our + * skb. The ->old_pop will take care up normal atm flow control, + * but we also need to wake up the device if we blocked it + */ +static void pppoatm_pop(struct atm_vcc *atmvcc, struct sk_buff *skb) +{ + struct pppoatm_vcc *pvcc = atmvcc_to_pvcc(atmvcc); + pvcc->old_pop(atmvcc, skb); + /* + * We don't really always want to do this since it's + * really inefficient - it would be much better if we could + * test if we had actually throttled the generic layer. + * Unfortunately then there would be a nasty SMP race where + * we could clear that flag just as we refuse another packet. + * For now we do the safe thing. + */ + tasklet_schedule(&pvcc->wakeup_tasklet); +} + +/* + * Unbind from PPP - currently we only do this when closing the socket, + * but we could put this into an ioctl if need be + */ +static void pppoatm_unassign_vcc(struct atm_vcc *atmvcc) +{ + struct pppoatm_vcc *pvcc; + pvcc = atmvcc_to_pvcc(atmvcc); + atmvcc->push = pvcc->old_push; + atmvcc->pop = pvcc->old_pop; + tasklet_kill(&pvcc->wakeup_tasklet); + ppp_unregister_channel(&pvcc->chan); + atmvcc->user_back = NULL; + kfree(pvcc); + /* Gee, I hope we have the big kernel lock here... */ + module_put(THIS_MODULE); +} + +/* Called when an AAL5 PDU comes in */ +static void pppoatm_push(struct atm_vcc *atmvcc, struct sk_buff *skb) +{ + struct pppoatm_vcc *pvcc = atmvcc_to_pvcc(atmvcc); + DPRINTK("pppoatm push\n"); + if (skb == NULL) { /* VCC was closed */ + DPRINTK("removing ATMPPP VCC %p\n", pvcc); + pppoatm_unassign_vcc(atmvcc); + atmvcc->push(atmvcc, NULL); /* Pass along bad news */ + return; + } + atm_return(atmvcc, skb->truesize); + switch (pvcc->encaps) { + case e_llc: + if (skb->len < LLC_LEN || + memcmp(skb->data, pppllc, LLC_LEN)) + goto error; + skb_pull(skb, LLC_LEN); + break; + case e_autodetect: + if (pvcc->chan.ppp == NULL) { /* Not bound yet! */ + kfree_skb(skb); + return; + } + if (skb->len >= sizeof(pppllc) && + !memcmp(skb->data, pppllc, sizeof(pppllc))) { + pvcc->encaps = e_llc; + skb_pull(skb, LLC_LEN); + break; + } + if (skb->len >= (sizeof(pppllc) - LLC_LEN) && + !memcmp(skb->data, &pppllc[LLC_LEN], + sizeof(pppllc) - LLC_LEN)) { + pvcc->encaps = e_vc; + pvcc->chan.mtu += LLC_LEN; + break; + } + DPRINTK("(unit %d): Couldn't autodetect yet " + "(skb: %02X %02X %02X %02X %02X %02X)\n", + pvcc->chan.unit, + skb->data[0], skb->data[1], skb->data[2], + skb->data[3], skb->data[4], skb->data[5]); + goto error; + case e_vc: + break; + } + ppp_input(&pvcc->chan, skb); + return; + error: + kfree_skb(skb); + ppp_input_error(&pvcc->chan, 0); +} + +/* + * Called by the ppp_generic.c to send a packet - returns true if packet + * was accepted. If we return false, then it's our job to call + * ppp_output_wakeup(chan) when we're feeling more up to it. + * Note that in the ENOMEM case (as opposed to the !atm_may_send case) + * we should really drop the packet, but the generic layer doesn't + * support this yet. We just return 'DROP_PACKET' which we actually define + * as success, just to be clear what we're really doing. + */ +#define DROP_PACKET 1 +static int pppoatm_send(struct ppp_channel *chan, struct sk_buff *skb) +{ + struct pppoatm_vcc *pvcc = chan_to_pvcc(chan); + ATM_SKB(skb)->vcc = pvcc->atmvcc; + DPRINTK("(unit %d): pppoatm_send (skb=0x%p, vcc=0x%p)\n", + pvcc->chan.unit, skb, pvcc->atmvcc); + if (skb->data[0] == '\0' && (pvcc->flags & SC_COMP_PROT)) + (void) skb_pull(skb, 1); + switch (pvcc->encaps) { /* LLC encapsulation needed */ + case e_llc: + if (skb_headroom(skb) < LLC_LEN) { + struct sk_buff *n; + n = skb_realloc_headroom(skb, LLC_LEN); + if (n != NULL && + !atm_may_send(pvcc->atmvcc, n->truesize)) { + kfree_skb(n); + goto nospace; + } + kfree_skb(skb); + if ((skb = n) == NULL) + return DROP_PACKET; + } else if (!atm_may_send(pvcc->atmvcc, skb->truesize)) + goto nospace; + memcpy(skb_push(skb, LLC_LEN), pppllc, LLC_LEN); + break; + case e_vc: + if (!atm_may_send(pvcc->atmvcc, skb->truesize)) + goto nospace; + break; + case e_autodetect: + DPRINTK("(unit %d): Trying to send without setting encaps!\n", + pvcc->chan.unit); + kfree_skb(skb); + return 1; + } + + atomic_add(skb->truesize, &sk_atm(ATM_SKB(skb)->vcc)->sk_wmem_alloc); + ATM_SKB(skb)->atm_options = ATM_SKB(skb)->vcc->atm_options; + DPRINTK("(unit %d): atm_skb(%p)->vcc(%p)->dev(%p)\n", + pvcc->chan.unit, skb, ATM_SKB(skb)->vcc, + ATM_SKB(skb)->vcc->dev); + return ATM_SKB(skb)->vcc->send(ATM_SKB(skb)->vcc, skb) + ? DROP_PACKET : 1; + nospace: + /* + * We don't have space to send this SKB now, but we might have + * already applied SC_COMP_PROT compression, so may need to undo + */ + if ((pvcc->flags & SC_COMP_PROT) && skb_headroom(skb) > 0 && + skb->data[-1] == '\0') + (void) skb_push(skb, 1); + return 0; +} + +/* This handles ioctls sent to the /dev/ppp interface */ +static int pppoatm_devppp_ioctl(struct ppp_channel *chan, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case PPPIOCGFLAGS: + return put_user(chan_to_pvcc(chan)->flags, (int __user *) arg) + ? -EFAULT : 0; + case PPPIOCSFLAGS: + return get_user(chan_to_pvcc(chan)->flags, (int __user *) arg) + ? -EFAULT : 0; + } + return -ENOTTY; +} + +static /*const*/ struct ppp_channel_ops pppoatm_ops = { + .start_xmit = pppoatm_send, + .ioctl = pppoatm_devppp_ioctl, +}; + +static int pppoatm_assign_vcc(struct atm_vcc *atmvcc, void __user *arg) +{ + struct atm_backend_ppp be; + struct pppoatm_vcc *pvcc; + int err; + /* + * Each PPPoATM instance has its own tasklet - this is just a + * prototypical one used to initialize them + */ + static const DECLARE_TASKLET(tasklet_proto, pppoatm_wakeup_sender, 0); + if (copy_from_user(&be, arg, sizeof be)) + return -EFAULT; + if (be.encaps != PPPOATM_ENCAPS_AUTODETECT && + be.encaps != PPPOATM_ENCAPS_VC && be.encaps != PPPOATM_ENCAPS_LLC) + return -EINVAL; + pvcc = kmalloc(sizeof(*pvcc), GFP_KERNEL); + if (pvcc == NULL) + return -ENOMEM; + memset(pvcc, 0, sizeof(*pvcc)); + pvcc->atmvcc = atmvcc; + pvcc->old_push = atmvcc->push; + pvcc->old_pop = atmvcc->pop; + pvcc->encaps = (enum pppoatm_encaps) be.encaps; + pvcc->chan.private = pvcc; + pvcc->chan.ops = &pppoatm_ops; + pvcc->chan.mtu = atmvcc->qos.txtp.max_sdu - PPP_HDRLEN - + (be.encaps == e_vc ? 0 : LLC_LEN); + pvcc->wakeup_tasklet = tasklet_proto; + pvcc->wakeup_tasklet.data = (unsigned long) &pvcc->chan; + if ((err = ppp_register_channel(&pvcc->chan)) != 0) { + kfree(pvcc); + return err; + } + atmvcc->user_back = pvcc; + atmvcc->push = pppoatm_push; + atmvcc->pop = pppoatm_pop; + __module_get(THIS_MODULE); + return 0; +} + +/* + * This handles ioctls actually performed on our vcc - we must return + * -ENOIOCTLCMD for any unrecognized ioctl + */ +static int pppoatm_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + struct atm_vcc *atmvcc = ATM_SD(sock); + void __user *argp = (void __user *)arg; + + if (cmd != ATM_SETBACKEND && atmvcc->push != pppoatm_push) + return -ENOIOCTLCMD; + switch (cmd) { + case ATM_SETBACKEND: { + atm_backend_t b; + if (get_user(b, (atm_backend_t __user *) argp)) + return -EFAULT; + if (b != ATM_BACKEND_PPP) + return -ENOIOCTLCMD; + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + return pppoatm_assign_vcc(atmvcc, argp); + } + case PPPIOCGCHAN: + return put_user(ppp_channel_index(&atmvcc_to_pvcc(atmvcc)-> + chan), (int __user *) argp) ? -EFAULT : 0; + case PPPIOCGUNIT: + return put_user(ppp_unit_number(&atmvcc_to_pvcc(atmvcc)-> + chan), (int __user *) argp) ? -EFAULT : 0; + } + return -ENOIOCTLCMD; +} + +static struct atm_ioctl pppoatm_ioctl_ops = { + .owner = THIS_MODULE, + .ioctl = pppoatm_ioctl, +}; + +static int __init pppoatm_init(void) +{ + register_atm_ioctl(&pppoatm_ioctl_ops); + return 0; +} + +static void __exit pppoatm_exit(void) +{ + deregister_atm_ioctl(&pppoatm_ioctl_ops); +} + +module_init(pppoatm_init); +module_exit(pppoatm_exit); + +MODULE_AUTHOR("Mitchell Blank Jr "); +MODULE_DESCRIPTION("RFC2364 PPP over ATM/AAL5"); +MODULE_LICENSE("GPL"); diff --git a/net/atm/proc.c b/net/atm/proc.c new file mode 100644 index 000000000000..4041054e5282 --- /dev/null +++ b/net/atm/proc.c @@ -0,0 +1,514 @@ +/* net/atm/proc.c - ATM /proc interface + * + * Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA + * + * seq_file api usage by romieu@fr.zoreil.com + * + * Evaluating the efficiency of the whole thing if left as an exercise to + * the reader. + */ + +#include +#include /* for EXPORT_SYMBOL */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for __init */ +#include +#include +#include +#include /* for HZ */ +#include "resources.h" +#include "common.h" /* atm_proc_init prototype */ +#include "signaling.h" /* to get sigd - ugly too */ + +static ssize_t proc_dev_atm_read(struct file *file,char __user *buf,size_t count, + loff_t *pos); + +static struct file_operations proc_atm_dev_ops = { + .owner = THIS_MODULE, + .read = proc_dev_atm_read, +}; + +static void add_stats(struct seq_file *seq, const char *aal, + const struct k_atm_aal_stats *stats) +{ + seq_printf(seq, "%s ( %d %d %d %d %d )", aal, + atomic_read(&stats->tx),atomic_read(&stats->tx_err), + atomic_read(&stats->rx),atomic_read(&stats->rx_err), + atomic_read(&stats->rx_drop)); +} + +static void atm_dev_info(struct seq_file *seq, const struct atm_dev *dev) +{ + int i; + + seq_printf(seq, "%3d %-8s", dev->number, dev->type); + for (i = 0; i < ESI_LEN; i++) + seq_printf(seq, "%02x", dev->esi[i]); + seq_puts(seq, " "); + add_stats(seq, "0", &dev->stats.aal0); + seq_puts(seq, " "); + add_stats(seq, "5", &dev->stats.aal5); + seq_printf(seq, "\t[%d]", atomic_read(&dev->refcnt)); + seq_putc(seq, '\n'); +} + +struct vcc_state { + int bucket; + struct sock *sk; + int family; +}; + +static inline int compare_family(struct sock *sk, int family) +{ + return !family || (sk->sk_family == family); +} + +static int __vcc_walk(struct sock **sock, int family, int *bucket, loff_t l) +{ + struct sock *sk = *sock; + + if (sk == (void *)1) { + for (*bucket = 0; *bucket < VCC_HTABLE_SIZE; ++*bucket) { + struct hlist_head *head = &vcc_hash[*bucket]; + + sk = hlist_empty(head) ? NULL : __sk_head(head); + if (sk) + break; + } + l--; + } +try_again: + for (; sk; sk = sk_next(sk)) { + l -= compare_family(sk, family); + if (l < 0) + goto out; + } + if (!sk && ++*bucket < VCC_HTABLE_SIZE) { + sk = sk_head(&vcc_hash[*bucket]); + goto try_again; + } + sk = (void *)1; +out: + *sock = sk; + return (l < 0); +} + +static inline void *vcc_walk(struct vcc_state *state, loff_t l) +{ + return __vcc_walk(&state->sk, state->family, &state->bucket, l) ? + state : NULL; +} + +static int __vcc_seq_open(struct inode *inode, struct file *file, + int family, struct seq_operations *ops) +{ + struct vcc_state *state; + struct seq_file *seq; + int rc = -ENOMEM; + + state = kmalloc(sizeof(*state), GFP_KERNEL); + if (!state) + goto out; + + rc = seq_open(file, ops); + if (rc) + goto out_kfree; + + state->family = family; + + seq = file->private_data; + seq->private = state; +out: + return rc; +out_kfree: + kfree(state); + goto out; +} + +static int vcc_seq_release(struct inode *inode, struct file *file) +{ + return seq_release_private(inode, file); +} + +static void *vcc_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct vcc_state *state = seq->private; + loff_t left = *pos; + + read_lock(&vcc_sklist_lock); + state->sk = (void *)1; + return left ? vcc_walk(state, left) : (void *)1; +} + +static void vcc_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&vcc_sklist_lock); +} + +static void *vcc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct vcc_state *state = seq->private; + + v = vcc_walk(state, 1); + *pos += !!PTR_ERR(v); + return v; +} + +static void pvc_info(struct seq_file *seq, struct atm_vcc *vcc) +{ + static const char *class_name[] = { "off","UBR","CBR","VBR","ABR" }; + static const char *aal_name[] = { + "---", "1", "2", "3/4", /* 0- 3 */ + "???", "5", "???", "???", /* 4- 7 */ + "???", "???", "???", "???", /* 8-11 */ + "???", "0", "???", "???"}; /* 12-15 */ + + seq_printf(seq, "%3d %3d %5d %-3s %7d %-5s %7d %-6s", + vcc->dev->number,vcc->vpi,vcc->vci, + vcc->qos.aal >= sizeof(aal_name)/sizeof(aal_name[0]) ? "err" : + aal_name[vcc->qos.aal],vcc->qos.rxtp.min_pcr, + class_name[vcc->qos.rxtp.traffic_class],vcc->qos.txtp.min_pcr, + class_name[vcc->qos.txtp.traffic_class]); + if (test_bit(ATM_VF_IS_CLIP, &vcc->flags)) { + struct clip_vcc *clip_vcc = CLIP_VCC(vcc); + struct net_device *dev; + + dev = clip_vcc->entry ? clip_vcc->entry->neigh->dev : NULL; + seq_printf(seq, "CLIP, Itf:%s, Encap:", + dev ? dev->name : "none?"); + seq_printf(seq, "%s", clip_vcc->encap ? "LLC/SNAP" : "None"); + } + seq_putc(seq, '\n'); +} + +static const char *vcc_state(struct atm_vcc *vcc) +{ + static const char *map[] = { ATM_VS2TXT_MAP }; + + return map[ATM_VF2VS(vcc->flags)]; +} + +static void vcc_info(struct seq_file *seq, struct atm_vcc *vcc) +{ + struct sock *sk = sk_atm(vcc); + + seq_printf(seq, "%p ", vcc); + if (!vcc->dev) + seq_printf(seq, "Unassigned "); + else + seq_printf(seq, "%3d %3d %5d ", vcc->dev->number, vcc->vpi, + vcc->vci); + switch (sk->sk_family) { + case AF_ATMPVC: + seq_printf(seq, "PVC"); + break; + case AF_ATMSVC: + seq_printf(seq, "SVC"); + break; + default: + seq_printf(seq, "%3d", sk->sk_family); + } + seq_printf(seq, " %04lx %5d %7d/%7d %7d/%7d [%d]\n", vcc->flags, sk->sk_err, + atomic_read(&sk->sk_wmem_alloc), sk->sk_sndbuf, + atomic_read(&sk->sk_rmem_alloc), sk->sk_rcvbuf, + atomic_read(&sk->sk_refcnt)); +} + +static void svc_info(struct seq_file *seq, struct atm_vcc *vcc) +{ + if (!vcc->dev) + seq_printf(seq, sizeof(void *) == 4 ? + "N/A@%p%10s" : "N/A@%p%2s", vcc, ""); + else + seq_printf(seq, "%3d %3d %5d ", + vcc->dev->number, vcc->vpi, vcc->vci); + seq_printf(seq, "%-10s ", vcc_state(vcc)); + seq_printf(seq, "%s%s", vcc->remote.sas_addr.pub, + *vcc->remote.sas_addr.pub && *vcc->remote.sas_addr.prv ? "+" : ""); + if (*vcc->remote.sas_addr.prv) { + int i; + + for (i = 0; i < ATM_ESA_LEN; i++) + seq_printf(seq, "%02x", vcc->remote.sas_addr.prv[i]); + } + seq_putc(seq, '\n'); +} + +static int atm_dev_seq_show(struct seq_file *seq, void *v) +{ + static char atm_dev_banner[] = + "Itf Type ESI/\"MAC\"addr " + "AAL(TX,err,RX,err,drop) ... [refcnt]\n"; + + if (v == (void *)1) + seq_puts(seq, atm_dev_banner); + else { + struct atm_dev *dev = list_entry(v, struct atm_dev, dev_list); + + atm_dev_info(seq, dev); + } + return 0; +} + +static struct seq_operations atm_dev_seq_ops = { + .start = atm_dev_seq_start, + .next = atm_dev_seq_next, + .stop = atm_dev_seq_stop, + .show = atm_dev_seq_show, +}; + +static int atm_dev_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &atm_dev_seq_ops); +} + +static struct file_operations devices_seq_fops = { + .open = atm_dev_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int pvc_seq_show(struct seq_file *seq, void *v) +{ + static char atm_pvc_banner[] = + "Itf VPI VCI AAL RX(PCR,Class) TX(PCR,Class)\n"; + + if (v == (void *)1) + seq_puts(seq, atm_pvc_banner); + else { + struct vcc_state *state = seq->private; + struct atm_vcc *vcc = atm_sk(state->sk); + + pvc_info(seq, vcc); + } + return 0; +} + +static struct seq_operations pvc_seq_ops = { + .start = vcc_seq_start, + .next = vcc_seq_next, + .stop = vcc_seq_stop, + .show = pvc_seq_show, +}; + +static int pvc_seq_open(struct inode *inode, struct file *file) +{ + return __vcc_seq_open(inode, file, PF_ATMPVC, &pvc_seq_ops); +} + +static struct file_operations pvc_seq_fops = { + .open = pvc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = vcc_seq_release, +}; + +static int vcc_seq_show(struct seq_file *seq, void *v) +{ + if (v == (void *)1) { + seq_printf(seq, sizeof(void *) == 4 ? "%-8s%s" : "%-16s%s", + "Address ", "Itf VPI VCI Fam Flags Reply " + "Send buffer Recv buffer [refcnt]\n"); + } else { + struct vcc_state *state = seq->private; + struct atm_vcc *vcc = atm_sk(state->sk); + + vcc_info(seq, vcc); + } + return 0; +} + +static struct seq_operations vcc_seq_ops = { + .start = vcc_seq_start, + .next = vcc_seq_next, + .stop = vcc_seq_stop, + .show = vcc_seq_show, +}; + +static int vcc_seq_open(struct inode *inode, struct file *file) +{ + return __vcc_seq_open(inode, file, 0, &vcc_seq_ops); +} + +static struct file_operations vcc_seq_fops = { + .open = vcc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = vcc_seq_release, +}; + +static int svc_seq_show(struct seq_file *seq, void *v) +{ + static char atm_svc_banner[] = + "Itf VPI VCI State Remote\n"; + + if (v == (void *)1) + seq_puts(seq, atm_svc_banner); + else { + struct vcc_state *state = seq->private; + struct atm_vcc *vcc = atm_sk(state->sk); + + svc_info(seq, vcc); + } + return 0; +} + +static struct seq_operations svc_seq_ops = { + .start = vcc_seq_start, + .next = vcc_seq_next, + .stop = vcc_seq_stop, + .show = svc_seq_show, +}; + +static int svc_seq_open(struct inode *inode, struct file *file) +{ + return __vcc_seq_open(inode, file, PF_ATMSVC, &svc_seq_ops); +} + +static struct file_operations svc_seq_fops = { + .open = svc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = vcc_seq_release, +}; + +static ssize_t proc_dev_atm_read(struct file *file, char __user *buf, + size_t count, loff_t *pos) +{ + struct atm_dev *dev; + unsigned long page; + int length; + + if (count == 0) return 0; + page = get_zeroed_page(GFP_KERNEL); + if (!page) return -ENOMEM; + dev = PDE(file->f_dentry->d_inode)->data; + if (!dev->ops->proc_read) + length = -EINVAL; + else { + length = dev->ops->proc_read(dev,pos,(char *) page); + if (length > count) length = -EINVAL; + } + if (length >= 0) { + if (copy_to_user(buf,(char *) page,length)) length = -EFAULT; + (*pos)++; + } + free_page(page); + return length; +} + + +struct proc_dir_entry *atm_proc_root; +EXPORT_SYMBOL(atm_proc_root); + + +int atm_proc_dev_register(struct atm_dev *dev) +{ + int digits,num; + int error; + + /* No proc info */ + if (!dev->ops->proc_read) + return 0; + + error = -ENOMEM; + digits = 0; + for (num = dev->number; num; num /= 10) digits++; + if (!digits) digits++; + + dev->proc_name = kmalloc(strlen(dev->type) + digits + 2, GFP_KERNEL); + if (!dev->proc_name) + goto err_out; + sprintf(dev->proc_name,"%s:%d",dev->type, dev->number); + + dev->proc_entry = create_proc_entry(dev->proc_name, 0, atm_proc_root); + if (!dev->proc_entry) + goto err_free_name; + dev->proc_entry->data = dev; + dev->proc_entry->proc_fops = &proc_atm_dev_ops; + dev->proc_entry->owner = THIS_MODULE; + return 0; +err_free_name: + kfree(dev->proc_name); +err_out: + return error; +} + + +void atm_proc_dev_deregister(struct atm_dev *dev) +{ + if (!dev->ops->proc_read) + return; + + remove_proc_entry(dev->proc_name, atm_proc_root); + kfree(dev->proc_name); +} + +static struct atm_proc_entry { + char *name; + struct file_operations *proc_fops; + struct proc_dir_entry *dirent; +} atm_proc_ents[] = { + { .name = "devices", .proc_fops = &devices_seq_fops }, + { .name = "pvc", .proc_fops = &pvc_seq_fops }, + { .name = "svc", .proc_fops = &svc_seq_fops }, + { .name = "vc", .proc_fops = &vcc_seq_fops }, + { .name = NULL, .proc_fops = NULL } +}; + +static void atm_proc_dirs_remove(void) +{ + static struct atm_proc_entry *e; + + for (e = atm_proc_ents; e->name; e++) { + if (e->dirent) + remove_proc_entry(e->name, atm_proc_root); + } + remove_proc_entry("net/atm", NULL); +} + +int __init atm_proc_init(void) +{ + static struct atm_proc_entry *e; + int ret; + + atm_proc_root = proc_mkdir("net/atm",NULL); + if (!atm_proc_root) + goto err_out; + for (e = atm_proc_ents; e->name; e++) { + struct proc_dir_entry *dirent; + + dirent = create_proc_entry(e->name, S_IRUGO, atm_proc_root); + if (!dirent) + goto err_out_remove; + dirent->proc_fops = e->proc_fops; + dirent->owner = THIS_MODULE; + e->dirent = dirent; + } + ret = 0; +out: + return ret; + +err_out_remove: + atm_proc_dirs_remove(); +err_out: + ret = -ENOMEM; + goto out; +} + +void __exit atm_proc_exit(void) +{ + atm_proc_dirs_remove(); +} diff --git a/net/atm/protocols.h b/net/atm/protocols.h new file mode 100644 index 000000000000..acdfc856222d --- /dev/null +++ b/net/atm/protocols.h @@ -0,0 +1,13 @@ +/* net/atm/protocols.h - ATM protocol handler entry points */ + +/* Written 1995-1997 by Werner Almesberger, EPFL LRC */ + + +#ifndef NET_ATM_PROTOCOLS_H +#define NET_ATM_PROTOCOLS_H + +int atm_init_aal0(struct atm_vcc *vcc); /* "raw" AAL0 */ +int atm_init_aal34(struct atm_vcc *vcc);/* "raw" AAL3/4 transport */ +int atm_init_aal5(struct atm_vcc *vcc); /* "raw" AAL5 transport */ + +#endif diff --git a/net/atm/pvc.c b/net/atm/pvc.c new file mode 100644 index 000000000000..2684a92da22b --- /dev/null +++ b/net/atm/pvc.c @@ -0,0 +1,155 @@ +/* net/atm/pvc.c - ATM PVC sockets */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + + +#include +#include /* struct socket, struct proto_ops */ +#include /* ATM stuff */ +#include /* ATM devices */ +#include /* error codes */ +#include /* printk */ +#include +#include +#include +#include /* for sock_no_* */ + +#include "resources.h" /* devs and vccs */ +#include "common.h" /* common for PVCs and SVCs */ + + +static int pvc_shutdown(struct socket *sock,int how) +{ + return 0; +} + + +static int pvc_bind(struct socket *sock,struct sockaddr *sockaddr, + int sockaddr_len) +{ + struct sock *sk = sock->sk; + struct sockaddr_atmpvc *addr; + struct atm_vcc *vcc; + int error; + + if (sockaddr_len != sizeof(struct sockaddr_atmpvc)) return -EINVAL; + addr = (struct sockaddr_atmpvc *) sockaddr; + if (addr->sap_family != AF_ATMPVC) return -EAFNOSUPPORT; + lock_sock(sk); + vcc = ATM_SD(sock); + if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) { + error = -EBADFD; + goto out; + } + if (test_bit(ATM_VF_PARTIAL,&vcc->flags)) { + if (vcc->vpi != ATM_VPI_UNSPEC) addr->sap_addr.vpi = vcc->vpi; + if (vcc->vci != ATM_VCI_UNSPEC) addr->sap_addr.vci = vcc->vci; + } + error = vcc_connect(sock, addr->sap_addr.itf, addr->sap_addr.vpi, + addr->sap_addr.vci); +out: + release_sock(sk); + return error; +} + + +static int pvc_connect(struct socket *sock,struct sockaddr *sockaddr, + int sockaddr_len,int flags) +{ + return pvc_bind(sock,sockaddr,sockaddr_len); +} + +static int pvc_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + int error; + + lock_sock(sk); + error = vcc_setsockopt(sock, level, optname, optval, optlen); + release_sock(sk); + return error; +} + + +static int pvc_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + int error; + + lock_sock(sk); + error = vcc_getsockopt(sock, level, optname, optval, optlen); + release_sock(sk); + return error; +} + + +static int pvc_getname(struct socket *sock,struct sockaddr *sockaddr, + int *sockaddr_len,int peer) +{ + struct sockaddr_atmpvc *addr; + struct atm_vcc *vcc = ATM_SD(sock); + + if (!vcc->dev || !test_bit(ATM_VF_ADDR,&vcc->flags)) return -ENOTCONN; + *sockaddr_len = sizeof(struct sockaddr_atmpvc); + addr = (struct sockaddr_atmpvc *) sockaddr; + addr->sap_family = AF_ATMPVC; + addr->sap_addr.itf = vcc->dev->number; + addr->sap_addr.vpi = vcc->vpi; + addr->sap_addr.vci = vcc->vci; + return 0; +} + + +static struct proto_ops pvc_proto_ops = { + .family = PF_ATMPVC, + .owner = THIS_MODULE, + + .release = vcc_release, + .bind = pvc_bind, + .connect = pvc_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = pvc_getname, + .poll = vcc_poll, + .ioctl = vcc_ioctl, + .listen = sock_no_listen, + .shutdown = pvc_shutdown, + .setsockopt = pvc_setsockopt, + .getsockopt = pvc_getsockopt, + .sendmsg = vcc_sendmsg, + .recvmsg = vcc_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + + +static int pvc_create(struct socket *sock,int protocol) +{ + sock->ops = &pvc_proto_ops; + return vcc_create(sock, protocol, PF_ATMPVC); +} + + +static struct net_proto_family pvc_family_ops = { + .family = PF_ATMPVC, + .create = pvc_create, + .owner = THIS_MODULE, +}; + + +/* + * Initialize the ATM PVC protocol family + */ + + +int __init atmpvc_init(void) +{ + return sock_register(&pvc_family_ops); +} + +void atmpvc_exit(void) +{ + sock_unregister(PF_ATMPVC); +} diff --git a/net/atm/raw.c b/net/atm/raw.c new file mode 100644 index 000000000000..4a0466e91aa6 --- /dev/null +++ b/net/atm/raw.c @@ -0,0 +1,98 @@ +/* net/atm/raw.c - Raw AAL0 and AAL5 transports */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + + +#include +#include +#include +#include +#include +#include + +#include "common.h" +#include "protocols.h" + + +#if 0 +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + + +/* + * SKB == NULL indicates that the link is being closed + */ + +static void atm_push_raw(struct atm_vcc *vcc,struct sk_buff *skb) +{ + if (skb) { + struct sock *sk = sk_atm(vcc); + + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + } +} + + +static void atm_pop_raw(struct atm_vcc *vcc,struct sk_buff *skb) +{ + struct sock *sk = sk_atm(vcc); + + DPRINTK("APopR (%d) %d -= %d\n", vcc->vci, sk->sk_wmem_alloc, + skb->truesize); + atomic_sub(skb->truesize, &sk->sk_wmem_alloc); + dev_kfree_skb_any(skb); + sk->sk_write_space(sk); +} + + +static int atm_send_aal0(struct atm_vcc *vcc,struct sk_buff *skb) +{ + /* + * Note that if vpi/vci are _ANY or _UNSPEC the below will + * still work + */ + if (!capable(CAP_NET_ADMIN) && + (((u32 *) skb->data)[0] & (ATM_HDR_VPI_MASK | ATM_HDR_VCI_MASK)) != + ((vcc->vpi << ATM_HDR_VPI_SHIFT) | (vcc->vci << ATM_HDR_VCI_SHIFT))) + { + kfree_skb(skb); + return -EADDRNOTAVAIL; + } + return vcc->dev->ops->send(vcc,skb); +} + + +int atm_init_aal0(struct atm_vcc *vcc) +{ + vcc->push = atm_push_raw; + vcc->pop = atm_pop_raw; + vcc->push_oam = NULL; + vcc->send = atm_send_aal0; + return 0; +} + + +int atm_init_aal34(struct atm_vcc *vcc) +{ + vcc->push = atm_push_raw; + vcc->pop = atm_pop_raw; + vcc->push_oam = NULL; + vcc->send = vcc->dev->ops->send; + return 0; +} + + +int atm_init_aal5(struct atm_vcc *vcc) +{ + vcc->push = atm_push_raw; + vcc->pop = atm_pop_raw; + vcc->push_oam = NULL; + vcc->send = vcc->dev->ops->send; + return 0; +} + + +EXPORT_SYMBOL(atm_init_aal5); diff --git a/net/atm/resources.c b/net/atm/resources.c new file mode 100644 index 000000000000..33f1685dbb77 --- /dev/null +++ b/net/atm/resources.c @@ -0,0 +1,432 @@ +/* net/atm/resources.c - Statically allocated resources */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + +/* Fixes + * Arnaldo Carvalho de Melo + * 2002/01 - don't free the whole struct sock on sk->destruct time, + * use the default destruct function initialized by sock_init_data */ + + +#include +#include +#include +#include +#include +#include /* for barrier */ +#include +#include +#include +#include /* for struct sock */ + +#include "common.h" +#include "resources.h" +#include "addr.h" + + +LIST_HEAD(atm_devs); +DEFINE_SPINLOCK(atm_dev_lock); + +static struct atm_dev *__alloc_atm_dev(const char *type) +{ + struct atm_dev *dev; + + dev = kmalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return NULL; + memset(dev, 0, sizeof(*dev)); + dev->type = type; + dev->signal = ATM_PHY_SIG_UNKNOWN; + dev->link_rate = ATM_OC3_PCR; + spin_lock_init(&dev->lock); + INIT_LIST_HEAD(&dev->local); + + return dev; +} + +static void __free_atm_dev(struct atm_dev *dev) +{ + kfree(dev); +} + +static struct atm_dev *__atm_dev_lookup(int number) +{ + struct atm_dev *dev; + struct list_head *p; + + list_for_each(p, &atm_devs) { + dev = list_entry(p, struct atm_dev, dev_list); + if ((dev->ops) && (dev->number == number)) { + atm_dev_hold(dev); + return dev; + } + } + return NULL; +} + +struct atm_dev *atm_dev_lookup(int number) +{ + struct atm_dev *dev; + + spin_lock(&atm_dev_lock); + dev = __atm_dev_lookup(number); + spin_unlock(&atm_dev_lock); + return dev; +} + +struct atm_dev *atm_dev_register(const char *type, const struct atmdev_ops *ops, + int number, unsigned long *flags) +{ + struct atm_dev *dev, *inuse; + + dev = __alloc_atm_dev(type); + if (!dev) { + printk(KERN_ERR "atm_dev_register: no space for dev %s\n", + type); + return NULL; + } + spin_lock(&atm_dev_lock); + if (number != -1) { + if ((inuse = __atm_dev_lookup(number))) { + atm_dev_put(inuse); + spin_unlock(&atm_dev_lock); + __free_atm_dev(dev); + return NULL; + } + dev->number = number; + } else { + dev->number = 0; + while ((inuse = __atm_dev_lookup(dev->number))) { + atm_dev_put(inuse); + dev->number++; + } + } + + dev->ops = ops; + if (flags) + dev->flags = *flags; + else + memset(&dev->flags, 0, sizeof(dev->flags)); + memset(&dev->stats, 0, sizeof(dev->stats)); + atomic_set(&dev->refcnt, 1); + list_add_tail(&dev->dev_list, &atm_devs); + spin_unlock(&atm_dev_lock); + + if (atm_proc_dev_register(dev) < 0) { + printk(KERN_ERR "atm_dev_register: " + "atm_proc_dev_register failed for dev %s\n", + type); + spin_lock(&atm_dev_lock); + list_del(&dev->dev_list); + spin_unlock(&atm_dev_lock); + __free_atm_dev(dev); + return NULL; + } + + return dev; +} + + +void atm_dev_deregister(struct atm_dev *dev) +{ + unsigned long warning_time; + + atm_proc_dev_deregister(dev); + + spin_lock(&atm_dev_lock); + list_del(&dev->dev_list); + spin_unlock(&atm_dev_lock); + + warning_time = jiffies; + while (atomic_read(&dev->refcnt) != 1) { + msleep(250); + if ((jiffies - warning_time) > 10 * HZ) { + printk(KERN_EMERG "atm_dev_deregister: waiting for " + "dev %d to become free. Usage count = %d\n", + dev->number, atomic_read(&dev->refcnt)); + warning_time = jiffies; + } + } + + __free_atm_dev(dev); +} + +void shutdown_atm_dev(struct atm_dev *dev) +{ + if (atomic_read(&dev->refcnt) > 1) { + set_bit(ATM_DF_CLOSE, &dev->flags); + return; + } + if (dev->ops->dev_close) + dev->ops->dev_close(dev); + atm_dev_deregister(dev); +} + + +static void copy_aal_stats(struct k_atm_aal_stats *from, + struct atm_aal_stats *to) +{ +#define __HANDLE_ITEM(i) to->i = atomic_read(&from->i) + __AAL_STAT_ITEMS +#undef __HANDLE_ITEM +} + + +static void subtract_aal_stats(struct k_atm_aal_stats *from, + struct atm_aal_stats *to) +{ +#define __HANDLE_ITEM(i) atomic_sub(to->i, &from->i) + __AAL_STAT_ITEMS +#undef __HANDLE_ITEM +} + + +static int fetch_stats(struct atm_dev *dev, struct atm_dev_stats __user *arg, int zero) +{ + struct atm_dev_stats tmp; + int error = 0; + + copy_aal_stats(&dev->stats.aal0, &tmp.aal0); + copy_aal_stats(&dev->stats.aal34, &tmp.aal34); + copy_aal_stats(&dev->stats.aal5, &tmp.aal5); + if (arg) + error = copy_to_user(arg, &tmp, sizeof(tmp)); + if (zero && !error) { + subtract_aal_stats(&dev->stats.aal0, &tmp.aal0); + subtract_aal_stats(&dev->stats.aal34, &tmp.aal34); + subtract_aal_stats(&dev->stats.aal5, &tmp.aal5); + } + return error ? -EFAULT : 0; +} + + +int atm_dev_ioctl(unsigned int cmd, void __user *arg) +{ + void __user *buf; + int error, len, number, size = 0; + struct atm_dev *dev; + struct list_head *p; + int *tmp_buf, *tmp_p; + struct atm_iobuf __user *iobuf = arg; + struct atmif_sioc __user *sioc = arg; + switch (cmd) { + case ATM_GETNAMES: + if (get_user(buf, &iobuf->buffer)) + return -EFAULT; + if (get_user(len, &iobuf->length)) + return -EFAULT; + spin_lock(&atm_dev_lock); + list_for_each(p, &atm_devs) + size += sizeof(int); + if (size > len) { + spin_unlock(&atm_dev_lock); + return -E2BIG; + } + tmp_buf = kmalloc(size, GFP_ATOMIC); + if (!tmp_buf) { + spin_unlock(&atm_dev_lock); + return -ENOMEM; + } + tmp_p = tmp_buf; + list_for_each(p, &atm_devs) { + dev = list_entry(p, struct atm_dev, dev_list); + *tmp_p++ = dev->number; + } + spin_unlock(&atm_dev_lock); + error = ((copy_to_user(buf, tmp_buf, size)) || + put_user(size, &iobuf->length)) + ? -EFAULT : 0; + kfree(tmp_buf); + return error; + default: + break; + } + + if (get_user(buf, &sioc->arg)) + return -EFAULT; + if (get_user(len, &sioc->length)) + return -EFAULT; + if (get_user(number, &sioc->number)) + return -EFAULT; + + if (!(dev = atm_dev_lookup(number))) + return -ENODEV; + + switch (cmd) { + case ATM_GETTYPE: + size = strlen(dev->type) + 1; + if (copy_to_user(buf, dev->type, size)) { + error = -EFAULT; + goto done; + } + break; + case ATM_GETESI: + size = ESI_LEN; + if (copy_to_user(buf, dev->esi, size)) { + error = -EFAULT; + goto done; + } + break; + case ATM_SETESI: + { + int i; + + for (i = 0; i < ESI_LEN; i++) + if (dev->esi[i]) { + error = -EEXIST; + goto done; + } + } + /* fall through */ + case ATM_SETESIF: + { + unsigned char esi[ESI_LEN]; + + if (!capable(CAP_NET_ADMIN)) { + error = -EPERM; + goto done; + } + if (copy_from_user(esi, buf, ESI_LEN)) { + error = -EFAULT; + goto done; + } + memcpy(dev->esi, esi, ESI_LEN); + error = ESI_LEN; + goto done; + } + case ATM_GETSTATZ: + if (!capable(CAP_NET_ADMIN)) { + error = -EPERM; + goto done; + } + /* fall through */ + case ATM_GETSTAT: + size = sizeof(struct atm_dev_stats); + error = fetch_stats(dev, buf, cmd == ATM_GETSTATZ); + if (error) + goto done; + break; + case ATM_GETCIRANGE: + size = sizeof(struct atm_cirange); + if (copy_to_user(buf, &dev->ci_range, size)) { + error = -EFAULT; + goto done; + } + break; + case ATM_GETLINKRATE: + size = sizeof(int); + if (copy_to_user(buf, &dev->link_rate, size)) { + error = -EFAULT; + goto done; + } + break; + case ATM_RSTADDR: + if (!capable(CAP_NET_ADMIN)) { + error = -EPERM; + goto done; + } + atm_reset_addr(dev); + break; + case ATM_ADDADDR: + case ATM_DELADDR: + if (!capable(CAP_NET_ADMIN)) { + error = -EPERM; + goto done; + } + { + struct sockaddr_atmsvc addr; + + if (copy_from_user(&addr, buf, sizeof(addr))) { + error = -EFAULT; + goto done; + } + if (cmd == ATM_ADDADDR) + error = atm_add_addr(dev, &addr); + else + error = atm_del_addr(dev, &addr); + goto done; + } + case ATM_GETADDR: + error = atm_get_addr(dev, buf, len); + if (error < 0) + goto done; + size = error; + /* may return 0, but later on size == 0 means "don't + write the length" */ + error = put_user(size, &sioc->length) + ? -EFAULT : 0; + goto done; + case ATM_SETLOOP: + if (__ATM_LM_XTRMT((int) (unsigned long) buf) && + __ATM_LM_XTLOC((int) (unsigned long) buf) > + __ATM_LM_XTRMT((int) (unsigned long) buf)) { + error = -EINVAL; + goto done; + } + /* fall through */ + case ATM_SETCIRANGE: + case SONET_GETSTATZ: + case SONET_SETDIAG: + case SONET_CLRDIAG: + case SONET_SETFRAMING: + if (!capable(CAP_NET_ADMIN)) { + error = -EPERM; + goto done; + } + /* fall through */ + default: + if (!dev->ops->ioctl) { + error = -EINVAL; + goto done; + } + size = dev->ops->ioctl(dev, cmd, buf); + if (size < 0) { + error = (size == -ENOIOCTLCMD ? -EINVAL : size); + goto done; + } + } + + if (size) + error = put_user(size, &sioc->length) + ? -EFAULT : 0; + else + error = 0; +done: + atm_dev_put(dev); + return error; +} + +static __inline__ void *dev_get_idx(loff_t left) +{ + struct list_head *p; + + list_for_each(p, &atm_devs) { + if (!--left) + break; + } + return (p != &atm_devs) ? p : NULL; +} + +void *atm_dev_seq_start(struct seq_file *seq, loff_t *pos) +{ + spin_lock(&atm_dev_lock); + return *pos ? dev_get_idx(*pos) : (void *) 1; +} + +void atm_dev_seq_stop(struct seq_file *seq, void *v) +{ + spin_unlock(&atm_dev_lock); +} + +void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + v = (v == (void *)1) ? atm_devs.next : ((struct list_head *)v)->next; + return (v == &atm_devs) ? NULL : v; +} + + +EXPORT_SYMBOL(atm_dev_register); +EXPORT_SYMBOL(atm_dev_deregister); +EXPORT_SYMBOL(atm_dev_lookup); +EXPORT_SYMBOL(shutdown_atm_dev); diff --git a/net/atm/resources.h b/net/atm/resources.h new file mode 100644 index 000000000000..12910619dbb6 --- /dev/null +++ b/net/atm/resources.h @@ -0,0 +1,46 @@ +/* net/atm/resources.h - ATM-related resources */ + +/* Written 1995-1998 by Werner Almesberger, EPFL LRC/ICA */ + + +#ifndef NET_ATM_RESOURCES_H +#define NET_ATM_RESOURCES_H + +#include +#include + + +extern struct list_head atm_devs; +extern spinlock_t atm_dev_lock; + + +int atm_dev_ioctl(unsigned int cmd, void __user *arg); + + +#ifdef CONFIG_PROC_FS + +#include + +void *atm_dev_seq_start(struct seq_file *seq, loff_t *pos); +void atm_dev_seq_stop(struct seq_file *seq, void *v); +void *atm_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos); + + +int atm_proc_dev_register(struct atm_dev *dev); +void atm_proc_dev_deregister(struct atm_dev *dev); + +#else + +static inline int atm_proc_dev_register(struct atm_dev *dev) +{ + return 0; +} + +static inline void atm_proc_dev_deregister(struct atm_dev *dev) +{ + /* nothing */ +} + +#endif /* CONFIG_PROC_FS */ + +#endif diff --git a/net/atm/signaling.c b/net/atm/signaling.c new file mode 100644 index 000000000000..6ff803154c04 --- /dev/null +++ b/net/atm/signaling.c @@ -0,0 +1,280 @@ +/* net/atm/signaling.c - ATM signaling */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + + +#include /* error codes */ +#include /* printk */ +#include +#include +#include /* jiffies and HZ */ +#include /* ATM stuff */ +#include +#include +#include +#include + +#include "resources.h" +#include "signaling.h" + + +#undef WAIT_FOR_DEMON /* #define this if system calls on SVC sockets + should block until the demon runs. + Danger: may cause nasty hangs if the demon + crashes. */ + +#if 0 +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + + +struct atm_vcc *sigd = NULL; +#ifdef WAIT_FOR_DEMON +static DECLARE_WAIT_QUEUE_HEAD(sigd_sleep); +#endif + + +static void sigd_put_skb(struct sk_buff *skb) +{ +#ifdef WAIT_FOR_DEMON + static unsigned long silence; + DECLARE_WAITQUEUE(wait,current); + + add_wait_queue(&sigd_sleep,&wait); + while (!sigd) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (time_after(jiffies, silence) || silence == 0) { + printk(KERN_INFO "atmsvc: waiting for signaling demon " + "...\n"); + silence = (jiffies+30*HZ)|1; + } + schedule(); + } + current->state = TASK_RUNNING; + remove_wait_queue(&sigd_sleep,&wait); +#else + if (!sigd) { + printk(KERN_WARNING "atmsvc: no signaling demon\n"); + kfree_skb(skb); + return; + } +#endif + atm_force_charge(sigd,skb->truesize); + skb_queue_tail(&sk_atm(sigd)->sk_receive_queue,skb); + sk_atm(sigd)->sk_data_ready(sk_atm(sigd), skb->len); +} + + +static void modify_qos(struct atm_vcc *vcc,struct atmsvc_msg *msg) +{ + struct sk_buff *skb; + + if (test_bit(ATM_VF_RELEASED,&vcc->flags) || + !test_bit(ATM_VF_READY,&vcc->flags)) + return; + msg->type = as_error; + if (!vcc->dev->ops->change_qos) msg->reply = -EOPNOTSUPP; + else { + /* should lock VCC */ + msg->reply = vcc->dev->ops->change_qos(vcc,&msg->qos, + msg->reply); + if (!msg->reply) msg->type = as_okay; + } + /* + * Should probably just turn around the old skb. But the, the buffer + * space accounting needs to follow the change too. Maybe later. + */ + while (!(skb = alloc_skb(sizeof(struct atmsvc_msg),GFP_KERNEL))) + schedule(); + *(struct atmsvc_msg *) skb_put(skb,sizeof(struct atmsvc_msg)) = *msg; + sigd_put_skb(skb); +} + + +static int sigd_send(struct atm_vcc *vcc,struct sk_buff *skb) +{ + struct atmsvc_msg *msg; + struct atm_vcc *session_vcc; + struct sock *sk; + + msg = (struct atmsvc_msg *) skb->data; + atomic_sub(skb->truesize, &sk_atm(vcc)->sk_wmem_alloc); + DPRINTK("sigd_send %d (0x%lx)\n",(int) msg->type, + (unsigned long) msg->vcc); + vcc = *(struct atm_vcc **) &msg->vcc; + sk = sk_atm(vcc); + + switch (msg->type) { + case as_okay: + sk->sk_err = -msg->reply; + clear_bit(ATM_VF_WAITING, &vcc->flags); + if (!*vcc->local.sas_addr.prv && + !*vcc->local.sas_addr.pub) { + vcc->local.sas_family = AF_ATMSVC; + memcpy(vcc->local.sas_addr.prv, + msg->local.sas_addr.prv,ATM_ESA_LEN); + memcpy(vcc->local.sas_addr.pub, + msg->local.sas_addr.pub,ATM_E164_LEN+1); + } + session_vcc = vcc->session ? vcc->session : vcc; + if (session_vcc->vpi || session_vcc->vci) break; + session_vcc->itf = msg->pvc.sap_addr.itf; + session_vcc->vpi = msg->pvc.sap_addr.vpi; + session_vcc->vci = msg->pvc.sap_addr.vci; + if (session_vcc->vpi || session_vcc->vci) + session_vcc->qos = msg->qos; + break; + case as_error: + clear_bit(ATM_VF_REGIS,&vcc->flags); + clear_bit(ATM_VF_READY,&vcc->flags); + sk->sk_err = -msg->reply; + clear_bit(ATM_VF_WAITING, &vcc->flags); + break; + case as_indicate: + vcc = *(struct atm_vcc **) &msg->listen_vcc; + DPRINTK("as_indicate!!!\n"); + lock_sock(sk); + if (sk_acceptq_is_full(sk)) { + sigd_enq(NULL,as_reject,vcc,NULL,NULL); + dev_kfree_skb(skb); + goto as_indicate_complete; + } + sk->sk_ack_backlog++; + skb_queue_tail(&sk->sk_receive_queue, skb); + DPRINTK("waking sk->sk_sleep 0x%p\n", sk->sk_sleep); + sk->sk_state_change(sk); +as_indicate_complete: + release_sock(sk); + return 0; + case as_close: + set_bit(ATM_VF_RELEASED,&vcc->flags); + vcc_release_async(vcc, msg->reply); + goto out; + case as_modify: + modify_qos(vcc,msg); + break; + case as_addparty: + case as_dropparty: + sk->sk_err_soft = msg->reply; /* < 0 failure, otherwise ep_ref */ + clear_bit(ATM_VF_WAITING, &vcc->flags); + break; + default: + printk(KERN_ALERT "sigd_send: bad message type %d\n", + (int) msg->type); + return -EINVAL; + } + sk->sk_state_change(sk); +out: + dev_kfree_skb(skb); + return 0; +} + + +void sigd_enq2(struct atm_vcc *vcc,enum atmsvc_msg_type type, + struct atm_vcc *listen_vcc,const struct sockaddr_atmpvc *pvc, + const struct sockaddr_atmsvc *svc,const struct atm_qos *qos,int reply) +{ + struct sk_buff *skb; + struct atmsvc_msg *msg; + static unsigned session = 0; + + DPRINTK("sigd_enq %d (0x%p)\n",(int) type,vcc); + while (!(skb = alloc_skb(sizeof(struct atmsvc_msg),GFP_KERNEL))) + schedule(); + msg = (struct atmsvc_msg *) skb_put(skb,sizeof(struct atmsvc_msg)); + memset(msg,0,sizeof(*msg)); + msg->type = type; + *(struct atm_vcc **) &msg->vcc = vcc; + *(struct atm_vcc **) &msg->listen_vcc = listen_vcc; + msg->reply = reply; + if (qos) msg->qos = *qos; + if (vcc) msg->sap = vcc->sap; + if (svc) msg->svc = *svc; + if (vcc) msg->local = vcc->local; + if (pvc) msg->pvc = *pvc; + if (vcc) { + if (type == as_connect && test_bit(ATM_VF_SESSION, &vcc->flags)) + msg->session = ++session; + /* every new pmp connect gets the next session number */ + } + sigd_put_skb(skb); + if (vcc) set_bit(ATM_VF_REGIS,&vcc->flags); +} + + +void sigd_enq(struct atm_vcc *vcc,enum atmsvc_msg_type type, + struct atm_vcc *listen_vcc,const struct sockaddr_atmpvc *pvc, + const struct sockaddr_atmsvc *svc) +{ + sigd_enq2(vcc,type,listen_vcc,pvc,svc,vcc ? &vcc->qos : NULL,0); + /* other ISP applications may use "reply" */ +} + + +static void purge_vcc(struct atm_vcc *vcc) +{ + if (sk_atm(vcc)->sk_family == PF_ATMSVC && + !test_bit(ATM_VF_META,&vcc->flags)) { + set_bit(ATM_VF_RELEASED,&vcc->flags); + vcc_release_async(vcc, -EUNATCH); + } +} + + +static void sigd_close(struct atm_vcc *vcc) +{ + struct hlist_node *node; + struct sock *s; + int i; + + DPRINTK("sigd_close\n"); + sigd = NULL; + if (skb_peek(&sk_atm(vcc)->sk_receive_queue)) + printk(KERN_ERR "sigd_close: closing with requests pending\n"); + skb_queue_purge(&sk_atm(vcc)->sk_receive_queue); + + read_lock(&vcc_sklist_lock); + for(i = 0; i < VCC_HTABLE_SIZE; ++i) { + struct hlist_head *head = &vcc_hash[i]; + + sk_for_each(s, node, head) { + struct atm_vcc *vcc = atm_sk(s); + + if (vcc->dev) + purge_vcc(vcc); + } + } + read_unlock(&vcc_sklist_lock); +} + + +static struct atmdev_ops sigd_dev_ops = { + .close = sigd_close, + .send = sigd_send +}; + + +static struct atm_dev sigd_dev = { + .ops = &sigd_dev_ops, + .type = "sig", + .number = 999, + .lock = SPIN_LOCK_UNLOCKED +}; + + +int sigd_attach(struct atm_vcc *vcc) +{ + if (sigd) return -EADDRINUSE; + DPRINTK("sigd_attach\n"); + sigd = vcc; + vcc->dev = &sigd_dev; + vcc_insert_socket(sk_atm(vcc)); + set_bit(ATM_VF_META,&vcc->flags); + set_bit(ATM_VF_READY,&vcc->flags); +#ifdef WAIT_FOR_DEMON + wake_up(&sigd_sleep); +#endif + return 0; +} diff --git a/net/atm/signaling.h b/net/atm/signaling.h new file mode 100644 index 000000000000..434ead455714 --- /dev/null +++ b/net/atm/signaling.h @@ -0,0 +1,30 @@ +/* net/atm/signaling.h - ATM signaling */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + + +#ifndef NET_ATM_SIGNALING_H +#define NET_ATM_SIGNALING_H + +#include +#include +#include + + +extern struct atm_vcc *sigd; /* needed in svc_release */ + + +/* + * sigd_enq is a wrapper for sigd_enq2, covering the more common cases, and + * avoiding huge lists of null values. + */ + +void sigd_enq2(struct atm_vcc *vcc,enum atmsvc_msg_type type, + struct atm_vcc *listen_vcc,const struct sockaddr_atmpvc *pvc, + const struct sockaddr_atmsvc *svc,const struct atm_qos *qos,int reply); +void sigd_enq(struct atm_vcc *vcc,enum atmsvc_msg_type type, + struct atm_vcc *listen_vcc,const struct sockaddr_atmpvc *pvc, + const struct sockaddr_atmsvc *svc); +int sigd_attach(struct atm_vcc *vcc); + +#endif diff --git a/net/atm/svc.c b/net/atm/svc.c new file mode 100644 index 000000000000..02f5374a51f2 --- /dev/null +++ b/net/atm/svc.c @@ -0,0 +1,674 @@ +/* net/atm/svc.c - ATM SVC sockets */ + +/* Written 1995-2000 by Werner Almesberger, EPFL LRC/ICA */ + + +#include +#include /* struct socket, struct proto_ops */ +#include /* error codes */ +#include /* printk */ +#include +#include +#include /* jiffies and HZ */ +#include /* O_NONBLOCK */ +#include +#include /* ATM stuff */ +#include +#include +#include +#include +#include /* for sock_no_* */ +#include + +#include "resources.h" +#include "common.h" /* common for PVCs and SVCs */ +#include "signaling.h" +#include "addr.h" + + +#if 0 +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + + +static int svc_create(struct socket *sock,int protocol); + + +/* + * Note: since all this is still nicely synchronized with the signaling demon, + * there's no need to protect sleep loops with clis. If signaling is + * moved into the kernel, that would change. + */ + + +static int svc_shutdown(struct socket *sock,int how) +{ + return 0; +} + + +static void svc_disconnect(struct atm_vcc *vcc) +{ + DEFINE_WAIT(wait); + struct sk_buff *skb; + struct sock *sk = sk_atm(vcc); + + DPRINTK("svc_disconnect %p\n",vcc); + if (test_bit(ATM_VF_REGIS,&vcc->flags)) { + prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE); + sigd_enq(vcc,as_close,NULL,NULL,NULL); + while (!test_bit(ATM_VF_RELEASED,&vcc->flags) && sigd) { + schedule(); + prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE); + } + finish_wait(sk->sk_sleep, &wait); + } + /* beware - socket is still in use by atmsigd until the last + as_indicate has been answered */ + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + atm_return(vcc, skb->truesize); + DPRINTK("LISTEN REL\n"); + sigd_enq2(NULL,as_reject,vcc,NULL,NULL,&vcc->qos,0); + dev_kfree_skb(skb); + } + clear_bit(ATM_VF_REGIS, &vcc->flags); + /* ... may retry later */ +} + + +static int svc_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct atm_vcc *vcc; + + if (sk) { + vcc = ATM_SD(sock); + DPRINTK("svc_release %p\n", vcc); + clear_bit(ATM_VF_READY, &vcc->flags); + /* VCC pointer is used as a reference, so we must not free it + (thereby subjecting it to re-use) before all pending connections + are closed */ + svc_disconnect(vcc); + vcc_release(sock); + } + return 0; +} + + +static int svc_bind(struct socket *sock,struct sockaddr *sockaddr, + int sockaddr_len) +{ + DEFINE_WAIT(wait); + struct sock *sk = sock->sk; + struct sockaddr_atmsvc *addr; + struct atm_vcc *vcc; + int error; + + if (sockaddr_len != sizeof(struct sockaddr_atmsvc)) + return -EINVAL; + lock_sock(sk); + if (sock->state == SS_CONNECTED) { + error = -EISCONN; + goto out; + } + if (sock->state != SS_UNCONNECTED) { + error = -EINVAL; + goto out; + } + vcc = ATM_SD(sock); + if (test_bit(ATM_VF_SESSION, &vcc->flags)) { + error = -EINVAL; + goto out; + } + addr = (struct sockaddr_atmsvc *) sockaddr; + if (addr->sas_family != AF_ATMSVC) { + error = -EAFNOSUPPORT; + goto out; + } + clear_bit(ATM_VF_BOUND,&vcc->flags); + /* failing rebind will kill old binding */ + /* @@@ check memory (de)allocation on rebind */ + if (!test_bit(ATM_VF_HASQOS,&vcc->flags)) { + error = -EBADFD; + goto out; + } + vcc->local = *addr; + set_bit(ATM_VF_WAITING, &vcc->flags); + prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE); + sigd_enq(vcc,as_bind,NULL,NULL,&vcc->local); + while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { + schedule(); + prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE); + } + finish_wait(sk->sk_sleep, &wait); + clear_bit(ATM_VF_REGIS,&vcc->flags); /* doesn't count */ + if (!sigd) { + error = -EUNATCH; + goto out; + } + if (!sk->sk_err) + set_bit(ATM_VF_BOUND,&vcc->flags); + error = -sk->sk_err; +out: + release_sock(sk); + return error; +} + + +static int svc_connect(struct socket *sock,struct sockaddr *sockaddr, + int sockaddr_len,int flags) +{ + DEFINE_WAIT(wait); + struct sock *sk = sock->sk; + struct sockaddr_atmsvc *addr; + struct atm_vcc *vcc = ATM_SD(sock); + int error; + + DPRINTK("svc_connect %p\n",vcc); + lock_sock(sk); + if (sockaddr_len != sizeof(struct sockaddr_atmsvc)) { + error = -EINVAL; + goto out; + } + + switch (sock->state) { + default: + error = -EINVAL; + goto out; + case SS_CONNECTED: + error = -EISCONN; + goto out; + case SS_CONNECTING: + if (test_bit(ATM_VF_WAITING, &vcc->flags)) { + error = -EALREADY; + goto out; + } + sock->state = SS_UNCONNECTED; + if (sk->sk_err) { + error = -sk->sk_err; + goto out; + } + break; + case SS_UNCONNECTED: + addr = (struct sockaddr_atmsvc *) sockaddr; + if (addr->sas_family != AF_ATMSVC) { + error = -EAFNOSUPPORT; + goto out; + } + if (!test_bit(ATM_VF_HASQOS, &vcc->flags)) { + error = -EBADFD; + goto out; + } + if (vcc->qos.txtp.traffic_class == ATM_ANYCLASS || + vcc->qos.rxtp.traffic_class == ATM_ANYCLASS) { + error = -EINVAL; + goto out; + } + if (!vcc->qos.txtp.traffic_class && + !vcc->qos.rxtp.traffic_class) { + error = -EINVAL; + goto out; + } + vcc->remote = *addr; + set_bit(ATM_VF_WAITING, &vcc->flags); + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + sigd_enq(vcc,as_connect,NULL,NULL,&vcc->remote); + if (flags & O_NONBLOCK) { + finish_wait(sk->sk_sleep, &wait); + sock->state = SS_CONNECTING; + error = -EINPROGRESS; + goto out; + } + error = 0; + while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { + schedule(); + if (!signal_pending(current)) { + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + continue; + } + DPRINTK("*ABORT*\n"); + /* + * This is tricky: + * Kernel ---close--> Demon + * Kernel <--close--- Demon + * or + * Kernel ---close--> Demon + * Kernel <--error--- Demon + * or + * Kernel ---close--> Demon + * Kernel <--okay---- Demon + * Kernel <--close--- Demon + */ + sigd_enq(vcc,as_close,NULL,NULL,NULL); + while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + schedule(); + } + if (!sk->sk_err) + while (!test_bit(ATM_VF_RELEASED,&vcc->flags) + && sigd) { + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + schedule(); + } + clear_bit(ATM_VF_REGIS,&vcc->flags); + clear_bit(ATM_VF_RELEASED,&vcc->flags); + clear_bit(ATM_VF_CLOSE,&vcc->flags); + /* we're gone now but may connect later */ + error = -EINTR; + break; + } + finish_wait(sk->sk_sleep, &wait); + if (error) + goto out; + if (!sigd) { + error = -EUNATCH; + goto out; + } + if (sk->sk_err) { + error = -sk->sk_err; + goto out; + } + } +/* + * Not supported yet + * + * #ifndef CONFIG_SINGLE_SIGITF + */ + vcc->qos.txtp.max_pcr = SELECT_TOP_PCR(vcc->qos.txtp); + vcc->qos.txtp.pcr = 0; + vcc->qos.txtp.min_pcr = 0; +/* + * #endif + */ + if (!(error = vcc_connect(sock, vcc->itf, vcc->vpi, vcc->vci))) + sock->state = SS_CONNECTED; + else + (void) svc_disconnect(vcc); +out: + release_sock(sk); + return error; +} + + +static int svc_listen(struct socket *sock,int backlog) +{ + DEFINE_WAIT(wait); + struct sock *sk = sock->sk; + struct atm_vcc *vcc = ATM_SD(sock); + int error; + + DPRINTK("svc_listen %p\n",vcc); + lock_sock(sk); + /* let server handle listen on unbound sockets */ + if (test_bit(ATM_VF_SESSION,&vcc->flags)) { + error = -EINVAL; + goto out; + } + set_bit(ATM_VF_WAITING, &vcc->flags); + prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE); + sigd_enq(vcc,as_listen,NULL,NULL,&vcc->local); + while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { + schedule(); + prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE); + } + finish_wait(sk->sk_sleep, &wait); + if (!sigd) { + error = -EUNATCH; + goto out; + } + set_bit(ATM_VF_LISTEN,&vcc->flags); + sk->sk_max_ack_backlog = backlog > 0 ? backlog : ATM_BACKLOG_DEFAULT; + error = -sk->sk_err; +out: + release_sock(sk); + return error; +} + + +static int svc_accept(struct socket *sock,struct socket *newsock,int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + struct atmsvc_msg *msg; + struct atm_vcc *old_vcc = ATM_SD(sock); + struct atm_vcc *new_vcc; + int error; + + lock_sock(sk); + + error = svc_create(newsock,0); + if (error) + goto out; + + new_vcc = ATM_SD(newsock); + + DPRINTK("svc_accept %p -> %p\n",old_vcc,new_vcc); + while (1) { + DEFINE_WAIT(wait); + + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + while (!(skb = skb_dequeue(&sk->sk_receive_queue)) && + sigd) { + if (test_bit(ATM_VF_RELEASED,&old_vcc->flags)) break; + if (test_bit(ATM_VF_CLOSE,&old_vcc->flags)) { + error = -sk->sk_err; + break; + } + if (flags & O_NONBLOCK) { + error = -EAGAIN; + break; + } + release_sock(sk); + schedule(); + lock_sock(sk); + if (signal_pending(current)) { + error = -ERESTARTSYS; + break; + } + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + } + finish_wait(sk->sk_sleep, &wait); + if (error) + goto out; + if (!skb) { + error = -EUNATCH; + goto out; + } + msg = (struct atmsvc_msg *) skb->data; + new_vcc->qos = msg->qos; + set_bit(ATM_VF_HASQOS,&new_vcc->flags); + new_vcc->remote = msg->svc; + new_vcc->local = msg->local; + new_vcc->sap = msg->sap; + error = vcc_connect(newsock, msg->pvc.sap_addr.itf, + msg->pvc.sap_addr.vpi, msg->pvc.sap_addr.vci); + dev_kfree_skb(skb); + sk->sk_ack_backlog--; + if (error) { + sigd_enq2(NULL,as_reject,old_vcc,NULL,NULL, + &old_vcc->qos,error); + error = error == -EAGAIN ? -EBUSY : error; + goto out; + } + /* wait should be short, so we ignore the non-blocking flag */ + set_bit(ATM_VF_WAITING, &new_vcc->flags); + prepare_to_wait(sk_atm(new_vcc)->sk_sleep, &wait, TASK_UNINTERRUPTIBLE); + sigd_enq(new_vcc,as_accept,old_vcc,NULL,NULL); + while (test_bit(ATM_VF_WAITING, &new_vcc->flags) && sigd) { + release_sock(sk); + schedule(); + lock_sock(sk); + prepare_to_wait(sk_atm(new_vcc)->sk_sleep, &wait, TASK_UNINTERRUPTIBLE); + } + finish_wait(sk_atm(new_vcc)->sk_sleep, &wait); + if (!sigd) { + error = -EUNATCH; + goto out; + } + if (!sk_atm(new_vcc)->sk_err) + break; + if (sk_atm(new_vcc)->sk_err != ERESTARTSYS) { + error = -sk_atm(new_vcc)->sk_err; + goto out; + } + } + newsock->state = SS_CONNECTED; +out: + release_sock(sk); + return error; +} + + +static int svc_getname(struct socket *sock,struct sockaddr *sockaddr, + int *sockaddr_len,int peer) +{ + struct sockaddr_atmsvc *addr; + + *sockaddr_len = sizeof(struct sockaddr_atmsvc); + addr = (struct sockaddr_atmsvc *) sockaddr; + memcpy(addr,peer ? &ATM_SD(sock)->remote : &ATM_SD(sock)->local, + sizeof(struct sockaddr_atmsvc)); + return 0; +} + + +int svc_change_qos(struct atm_vcc *vcc,struct atm_qos *qos) +{ + struct sock *sk = sk_atm(vcc); + DEFINE_WAIT(wait); + + set_bit(ATM_VF_WAITING, &vcc->flags); + prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE); + sigd_enq2(vcc,as_modify,NULL,NULL,&vcc->local,qos,0); + while (test_bit(ATM_VF_WAITING, &vcc->flags) && + !test_bit(ATM_VF_RELEASED, &vcc->flags) && sigd) { + schedule(); + prepare_to_wait(sk->sk_sleep, &wait, TASK_UNINTERRUPTIBLE); + } + finish_wait(sk->sk_sleep, &wait); + if (!sigd) return -EUNATCH; + return -sk->sk_err; +} + + +static int svc_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + struct atm_vcc *vcc = ATM_SD(sock); + int value, error = 0; + + lock_sock(sk); + switch (optname) { + case SO_ATMSAP: + if (level != SOL_ATM || optlen != sizeof(struct atm_sap)) { + error = -EINVAL; + goto out; + } + if (copy_from_user(&vcc->sap, optval, optlen)) { + error = -EFAULT; + goto out; + } + set_bit(ATM_VF_HASSAP, &vcc->flags); + break; + case SO_MULTIPOINT: + if (level != SOL_ATM || optlen != sizeof(int)) { + error = -EINVAL; + goto out; + } + if (get_user(value, (int __user *) optval)) { + error = -EFAULT; + goto out; + } + if (value == 1) { + set_bit(ATM_VF_SESSION, &vcc->flags); + } else if (value == 0) { + clear_bit(ATM_VF_SESSION, &vcc->flags); + } else { + error = -EINVAL; + } + break; + default: + error = vcc_setsockopt(sock, level, optname, + optval, optlen); + } + +out: + release_sock(sk); + return error; +} + + +static int svc_getsockopt(struct socket *sock,int level,int optname, + char __user *optval,int __user *optlen) +{ + struct sock *sk = sock->sk; + int error = 0, len; + + lock_sock(sk); + if (!__SO_LEVEL_MATCH(optname, level) || optname != SO_ATMSAP) { + error = vcc_getsockopt(sock, level, optname, optval, optlen); + goto out; + } + if (get_user(len, optlen)) { + error = -EFAULT; + goto out; + } + if (len != sizeof(struct atm_sap)) { + error = -EINVAL; + goto out; + } + if (copy_to_user(optval, &ATM_SD(sock)->sap, sizeof(struct atm_sap))) { + error = -EFAULT; + goto out; + } +out: + release_sock(sk); + return error; +} + + +static int svc_addparty(struct socket *sock, struct sockaddr *sockaddr, + int sockaddr_len, int flags) +{ + DEFINE_WAIT(wait); + struct sock *sk = sock->sk; + struct atm_vcc *vcc = ATM_SD(sock); + int error; + + lock_sock(sk); + set_bit(ATM_VF_WAITING, &vcc->flags); + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + sigd_enq(vcc, as_addparty, NULL, NULL, + (struct sockaddr_atmsvc *) sockaddr); + if (flags & O_NONBLOCK) { + finish_wait(sk->sk_sleep, &wait); + error = -EINPROGRESS; + goto out; + } + DPRINTK("svc_addparty added wait queue\n"); + while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { + schedule(); + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + } + finish_wait(sk->sk_sleep, &wait); + error = xchg(&sk->sk_err_soft, 0); +out: + release_sock(sk); + return error; +} + + +static int svc_dropparty(struct socket *sock, int ep_ref) +{ + DEFINE_WAIT(wait); + struct sock *sk = sock->sk; + struct atm_vcc *vcc = ATM_SD(sock); + int error; + + lock_sock(sk); + set_bit(ATM_VF_WAITING, &vcc->flags); + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + sigd_enq2(vcc, as_dropparty, NULL, NULL, NULL, NULL, ep_ref); + while (test_bit(ATM_VF_WAITING, &vcc->flags) && sigd) { + schedule(); + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + } + finish_wait(sk->sk_sleep, &wait); + if (!sigd) { + error = -EUNATCH; + goto out; + } + error = xchg(&sk->sk_err_soft, 0); +out: + release_sock(sk); + return error; +} + + +static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + int error, ep_ref; + struct sockaddr_atmsvc sa; + struct atm_vcc *vcc = ATM_SD(sock); + + switch (cmd) { + case ATM_ADDPARTY: + if (!test_bit(ATM_VF_SESSION, &vcc->flags)) + return -EINVAL; + if (copy_from_user(&sa, (void __user *) arg, sizeof(sa))) + return -EFAULT; + error = svc_addparty(sock, (struct sockaddr *) &sa, sizeof(sa), 0); + break; + case ATM_DROPPARTY: + if (!test_bit(ATM_VF_SESSION, &vcc->flags)) + return -EINVAL; + if (copy_from_user(&ep_ref, (void __user *) arg, sizeof(int))) + return -EFAULT; + error = svc_dropparty(sock, ep_ref); + break; + default: + error = vcc_ioctl(sock, cmd, arg); + } + + return error; +} + +static struct proto_ops svc_proto_ops = { + .family = PF_ATMSVC, + .owner = THIS_MODULE, + + .release = svc_release, + .bind = svc_bind, + .connect = svc_connect, + .socketpair = sock_no_socketpair, + .accept = svc_accept, + .getname = svc_getname, + .poll = vcc_poll, + .ioctl = svc_ioctl, + .listen = svc_listen, + .shutdown = svc_shutdown, + .setsockopt = svc_setsockopt, + .getsockopt = svc_getsockopt, + .sendmsg = vcc_sendmsg, + .recvmsg = vcc_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + + +static int svc_create(struct socket *sock,int protocol) +{ + int error; + + sock->ops = &svc_proto_ops; + error = vcc_create(sock, protocol, AF_ATMSVC); + if (error) return error; + ATM_SD(sock)->local.sas_family = AF_ATMSVC; + ATM_SD(sock)->remote.sas_family = AF_ATMSVC; + return 0; +} + + +static struct net_proto_family svc_family_ops = { + .family = PF_ATMSVC, + .create = svc_create, + .owner = THIS_MODULE, +}; + + +/* + * Initialize the ATM SVC protocol family + */ + +int __init atmsvc_init(void) +{ + return sock_register(&svc_family_ops); +} + +void atmsvc_exit(void) +{ + sock_unregister(PF_ATMSVC); +} diff --git a/net/ax25/Kconfig b/net/ax25/Kconfig new file mode 100644 index 000000000000..a8993a041724 --- /dev/null +++ b/net/ax25/Kconfig @@ -0,0 +1,110 @@ +# +# Amateur Radio protocols and AX.25 device configuration +# +# 19971130 Now in an own category to make correct compilation of the +# AX.25 stuff easier... +# Joerg Reuter DL1BKE +# 19980129 Moved to net/ax25/Config.in, sourcing device drivers. + +menuconfig HAMRADIO + depends on NET + bool "Amateur Radio support" + help + If you want to connect your Linux box to an amateur radio, answer Y + here. You want to read and + the AX25-HOWTO, available from . + + Note that the answer to this question won't directly affect the + kernel: saying N will just cause the configurator to skip all + the questions about amateur radio. + +comment "Packet Radio protocols" + depends on HAMRADIO && NET + +config AX25 + tristate "Amateur Radio AX.25 Level 2 protocol" + depends on HAMRADIO && NET + ---help--- + This is the protocol used for computer communication over amateur + radio. It is either used by itself for point-to-point links, or to + carry other protocols such as tcp/ip. To use it, you need a device + that connects your Linux box to your amateur radio. You can either + use a low speed TNC (a Terminal Node Controller acts as a kind of + modem connecting your computer's serial port to your radio's + microphone input and speaker output) supporting the KISS protocol or + one of the various SCC cards that are supported by the generic Z8530 + or the DMA SCC driver. Another option are the Baycom modem serial + and parallel port hacks or the sound card modem (supported by their + own drivers). If you say Y here, you also have to say Y to one of + those drivers. + + Information about where to get supporting software for Linux amateur + radio as well as information about how to configure an AX.25 port is + contained in the AX25-HOWTO, available from + . You might also want to + check out the file in the + kernel source. More information about digital amateur radio in + general is on the WWW at + . + + To compile this driver as a module, choose M here: the + module will be called ax25. + +config AX25_DAMA_SLAVE + bool "AX.25 DAMA Slave support" + depends on AX25 + help + DAMA is a mechanism to prevent collisions when doing AX.25 + networking. A DAMA server (called "master") accepts incoming traffic + from clients (called "slaves") and redistributes it to other slaves. + If you say Y here, your Linux box will act as a DAMA slave; this is + transparent in that you don't have to do any special DAMA + configuration. (Linux cannot yet act as a DAMA server.) If unsure, + say N. + +# bool ' AX.25 DAMA Master support' CONFIG_AX25_DAMA_MASTER +config NETROM + tristate "Amateur Radio NET/ROM protocol" + depends on AX25 + ---help--- + NET/ROM is a network layer protocol on top of AX.25 useful for + routing. + + A comprehensive listing of all the software for Linux amateur radio + users as well as information about how to configure an AX.25 port is + contained in the AX25-HOWTO, available from + . You also might want to + check out the file . More + information about digital amateur radio in general is on the WWW at + . + + To compile this driver as a module, choose M here: the + module will be called netrom. + +config ROSE + tristate "Amateur Radio X.25 PLP (Rose)" + depends on AX25 + ---help--- + The Packet Layer Protocol (PLP) is a way to route packets over X.25 + connections in general and amateur radio AX.25 connections in + particular, essentially an alternative to NET/ROM. + + A comprehensive listing of all the software for Linux amateur radio + users as well as information about how to configure an AX.25 port is + contained in the AX25-HOWTO, available from + . You also might want to + check out the file . More + information about digital amateur radio in general is on the WWW at + . + + To compile this driver as a module, choose M here: the + module will be called rose. + + +menu "AX.25 network device drivers" + depends on HAMRADIO && NET && AX25!=n + +source "drivers/net/hamradio/Kconfig" + +endmenu + diff --git a/net/ax25/Makefile b/net/ax25/Makefile new file mode 100644 index 000000000000..43c46d2cafb6 --- /dev/null +++ b/net/ax25/Makefile @@ -0,0 +1,11 @@ +# +# Makefile for the Linux AX.25 layer. +# + +obj-$(CONFIG_AX25) += ax25.o + +ax25-y := ax25_addr.o ax25_dev.o ax25_iface.o ax25_in.o ax25_ip.o ax25_out.o \ + ax25_route.o ax25_std_in.o ax25_std_subr.o ax25_std_timer.o \ + ax25_subr.o ax25_timer.o ax25_uid.o af_ax25.o +ax25-$(CONFIG_AX25_DAMA_SLAVE) += ax25_ds_in.o ax25_ds_subr.o ax25_ds_timer.o +ax25-$(CONFIG_SYSCTL) += sysctl_net_ax25.o diff --git a/net/ax25/TODO b/net/ax25/TODO new file mode 100644 index 000000000000..4089c49e45cc --- /dev/null +++ b/net/ax25/TODO @@ -0,0 +1,24 @@ +Do the ax25_list_lock, ax25_dev_lock, linkfail_lockreally, ax25_frag_lock and +listen_lock have to be bh-safe? + +Do the netrom and rose locks have to be bh-safe? + +A device might be deleted after lookup in the SIOCADDRT ioctl but before it's +being used. + +Routes to a device being taken down might be deleted by ax25_rt_device_down +but added by somebody else before the device has been deleted fully. + +Massive amounts of lock_kernel / unlock_kernel are just a temporary solution to +get around the removal of SOCKOPS_WRAP. A serious locking strategy has to be +implemented. + +The ax25_rt_find_route synopsys is pervert but I somehow had to deal with +the race caused by the static variable in it's previous implementation. + +Implement proper socket locking in netrom and rose. + +Check socket locking when ax25_rcv is sending to raw sockets. In particular +ax25_send_to_raw() seems fishy. Heck - ax25_rcv is fishy. + +Handle XID and TEST frames properly. diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c new file mode 100644 index 000000000000..33b1a3763027 --- /dev/null +++ b/net/ax25/af_ax25.c @@ -0,0 +1,2050 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Darryl Miles G7LED (dlm@g7led.demon.co.uk) + * Copyright (C) Steven Whitehouse GW7RRM (stevew@acm.org) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de) + * Copyright (C) Hans Alblas PE1AYX (hans@esrac.ele.tue.nl) + * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For TIOCINQ/OUTQ */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + + +HLIST_HEAD(ax25_list); +DEFINE_SPINLOCK(ax25_list_lock); + +static struct proto_ops ax25_proto_ops; + +static void ax25_free_sock(struct sock *sk) +{ + ax25_cb_put(ax25_sk(sk)); +} + +/* + * Socket removal during an interrupt is now safe. + */ +static void ax25_cb_del(ax25_cb *ax25) +{ + if (!hlist_unhashed(&ax25->ax25_node)) { + spin_lock_bh(&ax25_list_lock); + hlist_del_init(&ax25->ax25_node); + spin_unlock_bh(&ax25_list_lock); + ax25_cb_put(ax25); + } +} + +/* + * Kill all bound sockets on a dropped device. + */ +static void ax25_kill_by_device(struct net_device *dev) +{ + ax25_dev *ax25_dev; + ax25_cb *s; + struct hlist_node *node; + + if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + return; + + spin_lock_bh(&ax25_list_lock); + ax25_for_each(s, node, &ax25_list) { + if (s->ax25_dev == ax25_dev) { + s->ax25_dev = NULL; + ax25_disconnect(s, ENETUNREACH); + } + } + spin_unlock_bh(&ax25_list_lock); +} + +/* + * Handle device status changes. + */ +static int ax25_device_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = (struct net_device *)ptr; + + /* Reject non AX.25 devices */ + if (dev->type != ARPHRD_AX25) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: + ax25_dev_device_up(dev); + break; + case NETDEV_DOWN: + ax25_kill_by_device(dev); + ax25_rt_device_down(dev); + ax25_dev_device_down(dev); + break; + default: + break; + } + + return NOTIFY_DONE; +} + +/* + * Add a socket to the bound sockets list. + */ +void ax25_cb_add(ax25_cb *ax25) +{ + spin_lock_bh(&ax25_list_lock); + ax25_cb_hold(ax25); + hlist_add_head(&ax25->ax25_node, &ax25_list); + spin_unlock_bh(&ax25_list_lock); +} + +/* + * Find a socket that wants to accept the SABM we have just + * received. + */ +struct sock *ax25_find_listener(ax25_address *addr, int digi, + struct net_device *dev, int type) +{ + ax25_cb *s; + struct hlist_node *node; + + spin_lock_bh(&ax25_list_lock); + ax25_for_each(s, node, &ax25_list) { + if ((s->iamdigi && !digi) || (!s->iamdigi && digi)) + continue; + if (s->sk && !ax25cmp(&s->source_addr, addr) && + s->sk->sk_type == type && s->sk->sk_state == TCP_LISTEN) { + /* If device is null we match any device */ + if (s->ax25_dev == NULL || s->ax25_dev->dev == dev) { + sock_hold(s->sk); + spin_unlock_bh(&ax25_list_lock); + return s->sk; + } + } + } + spin_unlock_bh(&ax25_list_lock); + + return NULL; +} + +/* + * Find an AX.25 socket given both ends. + */ +struct sock *ax25_get_socket(ax25_address *my_addr, ax25_address *dest_addr, + int type) +{ + struct sock *sk = NULL; + ax25_cb *s; + struct hlist_node *node; + + spin_lock_bh(&ax25_list_lock); + ax25_for_each(s, node, &ax25_list) { + if (s->sk && !ax25cmp(&s->source_addr, my_addr) && + !ax25cmp(&s->dest_addr, dest_addr) && + s->sk->sk_type == type) { + sk = s->sk; + sock_hold(sk); + break; + } + } + + spin_unlock_bh(&ax25_list_lock); + + return sk; +} + +/* + * Find an AX.25 control block given both ends. It will only pick up + * floating AX.25 control blocks or non Raw socket bound control blocks. + */ +ax25_cb *ax25_find_cb(ax25_address *src_addr, ax25_address *dest_addr, + ax25_digi *digi, struct net_device *dev) +{ + ax25_cb *s; + struct hlist_node *node; + + spin_lock_bh(&ax25_list_lock); + ax25_for_each(s, node, &ax25_list) { + if (s->sk && s->sk->sk_type != SOCK_SEQPACKET) + continue; + if (s->ax25_dev == NULL) + continue; + if (ax25cmp(&s->source_addr, src_addr) == 0 && ax25cmp(&s->dest_addr, dest_addr) == 0 && s->ax25_dev->dev == dev) { + if (digi != NULL && digi->ndigi != 0) { + if (s->digipeat == NULL) + continue; + if (ax25digicmp(s->digipeat, digi) != 0) + continue; + } else { + if (s->digipeat != NULL && s->digipeat->ndigi != 0) + continue; + } + ax25_cb_hold(s); + spin_unlock_bh(&ax25_list_lock); + + return s; + } + } + spin_unlock_bh(&ax25_list_lock); + + return NULL; +} + +void ax25_send_to_raw(ax25_address *addr, struct sk_buff *skb, int proto) +{ + ax25_cb *s; + struct sk_buff *copy; + struct hlist_node *node; + + spin_lock_bh(&ax25_list_lock); + ax25_for_each(s, node, &ax25_list) { + if (s->sk != NULL && ax25cmp(&s->source_addr, addr) == 0 && + s->sk->sk_type == SOCK_RAW && + s->sk->sk_protocol == proto && + s->ax25_dev->dev == skb->dev && + atomic_read(&s->sk->sk_rmem_alloc) <= s->sk->sk_rcvbuf) { + if ((copy = skb_clone(skb, GFP_ATOMIC)) == NULL) + continue; + if (sock_queue_rcv_skb(s->sk, copy) != 0) + kfree_skb(copy); + } + } + spin_unlock_bh(&ax25_list_lock); +} + +/* + * Deferred destroy. + */ +void ax25_destroy_socket(ax25_cb *); + +/* + * Handler for deferred kills. + */ +static void ax25_destroy_timer(unsigned long data) +{ + ax25_cb *ax25=(ax25_cb *)data; + struct sock *sk; + + sk=ax25->sk; + + bh_lock_sock(sk); + sock_hold(sk); + ax25_destroy_socket(ax25); + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * This is called from user mode and the timers. Thus it protects itself + * against interrupt users but doesn't worry about being called during + * work. Once it is removed from the queue no interrupt or bottom half + * will touch it and we are (fairly 8-) ) safe. + */ +void ax25_destroy_socket(ax25_cb *ax25) +{ + struct sk_buff *skb; + + ax25_cb_del(ax25); + + ax25_stop_heartbeat(ax25); + ax25_stop_t1timer(ax25); + ax25_stop_t2timer(ax25); + ax25_stop_t3timer(ax25); + ax25_stop_idletimer(ax25); + + ax25_clear_queues(ax25); /* Flush the queues */ + + if (ax25->sk != NULL) { + while ((skb = skb_dequeue(&ax25->sk->sk_receive_queue)) != NULL) { + if (skb->sk != ax25->sk) { + /* A pending connection */ + ax25_cb *sax25 = ax25_sk(skb->sk); + + /* Queue the unaccepted socket for death */ + sock_orphan(skb->sk); + + ax25_start_heartbeat(sax25); + sax25->state = AX25_STATE_0; + } + + kfree_skb(skb); + } + skb_queue_purge(&ax25->sk->sk_write_queue); + } + + if (ax25->sk != NULL) { + if (atomic_read(&ax25->sk->sk_wmem_alloc) || + atomic_read(&ax25->sk->sk_rmem_alloc)) { + /* Defer: outstanding buffers */ + init_timer(&ax25->dtimer); + ax25->dtimer.expires = jiffies + 2 * HZ; + ax25->dtimer.function = ax25_destroy_timer; + ax25->dtimer.data = (unsigned long)ax25; + add_timer(&ax25->dtimer); + } else { + struct sock *sk=ax25->sk; + ax25->sk=NULL; + sock_put(sk); + } + } else { + ax25_cb_put(ax25); + } +} + +/* + * dl1bke 960311: set parameters for existing AX.25 connections, + * includes a KILL command to abort any connection. + * VERY useful for debugging ;-) + */ +static int ax25_ctl_ioctl(const unsigned int cmd, void __user *arg) +{ + struct ax25_ctl_struct ax25_ctl; + ax25_digi digi; + ax25_dev *ax25_dev; + ax25_cb *ax25; + unsigned int k; + + if (copy_from_user(&ax25_ctl, arg, sizeof(ax25_ctl))) + return -EFAULT; + + if ((ax25_dev = ax25_addr_ax25dev(&ax25_ctl.port_addr)) == NULL) + return -ENODEV; + + if (ax25_ctl.digi_count > AX25_MAX_DIGIS) + return -EINVAL; + + digi.ndigi = ax25_ctl.digi_count; + for (k = 0; k < digi.ndigi; k++) + digi.calls[k] = ax25_ctl.digi_addr[k]; + + if ((ax25 = ax25_find_cb(&ax25_ctl.source_addr, &ax25_ctl.dest_addr, &digi, ax25_dev->dev)) == NULL) + return -ENOTCONN; + + switch (ax25_ctl.cmd) { + case AX25_KILL: + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); +#ifdef CONFIG_AX25_DAMA_SLAVE + if (ax25_dev->dama.slave && ax25->ax25_dev->values[AX25_VALUES_PROTOCOL] == AX25_PROTO_DAMA_SLAVE) + ax25_dama_off(ax25); +#endif + ax25_disconnect(ax25, ENETRESET); + break; + + case AX25_WINDOW: + if (ax25->modulus == AX25_MODULUS) { + if (ax25_ctl.arg < 1 || ax25_ctl.arg > 7) + return -EINVAL; + } else { + if (ax25_ctl.arg < 1 || ax25_ctl.arg > 63) + return -EINVAL; + } + ax25->window = ax25_ctl.arg; + break; + + case AX25_T1: + if (ax25_ctl.arg < 1) + return -EINVAL; + ax25->rtt = (ax25_ctl.arg * HZ) / 2; + ax25->t1 = ax25_ctl.arg * HZ; + break; + + case AX25_T2: + if (ax25_ctl.arg < 1) + return -EINVAL; + ax25->t2 = ax25_ctl.arg * HZ; + break; + + case AX25_N2: + if (ax25_ctl.arg < 1 || ax25_ctl.arg > 31) + return -EINVAL; + ax25->n2count = 0; + ax25->n2 = ax25_ctl.arg; + break; + + case AX25_T3: + if (ax25_ctl.arg < 0) + return -EINVAL; + ax25->t3 = ax25_ctl.arg * HZ; + break; + + case AX25_IDLE: + if (ax25_ctl.arg < 0) + return -EINVAL; + ax25->idle = ax25_ctl.arg * 60 * HZ; + break; + + case AX25_PACLEN: + if (ax25_ctl.arg < 16 || ax25_ctl.arg > 65535) + return -EINVAL; + ax25->paclen = ax25_ctl.arg; + break; + + default: + return -EINVAL; + } + + return 0; +} + +/* + * Fill in a created AX.25 created control block with the default + * values for a particular device. + */ +void ax25_fillin_cb(ax25_cb *ax25, ax25_dev *ax25_dev) +{ + ax25->ax25_dev = ax25_dev; + + if (ax25->ax25_dev != NULL) { + ax25->rtt = ax25_dev->values[AX25_VALUES_T1] / 2; + ax25->t1 = ax25_dev->values[AX25_VALUES_T1]; + ax25->t2 = ax25_dev->values[AX25_VALUES_T2]; + ax25->t3 = ax25_dev->values[AX25_VALUES_T3]; + ax25->n2 = ax25_dev->values[AX25_VALUES_N2]; + ax25->paclen = ax25_dev->values[AX25_VALUES_PACLEN]; + ax25->idle = ax25_dev->values[AX25_VALUES_IDLE]; + ax25->backoff = ax25_dev->values[AX25_VALUES_BACKOFF]; + + if (ax25_dev->values[AX25_VALUES_AXDEFMODE]) { + ax25->modulus = AX25_EMODULUS; + ax25->window = ax25_dev->values[AX25_VALUES_EWINDOW]; + } else { + ax25->modulus = AX25_MODULUS; + ax25->window = ax25_dev->values[AX25_VALUES_WINDOW]; + } + } else { + ax25->rtt = AX25_DEF_T1 / 2; + ax25->t1 = AX25_DEF_T1; + ax25->t2 = AX25_DEF_T2; + ax25->t3 = AX25_DEF_T3; + ax25->n2 = AX25_DEF_N2; + ax25->paclen = AX25_DEF_PACLEN; + ax25->idle = AX25_DEF_IDLE; + ax25->backoff = AX25_DEF_BACKOFF; + + if (AX25_DEF_AXDEFMODE) { + ax25->modulus = AX25_EMODULUS; + ax25->window = AX25_DEF_EWINDOW; + } else { + ax25->modulus = AX25_MODULUS; + ax25->window = AX25_DEF_WINDOW; + } + } +} + +/* + * Create an empty AX.25 control block. + */ +ax25_cb *ax25_create_cb(void) +{ + ax25_cb *ax25; + + if ((ax25 = kmalloc(sizeof(*ax25), GFP_ATOMIC)) == NULL) + return NULL; + + memset(ax25, 0x00, sizeof(*ax25)); + atomic_set(&ax25->refcount, 1); + + skb_queue_head_init(&ax25->write_queue); + skb_queue_head_init(&ax25->frag_queue); + skb_queue_head_init(&ax25->ack_queue); + skb_queue_head_init(&ax25->reseq_queue); + + init_timer(&ax25->timer); + init_timer(&ax25->t1timer); + init_timer(&ax25->t2timer); + init_timer(&ax25->t3timer); + init_timer(&ax25->idletimer); + + ax25_fillin_cb(ax25, NULL); + + ax25->state = AX25_STATE_0; + + return ax25; +} + +/* + * Handling for system calls applied via the various interfaces to an + * AX25 socket object + */ + +static int ax25_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + ax25_cb *ax25; + struct net_device *dev; + char devname[IFNAMSIZ]; + int opt, res = 0; + + if (level != SOL_AX25) + return -ENOPROTOOPT; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(opt, (int __user *)optval)) + return -EFAULT; + + lock_sock(sk); + ax25 = ax25_sk(sk); + + switch (optname) { + case AX25_WINDOW: + if (ax25->modulus == AX25_MODULUS) { + if (opt < 1 || opt > 7) { + res = -EINVAL; + break; + } + } else { + if (opt < 1 || opt > 63) { + res = -EINVAL; + break; + } + } + ax25->window = opt; + break; + + case AX25_T1: + if (opt < 1) { + res = -EINVAL; + break; + } + ax25->rtt = (opt * HZ) / 2; + ax25->t1 = opt * HZ; + break; + + case AX25_T2: + if (opt < 1) { + res = -EINVAL; + break; + } + ax25->t2 = opt * HZ; + break; + + case AX25_N2: + if (opt < 1 || opt > 31) { + res = -EINVAL; + break; + } + ax25->n2 = opt; + break; + + case AX25_T3: + if (opt < 1) { + res = -EINVAL; + break; + } + ax25->t3 = opt * HZ; + break; + + case AX25_IDLE: + if (opt < 0) { + res = -EINVAL; + break; + } + ax25->idle = opt * 60 * HZ; + break; + + case AX25_BACKOFF: + if (opt < 0 || opt > 2) { + res = -EINVAL; + break; + } + ax25->backoff = opt; + break; + + case AX25_EXTSEQ: + ax25->modulus = opt ? AX25_EMODULUS : AX25_MODULUS; + break; + + case AX25_PIDINCL: + ax25->pidincl = opt ? 1 : 0; + break; + + case AX25_IAMDIGI: + ax25->iamdigi = opt ? 1 : 0; + break; + + case AX25_PACLEN: + if (opt < 16 || opt > 65535) { + res = -EINVAL; + break; + } + ax25->paclen = opt; + break; + + case SO_BINDTODEVICE: + if (optlen > IFNAMSIZ) + optlen=IFNAMSIZ; + if (copy_from_user(devname, optval, optlen)) { + res = -EFAULT; + break; + } + + dev = dev_get_by_name(devname); + if (dev == NULL) { + res = -ENODEV; + break; + } + + if (sk->sk_type == SOCK_SEQPACKET && + (sock->state != SS_UNCONNECTED || + sk->sk_state == TCP_LISTEN)) { + res = -EADDRNOTAVAIL; + dev_put(dev); + break; + } + + ax25->ax25_dev = ax25_dev_ax25dev(dev); + ax25_fillin_cb(ax25, ax25->ax25_dev); + break; + + default: + res = -ENOPROTOOPT; + } + release_sock(sk); + + return res; +} + +static int ax25_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + ax25_cb *ax25; + struct ax25_dev *ax25_dev; + char devname[IFNAMSIZ]; + void *valptr; + int val = 0; + int maxlen, length; + + if (level != SOL_AX25) + return -ENOPROTOOPT; + + if (get_user(maxlen, optlen)) + return -EFAULT; + + if (maxlen < 1) + return -EFAULT; + + valptr = (void *) &val; + length = min_t(unsigned int, maxlen, sizeof(int)); + + lock_sock(sk); + ax25 = ax25_sk(sk); + + switch (optname) { + case AX25_WINDOW: + val = ax25->window; + break; + + case AX25_T1: + val = ax25->t1 / HZ; + break; + + case AX25_T2: + val = ax25->t2 / HZ; + break; + + case AX25_N2: + val = ax25->n2; + break; + + case AX25_T3: + val = ax25->t3 / HZ; + break; + + case AX25_IDLE: + val = ax25->idle / (60 * HZ); + break; + + case AX25_BACKOFF: + val = ax25->backoff; + break; + + case AX25_EXTSEQ: + val = (ax25->modulus == AX25_EMODULUS); + break; + + case AX25_PIDINCL: + val = ax25->pidincl; + break; + + case AX25_IAMDIGI: + val = ax25->iamdigi; + break; + + case AX25_PACLEN: + val = ax25->paclen; + break; + + case SO_BINDTODEVICE: + ax25_dev = ax25->ax25_dev; + + if (ax25_dev != NULL && ax25_dev->dev != NULL) { + strlcpy(devname, ax25_dev->dev->name, sizeof(devname)); + length = strlen(devname) + 1; + } else { + *devname = '\0'; + length = 1; + } + + valptr = (void *) devname; + break; + + default: + release_sock(sk); + return -ENOPROTOOPT; + } + release_sock(sk); + + if (put_user(length, optlen)) + return -EFAULT; + + return copy_to_user(optval, valptr, length) ? -EFAULT : 0; +} + +static int ax25_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int res = 0; + + lock_sock(sk); + if (sk->sk_type == SOCK_SEQPACKET && sk->sk_state != TCP_LISTEN) { + sk->sk_max_ack_backlog = backlog; + sk->sk_state = TCP_LISTEN; + goto out; + } + res = -EOPNOTSUPP; + +out: + release_sock(sk); + + return res; +} + +/* + * XXX: when creating ax25_sock we should update the .obj_size setting + * below. + */ +static struct proto ax25_proto = { + .name = "AX25", + .owner = THIS_MODULE, + .obj_size = sizeof(struct sock), +}; + +static int ax25_create(struct socket *sock, int protocol) +{ + struct sock *sk; + ax25_cb *ax25; + + switch (sock->type) { + case SOCK_DGRAM: + if (protocol == 0 || protocol == PF_AX25) + protocol = AX25_P_TEXT; + break; + + case SOCK_SEQPACKET: + switch (protocol) { + case 0: + case PF_AX25: /* For CLX */ + protocol = AX25_P_TEXT; + break; + case AX25_P_SEGMENT: +#ifdef CONFIG_INET + case AX25_P_ARP: + case AX25_P_IP: +#endif +#ifdef CONFIG_NETROM + case AX25_P_NETROM: +#endif +#ifdef CONFIG_ROSE + case AX25_P_ROSE: +#endif + return -ESOCKTNOSUPPORT; +#ifdef CONFIG_NETROM_MODULE + case AX25_P_NETROM: + if (ax25_protocol_is_registered(AX25_P_NETROM)) + return -ESOCKTNOSUPPORT; +#endif +#ifdef CONFIG_ROSE_MODULE + case AX25_P_ROSE: + if (ax25_protocol_is_registered(AX25_P_ROSE)) + return -ESOCKTNOSUPPORT; +#endif + default: + break; + } + break; + + case SOCK_RAW: + break; + default: + return -ESOCKTNOSUPPORT; + } + + if ((sk = sk_alloc(PF_AX25, GFP_ATOMIC, &ax25_proto, 1)) == NULL) + return -ENOMEM; + + ax25 = sk->sk_protinfo = ax25_create_cb(); + if (!ax25) { + sk_free(sk); + return -ENOMEM; + } + + sock_init_data(sock, sk); + + sk->sk_destruct = ax25_free_sock; + sock->ops = &ax25_proto_ops; + sk->sk_protocol = protocol; + + ax25->sk = sk; + + return 0; +} + +struct sock *ax25_make_new(struct sock *osk, struct ax25_dev *ax25_dev) +{ + struct sock *sk; + ax25_cb *ax25, *oax25; + + if ((sk = sk_alloc(PF_AX25, GFP_ATOMIC, osk->sk_prot, 1)) == NULL) + return NULL; + + if ((ax25 = ax25_create_cb()) == NULL) { + sk_free(sk); + return NULL; + } + + switch (osk->sk_type) { + case SOCK_DGRAM: + break; + case SOCK_SEQPACKET: + break; + default: + sk_free(sk); + ax25_cb_put(ax25); + return NULL; + } + + sock_init_data(NULL, sk); + + sk->sk_destruct = ax25_free_sock; + sk->sk_type = osk->sk_type; + sk->sk_socket = osk->sk_socket; + sk->sk_priority = osk->sk_priority; + sk->sk_protocol = osk->sk_protocol; + sk->sk_rcvbuf = osk->sk_rcvbuf; + sk->sk_sndbuf = osk->sk_sndbuf; + sk->sk_state = TCP_ESTABLISHED; + sk->sk_sleep = osk->sk_sleep; + + if (sock_flag(osk, SOCK_DBG)) + sock_set_flag(sk, SOCK_DBG); + + if (sock_flag(osk, SOCK_ZAPPED)) + sock_set_flag(sk, SOCK_ZAPPED); + + oax25 = ax25_sk(osk); + + ax25->modulus = oax25->modulus; + ax25->backoff = oax25->backoff; + ax25->pidincl = oax25->pidincl; + ax25->iamdigi = oax25->iamdigi; + ax25->rtt = oax25->rtt; + ax25->t1 = oax25->t1; + ax25->t2 = oax25->t2; + ax25->t3 = oax25->t3; + ax25->n2 = oax25->n2; + ax25->idle = oax25->idle; + ax25->paclen = oax25->paclen; + ax25->window = oax25->window; + + ax25->ax25_dev = ax25_dev; + ax25->source_addr = oax25->source_addr; + + if (oax25->digipeat != NULL) { + if ((ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { + sk_free(sk); + ax25_cb_put(ax25); + return NULL; + } + + memcpy(ax25->digipeat, oax25->digipeat, sizeof(ax25_digi)); + } + + sk->sk_protinfo = ax25; + ax25->sk = sk; + + return sk; +} + +static int ax25_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + ax25_cb *ax25; + + if (sk == NULL) + return 0; + + sock_hold(sk); + sock_orphan(sk); + lock_sock(sk); + ax25 = ax25_sk(sk); + + if (sk->sk_type == SOCK_SEQPACKET) { + switch (ax25->state) { + case AX25_STATE_0: + release_sock(sk); + ax25_disconnect(ax25, 0); + lock_sock(sk); + ax25_destroy_socket(ax25); + break; + + case AX25_STATE_1: + case AX25_STATE_2: + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + release_sock(sk); + ax25_disconnect(ax25, 0); + lock_sock(sk); + ax25_destroy_socket(ax25); + break; + + case AX25_STATE_3: + case AX25_STATE_4: + ax25_clear_queues(ax25); + ax25->n2count = 0; + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_send_control(ax25, + AX25_DISC, + AX25_POLLON, + AX25_COMMAND); + ax25_stop_t2timer(ax25); + ax25_stop_t3timer(ax25); + ax25_stop_idletimer(ax25); + break; +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + ax25_stop_t3timer(ax25); + ax25_stop_idletimer(ax25); + break; +#endif + } + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); + ax25->state = AX25_STATE_2; + sk->sk_state = TCP_CLOSE; + sk->sk_shutdown |= SEND_SHUTDOWN; + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DESTROY); + break; + + default: + break; + } + } else { + sk->sk_state = TCP_CLOSE; + sk->sk_shutdown |= SEND_SHUTDOWN; + sk->sk_state_change(sk); + ax25_destroy_socket(ax25); + } + + sock->sk = NULL; + release_sock(sk); + sock_put(sk); + + return 0; +} + +/* + * We support a funny extension here so you can (as root) give any callsign + * digipeated via a local address as source. This hack is obsolete now + * that we've implemented support for SO_BINDTODEVICE. It is however small + * and trivially backward compatible. + */ +static int ax25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr; + ax25_dev *ax25_dev = NULL; + ax25_address *call; + ax25_cb *ax25; + int err = 0; + + if (addr_len != sizeof(struct sockaddr_ax25) && + addr_len != sizeof(struct full_sockaddr_ax25)) { + /* support for old structure may go away some time */ + if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) || + (addr_len > sizeof(struct full_sockaddr_ax25))) { + return -EINVAL; + } + + printk(KERN_WARNING "ax25_bind(): %s uses old (6 digipeater) socket structure.\n", + current->comm); + } + + if (addr->fsa_ax25.sax25_family != AF_AX25) + return -EINVAL; + + call = ax25_findbyuid(current->euid); + if (call == NULL && ax25_uid_policy && !capable(CAP_NET_ADMIN)) { + return -EACCES; + } + + lock_sock(sk); + + ax25 = ax25_sk(sk); + if (!sock_flag(sk, SOCK_ZAPPED)) { + err = -EINVAL; + goto out; + } + + if (call == NULL) + ax25->source_addr = addr->fsa_ax25.sax25_call; + else + ax25->source_addr = *call; + + /* + * User already set interface with SO_BINDTODEVICE + */ + if (ax25->ax25_dev != NULL) + goto done; + + if (addr_len > sizeof(struct sockaddr_ax25) && addr->fsa_ax25.sax25_ndigis == 1) { + if (ax25cmp(&addr->fsa_digipeater[0], &null_ax25_address) != 0 && + (ax25_dev = ax25_addr_ax25dev(&addr->fsa_digipeater[0])) == NULL) { + err = -EADDRNOTAVAIL; + goto out; + } + } else { + if ((ax25_dev = ax25_addr_ax25dev(&addr->fsa_ax25.sax25_call)) == NULL) { + err = -EADDRNOTAVAIL; + goto out; + } + } + + if (ax25_dev != NULL) + ax25_fillin_cb(ax25, ax25_dev); + +done: + ax25_cb_add(ax25); + sock_reset_flag(sk, SOCK_ZAPPED); + +out: + release_sock(sk); + + return 0; +} + +/* + * FIXME: nonblock behaviour looks like it may have a bug. + */ +static int ax25_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + ax25_cb *ax25 = ax25_sk(sk), *ax25t; + struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr; + ax25_digi *digi = NULL; + int ct = 0, err = 0; + + /* + * some sanity checks. code further down depends on this + */ + + if (addr_len == sizeof(struct sockaddr_ax25)) { + /* support for this will go away in early 2.5.x */ + printk(KERN_WARNING "ax25_connect(): %s uses obsolete socket structure\n", + current->comm); + } + else if (addr_len != sizeof(struct full_sockaddr_ax25)) { + /* support for old structure may go away some time */ + if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) || + (addr_len > sizeof(struct full_sockaddr_ax25))) { + return -EINVAL; + } + + printk(KERN_WARNING "ax25_connect(): %s uses old (6 digipeater) socket structure.\n", + current->comm); + } + + if (fsa->fsa_ax25.sax25_family != AF_AX25) + return -EINVAL; + + lock_sock(sk); + + /* deal with restarts */ + if (sock->state == SS_CONNECTING) { + switch (sk->sk_state) { + case TCP_SYN_SENT: /* still trying */ + err = -EINPROGRESS; + goto out; + + case TCP_ESTABLISHED: /* connection established */ + sock->state = SS_CONNECTED; + goto out; + + case TCP_CLOSE: /* connection refused */ + sock->state = SS_UNCONNECTED; + err = -ECONNREFUSED; + goto out; + } + } + + if (sk->sk_state == TCP_ESTABLISHED && sk->sk_type == SOCK_SEQPACKET) { + err = -EISCONN; /* No reconnect on a seqpacket socket */ + goto out; + } + + sk->sk_state = TCP_CLOSE; + sock->state = SS_UNCONNECTED; + + if (ax25->digipeat != NULL) { + kfree(ax25->digipeat); + ax25->digipeat = NULL; + } + + /* + * Handle digi-peaters to be used. + */ + if (addr_len > sizeof(struct sockaddr_ax25) && + fsa->fsa_ax25.sax25_ndigis != 0) { + /* Valid number of digipeaters ? */ + if (fsa->fsa_ax25.sax25_ndigis < 1 || fsa->fsa_ax25.sax25_ndigis > AX25_MAX_DIGIS) { + err = -EINVAL; + goto out; + } + + if ((digi = kmalloc(sizeof(ax25_digi), GFP_KERNEL)) == NULL) { + err = -ENOBUFS; + goto out; + } + + digi->ndigi = fsa->fsa_ax25.sax25_ndigis; + digi->lastrepeat = -1; + + while (ct < fsa->fsa_ax25.sax25_ndigis) { + if ((fsa->fsa_digipeater[ct].ax25_call[6] & + AX25_HBIT) && ax25->iamdigi) { + digi->repeated[ct] = 1; + digi->lastrepeat = ct; + } else { + digi->repeated[ct] = 0; + } + digi->calls[ct] = fsa->fsa_digipeater[ct]; + ct++; + } + } + + /* + * Must bind first - autobinding in this may or may not work. If + * the socket is already bound, check to see if the device has + * been filled in, error if it hasn't. + */ + if (sock_flag(sk, SOCK_ZAPPED)) { + /* check if we can remove this feature. It is broken. */ + printk(KERN_WARNING "ax25_connect(): %s uses autobind, please contact jreuter@yaina.de\n", + current->comm); + if ((err = ax25_rt_autobind(ax25, &fsa->fsa_ax25.sax25_call)) < 0) { + kfree(digi); + goto out; + } + + ax25_fillin_cb(ax25, ax25->ax25_dev); + ax25_cb_add(ax25); + } else { + if (ax25->ax25_dev == NULL) { + kfree(digi); + err = -EHOSTUNREACH; + goto out; + } + } + + if (sk->sk_type == SOCK_SEQPACKET && + (ax25t=ax25_find_cb(&ax25->source_addr, &fsa->fsa_ax25.sax25_call, digi, + ax25->ax25_dev->dev))) { + kfree(digi); + err = -EADDRINUSE; /* Already such a connection */ + ax25_cb_put(ax25t); + goto out; + } + + ax25->dest_addr = fsa->fsa_ax25.sax25_call; + ax25->digipeat = digi; + + /* First the easy one */ + if (sk->sk_type != SOCK_SEQPACKET) { + sock->state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; + goto out; + } + + /* Move to connecting socket, ax.25 lapb WAIT_UA.. */ + sock->state = SS_CONNECTING; + sk->sk_state = TCP_SYN_SENT; + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_std_establish_data_link(ax25); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + if (ax25->ax25_dev->dama.slave) + ax25_ds_establish_data_link(ax25); + else + ax25_std_establish_data_link(ax25); + break; +#endif + } + + ax25->state = AX25_STATE_1; + + ax25_start_heartbeat(ax25); + + /* Now the loop */ + if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) { + err = -EINPROGRESS; + goto out; + } + + if (sk->sk_state == TCP_SYN_SENT) { + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue(sk->sk_sleep, &wait); + for (;;) { + if (sk->sk_state != TCP_SYN_SENT) + break; + set_current_state(TASK_INTERRUPTIBLE); + release_sock(sk); + if (!signal_pending(tsk)) { + schedule(); + lock_sock(sk); + continue; + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep, &wait); + return -ERESTARTSYS; + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep, &wait); + } + + if (sk->sk_state != TCP_ESTABLISHED) { + /* Not in ABM, not in WAIT_UA -> failed */ + sock->state = SS_UNCONNECTED; + err = sock_error(sk); /* Always set at this point */ + goto out; + } + + sock->state = SS_CONNECTED; + + err=0; +out: + release_sock(sk); + + return err; +} + + +static int ax25_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + struct sk_buff *skb; + struct sock *newsk; + struct sock *sk; + int err = 0; + + if (sock->state != SS_UNCONNECTED) + return -EINVAL; + + if ((sk = sock->sk) == NULL) + return -EINVAL; + + lock_sock(sk); + if (sk->sk_type != SOCK_SEQPACKET) { + err = -EOPNOTSUPP; + goto out; + } + + if (sk->sk_state != TCP_LISTEN) { + err = -EINVAL; + goto out; + } + + /* + * The read queue this time is holding sockets ready to use + * hooked into the SABM we saved + */ + add_wait_queue(sk->sk_sleep, &wait); + for (;;) { + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb) + break; + + release_sock(sk); + current->state = TASK_INTERRUPTIBLE; + if (flags & O_NONBLOCK) { + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep, &wait); + return -EWOULDBLOCK; + } + if (!signal_pending(tsk)) { + schedule(); + lock_sock(sk); + continue; + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep, &wait); + return -ERESTARTSYS; + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep, &wait); + + newsk = skb->sk; + newsk->sk_socket = newsock; + newsk->sk_sleep = &newsock->wait; + + /* Now attach up the new socket */ + kfree_skb(skb); + sk->sk_ack_backlog--; + newsock->sk = newsk; + newsock->state = SS_CONNECTED; + +out: + release_sock(sk); + + return err; +} + +static int ax25_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)uaddr; + struct sock *sk = sock->sk; + unsigned char ndigi, i; + ax25_cb *ax25; + int err = 0; + + lock_sock(sk); + ax25 = ax25_sk(sk); + + if (peer != 0) { + if (sk->sk_state != TCP_ESTABLISHED) { + err = -ENOTCONN; + goto out; + } + + fsa->fsa_ax25.sax25_family = AF_AX25; + fsa->fsa_ax25.sax25_call = ax25->dest_addr; + fsa->fsa_ax25.sax25_ndigis = 0; + + if (ax25->digipeat != NULL) { + ndigi = ax25->digipeat->ndigi; + fsa->fsa_ax25.sax25_ndigis = ndigi; + for (i = 0; i < ndigi; i++) + fsa->fsa_digipeater[i] = + ax25->digipeat->calls[i]; + } + } else { + fsa->fsa_ax25.sax25_family = AF_AX25; + fsa->fsa_ax25.sax25_call = ax25->source_addr; + fsa->fsa_ax25.sax25_ndigis = 1; + if (ax25->ax25_dev != NULL) { + memcpy(&fsa->fsa_digipeater[0], + ax25->ax25_dev->dev->dev_addr, AX25_ADDR_LEN); + } else { + fsa->fsa_digipeater[0] = null_ax25_address; + } + } + *uaddr_len = sizeof (struct full_sockaddr_ax25); + +out: + release_sock(sk); + + return err; +} + +static int ax25_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sockaddr_ax25 *usax = (struct sockaddr_ax25 *)msg->msg_name; + struct sock *sk = sock->sk; + struct sockaddr_ax25 sax; + struct sk_buff *skb; + ax25_digi dtmp, *dp; + unsigned char *asmptr; + ax25_cb *ax25; + size_t size; + int lv, err, addr_len = msg->msg_namelen; + + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) + return -EINVAL; + + lock_sock(sk); + ax25 = ax25_sk(sk); + + if (sock_flag(sk, SOCK_ZAPPED)) { + err = -EADDRNOTAVAIL; + goto out; + } + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + send_sig(SIGPIPE, current, 0); + err = -EPIPE; + goto out; + } + + if (ax25->ax25_dev == NULL) { + err = -ENETUNREACH; + goto out; + } + + if (len > ax25->ax25_dev->dev->mtu) { + err = -EMSGSIZE; + goto out; + } + + if (usax != NULL) { + if (usax->sax25_family != AF_AX25) { + err = -EINVAL; + goto out; + } + + if (addr_len == sizeof(struct sockaddr_ax25)) { + printk(KERN_WARNING "ax25_sendmsg(): %s uses obsolete socket structure\n", + current->comm); + } + else if (addr_len != sizeof(struct full_sockaddr_ax25)) { + /* support for old structure may go away some time */ + if ((addr_len < sizeof(struct sockaddr_ax25) + sizeof(ax25_address) * 6) || + (addr_len > sizeof(struct full_sockaddr_ax25))) { + err = -EINVAL; + goto out; + } + + printk(KERN_WARNING "ax25_sendmsg(): %s uses old (6 digipeater) socket structure.\n", + current->comm); + } + + if (addr_len > sizeof(struct sockaddr_ax25) && usax->sax25_ndigis != 0) { + int ct = 0; + struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)usax; + + /* Valid number of digipeaters ? */ + if (usax->sax25_ndigis < 1 || usax->sax25_ndigis > AX25_MAX_DIGIS) { + err = -EINVAL; + goto out; + } + + dtmp.ndigi = usax->sax25_ndigis; + + while (ct < usax->sax25_ndigis) { + dtmp.repeated[ct] = 0; + dtmp.calls[ct] = fsa->fsa_digipeater[ct]; + ct++; + } + + dtmp.lastrepeat = 0; + } + + sax = *usax; + if (sk->sk_type == SOCK_SEQPACKET && + ax25cmp(&ax25->dest_addr, &sax.sax25_call)) { + err = -EISCONN; + goto out; + } + if (usax->sax25_ndigis == 0) + dp = NULL; + else + dp = &dtmp; + } else { + /* + * FIXME: 1003.1g - if the socket is like this because + * it has become closed (not started closed) and is VC + * we ought to SIGPIPE, EPIPE + */ + if (sk->sk_state != TCP_ESTABLISHED) { + err = -ENOTCONN; + goto out; + } + sax.sax25_family = AF_AX25; + sax.sax25_call = ax25->dest_addr; + dp = ax25->digipeat; + } + + SOCK_DEBUG(sk, "AX.25: sendto: Addresses built.\n"); + + /* Build a packet */ + SOCK_DEBUG(sk, "AX.25: sendto: building packet.\n"); + + /* Assume the worst case */ + size = len + ax25->ax25_dev->dev->hard_header_len; + + skb = sock_alloc_send_skb(sk, size, msg->msg_flags&MSG_DONTWAIT, &err); + if (skb == NULL) + goto out; + + skb_reserve(skb, size - len); + + SOCK_DEBUG(sk, "AX.25: Appending user data\n"); + + /* User data follows immediately after the AX.25 data */ + if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { + err = -EFAULT; + kfree_skb(skb); + goto out; + } + + skb->nh.raw = skb->data; + + /* Add the PID if one is not supplied by the user in the skb */ + if (!ax25->pidincl) { + asmptr = skb_push(skb, 1); + *asmptr = sk->sk_protocol; + } + + SOCK_DEBUG(sk, "AX.25: Transmitting buffer\n"); + + if (sk->sk_type == SOCK_SEQPACKET) { + /* Connected mode sockets go via the LAPB machine */ + if (sk->sk_state != TCP_ESTABLISHED) { + kfree_skb(skb); + err = -ENOTCONN; + goto out; + } + + /* Shove it onto the queue and kick */ + ax25_output(ax25, ax25->paclen, skb); + + err = len; + goto out; + } + + asmptr = skb_push(skb, 1 + ax25_addr_size(dp)); + + SOCK_DEBUG(sk, "Building AX.25 Header (dp=%p).\n", dp); + + if (dp != NULL) + SOCK_DEBUG(sk, "Num digipeaters=%d\n", dp->ndigi); + + /* Build an AX.25 header */ + asmptr += (lv = ax25_addr_build(asmptr, &ax25->source_addr, + &sax.sax25_call, dp, + AX25_COMMAND, AX25_MODULUS)); + + SOCK_DEBUG(sk, "Built header (%d bytes)\n",lv); + + skb->h.raw = asmptr; + + SOCK_DEBUG(sk, "base=%p pos=%p\n", skb->data, asmptr); + + *asmptr = AX25_UI; + + /* Datagram frames go straight out of the door as UI */ + skb->dev = ax25->ax25_dev->dev; + + ax25_queue_xmit(skb); + + err = len; + +out: + release_sock(sk); + + return err; +} + +static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + int copied; + int err = 0; + + lock_sock(sk); + /* + * This works for seqpacket too. The receiver has ordered the + * queue for us! We do one quick check first though + */ + if (sk->sk_type == SOCK_SEQPACKET && sk->sk_state != TCP_ESTABLISHED) { + err = -ENOTCONN; + goto out; + } + + /* Now we can treat all alike */ + skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, + flags & MSG_DONTWAIT, &err); + if (skb == NULL) + goto out; + + if (!ax25_sk(sk)->pidincl) + skb_pull(skb, 1); /* Remove PID */ + + skb->h.raw = skb->data; + copied = skb->len; + + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + + if (msg->msg_namelen != 0) { + struct sockaddr_ax25 *sax = (struct sockaddr_ax25 *)msg->msg_name; + ax25_digi digi; + ax25_address src; + + ax25_addr_parse(skb->mac.raw+1, skb->data-skb->mac.raw-1, &src, NULL, &digi, NULL, NULL); + + sax->sax25_family = AF_AX25; + /* We set this correctly, even though we may not let the + application know the digi calls further down (because it + did NOT ask to know them). This could get political... **/ + sax->sax25_ndigis = digi.ndigi; + sax->sax25_call = src; + + if (sax->sax25_ndigis != 0) { + int ct; + struct full_sockaddr_ax25 *fsa = (struct full_sockaddr_ax25 *)sax; + + for (ct = 0; ct < digi.ndigi; ct++) + fsa->fsa_digipeater[ct] = digi.calls[ct]; + } + msg->msg_namelen = sizeof(struct full_sockaddr_ax25); + } + + skb_free_datagram(sk, skb); + err = copied; + +out: + release_sock(sk); + + return err; +} + +static int ax25_shutdown(struct socket *sk, int how) +{ + /* FIXME - generate DM and RNR states */ + return -EOPNOTSUPP; +} + +static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + void __user *argp = (void __user *)arg; + int res = 0; + + lock_sock(sk); + switch (cmd) { + case TIOCOUTQ: { + long amount; + amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); + if (amount < 0) + amount = 0; + res = put_user(amount, (int __user *)argp); + break; + } + + case TIOCINQ: { + struct sk_buff *skb; + long amount = 0L; + /* These two are safe on a single CPU system as only user tasks fiddle here */ + if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) + amount = skb->len; + res = put_user(amount, (int __user *)argp); + break; + } + + case SIOCGSTAMP: + if (sk != NULL) { + res = sock_get_timestamp(sk, argp); + break; + } + res = -EINVAL; + break; + + case SIOCAX25ADDUID: /* Add a uid to the uid/call map table */ + case SIOCAX25DELUID: /* Delete a uid from the uid/call map table */ + case SIOCAX25GETUID: { + struct sockaddr_ax25 sax25; + if (copy_from_user(&sax25, argp, sizeof(sax25))) { + res = -EFAULT; + break; + } + res = ax25_uid_ioctl(cmd, &sax25); + break; + } + + case SIOCAX25NOUID: { /* Set the default policy (default/bar) */ + long amount; + if (!capable(CAP_NET_ADMIN)) { + res = -EPERM; + break; + } + if (get_user(amount, (long __user *)argp)) { + res = -EFAULT; + break; + } + if (amount > AX25_NOUID_BLOCK) { + res = -EINVAL; + break; + } + ax25_uid_policy = amount; + res = 0; + break; + } + + case SIOCADDRT: + case SIOCDELRT: + case SIOCAX25OPTRT: + if (!capable(CAP_NET_ADMIN)) { + res = -EPERM; + break; + } + res = ax25_rt_ioctl(cmd, argp); + break; + + case SIOCAX25CTLCON: + if (!capable(CAP_NET_ADMIN)) { + res = -EPERM; + break; + } + res = ax25_ctl_ioctl(cmd, argp); + break; + + case SIOCAX25GETINFO: + case SIOCAX25GETINFOOLD: { + ax25_cb *ax25 = ax25_sk(sk); + struct ax25_info_struct ax25_info; + + ax25_info.t1 = ax25->t1 / HZ; + ax25_info.t2 = ax25->t2 / HZ; + ax25_info.t3 = ax25->t3 / HZ; + ax25_info.idle = ax25->idle / (60 * HZ); + ax25_info.n2 = ax25->n2; + ax25_info.t1timer = ax25_display_timer(&ax25->t1timer) / HZ; + ax25_info.t2timer = ax25_display_timer(&ax25->t2timer) / HZ; + ax25_info.t3timer = ax25_display_timer(&ax25->t3timer) / HZ; + ax25_info.idletimer = ax25_display_timer(&ax25->idletimer) / (60 * HZ); + ax25_info.n2count = ax25->n2count; + ax25_info.state = ax25->state; + ax25_info.rcv_q = atomic_read(&sk->sk_rmem_alloc); + ax25_info.snd_q = atomic_read(&sk->sk_wmem_alloc); + ax25_info.vs = ax25->vs; + ax25_info.vr = ax25->vr; + ax25_info.va = ax25->va; + ax25_info.vs_max = ax25->vs; /* reserved */ + ax25_info.paclen = ax25->paclen; + ax25_info.window = ax25->window; + + /* old structure? */ + if (cmd == SIOCAX25GETINFOOLD) { + static int warned = 0; + if (!warned) { + printk(KERN_INFO "%s uses old SIOCAX25GETINFO\n", + current->comm); + warned=1; + } + + if (copy_to_user(argp, &ax25_info, sizeof(struct ax25_info_struct_deprecated))) { + res = -EFAULT; + break; + } + } else { + if (copy_to_user(argp, &ax25_info, sizeof(struct ax25_info_struct))) { + res = -EINVAL; + break; + } + } + res = 0; + break; + } + + case SIOCAX25ADDFWD: + case SIOCAX25DELFWD: { + struct ax25_fwd_struct ax25_fwd; + if (!capable(CAP_NET_ADMIN)) { + res = -EPERM; + break; + } + if (copy_from_user(&ax25_fwd, argp, sizeof(ax25_fwd))) { + res = -EFAULT; + break; + } + res = ax25_fwd_ioctl(cmd, &ax25_fwd); + break; + } + + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + res = -EINVAL; + break; + + default: + res = dev_ioctl(cmd, argp); + break; + } + release_sock(sk); + + return res; +} + +#ifdef CONFIG_PROC_FS + +static void *ax25_info_start(struct seq_file *seq, loff_t *pos) +{ + struct ax25_cb *ax25; + struct hlist_node *node; + int i = 0; + + spin_lock_bh(&ax25_list_lock); + ax25_for_each(ax25, node, &ax25_list) { + if (i == *pos) + return ax25; + ++i; + } + return NULL; +} + +static void *ax25_info_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + + return hlist_entry( ((struct ax25_cb *)v)->ax25_node.next, + struct ax25_cb, ax25_node); +} + +static void ax25_info_stop(struct seq_file *seq, void *v) +{ + spin_unlock_bh(&ax25_list_lock); +} + +static int ax25_info_show(struct seq_file *seq, void *v) +{ + ax25_cb *ax25 = v; + int k; + + + /* + * New format: + * magic dev src_addr dest_addr,digi1,digi2,.. st vs vr va t1 t1 t2 t2 t3 t3 idle idle n2 n2 rtt window paclen Snd-Q Rcv-Q inode + */ + + seq_printf(seq, "%8.8lx %s %s%s ", + (long) ax25, + ax25->ax25_dev == NULL? "???" : ax25->ax25_dev->dev->name, + ax2asc(&ax25->source_addr), + ax25->iamdigi? "*":""); + seq_printf(seq, "%s", ax2asc(&ax25->dest_addr)); + + for (k=0; (ax25->digipeat != NULL) && (k < ax25->digipeat->ndigi); k++) { + seq_printf(seq, ",%s%s", + ax2asc(&ax25->digipeat->calls[k]), + ax25->digipeat->repeated[k]? "*":""); + } + + seq_printf(seq, " %d %d %d %d %lu %lu %lu %lu %lu %lu %lu %lu %d %d %lu %d %d", + ax25->state, + ax25->vs, ax25->vr, ax25->va, + ax25_display_timer(&ax25->t1timer) / HZ, ax25->t1 / HZ, + ax25_display_timer(&ax25->t2timer) / HZ, ax25->t2 / HZ, + ax25_display_timer(&ax25->t3timer) / HZ, ax25->t3 / HZ, + ax25_display_timer(&ax25->idletimer) / (60 * HZ), + ax25->idle / (60 * HZ), + ax25->n2count, ax25->n2, + ax25->rtt / HZ, + ax25->window, + ax25->paclen); + + if (ax25->sk != NULL) { + bh_lock_sock(ax25->sk); + seq_printf(seq," %d %d %ld\n", + atomic_read(&ax25->sk->sk_wmem_alloc), + atomic_read(&ax25->sk->sk_rmem_alloc), + ax25->sk->sk_socket != NULL ? SOCK_INODE(ax25->sk->sk_socket)->i_ino : 0L); + bh_unlock_sock(ax25->sk); + } else { + seq_puts(seq, " * * *\n"); + } + return 0; +} + +static struct seq_operations ax25_info_seqops = { + .start = ax25_info_start, + .next = ax25_info_next, + .stop = ax25_info_stop, + .show = ax25_info_show, +}; + +static int ax25_info_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ax25_info_seqops); +} + +static struct file_operations ax25_info_fops = { + .owner = THIS_MODULE, + .open = ax25_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + +static struct net_proto_family ax25_family_ops = { + .family = PF_AX25, + .create = ax25_create, + .owner = THIS_MODULE, +}; + +static struct proto_ops ax25_proto_ops = { + .family = PF_AX25, + .owner = THIS_MODULE, + .release = ax25_release, + .bind = ax25_bind, + .connect = ax25_connect, + .socketpair = sock_no_socketpair, + .accept = ax25_accept, + .getname = ax25_getname, + .poll = datagram_poll, + .ioctl = ax25_ioctl, + .listen = ax25_listen, + .shutdown = ax25_shutdown, + .setsockopt = ax25_setsockopt, + .getsockopt = ax25_getsockopt, + .sendmsg = ax25_sendmsg, + .recvmsg = ax25_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +/* + * Called by socket.c on kernel start up + */ +static struct packet_type ax25_packet_type = { + .type = __constant_htons(ETH_P_AX25), + .dev = NULL, /* All devices */ + .func = ax25_kiss_rcv, +}; + +static struct notifier_block ax25_dev_notifier = { + .notifier_call =ax25_device_event, +}; + +EXPORT_SYMBOL(ax25_encapsulate); +EXPORT_SYMBOL(ax25_rebuild_header); +EXPORT_SYMBOL(ax25_findbyuid); +EXPORT_SYMBOL(ax25_find_cb); +EXPORT_SYMBOL(ax25_linkfail_register); +EXPORT_SYMBOL(ax25_linkfail_release); +EXPORT_SYMBOL(ax25_listen_register); +EXPORT_SYMBOL(ax25_listen_release); +EXPORT_SYMBOL(ax25_protocol_register); +EXPORT_SYMBOL(ax25_protocol_release); +EXPORT_SYMBOL(ax25_send_frame); +EXPORT_SYMBOL(ax25_uid_policy); +EXPORT_SYMBOL(ax25cmp); +EXPORT_SYMBOL(ax2asc); +EXPORT_SYMBOL(asc2ax); +EXPORT_SYMBOL(null_ax25_address); +EXPORT_SYMBOL(ax25_display_timer); + +static int __init ax25_init(void) +{ + int rc = proto_register(&ax25_proto, 0); + + if (rc != 0) + goto out; + + sock_register(&ax25_family_ops); + dev_add_pack(&ax25_packet_type); + register_netdevice_notifier(&ax25_dev_notifier); + ax25_register_sysctl(); + + proc_net_fops_create("ax25_route", S_IRUGO, &ax25_route_fops); + proc_net_fops_create("ax25", S_IRUGO, &ax25_info_fops); + proc_net_fops_create("ax25_calls", S_IRUGO, &ax25_uid_fops); +out: + return rc; +} +module_init(ax25_init); + + +MODULE_AUTHOR("Jonathan Naylor G4KLX "); +MODULE_DESCRIPTION("The amateur radio AX.25 link layer protocol"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_AX25); + +static void __exit ax25_exit(void) +{ + proc_net_remove("ax25_route"); + proc_net_remove("ax25"); + proc_net_remove("ax25_calls"); + ax25_rt_free(); + ax25_uid_free(); + ax25_dev_free(); + + ax25_unregister_sysctl(); + unregister_netdevice_notifier(&ax25_dev_notifier); + + dev_remove_pack(&ax25_packet_type); + + sock_unregister(PF_AX25); + proto_unregister(&ax25_proto); +} +module_exit(ax25_exit); diff --git a/net/ax25/ax25_addr.c b/net/ax25/ax25_addr.c new file mode 100644 index 000000000000..f4fa6dfb846e --- /dev/null +++ b/net/ax25/ax25_addr.c @@ -0,0 +1,290 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The null address is defined as a callsign of all spaces with an + * SSID of zero. + */ +ax25_address null_ax25_address = {{0x40, 0x40, 0x40, 0x40, 0x40, 0x40, 0x00}}; + +/* + * ax25 -> ascii conversion + */ +char *ax2asc(ax25_address *a) +{ + static char buf[11]; + char c, *s; + int n; + + for (n = 0, s = buf; n < 6; n++) { + c = (a->ax25_call[n] >> 1) & 0x7F; + + if (c != ' ') *s++ = c; + } + + *s++ = '-'; + + if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { + *s++ = '1'; + n -= 10; + } + + *s++ = n + '0'; + *s++ = '\0'; + + if (*buf == '\0' || *buf == '-') + return "*"; + + return buf; + +} + +/* + * ascii -> ax25 conversion + */ +ax25_address *asc2ax(char *callsign) +{ + static ax25_address addr; + char *s; + int n; + + for (s = callsign, n = 0; n < 6; n++) { + if (*s != '\0' && *s != '-') + addr.ax25_call[n] = *s++; + else + addr.ax25_call[n] = ' '; + addr.ax25_call[n] <<= 1; + addr.ax25_call[n] &= 0xFE; + } + + if (*s++ == '\0') { + addr.ax25_call[6] = 0x00; + return &addr; + } + + addr.ax25_call[6] = *s++ - '0'; + + if (*s != '\0') { + addr.ax25_call[6] *= 10; + addr.ax25_call[6] += *s++ - '0'; + } + + addr.ax25_call[6] <<= 1; + addr.ax25_call[6] &= 0x1E; + + return &addr; +} + +/* + * Compare two ax.25 addresses + */ +int ax25cmp(ax25_address *a, ax25_address *b) +{ + int ct = 0; + + while (ct < 6) { + if ((a->ax25_call[ct] & 0xFE) != (b->ax25_call[ct] & 0xFE)) /* Clean off repeater bits */ + return 1; + ct++; + } + + if ((a->ax25_call[ct] & 0x1E) == (b->ax25_call[ct] & 0x1E)) /* SSID without control bit */ + return 0; + + return 2; /* Partial match */ +} + +/* + * Compare two AX.25 digipeater paths. + */ +int ax25digicmp(ax25_digi *digi1, ax25_digi *digi2) +{ + int i; + + if (digi1->ndigi != digi2->ndigi) + return 1; + + if (digi1->lastrepeat != digi2->lastrepeat) + return 1; + + for (i = 0; i < digi1->ndigi; i++) + if (ax25cmp(&digi1->calls[i], &digi2->calls[i]) != 0) + return 1; + + return 0; +} + +/* + * Given an AX.25 address pull of to, from, digi list, command/response and the start of data + * + */ +unsigned char *ax25_addr_parse(unsigned char *buf, int len, ax25_address *src, ax25_address *dest, ax25_digi *digi, int *flags, int *dama) +{ + int d = 0; + + if (len < 14) return NULL; + + if (flags != NULL) { + *flags = 0; + + if (buf[6] & AX25_CBIT) + *flags = AX25_COMMAND; + if (buf[13] & AX25_CBIT) + *flags = AX25_RESPONSE; + } + + if (dama != NULL) + *dama = ~buf[13] & AX25_DAMA_FLAG; + + /* Copy to, from */ + if (dest != NULL) + memcpy(dest, buf + 0, AX25_ADDR_LEN); + if (src != NULL) + memcpy(src, buf + 7, AX25_ADDR_LEN); + + buf += 2 * AX25_ADDR_LEN; + len -= 2 * AX25_ADDR_LEN; + + digi->lastrepeat = -1; + digi->ndigi = 0; + + while (!(buf[-1] & AX25_EBIT)) { + if (d >= AX25_MAX_DIGIS) return NULL; /* Max of 6 digis */ + if (len < 7) return NULL; /* Short packet */ + + memcpy(&digi->calls[d], buf, AX25_ADDR_LEN); + digi->ndigi = d + 1; + + if (buf[6] & AX25_HBIT) { + digi->repeated[d] = 1; + digi->lastrepeat = d; + } else { + digi->repeated[d] = 0; + } + + buf += AX25_ADDR_LEN; + len -= AX25_ADDR_LEN; + d++; + } + + return buf; +} + +/* + * Assemble an AX.25 header from the bits + */ +int ax25_addr_build(unsigned char *buf, ax25_address *src, ax25_address *dest, ax25_digi *d, int flag, int modulus) +{ + int len = 0; + int ct = 0; + + memcpy(buf, dest, AX25_ADDR_LEN); + buf[6] &= ~(AX25_EBIT | AX25_CBIT); + buf[6] |= AX25_SSSID_SPARE; + + if (flag == AX25_COMMAND) buf[6] |= AX25_CBIT; + + buf += AX25_ADDR_LEN; + len += AX25_ADDR_LEN; + + memcpy(buf, src, AX25_ADDR_LEN); + buf[6] &= ~(AX25_EBIT | AX25_CBIT); + buf[6] &= ~AX25_SSSID_SPARE; + + if (modulus == AX25_MODULUS) + buf[6] |= AX25_SSSID_SPARE; + else + buf[6] |= AX25_ESSID_SPARE; + + if (flag == AX25_RESPONSE) buf[6] |= AX25_CBIT; + + /* + * Fast path the normal digiless path + */ + if (d == NULL || d->ndigi == 0) { + buf[6] |= AX25_EBIT; + return 2 * AX25_ADDR_LEN; + } + + buf += AX25_ADDR_LEN; + len += AX25_ADDR_LEN; + + while (ct < d->ndigi) { + memcpy(buf, &d->calls[ct], AX25_ADDR_LEN); + + if (d->repeated[ct]) + buf[6] |= AX25_HBIT; + else + buf[6] &= ~AX25_HBIT; + + buf[6] &= ~AX25_EBIT; + buf[6] |= AX25_SSSID_SPARE; + + buf += AX25_ADDR_LEN; + len += AX25_ADDR_LEN; + ct++; + } + + buf[-1] |= AX25_EBIT; + + return len; +} + +int ax25_addr_size(ax25_digi *dp) +{ + if (dp == NULL) + return 2 * AX25_ADDR_LEN; + + return AX25_ADDR_LEN * (2 + dp->ndigi); +} + +/* + * Reverse Digipeat List. May not pass both parameters as same struct + */ +void ax25_digi_invert(ax25_digi *in, ax25_digi *out) +{ + int ct; + + out->ndigi = in->ndigi; + out->lastrepeat = in->ndigi - in->lastrepeat - 2; + + /* Invert the digipeaters */ + for (ct = 0; ct < in->ndigi; ct++) { + out->calls[ct] = in->calls[in->ndigi - ct - 1]; + + if (ct <= out->lastrepeat) { + out->calls[ct].ax25_call[6] |= AX25_HBIT; + out->repeated[ct] = 1; + } else { + out->calls[ct].ax25_call[6] &= ~AX25_HBIT; + out->repeated[ct] = 0; + } + } +} + diff --git a/net/ax25/ax25_dev.c b/net/ax25/ax25_dev.c new file mode 100644 index 000000000000..dab77efe34a6 --- /dev/null +++ b/net/ax25/ax25_dev.c @@ -0,0 +1,208 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +ax25_dev *ax25_dev_list; +DEFINE_SPINLOCK(ax25_dev_lock); + +ax25_dev *ax25_addr_ax25dev(ax25_address *addr) +{ + ax25_dev *ax25_dev, *res = NULL; + + spin_lock_bh(&ax25_dev_lock); + for (ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) + if (ax25cmp(addr, (ax25_address *)ax25_dev->dev->dev_addr) == 0) { + res = ax25_dev; + } + spin_unlock_bh(&ax25_dev_lock); + + return res; +} + +/* + * This is called when an interface is brought up. These are + * reasonable defaults. + */ +void ax25_dev_device_up(struct net_device *dev) +{ + ax25_dev *ax25_dev; + + if ((ax25_dev = kmalloc(sizeof(*ax25_dev), GFP_ATOMIC)) == NULL) { + printk(KERN_ERR "AX.25: ax25_dev_device_up - out of memory\n"); + return; + } + + ax25_unregister_sysctl(); + + memset(ax25_dev, 0x00, sizeof(*ax25_dev)); + + dev->ax25_ptr = ax25_dev; + ax25_dev->dev = dev; + dev_hold(dev); + ax25_dev->forward = NULL; + + ax25_dev->values[AX25_VALUES_IPDEFMODE] = AX25_DEF_IPDEFMODE; + ax25_dev->values[AX25_VALUES_AXDEFMODE] = AX25_DEF_AXDEFMODE; + ax25_dev->values[AX25_VALUES_BACKOFF] = AX25_DEF_BACKOFF; + ax25_dev->values[AX25_VALUES_CONMODE] = AX25_DEF_CONMODE; + ax25_dev->values[AX25_VALUES_WINDOW] = AX25_DEF_WINDOW; + ax25_dev->values[AX25_VALUES_EWINDOW] = AX25_DEF_EWINDOW; + ax25_dev->values[AX25_VALUES_T1] = AX25_DEF_T1; + ax25_dev->values[AX25_VALUES_T2] = AX25_DEF_T2; + ax25_dev->values[AX25_VALUES_T3] = AX25_DEF_T3; + ax25_dev->values[AX25_VALUES_IDLE] = AX25_DEF_IDLE; + ax25_dev->values[AX25_VALUES_N2] = AX25_DEF_N2; + ax25_dev->values[AX25_VALUES_PACLEN] = AX25_DEF_PACLEN; + ax25_dev->values[AX25_VALUES_PROTOCOL] = AX25_DEF_PROTOCOL; + ax25_dev->values[AX25_VALUES_DS_TIMEOUT]= AX25_DEF_DS_TIMEOUT; + +#if defined(CONFIG_AX25_DAMA_SLAVE) || defined(CONFIG_AX25_DAMA_MASTER) + init_timer(&ax25_dev->dama.slave_timer); +#endif + + spin_lock_bh(&ax25_dev_lock); + ax25_dev->next = ax25_dev_list; + ax25_dev_list = ax25_dev; + spin_unlock_bh(&ax25_dev_lock); + + ax25_register_sysctl(); +} + +void ax25_dev_device_down(struct net_device *dev) +{ + ax25_dev *s, *ax25_dev; + + if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + return; + + ax25_unregister_sysctl(); + + spin_lock_bh(&ax25_dev_lock); + +#ifdef CONFIG_AX25_DAMA_SLAVE + ax25_ds_del_timer(ax25_dev); +#endif + + /* + * Remove any packet forwarding that points to this device. + */ + for (s = ax25_dev_list; s != NULL; s = s->next) + if (s->forward == dev) + s->forward = NULL; + + if ((s = ax25_dev_list) == ax25_dev) { + ax25_dev_list = s->next; + spin_unlock_bh(&ax25_dev_lock); + dev_put(dev); + kfree(ax25_dev); + ax25_register_sysctl(); + return; + } + + while (s != NULL && s->next != NULL) { + if (s->next == ax25_dev) { + s->next = ax25_dev->next; + spin_unlock_bh(&ax25_dev_lock); + dev_put(dev); + kfree(ax25_dev); + ax25_register_sysctl(); + return; + } + + s = s->next; + } + spin_unlock_bh(&ax25_dev_lock); + dev->ax25_ptr = NULL; + + ax25_register_sysctl(); +} + +int ax25_fwd_ioctl(unsigned int cmd, struct ax25_fwd_struct *fwd) +{ + ax25_dev *ax25_dev, *fwd_dev; + + if ((ax25_dev = ax25_addr_ax25dev(&fwd->port_from)) == NULL) + return -EINVAL; + + switch (cmd) { + case SIOCAX25ADDFWD: + if ((fwd_dev = ax25_addr_ax25dev(&fwd->port_to)) == NULL) + return -EINVAL; + if (ax25_dev->forward != NULL) + return -EINVAL; + ax25_dev->forward = fwd_dev->dev; + break; + + case SIOCAX25DELFWD: + if (ax25_dev->forward == NULL) + return -EINVAL; + ax25_dev->forward = NULL; + break; + + default: + return -EINVAL; + } + + return 0; +} + +struct net_device *ax25_fwd_dev(struct net_device *dev) +{ + ax25_dev *ax25_dev; + + if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + return dev; + + if (ax25_dev->forward == NULL) + return dev; + + return ax25_dev->forward; +} + +/* + * Free all memory associated with device structures. + */ +void __exit ax25_dev_free(void) +{ + ax25_dev *s, *ax25_dev; + + spin_lock_bh(&ax25_dev_lock); + ax25_dev = ax25_dev_list; + while (ax25_dev != NULL) { + s = ax25_dev; + dev_put(ax25_dev->dev); + ax25_dev = ax25_dev->next; + kfree(s); + } + ax25_dev_list = NULL; + spin_unlock_bh(&ax25_dev_lock); +} diff --git a/net/ax25/ax25_ds_in.c b/net/ax25/ax25_ds_in.c new file mode 100644 index 000000000000..8adc0022cf58 --- /dev/null +++ b/net/ax25/ax25_ds_in.c @@ -0,0 +1,305 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For ip_rcv */ +#include +#include +#include +#include +#include +#include + +/* + * State machine for state 1, Awaiting Connection State. + * The handling of the timer(s) is in file ax25_ds_timer.c. + * Handling of state 0 and connection release is in ax25.c. + */ +static int ax25_ds_state1_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type) +{ + switch (frametype) { + case AX25_SABM: + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + break; + + case AX25_SABME: + ax25->modulus = AX25_EMODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_EWINDOW]; + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + break; + + case AX25_DISC: + ax25_send_control(ax25, AX25_DM, pf, AX25_RESPONSE); + break; + + case AX25_UA: + ax25_calculate_rtt(ax25); + ax25_stop_t1timer(ax25); + ax25_start_t3timer(ax25); + ax25_start_idletimer(ax25); + ax25->vs = 0; + ax25->va = 0; + ax25->vr = 0; + ax25->state = AX25_STATE_3; + ax25->n2count = 0; + if (ax25->sk != NULL) { + bh_lock_sock(ax25->sk); + ax25->sk->sk_state = TCP_ESTABLISHED; + /* + * For WAIT_SABM connections we will produce an accept + * ready socket here + */ + if (!sock_flag(ax25->sk, SOCK_DEAD)) + ax25->sk->sk_state_change(ax25->sk); + bh_unlock_sock(ax25->sk); + } + ax25_dama_on(ax25); + + /* according to DK4EG´s spec we are required to + * send a RR RESPONSE FINAL NR=0. + */ + + ax25_std_enquiry_response(ax25); + break; + + case AX25_DM: + if (pf) + ax25_disconnect(ax25, ECONNREFUSED); + break; + + default: + if (pf) + ax25_send_control(ax25, AX25_SABM, AX25_POLLON, AX25_COMMAND); + break; + } + + return 0; +} + +/* + * State machine for state 2, Awaiting Release State. + * The handling of the timer(s) is in file ax25_ds_timer.c + * Handling of state 0 and connection release is in ax25.c. + */ +static int ax25_ds_state2_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type) +{ + switch (frametype) { + case AX25_SABM: + case AX25_SABME: + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + ax25_dama_off(ax25); + break; + + case AX25_DISC: + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + ax25_dama_off(ax25); + ax25_disconnect(ax25, 0); + break; + + case AX25_DM: + case AX25_UA: + if (pf) { + ax25_dama_off(ax25); + ax25_disconnect(ax25, 0); + } + break; + + case AX25_I: + case AX25_REJ: + case AX25_RNR: + case AX25_RR: + if (pf) { + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + ax25_dama_off(ax25); + } + break; + + default: + break; + } + + return 0; +} + +/* + * State machine for state 3, Connected State. + * The handling of the timer(s) is in file ax25_timer.c + * Handling of state 0 and connection release is in ax25.c. + */ +static int ax25_ds_state3_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int ns, int nr, int pf, int type) +{ + int queued = 0; + + switch (frametype) { + case AX25_SABM: + case AX25_SABME: + if (frametype == AX25_SABM) { + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + } else { + ax25->modulus = AX25_EMODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_EWINDOW]; + } + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + ax25_stop_t1timer(ax25); + ax25_start_t3timer(ax25); + ax25_start_idletimer(ax25); + ax25->condition = 0x00; + ax25->vs = 0; + ax25->va = 0; + ax25->vr = 0; + ax25_requeue_frames(ax25); + ax25_dama_on(ax25); + break; + + case AX25_DISC: + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + ax25_dama_off(ax25); + ax25_disconnect(ax25, 0); + break; + + case AX25_DM: + ax25_dama_off(ax25); + ax25_disconnect(ax25, ECONNRESET); + break; + + case AX25_RR: + case AX25_RNR: + if (frametype == AX25_RR) + ax25->condition &= ~AX25_COND_PEER_RX_BUSY; + else + ax25->condition |= AX25_COND_PEER_RX_BUSY; + + if (ax25_validate_nr(ax25, nr)) { + if (ax25_check_iframes_acked(ax25, nr)) + ax25->n2count=0; + if (type == AX25_COMMAND && pf) + ax25_ds_enquiry_response(ax25); + } else { + ax25_ds_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + } + break; + + case AX25_REJ: + ax25->condition &= ~AX25_COND_PEER_RX_BUSY; + + if (ax25_validate_nr(ax25, nr)) { + if (ax25->va != nr) + ax25->n2count=0; + + ax25_frames_acked(ax25, nr); + ax25_calculate_rtt(ax25); + ax25_stop_t1timer(ax25); + ax25_start_t3timer(ax25); + ax25_requeue_frames(ax25); + + if (type == AX25_COMMAND && pf) + ax25_ds_enquiry_response(ax25); + } else { + ax25_ds_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + } + break; + + case AX25_I: + if (!ax25_validate_nr(ax25, nr)) { + ax25_ds_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + break; + } + if (ax25->condition & AX25_COND_PEER_RX_BUSY) { + ax25_frames_acked(ax25, nr); + ax25->n2count = 0; + } else { + if (ax25_check_iframes_acked(ax25, nr)) + ax25->n2count = 0; + } + if (ax25->condition & AX25_COND_OWN_RX_BUSY) { + if (pf) ax25_ds_enquiry_response(ax25); + break; + } + if (ns == ax25->vr) { + ax25->vr = (ax25->vr + 1) % ax25->modulus; + queued = ax25_rx_iframe(ax25, skb); + if (ax25->condition & AX25_COND_OWN_RX_BUSY) + ax25->vr = ns; /* ax25->vr - 1 */ + ax25->condition &= ~AX25_COND_REJECT; + if (pf) { + ax25_ds_enquiry_response(ax25); + } else { + if (!(ax25->condition & AX25_COND_ACK_PENDING)) { + ax25->condition |= AX25_COND_ACK_PENDING; + ax25_start_t2timer(ax25); + } + } + } else { + if (ax25->condition & AX25_COND_REJECT) { + if (pf) ax25_ds_enquiry_response(ax25); + } else { + ax25->condition |= AX25_COND_REJECT; + ax25_ds_enquiry_response(ax25); + ax25->condition &= ~AX25_COND_ACK_PENDING; + } + } + break; + + case AX25_FRMR: + case AX25_ILLEGAL: + ax25_ds_establish_data_link(ax25); + ax25->state = AX25_STATE_1; + break; + + default: + break; + } + + return queued; +} + +/* + * Higher level upcall for a LAPB frame + */ +int ax25_ds_frame_in(ax25_cb *ax25, struct sk_buff *skb, int type) +{ + int queued = 0, frametype, ns, nr, pf; + + frametype = ax25_decode(ax25, skb, &ns, &nr, &pf); + + switch (ax25->state) { + case AX25_STATE_1: + queued = ax25_ds_state1_machine(ax25, skb, frametype, pf, type); + break; + case AX25_STATE_2: + queued = ax25_ds_state2_machine(ax25, skb, frametype, pf, type); + break; + case AX25_STATE_3: + queued = ax25_ds_state3_machine(ax25, skb, frametype, ns, nr, pf, type); + break; + } + + return queued; +} + diff --git a/net/ax25/ax25_ds_subr.c b/net/ax25/ax25_ds_subr.c new file mode 100644 index 000000000000..10ffd2beba3f --- /dev/null +++ b/net/ax25/ax25_ds_subr.c @@ -0,0 +1,212 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void ax25_ds_nr_error_recovery(ax25_cb *ax25) +{ + ax25_ds_establish_data_link(ax25); +} + +/* + * dl1bke 960114: transmit I frames on DAMA poll + */ +void ax25_ds_enquiry_response(ax25_cb *ax25) +{ + ax25_cb *ax25o; + struct hlist_node *node; + + /* Please note that neither DK4EG´s nor DG2FEF´s + * DAMA spec mention the following behaviour as seen + * with TheFirmware: + * + * DB0ACH->DL1BKE [DAMA] + * DL1BKE->DB0ACH + * DL1BKE-7->DB0PRA-6 DB0ACH + * DL1BKE->DB0ACH + * + * The Flexnet DAMA Master implementation apparently + * insists on the "proper" AX.25 behaviour: + * + * DB0ACH->DL1BKE [DAMA] + * DL1BKE->DB0ACH + * DL1BKE->DB0ACH + * DL1BKE-7->DB0PRA-6 DB0ACH + * + * Flexnet refuses to send us *any* I frame if we send + * a REJ in case AX25_COND_REJECT is set. It is superfluous in + * this mode anyway (a RR or RNR invokes the retransmission). + * Is this a Flexnet bug? + */ + + ax25_std_enquiry_response(ax25); + + if (!(ax25->condition & AX25_COND_PEER_RX_BUSY)) { + ax25_requeue_frames(ax25); + ax25_kick(ax25); + } + + if (ax25->state == AX25_STATE_1 || ax25->state == AX25_STATE_2 || skb_peek(&ax25->ack_queue) != NULL) + ax25_ds_t1_timeout(ax25); + else + ax25->n2count = 0; + + ax25_start_t3timer(ax25); + ax25_ds_set_timer(ax25->ax25_dev); + + spin_lock_bh(&ax25_list_lock); + ax25_for_each(ax25o, node, &ax25_list) { + if (ax25o == ax25) + continue; + + if (ax25o->ax25_dev != ax25->ax25_dev) + continue; + + if (ax25o->state == AX25_STATE_1 || ax25o->state == AX25_STATE_2) { + ax25_ds_t1_timeout(ax25o); + continue; + } + + if (!(ax25o->condition & AX25_COND_PEER_RX_BUSY) && ax25o->state == AX25_STATE_3) { + ax25_requeue_frames(ax25o); + ax25_kick(ax25o); + } + + if (ax25o->state == AX25_STATE_1 || ax25o->state == AX25_STATE_2 || skb_peek(&ax25o->ack_queue) != NULL) + ax25_ds_t1_timeout(ax25o); + + /* do not start T3 for listening sockets (tnx DD8NE) */ + + if (ax25o->state != AX25_STATE_0) + ax25_start_t3timer(ax25o); + } + spin_unlock_bh(&ax25_list_lock); +} + +void ax25_ds_establish_data_link(ax25_cb *ax25) +{ + ax25->condition &= AX25_COND_DAMA_MODE; + ax25->n2count = 0; + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); + ax25_stop_t2timer(ax25); + ax25_start_t3timer(ax25); +} + +/* + * :::FIXME::: + * This is a kludge. Not all drivers recognize kiss commands. + * We need a driver level request to switch duplex mode, that does + * either SCC changing, PI config or KISS as required. Currently + * this request isn't reliable. + */ +static void ax25_kiss_cmd(ax25_dev *ax25_dev, unsigned char cmd, unsigned char param) +{ + struct sk_buff *skb; + unsigned char *p; + + if (ax25_dev->dev == NULL) + return; + + if ((skb = alloc_skb(2, GFP_ATOMIC)) == NULL) + return; + + skb->nh.raw = skb->data; + p = skb_put(skb, 2); + + *p++ = cmd; + *p++ = param; + + skb->dev = ax25_dev->dev; + skb->protocol = htons(ETH_P_AX25); + + dev_queue_xmit(skb); +} + +/* + * A nasty problem arises if we count the number of DAMA connections + * wrong, especially when connections on the device already existed + * and our network node (or the sysop) decides to turn on DAMA Master + * mode. We thus flag the 'real' slave connections with + * ax25->dama_slave=1 and look on every disconnect if still slave + * connections exist. + */ +static int ax25_check_dama_slave(ax25_dev *ax25_dev) +{ + ax25_cb *ax25; + int res = 0; + struct hlist_node *node; + + spin_lock_bh(&ax25_list_lock); + ax25_for_each(ax25, node, &ax25_list) + if (ax25->ax25_dev == ax25_dev && (ax25->condition & AX25_COND_DAMA_MODE) && ax25->state > AX25_STATE_1) { + res = 1; + break; + } + spin_unlock_bh(&ax25_list_lock); + + return res; +} + +static void ax25_dev_dama_on(ax25_dev *ax25_dev) +{ + if (ax25_dev == NULL) + return; + + if (ax25_dev->dama.slave == 0) + ax25_kiss_cmd(ax25_dev, 5, 1); + + ax25_dev->dama.slave = 1; + ax25_ds_set_timer(ax25_dev); +} + +void ax25_dev_dama_off(ax25_dev *ax25_dev) +{ + if (ax25_dev == NULL) + return; + + if (ax25_dev->dama.slave && !ax25_check_dama_slave(ax25_dev)) { + ax25_kiss_cmd(ax25_dev, 5, 0); + ax25_dev->dama.slave = 0; + ax25_ds_del_timer(ax25_dev); + } +} + +void ax25_dama_on(ax25_cb *ax25) +{ + ax25_dev_dama_on(ax25->ax25_dev); + ax25->condition |= AX25_COND_DAMA_MODE; +} + +void ax25_dama_off(ax25_cb *ax25) +{ + ax25->condition &= ~AX25_COND_DAMA_MODE; + ax25_dev_dama_off(ax25->ax25_dev); +} + diff --git a/net/ax25/ax25_ds_timer.c b/net/ax25/ax25_ds_timer.c new file mode 100644 index 000000000000..3a8b67316fc3 --- /dev/null +++ b/net/ax25/ax25_ds_timer.c @@ -0,0 +1,241 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void ax25_ds_timeout(unsigned long); + +/* + * Add DAMA slave timeout timer to timer list. + * Unlike the connection based timers the timeout function gets + * triggered every second. Please note that NET_AX25_DAMA_SLAVE_TIMEOUT + * (aka /proc/sys/net/ax25/{dev}/dama_slave_timeout) is still in + * 1/10th of a second. + */ + +static void ax25_ds_add_timer(ax25_dev *ax25_dev) +{ + struct timer_list *t = &ax25_dev->dama.slave_timer; + t->data = (unsigned long) ax25_dev; + t->function = &ax25_ds_timeout; + t->expires = jiffies + HZ; + add_timer(t); +} + +void ax25_ds_del_timer(ax25_dev *ax25_dev) +{ + if (ax25_dev) + del_timer(&ax25_dev->dama.slave_timer); +} + +void ax25_ds_set_timer(ax25_dev *ax25_dev) +{ + if (ax25_dev == NULL) /* paranoia */ + return; + + del_timer(&ax25_dev->dama.slave_timer); + ax25_dev->dama.slave_timeout = ax25_dev->values[AX25_VALUES_DS_TIMEOUT] / 10; + ax25_ds_add_timer(ax25_dev); +} + +/* + * DAMA Slave Timeout + * Silently discard all (slave) connections in case our master forgot us... + */ + +static void ax25_ds_timeout(unsigned long arg) +{ + ax25_dev *ax25_dev = (struct ax25_dev *) arg; + ax25_cb *ax25; + struct hlist_node *node; + + if (ax25_dev == NULL || !ax25_dev->dama.slave) + return; /* Yikes! */ + + if (!ax25_dev->dama.slave_timeout || --ax25_dev->dama.slave_timeout) { + ax25_ds_set_timer(ax25_dev); + return; + } + + spin_lock_bh(&ax25_list_lock); + ax25_for_each(ax25, node, &ax25_list) { + if (ax25->ax25_dev != ax25_dev || !(ax25->condition & AX25_COND_DAMA_MODE)) + continue; + + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + ax25_disconnect(ax25, ETIMEDOUT); + } + spin_unlock_bh(&ax25_list_lock); + + ax25_dev_dama_off(ax25_dev); +} + +void ax25_ds_heartbeat_expiry(ax25_cb *ax25) +{ + struct sock *sk=ax25->sk; + + if (sk) + bh_lock_sock(sk); + + switch (ax25->state) { + + case AX25_STATE_0: + /* Magic here: If we listen() and a new link dies before it + is accepted() it isn't 'dead' so doesn't get removed. */ + if (!sk || sock_flag(sk, SOCK_DESTROY) || + (sk->sk_state == TCP_LISTEN && + sock_flag(sk, SOCK_DEAD))) { + if (sk) { + sock_hold(sk); + ax25_destroy_socket(ax25); + sock_put(sk); + bh_unlock_sock(sk); + } else + ax25_destroy_socket(ax25); + return; + } + break; + + case AX25_STATE_3: + /* + * Check the state of the receive buffer. + */ + if (sk != NULL) { + if (atomic_read(&sk->sk_rmem_alloc) < + (sk->sk_rcvbuf / 2) && + (ax25->condition & AX25_COND_OWN_RX_BUSY)) { + ax25->condition &= ~AX25_COND_OWN_RX_BUSY; + ax25->condition &= ~AX25_COND_ACK_PENDING; + break; + } + } + break; + } + + if (sk) + bh_unlock_sock(sk); + + ax25_start_heartbeat(ax25); +} + +/* dl1bke 960114: T3 works much like the IDLE timeout, but + * gets reloaded with every frame for this + * connection. + */ +void ax25_ds_t3timer_expiry(ax25_cb *ax25) +{ + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + ax25_dama_off(ax25); + ax25_disconnect(ax25, ETIMEDOUT); +} + +/* dl1bke 960228: close the connection when IDLE expires. + * unlike T3 this timer gets reloaded only on + * I frames. + */ +void ax25_ds_idletimer_expiry(ax25_cb *ax25) +{ + ax25_clear_queues(ax25); + + ax25->n2count = 0; + ax25->state = AX25_STATE_2; + + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); + ax25_stop_t3timer(ax25); + + if (ax25->sk != NULL) { + bh_lock_sock(ax25->sk); + ax25->sk->sk_state = TCP_CLOSE; + ax25->sk->sk_err = 0; + ax25->sk->sk_shutdown |= SEND_SHUTDOWN; + if (!sock_flag(ax25->sk, SOCK_DEAD)) { + ax25->sk->sk_state_change(ax25->sk); + sock_set_flag(ax25->sk, SOCK_DEAD); + } + bh_unlock_sock(ax25->sk); + } +} + +/* dl1bke 960114: The DAMA protocol requires to send data and SABM/DISC + * within the poll of any connected channel. Remember + * that we are not allowed to send anything unless we + * get polled by the Master. + * + * Thus we'll have to do parts of our T1 handling in + * ax25_enquiry_response(). + */ +void ax25_ds_t1_timeout(ax25_cb *ax25) +{ + switch (ax25->state) { + case AX25_STATE_1: + if (ax25->n2count == ax25->n2) { + if (ax25->modulus == AX25_MODULUS) { + ax25_disconnect(ax25, ETIMEDOUT); + return; + } else { + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + ax25->n2count = 0; + ax25_send_control(ax25, AX25_SABM, AX25_POLLOFF, AX25_COMMAND); + } + } else { + ax25->n2count++; + if (ax25->modulus == AX25_MODULUS) + ax25_send_control(ax25, AX25_SABM, AX25_POLLOFF, AX25_COMMAND); + else + ax25_send_control(ax25, AX25_SABME, AX25_POLLOFF, AX25_COMMAND); + } + break; + + case AX25_STATE_2: + if (ax25->n2count == ax25->n2) { + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + ax25_disconnect(ax25, ETIMEDOUT); + return; + } else { + ax25->n2count++; + } + break; + + case AX25_STATE_3: + if (ax25->n2count == ax25->n2) { + ax25_send_control(ax25, AX25_DM, AX25_POLLON, AX25_RESPONSE); + ax25_disconnect(ax25, ETIMEDOUT); + return; + } else { + ax25->n2count++; + } + break; + } + + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); +} diff --git a/net/ax25/ax25_iface.c b/net/ax25/ax25_iface.c new file mode 100644 index 000000000000..d68aff100729 --- /dev/null +++ b/net/ax25/ax25_iface.c @@ -0,0 +1,266 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct protocol_struct { + struct protocol_struct *next; + unsigned int pid; + int (*func)(struct sk_buff *, ax25_cb *); +} *protocol_list = NULL; +static DEFINE_RWLOCK(protocol_list_lock); + +static struct linkfail_struct { + struct linkfail_struct *next; + void (*func)(ax25_cb *, int); +} *linkfail_list = NULL; +static DEFINE_SPINLOCK(linkfail_lock); + +static struct listen_struct { + struct listen_struct *next; + ax25_address callsign; + struct net_device *dev; +} *listen_list = NULL; +static DEFINE_SPINLOCK(listen_lock); + +int ax25_protocol_register(unsigned int pid, + int (*func)(struct sk_buff *, ax25_cb *)) +{ + struct protocol_struct *protocol; + + if (pid == AX25_P_TEXT || pid == AX25_P_SEGMENT) + return 0; +#ifdef CONFIG_INET + if (pid == AX25_P_IP || pid == AX25_P_ARP) + return 0; +#endif + if ((protocol = kmalloc(sizeof(*protocol), GFP_ATOMIC)) == NULL) + return 0; + + protocol->pid = pid; + protocol->func = func; + + write_lock(&protocol_list_lock); + protocol->next = protocol_list; + protocol_list = protocol; + write_unlock(&protocol_list_lock); + + return 1; +} + +void ax25_protocol_release(unsigned int pid) +{ + struct protocol_struct *s, *protocol; + + write_lock(&protocol_list_lock); + protocol = protocol_list; + if (protocol == NULL) { + write_unlock(&protocol_list_lock); + return; + } + + if (protocol->pid == pid) { + protocol_list = protocol->next; + write_unlock(&protocol_list_lock); + kfree(protocol); + return; + } + + while (protocol != NULL && protocol->next != NULL) { + if (protocol->next->pid == pid) { + s = protocol->next; + protocol->next = protocol->next->next; + write_unlock(&protocol_list_lock); + kfree(s); + return; + } + + protocol = protocol->next; + } + write_unlock(&protocol_list_lock); +} + +int ax25_linkfail_register(void (*func)(ax25_cb *, int)) +{ + struct linkfail_struct *linkfail; + + if ((linkfail = kmalloc(sizeof(*linkfail), GFP_ATOMIC)) == NULL) + return 0; + + linkfail->func = func; + + spin_lock_bh(&linkfail_lock); + linkfail->next = linkfail_list; + linkfail_list = linkfail; + spin_unlock_bh(&linkfail_lock); + + return 1; +} + +void ax25_linkfail_release(void (*func)(ax25_cb *, int)) +{ + struct linkfail_struct *s, *linkfail; + + spin_lock_bh(&linkfail_lock); + linkfail = linkfail_list; + if (linkfail == NULL) { + spin_unlock_bh(&linkfail_lock); + return; + } + + if (linkfail->func == func) { + linkfail_list = linkfail->next; + spin_unlock_bh(&linkfail_lock); + kfree(linkfail); + return; + } + + while (linkfail != NULL && linkfail->next != NULL) { + if (linkfail->next->func == func) { + s = linkfail->next; + linkfail->next = linkfail->next->next; + spin_unlock_bh(&linkfail_lock); + kfree(s); + return; + } + + linkfail = linkfail->next; + } + spin_unlock_bh(&linkfail_lock); +} + +int ax25_listen_register(ax25_address *callsign, struct net_device *dev) +{ + struct listen_struct *listen; + + if (ax25_listen_mine(callsign, dev)) + return 0; + + if ((listen = kmalloc(sizeof(*listen), GFP_ATOMIC)) == NULL) + return 0; + + listen->callsign = *callsign; + listen->dev = dev; + + spin_lock_bh(&listen_lock); + listen->next = listen_list; + listen_list = listen; + spin_unlock_bh(&listen_lock); + + return 1; +} + +void ax25_listen_release(ax25_address *callsign, struct net_device *dev) +{ + struct listen_struct *s, *listen; + + spin_lock_bh(&listen_lock); + listen = listen_list; + if (listen == NULL) { + spin_unlock_bh(&listen_lock); + return; + } + + if (ax25cmp(&listen->callsign, callsign) == 0 && listen->dev == dev) { + listen_list = listen->next; + spin_unlock_bh(&listen_lock); + kfree(listen); + return; + } + + while (listen != NULL && listen->next != NULL) { + if (ax25cmp(&listen->next->callsign, callsign) == 0 && listen->next->dev == dev) { + s = listen->next; + listen->next = listen->next->next; + spin_unlock_bh(&listen_lock); + kfree(s); + return; + } + + listen = listen->next; + } + spin_unlock_bh(&listen_lock); +} + +int (*ax25_protocol_function(unsigned int pid))(struct sk_buff *, ax25_cb *) +{ + int (*res)(struct sk_buff *, ax25_cb *) = NULL; + struct protocol_struct *protocol; + + read_lock(&protocol_list_lock); + for (protocol = protocol_list; protocol != NULL; protocol = protocol->next) + if (protocol->pid == pid) { + res = protocol->func; + break; + } + read_unlock(&protocol_list_lock); + + return res; +} + +int ax25_listen_mine(ax25_address *callsign, struct net_device *dev) +{ + struct listen_struct *listen; + + spin_lock_bh(&listen_lock); + for (listen = listen_list; listen != NULL; listen = listen->next) + if (ax25cmp(&listen->callsign, callsign) == 0 && (listen->dev == dev || listen->dev == NULL)) { + spin_unlock_bh(&listen_lock); + return 1; + } + spin_unlock_bh(&listen_lock); + + return 0; +} + +void ax25_link_failed(ax25_cb *ax25, int reason) +{ + struct linkfail_struct *linkfail; + + spin_lock_bh(&linkfail_lock); + for (linkfail = linkfail_list; linkfail != NULL; linkfail = linkfail->next) + (linkfail->func)(ax25, reason); + spin_unlock_bh(&linkfail_lock); +} + +int ax25_protocol_is_registered(unsigned int pid) +{ + struct protocol_struct *protocol; + int res = 0; + + read_lock(&protocol_list_lock); + for (protocol = protocol_list; protocol != NULL; protocol = protocol->next) + if (protocol->pid == pid) { + res = 1; + break; + } + read_unlock(&protocol_list_lock); + + return res; +} diff --git a/net/ax25/ax25_in.c b/net/ax25/ax25_in.c new file mode 100644 index 000000000000..3dc808fde33f --- /dev/null +++ b/net/ax25/ax25_in.c @@ -0,0 +1,470 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For ip_rcv */ +#include +#include /* For arp_rcv */ +#include +#include +#include +#include +#include + +/* + * Given a fragment, queue it on the fragment queue and if the fragment + * is complete, send it back to ax25_rx_iframe. + */ +static int ax25_rx_fragment(ax25_cb *ax25, struct sk_buff *skb) +{ + struct sk_buff *skbn, *skbo; + + if (ax25->fragno != 0) { + if (!(*skb->data & AX25_SEG_FIRST)) { + if ((ax25->fragno - 1) == (*skb->data & AX25_SEG_REM)) { + /* Enqueue fragment */ + ax25->fragno = *skb->data & AX25_SEG_REM; + skb_pull(skb, 1); /* skip fragno */ + ax25->fraglen += skb->len; + skb_queue_tail(&ax25->frag_queue, skb); + + /* Last fragment received ? */ + if (ax25->fragno == 0) { + skbn = alloc_skb(AX25_MAX_HEADER_LEN + + ax25->fraglen, + GFP_ATOMIC); + if (!skbn) { + skb_queue_purge(&ax25->frag_queue); + return 1; + } + + skb_reserve(skbn, AX25_MAX_HEADER_LEN); + + skbn->dev = ax25->ax25_dev->dev; + skbn->h.raw = skbn->data; + skbn->nh.raw = skbn->data; + + /* Copy data from the fragments */ + while ((skbo = skb_dequeue(&ax25->frag_queue)) != NULL) { + memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); + kfree_skb(skbo); + } + + ax25->fraglen = 0; + + if (ax25_rx_iframe(ax25, skbn) == 0) + kfree_skb(skbn); + } + + return 1; + } + } + } else { + /* First fragment received */ + if (*skb->data & AX25_SEG_FIRST) { + skb_queue_purge(&ax25->frag_queue); + ax25->fragno = *skb->data & AX25_SEG_REM; + skb_pull(skb, 1); /* skip fragno */ + ax25->fraglen = skb->len; + skb_queue_tail(&ax25->frag_queue, skb); + return 1; + } + } + + return 0; +} + +/* + * This is where all valid I frames are sent to, to be dispatched to + * whichever protocol requires them. + */ +int ax25_rx_iframe(ax25_cb *ax25, struct sk_buff *skb) +{ + int (*func)(struct sk_buff *, ax25_cb *); + volatile int queued = 0; + unsigned char pid; + + if (skb == NULL) return 0; + + ax25_start_idletimer(ax25); + + pid = *skb->data; + +#ifdef CONFIG_INET + if (pid == AX25_P_IP) { + /* working around a TCP bug to keep additional listeners + * happy. TCP re-uses the buffer and destroys the original + * content. + */ + struct sk_buff *skbn = skb_copy(skb, GFP_ATOMIC); + if (skbn != NULL) { + kfree_skb(skb); + skb = skbn; + } + + skb_pull(skb, 1); /* Remove PID */ + skb->h.raw = skb->data; + skb->nh.raw = skb->data; + skb->dev = ax25->ax25_dev->dev; + skb->pkt_type = PACKET_HOST; + skb->protocol = htons(ETH_P_IP); + ip_rcv(skb, skb->dev, NULL); /* Wrong ptype */ + return 1; + } +#endif + if (pid == AX25_P_SEGMENT) { + skb_pull(skb, 1); /* Remove PID */ + return ax25_rx_fragment(ax25, skb); + } + + if ((func = ax25_protocol_function(pid)) != NULL) { + skb_pull(skb, 1); /* Remove PID */ + return (*func)(skb, ax25); + } + + if (ax25->sk != NULL && ax25->ax25_dev->values[AX25_VALUES_CONMODE] == 2) { + if ((!ax25->pidincl && ax25->sk->sk_protocol == pid) || + ax25->pidincl) { + if (sock_queue_rcv_skb(ax25->sk, skb) == 0) + queued = 1; + else + ax25->condition |= AX25_COND_OWN_RX_BUSY; + } + } + + return queued; +} + +/* + * Higher level upcall for a LAPB frame + */ +static int ax25_process_rx_frame(ax25_cb *ax25, struct sk_buff *skb, int type, int dama) +{ + int queued = 0; + + if (ax25->state == AX25_STATE_0) + return 0; + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + queued = ax25_std_frame_in(ax25, skb, type); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + if (dama || ax25->ax25_dev->dama.slave) + queued = ax25_ds_frame_in(ax25, skb, type); + else + queued = ax25_std_frame_in(ax25, skb, type); + break; +#endif + } + + return queued; +} + +static int ax25_rcv(struct sk_buff *skb, struct net_device *dev, + ax25_address *dev_addr, struct packet_type *ptype) +{ + ax25_address src, dest, *next_digi = NULL; + int type = 0, mine = 0, dama; + struct sock *make, *sk; + ax25_digi dp, reverse_dp; + ax25_cb *ax25; + ax25_dev *ax25_dev; + + /* + * Process the AX.25/LAPB frame. + */ + + skb->h.raw = skb->data; + + if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) { + kfree_skb(skb); + return 0; + } + + /* + * Parse the address header. + */ + + if (ax25_addr_parse(skb->data, skb->len, &src, &dest, &dp, &type, &dama) == NULL) { + kfree_skb(skb); + return 0; + } + + /* + * Ours perhaps ? + */ + if (dp.lastrepeat + 1 < dp.ndigi) /* Not yet digipeated completely */ + next_digi = &dp.calls[dp.lastrepeat + 1]; + + /* + * Pull of the AX.25 headers leaving the CTRL/PID bytes + */ + skb_pull(skb, ax25_addr_size(&dp)); + + /* For our port addresses ? */ + if (ax25cmp(&dest, dev_addr) == 0 && dp.lastrepeat + 1 == dp.ndigi) + mine = 1; + + /* Also match on any registered callsign from L3/4 */ + if (!mine && ax25_listen_mine(&dest, dev) && dp.lastrepeat + 1 == dp.ndigi) + mine = 1; + + /* UI frame - bypass LAPB processing */ + if ((*skb->data & ~0x10) == AX25_UI && dp.lastrepeat + 1 == dp.ndigi) { + skb->h.raw = skb->data + 2; /* skip control and pid */ + + ax25_send_to_raw(&dest, skb, skb->data[1]); + + if (!mine && ax25cmp(&dest, (ax25_address *)dev->broadcast) != 0) { + kfree_skb(skb); + return 0; + } + + /* Now we are pointing at the pid byte */ + switch (skb->data[1]) { +#ifdef CONFIG_INET + case AX25_P_IP: + skb_pull(skb,2); /* drop PID/CTRL */ + skb->h.raw = skb->data; + skb->nh.raw = skb->data; + skb->dev = dev; + skb->pkt_type = PACKET_HOST; + skb->protocol = htons(ETH_P_IP); + ip_rcv(skb, dev, ptype); /* Note ptype here is the wrong one, fix me later */ + break; + + case AX25_P_ARP: + skb_pull(skb,2); + skb->h.raw = skb->data; + skb->nh.raw = skb->data; + skb->dev = dev; + skb->pkt_type = PACKET_HOST; + skb->protocol = htons(ETH_P_ARP); + arp_rcv(skb, dev, ptype); /* Note ptype here is wrong... */ + break; +#endif + case AX25_P_TEXT: + /* Now find a suitable dgram socket */ + sk = ax25_get_socket(&dest, &src, SOCK_DGRAM); + if (sk != NULL) { + bh_lock_sock(sk); + if (atomic_read(&sk->sk_rmem_alloc) >= + sk->sk_rcvbuf) { + kfree_skb(skb); + } else { + /* + * Remove the control and PID. + */ + skb_pull(skb, 2); + if (sock_queue_rcv_skb(sk, skb) != 0) + kfree_skb(skb); + } + bh_unlock_sock(sk); + sock_put(sk); + } else { + kfree_skb(skb); + } + break; + + default: + kfree_skb(skb); /* Will scan SOCK_AX25 RAW sockets */ + break; + } + + return 0; + } + + /* + * Is connected mode supported on this device ? + * If not, should we DM the incoming frame (except DMs) or + * silently ignore them. For now we stay quiet. + */ + if (ax25_dev->values[AX25_VALUES_CONMODE] == 0) { + kfree_skb(skb); + return 0; + } + + /* LAPB */ + + /* AX.25 state 1-4 */ + + ax25_digi_invert(&dp, &reverse_dp); + + if ((ax25 = ax25_find_cb(&dest, &src, &reverse_dp, dev)) != NULL) { + /* + * Process the frame. If it is queued up internally it + * returns one otherwise we free it immediately. This + * routine itself wakes the user context layers so we do + * no further work + */ + if (ax25_process_rx_frame(ax25, skb, type, dama) == 0) + kfree_skb(skb); + + ax25_cb_put(ax25); + return 0; + } + + /* AX.25 state 0 (disconnected) */ + + /* a) received not a SABM(E) */ + + if ((*skb->data & ~AX25_PF) != AX25_SABM && + (*skb->data & ~AX25_PF) != AX25_SABME) { + /* + * Never reply to a DM. Also ignore any connects for + * addresses that are not our interfaces and not a socket. + */ + if ((*skb->data & ~AX25_PF) != AX25_DM && mine) + ax25_return_dm(dev, &src, &dest, &dp); + + kfree_skb(skb); + return 0; + } + + /* b) received SABM(E) */ + + if (dp.lastrepeat + 1 == dp.ndigi) + sk = ax25_find_listener(&dest, 0, dev, SOCK_SEQPACKET); + else + sk = ax25_find_listener(next_digi, 1, dev, SOCK_SEQPACKET); + + if (sk != NULL) { + bh_lock_sock(sk); + if (sk_acceptq_is_full(sk) || + (make = ax25_make_new(sk, ax25_dev)) == NULL) { + if (mine) + ax25_return_dm(dev, &src, &dest, &dp); + kfree_skb(skb); + bh_unlock_sock(sk); + sock_put(sk); + + return 0; + } + + ax25 = ax25_sk(make); + skb_set_owner_r(skb, make); + skb_queue_head(&sk->sk_receive_queue, skb); + + make->sk_state = TCP_ESTABLISHED; + + sk->sk_ack_backlog++; + bh_unlock_sock(sk); + } else { + if (!mine) { + kfree_skb(skb); + return 0; + } + + if ((ax25 = ax25_create_cb()) == NULL) { + ax25_return_dm(dev, &src, &dest, &dp); + kfree_skb(skb); + return 0; + } + + ax25_fillin_cb(ax25, ax25_dev); + } + + ax25->source_addr = dest; + ax25->dest_addr = src; + + /* + * Sort out any digipeated paths. + */ + if (dp.ndigi && !ax25->digipeat && + (ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { + kfree_skb(skb); + ax25_destroy_socket(ax25); + if (sk) + sock_put(sk); + return 0; + } + + if (dp.ndigi == 0) { + if (ax25->digipeat != NULL) { + kfree(ax25->digipeat); + ax25->digipeat = NULL; + } + } else { + /* Reverse the source SABM's path */ + memcpy(ax25->digipeat, &reverse_dp, sizeof(ax25_digi)); + } + + if ((*skb->data & ~AX25_PF) == AX25_SABME) { + ax25->modulus = AX25_EMODULUS; + ax25->window = ax25_dev->values[AX25_VALUES_EWINDOW]; + } else { + ax25->modulus = AX25_MODULUS; + ax25->window = ax25_dev->values[AX25_VALUES_WINDOW]; + } + + ax25_send_control(ax25, AX25_UA, AX25_POLLON, AX25_RESPONSE); + +#ifdef CONFIG_AX25_DAMA_SLAVE + if (dama && ax25->ax25_dev->values[AX25_VALUES_PROTOCOL] == AX25_PROTO_DAMA_SLAVE) + ax25_dama_on(ax25); +#endif + + ax25->state = AX25_STATE_3; + + ax25_cb_add(ax25); + + ax25_start_heartbeat(ax25); + ax25_start_t3timer(ax25); + ax25_start_idletimer(ax25); + + if (sk) { + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skb->len); + sock_put(sk); + } else + kfree_skb(skb); + + return 0; +} + +/* + * Receive an AX.25 frame via a SLIP interface. + */ +int ax25_kiss_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *ptype) +{ + skb->sk = NULL; /* Initially we don't know who it's for */ + skb->destructor = NULL; /* Who initializes this, dammit?! */ + + if ((*skb->data & 0x0F) != 0) { + kfree_skb(skb); /* Not a KISS data frame */ + return 0; + } + + skb_pull(skb, AX25_KISS_HEADER_LEN); /* Remove the KISS byte */ + + return ax25_rcv(skb, dev, (ax25_address *)dev->dev_addr, ptype); +} diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c new file mode 100644 index 000000000000..04d711344d55 --- /dev/null +++ b/net/ax25/ax25_ip.c @@ -0,0 +1,225 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For TIOCINQ/OUTQ */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * IP over AX.25 encapsulation. + */ + +/* + * Shove an AX.25 UI header on an IP packet and handle ARP + */ + +#ifdef CONFIG_INET + +int ax25_encapsulate(struct sk_buff *skb, struct net_device *dev, unsigned short type, void *daddr, void *saddr, unsigned len) +{ + unsigned char *buff; + + /* they sometimes come back to us... */ + if (type == ETH_P_AX25) + return 0; + + /* header is an AX.25 UI frame from us to them */ + buff = skb_push(skb, AX25_HEADER_LEN); + *buff++ = 0x00; /* KISS DATA */ + + if (daddr != NULL) + memcpy(buff, daddr, dev->addr_len); /* Address specified */ + + buff[6] &= ~AX25_CBIT; + buff[6] &= ~AX25_EBIT; + buff[6] |= AX25_SSSID_SPARE; + buff += AX25_ADDR_LEN; + + if (saddr != NULL) + memcpy(buff, saddr, dev->addr_len); + else + memcpy(buff, dev->dev_addr, dev->addr_len); + + buff[6] &= ~AX25_CBIT; + buff[6] |= AX25_EBIT; + buff[6] |= AX25_SSSID_SPARE; + buff += AX25_ADDR_LEN; + + *buff++ = AX25_UI; /* UI */ + + /* Append a suitable AX.25 PID */ + switch (type) { + case ETH_P_IP: + *buff++ = AX25_P_IP; + break; + case ETH_P_ARP: + *buff++ = AX25_P_ARP; + break; + default: + printk(KERN_ERR "AX.25: ax25_encapsulate - wrong protocol type 0x%2.2x\n", type); + *buff++ = 0; + break; + } + + if (daddr != NULL) + return AX25_HEADER_LEN; + + return -AX25_HEADER_LEN; /* Unfinished header */ +} + +int ax25_rebuild_header(struct sk_buff *skb) +{ + struct sk_buff *ourskb; + unsigned char *bp = skb->data; + struct net_device *dev; + ax25_address *src, *dst; + ax25_dev *ax25_dev; + ax25_route _route, *route = &_route; + ax25_cb *ax25; + + dst = (ax25_address *)(bp + 1); + src = (ax25_address *)(bp + 8); + + if (arp_find(bp + 1, skb)) + return 1; + + route = ax25_rt_find_route(route, dst, NULL); + dev = route->dev; + + if (dev == NULL) + dev = skb->dev; + + if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) { + goto put; + } + + if (bp[16] == AX25_P_IP) { + if (route->ip_mode == 'V' || (route->ip_mode == ' ' && ax25_dev->values[AX25_VALUES_IPDEFMODE])) { + /* + * We copy the buffer and release the original thereby + * keeping it straight + * + * Note: we report 1 back so the caller will + * not feed the frame direct to the physical device + * We don't want that to happen. (It won't be upset + * as we have pulled the frame from the queue by + * freeing it). + * + * NB: TCP modifies buffers that are still + * on a device queue, thus we use skb_copy() + * instead of using skb_clone() unless this + * gets fixed. + */ + + ax25_address src_c; + ax25_address dst_c; + + if ((ourskb = skb_copy(skb, GFP_ATOMIC)) == NULL) { + kfree_skb(skb); + goto put; + } + + if (skb->sk != NULL) + skb_set_owner_w(ourskb, skb->sk); + + kfree_skb(skb); + /* dl9sau: bugfix + * after kfree_skb(), dst and src which were pointer + * to bp which is part of skb->data would not be valid + * anymore hope that after skb_pull(ourskb, ..) our + * dsc_c and src_c will not become invalid + */ + bp = ourskb->data; + dst_c = *(ax25_address *)(bp + 1); + src_c = *(ax25_address *)(bp + 8); + + skb_pull(ourskb, AX25_HEADER_LEN - 1); /* Keep PID */ + ourskb->nh.raw = ourskb->data; + + ax25=ax25_send_frame( + ourskb, + ax25_dev->values[AX25_VALUES_PACLEN], + &src_c, + &dst_c, route->digipeat, dev); + if (ax25) { + ax25_cb_put(ax25); + } + goto put; + } + } + + bp[7] &= ~AX25_CBIT; + bp[7] &= ~AX25_EBIT; + bp[7] |= AX25_SSSID_SPARE; + + bp[14] &= ~AX25_CBIT; + bp[14] |= AX25_EBIT; + bp[14] |= AX25_SSSID_SPARE; + + skb_pull(skb, AX25_KISS_HEADER_LEN); + + if (route->digipeat != NULL) { + if ((ourskb = ax25_rt_build_path(skb, src, dst, route->digipeat)) == NULL) { + kfree_skb(skb); + goto put; + } + + skb = ourskb; + } + + skb->dev = dev; + + ax25_queue_xmit(skb); + +put: + ax25_put_route(route); + + return 1; +} + +#else /* INET */ + +int ax25_encapsulate(struct sk_buff *skb, struct net_device *dev, unsigned short type, void *daddr, void *saddr, unsigned len) +{ + return -AX25_HEADER_LEN; +} + +int ax25_rebuild_header(struct sk_buff *skb) +{ + return 1; +} + +#endif + diff --git a/net/ax25/ax25_out.c b/net/ax25/ax25_out.c new file mode 100644 index 000000000000..3475a3ac9343 --- /dev/null +++ b/net/ax25/ax25_out.c @@ -0,0 +1,383 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(ax25_frag_lock); + +ax25_cb *ax25_send_frame(struct sk_buff *skb, int paclen, ax25_address *src, ax25_address *dest, ax25_digi *digi, struct net_device *dev) +{ + ax25_dev *ax25_dev; + ax25_cb *ax25; + + /* + * Take the default packet length for the device if zero is + * specified. + */ + if (paclen == 0) { + if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + return NULL; + + paclen = ax25_dev->values[AX25_VALUES_PACLEN]; + } + + /* + * Look for an existing connection. + */ + if ((ax25 = ax25_find_cb(src, dest, digi, dev)) != NULL) { + ax25_output(ax25, paclen, skb); + return ax25; /* It already existed */ + } + + if ((ax25_dev = ax25_dev_ax25dev(dev)) == NULL) + return NULL; + + if ((ax25 = ax25_create_cb()) == NULL) + return NULL; + + ax25_fillin_cb(ax25, ax25_dev); + + ax25->source_addr = *src; + ax25->dest_addr = *dest; + + if (digi != NULL) { + if ((ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { + ax25_cb_put(ax25); + return NULL; + } + memcpy(ax25->digipeat, digi, sizeof(ax25_digi)); + } + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_std_establish_data_link(ax25); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + if (ax25_dev->dama.slave) + ax25_ds_establish_data_link(ax25); + else + ax25_std_establish_data_link(ax25); + break; +#endif + } + + ax25_cb_add(ax25); + + ax25->state = AX25_STATE_1; + + ax25_start_heartbeat(ax25); + + ax25_output(ax25, paclen, skb); + + return ax25; /* We had to create it */ +} + +/* + * All outgoing AX.25 I frames pass via this routine. Therefore this is + * where the fragmentation of frames takes place. If fragment is set to + * zero then we are not allowed to do fragmentation, even if the frame + * is too large. + */ +void ax25_output(ax25_cb *ax25, int paclen, struct sk_buff *skb) +{ + struct sk_buff *skbn; + unsigned char *p; + int frontlen, len, fragno, ka9qfrag, first = 1; + + if ((skb->len - 1) > paclen) { + if (*skb->data == AX25_P_TEXT) { + skb_pull(skb, 1); /* skip PID */ + ka9qfrag = 0; + } else { + paclen -= 2; /* Allow for fragment control info */ + ka9qfrag = 1; + } + + fragno = skb->len / paclen; + if (skb->len % paclen == 0) fragno--; + + frontlen = skb_headroom(skb); /* Address space + CTRL */ + + while (skb->len > 0) { + spin_lock_bh(&ax25_frag_lock); + if ((skbn = alloc_skb(paclen + 2 + frontlen, GFP_ATOMIC)) == NULL) { + spin_unlock_bh(&ax25_frag_lock); + printk(KERN_CRIT "AX.25: ax25_output - out of memory\n"); + return; + } + + if (skb->sk != NULL) + skb_set_owner_w(skbn, skb->sk); + + spin_unlock_bh(&ax25_frag_lock); + + len = (paclen > skb->len) ? skb->len : paclen; + + if (ka9qfrag == 1) { + skb_reserve(skbn, frontlen + 2); + skbn->nh.raw = skbn->data + (skb->nh.raw - skb->data); + memcpy(skb_put(skbn, len), skb->data, len); + p = skb_push(skbn, 2); + + *p++ = AX25_P_SEGMENT; + + *p = fragno--; + if (first) { + *p |= AX25_SEG_FIRST; + first = 0; + } + } else { + skb_reserve(skbn, frontlen + 1); + skbn->nh.raw = skbn->data + (skb->nh.raw - skb->data); + memcpy(skb_put(skbn, len), skb->data, len); + p = skb_push(skbn, 1); + *p = AX25_P_TEXT; + } + + skb_pull(skb, len); + skb_queue_tail(&ax25->write_queue, skbn); /* Throw it on the queue */ + } + + kfree_skb(skb); + } else { + skb_queue_tail(&ax25->write_queue, skb); /* Throw it on the queue */ + } + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_kick(ax25); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + /* + * A DAMA slave is _required_ to work as normal AX.25L2V2 + * if no DAMA master is available. + */ + case AX25_PROTO_DAMA_SLAVE: + if (!ax25->ax25_dev->dama.slave) ax25_kick(ax25); + break; +#endif + } +} + +/* + * This procedure is passed a buffer descriptor for an iframe. It builds + * the rest of the control part of the frame and then writes it out. + */ +static void ax25_send_iframe(ax25_cb *ax25, struct sk_buff *skb, int poll_bit) +{ + unsigned char *frame; + + if (skb == NULL) + return; + + skb->nh.raw = skb->data; + + if (ax25->modulus == AX25_MODULUS) { + frame = skb_push(skb, 1); + + *frame = AX25_I; + *frame |= (poll_bit) ? AX25_PF : 0; + *frame |= (ax25->vr << 5); + *frame |= (ax25->vs << 1); + } else { + frame = skb_push(skb, 2); + + frame[0] = AX25_I; + frame[0] |= (ax25->vs << 1); + frame[1] = (poll_bit) ? AX25_EPF : 0; + frame[1] |= (ax25->vr << 1); + } + + ax25_start_idletimer(ax25); + + ax25_transmit_buffer(ax25, skb, AX25_COMMAND); +} + +void ax25_kick(ax25_cb *ax25) +{ + struct sk_buff *skb, *skbn; + int last = 1; + unsigned short start, end, next; + + if (ax25->state != AX25_STATE_3 && ax25->state != AX25_STATE_4) + return; + + if (ax25->condition & AX25_COND_PEER_RX_BUSY) + return; + + if (skb_peek(&ax25->write_queue) == NULL) + return; + + start = (skb_peek(&ax25->ack_queue) == NULL) ? ax25->va : ax25->vs; + end = (ax25->va + ax25->window) % ax25->modulus; + + if (start == end) + return; + + ax25->vs = start; + + /* + * Transmit data until either we're out of data to send or + * the window is full. Send a poll on the final I frame if + * the window is filled. + */ + + /* + * Dequeue the frame and copy it. + */ + skb = skb_dequeue(&ax25->write_queue); + + do { + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skb_queue_head(&ax25->write_queue, skb); + break; + } + + if (skb->sk != NULL) + skb_set_owner_w(skbn, skb->sk); + + next = (ax25->vs + 1) % ax25->modulus; + last = (next == end); + + /* + * Transmit the frame copy. + * bke 960114: do not set the Poll bit on the last frame + * in DAMA mode. + */ + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_send_iframe(ax25, skbn, (last) ? AX25_POLLON : AX25_POLLOFF); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + ax25_send_iframe(ax25, skbn, AX25_POLLOFF); + break; +#endif + } + + ax25->vs = next; + + /* + * Requeue the original data frame. + */ + skb_queue_tail(&ax25->ack_queue, skb); + + } while (!last && (skb = skb_dequeue(&ax25->write_queue)) != NULL); + + ax25->condition &= ~AX25_COND_ACK_PENDING; + + if (!ax25_t1timer_running(ax25)) { + ax25_stop_t3timer(ax25); + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); + } +} + +void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type) +{ + struct sk_buff *skbn; + unsigned char *ptr; + int headroom; + + if (ax25->ax25_dev == NULL) { + ax25_disconnect(ax25, ENETUNREACH); + return; + } + + headroom = ax25_addr_size(ax25->digipeat); + + if (skb_headroom(skb) < headroom) { + if ((skbn = skb_realloc_headroom(skb, headroom)) == NULL) { + printk(KERN_CRIT "AX.25: ax25_transmit_buffer - out of memory\n"); + kfree_skb(skb); + return; + } + + if (skb->sk != NULL) + skb_set_owner_w(skbn, skb->sk); + + kfree_skb(skb); + skb = skbn; + } + + ptr = skb_push(skb, headroom); + + ax25_addr_build(ptr, &ax25->source_addr, &ax25->dest_addr, ax25->digipeat, type, ax25->modulus); + + skb->dev = ax25->ax25_dev->dev; + + ax25_queue_xmit(skb); +} + +/* + * A small shim to dev_queue_xmit to add the KISS control byte, and do + * any packet forwarding in operation. + */ +void ax25_queue_xmit(struct sk_buff *skb) +{ + unsigned char *ptr; + + skb->protocol = htons(ETH_P_AX25); + skb->dev = ax25_fwd_dev(skb->dev); + + ptr = skb_push(skb, 1); + *ptr = 0x00; /* KISS */ + + dev_queue_xmit(skb); +} + +int ax25_check_iframes_acked(ax25_cb *ax25, unsigned short nr) +{ + if (ax25->vs == nr) { + ax25_frames_acked(ax25, nr); + ax25_calculate_rtt(ax25); + ax25_stop_t1timer(ax25); + ax25_start_t3timer(ax25); + return 1; + } else { + if (ax25->va != nr) { + ax25_frames_acked(ax25, nr); + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); + return 1; + } + } + return 0; +} + diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c new file mode 100644 index 000000000000..44b99b1ff9f8 --- /dev/null +++ b/net/ax25/ax25_route.c @@ -0,0 +1,534 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Steven Whitehouse GW7RRM (stevew@acm.org) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de) + * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static ax25_route *ax25_route_list; +static DEFINE_RWLOCK(ax25_route_lock); + +static ax25_route *ax25_get_route(ax25_address *, struct net_device *); + +void ax25_rt_device_down(struct net_device *dev) +{ + ax25_route *s, *t, *ax25_rt; + + write_lock(&ax25_route_lock); + ax25_rt = ax25_route_list; + while (ax25_rt != NULL) { + s = ax25_rt; + ax25_rt = ax25_rt->next; + + if (s->dev == dev) { + if (ax25_route_list == s) { + ax25_route_list = s->next; + if (s->digipeat != NULL) + kfree(s->digipeat); + kfree(s); + } else { + for (t = ax25_route_list; t != NULL; t = t->next) { + if (t->next == s) { + t->next = s->next; + if (s->digipeat != NULL) + kfree(s->digipeat); + kfree(s); + break; + } + } + } + } + } + write_unlock(&ax25_route_lock); +} + +static int ax25_rt_add(struct ax25_routes_struct *route) +{ + ax25_route *ax25_rt; + ax25_dev *ax25_dev; + int i; + + if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL) + return -EINVAL; + if (route->digi_count > AX25_MAX_DIGIS) + return -EINVAL; + + write_lock(&ax25_route_lock); + + ax25_rt = ax25_route_list; + while (ax25_rt != NULL) { + if (ax25cmp(&ax25_rt->callsign, &route->dest_addr) == 0 && + ax25_rt->dev == ax25_dev->dev) { + if (ax25_rt->digipeat != NULL) { + kfree(ax25_rt->digipeat); + ax25_rt->digipeat = NULL; + } + if (route->digi_count != 0) { + if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { + write_unlock(&ax25_route_lock); + return -ENOMEM; + } + ax25_rt->digipeat->lastrepeat = -1; + ax25_rt->digipeat->ndigi = route->digi_count; + for (i = 0; i < route->digi_count; i++) { + ax25_rt->digipeat->repeated[i] = 0; + ax25_rt->digipeat->calls[i] = route->digi_addr[i]; + } + } + write_unlock(&ax25_route_lock); + return 0; + } + ax25_rt = ax25_rt->next; + } + + if ((ax25_rt = kmalloc(sizeof(ax25_route), GFP_ATOMIC)) == NULL) { + write_unlock(&ax25_route_lock); + return -ENOMEM; + } + + atomic_set(&ax25_rt->ref, 0); + ax25_rt->callsign = route->dest_addr; + ax25_rt->dev = ax25_dev->dev; + ax25_rt->digipeat = NULL; + ax25_rt->ip_mode = ' '; + if (route->digi_count != 0) { + if ((ax25_rt->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { + write_unlock(&ax25_route_lock); + kfree(ax25_rt); + return -ENOMEM; + } + ax25_rt->digipeat->lastrepeat = -1; + ax25_rt->digipeat->ndigi = route->digi_count; + for (i = 0; i < route->digi_count; i++) { + ax25_rt->digipeat->repeated[i] = 0; + ax25_rt->digipeat->calls[i] = route->digi_addr[i]; + } + } + ax25_rt->next = ax25_route_list; + ax25_route_list = ax25_rt; + write_unlock(&ax25_route_lock); + + return 0; +} + +static void ax25_rt_destroy(ax25_route *ax25_rt) +{ + if (atomic_read(&ax25_rt->ref) == 0) { + if (ax25_rt->digipeat != NULL) + kfree(ax25_rt->digipeat); + kfree(ax25_rt); + return; + } + + /* + * Uh... Route is still in use; we can't yet destroy it. Retry later. + */ + init_timer(&ax25_rt->timer); + ax25_rt->timer.data = (unsigned long) ax25_rt; + ax25_rt->timer.function = (void *) ax25_rt_destroy; + ax25_rt->timer.expires = jiffies + 5 * HZ; + + add_timer(&ax25_rt->timer); +} + +static int ax25_rt_del(struct ax25_routes_struct *route) +{ + ax25_route *s, *t, *ax25_rt; + ax25_dev *ax25_dev; + + if ((ax25_dev = ax25_addr_ax25dev(&route->port_addr)) == NULL) + return -EINVAL; + + write_lock(&ax25_route_lock); + + ax25_rt = ax25_route_list; + while (ax25_rt != NULL) { + s = ax25_rt; + ax25_rt = ax25_rt->next; + if (s->dev == ax25_dev->dev && + ax25cmp(&route->dest_addr, &s->callsign) == 0) { + if (ax25_route_list == s) { + ax25_route_list = s->next; + ax25_rt_destroy(s); + } else { + for (t = ax25_route_list; t != NULL; t = t->next) { + if (t->next == s) { + t->next = s->next; + ax25_rt_destroy(s); + break; + } + } + } + } + } + write_unlock(&ax25_route_lock); + + return 0; +} + +static int ax25_rt_opt(struct ax25_route_opt_struct *rt_option) +{ + ax25_route *ax25_rt; + ax25_dev *ax25_dev; + int err = 0; + + if ((ax25_dev = ax25_addr_ax25dev(&rt_option->port_addr)) == NULL) + return -EINVAL; + + write_lock(&ax25_route_lock); + + ax25_rt = ax25_route_list; + while (ax25_rt != NULL) { + if (ax25_rt->dev == ax25_dev->dev && + ax25cmp(&rt_option->dest_addr, &ax25_rt->callsign) == 0) { + switch (rt_option->cmd) { + case AX25_SET_RT_IPMODE: + switch (rt_option->arg) { + case ' ': + case 'D': + case 'V': + ax25_rt->ip_mode = rt_option->arg; + break; + default: + err = -EINVAL; + goto out; + } + break; + default: + err = -EINVAL; + goto out; + } + } + ax25_rt = ax25_rt->next; + } + +out: + write_unlock(&ax25_route_lock); + return err; +} + +int ax25_rt_ioctl(unsigned int cmd, void __user *arg) +{ + struct ax25_route_opt_struct rt_option; + struct ax25_routes_struct route; + + switch (cmd) { + case SIOCADDRT: + if (copy_from_user(&route, arg, sizeof(route))) + return -EFAULT; + return ax25_rt_add(&route); + + case SIOCDELRT: + if (copy_from_user(&route, arg, sizeof(route))) + return -EFAULT; + return ax25_rt_del(&route); + + case SIOCAX25OPTRT: + if (copy_from_user(&rt_option, arg, sizeof(rt_option))) + return -EFAULT; + return ax25_rt_opt(&rt_option); + + default: + return -EINVAL; + } +} + +#ifdef CONFIG_PROC_FS + +static void *ax25_rt_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct ax25_route *ax25_rt; + int i = 1; + + read_lock(&ax25_route_lock); + if (*pos == 0) + return SEQ_START_TOKEN; + + for (ax25_rt = ax25_route_list; ax25_rt != NULL; ax25_rt = ax25_rt->next) { + if (i == *pos) + return ax25_rt; + ++i; + } + + return NULL; +} + +static void *ax25_rt_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return (v == SEQ_START_TOKEN) ? ax25_route_list : + ((struct ax25_route *) v)->next; +} + +static void ax25_rt_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&ax25_route_lock); +} + +static int ax25_rt_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "callsign dev mode digipeaters\n"); + else { + struct ax25_route *ax25_rt = v; + const char *callsign; + int i; + + if (ax25cmp(&ax25_rt->callsign, &null_ax25_address) == 0) + callsign = "default"; + else + callsign = ax2asc(&ax25_rt->callsign); + + seq_printf(seq, "%-9s %-4s", + callsign, + ax25_rt->dev ? ax25_rt->dev->name : "???"); + + switch (ax25_rt->ip_mode) { + case 'V': + seq_puts(seq, " vc"); + break; + case 'D': + seq_puts(seq, " dg"); + break; + default: + seq_puts(seq, " *"); + break; + } + + if (ax25_rt->digipeat != NULL) + for (i = 0; i < ax25_rt->digipeat->ndigi; i++) + seq_printf(seq, " %s", ax2asc(&ax25_rt->digipeat->calls[i])); + + seq_puts(seq, "\n"); + } + return 0; +} + +static struct seq_operations ax25_rt_seqops = { + .start = ax25_rt_seq_start, + .next = ax25_rt_seq_next, + .stop = ax25_rt_seq_stop, + .show = ax25_rt_seq_show, +}; + +static int ax25_rt_info_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ax25_rt_seqops); +} + +struct file_operations ax25_route_fops = { + .owner = THIS_MODULE, + .open = ax25_rt_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + +/* + * Find AX.25 route + * + * Only routes with a refernce rout of zero can be destroyed. + */ +static ax25_route *ax25_get_route(ax25_address *addr, struct net_device *dev) +{ + ax25_route *ax25_spe_rt = NULL; + ax25_route *ax25_def_rt = NULL; + ax25_route *ax25_rt; + + read_lock(&ax25_route_lock); + /* + * Bind to the physical interface we heard them on, or the default + * route if none is found; + */ + for (ax25_rt = ax25_route_list; ax25_rt != NULL; ax25_rt = ax25_rt->next) { + if (dev == NULL) { + if (ax25cmp(&ax25_rt->callsign, addr) == 0 && ax25_rt->dev != NULL) + ax25_spe_rt = ax25_rt; + if (ax25cmp(&ax25_rt->callsign, &null_ax25_address) == 0 && ax25_rt->dev != NULL) + ax25_def_rt = ax25_rt; + } else { + if (ax25cmp(&ax25_rt->callsign, addr) == 0 && ax25_rt->dev == dev) + ax25_spe_rt = ax25_rt; + if (ax25cmp(&ax25_rt->callsign, &null_ax25_address) == 0 && ax25_rt->dev == dev) + ax25_def_rt = ax25_rt; + } + } + + ax25_rt = ax25_def_rt; + if (ax25_spe_rt != NULL) + ax25_rt = ax25_spe_rt; + + if (ax25_rt != NULL) + atomic_inc(&ax25_rt->ref); + + read_unlock(&ax25_route_lock); + + return ax25_rt; +} + +/* + * Adjust path: If you specify a default route and want to connect + * a target on the digipeater path but w/o having a special route + * set before, the path has to be truncated from your target on. + */ +static inline void ax25_adjust_path(ax25_address *addr, ax25_digi *digipeat) +{ + int k; + + for (k = 0; k < digipeat->ndigi; k++) { + if (ax25cmp(addr, &digipeat->calls[k]) == 0) + break; + } + + digipeat->ndigi = k; +} + + +/* + * Find which interface to use. + */ +int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) +{ + ax25_route *ax25_rt; + ax25_address *call; + int err; + + if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL) + return -EHOSTUNREACH; + + if ((ax25->ax25_dev = ax25_dev_ax25dev(ax25_rt->dev)) == NULL) { + err = -EHOSTUNREACH; + goto put; + } + + if ((call = ax25_findbyuid(current->euid)) == NULL) { + if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { + err = -EPERM; + goto put; + } + call = (ax25_address *)ax25->ax25_dev->dev->dev_addr; + } + + ax25->source_addr = *call; + + if (ax25_rt->digipeat != NULL) { + if ((ax25->digipeat = kmalloc(sizeof(ax25_digi), GFP_ATOMIC)) == NULL) { + err = -ENOMEM; + goto put; + } + memcpy(ax25->digipeat, ax25_rt->digipeat, sizeof(ax25_digi)); + ax25_adjust_path(addr, ax25->digipeat); + } + + if (ax25->sk != NULL) { + bh_lock_sock(ax25->sk); + sock_reset_flag(ax25->sk, SOCK_ZAPPED); + bh_unlock_sock(ax25->sk); + } + +put: + ax25_put_route(ax25_rt); + + return 0; +} + +ax25_route *ax25_rt_find_route(ax25_route * route, ax25_address *addr, + struct net_device *dev) +{ + ax25_route *ax25_rt; + + if ((ax25_rt = ax25_get_route(addr, dev))) + return ax25_rt; + + route->next = NULL; + atomic_set(&route->ref, 1); + route->callsign = *addr; + route->dev = dev; + route->digipeat = NULL; + route->ip_mode = ' '; + + return route; +} + +struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src, + ax25_address *dest, ax25_digi *digi) +{ + struct sk_buff *skbn; + unsigned char *bp; + int len; + + len = digi->ndigi * AX25_ADDR_LEN; + + if (skb_headroom(skb) < len) { + if ((skbn = skb_realloc_headroom(skb, len)) == NULL) { + printk(KERN_CRIT "AX.25: ax25_dg_build_path - out of memory\n"); + return NULL; + } + + if (skb->sk != NULL) + skb_set_owner_w(skbn, skb->sk); + + kfree_skb(skb); + + skb = skbn; + } + + bp = skb_push(skb, len); + + ax25_addr_build(bp, src, dest, digi, AX25_COMMAND, AX25_MODULUS); + + return skb; +} + +/* + * Free all memory associated with routing structures. + */ +void __exit ax25_rt_free(void) +{ + ax25_route *s, *ax25_rt = ax25_route_list; + + write_lock(&ax25_route_lock); + while (ax25_rt != NULL) { + s = ax25_rt; + ax25_rt = ax25_rt->next; + + if (s->digipeat != NULL) + kfree(s->digipeat); + + kfree(s); + } + write_unlock(&ax25_route_lock); +} diff --git a/net/ax25/ax25_std_in.c b/net/ax25/ax25_std_in.c new file mode 100644 index 000000000000..7131873322c4 --- /dev/null +++ b/net/ax25/ax25_std_in.c @@ -0,0 +1,449 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + * Copyright (C) Hans-Joachim Hetscher DD8NE (dd8ne@bnv-bamberg.de) + * + * Most of this code is based on the SDL diagrams published in the 7th ARRL + * Computer Networking Conference papers. The diagrams have mistakes in them, + * but are mostly correct. Before you modify the code could you read the SDL + * diagrams as the code is not obvious and probably very easy to break. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For ip_rcv */ +#include +#include +#include +#include +#include +#include + +/* + * State machine for state 1, Awaiting Connection State. + * The handling of the timer(s) is in file ax25_std_timer.c. + * Handling of state 0 and connection release is in ax25.c. + */ +static int ax25_std_state1_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type) +{ + switch (frametype) { + case AX25_SABM: + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + break; + + case AX25_SABME: + ax25->modulus = AX25_EMODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_EWINDOW]; + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + break; + + case AX25_DISC: + ax25_send_control(ax25, AX25_DM, pf, AX25_RESPONSE); + break; + + case AX25_UA: + if (pf) { + ax25_calculate_rtt(ax25); + ax25_stop_t1timer(ax25); + ax25_start_t3timer(ax25); + ax25_start_idletimer(ax25); + ax25->vs = 0; + ax25->va = 0; + ax25->vr = 0; + ax25->state = AX25_STATE_3; + ax25->n2count = 0; + if (ax25->sk != NULL) { + bh_lock_sock(ax25->sk); + ax25->sk->sk_state = TCP_ESTABLISHED; + /* For WAIT_SABM connections we will produce an accept ready socket here */ + if (!sock_flag(ax25->sk, SOCK_DEAD)) + ax25->sk->sk_state_change(ax25->sk); + bh_unlock_sock(ax25->sk); + } + } + break; + + case AX25_DM: + if (pf) { + if (ax25->modulus == AX25_MODULUS) { + ax25_disconnect(ax25, ECONNREFUSED); + } else { + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + } + } + break; + + default: + break; + } + + return 0; +} + +/* + * State machine for state 2, Awaiting Release State. + * The handling of the timer(s) is in file ax25_std_timer.c + * Handling of state 0 and connection release is in ax25.c. + */ +static int ax25_std_state2_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int pf, int type) +{ + switch (frametype) { + case AX25_SABM: + case AX25_SABME: + ax25_send_control(ax25, AX25_DM, pf, AX25_RESPONSE); + break; + + case AX25_DISC: + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + ax25_disconnect(ax25, 0); + break; + + case AX25_DM: + case AX25_UA: + if (pf) + ax25_disconnect(ax25, 0); + break; + + case AX25_I: + case AX25_REJ: + case AX25_RNR: + case AX25_RR: + if (pf) ax25_send_control(ax25, AX25_DM, AX25_POLLON, AX25_RESPONSE); + break; + + default: + break; + } + + return 0; +} + +/* + * State machine for state 3, Connected State. + * The handling of the timer(s) is in file ax25_std_timer.c + * Handling of state 0 and connection release is in ax25.c. + */ +static int ax25_std_state3_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int ns, int nr, int pf, int type) +{ + int queued = 0; + + switch (frametype) { + case AX25_SABM: + case AX25_SABME: + if (frametype == AX25_SABM) { + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + } else { + ax25->modulus = AX25_EMODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_EWINDOW]; + } + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + ax25_stop_t1timer(ax25); + ax25_stop_t2timer(ax25); + ax25_start_t3timer(ax25); + ax25_start_idletimer(ax25); + ax25->condition = 0x00; + ax25->vs = 0; + ax25->va = 0; + ax25->vr = 0; + ax25_requeue_frames(ax25); + break; + + case AX25_DISC: + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + ax25_disconnect(ax25, 0); + break; + + case AX25_DM: + ax25_disconnect(ax25, ECONNRESET); + break; + + case AX25_RR: + case AX25_RNR: + if (frametype == AX25_RR) + ax25->condition &= ~AX25_COND_PEER_RX_BUSY; + else + ax25->condition |= AX25_COND_PEER_RX_BUSY; + if (type == AX25_COMMAND && pf) + ax25_std_enquiry_response(ax25); + if (ax25_validate_nr(ax25, nr)) { + ax25_check_iframes_acked(ax25, nr); + } else { + ax25_std_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + } + break; + + case AX25_REJ: + ax25->condition &= ~AX25_COND_PEER_RX_BUSY; + if (type == AX25_COMMAND && pf) + ax25_std_enquiry_response(ax25); + if (ax25_validate_nr(ax25, nr)) { + ax25_frames_acked(ax25, nr); + ax25_calculate_rtt(ax25); + ax25_stop_t1timer(ax25); + ax25_start_t3timer(ax25); + ax25_requeue_frames(ax25); + } else { + ax25_std_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + } + break; + + case AX25_I: + if (!ax25_validate_nr(ax25, nr)) { + ax25_std_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + break; + } + if (ax25->condition & AX25_COND_PEER_RX_BUSY) { + ax25_frames_acked(ax25, nr); + } else { + ax25_check_iframes_acked(ax25, nr); + } + if (ax25->condition & AX25_COND_OWN_RX_BUSY) { + if (pf) ax25_std_enquiry_response(ax25); + break; + } + if (ns == ax25->vr) { + ax25->vr = (ax25->vr + 1) % ax25->modulus; + queued = ax25_rx_iframe(ax25, skb); + if (ax25->condition & AX25_COND_OWN_RX_BUSY) + ax25->vr = ns; /* ax25->vr - 1 */ + ax25->condition &= ~AX25_COND_REJECT; + if (pf) { + ax25_std_enquiry_response(ax25); + } else { + if (!(ax25->condition & AX25_COND_ACK_PENDING)) { + ax25->condition |= AX25_COND_ACK_PENDING; + ax25_start_t2timer(ax25); + } + } + } else { + if (ax25->condition & AX25_COND_REJECT) { + if (pf) ax25_std_enquiry_response(ax25); + } else { + ax25->condition |= AX25_COND_REJECT; + ax25_send_control(ax25, AX25_REJ, pf, AX25_RESPONSE); + ax25->condition &= ~AX25_COND_ACK_PENDING; + } + } + break; + + case AX25_FRMR: + case AX25_ILLEGAL: + ax25_std_establish_data_link(ax25); + ax25->state = AX25_STATE_1; + break; + + default: + break; + } + + return queued; +} + +/* + * State machine for state 4, Timer Recovery State. + * The handling of the timer(s) is in file ax25_std_timer.c + * Handling of state 0 and connection release is in ax25.c. + */ +static int ax25_std_state4_machine(ax25_cb *ax25, struct sk_buff *skb, int frametype, int ns, int nr, int pf, int type) +{ + int queued = 0; + + switch (frametype) { + case AX25_SABM: + case AX25_SABME: + if (frametype == AX25_SABM) { + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + } else { + ax25->modulus = AX25_EMODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_EWINDOW]; + } + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + ax25_stop_t1timer(ax25); + ax25_stop_t2timer(ax25); + ax25_start_t3timer(ax25); + ax25_start_idletimer(ax25); + ax25->condition = 0x00; + ax25->vs = 0; + ax25->va = 0; + ax25->vr = 0; + ax25->state = AX25_STATE_3; + ax25->n2count = 0; + ax25_requeue_frames(ax25); + break; + + case AX25_DISC: + ax25_send_control(ax25, AX25_UA, pf, AX25_RESPONSE); + ax25_disconnect(ax25, 0); + break; + + case AX25_DM: + ax25_disconnect(ax25, ECONNRESET); + break; + + case AX25_RR: + case AX25_RNR: + if (frametype == AX25_RR) + ax25->condition &= ~AX25_COND_PEER_RX_BUSY; + else + ax25->condition |= AX25_COND_PEER_RX_BUSY; + if (type == AX25_RESPONSE && pf) { + ax25_stop_t1timer(ax25); + ax25->n2count = 0; + if (ax25_validate_nr(ax25, nr)) { + ax25_frames_acked(ax25, nr); + if (ax25->vs == ax25->va) { + ax25_start_t3timer(ax25); + ax25->state = AX25_STATE_3; + } else { + ax25_requeue_frames(ax25); + } + } else { + ax25_std_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + } + break; + } + if (type == AX25_COMMAND && pf) + ax25_std_enquiry_response(ax25); + if (ax25_validate_nr(ax25, nr)) { + ax25_frames_acked(ax25, nr); + } else { + ax25_std_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + } + break; + + case AX25_REJ: + ax25->condition &= ~AX25_COND_PEER_RX_BUSY; + if (pf && type == AX25_RESPONSE) { + ax25_stop_t1timer(ax25); + ax25->n2count = 0; + if (ax25_validate_nr(ax25, nr)) { + ax25_frames_acked(ax25, nr); + if (ax25->vs == ax25->va) { + ax25_start_t3timer(ax25); + ax25->state = AX25_STATE_3; + } else { + ax25_requeue_frames(ax25); + } + } else { + ax25_std_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + } + break; + } + if (type == AX25_COMMAND && pf) + ax25_std_enquiry_response(ax25); + if (ax25_validate_nr(ax25, nr)) { + ax25_frames_acked(ax25, nr); + ax25_requeue_frames(ax25); + } else { + ax25_std_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + } + break; + + case AX25_I: + if (!ax25_validate_nr(ax25, nr)) { + ax25_std_nr_error_recovery(ax25); + ax25->state = AX25_STATE_1; + break; + } + ax25_frames_acked(ax25, nr); + if (ax25->condition & AX25_COND_OWN_RX_BUSY) { + if (pf) + ax25_std_enquiry_response(ax25); + break; + } + if (ns == ax25->vr) { + ax25->vr = (ax25->vr + 1) % ax25->modulus; + queued = ax25_rx_iframe(ax25, skb); + if (ax25->condition & AX25_COND_OWN_RX_BUSY) + ax25->vr = ns; /* ax25->vr - 1 */ + ax25->condition &= ~AX25_COND_REJECT; + if (pf) { + ax25_std_enquiry_response(ax25); + } else { + if (!(ax25->condition & AX25_COND_ACK_PENDING)) { + ax25->condition |= AX25_COND_ACK_PENDING; + ax25_start_t2timer(ax25); + } + } + } else { + if (ax25->condition & AX25_COND_REJECT) { + if (pf) ax25_std_enquiry_response(ax25); + } else { + ax25->condition |= AX25_COND_REJECT; + ax25_send_control(ax25, AX25_REJ, pf, AX25_RESPONSE); + ax25->condition &= ~AX25_COND_ACK_PENDING; + } + } + break; + + case AX25_FRMR: + case AX25_ILLEGAL: + ax25_std_establish_data_link(ax25); + ax25->state = AX25_STATE_1; + break; + + default: + break; + } + + return queued; +} + +/* + * Higher level upcall for a LAPB frame + */ +int ax25_std_frame_in(ax25_cb *ax25, struct sk_buff *skb, int type) +{ + int queued = 0, frametype, ns, nr, pf; + + frametype = ax25_decode(ax25, skb, &ns, &nr, &pf); + + switch (ax25->state) { + case AX25_STATE_1: + queued = ax25_std_state1_machine(ax25, skb, frametype, pf, type); + break; + case AX25_STATE_2: + queued = ax25_std_state2_machine(ax25, skb, frametype, pf, type); + break; + case AX25_STATE_3: + queued = ax25_std_state3_machine(ax25, skb, frametype, ns, nr, pf, type); + break; + case AX25_STATE_4: + queued = ax25_std_state4_machine(ax25, skb, frametype, ns, nr, pf, type); + break; + } + + ax25_kick(ax25); + + return queued; +} diff --git a/net/ax25/ax25_std_subr.c b/net/ax25/ax25_std_subr.c new file mode 100644 index 000000000000..2b3c801ae486 --- /dev/null +++ b/net/ax25/ax25_std_subr.c @@ -0,0 +1,88 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The following routines are taken from page 170 of the 7th ARRL Computer + * Networking Conference paper, as is the whole state machine. + */ + +void ax25_std_nr_error_recovery(ax25_cb *ax25) +{ + ax25_std_establish_data_link(ax25); +} + +void ax25_std_establish_data_link(ax25_cb *ax25) +{ + ax25->condition = 0x00; + ax25->n2count = 0; + + if (ax25->modulus == AX25_MODULUS) + ax25_send_control(ax25, AX25_SABM, AX25_POLLON, AX25_COMMAND); + else + ax25_send_control(ax25, AX25_SABME, AX25_POLLON, AX25_COMMAND); + + ax25_calculate_t1(ax25); + ax25_stop_idletimer(ax25); + ax25_stop_t3timer(ax25); + ax25_stop_t2timer(ax25); + ax25_start_t1timer(ax25); +} + +void ax25_std_transmit_enquiry(ax25_cb *ax25) +{ + if (ax25->condition & AX25_COND_OWN_RX_BUSY) + ax25_send_control(ax25, AX25_RNR, AX25_POLLON, AX25_COMMAND); + else + ax25_send_control(ax25, AX25_RR, AX25_POLLON, AX25_COMMAND); + + ax25->condition &= ~AX25_COND_ACK_PENDING; + + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); +} + +void ax25_std_enquiry_response(ax25_cb *ax25) +{ + if (ax25->condition & AX25_COND_OWN_RX_BUSY) + ax25_send_control(ax25, AX25_RNR, AX25_POLLON, AX25_RESPONSE); + else + ax25_send_control(ax25, AX25_RR, AX25_POLLON, AX25_RESPONSE); + + ax25->condition &= ~AX25_COND_ACK_PENDING; +} + +void ax25_std_timeout_response(ax25_cb *ax25) +{ + if (ax25->condition & AX25_COND_OWN_RX_BUSY) + ax25_send_control(ax25, AX25_RNR, AX25_POLLOFF, AX25_RESPONSE); + else + ax25_send_control(ax25, AX25_RR, AX25_POLLOFF, AX25_RESPONSE); + + ax25->condition &= ~AX25_COND_ACK_PENDING; +} diff --git a/net/ax25/ax25_std_timer.c b/net/ax25/ax25_std_timer.c new file mode 100644 index 000000000000..066897bc0749 --- /dev/null +++ b/net/ax25/ax25_std_timer.c @@ -0,0 +1,177 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void ax25_std_heartbeat_expiry(ax25_cb *ax25) +{ + struct sock *sk=ax25->sk; + + if (sk) + bh_lock_sock(sk); + + switch (ax25->state) { + case AX25_STATE_0: + /* Magic here: If we listen() and a new link dies before it + is accepted() it isn't 'dead' so doesn't get removed. */ + if (!sk || sock_flag(sk, SOCK_DESTROY) || + (sk->sk_state == TCP_LISTEN && + sock_flag(sk, SOCK_DEAD))) { + if (sk) { + sock_hold(sk); + ax25_destroy_socket(ax25); + bh_unlock_sock(sk); + sock_put(sk); + } else + ax25_destroy_socket(ax25); + return; + } + break; + + case AX25_STATE_3: + case AX25_STATE_4: + /* + * Check the state of the receive buffer. + */ + if (sk != NULL) { + if (atomic_read(&sk->sk_rmem_alloc) < + (sk->sk_rcvbuf / 2) && + (ax25->condition & AX25_COND_OWN_RX_BUSY)) { + ax25->condition &= ~AX25_COND_OWN_RX_BUSY; + ax25->condition &= ~AX25_COND_ACK_PENDING; + ax25_send_control(ax25, AX25_RR, AX25_POLLOFF, AX25_RESPONSE); + break; + } + } + } + + if (sk) + bh_unlock_sock(sk); + + ax25_start_heartbeat(ax25); +} + +void ax25_std_t2timer_expiry(ax25_cb *ax25) +{ + if (ax25->condition & AX25_COND_ACK_PENDING) { + ax25->condition &= ~AX25_COND_ACK_PENDING; + ax25_std_timeout_response(ax25); + } +} + +void ax25_std_t3timer_expiry(ax25_cb *ax25) +{ + ax25->n2count = 0; + ax25_std_transmit_enquiry(ax25); + ax25->state = AX25_STATE_4; +} + +void ax25_std_idletimer_expiry(ax25_cb *ax25) +{ + ax25_clear_queues(ax25); + + ax25->n2count = 0; + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + ax25->state = AX25_STATE_2; + + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); + ax25_stop_t2timer(ax25); + ax25_stop_t3timer(ax25); + + if (ax25->sk != NULL) { + bh_lock_sock(ax25->sk); + ax25->sk->sk_state = TCP_CLOSE; + ax25->sk->sk_err = 0; + ax25->sk->sk_shutdown |= SEND_SHUTDOWN; + if (!sock_flag(ax25->sk, SOCK_DEAD)) { + ax25->sk->sk_state_change(ax25->sk); + sock_set_flag(ax25->sk, SOCK_DEAD); + } + bh_unlock_sock(ax25->sk); + } +} + +void ax25_std_t1timer_expiry(ax25_cb *ax25) +{ + switch (ax25->state) { + case AX25_STATE_1: + if (ax25->n2count == ax25->n2) { + if (ax25->modulus == AX25_MODULUS) { + ax25_disconnect(ax25, ETIMEDOUT); + return; + } else { + ax25->modulus = AX25_MODULUS; + ax25->window = ax25->ax25_dev->values[AX25_VALUES_WINDOW]; + ax25->n2count = 0; + ax25_send_control(ax25, AX25_SABM, AX25_POLLON, AX25_COMMAND); + } + } else { + ax25->n2count++; + if (ax25->modulus == AX25_MODULUS) + ax25_send_control(ax25, AX25_SABM, AX25_POLLON, AX25_COMMAND); + else + ax25_send_control(ax25, AX25_SABME, AX25_POLLON, AX25_COMMAND); + } + break; + + case AX25_STATE_2: + if (ax25->n2count == ax25->n2) { + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + ax25_disconnect(ax25, ETIMEDOUT); + return; + } else { + ax25->n2count++; + ax25_send_control(ax25, AX25_DISC, AX25_POLLON, AX25_COMMAND); + } + break; + + case AX25_STATE_3: + ax25->n2count = 1; + ax25_std_transmit_enquiry(ax25); + ax25->state = AX25_STATE_4; + break; + + case AX25_STATE_4: + if (ax25->n2count == ax25->n2) { + ax25_send_control(ax25, AX25_DM, AX25_POLLON, AX25_RESPONSE); + ax25_disconnect(ax25, ETIMEDOUT); + return; + } else { + ax25->n2count++; + ax25_std_transmit_enquiry(ax25); + } + break; + } + + ax25_calculate_t1(ax25); + ax25_start_t1timer(ax25); +} diff --git a/net/ax25/ax25_subr.c b/net/ax25/ax25_subr.c new file mode 100644 index 000000000000..8cf72707af8b --- /dev/null +++ b/net/ax25/ax25_subr.c @@ -0,0 +1,295 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This routine purges all the queues of frames. + */ +void ax25_clear_queues(ax25_cb *ax25) +{ + skb_queue_purge(&ax25->write_queue); + skb_queue_purge(&ax25->ack_queue); + skb_queue_purge(&ax25->reseq_queue); + skb_queue_purge(&ax25->frag_queue); +} + +/* + * This routine purges the input queue of those frames that have been + * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the + * SDL diagram. + */ +void ax25_frames_acked(ax25_cb *ax25, unsigned short nr) +{ + struct sk_buff *skb; + + /* + * Remove all the ack-ed frames from the ack queue. + */ + if (ax25->va != nr) { + while (skb_peek(&ax25->ack_queue) != NULL && ax25->va != nr) { + skb = skb_dequeue(&ax25->ack_queue); + kfree_skb(skb); + ax25->va = (ax25->va + 1) % ax25->modulus; + } + } +} + +void ax25_requeue_frames(ax25_cb *ax25) +{ + struct sk_buff *skb, *skb_prev = NULL; + + /* + * Requeue all the un-ack-ed frames on the output queue to be picked + * up by ax25_kick called from the timer. This arrangement handles the + * possibility of an empty output queue. + */ + while ((skb = skb_dequeue(&ax25->ack_queue)) != NULL) { + if (skb_prev == NULL) + skb_queue_head(&ax25->write_queue, skb); + else + skb_append(skb_prev, skb); + skb_prev = skb; + } +} + +/* + * Validate that the value of nr is between va and vs. Return true or + * false for testing. + */ +int ax25_validate_nr(ax25_cb *ax25, unsigned short nr) +{ + unsigned short vc = ax25->va; + + while (vc != ax25->vs) { + if (nr == vc) return 1; + vc = (vc + 1) % ax25->modulus; + } + + if (nr == ax25->vs) return 1; + + return 0; +} + +/* + * This routine is the centralised routine for parsing the control + * information for the different frame formats. + */ +int ax25_decode(ax25_cb *ax25, struct sk_buff *skb, int *ns, int *nr, int *pf) +{ + unsigned char *frame; + int frametype = AX25_ILLEGAL; + + frame = skb->data; + *ns = *nr = *pf = 0; + + if (ax25->modulus == AX25_MODULUS) { + if ((frame[0] & AX25_S) == 0) { + frametype = AX25_I; /* I frame - carries NR/NS/PF */ + *ns = (frame[0] >> 1) & 0x07; + *nr = (frame[0] >> 5) & 0x07; + *pf = frame[0] & AX25_PF; + } else if ((frame[0] & AX25_U) == 1) { /* S frame - take out PF/NR */ + frametype = frame[0] & 0x0F; + *nr = (frame[0] >> 5) & 0x07; + *pf = frame[0] & AX25_PF; + } else if ((frame[0] & AX25_U) == 3) { /* U frame - take out PF */ + frametype = frame[0] & ~AX25_PF; + *pf = frame[0] & AX25_PF; + } + skb_pull(skb, 1); + } else { + if ((frame[0] & AX25_S) == 0) { + frametype = AX25_I; /* I frame - carries NR/NS/PF */ + *ns = (frame[0] >> 1) & 0x7F; + *nr = (frame[1] >> 1) & 0x7F; + *pf = frame[1] & AX25_EPF; + skb_pull(skb, 2); + } else if ((frame[0] & AX25_U) == 1) { /* S frame - take out PF/NR */ + frametype = frame[0] & 0x0F; + *nr = (frame[1] >> 1) & 0x7F; + *pf = frame[1] & AX25_EPF; + skb_pull(skb, 2); + } else if ((frame[0] & AX25_U) == 3) { /* U frame - take out PF */ + frametype = frame[0] & ~AX25_PF; + *pf = frame[0] & AX25_PF; + skb_pull(skb, 1); + } + } + + return frametype; +} + +/* + * This routine is called when the HDLC layer internally generates a + * command or response for the remote machine ( eg. RR, UA etc. ). + * Only supervisory or unnumbered frames are processed. + */ +void ax25_send_control(ax25_cb *ax25, int frametype, int poll_bit, int type) +{ + struct sk_buff *skb; + unsigned char *dptr; + + if ((skb = alloc_skb(ax25->ax25_dev->dev->hard_header_len + 2, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, ax25->ax25_dev->dev->hard_header_len); + + skb->nh.raw = skb->data; + + /* Assume a response - address structure for DTE */ + if (ax25->modulus == AX25_MODULUS) { + dptr = skb_put(skb, 1); + *dptr = frametype; + *dptr |= (poll_bit) ? AX25_PF : 0; + if ((frametype & AX25_U) == AX25_S) /* S frames carry NR */ + *dptr |= (ax25->vr << 5); + } else { + if ((frametype & AX25_U) == AX25_U) { + dptr = skb_put(skb, 1); + *dptr = frametype; + *dptr |= (poll_bit) ? AX25_PF : 0; + } else { + dptr = skb_put(skb, 2); + dptr[0] = frametype; + dptr[1] = (ax25->vr << 1); + dptr[1] |= (poll_bit) ? AX25_EPF : 0; + } + } + + ax25_transmit_buffer(ax25, skb, type); +} + +/* + * Send a 'DM' to an unknown connection attempt, or an invalid caller. + * + * Note: src here is the sender, thus it's the target of the DM + */ +void ax25_return_dm(struct net_device *dev, ax25_address *src, ax25_address *dest, ax25_digi *digi) +{ + struct sk_buff *skb; + char *dptr; + ax25_digi retdigi; + + if (dev == NULL) + return; + + if ((skb = alloc_skb(dev->hard_header_len + 1, GFP_ATOMIC)) == NULL) + return; /* Next SABM will get DM'd */ + + skb_reserve(skb, dev->hard_header_len); + skb->nh.raw = skb->data; + + ax25_digi_invert(digi, &retdigi); + + dptr = skb_put(skb, 1); + + *dptr = AX25_DM | AX25_PF; + + /* + * Do the address ourselves + */ + dptr = skb_push(skb, ax25_addr_size(digi)); + dptr += ax25_addr_build(dptr, dest, src, &retdigi, AX25_RESPONSE, AX25_MODULUS); + + skb->dev = dev; + + ax25_queue_xmit(skb); +} + +/* + * Exponential backoff for AX.25 + */ +void ax25_calculate_t1(ax25_cb *ax25) +{ + int n, t = 2; + + switch (ax25->backoff) { + case 0: + break; + + case 1: + t += 2 * ax25->n2count; + break; + + case 2: + for (n = 0; n < ax25->n2count; n++) + t *= 2; + if (t > 8) t = 8; + break; + } + + ax25->t1 = t * ax25->rtt; +} + +/* + * Calculate the Round Trip Time + */ +void ax25_calculate_rtt(ax25_cb *ax25) +{ + if (ax25->backoff == 0) + return; + + if (ax25_t1timer_running(ax25) && ax25->n2count == 0) + ax25->rtt = (9 * ax25->rtt + ax25->t1 - ax25_display_timer(&ax25->t1timer)) / 10; + + if (ax25->rtt < AX25_T1CLAMPLO) + ax25->rtt = AX25_T1CLAMPLO; + + if (ax25->rtt > AX25_T1CLAMPHI) + ax25->rtt = AX25_T1CLAMPHI; +} + +void ax25_disconnect(ax25_cb *ax25, int reason) +{ + ax25_clear_queues(ax25); + + ax25_stop_t1timer(ax25); + ax25_stop_t2timer(ax25); + ax25_stop_t3timer(ax25); + ax25_stop_idletimer(ax25); + + ax25->state = AX25_STATE_0; + + ax25_link_failed(ax25, reason); + + if (ax25->sk != NULL) { + bh_lock_sock(ax25->sk); + ax25->sk->sk_state = TCP_CLOSE; + ax25->sk->sk_err = reason; + ax25->sk->sk_shutdown |= SEND_SHUTDOWN; + if (!sock_flag(ax25->sk, SOCK_DEAD)) { + ax25->sk->sk_state_change(ax25->sk); + sock_set_flag(ax25->sk, SOCK_DEAD); + } + bh_unlock_sock(ax25->sk); + } +} diff --git a/net/ax25/ax25_timer.c b/net/ax25/ax25_timer.c new file mode 100644 index 000000000000..7a6b50a14554 --- /dev/null +++ b/net/ax25/ax25_timer.c @@ -0,0 +1,243 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Tomi Manninen OH2BNS (oh2bns@sral.fi) + * Copyright (C) Darryl Miles G7LED (dlm@g7led.demon.co.uk) + * Copyright (C) Joerg Reuter DL1BKE (jreuter@yaina.de) + * Copyright (C) Frederic Rible F1OAT (frible@teaser.fr) + * Copyright (C) 2002 Ralf Baechle DO1GRB (ralf@gnu.org) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void ax25_heartbeat_expiry(unsigned long); +static void ax25_t1timer_expiry(unsigned long); +static void ax25_t2timer_expiry(unsigned long); +static void ax25_t3timer_expiry(unsigned long); +static void ax25_idletimer_expiry(unsigned long); + +void ax25_start_heartbeat(ax25_cb *ax25) +{ + del_timer(&ax25->timer); + + ax25->timer.data = (unsigned long)ax25; + ax25->timer.function = &ax25_heartbeat_expiry; + ax25->timer.expires = jiffies + 5 * HZ; + + add_timer(&ax25->timer); +} + +void ax25_start_t1timer(ax25_cb *ax25) +{ + del_timer(&ax25->t1timer); + + ax25->t1timer.data = (unsigned long)ax25; + ax25->t1timer.function = &ax25_t1timer_expiry; + ax25->t1timer.expires = jiffies + ax25->t1; + + add_timer(&ax25->t1timer); +} + +void ax25_start_t2timer(ax25_cb *ax25) +{ + del_timer(&ax25->t2timer); + + ax25->t2timer.data = (unsigned long)ax25; + ax25->t2timer.function = &ax25_t2timer_expiry; + ax25->t2timer.expires = jiffies + ax25->t2; + + add_timer(&ax25->t2timer); +} + +void ax25_start_t3timer(ax25_cb *ax25) +{ + del_timer(&ax25->t3timer); + + if (ax25->t3 > 0) { + ax25->t3timer.data = (unsigned long)ax25; + ax25->t3timer.function = &ax25_t3timer_expiry; + ax25->t3timer.expires = jiffies + ax25->t3; + + add_timer(&ax25->t3timer); + } +} + +void ax25_start_idletimer(ax25_cb *ax25) +{ + del_timer(&ax25->idletimer); + + if (ax25->idle > 0) { + ax25->idletimer.data = (unsigned long)ax25; + ax25->idletimer.function = &ax25_idletimer_expiry; + ax25->idletimer.expires = jiffies + ax25->idle; + + add_timer(&ax25->idletimer); + } +} + +void ax25_stop_heartbeat(ax25_cb *ax25) +{ + del_timer(&ax25->timer); +} + +void ax25_stop_t1timer(ax25_cb *ax25) +{ + del_timer(&ax25->t1timer); +} + +void ax25_stop_t2timer(ax25_cb *ax25) +{ + del_timer(&ax25->t2timer); +} + +void ax25_stop_t3timer(ax25_cb *ax25) +{ + del_timer(&ax25->t3timer); +} + +void ax25_stop_idletimer(ax25_cb *ax25) +{ + del_timer(&ax25->idletimer); +} + +int ax25_t1timer_running(ax25_cb *ax25) +{ + return timer_pending(&ax25->t1timer); +} + +unsigned long ax25_display_timer(struct timer_list *timer) +{ + if (!timer_pending(timer)) + return 0; + + return timer->expires - jiffies; +} + +static void ax25_heartbeat_expiry(unsigned long param) +{ + int proto = AX25_PROTO_STD_SIMPLEX; + ax25_cb *ax25 = (ax25_cb *)param; + + if (ax25->ax25_dev) + proto = ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]; + + switch (proto) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_std_heartbeat_expiry(ax25); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + if (ax25->ax25_dev->dama.slave) + ax25_ds_heartbeat_expiry(ax25); + else + ax25_std_heartbeat_expiry(ax25); + break; +#endif + } +} + +static void ax25_t1timer_expiry(unsigned long param) +{ + ax25_cb *ax25 = (ax25_cb *)param; + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_std_t1timer_expiry(ax25); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + if (!ax25->ax25_dev->dama.slave) + ax25_std_t1timer_expiry(ax25); + break; +#endif + } +} + +static void ax25_t2timer_expiry(unsigned long param) +{ + ax25_cb *ax25 = (ax25_cb *)param; + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_std_t2timer_expiry(ax25); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + if (!ax25->ax25_dev->dama.slave) + ax25_std_t2timer_expiry(ax25); + break; +#endif + } +} + +static void ax25_t3timer_expiry(unsigned long param) +{ + ax25_cb *ax25 = (ax25_cb *)param; + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_std_t3timer_expiry(ax25); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + if (ax25->ax25_dev->dama.slave) + ax25_ds_t3timer_expiry(ax25); + else + ax25_std_t3timer_expiry(ax25); + break; +#endif + } +} + +static void ax25_idletimer_expiry(unsigned long param) +{ + ax25_cb *ax25 = (ax25_cb *)param; + + switch (ax25->ax25_dev->values[AX25_VALUES_PROTOCOL]) { + case AX25_PROTO_STD_SIMPLEX: + case AX25_PROTO_STD_DUPLEX: + ax25_std_idletimer_expiry(ax25); + break; + +#ifdef CONFIG_AX25_DAMA_SLAVE + case AX25_PROTO_DAMA_SLAVE: + if (ax25->ax25_dev->dama.slave) + ax25_ds_idletimer_expiry(ax25); + else + ax25_std_idletimer_expiry(ax25); + break; +#endif + } +} diff --git a/net/ax25/ax25_uid.c b/net/ax25/ax25_uid.c new file mode 100644 index 000000000000..cea6b7d19729 --- /dev/null +++ b/net/ax25/ax25_uid.c @@ -0,0 +1,228 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Callsign/UID mapper. This is in kernel space for security on multi-amateur machines. + */ + +static ax25_uid_assoc *ax25_uid_list; +static DEFINE_RWLOCK(ax25_uid_lock); + +int ax25_uid_policy = 0; + +ax25_address *ax25_findbyuid(uid_t uid) +{ + ax25_uid_assoc *ax25_uid; + ax25_address *res = NULL; + + read_lock(&ax25_uid_lock); + for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) { + if (ax25_uid->uid == uid) { + res = &ax25_uid->call; + break; + } + } + read_unlock(&ax25_uid_lock); + + return NULL; +} + +int ax25_uid_ioctl(int cmd, struct sockaddr_ax25 *sax) +{ + ax25_uid_assoc *s, *ax25_uid; + unsigned long res; + + switch (cmd) { + case SIOCAX25GETUID: + res = -ENOENT; + read_lock(&ax25_uid_lock); + for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) { + if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) { + res = ax25_uid->uid; + break; + } + } + read_unlock(&ax25_uid_lock); + + return res; + + case SIOCAX25ADDUID: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (ax25_findbyuid(sax->sax25_uid)) + return -EEXIST; + if (sax->sax25_uid == 0) + return -EINVAL; + if ((ax25_uid = kmalloc(sizeof(*ax25_uid), GFP_KERNEL)) == NULL) + return -ENOMEM; + + ax25_uid->uid = sax->sax25_uid; + ax25_uid->call = sax->sax25_call; + + write_lock(&ax25_uid_lock); + ax25_uid->next = ax25_uid_list; + ax25_uid_list = ax25_uid; + write_unlock(&ax25_uid_lock); + + return 0; + + case SIOCAX25DELUID: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + write_lock(&ax25_uid_lock); + for (ax25_uid = ax25_uid_list; ax25_uid != NULL; ax25_uid = ax25_uid->next) { + if (ax25cmp(&sax->sax25_call, &ax25_uid->call) == 0) { + break; + } + } + if (ax25_uid == NULL) { + write_unlock(&ax25_uid_lock); + return -ENOENT; + } + if ((s = ax25_uid_list) == ax25_uid) { + ax25_uid_list = s->next; + write_unlock(&ax25_uid_lock); + kfree(ax25_uid); + return 0; + } + while (s != NULL && s->next != NULL) { + if (s->next == ax25_uid) { + s->next = ax25_uid->next; + write_unlock(&ax25_uid_lock); + kfree(ax25_uid); + return 0; + } + s = s->next; + } + write_unlock(&ax25_uid_lock); + + return -ENOENT; + + default: + return -EINVAL; + } + + return -EINVAL; /*NOTREACHED */ +} + +#ifdef CONFIG_PROC_FS + +static void *ax25_uid_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct ax25_uid_assoc *pt; + int i = 1; + + read_lock(&ax25_uid_lock); + if (*pos == 0) + return SEQ_START_TOKEN; + + for (pt = ax25_uid_list; pt != NULL; pt = pt->next) { + if (i == *pos) + return pt; + ++i; + } + return NULL; +} + +static void *ax25_uid_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return (v == SEQ_START_TOKEN) ? ax25_uid_list : + ((struct ax25_uid_assoc *) v)->next; +} + +static void ax25_uid_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&ax25_uid_lock); +} + +static int ax25_uid_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, "Policy: %d\n", ax25_uid_policy); + else { + struct ax25_uid_assoc *pt = v; + + + seq_printf(seq, "%6d %s\n", pt->uid, ax2asc(&pt->call)); + } + return 0; +} + +static struct seq_operations ax25_uid_seqops = { + .start = ax25_uid_seq_start, + .next = ax25_uid_seq_next, + .stop = ax25_uid_seq_stop, + .show = ax25_uid_seq_show, +}; + +static int ax25_uid_info_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ax25_uid_seqops); +} + +struct file_operations ax25_uid_fops = { + .owner = THIS_MODULE, + .open = ax25_uid_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + +/* + * Free all memory associated with UID/Callsign structures. + */ +void __exit ax25_uid_free(void) +{ + ax25_uid_assoc *s, *ax25_uid; + + write_lock(&ax25_uid_lock); + ax25_uid = ax25_uid_list; + while (ax25_uid != NULL) { + s = ax25_uid; + ax25_uid = ax25_uid->next; + + kfree(s); + } + ax25_uid_list = NULL; + write_unlock(&ax25_uid_lock); +} diff --git a/net/ax25/sysctl_net_ax25.c b/net/ax25/sysctl_net_ax25.c new file mode 100644 index 000000000000..f67711f2ee96 --- /dev/null +++ b/net/ax25/sysctl_net_ax25.c @@ -0,0 +1,262 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) 1996 Mike Shaver (shaver@zeroknowledge.com) + */ +#include +#include +#include +#include +#include + +static int min_ipdefmode[1], max_ipdefmode[] = {1}; +static int min_axdefmode[1], max_axdefmode[] = {1}; +static int min_backoff[1], max_backoff[] = {2}; +static int min_conmode[1], max_conmode[] = {2}; +static int min_window[] = {1}, max_window[] = {7}; +static int min_ewindow[] = {1}, max_ewindow[] = {63}; +static int min_t1[] = {1}, max_t1[] = {30 * HZ}; +static int min_t2[] = {1}, max_t2[] = {20 * HZ}; +static int min_t3[1], max_t3[] = {3600 * HZ}; +static int min_idle[1], max_idle[] = {65535 * HZ}; +static int min_n2[] = {1}, max_n2[] = {31}; +static int min_paclen[] = {1}, max_paclen[] = {512}; +static int min_proto[1], max_proto[] = {3}; +static int min_ds_timeout[1], max_ds_timeout[] = {65535 * HZ}; + +static struct ctl_table_header *ax25_table_header; + +static ctl_table *ax25_table; +static int ax25_table_size; + +static ctl_table ax25_dir_table[] = { + { + .ctl_name = NET_AX25, + .procname = "ax25", + .mode = 0555, + }, + { .ctl_name = 0 } +}; + +static ctl_table ax25_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ax25_dir_table + }, + { .ctl_name = 0 } +}; + +static const ctl_table ax25_param_table[] = { + { + .ctl_name = NET_AX25_IP_DEFAULT_MODE, + .procname = "ip_default_mode", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_ipdefmode, + .extra2 = &max_ipdefmode + }, + { + .ctl_name = NET_AX25_DEFAULT_MODE, + .procname = "ax25_default_mode", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_axdefmode, + .extra2 = &max_axdefmode + }, + { + .ctl_name = NET_AX25_BACKOFF_TYPE, + .procname = "backoff_type", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_backoff, + .extra2 = &max_backoff + }, + { + .ctl_name = NET_AX25_CONNECT_MODE, + .procname = "connect_mode", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_conmode, + .extra2 = &max_conmode + }, + { + .ctl_name = NET_AX25_STANDARD_WINDOW, + .procname = "standard_window_size", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_window, + .extra2 = &max_window + }, + { + .ctl_name = NET_AX25_EXTENDED_WINDOW, + .procname = "extended_window_size", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_ewindow, + .extra2 = &max_ewindow + }, + { + .ctl_name = NET_AX25_T1_TIMEOUT, + .procname = "t1_timeout", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_t1, + .extra2 = &max_t1 + }, + { + .ctl_name = NET_AX25_T2_TIMEOUT, + .procname = "t2_timeout", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_t2, + .extra2 = &max_t2 + }, + { + .ctl_name = NET_AX25_T3_TIMEOUT, + .procname = "t3_timeout", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_t3, + .extra2 = &max_t3 + }, + { + .ctl_name = NET_AX25_IDLE_TIMEOUT, + .procname = "idle_timeout", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_idle, + .extra2 = &max_idle + }, + { + .ctl_name = NET_AX25_N2, + .procname = "maximum_retry_count", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_n2, + .extra2 = &max_n2 + }, + { + .ctl_name = NET_AX25_PACLEN, + .procname = "maximum_packet_length", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_paclen, + .extra2 = &max_paclen + }, + { + .ctl_name = NET_AX25_PROTOCOL, + .procname = "protocol", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_proto, + .extra2 = &max_proto + }, + { + .ctl_name = NET_AX25_DAMA_SLAVE_TIMEOUT, + .procname = "dama_slave_timeout", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_ds_timeout, + .extra2 = &max_ds_timeout + }, + { .ctl_name = 0 } /* that's all, folks! */ +}; + +void ax25_register_sysctl(void) +{ + ax25_dev *ax25_dev; + int n, k; + + spin_lock_bh(&ax25_dev_lock); + for (ax25_table_size = sizeof(ctl_table), ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) + ax25_table_size += sizeof(ctl_table); + + if ((ax25_table = kmalloc(ax25_table_size, GFP_ATOMIC)) == NULL) { + spin_unlock_bh(&ax25_dev_lock); + return; + } + + memset(ax25_table, 0x00, ax25_table_size); + + for (n = 0, ax25_dev = ax25_dev_list; ax25_dev != NULL; ax25_dev = ax25_dev->next) { + ctl_table *child = kmalloc(sizeof(ax25_param_table), GFP_ATOMIC); + if (!child) { + while (n--) + kfree(ax25_table[n].child); + kfree(ax25_table); + spin_unlock_bh(&ax25_dev_lock); + return; + } + memcpy(child, ax25_param_table, sizeof(ax25_param_table)); + ax25_table[n].child = ax25_dev->systable = child; + ax25_table[n].ctl_name = n + 1; + ax25_table[n].procname = ax25_dev->dev->name; + ax25_table[n].mode = 0555; + +#ifndef CONFIG_AX25_DAMA_SLAVE + /* + * We do not wish to have a representation of this parameter + * in /proc/sys/ when configured *not* to include the + * AX.25 DAMA slave code, do we? + */ + + child[AX25_VALUES_DS_TIMEOUT].procname = NULL; +#endif + + child[AX25_MAX_VALUES].ctl_name = 0; /* just in case... */ + + for (k = 0; k < AX25_MAX_VALUES; k++) + child[k].data = &ax25_dev->values[k]; + + n++; + } + spin_unlock_bh(&ax25_dev_lock); + + ax25_dir_table[0].child = ax25_table; + + ax25_table_header = register_sysctl_table(ax25_root_table, 1); +} + +void ax25_unregister_sysctl(void) +{ + ctl_table *p; + unregister_sysctl_table(ax25_table_header); + + ax25_dir_table[0].child = NULL; + for (p = ax25_table; p->ctl_name; p++) + kfree(p->child); + kfree(ax25_table); +} diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig new file mode 100644 index 000000000000..6929490d095a --- /dev/null +++ b/net/bluetooth/Kconfig @@ -0,0 +1,63 @@ +# +# Bluetooth subsystem configuration +# + +menuconfig BT + depends on NET + tristate "Bluetooth subsystem support" + help + Bluetooth is low-cost, low-power, short-range wireless technology. + It was designed as a replacement for cables and other short-range + technologies like IrDA. Bluetooth operates in personal area range + that typically extends up to 10 meters. More information about + Bluetooth can be found at . + + Linux Bluetooth subsystem consist of several layers: + Bluetooth Core (HCI device and connection manager, scheduler) + HCI Device drivers (Interface to the hardware) + SCO Module (SCO audio links) + L2CAP Module (Logical Link Control and Adaptation Protocol) + RFCOMM Module (RFCOMM Protocol) + BNEP Module (Bluetooth Network Encapsulation Protocol) + CMTP Module (CAPI Message Transport Protocol) + HIDP Module (Human Interface Device Protocol) + + Say Y here to compile Bluetooth support into the kernel or say M to + compile it as module (bluetooth). + + To use Linux Bluetooth subsystem, you will need several user-space + utilities like hciconfig and hcid. These utilities and updates to + Bluetooth kernel modules are provided in the BlueZ packages. + For more information, see . + +config BT_L2CAP + tristate "L2CAP protocol support" + depends on BT + help + L2CAP (Logical Link Control and Adaptation Protocol) provides + connection oriented and connection-less data transport. L2CAP + support is required for most Bluetooth applications. + + Say Y here to compile L2CAP support into the kernel or say M to + compile it as module (l2cap). + +config BT_SCO + tristate "SCO links support" + depends on BT + help + SCO link provides voice transport over Bluetooth. SCO support is + required for voice applications like Headset and Audio. + + Say Y here to compile SCO support into the kernel or say M to + compile it as module (sco). + +source "net/bluetooth/rfcomm/Kconfig" + +source "net/bluetooth/bnep/Kconfig" + +source "net/bluetooth/cmtp/Kconfig" + +source "net/bluetooth/hidp/Kconfig" + +source "drivers/bluetooth/Kconfig" + diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile new file mode 100644 index 000000000000..d1e433f7d673 --- /dev/null +++ b/net/bluetooth/Makefile @@ -0,0 +1,13 @@ +# +# Makefile for the Linux Bluetooth subsystem. +# + +obj-$(CONFIG_BT) += bluetooth.o +obj-$(CONFIG_BT_L2CAP) += l2cap.o +obj-$(CONFIG_BT_SCO) += sco.o +obj-$(CONFIG_BT_RFCOMM) += rfcomm/ +obj-$(CONFIG_BT_BNEP) += bnep/ +obj-$(CONFIG_BT_CMTP) += cmtp/ +obj-$(CONFIG_BT_HIDP) += hidp/ + +bluetooth-objs := af_bluetooth.o hci_core.o hci_conn.o hci_event.o hci_sock.o hci_sysfs.o lib.o diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c new file mode 100644 index 000000000000..1650c6bf6997 --- /dev/null +++ b/net/bluetooth/af_bluetooth.c @@ -0,0 +1,355 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2000-2001 Qualcomm Incorporated + + Written 2000,2001 by Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth address family and sockets. */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_KMOD) +#include +#endif + +#include + +#ifndef CONFIG_BT_SOCK_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +#define VERSION "2.7" + +struct proc_dir_entry *proc_bt; +EXPORT_SYMBOL(proc_bt); + +/* Bluetooth sockets */ +#define BT_MAX_PROTO 8 +static struct net_proto_family *bt_proto[BT_MAX_PROTO]; + +int bt_sock_register(int proto, struct net_proto_family *ops) +{ + if (proto < 0 || proto >= BT_MAX_PROTO) + return -EINVAL; + + if (bt_proto[proto]) + return -EEXIST; + + bt_proto[proto] = ops; + return 0; +} +EXPORT_SYMBOL(bt_sock_register); + +int bt_sock_unregister(int proto) +{ + if (proto < 0 || proto >= BT_MAX_PROTO) + return -EINVAL; + + if (!bt_proto[proto]) + return -ENOENT; + + bt_proto[proto] = NULL; + return 0; +} +EXPORT_SYMBOL(bt_sock_unregister); + +static int bt_sock_create(struct socket *sock, int proto) +{ + int err = 0; + + if (proto < 0 || proto >= BT_MAX_PROTO) + return -EINVAL; + +#if defined(CONFIG_KMOD) + if (!bt_proto[proto]) { + request_module("bt-proto-%d", proto); + } +#endif + err = -EPROTONOSUPPORT; + if (bt_proto[proto] && try_module_get(bt_proto[proto]->owner)) { + err = bt_proto[proto]->create(sock, proto); + module_put(bt_proto[proto]->owner); + } + return err; +} + +void bt_sock_link(struct bt_sock_list *l, struct sock *sk) +{ + write_lock_bh(&l->lock); + sk_add_node(sk, &l->head); + write_unlock_bh(&l->lock); +} +EXPORT_SYMBOL(bt_sock_link); + +void bt_sock_unlink(struct bt_sock_list *l, struct sock *sk) +{ + write_lock_bh(&l->lock); + sk_del_node_init(sk); + write_unlock_bh(&l->lock); +} +EXPORT_SYMBOL(bt_sock_unlink); + +void bt_accept_enqueue(struct sock *parent, struct sock *sk) +{ + BT_DBG("parent %p, sk %p", parent, sk); + + sock_hold(sk); + list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q); + bt_sk(sk)->parent = parent; + parent->sk_ack_backlog++; +} +EXPORT_SYMBOL(bt_accept_enqueue); + +void bt_accept_unlink(struct sock *sk) +{ + BT_DBG("sk %p state %d", sk, sk->sk_state); + + list_del_init(&bt_sk(sk)->accept_q); + bt_sk(sk)->parent->sk_ack_backlog--; + bt_sk(sk)->parent = NULL; + sock_put(sk); +} +EXPORT_SYMBOL(bt_accept_unlink); + +struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock) +{ + struct list_head *p, *n; + struct sock *sk; + + BT_DBG("parent %p", parent); + + list_for_each_safe(p, n, &bt_sk(parent)->accept_q) { + sk = (struct sock *) list_entry(p, struct bt_sock, accept_q); + + lock_sock(sk); + + /* FIXME: Is this check still needed */ + if (sk->sk_state == BT_CLOSED) { + release_sock(sk); + bt_accept_unlink(sk); + continue; + } + + if (sk->sk_state == BT_CONNECTED || !newsock) { + bt_accept_unlink(sk); + if (newsock) + sock_graft(sk, newsock); + release_sock(sk); + return sk; + } + + release_sock(sk); + } + return NULL; +} +EXPORT_SYMBOL(bt_accept_dequeue); + +int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len, int flags) +{ + int noblock = flags & MSG_DONTWAIT; + struct sock *sk = sock->sk; + struct sk_buff *skb; + size_t copied; + int err; + + BT_DBG("sock %p sk %p len %d", sock, sk, len); + + if (flags & (MSG_OOB)) + return -EOPNOTSUPP; + + if (!(skb = skb_recv_datagram(sk, flags, noblock, &err))) { + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 0; + return err; + } + + msg->msg_namelen = 0; + + copied = skb->len; + if (len < copied) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + skb->h.raw = skb->data; + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + + skb_free_datagram(sk, skb); + + return err ? : copied; +} +EXPORT_SYMBOL(bt_sock_recvmsg); + +static inline unsigned int bt_accept_poll(struct sock *parent) +{ + struct list_head *p, *n; + struct sock *sk; + + list_for_each_safe(p, n, &bt_sk(parent)->accept_q) { + sk = (struct sock *) list_entry(p, struct bt_sock, accept_q); + if (sk->sk_state == BT_CONNECTED) + return POLLIN | POLLRDNORM; + } + + return 0; +} + +unsigned int bt_sock_poll(struct file * file, struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask = 0; + + BT_DBG("sock %p, sk %p", sock, sk); + + poll_wait(file, sk->sk_sleep, wait); + + if (sk->sk_state == BT_LISTEN) + return bt_accept_poll(sk); + + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + mask |= POLLERR; + + if (sk->sk_shutdown == SHUTDOWN_MASK) + mask |= POLLHUP; + + if (!skb_queue_empty(&sk->sk_receive_queue) || + (sk->sk_shutdown & RCV_SHUTDOWN)) + mask |= POLLIN | POLLRDNORM; + + if (sk->sk_state == BT_CLOSED) + mask |= POLLHUP; + + if (sk->sk_state == BT_CONNECT || + sk->sk_state == BT_CONNECT2 || + sk->sk_state == BT_CONFIG) + return mask; + + if (sock_writeable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + else + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + return mask; +} +EXPORT_SYMBOL(bt_sock_poll); + +int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo) +{ + DECLARE_WAITQUEUE(wait, current); + int err = 0; + + BT_DBG("sk %p", sk); + + add_wait_queue(sk->sk_sleep, &wait); + while (sk->sk_state != state) { + set_current_state(TASK_INTERRUPTIBLE); + + if (!timeo) { + err = -EAGAIN; + break; + } + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + break; + } + + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + + if (sk->sk_err) { + err = sock_error(sk); + break; + } + } + set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sk_sleep, &wait); + return err; +} +EXPORT_SYMBOL(bt_sock_wait_state); + +static struct net_proto_family bt_sock_family_ops = { + .owner = THIS_MODULE, + .family = PF_BLUETOOTH, + .create = bt_sock_create, +}; + +extern int hci_sock_init(void); +extern int hci_sock_cleanup(void); + +extern int bt_sysfs_init(void); +extern int bt_sysfs_cleanup(void); + +static int __init bt_init(void) +{ + BT_INFO("Core ver %s", VERSION); + + proc_bt = proc_mkdir("bluetooth", NULL); + if (proc_bt) + proc_bt->owner = THIS_MODULE; + + sock_register(&bt_sock_family_ops); + + BT_INFO("HCI device and connection manager initialized"); + + bt_sysfs_init(); + + hci_sock_init(); + + return 0; +} + +static void __exit bt_exit(void) +{ + hci_sock_cleanup(); + + bt_sysfs_cleanup(); + + sock_unregister(PF_BLUETOOTH); + + remove_proc_entry("bluetooth", NULL); +} + +subsys_initcall(bt_init); +module_exit(bt_exit); + +MODULE_AUTHOR("Maxim Krasnyansky , Marcel Holtmann "); +MODULE_DESCRIPTION("Bluetooth Core ver " VERSION); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_BLUETOOTH); diff --git a/net/bluetooth/bnep/Kconfig b/net/bluetooth/bnep/Kconfig new file mode 100644 index 000000000000..35158b036d54 --- /dev/null +++ b/net/bluetooth/bnep/Kconfig @@ -0,0 +1,24 @@ +config BT_BNEP + tristate "BNEP protocol support" + depends on BT && BT_L2CAP + select CRC32 + help + BNEP (Bluetooth Network Encapsulation Protocol) is Ethernet + emulation layer on top of Bluetooth. BNEP is required for + Bluetooth PAN (Personal Area Network). + + Say Y here to compile BNEP support into the kernel or say M to + compile it as module (bnep). + +config BT_BNEP_MC_FILTER + bool "Multicast filter support" + depends on BT_BNEP + help + This option enables the multicast filter support for BNEP. + +config BT_BNEP_PROTO_FILTER + bool "Protocol filter support" + depends on BT_BNEP + help + This option enables the protocol filter support for BNEP. + diff --git a/net/bluetooth/bnep/Makefile b/net/bluetooth/bnep/Makefile new file mode 100644 index 000000000000..c7821e76ca56 --- /dev/null +++ b/net/bluetooth/bnep/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux Bluetooth BNEP layer. +# + +obj-$(CONFIG_BT_BNEP) += bnep.o + +bnep-objs := core.o sock.o netdev.o diff --git a/net/bluetooth/bnep/bnep.h b/net/bluetooth/bnep/bnep.h new file mode 100644 index 000000000000..bbb1ed7097a9 --- /dev/null +++ b/net/bluetooth/bnep/bnep.h @@ -0,0 +1,184 @@ +/* + BNEP protocol definition for Linux Bluetooth stack (BlueZ). + Copyright (C) 2002 Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, as + published by the Free Software Foundation. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + +/* + * $Id: bnep.h,v 1.5 2002/08/04 21:23:58 maxk Exp $ + */ + +#ifndef _BNEP_H +#define _BNEP_H + +#include +#include +#include + +// Limits +#define BNEP_MAX_PROTO_FILTERS 5 +#define BNEP_MAX_MULTICAST_FILTERS 20 + +// UUIDs +#define BNEP_BASE_UUID 0x0000000000001000800000805F9B34FB +#define BNEP_UUID16 0x02 +#define BNEP_UUID32 0x04 +#define BNEP_UUID128 0x16 + +#define BNEP_SVC_PANU 0x1115 +#define BNEP_SVC_NAP 0x1116 +#define BNEP_SVC_GN 0x1117 + +// Packet types +#define BNEP_GENERAL 0x00 +#define BNEP_CONTROL 0x01 +#define BNEP_COMPRESSED 0x02 +#define BNEP_COMPRESSED_SRC_ONLY 0x03 +#define BNEP_COMPRESSED_DST_ONLY 0x04 + +// Control types +#define BNEP_CMD_NOT_UNDERSTOOD 0x00 +#define BNEP_SETUP_CONN_REQ 0x01 +#define BNEP_SETUP_CONN_RSP 0x02 +#define BNEP_FILTER_NET_TYPE_SET 0x03 +#define BNEP_FILTER_NET_TYPE_RSP 0x04 +#define BNEP_FILTER_MULTI_ADDR_SET 0x05 +#define BNEP_FILTER_MULTI_ADDR_RSP 0x06 + +// Extension types +#define BNEP_EXT_CONTROL 0x00 + +// Response messages +#define BNEP_SUCCESS 0x00 + +#define BNEP_CONN_INVALID_DST 0x01 +#define BNEP_CONN_INVALID_SRC 0x02 +#define BNEP_CONN_INVALID_SVC 0x03 +#define BNEP_CONN_NOT_ALLOWED 0x04 + +#define BNEP_FILTER_UNSUPPORTED_REQ 0x01 +#define BNEP_FILTER_INVALID_RANGE 0x02 +#define BNEP_FILTER_INVALID_MCADDR 0x02 +#define BNEP_FILTER_LIMIT_REACHED 0x03 +#define BNEP_FILTER_DENIED_SECURITY 0x04 + +// L2CAP settings +#define BNEP_MTU 1691 +#define BNEP_PSM 0x0f +#define BNEP_FLUSH_TO 0xffff +#define BNEP_CONNECT_TO 15 +#define BNEP_FILTER_TO 15 + +// Headers +#define BNEP_TYPE_MASK 0x7f +#define BNEP_EXT_HEADER 0x80 + +struct bnep_setup_conn_req { + __u8 type; + __u8 ctrl; + __u8 uuid_size; + __u8 service[0]; +} __attribute__((packed)); + +struct bnep_set_filter_req { + __u8 type; + __u8 ctrl; + __u16 len; + __u8 list[0]; +} __attribute__((packed)); + +struct bnep_control_rsp { + __u8 type; + __u8 ctrl; + __u16 resp; +} __attribute__((packed)); + +struct bnep_ext_hdr { + __u8 type; + __u8 len; + __u8 data[0]; +} __attribute__((packed)); + +/* BNEP ioctl defines */ +#define BNEPCONNADD _IOW('B', 200, int) +#define BNEPCONNDEL _IOW('B', 201, int) +#define BNEPGETCONNLIST _IOR('B', 210, int) +#define BNEPGETCONNINFO _IOR('B', 211, int) + +struct bnep_connadd_req { + int sock; // Connected socket + __u32 flags; + __u16 role; + char device[16]; // Name of the Ethernet device +}; + +struct bnep_conndel_req { + __u32 flags; + __u8 dst[ETH_ALEN]; +}; + +struct bnep_conninfo { + __u32 flags; + __u16 role; + __u16 state; + __u8 dst[ETH_ALEN]; + char device[16]; +}; + +struct bnep_connlist_req { + __u32 cnum; + struct bnep_conninfo __user *ci; +}; + +struct bnep_proto_filter { + __u16 start; + __u16 end; +}; + +int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock); +int bnep_del_connection(struct bnep_conndel_req *req); +int bnep_get_connlist(struct bnep_connlist_req *req); +int bnep_get_conninfo(struct bnep_conninfo *ci); + +// BNEP sessions +struct bnep_session { + struct list_head list; + + unsigned int role; + unsigned long state; + unsigned long flags; + atomic_t killed; + + struct ethhdr eh; + struct msghdr msg; + + struct bnep_proto_filter proto_filter[BNEP_MAX_PROTO_FILTERS]; + u64 mc_filter; + + struct socket *sock; + struct net_device *dev; + struct net_device_stats stats; +}; + +void bnep_net_setup(struct net_device *dev); +int bnep_sock_init(void); +int bnep_sock_cleanup(void); + +static inline int bnep_mc_hash(__u8 *addr) +{ + return (crc32_be(~0, addr, ETH_ALEN) >> 26); +} + +#endif diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c new file mode 100644 index 000000000000..682bf20af52d --- /dev/null +++ b/net/bluetooth/bnep/core.c @@ -0,0 +1,713 @@ +/* + BNEP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2001-2002 Inventel Systemes + Written 2001-2002 by + Clément Moreau + David Libault + + Copyright (C) 2002 Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* + * $Id: core.c,v 1.20 2002/08/04 21:23:58 maxk Exp $ + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include + +#include +#include + +#include "bnep.h" + +#ifndef CONFIG_BT_BNEP_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +#define VERSION "1.2" + +static LIST_HEAD(bnep_session_list); +static DECLARE_RWSEM(bnep_session_sem); + +static struct bnep_session *__bnep_get_session(u8 *dst) +{ + struct bnep_session *s; + struct list_head *p; + + BT_DBG(""); + + list_for_each(p, &bnep_session_list) { + s = list_entry(p, struct bnep_session, list); + if (!memcmp(dst, s->eh.h_source, ETH_ALEN)) + return s; + } + return NULL; +} + +static void __bnep_link_session(struct bnep_session *s) +{ + /* It's safe to call __module_get() here because sessions are added + by the socket layer which has to hold the refference to this module. + */ + __module_get(THIS_MODULE); + list_add(&s->list, &bnep_session_list); +} + +static void __bnep_unlink_session(struct bnep_session *s) +{ + list_del(&s->list); + module_put(THIS_MODULE); +} + +static int bnep_send(struct bnep_session *s, void *data, size_t len) +{ + struct socket *sock = s->sock; + struct kvec iv = { data, len }; + + return kernel_sendmsg(sock, &s->msg, &iv, 1, len); +} + +static int bnep_send_rsp(struct bnep_session *s, u8 ctrl, u16 resp) +{ + struct bnep_control_rsp rsp; + rsp.type = BNEP_CONTROL; + rsp.ctrl = ctrl; + rsp.resp = htons(resp); + return bnep_send(s, &rsp, sizeof(rsp)); +} + +#ifdef CONFIG_BT_BNEP_PROTO_FILTER +static inline void bnep_set_default_proto_filter(struct bnep_session *s) +{ + /* (IPv4, ARP) */ + s->proto_filter[0].start = htons(0x0800); + s->proto_filter[0].end = htons(0x0806); + /* (RARP, AppleTalk) */ + s->proto_filter[1].start = htons(0x8035); + s->proto_filter[1].end = htons(0x80F3); + /* (IPX, IPv6) */ + s->proto_filter[2].start = htons(0x8137); + s->proto_filter[2].end = htons(0x86DD); +} +#endif + +static int bnep_ctrl_set_netfilter(struct bnep_session *s, u16 *data, int len) +{ + int n; + + if (len < 2) + return -EILSEQ; + + n = ntohs(get_unaligned(data)); + data++; len -= 2; + + if (len < n) + return -EILSEQ; + + BT_DBG("filter len %d", n); + +#ifdef CONFIG_BT_BNEP_PROTO_FILTER + n /= 4; + if (n <= BNEP_MAX_PROTO_FILTERS) { + struct bnep_proto_filter *f = s->proto_filter; + int i; + + for (i = 0; i < n; i++) { + f[i].start = get_unaligned(data++); + f[i].end = get_unaligned(data++); + + BT_DBG("proto filter start %d end %d", + f[i].start, f[i].end); + } + + if (i < BNEP_MAX_PROTO_FILTERS) + memset(f + i, 0, sizeof(*f)); + + if (n == 0) + bnep_set_default_proto_filter(s); + + bnep_send_rsp(s, BNEP_FILTER_NET_TYPE_RSP, BNEP_SUCCESS); + } else { + bnep_send_rsp(s, BNEP_FILTER_NET_TYPE_RSP, BNEP_FILTER_LIMIT_REACHED); + } +#else + bnep_send_rsp(s, BNEP_FILTER_NET_TYPE_RSP, BNEP_FILTER_UNSUPPORTED_REQ); +#endif + return 0; +} + +static int bnep_ctrl_set_mcfilter(struct bnep_session *s, u8 *data, int len) +{ + int n; + + if (len < 2) + return -EILSEQ; + + n = ntohs(get_unaligned((u16 *) data)); + data += 2; len -= 2; + + if (len < n) + return -EILSEQ; + + BT_DBG("filter len %d", n); + +#ifdef CONFIG_BT_BNEP_MC_FILTER + n /= (ETH_ALEN * 2); + + if (n > 0) { + s->mc_filter = 0; + + /* Always send broadcast */ + set_bit(bnep_mc_hash(s->dev->broadcast), (ulong *) &s->mc_filter); + + /* Add address ranges to the multicast hash */ + for (; n > 0; n--) { + u8 a1[6], *a2; + + memcpy(a1, data, ETH_ALEN); data += ETH_ALEN; + a2 = data; data += ETH_ALEN; + + BT_DBG("mc filter %s -> %s", + batostr((void *) a1), batostr((void *) a2)); + + #define INCA(a) { int i = 5; while (i >=0 && ++a[i--] == 0); } + + /* Iterate from a1 to a2 */ + set_bit(bnep_mc_hash(a1), (ulong *) &s->mc_filter); + while (memcmp(a1, a2, 6) < 0 && s->mc_filter != ~0LL) { + INCA(a1); + set_bit(bnep_mc_hash(a1), (ulong *) &s->mc_filter); + } + } + } + + BT_DBG("mc filter hash 0x%llx", s->mc_filter); + + bnep_send_rsp(s, BNEP_FILTER_MULTI_ADDR_RSP, BNEP_SUCCESS); +#else + bnep_send_rsp(s, BNEP_FILTER_MULTI_ADDR_RSP, BNEP_FILTER_UNSUPPORTED_REQ); +#endif + return 0; +} + +static int bnep_rx_control(struct bnep_session *s, void *data, int len) +{ + u8 cmd = *(u8 *)data; + int err = 0; + + data++; len--; + + switch (cmd) { + case BNEP_CMD_NOT_UNDERSTOOD: + case BNEP_SETUP_CONN_REQ: + case BNEP_SETUP_CONN_RSP: + case BNEP_FILTER_NET_TYPE_RSP: + case BNEP_FILTER_MULTI_ADDR_RSP: + /* Ignore these for now */ + break; + + case BNEP_FILTER_NET_TYPE_SET: + err = bnep_ctrl_set_netfilter(s, data, len); + break; + + case BNEP_FILTER_MULTI_ADDR_SET: + err = bnep_ctrl_set_mcfilter(s, data, len); + break; + + default: { + u8 pkt[3]; + pkt[0] = BNEP_CONTROL; + pkt[1] = BNEP_CMD_NOT_UNDERSTOOD; + pkt[2] = cmd; + bnep_send(s, pkt, sizeof(pkt)); + } + break; + } + + return err; +} + +static int bnep_rx_extension(struct bnep_session *s, struct sk_buff *skb) +{ + struct bnep_ext_hdr *h; + int err = 0; + + do { + h = (void *) skb->data; + if (!skb_pull(skb, sizeof(*h))) { + err = -EILSEQ; + break; + } + + BT_DBG("type 0x%x len %d", h->type, h->len); + + switch (h->type & BNEP_TYPE_MASK) { + case BNEP_EXT_CONTROL: + bnep_rx_control(s, skb->data, skb->len); + break; + + default: + /* Unknown extension, skip it. */ + break; + } + + if (!skb_pull(skb, h->len)) { + err = -EILSEQ; + break; + } + } while (!err && (h->type & BNEP_EXT_HEADER)); + + return err; +} + +static u8 __bnep_rx_hlen[] = { + ETH_HLEN, /* BNEP_GENERAL */ + 0, /* BNEP_CONTROL */ + 2, /* BNEP_COMPRESSED */ + ETH_ALEN + 2, /* BNEP_COMPRESSED_SRC_ONLY */ + ETH_ALEN + 2 /* BNEP_COMPRESSED_DST_ONLY */ +}; +#define BNEP_RX_TYPES (sizeof(__bnep_rx_hlen) - 1) + +static inline int bnep_rx_frame(struct bnep_session *s, struct sk_buff *skb) +{ + struct net_device *dev = s->dev; + struct sk_buff *nskb; + u8 type; + + dev->last_rx = jiffies; + s->stats.rx_bytes += skb->len; + + type = *(u8 *) skb->data; skb_pull(skb, 1); + + if ((type & BNEP_TYPE_MASK) > BNEP_RX_TYPES) + goto badframe; + + if ((type & BNEP_TYPE_MASK) == BNEP_CONTROL) { + bnep_rx_control(s, skb->data, skb->len); + kfree_skb(skb); + return 0; + } + + skb->mac.raw = skb->data; + + /* Verify and pull out header */ + if (!skb_pull(skb, __bnep_rx_hlen[type & BNEP_TYPE_MASK])) + goto badframe; + + s->eh.h_proto = get_unaligned((u16 *) (skb->data - 2)); + + if (type & BNEP_EXT_HEADER) { + if (bnep_rx_extension(s, skb) < 0) + goto badframe; + } + + /* Strip 802.1p header */ + if (ntohs(s->eh.h_proto) == 0x8100) { + if (!skb_pull(skb, 4)) + goto badframe; + s->eh.h_proto = get_unaligned((u16 *) (skb->data - 2)); + } + + /* We have to alloc new skb and copy data here :(. Because original skb + * may not be modified and because of the alignment requirements. */ + nskb = alloc_skb(2 + ETH_HLEN + skb->len, GFP_KERNEL); + if (!nskb) { + s->stats.rx_dropped++; + kfree_skb(skb); + return -ENOMEM; + } + skb_reserve(nskb, 2); + + /* Decompress header and construct ether frame */ + switch (type & BNEP_TYPE_MASK) { + case BNEP_COMPRESSED: + memcpy(__skb_put(nskb, ETH_HLEN), &s->eh, ETH_HLEN); + break; + + case BNEP_COMPRESSED_SRC_ONLY: + memcpy(__skb_put(nskb, ETH_ALEN), s->eh.h_dest, ETH_ALEN); + memcpy(__skb_put(nskb, ETH_ALEN), skb->mac.raw, ETH_ALEN); + put_unaligned(s->eh.h_proto, (u16 *) __skb_put(nskb, 2)); + break; + + case BNEP_COMPRESSED_DST_ONLY: + memcpy(__skb_put(nskb, ETH_ALEN), skb->mac.raw, ETH_ALEN); + memcpy(__skb_put(nskb, ETH_ALEN + 2), s->eh.h_source, ETH_ALEN + 2); + break; + + case BNEP_GENERAL: + memcpy(__skb_put(nskb, ETH_ALEN * 2), skb->mac.raw, ETH_ALEN * 2); + put_unaligned(s->eh.h_proto, (u16 *) __skb_put(nskb, 2)); + break; + } + + memcpy(__skb_put(nskb, skb->len), skb->data, skb->len); + kfree_skb(skb); + + s->stats.rx_packets++; + nskb->dev = dev; + nskb->ip_summed = CHECKSUM_NONE; + nskb->protocol = eth_type_trans(nskb, dev); + netif_rx_ni(nskb); + return 0; + +badframe: + s->stats.rx_errors++; + kfree_skb(skb); + return 0; +} + +static u8 __bnep_tx_types[] = { + BNEP_GENERAL, + BNEP_COMPRESSED_SRC_ONLY, + BNEP_COMPRESSED_DST_ONLY, + BNEP_COMPRESSED +}; + +static inline int bnep_tx_frame(struct bnep_session *s, struct sk_buff *skb) +{ + struct ethhdr *eh = (void *) skb->data; + struct socket *sock = s->sock; + struct kvec iv[3]; + int len = 0, il = 0; + u8 type = 0; + + BT_DBG("skb %p dev %p type %d", skb, skb->dev, skb->pkt_type); + + if (!skb->dev) { + /* Control frame sent by us */ + goto send; + } + + iv[il++] = (struct kvec) { &type, 1 }; + len++; + + if (!memcmp(eh->h_dest, s->eh.h_source, ETH_ALEN)) + type |= 0x01; + + if (!memcmp(eh->h_source, s->eh.h_dest, ETH_ALEN)) + type |= 0x02; + + if (type) + skb_pull(skb, ETH_ALEN * 2); + + type = __bnep_tx_types[type]; + switch (type) { + case BNEP_COMPRESSED_SRC_ONLY: + iv[il++] = (struct kvec) { eh->h_source, ETH_ALEN }; + len += ETH_ALEN; + break; + + case BNEP_COMPRESSED_DST_ONLY: + iv[il++] = (struct kvec) { eh->h_dest, ETH_ALEN }; + len += ETH_ALEN; + break; + } + +send: + iv[il++] = (struct kvec) { skb->data, skb->len }; + len += skb->len; + + /* FIXME: linearize skb */ + { + len = kernel_sendmsg(sock, &s->msg, iv, il, len); + } + kfree_skb(skb); + + if (len > 0) { + s->stats.tx_bytes += len; + s->stats.tx_packets++; + return 0; + } + + return len; +} + +static int bnep_session(void *arg) +{ + struct bnep_session *s = arg; + struct net_device *dev = s->dev; + struct sock *sk = s->sock->sk; + struct sk_buff *skb; + wait_queue_t wait; + + BT_DBG(""); + + daemonize("kbnepd %s", dev->name); + set_user_nice(current, -15); + current->flags |= PF_NOFREEZE; + + init_waitqueue_entry(&wait, current); + add_wait_queue(sk->sk_sleep, &wait); + while (!atomic_read(&s->killed)) { + set_current_state(TASK_INTERRUPTIBLE); + + // RX + while ((skb = skb_dequeue(&sk->sk_receive_queue))) { + skb_orphan(skb); + bnep_rx_frame(s, skb); + } + + if (sk->sk_state != BT_CONNECTED) + break; + + // TX + while ((skb = skb_dequeue(&sk->sk_write_queue))) + if (bnep_tx_frame(s, skb)) + break; + netif_wake_queue(dev); + + schedule(); + } + set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sk_sleep, &wait); + + /* Cleanup session */ + down_write(&bnep_session_sem); + + /* Delete network device */ + unregister_netdev(dev); + + /* Release the socket */ + fput(s->sock->file); + + __bnep_unlink_session(s); + + up_write(&bnep_session_sem); + free_netdev(dev); + return 0; +} + +int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock) +{ + struct net_device *dev; + struct bnep_session *s, *ss; + u8 dst[ETH_ALEN], src[ETH_ALEN]; + int err; + + BT_DBG(""); + + baswap((void *) dst, &bt_sk(sock->sk)->dst); + baswap((void *) src, &bt_sk(sock->sk)->src); + + /* session struct allocated as private part of net_device */ + dev = alloc_netdev(sizeof(struct bnep_session), + (*req->device) ? req->device : "bnep%d", + bnep_net_setup); + if (!dev) + return ENOMEM; + + + down_write(&bnep_session_sem); + + ss = __bnep_get_session(dst); + if (ss && ss->state == BT_CONNECTED) { + err = -EEXIST; + goto failed; + } + + s = dev->priv; + + /* This is rx header therefore addresses are swapped. + * ie eh.h_dest is our local address. */ + memcpy(s->eh.h_dest, &src, ETH_ALEN); + memcpy(s->eh.h_source, &dst, ETH_ALEN); + memcpy(dev->dev_addr, s->eh.h_dest, ETH_ALEN); + + s->dev = dev; + s->sock = sock; + s->role = req->role; + s->state = BT_CONNECTED; + + s->msg.msg_flags = MSG_NOSIGNAL; + +#ifdef CONFIG_BT_BNEP_MC_FILTER + /* Set default mc filter */ + set_bit(bnep_mc_hash(dev->broadcast), (ulong *) &s->mc_filter); +#endif + +#ifdef CONFIG_BT_BNEP_PROTO_FILTER + /* Set default protocol filter */ + bnep_set_default_proto_filter(s); +#endif + + err = register_netdev(dev); + if (err) { + goto failed; + } + + __bnep_link_session(s); + + err = kernel_thread(bnep_session, s, CLONE_KERNEL); + if (err < 0) { + /* Session thread start failed, gotta cleanup. */ + unregister_netdev(dev); + __bnep_unlink_session(s); + goto failed; + } + + up_write(&bnep_session_sem); + strcpy(req->device, dev->name); + return 0; + +failed: + up_write(&bnep_session_sem); + free_netdev(dev); + return err; +} + +int bnep_del_connection(struct bnep_conndel_req *req) +{ + struct bnep_session *s; + int err = 0; + + BT_DBG(""); + + down_read(&bnep_session_sem); + + s = __bnep_get_session(req->dst); + if (s) { + /* Wakeup user-space which is polling for socket errors. + * This is temporary hack untill we have shutdown in L2CAP */ + s->sock->sk->sk_err = EUNATCH; + + /* Kill session thread */ + atomic_inc(&s->killed); + wake_up_interruptible(s->sock->sk->sk_sleep); + } else + err = -ENOENT; + + up_read(&bnep_session_sem); + return err; +} + +static void __bnep_copy_ci(struct bnep_conninfo *ci, struct bnep_session *s) +{ + memcpy(ci->dst, s->eh.h_source, ETH_ALEN); + strcpy(ci->device, s->dev->name); + ci->flags = s->flags; + ci->state = s->state; + ci->role = s->role; +} + +int bnep_get_connlist(struct bnep_connlist_req *req) +{ + struct list_head *p; + int err = 0, n = 0; + + down_read(&bnep_session_sem); + + list_for_each(p, &bnep_session_list) { + struct bnep_session *s; + struct bnep_conninfo ci; + + s = list_entry(p, struct bnep_session, list); + + __bnep_copy_ci(&ci, s); + + if (copy_to_user(req->ci, &ci, sizeof(ci))) { + err = -EFAULT; + break; + } + + if (++n >= req->cnum) + break; + + req->ci++; + } + req->cnum = n; + + up_read(&bnep_session_sem); + return err; +} + +int bnep_get_conninfo(struct bnep_conninfo *ci) +{ + struct bnep_session *s; + int err = 0; + + down_read(&bnep_session_sem); + + s = __bnep_get_session(ci->dst); + if (s) + __bnep_copy_ci(ci, s); + else + err = -ENOENT; + + up_read(&bnep_session_sem); + return err; +} + +static int __init bnep_init(void) +{ + char flt[50] = ""; + + l2cap_load(); + +#ifdef CONFIG_BT_BNEP_PROTO_FILTER + strcat(flt, "protocol "); +#endif + +#ifdef CONFIG_BT_BNEP_MC_FILTER + strcat(flt, "multicast"); +#endif + + BT_INFO("BNEP (Ethernet Emulation) ver %s", VERSION); + if (flt[0]) + BT_INFO("BNEP filters: %s", flt); + + bnep_sock_init(); + return 0; +} + +static void __exit bnep_exit(void) +{ + bnep_sock_cleanup(); +} + +module_init(bnep_init); +module_exit(bnep_exit); + +MODULE_AUTHOR("David Libault , Maxim Krasnyansky "); +MODULE_DESCRIPTION("Bluetooth BNEP ver " VERSION); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("bt-proto-4"); diff --git a/net/bluetooth/bnep/netdev.c b/net/bluetooth/bnep/netdev.c new file mode 100644 index 000000000000..921204f95f4a --- /dev/null +++ b/net/bluetooth/bnep/netdev.c @@ -0,0 +1,247 @@ +/* + BNEP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2001-2002 Inventel Systemes + Written 2001-2002 by + Clément Moreau + David Libault + + Copyright (C) 2002 Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* + * $Id: netdev.c,v 1.8 2002/08/04 21:23:58 maxk Exp $ + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "bnep.h" + +#ifndef CONFIG_BT_BNEP_DEBUG +#undef BT_DBG +#define BT_DBG( A... ) +#endif + +#define BNEP_TX_QUEUE_LEN 20 + +static int bnep_net_open(struct net_device *dev) +{ + netif_start_queue(dev); + return 0; +} + +static int bnep_net_close(struct net_device *dev) +{ + netif_stop_queue(dev); + return 0; +} + +static struct net_device_stats *bnep_net_get_stats(struct net_device *dev) +{ + struct bnep_session *s = dev->priv; + return &s->stats; +} + +static void bnep_net_set_mc_list(struct net_device *dev) +{ +#ifdef CONFIG_BT_BNEP_MC_FILTER + struct bnep_session *s = dev->priv; + struct sock *sk = s->sock->sk; + struct bnep_set_filter_req *r; + struct sk_buff *skb; + int size; + + BT_DBG("%s mc_count %d", dev->name, dev->mc_count); + + size = sizeof(*r) + (BNEP_MAX_MULTICAST_FILTERS + 1) * ETH_ALEN * 2; + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) { + BT_ERR("%s Multicast list allocation failed", dev->name); + return; + } + + r = (void *) skb->data; + __skb_put(skb, sizeof(*r)); + + r->type = BNEP_CONTROL; + r->ctrl = BNEP_FILTER_MULTI_ADDR_SET; + + if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) { + u8 start[ETH_ALEN] = { 0x01 }; + + /* Request all addresses */ + memcpy(__skb_put(skb, ETH_ALEN), start, ETH_ALEN); + memcpy(__skb_put(skb, ETH_ALEN), dev->broadcast, ETH_ALEN); + r->len = htons(ETH_ALEN * 2); + } else { + struct dev_mc_list *dmi = dev->mc_list; + int i, len = skb->len; + + if (dev->flags & IFF_BROADCAST) { + memcpy(__skb_put(skb, ETH_ALEN), dev->broadcast, ETH_ALEN); + memcpy(__skb_put(skb, ETH_ALEN), dev->broadcast, ETH_ALEN); + } + + /* FIXME: We should group addresses here. */ + + for (i = 0; i < dev->mc_count && i < BNEP_MAX_MULTICAST_FILTERS; i++) { + memcpy(__skb_put(skb, ETH_ALEN), dmi->dmi_addr, ETH_ALEN); + memcpy(__skb_put(skb, ETH_ALEN), dmi->dmi_addr, ETH_ALEN); + dmi = dmi->next; + } + r->len = htons(skb->len - len); + } + + skb_queue_tail(&sk->sk_write_queue, skb); + wake_up_interruptible(sk->sk_sleep); +#endif +} + +static int bnep_net_set_mac_addr(struct net_device *dev, void *arg) +{ + BT_DBG("%s", dev->name); + return 0; +} + +static void bnep_net_timeout(struct net_device *dev) +{ + BT_DBG("net_timeout"); + netif_wake_queue(dev); +} + +static int bnep_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + return -EINVAL; +} + +#ifdef CONFIG_BT_BNEP_MC_FILTER +static inline int bnep_net_mc_filter(struct sk_buff *skb, struct bnep_session *s) +{ + struct ethhdr *eh = (void *) skb->data; + + if ((eh->h_dest[0] & 1) && !test_bit(bnep_mc_hash(eh->h_dest), (ulong *) &s->mc_filter)) + return 1; + return 0; +} +#endif + +#ifdef CONFIG_BT_BNEP_PROTO_FILTER +/* Determine ether protocol. Based on eth_type_trans. */ +static inline u16 bnep_net_eth_proto(struct sk_buff *skb) +{ + struct ethhdr *eh = (void *) skb->data; + + if (ntohs(eh->h_proto) >= 1536) + return eh->h_proto; + + if (get_unaligned((u16 *) skb->data) == 0xFFFF) + return htons(ETH_P_802_3); + + return htons(ETH_P_802_2); +} + +static inline int bnep_net_proto_filter(struct sk_buff *skb, struct bnep_session *s) +{ + u16 proto = bnep_net_eth_proto(skb); + struct bnep_proto_filter *f = s->proto_filter; + int i; + + for (i = 0; i < BNEP_MAX_PROTO_FILTERS && f[i].end; i++) { + if (proto >= f[i].start && proto <= f[i].end) + return 0; + } + + BT_DBG("BNEP: filtered skb %p, proto 0x%.4x", skb, proto); + return 1; +} +#endif + +static int bnep_net_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct bnep_session *s = dev->priv; + struct sock *sk = s->sock->sk; + + BT_DBG("skb %p, dev %p", skb, dev); + +#ifdef CONFIG_BT_BNEP_MC_FILTER + if (bnep_net_mc_filter(skb, s)) { + kfree_skb(skb); + return 0; + } +#endif + +#ifdef CONFIG_BT_BNEP_PROTO_FILTER + if (bnep_net_proto_filter(skb, s)) { + kfree_skb(skb); + return 0; + } +#endif + + /* + * We cannot send L2CAP packets from here as we are potentially in a bh. + * So we have to queue them and wake up session thread which is sleeping + * on the sk->sk_sleep. + */ + dev->trans_start = jiffies; + skb_queue_tail(&sk->sk_write_queue, skb); + wake_up_interruptible(sk->sk_sleep); + + if (skb_queue_len(&sk->sk_write_queue) >= BNEP_TX_QUEUE_LEN) { + BT_DBG("tx queue is full"); + + /* Stop queuing. + * Session thread will do netif_wake_queue() */ + netif_stop_queue(dev); + } + + return 0; +} + +void bnep_net_setup(struct net_device *dev) +{ + + memset(dev->broadcast, 0xff, ETH_ALEN); + dev->addr_len = ETH_ALEN; + + ether_setup(dev); + + dev->open = bnep_net_open; + dev->stop = bnep_net_close; + dev->hard_start_xmit = bnep_net_xmit; + dev->get_stats = bnep_net_get_stats; + dev->do_ioctl = bnep_net_ioctl; + dev->set_mac_address = bnep_net_set_mac_addr; + dev->set_multicast_list = bnep_net_set_mc_list; + + dev->watchdog_timeo = HZ * 2; + dev->tx_timeout = bnep_net_timeout; +} diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c new file mode 100644 index 000000000000..9a8d99a39b6d --- /dev/null +++ b/net/bluetooth/bnep/sock.c @@ -0,0 +1,237 @@ +/* + BNEP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2001-2002 Inventel Systemes + Written 2001-2002 by + David Libault + + Copyright (C) 2002 Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* + * $Id: sock.c,v 1.4 2002/08/04 21:23:58 maxk Exp $ + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "bnep.h" + +#ifndef CONFIG_BT_BNEP_DEBUG +#undef BT_DBG +#define BT_DBG( A... ) +#endif + +static int bnep_sock_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + BT_DBG("sock %p sk %p", sock, sk); + + if (!sk) + return 0; + + sock_orphan(sk); + sock_put(sk); + return 0; +} + +static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct bnep_connlist_req cl; + struct bnep_connadd_req ca; + struct bnep_conndel_req cd; + struct bnep_conninfo ci; + struct socket *nsock; + void __user *argp = (void __user *)arg; + int err; + + BT_DBG("cmd %x arg %lx", cmd, arg); + + switch (cmd) { + case BNEPCONNADD: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + + if (copy_from_user(&ca, argp, sizeof(ca))) + return -EFAULT; + + nsock = sockfd_lookup(ca.sock, &err); + if (!nsock) + return err; + + if (nsock->sk->sk_state != BT_CONNECTED) { + fput(nsock->file); + return -EBADFD; + } + + err = bnep_add_connection(&ca, nsock); + if (!err) { + if (copy_to_user(argp, &ca, sizeof(ca))) + err = -EFAULT; + } else + fput(nsock->file); + + return err; + + case BNEPCONNDEL: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + + if (copy_from_user(&cd, argp, sizeof(cd))) + return -EFAULT; + + return bnep_del_connection(&cd); + + case BNEPGETCONNLIST: + if (copy_from_user(&cl, argp, sizeof(cl))) + return -EFAULT; + + if (cl.cnum <= 0) + return -EINVAL; + + err = bnep_get_connlist(&cl); + if (!err && copy_to_user(argp, &cl, sizeof(cl))) + return -EFAULT; + + return err; + + case BNEPGETCONNINFO: + if (copy_from_user(&ci, argp, sizeof(ci))) + return -EFAULT; + + err = bnep_get_conninfo(&ci); + if (!err && copy_to_user(argp, &ci, sizeof(ci))) + return -EFAULT; + + return err; + + default: + return -EINVAL; + } + + return 0; +} + +static struct proto_ops bnep_sock_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .release = bnep_sock_release, + .ioctl = bnep_sock_ioctl, + .bind = sock_no_bind, + .getname = sock_no_getname, + .sendmsg = sock_no_sendmsg, + .recvmsg = sock_no_recvmsg, + .poll = sock_no_poll, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .mmap = sock_no_mmap +}; + +static struct proto bnep_proto = { + .name = "BNEP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct bt_sock) +}; + +static int bnep_sock_create(struct socket *sock, int protocol) +{ + struct sock *sk; + + BT_DBG("sock %p", sock); + + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + sk = sk_alloc(PF_BLUETOOTH, GFP_KERNEL, &bnep_proto, 1); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + + sock->ops = &bnep_sock_ops; + + sock->state = SS_UNCONNECTED; + + sock_reset_flag(sk, SOCK_ZAPPED); + + sk->sk_protocol = protocol; + sk->sk_state = BT_OPEN; + + return 0; +} + +static struct net_proto_family bnep_sock_family_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .create = bnep_sock_create +}; + +int __init bnep_sock_init(void) +{ + int err; + + err = proto_register(&bnep_proto, 0); + if (err < 0) + return err; + + err = bt_sock_register(BTPROTO_BNEP, &bnep_sock_family_ops); + if (err < 0) + goto error; + + return 0; + +error: + BT_ERR("Can't register BNEP socket"); + proto_unregister(&bnep_proto); + return err; +} + +int __exit bnep_sock_cleanup(void) +{ + if (bt_sock_unregister(BTPROTO_BNEP) < 0) + BT_ERR("Can't unregister BNEP socket"); + + proto_unregister(&bnep_proto); + + return 0; +} diff --git a/net/bluetooth/cmtp/Kconfig b/net/bluetooth/cmtp/Kconfig new file mode 100644 index 000000000000..d6b0382f6f3a --- /dev/null +++ b/net/bluetooth/cmtp/Kconfig @@ -0,0 +1,11 @@ +config BT_CMTP + tristate "CMTP protocol support" + depends on BT && BT_L2CAP && ISDN_CAPI + help + CMTP (CAPI Message Transport Protocol) is a transport layer + for CAPI messages. CMTP is required for the Bluetooth Common + ISDN Access Profile. + + Say Y here to compile CMTP support into the kernel or say M to + compile it as module (cmtp). + diff --git a/net/bluetooth/cmtp/Makefile b/net/bluetooth/cmtp/Makefile new file mode 100644 index 000000000000..890a9a5a6861 --- /dev/null +++ b/net/bluetooth/cmtp/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux Bluetooth CMTP layer +# + +obj-$(CONFIG_BT_CMTP) += cmtp.o + +cmtp-objs := core.o sock.o capi.o diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c new file mode 100644 index 000000000000..1e5c030b72ad --- /dev/null +++ b/net/bluetooth/cmtp/capi.c @@ -0,0 +1,600 @@ +/* + CMTP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2002-2003 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "cmtp.h" + +#ifndef CONFIG_BT_CMTP_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +#define CAPI_INTEROPERABILITY 0x20 + +#define CAPI_INTEROPERABILITY_REQ CAPICMD(CAPI_INTEROPERABILITY, CAPI_REQ) +#define CAPI_INTEROPERABILITY_CONF CAPICMD(CAPI_INTEROPERABILITY, CAPI_CONF) +#define CAPI_INTEROPERABILITY_IND CAPICMD(CAPI_INTEROPERABILITY, CAPI_IND) +#define CAPI_INTEROPERABILITY_RESP CAPICMD(CAPI_INTEROPERABILITY, CAPI_RESP) + +#define CAPI_INTEROPERABILITY_REQ_LEN (CAPI_MSG_BASELEN + 2) +#define CAPI_INTEROPERABILITY_CONF_LEN (CAPI_MSG_BASELEN + 4) +#define CAPI_INTEROPERABILITY_IND_LEN (CAPI_MSG_BASELEN + 2) +#define CAPI_INTEROPERABILITY_RESP_LEN (CAPI_MSG_BASELEN + 2) + +#define CAPI_FUNCTION_REGISTER 0 +#define CAPI_FUNCTION_RELEASE 1 +#define CAPI_FUNCTION_GET_PROFILE 2 +#define CAPI_FUNCTION_GET_MANUFACTURER 3 +#define CAPI_FUNCTION_GET_VERSION 4 +#define CAPI_FUNCTION_GET_SERIAL_NUMBER 5 +#define CAPI_FUNCTION_MANUFACTURER 6 +#define CAPI_FUNCTION_LOOPBACK 7 + + +#define CMTP_MSGNUM 1 +#define CMTP_APPLID 2 +#define CMTP_MAPPING 3 + +static struct cmtp_application *cmtp_application_add(struct cmtp_session *session, __u16 appl) +{ + struct cmtp_application *app = kmalloc(sizeof(*app), GFP_KERNEL); + + BT_DBG("session %p application %p appl %d", session, app, appl); + + if (!app) + return NULL; + + memset(app, 0, sizeof(*app)); + + app->state = BT_OPEN; + app->appl = appl; + + list_add_tail(&app->list, &session->applications); + + return app; +} + +static void cmtp_application_del(struct cmtp_session *session, struct cmtp_application *app) +{ + BT_DBG("session %p application %p", session, app); + + if (app) { + list_del(&app->list); + kfree(app); + } +} + +static struct cmtp_application *cmtp_application_get(struct cmtp_session *session, int pattern, __u16 value) +{ + struct cmtp_application *app; + struct list_head *p, *n; + + list_for_each_safe(p, n, &session->applications) { + app = list_entry(p, struct cmtp_application, list); + switch (pattern) { + case CMTP_MSGNUM: + if (app->msgnum == value) + return app; + break; + case CMTP_APPLID: + if (app->appl == value) + return app; + break; + case CMTP_MAPPING: + if (app->mapping == value) + return app; + break; + } + } + + return NULL; +} + +static int cmtp_msgnum_get(struct cmtp_session *session) +{ + session->msgnum++; + + if ((session->msgnum & 0xff) > 200) + session->msgnum = CMTP_INITIAL_MSGNUM + 1; + + return session->msgnum; +} + +static void cmtp_send_capimsg(struct cmtp_session *session, struct sk_buff *skb) +{ + struct cmtp_scb *scb = (void *) skb->cb; + + BT_DBG("session %p skb %p len %d", session, skb, skb->len); + + scb->id = -1; + scb->data = (CAPIMSG_COMMAND(skb->data) == CAPI_DATA_B3); + + skb_queue_tail(&session->transmit, skb); + + cmtp_schedule(session); +} + +static void cmtp_send_interopmsg(struct cmtp_session *session, + __u8 subcmd, __u16 appl, __u16 msgnum, + __u16 function, unsigned char *buf, int len) +{ + struct sk_buff *skb; + unsigned char *s; + + BT_DBG("session %p subcmd 0x%02x appl %d msgnum %d", session, subcmd, appl, msgnum); + + if (!(skb = alloc_skb(CAPI_MSG_BASELEN + 6 + len, GFP_ATOMIC))) { + BT_ERR("Can't allocate memory for interoperability packet"); + return; + } + + s = skb_put(skb, CAPI_MSG_BASELEN + 6 + len); + + capimsg_setu16(s, 0, CAPI_MSG_BASELEN + 6 + len); + capimsg_setu16(s, 2, appl); + capimsg_setu8 (s, 4, CAPI_INTEROPERABILITY); + capimsg_setu8 (s, 5, subcmd); + capimsg_setu16(s, 6, msgnum); + + /* Interoperability selector (Bluetooth Device Management) */ + capimsg_setu16(s, 8, 0x0001); + + capimsg_setu8 (s, 10, 3 + len); + capimsg_setu16(s, 11, function); + capimsg_setu8 (s, 13, len); + + if (len > 0) + memcpy(s + 14, buf, len); + + cmtp_send_capimsg(session, skb); +} + +static void cmtp_recv_interopmsg(struct cmtp_session *session, struct sk_buff *skb) +{ + struct capi_ctr *ctrl = &session->ctrl; + struct cmtp_application *application; + __u16 appl, msgnum, func, info; + __u32 controller; + + BT_DBG("session %p skb %p len %d", session, skb, skb->len); + + switch (CAPIMSG_SUBCOMMAND(skb->data)) { + case CAPI_CONF: + func = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 5); + info = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 8); + + switch (func) { + case CAPI_FUNCTION_REGISTER: + msgnum = CAPIMSG_MSGID(skb->data); + + application = cmtp_application_get(session, CMTP_MSGNUM, msgnum); + if (application) { + application->state = BT_CONNECTED; + application->msgnum = 0; + application->mapping = CAPIMSG_APPID(skb->data); + wake_up_interruptible(&session->wait); + } + + break; + + case CAPI_FUNCTION_RELEASE: + appl = CAPIMSG_APPID(skb->data); + + application = cmtp_application_get(session, CMTP_MAPPING, appl); + if (application) { + application->state = BT_CLOSED; + application->msgnum = 0; + wake_up_interruptible(&session->wait); + } + + break; + + case CAPI_FUNCTION_GET_PROFILE: + controller = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 11); + msgnum = CAPIMSG_MSGID(skb->data); + + if (!info && (msgnum == CMTP_INITIAL_MSGNUM)) { + session->ncontroller = controller; + wake_up_interruptible(&session->wait); + break; + } + + if (!info && ctrl) { + memcpy(&ctrl->profile, + skb->data + CAPI_MSG_BASELEN + 11, + sizeof(capi_profile)); + session->state = BT_CONNECTED; + capi_ctr_ready(ctrl); + } + + break; + + case CAPI_FUNCTION_GET_MANUFACTURER: + controller = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 10); + + if (!info && ctrl) { + strncpy(ctrl->manu, + skb->data + CAPI_MSG_BASELEN + 15, + skb->data[CAPI_MSG_BASELEN + 14]); + } + + break; + + case CAPI_FUNCTION_GET_VERSION: + controller = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 12); + + if (!info && ctrl) { + ctrl->version.majorversion = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 16); + ctrl->version.minorversion = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 20); + ctrl->version.majormanuversion = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 24); + ctrl->version.minormanuversion = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 28); + } + + break; + + case CAPI_FUNCTION_GET_SERIAL_NUMBER: + controller = CAPIMSG_U32(skb->data, CAPI_MSG_BASELEN + 12); + + if (!info && ctrl) { + memset(ctrl->serial, 0, CAPI_SERIAL_LEN); + strncpy(ctrl->serial, + skb->data + CAPI_MSG_BASELEN + 17, + skb->data[CAPI_MSG_BASELEN + 16]); + } + + break; + } + + break; + + case CAPI_IND: + func = CAPIMSG_U16(skb->data, CAPI_MSG_BASELEN + 3); + + if (func == CAPI_FUNCTION_LOOPBACK) { + appl = CAPIMSG_APPID(skb->data); + msgnum = CAPIMSG_MSGID(skb->data); + cmtp_send_interopmsg(session, CAPI_RESP, appl, msgnum, func, + skb->data + CAPI_MSG_BASELEN + 6, + skb->data[CAPI_MSG_BASELEN + 5]); + } + + break; + } + + kfree_skb(skb); +} + +void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb) +{ + struct capi_ctr *ctrl = &session->ctrl; + struct cmtp_application *application; + __u16 cmd, appl; + __u32 contr; + + BT_DBG("session %p skb %p len %d", session, skb, skb->len); + + if (CAPIMSG_COMMAND(skb->data) == CAPI_INTEROPERABILITY) { + cmtp_recv_interopmsg(session, skb); + return; + } + + if (session->flags & (1 << CMTP_LOOPBACK)) { + kfree_skb(skb); + return; + } + + cmd = CAPICMD(CAPIMSG_COMMAND(skb->data), CAPIMSG_SUBCOMMAND(skb->data)); + appl = CAPIMSG_APPID(skb->data); + contr = CAPIMSG_CONTROL(skb->data); + + application = cmtp_application_get(session, CMTP_MAPPING, appl); + if (application) { + appl = application->appl; + CAPIMSG_SETAPPID(skb->data, appl); + } else { + BT_ERR("Can't find application with id %d", appl); + kfree_skb(skb); + return; + } + + if ((contr & 0x7f) == 0x01) { + contr = (contr & 0xffffff80) | session->num; + CAPIMSG_SETCONTROL(skb->data, contr); + } + + if (!ctrl) { + BT_ERR("Can't find controller %d for message", session->num); + kfree_skb(skb); + return; + } + + capi_ctr_handle_message(ctrl, appl, skb); +} + +static int cmtp_load_firmware(struct capi_ctr *ctrl, capiloaddata *data) +{ + BT_DBG("ctrl %p data %p", ctrl, data); + + return 0; +} + +static void cmtp_reset_ctr(struct capi_ctr *ctrl) +{ + struct cmtp_session *session = ctrl->driverdata; + + BT_DBG("ctrl %p", ctrl); + + capi_ctr_reseted(ctrl); + + atomic_inc(&session->terminate); + cmtp_schedule(session); +} + +static void cmtp_register_appl(struct capi_ctr *ctrl, __u16 appl, capi_register_params *rp) +{ + DECLARE_WAITQUEUE(wait, current); + struct cmtp_session *session = ctrl->driverdata; + struct cmtp_application *application; + unsigned long timeo = CMTP_INTEROP_TIMEOUT; + unsigned char buf[8]; + int err = 0, nconn, want = rp->level3cnt; + + BT_DBG("ctrl %p appl %d level3cnt %d datablkcnt %d datablklen %d", + ctrl, appl, rp->level3cnt, rp->datablkcnt, rp->datablklen); + + application = cmtp_application_add(session, appl); + if (!application) { + BT_ERR("Can't allocate memory for new application"); + return; + } + + if (want < 0) + nconn = ctrl->profile.nbchannel * -want; + else + nconn = want; + + if (nconn == 0) + nconn = ctrl->profile.nbchannel; + + capimsg_setu16(buf, 0, nconn); + capimsg_setu16(buf, 2, rp->datablkcnt); + capimsg_setu16(buf, 4, rp->datablklen); + + application->state = BT_CONFIG; + application->msgnum = cmtp_msgnum_get(session); + + cmtp_send_interopmsg(session, CAPI_REQ, 0x0000, application->msgnum, + CAPI_FUNCTION_REGISTER, buf, 6); + + add_wait_queue(&session->wait, &wait); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (!timeo) { + err = -EAGAIN; + break; + } + + if (application->state == BT_CLOSED) { + err = -application->err; + break; + } + + if (application->state == BT_CONNECTED) + break; + + if (signal_pending(current)) { + err = -EINTR; + break; + } + + timeo = schedule_timeout(timeo); + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&session->wait, &wait); + + if (err) { + cmtp_application_del(session, application); + return; + } +} + +static void cmtp_release_appl(struct capi_ctr *ctrl, __u16 appl) +{ + struct cmtp_session *session = ctrl->driverdata; + struct cmtp_application *application; + + BT_DBG("ctrl %p appl %d", ctrl, appl); + + application = cmtp_application_get(session, CMTP_APPLID, appl); + if (!application) { + BT_ERR("Can't find application"); + return; + } + + application->msgnum = cmtp_msgnum_get(session); + + cmtp_send_interopmsg(session, CAPI_REQ, application->mapping, application->msgnum, + CAPI_FUNCTION_RELEASE, NULL, 0); + + wait_event_interruptible_timeout(session->wait, + (application->state == BT_CLOSED), CMTP_INTEROP_TIMEOUT); + + cmtp_application_del(session, application); +} + +static u16 cmtp_send_message(struct capi_ctr *ctrl, struct sk_buff *skb) +{ + struct cmtp_session *session = ctrl->driverdata; + struct cmtp_application *application; + __u16 appl; + __u32 contr; + + BT_DBG("ctrl %p skb %p", ctrl, skb); + + appl = CAPIMSG_APPID(skb->data); + contr = CAPIMSG_CONTROL(skb->data); + + application = cmtp_application_get(session, CMTP_APPLID, appl); + if ((!application) || (application->state != BT_CONNECTED)) { + BT_ERR("Can't find application with id %d", appl); + return CAPI_ILLAPPNR; + } + + CAPIMSG_SETAPPID(skb->data, application->mapping); + + if ((contr & 0x7f) == session->num) { + contr = (contr & 0xffffff80) | 0x01; + CAPIMSG_SETCONTROL(skb->data, contr); + } + + cmtp_send_capimsg(session, skb); + + return CAPI_NOERROR; +} + +static char *cmtp_procinfo(struct capi_ctr *ctrl) +{ + return "CAPI Message Transport Protocol"; +} + +static int cmtp_ctr_read_proc(char *page, char **start, off_t off, int count, int *eof, struct capi_ctr *ctrl) +{ + struct cmtp_session *session = ctrl->driverdata; + struct cmtp_application *app; + struct list_head *p, *n; + int len = 0; + + len += sprintf(page + len, "%s\n\n", cmtp_procinfo(ctrl)); + len += sprintf(page + len, "addr %s\n", session->name); + len += sprintf(page + len, "ctrl %d\n", session->num); + + list_for_each_safe(p, n, &session->applications) { + app = list_entry(p, struct cmtp_application, list); + len += sprintf(page + len, "appl %d -> %d\n", app->appl, app->mapping); + } + + if (off + count >= len) + *eof = 1; + + if (len < off) + return 0; + + *start = page + off; + + return ((count < len - off) ? count : len - off); +} + + +int cmtp_attach_device(struct cmtp_session *session) +{ + unsigned char buf[4]; + long ret; + + BT_DBG("session %p", session); + + capimsg_setu32(buf, 0, 0); + + cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, CMTP_INITIAL_MSGNUM, + CAPI_FUNCTION_GET_PROFILE, buf, 4); + + ret = wait_event_interruptible_timeout(session->wait, + session->ncontroller, CMTP_INTEROP_TIMEOUT); + + BT_INFO("Found %d CAPI controller(s) on device %s", session->ncontroller, session->name); + + if (!ret) + return -ETIMEDOUT; + + if (!session->ncontroller) + return -ENODEV; + + if (session->ncontroller > 1) + BT_INFO("Setting up only CAPI controller 1"); + + session->ctrl.owner = THIS_MODULE; + session->ctrl.driverdata = session; + strcpy(session->ctrl.name, session->name); + + session->ctrl.driver_name = "cmtp"; + session->ctrl.load_firmware = cmtp_load_firmware; + session->ctrl.reset_ctr = cmtp_reset_ctr; + session->ctrl.register_appl = cmtp_register_appl; + session->ctrl.release_appl = cmtp_release_appl; + session->ctrl.send_message = cmtp_send_message; + + session->ctrl.procinfo = cmtp_procinfo; + session->ctrl.ctr_read_proc = cmtp_ctr_read_proc; + + if (attach_capi_ctr(&session->ctrl) < 0) { + BT_ERR("Can't attach new controller"); + return -EBUSY; + } + + session->num = session->ctrl.cnr; + + BT_DBG("session %p num %d", session, session->num); + + capimsg_setu32(buf, 0, 1); + + cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, cmtp_msgnum_get(session), + CAPI_FUNCTION_GET_MANUFACTURER, buf, 4); + + cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, cmtp_msgnum_get(session), + CAPI_FUNCTION_GET_VERSION, buf, 4); + + cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, cmtp_msgnum_get(session), + CAPI_FUNCTION_GET_SERIAL_NUMBER, buf, 4); + + cmtp_send_interopmsg(session, CAPI_REQ, 0xffff, cmtp_msgnum_get(session), + CAPI_FUNCTION_GET_PROFILE, buf, 4); + + return 0; +} + +void cmtp_detach_device(struct cmtp_session *session) +{ + BT_DBG("session %p", session); + + detach_capi_ctr(&session->ctrl); +} diff --git a/net/bluetooth/cmtp/cmtp.h b/net/bluetooth/cmtp/cmtp.h new file mode 100644 index 000000000000..40e3dfec0cc8 --- /dev/null +++ b/net/bluetooth/cmtp/cmtp.h @@ -0,0 +1,135 @@ +/* + CMTP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2002-2003 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#ifndef __CMTP_H +#define __CMTP_H + +#include +#include + +#define BTNAMSIZ 18 + +/* CMTP ioctl defines */ +#define CMTPCONNADD _IOW('C', 200, int) +#define CMTPCONNDEL _IOW('C', 201, int) +#define CMTPGETCONNLIST _IOR('C', 210, int) +#define CMTPGETCONNINFO _IOR('C', 211, int) + +#define CMTP_LOOPBACK 0 + +struct cmtp_connadd_req { + int sock; // Connected socket + __u32 flags; +}; + +struct cmtp_conndel_req { + bdaddr_t bdaddr; + __u32 flags; +}; + +struct cmtp_conninfo { + bdaddr_t bdaddr; + __u32 flags; + __u16 state; + int num; +}; + +struct cmtp_connlist_req { + __u32 cnum; + struct cmtp_conninfo __user *ci; +}; + +int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock); +int cmtp_del_connection(struct cmtp_conndel_req *req); +int cmtp_get_connlist(struct cmtp_connlist_req *req); +int cmtp_get_conninfo(struct cmtp_conninfo *ci); + +/* CMTP session defines */ +#define CMTP_INTEROP_TIMEOUT (HZ * 5) +#define CMTP_INITIAL_MSGNUM 0xff00 + +struct cmtp_session { + struct list_head list; + + struct socket *sock; + + bdaddr_t bdaddr; + + unsigned long state; + unsigned long flags; + + uint mtu; + + char name[BTNAMSIZ]; + + atomic_t terminate; + + wait_queue_head_t wait; + + int ncontroller; + int num; + struct capi_ctr ctrl; + + struct list_head applications; + + unsigned long blockids; + int msgnum; + + struct sk_buff_head transmit; + + struct sk_buff *reassembly[16]; +}; + +struct cmtp_application { + struct list_head list; + + unsigned long state; + int err; + + __u16 appl; + __u16 mapping; + + __u16 msgnum; +}; + +struct cmtp_scb { + int id; + int data; +}; + +int cmtp_attach_device(struct cmtp_session *session); +void cmtp_detach_device(struct cmtp_session *session); + +void cmtp_recv_capimsg(struct cmtp_session *session, struct sk_buff *skb); + +static inline void cmtp_schedule(struct cmtp_session *session) +{ + struct sock *sk = session->sock->sk; + + wake_up_interruptible(sk->sk_sleep); +} + +/* CMTP init defines */ +int cmtp_init_sockets(void); +void cmtp_cleanup_sockets(void); + +#endif /* __CMTP_H */ diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c new file mode 100644 index 000000000000..20ce04f2be8b --- /dev/null +++ b/net/bluetooth/cmtp/core.c @@ -0,0 +1,504 @@ +/* + CMTP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2002-2003 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "cmtp.h" + +#ifndef CONFIG_BT_CMTP_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +#define VERSION "1.0" + +static DECLARE_RWSEM(cmtp_session_sem); +static LIST_HEAD(cmtp_session_list); + +static struct cmtp_session *__cmtp_get_session(bdaddr_t *bdaddr) +{ + struct cmtp_session *session; + struct list_head *p; + + BT_DBG(""); + + list_for_each(p, &cmtp_session_list) { + session = list_entry(p, struct cmtp_session, list); + if (!bacmp(bdaddr, &session->bdaddr)) + return session; + } + return NULL; +} + +static void __cmtp_link_session(struct cmtp_session *session) +{ + __module_get(THIS_MODULE); + list_add(&session->list, &cmtp_session_list); +} + +static void __cmtp_unlink_session(struct cmtp_session *session) +{ + list_del(&session->list); + module_put(THIS_MODULE); +} + +static void __cmtp_copy_session(struct cmtp_session *session, struct cmtp_conninfo *ci) +{ + bacpy(&ci->bdaddr, &session->bdaddr); + + ci->flags = session->flags; + ci->state = session->state; + + ci->num = session->num; +} + + +static inline int cmtp_alloc_block_id(struct cmtp_session *session) +{ + int i, id = -1; + + for (i = 0; i < 16; i++) + if (!test_and_set_bit(i, &session->blockids)) { + id = i; + break; + } + + return id; +} + +static inline void cmtp_free_block_id(struct cmtp_session *session, int id) +{ + clear_bit(id, &session->blockids); +} + +static inline void cmtp_add_msgpart(struct cmtp_session *session, int id, const unsigned char *buf, int count) +{ + struct sk_buff *skb = session->reassembly[id], *nskb; + int size; + + BT_DBG("session %p buf %p count %d", session, buf, count); + + size = (skb) ? skb->len + count : count; + + if (!(nskb = alloc_skb(size, GFP_ATOMIC))) { + BT_ERR("Can't allocate memory for CAPI message"); + return; + } + + if (skb && (skb->len > 0)) + memcpy(skb_put(nskb, skb->len), skb->data, skb->len); + + memcpy(skb_put(nskb, count), buf, count); + + session->reassembly[id] = nskb; + + if (skb) + kfree_skb(skb); +} + +static inline int cmtp_recv_frame(struct cmtp_session *session, struct sk_buff *skb) +{ + __u8 hdr, hdrlen, id; + __u16 len; + + BT_DBG("session %p skb %p len %d", session, skb, skb->len); + + while (skb->len > 0) { + hdr = skb->data[0]; + + switch (hdr & 0xc0) { + case 0x40: + hdrlen = 2; + len = skb->data[1]; + break; + case 0x80: + hdrlen = 3; + len = skb->data[1] | (skb->data[2] << 8); + break; + default: + hdrlen = 1; + len = 0; + break; + } + + id = (hdr & 0x3c) >> 2; + + BT_DBG("hdr 0x%02x hdrlen %d len %d id %d", hdr, hdrlen, len, id); + + if (hdrlen + len > skb->len) { + BT_ERR("Wrong size or header information in CMTP frame"); + break; + } + + if (len == 0) { + skb_pull(skb, hdrlen); + continue; + } + + switch (hdr & 0x03) { + case 0x00: + cmtp_add_msgpart(session, id, skb->data + hdrlen, len); + cmtp_recv_capimsg(session, session->reassembly[id]); + session->reassembly[id] = NULL; + break; + case 0x01: + cmtp_add_msgpart(session, id, skb->data + hdrlen, len); + break; + default: + if (session->reassembly[id] != NULL) + kfree_skb(session->reassembly[id]); + session->reassembly[id] = NULL; + break; + } + + skb_pull(skb, hdrlen + len); + } + + kfree_skb(skb); + return 0; +} + +static int cmtp_send_frame(struct cmtp_session *session, unsigned char *data, int len) +{ + struct socket *sock = session->sock; + struct kvec iv = { data, len }; + struct msghdr msg; + + BT_DBG("session %p data %p len %d", session, data, len); + + if (!len) + return 0; + + memset(&msg, 0, sizeof(msg)); + + return kernel_sendmsg(sock, &msg, &iv, 1, len); +} + +static int cmtp_process_transmit(struct cmtp_session *session) +{ + struct sk_buff *skb, *nskb; + unsigned char *hdr; + unsigned int size, tail; + + BT_DBG("session %p", session); + + if (!(nskb = alloc_skb(session->mtu, GFP_ATOMIC))) { + BT_ERR("Can't allocate memory for new frame"); + return -ENOMEM; + } + + while ((skb = skb_dequeue(&session->transmit))) { + struct cmtp_scb *scb = (void *) skb->cb; + + if ((tail = (session->mtu - nskb->len)) < 5) { + cmtp_send_frame(session, nskb->data, nskb->len); + skb_trim(nskb, 0); + tail = session->mtu; + } + + size = min_t(uint, ((tail < 258) ? (tail - 2) : (tail - 3)), skb->len); + + if ((scb->id < 0) && ((scb->id = cmtp_alloc_block_id(session)) < 0)) { + skb_queue_head(&session->transmit, skb); + break; + } + + if (size < 256) { + hdr = skb_put(nskb, 2); + hdr[0] = 0x40 + | ((scb->id << 2) & 0x3c) + | ((skb->len == size) ? 0x00 : 0x01); + hdr[1] = size; + } else { + hdr = skb_put(nskb, 3); + hdr[0] = 0x80 + | ((scb->id << 2) & 0x3c) + | ((skb->len == size) ? 0x00 : 0x01); + hdr[1] = size & 0xff; + hdr[2] = size >> 8; + } + + memcpy(skb_put(nskb, size), skb->data, size); + skb_pull(skb, size); + + if (skb->len > 0) { + skb_queue_head(&session->transmit, skb); + } else { + cmtp_free_block_id(session, scb->id); + if (scb->data) { + cmtp_send_frame(session, nskb->data, nskb->len); + skb_trim(nskb, 0); + } + kfree_skb(skb); + } + } + + cmtp_send_frame(session, nskb->data, nskb->len); + + kfree_skb(nskb); + + return skb_queue_len(&session->transmit); +} + +static int cmtp_session(void *arg) +{ + struct cmtp_session *session = arg; + struct sock *sk = session->sock->sk; + struct sk_buff *skb; + wait_queue_t wait; + + BT_DBG("session %p", session); + + daemonize("kcmtpd_ctr_%d", session->num); + set_user_nice(current, -15); + current->flags |= PF_NOFREEZE; + + init_waitqueue_entry(&wait, current); + add_wait_queue(sk->sk_sleep, &wait); + while (!atomic_read(&session->terminate)) { + set_current_state(TASK_INTERRUPTIBLE); + + if (sk->sk_state != BT_CONNECTED) + break; + + while ((skb = skb_dequeue(&sk->sk_receive_queue))) { + skb_orphan(skb); + cmtp_recv_frame(session, skb); + } + + cmtp_process_transmit(session); + + schedule(); + } + set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sk_sleep, &wait); + + down_write(&cmtp_session_sem); + + if (!(session->flags & (1 << CMTP_LOOPBACK))) + cmtp_detach_device(session); + + fput(session->sock->file); + + __cmtp_unlink_session(session); + + up_write(&cmtp_session_sem); + + kfree(session); + return 0; +} + +int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock) +{ + struct cmtp_session *session, *s; + bdaddr_t src, dst; + int i, err; + + BT_DBG(""); + + baswap(&src, &bt_sk(sock->sk)->src); + baswap(&dst, &bt_sk(sock->sk)->dst); + + session = kmalloc(sizeof(struct cmtp_session), GFP_KERNEL); + if (!session) + return -ENOMEM; + memset(session, 0, sizeof(struct cmtp_session)); + + down_write(&cmtp_session_sem); + + s = __cmtp_get_session(&bt_sk(sock->sk)->dst); + if (s && s->state == BT_CONNECTED) { + err = -EEXIST; + goto failed; + } + + bacpy(&session->bdaddr, &bt_sk(sock->sk)->dst); + + session->mtu = min_t(uint, l2cap_pi(sock->sk)->omtu, l2cap_pi(sock->sk)->imtu); + + BT_DBG("mtu %d", session->mtu); + + sprintf(session->name, "%s", batostr(&dst)); + + session->sock = sock; + session->state = BT_CONFIG; + + init_waitqueue_head(&session->wait); + + session->msgnum = CMTP_INITIAL_MSGNUM; + + INIT_LIST_HEAD(&session->applications); + + skb_queue_head_init(&session->transmit); + + for (i = 0; i < 16; i++) + session->reassembly[i] = NULL; + + session->flags = req->flags; + + __cmtp_link_session(session); + + err = kernel_thread(cmtp_session, session, CLONE_KERNEL); + if (err < 0) + goto unlink; + + if (!(session->flags & (1 << CMTP_LOOPBACK))) { + err = cmtp_attach_device(session); + if (err < 0) + goto detach; + } + + up_write(&cmtp_session_sem); + return 0; + +detach: + cmtp_detach_device(session); + +unlink: + __cmtp_unlink_session(session); + +failed: + up_write(&cmtp_session_sem); + kfree(session); + return err; +} + +int cmtp_del_connection(struct cmtp_conndel_req *req) +{ + struct cmtp_session *session; + int err = 0; + + BT_DBG(""); + + down_read(&cmtp_session_sem); + + session = __cmtp_get_session(&req->bdaddr); + if (session) { + /* Flush the transmit queue */ + skb_queue_purge(&session->transmit); + + /* Kill session thread */ + atomic_inc(&session->terminate); + cmtp_schedule(session); + } else + err = -ENOENT; + + up_read(&cmtp_session_sem); + return err; +} + +int cmtp_get_connlist(struct cmtp_connlist_req *req) +{ + struct list_head *p; + int err = 0, n = 0; + + BT_DBG(""); + + down_read(&cmtp_session_sem); + + list_for_each(p, &cmtp_session_list) { + struct cmtp_session *session; + struct cmtp_conninfo ci; + + session = list_entry(p, struct cmtp_session, list); + + __cmtp_copy_session(session, &ci); + + if (copy_to_user(req->ci, &ci, sizeof(ci))) { + err = -EFAULT; + break; + } + + if (++n >= req->cnum) + break; + + req->ci++; + } + req->cnum = n; + + up_read(&cmtp_session_sem); + return err; +} + +int cmtp_get_conninfo(struct cmtp_conninfo *ci) +{ + struct cmtp_session *session; + int err = 0; + + down_read(&cmtp_session_sem); + + session = __cmtp_get_session(&ci->bdaddr); + if (session) + __cmtp_copy_session(session, ci); + else + err = -ENOENT; + + up_read(&cmtp_session_sem); + return err; +} + + +static int __init cmtp_init(void) +{ + l2cap_load(); + + BT_INFO("CMTP (CAPI Emulation) ver %s", VERSION); + + cmtp_init_sockets(); + + return 0; +} + +static void __exit cmtp_exit(void) +{ + cmtp_cleanup_sockets(); +} + +module_init(cmtp_init); +module_exit(cmtp_exit); + +MODULE_AUTHOR("Marcel Holtmann "); +MODULE_DESCRIPTION("Bluetooth CMTP ver " VERSION); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("bt-proto-5"); diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c new file mode 100644 index 000000000000..4c7f9e20dade --- /dev/null +++ b/net/bluetooth/cmtp/sock.c @@ -0,0 +1,226 @@ +/* + CMTP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2002-2003 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "cmtp.h" + +#ifndef CONFIG_BT_CMTP_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +static int cmtp_sock_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + BT_DBG("sock %p sk %p", sock, sk); + + if (!sk) + return 0; + + sock_orphan(sk); + sock_put(sk); + + return 0; +} + +static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct cmtp_connadd_req ca; + struct cmtp_conndel_req cd; + struct cmtp_connlist_req cl; + struct cmtp_conninfo ci; + struct socket *nsock; + void __user *argp = (void __user *)arg; + int err; + + BT_DBG("cmd %x arg %lx", cmd, arg); + + switch (cmd) { + case CMTPCONNADD: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + + if (copy_from_user(&ca, argp, sizeof(ca))) + return -EFAULT; + + nsock = sockfd_lookup(ca.sock, &err); + if (!nsock) + return err; + + if (nsock->sk->sk_state != BT_CONNECTED) { + fput(nsock->file); + return -EBADFD; + } + + err = cmtp_add_connection(&ca, nsock); + if (!err) { + if (copy_to_user(argp, &ca, sizeof(ca))) + err = -EFAULT; + } else + fput(nsock->file); + + return err; + + case CMTPCONNDEL: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + + if (copy_from_user(&cd, argp, sizeof(cd))) + return -EFAULT; + + return cmtp_del_connection(&cd); + + case CMTPGETCONNLIST: + if (copy_from_user(&cl, argp, sizeof(cl))) + return -EFAULT; + + if (cl.cnum <= 0) + return -EINVAL; + + err = cmtp_get_connlist(&cl); + if (!err && copy_to_user(argp, &cl, sizeof(cl))) + return -EFAULT; + + return err; + + case CMTPGETCONNINFO: + if (copy_from_user(&ci, argp, sizeof(ci))) + return -EFAULT; + + err = cmtp_get_conninfo(&ci); + if (!err && copy_to_user(argp, &ci, sizeof(ci))) + return -EFAULT; + + return err; + } + + return -EINVAL; +} + +static struct proto_ops cmtp_sock_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .release = cmtp_sock_release, + .ioctl = cmtp_sock_ioctl, + .bind = sock_no_bind, + .getname = sock_no_getname, + .sendmsg = sock_no_sendmsg, + .recvmsg = sock_no_recvmsg, + .poll = sock_no_poll, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .mmap = sock_no_mmap +}; + +static struct proto cmtp_proto = { + .name = "CMTP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct bt_sock) +}; + +static int cmtp_sock_create(struct socket *sock, int protocol) +{ + struct sock *sk; + + BT_DBG("sock %p", sock); + + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + sk = sk_alloc(PF_BLUETOOTH, GFP_KERNEL, &cmtp_proto, 1); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + + sock->ops = &cmtp_sock_ops; + + sock->state = SS_UNCONNECTED; + + sock_reset_flag(sk, SOCK_ZAPPED); + + sk->sk_protocol = protocol; + sk->sk_state = BT_OPEN; + + return 0; +} + +static struct net_proto_family cmtp_sock_family_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .create = cmtp_sock_create +}; + +int cmtp_init_sockets(void) +{ + int err; + + err = proto_register(&cmtp_proto, 0); + if (err < 0) + return err; + + err = bt_sock_register(BTPROTO_CMTP, &cmtp_sock_family_ops); + if (err < 0) + goto error; + + return 0; + +error: + BT_ERR("Can't register CMTP socket"); + proto_unregister(&cmtp_proto); + return err; +} + +void cmtp_cleanup_sockets(void) +{ + if (bt_sock_unregister(BTPROTO_CMTP) < 0) + BT_ERR("Can't unregister CMTP socket"); + + proto_unregister(&cmtp_proto); +} diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c new file mode 100644 index 000000000000..71762d7e9970 --- /dev/null +++ b/net/bluetooth/hci_conn.c @@ -0,0 +1,471 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2000-2001 Qualcomm Incorporated + + Written 2000,2001 by Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth HCI connection handling. */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#ifndef CONFIG_BT_HCI_CORE_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +static void hci_acl_connect(struct hci_conn *conn) +{ + struct hci_dev *hdev = conn->hdev; + struct inquiry_entry *ie; + struct hci_cp_create_conn cp; + + BT_DBG("%p", conn); + + conn->state = BT_CONNECT; + conn->out = 1; + conn->link_mode = HCI_LM_MASTER; + + memset(&cp, 0, sizeof(cp)); + bacpy(&cp.bdaddr, &conn->dst); + cp.pscan_rep_mode = 0x02; + + if ((ie = hci_inquiry_cache_lookup(hdev, &conn->dst)) && + inquiry_entry_age(ie) <= INQUIRY_ENTRY_AGE_MAX) { + cp.pscan_rep_mode = ie->data.pscan_rep_mode; + cp.pscan_mode = ie->data.pscan_mode; + cp.clock_offset = ie->data.clock_offset | __cpu_to_le16(0x8000); + memcpy(conn->dev_class, ie->data.dev_class, 3); + } + + cp.pkt_type = __cpu_to_le16(hdev->pkt_type & ACL_PTYPE_MASK); + if (lmp_rswitch_capable(hdev) && !(hdev->link_mode & HCI_LM_MASTER)) + cp.role_switch = 0x01; + else + cp.role_switch = 0x00; + + hci_send_cmd(hdev, OGF_LINK_CTL, OCF_CREATE_CONN, sizeof(cp), &cp); +} + +void hci_acl_disconn(struct hci_conn *conn, __u8 reason) +{ + struct hci_cp_disconnect cp; + + BT_DBG("%p", conn); + + conn->state = BT_DISCONN; + + cp.handle = __cpu_to_le16(conn->handle); + cp.reason = reason; + hci_send_cmd(conn->hdev, OGF_LINK_CTL, OCF_DISCONNECT, sizeof(cp), &cp); +} + +void hci_add_sco(struct hci_conn *conn, __u16 handle) +{ + struct hci_dev *hdev = conn->hdev; + struct hci_cp_add_sco cp; + + BT_DBG("%p", conn); + + conn->state = BT_CONNECT; + conn->out = 1; + + cp.pkt_type = __cpu_to_le16(hdev->pkt_type & SCO_PTYPE_MASK); + cp.handle = __cpu_to_le16(handle); + + hci_send_cmd(hdev, OGF_LINK_CTL, OCF_ADD_SCO, sizeof(cp), &cp); +} + +static void hci_conn_timeout(unsigned long arg) +{ + struct hci_conn *conn = (void *)arg; + struct hci_dev *hdev = conn->hdev; + + BT_DBG("conn %p state %d", conn, conn->state); + + if (atomic_read(&conn->refcnt)) + return; + + hci_dev_lock(hdev); + if (conn->state == BT_CONNECTED) + hci_acl_disconn(conn, 0x13); + else + conn->state = BT_CLOSED; + hci_dev_unlock(hdev); + return; +} + +static void hci_conn_init_timer(struct hci_conn *conn) +{ + init_timer(&conn->timer); + conn->timer.function = hci_conn_timeout; + conn->timer.data = (unsigned long)conn; +} + +struct hci_conn *hci_conn_add(struct hci_dev *hdev, int type, bdaddr_t *dst) +{ + struct hci_conn *conn; + + BT_DBG("%s dst %s", hdev->name, batostr(dst)); + + if (!(conn = kmalloc(sizeof(struct hci_conn), GFP_ATOMIC))) + return NULL; + memset(conn, 0, sizeof(struct hci_conn)); + + bacpy(&conn->dst, dst); + conn->type = type; + conn->hdev = hdev; + conn->state = BT_OPEN; + + skb_queue_head_init(&conn->data_q); + hci_conn_init_timer(conn); + + atomic_set(&conn->refcnt, 0); + + hci_dev_hold(hdev); + + tasklet_disable(&hdev->tx_task); + + hci_conn_hash_add(hdev, conn); + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_CONN_ADD); + + tasklet_enable(&hdev->tx_task); + + return conn; +} + +int hci_conn_del(struct hci_conn *conn) +{ + struct hci_dev *hdev = conn->hdev; + + BT_DBG("%s conn %p handle %d", hdev->name, conn, conn->handle); + + hci_conn_del_timer(conn); + + if (conn->type == SCO_LINK) { + struct hci_conn *acl = conn->link; + if (acl) { + acl->link = NULL; + hci_conn_put(acl); + } + } else { + struct hci_conn *sco = conn->link; + if (sco) + sco->link = NULL; + + /* Unacked frames */ + hdev->acl_cnt += conn->sent; + } + + tasklet_disable(&hdev->tx_task); + + hci_conn_hash_del(hdev, conn); + if (hdev->notify) + hdev->notify(hdev, HCI_NOTIFY_CONN_DEL); + + tasklet_enable(&hdev->tx_task); + + skb_queue_purge(&conn->data_q); + + hci_dev_put(hdev); + + kfree(conn); + return 0; +} + +struct hci_dev *hci_get_route(bdaddr_t *dst, bdaddr_t *src) +{ + int use_src = bacmp(src, BDADDR_ANY); + struct hci_dev *hdev = NULL; + struct list_head *p; + + BT_DBG("%s -> %s", batostr(src), batostr(dst)); + + read_lock_bh(&hci_dev_list_lock); + + list_for_each(p, &hci_dev_list) { + struct hci_dev *d = list_entry(p, struct hci_dev, list); + + if (!test_bit(HCI_UP, &d->flags) || test_bit(HCI_RAW, &d->flags)) + continue; + + /* Simple routing: + * No source address - find interface with bdaddr != dst + * Source address - find interface with bdaddr == src + */ + + if (use_src) { + if (!bacmp(&d->bdaddr, src)) { + hdev = d; break; + } + } else { + if (bacmp(&d->bdaddr, dst)) { + hdev = d; break; + } + } + } + + if (hdev) + hdev = hci_dev_hold(hdev); + + read_unlock_bh(&hci_dev_list_lock); + return hdev; +} +EXPORT_SYMBOL(hci_get_route); + +/* Create SCO or ACL connection. + * Device _must_ be locked */ +struct hci_conn * hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst) +{ + struct hci_conn *acl; + + BT_DBG("%s dst %s", hdev->name, batostr(dst)); + + if (!(acl = hci_conn_hash_lookup_ba(hdev, ACL_LINK, dst))) { + if (!(acl = hci_conn_add(hdev, ACL_LINK, dst))) + return NULL; + } + + hci_conn_hold(acl); + + if (acl->state == BT_OPEN || acl->state == BT_CLOSED) + hci_acl_connect(acl); + + if (type == SCO_LINK) { + struct hci_conn *sco; + + if (!(sco = hci_conn_hash_lookup_ba(hdev, SCO_LINK, dst))) { + if (!(sco = hci_conn_add(hdev, SCO_LINK, dst))) { + hci_conn_put(acl); + return NULL; + } + } + acl->link = sco; + sco->link = acl; + + hci_conn_hold(sco); + + if (acl->state == BT_CONNECTED && + (sco->state == BT_OPEN || sco->state == BT_CLOSED)) + hci_add_sco(sco, acl->handle); + + return sco; + } else { + return acl; + } +} +EXPORT_SYMBOL(hci_connect); + +/* Authenticate remote device */ +int hci_conn_auth(struct hci_conn *conn) +{ + BT_DBG("conn %p", conn); + + if (conn->link_mode & HCI_LM_AUTH) + return 1; + + if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) { + struct hci_cp_auth_requested cp; + cp.handle = __cpu_to_le16(conn->handle); + hci_send_cmd(conn->hdev, OGF_LINK_CTL, OCF_AUTH_REQUESTED, sizeof(cp), &cp); + } + return 0; +} +EXPORT_SYMBOL(hci_conn_auth); + +/* Enable encryption */ +int hci_conn_encrypt(struct hci_conn *conn) +{ + BT_DBG("conn %p", conn); + + if (conn->link_mode & HCI_LM_ENCRYPT) + return 1; + + if (test_and_set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) + return 0; + + if (hci_conn_auth(conn)) { + struct hci_cp_set_conn_encrypt cp; + cp.handle = __cpu_to_le16(conn->handle); + cp.encrypt = 1; + hci_send_cmd(conn->hdev, OGF_LINK_CTL, OCF_SET_CONN_ENCRYPT, sizeof(cp), &cp); + } + return 0; +} +EXPORT_SYMBOL(hci_conn_encrypt); + +/* Change link key */ +int hci_conn_change_link_key(struct hci_conn *conn) +{ + BT_DBG("conn %p", conn); + + if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) { + struct hci_cp_change_conn_link_key cp; + cp.handle = __cpu_to_le16(conn->handle); + hci_send_cmd(conn->hdev, OGF_LINK_CTL, OCF_CHANGE_CONN_LINK_KEY, sizeof(cp), &cp); + } + return 0; +} +EXPORT_SYMBOL(hci_conn_change_link_key); + +/* Switch role */ +int hci_conn_switch_role(struct hci_conn *conn, uint8_t role) +{ + BT_DBG("conn %p", conn); + + if (!role && conn->link_mode & HCI_LM_MASTER) + return 1; + + if (!test_and_set_bit(HCI_CONN_RSWITCH_PEND, &conn->pend)) { + struct hci_cp_switch_role cp; + bacpy(&cp.bdaddr, &conn->dst); + cp.role = role; + hci_send_cmd(conn->hdev, OGF_LINK_POLICY, OCF_SWITCH_ROLE, sizeof(cp), &cp); + } + return 0; +} +EXPORT_SYMBOL(hci_conn_switch_role); + +/* Drop all connection on the device */ +void hci_conn_hash_flush(struct hci_dev *hdev) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct list_head *p; + + BT_DBG("hdev %s", hdev->name); + + p = h->list.next; + while (p != &h->list) { + struct hci_conn *c; + + c = list_entry(p, struct hci_conn, list); + p = p->next; + + c->state = BT_CLOSED; + + hci_proto_disconn_ind(c, 0x16); + hci_conn_del(c); + } +} + +int hci_get_conn_list(void __user *arg) +{ + struct hci_conn_list_req req, *cl; + struct hci_conn_info *ci; + struct hci_dev *hdev; + struct list_head *p; + int n = 0, size, err; + + if (copy_from_user(&req, arg, sizeof(req))) + return -EFAULT; + + if (!req.conn_num || req.conn_num > (PAGE_SIZE * 2) / sizeof(*ci)) + return -EINVAL; + + size = sizeof(req) + req.conn_num * sizeof(*ci); + + if (!(cl = (void *) kmalloc(size, GFP_KERNEL))) + return -ENOMEM; + + if (!(hdev = hci_dev_get(req.dev_id))) { + kfree(cl); + return -ENODEV; + } + + ci = cl->conn_info; + + hci_dev_lock_bh(hdev); + list_for_each(p, &hdev->conn_hash.list) { + register struct hci_conn *c; + c = list_entry(p, struct hci_conn, list); + + bacpy(&(ci + n)->bdaddr, &c->dst); + (ci + n)->handle = c->handle; + (ci + n)->type = c->type; + (ci + n)->out = c->out; + (ci + n)->state = c->state; + (ci + n)->link_mode = c->link_mode; + if (++n >= req.conn_num) + break; + } + hci_dev_unlock_bh(hdev); + + cl->dev_id = hdev->id; + cl->conn_num = n; + size = sizeof(req) + n * sizeof(*ci); + + hci_dev_put(hdev); + + err = copy_to_user(arg, cl, size); + kfree(cl); + + return err ? -EFAULT : 0; +} + +int hci_get_conn_info(struct hci_dev *hdev, void __user *arg) +{ + struct hci_conn_info_req req; + struct hci_conn_info ci; + struct hci_conn *conn; + char __user *ptr = arg + sizeof(req); + + if (copy_from_user(&req, arg, sizeof(req))) + return -EFAULT; + + hci_dev_lock_bh(hdev); + conn = hci_conn_hash_lookup_ba(hdev, req.type, &req.bdaddr); + if (conn) { + bacpy(&ci.bdaddr, &conn->dst); + ci.handle = conn->handle; + ci.type = conn->type; + ci.out = conn->out; + ci.state = conn->state; + ci.link_mode = conn->link_mode; + } + hci_dev_unlock_bh(hdev); + + if (!conn) + return -ENOENT; + + return copy_to_user(ptr, &ci, sizeof(ci)) ? -EFAULT : 0; +} diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c new file mode 100644 index 000000000000..860dba7bdd89 --- /dev/null +++ b/net/bluetooth/hci_core.c @@ -0,0 +1,1434 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2000-2001 Qualcomm Incorporated + + Written 2000,2001 by Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth HCI core. */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#ifndef CONFIG_BT_HCI_CORE_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +static void hci_cmd_task(unsigned long arg); +static void hci_rx_task(unsigned long arg); +static void hci_tx_task(unsigned long arg); +static void hci_notify(struct hci_dev *hdev, int event); + +static DEFINE_RWLOCK(hci_task_lock); + +/* HCI device list */ +LIST_HEAD(hci_dev_list); +DEFINE_RWLOCK(hci_dev_list_lock); + +/* HCI callback list */ +LIST_HEAD(hci_cb_list); +DEFINE_RWLOCK(hci_cb_list_lock); + +/* HCI protocols */ +#define HCI_MAX_PROTO 2 +struct hci_proto *hci_proto[HCI_MAX_PROTO]; + +/* HCI notifiers list */ +static struct notifier_block *hci_notifier; + +/* ---- HCI notifications ---- */ + +int hci_register_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&hci_notifier, nb); +} + +int hci_unregister_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&hci_notifier, nb); +} + +void hci_notify(struct hci_dev *hdev, int event) +{ + notifier_call_chain(&hci_notifier, event, hdev); +} + +/* ---- HCI requests ---- */ + +void hci_req_complete(struct hci_dev *hdev, int result) +{ + BT_DBG("%s result 0x%2.2x", hdev->name, result); + + if (hdev->req_status == HCI_REQ_PEND) { + hdev->req_result = result; + hdev->req_status = HCI_REQ_DONE; + wake_up_interruptible(&hdev->req_wait_q); + } +} + +static void hci_req_cancel(struct hci_dev *hdev, int err) +{ + BT_DBG("%s err 0x%2.2x", hdev->name, err); + + if (hdev->req_status == HCI_REQ_PEND) { + hdev->req_result = err; + hdev->req_status = HCI_REQ_CANCELED; + wake_up_interruptible(&hdev->req_wait_q); + } +} + +/* Execute request and wait for completion. */ +static int __hci_request(struct hci_dev *hdev, void (*req)(struct hci_dev *hdev, unsigned long opt), + unsigned long opt, __u32 timeout) +{ + DECLARE_WAITQUEUE(wait, current); + int err = 0; + + BT_DBG("%s start", hdev->name); + + hdev->req_status = HCI_REQ_PEND; + + add_wait_queue(&hdev->req_wait_q, &wait); + set_current_state(TASK_INTERRUPTIBLE); + + req(hdev, opt); + schedule_timeout(timeout); + + remove_wait_queue(&hdev->req_wait_q, &wait); + + if (signal_pending(current)) + return -EINTR; + + switch (hdev->req_status) { + case HCI_REQ_DONE: + err = -bt_err(hdev->req_result); + break; + + case HCI_REQ_CANCELED: + err = -hdev->req_result; + break; + + default: + err = -ETIMEDOUT; + break; + }; + + hdev->req_status = hdev->req_result = 0; + + BT_DBG("%s end: err %d", hdev->name, err); + + return err; +} + +static inline int hci_request(struct hci_dev *hdev, void (*req)(struct hci_dev *hdev, unsigned long opt), + unsigned long opt, __u32 timeout) +{ + int ret; + + /* Serialize all requests */ + hci_req_lock(hdev); + ret = __hci_request(hdev, req, opt, timeout); + hci_req_unlock(hdev); + + return ret; +} + +static void hci_reset_req(struct hci_dev *hdev, unsigned long opt) +{ + BT_DBG("%s %ld", hdev->name, opt); + + /* Reset device */ + hci_send_cmd(hdev, OGF_HOST_CTL, OCF_RESET, 0, NULL); +} + +static void hci_init_req(struct hci_dev *hdev, unsigned long opt) +{ + struct sk_buff *skb; + __u16 param; + + BT_DBG("%s %ld", hdev->name, opt); + + /* Driver initialization */ + + /* Special commands */ + while ((skb = skb_dequeue(&hdev->driver_init))) { + skb->pkt_type = HCI_COMMAND_PKT; + skb->dev = (void *) hdev; + skb_queue_tail(&hdev->cmd_q, skb); + hci_sched_cmd(hdev); + } + skb_queue_purge(&hdev->driver_init); + + /* Mandatory initialization */ + + /* Reset */ + if (test_bit(HCI_QUIRK_RESET_ON_INIT, &hdev->quirks)) + hci_send_cmd(hdev, OGF_HOST_CTL, OCF_RESET, 0, NULL); + + /* Read Local Supported Features */ + hci_send_cmd(hdev, OGF_INFO_PARAM, OCF_READ_LOCAL_FEATURES, 0, NULL); + + /* Read Buffer Size (ACL mtu, max pkt, etc.) */ + hci_send_cmd(hdev, OGF_INFO_PARAM, OCF_READ_BUFFER_SIZE, 0, NULL); + +#if 0 + /* Host buffer size */ + { + struct hci_cp_host_buffer_size cp; + cp.acl_mtu = __cpu_to_le16(HCI_MAX_ACL_SIZE); + cp.sco_mtu = HCI_MAX_SCO_SIZE; + cp.acl_max_pkt = __cpu_to_le16(0xffff); + cp.sco_max_pkt = __cpu_to_le16(0xffff); + hci_send_cmd(hdev, OGF_HOST_CTL, OCF_HOST_BUFFER_SIZE, sizeof(cp), &cp); + } +#endif + + /* Read BD Address */ + hci_send_cmd(hdev, OGF_INFO_PARAM, OCF_READ_BD_ADDR, 0, NULL); + + /* Read Voice Setting */ + hci_send_cmd(hdev, OGF_HOST_CTL, OCF_READ_VOICE_SETTING, 0, NULL); + + /* Optional initialization */ + + /* Clear Event Filters */ + { + struct hci_cp_set_event_flt cp; + cp.flt_type = HCI_FLT_CLEAR_ALL; + hci_send_cmd(hdev, OGF_HOST_CTL, OCF_SET_EVENT_FLT, sizeof(cp), &cp); + } + + /* Page timeout ~20 secs */ + param = __cpu_to_le16(0x8000); + hci_send_cmd(hdev, OGF_HOST_CTL, OCF_WRITE_PG_TIMEOUT, 2, ¶m); + + /* Connection accept timeout ~20 secs */ + param = __cpu_to_le16(0x7d00); + hci_send_cmd(hdev, OGF_HOST_CTL, OCF_WRITE_CA_TIMEOUT, 2, ¶m); +} + +static void hci_scan_req(struct hci_dev *hdev, unsigned long opt) +{ + __u8 scan = opt; + + BT_DBG("%s %x", hdev->name, scan); + + /* Inquiry and Page scans */ + hci_send_cmd(hdev, OGF_HOST_CTL, OCF_WRITE_SCAN_ENABLE, 1, &scan); +} + +static void hci_auth_req(struct hci_dev *hdev, unsigned long opt) +{ + __u8 auth = opt; + + BT_DBG("%s %x", hdev->name, auth); + + /* Authentication */ + hci_send_cmd(hdev, OGF_HOST_CTL, OCF_WRITE_AUTH_ENABLE, 1, &auth); +} + +static void hci_encrypt_req(struct hci_dev *hdev, unsigned long opt) +{ + __u8 encrypt = opt; + + BT_DBG("%s %x", hdev->name, encrypt); + + /* Authentication */ + hci_send_cmd(hdev, OGF_HOST_CTL, OCF_WRITE_ENCRYPT_MODE, 1, &encrypt); +} + +/* Get HCI device by index. + * Device is held on return. */ +struct hci_dev *hci_dev_get(int index) +{ + struct hci_dev *hdev = NULL; + struct list_head *p; + + BT_DBG("%d", index); + + if (index < 0) + return NULL; + + read_lock(&hci_dev_list_lock); + list_for_each(p, &hci_dev_list) { + struct hci_dev *d = list_entry(p, struct hci_dev, list); + if (d->id == index) { + hdev = hci_dev_hold(d); + break; + } + } + read_unlock(&hci_dev_list_lock); + return hdev; +} +EXPORT_SYMBOL(hci_dev_get); + +/* ---- Inquiry support ---- */ +static void inquiry_cache_flush(struct hci_dev *hdev) +{ + struct inquiry_cache *cache = &hdev->inq_cache; + struct inquiry_entry *next = cache->list, *e; + + BT_DBG("cache %p", cache); + + cache->list = NULL; + while ((e = next)) { + next = e->next; + kfree(e); + } +} + +struct inquiry_entry *hci_inquiry_cache_lookup(struct hci_dev *hdev, bdaddr_t *bdaddr) +{ + struct inquiry_cache *cache = &hdev->inq_cache; + struct inquiry_entry *e; + + BT_DBG("cache %p, %s", cache, batostr(bdaddr)); + + for (e = cache->list; e; e = e->next) + if (!bacmp(&e->data.bdaddr, bdaddr)) + break; + return e; +} + +void hci_inquiry_cache_update(struct hci_dev *hdev, struct inquiry_data *data) +{ + struct inquiry_cache *cache = &hdev->inq_cache; + struct inquiry_entry *e; + + BT_DBG("cache %p, %s", cache, batostr(&data->bdaddr)); + + if (!(e = hci_inquiry_cache_lookup(hdev, &data->bdaddr))) { + /* Entry not in the cache. Add new one. */ + if (!(e = kmalloc(sizeof(struct inquiry_entry), GFP_ATOMIC))) + return; + memset(e, 0, sizeof(struct inquiry_entry)); + e->next = cache->list; + cache->list = e; + } + + memcpy(&e->data, data, sizeof(*data)); + e->timestamp = jiffies; + cache->timestamp = jiffies; +} + +static int inquiry_cache_dump(struct hci_dev *hdev, int num, __u8 *buf) +{ + struct inquiry_cache *cache = &hdev->inq_cache; + struct inquiry_info *info = (struct inquiry_info *) buf; + struct inquiry_entry *e; + int copied = 0; + + for (e = cache->list; e && copied < num; e = e->next, copied++) { + struct inquiry_data *data = &e->data; + bacpy(&info->bdaddr, &data->bdaddr); + info->pscan_rep_mode = data->pscan_rep_mode; + info->pscan_period_mode = data->pscan_period_mode; + info->pscan_mode = data->pscan_mode; + memcpy(info->dev_class, data->dev_class, 3); + info->clock_offset = data->clock_offset; + info++; + } + + BT_DBG("cache %p, copied %d", cache, copied); + return copied; +} + +static void hci_inq_req(struct hci_dev *hdev, unsigned long opt) +{ + struct hci_inquiry_req *ir = (struct hci_inquiry_req *) opt; + struct hci_cp_inquiry cp; + + BT_DBG("%s", hdev->name); + + if (test_bit(HCI_INQUIRY, &hdev->flags)) + return; + + /* Start Inquiry */ + memcpy(&cp.lap, &ir->lap, 3); + cp.length = ir->length; + cp.num_rsp = ir->num_rsp; + hci_send_cmd(hdev, OGF_LINK_CTL, OCF_INQUIRY, sizeof(cp), &cp); +} + +int hci_inquiry(void __user *arg) +{ + __u8 __user *ptr = arg; + struct hci_inquiry_req ir; + struct hci_dev *hdev; + int err = 0, do_inquiry = 0, max_rsp; + long timeo; + __u8 *buf; + + if (copy_from_user(&ir, ptr, sizeof(ir))) + return -EFAULT; + + if (!(hdev = hci_dev_get(ir.dev_id))) + return -ENODEV; + + hci_dev_lock_bh(hdev); + if (inquiry_cache_age(hdev) > INQUIRY_CACHE_AGE_MAX || + inquiry_cache_empty(hdev) || + ir.flags & IREQ_CACHE_FLUSH) { + inquiry_cache_flush(hdev); + do_inquiry = 1; + } + hci_dev_unlock_bh(hdev); + + timeo = ir.length * 2 * HZ; + if (do_inquiry && (err = hci_request(hdev, hci_inq_req, (unsigned long)&ir, timeo)) < 0) + goto done; + + /* for unlimited number of responses we will use buffer with 255 entries */ + max_rsp = (ir.num_rsp == 0) ? 255 : ir.num_rsp; + + /* cache_dump can't sleep. Therefore we allocate temp buffer and then + * copy it to the user space. + */ + if (!(buf = kmalloc(sizeof(struct inquiry_info) * max_rsp, GFP_KERNEL))) { + err = -ENOMEM; + goto done; + } + + hci_dev_lock_bh(hdev); + ir.num_rsp = inquiry_cache_dump(hdev, max_rsp, buf); + hci_dev_unlock_bh(hdev); + + BT_DBG("num_rsp %d", ir.num_rsp); + + if (!copy_to_user(ptr, &ir, sizeof(ir))) { + ptr += sizeof(ir); + if (copy_to_user(ptr, buf, sizeof(struct inquiry_info) * + ir.num_rsp)) + err = -EFAULT; + } else + err = -EFAULT; + + kfree(buf); + +done: + hci_dev_put(hdev); + return err; +} + +/* ---- HCI ioctl helpers ---- */ + +int hci_dev_open(__u16 dev) +{ + struct hci_dev *hdev; + int ret = 0; + + if (!(hdev = hci_dev_get(dev))) + return -ENODEV; + + BT_DBG("%s %p", hdev->name, hdev); + + hci_req_lock(hdev); + + if (test_bit(HCI_UP, &hdev->flags)) { + ret = -EALREADY; + goto done; + } + + if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + set_bit(HCI_RAW, &hdev->flags); + + if (hdev->open(hdev)) { + ret = -EIO; + goto done; + } + + if (!test_bit(HCI_RAW, &hdev->flags)) { + atomic_set(&hdev->cmd_cnt, 1); + set_bit(HCI_INIT, &hdev->flags); + + //__hci_request(hdev, hci_reset_req, 0, HZ); + ret = __hci_request(hdev, hci_init_req, 0, HCI_INIT_TIMEOUT); + + clear_bit(HCI_INIT, &hdev->flags); + } + + if (!ret) { + hci_dev_hold(hdev); + set_bit(HCI_UP, &hdev->flags); + hci_notify(hdev, HCI_DEV_UP); + } else { + /* Init failed, cleanup */ + tasklet_kill(&hdev->rx_task); + tasklet_kill(&hdev->tx_task); + tasklet_kill(&hdev->cmd_task); + + skb_queue_purge(&hdev->cmd_q); + skb_queue_purge(&hdev->rx_q); + + if (hdev->flush) + hdev->flush(hdev); + + if (hdev->sent_cmd) { + kfree_skb(hdev->sent_cmd); + hdev->sent_cmd = NULL; + } + + hdev->close(hdev); + hdev->flags = 0; + } + +done: + hci_req_unlock(hdev); + hci_dev_put(hdev); + return ret; +} + +static int hci_dev_do_close(struct hci_dev *hdev) +{ + BT_DBG("%s %p", hdev->name, hdev); + + hci_req_cancel(hdev, ENODEV); + hci_req_lock(hdev); + + if (!test_and_clear_bit(HCI_UP, &hdev->flags)) { + hci_req_unlock(hdev); + return 0; + } + + /* Kill RX and TX tasks */ + tasklet_kill(&hdev->rx_task); + tasklet_kill(&hdev->tx_task); + + hci_dev_lock_bh(hdev); + inquiry_cache_flush(hdev); + hci_conn_hash_flush(hdev); + hci_dev_unlock_bh(hdev); + + hci_notify(hdev, HCI_DEV_DOWN); + + if (hdev->flush) + hdev->flush(hdev); + + /* Reset device */ + skb_queue_purge(&hdev->cmd_q); + atomic_set(&hdev->cmd_cnt, 1); + if (!test_bit(HCI_RAW, &hdev->flags)) { + set_bit(HCI_INIT, &hdev->flags); + __hci_request(hdev, hci_reset_req, 0, HZ/4); + clear_bit(HCI_INIT, &hdev->flags); + } + + /* Kill cmd task */ + tasklet_kill(&hdev->cmd_task); + + /* Drop queues */ + skb_queue_purge(&hdev->rx_q); + skb_queue_purge(&hdev->cmd_q); + skb_queue_purge(&hdev->raw_q); + + /* Drop last sent command */ + if (hdev->sent_cmd) { + kfree_skb(hdev->sent_cmd); + hdev->sent_cmd = NULL; + } + + /* After this point our queues are empty + * and no tasks are scheduled. */ + hdev->close(hdev); + + /* Clear flags */ + hdev->flags = 0; + + hci_req_unlock(hdev); + + hci_dev_put(hdev); + return 0; +} + +int hci_dev_close(__u16 dev) +{ + struct hci_dev *hdev; + int err; + + if (!(hdev = hci_dev_get(dev))) + return -ENODEV; + err = hci_dev_do_close(hdev); + hci_dev_put(hdev); + return err; +} + +int hci_dev_reset(__u16 dev) +{ + struct hci_dev *hdev; + int ret = 0; + + if (!(hdev = hci_dev_get(dev))) + return -ENODEV; + + hci_req_lock(hdev); + tasklet_disable(&hdev->tx_task); + + if (!test_bit(HCI_UP, &hdev->flags)) + goto done; + + /* Drop queues */ + skb_queue_purge(&hdev->rx_q); + skb_queue_purge(&hdev->cmd_q); + + hci_dev_lock_bh(hdev); + inquiry_cache_flush(hdev); + hci_conn_hash_flush(hdev); + hci_dev_unlock_bh(hdev); + + if (hdev->flush) + hdev->flush(hdev); + + atomic_set(&hdev->cmd_cnt, 1); + hdev->acl_cnt = 0; hdev->sco_cnt = 0; + + if (!test_bit(HCI_RAW, &hdev->flags)) + ret = __hci_request(hdev, hci_reset_req, 0, HCI_INIT_TIMEOUT); + +done: + tasklet_enable(&hdev->tx_task); + hci_req_unlock(hdev); + hci_dev_put(hdev); + return ret; +} + +int hci_dev_reset_stat(__u16 dev) +{ + struct hci_dev *hdev; + int ret = 0; + + if (!(hdev = hci_dev_get(dev))) + return -ENODEV; + + memset(&hdev->stat, 0, sizeof(struct hci_dev_stats)); + + hci_dev_put(hdev); + + return ret; +} + +int hci_dev_cmd(unsigned int cmd, void __user *arg) +{ + struct hci_dev *hdev; + struct hci_dev_req dr; + int err = 0; + + if (copy_from_user(&dr, arg, sizeof(dr))) + return -EFAULT; + + if (!(hdev = hci_dev_get(dr.dev_id))) + return -ENODEV; + + switch (cmd) { + case HCISETAUTH: + err = hci_request(hdev, hci_auth_req, dr.dev_opt, HCI_INIT_TIMEOUT); + break; + + case HCISETENCRYPT: + if (!lmp_encrypt_capable(hdev)) { + err = -EOPNOTSUPP; + break; + } + + if (!test_bit(HCI_AUTH, &hdev->flags)) { + /* Auth must be enabled first */ + err = hci_request(hdev, hci_auth_req, + dr.dev_opt, HCI_INIT_TIMEOUT); + if (err) + break; + } + + err = hci_request(hdev, hci_encrypt_req, + dr.dev_opt, HCI_INIT_TIMEOUT); + break; + + case HCISETSCAN: + err = hci_request(hdev, hci_scan_req, dr.dev_opt, HCI_INIT_TIMEOUT); + break; + + case HCISETPTYPE: + hdev->pkt_type = (__u16) dr.dev_opt; + break; + + case HCISETLINKPOL: + hdev->link_policy = (__u16) dr.dev_opt; + break; + + case HCISETLINKMODE: + hdev->link_mode = ((__u16) dr.dev_opt) & (HCI_LM_MASTER | HCI_LM_ACCEPT); + break; + + case HCISETACLMTU: + hdev->acl_mtu = *((__u16 *)&dr.dev_opt + 1); + hdev->acl_pkts = *((__u16 *)&dr.dev_opt + 0); + break; + + case HCISETSCOMTU: + hdev->sco_mtu = *((__u16 *)&dr.dev_opt + 1); + hdev->sco_pkts = *((__u16 *)&dr.dev_opt + 0); + break; + + default: + err = -EINVAL; + break; + } + hci_dev_put(hdev); + return err; +} + +int hci_get_dev_list(void __user *arg) +{ + struct hci_dev_list_req *dl; + struct hci_dev_req *dr; + struct list_head *p; + int n = 0, size, err; + __u16 dev_num; + + if (get_user(dev_num, (__u16 __user *) arg)) + return -EFAULT; + + if (!dev_num || dev_num > (PAGE_SIZE * 2) / sizeof(*dr)) + return -EINVAL; + + size = sizeof(*dl) + dev_num * sizeof(*dr); + + if (!(dl = kmalloc(size, GFP_KERNEL))) + return -ENOMEM; + + dr = dl->dev_req; + + read_lock_bh(&hci_dev_list_lock); + list_for_each(p, &hci_dev_list) { + struct hci_dev *hdev; + hdev = list_entry(p, struct hci_dev, list); + (dr + n)->dev_id = hdev->id; + (dr + n)->dev_opt = hdev->flags; + if (++n >= dev_num) + break; + } + read_unlock_bh(&hci_dev_list_lock); + + dl->dev_num = n; + size = sizeof(*dl) + n * sizeof(*dr); + + err = copy_to_user(arg, dl, size); + kfree(dl); + + return err ? -EFAULT : 0; +} + +int hci_get_dev_info(void __user *arg) +{ + struct hci_dev *hdev; + struct hci_dev_info di; + int err = 0; + + if (copy_from_user(&di, arg, sizeof(di))) + return -EFAULT; + + if (!(hdev = hci_dev_get(di.dev_id))) + return -ENODEV; + + strcpy(di.name, hdev->name); + di.bdaddr = hdev->bdaddr; + di.type = hdev->type; + di.flags = hdev->flags; + di.pkt_type = hdev->pkt_type; + di.acl_mtu = hdev->acl_mtu; + di.acl_pkts = hdev->acl_pkts; + di.sco_mtu = hdev->sco_mtu; + di.sco_pkts = hdev->sco_pkts; + di.link_policy = hdev->link_policy; + di.link_mode = hdev->link_mode; + + memcpy(&di.stat, &hdev->stat, sizeof(di.stat)); + memcpy(&di.features, &hdev->features, sizeof(di.features)); + + if (copy_to_user(arg, &di, sizeof(di))) + err = -EFAULT; + + hci_dev_put(hdev); + + return err; +} + +/* ---- Interface to HCI drivers ---- */ + +/* Alloc HCI device */ +struct hci_dev *hci_alloc_dev(void) +{ + struct hci_dev *hdev; + + hdev = kmalloc(sizeof(struct hci_dev), GFP_KERNEL); + if (!hdev) + return NULL; + + memset(hdev, 0, sizeof(struct hci_dev)); + + skb_queue_head_init(&hdev->driver_init); + + return hdev; +} +EXPORT_SYMBOL(hci_alloc_dev); + +/* Free HCI device */ +void hci_free_dev(struct hci_dev *hdev) +{ + skb_queue_purge(&hdev->driver_init); + + /* will free via class release */ + class_device_put(&hdev->class_dev); +} +EXPORT_SYMBOL(hci_free_dev); + +/* Register HCI device */ +int hci_register_dev(struct hci_dev *hdev) +{ + struct list_head *head = &hci_dev_list, *p; + int id = 0; + + BT_DBG("%p name %s type %d owner %p", hdev, hdev->name, hdev->type, hdev->owner); + + if (!hdev->open || !hdev->close || !hdev->destruct) + return -EINVAL; + + write_lock_bh(&hci_dev_list_lock); + + /* Find first available device id */ + list_for_each(p, &hci_dev_list) { + if (list_entry(p, struct hci_dev, list)->id != id) + break; + head = p; id++; + } + + sprintf(hdev->name, "hci%d", id); + hdev->id = id; + list_add(&hdev->list, head); + + atomic_set(&hdev->refcnt, 1); + spin_lock_init(&hdev->lock); + + hdev->flags = 0; + hdev->pkt_type = (HCI_DM1 | HCI_DH1 | HCI_HV1); + hdev->link_mode = (HCI_LM_ACCEPT); + + tasklet_init(&hdev->cmd_task, hci_cmd_task,(unsigned long) hdev); + tasklet_init(&hdev->rx_task, hci_rx_task, (unsigned long) hdev); + tasklet_init(&hdev->tx_task, hci_tx_task, (unsigned long) hdev); + + skb_queue_head_init(&hdev->rx_q); + skb_queue_head_init(&hdev->cmd_q); + skb_queue_head_init(&hdev->raw_q); + + init_waitqueue_head(&hdev->req_wait_q); + init_MUTEX(&hdev->req_lock); + + inquiry_cache_init(hdev); + + hci_conn_hash_init(hdev); + + memset(&hdev->stat, 0, sizeof(struct hci_dev_stats)); + + atomic_set(&hdev->promisc, 0); + + write_unlock_bh(&hci_dev_list_lock); + + hci_register_sysfs(hdev); + + hci_notify(hdev, HCI_DEV_REG); + + return id; +} +EXPORT_SYMBOL(hci_register_dev); + +/* Unregister HCI device */ +int hci_unregister_dev(struct hci_dev *hdev) +{ + BT_DBG("%p name %s type %d", hdev, hdev->name, hdev->type); + + hci_unregister_sysfs(hdev); + + write_lock_bh(&hci_dev_list_lock); + list_del(&hdev->list); + write_unlock_bh(&hci_dev_list_lock); + + hci_dev_do_close(hdev); + + hci_notify(hdev, HCI_DEV_UNREG); + + __hci_dev_put(hdev); + return 0; +} +EXPORT_SYMBOL(hci_unregister_dev); + +/* Suspend HCI device */ +int hci_suspend_dev(struct hci_dev *hdev) +{ + hci_notify(hdev, HCI_DEV_SUSPEND); + return 0; +} +EXPORT_SYMBOL(hci_suspend_dev); + +/* Resume HCI device */ +int hci_resume_dev(struct hci_dev *hdev) +{ + hci_notify(hdev, HCI_DEV_RESUME); + return 0; +} +EXPORT_SYMBOL(hci_resume_dev); + +/* ---- Interface to upper protocols ---- */ + +/* Register/Unregister protocols. + * hci_task_lock is used to ensure that no tasks are running. */ +int hci_register_proto(struct hci_proto *hp) +{ + int err = 0; + + BT_DBG("%p name %s id %d", hp, hp->name, hp->id); + + if (hp->id >= HCI_MAX_PROTO) + return -EINVAL; + + write_lock_bh(&hci_task_lock); + + if (!hci_proto[hp->id]) + hci_proto[hp->id] = hp; + else + err = -EEXIST; + + write_unlock_bh(&hci_task_lock); + + return err; +} +EXPORT_SYMBOL(hci_register_proto); + +int hci_unregister_proto(struct hci_proto *hp) +{ + int err = 0; + + BT_DBG("%p name %s id %d", hp, hp->name, hp->id); + + if (hp->id >= HCI_MAX_PROTO) + return -EINVAL; + + write_lock_bh(&hci_task_lock); + + if (hci_proto[hp->id]) + hci_proto[hp->id] = NULL; + else + err = -ENOENT; + + write_unlock_bh(&hci_task_lock); + + return err; +} +EXPORT_SYMBOL(hci_unregister_proto); + +int hci_register_cb(struct hci_cb *cb) +{ + BT_DBG("%p name %s", cb, cb->name); + + write_lock_bh(&hci_cb_list_lock); + list_add(&cb->list, &hci_cb_list); + write_unlock_bh(&hci_cb_list_lock); + + return 0; +} +EXPORT_SYMBOL(hci_register_cb); + +int hci_unregister_cb(struct hci_cb *cb) +{ + BT_DBG("%p name %s", cb, cb->name); + + write_lock_bh(&hci_cb_list_lock); + list_del(&cb->list); + write_unlock_bh(&hci_cb_list_lock); + + return 0; +} +EXPORT_SYMBOL(hci_unregister_cb); + +static int hci_send_frame(struct sk_buff *skb) +{ + struct hci_dev *hdev = (struct hci_dev *) skb->dev; + + if (!hdev) { + kfree_skb(skb); + return -ENODEV; + } + + BT_DBG("%s type %d len %d", hdev->name, skb->pkt_type, skb->len); + + if (atomic_read(&hdev->promisc)) { + /* Time stamp */ + do_gettimeofday(&skb->stamp); + + hci_send_to_sock(hdev, skb); + } + + /* Get rid of skb owner, prior to sending to the driver. */ + skb_orphan(skb); + + return hdev->send(skb); +} + +/* Send HCI command */ +int hci_send_cmd(struct hci_dev *hdev, __u16 ogf, __u16 ocf, __u32 plen, void *param) +{ + int len = HCI_COMMAND_HDR_SIZE + plen; + struct hci_command_hdr *hdr; + struct sk_buff *skb; + + BT_DBG("%s ogf 0x%x ocf 0x%x plen %d", hdev->name, ogf, ocf, plen); + + skb = bt_skb_alloc(len, GFP_ATOMIC); + if (!skb) { + BT_ERR("%s Can't allocate memory for HCI command", hdev->name); + return -ENOMEM; + } + + hdr = (struct hci_command_hdr *) skb_put(skb, HCI_COMMAND_HDR_SIZE); + hdr->opcode = __cpu_to_le16(hci_opcode_pack(ogf, ocf)); + hdr->plen = plen; + + if (plen) + memcpy(skb_put(skb, plen), param, plen); + + BT_DBG("skb len %d", skb->len); + + skb->pkt_type = HCI_COMMAND_PKT; + skb->dev = (void *) hdev; + skb_queue_tail(&hdev->cmd_q, skb); + hci_sched_cmd(hdev); + + return 0; +} +EXPORT_SYMBOL(hci_send_cmd); + +/* Get data from the previously sent command */ +void *hci_sent_cmd_data(struct hci_dev *hdev, __u16 ogf, __u16 ocf) +{ + struct hci_command_hdr *hdr; + + if (!hdev->sent_cmd) + return NULL; + + hdr = (void *) hdev->sent_cmd->data; + + if (hdr->opcode != __cpu_to_le16(hci_opcode_pack(ogf, ocf))) + return NULL; + + BT_DBG("%s ogf 0x%x ocf 0x%x", hdev->name, ogf, ocf); + + return hdev->sent_cmd->data + HCI_COMMAND_HDR_SIZE; +} + +/* Send ACL data */ +static void hci_add_acl_hdr(struct sk_buff *skb, __u16 handle, __u16 flags) +{ + struct hci_acl_hdr *hdr; + int len = skb->len; + + hdr = (struct hci_acl_hdr *) skb_push(skb, HCI_ACL_HDR_SIZE); + hdr->handle = __cpu_to_le16(hci_handle_pack(handle, flags)); + hdr->dlen = __cpu_to_le16(len); + + skb->h.raw = (void *) hdr; +} + +int hci_send_acl(struct hci_conn *conn, struct sk_buff *skb, __u16 flags) +{ + struct hci_dev *hdev = conn->hdev; + struct sk_buff *list; + + BT_DBG("%s conn %p flags 0x%x", hdev->name, conn, flags); + + skb->dev = (void *) hdev; + skb->pkt_type = HCI_ACLDATA_PKT; + hci_add_acl_hdr(skb, conn->handle, flags | ACL_START); + + if (!(list = skb_shinfo(skb)->frag_list)) { + /* Non fragmented */ + BT_DBG("%s nonfrag skb %p len %d", hdev->name, skb, skb->len); + + skb_queue_tail(&conn->data_q, skb); + } else { + /* Fragmented */ + BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); + + skb_shinfo(skb)->frag_list = NULL; + + /* Queue all fragments atomically */ + spin_lock_bh(&conn->data_q.lock); + + __skb_queue_tail(&conn->data_q, skb); + do { + skb = list; list = list->next; + + skb->dev = (void *) hdev; + skb->pkt_type = HCI_ACLDATA_PKT; + hci_add_acl_hdr(skb, conn->handle, flags | ACL_CONT); + + BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len); + + __skb_queue_tail(&conn->data_q, skb); + } while (list); + + spin_unlock_bh(&conn->data_q.lock); + } + + hci_sched_tx(hdev); + return 0; +} +EXPORT_SYMBOL(hci_send_acl); + +/* Send SCO data */ +int hci_send_sco(struct hci_conn *conn, struct sk_buff *skb) +{ + struct hci_dev *hdev = conn->hdev; + struct hci_sco_hdr hdr; + + BT_DBG("%s len %d", hdev->name, skb->len); + + if (skb->len > hdev->sco_mtu) { + kfree_skb(skb); + return -EINVAL; + } + + hdr.handle = __cpu_to_le16(conn->handle); + hdr.dlen = skb->len; + + skb->h.raw = skb_push(skb, HCI_SCO_HDR_SIZE); + memcpy(skb->h.raw, &hdr, HCI_SCO_HDR_SIZE); + + skb->dev = (void *) hdev; + skb->pkt_type = HCI_SCODATA_PKT; + skb_queue_tail(&conn->data_q, skb); + hci_sched_tx(hdev); + return 0; +} +EXPORT_SYMBOL(hci_send_sco); + +/* ---- HCI TX task (outgoing data) ---- */ + +/* HCI Connection scheduler */ +static inline struct hci_conn *hci_low_sent(struct hci_dev *hdev, __u8 type, int *quote) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct hci_conn *conn = NULL; + int num = 0, min = ~0; + struct list_head *p; + + /* We don't have to lock device here. Connections are always + * added and removed with TX task disabled. */ + list_for_each(p, &h->list) { + struct hci_conn *c; + c = list_entry(p, struct hci_conn, list); + + if (c->type != type || c->state != BT_CONNECTED + || skb_queue_empty(&c->data_q)) + continue; + num++; + + if (c->sent < min) { + min = c->sent; + conn = c; + } + } + + if (conn) { + int cnt = (type == ACL_LINK ? hdev->acl_cnt : hdev->sco_cnt); + int q = cnt / num; + *quote = q ? q : 1; + } else + *quote = 0; + + BT_DBG("conn %p quote %d", conn, *quote); + return conn; +} + +static inline void hci_acl_tx_to(struct hci_dev *hdev) +{ + struct hci_conn_hash *h = &hdev->conn_hash; + struct list_head *p; + struct hci_conn *c; + + BT_ERR("%s ACL tx timeout", hdev->name); + + /* Kill stalled connections */ + list_for_each(p, &h->list) { + c = list_entry(p, struct hci_conn, list); + if (c->type == ACL_LINK && c->sent) { + BT_ERR("%s killing stalled ACL connection %s", + hdev->name, batostr(&c->dst)); + hci_acl_disconn(c, 0x13); + } + } +} + +static inline void hci_sched_acl(struct hci_dev *hdev) +{ + struct hci_conn *conn; + struct sk_buff *skb; + int quote; + + BT_DBG("%s", hdev->name); + + if (!test_bit(HCI_RAW, &hdev->flags)) { + /* ACL tx timeout must be longer than maximum + * link supervision timeout (40.9 seconds) */ + if (!hdev->acl_cnt && (jiffies - hdev->acl_last_tx) > (HZ * 45)) + hci_acl_tx_to(hdev); + } + + while (hdev->acl_cnt && (conn = hci_low_sent(hdev, ACL_LINK, "e))) { + while (quote-- && (skb = skb_dequeue(&conn->data_q))) { + BT_DBG("skb %p len %d", skb, skb->len); + hci_send_frame(skb); + hdev->acl_last_tx = jiffies; + + hdev->acl_cnt--; + conn->sent++; + } + } +} + +/* Schedule SCO */ +static inline void hci_sched_sco(struct hci_dev *hdev) +{ + struct hci_conn *conn; + struct sk_buff *skb; + int quote; + + BT_DBG("%s", hdev->name); + + while (hdev->sco_cnt && (conn = hci_low_sent(hdev, SCO_LINK, "e))) { + while (quote-- && (skb = skb_dequeue(&conn->data_q))) { + BT_DBG("skb %p len %d", skb, skb->len); + hci_send_frame(skb); + + conn->sent++; + if (conn->sent == ~0) + conn->sent = 0; + } + } +} + +static void hci_tx_task(unsigned long arg) +{ + struct hci_dev *hdev = (struct hci_dev *) arg; + struct sk_buff *skb; + + read_lock(&hci_task_lock); + + BT_DBG("%s acl %d sco %d", hdev->name, hdev->acl_cnt, hdev->sco_cnt); + + /* Schedule queues and send stuff to HCI driver */ + + hci_sched_acl(hdev); + + hci_sched_sco(hdev); + + /* Send next queued raw (unknown type) packet */ + while ((skb = skb_dequeue(&hdev->raw_q))) + hci_send_frame(skb); + + read_unlock(&hci_task_lock); +} + +/* ----- HCI RX task (incoming data proccessing) ----- */ + +/* ACL data packet */ +static inline void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_acl_hdr *hdr = (void *) skb->data; + struct hci_conn *conn; + __u16 handle, flags; + + skb_pull(skb, HCI_ACL_HDR_SIZE); + + handle = __le16_to_cpu(hdr->handle); + flags = hci_flags(handle); + handle = hci_handle(handle); + + BT_DBG("%s len %d handle 0x%x flags 0x%x", hdev->name, skb->len, handle, flags); + + hdev->stat.acl_rx++; + + hci_dev_lock(hdev); + conn = hci_conn_hash_lookup_handle(hdev, handle); + hci_dev_unlock(hdev); + + if (conn) { + register struct hci_proto *hp; + + /* Send to upper protocol */ + if ((hp = hci_proto[HCI_PROTO_L2CAP]) && hp->recv_acldata) { + hp->recv_acldata(conn, skb, flags); + return; + } + } else { + BT_ERR("%s ACL packet for unknown connection handle %d", + hdev->name, handle); + } + + kfree_skb(skb); +} + +/* SCO data packet */ +static inline void hci_scodata_packet(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_sco_hdr *hdr = (void *) skb->data; + struct hci_conn *conn; + __u16 handle; + + skb_pull(skb, HCI_SCO_HDR_SIZE); + + handle = __le16_to_cpu(hdr->handle); + + BT_DBG("%s len %d handle 0x%x", hdev->name, skb->len, handle); + + hdev->stat.sco_rx++; + + hci_dev_lock(hdev); + conn = hci_conn_hash_lookup_handle(hdev, handle); + hci_dev_unlock(hdev); + + if (conn) { + register struct hci_proto *hp; + + /* Send to upper protocol */ + if ((hp = hci_proto[HCI_PROTO_SCO]) && hp->recv_scodata) { + hp->recv_scodata(conn, skb); + return; + } + } else { + BT_ERR("%s SCO packet for unknown connection handle %d", + hdev->name, handle); + } + + kfree_skb(skb); +} + +void hci_rx_task(unsigned long arg) +{ + struct hci_dev *hdev = (struct hci_dev *) arg; + struct sk_buff *skb; + + BT_DBG("%s", hdev->name); + + read_lock(&hci_task_lock); + + while ((skb = skb_dequeue(&hdev->rx_q))) { + if (atomic_read(&hdev->promisc)) { + /* Send copy to the sockets */ + hci_send_to_sock(hdev, skb); + } + + if (test_bit(HCI_RAW, &hdev->flags)) { + kfree_skb(skb); + continue; + } + + if (test_bit(HCI_INIT, &hdev->flags)) { + /* Don't process data packets in this states. */ + switch (skb->pkt_type) { + case HCI_ACLDATA_PKT: + case HCI_SCODATA_PKT: + kfree_skb(skb); + continue; + }; + } + + /* Process frame */ + switch (skb->pkt_type) { + case HCI_EVENT_PKT: + hci_event_packet(hdev, skb); + break; + + case HCI_ACLDATA_PKT: + BT_DBG("%s ACL data packet", hdev->name); + hci_acldata_packet(hdev, skb); + break; + + case HCI_SCODATA_PKT: + BT_DBG("%s SCO data packet", hdev->name); + hci_scodata_packet(hdev, skb); + break; + + default: + kfree_skb(skb); + break; + } + } + + read_unlock(&hci_task_lock); +} + +static void hci_cmd_task(unsigned long arg) +{ + struct hci_dev *hdev = (struct hci_dev *) arg; + struct sk_buff *skb; + + BT_DBG("%s cmd %d", hdev->name, atomic_read(&hdev->cmd_cnt)); + + if (!atomic_read(&hdev->cmd_cnt) && (jiffies - hdev->cmd_last_tx) > HZ) { + BT_ERR("%s command tx timeout", hdev->name); + atomic_set(&hdev->cmd_cnt, 1); + } + + /* Send queued commands */ + if (atomic_read(&hdev->cmd_cnt) && (skb = skb_dequeue(&hdev->cmd_q))) { + if (hdev->sent_cmd) + kfree_skb(hdev->sent_cmd); + + if ((hdev->sent_cmd = skb_clone(skb, GFP_ATOMIC))) { + atomic_dec(&hdev->cmd_cnt); + hci_send_frame(skb); + hdev->cmd_last_tx = jiffies; + } else { + skb_queue_head(&hdev->cmd_q, skb); + hci_sched_cmd(hdev); + } + } +} diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c new file mode 100644 index 000000000000..8ccba8ee9979 --- /dev/null +++ b/net/bluetooth/hci_event.c @@ -0,0 +1,1044 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2000-2001 Qualcomm Incorporated + + Written 2000,2001 by Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth HCI event handling. */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#ifndef CONFIG_BT_HCI_CORE_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +/* Handle HCI Event packets */ + +/* Command Complete OGF LINK_CTL */ +static void hci_cc_link_ctl(struct hci_dev *hdev, __u16 ocf, struct sk_buff *skb) +{ + __u8 status; + + BT_DBG("%s ocf 0x%x", hdev->name, ocf); + + switch (ocf) { + case OCF_INQUIRY_CANCEL: + status = *((__u8 *) skb->data); + + if (status) { + BT_DBG("%s Inquiry cancel error: status 0x%x", hdev->name, status); + } else { + clear_bit(HCI_INQUIRY, &hdev->flags); + hci_req_complete(hdev, status); + } + break; + + default: + BT_DBG("%s Command complete: ogf LINK_CTL ocf %x", hdev->name, ocf); + break; + } +} + +/* Command Complete OGF LINK_POLICY */ +static void hci_cc_link_policy(struct hci_dev *hdev, __u16 ocf, struct sk_buff *skb) +{ + struct hci_conn *conn; + struct hci_rp_role_discovery *rd; + + BT_DBG("%s ocf 0x%x", hdev->name, ocf); + + switch (ocf) { + case OCF_ROLE_DISCOVERY: + rd = (void *) skb->data; + + if (rd->status) + break; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, __le16_to_cpu(rd->handle)); + if (conn) { + if (rd->role) + conn->link_mode &= ~HCI_LM_MASTER; + else + conn->link_mode |= HCI_LM_MASTER; + } + + hci_dev_unlock(hdev); + break; + + default: + BT_DBG("%s: Command complete: ogf LINK_POLICY ocf %x", + hdev->name, ocf); + break; + } +} + +/* Command Complete OGF HOST_CTL */ +static void hci_cc_host_ctl(struct hci_dev *hdev, __u16 ocf, struct sk_buff *skb) +{ + __u8 status, param; + __u16 setting; + struct hci_rp_read_voice_setting *vs; + void *sent; + + BT_DBG("%s ocf 0x%x", hdev->name, ocf); + + switch (ocf) { + case OCF_RESET: + status = *((__u8 *) skb->data); + hci_req_complete(hdev, status); + break; + + case OCF_SET_EVENT_FLT: + status = *((__u8 *) skb->data); + if (status) { + BT_DBG("%s SET_EVENT_FLT failed %d", hdev->name, status); + } else { + BT_DBG("%s SET_EVENT_FLT succeseful", hdev->name); + } + break; + + case OCF_WRITE_AUTH_ENABLE: + sent = hci_sent_cmd_data(hdev, OGF_HOST_CTL, OCF_WRITE_AUTH_ENABLE); + if (!sent) + break; + + status = *((__u8 *) skb->data); + param = *((__u8 *) sent); + + if (!status) { + if (param == AUTH_ENABLED) + set_bit(HCI_AUTH, &hdev->flags); + else + clear_bit(HCI_AUTH, &hdev->flags); + } + hci_req_complete(hdev, status); + break; + + case OCF_WRITE_ENCRYPT_MODE: + sent = hci_sent_cmd_data(hdev, OGF_HOST_CTL, OCF_WRITE_ENCRYPT_MODE); + if (!sent) + break; + + status = *((__u8 *) skb->data); + param = *((__u8 *) sent); + + if (!status) { + if (param) + set_bit(HCI_ENCRYPT, &hdev->flags); + else + clear_bit(HCI_ENCRYPT, &hdev->flags); + } + hci_req_complete(hdev, status); + break; + + case OCF_WRITE_CA_TIMEOUT: + status = *((__u8 *) skb->data); + if (status) { + BT_DBG("%s OCF_WRITE_CA_TIMEOUT failed %d", hdev->name, status); + } else { + BT_DBG("%s OCF_WRITE_CA_TIMEOUT succeseful", hdev->name); + } + break; + + case OCF_WRITE_PG_TIMEOUT: + status = *((__u8 *) skb->data); + if (status) { + BT_DBG("%s OCF_WRITE_PG_TIMEOUT failed %d", hdev->name, status); + } else { + BT_DBG("%s: OCF_WRITE_PG_TIMEOUT succeseful", hdev->name); + } + break; + + case OCF_WRITE_SCAN_ENABLE: + sent = hci_sent_cmd_data(hdev, OGF_HOST_CTL, OCF_WRITE_SCAN_ENABLE); + if (!sent) + break; + + status = *((__u8 *) skb->data); + param = *((__u8 *) sent); + + BT_DBG("param 0x%x", param); + + if (!status) { + clear_bit(HCI_PSCAN, &hdev->flags); + clear_bit(HCI_ISCAN, &hdev->flags); + if (param & SCAN_INQUIRY) + set_bit(HCI_ISCAN, &hdev->flags); + + if (param & SCAN_PAGE) + set_bit(HCI_PSCAN, &hdev->flags); + } + hci_req_complete(hdev, status); + break; + + case OCF_READ_VOICE_SETTING: + vs = (struct hci_rp_read_voice_setting *) skb->data; + + if (vs->status) { + BT_DBG("%s READ_VOICE_SETTING failed %d", hdev->name, vs->status); + break; + } + + setting = __le16_to_cpu(vs->voice_setting); + + if (hdev->voice_setting != setting ) { + hdev->voice_setting = setting; + + BT_DBG("%s: voice setting 0x%04x", hdev->name, setting); + + if (hdev->notify) { + tasklet_disable(&hdev->tx_task); + hdev->notify(hdev, HCI_NOTIFY_VOICE_SETTING); + tasklet_enable(&hdev->tx_task); + } + } + break; + + case OCF_WRITE_VOICE_SETTING: + sent = hci_sent_cmd_data(hdev, OGF_HOST_CTL, OCF_WRITE_VOICE_SETTING); + if (!sent) + break; + + status = *((__u8 *) skb->data); + setting = __le16_to_cpu(get_unaligned((__u16 *) sent)); + + if (!status && hdev->voice_setting != setting) { + hdev->voice_setting = setting; + + BT_DBG("%s: voice setting 0x%04x", hdev->name, setting); + + if (hdev->notify) { + tasklet_disable(&hdev->tx_task); + hdev->notify(hdev, HCI_NOTIFY_VOICE_SETTING); + tasklet_enable(&hdev->tx_task); + } + } + hci_req_complete(hdev, status); + break; + + case OCF_HOST_BUFFER_SIZE: + status = *((__u8 *) skb->data); + if (status) { + BT_DBG("%s OCF_BUFFER_SIZE failed %d", hdev->name, status); + hci_req_complete(hdev, status); + } + break; + + default: + BT_DBG("%s Command complete: ogf HOST_CTL ocf %x", hdev->name, ocf); + break; + } +} + +/* Command Complete OGF INFO_PARAM */ +static void hci_cc_info_param(struct hci_dev *hdev, __u16 ocf, struct sk_buff *skb) +{ + struct hci_rp_read_loc_features *lf; + struct hci_rp_read_buffer_size *bs; + struct hci_rp_read_bd_addr *ba; + + BT_DBG("%s ocf 0x%x", hdev->name, ocf); + + switch (ocf) { + case OCF_READ_LOCAL_FEATURES: + lf = (struct hci_rp_read_loc_features *) skb->data; + + if (lf->status) { + BT_DBG("%s READ_LOCAL_FEATURES failed %d", hdev->name, lf->status); + break; + } + + memcpy(hdev->features, lf->features, sizeof(hdev->features)); + + /* Adjust default settings according to features + * supported by device. */ + if (hdev->features[0] & LMP_3SLOT) + hdev->pkt_type |= (HCI_DM3 | HCI_DH3); + + if (hdev->features[0] & LMP_5SLOT) + hdev->pkt_type |= (HCI_DM5 | HCI_DH5); + + if (hdev->features[1] & LMP_HV2) + hdev->pkt_type |= (HCI_HV2); + + if (hdev->features[1] & LMP_HV3) + hdev->pkt_type |= (HCI_HV3); + + BT_DBG("%s: features 0x%x 0x%x 0x%x", hdev->name, lf->features[0], lf->features[1], lf->features[2]); + + break; + + case OCF_READ_BUFFER_SIZE: + bs = (struct hci_rp_read_buffer_size *) skb->data; + + if (bs->status) { + BT_DBG("%s READ_BUFFER_SIZE failed %d", hdev->name, bs->status); + hci_req_complete(hdev, bs->status); + break; + } + + hdev->acl_mtu = __le16_to_cpu(bs->acl_mtu); + hdev->sco_mtu = bs->sco_mtu ? bs->sco_mtu : 64; + hdev->acl_pkts = hdev->acl_cnt = __le16_to_cpu(bs->acl_max_pkt); + hdev->sco_pkts = hdev->sco_cnt = __le16_to_cpu(bs->sco_max_pkt); + + BT_DBG("%s mtu: acl %d, sco %d max_pkt: acl %d, sco %d", hdev->name, + hdev->acl_mtu, hdev->sco_mtu, hdev->acl_pkts, hdev->sco_pkts); + break; + + case OCF_READ_BD_ADDR: + ba = (struct hci_rp_read_bd_addr *) skb->data; + + if (!ba->status) { + bacpy(&hdev->bdaddr, &ba->bdaddr); + } else { + BT_DBG("%s: READ_BD_ADDR failed %d", hdev->name, ba->status); + } + + hci_req_complete(hdev, ba->status); + break; + + default: + BT_DBG("%s Command complete: ogf INFO_PARAM ocf %x", hdev->name, ocf); + break; + } +} + +/* Command Status OGF LINK_CTL */ +static inline void hci_cs_create_conn(struct hci_dev *hdev, __u8 status) +{ + struct hci_conn *conn; + struct hci_cp_create_conn *cp = hci_sent_cmd_data(hdev, OGF_LINK_CTL, OCF_CREATE_CONN); + + if (!cp) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &cp->bdaddr); + + BT_DBG("%s status 0x%x bdaddr %s conn %p", hdev->name, + status, batostr(&cp->bdaddr), conn); + + if (status) { + if (conn && conn->state == BT_CONNECT) { + conn->state = BT_CLOSED; + hci_proto_connect_cfm(conn, status); + hci_conn_del(conn); + } + } else { + if (!conn) { + conn = hci_conn_add(hdev, ACL_LINK, &cp->bdaddr); + if (conn) { + conn->out = 1; + conn->link_mode |= HCI_LM_MASTER; + } else + BT_ERR("No memmory for new connection"); + } + } + + hci_dev_unlock(hdev); +} + +static void hci_cs_link_ctl(struct hci_dev *hdev, __u16 ocf, __u8 status) +{ + BT_DBG("%s ocf 0x%x", hdev->name, ocf); + + switch (ocf) { + case OCF_CREATE_CONN: + hci_cs_create_conn(hdev, status); + break; + + case OCF_ADD_SCO: + if (status) { + struct hci_conn *acl, *sco; + struct hci_cp_add_sco *cp = hci_sent_cmd_data(hdev, OGF_LINK_CTL, OCF_ADD_SCO); + __u16 handle; + + if (!cp) + break; + + handle = __le16_to_cpu(cp->handle); + + BT_DBG("%s Add SCO error: handle %d status 0x%x", hdev->name, handle, status); + + hci_dev_lock(hdev); + + acl = hci_conn_hash_lookup_handle(hdev, handle); + if (acl && (sco = acl->link)) { + sco->state = BT_CLOSED; + + hci_proto_connect_cfm(sco, status); + hci_conn_del(sco); + } + + hci_dev_unlock(hdev); + } + break; + + case OCF_INQUIRY: + if (status) { + BT_DBG("%s Inquiry error: status 0x%x", hdev->name, status); + hci_req_complete(hdev, status); + } else { + set_bit(HCI_INQUIRY, &hdev->flags); + } + break; + + default: + BT_DBG("%s Command status: ogf LINK_CTL ocf %x status %d", + hdev->name, ocf, status); + break; + } +} + +/* Command Status OGF LINK_POLICY */ +static void hci_cs_link_policy(struct hci_dev *hdev, __u16 ocf, __u8 status) +{ + BT_DBG("%s ocf 0x%x", hdev->name, ocf); + + switch (ocf) { + default: + BT_DBG("%s Command status: ogf HOST_POLICY ocf %x", hdev->name, ocf); + break; + } +} + +/* Command Status OGF HOST_CTL */ +static void hci_cs_host_ctl(struct hci_dev *hdev, __u16 ocf, __u8 status) +{ + BT_DBG("%s ocf 0x%x", hdev->name, ocf); + + switch (ocf) { + default: + BT_DBG("%s Command status: ogf HOST_CTL ocf %x", hdev->name, ocf); + break; + } +} + +/* Command Status OGF INFO_PARAM */ +static void hci_cs_info_param(struct hci_dev *hdev, __u16 ocf, __u8 status) +{ + BT_DBG("%s: hci_cs_info_param: ocf 0x%x", hdev->name, ocf); + + switch (ocf) { + default: + BT_DBG("%s Command status: ogf INFO_PARAM ocf %x", hdev->name, ocf); + break; + } +} + +/* Inquiry Complete */ +static inline void hci_inquiry_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + __u8 status = *((__u8 *) skb->data); + + BT_DBG("%s status %d", hdev->name, status); + + clear_bit(HCI_INQUIRY, &hdev->flags); + hci_req_complete(hdev, status); +} + +/* Inquiry Result */ +static inline void hci_inquiry_result_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct inquiry_info *info = (struct inquiry_info *) (skb->data + 1); + int num_rsp = *((__u8 *) skb->data); + + BT_DBG("%s num_rsp %d", hdev->name, num_rsp); + + hci_dev_lock(hdev); + for (; num_rsp; num_rsp--) { + struct inquiry_data data; + bacpy(&data.bdaddr, &info->bdaddr); + data.pscan_rep_mode = info->pscan_rep_mode; + data.pscan_period_mode = info->pscan_period_mode; + data.pscan_mode = info->pscan_mode; + memcpy(data.dev_class, info->dev_class, 3); + data.clock_offset = info->clock_offset; + data.rssi = 0x00; + info++; + hci_inquiry_cache_update(hdev, &data); + } + hci_dev_unlock(hdev); +} + +/* Inquiry Result With RSSI */ +static inline void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct inquiry_info_with_rssi *info = (struct inquiry_info_with_rssi *) (skb->data + 1); + int num_rsp = *((__u8 *) skb->data); + + BT_DBG("%s num_rsp %d", hdev->name, num_rsp); + + hci_dev_lock(hdev); + for (; num_rsp; num_rsp--) { + struct inquiry_data data; + bacpy(&data.bdaddr, &info->bdaddr); + data.pscan_rep_mode = info->pscan_rep_mode; + data.pscan_period_mode = info->pscan_period_mode; + data.pscan_mode = 0x00; + memcpy(data.dev_class, info->dev_class, 3); + data.clock_offset = info->clock_offset; + data.rssi = info->rssi; + info++; + hci_inquiry_cache_update(hdev, &data); + } + hci_dev_unlock(hdev); +} + +/* Connect Request */ +static inline void hci_conn_request_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_conn_request *ev = (struct hci_ev_conn_request *) skb->data; + int mask = hdev->link_mode; + + BT_DBG("%s Connection request: %s type 0x%x", hdev->name, + batostr(&ev->bdaddr), ev->link_type); + + mask |= hci_proto_connect_ind(hdev, &ev->bdaddr, ev->link_type); + + if (mask & HCI_LM_ACCEPT) { + /* Connection accepted */ + struct hci_conn *conn; + struct hci_cp_accept_conn_req cp; + + hci_dev_lock(hdev); + conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr); + if (!conn) { + if (!(conn = hci_conn_add(hdev, ev->link_type, &ev->bdaddr))) { + BT_ERR("No memmory for new connection"); + hci_dev_unlock(hdev); + return; + } + } + memcpy(conn->dev_class, ev->dev_class, 3); + conn->state = BT_CONNECT; + hci_dev_unlock(hdev); + + bacpy(&cp.bdaddr, &ev->bdaddr); + + if (lmp_rswitch_capable(hdev) && (mask & HCI_LM_MASTER)) + cp.role = 0x00; /* Become master */ + else + cp.role = 0x01; /* Remain slave */ + + hci_send_cmd(hdev, OGF_LINK_CTL, OCF_ACCEPT_CONN_REQ, sizeof(cp), &cp); + } else { + /* Connection rejected */ + struct hci_cp_reject_conn_req cp; + + bacpy(&cp.bdaddr, &ev->bdaddr); + cp.reason = 0x0f; + hci_send_cmd(hdev, OGF_LINK_CTL, OCF_REJECT_CONN_REQ, sizeof(cp), &cp); + } +} + +/* Connect Complete */ +static inline void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_conn_complete *ev = (struct hci_ev_conn_complete *) skb->data; + struct hci_conn *conn = NULL; + + BT_DBG("%s", hdev->name); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, ev->link_type, &ev->bdaddr); + if (!conn) { + hci_dev_unlock(hdev); + return; + } + + if (!ev->status) { + conn->handle = __le16_to_cpu(ev->handle); + conn->state = BT_CONNECTED; + + if (test_bit(HCI_AUTH, &hdev->flags)) + conn->link_mode |= HCI_LM_AUTH; + + if (test_bit(HCI_ENCRYPT, &hdev->flags)) + conn->link_mode |= HCI_LM_ENCRYPT; + + /* Set link policy */ + if (conn->type == ACL_LINK && hdev->link_policy) { + struct hci_cp_write_link_policy cp; + cp.handle = ev->handle; + cp.policy = __cpu_to_le16(hdev->link_policy); + hci_send_cmd(hdev, OGF_LINK_POLICY, OCF_WRITE_LINK_POLICY, sizeof(cp), &cp); + } + + /* Set packet type for incoming connection */ + if (!conn->out) { + struct hci_cp_change_conn_ptype cp; + cp.handle = ev->handle; + cp.pkt_type = (conn->type == ACL_LINK) ? + __cpu_to_le16(hdev->pkt_type & ACL_PTYPE_MASK): + __cpu_to_le16(hdev->pkt_type & SCO_PTYPE_MASK); + + hci_send_cmd(hdev, OGF_LINK_CTL, OCF_CHANGE_CONN_PTYPE, sizeof(cp), &cp); + } + } else + conn->state = BT_CLOSED; + + if (conn->type == ACL_LINK) { + struct hci_conn *sco = conn->link; + if (sco) { + if (!ev->status) + hci_add_sco(sco, conn->handle); + else { + hci_proto_connect_cfm(sco, ev->status); + hci_conn_del(sco); + } + } + } + + hci_proto_connect_cfm(conn, ev->status); + if (ev->status) + hci_conn_del(conn); + + hci_dev_unlock(hdev); +} + +/* Disconnect Complete */ +static inline void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_disconn_complete *ev = (struct hci_ev_disconn_complete *) skb->data; + struct hci_conn *conn = NULL; + __u16 handle = __le16_to_cpu(ev->handle); + + BT_DBG("%s status %d", hdev->name, ev->status); + + if (ev->status) + return; + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (conn) { + conn->state = BT_CLOSED; + hci_proto_disconn_ind(conn, ev->reason); + hci_conn_del(conn); + } + + hci_dev_unlock(hdev); +} + +/* Number of completed packets */ +static inline void hci_num_comp_pkts_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_num_comp_pkts *ev = (struct hci_ev_num_comp_pkts *) skb->data; + __u16 *ptr; + int i; + + skb_pull(skb, sizeof(*ev)); + + BT_DBG("%s num_hndl %d", hdev->name, ev->num_hndl); + + if (skb->len < ev->num_hndl * 4) { + BT_DBG("%s bad parameters", hdev->name); + return; + } + + tasklet_disable(&hdev->tx_task); + + for (i = 0, ptr = (__u16 *) skb->data; i < ev->num_hndl; i++) { + struct hci_conn *conn; + __u16 handle, count; + + handle = __le16_to_cpu(get_unaligned(ptr++)); + count = __le16_to_cpu(get_unaligned(ptr++)); + + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (conn) { + conn->sent -= count; + + if (conn->type == SCO_LINK) { + if ((hdev->sco_cnt += count) > hdev->sco_pkts) + hdev->sco_cnt = hdev->sco_pkts; + } else { + if ((hdev->acl_cnt += count) > hdev->acl_pkts) + hdev->acl_cnt = hdev->acl_pkts; + } + } + } + hci_sched_tx(hdev); + + tasklet_enable(&hdev->tx_task); +} + +/* Role Change */ +static inline void hci_role_change_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_role_change *ev = (struct hci_ev_role_change *) skb->data; + struct hci_conn *conn = NULL; + + BT_DBG("%s status %d", hdev->name, ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr); + if (conn) { + if (!ev->status) { + if (ev->role) + conn->link_mode &= ~HCI_LM_MASTER; + else + conn->link_mode |= HCI_LM_MASTER; + } + + clear_bit(HCI_CONN_RSWITCH_PEND, &conn->pend); + + hci_role_switch_cfm(conn, ev->status, ev->role); + } + + hci_dev_unlock(hdev); +} + +/* Authentication Complete */ +static inline void hci_auth_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_auth_complete *ev = (struct hci_ev_auth_complete *) skb->data; + struct hci_conn *conn = NULL; + __u16 handle = __le16_to_cpu(ev->handle); + + BT_DBG("%s status %d", hdev->name, ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (conn) { + if (!ev->status) + conn->link_mode |= HCI_LM_AUTH; + + clear_bit(HCI_CONN_AUTH_PEND, &conn->pend); + + hci_auth_cfm(conn, ev->status); + + if (test_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend)) { + if (!ev->status) { + struct hci_cp_set_conn_encrypt cp; + cp.handle = __cpu_to_le16(conn->handle); + cp.encrypt = 1; + hci_send_cmd(conn->hdev, OGF_LINK_CTL, + OCF_SET_CONN_ENCRYPT, + sizeof(cp), &cp); + } else { + clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend); + hci_encrypt_cfm(conn, ev->status, 0x00); + } + } + } + + hci_dev_unlock(hdev); +} + +/* Encryption Change */ +static inline void hci_encrypt_change_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_encrypt_change *ev = (struct hci_ev_encrypt_change *) skb->data; + struct hci_conn *conn = NULL; + __u16 handle = __le16_to_cpu(ev->handle); + + BT_DBG("%s status %d", hdev->name, ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (conn) { + if (!ev->status) { + if (ev->encrypt) + conn->link_mode |= HCI_LM_ENCRYPT; + else + conn->link_mode &= ~HCI_LM_ENCRYPT; + } + + clear_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend); + + hci_encrypt_cfm(conn, ev->status, ev->encrypt); + } + + hci_dev_unlock(hdev); +} + +/* Change Connection Link Key Complete */ +static inline void hci_change_conn_link_key_complete_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_change_conn_link_key_complete *ev = (struct hci_ev_change_conn_link_key_complete *) skb->data; + struct hci_conn *conn = NULL; + __u16 handle = __le16_to_cpu(ev->handle); + + BT_DBG("%s status %d", hdev->name, ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (conn) { + if (!ev->status) + conn->link_mode |= HCI_LM_SECURE; + + clear_bit(HCI_CONN_AUTH_PEND, &conn->pend); + + hci_key_change_cfm(conn, ev->status); + } + + hci_dev_unlock(hdev); +} + +/* Pin Code Request*/ +static inline void hci_pin_code_request_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ +} + +/* Link Key Request */ +static inline void hci_link_key_request_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ +} + +/* Link Key Notification */ +static inline void hci_link_key_notify_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ +} + +/* Clock Offset */ +static inline void hci_clock_offset_evt(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_ev_clock_offset *ev = (struct hci_ev_clock_offset *) skb->data; + struct hci_conn *conn = NULL; + __u16 handle = __le16_to_cpu(ev->handle); + + BT_DBG("%s status %d", hdev->name, ev->status); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (conn && !ev->status) { + struct inquiry_entry *ie; + + if ((ie = hci_inquiry_cache_lookup(hdev, &conn->dst))) { + ie->data.clock_offset = ev->clock_offset; + ie->timestamp = jiffies; + } + } + + hci_dev_unlock(hdev); +} + +void hci_event_packet(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct hci_event_hdr *hdr = (struct hci_event_hdr *) skb->data; + struct hci_ev_cmd_complete *ec; + struct hci_ev_cmd_status *cs; + u16 opcode, ocf, ogf; + + skb_pull(skb, HCI_EVENT_HDR_SIZE); + + BT_DBG("%s evt 0x%x", hdev->name, hdr->evt); + + switch (hdr->evt) { + case HCI_EV_NUM_COMP_PKTS: + hci_num_comp_pkts_evt(hdev, skb); + break; + + case HCI_EV_INQUIRY_COMPLETE: + hci_inquiry_complete_evt(hdev, skb); + break; + + case HCI_EV_INQUIRY_RESULT: + hci_inquiry_result_evt(hdev, skb); + break; + + case HCI_EV_INQUIRY_RESULT_WITH_RSSI: + hci_inquiry_result_with_rssi_evt(hdev, skb); + break; + + case HCI_EV_CONN_REQUEST: + hci_conn_request_evt(hdev, skb); + break; + + case HCI_EV_CONN_COMPLETE: + hci_conn_complete_evt(hdev, skb); + break; + + case HCI_EV_DISCONN_COMPLETE: + hci_disconn_complete_evt(hdev, skb); + break; + + case HCI_EV_ROLE_CHANGE: + hci_role_change_evt(hdev, skb); + break; + + case HCI_EV_AUTH_COMPLETE: + hci_auth_complete_evt(hdev, skb); + break; + + case HCI_EV_ENCRYPT_CHANGE: + hci_encrypt_change_evt(hdev, skb); + break; + + case HCI_EV_CHANGE_CONN_LINK_KEY_COMPLETE: + hci_change_conn_link_key_complete_evt(hdev, skb); + break; + + case HCI_EV_PIN_CODE_REQ: + hci_pin_code_request_evt(hdev, skb); + break; + + case HCI_EV_LINK_KEY_REQ: + hci_link_key_request_evt(hdev, skb); + break; + + case HCI_EV_LINK_KEY_NOTIFY: + hci_link_key_notify_evt(hdev, skb); + break; + + case HCI_EV_CLOCK_OFFSET: + hci_clock_offset_evt(hdev, skb); + break; + + case HCI_EV_CMD_STATUS: + cs = (struct hci_ev_cmd_status *) skb->data; + skb_pull(skb, sizeof(cs)); + + opcode = __le16_to_cpu(cs->opcode); + ogf = hci_opcode_ogf(opcode); + ocf = hci_opcode_ocf(opcode); + + switch (ogf) { + case OGF_INFO_PARAM: + hci_cs_info_param(hdev, ocf, cs->status); + break; + + case OGF_HOST_CTL: + hci_cs_host_ctl(hdev, ocf, cs->status); + break; + + case OGF_LINK_CTL: + hci_cs_link_ctl(hdev, ocf, cs->status); + break; + + case OGF_LINK_POLICY: + hci_cs_link_policy(hdev, ocf, cs->status); + break; + + default: + BT_DBG("%s Command Status OGF %x", hdev->name, ogf); + break; + } + + if (cs->ncmd) { + atomic_set(&hdev->cmd_cnt, 1); + if (!skb_queue_empty(&hdev->cmd_q)) + hci_sched_cmd(hdev); + } + break; + + case HCI_EV_CMD_COMPLETE: + ec = (struct hci_ev_cmd_complete *) skb->data; + skb_pull(skb, sizeof(*ec)); + + opcode = __le16_to_cpu(ec->opcode); + ogf = hci_opcode_ogf(opcode); + ocf = hci_opcode_ocf(opcode); + + switch (ogf) { + case OGF_INFO_PARAM: + hci_cc_info_param(hdev, ocf, skb); + break; + + case OGF_HOST_CTL: + hci_cc_host_ctl(hdev, ocf, skb); + break; + + case OGF_LINK_CTL: + hci_cc_link_ctl(hdev, ocf, skb); + break; + + case OGF_LINK_POLICY: + hci_cc_link_policy(hdev, ocf, skb); + break; + + default: + BT_DBG("%s Command Completed OGF %x", hdev->name, ogf); + break; + } + + if (ec->ncmd) { + atomic_set(&hdev->cmd_cnt, 1); + if (!skb_queue_empty(&hdev->cmd_q)) + hci_sched_cmd(hdev); + } + break; + } + + kfree_skb(skb); + hdev->stat.evt_rx++; +} + +/* Generate internal stack event */ +void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data) +{ + struct hci_event_hdr *hdr; + struct hci_ev_stack_internal *ev; + struct sk_buff *skb; + + skb = bt_skb_alloc(HCI_EVENT_HDR_SIZE + sizeof(*ev) + dlen, GFP_ATOMIC); + if (!skb) + return; + + hdr = (void *) skb_put(skb, HCI_EVENT_HDR_SIZE); + hdr->evt = HCI_EV_STACK_INTERNAL; + hdr->plen = sizeof(*ev) + dlen; + + ev = (void *) skb_put(skb, sizeof(*ev) + dlen); + ev->type = type; + memcpy(ev->data, data, dlen); + + skb->pkt_type = HCI_EVENT_PKT; + skb->dev = (void *) hdev; + hci_send_to_sock(hdev, skb); + kfree_skb(skb); +} +EXPORT_SYMBOL(hci_si_event); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c new file mode 100644 index 000000000000..c9792ba75122 --- /dev/null +++ b/net/bluetooth/hci_sock.c @@ -0,0 +1,707 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2000-2001 Qualcomm Incorporated + + Written 2000,2001 by Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth HCI sockets. */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#ifndef CONFIG_BT_HCI_SOCK_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +/* ----- HCI socket interface ----- */ + +static inline int hci_test_bit(int nr, void *addr) +{ + return *((__u32 *) addr + (nr >> 5)) & ((__u32) 1 << (nr & 31)); +} + +/* Security filter */ +static struct hci_sec_filter hci_sec_filter = { + /* Packet types */ + 0x10, + /* Events */ + { 0x1000d9fe, 0x0000300c }, + /* Commands */ + { + { 0x0 }, + /* OGF_LINK_CTL */ + { 0xbe000006, 0x00000001, 0x0000, 0x00 }, + /* OGF_LINK_POLICY */ + { 0x00005200, 0x00000000, 0x0000, 0x00 }, + /* OGF_HOST_CTL */ + { 0xaab00200, 0x2b402aaa, 0x0154, 0x00 }, + /* OGF_INFO_PARAM */ + { 0x000002be, 0x00000000, 0x0000, 0x00 }, + /* OGF_STATUS_PARAM */ + { 0x000000ea, 0x00000000, 0x0000, 0x00 } + } +}; + +static struct bt_sock_list hci_sk_list = { + .lock = RW_LOCK_UNLOCKED +}; + +/* Send frame to RAW socket */ +void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb) +{ + struct sock *sk; + struct hlist_node *node; + + BT_DBG("hdev %p len %d", hdev, skb->len); + + read_lock(&hci_sk_list.lock); + sk_for_each(sk, node, &hci_sk_list.head) { + struct hci_filter *flt; + struct sk_buff *nskb; + + if (sk->sk_state != BT_BOUND || hci_pi(sk)->hdev != hdev) + continue; + + /* Don't send frame to the socket it came from */ + if (skb->sk == sk) + continue; + + /* Apply filter */ + flt = &hci_pi(sk)->filter; + + if (!test_bit((skb->pkt_type == HCI_VENDOR_PKT) ? + 0 : (skb->pkt_type & HCI_FLT_TYPE_BITS), &flt->type_mask)) + continue; + + if (skb->pkt_type == HCI_EVENT_PKT) { + register int evt = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS); + + if (!hci_test_bit(evt, &flt->event_mask)) + continue; + + if (flt->opcode && ((evt == HCI_EV_CMD_COMPLETE && + flt->opcode != *(__u16 *)(skb->data + 3)) || + (evt == HCI_EV_CMD_STATUS && + flt->opcode != *(__u16 *)(skb->data + 4)))) + continue; + } + + if (!(nskb = skb_clone(skb, GFP_ATOMIC))) + continue; + + /* Put type byte before the data */ + memcpy(skb_push(nskb, 1), &nskb->pkt_type, 1); + + if (sock_queue_rcv_skb(sk, nskb)) + kfree_skb(nskb); + } + read_unlock(&hci_sk_list.lock); +} + +static int hci_sock_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct hci_dev *hdev = hci_pi(sk)->hdev; + + BT_DBG("sock %p sk %p", sock, sk); + + if (!sk) + return 0; + + bt_sock_unlink(&hci_sk_list, sk); + + if (hdev) { + atomic_dec(&hdev->promisc); + hci_dev_put(hdev); + } + + sock_orphan(sk); + + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); + + sock_put(sk); + return 0; +} + +/* Ioctls that require bound socket */ +static inline int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd, unsigned long arg) +{ + struct hci_dev *hdev = hci_pi(sk)->hdev; + + if (!hdev) + return -EBADFD; + + switch (cmd) { + case HCISETRAW: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + + if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks)) + return -EPERM; + + if (arg) + set_bit(HCI_RAW, &hdev->flags); + else + clear_bit(HCI_RAW, &hdev->flags); + + return 0; + + case HCISETSECMGR: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + + if (arg) + set_bit(HCI_SECMGR, &hdev->flags); + else + clear_bit(HCI_SECMGR, &hdev->flags); + + return 0; + + case HCIGETCONNINFO: + return hci_get_conn_info(hdev, (void __user *)arg); + + default: + if (hdev->ioctl) + return hdev->ioctl(hdev, cmd, arg); + return -EINVAL; + } +} + +static int hci_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + void __user *argp = (void __user *)arg; + int err; + + BT_DBG("cmd %x arg %lx", cmd, arg); + + switch (cmd) { + case HCIGETDEVLIST: + return hci_get_dev_list(argp); + + case HCIGETDEVINFO: + return hci_get_dev_info(argp); + + case HCIGETCONNLIST: + return hci_get_conn_list(argp); + + case HCIDEVUP: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + return hci_dev_open(arg); + + case HCIDEVDOWN: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + return hci_dev_close(arg); + + case HCIDEVRESET: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + return hci_dev_reset(arg); + + case HCIDEVRESTAT: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + return hci_dev_reset_stat(arg); + + case HCISETSCAN: + case HCISETAUTH: + case HCISETENCRYPT: + case HCISETPTYPE: + case HCISETLINKPOL: + case HCISETLINKMODE: + case HCISETACLMTU: + case HCISETSCOMTU: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + return hci_dev_cmd(cmd, argp); + + case HCIINQUIRY: + return hci_inquiry(argp); + + default: + lock_sock(sk); + err = hci_sock_bound_ioctl(sk, cmd, arg); + release_sock(sk); + return err; + } +} + +static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +{ + struct sockaddr_hci *haddr = (struct sockaddr_hci *) addr; + struct sock *sk = sock->sk; + struct hci_dev *hdev = NULL; + int err = 0; + + BT_DBG("sock %p sk %p", sock, sk); + + if (!haddr || haddr->hci_family != AF_BLUETOOTH) + return -EINVAL; + + lock_sock(sk); + + if (hci_pi(sk)->hdev) { + err = -EALREADY; + goto done; + } + + if (haddr->hci_dev != HCI_DEV_NONE) { + if (!(hdev = hci_dev_get(haddr->hci_dev))) { + err = -ENODEV; + goto done; + } + + atomic_inc(&hdev->promisc); + } + + hci_pi(sk)->hdev = hdev; + sk->sk_state = BT_BOUND; + +done: + release_sock(sk); + return err; +} + +static int hci_sock_getname(struct socket *sock, struct sockaddr *addr, int *addr_len, int peer) +{ + struct sockaddr_hci *haddr = (struct sockaddr_hci *) addr; + struct sock *sk = sock->sk; + + BT_DBG("sock %p sk %p", sock, sk); + + lock_sock(sk); + + *addr_len = sizeof(*haddr); + haddr->hci_family = AF_BLUETOOTH; + haddr->hci_dev = hci_pi(sk)->hdev->id; + + release_sock(sk); + return 0; +} + +static inline void hci_sock_cmsg(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) +{ + __u32 mask = hci_pi(sk)->cmsg_mask; + + if (mask & HCI_CMSG_DIR) + put_cmsg(msg, SOL_HCI, HCI_CMSG_DIR, sizeof(int), &bt_cb(skb)->incoming); + + if (mask & HCI_CMSG_TSTAMP) + put_cmsg(msg, SOL_HCI, HCI_CMSG_TSTAMP, sizeof(skb->stamp), &skb->stamp); +} + +static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len, int flags) +{ + int noblock = flags & MSG_DONTWAIT; + struct sock *sk = sock->sk; + struct sk_buff *skb; + int copied, err; + + BT_DBG("sock %p, sk %p", sock, sk); + + if (flags & (MSG_OOB)) + return -EOPNOTSUPP; + + if (sk->sk_state == BT_CLOSED) + return 0; + + if (!(skb = skb_recv_datagram(sk, flags, noblock, &err))) + return err; + + msg->msg_namelen = 0; + + copied = skb->len; + if (len < copied) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + skb->h.raw = skb->data; + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + + hci_sock_cmsg(sk, msg, skb); + + skb_free_datagram(sk, skb); + + return err ? : copied; +} + +static int hci_sock_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct hci_dev *hdev; + struct sk_buff *skb; + int err; + + BT_DBG("sock %p sk %p", sock, sk); + + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_NOSIGNAL|MSG_ERRQUEUE)) + return -EINVAL; + + if (len < 4 || len > HCI_MAX_FRAME_SIZE) + return -EINVAL; + + lock_sock(sk); + + if (!(hdev = hci_pi(sk)->hdev)) { + err = -EBADFD; + goto done; + } + + if (!(skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err))) + goto done; + + if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { + err = -EFAULT; + goto drop; + } + + skb->pkt_type = *((unsigned char *) skb->data); + skb_pull(skb, 1); + skb->dev = (void *) hdev; + + if (skb->pkt_type == HCI_COMMAND_PKT) { + u16 opcode = __le16_to_cpu(get_unaligned((u16 *)skb->data)); + u16 ogf = hci_opcode_ogf(opcode); + u16 ocf = hci_opcode_ocf(opcode); + + if (((ogf > HCI_SFLT_MAX_OGF) || + !hci_test_bit(ocf & HCI_FLT_OCF_BITS, &hci_sec_filter.ocf_mask[ogf])) && + !capable(CAP_NET_RAW)) { + err = -EPERM; + goto drop; + } + + if (test_bit(HCI_RAW, &hdev->flags) || (ogf == OGF_VENDOR_CMD)) { + skb_queue_tail(&hdev->raw_q, skb); + hci_sched_tx(hdev); + } else { + skb_queue_tail(&hdev->cmd_q, skb); + hci_sched_cmd(hdev); + } + } else { + if (!capable(CAP_NET_RAW)) { + err = -EPERM; + goto drop; + } + + skb_queue_tail(&hdev->raw_q, skb); + hci_sched_tx(hdev); + } + + err = len; + +done: + release_sock(sk); + return err; + +drop: + kfree_skb(skb); + goto done; +} + +static int hci_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int len) +{ + struct hci_ufilter uf = { .opcode = 0 }; + struct sock *sk = sock->sk; + int err = 0, opt = 0; + + BT_DBG("sk %p, opt %d", sk, optname); + + lock_sock(sk); + + switch (optname) { + case HCI_DATA_DIR: + if (get_user(opt, (int __user *)optval)) { + err = -EFAULT; + break; + } + + if (opt) + hci_pi(sk)->cmsg_mask |= HCI_CMSG_DIR; + else + hci_pi(sk)->cmsg_mask &= ~HCI_CMSG_DIR; + break; + + case HCI_TIME_STAMP: + if (get_user(opt, (int __user *)optval)) { + err = -EFAULT; + break; + } + + if (opt) + hci_pi(sk)->cmsg_mask |= HCI_CMSG_TSTAMP; + else + hci_pi(sk)->cmsg_mask &= ~HCI_CMSG_TSTAMP; + break; + + case HCI_FILTER: + len = min_t(unsigned int, len, sizeof(uf)); + if (copy_from_user(&uf, optval, len)) { + err = -EFAULT; + break; + } + + if (!capable(CAP_NET_RAW)) { + uf.type_mask &= hci_sec_filter.type_mask; + uf.event_mask[0] &= *((u32 *) hci_sec_filter.event_mask + 0); + uf.event_mask[1] &= *((u32 *) hci_sec_filter.event_mask + 1); + } + + { + struct hci_filter *f = &hci_pi(sk)->filter; + + f->type_mask = uf.type_mask; + f->opcode = uf.opcode; + *((u32 *) f->event_mask + 0) = uf.event_mask[0]; + *((u32 *) f->event_mask + 1) = uf.event_mask[1]; + } + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +{ + struct hci_ufilter uf; + struct sock *sk = sock->sk; + int len, opt; + + if (get_user(len, optlen)) + return -EFAULT; + + switch (optname) { + case HCI_DATA_DIR: + if (hci_pi(sk)->cmsg_mask & HCI_CMSG_DIR) + opt = 1; + else + opt = 0; + + if (put_user(opt, optval)) + return -EFAULT; + break; + + case HCI_TIME_STAMP: + if (hci_pi(sk)->cmsg_mask & HCI_CMSG_TSTAMP) + opt = 1; + else + opt = 0; + + if (put_user(opt, optval)) + return -EFAULT; + break; + + case HCI_FILTER: + { + struct hci_filter *f = &hci_pi(sk)->filter; + + uf.type_mask = f->type_mask; + uf.opcode = f->opcode; + uf.event_mask[0] = *((u32 *) f->event_mask + 0); + uf.event_mask[1] = *((u32 *) f->event_mask + 1); + } + + len = min_t(unsigned int, len, sizeof(uf)); + if (copy_to_user(optval, &uf, len)) + return -EFAULT; + break; + + default: + return -ENOPROTOOPT; + break; + } + + return 0; +} + +static struct proto_ops hci_sock_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .release = hci_sock_release, + .bind = hci_sock_bind, + .getname = hci_sock_getname, + .sendmsg = hci_sock_sendmsg, + .recvmsg = hci_sock_recvmsg, + .ioctl = hci_sock_ioctl, + .poll = datagram_poll, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = hci_sock_setsockopt, + .getsockopt = hci_sock_getsockopt, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .mmap = sock_no_mmap +}; + +static struct proto hci_sk_proto = { + .name = "HCI", + .owner = THIS_MODULE, + .obj_size = sizeof(struct hci_pinfo) +}; + +static int hci_sock_create(struct socket *sock, int protocol) +{ + struct sock *sk; + + BT_DBG("sock %p", sock); + + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + sock->ops = &hci_sock_ops; + + sk = sk_alloc(PF_BLUETOOTH, GFP_KERNEL, &hci_sk_proto, 1); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + + sock_reset_flag(sk, SOCK_ZAPPED); + + sk->sk_protocol = protocol; + + sock->state = SS_UNCONNECTED; + sk->sk_state = BT_OPEN; + + bt_sock_link(&hci_sk_list, sk); + return 0; +} + +static int hci_sock_dev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct hci_dev *hdev = (struct hci_dev *) ptr; + struct hci_ev_si_device ev; + + BT_DBG("hdev %s event %ld", hdev->name, event); + + /* Send event to sockets */ + ev.event = event; + ev.dev_id = hdev->id; + hci_si_event(NULL, HCI_EV_SI_DEVICE, sizeof(ev), &ev); + + if (event == HCI_DEV_UNREG) { + struct sock *sk; + struct hlist_node *node; + + /* Detach sockets from device */ + read_lock(&hci_sk_list.lock); + sk_for_each(sk, node, &hci_sk_list.head) { + bh_lock_sock(sk); + if (hci_pi(sk)->hdev == hdev) { + hci_pi(sk)->hdev = NULL; + sk->sk_err = EPIPE; + sk->sk_state = BT_OPEN; + sk->sk_state_change(sk); + + hci_dev_put(hdev); + } + bh_unlock_sock(sk); + } + read_unlock(&hci_sk_list.lock); + } + + return NOTIFY_DONE; +} + +static struct net_proto_family hci_sock_family_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .create = hci_sock_create, +}; + +static struct notifier_block hci_sock_nblock = { + .notifier_call = hci_sock_dev_event +}; + +int __init hci_sock_init(void) +{ + int err; + + err = proto_register(&hci_sk_proto, 0); + if (err < 0) + return err; + + err = bt_sock_register(BTPROTO_HCI, &hci_sock_family_ops); + if (err < 0) + goto error; + + hci_register_notifier(&hci_sock_nblock); + + BT_INFO("HCI socket layer initialized"); + + return 0; + +error: + BT_ERR("HCI socket registration failed"); + proto_unregister(&hci_sk_proto); + return err; +} + +int __exit hci_sock_cleanup(void) +{ + if (bt_sock_unregister(BTPROTO_HCI) < 0) + BT_ERR("HCI socket unregistration failed"); + + hci_unregister_notifier(&hci_sock_nblock); + + proto_unregister(&hci_sk_proto); + + return 0; +} diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c new file mode 100644 index 000000000000..7856bc26accb --- /dev/null +++ b/net/bluetooth/hci_sysfs.c @@ -0,0 +1,153 @@ +/* Bluetooth HCI driver model support. */ + +#include +#include +#include + +#include +#include + +#ifndef CONFIG_BT_HCI_CORE_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +static ssize_t show_name(struct class_device *cdev, char *buf) +{ + struct hci_dev *hdev = class_get_devdata(cdev); + return sprintf(buf, "%s\n", hdev->name); +} + +static ssize_t show_type(struct class_device *cdev, char *buf) +{ + struct hci_dev *hdev = class_get_devdata(cdev); + return sprintf(buf, "%d\n", hdev->type); +} + +static ssize_t show_address(struct class_device *cdev, char *buf) +{ + struct hci_dev *hdev = class_get_devdata(cdev); + bdaddr_t bdaddr; + baswap(&bdaddr, &hdev->bdaddr); + return sprintf(buf, "%s\n", batostr(&bdaddr)); +} + +static ssize_t show_flags(struct class_device *cdev, char *buf) +{ + struct hci_dev *hdev = class_get_devdata(cdev); + return sprintf(buf, "0x%lx\n", hdev->flags); +} + +static ssize_t show_inquiry_cache(struct class_device *cdev, char *buf) +{ + struct hci_dev *hdev = class_get_devdata(cdev); + struct inquiry_cache *cache = &hdev->inq_cache; + struct inquiry_entry *e; + int n = 0; + + hci_dev_lock_bh(hdev); + + for (e = cache->list; e; e = e->next) { + struct inquiry_data *data = &e->data; + bdaddr_t bdaddr; + baswap(&bdaddr, &data->bdaddr); + n += sprintf(buf + n, "%s %d %d %d 0x%.2x%.2x%.2x 0x%.4x %d %u\n", + batostr(&bdaddr), + data->pscan_rep_mode, data->pscan_period_mode, data->pscan_mode, + data->dev_class[2], data->dev_class[1], data->dev_class[0], + __le16_to_cpu(data->clock_offset), data->rssi, e->timestamp); + } + + hci_dev_unlock_bh(hdev); + return n; +} + +static CLASS_DEVICE_ATTR(name, S_IRUGO, show_name, NULL); +static CLASS_DEVICE_ATTR(type, S_IRUGO, show_type, NULL); +static CLASS_DEVICE_ATTR(address, S_IRUGO, show_address, NULL); +static CLASS_DEVICE_ATTR(flags, S_IRUGO, show_flags, NULL); +static CLASS_DEVICE_ATTR(inquiry_cache, S_IRUGO, show_inquiry_cache, NULL); + +static struct class_device_attribute *bt_attrs[] = { + &class_device_attr_name, + &class_device_attr_type, + &class_device_attr_address, + &class_device_attr_flags, + &class_device_attr_inquiry_cache, + NULL +}; + +#ifdef CONFIG_HOTPLUG +static int bt_hotplug(struct class_device *cdev, char **envp, int num_envp, char *buf, int size) +{ + struct hci_dev *hdev = class_get_devdata(cdev); + int n, i = 0; + + envp[i++] = buf; + n = snprintf(buf, size, "INTERFACE=%s", hdev->name) + 1; + buf += n; + size -= n; + + if ((size <= 0) || (i >= num_envp)) + return -ENOMEM; + + envp[i] = NULL; + return 0; +} +#endif + +static void bt_release(struct class_device *cdev) +{ + struct hci_dev *hdev = class_get_devdata(cdev); + + kfree(hdev); +} + +static struct class bt_class = { + .name = "bluetooth", + .release = bt_release, +#ifdef CONFIG_HOTPLUG + .hotplug = bt_hotplug, +#endif +}; + +int hci_register_sysfs(struct hci_dev *hdev) +{ + struct class_device *cdev = &hdev->class_dev; + unsigned int i; + int err; + + BT_DBG("%p name %s type %d", hdev, hdev->name, hdev->type); + + cdev->class = &bt_class; + class_set_devdata(cdev, hdev); + + strlcpy(cdev->class_id, hdev->name, BUS_ID_SIZE); + err = class_device_register(cdev); + if (err < 0) + return err; + + for (i = 0; bt_attrs[i]; i++) + class_device_create_file(cdev, bt_attrs[i]); + + return 0; +} + +void hci_unregister_sysfs(struct hci_dev *hdev) +{ + struct class_device * cdev = &hdev->class_dev; + + BT_DBG("%p name %s type %d", hdev, hdev->name, hdev->type); + + class_device_del(cdev); +} + +int __init bt_sysfs_init(void) +{ + return class_register(&bt_class); +} + +void __exit bt_sysfs_cleanup(void) +{ + class_unregister(&bt_class); +} diff --git a/net/bluetooth/hidp/Kconfig b/net/bluetooth/hidp/Kconfig new file mode 100644 index 000000000000..4e958f7d9418 --- /dev/null +++ b/net/bluetooth/hidp/Kconfig @@ -0,0 +1,12 @@ +config BT_HIDP + tristate "HIDP protocol support" + depends on BT && BT_L2CAP + select INPUT + help + HIDP (Human Interface Device Protocol) is a transport layer + for HID reports. HIDP is required for the Bluetooth Human + Interface Device Profile. + + Say Y here to compile HIDP support into the kernel or say M to + compile it as module (hidp). + diff --git a/net/bluetooth/hidp/Makefile b/net/bluetooth/hidp/Makefile new file mode 100644 index 000000000000..a9ee115696ae --- /dev/null +++ b/net/bluetooth/hidp/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux Bluetooth HIDP layer +# + +obj-$(CONFIG_BT_HIDP) += hidp.o + +hidp-objs := core.o sock.o diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c new file mode 100644 index 000000000000..2cf98ceabcc7 --- /dev/null +++ b/net/bluetooth/hidp/core.c @@ -0,0 +1,772 @@ +/* + HIDP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2003-2004 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "hidp.h" + +#ifndef CONFIG_BT_HIDP_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +#define VERSION "1.1" + +static DECLARE_RWSEM(hidp_session_sem); +static LIST_HEAD(hidp_session_list); + +static unsigned char hidp_keycode[256] = { + 0, 0, 0, 0, 30, 48, 46, 32, 18, 33, 34, 35, 23, 36, 37, 38, + 50, 49, 24, 25, 16, 19, 31, 20, 22, 47, 17, 45, 21, 44, 2, 3, + 4, 5, 6, 7, 8, 9, 10, 11, 28, 1, 14, 15, 57, 12, 13, 26, + 27, 43, 43, 39, 40, 41, 51, 52, 53, 58, 59, 60, 61, 62, 63, 64, + 65, 66, 67, 68, 87, 88, 99, 70,119,110,102,104,111,107,109,106, + 105,108,103, 69, 98, 55, 74, 78, 96, 79, 80, 81, 75, 76, 77, 71, + 72, 73, 82, 83, 86,127,116,117,183,184,185,186,187,188,189,190, + 191,192,193,194,134,138,130,132,128,129,131,137,133,135,136,113, + 115,114, 0, 0, 0,121, 0, 89, 93,124, 92, 94, 95, 0, 0, 0, + 122,123, 90, 91, 85, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 29, 42, 56,125, 97, 54,100,126,164,166,165,163,161,115,114,113, + 150,158,159,128,136,177,178,176,142,152,173,140 +}; + +static unsigned char hidp_mkeyspat[] = { 0x01, 0x01, 0x01, 0x01, 0x01, 0x01 }; + +static struct hidp_session *__hidp_get_session(bdaddr_t *bdaddr) +{ + struct hidp_session *session; + struct list_head *p; + + BT_DBG(""); + + list_for_each(p, &hidp_session_list) { + session = list_entry(p, struct hidp_session, list); + if (!bacmp(bdaddr, &session->bdaddr)) + return session; + } + return NULL; +} + +static void __hidp_link_session(struct hidp_session *session) +{ + __module_get(THIS_MODULE); + list_add(&session->list, &hidp_session_list); +} + +static void __hidp_unlink_session(struct hidp_session *session) +{ + list_del(&session->list); + module_put(THIS_MODULE); +} + +static void __hidp_copy_session(struct hidp_session *session, struct hidp_conninfo *ci) +{ + bacpy(&ci->bdaddr, &session->bdaddr); + + ci->flags = session->flags; + ci->state = session->state; + + ci->vendor = 0x0000; + ci->product = 0x0000; + ci->version = 0x0000; + memset(ci->name, 0, 128); + + if (session->input) { + ci->vendor = session->input->id.vendor; + ci->product = session->input->id.product; + ci->version = session->input->id.version; + if (session->input->name) + strncpy(ci->name, session->input->name, 128); + else + strncpy(ci->name, "HID Boot Device", 128); + } +} + +static int hidp_input_event(struct input_dev *dev, unsigned int type, unsigned int code, int value) +{ + struct hidp_session *session = dev->private; + struct sk_buff *skb; + unsigned char newleds; + + BT_DBG("input %p type %d code %d value %d", dev, type, code, value); + + if (type != EV_LED) + return -1; + + newleds = (!!test_bit(LED_KANA, dev->led) << 3) | + (!!test_bit(LED_COMPOSE, dev->led) << 3) | + (!!test_bit(LED_SCROLLL, dev->led) << 2) | + (!!test_bit(LED_CAPSL, dev->led) << 1) | + (!!test_bit(LED_NUML, dev->led)); + + if (session->leds == newleds) + return 0; + + session->leds = newleds; + + if (!(skb = alloc_skb(3, GFP_ATOMIC))) { + BT_ERR("Can't allocate memory for new frame"); + return -ENOMEM; + } + + *skb_put(skb, 1) = HIDP_TRANS_DATA | HIDP_DATA_RTYPE_OUPUT; + *skb_put(skb, 1) = 0x01; + *skb_put(skb, 1) = newleds; + + skb_queue_tail(&session->intr_transmit, skb); + + hidp_schedule(session); + + return 0; +} + +static void hidp_input_report(struct hidp_session *session, struct sk_buff *skb) +{ + struct input_dev *dev = session->input; + unsigned char *keys = session->keys; + unsigned char *udata = skb->data + 1; + signed char *sdata = skb->data + 1; + int i, size = skb->len - 1; + + switch (skb->data[0]) { + case 0x01: /* Keyboard report */ + for (i = 0; i < 8; i++) + input_report_key(dev, hidp_keycode[i + 224], (udata[0] >> i) & 1); + + /* If all the key codes have been set to 0x01, it means + * too many keys were pressed at the same time. */ + if (!memcmp(udata + 2, hidp_mkeyspat, 6)) + break; + + for (i = 2; i < 8; i++) { + if (keys[i] > 3 && memscan(udata + 2, keys[i], 6) == udata + 8) { + if (hidp_keycode[keys[i]]) + input_report_key(dev, hidp_keycode[keys[i]], 0); + else + BT_ERR("Unknown key (scancode %#x) released.", keys[i]); + } + + if (udata[i] > 3 && memscan(keys + 2, udata[i], 6) == keys + 8) { + if (hidp_keycode[udata[i]]) + input_report_key(dev, hidp_keycode[udata[i]], 1); + else + BT_ERR("Unknown key (scancode %#x) pressed.", udata[i]); + } + } + + memcpy(keys, udata, 8); + break; + + case 0x02: /* Mouse report */ + input_report_key(dev, BTN_LEFT, sdata[0] & 0x01); + input_report_key(dev, BTN_RIGHT, sdata[0] & 0x02); + input_report_key(dev, BTN_MIDDLE, sdata[0] & 0x04); + input_report_key(dev, BTN_SIDE, sdata[0] & 0x08); + input_report_key(dev, BTN_EXTRA, sdata[0] & 0x10); + + input_report_rel(dev, REL_X, sdata[1]); + input_report_rel(dev, REL_Y, sdata[2]); + + if (size > 3) + input_report_rel(dev, REL_WHEEL, sdata[3]); + break; + } + + input_sync(dev); +} + +static void hidp_idle_timeout(unsigned long arg) +{ + struct hidp_session *session = (struct hidp_session *) arg; + + atomic_inc(&session->terminate); + hidp_schedule(session); +} + +static inline void hidp_set_timer(struct hidp_session *session) +{ + if (session->idle_to > 0) + mod_timer(&session->timer, jiffies + HZ * session->idle_to); +} + +static inline void hidp_del_timer(struct hidp_session *session) +{ + if (session->idle_to > 0) + del_timer(&session->timer); +} + +static int __hidp_send_ctrl_message(struct hidp_session *session, + unsigned char hdr, unsigned char *data, int size) +{ + struct sk_buff *skb; + + BT_DBG("session %p data %p size %d", session, data, size); + + if (!(skb = alloc_skb(size + 1, GFP_ATOMIC))) { + BT_ERR("Can't allocate memory for new frame"); + return -ENOMEM; + } + + *skb_put(skb, 1) = hdr; + if (data && size > 0) + memcpy(skb_put(skb, size), data, size); + + skb_queue_tail(&session->ctrl_transmit, skb); + + return 0; +} + +static int inline hidp_send_ctrl_message(struct hidp_session *session, + unsigned char hdr, unsigned char *data, int size) +{ + int err; + + err = __hidp_send_ctrl_message(session, hdr, data, size); + + hidp_schedule(session); + + return err; +} + +static inline void hidp_process_handshake(struct hidp_session *session, unsigned char param) +{ + BT_DBG("session %p param 0x%02x", session, param); + + switch (param) { + case HIDP_HSHK_SUCCESSFUL: + /* FIXME: Call into SET_ GET_ handlers here */ + break; + + case HIDP_HSHK_NOT_READY: + case HIDP_HSHK_ERR_INVALID_REPORT_ID: + case HIDP_HSHK_ERR_UNSUPPORTED_REQUEST: + case HIDP_HSHK_ERR_INVALID_PARAMETER: + /* FIXME: Call into SET_ GET_ handlers here */ + break; + + case HIDP_HSHK_ERR_UNKNOWN: + break; + + case HIDP_HSHK_ERR_FATAL: + /* Device requests a reboot, as this is the only way this error + * can be recovered. */ + __hidp_send_ctrl_message(session, + HIDP_TRANS_HID_CONTROL | HIDP_CTRL_SOFT_RESET, NULL, 0); + break; + + default: + __hidp_send_ctrl_message(session, + HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0); + break; + } +} + +static inline void hidp_process_hid_control(struct hidp_session *session, unsigned char param) +{ + BT_DBG("session %p param 0x%02x", session, param); + + switch (param) { + case HIDP_CTRL_NOP: + break; + + case HIDP_CTRL_VIRTUAL_CABLE_UNPLUG: + /* Flush the transmit queues */ + skb_queue_purge(&session->ctrl_transmit); + skb_queue_purge(&session->intr_transmit); + + /* Kill session thread */ + atomic_inc(&session->terminate); + break; + + case HIDP_CTRL_HARD_RESET: + case HIDP_CTRL_SOFT_RESET: + case HIDP_CTRL_SUSPEND: + case HIDP_CTRL_EXIT_SUSPEND: + /* FIXME: We have to parse these and return no error */ + break; + + default: + __hidp_send_ctrl_message(session, + HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0); + break; + } +} + +static inline void hidp_process_data(struct hidp_session *session, struct sk_buff *skb, unsigned char param) +{ + BT_DBG("session %p skb %p len %d param 0x%02x", session, skb, skb->len, param); + + switch (param) { + case HIDP_DATA_RTYPE_INPUT: + hidp_set_timer(session); + + if (session->input) + hidp_input_report(session, skb); + break; + + case HIDP_DATA_RTYPE_OTHER: + case HIDP_DATA_RTYPE_OUPUT: + case HIDP_DATA_RTYPE_FEATURE: + break; + + default: + __hidp_send_ctrl_message(session, + HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_INVALID_PARAMETER, NULL, 0); + } +} + +static inline void hidp_recv_ctrl_frame(struct hidp_session *session, struct sk_buff *skb) +{ + unsigned char hdr, type, param; + + BT_DBG("session %p skb %p len %d", session, skb, skb->len); + + hdr = skb->data[0]; + skb_pull(skb, 1); + + type = hdr & HIDP_HEADER_TRANS_MASK; + param = hdr & HIDP_HEADER_PARAM_MASK; + + switch (type) { + case HIDP_TRANS_HANDSHAKE: + hidp_process_handshake(session, param); + break; + + case HIDP_TRANS_HID_CONTROL: + hidp_process_hid_control(session, param); + break; + + case HIDP_TRANS_DATA: + hidp_process_data(session, skb, param); + break; + + default: + __hidp_send_ctrl_message(session, + HIDP_TRANS_HANDSHAKE | HIDP_HSHK_ERR_UNSUPPORTED_REQUEST, NULL, 0); + break; + } + + kfree_skb(skb); +} + +static inline void hidp_recv_intr_frame(struct hidp_session *session, struct sk_buff *skb) +{ + unsigned char hdr; + + BT_DBG("session %p skb %p len %d", session, skb, skb->len); + + hdr = skb->data[0]; + skb_pull(skb, 1); + + if (hdr == (HIDP_TRANS_DATA | HIDP_DATA_RTYPE_INPUT)) { + hidp_set_timer(session); + if (session->input) + hidp_input_report(session, skb); + } else { + BT_DBG("Unsupported protocol header 0x%02x", hdr); + } + + kfree_skb(skb); +} + +static int hidp_send_frame(struct socket *sock, unsigned char *data, int len) +{ + struct kvec iv = { data, len }; + struct msghdr msg; + + BT_DBG("sock %p data %p len %d", sock, data, len); + + if (!len) + return 0; + + memset(&msg, 0, sizeof(msg)); + + return kernel_sendmsg(sock, &msg, &iv, 1, len); +} + +static int hidp_process_transmit(struct hidp_session *session) +{ + struct sk_buff *skb; + + BT_DBG("session %p", session); + + while ((skb = skb_dequeue(&session->ctrl_transmit))) { + if (hidp_send_frame(session->ctrl_sock, skb->data, skb->len) < 0) { + skb_queue_head(&session->ctrl_transmit, skb); + break; + } + + hidp_set_timer(session); + kfree_skb(skb); + } + + while ((skb = skb_dequeue(&session->intr_transmit))) { + if (hidp_send_frame(session->intr_sock, skb->data, skb->len) < 0) { + skb_queue_head(&session->intr_transmit, skb); + break; + } + + hidp_set_timer(session); + kfree_skb(skb); + } + + return skb_queue_len(&session->ctrl_transmit) + + skb_queue_len(&session->intr_transmit); +} + +static int hidp_session(void *arg) +{ + struct hidp_session *session = arg; + struct sock *ctrl_sk = session->ctrl_sock->sk; + struct sock *intr_sk = session->intr_sock->sk; + struct sk_buff *skb; + int vendor = 0x0000, product = 0x0000; + wait_queue_t ctrl_wait, intr_wait; + + BT_DBG("session %p", session); + + if (session->input) { + vendor = session->input->id.vendor; + product = session->input->id.product; + } + + daemonize("khidpd_%04x%04x", vendor, product); + set_user_nice(current, -15); + current->flags |= PF_NOFREEZE; + + init_waitqueue_entry(&ctrl_wait, current); + init_waitqueue_entry(&intr_wait, current); + add_wait_queue(ctrl_sk->sk_sleep, &ctrl_wait); + add_wait_queue(intr_sk->sk_sleep, &intr_wait); + while (!atomic_read(&session->terminate)) { + set_current_state(TASK_INTERRUPTIBLE); + + if (ctrl_sk->sk_state != BT_CONNECTED || intr_sk->sk_state != BT_CONNECTED) + break; + + while ((skb = skb_dequeue(&ctrl_sk->sk_receive_queue))) { + skb_orphan(skb); + hidp_recv_ctrl_frame(session, skb); + } + + while ((skb = skb_dequeue(&intr_sk->sk_receive_queue))) { + skb_orphan(skb); + hidp_recv_intr_frame(session, skb); + } + + hidp_process_transmit(session); + + schedule(); + } + set_current_state(TASK_RUNNING); + remove_wait_queue(intr_sk->sk_sleep, &intr_wait); + remove_wait_queue(ctrl_sk->sk_sleep, &ctrl_wait); + + down_write(&hidp_session_sem); + + hidp_del_timer(session); + + if (intr_sk->sk_state != BT_CONNECTED) + wait_event_timeout(*(ctrl_sk->sk_sleep), (ctrl_sk->sk_state == BT_CLOSED), HZ); + + fput(session->ctrl_sock->file); + + wait_event_timeout(*(intr_sk->sk_sleep), (intr_sk->sk_state == BT_CLOSED), HZ); + + fput(session->intr_sock->file); + + __hidp_unlink_session(session); + + if (session->input) { + input_unregister_device(session->input); + kfree(session->input); + } + + up_write(&hidp_session_sem); + + kfree(session); + return 0; +} + +static inline void hidp_setup_input(struct hidp_session *session, struct hidp_connadd_req *req) +{ + struct input_dev *input = session->input; + int i; + + input->private = session; + + input->id.bustype = BUS_BLUETOOTH; + input->id.vendor = req->vendor; + input->id.product = req->product; + input->id.version = req->version; + + if (req->subclass & 0x40) { + set_bit(EV_KEY, input->evbit); + set_bit(EV_LED, input->evbit); + set_bit(EV_REP, input->evbit); + + set_bit(LED_NUML, input->ledbit); + set_bit(LED_CAPSL, input->ledbit); + set_bit(LED_SCROLLL, input->ledbit); + set_bit(LED_COMPOSE, input->ledbit); + set_bit(LED_KANA, input->ledbit); + + for (i = 0; i < sizeof(hidp_keycode); i++) + set_bit(hidp_keycode[i], input->keybit); + clear_bit(0, input->keybit); + } + + if (req->subclass & 0x80) { + input->evbit[0] = BIT(EV_KEY) | BIT(EV_REL); + input->keybit[LONG(BTN_MOUSE)] = BIT(BTN_LEFT) | BIT(BTN_RIGHT) | BIT(BTN_MIDDLE); + input->relbit[0] = BIT(REL_X) | BIT(REL_Y); + input->keybit[LONG(BTN_MOUSE)] |= BIT(BTN_SIDE) | BIT(BTN_EXTRA); + input->relbit[0] |= BIT(REL_WHEEL); + } + + input->event = hidp_input_event; + + input_register_device(input); +} + +int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock) +{ + struct hidp_session *session, *s; + int err; + + BT_DBG(""); + + if (bacmp(&bt_sk(ctrl_sock->sk)->src, &bt_sk(intr_sock->sk)->src) || + bacmp(&bt_sk(ctrl_sock->sk)->dst, &bt_sk(intr_sock->sk)->dst)) + return -ENOTUNIQ; + + session = kmalloc(sizeof(struct hidp_session), GFP_KERNEL); + if (!session) + return -ENOMEM; + memset(session, 0, sizeof(struct hidp_session)); + + session->input = kmalloc(sizeof(struct input_dev), GFP_KERNEL); + if (!session->input) { + kfree(session); + return -ENOMEM; + } + memset(session->input, 0, sizeof(struct input_dev)); + + down_write(&hidp_session_sem); + + s = __hidp_get_session(&bt_sk(ctrl_sock->sk)->dst); + if (s && s->state == BT_CONNECTED) { + err = -EEXIST; + goto failed; + } + + bacpy(&session->bdaddr, &bt_sk(ctrl_sock->sk)->dst); + + session->ctrl_mtu = min_t(uint, l2cap_pi(ctrl_sock->sk)->omtu, l2cap_pi(ctrl_sock->sk)->imtu); + session->intr_mtu = min_t(uint, l2cap_pi(intr_sock->sk)->omtu, l2cap_pi(intr_sock->sk)->imtu); + + BT_DBG("ctrl mtu %d intr mtu %d", session->ctrl_mtu, session->intr_mtu); + + session->ctrl_sock = ctrl_sock; + session->intr_sock = intr_sock; + session->state = BT_CONNECTED; + + init_timer(&session->timer); + + session->timer.function = hidp_idle_timeout; + session->timer.data = (unsigned long) session; + + skb_queue_head_init(&session->ctrl_transmit); + skb_queue_head_init(&session->intr_transmit); + + session->flags = req->flags & (1 << HIDP_BLUETOOTH_VENDOR_ID); + session->idle_to = req->idle_to; + + if (session->input) + hidp_setup_input(session, req); + + __hidp_link_session(session); + + hidp_set_timer(session); + + err = kernel_thread(hidp_session, session, CLONE_KERNEL); + if (err < 0) + goto unlink; + + if (session->input) { + hidp_send_ctrl_message(session, + HIDP_TRANS_SET_PROTOCOL | HIDP_PROTO_BOOT, NULL, 0); + session->flags |= (1 << HIDP_BOOT_PROTOCOL_MODE); + + session->leds = 0xff; + hidp_input_event(session->input, EV_LED, 0, 0); + } + + up_write(&hidp_session_sem); + return 0; + +unlink: + hidp_del_timer(session); + + __hidp_unlink_session(session); + + if (session->input) + input_unregister_device(session->input); + +failed: + up_write(&hidp_session_sem); + + if (session->input) + kfree(session->input); + + kfree(session); + return err; +} + +int hidp_del_connection(struct hidp_conndel_req *req) +{ + struct hidp_session *session; + int err = 0; + + BT_DBG(""); + + down_read(&hidp_session_sem); + + session = __hidp_get_session(&req->bdaddr); + if (session) { + if (req->flags & (1 << HIDP_VIRTUAL_CABLE_UNPLUG)) { + hidp_send_ctrl_message(session, + HIDP_TRANS_HID_CONTROL | HIDP_CTRL_VIRTUAL_CABLE_UNPLUG, NULL, 0); + } else { + /* Flush the transmit queues */ + skb_queue_purge(&session->ctrl_transmit); + skb_queue_purge(&session->intr_transmit); + + /* Kill session thread */ + atomic_inc(&session->terminate); + hidp_schedule(session); + } + } else + err = -ENOENT; + + up_read(&hidp_session_sem); + return err; +} + +int hidp_get_connlist(struct hidp_connlist_req *req) +{ + struct list_head *p; + int err = 0, n = 0; + + BT_DBG(""); + + down_read(&hidp_session_sem); + + list_for_each(p, &hidp_session_list) { + struct hidp_session *session; + struct hidp_conninfo ci; + + session = list_entry(p, struct hidp_session, list); + + __hidp_copy_session(session, &ci); + + if (copy_to_user(req->ci, &ci, sizeof(ci))) { + err = -EFAULT; + break; + } + + if (++n >= req->cnum) + break; + + req->ci++; + } + req->cnum = n; + + up_read(&hidp_session_sem); + return err; +} + +int hidp_get_conninfo(struct hidp_conninfo *ci) +{ + struct hidp_session *session; + int err = 0; + + down_read(&hidp_session_sem); + + session = __hidp_get_session(&ci->bdaddr); + if (session) + __hidp_copy_session(session, ci); + else + err = -ENOENT; + + up_read(&hidp_session_sem); + return err; +} + +static int __init hidp_init(void) +{ + l2cap_load(); + + BT_INFO("HIDP (Human Interface Emulation) ver %s", VERSION); + + return hidp_init_sockets(); +} + +static void __exit hidp_exit(void) +{ + hidp_cleanup_sockets(); +} + +module_init(hidp_init); +module_exit(hidp_exit); + +MODULE_AUTHOR("Marcel Holtmann "); +MODULE_DESCRIPTION("Bluetooth HIDP ver " VERSION); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("bt-proto-6"); diff --git a/net/bluetooth/hidp/hidp.h b/net/bluetooth/hidp/hidp.h new file mode 100644 index 000000000000..c2775f587d2e --- /dev/null +++ b/net/bluetooth/hidp/hidp.h @@ -0,0 +1,167 @@ +/* + HIDP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2003-2004 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#ifndef __HIDP_H +#define __HIDP_H + +#include +#include + +/* HIDP header masks */ +#define HIDP_HEADER_TRANS_MASK 0xf0 +#define HIDP_HEADER_PARAM_MASK 0x0f + +/* HIDP transaction types */ +#define HIDP_TRANS_HANDSHAKE 0x00 +#define HIDP_TRANS_HID_CONTROL 0x10 +#define HIDP_TRANS_GET_REPORT 0x40 +#define HIDP_TRANS_SET_REPORT 0x50 +#define HIDP_TRANS_GET_PROTOCOL 0x60 +#define HIDP_TRANS_SET_PROTOCOL 0x70 +#define HIDP_TRANS_GET_IDLE 0x80 +#define HIDP_TRANS_SET_IDLE 0x90 +#define HIDP_TRANS_DATA 0xa0 +#define HIDP_TRANS_DATC 0xb0 + +/* HIDP handshake results */ +#define HIDP_HSHK_SUCCESSFUL 0x00 +#define HIDP_HSHK_NOT_READY 0x01 +#define HIDP_HSHK_ERR_INVALID_REPORT_ID 0x02 +#define HIDP_HSHK_ERR_UNSUPPORTED_REQUEST 0x03 +#define HIDP_HSHK_ERR_INVALID_PARAMETER 0x04 +#define HIDP_HSHK_ERR_UNKNOWN 0x0e +#define HIDP_HSHK_ERR_FATAL 0x0f + +/* HIDP control operation parameters */ +#define HIDP_CTRL_NOP 0x00 +#define HIDP_CTRL_HARD_RESET 0x01 +#define HIDP_CTRL_SOFT_RESET 0x02 +#define HIDP_CTRL_SUSPEND 0x03 +#define HIDP_CTRL_EXIT_SUSPEND 0x04 +#define HIDP_CTRL_VIRTUAL_CABLE_UNPLUG 0x05 + +/* HIDP data transaction headers */ +#define HIDP_DATA_RTYPE_MASK 0x03 +#define HIDP_DATA_RSRVD_MASK 0x0c +#define HIDP_DATA_RTYPE_OTHER 0x00 +#define HIDP_DATA_RTYPE_INPUT 0x01 +#define HIDP_DATA_RTYPE_OUPUT 0x02 +#define HIDP_DATA_RTYPE_FEATURE 0x03 + +/* HIDP protocol header parameters */ +#define HIDP_PROTO_BOOT 0x00 +#define HIDP_PROTO_REPORT 0x01 + +/* HIDP ioctl defines */ +#define HIDPCONNADD _IOW('H', 200, int) +#define HIDPCONNDEL _IOW('H', 201, int) +#define HIDPGETCONNLIST _IOR('H', 210, int) +#define HIDPGETCONNINFO _IOR('H', 211, int) + +#define HIDP_VIRTUAL_CABLE_UNPLUG 0 +#define HIDP_BOOT_PROTOCOL_MODE 1 +#define HIDP_BLUETOOTH_VENDOR_ID 9 + +struct hidp_connadd_req { + int ctrl_sock; // Connected control socket + int intr_sock; // Connteted interrupt socket + __u16 parser; + __u16 rd_size; + __u8 *rd_data; + __u8 country; + __u8 subclass; + __u16 vendor; + __u16 product; + __u16 version; + __u32 flags; + __u32 idle_to; + char name[128]; +}; + +struct hidp_conndel_req { + bdaddr_t bdaddr; + __u32 flags; +}; + +struct hidp_conninfo { + bdaddr_t bdaddr; + __u32 flags; + __u16 state; + __u16 vendor; + __u16 product; + __u16 version; + char name[128]; +}; + +struct hidp_connlist_req { + __u32 cnum; + struct hidp_conninfo __user *ci; +}; + +int hidp_add_connection(struct hidp_connadd_req *req, struct socket *ctrl_sock, struct socket *intr_sock); +int hidp_del_connection(struct hidp_conndel_req *req); +int hidp_get_connlist(struct hidp_connlist_req *req); +int hidp_get_conninfo(struct hidp_conninfo *ci); + +/* HIDP session defines */ +struct hidp_session { + struct list_head list; + + struct socket *ctrl_sock; + struct socket *intr_sock; + + bdaddr_t bdaddr; + + unsigned long state; + unsigned long flags; + unsigned long idle_to; + + uint ctrl_mtu; + uint intr_mtu; + + atomic_t terminate; + + unsigned char keys[8]; + unsigned char leds; + + struct input_dev *input; + + struct timer_list timer; + + struct sk_buff_head ctrl_transmit; + struct sk_buff_head intr_transmit; +}; + +static inline void hidp_schedule(struct hidp_session *session) +{ + struct sock *ctrl_sk = session->ctrl_sock->sk; + struct sock *intr_sk = session->intr_sock->sk; + + wake_up_interruptible(ctrl_sk->sk_sleep); + wake_up_interruptible(intr_sk->sk_sleep); +} + +/* HIDP init defines */ +extern int __init hidp_init_sockets(void); +extern void __exit hidp_cleanup_sockets(void); + +#endif /* __HIDP_H */ diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c new file mode 100644 index 000000000000..fabb36d4666b --- /dev/null +++ b/net/bluetooth/hidp/sock.c @@ -0,0 +1,232 @@ +/* + HIDP implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2003-2004 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "hidp.h" + +#ifndef CONFIG_BT_HIDP_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +static int hidp_sock_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + BT_DBG("sock %p sk %p", sock, sk); + + if (!sk) + return 0; + + sock_orphan(sk); + sock_put(sk); + + return 0; +} + +static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *) arg; + struct hidp_connadd_req ca; + struct hidp_conndel_req cd; + struct hidp_connlist_req cl; + struct hidp_conninfo ci; + struct socket *csock; + struct socket *isock; + int err; + + BT_DBG("cmd %x arg %lx", cmd, arg); + + switch (cmd) { + case HIDPCONNADD: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + + if (copy_from_user(&ca, argp, sizeof(ca))) + return -EFAULT; + + csock = sockfd_lookup(ca.ctrl_sock, &err); + if (!csock) + return err; + + isock = sockfd_lookup(ca.intr_sock, &err); + if (!isock) { + fput(csock->file); + return err; + } + + if (csock->sk->sk_state != BT_CONNECTED || isock->sk->sk_state != BT_CONNECTED) { + fput(csock->file); + fput(isock->file); + return -EBADFD; + } + + err = hidp_add_connection(&ca, csock, isock); + if (!err) { + if (copy_to_user(argp, &ca, sizeof(ca))) + err = -EFAULT; + } else { + fput(csock->file); + fput(isock->file); + } + + return err; + + case HIDPCONNDEL: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + + if (copy_from_user(&cd, argp, sizeof(cd))) + return -EFAULT; + + return hidp_del_connection(&cd); + + case HIDPGETCONNLIST: + if (copy_from_user(&cl, argp, sizeof(cl))) + return -EFAULT; + + if (cl.cnum <= 0) + return -EINVAL; + + err = hidp_get_connlist(&cl); + if (!err && copy_to_user(argp, &cl, sizeof(cl))) + return -EFAULT; + + return err; + + case HIDPGETCONNINFO: + if (copy_from_user(&ci, argp, sizeof(ci))) + return -EFAULT; + + err = hidp_get_conninfo(&ci); + if (!err && copy_to_user(argp, &ci, sizeof(ci))) + return -EFAULT; + + return err; + } + + return -EINVAL; +} + +static struct proto_ops hidp_sock_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .release = hidp_sock_release, + .ioctl = hidp_sock_ioctl, + .bind = sock_no_bind, + .getname = sock_no_getname, + .sendmsg = sock_no_sendmsg, + .recvmsg = sock_no_recvmsg, + .poll = sock_no_poll, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .mmap = sock_no_mmap +}; + +static struct proto hidp_proto = { + .name = "HIDP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct bt_sock) +}; + +static int hidp_sock_create(struct socket *sock, int protocol) +{ + struct sock *sk; + + BT_DBG("sock %p", sock); + + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + sk = sk_alloc(PF_BLUETOOTH, GFP_KERNEL, &hidp_proto, 1); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + + sock->ops = &hidp_sock_ops; + + sock->state = SS_UNCONNECTED; + + sock_reset_flag(sk, SOCK_ZAPPED); + + sk->sk_protocol = protocol; + sk->sk_state = BT_OPEN; + + return 0; +} + +static struct net_proto_family hidp_sock_family_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .create = hidp_sock_create +}; + +int __init hidp_init_sockets(void) +{ + int err; + + err = proto_register(&hidp_proto, 0); + if (err < 0) + return err; + + err = bt_sock_register(BTPROTO_HIDP, &hidp_sock_family_ops); + if (err < 0) + goto error; + + return 0; + +error: + BT_ERR("Can't register HIDP socket"); + proto_unregister(&hidp_proto); + return err; +} + +void __exit hidp_cleanup_sockets(void) +{ + if (bt_sock_unregister(BTPROTO_HIDP) < 0) + BT_ERR("Can't unregister HIDP socket"); + + proto_unregister(&hidp_proto); +} diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c new file mode 100644 index 000000000000..c12babcf0b3c --- /dev/null +++ b/net/bluetooth/l2cap.c @@ -0,0 +1,2329 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2000-2001 Qualcomm Incorporated + + Written 2000,2001 by Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth L2CAP core and sockets. */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#ifndef CONFIG_BT_L2CAP_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +#define VERSION "2.7" + +static struct proto_ops l2cap_sock_ops; + +static struct bt_sock_list l2cap_sk_list = { + .lock = RW_LOCK_UNLOCKED +}; + +static int l2cap_conn_del(struct hci_conn *conn, int err); + +static void __l2cap_chan_add(struct l2cap_conn *conn, struct sock *sk, struct sock *parent); +static void l2cap_chan_del(struct sock *sk, int err); + +static void __l2cap_sock_close(struct sock *sk, int reason); +static void l2cap_sock_close(struct sock *sk); +static void l2cap_sock_kill(struct sock *sk); + +static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, + u8 code, u8 ident, u16 dlen, void *data); + +/* ---- L2CAP timers ---- */ +static void l2cap_sock_timeout(unsigned long arg) +{ + struct sock *sk = (struct sock *) arg; + + BT_DBG("sock %p state %d", sk, sk->sk_state); + + bh_lock_sock(sk); + __l2cap_sock_close(sk, ETIMEDOUT); + bh_unlock_sock(sk); + + l2cap_sock_kill(sk); + sock_put(sk); +} + +static void l2cap_sock_set_timer(struct sock *sk, long timeout) +{ + BT_DBG("sk %p state %d timeout %ld", sk, sk->sk_state, timeout); + sk_reset_timer(sk, &sk->sk_timer, jiffies + timeout); +} + +static void l2cap_sock_clear_timer(struct sock *sk) +{ + BT_DBG("sock %p state %d", sk, sk->sk_state); + sk_stop_timer(sk, &sk->sk_timer); +} + +static void l2cap_sock_init_timer(struct sock *sk) +{ + init_timer(&sk->sk_timer); + sk->sk_timer.function = l2cap_sock_timeout; + sk->sk_timer.data = (unsigned long)sk; +} + +/* ---- L2CAP connections ---- */ +static struct l2cap_conn *l2cap_conn_add(struct hci_conn *hcon, u8 status) +{ + struct l2cap_conn *conn; + + if ((conn = hcon->l2cap_data)) + return conn; + + if (status) + return conn; + + if (!(conn = kmalloc(sizeof(struct l2cap_conn), GFP_ATOMIC))) + return NULL; + memset(conn, 0, sizeof(struct l2cap_conn)); + + hcon->l2cap_data = conn; + conn->hcon = hcon; + + conn->mtu = hcon->hdev->acl_mtu; + conn->src = &hcon->hdev->bdaddr; + conn->dst = &hcon->dst; + + spin_lock_init(&conn->lock); + rwlock_init(&conn->chan_list.lock); + + BT_DBG("hcon %p conn %p", hcon, conn); + return conn; +} + +static int l2cap_conn_del(struct hci_conn *hcon, int err) +{ + struct l2cap_conn *conn; + struct sock *sk; + + if (!(conn = hcon->l2cap_data)) + return 0; + + BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); + + if (conn->rx_skb) + kfree_skb(conn->rx_skb); + + /* Kill channels */ + while ((sk = conn->chan_list.head)) { + bh_lock_sock(sk); + l2cap_chan_del(sk, err); + bh_unlock_sock(sk); + l2cap_sock_kill(sk); + } + + hcon->l2cap_data = NULL; + kfree(conn); + return 0; +} + +static inline void l2cap_chan_add(struct l2cap_conn *conn, struct sock *sk, struct sock *parent) +{ + struct l2cap_chan_list *l = &conn->chan_list; + write_lock(&l->lock); + __l2cap_chan_add(conn, sk, parent); + write_unlock(&l->lock); +} + +static inline u8 l2cap_get_ident(struct l2cap_conn *conn) +{ + u8 id; + + /* Get next available identificator. + * 1 - 128 are used by kernel. + * 129 - 199 are reserved. + * 200 - 254 are used by utilities like l2ping, etc. + */ + + spin_lock(&conn->lock); + + if (++conn->tx_ident > 128) + conn->tx_ident = 1; + + id = conn->tx_ident; + + spin_unlock(&conn->lock); + + return id; +} + +static inline int l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, void *data) +{ + struct sk_buff *skb = l2cap_build_cmd(conn, code, ident, len, data); + + BT_DBG("code 0x%2.2x", code); + + if (!skb) + return -ENOMEM; + + return hci_send_acl(conn->hcon, skb, 0); +} + +/* ---- Socket interface ---- */ +static struct sock *__l2cap_get_sock_by_addr(u16 psm, bdaddr_t *src) +{ + struct sock *sk; + struct hlist_node *node; + sk_for_each(sk, node, &l2cap_sk_list.head) + if (l2cap_pi(sk)->sport == psm && !bacmp(&bt_sk(sk)->src, src)) + goto found; + sk = NULL; +found: + return sk; +} + +/* Find socket with psm and source bdaddr. + * Returns closest match. + */ +static struct sock *__l2cap_get_sock_by_psm(int state, u16 psm, bdaddr_t *src) +{ + struct sock *sk = NULL, *sk1 = NULL; + struct hlist_node *node; + + sk_for_each(sk, node, &l2cap_sk_list.head) { + if (state && sk->sk_state != state) + continue; + + if (l2cap_pi(sk)->psm == psm) { + /* Exact match. */ + if (!bacmp(&bt_sk(sk)->src, src)) + break; + + /* Closest match */ + if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) + sk1 = sk; + } + } + return node ? sk : sk1; +} + +/* Find socket with given address (psm, src). + * Returns locked socket */ +static inline struct sock *l2cap_get_sock_by_psm(int state, u16 psm, bdaddr_t *src) +{ + struct sock *s; + read_lock(&l2cap_sk_list.lock); + s = __l2cap_get_sock_by_psm(state, psm, src); + if (s) bh_lock_sock(s); + read_unlock(&l2cap_sk_list.lock); + return s; +} + +static void l2cap_sock_destruct(struct sock *sk) +{ + BT_DBG("sk %p", sk); + + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); +} + +static void l2cap_sock_cleanup_listen(struct sock *parent) +{ + struct sock *sk; + + BT_DBG("parent %p", parent); + + /* Close not yet accepted channels */ + while ((sk = bt_accept_dequeue(parent, NULL))) + l2cap_sock_close(sk); + + parent->sk_state = BT_CLOSED; + sock_set_flag(parent, SOCK_ZAPPED); +} + +/* Kill socket (only if zapped and orphan) + * Must be called on unlocked socket. + */ +static void l2cap_sock_kill(struct sock *sk) +{ + if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) + return; + + BT_DBG("sk %p state %d", sk, sk->sk_state); + + /* Kill poor orphan */ + bt_sock_unlink(&l2cap_sk_list, sk); + sock_set_flag(sk, SOCK_DEAD); + sock_put(sk); +} + +static void __l2cap_sock_close(struct sock *sk, int reason) +{ + BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket); + + switch (sk->sk_state) { + case BT_LISTEN: + l2cap_sock_cleanup_listen(sk); + break; + + case BT_CONNECTED: + case BT_CONFIG: + case BT_CONNECT2: + if (sk->sk_type == SOCK_SEQPACKET) { + struct l2cap_conn *conn = l2cap_pi(sk)->conn; + struct l2cap_disconn_req req; + + sk->sk_state = BT_DISCONN; + l2cap_sock_set_timer(sk, sk->sk_sndtimeo); + + req.dcid = __cpu_to_le16(l2cap_pi(sk)->dcid); + req.scid = __cpu_to_le16(l2cap_pi(sk)->scid); + l2cap_send_cmd(conn, l2cap_get_ident(conn), + L2CAP_DISCONN_REQ, sizeof(req), &req); + } else { + l2cap_chan_del(sk, reason); + } + break; + + case BT_CONNECT: + case BT_DISCONN: + l2cap_chan_del(sk, reason); + break; + + default: + sock_set_flag(sk, SOCK_ZAPPED); + break; + } +} + +/* Must be called on unlocked socket. */ +static void l2cap_sock_close(struct sock *sk) +{ + l2cap_sock_clear_timer(sk); + lock_sock(sk); + __l2cap_sock_close(sk, ECONNRESET); + release_sock(sk); + l2cap_sock_kill(sk); +} + +static void l2cap_sock_init(struct sock *sk, struct sock *parent) +{ + struct l2cap_pinfo *pi = l2cap_pi(sk); + + BT_DBG("sk %p", sk); + + if (parent) { + sk->sk_type = parent->sk_type; + pi->imtu = l2cap_pi(parent)->imtu; + pi->omtu = l2cap_pi(parent)->omtu; + pi->link_mode = l2cap_pi(parent)->link_mode; + } else { + pi->imtu = L2CAP_DEFAULT_MTU; + pi->omtu = 0; + pi->link_mode = 0; + } + + /* Default config options */ + pi->conf_mtu = L2CAP_DEFAULT_MTU; + pi->flush_to = L2CAP_DEFAULT_FLUSH_TO; +} + +static struct proto l2cap_proto = { + .name = "L2CAP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct l2cap_pinfo) +}; + +static struct sock *l2cap_sock_alloc(struct socket *sock, int proto, int prio) +{ + struct sock *sk; + + sk = sk_alloc(PF_BLUETOOTH, prio, &l2cap_proto, 1); + if (!sk) + return NULL; + + sock_init_data(sock, sk); + INIT_LIST_HEAD(&bt_sk(sk)->accept_q); + + sk->sk_destruct = l2cap_sock_destruct; + sk->sk_sndtimeo = L2CAP_CONN_TIMEOUT; + + sock_reset_flag(sk, SOCK_ZAPPED); + + sk->sk_protocol = proto; + sk->sk_state = BT_OPEN; + + l2cap_sock_init_timer(sk); + + bt_sock_link(&l2cap_sk_list, sk); + return sk; +} + +static int l2cap_sock_create(struct socket *sock, int protocol) +{ + struct sock *sk; + + BT_DBG("sock %p", sock); + + sock->state = SS_UNCONNECTED; + + if (sock->type != SOCK_SEQPACKET && + sock->type != SOCK_DGRAM && sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + if (sock->type == SOCK_RAW && !capable(CAP_NET_RAW)) + return -EPERM; + + sock->ops = &l2cap_sock_ops; + + sk = l2cap_sock_alloc(sock, protocol, GFP_KERNEL); + if (!sk) + return -ENOMEM; + + l2cap_sock_init(sk, NULL); + return 0; +} + +static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +{ + struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr; + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sk %p, %s %d", sk, batostr(&la->l2_bdaddr), la->l2_psm); + + if (!addr || addr->sa_family != AF_BLUETOOTH) + return -EINVAL; + + lock_sock(sk); + + if (sk->sk_state != BT_OPEN) { + err = -EBADFD; + goto done; + } + + write_lock_bh(&l2cap_sk_list.lock); + + if (la->l2_psm && __l2cap_get_sock_by_addr(la->l2_psm, &la->l2_bdaddr)) { + err = -EADDRINUSE; + } else { + /* Save source address */ + bacpy(&bt_sk(sk)->src, &la->l2_bdaddr); + l2cap_pi(sk)->psm = la->l2_psm; + l2cap_pi(sk)->sport = la->l2_psm; + sk->sk_state = BT_BOUND; + } + + write_unlock_bh(&l2cap_sk_list.lock); + +done: + release_sock(sk); + return err; +} + +static int l2cap_do_connect(struct sock *sk) +{ + bdaddr_t *src = &bt_sk(sk)->src; + bdaddr_t *dst = &bt_sk(sk)->dst; + struct l2cap_conn *conn; + struct hci_conn *hcon; + struct hci_dev *hdev; + int err = 0; + + BT_DBG("%s -> %s psm 0x%2.2x", batostr(src), batostr(dst), l2cap_pi(sk)->psm); + + if (!(hdev = hci_get_route(dst, src))) + return -EHOSTUNREACH; + + hci_dev_lock_bh(hdev); + + err = -ENOMEM; + + hcon = hci_connect(hdev, ACL_LINK, dst); + if (!hcon) + goto done; + + conn = l2cap_conn_add(hcon, 0); + if (!conn) { + hci_conn_put(hcon); + goto done; + } + + err = 0; + + /* Update source addr of the socket */ + bacpy(src, conn->src); + + l2cap_chan_add(conn, sk, NULL); + + sk->sk_state = BT_CONNECT; + l2cap_sock_set_timer(sk, sk->sk_sndtimeo); + + if (hcon->state == BT_CONNECTED) { + if (sk->sk_type == SOCK_SEQPACKET) { + struct l2cap_conn_req req; + l2cap_pi(sk)->ident = l2cap_get_ident(conn); + req.scid = __cpu_to_le16(l2cap_pi(sk)->scid); + req.psm = l2cap_pi(sk)->psm; + l2cap_send_cmd(conn, l2cap_pi(sk)->ident, + L2CAP_CONN_REQ, sizeof(req), &req); + } else { + l2cap_sock_clear_timer(sk); + sk->sk_state = BT_CONNECTED; + } + } + +done: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + return err; +} + +static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) +{ + struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr; + struct sock *sk = sock->sk; + int err = 0; + + lock_sock(sk); + + BT_DBG("sk %p", sk); + + if (addr->sa_family != AF_BLUETOOTH || alen < sizeof(struct sockaddr_l2)) { + err = -EINVAL; + goto done; + } + + if (sk->sk_type == SOCK_SEQPACKET && !la->l2_psm) { + err = -EINVAL; + goto done; + } + + switch(sk->sk_state) { + case BT_CONNECT: + case BT_CONNECT2: + case BT_CONFIG: + /* Already connecting */ + goto wait; + + case BT_CONNECTED: + /* Already connected */ + goto done; + + case BT_OPEN: + case BT_BOUND: + /* Can connect */ + break; + + default: + err = -EBADFD; + goto done; + } + + /* Set destination address and psm */ + bacpy(&bt_sk(sk)->dst, &la->l2_bdaddr); + l2cap_pi(sk)->psm = la->l2_psm; + + if ((err = l2cap_do_connect(sk))) + goto done; + +wait: + err = bt_sock_wait_state(sk, BT_CONNECTED, + sock_sndtimeo(sk, flags & O_NONBLOCK)); +done: + release_sock(sk); + return err; +} + +static int l2cap_sock_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sk %p backlog %d", sk, backlog); + + lock_sock(sk); + + if (sk->sk_state != BT_BOUND || sock->type != SOCK_SEQPACKET) { + err = -EBADFD; + goto done; + } + + if (!l2cap_pi(sk)->psm) { + bdaddr_t *src = &bt_sk(sk)->src; + u16 psm; + + err = -EINVAL; + + write_lock_bh(&l2cap_sk_list.lock); + + for (psm = 0x1001; psm < 0x1100; psm += 2) + if (!__l2cap_get_sock_by_addr(psm, src)) { + l2cap_pi(sk)->psm = htobs(psm); + l2cap_pi(sk)->sport = htobs(psm); + err = 0; + break; + } + + write_unlock_bh(&l2cap_sk_list.lock); + + if (err < 0) + goto done; + } + + sk->sk_max_ack_backlog = backlog; + sk->sk_ack_backlog = 0; + sk->sk_state = BT_LISTEN; + +done: + release_sock(sk); + return err; +} + +static int l2cap_sock_accept(struct socket *sock, struct socket *newsock, int flags) +{ + DECLARE_WAITQUEUE(wait, current); + struct sock *sk = sock->sk, *nsk; + long timeo; + int err = 0; + + lock_sock(sk); + + if (sk->sk_state != BT_LISTEN) { + err = -EBADFD; + goto done; + } + + timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + BT_DBG("sk %p timeo %ld", sk, timeo); + + /* Wait for an incoming connection. (wake-one). */ + add_wait_queue_exclusive(sk->sk_sleep, &wait); + while (!(nsk = bt_accept_dequeue(sk, newsock))) { + set_current_state(TASK_INTERRUPTIBLE); + if (!timeo) { + err = -EAGAIN; + break; + } + + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + + if (sk->sk_state != BT_LISTEN) { + err = -EBADFD; + break; + } + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + break; + } + } + set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sk_sleep, &wait); + + if (err) + goto done; + + newsock->state = SS_CONNECTED; + + BT_DBG("new socket %p", nsk); + +done: + release_sock(sk); + return err; +} + +static int l2cap_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer) +{ + struct sockaddr_l2 *la = (struct sockaddr_l2 *) addr; + struct sock *sk = sock->sk; + + BT_DBG("sock %p, sk %p", sock, sk); + + addr->sa_family = AF_BLUETOOTH; + *len = sizeof(struct sockaddr_l2); + + if (peer) + bacpy(&la->l2_bdaddr, &bt_sk(sk)->dst); + else + bacpy(&la->l2_bdaddr, &bt_sk(sk)->src); + + la->l2_psm = l2cap_pi(sk)->psm; + return 0; +} + +static inline int l2cap_do_send(struct sock *sk, struct msghdr *msg, int len) +{ + struct l2cap_conn *conn = l2cap_pi(sk)->conn; + struct sk_buff *skb, **frag; + int err, hlen, count, sent=0; + struct l2cap_hdr *lh; + + BT_DBG("sk %p len %d", sk, len); + + /* First fragment (with L2CAP header) */ + if (sk->sk_type == SOCK_DGRAM) + hlen = L2CAP_HDR_SIZE + 2; + else + hlen = L2CAP_HDR_SIZE; + + count = min_t(unsigned int, (conn->mtu - hlen), len); + + skb = bt_skb_send_alloc(sk, hlen + count, + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) + return err; + + /* Create L2CAP header */ + lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); + lh->cid = __cpu_to_le16(l2cap_pi(sk)->dcid); + lh->len = __cpu_to_le16(len + (hlen - L2CAP_HDR_SIZE)); + + if (sk->sk_type == SOCK_DGRAM) + put_unaligned(l2cap_pi(sk)->psm, (u16 *) skb_put(skb, 2)); + + if (memcpy_fromiovec(skb_put(skb, count), msg->msg_iov, count)) { + err = -EFAULT; + goto fail; + } + + sent += count; + len -= count; + + /* Continuation fragments (no L2CAP header) */ + frag = &skb_shinfo(skb)->frag_list; + while (len) { + count = min_t(unsigned int, conn->mtu, len); + + *frag = bt_skb_send_alloc(sk, count, msg->msg_flags & MSG_DONTWAIT, &err); + if (!*frag) + goto fail; + + if (memcpy_fromiovec(skb_put(*frag, count), msg->msg_iov, count)) { + err = -EFAULT; + goto fail; + } + + sent += count; + len -= count; + + frag = &(*frag)->next; + } + + if ((err = hci_send_acl(conn->hcon, skb, 0)) < 0) + goto fail; + + return sent; + +fail: + kfree_skb(skb); + return err; +} + +static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sock %p, sk %p", sock, sk); + + if (sk->sk_err) + return sock_error(sk); + + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + /* Check outgoing MTU */ + if (sk->sk_type != SOCK_RAW && len > l2cap_pi(sk)->omtu) + return -EINVAL; + + lock_sock(sk); + + if (sk->sk_state == BT_CONNECTED) + err = l2cap_do_send(sk, msg, len); + else + err = -ENOTCONN; + + release_sock(sk); + return err; +} + +static int l2cap_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + struct l2cap_options opts; + int err = 0, len; + u32 opt; + + BT_DBG("sk %p", sk); + + lock_sock(sk); + + switch (optname) { + case L2CAP_OPTIONS: + len = min_t(unsigned int, sizeof(opts), optlen); + if (copy_from_user((char *) &opts, optval, len)) { + err = -EFAULT; + break; + } + l2cap_pi(sk)->imtu = opts.imtu; + l2cap_pi(sk)->omtu = opts.omtu; + break; + + case L2CAP_LM: + if (get_user(opt, (u32 __user *) optval)) { + err = -EFAULT; + break; + } + + l2cap_pi(sk)->link_mode = opt; + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int l2cap_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct l2cap_options opts; + struct l2cap_conninfo cinfo; + int len, err = 0; + + BT_DBG("sk %p", sk); + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + case L2CAP_OPTIONS: + opts.imtu = l2cap_pi(sk)->imtu; + opts.omtu = l2cap_pi(sk)->omtu; + opts.flush_to = l2cap_pi(sk)->flush_to; + opts.mode = 0x00; + + len = min_t(unsigned int, len, sizeof(opts)); + if (copy_to_user(optval, (char *) &opts, len)) + err = -EFAULT; + + break; + + case L2CAP_LM: + if (put_user(l2cap_pi(sk)->link_mode, (u32 __user *) optval)) + err = -EFAULT; + break; + + case L2CAP_CONNINFO: + if (sk->sk_state != BT_CONNECTED) { + err = -ENOTCONN; + break; + } + + cinfo.hci_handle = l2cap_pi(sk)->conn->hcon->handle; + memcpy(cinfo.dev_class, l2cap_pi(sk)->conn->hcon->dev_class, 3); + + len = min_t(unsigned int, len, sizeof(cinfo)); + if (copy_to_user(optval, (char *) &cinfo, len)) + err = -EFAULT; + + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int l2cap_sock_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sock %p, sk %p", sock, sk); + + if (!sk) + return 0; + + lock_sock(sk); + if (!sk->sk_shutdown) { + sk->sk_shutdown = SHUTDOWN_MASK; + l2cap_sock_clear_timer(sk); + __l2cap_sock_close(sk, 0); + + if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) + err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); + } + release_sock(sk); + return err; +} + +static int l2cap_sock_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + int err; + + BT_DBG("sock %p, sk %p", sock, sk); + + if (!sk) + return 0; + + err = l2cap_sock_shutdown(sock, 2); + + sock_orphan(sk); + l2cap_sock_kill(sk); + return err; +} + +/* ---- L2CAP channels ---- */ +static struct sock *__l2cap_get_chan_by_dcid(struct l2cap_chan_list *l, u16 cid) +{ + struct sock *s; + for (s = l->head; s; s = l2cap_pi(s)->next_c) { + if (l2cap_pi(s)->dcid == cid) + break; + } + return s; +} + +static struct sock *__l2cap_get_chan_by_scid(struct l2cap_chan_list *l, u16 cid) +{ + struct sock *s; + for (s = l->head; s; s = l2cap_pi(s)->next_c) { + if (l2cap_pi(s)->scid == cid) + break; + } + return s; +} + +/* Find channel with given SCID. + * Returns locked socket */ +static inline struct sock *l2cap_get_chan_by_scid(struct l2cap_chan_list *l, u16 cid) +{ + struct sock *s; + read_lock(&l->lock); + s = __l2cap_get_chan_by_scid(l, cid); + if (s) bh_lock_sock(s); + read_unlock(&l->lock); + return s; +} + +static struct sock *__l2cap_get_chan_by_ident(struct l2cap_chan_list *l, u8 ident) +{ + struct sock *s; + for (s = l->head; s; s = l2cap_pi(s)->next_c) { + if (l2cap_pi(s)->ident == ident) + break; + } + return s; +} + +static inline struct sock *l2cap_get_chan_by_ident(struct l2cap_chan_list *l, u8 ident) +{ + struct sock *s; + read_lock(&l->lock); + s = __l2cap_get_chan_by_ident(l, ident); + if (s) bh_lock_sock(s); + read_unlock(&l->lock); + return s; +} + +static u16 l2cap_alloc_cid(struct l2cap_chan_list *l) +{ + u16 cid = 0x0040; + + for (; cid < 0xffff; cid++) { + if(!__l2cap_get_chan_by_scid(l, cid)) + return cid; + } + + return 0; +} + +static inline void __l2cap_chan_link(struct l2cap_chan_list *l, struct sock *sk) +{ + sock_hold(sk); + + if (l->head) + l2cap_pi(l->head)->prev_c = sk; + + l2cap_pi(sk)->next_c = l->head; + l2cap_pi(sk)->prev_c = NULL; + l->head = sk; +} + +static inline void l2cap_chan_unlink(struct l2cap_chan_list *l, struct sock *sk) +{ + struct sock *next = l2cap_pi(sk)->next_c, *prev = l2cap_pi(sk)->prev_c; + + write_lock(&l->lock); + if (sk == l->head) + l->head = next; + + if (next) + l2cap_pi(next)->prev_c = prev; + if (prev) + l2cap_pi(prev)->next_c = next; + write_unlock(&l->lock); + + __sock_put(sk); +} + +static void __l2cap_chan_add(struct l2cap_conn *conn, struct sock *sk, struct sock *parent) +{ + struct l2cap_chan_list *l = &conn->chan_list; + + BT_DBG("conn %p, psm 0x%2.2x, dcid 0x%4.4x", conn, l2cap_pi(sk)->psm, l2cap_pi(sk)->dcid); + + l2cap_pi(sk)->conn = conn; + + if (sk->sk_type == SOCK_SEQPACKET) { + /* Alloc CID for connection-oriented socket */ + l2cap_pi(sk)->scid = l2cap_alloc_cid(l); + } else if (sk->sk_type == SOCK_DGRAM) { + /* Connectionless socket */ + l2cap_pi(sk)->scid = 0x0002; + l2cap_pi(sk)->dcid = 0x0002; + l2cap_pi(sk)->omtu = L2CAP_DEFAULT_MTU; + } else { + /* Raw socket can send/recv signalling messages only */ + l2cap_pi(sk)->scid = 0x0001; + l2cap_pi(sk)->dcid = 0x0001; + l2cap_pi(sk)->omtu = L2CAP_DEFAULT_MTU; + } + + __l2cap_chan_link(l, sk); + + if (parent) + bt_accept_enqueue(parent, sk); +} + +/* Delete channel. + * Must be called on the locked socket. */ +static void l2cap_chan_del(struct sock *sk, int err) +{ + struct l2cap_conn *conn = l2cap_pi(sk)->conn; + struct sock *parent = bt_sk(sk)->parent; + + l2cap_sock_clear_timer(sk); + + BT_DBG("sk %p, conn %p, err %d", sk, conn, err); + + if (conn) { + /* Unlink from channel list */ + l2cap_chan_unlink(&conn->chan_list, sk); + l2cap_pi(sk)->conn = NULL; + hci_conn_put(conn->hcon); + } + + sk->sk_state = BT_CLOSED; + sock_set_flag(sk, SOCK_ZAPPED); + + if (err) + sk->sk_err = err; + + if (parent) { + bt_accept_unlink(sk); + parent->sk_data_ready(parent, 0); + } else + sk->sk_state_change(sk); +} + +static void l2cap_conn_ready(struct l2cap_conn *conn) +{ + struct l2cap_chan_list *l = &conn->chan_list; + struct sock *sk; + + BT_DBG("conn %p", conn); + + read_lock(&l->lock); + + for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) { + bh_lock_sock(sk); + + if (sk->sk_type != SOCK_SEQPACKET) { + l2cap_sock_clear_timer(sk); + sk->sk_state = BT_CONNECTED; + sk->sk_state_change(sk); + } else if (sk->sk_state == BT_CONNECT) { + struct l2cap_conn_req req; + l2cap_pi(sk)->ident = l2cap_get_ident(conn); + req.scid = __cpu_to_le16(l2cap_pi(sk)->scid); + req.psm = l2cap_pi(sk)->psm; + l2cap_send_cmd(conn, l2cap_pi(sk)->ident, L2CAP_CONN_REQ, sizeof(req), &req); + } + + bh_unlock_sock(sk); + } + + read_unlock(&l->lock); +} + +/* Notify sockets that we cannot guaranty reliability anymore */ +static void l2cap_conn_unreliable(struct l2cap_conn *conn, int err) +{ + struct l2cap_chan_list *l = &conn->chan_list; + struct sock *sk; + + BT_DBG("conn %p", conn); + + read_lock(&l->lock); + for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) { + if (l2cap_pi(sk)->link_mode & L2CAP_LM_RELIABLE) + sk->sk_err = err; + } + read_unlock(&l->lock); +} + +static void l2cap_chan_ready(struct sock *sk) +{ + struct sock *parent = bt_sk(sk)->parent; + + BT_DBG("sk %p, parent %p", sk, parent); + + l2cap_pi(sk)->conf_state = 0; + l2cap_sock_clear_timer(sk); + + if (!parent) { + /* Outgoing channel. + * Wake up socket sleeping on connect. + */ + sk->sk_state = BT_CONNECTED; + sk->sk_state_change(sk); + } else { + /* Incoming channel. + * Wake up socket sleeping on accept. + */ + parent->sk_data_ready(parent, 0); + } +} + +/* Copy frame to all raw sockets on that connection */ +static void l2cap_raw_recv(struct l2cap_conn *conn, struct sk_buff *skb) +{ + struct l2cap_chan_list *l = &conn->chan_list; + struct sk_buff *nskb; + struct sock * sk; + + BT_DBG("conn %p", conn); + + read_lock(&l->lock); + for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) { + if (sk->sk_type != SOCK_RAW) + continue; + + /* Don't send frame to the socket it came from */ + if (skb->sk == sk) + continue; + + if (!(nskb = skb_clone(skb, GFP_ATOMIC))) + continue; + + if (sock_queue_rcv_skb(sk, nskb)) + kfree_skb(nskb); + } + read_unlock(&l->lock); +} + +/* ---- L2CAP signalling commands ---- */ +static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, + u8 code, u8 ident, u16 dlen, void *data) +{ + struct sk_buff *skb, **frag; + struct l2cap_cmd_hdr *cmd; + struct l2cap_hdr *lh; + int len, count; + + BT_DBG("conn %p, code 0x%2.2x, ident 0x%2.2x, len %d", conn, code, ident, dlen); + + len = L2CAP_HDR_SIZE + L2CAP_CMD_HDR_SIZE + dlen; + count = min_t(unsigned int, conn->mtu, len); + + skb = bt_skb_alloc(count, GFP_ATOMIC); + if (!skb) + return NULL; + + lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); + lh->len = __cpu_to_le16(L2CAP_CMD_HDR_SIZE + dlen); + lh->cid = __cpu_to_le16(0x0001); + + cmd = (struct l2cap_cmd_hdr *) skb_put(skb, L2CAP_CMD_HDR_SIZE); + cmd->code = code; + cmd->ident = ident; + cmd->len = __cpu_to_le16(dlen); + + if (dlen) { + count -= L2CAP_HDR_SIZE + L2CAP_CMD_HDR_SIZE; + memcpy(skb_put(skb, count), data, count); + data += count; + } + + len -= skb->len; + + /* Continuation fragments (no L2CAP header) */ + frag = &skb_shinfo(skb)->frag_list; + while (len) { + count = min_t(unsigned int, conn->mtu, len); + + *frag = bt_skb_alloc(count, GFP_ATOMIC); + if (!*frag) + goto fail; + + memcpy(skb_put(*frag, count), data, count); + + len -= count; + data += count; + + frag = &(*frag)->next; + } + + return skb; + +fail: + kfree_skb(skb); + return NULL; +} + +static inline int l2cap_get_conf_opt(void **ptr, int *type, int *olen, unsigned long *val) +{ + struct l2cap_conf_opt *opt = *ptr; + int len; + + len = L2CAP_CONF_OPT_SIZE + opt->len; + *ptr += len; + + *type = opt->type; + *olen = opt->len; + + switch (opt->len) { + case 1: + *val = *((u8 *) opt->val); + break; + + case 2: + *val = __le16_to_cpu(*((u16 *)opt->val)); + break; + + case 4: + *val = __le32_to_cpu(*((u32 *)opt->val)); + break; + + default: + *val = (unsigned long) opt->val; + break; + } + + BT_DBG("type 0x%2.2x len %d val 0x%lx", *type, opt->len, *val); + return len; +} + +static inline void l2cap_parse_conf_req(struct sock *sk, void *data, int len) +{ + int type, hint, olen; + unsigned long val; + void *ptr = data; + + BT_DBG("sk %p len %d", sk, len); + + while (len >= L2CAP_CONF_OPT_SIZE) { + len -= l2cap_get_conf_opt(&ptr, &type, &olen, &val); + + hint = type & 0x80; + type &= 0x7f; + + switch (type) { + case L2CAP_CONF_MTU: + l2cap_pi(sk)->conf_mtu = val; + break; + + case L2CAP_CONF_FLUSH_TO: + l2cap_pi(sk)->flush_to = val; + break; + + case L2CAP_CONF_QOS: + break; + + default: + if (hint) + break; + + /* FIXME: Reject unknown option */ + break; + } + } +} + +static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val) +{ + struct l2cap_conf_opt *opt = *ptr; + + BT_DBG("type 0x%2.2x len %d val 0x%lx", type, len, val); + + opt->type = type; + opt->len = len; + + switch (len) { + case 1: + *((u8 *) opt->val) = val; + break; + + case 2: + *((u16 *) opt->val) = __cpu_to_le16(val); + break; + + case 4: + *((u32 *) opt->val) = __cpu_to_le32(val); + break; + + default: + memcpy(opt->val, (void *) val, len); + break; + } + + *ptr += L2CAP_CONF_OPT_SIZE + len; +} + +static int l2cap_build_conf_req(struct sock *sk, void *data) +{ + struct l2cap_pinfo *pi = l2cap_pi(sk); + struct l2cap_conf_req *req = data; + void *ptr = req->data; + + BT_DBG("sk %p", sk); + + if (pi->imtu != L2CAP_DEFAULT_MTU) + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, pi->imtu); + + /* FIXME: Need actual value of the flush timeout */ + //if (flush_to != L2CAP_DEFAULT_FLUSH_TO) + // l2cap_add_conf_opt(&ptr, L2CAP_CONF_FLUSH_TO, 2, pi->flush_to); + + req->dcid = __cpu_to_le16(pi->dcid); + req->flags = __cpu_to_le16(0); + + return ptr - data; +} + +static inline int l2cap_conf_output(struct sock *sk, void **ptr) +{ + struct l2cap_pinfo *pi = l2cap_pi(sk); + int result = 0; + + /* Configure output options and let the other side know + * which ones we don't like. */ + if (pi->conf_mtu < pi->omtu) { + l2cap_add_conf_opt(ptr, L2CAP_CONF_MTU, 2, pi->omtu); + result = L2CAP_CONF_UNACCEPT; + } else { + pi->omtu = pi->conf_mtu; + } + + BT_DBG("sk %p result %d", sk, result); + return result; +} + +static int l2cap_build_conf_rsp(struct sock *sk, void *data, int *result) +{ + struct l2cap_conf_rsp *rsp = data; + void *ptr = rsp->data; + u16 flags = 0; + + BT_DBG("sk %p complete %d", sk, result ? 1 : 0); + + if (result) + *result = l2cap_conf_output(sk, &ptr); + else + flags = 0x0001; + + rsp->scid = __cpu_to_le16(l2cap_pi(sk)->dcid); + rsp->result = __cpu_to_le16(result ? *result : 0); + rsp->flags = __cpu_to_le16(flags); + + return ptr - data; +} + +static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data) +{ + struct l2cap_chan_list *list = &conn->chan_list; + struct l2cap_conn_req *req = (struct l2cap_conn_req *) data; + struct l2cap_conn_rsp rsp; + struct sock *sk, *parent; + int result = 0, status = 0; + + u16 dcid = 0, scid = __le16_to_cpu(req->scid); + u16 psm = req->psm; + + BT_DBG("psm 0x%2.2x scid 0x%4.4x", psm, scid); + + /* Check if we have socket listening on psm */ + parent = l2cap_get_sock_by_psm(BT_LISTEN, psm, conn->src); + if (!parent) { + result = L2CAP_CR_BAD_PSM; + goto sendresp; + } + + result = L2CAP_CR_NO_MEM; + + /* Check for backlog size */ + if (sk_acceptq_is_full(parent)) { + BT_DBG("backlog full %d", parent->sk_ack_backlog); + goto response; + } + + sk = l2cap_sock_alloc(NULL, BTPROTO_L2CAP, GFP_ATOMIC); + if (!sk) + goto response; + + write_lock(&list->lock); + + /* Check if we already have channel with that dcid */ + if (__l2cap_get_chan_by_dcid(list, scid)) { + write_unlock(&list->lock); + sock_set_flag(sk, SOCK_ZAPPED); + l2cap_sock_kill(sk); + goto response; + } + + hci_conn_hold(conn->hcon); + + l2cap_sock_init(sk, parent); + bacpy(&bt_sk(sk)->src, conn->src); + bacpy(&bt_sk(sk)->dst, conn->dst); + l2cap_pi(sk)->psm = psm; + l2cap_pi(sk)->dcid = scid; + + __l2cap_chan_add(conn, sk, parent); + dcid = l2cap_pi(sk)->scid; + + l2cap_sock_set_timer(sk, sk->sk_sndtimeo); + + /* Service level security */ + result = L2CAP_CR_PEND; + status = L2CAP_CS_AUTHEN_PEND; + sk->sk_state = BT_CONNECT2; + l2cap_pi(sk)->ident = cmd->ident; + + if ((l2cap_pi(sk)->link_mode & L2CAP_LM_ENCRYPT) || + (l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE)) { + if (!hci_conn_encrypt(conn->hcon)) + goto done; + } else if (l2cap_pi(sk)->link_mode & L2CAP_LM_AUTH) { + if (!hci_conn_auth(conn->hcon)) + goto done; + } + + sk->sk_state = BT_CONFIG; + result = status = 0; + +done: + write_unlock(&list->lock); + +response: + bh_unlock_sock(parent); + +sendresp: + rsp.scid = __cpu_to_le16(scid); + rsp.dcid = __cpu_to_le16(dcid); + rsp.result = __cpu_to_le16(result); + rsp.status = __cpu_to_le16(status); + l2cap_send_cmd(conn, cmd->ident, L2CAP_CONN_RSP, sizeof(rsp), &rsp); + return 0; +} + +static inline int l2cap_connect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data) +{ + struct l2cap_conn_rsp *rsp = (struct l2cap_conn_rsp *) data; + u16 scid, dcid, result, status; + struct sock *sk; + u8 req[128]; + + scid = __le16_to_cpu(rsp->scid); + dcid = __le16_to_cpu(rsp->dcid); + result = __le16_to_cpu(rsp->result); + status = __le16_to_cpu(rsp->status); + + BT_DBG("dcid 0x%4.4x scid 0x%4.4x result 0x%2.2x status 0x%2.2x", dcid, scid, result, status); + + if (scid) { + if (!(sk = l2cap_get_chan_by_scid(&conn->chan_list, scid))) + return 0; + } else { + if (!(sk = l2cap_get_chan_by_ident(&conn->chan_list, cmd->ident))) + return 0; + } + + switch (result) { + case L2CAP_CR_SUCCESS: + sk->sk_state = BT_CONFIG; + l2cap_pi(sk)->ident = 0; + l2cap_pi(sk)->dcid = dcid; + l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT; + + l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, + l2cap_build_conf_req(sk, req), req); + break; + + case L2CAP_CR_PEND: + break; + + default: + l2cap_chan_del(sk, ECONNREFUSED); + break; + } + + bh_unlock_sock(sk); + return 0; +} + +static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data) +{ + struct l2cap_conf_req *req = (struct l2cap_conf_req *) data; + u16 dcid, flags; + u8 rsp[64]; + struct sock *sk; + int result; + + dcid = __le16_to_cpu(req->dcid); + flags = __le16_to_cpu(req->flags); + + BT_DBG("dcid 0x%4.4x flags 0x%2.2x", dcid, flags); + + if (!(sk = l2cap_get_chan_by_scid(&conn->chan_list, dcid))) + return -ENOENT; + + l2cap_parse_conf_req(sk, req->data, cmd->len - sizeof(*req)); + + if (flags & 0x0001) { + /* Incomplete config. Send empty response. */ + l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP, + l2cap_build_conf_rsp(sk, rsp, NULL), rsp); + goto unlock; + } + + /* Complete config. */ + l2cap_send_cmd(conn, cmd->ident, L2CAP_CONF_RSP, + l2cap_build_conf_rsp(sk, rsp, &result), rsp); + + if (result) + goto unlock; + + /* Output config done */ + l2cap_pi(sk)->conf_state |= L2CAP_CONF_OUTPUT_DONE; + + if (l2cap_pi(sk)->conf_state & L2CAP_CONF_INPUT_DONE) { + sk->sk_state = BT_CONNECTED; + l2cap_chan_ready(sk); + } else if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_REQ_SENT)) { + u8 req[64]; + l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, + l2cap_build_conf_req(sk, req), req); + } + +unlock: + bh_unlock_sock(sk); + return 0; +} + +static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data) +{ + struct l2cap_conf_rsp *rsp = (struct l2cap_conf_rsp *)data; + u16 scid, flags, result; + struct sock *sk; + + scid = __le16_to_cpu(rsp->scid); + flags = __le16_to_cpu(rsp->flags); + result = __le16_to_cpu(rsp->result); + + BT_DBG("scid 0x%4.4x flags 0x%2.2x result 0x%2.2x", scid, flags, result); + + if (!(sk = l2cap_get_chan_by_scid(&conn->chan_list, scid))) + return 0; + + switch (result) { + case L2CAP_CONF_SUCCESS: + break; + + case L2CAP_CONF_UNACCEPT: + if (++l2cap_pi(sk)->conf_retry < L2CAP_CONF_MAX_RETRIES) { + char req[128]; + /* It does not make sense to adjust L2CAP parameters + * that are currently defined in the spec. We simply + * resend config request that we sent earlier. It is + * stupid, but it helps qualification testing which + * expects at least some response from us. */ + l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, + l2cap_build_conf_req(sk, req), req); + goto done; + } + + default: + sk->sk_state = BT_DISCONN; + sk->sk_err = ECONNRESET; + l2cap_sock_set_timer(sk, HZ * 5); + { + struct l2cap_disconn_req req; + req.dcid = __cpu_to_le16(l2cap_pi(sk)->dcid); + req.scid = __cpu_to_le16(l2cap_pi(sk)->scid); + l2cap_send_cmd(conn, l2cap_get_ident(conn), + L2CAP_DISCONN_REQ, sizeof(req), &req); + } + goto done; + } + + if (flags & 0x01) + goto done; + + /* Input config done */ + l2cap_pi(sk)->conf_state |= L2CAP_CONF_INPUT_DONE; + + if (l2cap_pi(sk)->conf_state & L2CAP_CONF_OUTPUT_DONE) { + sk->sk_state = BT_CONNECTED; + l2cap_chan_ready(sk); + } + +done: + bh_unlock_sock(sk); + return 0; +} + +static inline int l2cap_disconnect_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data) +{ + struct l2cap_disconn_req *req = (struct l2cap_disconn_req *) data; + struct l2cap_disconn_rsp rsp; + u16 dcid, scid; + struct sock *sk; + + scid = __le16_to_cpu(req->scid); + dcid = __le16_to_cpu(req->dcid); + + BT_DBG("scid 0x%4.4x dcid 0x%4.4x", scid, dcid); + + if (!(sk = l2cap_get_chan_by_scid(&conn->chan_list, dcid))) + return 0; + + rsp.dcid = __cpu_to_le16(l2cap_pi(sk)->scid); + rsp.scid = __cpu_to_le16(l2cap_pi(sk)->dcid); + l2cap_send_cmd(conn, cmd->ident, L2CAP_DISCONN_RSP, sizeof(rsp), &rsp); + + sk->sk_shutdown = SHUTDOWN_MASK; + + l2cap_chan_del(sk, ECONNRESET); + bh_unlock_sock(sk); + + l2cap_sock_kill(sk); + return 0; +} + +static inline int l2cap_disconnect_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data) +{ + struct l2cap_disconn_rsp *rsp = (struct l2cap_disconn_rsp *) data; + u16 dcid, scid; + struct sock *sk; + + scid = __le16_to_cpu(rsp->scid); + dcid = __le16_to_cpu(rsp->dcid); + + BT_DBG("dcid 0x%4.4x scid 0x%4.4x", dcid, scid); + + if (!(sk = l2cap_get_chan_by_scid(&conn->chan_list, scid))) + return 0; + + l2cap_chan_del(sk, 0); + bh_unlock_sock(sk); + + l2cap_sock_kill(sk); + return 0; +} + +static inline int l2cap_information_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data) +{ + struct l2cap_info_req *req = (struct l2cap_info_req *) data; + struct l2cap_info_rsp rsp; + u16 type; + + type = __le16_to_cpu(req->type); + + BT_DBG("type 0x%4.4x", type); + + rsp.type = __cpu_to_le16(type); + rsp.result = __cpu_to_le16(L2CAP_IR_NOTSUPP); + l2cap_send_cmd(conn, cmd->ident, L2CAP_INFO_RSP, sizeof(rsp), &rsp); + + return 0; +} + +static inline int l2cap_information_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr *cmd, u8 *data) +{ + struct l2cap_info_rsp *rsp = (struct l2cap_info_rsp *) data; + u16 type, result; + + type = __le16_to_cpu(rsp->type); + result = __le16_to_cpu(rsp->result); + + BT_DBG("type 0x%4.4x result 0x%2.2x", type, result); + + return 0; +} + +static inline void l2cap_sig_channel(struct l2cap_conn *conn, struct sk_buff *skb) +{ + u8 *data = skb->data; + int len = skb->len; + struct l2cap_cmd_hdr cmd; + int err = 0; + + l2cap_raw_recv(conn, skb); + + while (len >= L2CAP_CMD_HDR_SIZE) { + memcpy(&cmd, data, L2CAP_CMD_HDR_SIZE); + data += L2CAP_CMD_HDR_SIZE; + len -= L2CAP_CMD_HDR_SIZE; + + cmd.len = __le16_to_cpu(cmd.len); + + BT_DBG("code 0x%2.2x len %d id 0x%2.2x", cmd.code, cmd.len, cmd.ident); + + if (cmd.len > len || !cmd.ident) { + BT_DBG("corrupted command"); + break; + } + + switch (cmd.code) { + case L2CAP_COMMAND_REJ: + /* FIXME: We should process this */ + break; + + case L2CAP_CONN_REQ: + err = l2cap_connect_req(conn, &cmd, data); + break; + + case L2CAP_CONN_RSP: + err = l2cap_connect_rsp(conn, &cmd, data); + break; + + case L2CAP_CONF_REQ: + err = l2cap_config_req(conn, &cmd, data); + break; + + case L2CAP_CONF_RSP: + err = l2cap_config_rsp(conn, &cmd, data); + break; + + case L2CAP_DISCONN_REQ: + err = l2cap_disconnect_req(conn, &cmd, data); + break; + + case L2CAP_DISCONN_RSP: + err = l2cap_disconnect_rsp(conn, &cmd, data); + break; + + case L2CAP_ECHO_REQ: + l2cap_send_cmd(conn, cmd.ident, L2CAP_ECHO_RSP, cmd.len, data); + break; + + case L2CAP_ECHO_RSP: + break; + + case L2CAP_INFO_REQ: + err = l2cap_information_req(conn, &cmd, data); + break; + + case L2CAP_INFO_RSP: + err = l2cap_information_rsp(conn, &cmd, data); + break; + + default: + BT_ERR("Unknown signaling command 0x%2.2x", cmd.code); + err = -EINVAL; + break; + } + + if (err) { + struct l2cap_cmd_rej rej; + BT_DBG("error %d", err); + + /* FIXME: Map err to a valid reason */ + rej.reason = __cpu_to_le16(0); + l2cap_send_cmd(conn, cmd.ident, L2CAP_COMMAND_REJ, sizeof(rej), &rej); + } + + data += cmd.len; + len -= cmd.len; + } + + kfree_skb(skb); +} + +static inline int l2cap_data_channel(struct l2cap_conn *conn, u16 cid, struct sk_buff *skb) +{ + struct sock *sk; + + sk = l2cap_get_chan_by_scid(&conn->chan_list, cid); + if (!sk) { + BT_DBG("unknown cid 0x%4.4x", cid); + goto drop; + } + + BT_DBG("sk %p, len %d", sk, skb->len); + + if (sk->sk_state != BT_CONNECTED) + goto drop; + + if (l2cap_pi(sk)->imtu < skb->len) + goto drop; + + /* If socket recv buffers overflows we drop data here + * which is *bad* because L2CAP has to be reliable. + * But we don't have any other choice. L2CAP doesn't + * provide flow control mechanism. */ + + if (!sock_queue_rcv_skb(sk, skb)) + goto done; + +drop: + kfree_skb(skb); + +done: + if (sk) bh_unlock_sock(sk); + return 0; +} + +static inline int l2cap_conless_channel(struct l2cap_conn *conn, u16 psm, struct sk_buff *skb) +{ + struct sock *sk; + + sk = l2cap_get_sock_by_psm(0, psm, conn->src); + if (!sk) + goto drop; + + BT_DBG("sk %p, len %d", sk, skb->len); + + if (sk->sk_state != BT_BOUND && sk->sk_state != BT_CONNECTED) + goto drop; + + if (l2cap_pi(sk)->imtu < skb->len) + goto drop; + + if (!sock_queue_rcv_skb(sk, skb)) + goto done; + +drop: + kfree_skb(skb); + +done: + if (sk) bh_unlock_sock(sk); + return 0; +} + +static void l2cap_recv_frame(struct l2cap_conn *conn, struct sk_buff *skb) +{ + struct l2cap_hdr *lh = (void *) skb->data; + u16 cid, psm, len; + + skb_pull(skb, L2CAP_HDR_SIZE); + cid = __le16_to_cpu(lh->cid); + len = __le16_to_cpu(lh->len); + + BT_DBG("len %d, cid 0x%4.4x", len, cid); + + switch (cid) { + case 0x0001: + l2cap_sig_channel(conn, skb); + break; + + case 0x0002: + psm = get_unaligned((u16 *) skb->data); + skb_pull(skb, 2); + l2cap_conless_channel(conn, psm, skb); + break; + + default: + l2cap_data_channel(conn, cid, skb); + break; + } +} + +/* ---- L2CAP interface with lower layer (HCI) ---- */ + +static int l2cap_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 type) +{ + int exact = 0, lm1 = 0, lm2 = 0; + register struct sock *sk; + struct hlist_node *node; + + if (type != ACL_LINK) + return 0; + + BT_DBG("hdev %s, bdaddr %s", hdev->name, batostr(bdaddr)); + + /* Find listening sockets and check their link_mode */ + read_lock(&l2cap_sk_list.lock); + sk_for_each(sk, node, &l2cap_sk_list.head) { + if (sk->sk_state != BT_LISTEN) + continue; + + if (!bacmp(&bt_sk(sk)->src, &hdev->bdaddr)) { + lm1 |= (HCI_LM_ACCEPT | l2cap_pi(sk)->link_mode); + exact++; + } else if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) + lm2 |= (HCI_LM_ACCEPT | l2cap_pi(sk)->link_mode); + } + read_unlock(&l2cap_sk_list.lock); + + return exact ? lm1 : lm2; +} + +static int l2cap_connect_cfm(struct hci_conn *hcon, u8 status) +{ + BT_DBG("hcon %p bdaddr %s status %d", hcon, batostr(&hcon->dst), status); + + if (hcon->type != ACL_LINK) + return 0; + + if (!status) { + struct l2cap_conn *conn; + + conn = l2cap_conn_add(hcon, status); + if (conn) + l2cap_conn_ready(conn); + } else + l2cap_conn_del(hcon, bt_err(status)); + + return 0; +} + +static int l2cap_disconn_ind(struct hci_conn *hcon, u8 reason) +{ + BT_DBG("hcon %p reason %d", hcon, reason); + + if (hcon->type != ACL_LINK) + return 0; + + l2cap_conn_del(hcon, bt_err(reason)); + return 0; +} + +static int l2cap_auth_cfm(struct hci_conn *hcon, u8 status) +{ + struct l2cap_chan_list *l; + struct l2cap_conn *conn; + struct l2cap_conn_rsp rsp; + struct sock *sk; + int result; + + if (!(conn = hcon->l2cap_data)) + return 0; + l = &conn->chan_list; + + BT_DBG("conn %p", conn); + + read_lock(&l->lock); + + for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) { + bh_lock_sock(sk); + + if (sk->sk_state != BT_CONNECT2 || + (l2cap_pi(sk)->link_mode & L2CAP_LM_ENCRYPT) || + (l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE)) { + bh_unlock_sock(sk); + continue; + } + + if (!status) { + sk->sk_state = BT_CONFIG; + result = 0; + } else { + sk->sk_state = BT_DISCONN; + l2cap_sock_set_timer(sk, HZ/10); + result = L2CAP_CR_SEC_BLOCK; + } + + rsp.scid = __cpu_to_le16(l2cap_pi(sk)->dcid); + rsp.dcid = __cpu_to_le16(l2cap_pi(sk)->scid); + rsp.result = __cpu_to_le16(result); + rsp.status = __cpu_to_le16(0); + l2cap_send_cmd(conn, l2cap_pi(sk)->ident, + L2CAP_CONN_RSP, sizeof(rsp), &rsp); + + bh_unlock_sock(sk); + } + + read_unlock(&l->lock); + return 0; +} + +static int l2cap_encrypt_cfm(struct hci_conn *hcon, u8 status) +{ + struct l2cap_chan_list *l; + struct l2cap_conn *conn; + struct l2cap_conn_rsp rsp; + struct sock *sk; + int result; + + if (!(conn = hcon->l2cap_data)) + return 0; + l = &conn->chan_list; + + BT_DBG("conn %p", conn); + + read_lock(&l->lock); + + for (sk = l->head; sk; sk = l2cap_pi(sk)->next_c) { + bh_lock_sock(sk); + + if (sk->sk_state != BT_CONNECT2) { + bh_unlock_sock(sk); + continue; + } + + if (!status) { + sk->sk_state = BT_CONFIG; + result = 0; + } else { + sk->sk_state = BT_DISCONN; + l2cap_sock_set_timer(sk, HZ/10); + result = L2CAP_CR_SEC_BLOCK; + } + + rsp.scid = __cpu_to_le16(l2cap_pi(sk)->dcid); + rsp.dcid = __cpu_to_le16(l2cap_pi(sk)->scid); + rsp.result = __cpu_to_le16(result); + rsp.status = __cpu_to_le16(0); + l2cap_send_cmd(conn, l2cap_pi(sk)->ident, + L2CAP_CONN_RSP, sizeof(rsp), &rsp); + + if (l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE) + hci_conn_change_link_key(hcon); + + bh_unlock_sock(sk); + } + + read_unlock(&l->lock); + return 0; +} + +static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 flags) +{ + struct l2cap_conn *conn = hcon->l2cap_data; + + if (!conn && !(conn = l2cap_conn_add(hcon, 0))) + goto drop; + + BT_DBG("conn %p len %d flags 0x%x", conn, skb->len, flags); + + if (flags & ACL_START) { + struct l2cap_hdr *hdr; + int len; + + if (conn->rx_len) { + BT_ERR("Unexpected start frame (len %d)", skb->len); + kfree_skb(conn->rx_skb); + conn->rx_skb = NULL; + conn->rx_len = 0; + l2cap_conn_unreliable(conn, ECOMM); + } + + if (skb->len < 2) { + BT_ERR("Frame is too short (len %d)", skb->len); + l2cap_conn_unreliable(conn, ECOMM); + goto drop; + } + + hdr = (struct l2cap_hdr *) skb->data; + len = __le16_to_cpu(hdr->len) + L2CAP_HDR_SIZE; + + if (len == skb->len) { + /* Complete frame received */ + l2cap_recv_frame(conn, skb); + return 0; + } + + BT_DBG("Start: total len %d, frag len %d", len, skb->len); + + if (skb->len > len) { + BT_ERR("Frame is too long (len %d, expected len %d)", + skb->len, len); + l2cap_conn_unreliable(conn, ECOMM); + goto drop; + } + + /* Allocate skb for the complete frame (with header) */ + if (!(conn->rx_skb = bt_skb_alloc(len, GFP_ATOMIC))) + goto drop; + + memcpy(skb_put(conn->rx_skb, skb->len), skb->data, skb->len); + conn->rx_len = len - skb->len; + } else { + BT_DBG("Cont: frag len %d (expecting %d)", skb->len, conn->rx_len); + + if (!conn->rx_len) { + BT_ERR("Unexpected continuation frame (len %d)", skb->len); + l2cap_conn_unreliable(conn, ECOMM); + goto drop; + } + + if (skb->len > conn->rx_len) { + BT_ERR("Fragment is too long (len %d, expected %d)", + skb->len, conn->rx_len); + kfree_skb(conn->rx_skb); + conn->rx_skb = NULL; + conn->rx_len = 0; + l2cap_conn_unreliable(conn, ECOMM); + goto drop; + } + + memcpy(skb_put(conn->rx_skb, skb->len), skb->data, skb->len); + conn->rx_len -= skb->len; + + if (!conn->rx_len) { + /* Complete frame received */ + l2cap_recv_frame(conn, conn->rx_skb); + conn->rx_skb = NULL; + } + } + +drop: + kfree_skb(skb); + return 0; +} + +/* ---- Proc fs support ---- */ +#ifdef CONFIG_PROC_FS +static void *l2cap_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct sock *sk; + struct hlist_node *node; + loff_t l = *pos; + + read_lock_bh(&l2cap_sk_list.lock); + + sk_for_each(sk, node, &l2cap_sk_list.head) + if (!l--) + goto found; + sk = NULL; +found: + return sk; +} + +static void *l2cap_seq_next(struct seq_file *seq, void *e, loff_t *pos) +{ + (*pos)++; + return sk_next(e); +} + +static void l2cap_seq_stop(struct seq_file *seq, void *e) +{ + read_unlock_bh(&l2cap_sk_list.lock); +} + +static int l2cap_seq_show(struct seq_file *seq, void *e) +{ + struct sock *sk = e; + struct l2cap_pinfo *pi = l2cap_pi(sk); + + seq_printf(seq, "%s %s %d %d 0x%4.4x 0x%4.4x %d %d 0x%x\n", + batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), + sk->sk_state, pi->psm, pi->scid, pi->dcid, pi->imtu, + pi->omtu, pi->link_mode); + return 0; +} + +static struct seq_operations l2cap_seq_ops = { + .start = l2cap_seq_start, + .next = l2cap_seq_next, + .stop = l2cap_seq_stop, + .show = l2cap_seq_show +}; + +static int l2cap_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &l2cap_seq_ops); +} + +static struct file_operations l2cap_seq_fops = { + .owner = THIS_MODULE, + .open = l2cap_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init l2cap_proc_init(void) +{ + struct proc_dir_entry *p = create_proc_entry("l2cap", S_IRUGO, proc_bt); + if (!p) + return -ENOMEM; + p->owner = THIS_MODULE; + p->proc_fops = &l2cap_seq_fops; + return 0; +} + +static void __exit l2cap_proc_cleanup(void) +{ + remove_proc_entry("l2cap", proc_bt); +} + +#else /* CONFIG_PROC_FS */ + +static int __init l2cap_proc_init(void) +{ + return 0; +} + +static void __exit l2cap_proc_cleanup(void) +{ + return; +} +#endif /* CONFIG_PROC_FS */ + +static struct proto_ops l2cap_sock_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .release = l2cap_sock_release, + .bind = l2cap_sock_bind, + .connect = l2cap_sock_connect, + .listen = l2cap_sock_listen, + .accept = l2cap_sock_accept, + .getname = l2cap_sock_getname, + .sendmsg = l2cap_sock_sendmsg, + .recvmsg = bt_sock_recvmsg, + .poll = bt_sock_poll, + .mmap = sock_no_mmap, + .socketpair = sock_no_socketpair, + .ioctl = sock_no_ioctl, + .shutdown = l2cap_sock_shutdown, + .setsockopt = l2cap_sock_setsockopt, + .getsockopt = l2cap_sock_getsockopt +}; + +static struct net_proto_family l2cap_sock_family_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .create = l2cap_sock_create, +}; + +static struct hci_proto l2cap_hci_proto = { + .name = "L2CAP", + .id = HCI_PROTO_L2CAP, + .connect_ind = l2cap_connect_ind, + .connect_cfm = l2cap_connect_cfm, + .disconn_ind = l2cap_disconn_ind, + .auth_cfm = l2cap_auth_cfm, + .encrypt_cfm = l2cap_encrypt_cfm, + .recv_acldata = l2cap_recv_acldata +}; + +static int __init l2cap_init(void) +{ + int err; + + err = proto_register(&l2cap_proto, 0); + if (err < 0) + return err; + + err = bt_sock_register(BTPROTO_L2CAP, &l2cap_sock_family_ops); + if (err < 0) { + BT_ERR("L2CAP socket registration failed"); + goto error; + } + + err = hci_register_proto(&l2cap_hci_proto); + if (err < 0) { + BT_ERR("L2CAP protocol registration failed"); + bt_sock_unregister(BTPROTO_L2CAP); + goto error; + } + + l2cap_proc_init(); + + BT_INFO("L2CAP ver %s", VERSION); + BT_INFO("L2CAP socket layer initialized"); + + return 0; + +error: + proto_unregister(&l2cap_proto); + return err; +} + +static void __exit l2cap_exit(void) +{ + l2cap_proc_cleanup(); + + if (bt_sock_unregister(BTPROTO_L2CAP) < 0) + BT_ERR("L2CAP socket unregistration failed"); + + if (hci_unregister_proto(&l2cap_hci_proto) < 0) + BT_ERR("L2CAP protocol unregistration failed"); + + proto_unregister(&l2cap_proto); +} + +void l2cap_load(void) +{ + /* Dummy function to trigger automatic L2CAP module loading by + * other modules that use L2CAP sockets but don't use any other + * symbols from it. */ + return; +} +EXPORT_SYMBOL(l2cap_load); + +module_init(l2cap_init); +module_exit(l2cap_exit); + +MODULE_AUTHOR("Maxim Krasnyansky , Marcel Holtmann "); +MODULE_DESCRIPTION("Bluetooth L2CAP ver " VERSION); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("bt-proto-0"); diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c new file mode 100644 index 000000000000..9efb0a093612 --- /dev/null +++ b/net/bluetooth/lib.c @@ -0,0 +1,178 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2000-2001 Qualcomm Incorporated + + Written 2000,2001 by Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth kernel library. */ + +#include +#include + +#include +#include +#include +#include + +#include + +void bt_dump(char *pref, __u8 *buf, int count) +{ + char *ptr; + char line[100]; + unsigned int i; + + printk(KERN_INFO "%s: dump, len %d\n", pref, count); + + ptr = line; + *ptr = 0; + for (i = 0; i < count; i++) { + ptr += sprintf(ptr, " %2.2X", buf[i]); + + if (i && !((i + 1) % 20)) { + printk(KERN_INFO "%s:%s\n", pref, line); + ptr = line; + *ptr = 0; + } + } + + if (line[0]) + printk(KERN_INFO "%s:%s\n", pref, line); +} +EXPORT_SYMBOL(bt_dump); + +void baswap(bdaddr_t *dst, bdaddr_t *src) +{ + unsigned char *d = (unsigned char *) dst; + unsigned char *s = (unsigned char *) src; + unsigned int i; + + for (i = 0; i < 6; i++) + d[i] = s[5 - i]; +} +EXPORT_SYMBOL(baswap); + +char *batostr(bdaddr_t *ba) +{ + static char str[2][18]; + static int i = 1; + + i ^= 1; + sprintf(str[i], "%2.2X:%2.2X:%2.2X:%2.2X:%2.2X:%2.2X", + ba->b[0], ba->b[1], ba->b[2], + ba->b[3], ba->b[4], ba->b[5]); + + return str[i]; +} +EXPORT_SYMBOL(batostr); + +/* Bluetooth error codes to Unix errno mapping */ +int bt_err(__u16 code) +{ + switch (code) { + case 0: + return 0; + + case 0x01: + return EBADRQC; + + case 0x02: + return ENOTCONN; + + case 0x03: + return EIO; + + case 0x04: + return EHOSTDOWN; + + case 0x05: + return EACCES; + + case 0x06: + return EBADE; + + case 0x07: + return ENOMEM; + + case 0x08: + return ETIMEDOUT; + + case 0x09: + return EMLINK; + + case 0x0a: + return EMLINK; + + case 0x0b: + return EALREADY; + + case 0x0c: + return EBUSY; + + case 0x0d: + case 0x0e: + case 0x0f: + return ECONNREFUSED; + + case 0x10: + return ETIMEDOUT; + + case 0x11: + case 0x27: + case 0x29: + case 0x20: + return EOPNOTSUPP; + + case 0x12: + return EINVAL; + + case 0x13: + case 0x14: + case 0x15: + return ECONNRESET; + + case 0x16: + return ECONNABORTED; + + case 0x17: + return ELOOP; + + case 0x18: + return EACCES; + + case 0x1a: + return EPROTONOSUPPORT; + + case 0x1b: + return ECONNREFUSED; + + case 0x19: + case 0x1e: + case 0x23: + case 0x24: + case 0x25: + return EPROTO; + + default: + return ENOSYS; + } +} +EXPORT_SYMBOL(bt_err); diff --git a/net/bluetooth/rfcomm/Kconfig b/net/bluetooth/rfcomm/Kconfig new file mode 100644 index 000000000000..405a0e61e7dc --- /dev/null +++ b/net/bluetooth/rfcomm/Kconfig @@ -0,0 +1,17 @@ +config BT_RFCOMM + tristate "RFCOMM protocol support" + depends on BT && BT_L2CAP + help + RFCOMM provides connection oriented stream transport. RFCOMM + support is required for Dialup Networking, OBEX and other Bluetooth + applications. + + Say Y here to compile RFCOMM support into the kernel or say M to + compile it as module (rfcomm). + +config BT_RFCOMM_TTY + bool "RFCOMM TTY support" + depends on BT_RFCOMM + help + This option enables TTY emulation support for RFCOMM channels. + diff --git a/net/bluetooth/rfcomm/Makefile b/net/bluetooth/rfcomm/Makefile new file mode 100644 index 000000000000..aecec45ec68d --- /dev/null +++ b/net/bluetooth/rfcomm/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the Linux Bluetooth RFCOMM layer. +# + +obj-$(CONFIG_BT_RFCOMM) += rfcomm.o + +rfcomm-y := core.o sock.o crc.o +rfcomm-$(CONFIG_BT_RFCOMM_TTY) += tty.o diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c new file mode 100644 index 000000000000..e9e6fda66f1a --- /dev/null +++ b/net/bluetooth/rfcomm/core.c @@ -0,0 +1,2127 @@ +/* + RFCOMM implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2002 Maxim Krasnyansky + Copyright (C) 2002 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* + RPN support - Dirk Husemann +*/ + +/* + * Bluetooth RFCOMM core. + * + * $Id: core.c,v 1.42 2002/10/01 23:26:25 maxk Exp $ + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define VERSION "1.5" + +#ifndef CONFIG_BT_RFCOMM_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +#ifdef CONFIG_PROC_FS +struct proc_dir_entry *proc_bt_rfcomm; +#endif + +static struct task_struct *rfcomm_thread; + +static DECLARE_MUTEX(rfcomm_sem); +#define rfcomm_lock() down(&rfcomm_sem); +#define rfcomm_unlock() up(&rfcomm_sem); + +static unsigned long rfcomm_event; + +static LIST_HEAD(session_list); +static atomic_t terminate, running; + +static int rfcomm_send_frame(struct rfcomm_session *s, u8 *data, int len); +static int rfcomm_send_sabm(struct rfcomm_session *s, u8 dlci); +static int rfcomm_send_disc(struct rfcomm_session *s, u8 dlci); +static int rfcomm_queue_disc(struct rfcomm_dlc *d); +static int rfcomm_send_nsc(struct rfcomm_session *s, int cr, u8 type); +static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d); +static int rfcomm_send_msc(struct rfcomm_session *s, int cr, u8 dlci, u8 v24_sig); +static int rfcomm_send_test(struct rfcomm_session *s, int cr, u8 *pattern, int len); +static int rfcomm_send_credits(struct rfcomm_session *s, u8 addr, u8 credits); +static void rfcomm_make_uih(struct sk_buff *skb, u8 addr); + +static void rfcomm_process_connect(struct rfcomm_session *s); + +static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst, int *err); +static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst); +static void rfcomm_session_del(struct rfcomm_session *s); + +/* ---- RFCOMM frame parsing macros ---- */ +#define __get_dlci(b) ((b & 0xfc) >> 2) +#define __get_channel(b) ((b & 0xf8) >> 3) +#define __get_dir(b) ((b & 0x04) >> 2) +#define __get_type(b) ((b & 0xef)) + +#define __test_ea(b) ((b & 0x01)) +#define __test_cr(b) ((b & 0x02)) +#define __test_pf(b) ((b & 0x10)) + +#define __addr(cr, dlci) (((dlci & 0x3f) << 2) | (cr << 1) | 0x01) +#define __ctrl(type, pf) (((type & 0xef) | (pf << 4))) +#define __dlci(dir, chn) (((chn & 0x1f) << 1) | dir) +#define __srv_channel(dlci) (dlci >> 1) +#define __dir(dlci) (dlci & 0x01) + +#define __len8(len) (((len) << 1) | 1) +#define __len16(len) ((len) << 1) + +/* MCC macros */ +#define __mcc_type(cr, type) (((type << 2) | (cr << 1) | 0x01)) +#define __get_mcc_type(b) ((b & 0xfc) >> 2) +#define __get_mcc_len(b) ((b & 0xfe) >> 1) + +/* RPN macros */ +#define __rpn_line_settings(data, stop, parity) ((data & 0x3) | ((stop & 0x1) << 2) | ((parity & 0x3) << 3)) +#define __get_rpn_data_bits(line) ((line) & 0x3) +#define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1) +#define __get_rpn_parity(line) (((line) >> 3) & 0x3) + +static inline void rfcomm_schedule(uint event) +{ + if (!rfcomm_thread) + return; + //set_bit(event, &rfcomm_event); + set_bit(RFCOMM_SCHED_WAKEUP, &rfcomm_event); + wake_up_process(rfcomm_thread); +} + +static inline void rfcomm_session_put(struct rfcomm_session *s) +{ + if (atomic_dec_and_test(&s->refcnt)) + rfcomm_session_del(s); +} + +/* ---- RFCOMM FCS computation ---- */ + +/* CRC on 2 bytes */ +#define __crc(data) (rfcomm_crc_table[rfcomm_crc_table[0xff ^ data[0]] ^ data[1]]) + +/* FCS on 2 bytes */ +static inline u8 __fcs(u8 *data) +{ + return (0xff - __crc(data)); +} + +/* FCS on 3 bytes */ +static inline u8 __fcs2(u8 *data) +{ + return (0xff - rfcomm_crc_table[__crc(data) ^ data[2]]); +} + +/* Check FCS */ +static inline int __check_fcs(u8 *data, int type, u8 fcs) +{ + u8 f = __crc(data); + + if (type != RFCOMM_UIH) + f = rfcomm_crc_table[f ^ data[2]]; + + return rfcomm_crc_table[f ^ fcs] != 0xcf; +} + +/* ---- L2CAP callbacks ---- */ +static void rfcomm_l2state_change(struct sock *sk) +{ + BT_DBG("%p state %d", sk, sk->sk_state); + rfcomm_schedule(RFCOMM_SCHED_STATE); +} + +static void rfcomm_l2data_ready(struct sock *sk, int bytes) +{ + BT_DBG("%p bytes %d", sk, bytes); + rfcomm_schedule(RFCOMM_SCHED_RX); +} + +static int rfcomm_l2sock_create(struct socket **sock) +{ + int err; + + BT_DBG(""); + + err = sock_create_kern(PF_BLUETOOTH, SOCK_SEQPACKET, BTPROTO_L2CAP, sock); + if (!err) { + struct sock *sk = (*sock)->sk; + sk->sk_data_ready = rfcomm_l2data_ready; + sk->sk_state_change = rfcomm_l2state_change; + } + return err; +} + +/* ---- RFCOMM DLCs ---- */ +static void rfcomm_dlc_timeout(unsigned long arg) +{ + struct rfcomm_dlc *d = (void *) arg; + + BT_DBG("dlc %p state %ld", d, d->state); + + set_bit(RFCOMM_TIMED_OUT, &d->flags); + rfcomm_dlc_put(d); + rfcomm_schedule(RFCOMM_SCHED_TIMEO); +} + +static void rfcomm_dlc_set_timer(struct rfcomm_dlc *d, long timeout) +{ + BT_DBG("dlc %p state %ld timeout %ld", d, d->state, timeout); + + if (!mod_timer(&d->timer, jiffies + timeout)) + rfcomm_dlc_hold(d); +} + +static void rfcomm_dlc_clear_timer(struct rfcomm_dlc *d) +{ + BT_DBG("dlc %p state %ld", d, d->state); + + if (timer_pending(&d->timer) && del_timer(&d->timer)) + rfcomm_dlc_put(d); +} + +static void rfcomm_dlc_clear_state(struct rfcomm_dlc *d) +{ + BT_DBG("%p", d); + + d->state = BT_OPEN; + d->flags = 0; + d->mscex = 0; + d->mtu = RFCOMM_DEFAULT_MTU; + d->v24_sig = RFCOMM_V24_RTC | RFCOMM_V24_RTR | RFCOMM_V24_DV; + + d->cfc = RFCOMM_CFC_DISABLED; + d->rx_credits = RFCOMM_DEFAULT_CREDITS; +} + +struct rfcomm_dlc *rfcomm_dlc_alloc(int prio) +{ + struct rfcomm_dlc *d = kmalloc(sizeof(*d), prio); + if (!d) + return NULL; + memset(d, 0, sizeof(*d)); + + init_timer(&d->timer); + d->timer.function = rfcomm_dlc_timeout; + d->timer.data = (unsigned long) d; + + skb_queue_head_init(&d->tx_queue); + spin_lock_init(&d->lock); + atomic_set(&d->refcnt, 1); + + rfcomm_dlc_clear_state(d); + + BT_DBG("%p", d); + return d; +} + +void rfcomm_dlc_free(struct rfcomm_dlc *d) +{ + BT_DBG("%p", d); + + skb_queue_purge(&d->tx_queue); + kfree(d); +} + +static void rfcomm_dlc_link(struct rfcomm_session *s, struct rfcomm_dlc *d) +{ + BT_DBG("dlc %p session %p", d, s); + + rfcomm_session_hold(s); + + rfcomm_dlc_hold(d); + list_add(&d->list, &s->dlcs); + d->session = s; +} + +static void rfcomm_dlc_unlink(struct rfcomm_dlc *d) +{ + struct rfcomm_session *s = d->session; + + BT_DBG("dlc %p refcnt %d session %p", d, atomic_read(&d->refcnt), s); + + list_del(&d->list); + d->session = NULL; + rfcomm_dlc_put(d); + + rfcomm_session_put(s); +} + +static struct rfcomm_dlc *rfcomm_dlc_get(struct rfcomm_session *s, u8 dlci) +{ + struct rfcomm_dlc *d; + struct list_head *p; + + list_for_each(p, &s->dlcs) { + d = list_entry(p, struct rfcomm_dlc, list); + if (d->dlci == dlci) + return d; + } + return NULL; +} + +static int __rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel) +{ + struct rfcomm_session *s; + int err = 0; + u8 dlci; + + BT_DBG("dlc %p state %ld %s %s channel %d", + d, d->state, batostr(src), batostr(dst), channel); + + if (channel < 1 || channel > 30) + return -EINVAL; + + if (d->state != BT_OPEN && d->state != BT_CLOSED) + return 0; + + s = rfcomm_session_get(src, dst); + if (!s) { + s = rfcomm_session_create(src, dst, &err); + if (!s) + return err; + } + + dlci = __dlci(!s->initiator, channel); + + /* Check if DLCI already exists */ + if (rfcomm_dlc_get(s, dlci)) + return -EBUSY; + + rfcomm_dlc_clear_state(d); + + d->dlci = dlci; + d->addr = __addr(s->initiator, dlci); + d->priority = 7; + + d->state = BT_CONFIG; + rfcomm_dlc_link(s, d); + + d->mtu = s->mtu; + d->cfc = (s->cfc == RFCOMM_CFC_UNKNOWN) ? 0 : s->cfc; + + if (s->state == BT_CONNECTED) + rfcomm_send_pn(s, 1, d); + rfcomm_dlc_set_timer(d, RFCOMM_CONN_TIMEOUT); + return 0; +} + +int rfcomm_dlc_open(struct rfcomm_dlc *d, bdaddr_t *src, bdaddr_t *dst, u8 channel) +{ + int r; + + rfcomm_lock(); + + r = __rfcomm_dlc_open(d, src, dst, channel); + + rfcomm_unlock(); + return r; +} + +static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err) +{ + struct rfcomm_session *s = d->session; + if (!s) + return 0; + + BT_DBG("dlc %p state %ld dlci %d err %d session %p", + d, d->state, d->dlci, err, s); + + switch (d->state) { + case BT_CONNECTED: + case BT_CONFIG: + case BT_CONNECT: + d->state = BT_DISCONN; + if (skb_queue_empty(&d->tx_queue)) { + rfcomm_send_disc(s, d->dlci); + rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT); + } else { + rfcomm_queue_disc(d); + rfcomm_dlc_set_timer(d, RFCOMM_DISC_TIMEOUT * 2); + } + break; + + default: + rfcomm_dlc_clear_timer(d); + + rfcomm_dlc_lock(d); + d->state = BT_CLOSED; + d->state_change(d, err); + rfcomm_dlc_unlock(d); + + skb_queue_purge(&d->tx_queue); + rfcomm_session_put(s); + + rfcomm_dlc_unlink(d); + } + + return 0; +} + +int rfcomm_dlc_close(struct rfcomm_dlc *d, int err) +{ + int r; + + rfcomm_lock(); + + r = __rfcomm_dlc_close(d, err); + + rfcomm_unlock(); + return r; +} + +int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb) +{ + int len = skb->len; + + if (d->state != BT_CONNECTED) + return -ENOTCONN; + + BT_DBG("dlc %p mtu %d len %d", d, d->mtu, len); + + if (len > d->mtu) + return -EINVAL; + + rfcomm_make_uih(skb, d->addr); + skb_queue_tail(&d->tx_queue, skb); + + if (!test_bit(RFCOMM_TX_THROTTLED, &d->flags)) + rfcomm_schedule(RFCOMM_SCHED_TX); + return len; +} + +void fastcall __rfcomm_dlc_throttle(struct rfcomm_dlc *d) +{ + BT_DBG("dlc %p state %ld", d, d->state); + + if (!d->cfc) { + d->v24_sig |= RFCOMM_V24_FC; + set_bit(RFCOMM_MSC_PENDING, &d->flags); + } + rfcomm_schedule(RFCOMM_SCHED_TX); +} + +void fastcall __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d) +{ + BT_DBG("dlc %p state %ld", d, d->state); + + if (!d->cfc) { + d->v24_sig &= ~RFCOMM_V24_FC; + set_bit(RFCOMM_MSC_PENDING, &d->flags); + } + rfcomm_schedule(RFCOMM_SCHED_TX); +} + +/* + Set/get modem status functions use _local_ status i.e. what we report + to the other side. + Remote status is provided by dlc->modem_status() callback. + */ +int rfcomm_dlc_set_modem_status(struct rfcomm_dlc *d, u8 v24_sig) +{ + BT_DBG("dlc %p state %ld v24_sig 0x%x", + d, d->state, v24_sig); + + if (test_bit(RFCOMM_RX_THROTTLED, &d->flags)) + v24_sig |= RFCOMM_V24_FC; + else + v24_sig &= ~RFCOMM_V24_FC; + + d->v24_sig = v24_sig; + + if (!test_and_set_bit(RFCOMM_MSC_PENDING, &d->flags)) + rfcomm_schedule(RFCOMM_SCHED_TX); + + return 0; +} + +int rfcomm_dlc_get_modem_status(struct rfcomm_dlc *d, u8 *v24_sig) +{ + BT_DBG("dlc %p state %ld v24_sig 0x%x", + d, d->state, d->v24_sig); + + *v24_sig = d->v24_sig; + return 0; +} + +/* ---- RFCOMM sessions ---- */ +static struct rfcomm_session *rfcomm_session_add(struct socket *sock, int state) +{ + struct rfcomm_session *s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return NULL; + memset(s, 0, sizeof(*s)); + + BT_DBG("session %p sock %p", s, sock); + + INIT_LIST_HEAD(&s->dlcs); + s->state = state; + s->sock = sock; + + s->mtu = RFCOMM_DEFAULT_MTU; + s->cfc = RFCOMM_CFC_UNKNOWN; + + /* Do not increment module usage count for listening sessions. + * Otherwise we won't be able to unload the module. */ + if (state != BT_LISTEN) + if (!try_module_get(THIS_MODULE)) { + kfree(s); + return NULL; + } + + list_add(&s->list, &session_list); + + return s; +} + +static void rfcomm_session_del(struct rfcomm_session *s) +{ + int state = s->state; + + BT_DBG("session %p state %ld", s, s->state); + + list_del(&s->list); + + if (state == BT_CONNECTED) + rfcomm_send_disc(s, 0); + + sock_release(s->sock); + kfree(s); + + if (state != BT_LISTEN) + module_put(THIS_MODULE); +} + +static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst) +{ + struct rfcomm_session *s; + struct list_head *p, *n; + struct bt_sock *sk; + list_for_each_safe(p, n, &session_list) { + s = list_entry(p, struct rfcomm_session, list); + sk = bt_sk(s->sock->sk); + + if ((!bacmp(src, BDADDR_ANY) || !bacmp(&sk->src, src)) && + !bacmp(&sk->dst, dst)) + return s; + } + return NULL; +} + +static void rfcomm_session_close(struct rfcomm_session *s, int err) +{ + struct rfcomm_dlc *d; + struct list_head *p, *n; + + BT_DBG("session %p state %ld err %d", s, s->state, err); + + rfcomm_session_hold(s); + + s->state = BT_CLOSED; + + /* Close all dlcs */ + list_for_each_safe(p, n, &s->dlcs) { + d = list_entry(p, struct rfcomm_dlc, list); + d->state = BT_CLOSED; + __rfcomm_dlc_close(d, err); + } + + rfcomm_session_put(s); +} + +static struct rfcomm_session *rfcomm_session_create(bdaddr_t *src, bdaddr_t *dst, int *err) +{ + struct rfcomm_session *s = NULL; + struct sockaddr_l2 addr; + struct socket *sock; + struct sock *sk; + + BT_DBG("%s %s", batostr(src), batostr(dst)); + + *err = rfcomm_l2sock_create(&sock); + if (*err < 0) + return NULL; + + bacpy(&addr.l2_bdaddr, src); + addr.l2_family = AF_BLUETOOTH; + addr.l2_psm = 0; + *err = sock->ops->bind(sock, (struct sockaddr *) &addr, sizeof(addr)); + if (*err < 0) + goto failed; + + /* Set L2CAP options */ + sk = sock->sk; + lock_sock(sk); + l2cap_pi(sk)->imtu = RFCOMM_MAX_L2CAP_MTU; + release_sock(sk); + + s = rfcomm_session_add(sock, BT_BOUND); + if (!s) { + *err = -ENOMEM; + goto failed; + } + + rfcomm_session_hold(s); + + s->initiator = 1; + + bacpy(&addr.l2_bdaddr, dst); + addr.l2_family = AF_BLUETOOTH; + addr.l2_psm = htobs(RFCOMM_PSM); + *err = sock->ops->connect(sock, (struct sockaddr *) &addr, sizeof(addr), O_NONBLOCK); + if (*err == 0 || *err == -EAGAIN) + return s; + + rfcomm_session_del(s); + return NULL; + +failed: + sock_release(sock); + return NULL; +} + +void rfcomm_session_getaddr(struct rfcomm_session *s, bdaddr_t *src, bdaddr_t *dst) +{ + struct sock *sk = s->sock->sk; + if (src) + bacpy(src, &bt_sk(sk)->src); + if (dst) + bacpy(dst, &bt_sk(sk)->dst); +} + +/* ---- RFCOMM frame sending ---- */ +static int rfcomm_send_frame(struct rfcomm_session *s, u8 *data, int len) +{ + struct socket *sock = s->sock; + struct kvec iv = { data, len }; + struct msghdr msg; + + BT_DBG("session %p len %d", s, len); + + memset(&msg, 0, sizeof(msg)); + + return kernel_sendmsg(sock, &msg, &iv, 1, len); +} + +static int rfcomm_send_sabm(struct rfcomm_session *s, u8 dlci) +{ + struct rfcomm_cmd cmd; + + BT_DBG("%p dlci %d", s, dlci); + + cmd.addr = __addr(s->initiator, dlci); + cmd.ctrl = __ctrl(RFCOMM_SABM, 1); + cmd.len = __len8(0); + cmd.fcs = __fcs2((u8 *) &cmd); + + return rfcomm_send_frame(s, (void *) &cmd, sizeof(cmd)); +} + +static int rfcomm_send_ua(struct rfcomm_session *s, u8 dlci) +{ + struct rfcomm_cmd cmd; + + BT_DBG("%p dlci %d", s, dlci); + + cmd.addr = __addr(!s->initiator, dlci); + cmd.ctrl = __ctrl(RFCOMM_UA, 1); + cmd.len = __len8(0); + cmd.fcs = __fcs2((u8 *) &cmd); + + return rfcomm_send_frame(s, (void *) &cmd, sizeof(cmd)); +} + +static int rfcomm_send_disc(struct rfcomm_session *s, u8 dlci) +{ + struct rfcomm_cmd cmd; + + BT_DBG("%p dlci %d", s, dlci); + + cmd.addr = __addr(s->initiator, dlci); + cmd.ctrl = __ctrl(RFCOMM_DISC, 1); + cmd.len = __len8(0); + cmd.fcs = __fcs2((u8 *) &cmd); + + return rfcomm_send_frame(s, (void *) &cmd, sizeof(cmd)); +} + +static int rfcomm_queue_disc(struct rfcomm_dlc *d) +{ + struct rfcomm_cmd *cmd; + struct sk_buff *skb; + + BT_DBG("dlc %p dlci %d", d, d->dlci); + + skb = alloc_skb(sizeof(*cmd), GFP_KERNEL); + if (!skb) + return -ENOMEM; + + cmd = (void *) __skb_put(skb, sizeof(*cmd)); + cmd->addr = d->addr; + cmd->ctrl = __ctrl(RFCOMM_DISC, 1); + cmd->len = __len8(0); + cmd->fcs = __fcs2((u8 *) cmd); + + skb_queue_tail(&d->tx_queue, skb); + rfcomm_schedule(RFCOMM_SCHED_TX); + return 0; +} + +static int rfcomm_send_dm(struct rfcomm_session *s, u8 dlci) +{ + struct rfcomm_cmd cmd; + + BT_DBG("%p dlci %d", s, dlci); + + cmd.addr = __addr(!s->initiator, dlci); + cmd.ctrl = __ctrl(RFCOMM_DM, 1); + cmd.len = __len8(0); + cmd.fcs = __fcs2((u8 *) &cmd); + + return rfcomm_send_frame(s, (void *) &cmd, sizeof(cmd)); +} + +static int rfcomm_send_nsc(struct rfcomm_session *s, int cr, u8 type) +{ + struct rfcomm_hdr *hdr; + struct rfcomm_mcc *mcc; + u8 buf[16], *ptr = buf; + + BT_DBG("%p cr %d type %d", s, cr, type); + + hdr = (void *) ptr; ptr += sizeof(*hdr); + hdr->addr = __addr(s->initiator, 0); + hdr->ctrl = __ctrl(RFCOMM_UIH, 0); + hdr->len = __len8(sizeof(*mcc) + 1); + + mcc = (void *) ptr; ptr += sizeof(*mcc); + mcc->type = __mcc_type(cr, RFCOMM_NSC); + mcc->len = __len8(1); + + /* Type that we didn't like */ + *ptr = __mcc_type(cr, type); ptr++; + + *ptr = __fcs(buf); ptr++; + + return rfcomm_send_frame(s, buf, ptr - buf); +} + +static int rfcomm_send_pn(struct rfcomm_session *s, int cr, struct rfcomm_dlc *d) +{ + struct rfcomm_hdr *hdr; + struct rfcomm_mcc *mcc; + struct rfcomm_pn *pn; + u8 buf[16], *ptr = buf; + + BT_DBG("%p cr %d dlci %d mtu %d", s, cr, d->dlci, d->mtu); + + hdr = (void *) ptr; ptr += sizeof(*hdr); + hdr->addr = __addr(s->initiator, 0); + hdr->ctrl = __ctrl(RFCOMM_UIH, 0); + hdr->len = __len8(sizeof(*mcc) + sizeof(*pn)); + + mcc = (void *) ptr; ptr += sizeof(*mcc); + mcc->type = __mcc_type(cr, RFCOMM_PN); + mcc->len = __len8(sizeof(*pn)); + + pn = (void *) ptr; ptr += sizeof(*pn); + pn->dlci = d->dlci; + pn->priority = d->priority; + pn->ack_timer = 0; + pn->max_retrans = 0; + + if (s->cfc) { + pn->flow_ctrl = cr ? 0xf0 : 0xe0; + pn->credits = RFCOMM_DEFAULT_CREDITS; + } else { + pn->flow_ctrl = 0; + pn->credits = 0; + } + + pn->mtu = htobs(d->mtu); + + *ptr = __fcs(buf); ptr++; + + return rfcomm_send_frame(s, buf, ptr - buf); +} + +static int rfcomm_send_rpn(struct rfcomm_session *s, int cr, u8 dlci, + u8 bit_rate, u8 data_bits, u8 stop_bits, + u8 parity, u8 flow_ctrl_settings, + u8 xon_char, u8 xoff_char, u16 param_mask) +{ + struct rfcomm_hdr *hdr; + struct rfcomm_mcc *mcc; + struct rfcomm_rpn *rpn; + u8 buf[16], *ptr = buf; + + BT_DBG("%p cr %d dlci %d bit_r 0x%x data_b 0x%x stop_b 0x%x parity 0x%x" + "flwc_s 0x%x xon_c 0x%x xoff_c 0x%x p_mask 0x%x", + s, cr, dlci, bit_rate, data_bits, stop_bits, parity, + flow_ctrl_settings, xon_char, xoff_char, param_mask); + + hdr = (void *) ptr; ptr += sizeof(*hdr); + hdr->addr = __addr(s->initiator, 0); + hdr->ctrl = __ctrl(RFCOMM_UIH, 0); + hdr->len = __len8(sizeof(*mcc) + sizeof(*rpn)); + + mcc = (void *) ptr; ptr += sizeof(*mcc); + mcc->type = __mcc_type(cr, RFCOMM_RPN); + mcc->len = __len8(sizeof(*rpn)); + + rpn = (void *) ptr; ptr += sizeof(*rpn); + rpn->dlci = __addr(1, dlci); + rpn->bit_rate = bit_rate; + rpn->line_settings = __rpn_line_settings(data_bits, stop_bits, parity); + rpn->flow_ctrl = flow_ctrl_settings; + rpn->xon_char = xon_char; + rpn->xoff_char = xoff_char; + rpn->param_mask = param_mask; + + *ptr = __fcs(buf); ptr++; + + return rfcomm_send_frame(s, buf, ptr - buf); +} + +static int rfcomm_send_rls(struct rfcomm_session *s, int cr, u8 dlci, u8 status) +{ + struct rfcomm_hdr *hdr; + struct rfcomm_mcc *mcc; + struct rfcomm_rls *rls; + u8 buf[16], *ptr = buf; + + BT_DBG("%p cr %d status 0x%x", s, cr, status); + + hdr = (void *) ptr; ptr += sizeof(*hdr); + hdr->addr = __addr(s->initiator, 0); + hdr->ctrl = __ctrl(RFCOMM_UIH, 0); + hdr->len = __len8(sizeof(*mcc) + sizeof(*rls)); + + mcc = (void *) ptr; ptr += sizeof(*mcc); + mcc->type = __mcc_type(cr, RFCOMM_RLS); + mcc->len = __len8(sizeof(*rls)); + + rls = (void *) ptr; ptr += sizeof(*rls); + rls->dlci = __addr(1, dlci); + rls->status = status; + + *ptr = __fcs(buf); ptr++; + + return rfcomm_send_frame(s, buf, ptr - buf); +} + +static int rfcomm_send_msc(struct rfcomm_session *s, int cr, u8 dlci, u8 v24_sig) +{ + struct rfcomm_hdr *hdr; + struct rfcomm_mcc *mcc; + struct rfcomm_msc *msc; + u8 buf[16], *ptr = buf; + + BT_DBG("%p cr %d v24 0x%x", s, cr, v24_sig); + + hdr = (void *) ptr; ptr += sizeof(*hdr); + hdr->addr = __addr(s->initiator, 0); + hdr->ctrl = __ctrl(RFCOMM_UIH, 0); + hdr->len = __len8(sizeof(*mcc) + sizeof(*msc)); + + mcc = (void *) ptr; ptr += sizeof(*mcc); + mcc->type = __mcc_type(cr, RFCOMM_MSC); + mcc->len = __len8(sizeof(*msc)); + + msc = (void *) ptr; ptr += sizeof(*msc); + msc->dlci = __addr(1, dlci); + msc->v24_sig = v24_sig | 0x01; + + *ptr = __fcs(buf); ptr++; + + return rfcomm_send_frame(s, buf, ptr - buf); +} + +static int rfcomm_send_fcoff(struct rfcomm_session *s, int cr) +{ + struct rfcomm_hdr *hdr; + struct rfcomm_mcc *mcc; + u8 buf[16], *ptr = buf; + + BT_DBG("%p cr %d", s, cr); + + hdr = (void *) ptr; ptr += sizeof(*hdr); + hdr->addr = __addr(s->initiator, 0); + hdr->ctrl = __ctrl(RFCOMM_UIH, 0); + hdr->len = __len8(sizeof(*mcc)); + + mcc = (void *) ptr; ptr += sizeof(*mcc); + mcc->type = __mcc_type(cr, RFCOMM_FCOFF); + mcc->len = __len8(0); + + *ptr = __fcs(buf); ptr++; + + return rfcomm_send_frame(s, buf, ptr - buf); +} + +static int rfcomm_send_fcon(struct rfcomm_session *s, int cr) +{ + struct rfcomm_hdr *hdr; + struct rfcomm_mcc *mcc; + u8 buf[16], *ptr = buf; + + BT_DBG("%p cr %d", s, cr); + + hdr = (void *) ptr; ptr += sizeof(*hdr); + hdr->addr = __addr(s->initiator, 0); + hdr->ctrl = __ctrl(RFCOMM_UIH, 0); + hdr->len = __len8(sizeof(*mcc)); + + mcc = (void *) ptr; ptr += sizeof(*mcc); + mcc->type = __mcc_type(cr, RFCOMM_FCON); + mcc->len = __len8(0); + + *ptr = __fcs(buf); ptr++; + + return rfcomm_send_frame(s, buf, ptr - buf); +} + +static int rfcomm_send_test(struct rfcomm_session *s, int cr, u8 *pattern, int len) +{ + struct socket *sock = s->sock; + struct kvec iv[3]; + struct msghdr msg; + unsigned char hdr[5], crc[1]; + + if (len > 125) + return -EINVAL; + + BT_DBG("%p cr %d", s, cr); + + hdr[0] = __addr(s->initiator, 0); + hdr[1] = __ctrl(RFCOMM_UIH, 0); + hdr[2] = 0x01 | ((len + 2) << 1); + hdr[3] = 0x01 | ((cr & 0x01) << 1) | (RFCOMM_TEST << 2); + hdr[4] = 0x01 | (len << 1); + + crc[0] = __fcs(hdr); + + iv[0].iov_base = hdr; + iv[0].iov_len = 5; + iv[1].iov_base = pattern; + iv[1].iov_len = len; + iv[2].iov_base = crc; + iv[2].iov_len = 1; + + memset(&msg, 0, sizeof(msg)); + + return kernel_sendmsg(sock, &msg, iv, 3, 6 + len); +} + +static int rfcomm_send_credits(struct rfcomm_session *s, u8 addr, u8 credits) +{ + struct rfcomm_hdr *hdr; + u8 buf[16], *ptr = buf; + + BT_DBG("%p addr %d credits %d", s, addr, credits); + + hdr = (void *) ptr; ptr += sizeof(*hdr); + hdr->addr = addr; + hdr->ctrl = __ctrl(RFCOMM_UIH, 1); + hdr->len = __len8(0); + + *ptr = credits; ptr++; + + *ptr = __fcs(buf); ptr++; + + return rfcomm_send_frame(s, buf, ptr - buf); +} + +static void rfcomm_make_uih(struct sk_buff *skb, u8 addr) +{ + struct rfcomm_hdr *hdr; + int len = skb->len; + u8 *crc; + + if (len > 127) { + hdr = (void *) skb_push(skb, 4); + put_unaligned(htobs(__len16(len)), (u16 *) &hdr->len); + } else { + hdr = (void *) skb_push(skb, 3); + hdr->len = __len8(len); + } + hdr->addr = addr; + hdr->ctrl = __ctrl(RFCOMM_UIH, 0); + + crc = skb_put(skb, 1); + *crc = __fcs((void *) hdr); +} + +/* ---- RFCOMM frame reception ---- */ +static int rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci) +{ + BT_DBG("session %p state %ld dlci %d", s, s->state, dlci); + + if (dlci) { + /* Data channel */ + struct rfcomm_dlc *d = rfcomm_dlc_get(s, dlci); + if (!d) { + rfcomm_send_dm(s, dlci); + return 0; + } + + switch (d->state) { + case BT_CONNECT: + rfcomm_dlc_clear_timer(d); + + rfcomm_dlc_lock(d); + d->state = BT_CONNECTED; + d->state_change(d, 0); + rfcomm_dlc_unlock(d); + + rfcomm_send_msc(s, 1, dlci, d->v24_sig); + break; + + case BT_DISCONN: + d->state = BT_CLOSED; + __rfcomm_dlc_close(d, 0); + break; + } + } else { + /* Control channel */ + switch (s->state) { + case BT_CONNECT: + s->state = BT_CONNECTED; + rfcomm_process_connect(s); + break; + } + } + return 0; +} + +static int rfcomm_recv_dm(struct rfcomm_session *s, u8 dlci) +{ + int err = 0; + + BT_DBG("session %p state %ld dlci %d", s, s->state, dlci); + + if (dlci) { + /* Data DLC */ + struct rfcomm_dlc *d = rfcomm_dlc_get(s, dlci); + if (d) { + if (d->state == BT_CONNECT || d->state == BT_CONFIG) + err = ECONNREFUSED; + else + err = ECONNRESET; + + d->state = BT_CLOSED; + __rfcomm_dlc_close(d, err); + } + } else { + if (s->state == BT_CONNECT) + err = ECONNREFUSED; + else + err = ECONNRESET; + + s->state = BT_CLOSED; + rfcomm_session_close(s, err); + } + return 0; +} + +static int rfcomm_recv_disc(struct rfcomm_session *s, u8 dlci) +{ + int err = 0; + + BT_DBG("session %p state %ld dlci %d", s, s->state, dlci); + + if (dlci) { + struct rfcomm_dlc *d = rfcomm_dlc_get(s, dlci); + if (d) { + rfcomm_send_ua(s, dlci); + + if (d->state == BT_CONNECT || d->state == BT_CONFIG) + err = ECONNREFUSED; + else + err = ECONNRESET; + + d->state = BT_CLOSED; + __rfcomm_dlc_close(d, err); + } else + rfcomm_send_dm(s, dlci); + + } else { + rfcomm_send_ua(s, 0); + + if (s->state == BT_CONNECT) + err = ECONNREFUSED; + else + err = ECONNRESET; + + s->state = BT_CLOSED; + rfcomm_session_close(s, err); + } + + return 0; +} + +static inline int rfcomm_check_link_mode(struct rfcomm_dlc *d) +{ + struct sock *sk = d->session->sock->sk; + + if (d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) { + if (!hci_conn_encrypt(l2cap_pi(sk)->conn->hcon)) + return 1; + } else if (d->link_mode & RFCOMM_LM_AUTH) { + if (!hci_conn_auth(l2cap_pi(sk)->conn->hcon)) + return 1; + } + + return 0; +} + +static void rfcomm_dlc_accept(struct rfcomm_dlc *d) +{ + BT_DBG("dlc %p", d); + + rfcomm_send_ua(d->session, d->dlci); + + rfcomm_dlc_lock(d); + d->state = BT_CONNECTED; + d->state_change(d, 0); + rfcomm_dlc_unlock(d); + + rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig); +} + +static int rfcomm_recv_sabm(struct rfcomm_session *s, u8 dlci) +{ + struct rfcomm_dlc *d; + u8 channel; + + BT_DBG("session %p state %ld dlci %d", s, s->state, dlci); + + if (!dlci) { + rfcomm_send_ua(s, 0); + + if (s->state == BT_OPEN) { + s->state = BT_CONNECTED; + rfcomm_process_connect(s); + } + return 0; + } + + /* Check if DLC exists */ + d = rfcomm_dlc_get(s, dlci); + if (d) { + if (d->state == BT_OPEN) { + /* DLC was previously opened by PN request */ + if (rfcomm_check_link_mode(d)) { + set_bit(RFCOMM_AUTH_PENDING, &d->flags); + rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); + return 0; + } + + rfcomm_dlc_accept(d); + } + return 0; + } + + /* Notify socket layer about incoming connection */ + channel = __srv_channel(dlci); + if (rfcomm_connect_ind(s, channel, &d)) { + d->dlci = dlci; + d->addr = __addr(s->initiator, dlci); + rfcomm_dlc_link(s, d); + + if (rfcomm_check_link_mode(d)) { + set_bit(RFCOMM_AUTH_PENDING, &d->flags); + rfcomm_dlc_set_timer(d, RFCOMM_AUTH_TIMEOUT); + return 0; + } + + rfcomm_dlc_accept(d); + } else { + rfcomm_send_dm(s, dlci); + } + + return 0; +} + +static int rfcomm_apply_pn(struct rfcomm_dlc *d, int cr, struct rfcomm_pn *pn) +{ + struct rfcomm_session *s = d->session; + + BT_DBG("dlc %p state %ld dlci %d mtu %d fc 0x%x credits %d", + d, d->state, d->dlci, pn->mtu, pn->flow_ctrl, pn->credits); + + if (pn->flow_ctrl == 0xf0 || pn->flow_ctrl == 0xe0) { + d->cfc = s->cfc = RFCOMM_CFC_ENABLED; + d->tx_credits = pn->credits; + } else { + d->cfc = s->cfc = RFCOMM_CFC_DISABLED; + set_bit(RFCOMM_TX_THROTTLED, &d->flags); + } + + d->priority = pn->priority; + + d->mtu = s->mtu = btohs(pn->mtu); + + return 0; +} + +static int rfcomm_recv_pn(struct rfcomm_session *s, int cr, struct sk_buff *skb) +{ + struct rfcomm_pn *pn = (void *) skb->data; + struct rfcomm_dlc *d; + u8 dlci = pn->dlci; + + BT_DBG("session %p state %ld dlci %d", s, s->state, dlci); + + if (!dlci) + return 0; + + d = rfcomm_dlc_get(s, dlci); + if (d) { + if (cr) { + /* PN request */ + rfcomm_apply_pn(d, cr, pn); + rfcomm_send_pn(s, 0, d); + } else { + /* PN response */ + switch (d->state) { + case BT_CONFIG: + rfcomm_apply_pn(d, cr, pn); + + d->state = BT_CONNECT; + rfcomm_send_sabm(s, d->dlci); + break; + } + } + } else { + u8 channel = __srv_channel(dlci); + + if (!cr) + return 0; + + /* PN request for non existing DLC. + * Assume incoming connection. */ + if (rfcomm_connect_ind(s, channel, &d)) { + d->dlci = dlci; + d->addr = __addr(s->initiator, dlci); + rfcomm_dlc_link(s, d); + + rfcomm_apply_pn(d, cr, pn); + + d->state = BT_OPEN; + rfcomm_send_pn(s, 0, d); + } else { + rfcomm_send_dm(s, dlci); + } + } + return 0; +} + +static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_buff *skb) +{ + struct rfcomm_rpn *rpn = (void *) skb->data; + u8 dlci = __get_dlci(rpn->dlci); + + u8 bit_rate = 0; + u8 data_bits = 0; + u8 stop_bits = 0; + u8 parity = 0; + u8 flow_ctrl = 0; + u8 xon_char = 0; + u8 xoff_char = 0; + u16 rpn_mask = RFCOMM_RPN_PM_ALL; + + BT_DBG("dlci %d cr %d len 0x%x bitr 0x%x line 0x%x flow 0x%x xonc 0x%x xoffc 0x%x pm 0x%x", + dlci, cr, len, rpn->bit_rate, rpn->line_settings, rpn->flow_ctrl, + rpn->xon_char, rpn->xoff_char, rpn->param_mask); + + if (!cr) + return 0; + + if (len == 1) { + /* request: return default setting */ + bit_rate = RFCOMM_RPN_BR_115200; + data_bits = RFCOMM_RPN_DATA_8; + stop_bits = RFCOMM_RPN_STOP_1; + parity = RFCOMM_RPN_PARITY_NONE; + flow_ctrl = RFCOMM_RPN_FLOW_NONE; + xon_char = RFCOMM_RPN_XON_CHAR; + xoff_char = RFCOMM_RPN_XOFF_CHAR; + + goto rpn_out; + } + /* check for sane values: ignore/accept bit_rate, 8 bits, 1 stop bit, no parity, + no flow control lines, normal XON/XOFF chars */ + if (rpn->param_mask & RFCOMM_RPN_PM_BITRATE) { + bit_rate = rpn->bit_rate; + if (bit_rate != RFCOMM_RPN_BR_115200) { + BT_DBG("RPN bit rate mismatch 0x%x", bit_rate); + bit_rate = RFCOMM_RPN_BR_115200; + rpn_mask ^= RFCOMM_RPN_PM_BITRATE; + } + } + if (rpn->param_mask & RFCOMM_RPN_PM_DATA) { + data_bits = __get_rpn_data_bits(rpn->line_settings); + if (data_bits != RFCOMM_RPN_DATA_8) { + BT_DBG("RPN data bits mismatch 0x%x", data_bits); + data_bits = RFCOMM_RPN_DATA_8; + rpn_mask ^= RFCOMM_RPN_PM_DATA; + } + } + if (rpn->param_mask & RFCOMM_RPN_PM_STOP) { + stop_bits = __get_rpn_stop_bits(rpn->line_settings); + if (stop_bits != RFCOMM_RPN_STOP_1) { + BT_DBG("RPN stop bits mismatch 0x%x", stop_bits); + stop_bits = RFCOMM_RPN_STOP_1; + rpn_mask ^= RFCOMM_RPN_PM_STOP; + } + } + if (rpn->param_mask & RFCOMM_RPN_PM_PARITY) { + parity = __get_rpn_parity(rpn->line_settings); + if (parity != RFCOMM_RPN_PARITY_NONE) { + BT_DBG("RPN parity mismatch 0x%x", parity); + parity = RFCOMM_RPN_PARITY_NONE; + rpn_mask ^= RFCOMM_RPN_PM_PARITY; + } + } + if (rpn->param_mask & RFCOMM_RPN_PM_FLOW) { + flow_ctrl = rpn->flow_ctrl; + if (flow_ctrl != RFCOMM_RPN_FLOW_NONE) { + BT_DBG("RPN flow ctrl mismatch 0x%x", flow_ctrl); + flow_ctrl = RFCOMM_RPN_FLOW_NONE; + rpn_mask ^= RFCOMM_RPN_PM_FLOW; + } + } + if (rpn->param_mask & RFCOMM_RPN_PM_XON) { + xon_char = rpn->xon_char; + if (xon_char != RFCOMM_RPN_XON_CHAR) { + BT_DBG("RPN XON char mismatch 0x%x", xon_char); + xon_char = RFCOMM_RPN_XON_CHAR; + rpn_mask ^= RFCOMM_RPN_PM_XON; + } + } + if (rpn->param_mask & RFCOMM_RPN_PM_XOFF) { + xoff_char = rpn->xoff_char; + if (xoff_char != RFCOMM_RPN_XOFF_CHAR) { + BT_DBG("RPN XOFF char mismatch 0x%x", xoff_char); + xoff_char = RFCOMM_RPN_XOFF_CHAR; + rpn_mask ^= RFCOMM_RPN_PM_XOFF; + } + } + +rpn_out: + rfcomm_send_rpn(s, 0, dlci, + bit_rate, data_bits, stop_bits, parity, flow_ctrl, + xon_char, xoff_char, rpn_mask); + + return 0; +} + +static int rfcomm_recv_rls(struct rfcomm_session *s, int cr, struct sk_buff *skb) +{ + struct rfcomm_rls *rls = (void *) skb->data; + u8 dlci = __get_dlci(rls->dlci); + + BT_DBG("dlci %d cr %d status 0x%x", dlci, cr, rls->status); + + if (!cr) + return 0; + + /* FIXME: We should probably do something with this + information here. But for now it's sufficient just + to reply -- Bluetooth 1.1 says it's mandatory to + recognise and respond to RLS */ + + rfcomm_send_rls(s, 0, dlci, rls->status); + + return 0; +} + +static int rfcomm_recv_msc(struct rfcomm_session *s, int cr, struct sk_buff *skb) +{ + struct rfcomm_msc *msc = (void *) skb->data; + struct rfcomm_dlc *d; + u8 dlci = __get_dlci(msc->dlci); + + BT_DBG("dlci %d cr %d v24 0x%x", dlci, cr, msc->v24_sig); + + d = rfcomm_dlc_get(s, dlci); + if (!d) + return 0; + + if (cr) { + if (msc->v24_sig & RFCOMM_V24_FC && !d->cfc) + set_bit(RFCOMM_TX_THROTTLED, &d->flags); + else + clear_bit(RFCOMM_TX_THROTTLED, &d->flags); + + rfcomm_dlc_lock(d); + if (d->modem_status) + d->modem_status(d, msc->v24_sig); + rfcomm_dlc_unlock(d); + + rfcomm_send_msc(s, 0, dlci, msc->v24_sig); + + d->mscex |= RFCOMM_MSCEX_RX; + } else + d->mscex |= RFCOMM_MSCEX_TX; + + return 0; +} + +static int rfcomm_recv_mcc(struct rfcomm_session *s, struct sk_buff *skb) +{ + struct rfcomm_mcc *mcc = (void *) skb->data; + u8 type, cr, len; + + cr = __test_cr(mcc->type); + type = __get_mcc_type(mcc->type); + len = __get_mcc_len(mcc->len); + + BT_DBG("%p type 0x%x cr %d", s, type, cr); + + skb_pull(skb, 2); + + switch (type) { + case RFCOMM_PN: + rfcomm_recv_pn(s, cr, skb); + break; + + case RFCOMM_RPN: + rfcomm_recv_rpn(s, cr, len, skb); + break; + + case RFCOMM_RLS: + rfcomm_recv_rls(s, cr, skb); + break; + + case RFCOMM_MSC: + rfcomm_recv_msc(s, cr, skb); + break; + + case RFCOMM_FCOFF: + if (cr) { + set_bit(RFCOMM_TX_THROTTLED, &s->flags); + rfcomm_send_fcoff(s, 0); + } + break; + + case RFCOMM_FCON: + if (cr) { + clear_bit(RFCOMM_TX_THROTTLED, &s->flags); + rfcomm_send_fcon(s, 0); + } + break; + + case RFCOMM_TEST: + if (cr) + rfcomm_send_test(s, 0, skb->data, skb->len); + break; + + case RFCOMM_NSC: + break; + + default: + BT_ERR("Unknown control type 0x%02x", type); + rfcomm_send_nsc(s, cr, type); + break; + } + return 0; +} + +static int rfcomm_recv_data(struct rfcomm_session *s, u8 dlci, int pf, struct sk_buff *skb) +{ + struct rfcomm_dlc *d; + + BT_DBG("session %p state %ld dlci %d pf %d", s, s->state, dlci, pf); + + d = rfcomm_dlc_get(s, dlci); + if (!d) { + rfcomm_send_dm(s, dlci); + goto drop; + } + + if (pf && d->cfc) { + u8 credits = *(u8 *) skb->data; skb_pull(skb, 1); + + d->tx_credits += credits; + if (d->tx_credits) + clear_bit(RFCOMM_TX_THROTTLED, &d->flags); + } + + if (skb->len && d->state == BT_CONNECTED) { + rfcomm_dlc_lock(d); + d->rx_credits--; + d->data_ready(d, skb); + rfcomm_dlc_unlock(d); + return 0; + } + +drop: + kfree_skb(skb); + return 0; +} + +static int rfcomm_recv_frame(struct rfcomm_session *s, struct sk_buff *skb) +{ + struct rfcomm_hdr *hdr = (void *) skb->data; + u8 type, dlci, fcs; + + dlci = __get_dlci(hdr->addr); + type = __get_type(hdr->ctrl); + + /* Trim FCS */ + skb->len--; skb->tail--; + fcs = *(u8 *) skb->tail; + + if (__check_fcs(skb->data, type, fcs)) { + BT_ERR("bad checksum in packet"); + kfree_skb(skb); + return -EILSEQ; + } + + if (__test_ea(hdr->len)) + skb_pull(skb, 3); + else + skb_pull(skb, 4); + + switch (type) { + case RFCOMM_SABM: + if (__test_pf(hdr->ctrl)) + rfcomm_recv_sabm(s, dlci); + break; + + case RFCOMM_DISC: + if (__test_pf(hdr->ctrl)) + rfcomm_recv_disc(s, dlci); + break; + + case RFCOMM_UA: + if (__test_pf(hdr->ctrl)) + rfcomm_recv_ua(s, dlci); + break; + + case RFCOMM_DM: + rfcomm_recv_dm(s, dlci); + break; + + case RFCOMM_UIH: + if (dlci) + return rfcomm_recv_data(s, dlci, __test_pf(hdr->ctrl), skb); + + rfcomm_recv_mcc(s, skb); + break; + + default: + BT_ERR("Unknown packet type 0x%02x\n", type); + break; + } + kfree_skb(skb); + return 0; +} + +/* ---- Connection and data processing ---- */ + +static void rfcomm_process_connect(struct rfcomm_session *s) +{ + struct rfcomm_dlc *d; + struct list_head *p, *n; + + BT_DBG("session %p state %ld", s, s->state); + + list_for_each_safe(p, n, &s->dlcs) { + d = list_entry(p, struct rfcomm_dlc, list); + if (d->state == BT_CONFIG) { + d->mtu = s->mtu; + rfcomm_send_pn(s, 1, d); + } + } +} + +/* Send data queued for the DLC. + * Return number of frames left in the queue. + */ +static inline int rfcomm_process_tx(struct rfcomm_dlc *d) +{ + struct sk_buff *skb; + int err; + + BT_DBG("dlc %p state %ld cfc %d rx_credits %d tx_credits %d", + d, d->state, d->cfc, d->rx_credits, d->tx_credits); + + /* Send pending MSC */ + if (test_and_clear_bit(RFCOMM_MSC_PENDING, &d->flags)) + rfcomm_send_msc(d->session, 1, d->dlci, d->v24_sig); + + if (d->cfc) { + /* CFC enabled. + * Give them some credits */ + if (!test_bit(RFCOMM_RX_THROTTLED, &d->flags) && + d->rx_credits <= (d->cfc >> 2)) { + rfcomm_send_credits(d->session, d->addr, d->cfc - d->rx_credits); + d->rx_credits = d->cfc; + } + } else { + /* CFC disabled. + * Give ourselves some credits */ + d->tx_credits = 5; + } + + if (test_bit(RFCOMM_TX_THROTTLED, &d->flags)) + return skb_queue_len(&d->tx_queue); + + while (d->tx_credits && (skb = skb_dequeue(&d->tx_queue))) { + err = rfcomm_send_frame(d->session, skb->data, skb->len); + if (err < 0) { + skb_queue_head(&d->tx_queue, skb); + break; + } + kfree_skb(skb); + d->tx_credits--; + } + + if (d->cfc && !d->tx_credits) { + /* We're out of TX credits. + * Set TX_THROTTLED flag to avoid unnesary wakeups by dlc_send. */ + set_bit(RFCOMM_TX_THROTTLED, &d->flags); + } + + return skb_queue_len(&d->tx_queue); +} + +static inline void rfcomm_process_dlcs(struct rfcomm_session *s) +{ + struct rfcomm_dlc *d; + struct list_head *p, *n; + + BT_DBG("session %p state %ld", s, s->state); + + list_for_each_safe(p, n, &s->dlcs) { + d = list_entry(p, struct rfcomm_dlc, list); + + if (test_bit(RFCOMM_TIMED_OUT, &d->flags)) { + __rfcomm_dlc_close(d, ETIMEDOUT); + continue; + } + + if (test_and_clear_bit(RFCOMM_AUTH_ACCEPT, &d->flags)) { + rfcomm_dlc_clear_timer(d); + rfcomm_dlc_accept(d); + if (d->link_mode & RFCOMM_LM_SECURE) { + struct sock *sk = s->sock->sk; + hci_conn_change_link_key(l2cap_pi(sk)->conn->hcon); + } + continue; + } else if (test_and_clear_bit(RFCOMM_AUTH_REJECT, &d->flags)) { + rfcomm_dlc_clear_timer(d); + rfcomm_send_dm(s, d->dlci); + __rfcomm_dlc_close(d, ECONNREFUSED); + continue; + } + + if (test_bit(RFCOMM_TX_THROTTLED, &s->flags)) + continue; + + if ((d->state == BT_CONNECTED || d->state == BT_DISCONN) && + d->mscex == RFCOMM_MSCEX_OK) + rfcomm_process_tx(d); + } +} + +static inline void rfcomm_process_rx(struct rfcomm_session *s) +{ + struct socket *sock = s->sock; + struct sock *sk = sock->sk; + struct sk_buff *skb; + + BT_DBG("session %p state %ld qlen %d", s, s->state, skb_queue_len(&sk->sk_receive_queue)); + + /* Get data directly from socket receive queue without copying it. */ + while ((skb = skb_dequeue(&sk->sk_receive_queue))) { + skb_orphan(skb); + rfcomm_recv_frame(s, skb); + } + + if (sk->sk_state == BT_CLOSED) { + if (!s->initiator) + rfcomm_session_put(s); + + rfcomm_session_close(s, sk->sk_err); + } +} + +static inline void rfcomm_accept_connection(struct rfcomm_session *s) +{ + struct socket *sock = s->sock, *nsock; + int err; + + /* Fast check for a new connection. + * Avoids unnesesary socket allocations. */ + if (list_empty(&bt_sk(sock->sk)->accept_q)) + return; + + BT_DBG("session %p", s); + + if (sock_create_lite(PF_BLUETOOTH, sock->type, BTPROTO_L2CAP, &nsock)) + return; + + nsock->ops = sock->ops; + + __module_get(nsock->ops->owner); + + err = sock->ops->accept(sock, nsock, O_NONBLOCK); + if (err < 0) { + sock_release(nsock); + return; + } + + /* Set our callbacks */ + nsock->sk->sk_data_ready = rfcomm_l2data_ready; + nsock->sk->sk_state_change = rfcomm_l2state_change; + + s = rfcomm_session_add(nsock, BT_OPEN); + if (s) { + rfcomm_session_hold(s); + rfcomm_schedule(RFCOMM_SCHED_RX); + } else + sock_release(nsock); +} + +static inline void rfcomm_check_connection(struct rfcomm_session *s) +{ + struct sock *sk = s->sock->sk; + + BT_DBG("%p state %ld", s, s->state); + + switch(sk->sk_state) { + case BT_CONNECTED: + s->state = BT_CONNECT; + + /* We can adjust MTU on outgoing sessions. + * L2CAP MTU minus UIH header and FCS. */ + s->mtu = min(l2cap_pi(sk)->omtu, l2cap_pi(sk)->imtu) - 5; + + rfcomm_send_sabm(s, 0); + break; + + case BT_CLOSED: + s->state = BT_CLOSED; + rfcomm_session_close(s, sk->sk_err); + break; + } +} + +static inline void rfcomm_process_sessions(void) +{ + struct list_head *p, *n; + + rfcomm_lock(); + + list_for_each_safe(p, n, &session_list) { + struct rfcomm_session *s; + s = list_entry(p, struct rfcomm_session, list); + + if (s->state == BT_LISTEN) { + rfcomm_accept_connection(s); + continue; + } + + rfcomm_session_hold(s); + + switch (s->state) { + case BT_BOUND: + rfcomm_check_connection(s); + break; + + default: + rfcomm_process_rx(s); + break; + } + + rfcomm_process_dlcs(s); + + rfcomm_session_put(s); + } + + rfcomm_unlock(); +} + +static void rfcomm_worker(void) +{ + BT_DBG(""); + + while (!atomic_read(&terminate)) { + if (!test_bit(RFCOMM_SCHED_WAKEUP, &rfcomm_event)) { + /* No pending events. Let's sleep. + * Incoming connections and data will wake us up. */ + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } + + /* Process stuff */ + clear_bit(RFCOMM_SCHED_WAKEUP, &rfcomm_event); + rfcomm_process_sessions(); + } + set_current_state(TASK_RUNNING); + return; +} + +static int rfcomm_add_listener(bdaddr_t *ba) +{ + struct sockaddr_l2 addr; + struct socket *sock; + struct sock *sk; + struct rfcomm_session *s; + int err = 0; + + /* Create socket */ + err = rfcomm_l2sock_create(&sock); + if (err < 0) { + BT_ERR("Create socket failed %d", err); + return err; + } + + /* Bind socket */ + bacpy(&addr.l2_bdaddr, ba); + addr.l2_family = AF_BLUETOOTH; + addr.l2_psm = htobs(RFCOMM_PSM); + err = sock->ops->bind(sock, (struct sockaddr *) &addr, sizeof(addr)); + if (err < 0) { + BT_ERR("Bind failed %d", err); + goto failed; + } + + /* Set L2CAP options */ + sk = sock->sk; + lock_sock(sk); + l2cap_pi(sk)->imtu = RFCOMM_MAX_L2CAP_MTU; + release_sock(sk); + + /* Start listening on the socket */ + err = sock->ops->listen(sock, 10); + if (err) { + BT_ERR("Listen failed %d", err); + goto failed; + } + + /* Add listening session */ + s = rfcomm_session_add(sock, BT_LISTEN); + if (!s) + goto failed; + + rfcomm_session_hold(s); + return 0; +failed: + sock_release(sock); + return err; +} + +static void rfcomm_kill_listener(void) +{ + struct rfcomm_session *s; + struct list_head *p, *n; + + BT_DBG(""); + + list_for_each_safe(p, n, &session_list) { + s = list_entry(p, struct rfcomm_session, list); + rfcomm_session_del(s); + } +} + +static int rfcomm_run(void *unused) +{ + rfcomm_thread = current; + + atomic_inc(&running); + + daemonize("krfcommd"); + set_user_nice(current, -10); + current->flags |= PF_NOFREEZE; + + BT_DBG(""); + + rfcomm_add_listener(BDADDR_ANY); + + rfcomm_worker(); + + rfcomm_kill_listener(); + + atomic_dec(&running); + return 0; +} + +static void rfcomm_auth_cfm(struct hci_conn *conn, u8 status) +{ + struct rfcomm_session *s; + struct rfcomm_dlc *d; + struct list_head *p, *n; + + BT_DBG("conn %p status 0x%02x", conn, status); + + s = rfcomm_session_get(&conn->hdev->bdaddr, &conn->dst); + if (!s) + return; + + rfcomm_session_hold(s); + + list_for_each_safe(p, n, &s->dlcs) { + d = list_entry(p, struct rfcomm_dlc, list); + + if (d->link_mode & (RFCOMM_LM_ENCRYPT | RFCOMM_LM_SECURE)) + continue; + + if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags)) + continue; + + if (!status) + set_bit(RFCOMM_AUTH_ACCEPT, &d->flags); + else + set_bit(RFCOMM_AUTH_REJECT, &d->flags); + } + + rfcomm_session_put(s); + + rfcomm_schedule(RFCOMM_SCHED_AUTH); +} + +static void rfcomm_encrypt_cfm(struct hci_conn *conn, u8 status, u8 encrypt) +{ + struct rfcomm_session *s; + struct rfcomm_dlc *d; + struct list_head *p, *n; + + BT_DBG("conn %p status 0x%02x encrypt 0x%02x", conn, status, encrypt); + + s = rfcomm_session_get(&conn->hdev->bdaddr, &conn->dst); + if (!s) + return; + + rfcomm_session_hold(s); + + list_for_each_safe(p, n, &s->dlcs) { + d = list_entry(p, struct rfcomm_dlc, list); + + if (!test_and_clear_bit(RFCOMM_AUTH_PENDING, &d->flags)) + continue; + + if (!status && encrypt) + set_bit(RFCOMM_AUTH_ACCEPT, &d->flags); + else + set_bit(RFCOMM_AUTH_REJECT, &d->flags); + } + + rfcomm_session_put(s); + + rfcomm_schedule(RFCOMM_SCHED_AUTH); +} + +static struct hci_cb rfcomm_cb = { + .name = "RFCOMM", + .auth_cfm = rfcomm_auth_cfm, + .encrypt_cfm = rfcomm_encrypt_cfm +}; + +/* ---- Proc fs support ---- */ +#ifdef CONFIG_PROC_FS +static void *rfcomm_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct rfcomm_session *s; + struct list_head *pp, *p; + loff_t l = *pos; + + rfcomm_lock(); + + list_for_each(p, &session_list) { + s = list_entry(p, struct rfcomm_session, list); + list_for_each(pp, &s->dlcs) + if (!l--) { + seq->private = s; + return pp; + } + } + return NULL; +} + +static void *rfcomm_seq_next(struct seq_file *seq, void *e, loff_t *pos) +{ + struct rfcomm_session *s = seq->private; + struct list_head *pp, *p = e; + (*pos)++; + + if (p->next != &s->dlcs) + return p->next; + + list_for_each(p, &session_list) { + s = list_entry(p, struct rfcomm_session, list); + __list_for_each(pp, &s->dlcs) { + seq->private = s; + return pp; + } + } + return NULL; +} + +static void rfcomm_seq_stop(struct seq_file *seq, void *e) +{ + rfcomm_unlock(); +} + +static int rfcomm_seq_show(struct seq_file *seq, void *e) +{ + struct rfcomm_session *s = seq->private; + struct sock *sk = s->sock->sk; + struct rfcomm_dlc *d = list_entry(e, struct rfcomm_dlc, list); + + seq_printf(seq, "%s %s %ld %d %d %d %d\n", + batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), + d->state, d->dlci, d->mtu, d->rx_credits, d->tx_credits); + return 0; +} + +static struct seq_operations rfcomm_seq_ops = { + .start = rfcomm_seq_start, + .next = rfcomm_seq_next, + .stop = rfcomm_seq_stop, + .show = rfcomm_seq_show +}; + +static int rfcomm_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rfcomm_seq_ops); +} + +static struct file_operations rfcomm_seq_fops = { + .owner = THIS_MODULE, + .open = rfcomm_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init rfcomm_proc_init(void) +{ + struct proc_dir_entry *p; + + proc_bt_rfcomm = proc_mkdir("rfcomm", proc_bt); + if (proc_bt_rfcomm) { + proc_bt_rfcomm->owner = THIS_MODULE; + + p = create_proc_entry("dlc", S_IRUGO, proc_bt_rfcomm); + if (p) + p->proc_fops = &rfcomm_seq_fops; + } + return 0; +} + +static void __exit rfcomm_proc_cleanup(void) +{ + remove_proc_entry("dlc", proc_bt_rfcomm); + + remove_proc_entry("rfcomm", proc_bt); +} + +#else /* CONFIG_PROC_FS */ + +static int __init rfcomm_proc_init(void) +{ + return 0; +} + +static void __exit rfcomm_proc_cleanup(void) +{ + return; +} +#endif /* CONFIG_PROC_FS */ + +/* ---- Initialization ---- */ +static int __init rfcomm_init(void) +{ + l2cap_load(); + + hci_register_cb(&rfcomm_cb); + + kernel_thread(rfcomm_run, NULL, CLONE_KERNEL); + + BT_INFO("RFCOMM ver %s", VERSION); + + rfcomm_proc_init(); + + rfcomm_init_sockets(); + +#ifdef CONFIG_BT_RFCOMM_TTY + rfcomm_init_ttys(); +#endif + + return 0; +} + +static void __exit rfcomm_exit(void) +{ + hci_unregister_cb(&rfcomm_cb); + + /* Terminate working thread. + * ie. Set terminate flag and wake it up */ + atomic_inc(&terminate); + rfcomm_schedule(RFCOMM_SCHED_STATE); + + /* Wait until thread is running */ + while (atomic_read(&running)) + schedule(); + +#ifdef CONFIG_BT_RFCOMM_TTY + rfcomm_cleanup_ttys(); +#endif + + rfcomm_cleanup_sockets(); + + rfcomm_proc_cleanup(); +} + +module_init(rfcomm_init); +module_exit(rfcomm_exit); + +MODULE_AUTHOR("Maxim Krasnyansky , Marcel Holtmann "); +MODULE_DESCRIPTION("Bluetooth RFCOMM ver " VERSION); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("bt-proto-3"); diff --git a/net/bluetooth/rfcomm/crc.c b/net/bluetooth/rfcomm/crc.c new file mode 100644 index 000000000000..1011bc4a8692 --- /dev/null +++ b/net/bluetooth/rfcomm/crc.c @@ -0,0 +1,71 @@ +/* + RFCOMM implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2002 Maxim Krasnyansky + Copyright (C) 2002 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* + * RFCOMM FCS calculation. + * + * $Id: crc.c,v 1.2 2002/09/21 09:54:32 holtmann Exp $ + */ + +/* reversed, 8-bit, poly=0x07 */ +unsigned char rfcomm_crc_table[256] = { + 0x00, 0x91, 0xe3, 0x72, 0x07, 0x96, 0xe4, 0x75, + 0x0e, 0x9f, 0xed, 0x7c, 0x09, 0x98, 0xea, 0x7b, + 0x1c, 0x8d, 0xff, 0x6e, 0x1b, 0x8a, 0xf8, 0x69, + 0x12, 0x83, 0xf1, 0x60, 0x15, 0x84, 0xf6, 0x67, + + 0x38, 0xa9, 0xdb, 0x4a, 0x3f, 0xae, 0xdc, 0x4d, + 0x36, 0xa7, 0xd5, 0x44, 0x31, 0xa0, 0xd2, 0x43, + 0x24, 0xb5, 0xc7, 0x56, 0x23, 0xb2, 0xc0, 0x51, + 0x2a, 0xbb, 0xc9, 0x58, 0x2d, 0xbc, 0xce, 0x5f, + + 0x70, 0xe1, 0x93, 0x02, 0x77, 0xe6, 0x94, 0x05, + 0x7e, 0xef, 0x9d, 0x0c, 0x79, 0xe8, 0x9a, 0x0b, + 0x6c, 0xfd, 0x8f, 0x1e, 0x6b, 0xfa, 0x88, 0x19, + 0x62, 0xf3, 0x81, 0x10, 0x65, 0xf4, 0x86, 0x17, + + 0x48, 0xd9, 0xab, 0x3a, 0x4f, 0xde, 0xac, 0x3d, + 0x46, 0xd7, 0xa5, 0x34, 0x41, 0xd0, 0xa2, 0x33, + 0x54, 0xc5, 0xb7, 0x26, 0x53, 0xc2, 0xb0, 0x21, + 0x5a, 0xcb, 0xb9, 0x28, 0x5d, 0xcc, 0xbe, 0x2f, + + 0xe0, 0x71, 0x03, 0x92, 0xe7, 0x76, 0x04, 0x95, + 0xee, 0x7f, 0x0d, 0x9c, 0xe9, 0x78, 0x0a, 0x9b, + 0xfc, 0x6d, 0x1f, 0x8e, 0xfb, 0x6a, 0x18, 0x89, + 0xf2, 0x63, 0x11, 0x80, 0xf5, 0x64, 0x16, 0x87, + + 0xd8, 0x49, 0x3b, 0xaa, 0xdf, 0x4e, 0x3c, 0xad, + 0xd6, 0x47, 0x35, 0xa4, 0xd1, 0x40, 0x32, 0xa3, + 0xc4, 0x55, 0x27, 0xb6, 0xc3, 0x52, 0x20, 0xb1, + 0xca, 0x5b, 0x29, 0xb8, 0xcd, 0x5c, 0x2e, 0xbf, + + 0x90, 0x01, 0x73, 0xe2, 0x97, 0x06, 0x74, 0xe5, + 0x9e, 0x0f, 0x7d, 0xec, 0x99, 0x08, 0x7a, 0xeb, + 0x8c, 0x1d, 0x6f, 0xfe, 0x8b, 0x1a, 0x68, 0xf9, + 0x82, 0x13, 0x61, 0xf0, 0x85, 0x14, 0x66, 0xf7, + + 0xa8, 0x39, 0x4b, 0xda, 0xaf, 0x3e, 0x4c, 0xdd, + 0xa6, 0x37, 0x45, 0xd4, 0xa1, 0x30, 0x42, 0xd3, + 0xb4, 0x25, 0x57, 0xc6, 0xb3, 0x22, 0x50, 0xc1, + 0xba, 0x2b, 0x59, 0xc8, 0xbd, 0x2c, 0x5e, 0xcf +}; diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c new file mode 100644 index 000000000000..640028a2183c --- /dev/null +++ b/net/bluetooth/rfcomm/sock.c @@ -0,0 +1,1010 @@ +/* + RFCOMM implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2002 Maxim Krasnyansky + Copyright (C) 2002 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* + * RFCOMM sockets. + * + * $Id: sock.c,v 1.24 2002/10/03 01:00:34 maxk Exp $ + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#ifndef CONFIG_BT_RFCOMM_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +static struct proto_ops rfcomm_sock_ops; + +static struct bt_sock_list rfcomm_sk_list = { + .lock = RW_LOCK_UNLOCKED +}; + +static void rfcomm_sock_close(struct sock *sk); +static void rfcomm_sock_kill(struct sock *sk); + +/* ---- DLC callbacks ---- + * + * called under rfcomm_dlc_lock() + */ +static void rfcomm_sk_data_ready(struct rfcomm_dlc *d, struct sk_buff *skb) +{ + struct sock *sk = d->owner; + if (!sk) + return; + + atomic_add(skb->len, &sk->sk_rmem_alloc); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + rfcomm_dlc_throttle(d); +} + +static void rfcomm_sk_state_change(struct rfcomm_dlc *d, int err) +{ + struct sock *sk = d->owner, *parent; + if (!sk) + return; + + BT_DBG("dlc %p state %ld err %d", d, d->state, err); + + bh_lock_sock(sk); + + if (err) + sk->sk_err = err; + + sk->sk_state = d->state; + + parent = bt_sk(sk)->parent; + if (parent) { + if (d->state == BT_CLOSED) { + sock_set_flag(sk, SOCK_ZAPPED); + bt_accept_unlink(sk); + } + parent->sk_data_ready(parent, 0); + } else { + if (d->state == BT_CONNECTED) + rfcomm_session_getaddr(d->session, &bt_sk(sk)->src, NULL); + sk->sk_state_change(sk); + } + + bh_unlock_sock(sk); + + if (parent && sock_flag(sk, SOCK_ZAPPED)) { + /* We have to drop DLC lock here, otherwise + * rfcomm_sock_destruct() will dead lock. */ + rfcomm_dlc_unlock(d); + rfcomm_sock_kill(sk); + rfcomm_dlc_lock(d); + } +} + +/* ---- Socket functions ---- */ +static struct sock *__rfcomm_get_sock_by_addr(u8 channel, bdaddr_t *src) +{ + struct sock *sk = NULL; + struct hlist_node *node; + + sk_for_each(sk, node, &rfcomm_sk_list.head) { + if (rfcomm_pi(sk)->channel == channel && + !bacmp(&bt_sk(sk)->src, src)) + break; + } + + return node ? sk : NULL; +} + +/* Find socket with channel and source bdaddr. + * Returns closest match. + */ +static struct sock *__rfcomm_get_sock_by_channel(int state, u8 channel, bdaddr_t *src) +{ + struct sock *sk = NULL, *sk1 = NULL; + struct hlist_node *node; + + sk_for_each(sk, node, &rfcomm_sk_list.head) { + if (state && sk->sk_state != state) + continue; + + if (rfcomm_pi(sk)->channel == channel) { + /* Exact match. */ + if (!bacmp(&bt_sk(sk)->src, src)) + break; + + /* Closest match */ + if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) + sk1 = sk; + } + } + return node ? sk : sk1; +} + +/* Find socket with given address (channel, src). + * Returns locked socket */ +static inline struct sock *rfcomm_get_sock_by_channel(int state, u8 channel, bdaddr_t *src) +{ + struct sock *s; + read_lock(&rfcomm_sk_list.lock); + s = __rfcomm_get_sock_by_channel(state, channel, src); + if (s) bh_lock_sock(s); + read_unlock(&rfcomm_sk_list.lock); + return s; +} + +static void rfcomm_sock_destruct(struct sock *sk) +{ + struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; + + BT_DBG("sk %p dlc %p", sk, d); + + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); + + rfcomm_dlc_lock(d); + rfcomm_pi(sk)->dlc = NULL; + + /* Detach DLC if it's owned by this socket */ + if (d->owner == sk) + d->owner = NULL; + rfcomm_dlc_unlock(d); + + rfcomm_dlc_put(d); +} + +static void rfcomm_sock_cleanup_listen(struct sock *parent) +{ + struct sock *sk; + + BT_DBG("parent %p", parent); + + /* Close not yet accepted dlcs */ + while ((sk = bt_accept_dequeue(parent, NULL))) { + rfcomm_sock_close(sk); + rfcomm_sock_kill(sk); + } + + parent->sk_state = BT_CLOSED; + sock_set_flag(parent, SOCK_ZAPPED); +} + +/* Kill socket (only if zapped and orphan) + * Must be called on unlocked socket. + */ +static void rfcomm_sock_kill(struct sock *sk) +{ + if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) + return; + + BT_DBG("sk %p state %d refcnt %d", sk, sk->sk_state, atomic_read(&sk->sk_refcnt)); + + /* Kill poor orphan */ + bt_sock_unlink(&rfcomm_sk_list, sk); + sock_set_flag(sk, SOCK_DEAD); + sock_put(sk); +} + +static void __rfcomm_sock_close(struct sock *sk) +{ + struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; + + BT_DBG("sk %p state %d socket %p", sk, sk->sk_state, sk->sk_socket); + + switch (sk->sk_state) { + case BT_LISTEN: + rfcomm_sock_cleanup_listen(sk); + break; + + case BT_CONNECT: + case BT_CONNECT2: + case BT_CONFIG: + case BT_CONNECTED: + rfcomm_dlc_close(d, 0); + + default: + sock_set_flag(sk, SOCK_ZAPPED); + break; + } +} + +/* Close socket. + * Must be called on unlocked socket. + */ +static void rfcomm_sock_close(struct sock *sk) +{ + lock_sock(sk); + __rfcomm_sock_close(sk); + release_sock(sk); +} + +static void rfcomm_sock_init(struct sock *sk, struct sock *parent) +{ + struct rfcomm_pinfo *pi = rfcomm_pi(sk); + + BT_DBG("sk %p", sk); + + if (parent) { + sk->sk_type = parent->sk_type; + pi->link_mode = rfcomm_pi(parent)->link_mode; + } else { + pi->link_mode = 0; + } + + pi->dlc->link_mode = pi->link_mode; +} + +static struct proto rfcomm_proto = { + .name = "RFCOMM", + .owner = THIS_MODULE, + .obj_size = sizeof(struct rfcomm_pinfo) +}; + +static struct sock *rfcomm_sock_alloc(struct socket *sock, int proto, int prio) +{ + struct rfcomm_dlc *d; + struct sock *sk; + + sk = sk_alloc(PF_BLUETOOTH, prio, &rfcomm_proto, 1); + if (!sk) + return NULL; + + sock_init_data(sock, sk); + INIT_LIST_HEAD(&bt_sk(sk)->accept_q); + + d = rfcomm_dlc_alloc(prio); + if (!d) { + sk_free(sk); + return NULL; + } + + d->data_ready = rfcomm_sk_data_ready; + d->state_change = rfcomm_sk_state_change; + + rfcomm_pi(sk)->dlc = d; + d->owner = sk; + + sk->sk_destruct = rfcomm_sock_destruct; + sk->sk_sndtimeo = RFCOMM_CONN_TIMEOUT; + + sk->sk_sndbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10; + sk->sk_rcvbuf = RFCOMM_MAX_CREDITS * RFCOMM_DEFAULT_MTU * 10; + + sock_reset_flag(sk, SOCK_ZAPPED); + + sk->sk_protocol = proto; + sk->sk_state = BT_OPEN; + + bt_sock_link(&rfcomm_sk_list, sk); + + BT_DBG("sk %p", sk); + return sk; +} + +static int rfcomm_sock_create(struct socket *sock, int protocol) +{ + struct sock *sk; + + BT_DBG("sock %p", sock); + + sock->state = SS_UNCONNECTED; + + if (sock->type != SOCK_STREAM && sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + sock->ops = &rfcomm_sock_ops; + + if (!(sk = rfcomm_sock_alloc(sock, protocol, GFP_KERNEL))) + return -ENOMEM; + + rfcomm_sock_init(sk, NULL); + return 0; +} + +static int rfcomm_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +{ + struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sk %p %s", sk, batostr(&sa->rc_bdaddr)); + + if (!addr || addr->sa_family != AF_BLUETOOTH) + return -EINVAL; + + lock_sock(sk); + + if (sk->sk_state != BT_OPEN) { + err = -EBADFD; + goto done; + } + + write_lock_bh(&rfcomm_sk_list.lock); + + if (sa->rc_channel && __rfcomm_get_sock_by_addr(sa->rc_channel, &sa->rc_bdaddr)) { + err = -EADDRINUSE; + } else { + /* Save source address */ + bacpy(&bt_sk(sk)->src, &sa->rc_bdaddr); + rfcomm_pi(sk)->channel = sa->rc_channel; + sk->sk_state = BT_BOUND; + } + + write_unlock_bh(&rfcomm_sk_list.lock); + +done: + release_sock(sk); + return err; +} + +static int rfcomm_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) +{ + struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; + struct sock *sk = sock->sk; + struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; + int err = 0; + + BT_DBG("sk %p", sk); + + if (addr->sa_family != AF_BLUETOOTH || alen < sizeof(struct sockaddr_rc)) + return -EINVAL; + + if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) + return -EBADFD; + + if (sk->sk_type != SOCK_STREAM) + return -EINVAL; + + lock_sock(sk); + + sk->sk_state = BT_CONNECT; + bacpy(&bt_sk(sk)->dst, &sa->rc_bdaddr); + rfcomm_pi(sk)->channel = sa->rc_channel; + + err = rfcomm_dlc_open(d, &bt_sk(sk)->src, &sa->rc_bdaddr, sa->rc_channel); + if (!err) + err = bt_sock_wait_state(sk, BT_CONNECTED, + sock_sndtimeo(sk, flags & O_NONBLOCK)); + + release_sock(sk); + return err; +} + +static int rfcomm_sock_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sk %p backlog %d", sk, backlog); + + lock_sock(sk); + + if (sk->sk_state != BT_BOUND) { + err = -EBADFD; + goto done; + } + + if (!rfcomm_pi(sk)->channel) { + bdaddr_t *src = &bt_sk(sk)->src; + u8 channel; + + err = -EINVAL; + + write_lock_bh(&rfcomm_sk_list.lock); + + for (channel = 1; channel < 31; channel++) + if (!__rfcomm_get_sock_by_addr(channel, src)) { + rfcomm_pi(sk)->channel = channel; + err = 0; + break; + } + + write_unlock_bh(&rfcomm_sk_list.lock); + + if (err < 0) + goto done; + } + + sk->sk_max_ack_backlog = backlog; + sk->sk_ack_backlog = 0; + sk->sk_state = BT_LISTEN; + +done: + release_sock(sk); + return err; +} + +static int rfcomm_sock_accept(struct socket *sock, struct socket *newsock, int flags) +{ + DECLARE_WAITQUEUE(wait, current); + struct sock *sk = sock->sk, *nsk; + long timeo; + int err = 0; + + lock_sock(sk); + + if (sk->sk_state != BT_LISTEN) { + err = -EBADFD; + goto done; + } + + timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + BT_DBG("sk %p timeo %ld", sk, timeo); + + /* Wait for an incoming connection. (wake-one). */ + add_wait_queue_exclusive(sk->sk_sleep, &wait); + while (!(nsk = bt_accept_dequeue(sk, newsock))) { + set_current_state(TASK_INTERRUPTIBLE); + if (!timeo) { + err = -EAGAIN; + break; + } + + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + + if (sk->sk_state != BT_LISTEN) { + err = -EBADFD; + break; + } + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + break; + } + } + set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sk_sleep, &wait); + + if (err) + goto done; + + newsock->state = SS_CONNECTED; + + BT_DBG("new socket %p", nsk); + +done: + release_sock(sk); + return err; +} + +static int rfcomm_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer) +{ + struct sockaddr_rc *sa = (struct sockaddr_rc *) addr; + struct sock *sk = sock->sk; + + BT_DBG("sock %p, sk %p", sock, sk); + + sa->rc_family = AF_BLUETOOTH; + sa->rc_channel = rfcomm_pi(sk)->channel; + if (peer) + bacpy(&sa->rc_bdaddr, &bt_sk(sk)->dst); + else + bacpy(&sa->rc_bdaddr, &bt_sk(sk)->src); + + *len = sizeof(struct sockaddr_rc); + return 0; +} + +static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; + struct sk_buff *skb; + int err; + int sent = 0; + + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + if (sk->sk_shutdown & SEND_SHUTDOWN) + return -EPIPE; + + BT_DBG("sock %p, sk %p", sock, sk); + + lock_sock(sk); + + while (len) { + size_t size = min_t(size_t, len, d->mtu); + + skb = sock_alloc_send_skb(sk, size + RFCOMM_SKB_RESERVE, + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) + break; + skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE); + + err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); + if (err) { + kfree_skb(skb); + sent = err; + break; + } + + err = rfcomm_dlc_send(d, skb); + if (err < 0) { + kfree_skb(skb); + break; + } + + sent += size; + len -= size; + } + + release_sock(sk); + + return sent ? sent : err; +} + +static long rfcomm_sock_data_wait(struct sock *sk, long timeo) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(sk->sk_sleep, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + if (skb_queue_len(&sk->sk_receive_queue) || sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN) || + signal_pending(current) || !timeo) + break; + + set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sk_sleep, &wait); + return timeo; +} + +static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + int err = 0; + size_t target, copied = 0; + long timeo; + + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + msg->msg_namelen = 0; + + BT_DBG("sk %p size %d", sk, size); + + lock_sock(sk); + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + do { + struct sk_buff *skb; + int chunk; + + skb = skb_dequeue(&sk->sk_receive_queue); + if (!skb) { + if (copied >= target) + break; + + if ((err = sock_error(sk)) != 0) + break; + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + err = -EAGAIN; + if (!timeo) + break; + + timeo = rfcomm_sock_data_wait(sk, timeo); + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + goto out; + } + continue; + } + + chunk = min_t(unsigned int, skb->len, size); + if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { + skb_queue_head(&sk->sk_receive_queue, skb); + if (!copied) + copied = -EFAULT; + break; + } + copied += chunk; + size -= chunk; + + if (!(flags & MSG_PEEK)) { + atomic_sub(chunk, &sk->sk_rmem_alloc); + + skb_pull(skb, chunk); + if (skb->len) { + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + kfree_skb(skb); + + } else { + /* put message back and return */ + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + } while (size); + +out: + if (atomic_read(&sk->sk_rmem_alloc) <= (sk->sk_rcvbuf >> 2)) + rfcomm_dlc_unthrottle(rfcomm_pi(sk)->dlc); + + release_sock(sk); + return copied ? : err; +} + +static int rfcomm_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + int err = 0; + u32 opt; + + BT_DBG("sk %p", sk); + + lock_sock(sk); + + switch (optname) { + case RFCOMM_LM: + if (get_user(opt, (u32 __user *) optval)) { + err = -EFAULT; + break; + } + + rfcomm_pi(sk)->link_mode = opt; + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int rfcomm_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct sock *l2cap_sk; + struct rfcomm_conninfo cinfo; + int len, err = 0; + + BT_DBG("sk %p", sk); + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + case RFCOMM_LM: + if (put_user(rfcomm_pi(sk)->link_mode, (u32 __user *) optval)) + err = -EFAULT; + break; + + case RFCOMM_CONNINFO: + if (sk->sk_state != BT_CONNECTED) { + err = -ENOTCONN; + break; + } + + l2cap_sk = rfcomm_pi(sk)->dlc->session->sock->sk; + + cinfo.hci_handle = l2cap_pi(l2cap_sk)->conn->hcon->handle; + memcpy(cinfo.dev_class, l2cap_pi(l2cap_sk)->conn->hcon->dev_class, 3); + + len = min_t(unsigned int, len, sizeof(cinfo)); + if (copy_to_user(optval, (char *) &cinfo, len)) + err = -EFAULT; + + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + int err; + + lock_sock(sk); + +#ifdef CONFIG_BT_RFCOMM_TTY + err = rfcomm_dev_ioctl(sk, cmd, (void __user *)arg); +#else + err = -EOPNOTSUPP; +#endif + + release_sock(sk); + return err; +} + +static int rfcomm_sock_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sock %p, sk %p", sock, sk); + + if (!sk) return 0; + + lock_sock(sk); + if (!sk->sk_shutdown) { + sk->sk_shutdown = SHUTDOWN_MASK; + __rfcomm_sock_close(sk); + + if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) + err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); + } + release_sock(sk); + return err; +} + +static int rfcomm_sock_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + int err; + + BT_DBG("sock %p, sk %p", sock, sk); + + if (!sk) + return 0; + + err = rfcomm_sock_shutdown(sock, 2); + + sock_orphan(sk); + rfcomm_sock_kill(sk); + return err; +} + +/* ---- RFCOMM core layer callbacks ---- + * + * called under rfcomm_lock() + */ +int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc **d) +{ + struct sock *sk, *parent; + bdaddr_t src, dst; + int result = 0; + + BT_DBG("session %p channel %d", s, channel); + + rfcomm_session_getaddr(s, &src, &dst); + + /* Check if we have socket listening on channel */ + parent = rfcomm_get_sock_by_channel(BT_LISTEN, channel, &src); + if (!parent) + return 0; + + /* Check for backlog size */ + if (sk_acceptq_is_full(parent)) { + BT_DBG("backlog full %d", parent->sk_ack_backlog); + goto done; + } + + sk = rfcomm_sock_alloc(NULL, BTPROTO_RFCOMM, GFP_ATOMIC); + if (!sk) + goto done; + + rfcomm_sock_init(sk, parent); + bacpy(&bt_sk(sk)->src, &src); + bacpy(&bt_sk(sk)->dst, &dst); + rfcomm_pi(sk)->channel = channel; + + sk->sk_state = BT_CONFIG; + bt_accept_enqueue(parent, sk); + + /* Accept connection and return socket DLC */ + *d = rfcomm_pi(sk)->dlc; + result = 1; + +done: + bh_unlock_sock(parent); + return result; +} + +/* ---- Proc fs support ---- */ +#ifdef CONFIG_PROC_FS +static void *rfcomm_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct sock *sk; + struct hlist_node *node; + loff_t l = *pos; + + read_lock_bh(&rfcomm_sk_list.lock); + + sk_for_each(sk, node, &rfcomm_sk_list.head) + if (!l--) + return sk; + return NULL; +} + +static void *rfcomm_seq_next(struct seq_file *seq, void *e, loff_t *pos) +{ + struct sock *sk = e; + (*pos)++; + return sk_next(sk); +} + +static void rfcomm_seq_stop(struct seq_file *seq, void *e) +{ + read_unlock_bh(&rfcomm_sk_list.lock); +} + +static int rfcomm_seq_show(struct seq_file *seq, void *e) +{ + struct sock *sk = e; + seq_printf(seq, "%s %s %d %d\n", + batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), + sk->sk_state, rfcomm_pi(sk)->channel); + return 0; +} + +static struct seq_operations rfcomm_seq_ops = { + .start = rfcomm_seq_start, + .next = rfcomm_seq_next, + .stop = rfcomm_seq_stop, + .show = rfcomm_seq_show +}; + +static int rfcomm_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rfcomm_seq_ops); +} + +static struct file_operations rfcomm_seq_fops = { + .owner = THIS_MODULE, + .open = rfcomm_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init rfcomm_sock_proc_init(void) +{ + struct proc_dir_entry *p = create_proc_entry("sock", S_IRUGO, proc_bt_rfcomm); + if (!p) + return -ENOMEM; + p->proc_fops = &rfcomm_seq_fops; + return 0; +} + +static void __exit rfcomm_sock_proc_cleanup(void) +{ + remove_proc_entry("sock", proc_bt_rfcomm); +} + +#else /* CONFIG_PROC_FS */ + +static int __init rfcomm_sock_proc_init(void) +{ + return 0; +} + +static void __exit rfcomm_sock_proc_cleanup(void) +{ + return; +} +#endif /* CONFIG_PROC_FS */ + +static struct proto_ops rfcomm_sock_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .release = rfcomm_sock_release, + .bind = rfcomm_sock_bind, + .connect = rfcomm_sock_connect, + .listen = rfcomm_sock_listen, + .accept = rfcomm_sock_accept, + .getname = rfcomm_sock_getname, + .sendmsg = rfcomm_sock_sendmsg, + .recvmsg = rfcomm_sock_recvmsg, + .shutdown = rfcomm_sock_shutdown, + .setsockopt = rfcomm_sock_setsockopt, + .getsockopt = rfcomm_sock_getsockopt, + .ioctl = rfcomm_sock_ioctl, + .poll = bt_sock_poll, + .socketpair = sock_no_socketpair, + .mmap = sock_no_mmap +}; + +static struct net_proto_family rfcomm_sock_family_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .create = rfcomm_sock_create +}; + +int __init rfcomm_init_sockets(void) +{ + int err; + + err = proto_register(&rfcomm_proto, 0); + if (err < 0) + return err; + + err = bt_sock_register(BTPROTO_RFCOMM, &rfcomm_sock_family_ops); + if (err < 0) + goto error; + + rfcomm_sock_proc_init(); + + BT_INFO("RFCOMM socket layer initialized"); + + return 0; + +error: + BT_ERR("RFCOMM socket layer registration failed"); + proto_unregister(&rfcomm_proto); + return err; +} + +void __exit rfcomm_cleanup_sockets(void) +{ + rfcomm_sock_proc_cleanup(); + + if (bt_sock_unregister(BTPROTO_RFCOMM) < 0) + BT_ERR("RFCOMM socket layer unregistration failed"); + + proto_unregister(&rfcomm_proto); +} diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c new file mode 100644 index 000000000000..6d689200bcf3 --- /dev/null +++ b/net/bluetooth/rfcomm/tty.c @@ -0,0 +1,930 @@ +/* + RFCOMM implementation for Linux Bluetooth stack (BlueZ). + Copyright (C) 2002 Maxim Krasnyansky + Copyright (C) 2002 Marcel Holtmann + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* + * RFCOMM TTY. + * + * $Id: tty.c,v 1.24 2002/10/03 01:54:38 holtmann Exp $ + */ + +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include + +#ifndef CONFIG_BT_RFCOMM_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +#define RFCOMM_TTY_MAGIC 0x6d02 /* magic number for rfcomm struct */ +#define RFCOMM_TTY_PORTS RFCOMM_MAX_DEV /* whole lotta rfcomm devices */ +#define RFCOMM_TTY_MAJOR 216 /* device node major id of the usb/bluetooth.c driver */ +#define RFCOMM_TTY_MINOR 0 + +static struct tty_driver *rfcomm_tty_driver; + +struct rfcomm_dev { + struct list_head list; + atomic_t refcnt; + + char name[12]; + int id; + unsigned long flags; + int opened; + int err; + + bdaddr_t src; + bdaddr_t dst; + u8 channel; + + uint modem_status; + + struct rfcomm_dlc *dlc; + struct tty_struct *tty; + wait_queue_head_t wait; + struct tasklet_struct wakeup_task; + + atomic_t wmem_alloc; +}; + +static LIST_HEAD(rfcomm_dev_list); +static DEFINE_RWLOCK(rfcomm_dev_lock); + +static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb); +static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err); +static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig); + +static void rfcomm_tty_wakeup(unsigned long arg); + +/* ---- Device functions ---- */ +static void rfcomm_dev_destruct(struct rfcomm_dev *dev) +{ + struct rfcomm_dlc *dlc = dev->dlc; + + BT_DBG("dev %p dlc %p", dev, dlc); + + rfcomm_dlc_lock(dlc); + /* Detach DLC if it's owned by this dev */ + if (dlc->owner == dev) + dlc->owner = NULL; + rfcomm_dlc_unlock(dlc); + + rfcomm_dlc_put(dlc); + + tty_unregister_device(rfcomm_tty_driver, dev->id); + + /* Refcount should only hit zero when called from rfcomm_dev_del() + which will have taken us off the list. Everything else are + refcounting bugs. */ + BUG_ON(!list_empty(&dev->list)); + + kfree(dev); + + /* It's safe to call module_put() here because socket still + holds reference to this module. */ + module_put(THIS_MODULE); +} + +static inline void rfcomm_dev_hold(struct rfcomm_dev *dev) +{ + atomic_inc(&dev->refcnt); +} + +static inline void rfcomm_dev_put(struct rfcomm_dev *dev) +{ + /* The reason this isn't actually a race, as you no + doubt have a little voice screaming at you in your + head, is that the refcount should never actually + reach zero unless the device has already been taken + off the list, in rfcomm_dev_del(). And if that's not + true, we'll hit the BUG() in rfcomm_dev_destruct() + anyway. */ + if (atomic_dec_and_test(&dev->refcnt)) + rfcomm_dev_destruct(dev); +} + +static struct rfcomm_dev *__rfcomm_dev_get(int id) +{ + struct rfcomm_dev *dev; + struct list_head *p; + + list_for_each(p, &rfcomm_dev_list) { + dev = list_entry(p, struct rfcomm_dev, list); + if (dev->id == id) + return dev; + } + + return NULL; +} + +static inline struct rfcomm_dev *rfcomm_dev_get(int id) +{ + struct rfcomm_dev *dev; + + read_lock(&rfcomm_dev_lock); + + dev = __rfcomm_dev_get(id); + if (dev) + rfcomm_dev_hold(dev); + + read_unlock(&rfcomm_dev_lock); + + return dev; +} + +static int rfcomm_dev_add(struct rfcomm_dev_req *req, struct rfcomm_dlc *dlc) +{ + struct rfcomm_dev *dev; + struct list_head *head = &rfcomm_dev_list, *p; + int err = 0; + + BT_DBG("id %d channel %d", req->dev_id, req->channel); + + dev = kmalloc(sizeof(struct rfcomm_dev), GFP_KERNEL); + if (!dev) + return -ENOMEM; + memset(dev, 0, sizeof(struct rfcomm_dev)); + + write_lock_bh(&rfcomm_dev_lock); + + if (req->dev_id < 0) { + dev->id = 0; + + list_for_each(p, &rfcomm_dev_list) { + if (list_entry(p, struct rfcomm_dev, list)->id != dev->id) + break; + + dev->id++; + head = p; + } + } else { + dev->id = req->dev_id; + + list_for_each(p, &rfcomm_dev_list) { + struct rfcomm_dev *entry = list_entry(p, struct rfcomm_dev, list); + + if (entry->id == dev->id) { + err = -EADDRINUSE; + goto out; + } + + if (entry->id > dev->id - 1) + break; + + head = p; + } + } + + if ((dev->id < 0) || (dev->id > RFCOMM_MAX_DEV - 1)) { + err = -ENFILE; + goto out; + } + + sprintf(dev->name, "rfcomm%d", dev->id); + + list_add(&dev->list, head); + atomic_set(&dev->refcnt, 1); + + bacpy(&dev->src, &req->src); + bacpy(&dev->dst, &req->dst); + dev->channel = req->channel; + + dev->flags = req->flags & + ((1 << RFCOMM_RELEASE_ONHUP) | (1 << RFCOMM_REUSE_DLC)); + + init_waitqueue_head(&dev->wait); + tasklet_init(&dev->wakeup_task, rfcomm_tty_wakeup, (unsigned long) dev); + + rfcomm_dlc_lock(dlc); + dlc->data_ready = rfcomm_dev_data_ready; + dlc->state_change = rfcomm_dev_state_change; + dlc->modem_status = rfcomm_dev_modem_status; + + dlc->owner = dev; + dev->dlc = dlc; + rfcomm_dlc_unlock(dlc); + + /* It's safe to call __module_get() here because socket already + holds reference to this module. */ + __module_get(THIS_MODULE); + +out: + write_unlock_bh(&rfcomm_dev_lock); + + if (err) { + kfree(dev); + return err; + } + + tty_register_device(rfcomm_tty_driver, dev->id, NULL); + + return dev->id; +} + +static void rfcomm_dev_del(struct rfcomm_dev *dev) +{ + BT_DBG("dev %p", dev); + + write_lock_bh(&rfcomm_dev_lock); + list_del_init(&dev->list); + write_unlock_bh(&rfcomm_dev_lock); + + rfcomm_dev_put(dev); +} + +/* ---- Send buffer ---- */ +static inline unsigned int rfcomm_room(struct rfcomm_dlc *dlc) +{ + /* We can't let it be zero, because we don't get a callback + when tx_credits becomes nonzero, hence we'd never wake up */ + return dlc->mtu * (dlc->tx_credits?:1); +} + +static void rfcomm_wfree(struct sk_buff *skb) +{ + struct rfcomm_dev *dev = (void *) skb->sk; + atomic_sub(skb->truesize, &dev->wmem_alloc); + if (test_bit(RFCOMM_TTY_ATTACHED, &dev->flags)) + tasklet_schedule(&dev->wakeup_task); + rfcomm_dev_put(dev); +} + +static inline void rfcomm_set_owner_w(struct sk_buff *skb, struct rfcomm_dev *dev) +{ + rfcomm_dev_hold(dev); + atomic_add(skb->truesize, &dev->wmem_alloc); + skb->sk = (void *) dev; + skb->destructor = rfcomm_wfree; +} + +static struct sk_buff *rfcomm_wmalloc(struct rfcomm_dev *dev, unsigned long size, int priority) +{ + if (atomic_read(&dev->wmem_alloc) < rfcomm_room(dev->dlc)) { + struct sk_buff *skb = alloc_skb(size, priority); + if (skb) { + rfcomm_set_owner_w(skb, dev); + return skb; + } + } + return NULL; +} + +/* ---- Device IOCTLs ---- */ + +#define NOCAP_FLAGS ((1 << RFCOMM_REUSE_DLC) | (1 << RFCOMM_RELEASE_ONHUP)) + +static int rfcomm_create_dev(struct sock *sk, void __user *arg) +{ + struct rfcomm_dev_req req; + struct rfcomm_dlc *dlc; + int id; + + if (copy_from_user(&req, arg, sizeof(req))) + return -EFAULT; + + BT_DBG("sk %p dev_id %id flags 0x%x", sk, req.dev_id, req.flags); + + if (req.flags != NOCAP_FLAGS && !capable(CAP_NET_ADMIN)) + return -EPERM; + + if (req.flags & (1 << RFCOMM_REUSE_DLC)) { + /* Socket must be connected */ + if (sk->sk_state != BT_CONNECTED) + return -EBADFD; + + dlc = rfcomm_pi(sk)->dlc; + rfcomm_dlc_hold(dlc); + } else { + dlc = rfcomm_dlc_alloc(GFP_KERNEL); + if (!dlc) + return -ENOMEM; + } + + id = rfcomm_dev_add(&req, dlc); + if (id < 0) { + rfcomm_dlc_put(dlc); + return id; + } + + if (req.flags & (1 << RFCOMM_REUSE_DLC)) { + /* DLC is now used by device. + * Socket must be disconnected */ + sk->sk_state = BT_CLOSED; + } + + return id; +} + +static int rfcomm_release_dev(void __user *arg) +{ + struct rfcomm_dev_req req; + struct rfcomm_dev *dev; + + if (copy_from_user(&req, arg, sizeof(req))) + return -EFAULT; + + BT_DBG("dev_id %id flags 0x%x", req.dev_id, req.flags); + + if (!(dev = rfcomm_dev_get(req.dev_id))) + return -ENODEV; + + if (dev->flags != NOCAP_FLAGS && !capable(CAP_NET_ADMIN)) { + rfcomm_dev_put(dev); + return -EPERM; + } + + if (req.flags & (1 << RFCOMM_HANGUP_NOW)) + rfcomm_dlc_close(dev->dlc, 0); + + rfcomm_dev_del(dev); + rfcomm_dev_put(dev); + return 0; +} + +static int rfcomm_get_dev_list(void __user *arg) +{ + struct rfcomm_dev_list_req *dl; + struct rfcomm_dev_info *di; + struct list_head *p; + int n = 0, size, err; + u16 dev_num; + + BT_DBG(""); + + if (get_user(dev_num, (u16 __user *) arg)) + return -EFAULT; + + if (!dev_num || dev_num > (PAGE_SIZE * 4) / sizeof(*di)) + return -EINVAL; + + size = sizeof(*dl) + dev_num * sizeof(*di); + + if (!(dl = kmalloc(size, GFP_KERNEL))) + return -ENOMEM; + + di = dl->dev_info; + + read_lock_bh(&rfcomm_dev_lock); + + list_for_each(p, &rfcomm_dev_list) { + struct rfcomm_dev *dev = list_entry(p, struct rfcomm_dev, list); + (di + n)->id = dev->id; + (di + n)->flags = dev->flags; + (di + n)->state = dev->dlc->state; + (di + n)->channel = dev->channel; + bacpy(&(di + n)->src, &dev->src); + bacpy(&(di + n)->dst, &dev->dst); + if (++n >= dev_num) + break; + } + + read_unlock_bh(&rfcomm_dev_lock); + + dl->dev_num = n; + size = sizeof(*dl) + n * sizeof(*di); + + err = copy_to_user(arg, dl, size); + kfree(dl); + + return err ? -EFAULT : 0; +} + +static int rfcomm_get_dev_info(void __user *arg) +{ + struct rfcomm_dev *dev; + struct rfcomm_dev_info di; + int err = 0; + + BT_DBG(""); + + if (copy_from_user(&di, arg, sizeof(di))) + return -EFAULT; + + if (!(dev = rfcomm_dev_get(di.id))) + return -ENODEV; + + di.flags = dev->flags; + di.channel = dev->channel; + di.state = dev->dlc->state; + bacpy(&di.src, &dev->src); + bacpy(&di.dst, &dev->dst); + + if (copy_to_user(arg, &di, sizeof(di))) + err = -EFAULT; + + rfcomm_dev_put(dev); + return err; +} + +int rfcomm_dev_ioctl(struct sock *sk, unsigned int cmd, void __user *arg) +{ + BT_DBG("cmd %d arg %p", cmd, arg); + + switch (cmd) { + case RFCOMMCREATEDEV: + return rfcomm_create_dev(sk, arg); + + case RFCOMMRELEASEDEV: + return rfcomm_release_dev(arg); + + case RFCOMMGETDEVLIST: + return rfcomm_get_dev_list(arg); + + case RFCOMMGETDEVINFO: + return rfcomm_get_dev_info(arg); + } + + return -EINVAL; +} + +/* ---- DLC callbacks ---- */ +static void rfcomm_dev_data_ready(struct rfcomm_dlc *dlc, struct sk_buff *skb) +{ + struct rfcomm_dev *dev = dlc->owner; + struct tty_struct *tty; + + if (!dev || !(tty = dev->tty)) { + kfree_skb(skb); + return; + } + + BT_DBG("dlc %p tty %p len %d", dlc, tty, skb->len); + + if (test_bit(TTY_DONT_FLIP, &tty->flags)) { + register int i; + for (i = 0; i < skb->len; i++) { + if (tty->flip.count >= TTY_FLIPBUF_SIZE) + tty_flip_buffer_push(tty); + + tty_insert_flip_char(tty, skb->data[i], 0); + } + tty_flip_buffer_push(tty); + } else + tty->ldisc.receive_buf(tty, skb->data, NULL, skb->len); + + kfree_skb(skb); +} + +static void rfcomm_dev_state_change(struct rfcomm_dlc *dlc, int err) +{ + struct rfcomm_dev *dev = dlc->owner; + if (!dev) + return; + + BT_DBG("dlc %p dev %p err %d", dlc, dev, err); + + dev->err = err; + wake_up_interruptible(&dev->wait); + + if (dlc->state == BT_CLOSED) { + if (!dev->tty) { + if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) { + rfcomm_dev_hold(dev); + rfcomm_dev_del(dev); + + /* We have to drop DLC lock here, otherwise + rfcomm_dev_put() will dead lock if it's + the last reference. */ + rfcomm_dlc_unlock(dlc); + rfcomm_dev_put(dev); + rfcomm_dlc_lock(dlc); + } + } else + tty_hangup(dev->tty); + } +} + +static void rfcomm_dev_modem_status(struct rfcomm_dlc *dlc, u8 v24_sig) +{ + struct rfcomm_dev *dev = dlc->owner; + if (!dev) + return; + + BT_DBG("dlc %p dev %p v24_sig 0x%02x", dlc, dev, v24_sig); + + dev->modem_status = + ((v24_sig & RFCOMM_V24_RTC) ? (TIOCM_DSR | TIOCM_DTR) : 0) | + ((v24_sig & RFCOMM_V24_RTR) ? (TIOCM_RTS | TIOCM_CTS) : 0) | + ((v24_sig & RFCOMM_V24_IC) ? TIOCM_RI : 0) | + ((v24_sig & RFCOMM_V24_DV) ? TIOCM_CD : 0); +} + +/* ---- TTY functions ---- */ +static void rfcomm_tty_wakeup(unsigned long arg) +{ + struct rfcomm_dev *dev = (void *) arg; + struct tty_struct *tty = dev->tty; + if (!tty) + return; + + BT_DBG("dev %p tty %p", dev, tty); + + if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags) && tty->ldisc.write_wakeup) + (tty->ldisc.write_wakeup)(tty); + + wake_up_interruptible(&tty->write_wait); +#ifdef SERIAL_HAVE_POLL_WAIT + wake_up_interruptible(&tty->poll_wait); +#endif +} + +static int rfcomm_tty_open(struct tty_struct *tty, struct file *filp) +{ + DECLARE_WAITQUEUE(wait, current); + struct rfcomm_dev *dev; + struct rfcomm_dlc *dlc; + int err, id; + + id = tty->index; + + BT_DBG("tty %p id %d", tty, id); + + /* We don't leak this refcount. For reasons which are not entirely + clear, the TTY layer will call our ->close() method even if the + open fails. We decrease the refcount there, and decreasing it + here too would cause breakage. */ + dev = rfcomm_dev_get(id); + if (!dev) + return -ENODEV; + + BT_DBG("dev %p dst %s channel %d opened %d", dev, batostr(&dev->dst), dev->channel, dev->opened); + + if (dev->opened++ != 0) + return 0; + + dlc = dev->dlc; + + /* Attach TTY and open DLC */ + + rfcomm_dlc_lock(dlc); + tty->driver_data = dev; + dev->tty = tty; + rfcomm_dlc_unlock(dlc); + set_bit(RFCOMM_TTY_ATTACHED, &dev->flags); + + err = rfcomm_dlc_open(dlc, &dev->src, &dev->dst, dev->channel); + if (err < 0) + return err; + + /* Wait for DLC to connect */ + add_wait_queue(&dev->wait, &wait); + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + + if (dlc->state == BT_CLOSED) { + err = -dev->err; + break; + } + + if (dlc->state == BT_CONNECTED) + break; + + if (signal_pending(current)) { + err = -EINTR; + break; + } + + schedule(); + } + set_current_state(TASK_RUNNING); + remove_wait_queue(&dev->wait, &wait); + + return err; +} + +static void rfcomm_tty_close(struct tty_struct *tty, struct file *filp) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + if (!dev) + return; + + BT_DBG("tty %p dev %p dlc %p opened %d", tty, dev, dev->dlc, dev->opened); + + if (--dev->opened == 0) { + /* Close DLC and dettach TTY */ + rfcomm_dlc_close(dev->dlc, 0); + + clear_bit(RFCOMM_TTY_ATTACHED, &dev->flags); + tasklet_kill(&dev->wakeup_task); + + rfcomm_dlc_lock(dev->dlc); + tty->driver_data = NULL; + dev->tty = NULL; + rfcomm_dlc_unlock(dev->dlc); + } + + rfcomm_dev_put(dev); +} + +static int rfcomm_tty_write(struct tty_struct *tty, const unsigned char *buf, int count) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dlc *dlc = dev->dlc; + struct sk_buff *skb; + int err = 0, sent = 0, size; + + BT_DBG("tty %p count %d", tty, count); + + while (count) { + size = min_t(uint, count, dlc->mtu); + + skb = rfcomm_wmalloc(dev, size + RFCOMM_SKB_RESERVE, GFP_ATOMIC); + + if (!skb) + break; + + skb_reserve(skb, RFCOMM_SKB_HEAD_RESERVE); + + memcpy(skb_put(skb, size), buf + sent, size); + + if ((err = rfcomm_dlc_send(dlc, skb)) < 0) { + kfree_skb(skb); + break; + } + + sent += size; + count -= size; + } + + return sent ? sent : err; +} + +static int rfcomm_tty_write_room(struct tty_struct *tty) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + int room; + + BT_DBG("tty %p", tty); + + room = rfcomm_room(dev->dlc) - atomic_read(&dev->wmem_alloc); + if (room < 0) + room = 0; + return room; +} + +static int rfcomm_tty_ioctl(struct tty_struct *tty, struct file *filp, unsigned int cmd, unsigned long arg) +{ + BT_DBG("tty %p cmd 0x%02x", tty, cmd); + + switch (cmd) { + case TCGETS: + BT_DBG("TCGETS is not supported"); + return -ENOIOCTLCMD; + + case TCSETS: + BT_DBG("TCSETS is not supported"); + return -ENOIOCTLCMD; + + case TIOCMIWAIT: + BT_DBG("TIOCMIWAIT"); + break; + + case TIOCGICOUNT: + BT_DBG("TIOCGICOUNT"); + break; + + case TIOCGSERIAL: + BT_ERR("TIOCGSERIAL is not supported"); + return -ENOIOCTLCMD; + + case TIOCSSERIAL: + BT_ERR("TIOCSSERIAL is not supported"); + return -ENOIOCTLCMD; + + case TIOCSERGSTRUCT: + BT_ERR("TIOCSERGSTRUCT is not supported"); + return -ENOIOCTLCMD; + + case TIOCSERGETLSR: + BT_ERR("TIOCSERGETLSR is not supported"); + return -ENOIOCTLCMD; + + case TIOCSERCONFIG: + BT_ERR("TIOCSERCONFIG is not supported"); + return -ENOIOCTLCMD; + + default: + return -ENOIOCTLCMD; /* ioctls which we must ignore */ + + } + + return -ENOIOCTLCMD; +} + +#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK)) + +static void rfcomm_tty_set_termios(struct tty_struct *tty, struct termios *old) +{ + BT_DBG("tty %p", tty); + + if ((tty->termios->c_cflag == old->c_cflag) && + (RELEVANT_IFLAG(tty->termios->c_iflag) == RELEVANT_IFLAG(old->c_iflag))) + return; + + /* handle turning off CRTSCTS */ + if ((old->c_cflag & CRTSCTS) && !(tty->termios->c_cflag & CRTSCTS)) { + BT_DBG("turning off CRTSCTS"); + } +} + +static void rfcomm_tty_throttle(struct tty_struct *tty) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + + BT_DBG("tty %p dev %p", tty, dev); + + rfcomm_dlc_throttle(dev->dlc); +} + +static void rfcomm_tty_unthrottle(struct tty_struct *tty) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + + BT_DBG("tty %p dev %p", tty, dev); + + rfcomm_dlc_unthrottle(dev->dlc); +} + +static int rfcomm_tty_chars_in_buffer(struct tty_struct *tty) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dlc *dlc = dev->dlc; + + BT_DBG("tty %p dev %p", tty, dev); + + if (skb_queue_len(&dlc->tx_queue)) + return dlc->mtu; + + return 0; +} + +static void rfcomm_tty_flush_buffer(struct tty_struct *tty) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + if (!dev) + return; + + BT_DBG("tty %p dev %p", tty, dev); + + skb_queue_purge(&dev->dlc->tx_queue); + + if (test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags) && tty->ldisc.write_wakeup) + tty->ldisc.write_wakeup(tty); +} + +static void rfcomm_tty_send_xchar(struct tty_struct *tty, char ch) +{ + BT_DBG("tty %p ch %c", tty, ch); +} + +static void rfcomm_tty_wait_until_sent(struct tty_struct *tty, int timeout) +{ + BT_DBG("tty %p timeout %d", tty, timeout); +} + +static void rfcomm_tty_hangup(struct tty_struct *tty) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + if (!dev) + return; + + BT_DBG("tty %p dev %p", tty, dev); + + rfcomm_tty_flush_buffer(tty); + + if (test_bit(RFCOMM_RELEASE_ONHUP, &dev->flags)) + rfcomm_dev_del(dev); +} + +static int rfcomm_tty_read_proc(char *buf, char **start, off_t offset, int len, int *eof, void *unused) +{ + return 0; +} + +static int rfcomm_tty_tiocmget(struct tty_struct *tty, struct file *filp) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + + BT_DBG("tty %p dev %p", tty, dev); + + return dev->modem_status; +} + +static int rfcomm_tty_tiocmset(struct tty_struct *tty, struct file *filp, unsigned int set, unsigned int clear) +{ + struct rfcomm_dev *dev = (struct rfcomm_dev *) tty->driver_data; + struct rfcomm_dlc *dlc = dev->dlc; + u8 v24_sig; + + BT_DBG("tty %p dev %p set 0x%02x clear 0x%02x", tty, dev, set, clear); + + rfcomm_dlc_get_modem_status(dlc, &v24_sig); + + if (set & TIOCM_DSR || set & TIOCM_DTR) + v24_sig |= RFCOMM_V24_RTC; + if (set & TIOCM_RTS || set & TIOCM_CTS) + v24_sig |= RFCOMM_V24_RTR; + if (set & TIOCM_RI) + v24_sig |= RFCOMM_V24_IC; + if (set & TIOCM_CD) + v24_sig |= RFCOMM_V24_DV; + + if (clear & TIOCM_DSR || clear & TIOCM_DTR) + v24_sig &= ~RFCOMM_V24_RTC; + if (clear & TIOCM_RTS || clear & TIOCM_CTS) + v24_sig &= ~RFCOMM_V24_RTR; + if (clear & TIOCM_RI) + v24_sig &= ~RFCOMM_V24_IC; + if (clear & TIOCM_CD) + v24_sig &= ~RFCOMM_V24_DV; + + rfcomm_dlc_set_modem_status(dlc, v24_sig); + + return 0; +} + +/* ---- TTY structure ---- */ + +static struct tty_operations rfcomm_ops = { + .open = rfcomm_tty_open, + .close = rfcomm_tty_close, + .write = rfcomm_tty_write, + .write_room = rfcomm_tty_write_room, + .chars_in_buffer = rfcomm_tty_chars_in_buffer, + .flush_buffer = rfcomm_tty_flush_buffer, + .ioctl = rfcomm_tty_ioctl, + .throttle = rfcomm_tty_throttle, + .unthrottle = rfcomm_tty_unthrottle, + .set_termios = rfcomm_tty_set_termios, + .send_xchar = rfcomm_tty_send_xchar, + .hangup = rfcomm_tty_hangup, + .wait_until_sent = rfcomm_tty_wait_until_sent, + .read_proc = rfcomm_tty_read_proc, + .tiocmget = rfcomm_tty_tiocmget, + .tiocmset = rfcomm_tty_tiocmset, +}; + +int rfcomm_init_ttys(void) +{ + rfcomm_tty_driver = alloc_tty_driver(RFCOMM_TTY_PORTS); + if (!rfcomm_tty_driver) + return -1; + + rfcomm_tty_driver->owner = THIS_MODULE; + rfcomm_tty_driver->driver_name = "rfcomm"; + rfcomm_tty_driver->devfs_name = "bluetooth/rfcomm/"; + rfcomm_tty_driver->name = "rfcomm"; + rfcomm_tty_driver->major = RFCOMM_TTY_MAJOR; + rfcomm_tty_driver->minor_start = RFCOMM_TTY_MINOR; + rfcomm_tty_driver->type = TTY_DRIVER_TYPE_SERIAL; + rfcomm_tty_driver->subtype = SERIAL_TYPE_NORMAL; + rfcomm_tty_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_NO_DEVFS; + rfcomm_tty_driver->init_termios = tty_std_termios; + rfcomm_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL; + tty_set_operations(rfcomm_tty_driver, &rfcomm_ops); + + if (tty_register_driver(rfcomm_tty_driver)) { + BT_ERR("Can't register RFCOMM TTY driver"); + put_tty_driver(rfcomm_tty_driver); + return -1; + } + + BT_INFO("RFCOMM TTY layer initialized"); + + return 0; +} + +void rfcomm_cleanup_ttys(void) +{ + tty_unregister_driver(rfcomm_tty_driver); + put_tty_driver(rfcomm_tty_driver); +} diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c new file mode 100644 index 000000000000..3e750ef09e60 --- /dev/null +++ b/net/bluetooth/sco.c @@ -0,0 +1,1071 @@ +/* + BlueZ - Bluetooth protocol stack for Linux + Copyright (C) 2000-2001 Qualcomm Incorporated + + Written 2000,2001 by Maxim Krasnyansky + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License version 2 as + published by the Free Software Foundation; + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + + ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + SOFTWARE IS DISCLAIMED. +*/ + +/* Bluetooth SCO sockets. */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#ifndef CONFIG_BT_SCO_DEBUG +#undef BT_DBG +#define BT_DBG(D...) +#endif + +#define VERSION "0.4" + +static struct proto_ops sco_sock_ops; + +static struct bt_sock_list sco_sk_list = { + .lock = RW_LOCK_UNLOCKED +}; + +static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent); +static void sco_chan_del(struct sock *sk, int err); + +static int sco_conn_del(struct hci_conn *conn, int err); + +static void sco_sock_close(struct sock *sk); +static void sco_sock_kill(struct sock *sk); + +/* ---- SCO timers ---- */ +static void sco_sock_timeout(unsigned long arg) +{ + struct sock *sk = (struct sock *) arg; + + BT_DBG("sock %p state %d", sk, sk->sk_state); + + bh_lock_sock(sk); + sk->sk_err = ETIMEDOUT; + sk->sk_state_change(sk); + bh_unlock_sock(sk); + + sco_sock_kill(sk); + sock_put(sk); +} + +static void sco_sock_set_timer(struct sock *sk, long timeout) +{ + BT_DBG("sock %p state %d timeout %ld", sk, sk->sk_state, timeout); + sk_reset_timer(sk, &sk->sk_timer, jiffies + timeout); +} + +static void sco_sock_clear_timer(struct sock *sk) +{ + BT_DBG("sock %p state %d", sk, sk->sk_state); + sk_stop_timer(sk, &sk->sk_timer); +} + +static void sco_sock_init_timer(struct sock *sk) +{ + init_timer(&sk->sk_timer); + sk->sk_timer.function = sco_sock_timeout; + sk->sk_timer.data = (unsigned long)sk; +} + +/* ---- SCO connections ---- */ +static struct sco_conn *sco_conn_add(struct hci_conn *hcon, __u8 status) +{ + struct hci_dev *hdev = hcon->hdev; + struct sco_conn *conn; + + if ((conn = hcon->sco_data)) + return conn; + + if (status) + return conn; + + if (!(conn = kmalloc(sizeof(struct sco_conn), GFP_ATOMIC))) + return NULL; + memset(conn, 0, sizeof(struct sco_conn)); + + spin_lock_init(&conn->lock); + + hcon->sco_data = conn; + conn->hcon = hcon; + + conn->src = &hdev->bdaddr; + conn->dst = &hcon->dst; + + if (hdev->sco_mtu > 0) + conn->mtu = hdev->sco_mtu; + else + conn->mtu = 60; + + BT_DBG("hcon %p conn %p", hcon, conn); + return conn; +} + +static inline struct sock *sco_chan_get(struct sco_conn *conn) +{ + struct sock *sk = NULL; + sco_conn_lock(conn); + sk = conn->sk; + sco_conn_unlock(conn); + return sk; +} + +static int sco_conn_del(struct hci_conn *hcon, int err) +{ + struct sco_conn *conn; + struct sock *sk; + + if (!(conn = hcon->sco_data)) + return 0; + + BT_DBG("hcon %p conn %p, err %d", hcon, conn, err); + + /* Kill socket */ + if ((sk = sco_chan_get(conn))) { + bh_lock_sock(sk); + sco_sock_clear_timer(sk); + sco_chan_del(sk, err); + bh_unlock_sock(sk); + sco_sock_kill(sk); + } + + hcon->sco_data = NULL; + kfree(conn); + return 0; +} + +static inline int sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent) +{ + int err = 0; + + sco_conn_lock(conn); + if (conn->sk) { + err = -EBUSY; + } else { + __sco_chan_add(conn, sk, parent); + } + sco_conn_unlock(conn); + return err; +} + +static int sco_connect(struct sock *sk) +{ + bdaddr_t *src = &bt_sk(sk)->src; + bdaddr_t *dst = &bt_sk(sk)->dst; + struct sco_conn *conn; + struct hci_conn *hcon; + struct hci_dev *hdev; + int err = 0; + + BT_DBG("%s -> %s", batostr(src), batostr(dst)); + + if (!(hdev = hci_get_route(dst, src))) + return -EHOSTUNREACH; + + hci_dev_lock_bh(hdev); + + err = -ENOMEM; + + hcon = hci_connect(hdev, SCO_LINK, dst); + if (!hcon) + goto done; + + conn = sco_conn_add(hcon, 0); + if (!conn) { + hci_conn_put(hcon); + goto done; + } + + /* Update source addr of the socket */ + bacpy(src, conn->src); + + err = sco_chan_add(conn, sk, NULL); + if (err) + goto done; + + if (hcon->state == BT_CONNECTED) { + sco_sock_clear_timer(sk); + sk->sk_state = BT_CONNECTED; + } else { + sk->sk_state = BT_CONNECT; + sco_sock_set_timer(sk, sk->sk_sndtimeo); + } +done: + hci_dev_unlock_bh(hdev); + hci_dev_put(hdev); + return err; +} + +static inline int sco_send_frame(struct sock *sk, struct msghdr *msg, int len) +{ + struct sco_conn *conn = sco_pi(sk)->conn; + struct sk_buff *skb; + int err, count; + + /* Check outgoing MTU */ + if (len > conn->mtu) + return -EINVAL; + + BT_DBG("sk %p len %d", sk, len); + + count = min_t(unsigned int, conn->mtu, len); + if (!(skb = bt_skb_send_alloc(sk, count, msg->msg_flags & MSG_DONTWAIT, &err))) + return err; + + if (memcpy_fromiovec(skb_put(skb, count), msg->msg_iov, count)) { + err = -EFAULT; + goto fail; + } + + if ((err = hci_send_sco(conn->hcon, skb)) < 0) + goto fail; + + return count; + +fail: + kfree_skb(skb); + return err; +} + +static inline void sco_recv_frame(struct sco_conn *conn, struct sk_buff *skb) +{ + struct sock *sk = sco_chan_get(conn); + + if (!sk) + goto drop; + + BT_DBG("sk %p len %d", sk, skb->len); + + if (sk->sk_state != BT_CONNECTED) + goto drop; + + if (!sock_queue_rcv_skb(sk, skb)) + return; + +drop: + kfree_skb(skb); + return; +} + +/* -------- Socket interface ---------- */ +static struct sock *__sco_get_sock_by_addr(bdaddr_t *ba) +{ + struct sock *sk; + struct hlist_node *node; + + sk_for_each(sk, node, &sco_sk_list.head) + if (!bacmp(&bt_sk(sk)->src, ba)) + goto found; + sk = NULL; +found: + return sk; +} + +/* Find socket listening on source bdaddr. + * Returns closest match. + */ +static struct sock *sco_get_sock_listen(bdaddr_t *src) +{ + struct sock *sk = NULL, *sk1 = NULL; + struct hlist_node *node; + + read_lock(&sco_sk_list.lock); + + sk_for_each(sk, node, &sco_sk_list.head) { + if (sk->sk_state != BT_LISTEN) + continue; + + /* Exact match. */ + if (!bacmp(&bt_sk(sk)->src, src)) + break; + + /* Closest match */ + if (!bacmp(&bt_sk(sk)->src, BDADDR_ANY)) + sk1 = sk; + } + + read_unlock(&sco_sk_list.lock); + + return node ? sk : sk1; +} + +static void sco_sock_destruct(struct sock *sk) +{ + BT_DBG("sk %p", sk); + + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); +} + +static void sco_sock_cleanup_listen(struct sock *parent) +{ + struct sock *sk; + + BT_DBG("parent %p", parent); + + /* Close not yet accepted channels */ + while ((sk = bt_accept_dequeue(parent, NULL))) { + sco_sock_close(sk); + sco_sock_kill(sk); + } + + parent->sk_state = BT_CLOSED; + sock_set_flag(parent, SOCK_ZAPPED); +} + +/* Kill socket (only if zapped and orphan) + * Must be called on unlocked socket. + */ +static void sco_sock_kill(struct sock *sk) +{ + if (!sock_flag(sk, SOCK_ZAPPED) || sk->sk_socket) + return; + + BT_DBG("sk %p state %d", sk, sk->sk_state); + + /* Kill poor orphan */ + bt_sock_unlink(&sco_sk_list, sk); + sock_set_flag(sk, SOCK_DEAD); + sock_put(sk); +} + +/* Close socket. + * Must be called on unlocked socket. + */ +static void sco_sock_close(struct sock *sk) +{ + struct sco_conn *conn; + + sco_sock_clear_timer(sk); + + lock_sock(sk); + + conn = sco_pi(sk)->conn; + + BT_DBG("sk %p state %d conn %p socket %p", sk, sk->sk_state, conn, sk->sk_socket); + + switch (sk->sk_state) { + case BT_LISTEN: + sco_sock_cleanup_listen(sk); + break; + + case BT_CONNECTED: + case BT_CONFIG: + case BT_CONNECT: + case BT_DISCONN: + sco_chan_del(sk, ECONNRESET); + break; + + default: + sock_set_flag(sk, SOCK_ZAPPED); + break; + }; + + release_sock(sk); + + sco_sock_kill(sk); +} + +static void sco_sock_init(struct sock *sk, struct sock *parent) +{ + BT_DBG("sk %p", sk); + + if (parent) + sk->sk_type = parent->sk_type; +} + +static struct proto sco_proto = { + .name = "SCO", + .owner = THIS_MODULE, + .obj_size = sizeof(struct sco_pinfo) +}; + +static struct sock *sco_sock_alloc(struct socket *sock, int proto, int prio) +{ + struct sock *sk; + + sk = sk_alloc(PF_BLUETOOTH, prio, &sco_proto, 1); + if (!sk) + return NULL; + + sock_init_data(sock, sk); + INIT_LIST_HEAD(&bt_sk(sk)->accept_q); + + sk->sk_destruct = sco_sock_destruct; + sk->sk_sndtimeo = SCO_CONN_TIMEOUT; + + sock_reset_flag(sk, SOCK_ZAPPED); + + sk->sk_protocol = proto; + sk->sk_state = BT_OPEN; + + sco_sock_init_timer(sk); + + bt_sock_link(&sco_sk_list, sk); + return sk; +} + +static int sco_sock_create(struct socket *sock, int protocol) +{ + struct sock *sk; + + BT_DBG("sock %p", sock); + + sock->state = SS_UNCONNECTED; + + if (sock->type != SOCK_SEQPACKET) + return -ESOCKTNOSUPPORT; + + sock->ops = &sco_sock_ops; + + if (!(sk = sco_sock_alloc(sock, protocol, GFP_KERNEL))) + return -ENOMEM; + + sco_sock_init(sk, NULL); + return 0; +} + +static int sco_sock_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +{ + struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; + struct sock *sk = sock->sk; + bdaddr_t *src = &sa->sco_bdaddr; + int err = 0; + + BT_DBG("sk %p %s", sk, batostr(&sa->sco_bdaddr)); + + if (!addr || addr->sa_family != AF_BLUETOOTH) + return -EINVAL; + + lock_sock(sk); + + if (sk->sk_state != BT_OPEN) { + err = -EBADFD; + goto done; + } + + write_lock_bh(&sco_sk_list.lock); + + if (bacmp(src, BDADDR_ANY) && __sco_get_sock_by_addr(src)) { + err = -EADDRINUSE; + } else { + /* Save source address */ + bacpy(&bt_sk(sk)->src, &sa->sco_bdaddr); + sk->sk_state = BT_BOUND; + } + + write_unlock_bh(&sco_sk_list.lock); + +done: + release_sock(sk); + return err; +} + +static int sco_sock_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) +{ + struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; + struct sock *sk = sock->sk; + int err = 0; + + + BT_DBG("sk %p", sk); + + if (addr->sa_family != AF_BLUETOOTH || alen < sizeof(struct sockaddr_sco)) + return -EINVAL; + + if (sk->sk_state != BT_OPEN && sk->sk_state != BT_BOUND) + return -EBADFD; + + if (sk->sk_type != SOCK_SEQPACKET) + return -EINVAL; + + lock_sock(sk); + + /* Set destination address and psm */ + bacpy(&bt_sk(sk)->dst, &sa->sco_bdaddr); + + if ((err = sco_connect(sk))) + goto done; + + err = bt_sock_wait_state(sk, BT_CONNECTED, + sock_sndtimeo(sk, flags & O_NONBLOCK)); + +done: + release_sock(sk); + return err; +} + +static int sco_sock_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sk %p backlog %d", sk, backlog); + + lock_sock(sk); + + if (sk->sk_state != BT_BOUND || sock->type != SOCK_SEQPACKET) { + err = -EBADFD; + goto done; + } + + sk->sk_max_ack_backlog = backlog; + sk->sk_ack_backlog = 0; + sk->sk_state = BT_LISTEN; + +done: + release_sock(sk); + return err; +} + +static int sco_sock_accept(struct socket *sock, struct socket *newsock, int flags) +{ + DECLARE_WAITQUEUE(wait, current); + struct sock *sk = sock->sk, *ch; + long timeo; + int err = 0; + + lock_sock(sk); + + if (sk->sk_state != BT_LISTEN) { + err = -EBADFD; + goto done; + } + + timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + BT_DBG("sk %p timeo %ld", sk, timeo); + + /* Wait for an incoming connection. (wake-one). */ + add_wait_queue_exclusive(sk->sk_sleep, &wait); + while (!(ch = bt_accept_dequeue(sk, newsock))) { + set_current_state(TASK_INTERRUPTIBLE); + if (!timeo) { + err = -EAGAIN; + break; + } + + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + + if (sk->sk_state != BT_LISTEN) { + err = -EBADFD; + break; + } + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + break; + } + } + set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sk_sleep, &wait); + + if (err) + goto done; + + newsock->state = SS_CONNECTED; + + BT_DBG("new socket %p", ch); + +done: + release_sock(sk); + return err; +} + +static int sco_sock_getname(struct socket *sock, struct sockaddr *addr, int *len, int peer) +{ + struct sockaddr_sco *sa = (struct sockaddr_sco *) addr; + struct sock *sk = sock->sk; + + BT_DBG("sock %p, sk %p", sock, sk); + + addr->sa_family = AF_BLUETOOTH; + *len = sizeof(struct sockaddr_sco); + + if (peer) + bacpy(&sa->sco_bdaddr, &bt_sk(sk)->dst); + else + bacpy(&sa->sco_bdaddr, &bt_sk(sk)->src); + + return 0; +} + +static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sock %p, sk %p", sock, sk); + + if (sk->sk_err) + return sock_error(sk); + + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + lock_sock(sk); + + if (sk->sk_state == BT_CONNECTED) + err = sco_send_frame(sk, msg, len); + else + err = -ENOTCONN; + + release_sock(sk); + return err; +} + +static int sco_sock_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sk %p", sk); + + lock_sock(sk); + + switch (optname) { + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int sco_sock_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct sco_options opts; + struct sco_conninfo cinfo; + int len, err = 0; + + BT_DBG("sk %p", sk); + + if (get_user(len, optlen)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + case SCO_OPTIONS: + if (sk->sk_state != BT_CONNECTED) { + err = -ENOTCONN; + break; + } + + opts.mtu = sco_pi(sk)->conn->mtu; + + BT_DBG("mtu %d", opts.mtu); + + len = min_t(unsigned int, len, sizeof(opts)); + if (copy_to_user(optval, (char *)&opts, len)) + err = -EFAULT; + + break; + + case SCO_CONNINFO: + if (sk->sk_state != BT_CONNECTED) { + err = -ENOTCONN; + break; + } + + cinfo.hci_handle = sco_pi(sk)->conn->hcon->handle; + memcpy(cinfo.dev_class, sco_pi(sk)->conn->hcon->dev_class, 3); + + len = min_t(unsigned int, len, sizeof(cinfo)); + if (copy_to_user(optval, (char *)&cinfo, len)) + err = -EFAULT; + + break; + + default: + err = -ENOPROTOOPT; + break; + } + + release_sock(sk); + return err; +} + +static int sco_sock_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + int err = 0; + + BT_DBG("sock %p, sk %p", sock, sk); + + if (!sk) + return 0; + + sco_sock_close(sk); + + if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime) { + lock_sock(sk); + err = bt_sock_wait_state(sk, BT_CLOSED, sk->sk_lingertime); + release_sock(sk); + } + + sock_orphan(sk); + sco_sock_kill(sk); + return err; +} + +static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent) +{ + BT_DBG("conn %p", conn); + + sco_pi(sk)->conn = conn; + conn->sk = sk; + + if (parent) + bt_accept_enqueue(parent, sk); +} + +/* Delete channel. + * Must be called on the locked socket. */ +static void sco_chan_del(struct sock *sk, int err) +{ + struct sco_conn *conn; + + conn = sco_pi(sk)->conn; + + BT_DBG("sk %p, conn %p, err %d", sk, conn, err); + + if (conn) { + sco_conn_lock(conn); + conn->sk = NULL; + sco_pi(sk)->conn = NULL; + sco_conn_unlock(conn); + hci_conn_put(conn->hcon); + } + + sk->sk_state = BT_CLOSED; + sk->sk_err = err; + sk->sk_state_change(sk); + + sock_set_flag(sk, SOCK_ZAPPED); +} + +static void sco_conn_ready(struct sco_conn *conn) +{ + struct sock *parent, *sk; + + BT_DBG("conn %p", conn); + + sco_conn_lock(conn); + + if ((sk = conn->sk)) { + sco_sock_clear_timer(sk); + bh_lock_sock(sk); + sk->sk_state = BT_CONNECTED; + sk->sk_state_change(sk); + bh_unlock_sock(sk); + } else { + parent = sco_get_sock_listen(conn->src); + if (!parent) + goto done; + + bh_lock_sock(parent); + + sk = sco_sock_alloc(NULL, BTPROTO_SCO, GFP_ATOMIC); + if (!sk) { + bh_unlock_sock(parent); + goto done; + } + + sco_sock_init(sk, parent); + + bacpy(&bt_sk(sk)->src, conn->src); + bacpy(&bt_sk(sk)->dst, conn->dst); + + hci_conn_hold(conn->hcon); + __sco_chan_add(conn, sk, parent); + + sk->sk_state = BT_CONNECTED; + + /* Wake up parent */ + parent->sk_data_ready(parent, 1); + + bh_unlock_sock(parent); + } + +done: + sco_conn_unlock(conn); +} + +/* ----- SCO interface with lower layer (HCI) ----- */ +static int sco_connect_ind(struct hci_dev *hdev, bdaddr_t *bdaddr, __u8 type) +{ + BT_DBG("hdev %s, bdaddr %s", hdev->name, batostr(bdaddr)); + + /* Always accept connection */ + return HCI_LM_ACCEPT; +} + +static int sco_connect_cfm(struct hci_conn *hcon, __u8 status) +{ + BT_DBG("hcon %p bdaddr %s status %d", hcon, batostr(&hcon->dst), status); + + if (hcon->type != SCO_LINK) + return 0; + + if (!status) { + struct sco_conn *conn; + + conn = sco_conn_add(hcon, status); + if (conn) + sco_conn_ready(conn); + } else + sco_conn_del(hcon, bt_err(status)); + + return 0; +} + +static int sco_disconn_ind(struct hci_conn *hcon, __u8 reason) +{ + BT_DBG("hcon %p reason %d", hcon, reason); + + if (hcon->type != SCO_LINK) + return 0; + + sco_conn_del(hcon, bt_err(reason)); + return 0; +} + +static int sco_recv_scodata(struct hci_conn *hcon, struct sk_buff *skb) +{ + struct sco_conn *conn = hcon->sco_data; + + if (!conn) + goto drop; + + BT_DBG("conn %p len %d", conn, skb->len); + + if (skb->len) { + sco_recv_frame(conn, skb); + return 0; + } + +drop: + kfree_skb(skb); + return 0; +} + +/* ---- Proc fs support ---- */ +#ifdef CONFIG_PROC_FS +static void *sco_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct sock *sk; + struct hlist_node *node; + loff_t l = *pos; + + read_lock_bh(&sco_sk_list.lock); + + sk_for_each(sk, node, &sco_sk_list.head) + if (!l--) + goto found; + sk = NULL; +found: + return sk; +} + +static void *sco_seq_next(struct seq_file *seq, void *e, loff_t *pos) +{ + struct sock *sk = e; + (*pos)++; + return sk_next(sk); +} + +static void sco_seq_stop(struct seq_file *seq, void *e) +{ + read_unlock_bh(&sco_sk_list.lock); +} + +static int sco_seq_show(struct seq_file *seq, void *e) +{ + struct sock *sk = e; + seq_printf(seq, "%s %s %d\n", + batostr(&bt_sk(sk)->src), batostr(&bt_sk(sk)->dst), sk->sk_state); + return 0; +} + +static struct seq_operations sco_seq_ops = { + .start = sco_seq_start, + .next = sco_seq_next, + .stop = sco_seq_stop, + .show = sco_seq_show +}; + +static int sco_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &sco_seq_ops); +} + +static struct file_operations sco_seq_fops = { + .owner = THIS_MODULE, + .open = sco_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init sco_proc_init(void) +{ + struct proc_dir_entry *p = create_proc_entry("sco", S_IRUGO, proc_bt); + if (!p) + return -ENOMEM; + p->owner = THIS_MODULE; + p->proc_fops = &sco_seq_fops; + return 0; +} + +static void __exit sco_proc_cleanup(void) +{ + remove_proc_entry("sco", proc_bt); +} + +#else /* CONFIG_PROC_FS */ + +static int __init sco_proc_init(void) +{ + return 0; +} + +static void __exit sco_proc_cleanup(void) +{ + return; +} +#endif /* CONFIG_PROC_FS */ + +static struct proto_ops sco_sock_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .release = sco_sock_release, + .bind = sco_sock_bind, + .connect = sco_sock_connect, + .listen = sco_sock_listen, + .accept = sco_sock_accept, + .getname = sco_sock_getname, + .sendmsg = sco_sock_sendmsg, + .recvmsg = bt_sock_recvmsg, + .poll = bt_sock_poll, + .ioctl = sock_no_ioctl, + .mmap = sock_no_mmap, + .socketpair = sock_no_socketpair, + .shutdown = sock_no_shutdown, + .setsockopt = sco_sock_setsockopt, + .getsockopt = sco_sock_getsockopt +}; + +static struct net_proto_family sco_sock_family_ops = { + .family = PF_BLUETOOTH, + .owner = THIS_MODULE, + .create = sco_sock_create, +}; + +static struct hci_proto sco_hci_proto = { + .name = "SCO", + .id = HCI_PROTO_SCO, + .connect_ind = sco_connect_ind, + .connect_cfm = sco_connect_cfm, + .disconn_ind = sco_disconn_ind, + .recv_scodata = sco_recv_scodata +}; + +static int __init sco_init(void) +{ + int err; + + err = proto_register(&sco_proto, 0); + if (err < 0) + return err; + + err = bt_sock_register(BTPROTO_SCO, &sco_sock_family_ops); + if (err < 0) { + BT_ERR("SCO socket registration failed"); + goto error; + } + + err = hci_register_proto(&sco_hci_proto); + if (err < 0) { + BT_ERR("SCO protocol registration failed"); + bt_sock_unregister(BTPROTO_SCO); + goto error; + } + + sco_proc_init(); + + BT_INFO("SCO (Voice Link) ver %s", VERSION); + BT_INFO("SCO socket layer initialized"); + + return 0; + +error: + proto_unregister(&sco_proto); + return err; +} + +static void __exit sco_exit(void) +{ + sco_proc_cleanup(); + + if (bt_sock_unregister(BTPROTO_SCO) < 0) + BT_ERR("SCO socket unregistration failed"); + + if (hci_unregister_proto(&sco_hci_proto) < 0) + BT_ERR("SCO protocol unregistration failed"); + + proto_unregister(&sco_proto); +} + +module_init(sco_init); +module_exit(sco_exit); + +MODULE_AUTHOR("Maxim Krasnyansky , Marcel Holtmann "); +MODULE_DESCRIPTION("Bluetooth SCO ver " VERSION); +MODULE_VERSION(VERSION); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("bt-proto-2"); diff --git a/net/bridge/Makefile b/net/bridge/Makefile new file mode 100644 index 000000000000..59556e40e143 --- /dev/null +++ b/net/bridge/Makefile @@ -0,0 +1,15 @@ +# +# Makefile for the IEEE 802.1d ethernet bridging layer. +# + +obj-$(CONFIG_BRIDGE) += bridge.o + +bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ + br_ioctl.o br_notify.o br_stp.o br_stp_bpdu.o \ + br_stp_if.o br_stp_timer.o + +bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o + +bridge-$(CONFIG_BRIDGE_NETFILTER) += br_netfilter.o + +obj-$(CONFIG_BRIDGE_NF_EBTABLES) += netfilter/ diff --git a/net/bridge/br.c b/net/bridge/br.c new file mode 100644 index 000000000000..f8f184942aaf --- /dev/null +++ b/net/bridge/br.c @@ -0,0 +1,69 @@ +/* + * Generic parts + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * $Id: br.c,v 1.47 2001/12/24 00:56:41 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include "br_private.h" + +int (*br_should_route_hook) (struct sk_buff **pskb) = NULL; + +static int __init br_init(void) +{ + br_fdb_init(); + +#ifdef CONFIG_BRIDGE_NETFILTER + if (br_netfilter_init()) + return 1; +#endif + brioctl_set(br_ioctl_deviceless_stub); + br_handle_frame_hook = br_handle_frame; + + br_fdb_get_hook = br_fdb_get; + br_fdb_put_hook = br_fdb_put; + + register_netdevice_notifier(&br_device_notifier); + + return 0; +} + +static void __exit br_deinit(void) +{ +#ifdef CONFIG_BRIDGE_NETFILTER + br_netfilter_fini(); +#endif + unregister_netdevice_notifier(&br_device_notifier); + brioctl_set(NULL); + + br_cleanup_bridges(); + + synchronize_net(); + + br_fdb_get_hook = NULL; + br_fdb_put_hook = NULL; + + br_handle_frame_hook = NULL; + br_fdb_fini(); +} + +EXPORT_SYMBOL(br_should_route_hook); + +module_init(br_init) +module_exit(br_deinit) +MODULE_LICENSE("GPL"); diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c new file mode 100644 index 000000000000..d9b72fde433c --- /dev/null +++ b/net/bridge/br_device.c @@ -0,0 +1,104 @@ +/* + * Device handling code + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * $Id: br_device.c,v 1.6 2001/12/24 00:59:55 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include "br_private.h" + +static struct net_device_stats *br_dev_get_stats(struct net_device *dev) +{ + struct net_bridge *br; + + br = dev->priv; + + return &br->statistics; +} + +int br_dev_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_bridge *br = netdev_priv(dev); + const unsigned char *dest = skb->data; + struct net_bridge_fdb_entry *dst; + + br->statistics.tx_packets++; + br->statistics.tx_bytes += skb->len; + + skb->mac.raw = skb->data; + skb_pull(skb, ETH_HLEN); + + rcu_read_lock(); + if (dest[0] & 1) + br_flood_deliver(br, skb, 0); + else if ((dst = __br_fdb_get(br, dest)) != NULL) + br_deliver(dst->dst, skb); + else + br_flood_deliver(br, skb, 0); + + rcu_read_unlock(); + return 0; +} + +static int br_dev_open(struct net_device *dev) +{ + netif_start_queue(dev); + + br_stp_enable_bridge(dev->priv); + + return 0; +} + +static void br_dev_set_multicast_list(struct net_device *dev) +{ +} + +static int br_dev_stop(struct net_device *dev) +{ + br_stp_disable_bridge(dev->priv); + + netif_stop_queue(dev); + + return 0; +} + +static int br_change_mtu(struct net_device *dev, int new_mtu) +{ + if ((new_mtu < 68) || new_mtu > br_min_mtu(dev->priv)) + return -EINVAL; + + dev->mtu = new_mtu; + return 0; +} + +void br_dev_setup(struct net_device *dev) +{ + memset(dev->dev_addr, 0, ETH_ALEN); + + ether_setup(dev); + + dev->do_ioctl = br_dev_ioctl; + dev->get_stats = br_dev_get_stats; + dev->hard_start_xmit = br_dev_xmit; + dev->open = br_dev_open; + dev->set_multicast_list = br_dev_set_multicast_list; + dev->change_mtu = br_change_mtu; + dev->destructor = free_netdev; + SET_MODULE_OWNER(dev); + dev->stop = br_dev_stop; + dev->tx_queue_len = 0; + dev->set_mac_address = NULL; + dev->priv_flags = IFF_EBRIDGE; +} diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c new file mode 100644 index 000000000000..e6c2200b7ca3 --- /dev/null +++ b/net/bridge/br_fdb.c @@ -0,0 +1,368 @@ +/* + * Forwarding database + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * $Id: br_fdb.c,v 1.6 2002/01/17 00:57:07 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "br_private.h" + +static kmem_cache_t *br_fdb_cache; +static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, + const unsigned char *addr); + +void __init br_fdb_init(void) +{ + br_fdb_cache = kmem_cache_create("bridge_fdb_cache", + sizeof(struct net_bridge_fdb_entry), + 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); +} + +void __exit br_fdb_fini(void) +{ + kmem_cache_destroy(br_fdb_cache); +} + + +/* if topology_changing then use forward_delay (default 15 sec) + * otherwise keep longer (default 5 minutes) + */ +static __inline__ unsigned long hold_time(const struct net_bridge *br) +{ + return br->topology_change ? br->forward_delay : br->ageing_time; +} + +static __inline__ int has_expired(const struct net_bridge *br, + const struct net_bridge_fdb_entry *fdb) +{ + return !fdb->is_static + && time_before_eq(fdb->ageing_timer + hold_time(br), jiffies); +} + +static __inline__ int br_mac_hash(const unsigned char *mac) +{ + return jhash(mac, ETH_ALEN, 0) & (BR_HASH_SIZE - 1); +} + +static __inline__ void fdb_delete(struct net_bridge_fdb_entry *f) +{ + hlist_del_rcu(&f->hlist); + br_fdb_put(f); +} + +void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr) +{ + struct net_bridge *br = p->br; + int i; + + spin_lock_bh(&br->hash_lock); + + /* Search all chains since old address/hash is unknown */ + for (i = 0; i < BR_HASH_SIZE; i++) { + struct hlist_node *h; + hlist_for_each(h, &br->hash[i]) { + struct net_bridge_fdb_entry *f; + + f = hlist_entry(h, struct net_bridge_fdb_entry, hlist); + if (f->dst == p && f->is_local) { + /* maybe another port has same hw addr? */ + struct net_bridge_port *op; + list_for_each_entry(op, &br->port_list, list) { + if (op != p && + !memcmp(op->dev->dev_addr, + f->addr.addr, ETH_ALEN)) { + f->dst = op; + goto insert; + } + } + + /* delete old one */ + fdb_delete(f); + goto insert; + } + } + } + insert: + /* insert new address, may fail if invalid address or dup. */ + fdb_insert(br, p, newaddr); + + spin_unlock_bh(&br->hash_lock); +} + +void br_fdb_cleanup(unsigned long _data) +{ + struct net_bridge *br = (struct net_bridge *)_data; + unsigned long delay = hold_time(br); + int i; + + spin_lock_bh(&br->hash_lock); + for (i = 0; i < BR_HASH_SIZE; i++) { + struct net_bridge_fdb_entry *f; + struct hlist_node *h, *n; + + hlist_for_each_entry_safe(f, h, n, &br->hash[i], hlist) { + if (!f->is_static && + time_before_eq(f->ageing_timer + delay, jiffies)) + fdb_delete(f); + } + } + spin_unlock_bh(&br->hash_lock); + + mod_timer(&br->gc_timer, jiffies + HZ/10); +} + +void br_fdb_delete_by_port(struct net_bridge *br, struct net_bridge_port *p) +{ + int i; + + spin_lock_bh(&br->hash_lock); + for (i = 0; i < BR_HASH_SIZE; i++) { + struct hlist_node *h, *g; + + hlist_for_each_safe(h, g, &br->hash[i]) { + struct net_bridge_fdb_entry *f + = hlist_entry(h, struct net_bridge_fdb_entry, hlist); + if (f->dst != p) + continue; + + /* + * if multiple ports all have the same device address + * then when one port is deleted, assign + * the local entry to other port + */ + if (f->is_local) { + struct net_bridge_port *op; + list_for_each_entry(op, &br->port_list, list) { + if (op != p && + !memcmp(op->dev->dev_addr, + f->addr.addr, ETH_ALEN)) { + f->dst = op; + goto skip_delete; + } + } + } + + fdb_delete(f); + skip_delete: ; + } + } + spin_unlock_bh(&br->hash_lock); +} + +/* No locking or refcounting, assumes caller has no preempt (rcu_read_lock) */ +struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, + const unsigned char *addr) +{ + struct hlist_node *h; + struct net_bridge_fdb_entry *fdb; + + hlist_for_each_entry_rcu(fdb, h, &br->hash[br_mac_hash(addr)], hlist) { + if (!memcmp(fdb->addr.addr, addr, ETH_ALEN)) { + if (unlikely(has_expired(br, fdb))) + break; + return fdb; + } + } + + return NULL; +} + +/* Interface used by ATM hook that keeps a ref count */ +struct net_bridge_fdb_entry *br_fdb_get(struct net_bridge *br, + unsigned char *addr) +{ + struct net_bridge_fdb_entry *fdb; + + rcu_read_lock(); + fdb = __br_fdb_get(br, addr); + if (fdb) + atomic_inc(&fdb->use_count); + rcu_read_unlock(); + return fdb; +} + +static void fdb_rcu_free(struct rcu_head *head) +{ + struct net_bridge_fdb_entry *ent + = container_of(head, struct net_bridge_fdb_entry, rcu); + kmem_cache_free(br_fdb_cache, ent); +} + +/* Set entry up for deletion with RCU */ +void br_fdb_put(struct net_bridge_fdb_entry *ent) +{ + if (atomic_dec_and_test(&ent->use_count)) + call_rcu(&ent->rcu, fdb_rcu_free); +} + +/* + * Fill buffer with forwarding table records in + * the API format. + */ +int br_fdb_fillbuf(struct net_bridge *br, void *buf, + unsigned long maxnum, unsigned long skip) +{ + struct __fdb_entry *fe = buf; + int i, num = 0; + struct hlist_node *h; + struct net_bridge_fdb_entry *f; + + memset(buf, 0, maxnum*sizeof(struct __fdb_entry)); + + rcu_read_lock(); + for (i = 0; i < BR_HASH_SIZE; i++) { + hlist_for_each_entry_rcu(f, h, &br->hash[i], hlist) { + if (num >= maxnum) + goto out; + + if (has_expired(br, f)) + continue; + + if (skip) { + --skip; + continue; + } + + /* convert from internal format to API */ + memcpy(fe->mac_addr, f->addr.addr, ETH_ALEN); + fe->port_no = f->dst->port_no; + fe->is_local = f->is_local; + if (!f->is_static) + fe->ageing_timer_value = jiffies_to_clock_t(jiffies - f->ageing_timer); + ++fe; + ++num; + } + } + + out: + rcu_read_unlock(); + + return num; +} + +static inline struct net_bridge_fdb_entry *fdb_find(struct hlist_head *head, + const unsigned char *addr) +{ + struct hlist_node *h; + struct net_bridge_fdb_entry *fdb; + + hlist_for_each_entry_rcu(fdb, h, head, hlist) { + if (!memcmp(fdb->addr.addr, addr, ETH_ALEN)) + return fdb; + } + return NULL; +} + +static struct net_bridge_fdb_entry *fdb_create(struct hlist_head *head, + struct net_bridge_port *source, + const unsigned char *addr, + int is_local) +{ + struct net_bridge_fdb_entry *fdb; + + fdb = kmem_cache_alloc(br_fdb_cache, GFP_ATOMIC); + if (fdb) { + memcpy(fdb->addr.addr, addr, ETH_ALEN); + atomic_set(&fdb->use_count, 1); + hlist_add_head_rcu(&fdb->hlist, head); + + fdb->dst = source; + fdb->is_local = is_local; + fdb->is_static = is_local; + fdb->ageing_timer = jiffies; + } + return fdb; +} + +static int fdb_insert(struct net_bridge *br, struct net_bridge_port *source, + const unsigned char *addr) +{ + struct hlist_head *head = &br->hash[br_mac_hash(addr)]; + struct net_bridge_fdb_entry *fdb; + + if (!is_valid_ether_addr(addr)) + return -EINVAL; + + fdb = fdb_find(head, addr); + if (fdb) { + /* it is okay to have multiple ports with same + * address, just use the first one. + */ + if (fdb->is_local) + return 0; + + printk(KERN_WARNING "%s adding interface with same address " + "as a received packet\n", + source->dev->name); + fdb_delete(fdb); + } + + if (!fdb_create(head, source, addr, 1)) + return -ENOMEM; + + return 0; +} + +int br_fdb_insert(struct net_bridge *br, struct net_bridge_port *source, + const unsigned char *addr) +{ + int ret; + + spin_lock_bh(&br->hash_lock); + ret = fdb_insert(br, source, addr); + spin_unlock_bh(&br->hash_lock); + return ret; +} + +void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, + const unsigned char *addr) +{ + struct hlist_head *head = &br->hash[br_mac_hash(addr)]; + struct net_bridge_fdb_entry *fdb; + + /* some users want to always flood. */ + if (hold_time(br) == 0) + return; + + rcu_read_lock(); + fdb = fdb_find(head, addr); + if (likely(fdb)) { + /* attempt to update an entry for a local interface */ + if (unlikely(fdb->is_local)) { + if (net_ratelimit()) + printk(KERN_WARNING "%s: received packet with " + " own address as source address\n", + source->dev->name); + } else { + /* fastpath: update of existing entry */ + fdb->dst = source; + fdb->ageing_timer = jiffies; + } + } else { + spin_lock_bh(&br->hash_lock); + if (!fdb_find(head, addr)) + fdb_create(head, source, addr, 0); + /* else we lose race and someone else inserts + * it first, don't bother updating + */ + spin_unlock_bh(&br->hash_lock); + } + rcu_read_unlock(); +} diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c new file mode 100644 index 000000000000..ef9f2095f96e --- /dev/null +++ b/net/bridge/br_forward.c @@ -0,0 +1,159 @@ +/* + * Forwarding decision + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * $Id: br_forward.c,v 1.4 2001/08/14 22:05:57 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include "br_private.h" + +static inline int should_deliver(const struct net_bridge_port *p, + const struct sk_buff *skb) +{ + if (skb->dev == p->dev || + p->state != BR_STATE_FORWARDING) + return 0; + + return 1; +} + +int br_dev_queue_push_xmit(struct sk_buff *skb) +{ + if (skb->len > skb->dev->mtu) + kfree_skb(skb); + else { +#ifdef CONFIG_BRIDGE_NETFILTER + /* ip_refrag calls ip_fragment, doesn't copy the MAC header. */ + nf_bridge_maybe_copy_header(skb); +#endif + skb_push(skb, ETH_HLEN); + + dev_queue_xmit(skb); + } + + return 0; +} + +int br_forward_finish(struct sk_buff *skb) +{ + NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev, + br_dev_queue_push_xmit); + + return 0; +} + +static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) +{ + skb->dev = to->dev; +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug = 0; +#endif + NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, + br_forward_finish); +} + +static void __br_forward(const struct net_bridge_port *to, struct sk_buff *skb) +{ + struct net_device *indev; + + indev = skb->dev; + skb->dev = to->dev; + skb->ip_summed = CHECKSUM_NONE; + + NF_HOOK(PF_BRIDGE, NF_BR_FORWARD, skb, indev, skb->dev, + br_forward_finish); +} + +/* called with rcu_read_lock */ +void br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) +{ + if (should_deliver(to, skb)) { + __br_deliver(to, skb); + return; + } + + kfree_skb(skb); +} + +/* called with rcu_read_lock */ +void br_forward(const struct net_bridge_port *to, struct sk_buff *skb) +{ + if (should_deliver(to, skb)) { + __br_forward(to, skb); + return; + } + + kfree_skb(skb); +} + +/* called under bridge lock */ +static void br_flood(struct net_bridge *br, struct sk_buff *skb, int clone, + void (*__packet_hook)(const struct net_bridge_port *p, + struct sk_buff *skb)) +{ + struct net_bridge_port *p; + struct net_bridge_port *prev; + + if (clone) { + struct sk_buff *skb2; + + if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) { + br->statistics.tx_dropped++; + return; + } + + skb = skb2; + } + + prev = NULL; + + list_for_each_entry_rcu(p, &br->port_list, list) { + if (should_deliver(p, skb)) { + if (prev != NULL) { + struct sk_buff *skb2; + + if ((skb2 = skb_clone(skb, GFP_ATOMIC)) == NULL) { + br->statistics.tx_dropped++; + kfree_skb(skb); + return; + } + + __packet_hook(prev, skb2); + } + + prev = p; + } + } + + if (prev != NULL) { + __packet_hook(prev, skb); + return; + } + + kfree_skb(skb); +} + + +/* called with rcu_read_lock */ +void br_flood_deliver(struct net_bridge *br, struct sk_buff *skb, int clone) +{ + br_flood(br, skb, clone, __br_deliver); +} + +/* called under bridge lock */ +void br_flood_forward(struct net_bridge *br, struct sk_buff *skb, int clone) +{ + br_flood(br, skb, clone, __br_forward); +} diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c new file mode 100644 index 000000000000..69872bf3b87e --- /dev/null +++ b/net/bridge/br_if.c @@ -0,0 +1,388 @@ +/* + * Userspace interface + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * $Id: br_if.c,v 1.7 2001/12/24 00:59:55 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "br_private.h" + +/* + * Determine initial path cost based on speed. + * using recommendations from 802.1d standard + * + * Need to simulate user ioctl because not all device's that support + * ethtool, use ethtool_ops. Also, since driver might sleep need to + * not be holding any locks. + */ +static int br_initial_port_cost(struct net_device *dev) +{ + + struct ethtool_cmd ecmd = { ETHTOOL_GSET }; + struct ifreq ifr; + mm_segment_t old_fs; + int err; + + strncpy(ifr.ifr_name, dev->name, IFNAMSIZ); + ifr.ifr_data = (void __user *) &ecmd; + + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = dev_ethtool(&ifr); + set_fs(old_fs); + + if (!err) { + switch(ecmd.speed) { + case SPEED_100: + return 19; + case SPEED_1000: + return 4; + case SPEED_10000: + return 2; + case SPEED_10: + return 100; + default: + pr_info("bridge: can't decode speed from %s: %d\n", + dev->name, ecmd.speed); + return 100; + } + } + + /* Old silly heuristics based on name */ + if (!strncmp(dev->name, "lec", 3)) + return 7; + + if (!strncmp(dev->name, "plip", 4)) + return 2500; + + return 100; /* assume old 10Mbps */ +} + +static void destroy_nbp(struct net_bridge_port *p) +{ + struct net_device *dev = p->dev; + + dev->br_port = NULL; + p->br = NULL; + p->dev = NULL; + dev_put(dev); + + br_sysfs_freeif(p); +} + +static void destroy_nbp_rcu(struct rcu_head *head) +{ + struct net_bridge_port *p = + container_of(head, struct net_bridge_port, rcu); + destroy_nbp(p); +} + +/* called with RTNL */ +static void del_nbp(struct net_bridge_port *p) +{ + struct net_bridge *br = p->br; + struct net_device *dev = p->dev; + + dev_set_promiscuity(dev, -1); + + spin_lock_bh(&br->lock); + br_stp_disable_port(p); + spin_unlock_bh(&br->lock); + + br_fdb_delete_by_port(br, p); + + list_del_rcu(&p->list); + + del_timer_sync(&p->message_age_timer); + del_timer_sync(&p->forward_delay_timer); + del_timer_sync(&p->hold_timer); + + call_rcu(&p->rcu, destroy_nbp_rcu); +} + +/* called with RTNL */ +static void del_br(struct net_bridge *br) +{ + struct net_bridge_port *p, *n; + + list_for_each_entry_safe(p, n, &br->port_list, list) { + br_sysfs_removeif(p); + del_nbp(p); + } + + del_timer_sync(&br->gc_timer); + + br_sysfs_delbr(br->dev); + unregister_netdevice(br->dev); +} + +static struct net_device *new_bridge_dev(const char *name) +{ + struct net_bridge *br; + struct net_device *dev; + + dev = alloc_netdev(sizeof(struct net_bridge), name, + br_dev_setup); + + if (!dev) + return NULL; + + br = netdev_priv(dev); + br->dev = dev; + + spin_lock_init(&br->lock); + INIT_LIST_HEAD(&br->port_list); + spin_lock_init(&br->hash_lock); + + br->bridge_id.prio[0] = 0x80; + br->bridge_id.prio[1] = 0x00; + memset(br->bridge_id.addr, 0, ETH_ALEN); + + br->stp_enabled = 0; + br->designated_root = br->bridge_id; + br->root_path_cost = 0; + br->root_port = 0; + br->bridge_max_age = br->max_age = 20 * HZ; + br->bridge_hello_time = br->hello_time = 2 * HZ; + br->bridge_forward_delay = br->forward_delay = 15 * HZ; + br->topology_change = 0; + br->topology_change_detected = 0; + br->ageing_time = 300 * HZ; + INIT_LIST_HEAD(&br->age_list); + + br_stp_timer_init(br); + + return dev; +} + +/* find an available port number */ +static int find_portno(struct net_bridge *br) +{ + int index; + struct net_bridge_port *p; + unsigned long *inuse; + + inuse = kmalloc(BITS_TO_LONGS(BR_MAX_PORTS)*sizeof(unsigned long), + GFP_KERNEL); + if (!inuse) + return -ENOMEM; + + memset(inuse, 0, BITS_TO_LONGS(BR_MAX_PORTS)*sizeof(unsigned long)); + set_bit(0, inuse); /* zero is reserved */ + list_for_each_entry(p, &br->port_list, list) { + set_bit(p->port_no, inuse); + } + index = find_first_zero_bit(inuse, BR_MAX_PORTS); + kfree(inuse); + + return (index >= BR_MAX_PORTS) ? -EXFULL : index; +} + +/* called with RTNL */ +static struct net_bridge_port *new_nbp(struct net_bridge *br, + struct net_device *dev, + unsigned long cost) +{ + int index; + struct net_bridge_port *p; + + index = find_portno(br); + if (index < 0) + return ERR_PTR(index); + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p == NULL) + return ERR_PTR(-ENOMEM); + + memset(p, 0, sizeof(*p)); + p->br = br; + dev_hold(dev); + p->dev = dev; + p->path_cost = cost; + p->priority = 0x8000 >> BR_PORT_BITS; + dev->br_port = p; + p->port_no = index; + br_init_port(p); + p->state = BR_STATE_DISABLED; + kobject_init(&p->kobj); + + return p; +} + +int br_add_bridge(const char *name) +{ + struct net_device *dev; + int ret; + + dev = new_bridge_dev(name); + if (!dev) + return -ENOMEM; + + rtnl_lock(); + if (strchr(dev->name, '%')) { + ret = dev_alloc_name(dev, dev->name); + if (ret < 0) + goto err1; + } + + ret = register_netdevice(dev); + if (ret) + goto err2; + + /* network device kobject is not setup until + * after rtnl_unlock does it's hotplug magic. + * so hold reference to avoid race. + */ + dev_hold(dev); + rtnl_unlock(); + + ret = br_sysfs_addbr(dev); + dev_put(dev); + + if (ret) + unregister_netdev(dev); + out: + return ret; + + err2: + free_netdev(dev); + err1: + rtnl_unlock(); + goto out; +} + +int br_del_bridge(const char *name) +{ + struct net_device *dev; + int ret = 0; + + rtnl_lock(); + dev = __dev_get_by_name(name); + if (dev == NULL) + ret = -ENXIO; /* Could not find device */ + + else if (!(dev->priv_flags & IFF_EBRIDGE)) { + /* Attempt to delete non bridge device! */ + ret = -EPERM; + } + + else if (dev->flags & IFF_UP) { + /* Not shutdown yet. */ + ret = -EBUSY; + } + + else + del_br(netdev_priv(dev)); + + rtnl_unlock(); + return ret; +} + +/* Mtu of the bridge pseudo-device 1500 or the minimum of the ports */ +int br_min_mtu(const struct net_bridge *br) +{ + const struct net_bridge_port *p; + int mtu = 0; + + ASSERT_RTNL(); + + if (list_empty(&br->port_list)) + mtu = 1500; + else { + list_for_each_entry(p, &br->port_list, list) { + if (!mtu || p->dev->mtu < mtu) + mtu = p->dev->mtu; + } + } + return mtu; +} + +/* called with RTNL */ +int br_add_if(struct net_bridge *br, struct net_device *dev) +{ + struct net_bridge_port *p; + int err = 0; + + if (dev->flags & IFF_LOOPBACK || dev->type != ARPHRD_ETHER) + return -EINVAL; + + if (dev->hard_start_xmit == br_dev_xmit) + return -ELOOP; + + if (dev->br_port != NULL) + return -EBUSY; + + if (IS_ERR(p = new_nbp(br, dev, br_initial_port_cost(dev)))) + return PTR_ERR(p); + + if ((err = br_fdb_insert(br, p, dev->dev_addr))) + destroy_nbp(p); + + else if ((err = br_sysfs_addif(p))) + del_nbp(p); + else { + dev_set_promiscuity(dev, 1); + + list_add_rcu(&p->list, &br->port_list); + + spin_lock_bh(&br->lock); + br_stp_recalculate_bridge_id(br); + if ((br->dev->flags & IFF_UP) + && (dev->flags & IFF_UP) && netif_carrier_ok(dev)) + br_stp_enable_port(p); + spin_unlock_bh(&br->lock); + + dev_set_mtu(br->dev, br_min_mtu(br)); + } + + return err; +} + +/* called with RTNL */ +int br_del_if(struct net_bridge *br, struct net_device *dev) +{ + struct net_bridge_port *p = dev->br_port; + + if (!p || p->br != br) + return -EINVAL; + + br_sysfs_removeif(p); + del_nbp(p); + + spin_lock_bh(&br->lock); + br_stp_recalculate_bridge_id(br); + spin_unlock_bh(&br->lock); + + return 0; +} + +void __exit br_cleanup_bridges(void) +{ + struct net_device *dev, *nxt; + + rtnl_lock(); + for (dev = dev_base; dev; dev = nxt) { + nxt = dev->next; + if (dev->priv_flags & IFF_EBRIDGE) + del_br(dev->priv); + } + rtnl_unlock(); + +} diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c new file mode 100644 index 000000000000..2b1cce46cab4 --- /dev/null +++ b/net/bridge/br_input.c @@ -0,0 +1,144 @@ +/* + * Handle incoming frames + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * $Id: br_input.c,v 1.10 2001/12/24 04:50:20 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include "br_private.h" + +const unsigned char bridge_ula[6] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; + +static int br_pass_frame_up_finish(struct sk_buff *skb) +{ +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug = 0; +#endif + netif_rx(skb); + + return 0; +} + +static void br_pass_frame_up(struct net_bridge *br, struct sk_buff *skb) +{ + struct net_device *indev; + + br->statistics.rx_packets++; + br->statistics.rx_bytes += skb->len; + + indev = skb->dev; + skb->dev = br->dev; + + NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, indev, NULL, + br_pass_frame_up_finish); +} + +/* note: already called with rcu_read_lock (preempt_disabled) */ +int br_handle_frame_finish(struct sk_buff *skb) +{ + const unsigned char *dest = eth_hdr(skb)->h_dest; + struct net_bridge_port *p = skb->dev->br_port; + struct net_bridge *br = p->br; + struct net_bridge_fdb_entry *dst; + int passedup = 0; + + if (br->dev->flags & IFF_PROMISC) { + struct sk_buff *skb2; + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 != NULL) { + passedup = 1; + br_pass_frame_up(br, skb2); + } + } + + if (dest[0] & 1) { + br_flood_forward(br, skb, !passedup); + if (!passedup) + br_pass_frame_up(br, skb); + goto out; + } + + dst = __br_fdb_get(br, dest); + if (dst != NULL && dst->is_local) { + if (!passedup) + br_pass_frame_up(br, skb); + else + kfree_skb(skb); + goto out; + } + + if (dst != NULL) { + br_forward(dst->dst, skb); + goto out; + } + + br_flood_forward(br, skb, 0); + +out: + return 0; +} + +/* + * Called via br_handle_frame_hook. + * Return 0 if *pskb should be processed furthur + * 1 if *pskb is handled + * note: already called with rcu_read_lock (preempt_disabled) + */ +int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb) +{ + struct sk_buff *skb = *pskb; + const unsigned char *dest = eth_hdr(skb)->h_dest; + + if (p->state == BR_STATE_DISABLED) + goto err; + + if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) + goto err; + + if (p->state == BR_STATE_LEARNING || + p->state == BR_STATE_FORWARDING) + br_fdb_update(p->br, p, eth_hdr(skb)->h_source); + + if (p->br->stp_enabled && + !memcmp(dest, bridge_ula, 5) && + !(dest[5] & 0xF0)) { + if (!dest[5]) { + NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev, + NULL, br_stp_handle_bpdu); + return 1; + } + } + + else if (p->state == BR_STATE_FORWARDING) { + if (br_should_route_hook) { + if (br_should_route_hook(pskb)) + return 0; + skb = *pskb; + dest = eth_hdr(skb)->h_dest; + } + + if (!memcmp(p->br->dev->dev_addr, dest, ETH_ALEN)) + skb->pkt_type = PACKET_HOST; + + NF_HOOK(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, + br_handle_frame_finish); + return 1; + } + +err: + kfree_skb(skb); + return 1; +} diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c new file mode 100644 index 000000000000..b8ce14b22181 --- /dev/null +++ b/net/bridge/br_ioctl.c @@ -0,0 +1,410 @@ +/* + * Ioctl handler + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * $Id: br_ioctl.c,v 1.4 2000/11/08 05:16:40 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include "br_private.h" + +/* called with RTNL */ +static int get_bridge_ifindices(int *indices, int num) +{ + struct net_device *dev; + int i = 0; + + for (dev = dev_base; dev && i < num; dev = dev->next) { + if (dev->priv_flags & IFF_EBRIDGE) + indices[i++] = dev->ifindex; + } + + return i; +} + +/* called with RTNL */ +static void get_port_ifindices(struct net_bridge *br, int *ifindices, int num) +{ + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) { + if (p->port_no < num) + ifindices[p->port_no] = p->dev->ifindex; + } +} + +/* + * Format up to a page worth of forwarding table entries + * userbuf -- where to copy result + * maxnum -- maximum number of entries desired + * (limited to a page for sanity) + * offset -- number of records to skip + */ +static int get_fdb_entries(struct net_bridge *br, void __user *userbuf, + unsigned long maxnum, unsigned long offset) +{ + int num; + void *buf; + size_t size = maxnum * sizeof(struct __fdb_entry); + + if (size > PAGE_SIZE) { + size = PAGE_SIZE; + maxnum = PAGE_SIZE/sizeof(struct __fdb_entry); + } + + buf = kmalloc(size, GFP_USER); + if (!buf) + return -ENOMEM; + + num = br_fdb_fillbuf(br, buf, maxnum, offset); + if (num > 0) { + if (copy_to_user(userbuf, buf, num*sizeof(struct __fdb_entry))) + num = -EFAULT; + } + kfree(buf); + + return num; +} + +static int add_del_if(struct net_bridge *br, int ifindex, int isadd) +{ + struct net_device *dev; + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + dev = dev_get_by_index(ifindex); + if (dev == NULL) + return -EINVAL; + + if (isadd) + ret = br_add_if(br, dev); + else + ret = br_del_if(br, dev); + + dev_put(dev); + return ret; +} + +/* + * Legacy ioctl's through SIOCDEVPRIVATE + * This interface is deprecated because it was too difficult to + * to do the translation for 32/64bit ioctl compatability. + */ +static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) +{ + struct net_bridge *br = netdev_priv(dev); + unsigned long args[4]; + + if (copy_from_user(args, rq->ifr_data, sizeof(args))) + return -EFAULT; + + switch (args[0]) { + case BRCTL_ADD_IF: + case BRCTL_DEL_IF: + return add_del_if(br, args[1], args[0] == BRCTL_ADD_IF); + + case BRCTL_GET_BRIDGE_INFO: + { + struct __bridge_info b; + + memset(&b, 0, sizeof(struct __bridge_info)); + rcu_read_lock(); + memcpy(&b.designated_root, &br->designated_root, 8); + memcpy(&b.bridge_id, &br->bridge_id, 8); + b.root_path_cost = br->root_path_cost; + b.max_age = jiffies_to_clock_t(br->max_age); + b.hello_time = jiffies_to_clock_t(br->hello_time); + b.forward_delay = br->forward_delay; + b.bridge_max_age = br->bridge_max_age; + b.bridge_hello_time = br->bridge_hello_time; + b.bridge_forward_delay = jiffies_to_clock_t(br->bridge_forward_delay); + b.topology_change = br->topology_change; + b.topology_change_detected = br->topology_change_detected; + b.root_port = br->root_port; + b.stp_enabled = br->stp_enabled; + b.ageing_time = jiffies_to_clock_t(br->ageing_time); + b.hello_timer_value = br_timer_value(&br->hello_timer); + b.tcn_timer_value = br_timer_value(&br->tcn_timer); + b.topology_change_timer_value = br_timer_value(&br->topology_change_timer); + b.gc_timer_value = br_timer_value(&br->gc_timer); + rcu_read_unlock(); + + if (copy_to_user((void __user *)args[1], &b, sizeof(b))) + return -EFAULT; + + return 0; + } + + case BRCTL_GET_PORT_LIST: + { + int num, *indices; + + num = args[2]; + if (num < 0) + return -EINVAL; + if (num == 0) + num = 256; + if (num > BR_MAX_PORTS) + num = BR_MAX_PORTS; + + indices = kmalloc(num*sizeof(int), GFP_KERNEL); + if (indices == NULL) + return -ENOMEM; + + memset(indices, 0, num*sizeof(int)); + + get_port_ifindices(br, indices, num); + if (copy_to_user((void __user *)args[1], indices, num*sizeof(int))) + num = -EFAULT; + kfree(indices); + return num; + } + + case BRCTL_SET_BRIDGE_FORWARD_DELAY: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + spin_lock_bh(&br->lock); + br->bridge_forward_delay = clock_t_to_jiffies(args[1]); + if (br_is_root_bridge(br)) + br->forward_delay = br->bridge_forward_delay; + spin_unlock_bh(&br->lock); + return 0; + + case BRCTL_SET_BRIDGE_HELLO_TIME: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + spin_lock_bh(&br->lock); + br->bridge_hello_time = clock_t_to_jiffies(args[1]); + if (br_is_root_bridge(br)) + br->hello_time = br->bridge_hello_time; + spin_unlock_bh(&br->lock); + return 0; + + case BRCTL_SET_BRIDGE_MAX_AGE: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + spin_lock_bh(&br->lock); + br->bridge_max_age = clock_t_to_jiffies(args[1]); + if (br_is_root_bridge(br)) + br->max_age = br->bridge_max_age; + spin_unlock_bh(&br->lock); + return 0; + + case BRCTL_SET_AGEING_TIME: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + br->ageing_time = clock_t_to_jiffies(args[1]); + return 0; + + case BRCTL_GET_PORT_INFO: + { + struct __port_info p; + struct net_bridge_port *pt; + + rcu_read_lock(); + if ((pt = br_get_port(br, args[2])) == NULL) { + rcu_read_unlock(); + return -EINVAL; + } + + memset(&p, 0, sizeof(struct __port_info)); + memcpy(&p.designated_root, &pt->designated_root, 8); + memcpy(&p.designated_bridge, &pt->designated_bridge, 8); + p.port_id = pt->port_id; + p.designated_port = pt->designated_port; + p.path_cost = pt->path_cost; + p.designated_cost = pt->designated_cost; + p.state = pt->state; + p.top_change_ack = pt->topology_change_ack; + p.config_pending = pt->config_pending; + p.message_age_timer_value = br_timer_value(&pt->message_age_timer); + p.forward_delay_timer_value = br_timer_value(&pt->forward_delay_timer); + p.hold_timer_value = br_timer_value(&pt->hold_timer); + + rcu_read_unlock(); + + if (copy_to_user((void __user *)args[1], &p, sizeof(p))) + return -EFAULT; + + return 0; + } + + case BRCTL_SET_BRIDGE_STP_STATE: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + br->stp_enabled = args[1]?1:0; + return 0; + + case BRCTL_SET_BRIDGE_PRIORITY: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + spin_lock_bh(&br->lock); + br_stp_set_bridge_priority(br, args[1]); + spin_unlock_bh(&br->lock); + return 0; + + case BRCTL_SET_PORT_PRIORITY: + { + struct net_bridge_port *p; + int ret = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (args[2] >= (1<<(16-BR_PORT_BITS))) + return -ERANGE; + + spin_lock_bh(&br->lock); + if ((p = br_get_port(br, args[1])) == NULL) + ret = -EINVAL; + else + br_stp_set_port_priority(p, args[2]); + spin_unlock_bh(&br->lock); + return ret; + } + + case BRCTL_SET_PATH_COST: + { + struct net_bridge_port *p; + int ret = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + spin_lock_bh(&br->lock); + if ((p = br_get_port(br, args[1])) == NULL) + ret = -EINVAL; + else + br_stp_set_path_cost(p, args[2]); + spin_unlock_bh(&br->lock); + return ret; + } + + case BRCTL_GET_FDB_ENTRIES: + return get_fdb_entries(br, (void __user *)args[1], + args[2], args[3]); + } + + return -EOPNOTSUPP; +} + +static int old_deviceless(void __user *uarg) +{ + unsigned long args[3]; + + if (copy_from_user(args, uarg, sizeof(args))) + return -EFAULT; + + switch (args[0]) { + case BRCTL_GET_VERSION: + return BRCTL_VERSION; + + case BRCTL_GET_BRIDGES: + { + int *indices; + int ret = 0; + + if (args[2] >= 2048) + return -ENOMEM; + indices = kmalloc(args[2]*sizeof(int), GFP_KERNEL); + if (indices == NULL) + return -ENOMEM; + + memset(indices, 0, args[2]*sizeof(int)); + args[2] = get_bridge_ifindices(indices, args[2]); + + ret = copy_to_user((void __user *)args[1], indices, args[2]*sizeof(int)) + ? -EFAULT : args[2]; + + kfree(indices); + return ret; + } + + case BRCTL_ADD_BRIDGE: + case BRCTL_DEL_BRIDGE: + { + char buf[IFNAMSIZ]; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(buf, (void __user *)args[1], IFNAMSIZ)) + return -EFAULT; + + buf[IFNAMSIZ-1] = 0; + + if (args[0] == BRCTL_ADD_BRIDGE) + return br_add_bridge(buf); + + return br_del_bridge(buf); + } + } + + return -EOPNOTSUPP; +} + +int br_ioctl_deviceless_stub(unsigned int cmd, void __user *uarg) +{ + switch (cmd) { + case SIOCGIFBR: + case SIOCSIFBR: + return old_deviceless(uarg); + + case SIOCBRADDBR: + case SIOCBRDELBR: + { + char buf[IFNAMSIZ]; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(buf, uarg, IFNAMSIZ)) + return -EFAULT; + + buf[IFNAMSIZ-1] = 0; + if (cmd == SIOCBRADDBR) + return br_add_bridge(buf); + + return br_del_bridge(buf); + } + } + return -EOPNOTSUPP; +} + +int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) +{ + struct net_bridge *br = netdev_priv(dev); + + switch(cmd) { + case SIOCDEVPRIVATE: + return old_dev_ioctl(dev, rq, cmd); + + case SIOCBRADDIF: + case SIOCBRDELIF: + return add_del_if(br, rq->ifr_ifindex, cmd == SIOCBRADDIF); + + } + + pr_debug("Bridge does not support ioctl 0x%x\n", cmd); + return -EOPNOTSUPP; +} diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c new file mode 100644 index 000000000000..be03d3ad2648 --- /dev/null +++ b/net/bridge/br_netfilter.c @@ -0,0 +1,1087 @@ +/* + * Handle firewalling + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * Bart De Schuymer (maintainer) + * + * Changes: + * Apr 29 2003: physdev module support (bdschuym) + * Jun 19 2003: let arptables see bridged ARP traffic (bdschuym) + * Oct 06 2003: filter encapsulated IP/ARP VLAN traffic on untagged bridge + * (bdschuym) + * Sep 01 2004: add IPv6 filtering (bdschuym) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Lennert dedicates this file to Kerstin Wurdinger. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "br_private.h" +#ifdef CONFIG_SYSCTL +#include +#endif + +#define skb_origaddr(skb) (((struct bridge_skb_cb *) \ + (skb->nf_bridge->data))->daddr.ipv4) +#define store_orig_dstaddr(skb) (skb_origaddr(skb) = (skb)->nh.iph->daddr) +#define dnat_took_place(skb) (skb_origaddr(skb) != (skb)->nh.iph->daddr) + +#define has_bridge_parent(device) ((device)->br_port != NULL) +#define bridge_parent(device) ((device)->br_port->br->dev) + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *brnf_sysctl_header; +static int brnf_call_iptables = 1; +static int brnf_call_ip6tables = 1; +static int brnf_call_arptables = 1; +static int brnf_filter_vlan_tagged = 1; +#else +#define brnf_filter_vlan_tagged 1 +#endif + +#define IS_VLAN_IP (skb->protocol == __constant_htons(ETH_P_8021Q) && \ + hdr->h_vlan_encapsulated_proto == __constant_htons(ETH_P_IP) && \ + brnf_filter_vlan_tagged) +#define IS_VLAN_IPV6 (skb->protocol == __constant_htons(ETH_P_8021Q) && \ + hdr->h_vlan_encapsulated_proto == __constant_htons(ETH_P_IPV6) && \ + brnf_filter_vlan_tagged) +#define IS_VLAN_ARP (skb->protocol == __constant_htons(ETH_P_8021Q) && \ + hdr->h_vlan_encapsulated_proto == __constant_htons(ETH_P_ARP) && \ + brnf_filter_vlan_tagged) + +/* We need these fake structures to make netfilter happy -- + * lots of places assume that skb->dst != NULL, which isn't + * all that unreasonable. + * + * Currently, we fill in the PMTU entry because netfilter + * refragmentation needs it, and the rt_flags entry because + * ipt_REJECT needs it. Future netfilter modules might + * require us to fill additional fields. */ +static struct net_device __fake_net_device = { + .hard_header_len = ETH_HLEN +}; + +static struct rtable __fake_rtable = { + .u = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .dev = &__fake_net_device, + .path = &__fake_rtable.u.dst, + .metrics = {[RTAX_MTU - 1] = 1500}, + } + }, + .rt_flags = 0, +}; + + +/* PF_BRIDGE/PRE_ROUTING *********************************************/ +/* Undo the changes made for ip6tables PREROUTING and continue the + * bridge PRE_ROUTING hook. */ +static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = skb->nf_bridge; + +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug ^= (1 << NF_BR_PRE_ROUTING); +#endif + + if (nf_bridge->mask & BRNF_PKT_TYPE) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->mask ^= BRNF_PKT_TYPE; + } + nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING; + + skb->dst = (struct dst_entry *)&__fake_rtable; + dst_hold(skb->dst); + + skb->dev = nf_bridge->physindev; + if (skb->protocol == __constant_htons(ETH_P_8021Q)) { + skb_push(skb, VLAN_HLEN); + skb->nh.raw -= VLAN_HLEN; + } + NF_HOOK_THRESH(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, + br_handle_frame_finish, 1); + + return 0; +} + +static void __br_dnat_complain(void) +{ + static unsigned long last_complaint; + + if (jiffies - last_complaint >= 5 * HZ) { + printk(KERN_WARNING "Performing cross-bridge DNAT requires IP " + "forwarding to be enabled\n"); + last_complaint = jiffies; + } +} + +/* This requires some explaining. If DNAT has taken place, + * we will need to fix up the destination Ethernet address, + * and this is a tricky process. + * + * There are two cases to consider: + * 1. The packet was DNAT'ed to a device in the same bridge + * port group as it was received on. We can still bridge + * the packet. + * 2. The packet was DNAT'ed to a different device, either + * a non-bridged device or another bridge port group. + * The packet will need to be routed. + * + * The correct way of distinguishing between these two cases is to + * call ip_route_input() and to look at skb->dst->dev, which is + * changed to the destination device if ip_route_input() succeeds. + * + * Let us first consider the case that ip_route_input() succeeds: + * + * If skb->dst->dev equals the logical bridge device the packet + * came in on, we can consider this bridging. We then call + * skb->dst->output() which will make the packet enter br_nf_local_out() + * not much later. In that function it is assured that the iptables + * FORWARD chain is traversed for the packet. + * + * Otherwise, the packet is considered to be routed and we just + * change the destination MAC address so that the packet will + * later be passed up to the IP stack to be routed. + * + * Let us now consider the case that ip_route_input() fails: + * + * After a "echo '0' > /proc/sys/net/ipv4/ip_forward" ip_route_input() + * will fail, while __ip_route_output_key() will return success. The source + * address for __ip_route_output_key() is set to zero, so __ip_route_output_key + * thinks we're handling a locally generated packet and won't care + * if IP forwarding is allowed. We send a warning message to the users's + * log telling her to put IP forwarding on. + * + * ip_route_input() will also fail if there is no route available. + * In that case we just drop the packet. + * + * --Lennert, 20020411 + * --Bart, 20020416 (updated) + * --Bart, 20021007 (updated) */ +static int br_nf_pre_routing_finish_bridge(struct sk_buff *skb) +{ +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug |= (1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_FORWARD); +#endif + + if (skb->pkt_type == PACKET_OTHERHOST) { + skb->pkt_type = PACKET_HOST; + skb->nf_bridge->mask |= BRNF_PKT_TYPE; + } + skb->nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING; + + skb->dev = bridge_parent(skb->dev); + if (skb->protocol == __constant_htons(ETH_P_8021Q)) { + skb_pull(skb, VLAN_HLEN); + skb->nh.raw += VLAN_HLEN; + } + skb->dst->output(skb); + return 0; +} + +static int br_nf_pre_routing_finish(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct iphdr *iph = skb->nh.iph; + struct nf_bridge_info *nf_bridge = skb->nf_bridge; + +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug ^= (1 << NF_BR_PRE_ROUTING); +#endif + + if (nf_bridge->mask & BRNF_PKT_TYPE) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->mask ^= BRNF_PKT_TYPE; + } + nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING; + + if (dnat_took_place(skb)) { + if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, + dev)) { + struct rtable *rt; + struct flowi fl = { .nl_u = + { .ip4_u = { .daddr = iph->daddr, .saddr = 0 , + .tos = RT_TOS(iph->tos)} }, .proto = 0}; + + if (!ip_route_output_key(&rt, &fl)) { + /* Bridged-and-DNAT'ed traffic doesn't + * require ip_forwarding. */ + if (((struct dst_entry *)rt)->dev == dev) { + skb->dst = (struct dst_entry *)rt; + goto bridged_dnat; + } + __br_dnat_complain(); + dst_release((struct dst_entry *)rt); + } + kfree_skb(skb); + return 0; + } else { + if (skb->dst->dev == dev) { +bridged_dnat: + /* Tell br_nf_local_out this is a + * bridged frame */ + nf_bridge->mask |= BRNF_BRIDGED_DNAT; + skb->dev = nf_bridge->physindev; + if (skb->protocol == + __constant_htons(ETH_P_8021Q)) { + skb_push(skb, VLAN_HLEN); + skb->nh.raw -= VLAN_HLEN; + } + NF_HOOK_THRESH(PF_BRIDGE, NF_BR_PRE_ROUTING, + skb, skb->dev, NULL, + br_nf_pre_routing_finish_bridge, + 1); + return 0; + } + memcpy(eth_hdr(skb)->h_dest, dev->dev_addr, + ETH_ALEN); + skb->pkt_type = PACKET_HOST; + } + } else { + skb->dst = (struct dst_entry *)&__fake_rtable; + dst_hold(skb->dst); + } + + skb->dev = nf_bridge->physindev; + if (skb->protocol == __constant_htons(ETH_P_8021Q)) { + skb_push(skb, VLAN_HLEN); + skb->nh.raw -= VLAN_HLEN; + } + NF_HOOK_THRESH(PF_BRIDGE, NF_BR_PRE_ROUTING, skb, skb->dev, NULL, + br_handle_frame_finish, 1); + + return 0; +} + +/* Some common code for IPv4/IPv6 */ +static void setup_pre_routing(struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = skb->nf_bridge; + + if (skb->pkt_type == PACKET_OTHERHOST) { + skb->pkt_type = PACKET_HOST; + nf_bridge->mask |= BRNF_PKT_TYPE; + } + + nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING; + nf_bridge->physindev = skb->dev; + skb->dev = bridge_parent(skb->dev); +} + +/* We only check the length. A bridge shouldn't do any hop-by-hop stuff anyway */ +static int check_hbh_len(struct sk_buff *skb) +{ + unsigned char *raw = (u8*)(skb->nh.ipv6h+1); + u32 pkt_len; + int off = raw - skb->nh.raw; + int len = (raw[1]+1)<<3; + + if ((raw + len) - skb->data > skb_headlen(skb)) + goto bad; + + off += 2; + len -= 2; + + while (len > 0) { + int optlen = raw[off+1]+2; + + switch (skb->nh.raw[off]) { + case IPV6_TLV_PAD0: + optlen = 1; + break; + + case IPV6_TLV_PADN: + break; + + case IPV6_TLV_JUMBO: + if (skb->nh.raw[off+1] != 4 || (off&3) != 2) + goto bad; + + pkt_len = ntohl(*(u32*)(skb->nh.raw+off+2)); + + if (pkt_len > skb->len - sizeof(struct ipv6hdr)) + goto bad; + if (pkt_len + sizeof(struct ipv6hdr) < skb->len) { + if (__pskb_trim(skb, + pkt_len + sizeof(struct ipv6hdr))) + goto bad; + if (skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_NONE; + } + break; + default: + if (optlen > len) + goto bad; + break; + } + off += optlen; + len -= optlen; + } + if (len == 0) + return 0; +bad: + return -1; + +} + +/* Replicate the checks that IPv6 does on packet reception and pass the packet + * to ip6tables, which doesn't support NAT, so things are fairly simple. */ +static unsigned int br_nf_pre_routing_ipv6(unsigned int hook, + struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, int (*okfn)(struct sk_buff *)) +{ + struct ipv6hdr *hdr; + u32 pkt_len; + struct nf_bridge_info *nf_bridge; + + if (skb->len < sizeof(struct ipv6hdr)) + goto inhdr_error; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto inhdr_error; + + hdr = skb->nh.ipv6h; + + if (hdr->version != 6) + goto inhdr_error; + + pkt_len = ntohs(hdr->payload_len); + + if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { + if (pkt_len + sizeof(struct ipv6hdr) > skb->len) + goto inhdr_error; + if (pkt_len + sizeof(struct ipv6hdr) < skb->len) { + if (__pskb_trim(skb, pkt_len + sizeof(struct ipv6hdr))) + goto inhdr_error; + if (skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_NONE; + } + } + if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) + goto inhdr_error; + +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug ^= (1 << NF_IP6_PRE_ROUTING); +#endif + if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) + return NF_DROP; + setup_pre_routing(skb); + + NF_HOOK(PF_INET6, NF_IP6_PRE_ROUTING, skb, skb->dev, NULL, + br_nf_pre_routing_finish_ipv6); + + return NF_STOLEN; + +inhdr_error: + return NF_DROP; +} + +/* Direct IPv6 traffic to br_nf_pre_routing_ipv6. + * Replicate the checks that IPv4 does on packet reception. + * Set skb->dev to the bridge device (i.e. parent of the + * receiving device) to make netfilter happy, the REDIRECT + * target in particular. Save the original destination IP + * address to be able to detect DNAT afterwards. */ +static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct iphdr *iph; + __u32 len; + struct sk_buff *skb = *pskb; + struct nf_bridge_info *nf_bridge; + struct vlan_ethhdr *hdr = vlan_eth_hdr(*pskb); + + if (skb->protocol == __constant_htons(ETH_P_IPV6) || IS_VLAN_IPV6) { +#ifdef CONFIG_SYSCTL + if (!brnf_call_ip6tables) + return NF_ACCEPT; +#endif + if ((skb = skb_share_check(*pskb, GFP_ATOMIC)) == NULL) + goto out; + + if (skb->protocol == __constant_htons(ETH_P_8021Q)) { + skb_pull(skb, VLAN_HLEN); + (skb)->nh.raw += VLAN_HLEN; + } + return br_nf_pre_routing_ipv6(hook, skb, in, out, okfn); + } +#ifdef CONFIG_SYSCTL + if (!brnf_call_iptables) + return NF_ACCEPT; +#endif + + if (skb->protocol != __constant_htons(ETH_P_IP) && !IS_VLAN_IP) + return NF_ACCEPT; + + if ((skb = skb_share_check(*pskb, GFP_ATOMIC)) == NULL) + goto out; + + if (skb->protocol == __constant_htons(ETH_P_8021Q)) { + skb_pull(skb, VLAN_HLEN); + (skb)->nh.raw += VLAN_HLEN; + } + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto inhdr_error; + + iph = skb->nh.iph; + if (iph->ihl < 5 || iph->version != 4) + goto inhdr_error; + + if (!pskb_may_pull(skb, 4*iph->ihl)) + goto inhdr_error; + + iph = skb->nh.iph; + if (ip_fast_csum((__u8 *)iph, iph->ihl) != 0) + goto inhdr_error; + + len = ntohs(iph->tot_len); + if (skb->len < len || len < 4*iph->ihl) + goto inhdr_error; + + if (skb->len > len) { + __pskb_trim(skb, len); + if (skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_NONE; + } + +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug ^= (1 << NF_IP_PRE_ROUTING); +#endif + if ((nf_bridge = nf_bridge_alloc(skb)) == NULL) + return NF_DROP; + setup_pre_routing(skb); + store_orig_dstaddr(skb); + + NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL, + br_nf_pre_routing_finish); + + return NF_STOLEN; + +inhdr_error: +// IP_INC_STATS_BH(IpInHdrErrors); +out: + return NF_DROP; +} + + +/* PF_BRIDGE/LOCAL_IN ************************************************/ +/* The packet is locally destined, which requires a real + * dst_entry, so detach the fake one. On the way up, the + * packet would pass through PRE_ROUTING again (which already + * took place when the packet entered the bridge), but we + * register an IPv4 PRE_ROUTING 'sabotage' hook that will + * prevent this from happening. */ +static unsigned int br_nf_local_in(unsigned int hook, struct sk_buff **pskb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *pskb; + + if (skb->dst == (struct dst_entry *)&__fake_rtable) { + dst_release(skb->dst); + skb->dst = NULL; + } + + return NF_ACCEPT; +} + + +/* PF_BRIDGE/FORWARD *************************************************/ +static int br_nf_forward_finish(struct sk_buff *skb) +{ + struct nf_bridge_info *nf_bridge = skb->nf_bridge; + struct net_device *in; + struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); + +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug ^= (1 << NF_BR_FORWARD); +#endif + + if (skb->protocol != __constant_htons(ETH_P_ARP) && !IS_VLAN_ARP) { + in = nf_bridge->physindev; + if (nf_bridge->mask & BRNF_PKT_TYPE) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->mask ^= BRNF_PKT_TYPE; + } + } else { + in = *((struct net_device **)(skb->cb)); + } + if (skb->protocol == __constant_htons(ETH_P_8021Q)) { + skb_push(skb, VLAN_HLEN); + skb->nh.raw -= VLAN_HLEN; + } + NF_HOOK_THRESH(PF_BRIDGE, NF_BR_FORWARD, skb, in, + skb->dev, br_forward_finish, 1); + return 0; +} + +/* This is the 'purely bridged' case. For IP, we pass the packet to + * netfilter with indev and outdev set to the bridge device, + * but we are still able to filter on the 'real' indev/outdev + * because of the physdev module. For ARP, indev and outdev are the + * bridge ports. */ +static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff **pskb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *pskb; + struct nf_bridge_info *nf_bridge; + struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); + int pf; + + if (!skb->nf_bridge) + return NF_ACCEPT; + + if (skb->protocol == __constant_htons(ETH_P_IP) || IS_VLAN_IP) + pf = PF_INET; + else + pf = PF_INET6; + + if (skb->protocol == __constant_htons(ETH_P_8021Q)) { + skb_pull(*pskb, VLAN_HLEN); + (*pskb)->nh.raw += VLAN_HLEN; + } + +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug ^= (1 << NF_BR_FORWARD); +#endif + nf_bridge = skb->nf_bridge; + if (skb->pkt_type == PACKET_OTHERHOST) { + skb->pkt_type = PACKET_HOST; + nf_bridge->mask |= BRNF_PKT_TYPE; + } + + /* The physdev module checks on this */ + nf_bridge->mask |= BRNF_BRIDGED; + nf_bridge->physoutdev = skb->dev; + + NF_HOOK(pf, NF_IP_FORWARD, skb, bridge_parent(in), + bridge_parent(out), br_nf_forward_finish); + + return NF_STOLEN; +} + +static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff **pskb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *pskb; + struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); + struct net_device **d = (struct net_device **)(skb->cb); + +#ifdef CONFIG_SYSCTL + if (!brnf_call_arptables) + return NF_ACCEPT; +#endif + + if (skb->protocol != __constant_htons(ETH_P_ARP)) { + if (!IS_VLAN_ARP) + return NF_ACCEPT; + skb_pull(*pskb, VLAN_HLEN); + (*pskb)->nh.raw += VLAN_HLEN; + } + +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug ^= (1 << NF_BR_FORWARD); +#endif + + if (skb->nh.arph->ar_pln != 4) { + if (IS_VLAN_ARP) { + skb_push(*pskb, VLAN_HLEN); + (*pskb)->nh.raw -= VLAN_HLEN; + } + return NF_ACCEPT; + } + *d = (struct net_device *)in; + NF_HOOK(NF_ARP, NF_ARP_FORWARD, skb, (struct net_device *)in, + (struct net_device *)out, br_nf_forward_finish); + + return NF_STOLEN; +} + + +/* PF_BRIDGE/LOCAL_OUT ***********************************************/ +static int br_nf_local_out_finish(struct sk_buff *skb) +{ +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug &= ~(1 << NF_BR_LOCAL_OUT); +#endif + if (skb->protocol == __constant_htons(ETH_P_8021Q)) { + skb_push(skb, VLAN_HLEN); + skb->nh.raw -= VLAN_HLEN; + } + + NF_HOOK_THRESH(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, + br_forward_finish, NF_BR_PRI_FIRST + 1); + + return 0; +} + +/* This function sees both locally originated IP packets and forwarded + * IP packets (in both cases the destination device is a bridge + * device). It also sees bridged-and-DNAT'ed packets. + * To be able to filter on the physical bridge devices (with the physdev + * module), we steal packets destined to a bridge device away from the + * PF_INET/FORWARD and PF_INET/OUTPUT hook functions, and give them back later, + * when we have determined the real output device. This is done in here. + * + * If (nf_bridge->mask & BRNF_BRIDGED_DNAT) then the packet is bridged + * and we fake the PF_BRIDGE/FORWARD hook. The function br_nf_forward() + * will then fake the PF_INET/FORWARD hook. br_nf_local_out() has priority + * NF_BR_PRI_FIRST, so no relevant PF_BRIDGE/INPUT functions have been nor + * will be executed. + * Otherwise, if nf_bridge->physindev is NULL, the bridge-nf code never touched + * this packet before, and so the packet was locally originated. We fake + * the PF_INET/LOCAL_OUT hook. + * Finally, if nf_bridge->physindev isn't NULL, then the packet was IP routed, + * so we fake the PF_INET/FORWARD hook. ip_sabotage_out() makes sure + * even routed packets that didn't arrive on a bridge interface have their + * nf_bridge->physindev set. */ +static unsigned int br_nf_local_out(unsigned int hook, struct sk_buff **pskb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct net_device *realindev, *realoutdev; + struct sk_buff *skb = *pskb; + struct nf_bridge_info *nf_bridge; + struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); + int pf; + + if (!skb->nf_bridge) + return NF_ACCEPT; + + if (skb->protocol == __constant_htons(ETH_P_IP) || IS_VLAN_IP) + pf = PF_INET; + else + pf = PF_INET6; + +#ifdef CONFIG_NETFILTER_DEBUG + /* Sometimes we get packets with NULL ->dst here (for example, + * running a dhcp client daemon triggers this). This should now + * be fixed, but let's keep the check around. */ + if (skb->dst == NULL) { + printk(KERN_CRIT "br_netfilter: skb->dst == NULL."); + return NF_ACCEPT; + } +#endif + + nf_bridge = skb->nf_bridge; + nf_bridge->physoutdev = skb->dev; + realindev = nf_bridge->physindev; + + /* Bridged, take PF_BRIDGE/FORWARD. + * (see big note in front of br_nf_pre_routing_finish) */ + if (nf_bridge->mask & BRNF_BRIDGED_DNAT) { + if (nf_bridge->mask & BRNF_PKT_TYPE) { + skb->pkt_type = PACKET_OTHERHOST; + nf_bridge->mask ^= BRNF_PKT_TYPE; + } + if (skb->protocol == __constant_htons(ETH_P_8021Q)) { + skb_push(skb, VLAN_HLEN); + skb->nh.raw -= VLAN_HLEN; + } + + NF_HOOK(PF_BRIDGE, NF_BR_FORWARD, skb, realindev, + skb->dev, br_forward_finish); + goto out; + } + realoutdev = bridge_parent(skb->dev); + +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) + /* iptables should match -o br0.x */ + if (nf_bridge->netoutdev) + realoutdev = nf_bridge->netoutdev; +#endif + if (skb->protocol == __constant_htons(ETH_P_8021Q)) { + skb_pull(skb, VLAN_HLEN); + (*pskb)->nh.raw += VLAN_HLEN; + } + /* IP forwarded traffic has a physindev, locally + * generated traffic hasn't. */ + if (realindev != NULL) { + if (!(nf_bridge->mask & BRNF_DONT_TAKE_PARENT) && + has_bridge_parent(realindev)) + realindev = bridge_parent(realindev); + + NF_HOOK_THRESH(pf, NF_IP_FORWARD, skb, realindev, + realoutdev, br_nf_local_out_finish, + NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD + 1); + } else { +#ifdef CONFIG_NETFILTER_DEBUG + skb->nf_debug ^= (1 << NF_IP_LOCAL_OUT); +#endif + + NF_HOOK_THRESH(pf, NF_IP_LOCAL_OUT, skb, realindev, + realoutdev, br_nf_local_out_finish, + NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT + 1); + } + +out: + return NF_STOLEN; +} + + +/* PF_BRIDGE/POST_ROUTING ********************************************/ +static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff **pskb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *pskb; + struct nf_bridge_info *nf_bridge = (*pskb)->nf_bridge; + struct vlan_ethhdr *hdr = vlan_eth_hdr(skb); + struct net_device *realoutdev = bridge_parent(skb->dev); + int pf; + +#ifdef CONFIG_NETFILTER_DEBUG + /* Be very paranoid. This probably won't happen anymore, but let's + * keep the check just to be sure... */ + if (skb->mac.raw < skb->head || skb->mac.raw + ETH_HLEN > skb->data) { + printk(KERN_CRIT "br_netfilter: Argh!! br_nf_post_routing: " + "bad mac.raw pointer."); + goto print_error; + } +#endif + + if (!nf_bridge) + return NF_ACCEPT; + + if (skb->protocol == __constant_htons(ETH_P_IP) || IS_VLAN_IP) + pf = PF_INET; + else + pf = PF_INET6; + +#ifdef CONFIG_NETFILTER_DEBUG + if (skb->dst == NULL) { + printk(KERN_CRIT "br_netfilter: skb->dst == NULL."); + goto print_error; + } + + skb->nf_debug ^= (1 << NF_IP_POST_ROUTING); +#endif + + /* We assume any code from br_dev_queue_push_xmit onwards doesn't care + * about the value of skb->pkt_type. */ + if (skb->pkt_type == PACKET_OTHERHOST) { + skb->pkt_type = PACKET_HOST; + nf_bridge->mask |= BRNF_PKT_TYPE; + } + + if (skb->protocol == __constant_htons(ETH_P_8021Q)) { + skb_pull(skb, VLAN_HLEN); + skb->nh.raw += VLAN_HLEN; + } + + nf_bridge_save_header(skb); + +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) + if (nf_bridge->netoutdev) + realoutdev = nf_bridge->netoutdev; +#endif + NF_HOOK(pf, NF_IP_POST_ROUTING, skb, NULL, realoutdev, + br_dev_queue_push_xmit); + + return NF_STOLEN; + +#ifdef CONFIG_NETFILTER_DEBUG +print_error: + if (skb->dev != NULL) { + printk("[%s]", skb->dev->name); + if (has_bridge_parent(skb->dev)) + printk("[%s]", bridge_parent(skb->dev)->name); + } + printk(" head:%p, raw:%p, data:%p\n", skb->head, skb->mac.raw, + skb->data); + return NF_ACCEPT; +#endif +} + + +/* IP/SABOTAGE *****************************************************/ +/* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING + * for the second time. */ +static unsigned int ip_sabotage_in(unsigned int hook, struct sk_buff **pskb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if ((*pskb)->nf_bridge && + !((*pskb)->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) { + return NF_STOP; + } + + return NF_ACCEPT; +} + +/* Postpone execution of PF_INET(6)/FORWARD, PF_INET(6)/LOCAL_OUT + * and PF_INET(6)/POST_ROUTING until we have done the forwarding + * decision in the bridge code and have determined nf_bridge->physoutdev. */ +static unsigned int ip_sabotage_out(unsigned int hook, struct sk_buff **pskb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *pskb; + + if ((out->hard_start_xmit == br_dev_xmit && + okfn != br_nf_forward_finish && + okfn != br_nf_local_out_finish && + okfn != br_dev_queue_push_xmit) +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) + || ((out->priv_flags & IFF_802_1Q_VLAN) && + VLAN_DEV_INFO(out)->real_dev->hard_start_xmit == br_dev_xmit) +#endif + ) { + struct nf_bridge_info *nf_bridge; + + if (!skb->nf_bridge) { +#ifdef CONFIG_SYSCTL + /* This code is executed while in the IP(v6) stack, + the version should be 4 or 6. We can't use + skb->protocol because that isn't set on + PF_INET(6)/LOCAL_OUT. */ + struct iphdr *ip = skb->nh.iph; + + if (ip->version == 4 && !brnf_call_iptables) + return NF_ACCEPT; + else if (ip->version == 6 && !brnf_call_ip6tables) + return NF_ACCEPT; +#endif + if (hook == NF_IP_POST_ROUTING) + return NF_ACCEPT; + if (!nf_bridge_alloc(skb)) + return NF_DROP; + } + + nf_bridge = skb->nf_bridge; + + /* This frame will arrive on PF_BRIDGE/LOCAL_OUT and we + * will need the indev then. For a brouter, the real indev + * can be a bridge port, so we make sure br_nf_local_out() + * doesn't use the bridge parent of the indev by using + * the BRNF_DONT_TAKE_PARENT mask. */ + if (hook == NF_IP_FORWARD && nf_bridge->physindev == NULL) { + nf_bridge->mask &= BRNF_DONT_TAKE_PARENT; + nf_bridge->physindev = (struct net_device *)in; + } +#if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) + /* the iptables outdev is br0.x, not br0 */ + if (out->priv_flags & IFF_802_1Q_VLAN) + nf_bridge->netoutdev = (struct net_device *)out; +#endif + return NF_STOP; + } + + return NF_ACCEPT; +} + +/* For br_nf_local_out we need (prio = NF_BR_PRI_FIRST), to insure that innocent + * PF_BRIDGE/NF_BR_LOCAL_OUT functions don't get bridged traffic as input. + * For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because + * ip_refrag() can return NF_STOLEN. */ +static struct nf_hook_ops br_nf_ops[] = { + { .hook = br_nf_pre_routing, + .owner = THIS_MODULE, + .pf = PF_BRIDGE, + .hooknum = NF_BR_PRE_ROUTING, + .priority = NF_BR_PRI_BRNF, }, + { .hook = br_nf_local_in, + .owner = THIS_MODULE, + .pf = PF_BRIDGE, + .hooknum = NF_BR_LOCAL_IN, + .priority = NF_BR_PRI_BRNF, }, + { .hook = br_nf_forward_ip, + .owner = THIS_MODULE, + .pf = PF_BRIDGE, + .hooknum = NF_BR_FORWARD, + .priority = NF_BR_PRI_BRNF - 1, }, + { .hook = br_nf_forward_arp, + .owner = THIS_MODULE, + .pf = PF_BRIDGE, + .hooknum = NF_BR_FORWARD, + .priority = NF_BR_PRI_BRNF, }, + { .hook = br_nf_local_out, + .owner = THIS_MODULE, + .pf = PF_BRIDGE, + .hooknum = NF_BR_LOCAL_OUT, + .priority = NF_BR_PRI_FIRST, }, + { .hook = br_nf_post_routing, + .owner = THIS_MODULE, + .pf = PF_BRIDGE, + .hooknum = NF_BR_POST_ROUTING, + .priority = NF_BR_PRI_LAST, }, + { .hook = ip_sabotage_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_FIRST, }, + { .hook = ip_sabotage_in, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_PRE_ROUTING, + .priority = NF_IP6_PRI_FIRST, }, + { .hook = ip_sabotage_out, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_FORWARD, + .priority = NF_IP_PRI_BRIDGE_SABOTAGE_FORWARD, }, + { .hook = ip_sabotage_out, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_FORWARD, + .priority = NF_IP6_PRI_BRIDGE_SABOTAGE_FORWARD, }, + { .hook = ip_sabotage_out, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_BRIDGE_SABOTAGE_LOCAL_OUT, }, + { .hook = ip_sabotage_out, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_OUT, + .priority = NF_IP6_PRI_BRIDGE_SABOTAGE_LOCAL_OUT, }, + { .hook = ip_sabotage_out, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_FIRST, }, + { .hook = ip_sabotage_out, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_POST_ROUTING, + .priority = NF_IP6_PRI_FIRST, }, +}; + +#ifdef CONFIG_SYSCTL +static +int brnf_sysctl_call_tables(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + + if (write && *(int *)(ctl->data)) + *(int *)(ctl->data) = 1; + return ret; +} + +static ctl_table brnf_table[] = { + { + .ctl_name = NET_BRIDGE_NF_CALL_ARPTABLES, + .procname = "bridge-nf-call-arptables", + .data = &brnf_call_arptables, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &brnf_sysctl_call_tables, + }, + { + .ctl_name = NET_BRIDGE_NF_CALL_IPTABLES, + .procname = "bridge-nf-call-iptables", + .data = &brnf_call_iptables, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &brnf_sysctl_call_tables, + }, + { + .ctl_name = NET_BRIDGE_NF_CALL_IP6TABLES, + .procname = "bridge-nf-call-ip6tables", + .data = &brnf_call_ip6tables, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &brnf_sysctl_call_tables, + }, + { + .ctl_name = NET_BRIDGE_NF_FILTER_VLAN_TAGGED, + .procname = "bridge-nf-filter-vlan-tagged", + .data = &brnf_filter_vlan_tagged, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &brnf_sysctl_call_tables, + }, + { .ctl_name = 0 } +}; + +static ctl_table brnf_bridge_table[] = { + { + .ctl_name = NET_BRIDGE, + .procname = "bridge", + .mode = 0555, + .child = brnf_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table brnf_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = brnf_bridge_table, + }, + { .ctl_name = 0 } +}; +#endif + +int br_netfilter_init(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(br_nf_ops); i++) { + int ret; + + if ((ret = nf_register_hook(&br_nf_ops[i])) >= 0) + continue; + + while (i--) + nf_unregister_hook(&br_nf_ops[i]); + + return ret; + } + +#ifdef CONFIG_SYSCTL + brnf_sysctl_header = register_sysctl_table(brnf_net_table, 0); + if (brnf_sysctl_header == NULL) { + printk(KERN_WARNING "br_netfilter: can't register to sysctl.\n"); + for (i = 0; i < ARRAY_SIZE(br_nf_ops); i++) + nf_unregister_hook(&br_nf_ops[i]); + return -EFAULT; + } +#endif + + printk(KERN_NOTICE "Bridge firewalling registered\n"); + + return 0; +} + +void br_netfilter_fini(void) +{ + int i; + + for (i = ARRAY_SIZE(br_nf_ops) - 1; i >= 0; i--) + nf_unregister_hook(&br_nf_ops[i]); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(brnf_sysctl_header); +#endif +} diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c new file mode 100644 index 000000000000..f8fb49e34764 --- /dev/null +++ b/net/bridge/br_notify.c @@ -0,0 +1,87 @@ +/* + * Device event handling + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * $Id: br_notify.c,v 1.2 2000/02/21 15:51:34 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include + +#include "br_private.h" + +static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr); + +struct notifier_block br_device_notifier = { + .notifier_call = br_device_event +}; + +/* + * Handle changes in state of network devices enslaved to a bridge. + * + * Note: don't care about up/down if bridge itself is down, because + * port state is checked when bridge is brought up. + */ +static int br_device_event(struct notifier_block *unused, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct net_bridge_port *p = dev->br_port; + struct net_bridge *br; + + /* not a port of a bridge */ + if (p == NULL) + return NOTIFY_DONE; + + br = p->br; + + spin_lock_bh(&br->lock); + switch (event) { + case NETDEV_CHANGEMTU: + dev_set_mtu(br->dev, br_min_mtu(br)); + break; + + case NETDEV_CHANGEADDR: + br_fdb_changeaddr(p, dev->dev_addr); + br_stp_recalculate_bridge_id(br); + break; + + case NETDEV_CHANGE: /* device is up but carrier changed */ + if (!(br->dev->flags & IFF_UP)) + break; + + if (netif_carrier_ok(dev)) { + if (p->state == BR_STATE_DISABLED) + br_stp_enable_port(p); + } else { + if (p->state != BR_STATE_DISABLED) + br_stp_disable_port(p); + } + break; + + case NETDEV_DOWN: + if (br->dev->flags & IFF_UP) + br_stp_disable_port(p); + break; + + case NETDEV_UP: + if (netif_carrier_ok(dev) && (br->dev->flags & IFF_UP)) + br_stp_enable_port(p); + break; + + case NETDEV_UNREGISTER: + spin_unlock_bh(&br->lock); + br_del_if(br, dev); + goto done; + } + spin_unlock_bh(&br->lock); + + done: + return NOTIFY_DONE; +} diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h new file mode 100644 index 000000000000..54d63f1372a0 --- /dev/null +++ b/net/bridge/br_private.h @@ -0,0 +1,244 @@ +/* + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * $Id: br_private.h,v 1.7 2001/12/24 00:59:55 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _BR_PRIVATE_H +#define _BR_PRIVATE_H + +#include +#include +#include + +#define BR_HASH_BITS 8 +#define BR_HASH_SIZE (1 << BR_HASH_BITS) + +#define BR_HOLD_TIME (1*HZ) + +#define BR_PORT_BITS 10 +#define BR_MAX_PORTS (1<bridge_id, &br->designated_root, 8); +} + + +/* br_device.c */ +extern void br_dev_setup(struct net_device *dev); +extern int br_dev_xmit(struct sk_buff *skb, struct net_device *dev); + +/* br_fdb.c */ +extern void br_fdb_init(void); +extern void br_fdb_fini(void); +extern void br_fdb_changeaddr(struct net_bridge_port *p, + const unsigned char *newaddr); +extern void br_fdb_cleanup(unsigned long arg); +extern void br_fdb_delete_by_port(struct net_bridge *br, + struct net_bridge_port *p); +extern struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br, + const unsigned char *addr); +extern struct net_bridge_fdb_entry *br_fdb_get(struct net_bridge *br, + unsigned char *addr); +extern void br_fdb_put(struct net_bridge_fdb_entry *ent); +extern int br_fdb_fillbuf(struct net_bridge *br, void *buf, + unsigned long count, unsigned long off); +extern int br_fdb_insert(struct net_bridge *br, + struct net_bridge_port *source, + const unsigned char *addr); +extern void br_fdb_update(struct net_bridge *br, + struct net_bridge_port *source, + const unsigned char *addr); + +/* br_forward.c */ +extern void br_deliver(const struct net_bridge_port *to, + struct sk_buff *skb); +extern int br_dev_queue_push_xmit(struct sk_buff *skb); +extern void br_forward(const struct net_bridge_port *to, + struct sk_buff *skb); +extern int br_forward_finish(struct sk_buff *skb); +extern void br_flood_deliver(struct net_bridge *br, + struct sk_buff *skb, + int clone); +extern void br_flood_forward(struct net_bridge *br, + struct sk_buff *skb, + int clone); + +/* br_if.c */ +extern int br_add_bridge(const char *name); +extern int br_del_bridge(const char *name); +extern void br_cleanup_bridges(void); +extern int br_add_if(struct net_bridge *br, + struct net_device *dev); +extern int br_del_if(struct net_bridge *br, + struct net_device *dev); +extern int br_min_mtu(const struct net_bridge *br); + +/* br_input.c */ +extern int br_handle_frame_finish(struct sk_buff *skb); +extern int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb); + +/* br_ioctl.c */ +extern int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); +extern int br_ioctl_deviceless_stub(unsigned int cmd, void __user *arg); + +/* br_netfilter.c */ +extern int br_netfilter_init(void); +extern void br_netfilter_fini(void); + +/* br_stp.c */ +extern void br_log_state(const struct net_bridge_port *p); +extern struct net_bridge_port *br_get_port(struct net_bridge *br, + u16 port_no); +extern void br_init_port(struct net_bridge_port *p); +extern void br_become_designated_port(struct net_bridge_port *p); + +/* br_stp_if.c */ +extern void br_stp_enable_bridge(struct net_bridge *br); +extern void br_stp_disable_bridge(struct net_bridge *br); +extern void br_stp_enable_port(struct net_bridge_port *p); +extern void br_stp_disable_port(struct net_bridge_port *p); +extern void br_stp_recalculate_bridge_id(struct net_bridge *br); +extern void br_stp_set_bridge_priority(struct net_bridge *br, + u16 newprio); +extern void br_stp_set_port_priority(struct net_bridge_port *p, + u8 newprio); +extern void br_stp_set_path_cost(struct net_bridge_port *p, + u32 path_cost); +extern ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id); + +/* br_stp_bpdu.c */ +extern int br_stp_handle_bpdu(struct sk_buff *skb); + +/* br_stp_timer.c */ +extern void br_stp_timer_init(struct net_bridge *br); +extern void br_stp_port_timer_init(struct net_bridge_port *p); +extern unsigned long br_timer_value(const struct timer_list *timer); + +/* br.c */ +extern struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, + unsigned char *addr); +extern void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); + + +#ifdef CONFIG_SYSFS +/* br_sysfs_if.c */ +extern int br_sysfs_addif(struct net_bridge_port *p); +extern void br_sysfs_removeif(struct net_bridge_port *p); +extern void br_sysfs_freeif(struct net_bridge_port *p); + +/* br_sysfs_br.c */ +extern int br_sysfs_addbr(struct net_device *dev); +extern void br_sysfs_delbr(struct net_device *dev); + +#else + +#define br_sysfs_addif(p) (0) +#define br_sysfs_removeif(p) do { } while(0) +#define br_sysfs_freeif(p) kfree(p) +#define br_sysfs_addbr(dev) (0) +#define br_sysfs_delbr(dev) do { } while(0) +#endif /* CONFIG_SYSFS */ + +#endif diff --git a/net/bridge/br_private_stp.h b/net/bridge/br_private_stp.h new file mode 100644 index 000000000000..e29f01ac1adf --- /dev/null +++ b/net/bridge/br_private_stp.h @@ -0,0 +1,58 @@ +/* + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * $Id: br_private_stp.h,v 1.3 2001/02/05 06:03:47 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _BR_PRIVATE_STP_H +#define _BR_PRIVATE_STP_H + +#define BPDU_TYPE_CONFIG 0 +#define BPDU_TYPE_TCN 0x80 + +struct br_config_bpdu +{ + unsigned topology_change:1; + unsigned topology_change_ack:1; + bridge_id root; + int root_path_cost; + bridge_id bridge_id; + port_id port_id; + int message_age; + int max_age; + int hello_time; + int forward_delay; +}; + +/* called under bridge lock */ +static inline int br_is_designated_port(const struct net_bridge_port *p) +{ + return !memcmp(&p->designated_bridge, &p->br->bridge_id, 8) && + (p->designated_port == p->port_id); +} + + +/* br_stp.c */ +extern void br_become_root_bridge(struct net_bridge *br); +extern void br_config_bpdu_generation(struct net_bridge *); +extern void br_configuration_update(struct net_bridge *); +extern void br_port_state_selection(struct net_bridge *); +extern void br_received_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu); +extern void br_received_tcn_bpdu(struct net_bridge_port *p); +extern void br_transmit_config(struct net_bridge_port *p); +extern void br_transmit_tcn(struct net_bridge *br); +extern void br_topology_change_detection(struct net_bridge *br); + +/* br_stp_bpdu.c */ +extern void br_send_config_bpdu(struct net_bridge_port *, struct br_config_bpdu *); +extern void br_send_tcn_bpdu(struct net_bridge_port *); + +#endif diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c new file mode 100644 index 000000000000..04ca0639a95a --- /dev/null +++ b/net/bridge/br_stp.c @@ -0,0 +1,459 @@ +/* + * Spanning tree protocol; generic parts + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * $Id: br_stp.c,v 1.4 2000/06/19 10:13:35 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include + +#include "br_private.h" +#include "br_private_stp.h" + +/* since time values in bpdu are in jiffies and then scaled (1/256) + * before sending, make sure that is at least one. + */ +#define MESSAGE_AGE_INCR ((HZ < 256) ? 1 : (HZ/256)) + +static const char *br_port_state_names[] = { + [BR_STATE_DISABLED] = "disabled", + [BR_STATE_LISTENING] = "listening", + [BR_STATE_LEARNING] = "learning", + [BR_STATE_FORWARDING] = "forwarding", + [BR_STATE_BLOCKING] = "blocking", +}; + +void br_log_state(const struct net_bridge_port *p) +{ + pr_info("%s: port %d(%s) entering %s state\n", + p->br->dev->name, p->port_no, p->dev->name, + br_port_state_names[p->state]); + +} + +/* called under bridge lock */ +struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no) +{ + struct net_bridge_port *p; + + list_for_each_entry_rcu(p, &br->port_list, list) { + if (p->port_no == port_no) + return p; + } + + return NULL; +} + +/* called under bridge lock */ +static int br_should_become_root_port(const struct net_bridge_port *p, + u16 root_port) +{ + struct net_bridge *br; + struct net_bridge_port *rp; + int t; + + br = p->br; + if (p->state == BR_STATE_DISABLED || + br_is_designated_port(p)) + return 0; + + if (memcmp(&br->bridge_id, &p->designated_root, 8) <= 0) + return 0; + + if (!root_port) + return 1; + + rp = br_get_port(br, root_port); + + t = memcmp(&p->designated_root, &rp->designated_root, 8); + if (t < 0) + return 1; + else if (t > 0) + return 0; + + if (p->designated_cost + p->path_cost < + rp->designated_cost + rp->path_cost) + return 1; + else if (p->designated_cost + p->path_cost > + rp->designated_cost + rp->path_cost) + return 0; + + t = memcmp(&p->designated_bridge, &rp->designated_bridge, 8); + if (t < 0) + return 1; + else if (t > 0) + return 0; + + if (p->designated_port < rp->designated_port) + return 1; + else if (p->designated_port > rp->designated_port) + return 0; + + if (p->port_id < rp->port_id) + return 1; + + return 0; +} + +/* called under bridge lock */ +static void br_root_selection(struct net_bridge *br) +{ + struct net_bridge_port *p; + u16 root_port = 0; + + list_for_each_entry(p, &br->port_list, list) { + if (br_should_become_root_port(p, root_port)) + root_port = p->port_no; + + } + + br->root_port = root_port; + + if (!root_port) { + br->designated_root = br->bridge_id; + br->root_path_cost = 0; + } else { + p = br_get_port(br, root_port); + br->designated_root = p->designated_root; + br->root_path_cost = p->designated_cost + p->path_cost; + } +} + +/* called under bridge lock */ +void br_become_root_bridge(struct net_bridge *br) +{ + br->max_age = br->bridge_max_age; + br->hello_time = br->bridge_hello_time; + br->forward_delay = br->bridge_forward_delay; + br_topology_change_detection(br); + del_timer(&br->tcn_timer); + + if (br->dev->flags & IFF_UP) { + br_config_bpdu_generation(br); + mod_timer(&br->hello_timer, jiffies + br->hello_time); + } +} + +/* called under bridge lock */ +void br_transmit_config(struct net_bridge_port *p) +{ + struct br_config_bpdu bpdu; + struct net_bridge *br; + + + if (timer_pending(&p->hold_timer)) { + p->config_pending = 1; + return; + } + + br = p->br; + + bpdu.topology_change = br->topology_change; + bpdu.topology_change_ack = p->topology_change_ack; + bpdu.root = br->designated_root; + bpdu.root_path_cost = br->root_path_cost; + bpdu.bridge_id = br->bridge_id; + bpdu.port_id = p->port_id; + if (br_is_root_bridge(br)) + bpdu.message_age = 0; + else { + struct net_bridge_port *root + = br_get_port(br, br->root_port); + bpdu.message_age = br->max_age + - (root->message_age_timer.expires - jiffies) + + MESSAGE_AGE_INCR; + } + bpdu.max_age = br->max_age; + bpdu.hello_time = br->hello_time; + bpdu.forward_delay = br->forward_delay; + + if (bpdu.message_age < br->max_age) { + br_send_config_bpdu(p, &bpdu); + p->topology_change_ack = 0; + p->config_pending = 0; + mod_timer(&p->hold_timer, jiffies + BR_HOLD_TIME); + } +} + +/* called under bridge lock */ +static inline void br_record_config_information(struct net_bridge_port *p, + const struct br_config_bpdu *bpdu) +{ + p->designated_root = bpdu->root; + p->designated_cost = bpdu->root_path_cost; + p->designated_bridge = bpdu->bridge_id; + p->designated_port = bpdu->port_id; + + mod_timer(&p->message_age_timer, jiffies + + (p->br->max_age - bpdu->message_age)); +} + +/* called under bridge lock */ +static inline void br_record_config_timeout_values(struct net_bridge *br, + const struct br_config_bpdu *bpdu) +{ + br->max_age = bpdu->max_age; + br->hello_time = bpdu->hello_time; + br->forward_delay = bpdu->forward_delay; + br->topology_change = bpdu->topology_change; +} + +/* called under bridge lock */ +void br_transmit_tcn(struct net_bridge *br) +{ + br_send_tcn_bpdu(br_get_port(br, br->root_port)); +} + +/* called under bridge lock */ +static int br_should_become_designated_port(const struct net_bridge_port *p) +{ + struct net_bridge *br; + int t; + + br = p->br; + if (br_is_designated_port(p)) + return 1; + + if (memcmp(&p->designated_root, &br->designated_root, 8)) + return 1; + + if (br->root_path_cost < p->designated_cost) + return 1; + else if (br->root_path_cost > p->designated_cost) + return 0; + + t = memcmp(&br->bridge_id, &p->designated_bridge, 8); + if (t < 0) + return 1; + else if (t > 0) + return 0; + + if (p->port_id < p->designated_port) + return 1; + + return 0; +} + +/* called under bridge lock */ +static void br_designated_port_selection(struct net_bridge *br) +{ + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) { + if (p->state != BR_STATE_DISABLED && + br_should_become_designated_port(p)) + br_become_designated_port(p); + + } +} + +/* called under bridge lock */ +static int br_supersedes_port_info(struct net_bridge_port *p, struct br_config_bpdu *bpdu) +{ + int t; + + t = memcmp(&bpdu->root, &p->designated_root, 8); + if (t < 0) + return 1; + else if (t > 0) + return 0; + + if (bpdu->root_path_cost < p->designated_cost) + return 1; + else if (bpdu->root_path_cost > p->designated_cost) + return 0; + + t = memcmp(&bpdu->bridge_id, &p->designated_bridge, 8); + if (t < 0) + return 1; + else if (t > 0) + return 0; + + if (memcmp(&bpdu->bridge_id, &p->br->bridge_id, 8)) + return 1; + + if (bpdu->port_id <= p->designated_port) + return 1; + + return 0; +} + +/* called under bridge lock */ +static inline void br_topology_change_acknowledged(struct net_bridge *br) +{ + br->topology_change_detected = 0; + del_timer(&br->tcn_timer); +} + +/* called under bridge lock */ +void br_topology_change_detection(struct net_bridge *br) +{ + int isroot = br_is_root_bridge(br); + + pr_info("%s: topology change detected, %s\n", br->dev->name, + isroot ? "propagating" : "sending tcn bpdu"); + + if (isroot) { + br->topology_change = 1; + mod_timer(&br->topology_change_timer, jiffies + + br->bridge_forward_delay + br->bridge_max_age); + } else if (!br->topology_change_detected) { + br_transmit_tcn(br); + mod_timer(&br->tcn_timer, jiffies + br->bridge_hello_time); + } + + br->topology_change_detected = 1; +} + +/* called under bridge lock */ +void br_config_bpdu_generation(struct net_bridge *br) +{ + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) { + if (p->state != BR_STATE_DISABLED && + br_is_designated_port(p)) + br_transmit_config(p); + } +} + +/* called under bridge lock */ +static inline void br_reply(struct net_bridge_port *p) +{ + br_transmit_config(p); +} + +/* called under bridge lock */ +void br_configuration_update(struct net_bridge *br) +{ + br_root_selection(br); + br_designated_port_selection(br); +} + +/* called under bridge lock */ +void br_become_designated_port(struct net_bridge_port *p) +{ + struct net_bridge *br; + + br = p->br; + p->designated_root = br->designated_root; + p->designated_cost = br->root_path_cost; + p->designated_bridge = br->bridge_id; + p->designated_port = p->port_id; +} + + +/* called under bridge lock */ +static void br_make_blocking(struct net_bridge_port *p) +{ + if (p->state != BR_STATE_DISABLED && + p->state != BR_STATE_BLOCKING) { + if (p->state == BR_STATE_FORWARDING || + p->state == BR_STATE_LEARNING) + br_topology_change_detection(p->br); + + p->state = BR_STATE_BLOCKING; + br_log_state(p); + del_timer(&p->forward_delay_timer); + } +} + +/* called under bridge lock */ +static void br_make_forwarding(struct net_bridge_port *p) +{ + if (p->state == BR_STATE_BLOCKING) { + if (p->br->stp_enabled) { + p->state = BR_STATE_LISTENING; + } else { + p->state = BR_STATE_LEARNING; + } + br_log_state(p); + mod_timer(&p->forward_delay_timer, jiffies + p->br->forward_delay); } +} + +/* called under bridge lock */ +void br_port_state_selection(struct net_bridge *br) +{ + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) { + if (p->state != BR_STATE_DISABLED) { + if (p->port_no == br->root_port) { + p->config_pending = 0; + p->topology_change_ack = 0; + br_make_forwarding(p); + } else if (br_is_designated_port(p)) { + del_timer(&p->message_age_timer); + br_make_forwarding(p); + } else { + p->config_pending = 0; + p->topology_change_ack = 0; + br_make_blocking(p); + } + } + + } +} + +/* called under bridge lock */ +static inline void br_topology_change_acknowledge(struct net_bridge_port *p) +{ + p->topology_change_ack = 1; + br_transmit_config(p); +} + +/* called under bridge lock */ +void br_received_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu) +{ + struct net_bridge *br; + int was_root; + + br = p->br; + was_root = br_is_root_bridge(br); + + if (br_supersedes_port_info(p, bpdu)) { + br_record_config_information(p, bpdu); + br_configuration_update(br); + br_port_state_selection(br); + + if (!br_is_root_bridge(br) && was_root) { + del_timer(&br->hello_timer); + if (br->topology_change_detected) { + del_timer(&br->topology_change_timer); + br_transmit_tcn(br); + + mod_timer(&br->tcn_timer, + jiffies + br->bridge_hello_time); + } + } + + if (p->port_no == br->root_port) { + br_record_config_timeout_values(br, bpdu); + br_config_bpdu_generation(br); + if (bpdu->topology_change_ack) + br_topology_change_acknowledged(br); + } + } else if (br_is_designated_port(p)) { + br_reply(p); + } +} + +/* called under bridge lock */ +void br_received_tcn_bpdu(struct net_bridge_port *p) +{ + if (br_is_designated_port(p)) { + pr_info("%s: received tcn bpdu on port %i(%s)\n", + p->br->dev->name, p->port_no, p->dev->name); + + br_topology_change_detection(p->br); + br_topology_change_acknowledge(p); + } +} diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c new file mode 100644 index 000000000000..b91a875aca01 --- /dev/null +++ b/net/bridge/br_stp_bpdu.c @@ -0,0 +1,205 @@ +/* + * Spanning tree protocol; BPDU handling + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * $Id: br_stp_bpdu.c,v 1.3 2001/11/10 02:35:25 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#include "br_private.h" +#include "br_private_stp.h" + +#define JIFFIES_TO_TICKS(j) (((j) << 8) / HZ) +#define TICKS_TO_JIFFIES(j) (((j) * HZ) >> 8) + +static void br_send_bpdu(struct net_bridge_port *p, unsigned char *data, int length) +{ + struct net_device *dev; + struct sk_buff *skb; + int size; + + if (!p->br->stp_enabled) + return; + + size = length + 2*ETH_ALEN + 2; + if (size < 60) + size = 60; + + dev = p->dev; + + if ((skb = dev_alloc_skb(size)) == NULL) { + printk(KERN_INFO "br: memory squeeze!\n"); + return; + } + + skb->dev = dev; + skb->protocol = htons(ETH_P_802_2); + skb->mac.raw = skb_put(skb, size); + memcpy(skb->mac.raw, bridge_ula, ETH_ALEN); + memcpy(skb->mac.raw+ETH_ALEN, dev->dev_addr, ETH_ALEN); + skb->mac.raw[2*ETH_ALEN] = 0; + skb->mac.raw[2*ETH_ALEN+1] = length; + skb->nh.raw = skb->mac.raw + 2*ETH_ALEN + 2; + memcpy(skb->nh.raw, data, length); + memset(skb->nh.raw + length, 0xa5, size - length - 2*ETH_ALEN - 2); + + NF_HOOK(PF_BRIDGE, NF_BR_LOCAL_OUT, skb, NULL, skb->dev, + dev_queue_xmit); +} + +static __inline__ void br_set_ticks(unsigned char *dest, int jiff) +{ + __u16 ticks; + + ticks = JIFFIES_TO_TICKS(jiff); + dest[0] = (ticks >> 8) & 0xFF; + dest[1] = ticks & 0xFF; +} + +static __inline__ int br_get_ticks(unsigned char *dest) +{ + return TICKS_TO_JIFFIES((dest[0] << 8) | dest[1]); +} + +/* called under bridge lock */ +void br_send_config_bpdu(struct net_bridge_port *p, struct br_config_bpdu *bpdu) +{ + unsigned char buf[38]; + + buf[0] = 0x42; + buf[1] = 0x42; + buf[2] = 0x03; + buf[3] = 0; + buf[4] = 0; + buf[5] = 0; + buf[6] = BPDU_TYPE_CONFIG; + buf[7] = (bpdu->topology_change ? 0x01 : 0) | + (bpdu->topology_change_ack ? 0x80 : 0); + buf[8] = bpdu->root.prio[0]; + buf[9] = bpdu->root.prio[1]; + buf[10] = bpdu->root.addr[0]; + buf[11] = bpdu->root.addr[1]; + buf[12] = bpdu->root.addr[2]; + buf[13] = bpdu->root.addr[3]; + buf[14] = bpdu->root.addr[4]; + buf[15] = bpdu->root.addr[5]; + buf[16] = (bpdu->root_path_cost >> 24) & 0xFF; + buf[17] = (bpdu->root_path_cost >> 16) & 0xFF; + buf[18] = (bpdu->root_path_cost >> 8) & 0xFF; + buf[19] = bpdu->root_path_cost & 0xFF; + buf[20] = bpdu->bridge_id.prio[0]; + buf[21] = bpdu->bridge_id.prio[1]; + buf[22] = bpdu->bridge_id.addr[0]; + buf[23] = bpdu->bridge_id.addr[1]; + buf[24] = bpdu->bridge_id.addr[2]; + buf[25] = bpdu->bridge_id.addr[3]; + buf[26] = bpdu->bridge_id.addr[4]; + buf[27] = bpdu->bridge_id.addr[5]; + buf[28] = (bpdu->port_id >> 8) & 0xFF; + buf[29] = bpdu->port_id & 0xFF; + + br_set_ticks(buf+30, bpdu->message_age); + br_set_ticks(buf+32, bpdu->max_age); + br_set_ticks(buf+34, bpdu->hello_time); + br_set_ticks(buf+36, bpdu->forward_delay); + + br_send_bpdu(p, buf, 38); +} + +/* called under bridge lock */ +void br_send_tcn_bpdu(struct net_bridge_port *p) +{ + unsigned char buf[7]; + + buf[0] = 0x42; + buf[1] = 0x42; + buf[2] = 0x03; + buf[3] = 0; + buf[4] = 0; + buf[5] = 0; + buf[6] = BPDU_TYPE_TCN; + br_send_bpdu(p, buf, 7); +} + +static const unsigned char header[6] = {0x42, 0x42, 0x03, 0x00, 0x00, 0x00}; + +/* NO locks */ +int br_stp_handle_bpdu(struct sk_buff *skb) +{ + struct net_bridge_port *p = skb->dev->br_port; + struct net_bridge *br = p->br; + unsigned char *buf; + + /* need at least the 802 and STP headers */ + if (!pskb_may_pull(skb, sizeof(header)+1) || + memcmp(skb->data, header, sizeof(header))) + goto err; + + buf = skb_pull(skb, sizeof(header)); + + spin_lock_bh(&br->lock); + if (p->state == BR_STATE_DISABLED + || !(br->dev->flags & IFF_UP) + || !br->stp_enabled) + goto out; + + if (buf[0] == BPDU_TYPE_CONFIG) { + struct br_config_bpdu bpdu; + + if (!pskb_may_pull(skb, 32)) + goto out; + + buf = skb->data; + bpdu.topology_change = (buf[1] & 0x01) ? 1 : 0; + bpdu.topology_change_ack = (buf[1] & 0x80) ? 1 : 0; + + bpdu.root.prio[0] = buf[2]; + bpdu.root.prio[1] = buf[3]; + bpdu.root.addr[0] = buf[4]; + bpdu.root.addr[1] = buf[5]; + bpdu.root.addr[2] = buf[6]; + bpdu.root.addr[3] = buf[7]; + bpdu.root.addr[4] = buf[8]; + bpdu.root.addr[5] = buf[9]; + bpdu.root_path_cost = + (buf[10] << 24) | + (buf[11] << 16) | + (buf[12] << 8) | + buf[13]; + bpdu.bridge_id.prio[0] = buf[14]; + bpdu.bridge_id.prio[1] = buf[15]; + bpdu.bridge_id.addr[0] = buf[16]; + bpdu.bridge_id.addr[1] = buf[17]; + bpdu.bridge_id.addr[2] = buf[18]; + bpdu.bridge_id.addr[3] = buf[19]; + bpdu.bridge_id.addr[4] = buf[20]; + bpdu.bridge_id.addr[5] = buf[21]; + bpdu.port_id = (buf[22] << 8) | buf[23]; + + bpdu.message_age = br_get_ticks(buf+24); + bpdu.max_age = br_get_ticks(buf+26); + bpdu.hello_time = br_get_ticks(buf+28); + bpdu.forward_delay = br_get_ticks(buf+30); + + br_received_config_bpdu(p, &bpdu); + } + + else if (buf[0] == BPDU_TYPE_TCN) { + br_received_tcn_bpdu(p); + } + out: + spin_unlock_bh(&br->lock); + err: + kfree_skb(skb); + return 0; +} diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c new file mode 100644 index 000000000000..0da11ff05fa3 --- /dev/null +++ b/net/bridge/br_stp_if.c @@ -0,0 +1,225 @@ +/* + * Spanning tree protocol; interface code + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * $Id: br_stp_if.c,v 1.4 2001/04/14 21:14:39 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#include "br_private.h" +#include "br_private_stp.h" + + +/* Port id is composed of priority and port number. + * NB: least significant bits of priority are dropped to + * make room for more ports. + */ +static inline port_id br_make_port_id(__u8 priority, __u16 port_no) +{ + return ((u16)priority << BR_PORT_BITS) + | (port_no & ((1<port_id = br_make_port_id(p->priority, p->port_no); + br_become_designated_port(p); + p->state = BR_STATE_BLOCKING; + p->topology_change_ack = 0; + p->config_pending = 0; + + br_stp_port_timer_init(p); +} + +/* called under bridge lock */ +void br_stp_enable_bridge(struct net_bridge *br) +{ + struct net_bridge_port *p; + + spin_lock_bh(&br->lock); + mod_timer(&br->hello_timer, jiffies + br->hello_time); + mod_timer(&br->gc_timer, jiffies + HZ/10); + + br_config_bpdu_generation(br); + + list_for_each_entry(p, &br->port_list, list) { + if ((p->dev->flags & IFF_UP) && netif_carrier_ok(p->dev)) + br_stp_enable_port(p); + + } + spin_unlock_bh(&br->lock); +} + +/* NO locks held */ +void br_stp_disable_bridge(struct net_bridge *br) +{ + struct net_bridge_port *p; + + spin_lock(&br->lock); + list_for_each_entry(p, &br->port_list, list) { + if (p->state != BR_STATE_DISABLED) + br_stp_disable_port(p); + + } + + br->topology_change = 0; + br->topology_change_detected = 0; + spin_unlock(&br->lock); + + del_timer_sync(&br->hello_timer); + del_timer_sync(&br->topology_change_timer); + del_timer_sync(&br->tcn_timer); + del_timer_sync(&br->gc_timer); +} + +/* called under bridge lock */ +void br_stp_enable_port(struct net_bridge_port *p) +{ + br_init_port(p); + br_port_state_selection(p->br); +} + +/* called under bridge lock */ +void br_stp_disable_port(struct net_bridge_port *p) +{ + struct net_bridge *br; + int wasroot; + + br = p->br; + printk(KERN_INFO "%s: port %i(%s) entering %s state\n", + br->dev->name, p->port_no, p->dev->name, "disabled"); + + wasroot = br_is_root_bridge(br); + br_become_designated_port(p); + p->state = BR_STATE_DISABLED; + p->topology_change_ack = 0; + p->config_pending = 0; + + del_timer(&p->message_age_timer); + del_timer(&p->forward_delay_timer); + del_timer(&p->hold_timer); + + br_configuration_update(br); + + br_port_state_selection(br); + + if (br_is_root_bridge(br) && !wasroot) + br_become_root_bridge(br); +} + +/* called under bridge lock */ +static void br_stp_change_bridge_id(struct net_bridge *br, + const unsigned char *addr) +{ + unsigned char oldaddr[6]; + struct net_bridge_port *p; + int wasroot; + + wasroot = br_is_root_bridge(br); + + memcpy(oldaddr, br->bridge_id.addr, ETH_ALEN); + memcpy(br->bridge_id.addr, addr, ETH_ALEN); + memcpy(br->dev->dev_addr, addr, ETH_ALEN); + + list_for_each_entry(p, &br->port_list, list) { + if (!memcmp(p->designated_bridge.addr, oldaddr, ETH_ALEN)) + memcpy(p->designated_bridge.addr, addr, ETH_ALEN); + + if (!memcmp(p->designated_root.addr, oldaddr, ETH_ALEN)) + memcpy(p->designated_root.addr, addr, ETH_ALEN); + + } + + br_configuration_update(br); + br_port_state_selection(br); + if (br_is_root_bridge(br) && !wasroot) + br_become_root_bridge(br); +} + +static const unsigned char br_mac_zero[6]; + +/* called under bridge lock */ +void br_stp_recalculate_bridge_id(struct net_bridge *br) +{ + const unsigned char *addr = br_mac_zero; + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) { + if (addr == br_mac_zero || + memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0) + addr = p->dev->dev_addr; + + } + + if (memcmp(br->bridge_id.addr, addr, ETH_ALEN)) + br_stp_change_bridge_id(br, addr); +} + +/* called under bridge lock */ +void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio) +{ + struct net_bridge_port *p; + int wasroot; + + wasroot = br_is_root_bridge(br); + + list_for_each_entry(p, &br->port_list, list) { + if (p->state != BR_STATE_DISABLED && + br_is_designated_port(p)) { + p->designated_bridge.prio[0] = (newprio >> 8) & 0xFF; + p->designated_bridge.prio[1] = newprio & 0xFF; + } + + } + + br->bridge_id.prio[0] = (newprio >> 8) & 0xFF; + br->bridge_id.prio[1] = newprio & 0xFF; + br_configuration_update(br); + br_port_state_selection(br); + if (br_is_root_bridge(br) && !wasroot) + br_become_root_bridge(br); +} + +/* called under bridge lock */ +void br_stp_set_port_priority(struct net_bridge_port *p, u8 newprio) +{ + port_id new_port_id = br_make_port_id(newprio, p->port_no); + + if (br_is_designated_port(p)) + p->designated_port = new_port_id; + + p->port_id = new_port_id; + p->priority = newprio; + if (!memcmp(&p->br->bridge_id, &p->designated_bridge, 8) && + p->port_id < p->designated_port) { + br_become_designated_port(p); + br_port_state_selection(p->br); + } +} + +/* called under bridge lock */ +void br_stp_set_path_cost(struct net_bridge_port *p, u32 path_cost) +{ + p->path_cost = path_cost; + br_configuration_update(p->br); + br_port_state_selection(p->br); +} + +ssize_t br_show_bridge_id(char *buf, const struct bridge_id *id) +{ + return sprintf(buf, "%.2x%.2x.%.2x%.2x%.2x%.2x%.2x%.2x\n", + id->prio[0], id->prio[1], + id->addr[0], id->addr[1], id->addr[2], + id->addr[3], id->addr[4], id->addr[5]); +} diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c new file mode 100644 index 000000000000..9bef55f56425 --- /dev/null +++ b/net/bridge/br_stp_timer.c @@ -0,0 +1,188 @@ +/* + * Spanning tree protocol; timer-related code + * Linux ethernet bridge + * + * Authors: + * Lennert Buytenhek + * + * $Id: br_stp_timer.c,v 1.3 2000/05/05 02:17:17 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +#include "br_private.h" +#include "br_private_stp.h" + +/* called under bridge lock */ +static int br_is_designated_for_some_port(const struct net_bridge *br) +{ + struct net_bridge_port *p; + + list_for_each_entry(p, &br->port_list, list) { + if (p->state != BR_STATE_DISABLED && + !memcmp(&p->designated_bridge, &br->bridge_id, 8)) + return 1; + } + + return 0; +} + +static void br_hello_timer_expired(unsigned long arg) +{ + struct net_bridge *br = (struct net_bridge *)arg; + + pr_debug("%s: hello timer expired\n", br->dev->name); + spin_lock_bh(&br->lock); + if (br->dev->flags & IFF_UP) { + br_config_bpdu_generation(br); + + mod_timer(&br->hello_timer, jiffies + br->hello_time); + } + spin_unlock_bh(&br->lock); +} + +static void br_message_age_timer_expired(unsigned long arg) +{ + struct net_bridge_port *p = (struct net_bridge_port *) arg; + struct net_bridge *br = p->br; + const bridge_id *id = &p->designated_bridge; + int was_root; + + if (p->state == BR_STATE_DISABLED) + return; + + + pr_info("%s: neighbor %.2x%.2x.%.2x:%.2x:%.2x:%.2x:%.2x:%.2x lost on port %d(%s)\n", + br->dev->name, + id->prio[0], id->prio[1], + id->addr[0], id->addr[1], id->addr[2], + id->addr[3], id->addr[4], id->addr[5], + p->port_no, p->dev->name); + + /* + * According to the spec, the message age timer cannot be + * running when we are the root bridge. So.. this was_root + * check is redundant. I'm leaving it in for now, though. + */ + spin_lock_bh(&br->lock); + if (p->state == BR_STATE_DISABLED) + goto unlock; + was_root = br_is_root_bridge(br); + + br_become_designated_port(p); + br_configuration_update(br); + br_port_state_selection(br); + if (br_is_root_bridge(br) && !was_root) + br_become_root_bridge(br); + unlock: + spin_unlock_bh(&br->lock); +} + +static void br_forward_delay_timer_expired(unsigned long arg) +{ + struct net_bridge_port *p = (struct net_bridge_port *) arg; + struct net_bridge *br = p->br; + + pr_debug("%s: %d(%s) forward delay timer\n", + br->dev->name, p->port_no, p->dev->name); + spin_lock_bh(&br->lock); + if (p->state == BR_STATE_LISTENING) { + p->state = BR_STATE_LEARNING; + mod_timer(&p->forward_delay_timer, + jiffies + br->forward_delay); + } else if (p->state == BR_STATE_LEARNING) { + p->state = BR_STATE_FORWARDING; + if (br_is_designated_for_some_port(br)) + br_topology_change_detection(br); + } + br_log_state(p); + spin_unlock_bh(&br->lock); +} + +static void br_tcn_timer_expired(unsigned long arg) +{ + struct net_bridge *br = (struct net_bridge *) arg; + + pr_debug("%s: tcn timer expired\n", br->dev->name); + spin_lock_bh(&br->lock); + if (br->dev->flags & IFF_UP) { + br_transmit_tcn(br); + + mod_timer(&br->tcn_timer,jiffies + br->bridge_hello_time); + } + spin_unlock_bh(&br->lock); +} + +static void br_topology_change_timer_expired(unsigned long arg) +{ + struct net_bridge *br = (struct net_bridge *) arg; + + pr_debug("%s: topo change timer expired\n", br->dev->name); + spin_lock_bh(&br->lock); + br->topology_change_detected = 0; + br->topology_change = 0; + spin_unlock_bh(&br->lock); +} + +static void br_hold_timer_expired(unsigned long arg) +{ + struct net_bridge_port *p = (struct net_bridge_port *) arg; + + pr_debug("%s: %d(%s) hold timer expired\n", + p->br->dev->name, p->port_no, p->dev->name); + + spin_lock_bh(&p->br->lock); + if (p->config_pending) + br_transmit_config(p); + spin_unlock_bh(&p->br->lock); +} + +static inline void br_timer_init(struct timer_list *timer, + void (*_function)(unsigned long), + unsigned long _data) +{ + init_timer(timer); + timer->function = _function; + timer->data = _data; +} + +void br_stp_timer_init(struct net_bridge *br) +{ + br_timer_init(&br->hello_timer, br_hello_timer_expired, + (unsigned long) br); + + br_timer_init(&br->tcn_timer, br_tcn_timer_expired, + (unsigned long) br); + + br_timer_init(&br->topology_change_timer, + br_topology_change_timer_expired, + (unsigned long) br); + + br_timer_init(&br->gc_timer, br_fdb_cleanup, (unsigned long) br); +} + +void br_stp_port_timer_init(struct net_bridge_port *p) +{ + br_timer_init(&p->message_age_timer, br_message_age_timer_expired, + (unsigned long) p); + + br_timer_init(&p->forward_delay_timer, br_forward_delay_timer_expired, + (unsigned long) p); + + br_timer_init(&p->hold_timer, br_hold_timer_expired, + (unsigned long) p); +} + +/* Report ticks left (in USER_HZ) used for API */ +unsigned long br_timer_value(const struct timer_list *timer) +{ + return timer_pending(timer) + ? jiffies_to_clock_t(timer->expires - jiffies) : 0; +} diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c new file mode 100644 index 000000000000..98cf53c81fad --- /dev/null +++ b/net/bridge/br_sysfs_br.c @@ -0,0 +1,364 @@ +/* + * Sysfs attributes of bridge ports + * Linux ethernet bridge + * + * Authors: + * Stephen Hemminger + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +#include "br_private.h" + +#define to_class_dev(obj) container_of(obj,struct class_device,kobj) +#define to_net_dev(class) container_of(class, struct net_device, class_dev) +#define to_bridge(cd) ((struct net_bridge *)(to_net_dev(cd)->priv)) + +/* + * Common code for storing bridge parameters. + */ +static ssize_t store_bridge_parm(struct class_device *cd, + const char *buf, size_t len, + void (*set)(struct net_bridge *, unsigned long)) +{ + struct net_bridge *br = to_bridge(cd); + char *endp; + unsigned long val; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + val = simple_strtoul(buf, &endp, 0); + if (endp == buf) + return -EINVAL; + + spin_lock_bh(&br->lock); + (*set)(br, val); + spin_unlock_bh(&br->lock); + return len; +} + + +static ssize_t show_forward_delay(struct class_device *cd, char *buf) +{ + struct net_bridge *br = to_bridge(cd); + return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); +} + +static void set_forward_delay(struct net_bridge *br, unsigned long val) +{ + unsigned long delay = clock_t_to_jiffies(val); + br->forward_delay = delay; + if (br_is_root_bridge(br)) + br->bridge_forward_delay = delay; +} + +static ssize_t store_forward_delay(struct class_device *cd, const char *buf, + size_t len) +{ + return store_bridge_parm(cd, buf, len, set_forward_delay); +} +static CLASS_DEVICE_ATTR(forward_delay, S_IRUGO | S_IWUSR, + show_forward_delay, store_forward_delay); + +static ssize_t show_hello_time(struct class_device *cd, char *buf) +{ + return sprintf(buf, "%lu\n", + jiffies_to_clock_t(to_bridge(cd)->hello_time)); +} + +static void set_hello_time(struct net_bridge *br, unsigned long val) +{ + unsigned long t = clock_t_to_jiffies(val); + br->hello_time = t; + if (br_is_root_bridge(br)) + br->bridge_hello_time = t; +} + +static ssize_t store_hello_time(struct class_device *cd, const char *buf, + size_t len) +{ + return store_bridge_parm(cd, buf, len, set_hello_time); +} + +static CLASS_DEVICE_ATTR(hello_time, S_IRUGO | S_IWUSR, show_hello_time, + store_hello_time); + +static ssize_t show_max_age(struct class_device *cd, char *buf) +{ + return sprintf(buf, "%lu\n", + jiffies_to_clock_t(to_bridge(cd)->max_age)); +} + +static void set_max_age(struct net_bridge *br, unsigned long val) +{ + unsigned long t = clock_t_to_jiffies(val); + br->max_age = t; + if (br_is_root_bridge(br)) + br->bridge_max_age = t; +} + +static ssize_t store_max_age(struct class_device *cd, const char *buf, + size_t len) +{ + return store_bridge_parm(cd, buf, len, set_max_age); +} + +static CLASS_DEVICE_ATTR(max_age, S_IRUGO | S_IWUSR, show_max_age, + store_max_age); + +static ssize_t show_ageing_time(struct class_device *cd, char *buf) +{ + struct net_bridge *br = to_bridge(cd); + return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time)); +} + +static void set_ageing_time(struct net_bridge *br, unsigned long val) +{ + br->ageing_time = clock_t_to_jiffies(val); +} + +static ssize_t store_ageing_time(struct class_device *cd, const char *buf, + size_t len) +{ + return store_bridge_parm(cd, buf, len, set_ageing_time); +} + +static CLASS_DEVICE_ATTR(ageing_time, S_IRUGO | S_IWUSR, show_ageing_time, + store_ageing_time); +static ssize_t show_stp_state(struct class_device *cd, char *buf) +{ + struct net_bridge *br = to_bridge(cd); + return sprintf(buf, "%d\n", br->stp_enabled); +} + +static void set_stp_state(struct net_bridge *br, unsigned long val) +{ + br->stp_enabled = val; +} + +static ssize_t store_stp_state(struct class_device *cd, + const char *buf, size_t len) +{ + return store_bridge_parm(cd, buf, len, set_stp_state); +} + +static CLASS_DEVICE_ATTR(stp_state, S_IRUGO | S_IWUSR, show_stp_state, + store_stp_state); + +static ssize_t show_priority(struct class_device *cd, char *buf) +{ + struct net_bridge *br = to_bridge(cd); + return sprintf(buf, "%d\n", + (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]); +} + +static void set_priority(struct net_bridge *br, unsigned long val) +{ + br_stp_set_bridge_priority(br, (u16) val); +} + +static ssize_t store_priority(struct class_device *cd, + const char *buf, size_t len) +{ + return store_bridge_parm(cd, buf, len, set_priority); +} +static CLASS_DEVICE_ATTR(priority, S_IRUGO | S_IWUSR, show_priority, + store_priority); + +static ssize_t show_root_id(struct class_device *cd, char *buf) +{ + return br_show_bridge_id(buf, &to_bridge(cd)->designated_root); +} +static CLASS_DEVICE_ATTR(root_id, S_IRUGO, show_root_id, NULL); + +static ssize_t show_bridge_id(struct class_device *cd, char *buf) +{ + return br_show_bridge_id(buf, &to_bridge(cd)->bridge_id); +} +static CLASS_DEVICE_ATTR(bridge_id, S_IRUGO, show_bridge_id, NULL); + +static ssize_t show_root_port(struct class_device *cd, char *buf) +{ + return sprintf(buf, "%d\n", to_bridge(cd)->root_port); +} +static CLASS_DEVICE_ATTR(root_port, S_IRUGO, show_root_port, NULL); + +static ssize_t show_root_path_cost(struct class_device *cd, char *buf) +{ + return sprintf(buf, "%d\n", to_bridge(cd)->root_path_cost); +} +static CLASS_DEVICE_ATTR(root_path_cost, S_IRUGO, show_root_path_cost, NULL); + +static ssize_t show_topology_change(struct class_device *cd, char *buf) +{ + return sprintf(buf, "%d\n", to_bridge(cd)->topology_change); +} +static CLASS_DEVICE_ATTR(topology_change, S_IRUGO, show_topology_change, NULL); + +static ssize_t show_topology_change_detected(struct class_device *cd, char *buf) +{ + struct net_bridge *br = to_bridge(cd); + return sprintf(buf, "%d\n", br->topology_change_detected); +} +static CLASS_DEVICE_ATTR(topology_change_detected, S_IRUGO, show_topology_change_detected, NULL); + +static ssize_t show_hello_timer(struct class_device *cd, char *buf) +{ + struct net_bridge *br = to_bridge(cd); + return sprintf(buf, "%ld\n", br_timer_value(&br->hello_timer)); +} +static CLASS_DEVICE_ATTR(hello_timer, S_IRUGO, show_hello_timer, NULL); + +static ssize_t show_tcn_timer(struct class_device *cd, char *buf) +{ + struct net_bridge *br = to_bridge(cd); + return sprintf(buf, "%ld\n", br_timer_value(&br->tcn_timer)); +} +static CLASS_DEVICE_ATTR(tcn_timer, S_IRUGO, show_tcn_timer, NULL); + +static ssize_t show_topology_change_timer(struct class_device *cd, char *buf) +{ + struct net_bridge *br = to_bridge(cd); + return sprintf(buf, "%ld\n", br_timer_value(&br->topology_change_timer)); +} +static CLASS_DEVICE_ATTR(topology_change_timer, S_IRUGO, show_topology_change_timer, NULL); + +static ssize_t show_gc_timer(struct class_device *cd, char *buf) +{ + struct net_bridge *br = to_bridge(cd); + return sprintf(buf, "%ld\n", br_timer_value(&br->gc_timer)); +} +static CLASS_DEVICE_ATTR(gc_timer, S_IRUGO, show_gc_timer, NULL); + +static struct attribute *bridge_attrs[] = { + &class_device_attr_forward_delay.attr, + &class_device_attr_hello_time.attr, + &class_device_attr_max_age.attr, + &class_device_attr_ageing_time.attr, + &class_device_attr_stp_state.attr, + &class_device_attr_priority.attr, + &class_device_attr_bridge_id.attr, + &class_device_attr_root_id.attr, + &class_device_attr_root_path_cost.attr, + &class_device_attr_root_port.attr, + &class_device_attr_topology_change.attr, + &class_device_attr_topology_change_detected.attr, + &class_device_attr_hello_timer.attr, + &class_device_attr_tcn_timer.attr, + &class_device_attr_topology_change_timer.attr, + &class_device_attr_gc_timer.attr, + NULL +}; + +static struct attribute_group bridge_group = { + .name = SYSFS_BRIDGE_ATTR, + .attrs = bridge_attrs, +}; + +/* + * Export the forwarding information table as a binary file + * The records are struct __fdb_entry. + * + * Returns the number of bytes read. + */ +static ssize_t brforward_read(struct kobject *kobj, char *buf, + loff_t off, size_t count) +{ + struct class_device *cdev = to_class_dev(kobj); + struct net_bridge *br = to_bridge(cdev); + int n; + + /* must read whole records */ + if (off % sizeof(struct __fdb_entry) != 0) + return -EINVAL; + + n = br_fdb_fillbuf(br, buf, + count / sizeof(struct __fdb_entry), + off / sizeof(struct __fdb_entry)); + + if (n > 0) + n *= sizeof(struct __fdb_entry); + + return n; +} + +static struct bin_attribute bridge_forward = { + .attr = { .name = SYSFS_BRIDGE_FDB, + .mode = S_IRUGO, + .owner = THIS_MODULE, }, + .read = brforward_read, +}; + +/* + * Add entries in sysfs onto the existing network class device + * for the bridge. + * Adds a attribute group "bridge" containing tuning parameters. + * Binary attribute containing the forward table + * Sub directory to hold links to interfaces. + * + * Note: the ifobj exists only to be a subdirectory + * to hold links. The ifobj exists in same data structure + * as it's parent the bridge so reference counting works. + */ +int br_sysfs_addbr(struct net_device *dev) +{ + struct kobject *brobj = &dev->class_dev.kobj; + struct net_bridge *br = netdev_priv(dev); + int err; + + err = sysfs_create_group(brobj, &bridge_group); + if (err) { + pr_info("%s: can't create group %s/%s\n", + __FUNCTION__, dev->name, bridge_group.name); + goto out1; + } + + err = sysfs_create_bin_file(brobj, &bridge_forward); + if (err) { + pr_info("%s: can't create attribue file %s/%s\n", + __FUNCTION__, dev->name, bridge_forward.attr.name); + goto out2; + } + + + kobject_set_name(&br->ifobj, SYSFS_BRIDGE_PORT_SUBDIR); + br->ifobj.ktype = NULL; + br->ifobj.kset = NULL; + br->ifobj.parent = brobj; + + err = kobject_register(&br->ifobj); + if (err) { + pr_info("%s: can't add kobject (directory) %s/%s\n", + __FUNCTION__, dev->name, br->ifobj.name); + goto out3; + } + return 0; + out3: + sysfs_remove_bin_file(&dev->class_dev.kobj, &bridge_forward); + out2: + sysfs_remove_group(&dev->class_dev.kobj, &bridge_group); + out1: + return err; + +} + +void br_sysfs_delbr(struct net_device *dev) +{ + struct kobject *kobj = &dev->class_dev.kobj; + struct net_bridge *br = netdev_priv(dev); + + kobject_unregister(&br->ifobj); + sysfs_remove_bin_file(kobj, &bridge_forward); + sysfs_remove_group(kobj, &bridge_group); +} diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c new file mode 100644 index 000000000000..567249bf9331 --- /dev/null +++ b/net/bridge/br_sysfs_if.c @@ -0,0 +1,269 @@ +/* + * Sysfs attributes of bridge ports + * Linux ethernet bridge + * + * Authors: + * Stephen Hemminger + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include "br_private.h" + +struct brport_attribute { + struct attribute attr; + ssize_t (*show)(struct net_bridge_port *, char *); + ssize_t (*store)(struct net_bridge_port *, unsigned long); +}; + +#define BRPORT_ATTR(_name,_mode,_show,_store) \ +struct brport_attribute brport_attr_##_name = { \ + .attr = {.name = __stringify(_name), \ + .mode = _mode, \ + .owner = THIS_MODULE, }, \ + .show = _show, \ + .store = _store, \ +}; + +static ssize_t show_path_cost(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%d\n", p->path_cost); +} +static ssize_t store_path_cost(struct net_bridge_port *p, unsigned long v) +{ + br_stp_set_path_cost(p, v); + return 0; +} +static BRPORT_ATTR(path_cost, S_IRUGO | S_IWUSR, + show_path_cost, store_path_cost); + +static ssize_t show_priority(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%d\n", p->priority); +} +static ssize_t store_priority(struct net_bridge_port *p, unsigned long v) +{ + if (v >= (1<<(16-BR_PORT_BITS))) + return -ERANGE; + br_stp_set_port_priority(p, v); + return 0; +} +static BRPORT_ATTR(priority, S_IRUGO | S_IWUSR, + show_priority, store_priority); + +static ssize_t show_designated_root(struct net_bridge_port *p, char *buf) +{ + return br_show_bridge_id(buf, &p->designated_root); +} +static BRPORT_ATTR(designated_root, S_IRUGO, show_designated_root, NULL); + +static ssize_t show_designated_bridge(struct net_bridge_port *p, char *buf) +{ + return br_show_bridge_id(buf, &p->designated_bridge); +} +static BRPORT_ATTR(designated_bridge, S_IRUGO, show_designated_bridge, NULL); + +static ssize_t show_designated_port(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%d\n", p->designated_port); +} +static BRPORT_ATTR(designated_port, S_IRUGO, show_designated_port, NULL); + +static ssize_t show_designated_cost(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%d\n", p->designated_cost); +} +static BRPORT_ATTR(designated_cost, S_IRUGO, show_designated_cost, NULL); + +static ssize_t show_port_id(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "0x%x\n", p->port_id); +} +static BRPORT_ATTR(port_id, S_IRUGO, show_port_id, NULL); + +static ssize_t show_port_no(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "0x%x\n", p->port_no); +} + +static BRPORT_ATTR(port_no, S_IRUGO, show_port_no, NULL); + +static ssize_t show_change_ack(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%d\n", p->topology_change_ack); +} +static BRPORT_ATTR(change_ack, S_IRUGO, show_change_ack, NULL); + +static ssize_t show_config_pending(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%d\n", p->config_pending); +} +static BRPORT_ATTR(config_pending, S_IRUGO, show_config_pending, NULL); + +static ssize_t show_port_state(struct net_bridge_port *p, char *buf) +{ + return sprintf(buf, "%d\n", p->state); +} +static BRPORT_ATTR(state, S_IRUGO, show_port_state, NULL); + +static ssize_t show_message_age_timer(struct net_bridge_port *p, + char *buf) +{ + return sprintf(buf, "%ld\n", br_timer_value(&p->message_age_timer)); +} +static BRPORT_ATTR(message_age_timer, S_IRUGO, show_message_age_timer, NULL); + +static ssize_t show_forward_delay_timer(struct net_bridge_port *p, + char *buf) +{ + return sprintf(buf, "%ld\n", br_timer_value(&p->forward_delay_timer)); +} +static BRPORT_ATTR(forward_delay_timer, S_IRUGO, show_forward_delay_timer, NULL); + +static ssize_t show_hold_timer(struct net_bridge_port *p, + char *buf) +{ + return sprintf(buf, "%ld\n", br_timer_value(&p->hold_timer)); +} +static BRPORT_ATTR(hold_timer, S_IRUGO, show_hold_timer, NULL); + +static struct brport_attribute *brport_attrs[] = { + &brport_attr_path_cost, + &brport_attr_priority, + &brport_attr_port_id, + &brport_attr_port_no, + &brport_attr_designated_root, + &brport_attr_designated_bridge, + &brport_attr_designated_port, + &brport_attr_designated_cost, + &brport_attr_state, + &brport_attr_change_ack, + &brport_attr_config_pending, + &brport_attr_message_age_timer, + &brport_attr_forward_delay_timer, + &brport_attr_hold_timer, + NULL +}; + +#define to_brport_attr(_at) container_of(_at, struct brport_attribute, attr) +#define to_brport(obj) container_of(obj, struct net_bridge_port, kobj) + +static ssize_t brport_show(struct kobject * kobj, + struct attribute * attr, char * buf) +{ + struct brport_attribute * brport_attr = to_brport_attr(attr); + struct net_bridge_port * p = to_brport(kobj); + + return brport_attr->show(p, buf); +} + +static ssize_t brport_store(struct kobject * kobj, + struct attribute * attr, + const char * buf, size_t count) +{ + struct brport_attribute * brport_attr = to_brport_attr(attr); + struct net_bridge_port * p = to_brport(kobj); + ssize_t ret = -EINVAL; + char *endp; + unsigned long val; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + val = simple_strtoul(buf, &endp, 0); + if (endp != buf) { + rtnl_lock(); + if (p->dev && p->br && brport_attr->store) { + spin_lock_bh(&p->br->lock); + ret = brport_attr->store(p, val); + spin_unlock_bh(&p->br->lock); + if (ret == 0) + ret = count; + } + rtnl_unlock(); + } + return ret; +} + +/* called from kobject_put when port ref count goes to zero. */ +static void brport_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct net_bridge_port, kobj)); +} + +static struct sysfs_ops brport_sysfs_ops = { + .show = brport_show, + .store = brport_store, +}; + +static struct kobj_type brport_ktype = { + .sysfs_ops = &brport_sysfs_ops, + .release = brport_release, +}; + + +/* + * Add sysfs entries to ethernet device added to a bridge. + * Creates a brport subdirectory with bridge attributes. + * Puts symlink in bridge's brport subdirectory + */ +int br_sysfs_addif(struct net_bridge_port *p) +{ + struct net_bridge *br = p->br; + struct brport_attribute **a; + int err; + + ASSERT_RTNL(); + + kobject_set_name(&p->kobj, SYSFS_BRIDGE_PORT_ATTR); + p->kobj.ktype = &brport_ktype; + p->kobj.parent = &(p->dev->class_dev.kobj); + p->kobj.kset = NULL; + + err = kobject_add(&p->kobj); + if(err) + goto out1; + + err = sysfs_create_link(&p->kobj, &br->dev->class_dev.kobj, + SYSFS_BRIDGE_PORT_LINK); + if (err) + goto out2; + + for (a = brport_attrs; *a; ++a) { + err = sysfs_create_file(&p->kobj, &((*a)->attr)); + if (err) + goto out2; + } + + err = sysfs_create_link(&br->ifobj, &p->kobj, p->dev->name); + if (err) + goto out2; + + return 0; + out2: + kobject_del(&p->kobj); + out1: + return err; +} + +void br_sysfs_removeif(struct net_bridge_port *p) +{ + pr_debug("br_sysfs_removeif\n"); + sysfs_remove_link(&p->br->ifobj, p->dev->name); + kobject_del(&p->kobj); +} + +void br_sysfs_freeif(struct net_bridge_port *p) +{ + pr_debug("br_sysfs_freeif\n"); + kobject_put(&p->kobj); +} diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig new file mode 100644 index 000000000000..68ccef507b49 --- /dev/null +++ b/net/bridge/netfilter/Kconfig @@ -0,0 +1,211 @@ +# +# Bridge netfilter configuration +# + +menu "Bridge: Netfilter Configuration" + depends on BRIDGE && NETFILTER + +config BRIDGE_NF_EBTABLES + tristate "Ethernet Bridge tables (ebtables) support" + help + ebtables is a general, extensible frame/packet identification + framework. Say 'Y' or 'M' here if you want to do Ethernet + filtering/NAT/brouting on the Ethernet bridge. +# +# tables +# +config BRIDGE_EBT_BROUTE + tristate "ebt: broute table support" + depends on BRIDGE_NF_EBTABLES + help + The ebtables broute table is used to define rules that decide between + bridging and routing frames, giving Linux the functionality of a + brouter. See the man page for ebtables(8) and examples on the ebtables + website. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_T_FILTER + tristate "ebt: filter table support" + depends on BRIDGE_NF_EBTABLES + help + The ebtables filter table is used to define frame filtering rules at + local input, forwarding and local output. See the man page for + ebtables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_T_NAT + tristate "ebt: nat table support" + depends on BRIDGE_NF_EBTABLES + help + The ebtables nat table is used to define rules that alter the MAC + source address (MAC SNAT) or the MAC destination address (MAC DNAT). + See the man page for ebtables(8). + + To compile it as a module, choose M here. If unsure, say N. +# +# matches +# +config BRIDGE_EBT_802_3 + tristate "ebt: 802.3 filter support" + depends on BRIDGE_NF_EBTABLES + help + This option adds matching support for 802.3 Ethernet frames. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_AMONG + tristate "ebt: among filter support" + depends on BRIDGE_NF_EBTABLES + help + This option adds the among match, which allows matching the MAC source + and/or destination address on a list of addresses. Optionally, + MAC/IP address pairs can be matched, f.e. for anti-spoofing rules. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_ARP + tristate "ebt: ARP filter support" + depends on BRIDGE_NF_EBTABLES + help + This option adds the ARP match, which allows ARP and RARP header field + filtering. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_IP + tristate "ebt: IP filter support" + depends on BRIDGE_NF_EBTABLES + help + This option adds the IP match, which allows basic IP header field + filtering. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_LIMIT + tristate "ebt: limit match support" + depends on BRIDGE_NF_EBTABLES + help + This option adds the limit match, which allows you to control + the rate at which a rule can be matched. This match is the + equivalent of the iptables limit match. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config BRIDGE_EBT_MARK + tristate "ebt: mark filter support" + depends on BRIDGE_NF_EBTABLES + help + This option adds the mark match, which allows matching frames based on + the 'nfmark' value in the frame. This can be set by the mark target. + This value is the same as the one used in the iptables mark match and + target. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_PKTTYPE + tristate "ebt: packet type filter support" + depends on BRIDGE_NF_EBTABLES + help + This option adds the packet type match, which allows matching on the + type of packet based on its Ethernet "class" (as determined by + the generic networking code): broadcast, multicast, + for this host alone or for another host. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_STP + tristate "ebt: STP filter support" + depends on BRIDGE_NF_EBTABLES + help + This option adds the Spanning Tree Protocol match, which + allows STP header field filtering. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_VLAN + tristate "ebt: 802.1Q VLAN filter support" + depends on BRIDGE_NF_EBTABLES + help + This option adds the 802.1Q vlan match, which allows the filtering of + 802.1Q vlan fields. + + To compile it as a module, choose M here. If unsure, say N. +# +# targets +# +config BRIDGE_EBT_ARPREPLY + tristate "ebt: arp reply target support" + depends on BRIDGE_NF_EBTABLES + help + This option adds the arp reply target, which allows + automatically sending arp replies to arp requests. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_DNAT + tristate "ebt: dnat target support" + depends on BRIDGE_NF_EBTABLES + help + This option adds the MAC DNAT target, which allows altering the MAC + destination address of frames. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_MARK_T + tristate "ebt: mark target support" + depends on BRIDGE_NF_EBTABLES + help + This option adds the mark target, which allows marking frames by + setting the 'nfmark' value in the frame. + This value is the same as the one used in the iptables mark match and + target. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_REDIRECT + tristate "ebt: redirect target support" + depends on BRIDGE_NF_EBTABLES + help + This option adds the MAC redirect target, which allows altering the MAC + destination address of a frame to that of the device it arrived on. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_SNAT + tristate "ebt: snat target support" + depends on BRIDGE_NF_EBTABLES + help + This option adds the MAC SNAT target, which allows altering the MAC + source address of frames. + + To compile it as a module, choose M here. If unsure, say N. +# +# watchers +# +config BRIDGE_EBT_LOG + tristate "ebt: log support" + depends on BRIDGE_NF_EBTABLES + help + This option adds the log watcher, that you can use in any rule + in any ebtables table. It records info about the frame header + to the syslog. + + To compile it as a module, choose M here. If unsure, say N. + +config BRIDGE_EBT_ULOG + tristate "ebt: ulog support" + depends on BRIDGE_NF_EBTABLES + help + This option adds the ulog watcher, that you can use in any rule + in any ebtables table. The packet is passed to a userspace + logging daemon using netlink multicast sockets. This differs + from the log watcher in the sense that the complete packet is + sent to userspace instead of a descriptive text and that + netlink multicast sockets are used instead of the syslog. + + To compile it as a module, choose M here. If unsure, say N. + +endmenu diff --git a/net/bridge/netfilter/Makefile b/net/bridge/netfilter/Makefile new file mode 100644 index 000000000000..8bf6d9f6e9d3 --- /dev/null +++ b/net/bridge/netfilter/Makefile @@ -0,0 +1,32 @@ +# +# Makefile for the netfilter modules for Link Layer filtering on a bridge. +# + +obj-$(CONFIG_BRIDGE_NF_EBTABLES) += ebtables.o + +# tables +obj-$(CONFIG_BRIDGE_EBT_BROUTE) += ebtable_broute.o +obj-$(CONFIG_BRIDGE_EBT_T_FILTER) += ebtable_filter.o +obj-$(CONFIG_BRIDGE_EBT_T_NAT) += ebtable_nat.o + +#matches +obj-$(CONFIG_BRIDGE_EBT_802_3) += ebt_802_3.o +obj-$(CONFIG_BRIDGE_EBT_AMONG) += ebt_among.o +obj-$(CONFIG_BRIDGE_EBT_ARP) += ebt_arp.o +obj-$(CONFIG_BRIDGE_EBT_IP) += ebt_ip.o +obj-$(CONFIG_BRIDGE_EBT_LIMIT) += ebt_limit.o +obj-$(CONFIG_BRIDGE_EBT_MARK) += ebt_mark_m.o +obj-$(CONFIG_BRIDGE_EBT_PKTTYPE) += ebt_pkttype.o +obj-$(CONFIG_BRIDGE_EBT_STP) += ebt_stp.o +obj-$(CONFIG_BRIDGE_EBT_VLAN) += ebt_vlan.o + +# targets +obj-$(CONFIG_BRIDGE_EBT_ARPREPLY) += ebt_arpreply.o +obj-$(CONFIG_BRIDGE_EBT_MARK_T) += ebt_mark.o +obj-$(CONFIG_BRIDGE_EBT_DNAT) += ebt_dnat.o +obj-$(CONFIG_BRIDGE_EBT_REDIRECT) += ebt_redirect.o +obj-$(CONFIG_BRIDGE_EBT_SNAT) += ebt_snat.o + +# watchers +obj-$(CONFIG_BRIDGE_EBT_LOG) += ebt_log.o +obj-$(CONFIG_BRIDGE_EBT_LOG) += ebt_ulog.o diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c new file mode 100644 index 000000000000..468ebdf4bc1c --- /dev/null +++ b/net/bridge/netfilter/ebt_802_3.c @@ -0,0 +1,73 @@ +/* + * 802_3 + * + * Author: + * Chris Vitale csv@bluetail.com + * + * May 2003 + * + */ + +#include +#include +#include + +static int ebt_filter_802_3(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *data, unsigned int datalen) +{ + struct ebt_802_3_info *info = (struct ebt_802_3_info *)data; + struct ebt_802_3_hdr *hdr = ebt_802_3_hdr(skb); + uint16_t type = hdr->llc.ui.ctrl & IS_UI ? hdr->llc.ui.type : hdr->llc.ni.type; + + if (info->bitmask & EBT_802_3_SAP) { + if (FWINV(info->sap != hdr->llc.ui.ssap, EBT_802_3_SAP)) + return EBT_NOMATCH; + if (FWINV(info->sap != hdr->llc.ui.dsap, EBT_802_3_SAP)) + return EBT_NOMATCH; + } + + if (info->bitmask & EBT_802_3_TYPE) { + if (!(hdr->llc.ui.dsap == CHECK_TYPE && hdr->llc.ui.ssap == CHECK_TYPE)) + return EBT_NOMATCH; + if (FWINV(info->type != type, EBT_802_3_TYPE)) + return EBT_NOMATCH; + } + + return EBT_MATCH; +} + +static struct ebt_match filter_802_3; +static int ebt_802_3_check(const char *tablename, unsigned int hookmask, + const struct ebt_entry *e, void *data, unsigned int datalen) +{ + struct ebt_802_3_info *info = (struct ebt_802_3_info *)data; + + if (datalen < sizeof(struct ebt_802_3_info)) + return -EINVAL; + if (info->bitmask & ~EBT_802_3_MASK || info->invflags & ~EBT_802_3_MASK) + return -EINVAL; + + return 0; +} + +static struct ebt_match filter_802_3 = +{ + .name = EBT_802_3_MATCH, + .match = ebt_filter_802_3, + .check = ebt_802_3_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ebt_register_match(&filter_802_3); +} + +static void __exit fini(void) +{ + ebt_unregister_match(&filter_802_3); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_among.c b/net/bridge/netfilter/ebt_among.c new file mode 100644 index 000000000000..5a1f5e3bff15 --- /dev/null +++ b/net/bridge/netfilter/ebt_among.c @@ -0,0 +1,228 @@ +/* + * ebt_among + * + * Authors: + * Grzegorz Borowiak + * + * August, 2003 + * + */ + +#include +#include +#include +#include +#include + +static int ebt_mac_wormhash_contains(const struct ebt_mac_wormhash *wh, + const char *mac, uint32_t ip) +{ + /* You may be puzzled as to how this code works. + * Some tricks were used, refer to + * include/linux/netfilter_bridge/ebt_among.h + * as there you can find a solution of this mystery. + */ + const struct ebt_mac_wormhash_tuple *p; + int start, limit, i; + uint32_t cmp[2] = { 0, 0 }; + int key = (const unsigned char) mac[5]; + + memcpy(((char *) cmp) + 2, mac, 6); + start = wh->table[key]; + limit = wh->table[key + 1]; + if (ip) { + for (i = start; i < limit; i++) { + p = &wh->pool[i]; + if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0]) { + if (p->ip == 0 || p->ip == ip) { + return 1; + } + } + } + } else { + for (i = start; i < limit; i++) { + p = &wh->pool[i]; + if (cmp[1] == p->cmp[1] && cmp[0] == p->cmp[0]) { + if (p->ip == 0) { + return 1; + } + } + } + } + return 0; +} + +static int ebt_mac_wormhash_check_integrity(const struct ebt_mac_wormhash + *wh) +{ + int i; + + for (i = 0; i < 256; i++) { + if (wh->table[i] > wh->table[i + 1]) + return -0x100 - i; + if (wh->table[i] < 0) + return -0x200 - i; + if (wh->table[i] > wh->poolsize) + return -0x300 - i; + } + if (wh->table[256] > wh->poolsize) + return -0xc00; + return 0; +} + +static int get_ip_dst(const struct sk_buff *skb, uint32_t *addr) +{ + if (eth_hdr(skb)->h_proto == htons(ETH_P_IP)) { + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (ih == NULL) + return -1; + *addr = ih->daddr; + } else if (eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) { + struct arphdr _arph, *ah; + uint32_t buf, *bp; + + ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); + if (ah == NULL || + ah->ar_pln != sizeof(uint32_t) || + ah->ar_hln != ETH_ALEN) + return -1; + bp = skb_header_pointer(skb, sizeof(struct arphdr) + + 2 * ETH_ALEN + sizeof(uint32_t), + sizeof(uint32_t), &buf); + if (bp == NULL) + return -1; + *addr = *bp; + } + return 0; +} + +static int get_ip_src(const struct sk_buff *skb, uint32_t *addr) +{ + if (eth_hdr(skb)->h_proto == htons(ETH_P_IP)) { + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (ih == NULL) + return -1; + *addr = ih->saddr; + } else if (eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) { + struct arphdr _arph, *ah; + uint32_t buf, *bp; + + ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); + if (ah == NULL || + ah->ar_pln != sizeof(uint32_t) || + ah->ar_hln != ETH_ALEN) + return -1; + bp = skb_header_pointer(skb, sizeof(struct arphdr) + + ETH_ALEN, sizeof(uint32_t), &buf); + if (bp == NULL) + return -1; + *addr = *bp; + } + return 0; +} + +static int ebt_filter_among(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, const void *data, + unsigned int datalen) +{ + struct ebt_among_info *info = (struct ebt_among_info *) data; + const char *dmac, *smac; + const struct ebt_mac_wormhash *wh_dst, *wh_src; + uint32_t dip = 0, sip = 0; + + wh_dst = ebt_among_wh_dst(info); + wh_src = ebt_among_wh_src(info); + + if (wh_src) { + smac = eth_hdr(skb)->h_source; + if (get_ip_src(skb, &sip)) + return EBT_NOMATCH; + if (!(info->bitmask & EBT_AMONG_SRC_NEG)) { + /* we match only if it contains */ + if (!ebt_mac_wormhash_contains(wh_src, smac, sip)) + return EBT_NOMATCH; + } else { + /* we match only if it DOES NOT contain */ + if (ebt_mac_wormhash_contains(wh_src, smac, sip)) + return EBT_NOMATCH; + } + } + + if (wh_dst) { + dmac = eth_hdr(skb)->h_dest; + if (get_ip_dst(skb, &dip)) + return EBT_NOMATCH; + if (!(info->bitmask & EBT_AMONG_DST_NEG)) { + /* we match only if it contains */ + if (!ebt_mac_wormhash_contains(wh_dst, dmac, dip)) + return EBT_NOMATCH; + } else { + /* we match only if it DOES NOT contain */ + if (ebt_mac_wormhash_contains(wh_dst, dmac, dip)) + return EBT_NOMATCH; + } + } + + return EBT_MATCH; +} + +static int ebt_among_check(const char *tablename, unsigned int hookmask, + const struct ebt_entry *e, void *data, + unsigned int datalen) +{ + struct ebt_among_info *info = (struct ebt_among_info *) data; + int expected_length = sizeof(struct ebt_among_info); + const struct ebt_mac_wormhash *wh_dst, *wh_src; + int err; + + wh_dst = ebt_among_wh_dst(info); + wh_src = ebt_among_wh_src(info); + expected_length += ebt_mac_wormhash_size(wh_dst); + expected_length += ebt_mac_wormhash_size(wh_src); + + if (datalen != EBT_ALIGN(expected_length)) { + printk(KERN_WARNING + "ebtables: among: wrong size: %d" + "against expected %d, rounded to %Zd\n", + datalen, expected_length, + EBT_ALIGN(expected_length)); + return -EINVAL; + } + if (wh_dst && (err = ebt_mac_wormhash_check_integrity(wh_dst))) { + printk(KERN_WARNING + "ebtables: among: dst integrity fail: %x\n", -err); + return -EINVAL; + } + if (wh_src && (err = ebt_mac_wormhash_check_integrity(wh_src))) { + printk(KERN_WARNING + "ebtables: among: src integrity fail: %x\n", -err); + return -EINVAL; + } + return 0; +} + +static struct ebt_match filter_among = { + .name = EBT_AMONG_MATCH, + .match = ebt_filter_among, + .check = ebt_among_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ebt_register_match(&filter_among); +} + +static void __exit fini(void) +{ + ebt_unregister_match(&filter_among); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c new file mode 100644 index 000000000000..b94c48cb6e4b --- /dev/null +++ b/net/bridge/netfilter/ebt_arp.c @@ -0,0 +1,140 @@ +/* + * ebt_arp + * + * Authors: + * Bart De Schuymer + * Tim Gardner + * + * April, 2002 + * + */ + +#include +#include +#include +#include +#include + +static int ebt_filter_arp(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *data, unsigned int datalen) +{ + struct ebt_arp_info *info = (struct ebt_arp_info *)data; + struct arphdr _arph, *ah; + + ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); + if (ah == NULL) + return EBT_NOMATCH; + if (info->bitmask & EBT_ARP_OPCODE && FWINV(info->opcode != + ah->ar_op, EBT_ARP_OPCODE)) + return EBT_NOMATCH; + if (info->bitmask & EBT_ARP_HTYPE && FWINV(info->htype != + ah->ar_hrd, EBT_ARP_HTYPE)) + return EBT_NOMATCH; + if (info->bitmask & EBT_ARP_PTYPE && FWINV(info->ptype != + ah->ar_pro, EBT_ARP_PTYPE)) + return EBT_NOMATCH; + + if (info->bitmask & (EBT_ARP_SRC_IP | EBT_ARP_DST_IP)) { + uint32_t _addr, *ap; + + /* IPv4 addresses are always 4 bytes */ + if (ah->ar_pln != sizeof(uint32_t)) + return EBT_NOMATCH; + if (info->bitmask & EBT_ARP_SRC_IP) { + ap = skb_header_pointer(skb, sizeof(struct arphdr) + + ah->ar_hln, sizeof(_addr), + &_addr); + if (ap == NULL) + return EBT_NOMATCH; + if (FWINV(info->saddr != (*ap & info->smsk), + EBT_ARP_SRC_IP)) + return EBT_NOMATCH; + } + + if (info->bitmask & EBT_ARP_DST_IP) { + ap = skb_header_pointer(skb, sizeof(struct arphdr) + + 2*ah->ar_hln+sizeof(uint32_t), + sizeof(_addr), &_addr); + if (ap == NULL) + return EBT_NOMATCH; + if (FWINV(info->daddr != (*ap & info->dmsk), + EBT_ARP_DST_IP)) + return EBT_NOMATCH; + } + } + + if (info->bitmask & (EBT_ARP_SRC_MAC | EBT_ARP_DST_MAC)) { + unsigned char _mac[ETH_ALEN], *mp; + uint8_t verdict, i; + + /* MAC addresses are 6 bytes */ + if (ah->ar_hln != ETH_ALEN) + return EBT_NOMATCH; + if (info->bitmask & EBT_ARP_SRC_MAC) { + mp = skb_header_pointer(skb, sizeof(struct arphdr), + sizeof(_mac), &_mac); + if (mp == NULL) + return EBT_NOMATCH; + verdict = 0; + for (i = 0; i < 6; i++) + verdict |= (mp[i] ^ info->smaddr[i]) & + info->smmsk[i]; + if (FWINV(verdict != 0, EBT_ARP_SRC_MAC)) + return EBT_NOMATCH; + } + + if (info->bitmask & EBT_ARP_DST_MAC) { + mp = skb_header_pointer(skb, sizeof(struct arphdr) + + ah->ar_hln + ah->ar_pln, + sizeof(_mac), &_mac); + if (mp == NULL) + return EBT_NOMATCH; + verdict = 0; + for (i = 0; i < 6; i++) + verdict |= (mp[i] ^ info->dmaddr[i]) & + info->dmmsk[i]; + if (FWINV(verdict != 0, EBT_ARP_DST_MAC)) + return EBT_NOMATCH; + } + } + + return EBT_MATCH; +} + +static int ebt_arp_check(const char *tablename, unsigned int hookmask, + const struct ebt_entry *e, void *data, unsigned int datalen) +{ + struct ebt_arp_info *info = (struct ebt_arp_info *)data; + + if (datalen != EBT_ALIGN(sizeof(struct ebt_arp_info))) + return -EINVAL; + if ((e->ethproto != htons(ETH_P_ARP) && + e->ethproto != htons(ETH_P_RARP)) || + e->invflags & EBT_IPROTO) + return -EINVAL; + if (info->bitmask & ~EBT_ARP_MASK || info->invflags & ~EBT_ARP_MASK) + return -EINVAL; + return 0; +} + +static struct ebt_match filter_arp = +{ + .name = EBT_ARP_MATCH, + .match = ebt_filter_arp, + .check = ebt_arp_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ebt_register_match(&filter_arp); +} + +static void __exit fini(void) +{ + ebt_unregister_match(&filter_arp); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_arpreply.c b/net/bridge/netfilter/ebt_arpreply.c new file mode 100644 index 000000000000..b934de90f7c5 --- /dev/null +++ b/net/bridge/netfilter/ebt_arpreply.c @@ -0,0 +1,97 @@ +/* + * ebt_arpreply + * + * Authors: + * Grzegorz Borowiak + * Bart De Schuymer + * + * August, 2003 + * + */ + +#include +#include +#include +#include +#include + +static int ebt_target_reply(struct sk_buff **pskb, unsigned int hooknr, + const struct net_device *in, const struct net_device *out, + const void *data, unsigned int datalen) +{ + struct ebt_arpreply_info *info = (struct ebt_arpreply_info *)data; + u32 _sip, *siptr, _dip, *diptr; + struct arphdr _ah, *ap; + unsigned char _sha[ETH_ALEN], *shp; + struct sk_buff *skb = *pskb; + + ap = skb_header_pointer(skb, 0, sizeof(_ah), &_ah); + if (ap == NULL) + return EBT_DROP; + + if (ap->ar_op != htons(ARPOP_REQUEST) || + ap->ar_hln != ETH_ALEN || + ap->ar_pro != htons(ETH_P_IP) || + ap->ar_pln != 4) + return EBT_CONTINUE; + + shp = skb_header_pointer(skb, sizeof(_ah), ETH_ALEN, &_sha); + if (shp == NULL) + return EBT_DROP; + + siptr = skb_header_pointer(skb, sizeof(_ah) + ETH_ALEN, + sizeof(_sip), &_sip); + if (siptr == NULL) + return EBT_DROP; + + diptr = skb_header_pointer(skb, + sizeof(_ah) + 2 * ETH_ALEN + sizeof(_sip), + sizeof(_dip), &_dip); + if (diptr == NULL) + return EBT_DROP; + + arp_send(ARPOP_REPLY, ETH_P_ARP, *siptr, (struct net_device *)in, + *diptr, shp, info->mac, shp); + + return info->target; +} + +static int ebt_target_reply_check(const char *tablename, unsigned int hookmask, + const struct ebt_entry *e, void *data, unsigned int datalen) +{ + struct ebt_arpreply_info *info = (struct ebt_arpreply_info *)data; + + if (datalen != EBT_ALIGN(sizeof(struct ebt_arpreply_info))) + return -EINVAL; + if (BASE_CHAIN && info->target == EBT_RETURN) + return -EINVAL; + if (e->ethproto != htons(ETH_P_ARP) || + e->invflags & EBT_IPROTO) + return -EINVAL; + CLEAR_BASE_CHAIN_BIT; + if (strcmp(tablename, "nat") || hookmask & ~(1 << NF_BR_PRE_ROUTING)) + return -EINVAL; + return 0; +} + +static struct ebt_target reply_target = +{ + .name = EBT_ARPREPLY_TARGET, + .target = ebt_target_reply, + .check = ebt_target_reply_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ebt_register_target(&reply_target); +} + +static void __exit fini(void) +{ + ebt_unregister_target(&reply_target); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c new file mode 100644 index 000000000000..f5463086c7bd --- /dev/null +++ b/net/bridge/netfilter/ebt_dnat.c @@ -0,0 +1,76 @@ +/* + * ebt_dnat + * + * Authors: + * Bart De Schuymer + * + * June, 2002 + * + */ + +#include +#include +#include +#include + +static int ebt_target_dnat(struct sk_buff **pskb, unsigned int hooknr, + const struct net_device *in, const struct net_device *out, + const void *data, unsigned int datalen) +{ + struct ebt_nat_info *info = (struct ebt_nat_info *)data; + + if (skb_shared(*pskb) || skb_cloned(*pskb)) { + struct sk_buff *nskb; + + nskb = skb_copy(*pskb, GFP_ATOMIC); + if (!nskb) + return NF_DROP; + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = nskb; + } + memcpy(eth_hdr(*pskb)->h_dest, info->mac, ETH_ALEN); + return info->target; +} + +static int ebt_target_dnat_check(const char *tablename, unsigned int hookmask, + const struct ebt_entry *e, void *data, unsigned int datalen) +{ + struct ebt_nat_info *info = (struct ebt_nat_info *)data; + + if (BASE_CHAIN && info->target == EBT_RETURN) + return -EINVAL; + CLEAR_BASE_CHAIN_BIT; + if ( (strcmp(tablename, "nat") || + (hookmask & ~((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT)))) && + (strcmp(tablename, "broute") || hookmask & ~(1 << NF_BR_BROUTING)) ) + return -EINVAL; + if (datalen != EBT_ALIGN(sizeof(struct ebt_nat_info))) + return -EINVAL; + if (INVALID_TARGET) + return -EINVAL; + return 0; +} + +static struct ebt_target dnat = +{ + .name = EBT_DNAT_TARGET, + .target = ebt_target_dnat, + .check = ebt_target_dnat_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ebt_register_target(&dnat); +} + +static void __exit fini(void) +{ + ebt_unregister_target(&dnat); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c new file mode 100644 index 000000000000..7323805b9726 --- /dev/null +++ b/net/bridge/netfilter/ebt_ip.c @@ -0,0 +1,122 @@ +/* + * ebt_ip + * + * Authors: + * Bart De Schuymer + * + * April, 2002 + * + * Changes: + * added ip-sport and ip-dport + * Innominate Security Technologies AG + * September, 2002 + */ + +#include +#include +#include +#include +#include + +struct tcpudphdr { + uint16_t src; + uint16_t dst; +}; + +static int ebt_filter_ip(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *data, + unsigned int datalen) +{ + struct ebt_ip_info *info = (struct ebt_ip_info *)data; + struct iphdr _iph, *ih; + struct tcpudphdr _ports, *pptr; + + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (ih == NULL) + return EBT_NOMATCH; + if (info->bitmask & EBT_IP_TOS && + FWINV(info->tos != ih->tos, EBT_IP_TOS)) + return EBT_NOMATCH; + if (info->bitmask & EBT_IP_SOURCE && + FWINV((ih->saddr & info->smsk) != + info->saddr, EBT_IP_SOURCE)) + return EBT_NOMATCH; + if ((info->bitmask & EBT_IP_DEST) && + FWINV((ih->daddr & info->dmsk) != + info->daddr, EBT_IP_DEST)) + return EBT_NOMATCH; + if (info->bitmask & EBT_IP_PROTO) { + if (FWINV(info->protocol != ih->protocol, EBT_IP_PROTO)) + return EBT_NOMATCH; + if (!(info->bitmask & EBT_IP_DPORT) && + !(info->bitmask & EBT_IP_SPORT)) + return EBT_MATCH; + pptr = skb_header_pointer(skb, ih->ihl*4, + sizeof(_ports), &_ports); + if (pptr == NULL) + return EBT_NOMATCH; + if (info->bitmask & EBT_IP_DPORT) { + u32 dst = ntohs(pptr->dst); + if (FWINV(dst < info->dport[0] || + dst > info->dport[1], + EBT_IP_DPORT)) + return EBT_NOMATCH; + } + if (info->bitmask & EBT_IP_SPORT) { + u32 src = ntohs(pptr->src); + if (FWINV(src < info->sport[0] || + src > info->sport[1], + EBT_IP_SPORT)) + return EBT_NOMATCH; + } + } + return EBT_MATCH; +} + +static int ebt_ip_check(const char *tablename, unsigned int hookmask, + const struct ebt_entry *e, void *data, unsigned int datalen) +{ + struct ebt_ip_info *info = (struct ebt_ip_info *)data; + + if (datalen != EBT_ALIGN(sizeof(struct ebt_ip_info))) + return -EINVAL; + if (e->ethproto != htons(ETH_P_IP) || + e->invflags & EBT_IPROTO) + return -EINVAL; + if (info->bitmask & ~EBT_IP_MASK || info->invflags & ~EBT_IP_MASK) + return -EINVAL; + if (info->bitmask & (EBT_IP_DPORT | EBT_IP_SPORT)) { + if (info->invflags & EBT_IP_PROTO) + return -EINVAL; + if (info->protocol != IPPROTO_TCP && + info->protocol != IPPROTO_UDP) + return -EINVAL; + } + if (info->bitmask & EBT_IP_DPORT && info->dport[0] > info->dport[1]) + return -EINVAL; + if (info->bitmask & EBT_IP_SPORT && info->sport[0] > info->sport[1]) + return -EINVAL; + return 0; +} + +static struct ebt_match filter_ip = +{ + .name = EBT_IP_MATCH, + .match = ebt_filter_ip, + .check = ebt_ip_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ebt_register_match(&filter_ip); +} + +static void __exit fini(void) +{ + ebt_unregister_match(&filter_ip); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_limit.c b/net/bridge/netfilter/ebt_limit.c new file mode 100644 index 000000000000..637c8844cd5f --- /dev/null +++ b/net/bridge/netfilter/ebt_limit.c @@ -0,0 +1,113 @@ +/* + * ebt_limit + * + * Authors: + * Tom Marshall + * + * Mostly copied from netfilter's ipt_limit.c, see that file for + * more explanation + * + * September, 2003 + * + */ + +#include +#include +#include + +#include +#include + +static DEFINE_SPINLOCK(limit_lock); + +#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24)) + +#define _POW2_BELOW2(x) ((x)|((x)>>1)) +#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2)) +#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) +#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) +#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) +#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) + +#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) + +static int ebt_limit_match(const struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + const void *data, unsigned int datalen) +{ + struct ebt_limit_info *info = (struct ebt_limit_info *)data; + unsigned long now = jiffies; + + spin_lock_bh(&limit_lock); + info->credit += (now - xchg(&info->prev, now)) * CREDITS_PER_JIFFY; + if (info->credit > info->credit_cap) + info->credit = info->credit_cap; + + if (info->credit >= info->cost) { + /* We're not limited. */ + info->credit -= info->cost; + spin_unlock_bh(&limit_lock); + return EBT_MATCH; + } + + spin_unlock_bh(&limit_lock); + return EBT_NOMATCH; +} + +/* Precision saver. */ +static u_int32_t +user2credits(u_int32_t user) +{ + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + /* Divide first. */ + return (user / EBT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + + return (user * HZ * CREDITS_PER_JIFFY) / EBT_LIMIT_SCALE; +} + +static int ebt_limit_check(const char *tablename, unsigned int hookmask, + const struct ebt_entry *e, void *data, unsigned int datalen) +{ + struct ebt_limit_info *info = (struct ebt_limit_info *)data; + + if (datalen != EBT_ALIGN(sizeof(struct ebt_limit_info))) + return -EINVAL; + + /* Check for overflow. */ + if (info->burst == 0 || + user2credits(info->avg * info->burst) < user2credits(info->avg)) { + printk("Overflow in ebt_limit, try lower: %u/%u\n", + info->avg, info->burst); + return -EINVAL; + } + + /* User avg in seconds * EBT_LIMIT_SCALE: convert to jiffies * 128. */ + info->prev = jiffies; + info->credit = user2credits(info->avg * info->burst); + info->credit_cap = user2credits(info->avg * info->burst); + info->cost = user2credits(info->avg); + return 0; +} + +static struct ebt_match ebt_limit_reg = +{ + .name = EBT_LIMIT_MATCH, + .match = ebt_limit_match, + .check = ebt_limit_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ebt_register_match(&ebt_limit_reg); +} + +static void __exit fini(void) +{ + ebt_unregister_match(&ebt_limit_reg); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c new file mode 100644 index 000000000000..e4ae34b88925 --- /dev/null +++ b/net/bridge/netfilter/ebt_log.c @@ -0,0 +1,171 @@ +/* + * ebt_log + * + * Authors: + * Bart De Schuymer + * + * April, 2002 + * + */ + +#include +#include +#include +#include +#include +#include + +static DEFINE_SPINLOCK(ebt_log_lock); + +static int ebt_log_check(const char *tablename, unsigned int hookmask, + const struct ebt_entry *e, void *data, unsigned int datalen) +{ + struct ebt_log_info *info = (struct ebt_log_info *)data; + + if (datalen != EBT_ALIGN(sizeof(struct ebt_log_info))) + return -EINVAL; + if (info->bitmask & ~EBT_LOG_MASK) + return -EINVAL; + if (info->loglevel >= 8) + return -EINVAL; + info->prefix[EBT_LOG_PREFIX_SIZE - 1] = '\0'; + return 0; +} + +struct tcpudphdr +{ + uint16_t src; + uint16_t dst; +}; + +struct arppayload +{ + unsigned char mac_src[ETH_ALEN]; + unsigned char ip_src[4]; + unsigned char mac_dst[ETH_ALEN]; + unsigned char ip_dst[4]; +}; + +static void print_MAC(unsigned char *p) +{ + int i; + + for (i = 0; i < ETH_ALEN; i++, p++) + printk("%02x%c", *p, i == ETH_ALEN - 1 ? ' ':':'); +} + +#define myNIPQUAD(a) a[0], a[1], a[2], a[3] +static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, + const struct net_device *in, const struct net_device *out, + const void *data, unsigned int datalen) +{ + struct ebt_log_info *info = (struct ebt_log_info *)data; + char level_string[4] = "< >"; + union {struct iphdr iph; struct tcpudphdr ports; + struct arphdr arph; struct arppayload arpp;} u; + + level_string[1] = '0' + info->loglevel; + spin_lock_bh(&ebt_log_lock); + printk(level_string); + printk("%s IN=%s OUT=%s ", info->prefix, in ? in->name : "", + out ? out->name : ""); + + printk("MAC source = "); + print_MAC(eth_hdr(skb)->h_source); + printk("MAC dest = "); + print_MAC(eth_hdr(skb)->h_dest); + + printk("proto = 0x%04x", ntohs(eth_hdr(skb)->h_proto)); + + if ((info->bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto == + htons(ETH_P_IP)){ + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); + if (ih == NULL) { + printk(" INCOMPLETE IP header"); + goto out; + } + printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,", + NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); + printk(" IP tos=0x%02X, IP proto=%d", u.iph.tos, + ih->protocol); + if (ih->protocol == IPPROTO_TCP || + ih->protocol == IPPROTO_UDP) { + struct tcpudphdr _ports, *pptr; + + pptr = skb_header_pointer(skb, ih->ihl*4, + sizeof(_ports), &_ports); + if (pptr == NULL) { + printk(" INCOMPLETE TCP/UDP header"); + goto out; + } + printk(" SPT=%u DPT=%u", ntohs(pptr->src), + ntohs(pptr->dst)); + } + goto out; + } + + if ((info->bitmask & EBT_LOG_ARP) && + ((eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) || + (eth_hdr(skb)->h_proto == htons(ETH_P_RARP)))) { + struct arphdr _arph, *ah; + + ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); + if (ah == NULL) { + printk(" INCOMPLETE ARP header"); + goto out; + } + printk(" ARP HTYPE=%d, PTYPE=0x%04x, OPCODE=%d", + ntohs(ah->ar_hrd), ntohs(ah->ar_pro), + ntohs(ah->ar_op)); + + /* If it's for Ethernet and the lengths are OK, + * then log the ARP payload */ + if (ah->ar_hrd == htons(1) && + ah->ar_hln == ETH_ALEN && + ah->ar_pln == sizeof(uint32_t)) { + struct arppayload _arpp, *ap; + + ap = skb_header_pointer(skb, sizeof(u.arph), + sizeof(_arpp), &_arpp); + if (ap == NULL) { + printk(" INCOMPLETE ARP payload"); + goto out; + } + printk(" ARP MAC SRC="); + print_MAC(ap->mac_src); + printk(" ARP IP SRC=%u.%u.%u.%u", + myNIPQUAD(ap->ip_src)); + printk(" ARP MAC DST="); + print_MAC(ap->mac_dst); + printk(" ARP IP DST=%u.%u.%u.%u", + myNIPQUAD(ap->ip_dst)); + } + } +out: + printk("\n"); + spin_unlock_bh(&ebt_log_lock); +} + +static struct ebt_watcher log = +{ + .name = EBT_LOG_WATCHER, + .watcher = ebt_log, + .check = ebt_log_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ebt_register_watcher(&log); +} + +static void __exit fini(void) +{ + ebt_unregister_watcher(&log); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_mark.c b/net/bridge/netfilter/ebt_mark.c new file mode 100644 index 000000000000..02c632b4d325 --- /dev/null +++ b/net/bridge/netfilter/ebt_mark.c @@ -0,0 +1,68 @@ +/* + * ebt_mark + * + * Authors: + * Bart De Schuymer + * + * July, 2002 + * + */ + +/* The mark target can be used in any chain, + * I believe adding a mangle table just for marking is total overkill. + * Marking a frame doesn't really change anything in the frame anyway. + */ + +#include +#include +#include + +static int ebt_target_mark(struct sk_buff **pskb, unsigned int hooknr, + const struct net_device *in, const struct net_device *out, + const void *data, unsigned int datalen) +{ + struct ebt_mark_t_info *info = (struct ebt_mark_t_info *)data; + + if ((*pskb)->nfmark != info->mark) { + (*pskb)->nfmark = info->mark; + (*pskb)->nfcache |= NFC_ALTERED; + } + return info->target; +} + +static int ebt_target_mark_check(const char *tablename, unsigned int hookmask, + const struct ebt_entry *e, void *data, unsigned int datalen) +{ + struct ebt_mark_t_info *info = (struct ebt_mark_t_info *)data; + + if (datalen != EBT_ALIGN(sizeof(struct ebt_mark_t_info))) + return -EINVAL; + if (BASE_CHAIN && info->target == EBT_RETURN) + return -EINVAL; + CLEAR_BASE_CHAIN_BIT; + if (INVALID_TARGET) + return -EINVAL; + return 0; +} + +static struct ebt_target mark_target = +{ + .name = EBT_MARK_TARGET, + .target = ebt_target_mark, + .check = ebt_target_mark_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ebt_register_target(&mark_target); +} + +static void __exit fini(void) +{ + ebt_unregister_target(&mark_target); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_mark_m.c b/net/bridge/netfilter/ebt_mark_m.c new file mode 100644 index 000000000000..625102de1495 --- /dev/null +++ b/net/bridge/netfilter/ebt_mark_m.c @@ -0,0 +1,62 @@ +/* + * ebt_mark_m + * + * Authors: + * Bart De Schuymer + * + * July, 2002 + * + */ + +#include +#include +#include + +static int ebt_filter_mark(const struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, const void *data, + unsigned int datalen) +{ + struct ebt_mark_m_info *info = (struct ebt_mark_m_info *) data; + + if (info->bitmask & EBT_MARK_OR) + return !(!!(skb->nfmark & info->mask) ^ info->invert); + return !(((skb->nfmark & info->mask) == info->mark) ^ info->invert); +} + +static int ebt_mark_check(const char *tablename, unsigned int hookmask, + const struct ebt_entry *e, void *data, unsigned int datalen) +{ + struct ebt_mark_m_info *info = (struct ebt_mark_m_info *) data; + + if (datalen != EBT_ALIGN(sizeof(struct ebt_mark_m_info))) + return -EINVAL; + if (info->bitmask & ~EBT_MARK_MASK) + return -EINVAL; + if ((info->bitmask & EBT_MARK_OR) && (info->bitmask & EBT_MARK_AND)) + return -EINVAL; + if (!info->bitmask) + return -EINVAL; + return 0; +} + +static struct ebt_match filter_mark = +{ + .name = EBT_MARK_MATCH, + .match = ebt_filter_mark, + .check = ebt_mark_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ebt_register_match(&filter_mark); +} + +static void __exit fini(void) +{ + ebt_unregister_match(&filter_mark); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_pkttype.c b/net/bridge/netfilter/ebt_pkttype.c new file mode 100644 index 000000000000..ecd3b42b19b0 --- /dev/null +++ b/net/bridge/netfilter/ebt_pkttype.c @@ -0,0 +1,59 @@ +/* + * ebt_pkttype + * + * Authors: + * Bart De Schuymer + * + * April, 2003 + * + */ + +#include +#include +#include + +static int ebt_filter_pkttype(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *data, + unsigned int datalen) +{ + struct ebt_pkttype_info *info = (struct ebt_pkttype_info *)data; + + return (skb->pkt_type != info->pkt_type) ^ info->invert; +} + +static int ebt_pkttype_check(const char *tablename, unsigned int hookmask, + const struct ebt_entry *e, void *data, unsigned int datalen) +{ + struct ebt_pkttype_info *info = (struct ebt_pkttype_info *)data; + + if (datalen != EBT_ALIGN(sizeof(struct ebt_pkttype_info))) + return -EINVAL; + if (info->invert != 0 && info->invert != 1) + return -EINVAL; + /* Allow any pkt_type value */ + return 0; +} + +static struct ebt_match filter_pkttype = +{ + .name = EBT_PKTTYPE_MATCH, + .match = ebt_filter_pkttype, + .check = ebt_pkttype_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ebt_register_match(&filter_pkttype); +} + +static void __exit fini(void) +{ + ebt_unregister_match(&filter_pkttype); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_redirect.c b/net/bridge/netfilter/ebt_redirect.c new file mode 100644 index 000000000000..1538b4386662 --- /dev/null +++ b/net/bridge/netfilter/ebt_redirect.c @@ -0,0 +1,81 @@ +/* + * ebt_redirect + * + * Authors: + * Bart De Schuymer + * + * April, 2002 + * + */ + +#include +#include +#include +#include +#include "../br_private.h" + +static int ebt_target_redirect(struct sk_buff **pskb, unsigned int hooknr, + const struct net_device *in, const struct net_device *out, + const void *data, unsigned int datalen) +{ + struct ebt_redirect_info *info = (struct ebt_redirect_info *)data; + + if (skb_shared(*pskb) || skb_cloned(*pskb)) { + struct sk_buff *nskb; + + nskb = skb_copy(*pskb, GFP_ATOMIC); + if (!nskb) + return NF_DROP; + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = nskb; + } + if (hooknr != NF_BR_BROUTING) + memcpy(eth_hdr(*pskb)->h_dest, + in->br_port->br->dev->dev_addr, ETH_ALEN); + else + memcpy(eth_hdr(*pskb)->h_dest, in->dev_addr, ETH_ALEN); + (*pskb)->pkt_type = PACKET_HOST; + return info->target; +} + +static int ebt_target_redirect_check(const char *tablename, unsigned int hookmask, + const struct ebt_entry *e, void *data, unsigned int datalen) +{ + struct ebt_redirect_info *info = (struct ebt_redirect_info *)data; + + if (datalen != EBT_ALIGN(sizeof(struct ebt_redirect_info))) + return -EINVAL; + if (BASE_CHAIN && info->target == EBT_RETURN) + return -EINVAL; + CLEAR_BASE_CHAIN_BIT; + if ( (strcmp(tablename, "nat") || hookmask & ~(1 << NF_BR_PRE_ROUTING)) && + (strcmp(tablename, "broute") || hookmask & ~(1 << NF_BR_BROUTING)) ) + return -EINVAL; + if (INVALID_TARGET) + return -EINVAL; + return 0; +} + +static struct ebt_target redirect_target = +{ + .name = EBT_REDIRECT_TARGET, + .target = ebt_target_redirect, + .check = ebt_target_redirect_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ebt_register_target(&redirect_target); +} + +static void __exit fini(void) +{ + ebt_unregister_target(&redirect_target); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_snat.c b/net/bridge/netfilter/ebt_snat.c new file mode 100644 index 000000000000..1529bdcb9a48 --- /dev/null +++ b/net/bridge/netfilter/ebt_snat.c @@ -0,0 +1,76 @@ +/* + * ebt_snat + * + * Authors: + * Bart De Schuymer + * + * June, 2002 + * + */ + +#include +#include +#include +#include + +static int ebt_target_snat(struct sk_buff **pskb, unsigned int hooknr, + const struct net_device *in, const struct net_device *out, + const void *data, unsigned int datalen) +{ + struct ebt_nat_info *info = (struct ebt_nat_info *) data; + + if (skb_shared(*pskb) || skb_cloned(*pskb)) { + struct sk_buff *nskb; + + nskb = skb_copy(*pskb, GFP_ATOMIC); + if (!nskb) + return NF_DROP; + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = nskb; + } + memcpy(eth_hdr(*pskb)->h_source, info->mac, ETH_ALEN); + return info->target; +} + +static int ebt_target_snat_check(const char *tablename, unsigned int hookmask, + const struct ebt_entry *e, void *data, unsigned int datalen) +{ + struct ebt_nat_info *info = (struct ebt_nat_info *) data; + + if (datalen != EBT_ALIGN(sizeof(struct ebt_nat_info))) + return -EINVAL; + if (BASE_CHAIN && info->target == EBT_RETURN) + return -EINVAL; + CLEAR_BASE_CHAIN_BIT; + if (strcmp(tablename, "nat")) + return -EINVAL; + if (hookmask & ~(1 << NF_BR_POST_ROUTING)) + return -EINVAL; + if (INVALID_TARGET) + return -EINVAL; + return 0; +} + +static struct ebt_target snat = +{ + .name = EBT_SNAT_TARGET, + .target = ebt_target_snat, + .check = ebt_target_snat_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ebt_register_target(&snat); +} + +static void __exit fini(void) +{ + ebt_unregister_target(&snat); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c new file mode 100644 index 000000000000..f8a8cdec16ee --- /dev/null +++ b/net/bridge/netfilter/ebt_stp.c @@ -0,0 +1,194 @@ +/* + * ebt_stp + * + * Authors: + * Bart De Schuymer + * Stephen Hemminger + * + * July, 2003 + */ + +#include +#include +#include + +#define BPDU_TYPE_CONFIG 0 +#define BPDU_TYPE_TCN 0x80 + +struct stp_header { + uint8_t dsap; + uint8_t ssap; + uint8_t ctrl; + uint8_t pid; + uint8_t vers; + uint8_t type; +}; + +struct stp_config_pdu { + uint8_t flags; + uint8_t root[8]; + uint8_t root_cost[4]; + uint8_t sender[8]; + uint8_t port[2]; + uint8_t msg_age[2]; + uint8_t max_age[2]; + uint8_t hello_time[2]; + uint8_t forward_delay[2]; +}; + +#define NR16(p) (p[0] << 8 | p[1]) +#define NR32(p) ((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]) + +static int ebt_filter_config(struct ebt_stp_info *info, + struct stp_config_pdu *stpc) +{ + struct ebt_stp_config_info *c; + uint16_t v16; + uint32_t v32; + int verdict, i; + + c = &info->config; + if ((info->bitmask & EBT_STP_FLAGS) && + FWINV(c->flags != stpc->flags, EBT_STP_FLAGS)) + return EBT_NOMATCH; + if (info->bitmask & EBT_STP_ROOTPRIO) { + v16 = NR16(stpc->root); + if (FWINV(v16 < c->root_priol || + v16 > c->root_priou, EBT_STP_ROOTPRIO)) + return EBT_NOMATCH; + } + if (info->bitmask & EBT_STP_ROOTADDR) { + verdict = 0; + for (i = 0; i < 6; i++) + verdict |= (stpc->root[2+i] ^ c->root_addr[i]) & + c->root_addrmsk[i]; + if (FWINV(verdict != 0, EBT_STP_ROOTADDR)) + return EBT_NOMATCH; + } + if (info->bitmask & EBT_STP_ROOTCOST) { + v32 = NR32(stpc->root_cost); + if (FWINV(v32 < c->root_costl || + v32 > c->root_costu, EBT_STP_ROOTCOST)) + return EBT_NOMATCH; + } + if (info->bitmask & EBT_STP_SENDERPRIO) { + v16 = NR16(stpc->sender); + if (FWINV(v16 < c->sender_priol || + v16 > c->sender_priou, EBT_STP_SENDERPRIO)) + return EBT_NOMATCH; + } + if (info->bitmask & EBT_STP_SENDERADDR) { + verdict = 0; + for (i = 0; i < 6; i++) + verdict |= (stpc->sender[2+i] ^ c->sender_addr[i]) & + c->sender_addrmsk[i]; + if (FWINV(verdict != 0, EBT_STP_SENDERADDR)) + return EBT_NOMATCH; + } + if (info->bitmask & EBT_STP_PORT) { + v16 = NR16(stpc->port); + if (FWINV(v16 < c->portl || + v16 > c->portu, EBT_STP_PORT)) + return EBT_NOMATCH; + } + if (info->bitmask & EBT_STP_MSGAGE) { + v16 = NR16(stpc->msg_age); + if (FWINV(v16 < c->msg_agel || + v16 > c->msg_ageu, EBT_STP_MSGAGE)) + return EBT_NOMATCH; + } + if (info->bitmask & EBT_STP_MAXAGE) { + v16 = NR16(stpc->max_age); + if (FWINV(v16 < c->max_agel || + v16 > c->max_ageu, EBT_STP_MAXAGE)) + return EBT_NOMATCH; + } + if (info->bitmask & EBT_STP_HELLOTIME) { + v16 = NR16(stpc->hello_time); + if (FWINV(v16 < c->hello_timel || + v16 > c->hello_timeu, EBT_STP_HELLOTIME)) + return EBT_NOMATCH; + } + if (info->bitmask & EBT_STP_FWDD) { + v16 = NR16(stpc->forward_delay); + if (FWINV(v16 < c->forward_delayl || + v16 > c->forward_delayu, EBT_STP_FWDD)) + return EBT_NOMATCH; + } + return EBT_MATCH; +} + +static int ebt_filter_stp(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *data, unsigned int datalen) +{ + struct ebt_stp_info *info = (struct ebt_stp_info *)data; + struct stp_header _stph, *sp; + uint8_t header[6] = {0x42, 0x42, 0x03, 0x00, 0x00, 0x00}; + + sp = skb_header_pointer(skb, 0, sizeof(_stph), &_stph); + if (sp == NULL) + return EBT_NOMATCH; + + /* The stp code only considers these */ + if (memcmp(sp, header, sizeof(header))) + return EBT_NOMATCH; + + if (info->bitmask & EBT_STP_TYPE + && FWINV(info->type != sp->type, EBT_STP_TYPE)) + return EBT_NOMATCH; + + if (sp->type == BPDU_TYPE_CONFIG && + info->bitmask & EBT_STP_CONFIG_MASK) { + struct stp_config_pdu _stpc, *st; + + st = skb_header_pointer(skb, sizeof(_stph), + sizeof(_stpc), &_stpc); + if (st == NULL) + return EBT_NOMATCH; + return ebt_filter_config(info, st); + } + return EBT_MATCH; +} + +static int ebt_stp_check(const char *tablename, unsigned int hookmask, + const struct ebt_entry *e, void *data, unsigned int datalen) +{ + struct ebt_stp_info *info = (struct ebt_stp_info *)data; + int len = EBT_ALIGN(sizeof(struct ebt_stp_info)); + uint8_t bridge_ula[6] = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; + uint8_t msk[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + + if (info->bitmask & ~EBT_STP_MASK || info->invflags & ~EBT_STP_MASK || + !(info->bitmask & EBT_STP_MASK)) + return -EINVAL; + if (datalen != len) + return -EINVAL; + /* Make sure the match only receives stp frames */ + if (memcmp(e->destmac, bridge_ula, ETH_ALEN) || + memcmp(e->destmsk, msk, ETH_ALEN) || !(e->bitmask & EBT_DESTMAC)) + return -EINVAL; + + return 0; +} + +static struct ebt_match filter_stp = +{ + .name = EBT_STP_MATCH, + .match = ebt_filter_stp, + .check = ebt_stp_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ebt_register_match(&filter_stp); +} + +static void __exit fini(void) +{ + ebt_unregister_match(&filter_stp); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c new file mode 100644 index 000000000000..01af4fcef26d --- /dev/null +++ b/net/bridge/netfilter/ebt_ulog.c @@ -0,0 +1,295 @@ +/* + * netfilter module for userspace bridged Ethernet frames logging daemons + * + * Authors: + * Bart De Schuymer + * + * November, 2004 + * + * Based on ipt_ULOG.c, which is + * (C) 2000-2002 by Harald Welte + * + * This module accepts two parameters: + * + * nlbufsiz: + * The parameter specifies how big the buffer for each netlink multicast + * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will + * get accumulated in the kernel until they are sent to userspace. It is + * NOT possible to allocate more than 128kB, and it is strongly discouraged, + * because atomically allocating 128kB inside the network rx softirq is not + * reliable. Please also keep in mind that this buffer size is allocated for + * each nlgroup you are using, so the total kernel memory usage increases + * by that factor. + * + * flushtimeout: + * Specify, after how many hundredths of a second the queue should be + * flushed even if it is not full yet. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../br_private.h" + +#define PRINTR(format, args...) do { if (net_ratelimit()) \ + printk(format , ## args); } while (0) + +static unsigned int nlbufsiz = 4096; +module_param(nlbufsiz, uint, 0600); +MODULE_PARM_DESC(nlbufsiz, "netlink buffer size (number of bytes) " + "(defaults to 4096)"); + +static unsigned int flushtimeout = 10; +module_param(flushtimeout, uint, 0600); +MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths ofa second) " + "(defaults to 10)"); + +typedef struct { + unsigned int qlen; /* number of nlmsgs' in the skb */ + struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */ + struct sk_buff *skb; /* the pre-allocated skb */ + struct timer_list timer; /* the timer function */ + spinlock_t lock; /* the per-queue lock */ +} ebt_ulog_buff_t; + +static ebt_ulog_buff_t ulog_buffers[EBT_ULOG_MAXNLGROUPS]; +static struct sock *ebtulognl; + +/* send one ulog_buff_t to userspace */ +static void ulog_send(unsigned int nlgroup) +{ + ebt_ulog_buff_t *ub = &ulog_buffers[nlgroup]; + + if (timer_pending(&ub->timer)) + del_timer(&ub->timer); + + /* last nlmsg needs NLMSG_DONE */ + if (ub->qlen > 1) + ub->lastnlh->nlmsg_type = NLMSG_DONE; + + NETLINK_CB(ub->skb).dst_groups = 1 << nlgroup; + netlink_broadcast(ebtulognl, ub->skb, 0, 1 << nlgroup, GFP_ATOMIC); + + ub->qlen = 0; + ub->skb = NULL; +} + +/* timer function to flush queue in flushtimeout time */ +static void ulog_timer(unsigned long data) +{ + spin_lock_bh(&ulog_buffers[data].lock); + if (ulog_buffers[data].skb) + ulog_send(data); + spin_unlock_bh(&ulog_buffers[data].lock); +} + +static struct sk_buff *ulog_alloc_skb(unsigned int size) +{ + struct sk_buff *skb; + + skb = alloc_skb(nlbufsiz, GFP_ATOMIC); + if (!skb) { + PRINTR(KERN_ERR "ebt_ulog: can't alloc whole buffer " + "of size %ub!\n", nlbufsiz); + if (size < nlbufsiz) { + /* try to allocate only as much as we need for + * current packet */ + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + PRINTR(KERN_ERR "ebt_ulog: can't even allocate " + "buffer of size %ub\n", size); + } + } + + return skb; +} + +static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr, + const struct net_device *in, const struct net_device *out, + const void *data, unsigned int datalen) +{ + ebt_ulog_packet_msg_t *pm; + size_t size, copy_len; + struct nlmsghdr *nlh; + struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data; + unsigned int group = uloginfo->nlgroup; + ebt_ulog_buff_t *ub = &ulog_buffers[group]; + spinlock_t *lock = &ub->lock; + + if ((uloginfo->cprange == 0) || + (uloginfo->cprange > skb->len + ETH_HLEN)) + copy_len = skb->len + ETH_HLEN; + else + copy_len = uloginfo->cprange; + + size = NLMSG_SPACE(sizeof(*pm) + copy_len); + if (size > nlbufsiz) { + PRINTR("ebt_ulog: Size %Zd needed, but nlbufsiz=%d\n", + size, nlbufsiz); + return; + } + + spin_lock_bh(lock); + + if (!ub->skb) { + if (!(ub->skb = ulog_alloc_skb(size))) + goto alloc_failure; + } else if (size > skb_tailroom(ub->skb)) { + ulog_send(group); + + if (!(ub->skb = ulog_alloc_skb(size))) + goto alloc_failure; + } + + nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, 0, + size - NLMSG_ALIGN(sizeof(*nlh))); + ub->qlen++; + + pm = NLMSG_DATA(nlh); + + /* Fill in the ulog data */ + pm->version = EBT_ULOG_VERSION; + do_gettimeofday(&pm->stamp); + if (ub->qlen == 1) + ub->skb->stamp = pm->stamp; + pm->data_len = copy_len; + pm->mark = skb->nfmark; + pm->hook = hooknr; + if (uloginfo->prefix != NULL) + strcpy(pm->prefix, uloginfo->prefix); + else + *(pm->prefix) = '\0'; + + if (in) { + strcpy(pm->physindev, in->name); + /* If in isn't a bridge, then physindev==indev */ + if (in->br_port) + strcpy(pm->indev, in->br_port->br->dev->name); + else + strcpy(pm->indev, in->name); + } else + pm->indev[0] = pm->physindev[0] = '\0'; + + if (out) { + /* If out exists, then out is a bridge port */ + strcpy(pm->physoutdev, out->name); + strcpy(pm->outdev, out->br_port->br->dev->name); + } else + pm->outdev[0] = pm->physoutdev[0] = '\0'; + + if (skb_copy_bits(skb, -ETH_HLEN, pm->data, copy_len) < 0) + BUG(); + + if (ub->qlen > 1) + ub->lastnlh->nlmsg_flags |= NLM_F_MULTI; + + ub->lastnlh = nlh; + + if (ub->qlen >= uloginfo->qthreshold) + ulog_send(group); + else if (!timer_pending(&ub->timer)) { + ub->timer.expires = jiffies + flushtimeout * HZ / 100; + add_timer(&ub->timer); + } + +unlock: + spin_unlock_bh(lock); + + return; + +nlmsg_failure: + printk(KERN_CRIT "ebt_ulog: error during NLMSG_PUT. This should " + "not happen, please report to author.\n"); + goto unlock; +alloc_failure: + goto unlock; +} + +static int ebt_ulog_check(const char *tablename, unsigned int hookmask, + const struct ebt_entry *e, void *data, unsigned int datalen) +{ + struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data; + + if (datalen != EBT_ALIGN(sizeof(struct ebt_ulog_info)) || + uloginfo->nlgroup > 31) + return -EINVAL; + + uloginfo->prefix[EBT_ULOG_PREFIX_LEN - 1] = '\0'; + + if (uloginfo->qthreshold > EBT_ULOG_MAX_QLEN) + uloginfo->qthreshold = EBT_ULOG_MAX_QLEN; + + return 0; +} + +static struct ebt_watcher ulog = { + .name = EBT_ULOG_WATCHER, + .watcher = ebt_ulog, + .check = ebt_ulog_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + int i, ret = 0; + + if (nlbufsiz >= 128*1024) { + printk(KERN_NOTICE "ebt_ulog: Netlink buffer has to be <= 128kB," + " please try a smaller nlbufsiz parameter.\n"); + return -EINVAL; + } + + /* initialize ulog_buffers */ + for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { + init_timer(&ulog_buffers[i].timer); + ulog_buffers[i].timer.function = ulog_timer; + ulog_buffers[i].timer.data = i; + spin_lock_init(&ulog_buffers[i].lock); + } + + ebtulognl = netlink_kernel_create(NETLINK_NFLOG, NULL); + if (!ebtulognl) + ret = -ENOMEM; + else if ((ret = ebt_register_watcher(&ulog))) + sock_release(ebtulognl->sk_socket); + + return ret; +} + +static void __exit fini(void) +{ + ebt_ulog_buff_t *ub; + int i; + + ebt_unregister_watcher(&ulog); + for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { + ub = &ulog_buffers[i]; + if (timer_pending(&ub->timer)) + del_timer(&ub->timer); + spin_lock_bh(&ub->lock); + if (ub->skb) { + kfree_skb(ub->skb); + ub->skb = NULL; + } + spin_unlock_bh(&ub->lock); + } + sock_release(ebtulognl->sk_socket); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Bart De Schuymer "); +MODULE_DESCRIPTION("ebtables userspace logging module for bridged Ethernet" + " frames"); diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c new file mode 100644 index 000000000000..db60d734908b --- /dev/null +++ b/net/bridge/netfilter/ebt_vlan.c @@ -0,0 +1,195 @@ +/* + * Description: EBTables 802.1Q match extension kernelspace module. + * Authors: Nick Fedchik + * Bart De Schuymer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include + +static int debug; +#define MODULE_VERS "0.6" + +module_param(debug, int, 0); +MODULE_PARM_DESC(debug, "debug=1 is turn on debug messages"); +MODULE_AUTHOR("Nick Fedchik "); +MODULE_DESCRIPTION("802.1Q match module (ebtables extension), v" + MODULE_VERS); +MODULE_LICENSE("GPL"); + + +#define DEBUG_MSG(args...) if (debug) printk (KERN_DEBUG "ebt_vlan: " args) +#define INV_FLAG(_inv_flag_) (info->invflags & _inv_flag_) ? "!" : "" +#define GET_BITMASK(_BIT_MASK_) info->bitmask & _BIT_MASK_ +#define SET_BITMASK(_BIT_MASK_) info->bitmask |= _BIT_MASK_ +#define EXIT_ON_MISMATCH(_MATCH_,_MASK_) {if (!((info->_MATCH_ == _MATCH_)^!!(info->invflags & _MASK_))) return EBT_NOMATCH;} + +static int +ebt_filter_vlan(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *data, unsigned int datalen) +{ + struct ebt_vlan_info *info = (struct ebt_vlan_info *) data; + struct vlan_hdr _frame, *fp; + + unsigned short TCI; /* Whole TCI, given from parsed frame */ + unsigned short id; /* VLAN ID, given from frame TCI */ + unsigned char prio; /* user_priority, given from frame TCI */ + /* VLAN encapsulated Type/Length field, given from orig frame */ + unsigned short encap; + + fp = skb_header_pointer(skb, 0, sizeof(_frame), &_frame); + if (fp == NULL) + return EBT_NOMATCH; + + /* Tag Control Information (TCI) consists of the following elements: + * - User_priority. The user_priority field is three bits in length, + * interpreted as a binary number. + * - Canonical Format Indicator (CFI). The Canonical Format Indicator + * (CFI) is a single bit flag value. Currently ignored. + * - VLAN Identifier (VID). The VID is encoded as + * an unsigned binary number. */ + TCI = ntohs(fp->h_vlan_TCI); + id = TCI & VLAN_VID_MASK; + prio = (TCI >> 13) & 0x7; + encap = fp->h_vlan_encapsulated_proto; + + /* Checking VLAN Identifier (VID) */ + if (GET_BITMASK(EBT_VLAN_ID)) + EXIT_ON_MISMATCH(id, EBT_VLAN_ID); + + /* Checking user_priority */ + if (GET_BITMASK(EBT_VLAN_PRIO)) + EXIT_ON_MISMATCH(prio, EBT_VLAN_PRIO); + + /* Checking Encapsulated Proto (Length/Type) field */ + if (GET_BITMASK(EBT_VLAN_ENCAP)) + EXIT_ON_MISMATCH(encap, EBT_VLAN_ENCAP); + + return EBT_MATCH; +} + +static int +ebt_check_vlan(const char *tablename, + unsigned int hooknr, + const struct ebt_entry *e, void *data, unsigned int datalen) +{ + struct ebt_vlan_info *info = (struct ebt_vlan_info *) data; + + /* Parameters buffer overflow check */ + if (datalen != EBT_ALIGN(sizeof(struct ebt_vlan_info))) { + DEBUG_MSG + ("passed size %d is not eq to ebt_vlan_info (%Zd)\n", + datalen, sizeof(struct ebt_vlan_info)); + return -EINVAL; + } + + /* Is it 802.1Q frame checked? */ + if (e->ethproto != htons(ETH_P_8021Q)) { + DEBUG_MSG + ("passed entry proto %2.4X is not 802.1Q (8100)\n", + (unsigned short) ntohs(e->ethproto)); + return -EINVAL; + } + + /* Check for bitmask range + * True if even one bit is out of mask */ + if (info->bitmask & ~EBT_VLAN_MASK) { + DEBUG_MSG("bitmask %2X is out of mask (%2X)\n", + info->bitmask, EBT_VLAN_MASK); + return -EINVAL; + } + + /* Check for inversion flags range */ + if (info->invflags & ~EBT_VLAN_MASK) { + DEBUG_MSG("inversion flags %2X is out of mask (%2X)\n", + info->invflags, EBT_VLAN_MASK); + return -EINVAL; + } + + /* Reserved VLAN ID (VID) values + * ----------------------------- + * 0 - The null VLAN ID. + * 1 - The default Port VID (PVID) + * 0x0FFF - Reserved for implementation use. + * if_vlan.h: VLAN_GROUP_ARRAY_LEN 4096. */ + if (GET_BITMASK(EBT_VLAN_ID)) { + if (!!info->id) { /* if id!=0 => check vid range */ + if (info->id > VLAN_GROUP_ARRAY_LEN) { + DEBUG_MSG + ("id %d is out of range (1-4096)\n", + info->id); + return -EINVAL; + } + /* Note: This is valid VLAN-tagged frame point. + * Any value of user_priority are acceptable, + * but should be ignored according to 802.1Q Std. + * So we just drop the prio flag. */ + info->bitmask &= ~EBT_VLAN_PRIO; + } + /* Else, id=0 (null VLAN ID) => user_priority range (any?) */ + } + + if (GET_BITMASK(EBT_VLAN_PRIO)) { + if ((unsigned char) info->prio > 7) { + DEBUG_MSG("prio %d is out of range (0-7)\n", + info->prio); + return -EINVAL; + } + } + /* Check for encapsulated proto range - it is possible to be + * any value for u_short range. + * if_ether.h: ETH_ZLEN 60 - Min. octets in frame sans FCS */ + if (GET_BITMASK(EBT_VLAN_ENCAP)) { + if ((unsigned short) ntohs(info->encap) < ETH_ZLEN) { + DEBUG_MSG + ("encap frame length %d is less than minimal\n", + ntohs(info->encap)); + return -EINVAL; + } + } + + return 0; +} + +static struct ebt_match filter_vlan = { + .name = EBT_VLAN_MATCH, + .match = ebt_filter_vlan, + .check = ebt_check_vlan, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + DEBUG_MSG("ebtables 802.1Q extension module v" + MODULE_VERS "\n"); + DEBUG_MSG("module debug=%d\n", !!debug); + return ebt_register_match(&filter_vlan); +} + +static void __exit fini(void) +{ + ebt_unregister_match(&filter_vlan); +} + +module_init(init); +module_exit(fini); diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c new file mode 100644 index 000000000000..1767c94cd3de --- /dev/null +++ b/net/bridge/netfilter/ebtable_broute.c @@ -0,0 +1,86 @@ +/* + * ebtable_broute + * + * Authors: + * Bart De Schuymer + * + * April, 2002 + * + * This table lets you choose between routing and bridging for frames + * entering on a bridge enslaved nic. This table is traversed before any + * other ebtables table. See net/bridge/br_input.c. + */ + +#include +#include +#include + +/* EBT_ACCEPT means the frame will be bridged + * EBT_DROP means the frame will be routed + */ +static struct ebt_entries initial_chain = { + .name = "BROUTING", + .policy = EBT_ACCEPT, +}; + +static struct ebt_replace initial_table = +{ + .name = "broute", + .valid_hooks = 1 << NF_BR_BROUTING, + .entries_size = sizeof(struct ebt_entries), + .hook_entry = { + [NF_BR_BROUTING] = &initial_chain, + }, + .entries = (char *)&initial_chain, +}; + +static int check(const struct ebt_table_info *info, unsigned int valid_hooks) +{ + if (valid_hooks & ~(1 << NF_BR_BROUTING)) + return -EINVAL; + return 0; +} + +static struct ebt_table broute_table = +{ + .name = "broute", + .table = &initial_table, + .valid_hooks = 1 << NF_BR_BROUTING, + .lock = RW_LOCK_UNLOCKED, + .check = check, + .me = THIS_MODULE, +}; + +static int ebt_broute(struct sk_buff **pskb) +{ + int ret; + + ret = ebt_do_table(NF_BR_BROUTING, pskb, (*pskb)->dev, NULL, + &broute_table); + if (ret == NF_DROP) + return 1; /* route it */ + return 0; /* bridge it */ +} + +static int __init init(void) +{ + int ret; + + ret = ebt_register_table(&broute_table); + if (ret < 0) + return ret; + /* see br_input.c */ + br_should_route_hook = ebt_broute; + return ret; +} + +static void __exit fini(void) +{ + br_should_route_hook = NULL; + synchronize_net(); + ebt_unregister_table(&broute_table); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c new file mode 100644 index 000000000000..c18666e0392b --- /dev/null +++ b/net/bridge/netfilter/ebtable_filter.c @@ -0,0 +1,123 @@ +/* + * ebtable_filter + * + * Authors: + * Bart De Schuymer + * + * April, 2002 + * + */ + +#include +#include + +#define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \ + (1 << NF_BR_LOCAL_OUT)) + +static struct ebt_entries initial_chains[] = +{ + { + .name = "INPUT", + .policy = EBT_ACCEPT, + }, + { + .name = "FORWARD", + .policy = EBT_ACCEPT, + }, + { + .name = "OUTPUT", + .policy = EBT_ACCEPT, + }, +}; + +static struct ebt_replace initial_table = +{ + .name = "filter", + .valid_hooks = FILTER_VALID_HOOKS, + .entries_size = 3 * sizeof(struct ebt_entries), + .hook_entry = { + [NF_BR_LOCAL_IN] = &initial_chains[0], + [NF_BR_FORWARD] = &initial_chains[1], + [NF_BR_LOCAL_OUT] = &initial_chains[2], + }, + .entries = (char *)initial_chains, +}; + +static int check(const struct ebt_table_info *info, unsigned int valid_hooks) +{ + if (valid_hooks & ~FILTER_VALID_HOOKS) + return -EINVAL; + return 0; +} + +static struct ebt_table frame_filter = +{ + .name = "filter", + .table = &initial_table, + .valid_hooks = FILTER_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .check = check, + .me = THIS_MODULE, +}; + +static unsigned int +ebt_hook (unsigned int hook, struct sk_buff **pskb, const struct net_device *in, + const struct net_device *out, int (*okfn)(struct sk_buff *)) +{ + return ebt_do_table(hook, pskb, in, out, &frame_filter); +} + +static struct nf_hook_ops ebt_ops_filter[] = { + { + .hook = ebt_hook, + .owner = THIS_MODULE, + .pf = PF_BRIDGE, + .hooknum = NF_BR_LOCAL_IN, + .priority = NF_BR_PRI_FILTER_BRIDGED, + }, + { + .hook = ebt_hook, + .owner = THIS_MODULE, + .pf = PF_BRIDGE, + .hooknum = NF_BR_FORWARD, + .priority = NF_BR_PRI_FILTER_BRIDGED, + }, + { + .hook = ebt_hook, + .owner = THIS_MODULE, + .pf = PF_BRIDGE, + .hooknum = NF_BR_LOCAL_OUT, + .priority = NF_BR_PRI_FILTER_OTHER, + }, +}; + +static int __init init(void) +{ + int i, j, ret; + + ret = ebt_register_table(&frame_filter); + if (ret < 0) + return ret; + for (i = 0; i < ARRAY_SIZE(ebt_ops_filter); i++) + if ((ret = nf_register_hook(&ebt_ops_filter[i])) < 0) + goto cleanup; + return ret; +cleanup: + for (j = 0; j < i; j++) + nf_unregister_hook(&ebt_ops_filter[j]); + ebt_unregister_table(&frame_filter); + return ret; +} + +static void __exit fini(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ebt_ops_filter); i++) + nf_unregister_hook(&ebt_ops_filter[i]); + ebt_unregister_table(&frame_filter); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c new file mode 100644 index 000000000000..828cac2cc4a3 --- /dev/null +++ b/net/bridge/netfilter/ebtable_nat.c @@ -0,0 +1,130 @@ +/* + * ebtable_nat + * + * Authors: + * Bart De Schuymer + * + * April, 2002 + * + */ + +#include +#include + +#define NAT_VALID_HOOKS ((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | \ + (1 << NF_BR_POST_ROUTING)) + +static struct ebt_entries initial_chains[] = +{ + { + .name = "PREROUTING", + .policy = EBT_ACCEPT, + }, + { + .name = "OUTPUT", + .policy = EBT_ACCEPT, + }, + { + .name = "POSTROUTING", + .policy = EBT_ACCEPT, + } +}; + +static struct ebt_replace initial_table = +{ + .name = "nat", + .valid_hooks = NAT_VALID_HOOKS, + .entries_size = 3 * sizeof(struct ebt_entries), + .hook_entry = { + [NF_BR_PRE_ROUTING] = &initial_chains[0], + [NF_BR_LOCAL_OUT] = &initial_chains[1], + [NF_BR_POST_ROUTING] = &initial_chains[2], + }, + .entries = (char *)initial_chains, +}; + +static int check(const struct ebt_table_info *info, unsigned int valid_hooks) +{ + if (valid_hooks & ~NAT_VALID_HOOKS) + return -EINVAL; + return 0; +} + +static struct ebt_table frame_nat = +{ + .name = "nat", + .table = &initial_table, + .valid_hooks = NAT_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .check = check, + .me = THIS_MODULE, +}; + +static unsigned int +ebt_nat_dst(unsigned int hook, struct sk_buff **pskb, const struct net_device *in + , const struct net_device *out, int (*okfn)(struct sk_buff *)) +{ + return ebt_do_table(hook, pskb, in, out, &frame_nat); +} + +static unsigned int +ebt_nat_src(unsigned int hook, struct sk_buff **pskb, const struct net_device *in + , const struct net_device *out, int (*okfn)(struct sk_buff *)) +{ + return ebt_do_table(hook, pskb, in, out, &frame_nat); +} + +static struct nf_hook_ops ebt_ops_nat[] = { + { + .hook = ebt_nat_dst, + .owner = THIS_MODULE, + .pf = PF_BRIDGE, + .hooknum = NF_BR_LOCAL_OUT, + .priority = NF_BR_PRI_NAT_DST_OTHER, + }, + { + .hook = ebt_nat_src, + .owner = THIS_MODULE, + .pf = PF_BRIDGE, + .hooknum = NF_BR_POST_ROUTING, + .priority = NF_BR_PRI_NAT_SRC, + }, + { + .hook = ebt_nat_dst, + .owner = THIS_MODULE, + .pf = PF_BRIDGE, + .hooknum = NF_BR_PRE_ROUTING, + .priority = NF_BR_PRI_NAT_DST_BRIDGED, + }, +}; + +static int __init init(void) +{ + int i, ret, j; + + ret = ebt_register_table(&frame_nat); + if (ret < 0) + return ret; + for (i = 0; i < ARRAY_SIZE(ebt_ops_nat); i++) + if ((ret = nf_register_hook(&ebt_ops_nat[i])) < 0) + goto cleanup; + return ret; +cleanup: + for (j = 0; j < i; j++) + nf_unregister_hook(&ebt_ops_nat[j]); + ebt_unregister_table(&frame_nat); + return ret; +} + +static void __exit fini(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(ebt_ops_nat); i++) + nf_unregister_hook(&ebt_ops_nat[i]); + ebt_unregister_table(&frame_nat); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c new file mode 100644 index 000000000000..18ebc664769b --- /dev/null +++ b/net/bridge/netfilter/ebtables.c @@ -0,0 +1,1507 @@ +/* + * ebtables + * + * Author: + * Bart De Schuymer + * + * ebtables.c,v 2.0, July, 2002 + * + * This code is stongly inspired on the iptables code which is + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* used for print_string */ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +/* needed for logical [in,out]-dev filtering */ +#include "../br_private.h" + +/* list_named_find */ +#define ASSERT_READ_LOCK(x) +#define ASSERT_WRITE_LOCK(x) +#include + +#if 0 +/* use this for remote debugging + * Copyright (C) 1998 by Ori Pomerantz + * Print the string to the appropriate tty, the one + * the current task uses + */ +static void print_string(char *str) +{ + struct tty_struct *my_tty; + + /* The tty for the current task */ + my_tty = current->signal->tty; + if (my_tty != NULL) { + my_tty->driver->write(my_tty, 0, str, strlen(str)); + my_tty->driver->write(my_tty, 0, "\015\012", 2); + } +} + +#define BUGPRINT(args) print_string(args); +#else +#define BUGPRINT(format, args...) printk("kernel msg: ebtables bug: please "\ + "report to author: "format, ## args) +/* #define BUGPRINT(format, args...) */ +#endif +#define MEMPRINT(format, args...) printk("kernel msg: ebtables "\ + ": out of memory: "format, ## args) +/* #define MEMPRINT(format, args...) */ + + + +/* + * Each cpu has its own set of counters, so there is no need for write_lock in + * the softirq + * For reading or updating the counters, the user context needs to + * get a write_lock + */ + +/* The size of each set of counters is altered to get cache alignment */ +#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) +#define COUNTER_OFFSET(n) (SMP_ALIGN(n * sizeof(struct ebt_counter))) +#define COUNTER_BASE(c, n, cpu) ((struct ebt_counter *)(((char *)c) + \ + COUNTER_OFFSET(n) * cpu)) + + + +static DECLARE_MUTEX(ebt_mutex); +static LIST_HEAD(ebt_tables); +static LIST_HEAD(ebt_targets); +static LIST_HEAD(ebt_matches); +static LIST_HEAD(ebt_watchers); + +static struct ebt_target ebt_standard_target = +{ {NULL, NULL}, EBT_STANDARD_TARGET, NULL, NULL, NULL, NULL}; + +static inline int ebt_do_watcher (struct ebt_entry_watcher *w, + const struct sk_buff *skb, unsigned int hooknr, const struct net_device *in, + const struct net_device *out) +{ + w->u.watcher->watcher(skb, hooknr, in, out, w->data, + w->watcher_size); + /* watchers don't give a verdict */ + return 0; +} + +static inline int ebt_do_match (struct ebt_entry_match *m, + const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out) +{ + return m->u.match->match(skb, in, out, m->data, + m->match_size); +} + +static inline int ebt_dev_check(char *entry, const struct net_device *device) +{ + int i = 0; + char *devname = device->name; + + if (*entry == '\0') + return 0; + if (!device) + return 1; + /* 1 is the wildcard token */ + while (entry[i] != '\0' && entry[i] != 1 && entry[i] == devname[i]) + i++; + return (devname[i] != entry[i] && entry[i] != 1); +} + +#define FWINV2(bool,invflg) ((bool) ^ !!(e->invflags & invflg)) +/* process standard matches */ +static inline int ebt_basic_match(struct ebt_entry *e, struct ethhdr *h, + const struct net_device *in, const struct net_device *out) +{ + int verdict, i; + + if (e->bitmask & EBT_802_3) { + if (FWINV2(ntohs(h->h_proto) >= 1536, EBT_IPROTO)) + return 1; + } else if (!(e->bitmask & EBT_NOPROTO) && + FWINV2(e->ethproto != h->h_proto, EBT_IPROTO)) + return 1; + + if (FWINV2(ebt_dev_check(e->in, in), EBT_IIN)) + return 1; + if (FWINV2(ebt_dev_check(e->out, out), EBT_IOUT)) + return 1; + if ((!in || !in->br_port) ? 0 : FWINV2(ebt_dev_check( + e->logical_in, in->br_port->br->dev), EBT_ILOGICALIN)) + return 1; + if ((!out || !out->br_port) ? 0 : FWINV2(ebt_dev_check( + e->logical_out, out->br_port->br->dev), EBT_ILOGICALOUT)) + return 1; + + if (e->bitmask & EBT_SOURCEMAC) { + verdict = 0; + for (i = 0; i < 6; i++) + verdict |= (h->h_source[i] ^ e->sourcemac[i]) & + e->sourcemsk[i]; + if (FWINV2(verdict != 0, EBT_ISOURCE) ) + return 1; + } + if (e->bitmask & EBT_DESTMAC) { + verdict = 0; + for (i = 0; i < 6; i++) + verdict |= (h->h_dest[i] ^ e->destmac[i]) & + e->destmsk[i]; + if (FWINV2(verdict != 0, EBT_IDEST) ) + return 1; + } + return 0; +} + +/* Do some firewalling */ +unsigned int ebt_do_table (unsigned int hook, struct sk_buff **pskb, + const struct net_device *in, const struct net_device *out, + struct ebt_table *table) +{ + int i, nentries; + struct ebt_entry *point; + struct ebt_counter *counter_base, *cb_base; + struct ebt_entry_target *t; + int verdict, sp = 0; + struct ebt_chainstack *cs; + struct ebt_entries *chaininfo; + char *base; + struct ebt_table_info *private; + + read_lock_bh(&table->lock); + private = table->private; + cb_base = COUNTER_BASE(private->counters, private->nentries, + smp_processor_id()); + if (private->chainstack) + cs = private->chainstack[smp_processor_id()]; + else + cs = NULL; + chaininfo = private->hook_entry[hook]; + nentries = private->hook_entry[hook]->nentries; + point = (struct ebt_entry *)(private->hook_entry[hook]->data); + counter_base = cb_base + private->hook_entry[hook]->counter_offset; + /* base for chain jumps */ + base = private->entries; + i = 0; + while (i < nentries) { + if (ebt_basic_match(point, eth_hdr(*pskb), in, out)) + goto letscontinue; + + if (EBT_MATCH_ITERATE(point, ebt_do_match, *pskb, in, out) != 0) + goto letscontinue; + + /* increase counter */ + (*(counter_base + i)).pcnt++; + (*(counter_base + i)).bcnt+=(**pskb).len; + + /* these should only watch: not modify, nor tell us + what to do with the packet */ + EBT_WATCHER_ITERATE(point, ebt_do_watcher, *pskb, hook, in, + out); + + t = (struct ebt_entry_target *) + (((char *)point) + point->target_offset); + /* standard target */ + if (!t->u.target->target) + verdict = ((struct ebt_standard_target *)t)->verdict; + else + verdict = t->u.target->target(pskb, hook, + in, out, t->data, t->target_size); + if (verdict == EBT_ACCEPT) { + read_unlock_bh(&table->lock); + return NF_ACCEPT; + } + if (verdict == EBT_DROP) { + read_unlock_bh(&table->lock); + return NF_DROP; + } + if (verdict == EBT_RETURN) { +letsreturn: +#ifdef CONFIG_NETFILTER_DEBUG + if (sp == 0) { + BUGPRINT("RETURN on base chain"); + /* act like this is EBT_CONTINUE */ + goto letscontinue; + } +#endif + sp--; + /* put all the local variables right */ + i = cs[sp].n; + chaininfo = cs[sp].chaininfo; + nentries = chaininfo->nentries; + point = cs[sp].e; + counter_base = cb_base + + chaininfo->counter_offset; + continue; + } + if (verdict == EBT_CONTINUE) + goto letscontinue; +#ifdef CONFIG_NETFILTER_DEBUG + if (verdict < 0) { + BUGPRINT("bogus standard verdict\n"); + read_unlock_bh(&table->lock); + return NF_DROP; + } +#endif + /* jump to a udc */ + cs[sp].n = i + 1; + cs[sp].chaininfo = chaininfo; + cs[sp].e = (struct ebt_entry *) + (((char *)point) + point->next_offset); + i = 0; + chaininfo = (struct ebt_entries *) (base + verdict); +#ifdef CONFIG_NETFILTER_DEBUG + if (chaininfo->distinguisher) { + BUGPRINT("jump to non-chain\n"); + read_unlock_bh(&table->lock); + return NF_DROP; + } +#endif + nentries = chaininfo->nentries; + point = (struct ebt_entry *)chaininfo->data; + counter_base = cb_base + chaininfo->counter_offset; + sp++; + continue; +letscontinue: + point = (struct ebt_entry *) + (((char *)point) + point->next_offset); + i++; + } + + /* I actually like this :) */ + if (chaininfo->policy == EBT_RETURN) + goto letsreturn; + if (chaininfo->policy == EBT_ACCEPT) { + read_unlock_bh(&table->lock); + return NF_ACCEPT; + } + read_unlock_bh(&table->lock); + return NF_DROP; +} + +/* If it succeeds, returns element and locks mutex */ +static inline void * +find_inlist_lock_noload(struct list_head *head, const char *name, int *error, + struct semaphore *mutex) +{ + void *ret; + + *error = down_interruptible(mutex); + if (*error != 0) + return NULL; + + ret = list_named_find(head, name); + if (!ret) { + *error = -ENOENT; + up(mutex); + } + return ret; +} + +#ifndef CONFIG_KMOD +#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m)) +#else +static void * +find_inlist_lock(struct list_head *head, const char *name, const char *prefix, + int *error, struct semaphore *mutex) +{ + void *ret; + + ret = find_inlist_lock_noload(head, name, error, mutex); + if (!ret) { + request_module("%s%s", prefix, name); + ret = find_inlist_lock_noload(head, name, error, mutex); + } + return ret; +} +#endif + +static inline struct ebt_table * +find_table_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&ebt_tables, name, "ebtable_", error, mutex); +} + +static inline struct ebt_match * +find_match_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&ebt_matches, name, "ebt_", error, mutex); +} + +static inline struct ebt_watcher * +find_watcher_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&ebt_watchers, name, "ebt_", error, mutex); +} + +static inline struct ebt_target * +find_target_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&ebt_targets, name, "ebt_", error, mutex); +} + +static inline int +ebt_check_match(struct ebt_entry_match *m, struct ebt_entry *e, + const char *name, unsigned int hookmask, unsigned int *cnt) +{ + struct ebt_match *match; + int ret; + + if (((char *)m) + m->match_size + sizeof(struct ebt_entry_match) > + ((char *)e) + e->watchers_offset) + return -EINVAL; + match = find_match_lock(m->u.name, &ret, &ebt_mutex); + if (!match) + return ret; + m->u.match = match; + if (!try_module_get(match->me)) { + up(&ebt_mutex); + return -ENOENT; + } + up(&ebt_mutex); + if (match->check && + match->check(name, hookmask, e, m->data, m->match_size) != 0) { + BUGPRINT("match->check failed\n"); + module_put(match->me); + return -EINVAL; + } + (*cnt)++; + return 0; +} + +static inline int +ebt_check_watcher(struct ebt_entry_watcher *w, struct ebt_entry *e, + const char *name, unsigned int hookmask, unsigned int *cnt) +{ + struct ebt_watcher *watcher; + int ret; + + if (((char *)w) + w->watcher_size + sizeof(struct ebt_entry_watcher) > + ((char *)e) + e->target_offset) + return -EINVAL; + watcher = find_watcher_lock(w->u.name, &ret, &ebt_mutex); + if (!watcher) + return ret; + w->u.watcher = watcher; + if (!try_module_get(watcher->me)) { + up(&ebt_mutex); + return -ENOENT; + } + up(&ebt_mutex); + if (watcher->check && + watcher->check(name, hookmask, e, w->data, w->watcher_size) != 0) { + BUGPRINT("watcher->check failed\n"); + module_put(watcher->me); + return -EINVAL; + } + (*cnt)++; + return 0; +} + +/* + * this one is very careful, as it is the first function + * to parse the userspace data + */ +static inline int +ebt_check_entry_size_and_hooks(struct ebt_entry *e, + struct ebt_table_info *newinfo, char *base, char *limit, + struct ebt_entries **hook_entries, unsigned int *n, unsigned int *cnt, + unsigned int *totalcnt, unsigned int *udc_cnt, unsigned int valid_hooks) +{ + int i; + + for (i = 0; i < NF_BR_NUMHOOKS; i++) { + if ((valid_hooks & (1 << i)) == 0) + continue; + if ( (char *)hook_entries[i] - base == + (char *)e - newinfo->entries) + break; + } + /* beginning of a new chain + if i == NF_BR_NUMHOOKS it must be a user defined chain */ + if (i != NF_BR_NUMHOOKS || !(e->bitmask & EBT_ENTRY_OR_ENTRIES)) { + if ((e->bitmask & EBT_ENTRY_OR_ENTRIES) != 0) { + /* we make userspace set this right, + so there is no misunderstanding */ + BUGPRINT("EBT_ENTRY_OR_ENTRIES shouldn't be set " + "in distinguisher\n"); + return -EINVAL; + } + /* this checks if the previous chain has as many entries + as it said it has */ + if (*n != *cnt) { + BUGPRINT("nentries does not equal the nr of entries " + "in the chain\n"); + return -EINVAL; + } + /* before we look at the struct, be sure it is not too big */ + if ((char *)hook_entries[i] + sizeof(struct ebt_entries) + > limit) { + BUGPRINT("entries_size too small\n"); + return -EINVAL; + } + if (((struct ebt_entries *)e)->policy != EBT_DROP && + ((struct ebt_entries *)e)->policy != EBT_ACCEPT) { + /* only RETURN from udc */ + if (i != NF_BR_NUMHOOKS || + ((struct ebt_entries *)e)->policy != EBT_RETURN) { + BUGPRINT("bad policy\n"); + return -EINVAL; + } + } + if (i == NF_BR_NUMHOOKS) /* it's a user defined chain */ + (*udc_cnt)++; + else + newinfo->hook_entry[i] = (struct ebt_entries *)e; + if (((struct ebt_entries *)e)->counter_offset != *totalcnt) { + BUGPRINT("counter_offset != totalcnt"); + return -EINVAL; + } + *n = ((struct ebt_entries *)e)->nentries; + *cnt = 0; + return 0; + } + /* a plain old entry, heh */ + if (sizeof(struct ebt_entry) > e->watchers_offset || + e->watchers_offset > e->target_offset || + e->target_offset >= e->next_offset) { + BUGPRINT("entry offsets not in right order\n"); + return -EINVAL; + } + /* this is not checked anywhere else */ + if (e->next_offset - e->target_offset < sizeof(struct ebt_entry_target)) { + BUGPRINT("target size too small\n"); + return -EINVAL; + } + + (*cnt)++; + (*totalcnt)++; + return 0; +} + +struct ebt_cl_stack +{ + struct ebt_chainstack cs; + int from; + unsigned int hookmask; +}; + +/* + * we need these positions to check that the jumps to a different part of the + * entries is a jump to the beginning of a new chain. + */ +static inline int +ebt_get_udc_positions(struct ebt_entry *e, struct ebt_table_info *newinfo, + struct ebt_entries **hook_entries, unsigned int *n, unsigned int valid_hooks, + struct ebt_cl_stack *udc) +{ + int i; + + /* we're only interested in chain starts */ + if (e->bitmask & EBT_ENTRY_OR_ENTRIES) + return 0; + for (i = 0; i < NF_BR_NUMHOOKS; i++) { + if ((valid_hooks & (1 << i)) == 0) + continue; + if (newinfo->hook_entry[i] == (struct ebt_entries *)e) + break; + } + /* only care about udc */ + if (i != NF_BR_NUMHOOKS) + return 0; + + udc[*n].cs.chaininfo = (struct ebt_entries *)e; + /* these initialisations are depended on later in check_chainloops() */ + udc[*n].cs.n = 0; + udc[*n].hookmask = 0; + + (*n)++; + return 0; +} + +static inline int +ebt_cleanup_match(struct ebt_entry_match *m, unsigned int *i) +{ + if (i && (*i)-- == 0) + return 1; + if (m->u.match->destroy) + m->u.match->destroy(m->data, m->match_size); + module_put(m->u.match->me); + + return 0; +} + +static inline int +ebt_cleanup_watcher(struct ebt_entry_watcher *w, unsigned int *i) +{ + if (i && (*i)-- == 0) + return 1; + if (w->u.watcher->destroy) + w->u.watcher->destroy(w->data, w->watcher_size); + module_put(w->u.watcher->me); + + return 0; +} + +static inline int +ebt_cleanup_entry(struct ebt_entry *e, unsigned int *cnt) +{ + struct ebt_entry_target *t; + + if ((e->bitmask & EBT_ENTRY_OR_ENTRIES) == 0) + return 0; + /* we're done */ + if (cnt && (*cnt)-- == 0) + return 1; + EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, NULL); + EBT_MATCH_ITERATE(e, ebt_cleanup_match, NULL); + t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + if (t->u.target->destroy) + t->u.target->destroy(t->data, t->target_size); + module_put(t->u.target->me); + + return 0; +} + +static inline int +ebt_check_entry(struct ebt_entry *e, struct ebt_table_info *newinfo, + const char *name, unsigned int *cnt, unsigned int valid_hooks, + struct ebt_cl_stack *cl_s, unsigned int udc_cnt) +{ + struct ebt_entry_target *t; + struct ebt_target *target; + unsigned int i, j, hook = 0, hookmask = 0; + int ret; + + /* don't mess with the struct ebt_entries */ + if ((e->bitmask & EBT_ENTRY_OR_ENTRIES) == 0) + return 0; + + if (e->bitmask & ~EBT_F_MASK) { + BUGPRINT("Unknown flag for bitmask\n"); + return -EINVAL; + } + if (e->invflags & ~EBT_INV_MASK) { + BUGPRINT("Unknown flag for inv bitmask\n"); + return -EINVAL; + } + if ( (e->bitmask & EBT_NOPROTO) && (e->bitmask & EBT_802_3) ) { + BUGPRINT("NOPROTO & 802_3 not allowed\n"); + return -EINVAL; + } + /* what hook do we belong to? */ + for (i = 0; i < NF_BR_NUMHOOKS; i++) { + if ((valid_hooks & (1 << i)) == 0) + continue; + if ((char *)newinfo->hook_entry[i] < (char *)e) + hook = i; + else + break; + } + /* (1 << NF_BR_NUMHOOKS) tells the check functions the rule is on + a base chain */ + if (i < NF_BR_NUMHOOKS) + hookmask = (1 << hook) | (1 << NF_BR_NUMHOOKS); + else { + for (i = 0; i < udc_cnt; i++) + if ((char *)(cl_s[i].cs.chaininfo) > (char *)e) + break; + if (i == 0) + hookmask = (1 << hook) | (1 << NF_BR_NUMHOOKS); + else + hookmask = cl_s[i - 1].hookmask; + } + i = 0; + ret = EBT_MATCH_ITERATE(e, ebt_check_match, e, name, hookmask, &i); + if (ret != 0) + goto cleanup_matches; + j = 0; + ret = EBT_WATCHER_ITERATE(e, ebt_check_watcher, e, name, hookmask, &j); + if (ret != 0) + goto cleanup_watchers; + t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + target = find_target_lock(t->u.name, &ret, &ebt_mutex); + if (!target) + goto cleanup_watchers; + if (!try_module_get(target->me)) { + up(&ebt_mutex); + ret = -ENOENT; + goto cleanup_watchers; + } + up(&ebt_mutex); + + t->u.target = target; + if (t->u.target == &ebt_standard_target) { + if (e->target_offset + sizeof(struct ebt_standard_target) > + e->next_offset) { + BUGPRINT("Standard target size too big\n"); + ret = -EFAULT; + goto cleanup_watchers; + } + if (((struct ebt_standard_target *)t)->verdict < + -NUM_STANDARD_TARGETS) { + BUGPRINT("Invalid standard target\n"); + ret = -EFAULT; + goto cleanup_watchers; + } + } else if ((e->target_offset + t->target_size + + sizeof(struct ebt_entry_target) > e->next_offset) || + (t->u.target->check && + t->u.target->check(name, hookmask, e, t->data, t->target_size) != 0)){ + module_put(t->u.target->me); + ret = -EFAULT; + goto cleanup_watchers; + } + (*cnt)++; + return 0; +cleanup_watchers: + EBT_WATCHER_ITERATE(e, ebt_cleanup_watcher, &j); +cleanup_matches: + EBT_MATCH_ITERATE(e, ebt_cleanup_match, &i); + return ret; +} + +/* + * checks for loops and sets the hook mask for udc + * the hook mask for udc tells us from which base chains the udc can be + * accessed. This mask is a parameter to the check() functions of the extensions + */ +static int check_chainloops(struct ebt_entries *chain, struct ebt_cl_stack *cl_s, + unsigned int udc_cnt, unsigned int hooknr, char *base) +{ + int i, chain_nr = -1, pos = 0, nentries = chain->nentries, verdict; + struct ebt_entry *e = (struct ebt_entry *)chain->data; + struct ebt_entry_target *t; + + while (pos < nentries || chain_nr != -1) { + /* end of udc, go back one 'recursion' step */ + if (pos == nentries) { + /* put back values of the time when this chain was called */ + e = cl_s[chain_nr].cs.e; + if (cl_s[chain_nr].from != -1) + nentries = + cl_s[cl_s[chain_nr].from].cs.chaininfo->nentries; + else + nentries = chain->nentries; + pos = cl_s[chain_nr].cs.n; + /* make sure we won't see a loop that isn't one */ + cl_s[chain_nr].cs.n = 0; + chain_nr = cl_s[chain_nr].from; + if (pos == nentries) + continue; + } + t = (struct ebt_entry_target *) + (((char *)e) + e->target_offset); + if (strcmp(t->u.name, EBT_STANDARD_TARGET)) + goto letscontinue; + if (e->target_offset + sizeof(struct ebt_standard_target) > + e->next_offset) { + BUGPRINT("Standard target size too big\n"); + return -1; + } + verdict = ((struct ebt_standard_target *)t)->verdict; + if (verdict >= 0) { /* jump to another chain */ + struct ebt_entries *hlp2 = + (struct ebt_entries *)(base + verdict); + for (i = 0; i < udc_cnt; i++) + if (hlp2 == cl_s[i].cs.chaininfo) + break; + /* bad destination or loop */ + if (i == udc_cnt) { + BUGPRINT("bad destination\n"); + return -1; + } + if (cl_s[i].cs.n) { + BUGPRINT("loop\n"); + return -1; + } + /* this can't be 0, so the above test is correct */ + cl_s[i].cs.n = pos + 1; + pos = 0; + cl_s[i].cs.e = ((void *)e + e->next_offset); + e = (struct ebt_entry *)(hlp2->data); + nentries = hlp2->nentries; + cl_s[i].from = chain_nr; + chain_nr = i; + /* this udc is accessible from the base chain for hooknr */ + cl_s[i].hookmask |= (1 << hooknr); + continue; + } +letscontinue: + e = (void *)e + e->next_offset; + pos++; + } + return 0; +} + +/* do the parsing of the table/chains/entries/matches/watchers/targets, heh */ +static int translate_table(struct ebt_replace *repl, + struct ebt_table_info *newinfo) +{ + unsigned int i, j, k, udc_cnt; + int ret; + struct ebt_cl_stack *cl_s = NULL; /* used in the checking for chain loops */ + + i = 0; + while (i < NF_BR_NUMHOOKS && !(repl->valid_hooks & (1 << i))) + i++; + if (i == NF_BR_NUMHOOKS) { + BUGPRINT("No valid hooks specified\n"); + return -EINVAL; + } + if (repl->hook_entry[i] != (struct ebt_entries *)repl->entries) { + BUGPRINT("Chains don't start at beginning\n"); + return -EINVAL; + } + /* make sure chains are ordered after each other in same order + as their corresponding hooks */ + for (j = i + 1; j < NF_BR_NUMHOOKS; j++) { + if (!(repl->valid_hooks & (1 << j))) + continue; + if ( repl->hook_entry[j] <= repl->hook_entry[i] ) { + BUGPRINT("Hook order must be followed\n"); + return -EINVAL; + } + i = j; + } + + for (i = 0; i < NF_BR_NUMHOOKS; i++) + newinfo->hook_entry[i] = NULL; + + newinfo->entries_size = repl->entries_size; + newinfo->nentries = repl->nentries; + + /* do some early checkings and initialize some things */ + i = 0; /* holds the expected nr. of entries for the chain */ + j = 0; /* holds the up to now counted entries for the chain */ + k = 0; /* holds the total nr. of entries, should equal + newinfo->nentries afterwards */ + udc_cnt = 0; /* will hold the nr. of user defined chains (udc) */ + ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, + ebt_check_entry_size_and_hooks, newinfo, repl->entries, + repl->entries + repl->entries_size, repl->hook_entry, &i, &j, &k, + &udc_cnt, repl->valid_hooks); + + if (ret != 0) + return ret; + + if (i != j) { + BUGPRINT("nentries does not equal the nr of entries in the " + "(last) chain\n"); + return -EINVAL; + } + if (k != newinfo->nentries) { + BUGPRINT("Total nentries is wrong\n"); + return -EINVAL; + } + + /* check if all valid hooks have a chain */ + for (i = 0; i < NF_BR_NUMHOOKS; i++) { + if (newinfo->hook_entry[i] == NULL && + (repl->valid_hooks & (1 << i))) { + BUGPRINT("Valid hook without chain\n"); + return -EINVAL; + } + } + + /* get the location of the udc, put them in an array + while we're at it, allocate the chainstack */ + if (udc_cnt) { + /* this will get free'd in do_replace()/ebt_register_table() + if an error occurs */ + newinfo->chainstack = (struct ebt_chainstack **) + vmalloc(num_possible_cpus() * sizeof(struct ebt_chainstack)); + if (!newinfo->chainstack) + return -ENOMEM; + for (i = 0; i < num_possible_cpus(); i++) { + newinfo->chainstack[i] = + vmalloc(udc_cnt * sizeof(struct ebt_chainstack)); + if (!newinfo->chainstack[i]) { + while (i) + vfree(newinfo->chainstack[--i]); + vfree(newinfo->chainstack); + newinfo->chainstack = NULL; + return -ENOMEM; + } + } + + cl_s = (struct ebt_cl_stack *) + vmalloc(udc_cnt * sizeof(struct ebt_cl_stack)); + if (!cl_s) + return -ENOMEM; + i = 0; /* the i'th udc */ + EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, + ebt_get_udc_positions, newinfo, repl->hook_entry, &i, + repl->valid_hooks, cl_s); + /* sanity check */ + if (i != udc_cnt) { + BUGPRINT("i != udc_cnt\n"); + vfree(cl_s); + return -EFAULT; + } + } + + /* Check for loops */ + for (i = 0; i < NF_BR_NUMHOOKS; i++) + if (repl->valid_hooks & (1 << i)) + if (check_chainloops(newinfo->hook_entry[i], + cl_s, udc_cnt, i, newinfo->entries)) { + if (cl_s) + vfree(cl_s); + return -EINVAL; + } + + /* we now know the following (along with E=mc²): + - the nr of entries in each chain is right + - the size of the allocated space is right + - all valid hooks have a corresponding chain + - there are no loops + - wrong data can still be on the level of a single entry + - could be there are jumps to places that are not the + beginning of a chain. This can only occur in chains that + are not accessible from any base chains, so we don't care. */ + + /* used to know what we need to clean up if something goes wrong */ + i = 0; + ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, + ebt_check_entry, newinfo, repl->name, &i, repl->valid_hooks, + cl_s, udc_cnt); + if (ret != 0) { + EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, + ebt_cleanup_entry, &i); + } + if (cl_s) + vfree(cl_s); + return ret; +} + +/* called under write_lock */ +static void get_counters(struct ebt_counter *oldcounters, + struct ebt_counter *counters, unsigned int nentries) +{ + int i, cpu; + struct ebt_counter *counter_base; + + /* counters of cpu 0 */ + memcpy(counters, oldcounters, + sizeof(struct ebt_counter) * nentries); + /* add other counters to those of cpu 0 */ + for (cpu = 1; cpu < num_possible_cpus(); cpu++) { + counter_base = COUNTER_BASE(oldcounters, nentries, cpu); + for (i = 0; i < nentries; i++) { + counters[i].pcnt += counter_base[i].pcnt; + counters[i].bcnt += counter_base[i].bcnt; + } + } +} + +/* replace the table */ +static int do_replace(void __user *user, unsigned int len) +{ + int ret, i, countersize; + struct ebt_table_info *newinfo; + struct ebt_replace tmp; + struct ebt_table *t; + struct ebt_counter *counterstmp = NULL; + /* used to be able to unlock earlier */ + struct ebt_table_info *table; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + if (len != sizeof(tmp) + tmp.entries_size) { + BUGPRINT("Wrong len argument\n"); + return -EINVAL; + } + + if (tmp.entries_size == 0) { + BUGPRINT("Entries_size never zero\n"); + return -EINVAL; + } + countersize = COUNTER_OFFSET(tmp.nentries) * num_possible_cpus(); + newinfo = (struct ebt_table_info *) + vmalloc(sizeof(struct ebt_table_info) + countersize); + if (!newinfo) + return -ENOMEM; + + if (countersize) + memset(newinfo->counters, 0, countersize); + + newinfo->entries = (char *)vmalloc(tmp.entries_size); + if (!newinfo->entries) { + ret = -ENOMEM; + goto free_newinfo; + } + if (copy_from_user( + newinfo->entries, tmp.entries, tmp.entries_size) != 0) { + BUGPRINT("Couldn't copy entries from userspace\n"); + ret = -EFAULT; + goto free_entries; + } + + /* the user wants counters back + the check on the size is done later, when we have the lock */ + if (tmp.num_counters) { + counterstmp = (struct ebt_counter *) + vmalloc(tmp.num_counters * sizeof(struct ebt_counter)); + if (!counterstmp) { + ret = -ENOMEM; + goto free_entries; + } + } + else + counterstmp = NULL; + + /* this can get initialized by translate_table() */ + newinfo->chainstack = NULL; + ret = translate_table(&tmp, newinfo); + + if (ret != 0) + goto free_counterstmp; + + t = find_table_lock(tmp.name, &ret, &ebt_mutex); + if (!t) { + ret = -ENOENT; + goto free_iterate; + } + + /* the table doesn't like it */ + if (t->check && (ret = t->check(newinfo, tmp.valid_hooks))) + goto free_unlock; + + if (tmp.num_counters && tmp.num_counters != t->private->nentries) { + BUGPRINT("Wrong nr. of counters requested\n"); + ret = -EINVAL; + goto free_unlock; + } + + /* we have the mutex lock, so no danger in reading this pointer */ + table = t->private; + /* make sure the table can only be rmmod'ed if it contains no rules */ + if (!table->nentries && newinfo->nentries && !try_module_get(t->me)) { + ret = -ENOENT; + goto free_unlock; + } else if (table->nentries && !newinfo->nentries) + module_put(t->me); + /* we need an atomic snapshot of the counters */ + write_lock_bh(&t->lock); + if (tmp.num_counters) + get_counters(t->private->counters, counterstmp, + t->private->nentries); + + t->private = newinfo; + write_unlock_bh(&t->lock); + up(&ebt_mutex); + /* so, a user can change the chains while having messed up her counter + allocation. Only reason why this is done is because this way the lock + is held only once, while this doesn't bring the kernel into a + dangerous state. */ + if (tmp.num_counters && + copy_to_user(tmp.counters, counterstmp, + tmp.num_counters * sizeof(struct ebt_counter))) { + BUGPRINT("Couldn't copy counters to userspace\n"); + ret = -EFAULT; + } + else + ret = 0; + + /* decrease module count and free resources */ + EBT_ENTRY_ITERATE(table->entries, table->entries_size, + ebt_cleanup_entry, NULL); + + vfree(table->entries); + if (table->chainstack) { + for (i = 0; i < num_possible_cpus(); i++) + vfree(table->chainstack[i]); + vfree(table->chainstack); + } + vfree(table); + + if (counterstmp) + vfree(counterstmp); + return ret; + +free_unlock: + up(&ebt_mutex); +free_iterate: + EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size, + ebt_cleanup_entry, NULL); +free_counterstmp: + if (counterstmp) + vfree(counterstmp); + /* can be initialized in translate_table() */ + if (newinfo->chainstack) { + for (i = 0; i < num_possible_cpus(); i++) + vfree(newinfo->chainstack[i]); + vfree(newinfo->chainstack); + } +free_entries: + if (newinfo->entries) + vfree(newinfo->entries); +free_newinfo: + if (newinfo) + vfree(newinfo); + return ret; +} + +int ebt_register_target(struct ebt_target *target) +{ + int ret; + + ret = down_interruptible(&ebt_mutex); + if (ret != 0) + return ret; + if (!list_named_insert(&ebt_targets, target)) { + up(&ebt_mutex); + return -EEXIST; + } + up(&ebt_mutex); + + return 0; +} + +void ebt_unregister_target(struct ebt_target *target) +{ + down(&ebt_mutex); + LIST_DELETE(&ebt_targets, target); + up(&ebt_mutex); +} + +int ebt_register_match(struct ebt_match *match) +{ + int ret; + + ret = down_interruptible(&ebt_mutex); + if (ret != 0) + return ret; + if (!list_named_insert(&ebt_matches, match)) { + up(&ebt_mutex); + return -EEXIST; + } + up(&ebt_mutex); + + return 0; +} + +void ebt_unregister_match(struct ebt_match *match) +{ + down(&ebt_mutex); + LIST_DELETE(&ebt_matches, match); + up(&ebt_mutex); +} + +int ebt_register_watcher(struct ebt_watcher *watcher) +{ + int ret; + + ret = down_interruptible(&ebt_mutex); + if (ret != 0) + return ret; + if (!list_named_insert(&ebt_watchers, watcher)) { + up(&ebt_mutex); + return -EEXIST; + } + up(&ebt_mutex); + + return 0; +} + +void ebt_unregister_watcher(struct ebt_watcher *watcher) +{ + down(&ebt_mutex); + LIST_DELETE(&ebt_watchers, watcher); + up(&ebt_mutex); +} + +int ebt_register_table(struct ebt_table *table) +{ + struct ebt_table_info *newinfo; + int ret, i, countersize; + + if (!table || !table->table ||!table->table->entries || + table->table->entries_size == 0 || + table->table->counters || table->private) { + BUGPRINT("Bad table data for ebt_register_table!!!\n"); + return -EINVAL; + } + + countersize = COUNTER_OFFSET(table->table->nentries) * num_possible_cpus(); + newinfo = (struct ebt_table_info *) + vmalloc(sizeof(struct ebt_table_info) + countersize); + ret = -ENOMEM; + if (!newinfo) + return -ENOMEM; + + newinfo->entries = (char *)vmalloc(table->table->entries_size); + if (!(newinfo->entries)) + goto free_newinfo; + + memcpy(newinfo->entries, table->table->entries, + table->table->entries_size); + + if (countersize) + memset(newinfo->counters, 0, countersize); + + /* fill in newinfo and parse the entries */ + newinfo->chainstack = NULL; + ret = translate_table(table->table, newinfo); + if (ret != 0) { + BUGPRINT("Translate_table failed\n"); + goto free_chainstack; + } + + if (table->check && table->check(newinfo, table->valid_hooks)) { + BUGPRINT("The table doesn't like its own initial data, lol\n"); + return -EINVAL; + } + + table->private = newinfo; + rwlock_init(&table->lock); + ret = down_interruptible(&ebt_mutex); + if (ret != 0) + goto free_chainstack; + + if (list_named_find(&ebt_tables, table->name)) { + ret = -EEXIST; + BUGPRINT("Table name already exists\n"); + goto free_unlock; + } + + /* Hold a reference count if the chains aren't empty */ + if (newinfo->nentries && !try_module_get(table->me)) { + ret = -ENOENT; + goto free_unlock; + } + list_prepend(&ebt_tables, table); + up(&ebt_mutex); + return 0; +free_unlock: + up(&ebt_mutex); +free_chainstack: + if (newinfo->chainstack) { + for (i = 0; i < num_possible_cpus(); i++) + vfree(newinfo->chainstack[i]); + vfree(newinfo->chainstack); + } + vfree(newinfo->entries); +free_newinfo: + vfree(newinfo); + return ret; +} + +void ebt_unregister_table(struct ebt_table *table) +{ + int i; + + if (!table) { + BUGPRINT("Request to unregister NULL table!!!\n"); + return; + } + down(&ebt_mutex); + LIST_DELETE(&ebt_tables, table); + up(&ebt_mutex); + if (table->private->entries) + vfree(table->private->entries); + if (table->private->chainstack) { + for (i = 0; i < num_possible_cpus(); i++) + vfree(table->private->chainstack[i]); + vfree(table->private->chainstack); + } + vfree(table->private); +} + +/* userspace just supplied us with counters */ +static int update_counters(void __user *user, unsigned int len) +{ + int i, ret; + struct ebt_counter *tmp; + struct ebt_replace hlp; + struct ebt_table *t; + + if (copy_from_user(&hlp, user, sizeof(hlp))) + return -EFAULT; + + if (len != sizeof(hlp) + hlp.num_counters * sizeof(struct ebt_counter)) + return -EINVAL; + if (hlp.num_counters == 0) + return -EINVAL; + + if ( !(tmp = (struct ebt_counter *) + vmalloc(hlp.num_counters * sizeof(struct ebt_counter))) ){ + MEMPRINT("Update_counters && nomemory\n"); + return -ENOMEM; + } + + t = find_table_lock(hlp.name, &ret, &ebt_mutex); + if (!t) + goto free_tmp; + + if (hlp.num_counters != t->private->nentries) { + BUGPRINT("Wrong nr of counters\n"); + ret = -EINVAL; + goto unlock_mutex; + } + + if ( copy_from_user(tmp, hlp.counters, + hlp.num_counters * sizeof(struct ebt_counter)) ) { + BUGPRINT("Updata_counters && !cfu\n"); + ret = -EFAULT; + goto unlock_mutex; + } + + /* we want an atomic add of the counters */ + write_lock_bh(&t->lock); + + /* we add to the counters of the first cpu */ + for (i = 0; i < hlp.num_counters; i++) { + t->private->counters[i].pcnt += tmp[i].pcnt; + t->private->counters[i].bcnt += tmp[i].bcnt; + } + + write_unlock_bh(&t->lock); + ret = 0; +unlock_mutex: + up(&ebt_mutex); +free_tmp: + vfree(tmp); + return ret; +} + +static inline int ebt_make_matchname(struct ebt_entry_match *m, + char *base, char *ubase) +{ + char *hlp = ubase - base + (char *)m; + if (copy_to_user(hlp, m->u.match->name, EBT_FUNCTION_MAXNAMELEN)) + return -EFAULT; + return 0; +} + +static inline int ebt_make_watchername(struct ebt_entry_watcher *w, + char *base, char *ubase) +{ + char *hlp = ubase - base + (char *)w; + if (copy_to_user(hlp , w->u.watcher->name, EBT_FUNCTION_MAXNAMELEN)) + return -EFAULT; + return 0; +} + +static inline int ebt_make_names(struct ebt_entry *e, char *base, char *ubase) +{ + int ret; + char *hlp; + struct ebt_entry_target *t; + + if ((e->bitmask & EBT_ENTRY_OR_ENTRIES) == 0) + return 0; + + hlp = ubase - base + (char *)e + e->target_offset; + t = (struct ebt_entry_target *)(((char *)e) + e->target_offset); + + ret = EBT_MATCH_ITERATE(e, ebt_make_matchname, base, ubase); + if (ret != 0) + return ret; + ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase); + if (ret != 0) + return ret; + if (copy_to_user(hlp, t->u.target->name, EBT_FUNCTION_MAXNAMELEN)) + return -EFAULT; + return 0; +} + +/* called with ebt_mutex down */ +static int copy_everything_to_user(struct ebt_table *t, void __user *user, + int *len, int cmd) +{ + struct ebt_replace tmp; + struct ebt_counter *counterstmp, *oldcounters; + unsigned int entries_size, nentries; + char *entries; + + if (cmd == EBT_SO_GET_ENTRIES) { + entries_size = t->private->entries_size; + nentries = t->private->nentries; + entries = t->private->entries; + oldcounters = t->private->counters; + } else { + entries_size = t->table->entries_size; + nentries = t->table->nentries; + entries = t->table->entries; + oldcounters = t->table->counters; + } + + if (copy_from_user(&tmp, user, sizeof(tmp))) { + BUGPRINT("Cfu didn't work\n"); + return -EFAULT; + } + + if (*len != sizeof(struct ebt_replace) + entries_size + + (tmp.num_counters? nentries * sizeof(struct ebt_counter): 0)) { + BUGPRINT("Wrong size\n"); + return -EINVAL; + } + + if (tmp.nentries != nentries) { + BUGPRINT("Nentries wrong\n"); + return -EINVAL; + } + + if (tmp.entries_size != entries_size) { + BUGPRINT("Wrong size\n"); + return -EINVAL; + } + + /* userspace might not need the counters */ + if (tmp.num_counters) { + if (tmp.num_counters != nentries) { + BUGPRINT("Num_counters wrong\n"); + return -EINVAL; + } + counterstmp = (struct ebt_counter *) + vmalloc(nentries * sizeof(struct ebt_counter)); + if (!counterstmp) { + MEMPRINT("Couldn't copy counters, out of memory\n"); + return -ENOMEM; + } + write_lock_bh(&t->lock); + get_counters(oldcounters, counterstmp, nentries); + write_unlock_bh(&t->lock); + + if (copy_to_user(tmp.counters, counterstmp, + nentries * sizeof(struct ebt_counter))) { + BUGPRINT("Couldn't copy counters to userspace\n"); + vfree(counterstmp); + return -EFAULT; + } + vfree(counterstmp); + } + + if (copy_to_user(tmp.entries, entries, entries_size)) { + BUGPRINT("Couldn't copy entries to userspace\n"); + return -EFAULT; + } + /* set the match/watcher/target names right */ + return EBT_ENTRY_ITERATE(entries, entries_size, + ebt_make_names, entries, tmp.entries); +} + +static int do_ebt_set_ctl(struct sock *sk, + int cmd, void __user *user, unsigned int len) +{ + int ret; + + switch(cmd) { + case EBT_SO_SET_ENTRIES: + ret = do_replace(user, len); + break; + case EBT_SO_SET_COUNTERS: + ret = update_counters(user, len); + break; + default: + ret = -EINVAL; + } + return ret; +} + +static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + struct ebt_replace tmp; + struct ebt_table *t; + + if (copy_from_user(&tmp, user, sizeof(tmp))) + return -EFAULT; + + t = find_table_lock(tmp.name, &ret, &ebt_mutex); + if (!t) + return ret; + + switch(cmd) { + case EBT_SO_GET_INFO: + case EBT_SO_GET_INIT_INFO: + if (*len != sizeof(struct ebt_replace)){ + ret = -EINVAL; + up(&ebt_mutex); + break; + } + if (cmd == EBT_SO_GET_INFO) { + tmp.nentries = t->private->nentries; + tmp.entries_size = t->private->entries_size; + tmp.valid_hooks = t->valid_hooks; + } else { + tmp.nentries = t->table->nentries; + tmp.entries_size = t->table->entries_size; + tmp.valid_hooks = t->table->valid_hooks; + } + up(&ebt_mutex); + if (copy_to_user(user, &tmp, *len) != 0){ + BUGPRINT("c2u Didn't work\n"); + ret = -EFAULT; + break; + } + ret = 0; + break; + + case EBT_SO_GET_ENTRIES: + case EBT_SO_GET_INIT_ENTRIES: + ret = copy_everything_to_user(t, user, len, cmd); + up(&ebt_mutex); + break; + + default: + up(&ebt_mutex); + ret = -EINVAL; + } + + return ret; +} + +static struct nf_sockopt_ops ebt_sockopts = +{ { NULL, NULL }, PF_INET, EBT_BASE_CTL, EBT_SO_SET_MAX + 1, do_ebt_set_ctl, + EBT_BASE_CTL, EBT_SO_GET_MAX + 1, do_ebt_get_ctl, 0, NULL +}; + +static int __init init(void) +{ + int ret; + + down(&ebt_mutex); + list_named_insert(&ebt_targets, &ebt_standard_target); + up(&ebt_mutex); + if ((ret = nf_register_sockopt(&ebt_sockopts)) < 0) + return ret; + + printk(KERN_NOTICE "Ebtables v2.0 registered\n"); + return 0; +} + +static void __exit fini(void) +{ + nf_unregister_sockopt(&ebt_sockopts); + printk(KERN_NOTICE "Ebtables v2.0 unregistered\n"); +} + +EXPORT_SYMBOL(ebt_register_table); +EXPORT_SYMBOL(ebt_unregister_table); +EXPORT_SYMBOL(ebt_register_match); +EXPORT_SYMBOL(ebt_unregister_match); +EXPORT_SYMBOL(ebt_register_watcher); +EXPORT_SYMBOL(ebt_unregister_watcher); +EXPORT_SYMBOL(ebt_register_target); +EXPORT_SYMBOL(ebt_unregister_target); +EXPORT_SYMBOL(ebt_do_table); +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/compat.c b/net/compat.c new file mode 100644 index 000000000000..be5d936dc423 --- /dev/null +++ b/net/compat.c @@ -0,0 +1,605 @@ +/* + * 32bit Socket syscall emulation. Based on arch/sparc64/kernel/sys_sparc32.c. + * + * Copyright (C) 2000 VA Linux Co + * Copyright (C) 2000 Don Dugger + * Copyright (C) 1999 Arun Sharma + * Copyright (C) 1997,1998 Jakub Jelinek (jj@sunsite.mff.cuni.cz) + * Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu) + * Copyright (C) 2000 Hewlett-Packard Co. + * Copyright (C) 2000 David Mosberger-Tang + * Copyright (C) 2000,2001 Andi Kleen, SuSE Labs + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static inline int iov_from_user_compat_to_kern(struct iovec *kiov, + struct compat_iovec __user *uiov32, + int niov) +{ + int tot_len = 0; + + while(niov > 0) { + compat_uptr_t buf; + compat_size_t len; + + if(get_user(len, &uiov32->iov_len) || + get_user(buf, &uiov32->iov_base)) { + tot_len = -EFAULT; + break; + } + tot_len += len; + kiov->iov_base = compat_ptr(buf); + kiov->iov_len = (__kernel_size_t) len; + uiov32++; + kiov++; + niov--; + } + return tot_len; +} + +int get_compat_msghdr(struct msghdr *kmsg, struct compat_msghdr __user *umsg) +{ + compat_uptr_t tmp1, tmp2, tmp3; + + if (!access_ok(VERIFY_READ, umsg, sizeof(*umsg)) || + __get_user(tmp1, &umsg->msg_name) || + __get_user(kmsg->msg_namelen, &umsg->msg_namelen) || + __get_user(tmp2, &umsg->msg_iov) || + __get_user(kmsg->msg_iovlen, &umsg->msg_iovlen) || + __get_user(tmp3, &umsg->msg_control) || + __get_user(kmsg->msg_controllen, &umsg->msg_controllen) || + __get_user(kmsg->msg_flags, &umsg->msg_flags)) + return -EFAULT; + kmsg->msg_name = compat_ptr(tmp1); + kmsg->msg_iov = compat_ptr(tmp2); + kmsg->msg_control = compat_ptr(tmp3); + return 0; +} + +/* I've named the args so it is easy to tell whose space the pointers are in. */ +int verify_compat_iovec(struct msghdr *kern_msg, struct iovec *kern_iov, + char *kern_address, int mode) +{ + int tot_len; + + if(kern_msg->msg_namelen) { + if(mode==VERIFY_READ) { + int err = move_addr_to_kernel(kern_msg->msg_name, + kern_msg->msg_namelen, + kern_address); + if(err < 0) + return err; + } + kern_msg->msg_name = kern_address; + } else + kern_msg->msg_name = NULL; + + if(kern_msg->msg_iovlen > UIO_FASTIOV) { + kern_iov = kmalloc(kern_msg->msg_iovlen * sizeof(struct iovec), + GFP_KERNEL); + if(!kern_iov) + return -ENOMEM; + } + + tot_len = iov_from_user_compat_to_kern(kern_iov, + (struct compat_iovec __user *)kern_msg->msg_iov, + kern_msg->msg_iovlen); + if(tot_len >= 0) + kern_msg->msg_iov = kern_iov; + else if(kern_msg->msg_iovlen > UIO_FASTIOV) + kfree(kern_iov); + + return tot_len; +} + +/* Bleech... */ +#define CMSG_COMPAT_ALIGN(len) ALIGN((len), sizeof(s32)) + +#define CMSG_COMPAT_DATA(cmsg) \ + ((void __user *)((char __user *)(cmsg) + CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)))) +#define CMSG_COMPAT_SPACE(len) \ + (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + CMSG_COMPAT_ALIGN(len)) +#define CMSG_COMPAT_LEN(len) \ + (CMSG_COMPAT_ALIGN(sizeof(struct compat_cmsghdr)) + (len)) + +#define CMSG_COMPAT_FIRSTHDR(msg) \ + (((msg)->msg_controllen) >= sizeof(struct compat_cmsghdr) ? \ + (struct compat_cmsghdr __user *)((msg)->msg_control) : \ + (struct compat_cmsghdr __user *)NULL) + +#define CMSG_COMPAT_OK(ucmlen, ucmsg, mhdr) \ + ((ucmlen) >= sizeof(struct compat_cmsghdr) && \ + (ucmlen) <= (unsigned long) \ + ((mhdr)->msg_controllen - \ + ((char *)(ucmsg) - (char *)(mhdr)->msg_control))) + +static inline struct compat_cmsghdr __user *cmsg_compat_nxthdr(struct msghdr *msg, + struct compat_cmsghdr __user *cmsg, int cmsg_len) +{ + char __user *ptr = (char __user *)cmsg + CMSG_COMPAT_ALIGN(cmsg_len); + if ((unsigned long)(ptr + 1 - (char __user *)msg->msg_control) > + msg->msg_controllen) + return NULL; + return (struct compat_cmsghdr __user *)ptr; +} + +/* There is a lot of hair here because the alignment rules (and + * thus placement) of cmsg headers and length are different for + * 32-bit apps. -DaveM + */ +int cmsghdr_from_user_compat_to_kern(struct msghdr *kmsg, + unsigned char *stackbuf, int stackbuf_size) +{ + struct compat_cmsghdr __user *ucmsg; + struct cmsghdr *kcmsg, *kcmsg_base; + compat_size_t ucmlen; + __kernel_size_t kcmlen, tmp; + + kcmlen = 0; + kcmsg_base = kcmsg = (struct cmsghdr *)stackbuf; + ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg); + while(ucmsg != NULL) { + if(get_user(ucmlen, &ucmsg->cmsg_len)) + return -EFAULT; + + /* Catch bogons. */ + if (!CMSG_COMPAT_OK(ucmlen, ucmsg, kmsg)) + return -EINVAL; + + tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) + + CMSG_ALIGN(sizeof(struct cmsghdr))); + kcmlen += tmp; + ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen); + } + if(kcmlen == 0) + return -EINVAL; + + /* The kcmlen holds the 64-bit version of the control length. + * It may not be modified as we do not stick it into the kmsg + * until we have successfully copied over all of the data + * from the user. + */ + if(kcmlen > stackbuf_size) + kcmsg_base = kcmsg = kmalloc(kcmlen, GFP_KERNEL); + if(kcmsg == NULL) + return -ENOBUFS; + + /* Now copy them over neatly. */ + memset(kcmsg, 0, kcmlen); + ucmsg = CMSG_COMPAT_FIRSTHDR(kmsg); + while(ucmsg != NULL) { + __get_user(ucmlen, &ucmsg->cmsg_len); + tmp = ((ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))) + + CMSG_ALIGN(sizeof(struct cmsghdr))); + kcmsg->cmsg_len = tmp; + __get_user(kcmsg->cmsg_level, &ucmsg->cmsg_level); + __get_user(kcmsg->cmsg_type, &ucmsg->cmsg_type); + + /* Copy over the data. */ + if(copy_from_user(CMSG_DATA(kcmsg), + CMSG_COMPAT_DATA(ucmsg), + (ucmlen - CMSG_COMPAT_ALIGN(sizeof(*ucmsg))))) + goto out_free_efault; + + /* Advance. */ + kcmsg = (struct cmsghdr *)((char *)kcmsg + CMSG_ALIGN(tmp)); + ucmsg = cmsg_compat_nxthdr(kmsg, ucmsg, ucmlen); + } + + /* Ok, looks like we made it. Hook it up and return success. */ + kmsg->msg_control = kcmsg_base; + kmsg->msg_controllen = kcmlen; + return 0; + +out_free_efault: + if(kcmsg_base != (struct cmsghdr *)stackbuf) + kfree(kcmsg_base); + return -EFAULT; +} + +int put_cmsg_compat(struct msghdr *kmsg, int level, int type, int len, void *data) +{ + struct compat_timeval ctv; + struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control; + struct compat_cmsghdr cmhdr; + int cmlen; + + if(cm == NULL || kmsg->msg_controllen < sizeof(*cm)) { + kmsg->msg_flags |= MSG_CTRUNC; + return 0; /* XXX: return error? check spec. */ + } + + if (level == SOL_SOCKET && type == SO_TIMESTAMP) { + struct timeval *tv = (struct timeval *)data; + ctv.tv_sec = tv->tv_sec; + ctv.tv_usec = tv->tv_usec; + data = &ctv; + len = sizeof(struct compat_timeval); + } + + cmlen = CMSG_COMPAT_LEN(len); + if(kmsg->msg_controllen < cmlen) { + kmsg->msg_flags |= MSG_CTRUNC; + cmlen = kmsg->msg_controllen; + } + cmhdr.cmsg_level = level; + cmhdr.cmsg_type = type; + cmhdr.cmsg_len = cmlen; + + if(copy_to_user(cm, &cmhdr, sizeof cmhdr)) + return -EFAULT; + if(copy_to_user(CMSG_COMPAT_DATA(cm), data, cmlen - sizeof(struct compat_cmsghdr))) + return -EFAULT; + cmlen = CMSG_COMPAT_SPACE(len); + kmsg->msg_control += cmlen; + kmsg->msg_controllen -= cmlen; + return 0; +} + +void scm_detach_fds_compat(struct msghdr *kmsg, struct scm_cookie *scm) +{ + struct compat_cmsghdr __user *cm = (struct compat_cmsghdr __user *) kmsg->msg_control; + int fdmax = (kmsg->msg_controllen - sizeof(struct compat_cmsghdr)) / sizeof(int); + int fdnum = scm->fp->count; + struct file **fp = scm->fp->fp; + int __user *cmfptr; + int err = 0, i; + + if (fdnum < fdmax) + fdmax = fdnum; + + for (i = 0, cmfptr = (int __user *) CMSG_COMPAT_DATA(cm); i < fdmax; i++, cmfptr++) { + int new_fd; + err = security_file_receive(fp[i]); + if (err) + break; + err = get_unused_fd(); + if (err < 0) + break; + new_fd = err; + err = put_user(new_fd, cmfptr); + if (err) { + put_unused_fd(new_fd); + break; + } + /* Bump the usage count and install the file. */ + get_file(fp[i]); + fd_install(new_fd, fp[i]); + } + + if (i > 0) { + int cmlen = CMSG_COMPAT_LEN(i * sizeof(int)); + if (!err) + err = put_user(SOL_SOCKET, &cm->cmsg_level); + if (!err) + err = put_user(SCM_RIGHTS, &cm->cmsg_type); + if (!err) + err = put_user(cmlen, &cm->cmsg_len); + if (!err) { + cmlen = CMSG_COMPAT_SPACE(i * sizeof(int)); + kmsg->msg_control += cmlen; + kmsg->msg_controllen -= cmlen; + } + } + if (i < fdnum) + kmsg->msg_flags |= MSG_CTRUNC; + + /* + * All of the files that fit in the message have had their + * usage counts incremented, so we just free the list. + */ + __scm_destroy(scm); +} + +/* + * For now, we assume that the compatibility and native version + * of struct ipt_entry are the same - sfr. FIXME + */ +struct compat_ipt_replace { + char name[IPT_TABLE_MAXNAMELEN]; + u32 valid_hooks; + u32 num_entries; + u32 size; + u32 hook_entry[NF_IP_NUMHOOKS]; + u32 underflow[NF_IP_NUMHOOKS]; + u32 num_counters; + compat_uptr_t counters; /* struct ipt_counters * */ + struct ipt_entry entries[0]; +}; + +static int do_netfilter_replace(int fd, int level, int optname, + char __user *optval, int optlen) +{ + struct compat_ipt_replace __user *urepl; + struct ipt_replace __user *repl_nat; + char name[IPT_TABLE_MAXNAMELEN]; + u32 origsize, tmp32, num_counters; + unsigned int repl_nat_size; + int ret; + int i; + compat_uptr_t ucntrs; + + urepl = (struct compat_ipt_replace __user *)optval; + if (get_user(origsize, &urepl->size)) + return -EFAULT; + + /* Hack: Causes ipchains to give correct error msg --RR */ + if (optlen != sizeof(*urepl) + origsize) + return -ENOPROTOOPT; + + /* XXX Assumes that size of ipt_entry is the same both in + * native and compat environments. + */ + repl_nat_size = sizeof(*repl_nat) + origsize; + repl_nat = compat_alloc_user_space(repl_nat_size); + + ret = -EFAULT; + if (put_user(origsize, &repl_nat->size)) + goto out; + + if (!access_ok(VERIFY_READ, urepl, optlen) || + !access_ok(VERIFY_WRITE, repl_nat, optlen)) + goto out; + + if (__copy_from_user(name, urepl->name, sizeof(urepl->name)) || + __copy_to_user(repl_nat->name, name, sizeof(repl_nat->name))) + goto out; + + if (__get_user(tmp32, &urepl->valid_hooks) || + __put_user(tmp32, &repl_nat->valid_hooks)) + goto out; + + if (__get_user(tmp32, &urepl->num_entries) || + __put_user(tmp32, &repl_nat->num_entries)) + goto out; + + if (__get_user(num_counters, &urepl->num_counters) || + __put_user(num_counters, &repl_nat->num_counters)) + goto out; + + if (__get_user(ucntrs, &urepl->counters) || + __put_user(compat_ptr(ucntrs), &repl_nat->counters)) + goto out; + + if (__copy_in_user(&repl_nat->entries[0], + &urepl->entries[0], + origsize)) + goto out; + + for (i = 0; i < NF_IP_NUMHOOKS; i++) { + if (__get_user(tmp32, &urepl->hook_entry[i]) || + __put_user(tmp32, &repl_nat->hook_entry[i]) || + __get_user(tmp32, &urepl->underflow[i]) || + __put_user(tmp32, &repl_nat->underflow[i])) + goto out; + } + + /* + * Since struct ipt_counters just contains two u_int64_t members + * we can just do the access_ok check here and pass the (converted) + * pointer into the standard syscall. We hope that the pointer is + * not misaligned ... + */ + if (!access_ok(VERIFY_WRITE, compat_ptr(ucntrs), + num_counters * sizeof(struct ipt_counters))) + goto out; + + + ret = sys_setsockopt(fd, level, optname, + (char __user *)repl_nat, repl_nat_size); + +out: + return ret; +} + +/* + * A struct sock_filter is architecture independent. + */ +struct compat_sock_fprog { + u16 len; + compat_uptr_t filter; /* struct sock_filter * */ +}; + +static int do_set_attach_filter(int fd, int level, int optname, + char __user *optval, int optlen) +{ + struct compat_sock_fprog __user *fprog32 = (struct compat_sock_fprog __user *)optval; + struct sock_fprog __user *kfprog = compat_alloc_user_space(sizeof(struct sock_fprog)); + compat_uptr_t ptr; + u16 len; + + if (!access_ok(VERIFY_READ, fprog32, sizeof(*fprog32)) || + !access_ok(VERIFY_WRITE, kfprog, sizeof(struct sock_fprog)) || + __get_user(len, &fprog32->len) || + __get_user(ptr, &fprog32->filter) || + __put_user(len, &kfprog->len) || + __put_user(compat_ptr(ptr), &kfprog->filter)) + return -EFAULT; + + return sys_setsockopt(fd, level, optname, (char __user *)kfprog, + sizeof(struct sock_fprog)); +} + +static int do_set_sock_timeout(int fd, int level, int optname, char __user *optval, int optlen) +{ + struct compat_timeval __user *up = (struct compat_timeval __user *) optval; + struct timeval ktime; + mm_segment_t old_fs; + int err; + + if (optlen < sizeof(*up)) + return -EINVAL; + if (!access_ok(VERIFY_READ, up, sizeof(*up)) || + __get_user(ktime.tv_sec, &up->tv_sec) || + __get_user(ktime.tv_usec, &up->tv_usec)) + return -EFAULT; + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = sys_setsockopt(fd, level, optname, (char *) &ktime, sizeof(ktime)); + set_fs(old_fs); + + return err; +} + +asmlinkage long compat_sys_setsockopt(int fd, int level, int optname, + char __user *optval, int optlen) +{ + /* SO_SET_REPLACE seems to be the same in all levels */ + if (optname == IPT_SO_SET_REPLACE) + return do_netfilter_replace(fd, level, optname, + optval, optlen); + if (level == SOL_SOCKET && optname == SO_ATTACH_FILTER) + return do_set_attach_filter(fd, level, optname, + optval, optlen); + if (level == SOL_SOCKET && + (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)) + return do_set_sock_timeout(fd, level, optname, optval, optlen); + + return sys_setsockopt(fd, level, optname, optval, optlen); +} + +static int do_get_sock_timeout(int fd, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct compat_timeval __user *up; + struct timeval ktime; + mm_segment_t old_fs; + int len, err; + + up = (struct compat_timeval __user *) optval; + if (get_user(len, optlen)) + return -EFAULT; + if (len < sizeof(*up)) + return -EINVAL; + len = sizeof(ktime); + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = sys_getsockopt(fd, level, optname, (char *) &ktime, &len); + set_fs(old_fs); + + if (!err) { + if (put_user(sizeof(*up), optlen) || + !access_ok(VERIFY_WRITE, up, sizeof(*up)) || + __put_user(ktime.tv_sec, &up->tv_sec) || + __put_user(ktime.tv_usec, &up->tv_usec)) + err = -EFAULT; + } + return err; +} + +asmlinkage long compat_sys_getsockopt(int fd, int level, int optname, + char __user *optval, int __user *optlen) +{ + if (level == SOL_SOCKET && + (optname == SO_RCVTIMEO || optname == SO_SNDTIMEO)) + return do_get_sock_timeout(fd, level, optname, optval, optlen); + return sys_getsockopt(fd, level, optname, optval, optlen); +} + +/* Argument list sizes for compat_sys_socketcall */ +#define AL(x) ((x) * sizeof(u32)) +static unsigned char nas[18]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3), + AL(3),AL(3),AL(4),AL(4),AL(4),AL(6), + AL(6),AL(2),AL(5),AL(5),AL(3),AL(3)}; +#undef AL + +asmlinkage long compat_sys_sendmsg(int fd, struct compat_msghdr __user *msg, unsigned flags) +{ + return sys_sendmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); +} + +asmlinkage long compat_sys_recvmsg(int fd, struct compat_msghdr __user *msg, unsigned int flags) +{ + return sys_recvmsg(fd, (struct msghdr __user *)msg, flags | MSG_CMSG_COMPAT); +} + +asmlinkage long compat_sys_socketcall(int call, u32 __user *args) +{ + int ret; + u32 a[6]; + u32 a0, a1; + + if (call < SYS_SOCKET || call > SYS_RECVMSG) + return -EINVAL; + if (copy_from_user(a, args, nas[call])) + return -EFAULT; + a0 = a[0]; + a1 = a[1]; + + switch(call) { + case SYS_SOCKET: + ret = sys_socket(a0, a1, a[2]); + break; + case SYS_BIND: + ret = sys_bind(a0, compat_ptr(a1), a[2]); + break; + case SYS_CONNECT: + ret = sys_connect(a0, compat_ptr(a1), a[2]); + break; + case SYS_LISTEN: + ret = sys_listen(a0, a1); + break; + case SYS_ACCEPT: + ret = sys_accept(a0, compat_ptr(a1), compat_ptr(a[2])); + break; + case SYS_GETSOCKNAME: + ret = sys_getsockname(a0, compat_ptr(a1), compat_ptr(a[2])); + break; + case SYS_GETPEERNAME: + ret = sys_getpeername(a0, compat_ptr(a1), compat_ptr(a[2])); + break; + case SYS_SOCKETPAIR: + ret = sys_socketpair(a0, a1, a[2], compat_ptr(a[3])); + break; + case SYS_SEND: + ret = sys_send(a0, compat_ptr(a1), a[2], a[3]); + break; + case SYS_SENDTO: + ret = sys_sendto(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), a[5]); + break; + case SYS_RECV: + ret = sys_recv(a0, compat_ptr(a1), a[2], a[3]); + break; + case SYS_RECVFROM: + ret = sys_recvfrom(a0, compat_ptr(a1), a[2], a[3], compat_ptr(a[4]), compat_ptr(a[5])); + break; + case SYS_SHUTDOWN: + ret = sys_shutdown(a0,a1); + break; + case SYS_SETSOCKOPT: + ret = compat_sys_setsockopt(a0, a1, a[2], + compat_ptr(a[3]), a[4]); + break; + case SYS_GETSOCKOPT: + ret = compat_sys_getsockopt(a0, a1, a[2], + compat_ptr(a[3]), compat_ptr(a[4])); + break; + case SYS_SENDMSG: + ret = compat_sys_sendmsg(a0, compat_ptr(a1), a[2]); + break; + case SYS_RECVMSG: + ret = compat_sys_recvmsg(a0, compat_ptr(a1), a[2]); + break; + default: + ret = -EINVAL; + break; + } + return ret; +} diff --git a/net/core/Makefile b/net/core/Makefile new file mode 100644 index 000000000000..81f03243fe2f --- /dev/null +++ b/net/core/Makefile @@ -0,0 +1,17 @@ +# +# Makefile for the Linux networking core. +# + +obj-y := sock.o skbuff.o iovec.o datagram.o stream.o scm.o gen_stats.o gen_estimator.o + +obj-$(CONFIG_SYSCTL) += sysctl_net_core.o + +obj-y += flow.o dev.o ethtool.o dev_mcast.o dst.o \ + neighbour.o rtnetlink.o utils.o link_watch.o filter.o + +obj-$(CONFIG_SYSFS) += net-sysfs.o +obj-$(CONFIG_NETFILTER) += netfilter.o +obj-$(CONFIG_NET_DIVERT) += dv.o +obj-$(CONFIG_NET_PKTGEN) += pktgen.o +obj-$(CONFIG_NET_RADIO) += wireless.o +obj-$(CONFIG_NETPOLL) += netpoll.o diff --git a/net/core/datagram.c b/net/core/datagram.c new file mode 100644 index 000000000000..d1bfd279cc1a --- /dev/null +++ b/net/core/datagram.c @@ -0,0 +1,482 @@ +/* + * SUCS NET3: + * + * Generic datagram handling routines. These are generic for all + * protocols. Possibly a generic IP version on top of these would + * make sense. Not tonight however 8-). + * This is used because UDP, RAW, PACKET, DDP, IPX, AX.25 and + * NetROM layer all have identical poll code and mostly + * identical recvmsg() code. So we share it here. The poll was + * shared before but buried in udp.c so I moved it. + * + * Authors: Alan Cox . (datagram_poll() from old + * udp.c code) + * + * Fixes: + * Alan Cox : NULL return from skb_peek_copy() + * understood + * Alan Cox : Rewrote skb_read_datagram to avoid the + * skb_peek_copy stuff. + * Alan Cox : Added support for SOCK_SEQPACKET. + * IPX can no longer use the SO_TYPE hack + * but AX.25 now works right, and SPX is + * feasible. + * Alan Cox : Fixed write poll of non IP protocol + * crash. + * Florian La Roche: Changed for my new skbuff handling. + * Darryl Miles : Fixed non-blocking SOCK_SEQPACKET. + * Linus Torvalds : BSD semantic fixes. + * Alan Cox : Datagram iovec handling + * Darryl Miles : Fixed non-blocking SOCK_STREAM. + * Alan Cox : POSIXisms + * Pete Wyckoff : Unconnected accept() fix. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + +/* + * Is a socket 'connection oriented' ? + */ +static inline int connection_based(struct sock *sk) +{ + return sk->sk_type == SOCK_SEQPACKET || sk->sk_type == SOCK_STREAM; +} + +/* + * Wait for a packet.. + */ +static int wait_for_packet(struct sock *sk, int *err, long *timeo_p) +{ + int error; + DEFINE_WAIT(wait); + + prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + + /* Socket errors? */ + error = sock_error(sk); + if (error) + goto out_err; + + if (!skb_queue_empty(&sk->sk_receive_queue)) + goto out; + + /* Socket shut down? */ + if (sk->sk_shutdown & RCV_SHUTDOWN) + goto out_noerr; + + /* Sequenced packets can come disconnected. + * If so we report the problem + */ + error = -ENOTCONN; + if (connection_based(sk) && + !(sk->sk_state == TCP_ESTABLISHED || sk->sk_state == TCP_LISTEN)) + goto out_err; + + /* handle signals */ + if (signal_pending(current)) + goto interrupted; + + error = 0; + *timeo_p = schedule_timeout(*timeo_p); +out: + finish_wait(sk->sk_sleep, &wait); + return error; +interrupted: + error = sock_intr_errno(*timeo_p); +out_err: + *err = error; + goto out; +out_noerr: + *err = 0; + error = 1; + goto out; +} + +/** + * skb_recv_datagram - Receive a datagram skbuff + * @sk - socket + * @flags - MSG_ flags + * @noblock - blocking operation? + * @err - error code returned + * + * Get a datagram skbuff, understands the peeking, nonblocking wakeups + * and possible races. This replaces identical code in packet, raw and + * udp, as well as the IPX AX.25 and Appletalk. It also finally fixes + * the long standing peek and read race for datagram sockets. If you + * alter this routine remember it must be re-entrant. + * + * This function will lock the socket if a skb is returned, so the caller + * needs to unlock the socket in that case (usually by calling + * skb_free_datagram) + * + * * It does not lock socket since today. This function is + * * free of race conditions. This measure should/can improve + * * significantly datagram socket latencies at high loads, + * * when data copying to user space takes lots of time. + * * (BTW I've just killed the last cli() in IP/IPv6/core/netlink/packet + * * 8) Great win.) + * * --ANK (980729) + * + * The order of the tests when we find no data waiting are specified + * quite explicitly by POSIX 1003.1g, don't change them without having + * the standard around please. + */ +struct sk_buff *skb_recv_datagram(struct sock *sk, unsigned flags, + int noblock, int *err) +{ + struct sk_buff *skb; + long timeo; + /* + * Caller is allowed not to check sk->sk_err before skb_recv_datagram() + */ + int error = sock_error(sk); + + if (error) + goto no_packet; + + timeo = sock_rcvtimeo(sk, noblock); + + do { + /* Again only user level code calls this function, so nothing + * interrupt level will suddenly eat the receive_queue. + * + * Look at current nfs client by the way... + * However, this function was corrent in any case. 8) + */ + if (flags & MSG_PEEK) { + unsigned long cpu_flags; + + spin_lock_irqsave(&sk->sk_receive_queue.lock, + cpu_flags); + skb = skb_peek(&sk->sk_receive_queue); + if (skb) + atomic_inc(&skb->users); + spin_unlock_irqrestore(&sk->sk_receive_queue.lock, + cpu_flags); + } else + skb = skb_dequeue(&sk->sk_receive_queue); + + if (skb) + return skb; + + /* User doesn't want to wait */ + error = -EAGAIN; + if (!timeo) + goto no_packet; + + } while (!wait_for_packet(sk, err, &timeo)); + + return NULL; + +no_packet: + *err = error; + return NULL; +} + +void skb_free_datagram(struct sock *sk, struct sk_buff *skb) +{ + kfree_skb(skb); +} + +/** + * skb_copy_datagram_iovec - Copy a datagram to an iovec. + * @skb - buffer to copy + * @offset - offset in the buffer to start copying from + * @iovec - io vector to copy to + * @len - amount of data to copy from buffer to iovec + * + * Note: the iovec is modified during the copy. + */ +int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset, + struct iovec *to, int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + if (memcpy_toiovec(to, skb->data + offset, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + int err; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = frag->page; + + if (copy > len) + copy = len; + vaddr = kmap(page); + err = memcpy_toiovec(to, vaddr + frag->page_offset + + offset - start, copy); + kunmap(page); + if (err) + goto fault; + if (!(len -= copy)) + return 0; + offset += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_iovec(list, + offset - start, + to, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + start = end; + } + } + if (!len) + return 0; + +fault: + return -EFAULT; +} + +static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset, + u8 __user *to, int len, + unsigned int *csump) +{ + int start = skb_headlen(skb); + int pos = 0; + int i, copy = start - offset; + + /* Copy header. */ + if (copy > 0) { + int err = 0; + if (copy > len) + copy = len; + *csump = csum_and_copy_to_user(skb->data + offset, to, copy, + *csump, &err); + if (err) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + pos = copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + unsigned int csum2; + int err = 0; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = frag->page; + + if (copy > len) + copy = len; + vaddr = kmap(page); + csum2 = csum_and_copy_to_user(vaddr + + frag->page_offset + + offset - start, + to, copy, 0, &err); + kunmap(page); + if (err) + goto fault; + *csump = csum_block_add(*csump, csum2, pos); + if (!(len -= copy)) + return 0; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list=list->next) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + unsigned int csum2 = 0; + if (copy > len) + copy = len; + if (skb_copy_and_csum_datagram(list, + offset - start, + to, copy, + &csum2)) + goto fault; + *csump = csum_block_add(*csump, csum2, pos); + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + } + if (!len) + return 0; + +fault: + return -EFAULT; +} + +/** + * skb_copy_and_csum_datagram_iovec - Copy and checkum skb to user iovec. + * @skb - skbuff + * @hlen - hardware length + * @iovec - io vector + * + * Caller _must_ check that skb will fit to this iovec. + * + * Returns: 0 - success. + * -EINVAL - checksum failure. + * -EFAULT - fault during copy. Beware, in this case iovec + * can be modified! + */ +int skb_copy_and_csum_datagram_iovec(const struct sk_buff *skb, + int hlen, struct iovec *iov) +{ + unsigned int csum; + int chunk = skb->len - hlen; + + /* Skip filled elements. + * Pretty silly, look at memcpy_toiovec, though 8) + */ + while (!iov->iov_len) + iov++; + + if (iov->iov_len < chunk) { + if ((unsigned short)csum_fold(skb_checksum(skb, 0, chunk + hlen, + skb->csum))) + goto csum_error; + if (skb_copy_datagram_iovec(skb, hlen, iov, chunk)) + goto fault; + } else { + csum = csum_partial(skb->data, hlen, skb->csum); + if (skb_copy_and_csum_datagram(skb, hlen, iov->iov_base, + chunk, &csum)) + goto fault; + if ((unsigned short)csum_fold(csum)) + goto csum_error; + iov->iov_len -= chunk; + iov->iov_base += chunk; + } + return 0; +csum_error: + return -EINVAL; +fault: + return -EFAULT; +} + +/** + * datagram_poll - generic datagram poll + * @file - file struct + * @sock - socket + * @wait - poll table + * + * Datagram poll: Again totally generic. This also handles + * sequenced packet sockets providing the socket receive queue + * is only ever holding data ready to receive. + * + * Note: when you _don't_ use this routine for this protocol, + * and you use a different write policy from sock_writeable() + * then please supply your own write_space callback. + */ +unsigned int datagram_poll(struct file *file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask; + + poll_wait(file, sk->sk_sleep, wait); + mask = 0; + + /* exceptional events? */ + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + mask |= POLLERR; + if (sk->sk_shutdown == SHUTDOWN_MASK) + mask |= POLLHUP; + + /* readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue) || + (sk->sk_shutdown & RCV_SHUTDOWN)) + mask |= POLLIN | POLLRDNORM; + + /* Connection-based need to check for termination and startup */ + if (connection_based(sk)) { + if (sk->sk_state == TCP_CLOSE) + mask |= POLLHUP; + /* connection hasn't started yet? */ + if (sk->sk_state == TCP_SYN_SENT) + return mask; + } + + /* writable? */ + if (sock_writeable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + else + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + return mask; +} + +EXPORT_SYMBOL(datagram_poll); +EXPORT_SYMBOL(skb_copy_and_csum_datagram_iovec); +EXPORT_SYMBOL(skb_copy_datagram_iovec); +EXPORT_SYMBOL(skb_free_datagram); +EXPORT_SYMBOL(skb_recv_datagram); diff --git a/net/core/dev.c b/net/core/dev.c new file mode 100644 index 000000000000..42344d903692 --- /dev/null +++ b/net/core/dev.c @@ -0,0 +1,3359 @@ +/* + * NET3 Protocol independent device support routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Derived from the non IP parts of dev.c 1.0.19 + * Authors: Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * + * Additional Authors: + * Florian la Roche + * Alan Cox + * David Hinds + * Alexey Kuznetsov + * Adam Sulmicki + * Pekka Riikonen + * + * Changes: + * D.J. Barrow : Fixed bug where dev->refcnt gets set + * to 2 if register_netdev gets called + * before net_dev_init & also removed a + * few lines of code in the process. + * Alan Cox : device private ioctl copies fields back. + * Alan Cox : Transmit queue code does relevant + * stunts to keep the queue safe. + * Alan Cox : Fixed double lock. + * Alan Cox : Fixed promisc NULL pointer trap + * ???????? : Support the full private ioctl range + * Alan Cox : Moved ioctl permission check into + * drivers + * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI + * Alan Cox : 100 backlog just doesn't cut it when + * you start doing multicast video 8) + * Alan Cox : Rewrote net_bh and list manager. + * Alan Cox : Fix ETH_P_ALL echoback lengths. + * Alan Cox : Took out transmit every packet pass + * Saved a few bytes in the ioctl handler + * Alan Cox : Network driver sets packet type before + * calling netif_rx. Saves a function + * call a packet. + * Alan Cox : Hashed net_bh() + * Richard Kooijman: Timestamp fixes. + * Alan Cox : Wrong field in SIOCGIFDSTADDR + * Alan Cox : Device lock protection. + * Alan Cox : Fixed nasty side effect of device close + * changes. + * Rudi Cilibrasi : Pass the right thing to + * set_mac_address() + * Dave Miller : 32bit quantity for the device lock to + * make it work out on a Sparc. + * Bjorn Ekwall : Added KERNELD hack. + * Alan Cox : Cleaned up the backlog initialise. + * Craig Metz : SIOCGIFCONF fix if space for under + * 1 device. + * Thomas Bogendoerfer : Return ENODEV for dev_open, if there + * is no device open function. + * Andi Kleen : Fix error reporting for SIOCGIFCONF + * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF + * Cyrus Durgin : Cleaned for KMOD + * Adam Sulmicki : Bug Fix : Network Device Unload + * A network device unload needs to purge + * the backlog queue. + * Paul Rusty Russell : SIOCSIFNAME + * Pekka Riikonen : Netdev boot-time settings code + * Andrew Morton : Make unregister_netdevice wait + * indefinitely on dev->refcnt + * J Hadi Salim : - Backlog queue sampling + * - netif_rx() feedback + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_NET_RADIO +#include /* Note : will define WIRELESS_EXT */ +#include +#endif /* CONFIG_NET_RADIO */ +#include + +/* This define, if set, will randomly drop a packet when congestion + * is more than moderate. It helps fairness in the multi-interface + * case when one of them is a hog, but it kills performance for the + * single interface case so it is off now by default. + */ +#undef RAND_LIE + +/* Setting this will sample the queue lengths and thus congestion + * via a timer instead of as each packet is received. + */ +#undef OFFLINE_SAMPLE + +/* + * The list of packet types we will receive (as opposed to discard) + * and the routines to invoke. + * + * Why 16. Because with 16 the only overlap we get on a hash of the + * low nibble of the protocol value is RARP/SNAP/X.25. + * + * NOTE: That is no longer true with the addition of VLAN tags. Not + * sure which should go first, but I bet it won't make much + * difference if we are running VLANs. The good news is that + * this protocol won't be in the list unless compiled in, so + * the average user (w/out VLANs) will not be adversly affected. + * --BLG + * + * 0800 IP + * 8100 802.1Q VLAN + * 0001 802.3 + * 0002 AX.25 + * 0004 802.2 + * 8035 RARP + * 0005 SNAP + * 0805 X.25 + * 0806 ARP + * 8137 IPX + * 0009 Localtalk + * 86DD IPv6 + */ + +static DEFINE_SPINLOCK(ptype_lock); +static struct list_head ptype_base[16]; /* 16 way hashed list */ +static struct list_head ptype_all; /* Taps */ + +#ifdef OFFLINE_SAMPLE +static void sample_queue(unsigned long dummy); +static struct timer_list samp_timer = TIMER_INITIALIZER(sample_queue, 0, 0); +#endif + +/* + * The @dev_base list is protected by @dev_base_lock and the rtln + * semaphore. + * + * Pure readers hold dev_base_lock for reading. + * + * Writers must hold the rtnl semaphore while they loop through the + * dev_base list, and hold dev_base_lock for writing when they do the + * actual updates. This allows pure readers to access the list even + * while a writer is preparing to update it. + * + * To put it another way, dev_base_lock is held for writing only to + * protect against pure readers; the rtnl semaphore provides the + * protection against other writers. + * + * See, for example usages, register_netdevice() and + * unregister_netdevice(), which must be called with the rtnl + * semaphore held. + */ +struct net_device *dev_base; +static struct net_device **dev_tail = &dev_base; +DEFINE_RWLOCK(dev_base_lock); + +EXPORT_SYMBOL(dev_base); +EXPORT_SYMBOL(dev_base_lock); + +#define NETDEV_HASHBITS 8 +static struct hlist_head dev_name_head[1<type == htons(ETH_P_ALL)) { + netdev_nit++; + list_add_rcu(&pt->list, &ptype_all); + } else { + hash = ntohs(pt->type) & 15; + list_add_rcu(&pt->list, &ptype_base[hash]); + } + spin_unlock_bh(&ptype_lock); +} + +extern void linkwatch_run_queue(void); + + + +/** + * __dev_remove_pack - remove packet handler + * @pt: packet type declaration + * + * Remove a protocol handler that was previously added to the kernel + * protocol handlers by dev_add_pack(). The passed &packet_type is removed + * from the kernel lists and can be freed or reused once this function + * returns. + * + * The packet type might still be in use by receivers + * and must not be freed until after all the CPU's have gone + * through a quiescent state. + */ +void __dev_remove_pack(struct packet_type *pt) +{ + struct list_head *head; + struct packet_type *pt1; + + spin_lock_bh(&ptype_lock); + + if (pt->type == htons(ETH_P_ALL)) { + netdev_nit--; + head = &ptype_all; + } else + head = &ptype_base[ntohs(pt->type) & 15]; + + list_for_each_entry(pt1, head, list) { + if (pt == pt1) { + list_del_rcu(&pt->list); + goto out; + } + } + + printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); +out: + spin_unlock_bh(&ptype_lock); +} +/** + * dev_remove_pack - remove packet handler + * @pt: packet type declaration + * + * Remove a protocol handler that was previously added to the kernel + * protocol handlers by dev_add_pack(). The passed &packet_type is removed + * from the kernel lists and can be freed or reused once this function + * returns. + * + * This call sleeps to guarantee that no CPU is looking at the packet + * type after return. + */ +void dev_remove_pack(struct packet_type *pt) +{ + __dev_remove_pack(pt); + + synchronize_net(); +} + +/****************************************************************************** + + Device Boot-time Settings Routines + +*******************************************************************************/ + +/* Boot time configuration table */ +static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX]; + +/** + * netdev_boot_setup_add - add new setup entry + * @name: name of the device + * @map: configured settings for the device + * + * Adds new setup entry to the dev_boot_setup list. The function + * returns 0 on error and 1 on success. This is a generic routine to + * all netdevices. + */ +static int netdev_boot_setup_add(char *name, struct ifmap *map) +{ + struct netdev_boot_setup *s; + int i; + + s = dev_boot_setup; + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { + if (s[i].name[0] == '\0' || s[i].name[0] == ' ') { + memset(s[i].name, 0, sizeof(s[i].name)); + strcpy(s[i].name, name); + memcpy(&s[i].map, map, sizeof(s[i].map)); + break; + } + } + + return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1; +} + +/** + * netdev_boot_setup_check - check boot time settings + * @dev: the netdevice + * + * Check boot time settings for the device. + * The found settings are set for the device to be used + * later in the device probing. + * Returns 0 if no settings found, 1 if they are. + */ +int netdev_boot_setup_check(struct net_device *dev) +{ + struct netdev_boot_setup *s = dev_boot_setup; + int i; + + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) { + if (s[i].name[0] != '\0' && s[i].name[0] != ' ' && + !strncmp(dev->name, s[i].name, strlen(s[i].name))) { + dev->irq = s[i].map.irq; + dev->base_addr = s[i].map.base_addr; + dev->mem_start = s[i].map.mem_start; + dev->mem_end = s[i].map.mem_end; + return 1; + } + } + return 0; +} + + +/** + * netdev_boot_base - get address from boot time settings + * @prefix: prefix for network device + * @unit: id for network device + * + * Check boot time settings for the base address of device. + * The found settings are set for the device to be used + * later in the device probing. + * Returns 0 if no settings found. + */ +unsigned long netdev_boot_base(const char *prefix, int unit) +{ + const struct netdev_boot_setup *s = dev_boot_setup; + char name[IFNAMSIZ]; + int i; + + sprintf(name, "%s%d", prefix, unit); + + /* + * If device already registered then return base of 1 + * to indicate not to probe for this interface + */ + if (__dev_get_by_name(name)) + return 1; + + for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) + if (!strcmp(name, s[i].name)) + return s[i].map.base_addr; + return 0; +} + +/* + * Saves at boot time configured settings for any netdevice. + */ +int __init netdev_boot_setup(char *str) +{ + int ints[5]; + struct ifmap map; + + str = get_options(str, ARRAY_SIZE(ints), ints); + if (!str || !*str) + return 0; + + /* Save settings */ + memset(&map, 0, sizeof(map)); + if (ints[0] > 0) + map.irq = ints[1]; + if (ints[0] > 1) + map.base_addr = ints[2]; + if (ints[0] > 2) + map.mem_start = ints[3]; + if (ints[0] > 3) + map.mem_end = ints[4]; + + /* Add new entry to the list */ + return netdev_boot_setup_add(str, &map); +} + +__setup("netdev=", netdev_boot_setup); + +/******************************************************************************* + + Device Interface Subroutines + +*******************************************************************************/ + +/** + * __dev_get_by_name - find a device by its name + * @name: name to find + * + * Find an interface by name. Must be called under RTNL semaphore + * or @dev_base_lock. If the name is found a pointer to the device + * is returned. If the name is not found then %NULL is returned. The + * reference counters are not incremented so the caller must be + * careful with locks. + */ + +struct net_device *__dev_get_by_name(const char *name) +{ + struct hlist_node *p; + + hlist_for_each(p, dev_name_hash(name)) { + struct net_device *dev + = hlist_entry(p, struct net_device, name_hlist); + if (!strncmp(dev->name, name, IFNAMSIZ)) + return dev; + } + return NULL; +} + +/** + * dev_get_by_name - find a device by its name + * @name: name to find + * + * Find an interface by name. This can be called from any + * context and does its own locking. The returned handle has + * the usage count incremented and the caller must use dev_put() to + * release it when it is no longer needed. %NULL is returned if no + * matching device is found. + */ + +struct net_device *dev_get_by_name(const char *name) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + dev = __dev_get_by_name(name); + if (dev) + dev_hold(dev); + read_unlock(&dev_base_lock); + return dev; +} + +/** + * __dev_get_by_index - find a device by its ifindex + * @ifindex: index of device + * + * Search for an interface by index. Returns %NULL if the device + * is not found or a pointer to the device. The device has not + * had its reference counter increased so the caller must be careful + * about locking. The caller must hold either the RTNL semaphore + * or @dev_base_lock. + */ + +struct net_device *__dev_get_by_index(int ifindex) +{ + struct hlist_node *p; + + hlist_for_each(p, dev_index_hash(ifindex)) { + struct net_device *dev + = hlist_entry(p, struct net_device, index_hlist); + if (dev->ifindex == ifindex) + return dev; + } + return NULL; +} + + +/** + * dev_get_by_index - find a device by its ifindex + * @ifindex: index of device + * + * Search for an interface by index. Returns NULL if the device + * is not found or a pointer to the device. The device returned has + * had a reference added and the pointer is safe until the user calls + * dev_put to indicate they have finished with it. + */ + +struct net_device *dev_get_by_index(int ifindex) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + dev = __dev_get_by_index(ifindex); + if (dev) + dev_hold(dev); + read_unlock(&dev_base_lock); + return dev; +} + +/** + * dev_getbyhwaddr - find a device by its hardware address + * @type: media type of device + * @ha: hardware address + * + * Search for an interface by MAC address. Returns NULL if the device + * is not found or a pointer to the device. The caller must hold the + * rtnl semaphore. The returned device has not had its ref count increased + * and the caller must therefore be careful about locking + * + * BUGS: + * If the API was consistent this would be __dev_get_by_hwaddr + */ + +struct net_device *dev_getbyhwaddr(unsigned short type, char *ha) +{ + struct net_device *dev; + + ASSERT_RTNL(); + + for (dev = dev_base; dev; dev = dev->next) + if (dev->type == type && + !memcmp(dev->dev_addr, ha, dev->addr_len)) + break; + return dev; +} + +struct net_device *dev_getfirstbyhwtype(unsigned short type) +{ + struct net_device *dev; + + rtnl_lock(); + for (dev = dev_base; dev; dev = dev->next) { + if (dev->type == type) { + dev_hold(dev); + break; + } + } + rtnl_unlock(); + return dev; +} + +EXPORT_SYMBOL(dev_getfirstbyhwtype); + +/** + * dev_get_by_flags - find any device with given flags + * @if_flags: IFF_* values + * @mask: bitmask of bits in if_flags to check + * + * Search for any interface with the given flags. Returns NULL if a device + * is not found or a pointer to the device. The device returned has + * had a reference added and the pointer is safe until the user calls + * dev_put to indicate they have finished with it. + */ + +struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + for (dev = dev_base; dev != NULL; dev = dev->next) { + if (((dev->flags ^ if_flags) & mask) == 0) { + dev_hold(dev); + break; + } + } + read_unlock(&dev_base_lock); + return dev; +} + +/** + * dev_valid_name - check if name is okay for network device + * @name: name string + * + * Network device names need to be valid file names to + * to allow sysfs to work + */ +static int dev_valid_name(const char *name) +{ + return !(*name == '\0' + || !strcmp(name, ".") + || !strcmp(name, "..") + || strchr(name, '/')); +} + +/** + * dev_alloc_name - allocate a name for a device + * @dev: device + * @name: name format string + * + * Passed a format string - eg "lt%d" it will try and find a suitable + * id. Not efficient for many devices, not called a lot. The caller + * must hold the dev_base or rtnl lock while allocating the name and + * adding the device in order to avoid duplicates. Returns the number + * of the unit assigned or a negative errno code. + */ + +int dev_alloc_name(struct net_device *dev, const char *name) +{ + int i = 0; + char buf[IFNAMSIZ]; + const char *p; + const int max_netdevices = 8*PAGE_SIZE; + long *inuse; + struct net_device *d; + + p = strnchr(name, IFNAMSIZ-1, '%'); + if (p) { + /* + * Verify the string as this thing may have come from + * the user. There must be either one "%d" and no other "%" + * characters. + */ + if (p[1] != 'd' || strchr(p + 2, '%')) + return -EINVAL; + + /* Use one page as a bit array of possible slots */ + inuse = (long *) get_zeroed_page(GFP_ATOMIC); + if (!inuse) + return -ENOMEM; + + for (d = dev_base; d; d = d->next) { + if (!sscanf(d->name, name, &i)) + continue; + if (i < 0 || i >= max_netdevices) + continue; + + /* avoid cases where sscanf is not exact inverse of printf */ + snprintf(buf, sizeof(buf), name, i); + if (!strncmp(buf, d->name, IFNAMSIZ)) + set_bit(i, inuse); + } + + i = find_first_zero_bit(inuse, max_netdevices); + free_page((unsigned long) inuse); + } + + snprintf(buf, sizeof(buf), name, i); + if (!__dev_get_by_name(buf)) { + strlcpy(dev->name, buf, IFNAMSIZ); + return i; + } + + /* It is possible to run out of possible slots + * when the name is long and there isn't enough space left + * for the digits, or if all bits are used. + */ + return -ENFILE; +} + + +/** + * dev_change_name - change name of a device + * @dev: device + * @newname: name (or format string) must be at least IFNAMSIZ + * + * Change name of a device, can pass format strings "eth%d". + * for wildcarding. + */ +int dev_change_name(struct net_device *dev, char *newname) +{ + int err = 0; + + ASSERT_RTNL(); + + if (dev->flags & IFF_UP) + return -EBUSY; + + if (!dev_valid_name(newname)) + return -EINVAL; + + if (strchr(newname, '%')) { + err = dev_alloc_name(dev, newname); + if (err < 0) + return err; + strcpy(newname, dev->name); + } + else if (__dev_get_by_name(newname)) + return -EEXIST; + else + strlcpy(dev->name, newname, IFNAMSIZ); + + err = class_device_rename(&dev->class_dev, dev->name); + if (!err) { + hlist_del(&dev->name_hlist); + hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name)); + notifier_call_chain(&netdev_chain, NETDEV_CHANGENAME, dev); + } + + return err; +} + +/** + * netdev_state_change - device changes state + * @dev: device to cause notification + * + * Called to indicate a device has changed state. This function calls + * the notifier chains for netdev_chain and sends a NEWLINK message + * to the routing socket. + */ +void netdev_state_change(struct net_device *dev) +{ + if (dev->flags & IFF_UP) { + notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); + rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + } +} + +/** + * dev_load - load a network module + * @name: name of interface + * + * If a network interface is not present and the process has suitable + * privileges this function loads the module. If module loading is not + * available in this kernel then it becomes a nop. + */ + +void dev_load(const char *name) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + dev = __dev_get_by_name(name); + read_unlock(&dev_base_lock); + + if (!dev && capable(CAP_SYS_MODULE)) + request_module("%s", name); +} + +static int default_rebuild_header(struct sk_buff *skb) +{ + printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n", + skb->dev ? skb->dev->name : "NULL!!!"); + kfree_skb(skb); + return 1; +} + + +/** + * dev_open - prepare an interface for use. + * @dev: device to open + * + * Takes a device from down to up state. The device's private open + * function is invoked and then the multicast lists are loaded. Finally + * the device is moved into the up state and a %NETDEV_UP message is + * sent to the netdev notifier chain. + * + * Calling this function on an active interface is a nop. On a failure + * a negative errno code is returned. + */ +int dev_open(struct net_device *dev) +{ + int ret = 0; + + /* + * Is it already up? + */ + + if (dev->flags & IFF_UP) + return 0; + + /* + * Is it even present? + */ + if (!netif_device_present(dev)) + return -ENODEV; + + /* + * Call device private open method + */ + set_bit(__LINK_STATE_START, &dev->state); + if (dev->open) { + ret = dev->open(dev); + if (ret) + clear_bit(__LINK_STATE_START, &dev->state); + } + + /* + * If it went open OK then: + */ + + if (!ret) { + /* + * Set the flags. + */ + dev->flags |= IFF_UP; + + /* + * Initialize multicasting status + */ + dev_mc_upload(dev); + + /* + * Wakeup transmit queue engine + */ + dev_activate(dev); + + /* + * ... and announce new interface. + */ + notifier_call_chain(&netdev_chain, NETDEV_UP, dev); + } + return ret; +} + +/** + * dev_close - shutdown an interface. + * @dev: device to shutdown + * + * This function moves an active device into down state. A + * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device + * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier + * chain. + */ +int dev_close(struct net_device *dev) +{ + if (!(dev->flags & IFF_UP)) + return 0; + + /* + * Tell people we are going down, so that they can + * prepare to death, when device is still operating. + */ + notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev); + + dev_deactivate(dev); + + clear_bit(__LINK_STATE_START, &dev->state); + + /* Synchronize to scheduled poll. We cannot touch poll list, + * it can be even on different cpu. So just clear netif_running(), + * and wait when poll really will happen. Actually, the best place + * for this is inside dev->stop() after device stopped its irq + * engine, but this requires more changes in devices. */ + + smp_mb__after_clear_bit(); /* Commit netif_running(). */ + while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) { + /* No hurry. */ + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(1); + } + + /* + * Call the device specific close. This cannot fail. + * Only if device is UP + * + * We allow it to be called even after a DETACH hot-plug + * event. + */ + if (dev->stop) + dev->stop(dev); + + /* + * Device is now down. + */ + + dev->flags &= ~IFF_UP; + + /* + * Tell people we are down + */ + notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev); + + return 0; +} + + +/* + * Device change register/unregister. These are not inline or static + * as we export them to the world. + */ + +/** + * register_netdevice_notifier - register a network notifier block + * @nb: notifier + * + * Register a notifier to be called when network device events occur. + * The notifier passed is linked into the kernel structures and must + * not be reused until it has been unregistered. A negative errno code + * is returned on a failure. + * + * When registered all registration and up events are replayed + * to the new notifier to allow device to have a race free + * view of the network device list. + */ + +int register_netdevice_notifier(struct notifier_block *nb) +{ + struct net_device *dev; + int err; + + rtnl_lock(); + err = notifier_chain_register(&netdev_chain, nb); + if (!err) { + for (dev = dev_base; dev; dev = dev->next) { + nb->notifier_call(nb, NETDEV_REGISTER, dev); + + if (dev->flags & IFF_UP) + nb->notifier_call(nb, NETDEV_UP, dev); + } + } + rtnl_unlock(); + return err; +} + +/** + * unregister_netdevice_notifier - unregister a network notifier block + * @nb: notifier + * + * Unregister a notifier previously registered by + * register_netdevice_notifier(). The notifier is unlinked into the + * kernel structures and may then be reused. A negative errno code + * is returned on a failure. + */ + +int unregister_netdevice_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&netdev_chain, nb); +} + +/** + * call_netdevice_notifiers - call all network notifier blocks + * @val: value passed unmodified to notifier function + * @v: pointer passed unmodified to notifier function + * + * Call all network notifier blocks. Parameters and return value + * are as for notifier_call_chain(). + */ + +int call_netdevice_notifiers(unsigned long val, void *v) +{ + return notifier_call_chain(&netdev_chain, val, v); +} + +/* When > 0 there are consumers of rx skb time stamps */ +static atomic_t netstamp_needed = ATOMIC_INIT(0); + +void net_enable_timestamp(void) +{ + atomic_inc(&netstamp_needed); +} + +void net_disable_timestamp(void) +{ + atomic_dec(&netstamp_needed); +} + +static inline void net_timestamp(struct timeval *stamp) +{ + if (atomic_read(&netstamp_needed)) + do_gettimeofday(stamp); + else { + stamp->tv_sec = 0; + stamp->tv_usec = 0; + } +} + +/* + * Support routine. Sends outgoing frames to any network + * taps currently in use. + */ + +void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) +{ + struct packet_type *ptype; + net_timestamp(&skb->stamp); + + rcu_read_lock(); + list_for_each_entry_rcu(ptype, &ptype_all, list) { + /* Never send packets back to the socket + * they originated from - MvS (miquels@drinkel.ow.org) + */ + if ((ptype->dev == dev || !ptype->dev) && + (ptype->af_packet_priv == NULL || + (struct sock *)ptype->af_packet_priv != skb->sk)) { + struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC); + if (!skb2) + break; + + /* skb->nh should be correctly + set by sender, so that the second statement is + just protection against buggy protocols. + */ + skb2->mac.raw = skb2->data; + + if (skb2->nh.raw < skb2->data || + skb2->nh.raw > skb2->tail) { + if (net_ratelimit()) + printk(KERN_CRIT "protocol %04x is " + "buggy, dev %s\n", + skb2->protocol, dev->name); + skb2->nh.raw = skb2->data; + } + + skb2->h.raw = skb2->nh.raw; + skb2->pkt_type = PACKET_OUTGOING; + ptype->func(skb2, skb->dev, ptype); + } + } + rcu_read_unlock(); +} + +/* + * Invalidate hardware checksum when packet is to be mangled, and + * complete checksum manually on outgoing path. + */ +int skb_checksum_help(struct sk_buff *skb, int inward) +{ + unsigned int csum; + int ret = 0, offset = skb->h.raw - skb->data; + + if (inward) { + skb->ip_summed = CHECKSUM_NONE; + goto out; + } + + if (skb_cloned(skb)) { + ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + if (ret) + goto out; + } + + if (offset > (int)skb->len) + BUG(); + csum = skb_checksum(skb, offset, skb->len-offset, 0); + + offset = skb->tail - skb->h.raw; + if (offset <= 0) + BUG(); + if (skb->csum + 2 > offset) + BUG(); + + *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; +out: + return ret; +} + +#ifdef CONFIG_HIGHMEM +/* Actually, we should eliminate this check as soon as we know, that: + * 1. IOMMU is present and allows to map all the memory. + * 2. No high memory really exists on this machine. + */ + +static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) +{ + int i; + + if (dev->features & NETIF_F_HIGHDMA) + return 0; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + if (PageHighMem(skb_shinfo(skb)->frags[i].page)) + return 1; + + return 0; +} +#else +#define illegal_highdma(dev, skb) (0) +#endif + +extern void skb_release_data(struct sk_buff *); + +/* Keep head the same: replace data */ +int __skb_linearize(struct sk_buff *skb, int gfp_mask) +{ + unsigned int size; + u8 *data; + long offset; + struct skb_shared_info *ninfo; + int headerlen = skb->data - skb->head; + int expand = (skb->tail + skb->data_len) - skb->end; + + if (skb_shared(skb)) + BUG(); + + if (expand <= 0) + expand = 0; + + size = skb->end - skb->head + expand; + size = SKB_DATA_ALIGN(size); + data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); + if (!data) + return -ENOMEM; + + /* Copy entire thing */ + if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len)) + BUG(); + + /* Set up shinfo */ + ninfo = (struct skb_shared_info*)(data + size); + atomic_set(&ninfo->dataref, 1); + ninfo->tso_size = skb_shinfo(skb)->tso_size; + ninfo->tso_segs = skb_shinfo(skb)->tso_segs; + ninfo->nr_frags = 0; + ninfo->frag_list = NULL; + + /* Offset between the two in bytes */ + offset = data - skb->head; + + /* Free old data. */ + skb_release_data(skb); + + skb->head = data; + skb->end = data + size; + + /* Set up new pointers */ + skb->h.raw += offset; + skb->nh.raw += offset; + skb->mac.raw += offset; + skb->tail += offset; + skb->data += offset; + + /* We are no longer a clone, even if we were. */ + skb->cloned = 0; + + skb->tail += skb->data_len; + skb->data_len = 0; + return 0; +} + +#define HARD_TX_LOCK(dev, cpu) { \ + if ((dev->features & NETIF_F_LLTX) == 0) { \ + spin_lock(&dev->xmit_lock); \ + dev->xmit_lock_owner = cpu; \ + } \ +} + +#define HARD_TX_UNLOCK(dev) { \ + if ((dev->features & NETIF_F_LLTX) == 0) { \ + dev->xmit_lock_owner = -1; \ + spin_unlock(&dev->xmit_lock); \ + } \ +} + +/** + * dev_queue_xmit - transmit a buffer + * @skb: buffer to transmit + * + * Queue a buffer for transmission to a network device. The caller must + * have set the device and priority and built the buffer before calling + * this function. The function can be called from an interrupt. + * + * A negative errno code is returned on a failure. A success does not + * guarantee the frame will be transmitted as it may be dropped due + * to congestion or traffic shaping. + */ + +int dev_queue_xmit(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct Qdisc *q; + int rc = -ENOMEM; + + if (skb_shinfo(skb)->frag_list && + !(dev->features & NETIF_F_FRAGLIST) && + __skb_linearize(skb, GFP_ATOMIC)) + goto out_kfree_skb; + + /* Fragmented skb is linearized if device does not support SG, + * or if at least one of fragments is in highmem and device + * does not support DMA from it. + */ + if (skb_shinfo(skb)->nr_frags && + (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && + __skb_linearize(skb, GFP_ATOMIC)) + goto out_kfree_skb; + + /* If packet is not checksummed and device does not support + * checksumming for this protocol, complete checksumming here. + */ + if (skb->ip_summed == CHECKSUM_HW && + (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) && + (!(dev->features & NETIF_F_IP_CSUM) || + skb->protocol != htons(ETH_P_IP)))) + if (skb_checksum_help(skb, 0)) + goto out_kfree_skb; + + /* Disable soft irqs for various locks below. Also + * stops preemption for RCU. + */ + local_bh_disable(); + + /* Updates of qdisc are serialized by queue_lock. + * The struct Qdisc which is pointed to by qdisc is now a + * rcu structure - it may be accessed without acquiring + * a lock (but the structure may be stale.) The freeing of the + * qdisc will be deferred until it's known that there are no + * more references to it. + * + * If the qdisc has an enqueue function, we still need to + * hold the queue_lock before calling it, since queue_lock + * also serializes access to the device queue. + */ + + q = rcu_dereference(dev->qdisc); +#ifdef CONFIG_NET_CLS_ACT + skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS); +#endif + if (q->enqueue) { + /* Grab device queue */ + spin_lock(&dev->queue_lock); + + rc = q->enqueue(skb, q); + + qdisc_run(dev); + + spin_unlock(&dev->queue_lock); + rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; + goto out; + } + + /* The device has no queue. Common case for software devices: + loopback, all the sorts of tunnels... + + Really, it is unlikely that xmit_lock protection is necessary here. + (f.e. loopback and IP tunnels are clean ignoring statistics + counters.) + However, it is possible, that they rely on protection + made by us here. + + Check this and shot the lock. It is not prone from deadlocks. + Either shot noqueue qdisc, it is even simpler 8) + */ + if (dev->flags & IFF_UP) { + int cpu = smp_processor_id(); /* ok because BHs are off */ + + if (dev->xmit_lock_owner != cpu) { + + HARD_TX_LOCK(dev, cpu); + + if (!netif_queue_stopped(dev)) { + if (netdev_nit) + dev_queue_xmit_nit(skb, dev); + + rc = 0; + if (!dev->hard_start_xmit(skb, dev)) { + HARD_TX_UNLOCK(dev); + goto out; + } + } + HARD_TX_UNLOCK(dev); + if (net_ratelimit()) + printk(KERN_CRIT "Virtual device %s asks to " + "queue packet!\n", dev->name); + } else { + /* Recursion is detected! It is possible, + * unfortunately */ + if (net_ratelimit()) + printk(KERN_CRIT "Dead loop on virtual device " + "%s, fix it urgently!\n", dev->name); + } + } + + rc = -ENETDOWN; + local_bh_enable(); + +out_kfree_skb: + kfree_skb(skb); + return rc; +out: + local_bh_enable(); + return rc; +} + + +/*======================================================================= + Receiver routines + =======================================================================*/ + +int netdev_max_backlog = 300; +int weight_p = 64; /* old backlog weight */ +/* These numbers are selected based on intuition and some + * experimentatiom, if you have more scientific way of doing this + * please go ahead and fix things. + */ +int no_cong_thresh = 10; +int no_cong = 20; +int lo_cong = 100; +int mod_cong = 290; + +DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, }; + + +static void get_sample_stats(int cpu) +{ +#ifdef RAND_LIE + unsigned long rd; + int rq; +#endif + struct softnet_data *sd = &per_cpu(softnet_data, cpu); + int blog = sd->input_pkt_queue.qlen; + int avg_blog = sd->avg_blog; + + avg_blog = (avg_blog >> 1) + (blog >> 1); + + if (avg_blog > mod_cong) { + /* Above moderate congestion levels. */ + sd->cng_level = NET_RX_CN_HIGH; +#ifdef RAND_LIE + rd = net_random(); + rq = rd % netdev_max_backlog; + if (rq < avg_blog) /* unlucky bastard */ + sd->cng_level = NET_RX_DROP; +#endif + } else if (avg_blog > lo_cong) { + sd->cng_level = NET_RX_CN_MOD; +#ifdef RAND_LIE + rd = net_random(); + rq = rd % netdev_max_backlog; + if (rq < avg_blog) /* unlucky bastard */ + sd->cng_level = NET_RX_CN_HIGH; +#endif + } else if (avg_blog > no_cong) + sd->cng_level = NET_RX_CN_LOW; + else /* no congestion */ + sd->cng_level = NET_RX_SUCCESS; + + sd->avg_blog = avg_blog; +} + +#ifdef OFFLINE_SAMPLE +static void sample_queue(unsigned long dummy) +{ +/* 10 ms 0r 1ms -- i don't care -- JHS */ + int next_tick = 1; + int cpu = smp_processor_id(); + + get_sample_stats(cpu); + next_tick += jiffies; + mod_timer(&samp_timer, next_tick); +} +#endif + + +/** + * netif_rx - post buffer to the network code + * @skb: buffer to post + * + * This function receives a packet from a device driver and queues it for + * the upper (protocol) levels to process. It always succeeds. The buffer + * may be dropped during processing for congestion control or by the + * protocol layers. + * + * return values: + * NET_RX_SUCCESS (no congestion) + * NET_RX_CN_LOW (low congestion) + * NET_RX_CN_MOD (moderate congestion) + * NET_RX_CN_HIGH (high congestion) + * NET_RX_DROP (packet was dropped) + * + */ + +int netif_rx(struct sk_buff *skb) +{ + int this_cpu; + struct softnet_data *queue; + unsigned long flags; + + /* if netpoll wants it, pretend we never saw it */ + if (netpoll_rx(skb)) + return NET_RX_DROP; + + if (!skb->stamp.tv_sec) + net_timestamp(&skb->stamp); + + /* + * The code is rearranged so that the path is the most + * short when CPU is congested, but is still operating. + */ + local_irq_save(flags); + this_cpu = smp_processor_id(); + queue = &__get_cpu_var(softnet_data); + + __get_cpu_var(netdev_rx_stat).total++; + if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { + if (queue->input_pkt_queue.qlen) { + if (queue->throttle) + goto drop; + +enqueue: + dev_hold(skb->dev); + __skb_queue_tail(&queue->input_pkt_queue, skb); +#ifndef OFFLINE_SAMPLE + get_sample_stats(this_cpu); +#endif + local_irq_restore(flags); + return queue->cng_level; + } + + if (queue->throttle) + queue->throttle = 0; + + netif_rx_schedule(&queue->backlog_dev); + goto enqueue; + } + + if (!queue->throttle) { + queue->throttle = 1; + __get_cpu_var(netdev_rx_stat).throttled++; + } + +drop: + __get_cpu_var(netdev_rx_stat).dropped++; + local_irq_restore(flags); + + kfree_skb(skb); + return NET_RX_DROP; +} + +int netif_rx_ni(struct sk_buff *skb) +{ + int err; + + preempt_disable(); + err = netif_rx(skb); + if (local_softirq_pending()) + do_softirq(); + preempt_enable(); + + return err; +} + +EXPORT_SYMBOL(netif_rx_ni); + +static __inline__ void skb_bond(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + + if (dev->master) { + skb->real_dev = skb->dev; + skb->dev = dev->master; + } +} + +static void net_tx_action(struct softirq_action *h) +{ + struct softnet_data *sd = &__get_cpu_var(softnet_data); + + if (sd->completion_queue) { + struct sk_buff *clist; + + local_irq_disable(); + clist = sd->completion_queue; + sd->completion_queue = NULL; + local_irq_enable(); + + while (clist) { + struct sk_buff *skb = clist; + clist = clist->next; + + BUG_TRAP(!atomic_read(&skb->users)); + __kfree_skb(skb); + } + } + + if (sd->output_queue) { + struct net_device *head; + + local_irq_disable(); + head = sd->output_queue; + sd->output_queue = NULL; + local_irq_enable(); + + while (head) { + struct net_device *dev = head; + head = head->next_sched; + + smp_mb__before_clear_bit(); + clear_bit(__LINK_STATE_SCHED, &dev->state); + + if (spin_trylock(&dev->queue_lock)) { + qdisc_run(dev); + spin_unlock(&dev->queue_lock); + } else { + netif_schedule(dev); + } + } + } +} + +static __inline__ int deliver_skb(struct sk_buff *skb, + struct packet_type *pt_prev) +{ + atomic_inc(&skb->users); + return pt_prev->func(skb, skb->dev, pt_prev); +} + +#if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE) +int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb); +struct net_bridge; +struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, + unsigned char *addr); +void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); + +static __inline__ int handle_bridge(struct sk_buff **pskb, + struct packet_type **pt_prev, int *ret) +{ + struct net_bridge_port *port; + + if ((*pskb)->pkt_type == PACKET_LOOPBACK || + (port = rcu_dereference((*pskb)->dev->br_port)) == NULL) + return 0; + + if (*pt_prev) { + *ret = deliver_skb(*pskb, *pt_prev); + *pt_prev = NULL; + } + + return br_handle_frame_hook(port, pskb); +} +#else +#define handle_bridge(skb, pt_prev, ret) (0) +#endif + +#ifdef CONFIG_NET_CLS_ACT +/* TODO: Maybe we should just force sch_ingress to be compiled in + * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions + * a compare and 2 stores extra right now if we dont have it on + * but have CONFIG_NET_CLS_ACT + * NOTE: This doesnt stop any functionality; if you dont have + * the ingress scheduler, you just cant add policies on ingress. + * + */ +static int ing_filter(struct sk_buff *skb) +{ + struct Qdisc *q; + struct net_device *dev = skb->dev; + int result = TC_ACT_OK; + + if (dev->qdisc_ingress) { + __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd); + if (MAX_RED_LOOP < ttl++) { + printk("Redir loop detected Dropping packet (%s->%s)\n", + skb->input_dev?skb->input_dev->name:"??",skb->dev->name); + return TC_ACT_SHOT; + } + + skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl); + + skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS); + if (NULL == skb->input_dev) { + skb->input_dev = skb->dev; + printk("ing_filter: fixed %s out %s\n",skb->input_dev->name,skb->dev->name); + } + spin_lock(&dev->ingress_lock); + if ((q = dev->qdisc_ingress) != NULL) + result = q->enqueue(skb, q); + spin_unlock(&dev->ingress_lock); + + } + + return result; +} +#endif + +int netif_receive_skb(struct sk_buff *skb) +{ + struct packet_type *ptype, *pt_prev; + int ret = NET_RX_DROP; + unsigned short type; + + /* if we've gotten here through NAPI, check netpoll */ + if (skb->dev->poll && netpoll_rx(skb)) + return NET_RX_DROP; + + if (!skb->stamp.tv_sec) + net_timestamp(&skb->stamp); + + skb_bond(skb); + + __get_cpu_var(netdev_rx_stat).total++; + + skb->h.raw = skb->nh.raw = skb->data; + skb->mac_len = skb->nh.raw - skb->mac.raw; + + pt_prev = NULL; + + rcu_read_lock(); + +#ifdef CONFIG_NET_CLS_ACT + if (skb->tc_verd & TC_NCLS) { + skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); + goto ncls; + } +#endif + + list_for_each_entry_rcu(ptype, &ptype_all, list) { + if (!ptype->dev || ptype->dev == skb->dev) { + if (pt_prev) + ret = deliver_skb(skb, pt_prev); + pt_prev = ptype; + } + } + +#ifdef CONFIG_NET_CLS_ACT + if (pt_prev) { + ret = deliver_skb(skb, pt_prev); + pt_prev = NULL; /* noone else should process this after*/ + } else { + skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); + } + + ret = ing_filter(skb); + + if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) { + kfree_skb(skb); + goto out; + } + + skb->tc_verd = 0; +ncls: +#endif + + handle_diverter(skb); + + if (handle_bridge(&skb, &pt_prev, &ret)) + goto out; + + type = skb->protocol; + list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) { + if (ptype->type == type && + (!ptype->dev || ptype->dev == skb->dev)) { + if (pt_prev) + ret = deliver_skb(skb, pt_prev); + pt_prev = ptype; + } + } + + if (pt_prev) { + ret = pt_prev->func(skb, skb->dev, pt_prev); + } else { + kfree_skb(skb); + /* Jamal, now you will not able to escape explaining + * me how you were going to use this. :-) + */ + ret = NET_RX_DROP; + } + +out: + rcu_read_unlock(); + return ret; +} + +static int process_backlog(struct net_device *backlog_dev, int *budget) +{ + int work = 0; + int quota = min(backlog_dev->quota, *budget); + struct softnet_data *queue = &__get_cpu_var(softnet_data); + unsigned long start_time = jiffies; + + for (;;) { + struct sk_buff *skb; + struct net_device *dev; + + local_irq_disable(); + skb = __skb_dequeue(&queue->input_pkt_queue); + if (!skb) + goto job_done; + local_irq_enable(); + + dev = skb->dev; + + netif_receive_skb(skb); + + dev_put(dev); + + work++; + + if (work >= quota || jiffies - start_time > 1) + break; + + } + + backlog_dev->quota -= work; + *budget -= work; + return -1; + +job_done: + backlog_dev->quota -= work; + *budget -= work; + + list_del(&backlog_dev->poll_list); + smp_mb__before_clear_bit(); + netif_poll_enable(backlog_dev); + + if (queue->throttle) + queue->throttle = 0; + local_irq_enable(); + return 0; +} + +static void net_rx_action(struct softirq_action *h) +{ + struct softnet_data *queue = &__get_cpu_var(softnet_data); + unsigned long start_time = jiffies; + int budget = netdev_max_backlog; + + + local_irq_disable(); + + while (!list_empty(&queue->poll_list)) { + struct net_device *dev; + + if (budget <= 0 || jiffies - start_time > 1) + goto softnet_break; + + local_irq_enable(); + + dev = list_entry(queue->poll_list.next, + struct net_device, poll_list); + netpoll_poll_lock(dev); + + if (dev->quota <= 0 || dev->poll(dev, &budget)) { + netpoll_poll_unlock(dev); + local_irq_disable(); + list_del(&dev->poll_list); + list_add_tail(&dev->poll_list, &queue->poll_list); + if (dev->quota < 0) + dev->quota += dev->weight; + else + dev->quota = dev->weight; + } else { + netpoll_poll_unlock(dev); + dev_put(dev); + local_irq_disable(); + } + } +out: + local_irq_enable(); + return; + +softnet_break: + __get_cpu_var(netdev_rx_stat).time_squeeze++; + __raise_softirq_irqoff(NET_RX_SOFTIRQ); + goto out; +} + +static gifconf_func_t * gifconf_list [NPROTO]; + +/** + * register_gifconf - register a SIOCGIF handler + * @family: Address family + * @gifconf: Function handler + * + * Register protocol dependent address dumping routines. The handler + * that is passed must not be freed or reused until it has been replaced + * by another handler. + */ +int register_gifconf(unsigned int family, gifconf_func_t * gifconf) +{ + if (family >= NPROTO) + return -EINVAL; + gifconf_list[family] = gifconf; + return 0; +} + + +/* + * Map an interface index to its name (SIOCGIFNAME) + */ + +/* + * We need this ioctl for efficient implementation of the + * if_indextoname() function required by the IPv6 API. Without + * it, we would have to search all the interfaces to find a + * match. --pb + */ + +static int dev_ifname(struct ifreq __user *arg) +{ + struct net_device *dev; + struct ifreq ifr; + + /* + * Fetch the caller's info block. + */ + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + read_lock(&dev_base_lock); + dev = __dev_get_by_index(ifr.ifr_ifindex); + if (!dev) { + read_unlock(&dev_base_lock); + return -ENODEV; + } + + strcpy(ifr.ifr_name, dev->name); + read_unlock(&dev_base_lock); + + if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return 0; +} + +/* + * Perform a SIOCGIFCONF call. This structure will change + * size eventually, and there is nothing I can do about it. + * Thus we will need a 'compatibility mode'. + */ + +static int dev_ifconf(char __user *arg) +{ + struct ifconf ifc; + struct net_device *dev; + char __user *pos; + int len; + int total; + int i; + + /* + * Fetch the caller's info block. + */ + + if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) + return -EFAULT; + + pos = ifc.ifc_buf; + len = ifc.ifc_len; + + /* + * Loop over the interfaces, and write an info block for each. + */ + + total = 0; + for (dev = dev_base; dev; dev = dev->next) { + for (i = 0; i < NPROTO; i++) { + if (gifconf_list[i]) { + int done; + if (!pos) + done = gifconf_list[i](dev, NULL, 0); + else + done = gifconf_list[i](dev, pos + total, + len - total); + if (done < 0) + return -EFAULT; + total += done; + } + } + } + + /* + * All done. Write the updated control block back to the caller. + */ + ifc.ifc_len = total; + + /* + * Both BSD and Solaris return 0 here, so we do too. + */ + return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; +} + +#ifdef CONFIG_PROC_FS +/* + * This is invoked by the /proc filesystem handler to display a device + * in detail. + */ +static __inline__ struct net_device *dev_get_idx(loff_t pos) +{ + struct net_device *dev; + loff_t i; + + for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next); + + return i == pos ? dev : NULL; +} + +void *dev_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&dev_base_lock); + return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN; +} + +void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next; +} + +void dev_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&dev_base_lock); +} + +static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) +{ + if (dev->get_stats) { + struct net_device_stats *stats = dev->get_stats(dev); + + seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu " + "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n", + dev->name, stats->rx_bytes, stats->rx_packets, + stats->rx_errors, + stats->rx_dropped + stats->rx_missed_errors, + stats->rx_fifo_errors, + stats->rx_length_errors + stats->rx_over_errors + + stats->rx_crc_errors + stats->rx_frame_errors, + stats->rx_compressed, stats->multicast, + stats->tx_bytes, stats->tx_packets, + stats->tx_errors, stats->tx_dropped, + stats->tx_fifo_errors, stats->collisions, + stats->tx_carrier_errors + + stats->tx_aborted_errors + + stats->tx_window_errors + + stats->tx_heartbeat_errors, + stats->tx_compressed); + } else + seq_printf(seq, "%6s: No statistics available.\n", dev->name); +} + +/* + * Called from the PROCfs module. This now uses the new arbitrary sized + * /proc/net interface to create /proc/net/dev + */ +static int dev_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Inter-| Receive " + " | Transmit\n" + " face |bytes packets errs drop fifo frame " + "compressed multicast|bytes packets errs " + "drop fifo colls carrier compressed\n"); + else + dev_seq_printf_stats(seq, v); + return 0; +} + +static struct netif_rx_stats *softnet_get_online(loff_t *pos) +{ + struct netif_rx_stats *rc = NULL; + + while (*pos < NR_CPUS) + if (cpu_online(*pos)) { + rc = &per_cpu(netdev_rx_stat, *pos); + break; + } else + ++*pos; + return rc; +} + +static void *softnet_seq_start(struct seq_file *seq, loff_t *pos) +{ + return softnet_get_online(pos); +} + +static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return softnet_get_online(pos); +} + +static void softnet_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int softnet_seq_show(struct seq_file *seq, void *v) +{ + struct netif_rx_stats *s = v; + + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n", + s->total, s->dropped, s->time_squeeze, s->throttled, + s->fastroute_hit, s->fastroute_success, s->fastroute_defer, + s->fastroute_deferred_out, +#if 0 + s->fastroute_latency_reduction +#else + s->cpu_collision +#endif + ); + return 0; +} + +static struct seq_operations dev_seq_ops = { + .start = dev_seq_start, + .next = dev_seq_next, + .stop = dev_seq_stop, + .show = dev_seq_show, +}; + +static int dev_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &dev_seq_ops); +} + +static struct file_operations dev_seq_fops = { + .owner = THIS_MODULE, + .open = dev_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct seq_operations softnet_seq_ops = { + .start = softnet_seq_start, + .next = softnet_seq_next, + .stop = softnet_seq_stop, + .show = softnet_seq_show, +}; + +static int softnet_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &softnet_seq_ops); +} + +static struct file_operations softnet_seq_fops = { + .owner = THIS_MODULE, + .open = softnet_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#ifdef WIRELESS_EXT +extern int wireless_proc_init(void); +#else +#define wireless_proc_init() 0 +#endif + +static int __init dev_proc_init(void) +{ + int rc = -ENOMEM; + + if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops)) + goto out; + if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops)) + goto out_dev; + if (wireless_proc_init()) + goto out_softnet; + rc = 0; +out: + return rc; +out_softnet: + proc_net_remove("softnet_stat"); +out_dev: + proc_net_remove("dev"); + goto out; +} +#else +#define dev_proc_init() 0 +#endif /* CONFIG_PROC_FS */ + + +/** + * netdev_set_master - set up master/slave pair + * @slave: slave device + * @master: new master device + * + * Changes the master device of the slave. Pass %NULL to break the + * bonding. The caller must hold the RTNL semaphore. On a failure + * a negative errno code is returned. On success the reference counts + * are adjusted, %RTM_NEWLINK is sent to the routing socket and the + * function returns zero. + */ +int netdev_set_master(struct net_device *slave, struct net_device *master) +{ + struct net_device *old = slave->master; + + ASSERT_RTNL(); + + if (master) { + if (old) + return -EBUSY; + dev_hold(master); + } + + slave->master = master; + + synchronize_net(); + + if (old) + dev_put(old); + + if (master) + slave->flags |= IFF_SLAVE; + else + slave->flags &= ~IFF_SLAVE; + + rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE); + return 0; +} + +/** + * dev_set_promiscuity - update promiscuity count on a device + * @dev: device + * @inc: modifier + * + * Add or remove promsicuity from a device. While the count in the device + * remains above zero the interface remains promiscuous. Once it hits zero + * the device reverts back to normal filtering operation. A negative inc + * value is used to drop promiscuity on the device. + */ +void dev_set_promiscuity(struct net_device *dev, int inc) +{ + unsigned short old_flags = dev->flags; + + dev->flags |= IFF_PROMISC; + if ((dev->promiscuity += inc) == 0) + dev->flags &= ~IFF_PROMISC; + if (dev->flags ^ old_flags) { + dev_mc_upload(dev); + printk(KERN_INFO "device %s %s promiscuous mode\n", + dev->name, (dev->flags & IFF_PROMISC) ? "entered" : + "left"); + } +} + +/** + * dev_set_allmulti - update allmulti count on a device + * @dev: device + * @inc: modifier + * + * Add or remove reception of all multicast frames to a device. While the + * count in the device remains above zero the interface remains listening + * to all interfaces. Once it hits zero the device reverts back to normal + * filtering operation. A negative @inc value is used to drop the counter + * when releasing a resource needing all multicasts. + */ + +void dev_set_allmulti(struct net_device *dev, int inc) +{ + unsigned short old_flags = dev->flags; + + dev->flags |= IFF_ALLMULTI; + if ((dev->allmulti += inc) == 0) + dev->flags &= ~IFF_ALLMULTI; + if (dev->flags ^ old_flags) + dev_mc_upload(dev); +} + +unsigned dev_get_flags(const struct net_device *dev) +{ + unsigned flags; + + flags = (dev->flags & ~(IFF_PROMISC | + IFF_ALLMULTI | + IFF_RUNNING)) | + (dev->gflags & (IFF_PROMISC | + IFF_ALLMULTI)); + + if (netif_running(dev) && netif_carrier_ok(dev)) + flags |= IFF_RUNNING; + + return flags; +} + +int dev_change_flags(struct net_device *dev, unsigned flags) +{ + int ret; + int old_flags = dev->flags; + + /* + * Set the flags on our device. + */ + + dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP | + IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL | + IFF_AUTOMEDIA)) | + (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC | + IFF_ALLMULTI)); + + /* + * Load in the correct multicast list now the flags have changed. + */ + + dev_mc_upload(dev); + + /* + * Have we downed the interface. We handle IFF_UP ourselves + * according to user attempts to set it, rather than blindly + * setting it. + */ + + ret = 0; + if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */ + ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev); + + if (!ret) + dev_mc_upload(dev); + } + + if (dev->flags & IFF_UP && + ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI | + IFF_VOLATILE))) + notifier_call_chain(&netdev_chain, NETDEV_CHANGE, dev); + + if ((flags ^ dev->gflags) & IFF_PROMISC) { + int inc = (flags & IFF_PROMISC) ? +1 : -1; + dev->gflags ^= IFF_PROMISC; + dev_set_promiscuity(dev, inc); + } + + /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI + is important. Some (broken) drivers set IFF_PROMISC, when + IFF_ALLMULTI is requested not asking us and not reporting. + */ + if ((flags ^ dev->gflags) & IFF_ALLMULTI) { + int inc = (flags & IFF_ALLMULTI) ? +1 : -1; + dev->gflags ^= IFF_ALLMULTI; + dev_set_allmulti(dev, inc); + } + + if (old_flags ^ dev->flags) + rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags); + + return ret; +} + +int dev_set_mtu(struct net_device *dev, int new_mtu) +{ + int err; + + if (new_mtu == dev->mtu) + return 0; + + /* MTU must be positive. */ + if (new_mtu < 0) + return -EINVAL; + + if (!netif_device_present(dev)) + return -ENODEV; + + err = 0; + if (dev->change_mtu) + err = dev->change_mtu(dev, new_mtu); + else + dev->mtu = new_mtu; + if (!err && dev->flags & IFF_UP) + notifier_call_chain(&netdev_chain, + NETDEV_CHANGEMTU, dev); + return err; +} + +int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa) +{ + int err; + + if (!dev->set_mac_address) + return -EOPNOTSUPP; + if (sa->sa_family != dev->type) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + err = dev->set_mac_address(dev, sa); + if (!err) + notifier_call_chain(&netdev_chain, NETDEV_CHANGEADDR, dev); + return err; +} + +/* + * Perform the SIOCxIFxxx calls. + */ +static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd) +{ + int err; + struct net_device *dev = __dev_get_by_name(ifr->ifr_name); + + if (!dev) + return -ENODEV; + + switch (cmd) { + case SIOCGIFFLAGS: /* Get interface flags */ + ifr->ifr_flags = dev_get_flags(dev); + return 0; + + case SIOCSIFFLAGS: /* Set interface flags */ + return dev_change_flags(dev, ifr->ifr_flags); + + case SIOCGIFMETRIC: /* Get the metric on the interface + (currently unused) */ + ifr->ifr_metric = 0; + return 0; + + case SIOCSIFMETRIC: /* Set the metric on the interface + (currently unused) */ + return -EOPNOTSUPP; + + case SIOCGIFMTU: /* Get the MTU of a device */ + ifr->ifr_mtu = dev->mtu; + return 0; + + case SIOCSIFMTU: /* Set the MTU of a device */ + return dev_set_mtu(dev, ifr->ifr_mtu); + + case SIOCGIFHWADDR: + if (!dev->addr_len) + memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data); + else + memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr, + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + ifr->ifr_hwaddr.sa_family = dev->type; + return 0; + + case SIOCSIFHWADDR: + return dev_set_mac_address(dev, &ifr->ifr_hwaddr); + + case SIOCSIFHWBROADCAST: + if (ifr->ifr_hwaddr.sa_family != dev->type) + return -EINVAL; + memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, + min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len)); + notifier_call_chain(&netdev_chain, + NETDEV_CHANGEADDR, dev); + return 0; + + case SIOCGIFMAP: + ifr->ifr_map.mem_start = dev->mem_start; + ifr->ifr_map.mem_end = dev->mem_end; + ifr->ifr_map.base_addr = dev->base_addr; + ifr->ifr_map.irq = dev->irq; + ifr->ifr_map.dma = dev->dma; + ifr->ifr_map.port = dev->if_port; + return 0; + + case SIOCSIFMAP: + if (dev->set_config) { + if (!netif_device_present(dev)) + return -ENODEV; + return dev->set_config(dev, &ifr->ifr_map); + } + return -EOPNOTSUPP; + + case SIOCADDMULTI: + if (!dev->set_multicast_list || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data, + dev->addr_len, 1); + + case SIOCDELMULTI: + if (!dev->set_multicast_list || + ifr->ifr_hwaddr.sa_family != AF_UNSPEC) + return -EINVAL; + if (!netif_device_present(dev)) + return -ENODEV; + return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data, + dev->addr_len, 1); + + case SIOCGIFINDEX: + ifr->ifr_ifindex = dev->ifindex; + return 0; + + case SIOCGIFTXQLEN: + ifr->ifr_qlen = dev->tx_queue_len; + return 0; + + case SIOCSIFTXQLEN: + if (ifr->ifr_qlen < 0) + return -EINVAL; + dev->tx_queue_len = ifr->ifr_qlen; + return 0; + + case SIOCSIFNAME: + ifr->ifr_newname[IFNAMSIZ-1] = '\0'; + return dev_change_name(dev, ifr->ifr_newname); + + /* + * Unknown or private ioctl + */ + + default: + if ((cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15) || + cmd == SIOCBONDENSLAVE || + cmd == SIOCBONDRELEASE || + cmd == SIOCBONDSETHWADDR || + cmd == SIOCBONDSLAVEINFOQUERY || + cmd == SIOCBONDINFOQUERY || + cmd == SIOCBONDCHANGEACTIVE || + cmd == SIOCGMIIPHY || + cmd == SIOCGMIIREG || + cmd == SIOCSMIIREG || + cmd == SIOCBRADDIF || + cmd == SIOCBRDELIF || + cmd == SIOCWANDEV) { + err = -EOPNOTSUPP; + if (dev->do_ioctl) { + if (netif_device_present(dev)) + err = dev->do_ioctl(dev, ifr, + cmd); + else + err = -ENODEV; + } + } else + err = -EINVAL; + + } + return err; +} + +/* + * This function handles all "interface"-type I/O control requests. The actual + * 'doing' part of this is dev_ifsioc above. + */ + +/** + * dev_ioctl - network device ioctl + * @cmd: command to issue + * @arg: pointer to a struct ifreq in user space + * + * Issue ioctl functions to devices. This is normally called by the + * user space syscall interfaces but can sometimes be useful for + * other purposes. The return value is the return from the syscall if + * positive or a negative errno code on error. + */ + +int dev_ioctl(unsigned int cmd, void __user *arg) +{ + struct ifreq ifr; + int ret; + char *colon; + + /* One special case: SIOCGIFCONF takes ifconf argument + and requires shared lock, because it sleeps writing + to user space. + */ + + if (cmd == SIOCGIFCONF) { + rtnl_shlock(); + ret = dev_ifconf((char __user *) arg); + rtnl_shunlock(); + return ret; + } + if (cmd == SIOCGIFNAME) + return dev_ifname((struct ifreq __user *)arg); + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + ifr.ifr_name[IFNAMSIZ-1] = 0; + + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + + /* + * See which interface the caller is talking about. + */ + + switch (cmd) { + /* + * These ioctl calls: + * - can be done by all. + * - atomic and do not require locking. + * - return a value + */ + case SIOCGIFFLAGS: + case SIOCGIFMETRIC: + case SIOCGIFMTU: + case SIOCGIFHWADDR: + case SIOCGIFSLAVE: + case SIOCGIFMAP: + case SIOCGIFINDEX: + case SIOCGIFTXQLEN: + dev_load(ifr.ifr_name); + read_lock(&dev_base_lock); + ret = dev_ifsioc(&ifr, cmd); + read_unlock(&dev_base_lock); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + case SIOCETHTOOL: + dev_load(ifr.ifr_name); + rtnl_lock(); + ret = dev_ethtool(&ifr); + rtnl_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - return a value + */ + case SIOCGMIIPHY: + case SIOCGMIIREG: + case SIOCSIFNAME: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + dev_load(ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(&ifr, cmd); + rtnl_unlock(); + if (!ret) { + if (colon) + *colon = ':'; + if (copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + } + return ret; + + /* + * These ioctl calls: + * - require superuser power. + * - require strict serialization. + * - do not return a value + */ + case SIOCSIFFLAGS: + case SIOCSIFMETRIC: + case SIOCSIFMTU: + case SIOCSIFMAP: + case SIOCSIFHWADDR: + case SIOCSIFSLAVE: + case SIOCADDMULTI: + case SIOCDELMULTI: + case SIOCSIFHWBROADCAST: + case SIOCSIFTXQLEN: + case SIOCSMIIREG: + case SIOCBONDENSLAVE: + case SIOCBONDRELEASE: + case SIOCBONDSETHWADDR: + case SIOCBONDSLAVEINFOQUERY: + case SIOCBONDINFOQUERY: + case SIOCBONDCHANGEACTIVE: + case SIOCBRADDIF: + case SIOCBRDELIF: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + dev_load(ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(&ifr, cmd); + rtnl_unlock(); + return ret; + + case SIOCGIFMEM: + /* Get the per device memory space. We can add this but + * currently do not support it */ + case SIOCSIFMEM: + /* Set the per device memory buffer space. + * Not applicable in our case */ + case SIOCSIFLINK: + return -EINVAL; + + /* + * Unknown or private ioctl. + */ + default: + if (cmd == SIOCWANDEV || + (cmd >= SIOCDEVPRIVATE && + cmd <= SIOCDEVPRIVATE + 15)) { + dev_load(ifr.ifr_name); + rtnl_lock(); + ret = dev_ifsioc(&ifr, cmd); + rtnl_unlock(); + if (!ret && copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + return ret; + } +#ifdef WIRELESS_EXT + /* Take care of Wireless Extensions */ + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { + /* If command is `set a parameter', or + * `get the encoding parameters', check if + * the user has the right to do it */ + if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE) { + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + } + dev_load(ifr.ifr_name); + rtnl_lock(); + /* Follow me in net/core/wireless.c */ + ret = wireless_process_ioctl(&ifr, cmd); + rtnl_unlock(); + if (IW_IS_GET(cmd) && + copy_to_user(arg, &ifr, + sizeof(struct ifreq))) + ret = -EFAULT; + return ret; + } +#endif /* WIRELESS_EXT */ + return -EINVAL; + } +} + + +/** + * dev_new_index - allocate an ifindex + * + * Returns a suitable unique value for a new device interface + * number. The caller must hold the rtnl semaphore or the + * dev_base_lock to be sure it remains unique. + */ +static int dev_new_index(void) +{ + static int ifindex; + for (;;) { + if (++ifindex <= 0) + ifindex = 1; + if (!__dev_get_by_index(ifindex)) + return ifindex; + } +} + +static int dev_boot_phase = 1; + +/* Delayed registration/unregisteration */ +static DEFINE_SPINLOCK(net_todo_list_lock); +static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list); + +static inline void net_set_todo(struct net_device *dev) +{ + spin_lock(&net_todo_list_lock); + list_add_tail(&dev->todo_list, &net_todo_list); + spin_unlock(&net_todo_list_lock); +} + +/** + * register_netdevice - register a network device + * @dev: device to register + * + * Take a completed network device structure and add it to the kernel + * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier + * chain. 0 is returned on success. A negative errno code is returned + * on a failure to set up the device, or if the name is a duplicate. + * + * Callers must hold the rtnl semaphore. You may want + * register_netdev() instead of this. + * + * BUGS: + * The locking appears insufficient to guarantee two parallel registers + * will not get the same name. + */ + +int register_netdevice(struct net_device *dev) +{ + struct hlist_head *head; + struct hlist_node *p; + int ret; + + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + + /* When net_device's are persistent, this will be fatal. */ + BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); + + spin_lock_init(&dev->queue_lock); + spin_lock_init(&dev->xmit_lock); + dev->xmit_lock_owner = -1; +#ifdef CONFIG_NET_CLS_ACT + spin_lock_init(&dev->ingress_lock); +#endif + + ret = alloc_divert_blk(dev); + if (ret) + goto out; + + dev->iflink = -1; + + /* Init, if this function is available */ + if (dev->init) { + ret = dev->init(dev); + if (ret) { + if (ret > 0) + ret = -EIO; + goto out_err; + } + } + + if (!dev_valid_name(dev->name)) { + ret = -EINVAL; + goto out_err; + } + + dev->ifindex = dev_new_index(); + if (dev->iflink == -1) + dev->iflink = dev->ifindex; + + /* Check for existence of name */ + head = dev_name_hash(dev->name); + hlist_for_each(p, head) { + struct net_device *d + = hlist_entry(p, struct net_device, name_hlist); + if (!strncmp(d->name, dev->name, IFNAMSIZ)) { + ret = -EEXIST; + goto out_err; + } + } + + /* Fix illegal SG+CSUM combinations. */ + if ((dev->features & NETIF_F_SG) && + !(dev->features & (NETIF_F_IP_CSUM | + NETIF_F_NO_CSUM | + NETIF_F_HW_CSUM))) { + printk("%s: Dropping NETIF_F_SG since no checksum feature.\n", + dev->name); + dev->features &= ~NETIF_F_SG; + } + + /* TSO requires that SG is present as well. */ + if ((dev->features & NETIF_F_TSO) && + !(dev->features & NETIF_F_SG)) { + printk("%s: Dropping NETIF_F_TSO since no SG feature.\n", + dev->name); + dev->features &= ~NETIF_F_TSO; + } + + /* + * nil rebuild_header routine, + * that should be never called and used as just bug trap. + */ + + if (!dev->rebuild_header) + dev->rebuild_header = default_rebuild_header; + + /* + * Default initial state at registry is that the + * device is present. + */ + + set_bit(__LINK_STATE_PRESENT, &dev->state); + + dev->next = NULL; + dev_init_scheduler(dev); + write_lock_bh(&dev_base_lock); + *dev_tail = dev; + dev_tail = &dev->next; + hlist_add_head(&dev->name_hlist, head); + hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex)); + dev_hold(dev); + dev->reg_state = NETREG_REGISTERING; + write_unlock_bh(&dev_base_lock); + + /* Notify protocols, that a new device appeared. */ + notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev); + + /* Finish registration after unlock */ + net_set_todo(dev); + ret = 0; + +out: + return ret; +out_err: + free_divert_blk(dev); + goto out; +} + +/** + * register_netdev - register a network device + * @dev: device to register + * + * Take a completed network device structure and add it to the kernel + * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier + * chain. 0 is returned on success. A negative errno code is returned + * on a failure to set up the device, or if the name is a duplicate. + * + * This is a wrapper around register_netdev that takes the rtnl semaphore + * and expands the device name if you passed a format string to + * alloc_netdev. + */ +int register_netdev(struct net_device *dev) +{ + int err; + + rtnl_lock(); + + /* + * If the name is a format string the caller wants us to do a + * name allocation. + */ + if (strchr(dev->name, '%')) { + err = dev_alloc_name(dev, dev->name); + if (err < 0) + goto out; + } + + /* + * Back compatibility hook. Kill this one in 2.5 + */ + if (dev->name[0] == 0 || dev->name[0] == ' ') { + err = dev_alloc_name(dev, "eth%d"); + if (err < 0) + goto out; + } + + err = register_netdevice(dev); +out: + rtnl_unlock(); + return err; +} +EXPORT_SYMBOL(register_netdev); + +/* + * netdev_wait_allrefs - wait until all references are gone. + * + * This is called when unregistering network devices. + * + * Any protocol or device that holds a reference should register + * for netdevice notification, and cleanup and put back the + * reference if they receive an UNREGISTER event. + * We can get stuck here if buggy protocols don't correctly + * call dev_put. + */ +static void netdev_wait_allrefs(struct net_device *dev) +{ + unsigned long rebroadcast_time, warning_time; + + rebroadcast_time = warning_time = jiffies; + while (atomic_read(&dev->refcnt) != 0) { + if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { + rtnl_shlock(); + + /* Rebroadcast unregister notification */ + notifier_call_chain(&netdev_chain, + NETDEV_UNREGISTER, dev); + + if (test_bit(__LINK_STATE_LINKWATCH_PENDING, + &dev->state)) { + /* We must not have linkwatch events + * pending on unregister. If this + * happens, we simply run the queue + * unscheduled, resulting in a noop + * for this device. + */ + linkwatch_run_queue(); + } + + rtnl_shunlock(); + + rebroadcast_time = jiffies; + } + + msleep(250); + + if (time_after(jiffies, warning_time + 10 * HZ)) { + printk(KERN_EMERG "unregister_netdevice: " + "waiting for %s to become free. Usage " + "count = %d\n", + dev->name, atomic_read(&dev->refcnt)); + warning_time = jiffies; + } + } +} + +/* The sequence is: + * + * rtnl_lock(); + * ... + * register_netdevice(x1); + * register_netdevice(x2); + * ... + * unregister_netdevice(y1); + * unregister_netdevice(y2); + * ... + * rtnl_unlock(); + * free_netdev(y1); + * free_netdev(y2); + * + * We are invoked by rtnl_unlock() after it drops the semaphore. + * This allows us to deal with problems: + * 1) We can create/delete sysfs objects which invoke hotplug + * without deadlocking with linkwatch via keventd. + * 2) Since we run with the RTNL semaphore not held, we can sleep + * safely in order to wait for the netdev refcnt to drop to zero. + */ +static DECLARE_MUTEX(net_todo_run_mutex); +void netdev_run_todo(void) +{ + struct list_head list = LIST_HEAD_INIT(list); + int err; + + + /* Need to guard against multiple cpu's getting out of order. */ + down(&net_todo_run_mutex); + + /* Not safe to do outside the semaphore. We must not return + * until all unregister events invoked by the local processor + * have been completed (either by this todo run, or one on + * another cpu). + */ + if (list_empty(&net_todo_list)) + goto out; + + /* Snapshot list, allow later requests */ + spin_lock(&net_todo_list_lock); + list_splice_init(&net_todo_list, &list); + spin_unlock(&net_todo_list_lock); + + while (!list_empty(&list)) { + struct net_device *dev + = list_entry(list.next, struct net_device, todo_list); + list_del(&dev->todo_list); + + switch(dev->reg_state) { + case NETREG_REGISTERING: + err = netdev_register_sysfs(dev); + if (err) + printk(KERN_ERR "%s: failed sysfs registration (%d)\n", + dev->name, err); + dev->reg_state = NETREG_REGISTERED; + break; + + case NETREG_UNREGISTERING: + netdev_unregister_sysfs(dev); + dev->reg_state = NETREG_UNREGISTERED; + + netdev_wait_allrefs(dev); + + /* paranoia */ + BUG_ON(atomic_read(&dev->refcnt)); + BUG_TRAP(!dev->ip_ptr); + BUG_TRAP(!dev->ip6_ptr); + BUG_TRAP(!dev->dn_ptr); + + + /* It must be the very last action, + * after this 'dev' may point to freed up memory. + */ + if (dev->destructor) + dev->destructor(dev); + break; + + default: + printk(KERN_ERR "network todo '%s' but state %d\n", + dev->name, dev->reg_state); + break; + } + } + +out: + up(&net_todo_run_mutex); +} + +/** + * alloc_netdev - allocate network device + * @sizeof_priv: size of private data to allocate space for + * @name: device name format string + * @setup: callback to initialize device + * + * Allocates a struct net_device with private data area for driver use + * and performs basic initialization. + */ +struct net_device *alloc_netdev(int sizeof_priv, const char *name, + void (*setup)(struct net_device *)) +{ + void *p; + struct net_device *dev; + int alloc_size; + + /* ensure 32-byte alignment of both the device and private area */ + alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST; + alloc_size += sizeof_priv + NETDEV_ALIGN_CONST; + + p = kmalloc(alloc_size, GFP_KERNEL); + if (!p) { + printk(KERN_ERR "alloc_dev: Unable to allocate device.\n"); + return NULL; + } + memset(p, 0, alloc_size); + + dev = (struct net_device *) + (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST); + dev->padded = (char *)dev - (char *)p; + + if (sizeof_priv) + dev->priv = netdev_priv(dev); + + setup(dev); + strcpy(dev->name, name); + return dev; +} +EXPORT_SYMBOL(alloc_netdev); + +/** + * free_netdev - free network device + * @dev: device + * + * This function does the last stage of destroying an allocated device + * interface. The reference to the device object is released. + * If this is the last reference then it will be freed. + */ +void free_netdev(struct net_device *dev) +{ +#ifdef CONFIG_SYSFS + /* Compatiablity with error handling in drivers */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + kfree((char *)dev - dev->padded); + return; + } + + BUG_ON(dev->reg_state != NETREG_UNREGISTERED); + dev->reg_state = NETREG_RELEASED; + + /* will free via class release */ + class_device_put(&dev->class_dev); +#else + kfree((char *)dev - dev->padded); +#endif +} + +/* Synchronize with packet receive processing. */ +void synchronize_net(void) +{ + might_sleep(); + synchronize_kernel(); +} + +/** + * unregister_netdevice - remove device from the kernel + * @dev: device + * + * This function shuts down a device interface and removes it + * from the kernel tables. On success 0 is returned, on a failure + * a negative errno code is returned. + * + * Callers must hold the rtnl semaphore. You may want + * unregister_netdev() instead of this. + */ + +int unregister_netdevice(struct net_device *dev) +{ + struct net_device *d, **dp; + + BUG_ON(dev_boot_phase); + ASSERT_RTNL(); + + /* Some devices call without registering for initialization unwind. */ + if (dev->reg_state == NETREG_UNINITIALIZED) { + printk(KERN_DEBUG "unregister_netdevice: device %s/%p never " + "was registered\n", dev->name, dev); + return -ENODEV; + } + + BUG_ON(dev->reg_state != NETREG_REGISTERED); + + /* If device is running, close it first. */ + if (dev->flags & IFF_UP) + dev_close(dev); + + /* And unlink it from device chain. */ + for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) { + if (d == dev) { + write_lock_bh(&dev_base_lock); + hlist_del(&dev->name_hlist); + hlist_del(&dev->index_hlist); + if (dev_tail == &dev->next) + dev_tail = dp; + *dp = d->next; + write_unlock_bh(&dev_base_lock); + break; + } + } + if (!d) { + printk(KERN_ERR "unregister net_device: '%s' not found\n", + dev->name); + return -ENODEV; + } + + dev->reg_state = NETREG_UNREGISTERING; + + synchronize_net(); + + /* Shutdown queueing discipline. */ + dev_shutdown(dev); + + + /* Notify protocols, that we are about to destroy + this device. They should clean all the things. + */ + notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev); + + /* + * Flush the multicast chain + */ + dev_mc_discard(dev); + + if (dev->uninit) + dev->uninit(dev); + + /* Notifier chain MUST detach us from master device. */ + BUG_TRAP(!dev->master); + + free_divert_blk(dev); + + /* Finish processing unregister after unlock */ + net_set_todo(dev); + + synchronize_net(); + + dev_put(dev); + return 0; +} + +/** + * unregister_netdev - remove device from the kernel + * @dev: device + * + * This function shuts down a device interface and removes it + * from the kernel tables. On success 0 is returned, on a failure + * a negative errno code is returned. + * + * This is just a wrapper for unregister_netdevice that takes + * the rtnl semaphore. In general you want to use this and not + * unregister_netdevice. + */ +void unregister_netdev(struct net_device *dev) +{ + rtnl_lock(); + unregister_netdevice(dev); + rtnl_unlock(); +} + +EXPORT_SYMBOL(unregister_netdev); + +#ifdef CONFIG_HOTPLUG_CPU +static int dev_cpu_callback(struct notifier_block *nfb, + unsigned long action, + void *ocpu) +{ + struct sk_buff **list_skb; + struct net_device **list_net; + struct sk_buff *skb; + unsigned int cpu, oldcpu = (unsigned long)ocpu; + struct softnet_data *sd, *oldsd; + + if (action != CPU_DEAD) + return NOTIFY_OK; + + local_irq_disable(); + cpu = smp_processor_id(); + sd = &per_cpu(softnet_data, cpu); + oldsd = &per_cpu(softnet_data, oldcpu); + + /* Find end of our completion_queue. */ + list_skb = &sd->completion_queue; + while (*list_skb) + list_skb = &(*list_skb)->next; + /* Append completion queue from offline CPU. */ + *list_skb = oldsd->completion_queue; + oldsd->completion_queue = NULL; + + /* Find end of our output_queue. */ + list_net = &sd->output_queue; + while (*list_net) + list_net = &(*list_net)->next_sched; + /* Append output queue from offline CPU. */ + *list_net = oldsd->output_queue; + oldsd->output_queue = NULL; + + raise_softirq_irqoff(NET_TX_SOFTIRQ); + local_irq_enable(); + + /* Process offline CPU's input_pkt_queue */ + while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) + netif_rx(skb); + + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + + +/* + * Initialize the DEV module. At boot time this walks the device list and + * unhooks any devices that fail to initialise (normally hardware not + * present) and leaves us with a valid list of present and active devices. + * + */ + +/* + * This is called single threaded during boot, so no need + * to take the rtnl semaphore. + */ +static int __init net_dev_init(void) +{ + int i, rc = -ENOMEM; + + BUG_ON(!dev_boot_phase); + + net_random_init(); + + if (dev_proc_init()) + goto out; + + if (netdev_sysfs_init()) + goto out; + + INIT_LIST_HEAD(&ptype_all); + for (i = 0; i < 16; i++) + INIT_LIST_HEAD(&ptype_base[i]); + + for (i = 0; i < ARRAY_SIZE(dev_name_head); i++) + INIT_HLIST_HEAD(&dev_name_head[i]); + + for (i = 0; i < ARRAY_SIZE(dev_index_head); i++) + INIT_HLIST_HEAD(&dev_index_head[i]); + + /* + * Initialise the packet receive queues. + */ + + for (i = 0; i < NR_CPUS; i++) { + struct softnet_data *queue; + + queue = &per_cpu(softnet_data, i); + skb_queue_head_init(&queue->input_pkt_queue); + queue->throttle = 0; + queue->cng_level = 0; + queue->avg_blog = 10; /* arbitrary non-zero */ + queue->completion_queue = NULL; + INIT_LIST_HEAD(&queue->poll_list); + set_bit(__LINK_STATE_START, &queue->backlog_dev.state); + queue->backlog_dev.weight = weight_p; + queue->backlog_dev.poll = process_backlog; + atomic_set(&queue->backlog_dev.refcnt, 1); + } + +#ifdef OFFLINE_SAMPLE + samp_timer.expires = jiffies + (10 * HZ); + add_timer(&samp_timer); +#endif + + dev_boot_phase = 0; + + open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); + open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL); + + hotcpu_notifier(dev_cpu_callback, 0); + dst_init(); + dev_mcast_init(); + rc = 0; +out: + return rc; +} + +subsys_initcall(net_dev_init); + +EXPORT_SYMBOL(__dev_get_by_index); +EXPORT_SYMBOL(__dev_get_by_name); +EXPORT_SYMBOL(__dev_remove_pack); +EXPORT_SYMBOL(__skb_linearize); +EXPORT_SYMBOL(dev_add_pack); +EXPORT_SYMBOL(dev_alloc_name); +EXPORT_SYMBOL(dev_close); +EXPORT_SYMBOL(dev_get_by_flags); +EXPORT_SYMBOL(dev_get_by_index); +EXPORT_SYMBOL(dev_get_by_name); +EXPORT_SYMBOL(dev_ioctl); +EXPORT_SYMBOL(dev_open); +EXPORT_SYMBOL(dev_queue_xmit); +EXPORT_SYMBOL(dev_remove_pack); +EXPORT_SYMBOL(dev_set_allmulti); +EXPORT_SYMBOL(dev_set_promiscuity); +EXPORT_SYMBOL(dev_change_flags); +EXPORT_SYMBOL(dev_set_mtu); +EXPORT_SYMBOL(dev_set_mac_address); +EXPORT_SYMBOL(free_netdev); +EXPORT_SYMBOL(netdev_boot_setup_check); +EXPORT_SYMBOL(netdev_set_master); +EXPORT_SYMBOL(netdev_state_change); +EXPORT_SYMBOL(netif_receive_skb); +EXPORT_SYMBOL(netif_rx); +EXPORT_SYMBOL(register_gifconf); +EXPORT_SYMBOL(register_netdevice); +EXPORT_SYMBOL(register_netdevice_notifier); +EXPORT_SYMBOL(skb_checksum_help); +EXPORT_SYMBOL(synchronize_net); +EXPORT_SYMBOL(unregister_netdevice); +EXPORT_SYMBOL(unregister_netdevice_notifier); +EXPORT_SYMBOL(net_enable_timestamp); +EXPORT_SYMBOL(net_disable_timestamp); +EXPORT_SYMBOL(dev_get_flags); + +#if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE) +EXPORT_SYMBOL(br_handle_frame_hook); +EXPORT_SYMBOL(br_fdb_get_hook); +EXPORT_SYMBOL(br_fdb_put_hook); +#endif + +#ifdef CONFIG_KMOD +EXPORT_SYMBOL(dev_load); +#endif + +EXPORT_PER_CPU_SYMBOL(softnet_data); diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c new file mode 100644 index 000000000000..db098ff3cd6a --- /dev/null +++ b/net/core/dev_mcast.c @@ -0,0 +1,299 @@ +/* + * Linux NET3: Multicast List maintenance. + * + * Authors: + * Tim Kordas + * Richard Underwood + * + * Stir fried together from the IP multicast and CAP patches above + * Alan Cox + * + * Fixes: + * Alan Cox : Update the device on a real delete + * rather than any time but... + * Alan Cox : IFF_ALLMULTI support. + * Alan Cox : New format set_multicast_list() calls. + * Gleb Natapov : Remove dev_mc_lock. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * Device multicast list maintenance. + * + * This is used both by IP and by the user level maintenance functions. + * Unlike BSD we maintain a usage count on a given multicast address so + * that a casual user application can add/delete multicasts used by + * protocols without doing damage to the protocols when it deletes the + * entries. It also helps IP as it tracks overlapping maps. + * + * Device mc lists are changed by bh at least if IPv6 is enabled, + * so that it must be bh protected. + * + * We block accesses to device mc filters with dev->xmit_lock. + */ + +/* + * Update the multicast list into the physical NIC controller. + */ + +static void __dev_mc_upload(struct net_device *dev) +{ + /* Don't do anything till we up the interface + * [dev_open will call this function so the list will + * stay sane] + */ + + if (!(dev->flags&IFF_UP)) + return; + + /* + * Devices with no set multicast or which have been + * detached don't get set. + */ + + if (dev->set_multicast_list == NULL || + !netif_device_present(dev)) + return; + + dev->set_multicast_list(dev); +} + +void dev_mc_upload(struct net_device *dev) +{ + spin_lock_bh(&dev->xmit_lock); + __dev_mc_upload(dev); + spin_unlock_bh(&dev->xmit_lock); +} + +/* + * Delete a device level multicast + */ + +int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) +{ + int err = 0; + struct dev_mc_list *dmi, **dmip; + + spin_lock_bh(&dev->xmit_lock); + + for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) { + /* + * Find the entry we want to delete. The device could + * have variable length entries so check these too. + */ + if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 && + alen == dmi->dmi_addrlen) { + if (glbl) { + int old_glbl = dmi->dmi_gusers; + dmi->dmi_gusers = 0; + if (old_glbl == 0) + break; + } + if (--dmi->dmi_users) + goto done; + + /* + * Last user. So delete the entry. + */ + *dmip = dmi->next; + dev->mc_count--; + + kfree(dmi); + + /* + * We have altered the list, so the card + * loaded filter is now wrong. Fix it + */ + __dev_mc_upload(dev); + + spin_unlock_bh(&dev->xmit_lock); + return 0; + } + } + err = -ENOENT; +done: + spin_unlock_bh(&dev->xmit_lock); + return err; +} + +/* + * Add a device level multicast + */ + +int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) +{ + int err = 0; + struct dev_mc_list *dmi, *dmi1; + + dmi1 = (struct dev_mc_list *)kmalloc(sizeof(*dmi), GFP_ATOMIC); + + spin_lock_bh(&dev->xmit_lock); + for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) { + if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 && + dmi->dmi_addrlen == alen) { + if (glbl) { + int old_glbl = dmi->dmi_gusers; + dmi->dmi_gusers = 1; + if (old_glbl) + goto done; + } + dmi->dmi_users++; + goto done; + } + } + + if ((dmi = dmi1) == NULL) { + spin_unlock_bh(&dev->xmit_lock); + return -ENOMEM; + } + memcpy(dmi->dmi_addr, addr, alen); + dmi->dmi_addrlen = alen; + dmi->next = dev->mc_list; + dmi->dmi_users = 1; + dmi->dmi_gusers = glbl ? 1 : 0; + dev->mc_list = dmi; + dev->mc_count++; + + __dev_mc_upload(dev); + + spin_unlock_bh(&dev->xmit_lock); + return 0; + +done: + spin_unlock_bh(&dev->xmit_lock); + if (dmi1) + kfree(dmi1); + return err; +} + +/* + * Discard multicast list when a device is downed + */ + +void dev_mc_discard(struct net_device *dev) +{ + spin_lock_bh(&dev->xmit_lock); + + while (dev->mc_list != NULL) { + struct dev_mc_list *tmp = dev->mc_list; + dev->mc_list = tmp->next; + if (tmp->dmi_users > tmp->dmi_gusers) + printk("dev_mc_discard: multicast leakage! dmi_users=%d\n", tmp->dmi_users); + kfree(tmp); + } + dev->mc_count = 0; + + spin_unlock_bh(&dev->xmit_lock); +} + +#ifdef CONFIG_PROC_FS +static void *dev_mc_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct net_device *dev; + loff_t off = 0; + + read_lock(&dev_base_lock); + for (dev = dev_base; dev; dev = dev->next) { + if (off++ == *pos) + return dev; + } + return NULL; +} + +static void *dev_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct net_device *dev = v; + ++*pos; + return dev->next; +} + +static void dev_mc_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&dev_base_lock); +} + + +static int dev_mc_seq_show(struct seq_file *seq, void *v) +{ + struct dev_mc_list *m; + struct net_device *dev = v; + + spin_lock_bh(&dev->xmit_lock); + for (m = dev->mc_list; m; m = m->next) { + int i; + + seq_printf(seq, "%-4d %-15s %-5d %-5d ", dev->ifindex, + dev->name, m->dmi_users, m->dmi_gusers); + + for (i = 0; i < m->dmi_addrlen; i++) + seq_printf(seq, "%02x", m->dmi_addr[i]); + + seq_putc(seq, '\n'); + } + spin_unlock_bh(&dev->xmit_lock); + return 0; +} + +static struct seq_operations dev_mc_seq_ops = { + .start = dev_mc_seq_start, + .next = dev_mc_seq_next, + .stop = dev_mc_seq_stop, + .show = dev_mc_seq_show, +}; + +static int dev_mc_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &dev_mc_seq_ops); +} + +static struct file_operations dev_mc_seq_fops = { + .owner = THIS_MODULE, + .open = dev_mc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + +void __init dev_mcast_init(void) +{ + proc_net_fops_create("dev_mcast", 0, &dev_mc_seq_fops); +} + +EXPORT_SYMBOL(dev_mc_add); +EXPORT_SYMBOL(dev_mc_delete); +EXPORT_SYMBOL(dev_mc_upload); diff --git a/net/core/dst.c b/net/core/dst.c new file mode 100644 index 000000000000..3bf6cc434814 --- /dev/null +++ b/net/core/dst.c @@ -0,0 +1,276 @@ +/* + * net/core/dst.c Protocol independent destination cache. + * + * Authors: Alexey Kuznetsov, + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* Locking strategy: + * 1) Garbage collection state of dead destination cache + * entries is protected by dst_lock. + * 2) GC is run only from BH context, and is the only remover + * of entries. + * 3) Entries are added to the garbage list from both BH + * and non-BH context, so local BH disabling is needed. + * 4) All operations modify state, so a spinlock is used. + */ +static struct dst_entry *dst_garbage_list; +#if RT_CACHE_DEBUG >= 2 +static atomic_t dst_total = ATOMIC_INIT(0); +#endif +static DEFINE_SPINLOCK(dst_lock); + +static unsigned long dst_gc_timer_expires; +static unsigned long dst_gc_timer_inc = DST_GC_MAX; +static void dst_run_gc(unsigned long); +static void ___dst_free(struct dst_entry * dst); + +static struct timer_list dst_gc_timer = + TIMER_INITIALIZER(dst_run_gc, DST_GC_MIN, 0); + +static void dst_run_gc(unsigned long dummy) +{ + int delayed = 0; + struct dst_entry * dst, **dstp; + + if (!spin_trylock(&dst_lock)) { + mod_timer(&dst_gc_timer, jiffies + HZ/10); + return; + } + + + del_timer(&dst_gc_timer); + dstp = &dst_garbage_list; + while ((dst = *dstp) != NULL) { + if (atomic_read(&dst->__refcnt)) { + dstp = &dst->next; + delayed++; + continue; + } + *dstp = dst->next; + + dst = dst_destroy(dst); + if (dst) { + /* NOHASH and still referenced. Unless it is already + * on gc list, invalidate it and add to gc list. + * + * Note: this is temporary. Actually, NOHASH dst's + * must be obsoleted when parent is obsoleted. + * But we do not have state "obsoleted, but + * referenced by parent", so it is right. + */ + if (dst->obsolete > 1) + continue; + + ___dst_free(dst); + dst->next = *dstp; + *dstp = dst; + dstp = &dst->next; + } + } + if (!dst_garbage_list) { + dst_gc_timer_inc = DST_GC_MAX; + goto out; + } + if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX) + dst_gc_timer_expires = DST_GC_MAX; + dst_gc_timer_inc += DST_GC_INC; + dst_gc_timer.expires = jiffies + dst_gc_timer_expires; +#if RT_CACHE_DEBUG >= 2 + printk("dst_total: %d/%d %ld\n", + atomic_read(&dst_total), delayed, dst_gc_timer_expires); +#endif + add_timer(&dst_gc_timer); + +out: + spin_unlock(&dst_lock); +} + +static int dst_discard_in(struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +static int dst_discard_out(struct sk_buff *skb) +{ + kfree_skb(skb); + return 0; +} + +void * dst_alloc(struct dst_ops * ops) +{ + struct dst_entry * dst; + + if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) { + if (ops->gc()) + return NULL; + } + dst = kmem_cache_alloc(ops->kmem_cachep, SLAB_ATOMIC); + if (!dst) + return NULL; + memset(dst, 0, ops->entry_size); + atomic_set(&dst->__refcnt, 0); + dst->ops = ops; + dst->lastuse = jiffies; + dst->path = dst; + dst->input = dst_discard_in; + dst->output = dst_discard_out; +#if RT_CACHE_DEBUG >= 2 + atomic_inc(&dst_total); +#endif + atomic_inc(&ops->entries); + return dst; +} + +static void ___dst_free(struct dst_entry * dst) +{ + /* The first case (dev==NULL) is required, when + protocol module is unloaded. + */ + if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) { + dst->input = dst_discard_in; + dst->output = dst_discard_out; + } + dst->obsolete = 2; +} + +void __dst_free(struct dst_entry * dst) +{ + spin_lock_bh(&dst_lock); + ___dst_free(dst); + dst->next = dst_garbage_list; + dst_garbage_list = dst; + if (dst_gc_timer_inc > DST_GC_INC) { + dst_gc_timer_inc = DST_GC_INC; + dst_gc_timer_expires = DST_GC_MIN; + mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires); + } + spin_unlock_bh(&dst_lock); +} + +struct dst_entry *dst_destroy(struct dst_entry * dst) +{ + struct dst_entry *child; + struct neighbour *neigh; + struct hh_cache *hh; + + smp_rmb(); + +again: + neigh = dst->neighbour; + hh = dst->hh; + child = dst->child; + + dst->hh = NULL; + if (hh && atomic_dec_and_test(&hh->hh_refcnt)) + kfree(hh); + + if (neigh) { + dst->neighbour = NULL; + neigh_release(neigh); + } + + atomic_dec(&dst->ops->entries); + + if (dst->ops->destroy) + dst->ops->destroy(dst); + if (dst->dev) + dev_put(dst->dev); +#if RT_CACHE_DEBUG >= 2 + atomic_dec(&dst_total); +#endif + kmem_cache_free(dst->ops->kmem_cachep, dst); + + dst = child; + if (dst) { + if (atomic_dec_and_test(&dst->__refcnt)) { + /* We were real parent of this dst, so kill child. */ + if (dst->flags&DST_NOHASH) + goto again; + } else { + /* Child is still referenced, return it for freeing. */ + if (dst->flags&DST_NOHASH) + return dst; + /* Child is still in his hash table */ + } + } + return NULL; +} + +/* Dirty hack. We did it in 2.2 (in __dst_free), + * we have _very_ good reasons not to repeat + * this mistake in 2.3, but we have no choice + * now. _It_ _is_ _explicit_ _deliberate_ + * _race_ _condition_. + * + * Commented and originally written by Alexey. + */ +static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int unregister) +{ + if (dst->ops->ifdown) + dst->ops->ifdown(dst, dev, unregister); + + if (dev != dst->dev) + return; + + if (!unregister) { + dst->input = dst_discard_in; + dst->output = dst_discard_out; + } else { + dst->dev = &loopback_dev; + dev_hold(&loopback_dev); + dev_put(dev); + if (dst->neighbour && dst->neighbour->dev == dev) { + dst->neighbour->dev = &loopback_dev; + dev_put(dev); + dev_hold(&loopback_dev); + } + } +} + +static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct dst_entry *dst; + + switch (event) { + case NETDEV_UNREGISTER: + case NETDEV_DOWN: + spin_lock_bh(&dst_lock); + for (dst = dst_garbage_list; dst; dst = dst->next) { + dst_ifdown(dst, dev, event != NETDEV_DOWN); + } + spin_unlock_bh(&dst_lock); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block dst_dev_notifier = { + .notifier_call = dst_dev_event, +}; + +void __init dst_init(void) +{ + register_netdevice_notifier(&dst_dev_notifier); +} + +EXPORT_SYMBOL(__dst_free); +EXPORT_SYMBOL(dst_alloc); +EXPORT_SYMBOL(dst_destroy); diff --git a/net/core/dv.c b/net/core/dv.c new file mode 100644 index 000000000000..3f25f4aa4e66 --- /dev/null +++ b/net/core/dv.c @@ -0,0 +1,548 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic frame diversion + * + * Authors: + * Benoit LOCHER: initial integration within the kernel with support for ethernet + * Dave Miller: improvement on the code (correctness, performance and source files) + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const char sysctl_divert_version[32]="0.46"; /* Current version */ + +static int __init dv_init(void) +{ + return 0; +} +module_init(dv_init); + +/* + * Allocate a divert_blk for a device. This must be an ethernet nic. + */ +int alloc_divert_blk(struct net_device *dev) +{ + int alloc_size = (sizeof(struct divert_blk) + 3) & ~3; + + dev->divert = NULL; + if (dev->type == ARPHRD_ETHER) { + dev->divert = (struct divert_blk *) + kmalloc(alloc_size, GFP_KERNEL); + if (dev->divert == NULL) { + printk(KERN_INFO "divert: unable to allocate divert_blk for %s\n", + dev->name); + return -ENOMEM; + } + + memset(dev->divert, 0, sizeof(struct divert_blk)); + dev_hold(dev); + } + + return 0; +} + +/* + * Free a divert_blk allocated by the above function, if it was + * allocated on that device. + */ +void free_divert_blk(struct net_device *dev) +{ + if (dev->divert) { + kfree(dev->divert); + dev->divert=NULL; + dev_put(dev); + } +} + +/* + * Adds a tcp/udp (source or dest) port to an array + */ +static int add_port(u16 ports[], u16 port) +{ + int i; + + if (port == 0) + return -EINVAL; + + /* Storing directly in network format for performance, + * thanks Dave :) + */ + port = htons(port); + + for (i = 0; i < MAX_DIVERT_PORTS; i++) { + if (ports[i] == port) + return -EALREADY; + } + + for (i = 0; i < MAX_DIVERT_PORTS; i++) { + if (ports[i] == 0) { + ports[i] = port; + return 0; + } + } + + return -ENOBUFS; +} + +/* + * Removes a port from an array tcp/udp (source or dest) + */ +static int remove_port(u16 ports[], u16 port) +{ + int i; + + if (port == 0) + return -EINVAL; + + /* Storing directly in network format for performance, + * thanks Dave ! + */ + port = htons(port); + + for (i = 0; i < MAX_DIVERT_PORTS; i++) { + if (ports[i] == port) { + ports[i] = 0; + return 0; + } + } + + return -EINVAL; +} + +/* Some basic sanity checks on the arguments passed to divert_ioctl() */ +static int check_args(struct divert_cf *div_cf, struct net_device **dev) +{ + char devname[32]; + int ret; + + if (dev == NULL) + return -EFAULT; + + /* GETVERSION: all other args are unused */ + if (div_cf->cmd == DIVCMD_GETVERSION) + return 0; + + /* Network device index should reasonably be between 0 and 1000 :) */ + if (div_cf->dev_index < 0 || div_cf->dev_index > 1000) + return -EINVAL; + + /* Let's try to find the ifname */ + sprintf(devname, "eth%d", div_cf->dev_index); + *dev = dev_get_by_name(devname); + + /* dev should NOT be null */ + if (*dev == NULL) + return -EINVAL; + + ret = 0; + + /* user issuing the ioctl must be a super one :) */ + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + + /* Device must have a divert_blk member NOT null */ + if ((*dev)->divert == NULL) + ret = -EINVAL; +out: + dev_put(*dev); + return ret; +} + +/* + * control function of the diverter + */ +#if 0 +#define DVDBG(a) \ + printk(KERN_DEBUG "divert_ioctl() line %d %s\n", __LINE__, (a)) +#else +#define DVDBG(a) +#endif + +int divert_ioctl(unsigned int cmd, struct divert_cf __user *arg) +{ + struct divert_cf div_cf; + struct divert_blk *div_blk; + struct net_device *dev; + int ret; + + switch (cmd) { + case SIOCGIFDIVERT: + DVDBG("SIOCGIFDIVERT, copy_from_user"); + if (copy_from_user(&div_cf, arg, sizeof(struct divert_cf))) + return -EFAULT; + DVDBG("before check_args"); + ret = check_args(&div_cf, &dev); + if (ret) + return ret; + DVDBG("after checkargs"); + div_blk = dev->divert; + + DVDBG("befre switch()"); + switch (div_cf.cmd) { + case DIVCMD_GETSTATUS: + /* Now, just give the user the raw divert block + * for him to play with :) + */ + if (copy_to_user(div_cf.arg1.ptr, dev->divert, + sizeof(struct divert_blk))) + return -EFAULT; + break; + + case DIVCMD_GETVERSION: + DVDBG("GETVERSION: checking ptr"); + if (div_cf.arg1.ptr == NULL) + return -EINVAL; + DVDBG("GETVERSION: copying data to userland"); + if (copy_to_user(div_cf.arg1.ptr, + sysctl_divert_version, 32)) + return -EFAULT; + DVDBG("GETVERSION: data copied"); + break; + + default: + return -EINVAL; + } + + break; + + case SIOCSIFDIVERT: + if (copy_from_user(&div_cf, arg, sizeof(struct divert_cf))) + return -EFAULT; + + ret = check_args(&div_cf, &dev); + if (ret) + return ret; + + div_blk = dev->divert; + + switch(div_cf.cmd) { + case DIVCMD_RESET: + div_blk->divert = 0; + div_blk->protos = DIVERT_PROTO_NONE; + memset(div_blk->tcp_dst, 0, + MAX_DIVERT_PORTS * sizeof(u16)); + memset(div_blk->tcp_src, 0, + MAX_DIVERT_PORTS * sizeof(u16)); + memset(div_blk->udp_dst, 0, + MAX_DIVERT_PORTS * sizeof(u16)); + memset(div_blk->udp_src, 0, + MAX_DIVERT_PORTS * sizeof(u16)); + return 0; + + case DIVCMD_DIVERT: + switch(div_cf.arg1.int32) { + case DIVARG1_ENABLE: + if (div_blk->divert) + return -EALREADY; + div_blk->divert = 1; + break; + + case DIVARG1_DISABLE: + if (!div_blk->divert) + return -EALREADY; + div_blk->divert = 0; + break; + + default: + return -EINVAL; + } + + break; + + case DIVCMD_IP: + switch(div_cf.arg1.int32) { + case DIVARG1_ENABLE: + if (div_blk->protos & DIVERT_PROTO_IP) + return -EALREADY; + div_blk->protos |= DIVERT_PROTO_IP; + break; + + case DIVARG1_DISABLE: + if (!(div_blk->protos & DIVERT_PROTO_IP)) + return -EALREADY; + div_blk->protos &= ~DIVERT_PROTO_IP; + break; + + default: + return -EINVAL; + } + + break; + + case DIVCMD_TCP: + switch(div_cf.arg1.int32) { + case DIVARG1_ENABLE: + if (div_blk->protos & DIVERT_PROTO_TCP) + return -EALREADY; + div_blk->protos |= DIVERT_PROTO_TCP; + break; + + case DIVARG1_DISABLE: + if (!(div_blk->protos & DIVERT_PROTO_TCP)) + return -EALREADY; + div_blk->protos &= ~DIVERT_PROTO_TCP; + break; + + default: + return -EINVAL; + } + + break; + + case DIVCMD_TCPDST: + switch(div_cf.arg1.int32) { + case DIVARG1_ADD: + return add_port(div_blk->tcp_dst, + div_cf.arg2.uint16); + + case DIVARG1_REMOVE: + return remove_port(div_blk->tcp_dst, + div_cf.arg2.uint16); + + default: + return -EINVAL; + } + + break; + + case DIVCMD_TCPSRC: + switch(div_cf.arg1.int32) { + case DIVARG1_ADD: + return add_port(div_blk->tcp_src, + div_cf.arg2.uint16); + + case DIVARG1_REMOVE: + return remove_port(div_blk->tcp_src, + div_cf.arg2.uint16); + + default: + return -EINVAL; + } + + break; + + case DIVCMD_UDP: + switch(div_cf.arg1.int32) { + case DIVARG1_ENABLE: + if (div_blk->protos & DIVERT_PROTO_UDP) + return -EALREADY; + div_blk->protos |= DIVERT_PROTO_UDP; + break; + + case DIVARG1_DISABLE: + if (!(div_blk->protos & DIVERT_PROTO_UDP)) + return -EALREADY; + div_blk->protos &= ~DIVERT_PROTO_UDP; + break; + + default: + return -EINVAL; + } + + break; + + case DIVCMD_UDPDST: + switch(div_cf.arg1.int32) { + case DIVARG1_ADD: + return add_port(div_blk->udp_dst, + div_cf.arg2.uint16); + + case DIVARG1_REMOVE: + return remove_port(div_blk->udp_dst, + div_cf.arg2.uint16); + + default: + return -EINVAL; + } + + break; + + case DIVCMD_UDPSRC: + switch(div_cf.arg1.int32) { + case DIVARG1_ADD: + return add_port(div_blk->udp_src, + div_cf.arg2.uint16); + + case DIVARG1_REMOVE: + return remove_port(div_blk->udp_src, + div_cf.arg2.uint16); + + default: + return -EINVAL; + } + + break; + + case DIVCMD_ICMP: + switch(div_cf.arg1.int32) { + case DIVARG1_ENABLE: + if (div_blk->protos & DIVERT_PROTO_ICMP) + return -EALREADY; + div_blk->protos |= DIVERT_PROTO_ICMP; + break; + + case DIVARG1_DISABLE: + if (!(div_blk->protos & DIVERT_PROTO_ICMP)) + return -EALREADY; + div_blk->protos &= ~DIVERT_PROTO_ICMP; + break; + + default: + return -EINVAL; + } + + break; + + default: + return -EINVAL; + } + + break; + + default: + return -EINVAL; + } + + return 0; +} + + +/* + * Check if packet should have its dest mac address set to the box itself + * for diversion + */ + +#define ETH_DIVERT_FRAME(skb) \ + memcpy(eth_hdr(skb), skb->dev->dev_addr, ETH_ALEN); \ + skb->pkt_type=PACKET_HOST + +void divert_frame(struct sk_buff *skb) +{ + struct ethhdr *eth = eth_hdr(skb); + struct iphdr *iph; + struct tcphdr *tcph; + struct udphdr *udph; + struct divert_blk *divert = skb->dev->divert; + int i, src, dst; + unsigned char *skb_data_end = skb->data + skb->len; + + /* Packet is already aimed at us, return */ + if (!memcmp(eth, skb->dev->dev_addr, ETH_ALEN)) + return; + + /* proto is not IP, do nothing */ + if (eth->h_proto != htons(ETH_P_IP)) + return; + + /* Divert all IP frames ? */ + if (divert->protos & DIVERT_PROTO_IP) { + ETH_DIVERT_FRAME(skb); + return; + } + + /* Check for possible (maliciously) malformed IP frame (thanks Dave) */ + iph = (struct iphdr *) skb->data; + if (((iph->ihl<<2)+(unsigned char*)(iph)) >= skb_data_end) { + printk(KERN_INFO "divert: malformed IP packet !\n"); + return; + } + + switch (iph->protocol) { + /* Divert all ICMP frames ? */ + case IPPROTO_ICMP: + if (divert->protos & DIVERT_PROTO_ICMP) { + ETH_DIVERT_FRAME(skb); + return; + } + break; + + /* Divert all TCP frames ? */ + case IPPROTO_TCP: + if (divert->protos & DIVERT_PROTO_TCP) { + ETH_DIVERT_FRAME(skb); + return; + } + + /* Check for possible (maliciously) malformed IP + * frame (thanx Dave) + */ + tcph = (struct tcphdr *) + (((unsigned char *)iph) + (iph->ihl<<2)); + if (((unsigned char *)(tcph+1)) >= skb_data_end) { + printk(KERN_INFO "divert: malformed TCP packet !\n"); + return; + } + + /* Divert some tcp dst/src ports only ?*/ + for (i = 0; i < MAX_DIVERT_PORTS; i++) { + dst = divert->tcp_dst[i]; + src = divert->tcp_src[i]; + if ((dst && dst == tcph->dest) || + (src && src == tcph->source)) { + ETH_DIVERT_FRAME(skb); + return; + } + } + break; + + /* Divert all UDP frames ? */ + case IPPROTO_UDP: + if (divert->protos & DIVERT_PROTO_UDP) { + ETH_DIVERT_FRAME(skb); + return; + } + + /* Check for possible (maliciously) malformed IP + * packet (thanks Dave) + */ + udph = (struct udphdr *) + (((unsigned char *)iph) + (iph->ihl<<2)); + if (((unsigned char *)(udph+1)) >= skb_data_end) { + printk(KERN_INFO + "divert: malformed UDP packet !\n"); + return; + } + + /* Divert some udp dst/src ports only ? */ + for (i = 0; i < MAX_DIVERT_PORTS; i++) { + dst = divert->udp_dst[i]; + src = divert->udp_src[i]; + if ((dst && dst == udph->dest) || + (src && src == udph->source)) { + ETH_DIVERT_FRAME(skb); + return; + } + } + break; + } +} diff --git a/net/core/ethtool.c b/net/core/ethtool.c new file mode 100644 index 000000000000..f05fde97c43d --- /dev/null +++ b/net/core/ethtool.c @@ -0,0 +1,819 @@ +/* + * net/core/ethtool.c - Ethtool ioctl handler + * Copyright (c) 2003 Matthew Wilcox + * + * This file is where we call all the ethtool_ops commands to get + * the information ethtool needs. We fall back to calling do_ioctl() + * for drivers which haven't been converted to ethtool_ops yet. + * + * It's GPL, stupid. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Some useful ethtool_ops methods that're device independent. + * If we find that all drivers want to do the same thing here, + * we can turn these into dev_() function calls. + */ + +u32 ethtool_op_get_link(struct net_device *dev) +{ + return netif_carrier_ok(dev) ? 1 : 0; +} + +u32 ethtool_op_get_tx_csum(struct net_device *dev) +{ + return (dev->features & NETIF_F_IP_CSUM) != 0; +} + +int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) +{ + if (data) + dev->features |= NETIF_F_IP_CSUM; + else + dev->features &= ~NETIF_F_IP_CSUM; + + return 0; +} + +u32 ethtool_op_get_sg(struct net_device *dev) +{ + return (dev->features & NETIF_F_SG) != 0; +} + +int ethtool_op_set_sg(struct net_device *dev, u32 data) +{ + if (data) + dev->features |= NETIF_F_SG; + else + dev->features &= ~NETIF_F_SG; + + return 0; +} + +u32 ethtool_op_get_tso(struct net_device *dev) +{ + return (dev->features & NETIF_F_TSO) != 0; +} + +int ethtool_op_set_tso(struct net_device *dev, u32 data) +{ + if (data) + dev->features |= NETIF_F_TSO; + else + dev->features &= ~NETIF_F_TSO; + + return 0; +} + +/* Handlers for each ethtool command */ + +static int ethtool_get_settings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_cmd cmd = { ETHTOOL_GSET }; + int err; + + if (!dev->ethtool_ops->get_settings) + return -EOPNOTSUPP; + + err = dev->ethtool_ops->get_settings(dev, &cmd); + if (err < 0) + return err; + + if (copy_to_user(useraddr, &cmd, sizeof(cmd))) + return -EFAULT; + return 0; +} + +static int ethtool_set_settings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_cmd cmd; + + if (!dev->ethtool_ops->set_settings) + return -EOPNOTSUPP; + + if (copy_from_user(&cmd, useraddr, sizeof(cmd))) + return -EFAULT; + + return dev->ethtool_ops->set_settings(dev, &cmd); +} + +static int ethtool_get_drvinfo(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_drvinfo info; + struct ethtool_ops *ops = dev->ethtool_ops; + + if (!ops->get_drvinfo) + return -EOPNOTSUPP; + + memset(&info, 0, sizeof(info)); + info.cmd = ETHTOOL_GDRVINFO; + ops->get_drvinfo(dev, &info); + + if (ops->self_test_count) + info.testinfo_len = ops->self_test_count(dev); + if (ops->get_stats_count) + info.n_stats = ops->get_stats_count(dev); + if (ops->get_regs_len) + info.regdump_len = ops->get_regs_len(dev); + if (ops->get_eeprom_len) + info.eedump_len = ops->get_eeprom_len(dev); + + if (copy_to_user(useraddr, &info, sizeof(info))) + return -EFAULT; + return 0; +} + +static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_regs regs; + struct ethtool_ops *ops = dev->ethtool_ops; + void *regbuf; + int reglen, ret; + + if (!ops->get_regs || !ops->get_regs_len) + return -EOPNOTSUPP; + + if (copy_from_user(®s, useraddr, sizeof(regs))) + return -EFAULT; + + reglen = ops->get_regs_len(dev); + if (regs.len > reglen) + regs.len = reglen; + + regbuf = kmalloc(reglen, GFP_USER); + if (!regbuf) + return -ENOMEM; + + ops->get_regs(dev, ®s, regbuf); + + ret = -EFAULT; + if (copy_to_user(useraddr, ®s, sizeof(regs))) + goto out; + useraddr += offsetof(struct ethtool_regs, data); + if (copy_to_user(useraddr, regbuf, regs.len)) + goto out; + ret = 0; + + out: + kfree(regbuf); + return ret; +} + +static int ethtool_get_wol(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_wolinfo wol = { ETHTOOL_GWOL }; + + if (!dev->ethtool_ops->get_wol) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_wol(dev, &wol); + + if (copy_to_user(useraddr, &wol, sizeof(wol))) + return -EFAULT; + return 0; +} + +static int ethtool_set_wol(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_wolinfo wol; + + if (!dev->ethtool_ops->set_wol) + return -EOPNOTSUPP; + + if (copy_from_user(&wol, useraddr, sizeof(wol))) + return -EFAULT; + + return dev->ethtool_ops->set_wol(dev, &wol); +} + +static int ethtool_get_msglevel(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GMSGLVL }; + + if (!dev->ethtool_ops->get_msglevel) + return -EOPNOTSUPP; + + edata.data = dev->ethtool_ops->get_msglevel(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_msglevel(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata; + + if (!dev->ethtool_ops->set_msglevel) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + dev->ethtool_ops->set_msglevel(dev, edata.data); + return 0; +} + +static int ethtool_nway_reset(struct net_device *dev) +{ + if (!dev->ethtool_ops->nway_reset) + return -EOPNOTSUPP; + + return dev->ethtool_ops->nway_reset(dev); +} + +static int ethtool_get_link(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GLINK }; + + if (!dev->ethtool_ops->get_link) + return -EOPNOTSUPP; + + edata.data = dev->ethtool_ops->get_link(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_get_eeprom(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_eeprom eeprom; + struct ethtool_ops *ops = dev->ethtool_ops; + u8 *data; + int ret; + + if (!ops->get_eeprom || !ops->get_eeprom_len) + return -EOPNOTSUPP; + + if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) + return -EFAULT; + + /* Check for wrap and zero */ + if (eeprom.offset + eeprom.len <= eeprom.offset) + return -EINVAL; + + /* Check for exceeding total eeprom len */ + if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) + return -EINVAL; + + data = kmalloc(eeprom.len, GFP_USER); + if (!data) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(data, useraddr + sizeof(eeprom), eeprom.len)) + goto out; + + ret = ops->get_eeprom(dev, &eeprom, data); + if (ret) + goto out; + + ret = -EFAULT; + if (copy_to_user(useraddr, &eeprom, sizeof(eeprom))) + goto out; + if (copy_to_user(useraddr + sizeof(eeprom), data, eeprom.len)) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + +static int ethtool_set_eeprom(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_eeprom eeprom; + struct ethtool_ops *ops = dev->ethtool_ops; + u8 *data; + int ret; + + if (!ops->set_eeprom || !ops->get_eeprom_len) + return -EOPNOTSUPP; + + if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) + return -EFAULT; + + /* Check for wrap and zero */ + if (eeprom.offset + eeprom.len <= eeprom.offset) + return -EINVAL; + + /* Check for exceeding total eeprom len */ + if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) + return -EINVAL; + + data = kmalloc(eeprom.len, GFP_USER); + if (!data) + return -ENOMEM; + + ret = -EFAULT; + if (copy_from_user(data, useraddr + sizeof(eeprom), eeprom.len)) + goto out; + + ret = ops->set_eeprom(dev, &eeprom, data); + if (ret) + goto out; + + if (copy_to_user(useraddr + sizeof(eeprom), data, eeprom.len)) + ret = -EFAULT; + + out: + kfree(data); + return ret; +} + +static int ethtool_get_coalesce(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_coalesce coalesce = { ETHTOOL_GCOALESCE }; + + if (!dev->ethtool_ops->get_coalesce) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_coalesce(dev, &coalesce); + + if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) + return -EFAULT; + return 0; +} + +static int ethtool_set_coalesce(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_coalesce coalesce; + + if (!dev->ethtool_ops->get_coalesce) + return -EOPNOTSUPP; + + if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) + return -EFAULT; + + return dev->ethtool_ops->set_coalesce(dev, &coalesce); +} + +static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_ringparam ringparam = { ETHTOOL_GRINGPARAM }; + + if (!dev->ethtool_ops->get_ringparam) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_ringparam(dev, &ringparam); + + if (copy_to_user(useraddr, &ringparam, sizeof(ringparam))) + return -EFAULT; + return 0; +} + +static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_ringparam ringparam; + + if (!dev->ethtool_ops->set_ringparam) + return -EOPNOTSUPP; + + if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) + return -EFAULT; + + return dev->ethtool_ops->set_ringparam(dev, &ringparam); +} + +static int ethtool_get_pauseparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM }; + + if (!dev->ethtool_ops->get_pauseparam) + return -EOPNOTSUPP; + + dev->ethtool_ops->get_pauseparam(dev, &pauseparam); + + if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam))) + return -EFAULT; + return 0; +} + +static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_pauseparam pauseparam; + + if (!dev->ethtool_ops->get_pauseparam) + return -EOPNOTSUPP; + + if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam))) + return -EFAULT; + + return dev->ethtool_ops->set_pauseparam(dev, &pauseparam); +} + +static int ethtool_get_rx_csum(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GRXCSUM }; + + if (!dev->ethtool_ops->get_rx_csum) + return -EOPNOTSUPP; + + edata.data = dev->ethtool_ops->get_rx_csum(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_rx_csum(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata; + + if (!dev->ethtool_ops->set_rx_csum) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + dev->ethtool_ops->set_rx_csum(dev, edata.data); + return 0; +} + +static int ethtool_get_tx_csum(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GTXCSUM }; + + if (!dev->ethtool_ops->get_tx_csum) + return -EOPNOTSUPP; + + edata.data = dev->ethtool_ops->get_tx_csum(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int __ethtool_set_sg(struct net_device *dev, u32 data) +{ + int err; + + if (!data && dev->ethtool_ops->set_tso) { + err = dev->ethtool_ops->set_tso(dev, 0); + if (err) + return err; + } + + return dev->ethtool_ops->set_sg(dev, data); +} + +static int ethtool_set_tx_csum(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata; + int err; + + if (!dev->ethtool_ops->set_tx_csum) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + if (!edata.data && dev->ethtool_ops->set_sg) { + err = __ethtool_set_sg(dev, 0); + if (err) + return err; + } + + return dev->ethtool_ops->set_tx_csum(dev, edata.data); +} + +static int ethtool_get_sg(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GSG }; + + if (!dev->ethtool_ops->get_sg) + return -EOPNOTSUPP; + + edata.data = dev->ethtool_ops->get_sg(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_sg(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata; + + if (!dev->ethtool_ops->set_sg) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + if (edata.data && + !(dev->features & (NETIF_F_IP_CSUM | + NETIF_F_NO_CSUM | + NETIF_F_HW_CSUM))) + return -EINVAL; + + return __ethtool_set_sg(dev, edata.data); +} + +static int ethtool_get_tso(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata = { ETHTOOL_GTSO }; + + if (!dev->ethtool_ops->get_tso) + return -EOPNOTSUPP; + + edata.data = dev->ethtool_ops->get_tso(dev); + + if (copy_to_user(useraddr, &edata, sizeof(edata))) + return -EFAULT; + return 0; +} + +static int ethtool_set_tso(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_value edata; + + if (!dev->ethtool_ops->set_tso) + return -EOPNOTSUPP; + + if (copy_from_user(&edata, useraddr, sizeof(edata))) + return -EFAULT; + + if (edata.data && !(dev->features & NETIF_F_SG)) + return -EINVAL; + + return dev->ethtool_ops->set_tso(dev, edata.data); +} + +static int ethtool_self_test(struct net_device *dev, char __user *useraddr) +{ + struct ethtool_test test; + struct ethtool_ops *ops = dev->ethtool_ops; + u64 *data; + int ret; + + if (!ops->self_test || !ops->self_test_count) + return -EOPNOTSUPP; + + if (copy_from_user(&test, useraddr, sizeof(test))) + return -EFAULT; + + test.len = ops->self_test_count(dev); + data = kmalloc(test.len * sizeof(u64), GFP_USER); + if (!data) + return -ENOMEM; + + ops->self_test(dev, &test, data); + + ret = -EFAULT; + if (copy_to_user(useraddr, &test, sizeof(test))) + goto out; + useraddr += sizeof(test); + if (copy_to_user(useraddr, data, test.len * sizeof(u64))) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + +static int ethtool_get_strings(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_gstrings gstrings; + struct ethtool_ops *ops = dev->ethtool_ops; + u8 *data; + int ret; + + if (!ops->get_strings) + return -EOPNOTSUPP; + + if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) + return -EFAULT; + + switch (gstrings.string_set) { + case ETH_SS_TEST: + if (!ops->self_test_count) + return -EOPNOTSUPP; + gstrings.len = ops->self_test_count(dev); + break; + case ETH_SS_STATS: + if (!ops->get_stats_count) + return -EOPNOTSUPP; + gstrings.len = ops->get_stats_count(dev); + break; + default: + return -EINVAL; + } + + data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); + if (!data) + return -ENOMEM; + + ops->get_strings(dev, gstrings.string_set, data); + + ret = -EFAULT; + if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) + goto out; + useraddr += sizeof(gstrings); + if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + +static int ethtool_phys_id(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_value id; + + if (!dev->ethtool_ops->phys_id) + return -EOPNOTSUPP; + + if (copy_from_user(&id, useraddr, sizeof(id))) + return -EFAULT; + + return dev->ethtool_ops->phys_id(dev, id.data); +} + +static int ethtool_get_stats(struct net_device *dev, void __user *useraddr) +{ + struct ethtool_stats stats; + struct ethtool_ops *ops = dev->ethtool_ops; + u64 *data; + int ret; + + if (!ops->get_ethtool_stats || !ops->get_stats_count) + return -EOPNOTSUPP; + + if (copy_from_user(&stats, useraddr, sizeof(stats))) + return -EFAULT; + + stats.n_stats = ops->get_stats_count(dev); + data = kmalloc(stats.n_stats * sizeof(u64), GFP_USER); + if (!data) + return -ENOMEM; + + ops->get_ethtool_stats(dev, &stats, data); + + ret = -EFAULT; + if (copy_to_user(useraddr, &stats, sizeof(stats))) + goto out; + useraddr += sizeof(stats); + if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) + goto out; + ret = 0; + + out: + kfree(data); + return ret; +} + +/* The main entry point in this file. Called from net/core/dev.c */ + +int dev_ethtool(struct ifreq *ifr) +{ + struct net_device *dev = __dev_get_by_name(ifr->ifr_name); + void __user *useraddr = ifr->ifr_data; + u32 ethcmd; + int rc; + + /* + * XXX: This can be pushed down into the ethtool_* handlers that + * need it. Keep existing behaviour for the moment. + */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!dev || !netif_device_present(dev)) + return -ENODEV; + + if (!dev->ethtool_ops) + goto ioctl; + + if (copy_from_user(ðcmd, useraddr, sizeof (ethcmd))) + return -EFAULT; + + if(dev->ethtool_ops->begin) + if ((rc = dev->ethtool_ops->begin(dev)) < 0) + return rc; + + switch (ethcmd) { + case ETHTOOL_GSET: + rc = ethtool_get_settings(dev, useraddr); + break; + case ETHTOOL_SSET: + rc = ethtool_set_settings(dev, useraddr); + break; + case ETHTOOL_GDRVINFO: + rc = ethtool_get_drvinfo(dev, useraddr); + + break; + case ETHTOOL_GREGS: + rc = ethtool_get_regs(dev, useraddr); + break; + case ETHTOOL_GWOL: + rc = ethtool_get_wol(dev, useraddr); + break; + case ETHTOOL_SWOL: + rc = ethtool_set_wol(dev, useraddr); + break; + case ETHTOOL_GMSGLVL: + rc = ethtool_get_msglevel(dev, useraddr); + break; + case ETHTOOL_SMSGLVL: + rc = ethtool_set_msglevel(dev, useraddr); + break; + case ETHTOOL_NWAY_RST: + rc = ethtool_nway_reset(dev); + break; + case ETHTOOL_GLINK: + rc = ethtool_get_link(dev, useraddr); + break; + case ETHTOOL_GEEPROM: + rc = ethtool_get_eeprom(dev, useraddr); + break; + case ETHTOOL_SEEPROM: + rc = ethtool_set_eeprom(dev, useraddr); + break; + case ETHTOOL_GCOALESCE: + rc = ethtool_get_coalesce(dev, useraddr); + break; + case ETHTOOL_SCOALESCE: + rc = ethtool_set_coalesce(dev, useraddr); + break; + case ETHTOOL_GRINGPARAM: + rc = ethtool_get_ringparam(dev, useraddr); + break; + case ETHTOOL_SRINGPARAM: + rc = ethtool_set_ringparam(dev, useraddr); + break; + case ETHTOOL_GPAUSEPARAM: + rc = ethtool_get_pauseparam(dev, useraddr); + break; + case ETHTOOL_SPAUSEPARAM: + rc = ethtool_set_pauseparam(dev, useraddr); + break; + case ETHTOOL_GRXCSUM: + rc = ethtool_get_rx_csum(dev, useraddr); + break; + case ETHTOOL_SRXCSUM: + rc = ethtool_set_rx_csum(dev, useraddr); + break; + case ETHTOOL_GTXCSUM: + rc = ethtool_get_tx_csum(dev, useraddr); + break; + case ETHTOOL_STXCSUM: + rc = ethtool_set_tx_csum(dev, useraddr); + break; + case ETHTOOL_GSG: + rc = ethtool_get_sg(dev, useraddr); + break; + case ETHTOOL_SSG: + rc = ethtool_set_sg(dev, useraddr); + break; + case ETHTOOL_GTSO: + rc = ethtool_get_tso(dev, useraddr); + break; + case ETHTOOL_STSO: + rc = ethtool_set_tso(dev, useraddr); + break; + case ETHTOOL_TEST: + rc = ethtool_self_test(dev, useraddr); + break; + case ETHTOOL_GSTRINGS: + rc = ethtool_get_strings(dev, useraddr); + break; + case ETHTOOL_PHYS_ID: + rc = ethtool_phys_id(dev, useraddr); + break; + case ETHTOOL_GSTATS: + rc = ethtool_get_stats(dev, useraddr); + break; + default: + rc = -EOPNOTSUPP; + } + + if(dev->ethtool_ops->complete) + dev->ethtool_ops->complete(dev); + return rc; + + ioctl: + if (dev->do_ioctl) + return dev->do_ioctl(dev, ifr, SIOCETHTOOL); + return -EOPNOTSUPP; +} + +EXPORT_SYMBOL(dev_ethtool); +EXPORT_SYMBOL(ethtool_op_get_link); +EXPORT_SYMBOL(ethtool_op_get_sg); +EXPORT_SYMBOL(ethtool_op_get_tso); +EXPORT_SYMBOL(ethtool_op_get_tx_csum); +EXPORT_SYMBOL(ethtool_op_set_sg); +EXPORT_SYMBOL(ethtool_op_set_tso); +EXPORT_SYMBOL(ethtool_op_set_tx_csum); diff --git a/net/core/filter.c b/net/core/filter.c new file mode 100644 index 000000000000..f3b88205ace2 --- /dev/null +++ b/net/core/filter.c @@ -0,0 +1,432 @@ +/* + * Linux Socket Filter - Kernel level socket filtering + * + * Author: + * Jay Schulist + * + * Based on the design of: + * - The Berkeley Packet Filter + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Andi Kleen - Fix a few bad bugs and races. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* No hurry in this branch */ +static u8 *load_pointer(struct sk_buff *skb, int k) +{ + u8 *ptr = NULL; + + if (k >= SKF_NET_OFF) + ptr = skb->nh.raw + k - SKF_NET_OFF; + else if (k >= SKF_LL_OFF) + ptr = skb->mac.raw + k - SKF_LL_OFF; + + if (ptr >= skb->head && ptr < skb->tail) + return ptr; + return NULL; +} + +/** + * sk_run_filter - run a filter on a socket + * @skb: buffer to run the filter on + * @filter: filter to apply + * @flen: length of filter + * + * Decode and apply filter instructions to the skb->data. + * Return length to keep, 0 for none. skb is the data we are + * filtering, filter is the array of filter instructions, and + * len is the number of filter blocks in the array. + */ + +int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen) +{ + unsigned char *data = skb->data; + /* len is UNSIGNED. Byte wide insns relies only on implicit + type casts to prevent reading arbitrary memory locations. + */ + unsigned int len = skb->len-skb->data_len; + struct sock_filter *fentry; /* We walk down these */ + u32 A = 0; /* Accumulator */ + u32 X = 0; /* Index Register */ + u32 mem[BPF_MEMWORDS]; /* Scratch Memory Store */ + int k; + int pc; + + /* + * Process array of filter instructions. + */ + for (pc = 0; pc < flen; pc++) { + fentry = &filter[pc]; + + switch (fentry->code) { + case BPF_ALU|BPF_ADD|BPF_X: + A += X; + continue; + case BPF_ALU|BPF_ADD|BPF_K: + A += fentry->k; + continue; + case BPF_ALU|BPF_SUB|BPF_X: + A -= X; + continue; + case BPF_ALU|BPF_SUB|BPF_K: + A -= fentry->k; + continue; + case BPF_ALU|BPF_MUL|BPF_X: + A *= X; + continue; + case BPF_ALU|BPF_MUL|BPF_K: + A *= fentry->k; + continue; + case BPF_ALU|BPF_DIV|BPF_X: + if (X == 0) + return 0; + A /= X; + continue; + case BPF_ALU|BPF_DIV|BPF_K: + if (fentry->k == 0) + return 0; + A /= fentry->k; + continue; + case BPF_ALU|BPF_AND|BPF_X: + A &= X; + continue; + case BPF_ALU|BPF_AND|BPF_K: + A &= fentry->k; + continue; + case BPF_ALU|BPF_OR|BPF_X: + A |= X; + continue; + case BPF_ALU|BPF_OR|BPF_K: + A |= fentry->k; + continue; + case BPF_ALU|BPF_LSH|BPF_X: + A <<= X; + continue; + case BPF_ALU|BPF_LSH|BPF_K: + A <<= fentry->k; + continue; + case BPF_ALU|BPF_RSH|BPF_X: + A >>= X; + continue; + case BPF_ALU|BPF_RSH|BPF_K: + A >>= fentry->k; + continue; + case BPF_ALU|BPF_NEG: + A = -A; + continue; + case BPF_JMP|BPF_JA: + pc += fentry->k; + continue; + case BPF_JMP|BPF_JGT|BPF_K: + pc += (A > fentry->k) ? fentry->jt : fentry->jf; + continue; + case BPF_JMP|BPF_JGE|BPF_K: + pc += (A >= fentry->k) ? fentry->jt : fentry->jf; + continue; + case BPF_JMP|BPF_JEQ|BPF_K: + pc += (A == fentry->k) ? fentry->jt : fentry->jf; + continue; + case BPF_JMP|BPF_JSET|BPF_K: + pc += (A & fentry->k) ? fentry->jt : fentry->jf; + continue; + case BPF_JMP|BPF_JGT|BPF_X: + pc += (A > X) ? fentry->jt : fentry->jf; + continue; + case BPF_JMP|BPF_JGE|BPF_X: + pc += (A >= X) ? fentry->jt : fentry->jf; + continue; + case BPF_JMP|BPF_JEQ|BPF_X: + pc += (A == X) ? fentry->jt : fentry->jf; + continue; + case BPF_JMP|BPF_JSET|BPF_X: + pc += (A & X) ? fentry->jt : fentry->jf; + continue; + case BPF_LD|BPF_W|BPF_ABS: + k = fentry->k; + load_w: + if (k >= 0 && (unsigned int)(k+sizeof(u32)) <= len) { + A = ntohl(*(u32*)&data[k]); + continue; + } + if (k < 0) { + u8 *ptr; + + if (k >= SKF_AD_OFF) + break; + ptr = load_pointer(skb, k); + if (ptr) { + A = ntohl(*(u32*)ptr); + continue; + } + } else { + u32 _tmp, *p; + p = skb_header_pointer(skb, k, 4, &_tmp); + if (p != NULL) { + A = ntohl(*p); + continue; + } + } + return 0; + case BPF_LD|BPF_H|BPF_ABS: + k = fentry->k; + load_h: + if (k >= 0 && (unsigned int)(k + sizeof(u16)) <= len) { + A = ntohs(*(u16*)&data[k]); + continue; + } + if (k < 0) { + u8 *ptr; + + if (k >= SKF_AD_OFF) + break; + ptr = load_pointer(skb, k); + if (ptr) { + A = ntohs(*(u16*)ptr); + continue; + } + } else { + u16 _tmp, *p; + p = skb_header_pointer(skb, k, 2, &_tmp); + if (p != NULL) { + A = ntohs(*p); + continue; + } + } + return 0; + case BPF_LD|BPF_B|BPF_ABS: + k = fentry->k; +load_b: + if (k >= 0 && (unsigned int)k < len) { + A = data[k]; + continue; + } + if (k < 0) { + u8 *ptr; + + if (k >= SKF_AD_OFF) + break; + ptr = load_pointer(skb, k); + if (ptr) { + A = *ptr; + continue; + } + } else { + u8 _tmp, *p; + p = skb_header_pointer(skb, k, 1, &_tmp); + if (p != NULL) { + A = *p; + continue; + } + } + return 0; + case BPF_LD|BPF_W|BPF_LEN: + A = len; + continue; + case BPF_LDX|BPF_W|BPF_LEN: + X = len; + continue; + case BPF_LD|BPF_W|BPF_IND: + k = X + fentry->k; + goto load_w; + case BPF_LD|BPF_H|BPF_IND: + k = X + fentry->k; + goto load_h; + case BPF_LD|BPF_B|BPF_IND: + k = X + fentry->k; + goto load_b; + case BPF_LDX|BPF_B|BPF_MSH: + if (fentry->k >= len) + return 0; + X = (data[fentry->k] & 0xf) << 2; + continue; + case BPF_LD|BPF_IMM: + A = fentry->k; + continue; + case BPF_LDX|BPF_IMM: + X = fentry->k; + continue; + case BPF_LD|BPF_MEM: + A = mem[fentry->k]; + continue; + case BPF_LDX|BPF_MEM: + X = mem[fentry->k]; + continue; + case BPF_MISC|BPF_TAX: + X = A; + continue; + case BPF_MISC|BPF_TXA: + A = X; + continue; + case BPF_RET|BPF_K: + return ((unsigned int)fentry->k); + case BPF_RET|BPF_A: + return ((unsigned int)A); + case BPF_ST: + mem[fentry->k] = A; + continue; + case BPF_STX: + mem[fentry->k] = X; + continue; + default: + /* Invalid instruction counts as RET */ + return 0; + } + + /* + * Handle ancillary data, which are impossible + * (or very difficult) to get parsing packet contents. + */ + switch (k-SKF_AD_OFF) { + case SKF_AD_PROTOCOL: + A = htons(skb->protocol); + continue; + case SKF_AD_PKTTYPE: + A = skb->pkt_type; + continue; + case SKF_AD_IFINDEX: + A = skb->dev->ifindex; + continue; + default: + return 0; + } + } + + return 0; +} + +/** + * sk_chk_filter - verify socket filter code + * @filter: filter to verify + * @flen: length of filter + * + * Check the user's filter code. If we let some ugly + * filter code slip through kaboom! The filter must contain + * no references or jumps that are out of range, no illegal instructions + * and no backward jumps. It must end with a RET instruction + * + * Returns 0 if the rule set is legal or a negative errno code if not. + */ +int sk_chk_filter(struct sock_filter *filter, int flen) +{ + struct sock_filter *ftest; + int pc; + + if (((unsigned int)flen >= (~0U / sizeof(struct sock_filter))) || flen == 0) + return -EINVAL; + + /* check the filter code now */ + for (pc = 0; pc < flen; pc++) { + /* all jumps are forward as they are not signed */ + ftest = &filter[pc]; + if (BPF_CLASS(ftest->code) == BPF_JMP) { + /* but they mustn't jump off the end */ + if (BPF_OP(ftest->code) == BPF_JA) { + /* + * Note, the large ftest->k might cause loops. + * Compare this with conditional jumps below, + * where offsets are limited. --ANK (981016) + */ + if (ftest->k >= (unsigned)(flen-pc-1)) + return -EINVAL; + } else { + /* for conditionals both must be safe */ + if (pc + ftest->jt +1 >= flen || + pc + ftest->jf +1 >= flen) + return -EINVAL; + } + } + + /* check that memory operations use valid addresses. */ + if (ftest->k >= BPF_MEMWORDS) { + /* but it might not be a memory operation... */ + switch (ftest->code) { + case BPF_ST: + case BPF_STX: + case BPF_LD|BPF_MEM: + case BPF_LDX|BPF_MEM: + return -EINVAL; + } + } + } + + /* + * The program must end with a return. We don't care where they + * jumped within the script (its always forwards) but in the end + * they _will_ hit this. + */ + return (BPF_CLASS(filter[flen - 1].code) == BPF_RET) ? 0 : -EINVAL; +} + +/** + * sk_attach_filter - attach a socket filter + * @fprog: the filter program + * @sk: the socket to use + * + * Attach the user's filter code. We first run some sanity checks on + * it to make sure it does not explode on us later. If an error + * occurs or there is insufficient memory for the filter a negative + * errno code is returned. On success the return is zero. + */ +int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) +{ + struct sk_filter *fp; + unsigned int fsize = sizeof(struct sock_filter) * fprog->len; + int err; + + /* Make sure new filter is there and in the right amounts. */ + if (fprog->filter == NULL || fprog->len > BPF_MAXINSNS) + return -EINVAL; + + fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL); + if (!fp) + return -ENOMEM; + if (copy_from_user(fp->insns, fprog->filter, fsize)) { + sock_kfree_s(sk, fp, fsize+sizeof(*fp)); + return -EFAULT; + } + + atomic_set(&fp->refcnt, 1); + fp->len = fprog->len; + + err = sk_chk_filter(fp->insns, fp->len); + if (!err) { + struct sk_filter *old_fp; + + spin_lock_bh(&sk->sk_lock.slock); + old_fp = sk->sk_filter; + sk->sk_filter = fp; + spin_unlock_bh(&sk->sk_lock.slock); + fp = old_fp; + } + + if (fp) + sk_filter_release(sk, fp); + return err; +} + +EXPORT_SYMBOL(sk_chk_filter); +EXPORT_SYMBOL(sk_run_filter); diff --git a/net/core/flow.c b/net/core/flow.c new file mode 100644 index 000000000000..f289570b15a3 --- /dev/null +++ b/net/core/flow.c @@ -0,0 +1,371 @@ +/* flow.c: Generic flow cache. + * + * Copyright (C) 2003 Alexey N. Kuznetsov (kuznet@ms2.inr.ac.ru) + * Copyright (C) 2003 David S. Miller (davem@redhat.com) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct flow_cache_entry { + struct flow_cache_entry *next; + u16 family; + u8 dir; + struct flowi key; + u32 genid; + void *object; + atomic_t *object_ref; +}; + +atomic_t flow_cache_genid = ATOMIC_INIT(0); + +static u32 flow_hash_shift; +#define flow_hash_size (1 << flow_hash_shift) +static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL }; + +#define flow_table(cpu) (per_cpu(flow_tables, cpu)) + +static kmem_cache_t *flow_cachep; + +static int flow_lwm, flow_hwm; + +struct flow_percpu_info { + int hash_rnd_recalc; + u32 hash_rnd; + int count; +} ____cacheline_aligned; +static DEFINE_PER_CPU(struct flow_percpu_info, flow_hash_info) = { 0 }; + +#define flow_hash_rnd_recalc(cpu) \ + (per_cpu(flow_hash_info, cpu).hash_rnd_recalc) +#define flow_hash_rnd(cpu) \ + (per_cpu(flow_hash_info, cpu).hash_rnd) +#define flow_count(cpu) \ + (per_cpu(flow_hash_info, cpu).count) + +static struct timer_list flow_hash_rnd_timer; + +#define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) + +struct flow_flush_info { + atomic_t cpuleft; + struct completion completion; +}; +static DEFINE_PER_CPU(struct tasklet_struct, flow_flush_tasklets) = { NULL }; + +#define flow_flush_tasklet(cpu) (&per_cpu(flow_flush_tasklets, cpu)) + +static void flow_cache_new_hashrnd(unsigned long arg) +{ + int i; + + for_each_cpu(i) + flow_hash_rnd_recalc(i) = 1; + + flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&flow_hash_rnd_timer); +} + +static void __flow_cache_shrink(int cpu, int shrink_to) +{ + struct flow_cache_entry *fle, **flp; + int i; + + for (i = 0; i < flow_hash_size; i++) { + int k = 0; + + flp = &flow_table(cpu)[i]; + while ((fle = *flp) != NULL && k < shrink_to) { + k++; + flp = &fle->next; + } + while ((fle = *flp) != NULL) { + *flp = fle->next; + if (fle->object) + atomic_dec(fle->object_ref); + kmem_cache_free(flow_cachep, fle); + flow_count(cpu)--; + } + } +} + +static void flow_cache_shrink(int cpu) +{ + int shrink_to = flow_lwm / flow_hash_size; + + __flow_cache_shrink(cpu, shrink_to); +} + +static void flow_new_hash_rnd(int cpu) +{ + get_random_bytes(&flow_hash_rnd(cpu), sizeof(u32)); + flow_hash_rnd_recalc(cpu) = 0; + + __flow_cache_shrink(cpu, 0); +} + +static u32 flow_hash_code(struct flowi *key, int cpu) +{ + u32 *k = (u32 *) key; + + return (jhash2(k, (sizeof(*key) / sizeof(u32)), flow_hash_rnd(cpu)) & + (flow_hash_size - 1)); +} + +#if (BITS_PER_LONG == 64) +typedef u64 flow_compare_t; +#else +typedef u32 flow_compare_t; +#endif + +extern void flowi_is_missized(void); + +/* I hear what you're saying, use memcmp. But memcmp cannot make + * important assumptions that we can here, such as alignment and + * constant size. + */ +static int flow_key_compare(struct flowi *key1, struct flowi *key2) +{ + flow_compare_t *k1, *k1_lim, *k2; + const int n_elem = sizeof(struct flowi) / sizeof(flow_compare_t); + + if (sizeof(struct flowi) % sizeof(flow_compare_t)) + flowi_is_missized(); + + k1 = (flow_compare_t *) key1; + k1_lim = k1 + n_elem; + + k2 = (flow_compare_t *) key2; + + do { + if (*k1++ != *k2++) + return 1; + } while (k1 < k1_lim); + + return 0; +} + +void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, + flow_resolve_t resolver) +{ + struct flow_cache_entry *fle, **head; + unsigned int hash; + int cpu; + + local_bh_disable(); + cpu = smp_processor_id(); + + fle = NULL; + /* Packet really early in init? Making flow_cache_init a + * pre-smp initcall would solve this. --RR */ + if (!flow_table(cpu)) + goto nocache; + + if (flow_hash_rnd_recalc(cpu)) + flow_new_hash_rnd(cpu); + hash = flow_hash_code(key, cpu); + + head = &flow_table(cpu)[hash]; + for (fle = *head; fle; fle = fle->next) { + if (fle->family == family && + fle->dir == dir && + flow_key_compare(key, &fle->key) == 0) { + if (fle->genid == atomic_read(&flow_cache_genid)) { + void *ret = fle->object; + + if (ret) + atomic_inc(fle->object_ref); + local_bh_enable(); + + return ret; + } + break; + } + } + + if (!fle) { + if (flow_count(cpu) > flow_hwm) + flow_cache_shrink(cpu); + + fle = kmem_cache_alloc(flow_cachep, SLAB_ATOMIC); + if (fle) { + fle->next = *head; + *head = fle; + fle->family = family; + fle->dir = dir; + memcpy(&fle->key, key, sizeof(*key)); + fle->object = NULL; + flow_count(cpu)++; + } + } + +nocache: + { + void *obj; + atomic_t *obj_ref; + + resolver(key, family, dir, &obj, &obj_ref); + + if (fle) { + fle->genid = atomic_read(&flow_cache_genid); + + if (fle->object) + atomic_dec(fle->object_ref); + + fle->object = obj; + fle->object_ref = obj_ref; + if (obj) + atomic_inc(fle->object_ref); + } + local_bh_enable(); + + return obj; + } +} + +static void flow_cache_flush_tasklet(unsigned long data) +{ + struct flow_flush_info *info = (void *)data; + int i; + int cpu; + + cpu = smp_processor_id(); + for (i = 0; i < flow_hash_size; i++) { + struct flow_cache_entry *fle; + + fle = flow_table(cpu)[i]; + for (; fle; fle = fle->next) { + unsigned genid = atomic_read(&flow_cache_genid); + + if (!fle->object || fle->genid == genid) + continue; + + fle->object = NULL; + atomic_dec(fle->object_ref); + } + } + + if (atomic_dec_and_test(&info->cpuleft)) + complete(&info->completion); +} + +static void flow_cache_flush_per_cpu(void *) __attribute__((__unused__)); +static void flow_cache_flush_per_cpu(void *data) +{ + struct flow_flush_info *info = data; + int cpu; + struct tasklet_struct *tasklet; + + cpu = smp_processor_id(); + + tasklet = flow_flush_tasklet(cpu); + tasklet->data = (unsigned long)info; + tasklet_schedule(tasklet); +} + +void flow_cache_flush(void) +{ + struct flow_flush_info info; + static DECLARE_MUTEX(flow_flush_sem); + + /* Don't want cpus going down or up during this. */ + lock_cpu_hotplug(); + down(&flow_flush_sem); + atomic_set(&info.cpuleft, num_online_cpus()); + init_completion(&info.completion); + + local_bh_disable(); + smp_call_function(flow_cache_flush_per_cpu, &info, 1, 0); + flow_cache_flush_tasklet((unsigned long)&info); + local_bh_enable(); + + wait_for_completion(&info.completion); + up(&flow_flush_sem); + unlock_cpu_hotplug(); +} + +static void __devinit flow_cache_cpu_prepare(int cpu) +{ + struct tasklet_struct *tasklet; + unsigned long order; + + for (order = 0; + (PAGE_SIZE << order) < + (sizeof(struct flow_cache_entry *)*flow_hash_size); + order++) + /* NOTHING */; + + flow_table(cpu) = (struct flow_cache_entry **) + __get_free_pages(GFP_KERNEL, order); + if (!flow_table(cpu)) + panic("NET: failed to allocate flow cache order %lu\n", order); + + memset(flow_table(cpu), 0, PAGE_SIZE << order); + + flow_hash_rnd_recalc(cpu) = 1; + flow_count(cpu) = 0; + + tasklet = flow_flush_tasklet(cpu); + tasklet_init(tasklet, flow_cache_flush_tasklet, 0); +} + +#ifdef CONFIG_HOTPLUG_CPU +static int flow_cache_cpu(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + if (action == CPU_DEAD) + __flow_cache_shrink((unsigned long)hcpu, 0); + return NOTIFY_OK; +} +#endif /* CONFIG_HOTPLUG_CPU */ + +static int __init flow_cache_init(void) +{ + int i; + + flow_cachep = kmem_cache_create("flow_cache", + sizeof(struct flow_cache_entry), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + + if (!flow_cachep) + panic("NET: failed to allocate flow cache slab\n"); + + flow_hash_shift = 10; + flow_lwm = 2 * flow_hash_size; + flow_hwm = 4 * flow_hash_size; + + init_timer(&flow_hash_rnd_timer); + flow_hash_rnd_timer.function = flow_cache_new_hashrnd; + flow_hash_rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&flow_hash_rnd_timer); + + for_each_cpu(i) + flow_cache_cpu_prepare(i); + + hotcpu_notifier(flow_cache_cpu, 0); + return 0; +} + +module_init(flow_cache_init); + +EXPORT_SYMBOL(flow_cache_genid); +EXPORT_SYMBOL(flow_cache_lookup); diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c new file mode 100644 index 000000000000..b07c029e8219 --- /dev/null +++ b/net/core/gen_estimator.c @@ -0,0 +1,250 @@ +/* + * net/sched/gen_estimator.c Simple rate estimator. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + * Changes: + * Jamal Hadi Salim - moved it to net/core and reshulfed + * names to make it usable in general net subsystem. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + This code is NOT intended to be used for statistics collection, + its purpose is to provide a base for statistical multiplexing + for controlled load service. + If you need only statistics, run a user level daemon which + periodically reads byte counters. + + Unfortunately, rate estimation is not a very easy task. + F.e. I did not find a simple way to estimate the current peak rate + and even failed to formulate the problem 8)8) + + So I preferred not to built an estimator into the scheduler, + but run this task separately. + Ideally, it should be kernel thread(s), but for now it runs + from timers, which puts apparent top bounds on the number of rated + flows, has minimal overhead on small, but is enough + to handle controlled load service, sets of aggregates. + + We measure rate over A=(1<next) { + u64 nbytes; + u32 npackets; + u32 rate; + + spin_lock(e->stats_lock); + nbytes = e->bstats->bytes; + npackets = e->bstats->packets; + rate = (nbytes - e->last_bytes)<<(7 - idx); + e->last_bytes = nbytes; + e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; + e->rate_est->bps = (e->avbps+0xF)>>5; + + rate = (npackets - e->last_packets)<<(12 - idx); + e->last_packets = npackets; + e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; + e->rate_est->pps = (e->avpps+0x1FF)>>10; + spin_unlock(e->stats_lock); + } + + mod_timer(&elist[idx].timer, jiffies + ((HZ<interval < -2 || parm->interval > 3) + return -EINVAL; + + est = kmalloc(sizeof(*est), GFP_KERNEL); + if (est == NULL) + return -ENOBUFS; + + memset(est, 0, sizeof(*est)); + est->interval = parm->interval + 2; + est->bstats = bstats; + est->rate_est = rate_est; + est->stats_lock = stats_lock; + est->ewma_log = parm->ewma_log; + est->last_bytes = bstats->bytes; + est->avbps = rate_est->bps<<5; + est->last_packets = bstats->packets; + est->avpps = rate_est->pps<<10; + + est->next = elist[est->interval].list; + if (est->next == NULL) { + init_timer(&elist[est->interval].timer); + elist[est->interval].timer.data = est->interval; + elist[est->interval].timer.expires = jiffies + ((HZ<interval)/4); + elist[est->interval].timer.function = est_timer; + add_timer(&elist[est->interval].timer); + } + write_lock_bh(&est_lock); + elist[est->interval].list = est; + write_unlock_bh(&est_lock); + return 0; +} + +/** + * gen_kill_estimator - remove a rate estimator + * @bstats: basic statistics + * @rate_est: rate estimator statistics + * + * Removes the rate estimator specified by &bstats and &rate_est + * and deletes the timer. + */ +void gen_kill_estimator(struct gnet_stats_basic *bstats, + struct gnet_stats_rate_est *rate_est) +{ + int idx; + struct gen_estimator *est, **pest; + + for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { + int killed = 0; + pest = &elist[idx].list; + while ((est=*pest) != NULL) { + if (est->rate_est != rate_est || est->bstats != bstats) { + pest = &est->next; + continue; + } + + write_lock_bh(&est_lock); + *pest = est->next; + write_unlock_bh(&est_lock); + + kfree(est); + killed++; + } + if (killed && elist[idx].list == NULL) + del_timer(&elist[idx].timer); + } +} + +/** + * gen_replace_estimator - replace rate estimator configruation + * @bstats: basic statistics + * @rate_est: rate estimator statistics + * @stats_lock: statistics lock + * @opt: rate estimator configuration TLV + * + * Replaces the configuration of a rate estimator by calling + * gen_kill_estimator() and gen_new_estimator(). + * + * Returns 0 on success or a negative error code. + */ +int +gen_replace_estimator(struct gnet_stats_basic *bstats, + struct gnet_stats_rate_est *rate_est, spinlock_t *stats_lock, + struct rtattr *opt) +{ + gen_kill_estimator(bstats, rate_est); + return gen_new_estimator(bstats, rate_est, stats_lock, opt); +} + + +EXPORT_SYMBOL(gen_kill_estimator); +EXPORT_SYMBOL(gen_new_estimator); +EXPORT_SYMBOL(gen_replace_estimator); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c new file mode 100644 index 000000000000..8f21490355fa --- /dev/null +++ b/net/core/gen_stats.c @@ -0,0 +1,239 @@ +/* + * net/core/gen_stats.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + * Jamal Hadi Salim + * Alexey Kuznetsov, + * + * See Documentation/networking/gen_stats.txt + */ + +#include +#include +#include +#include +#include +#include +#include +#include + + +static inline int +gnet_stats_copy(struct gnet_dump *d, int type, void *buf, int size) +{ + RTA_PUT(d->skb, type, size, buf); + return 0; + +rtattr_failure: + spin_unlock_bh(d->lock); + return -1; +} + +/** + * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode + * @skb: socket buffer to put statistics TLVs into + * @type: TLV type for top level statistic TLV + * @tc_stats_type: TLV type for backward compatibility struct tc_stats TLV + * @xstats_type: TLV type for backward compatibility xstats TLV + * @lock: statistics lock + * @d: dumping handle + * + * Initializes the dumping handle, grabs the statistic lock and appends + * an empty TLV header to the socket buffer for use a container for all + * other statistic TLVS. + * + * The dumping handle is marked to be in backward compatibility mode telling + * all gnet_stats_copy_XXX() functions to fill a local copy of struct tc_stats. + * + * Returns 0 on success or -1 if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_start_copy_compat(struct sk_buff *skb, int type, int tc_stats_type, + int xstats_type, spinlock_t *lock, struct gnet_dump *d) +{ + memset(d, 0, sizeof(*d)); + + spin_lock_bh(lock); + d->lock = lock; + if (type) + d->tail = (struct rtattr *) skb->tail; + d->skb = skb; + d->compat_tc_stats = tc_stats_type; + d->compat_xstats = xstats_type; + + if (d->tail) + return gnet_stats_copy(d, type, NULL, 0); + + return 0; +} + +/** + * gnet_stats_start_copy_compat - start dumping procedure in compatibility mode + * @skb: socket buffer to put statistics TLVs into + * @type: TLV type for top level statistic TLV + * @lock: statistics lock + * @d: dumping handle + * + * Initializes the dumping handle, grabs the statistic lock and appends + * an empty TLV header to the socket buffer for use a container for all + * other statistic TLVS. + * + * Returns 0 on success or -1 if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_start_copy(struct sk_buff *skb, int type, spinlock_t *lock, + struct gnet_dump *d) +{ + return gnet_stats_start_copy_compat(skb, type, 0, 0, lock, d); +} + +/** + * gnet_stats_copy_basic - copy basic statistics into statistic TLV + * @d: dumping handle + * @b: basic statistics + * + * Appends the basic statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_basic(struct gnet_dump *d, struct gnet_stats_basic *b) +{ + if (d->compat_tc_stats) { + d->tc_stats.bytes = b->bytes; + d->tc_stats.packets = b->packets; + } + + if (d->tail) + return gnet_stats_copy(d, TCA_STATS_BASIC, b, sizeof(*b)); + + return 0; +} + +/** + * gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV + * @d: dumping handle + * @r: rate estimator statistics + * + * Appends the rate estimator statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_rate_est(struct gnet_dump *d, struct gnet_stats_rate_est *r) +{ + if (d->compat_tc_stats) { + d->tc_stats.bps = r->bps; + d->tc_stats.pps = r->pps; + } + + if (d->tail) + return gnet_stats_copy(d, TCA_STATS_RATE_EST, r, sizeof(*r)); + + return 0; +} + +/** + * gnet_stats_copy_queue - copy queue statistics into statistics TLV + * @d: dumping handle + * @q: queue statistics + * + * Appends the queue statistics to the top level TLV created by + * gnet_stats_start_copy(). + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_queue(struct gnet_dump *d, struct gnet_stats_queue *q) +{ + if (d->compat_tc_stats) { + d->tc_stats.drops = q->drops; + d->tc_stats.qlen = q->qlen; + d->tc_stats.backlog = q->backlog; + d->tc_stats.overlimits = q->overlimits; + } + + if (d->tail) + return gnet_stats_copy(d, TCA_STATS_QUEUE, q, sizeof(*q)); + + return 0; +} + +/** + * gnet_stats_copy_app - copy application specific statistics into statistics TLV + * @d: dumping handle + * @st: application specific statistics data + * @len: length of data + * + * Appends the application sepecific statistics to the top level TLV created by + * gnet_stats_start_copy() and remembers the data for XSTATS if the dumping + * handle is in backward compatibility mode. + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_copy_app(struct gnet_dump *d, void *st, int len) +{ + if (d->compat_xstats) { + d->xstats = st; + d->xstats_len = len; + } + + if (d->tail) + return gnet_stats_copy(d, TCA_STATS_APP, st, len); + + return 0; +} + +/** + * gnet_stats_finish_copy - finish dumping procedure + * @d: dumping handle + * + * Corrects the length of the top level TLV to include all TLVs added + * by gnet_stats_copy_XXX() calls. Adds the backward compatibility TLVs + * if gnet_stats_start_copy_compat() was used and releases the statistics + * lock. + * + * Returns 0 on success or -1 with the statistic lock released + * if the room in the socket buffer was not sufficient. + */ +int +gnet_stats_finish_copy(struct gnet_dump *d) +{ + if (d->tail) + d->tail->rta_len = d->skb->tail - (u8 *) d->tail; + + if (d->compat_tc_stats) + if (gnet_stats_copy(d, d->compat_tc_stats, &d->tc_stats, + sizeof(d->tc_stats)) < 0) + return -1; + + if (d->compat_xstats && d->xstats) { + if (gnet_stats_copy(d, d->compat_xstats, d->xstats, + d->xstats_len) < 0) + return -1; + } + + spin_unlock_bh(d->lock); + return 0; +} + + +EXPORT_SYMBOL(gnet_stats_start_copy); +EXPORT_SYMBOL(gnet_stats_start_copy_compat); +EXPORT_SYMBOL(gnet_stats_copy_basic); +EXPORT_SYMBOL(gnet_stats_copy_rate_est); +EXPORT_SYMBOL(gnet_stats_copy_queue); +EXPORT_SYMBOL(gnet_stats_copy_app); +EXPORT_SYMBOL(gnet_stats_finish_copy); diff --git a/net/core/iovec.c b/net/core/iovec.c new file mode 100644 index 000000000000..d57ace949ab8 --- /dev/null +++ b/net/core/iovec.c @@ -0,0 +1,239 @@ +/* + * iovec manipulation routines. + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Andrew Lunn : Errors in iovec copying. + * Pedro Roque : Added memcpy_fromiovecend and + * csum_..._fromiovecend. + * Andi Kleen : fixed error handling for 2.1 + * Alexey Kuznetsov: 2.1 optimisations + * Andi Kleen : Fix csum*fromiovecend for IPv6. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Verify iovec. The caller must ensure that the iovec is big enough + * to hold the message iovec. + * + * Save time not doing verify_area. copy_*_user will make this work + * in any case. + */ + +int verify_iovec(struct msghdr *m, struct iovec *iov, char *address, int mode) +{ + int size, err, ct; + + if (m->msg_namelen) { + if (mode == VERIFY_READ) { + err = move_addr_to_kernel(m->msg_name, m->msg_namelen, + address); + if (err < 0) + return err; + } + m->msg_name = address; + } else { + m->msg_name = NULL; + } + + size = m->msg_iovlen * sizeof(struct iovec); + if (copy_from_user(iov, m->msg_iov, size)) + return -EFAULT; + + m->msg_iov = iov; + err = 0; + + for (ct = 0; ct < m->msg_iovlen; ct++) { + err += iov[ct].iov_len; + /* + * Goal is not to verify user data, but to prevent returning + * negative value, which is interpreted as errno. + * Overflow is still possible, but it is harmless. + */ + if (err < 0) + return -EMSGSIZE; + } + + return err; +} + +/* + * Copy kernel to iovec. Returns -EFAULT on error. + * + * Note: this modifies the original iovec. + */ + +int memcpy_toiovec(struct iovec *iov, unsigned char *kdata, int len) +{ + while (len > 0) { + if (iov->iov_len) { + int copy = min_t(unsigned int, iov->iov_len, len); + if (copy_to_user(iov->iov_base, kdata, copy)) + return -EFAULT; + kdata += copy; + len -= copy; + iov->iov_len -= copy; + iov->iov_base += copy; + } + iov++; + } + + return 0; +} + +/* + * Copy iovec to kernel. Returns -EFAULT on error. + * + * Note: this modifies the original iovec. + */ + +int memcpy_fromiovec(unsigned char *kdata, struct iovec *iov, int len) +{ + while (len > 0) { + if (iov->iov_len) { + int copy = min_t(unsigned int, len, iov->iov_len); + if (copy_from_user(kdata, iov->iov_base, copy)) + return -EFAULT; + len -= copy; + kdata += copy; + iov->iov_base += copy; + iov->iov_len -= copy; + } + iov++; + } + + return 0; +} + +/* + * For use with ip_build_xmit + */ +int memcpy_fromiovecend(unsigned char *kdata, struct iovec *iov, int offset, + int len) +{ + /* Skip over the finished iovecs */ + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + } + + while (len > 0) { + u8 __user *base = iov->iov_base + offset; + int copy = min_t(unsigned int, len, iov->iov_len - offset); + + offset = 0; + if (copy_from_user(kdata, base, copy)) + return -EFAULT; + len -= copy; + kdata += copy; + iov++; + } + + return 0; +} + +/* + * And now for the all-in-one: copy and checksum from a user iovec + * directly to a datagram + * Calls to csum_partial but the last must be in 32 bit chunks + * + * ip_build_xmit must ensure that when fragmenting only the last + * call to this function will be unaligned also. + */ +int csum_partial_copy_fromiovecend(unsigned char *kdata, struct iovec *iov, + int offset, unsigned int len, int *csump) +{ + int csum = *csump; + int partial_cnt = 0, err = 0; + + /* Skip over the finished iovecs */ + while (offset >= iov->iov_len) { + offset -= iov->iov_len; + iov++; + } + + while (len > 0) { + u8 __user *base = iov->iov_base + offset; + int copy = min_t(unsigned int, len, iov->iov_len - offset); + + offset = 0; + + /* There is a remnant from previous iov. */ + if (partial_cnt) { + int par_len = 4 - partial_cnt; + + /* iov component is too short ... */ + if (par_len > copy) { + if (copy_from_user(kdata, base, copy)) + goto out_fault; + kdata += copy; + base += copy; + partial_cnt += copy; + len -= copy; + iov++; + if (len) + continue; + *csump = csum_partial(kdata - partial_cnt, + partial_cnt, csum); + goto out; + } + if (copy_from_user(kdata, base, par_len)) + goto out_fault; + csum = csum_partial(kdata - partial_cnt, 4, csum); + kdata += par_len; + base += par_len; + copy -= par_len; + len -= par_len; + partial_cnt = 0; + } + + if (len > copy) { + partial_cnt = copy % 4; + if (partial_cnt) { + copy -= partial_cnt; + if (copy_from_user(kdata + copy, base + copy, + partial_cnt)) + goto out_fault; + } + } + + if (copy) { + csum = csum_and_copy_from_user(base, kdata, copy, + csum, &err); + if (err) + goto out; + } + len -= copy + partial_cnt; + kdata += copy + partial_cnt; + iov++; + } + *csump = csum; +out: + return err; + +out_fault: + err = -EFAULT; + goto out; +} + +EXPORT_SYMBOL(csum_partial_copy_fromiovecend); +EXPORT_SYMBOL(memcpy_fromiovec); +EXPORT_SYMBOL(memcpy_fromiovecend); +EXPORT_SYMBOL(memcpy_toiovec); diff --git a/net/core/link_watch.c b/net/core/link_watch.c new file mode 100644 index 000000000000..4859b7446c6f --- /dev/null +++ b/net/core/link_watch.c @@ -0,0 +1,137 @@ +/* + * Linux network device link state notification + * + * Author: + * Stefan Rompf + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +enum lw_bits { + LW_RUNNING = 0, + LW_SE_USED +}; + +static unsigned long linkwatch_flags; +static unsigned long linkwatch_nextevent; + +static void linkwatch_event(void *dummy); +static DECLARE_WORK(linkwatch_work, linkwatch_event, NULL); + +static LIST_HEAD(lweventlist); +static DEFINE_SPINLOCK(lweventlist_lock); + +struct lw_event { + struct list_head list; + struct net_device *dev; +}; + +/* Avoid kmalloc() for most systems */ +static struct lw_event singleevent; + +/* Must be called with the rtnl semaphore held */ +void linkwatch_run_queue(void) +{ + LIST_HEAD(head); + struct list_head *n, *next; + + spin_lock_irq(&lweventlist_lock); + list_splice_init(&lweventlist, &head); + spin_unlock_irq(&lweventlist_lock); + + list_for_each_safe(n, next, &head) { + struct lw_event *event = list_entry(n, struct lw_event, list); + struct net_device *dev = event->dev; + + if (event == &singleevent) { + clear_bit(LW_SE_USED, &linkwatch_flags); + } else { + kfree(event); + } + + /* We are about to handle this device, + * so new events can be accepted + */ + clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); + + if (dev->flags & IFF_UP) { + netdev_state_change(dev); + } + + dev_put(dev); + } +} + + +static void linkwatch_event(void *dummy) +{ + /* Limit the number of linkwatch events to one + * per second so that a runaway driver does not + * cause a storm of messages on the netlink + * socket + */ + linkwatch_nextevent = jiffies + HZ; + clear_bit(LW_RUNNING, &linkwatch_flags); + + rtnl_shlock(); + linkwatch_run_queue(); + rtnl_shunlock(); +} + + +void linkwatch_fire_event(struct net_device *dev) +{ + if (!test_and_set_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state)) { + unsigned long flags; + struct lw_event *event; + + if (test_and_set_bit(LW_SE_USED, &linkwatch_flags)) { + event = kmalloc(sizeof(struct lw_event), GFP_ATOMIC); + + if (unlikely(event == NULL)) { + clear_bit(__LINK_STATE_LINKWATCH_PENDING, &dev->state); + return; + } + } else { + event = &singleevent; + } + + dev_hold(dev); + event->dev = dev; + + spin_lock_irqsave(&lweventlist_lock, flags); + list_add_tail(&event->list, &lweventlist); + spin_unlock_irqrestore(&lweventlist_lock, flags); + + if (!test_and_set_bit(LW_RUNNING, &linkwatch_flags)) { + unsigned long thisevent = jiffies; + + if (thisevent >= linkwatch_nextevent) { + schedule_work(&linkwatch_work); + } else { + schedule_delayed_work(&linkwatch_work, linkwatch_nextevent - thisevent); + } + } + } +} + +EXPORT_SYMBOL(linkwatch_fire_event); diff --git a/net/core/neighbour.c b/net/core/neighbour.c new file mode 100644 index 000000000000..0a2f67bbef2e --- /dev/null +++ b/net/core/neighbour.c @@ -0,0 +1,2362 @@ +/* + * Generic address resolution entity + * + * Authors: + * Pedro Roque + * Alexey Kuznetsov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Vitaly E. Lavrov releasing NULL neighbor in neigh_add. + * Harald Welte Add neighbour cache statistics like rtstat + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif +#include +#include +#include +#include +#include +#include + +#define NEIGH_DEBUG 1 + +#define NEIGH_PRINTK(x...) printk(x) +#define NEIGH_NOPRINTK(x...) do { ; } while(0) +#define NEIGH_PRINTK0 NEIGH_PRINTK +#define NEIGH_PRINTK1 NEIGH_NOPRINTK +#define NEIGH_PRINTK2 NEIGH_NOPRINTK + +#if NEIGH_DEBUG >= 1 +#undef NEIGH_PRINTK1 +#define NEIGH_PRINTK1 NEIGH_PRINTK +#endif +#if NEIGH_DEBUG >= 2 +#undef NEIGH_PRINTK2 +#define NEIGH_PRINTK2 NEIGH_PRINTK +#endif + +#define PNEIGH_HASHMASK 0xF + +static void neigh_timer_handler(unsigned long arg); +#ifdef CONFIG_ARPD +static void neigh_app_notify(struct neighbour *n); +#endif +static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); +void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); + +static struct neigh_table *neigh_tables; +static struct file_operations neigh_stat_seq_fops; + +/* + Neighbour hash table buckets are protected with rwlock tbl->lock. + + - All the scans/updates to hash buckets MUST be made under this lock. + - NOTHING clever should be made under this lock: no callbacks + to protocol backends, no attempts to send something to network. + It will result in deadlocks, if backend/driver wants to use neighbour + cache. + - If the entry requires some non-trivial actions, increase + its reference count and release table lock. + + Neighbour entries are protected: + - with reference count. + - with rwlock neigh->lock + + Reference count prevents destruction. + + neigh->lock mainly serializes ll address data and its validity state. + However, the same lock is used to protect another entry fields: + - timer + - resolution queue + + Again, nothing clever shall be made under neigh->lock, + the most complicated procedure, which we allow is dev->hard_header. + It is supposed, that dev->hard_header is simplistic and does + not make callbacks to neighbour tables. + + The last lock is neigh_tbl_lock. It is pure SMP lock, protecting + list of neighbour tables. This list is used only in process context, + */ + +static DEFINE_RWLOCK(neigh_tbl_lock); + +static int neigh_blackhole(struct sk_buff *skb) +{ + kfree_skb(skb); + return -ENETDOWN; +} + +/* + * It is random distribution in the interval (1/2)*base...(3/2)*base. + * It corresponds to default IPv6 settings and is not overridable, + * because it is really reasonable choice. + */ + +unsigned long neigh_rand_reach_time(unsigned long base) +{ + return (base ? (net_random() % base) + (base >> 1) : 0); +} + + +static int neigh_forced_gc(struct neigh_table *tbl) +{ + int shrunk = 0; + int i; + + NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); + + write_lock_bh(&tbl->lock); + for (i = 0; i <= tbl->hash_mask; i++) { + struct neighbour *n, **np; + + np = &tbl->hash_buckets[i]; + while ((n = *np) != NULL) { + /* Neighbour record may be discarded if: + * - nobody refers to it. + * - it is not permanent + */ + write_lock(&n->lock); + if (atomic_read(&n->refcnt) == 1 && + !(n->nud_state & NUD_PERMANENT)) { + *np = n->next; + n->dead = 1; + shrunk = 1; + write_unlock(&n->lock); + neigh_release(n); + continue; + } + write_unlock(&n->lock); + np = &n->next; + } + } + + tbl->last_flush = jiffies; + + write_unlock_bh(&tbl->lock); + + return shrunk; +} + +static int neigh_del_timer(struct neighbour *n) +{ + if ((n->nud_state & NUD_IN_TIMER) && + del_timer(&n->timer)) { + neigh_release(n); + return 1; + } + return 0; +} + +static void pneigh_queue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(list)) != NULL) { + dev_put(skb->dev); + kfree_skb(skb); + } +} + +void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev) +{ + int i; + + write_lock_bh(&tbl->lock); + + for (i=0; i <= tbl->hash_mask; i++) { + struct neighbour *n, **np; + + np = &tbl->hash_buckets[i]; + while ((n = *np) != NULL) { + if (dev && n->dev != dev) { + np = &n->next; + continue; + } + *np = n->next; + write_lock_bh(&n->lock); + n->dead = 1; + neigh_del_timer(n); + write_unlock_bh(&n->lock); + neigh_release(n); + } + } + + write_unlock_bh(&tbl->lock); +} + +int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +{ + int i; + + write_lock_bh(&tbl->lock); + + for (i = 0; i <= tbl->hash_mask; i++) { + struct neighbour *n, **np = &tbl->hash_buckets[i]; + + while ((n = *np) != NULL) { + if (dev && n->dev != dev) { + np = &n->next; + continue; + } + *np = n->next; + write_lock(&n->lock); + neigh_del_timer(n); + n->dead = 1; + + if (atomic_read(&n->refcnt) != 1) { + /* The most unpleasant situation. + We must destroy neighbour entry, + but someone still uses it. + + The destroy will be delayed until + the last user releases us, but + we must kill timers etc. and move + it to safe state. + */ + skb_queue_purge(&n->arp_queue); + n->output = neigh_blackhole; + if (n->nud_state & NUD_VALID) + n->nud_state = NUD_NOARP; + else + n->nud_state = NUD_NONE; + NEIGH_PRINTK2("neigh %p is stray.\n", n); + } + write_unlock(&n->lock); + neigh_release(n); + } + } + + pneigh_ifdown(tbl, dev); + write_unlock_bh(&tbl->lock); + + del_timer_sync(&tbl->proxy_timer); + pneigh_queue_purge(&tbl->proxy_queue); + return 0; +} + +static struct neighbour *neigh_alloc(struct neigh_table *tbl) +{ + struct neighbour *n = NULL; + unsigned long now = jiffies; + int entries; + + entries = atomic_inc_return(&tbl->entries) - 1; + if (entries >= tbl->gc_thresh3 || + (entries >= tbl->gc_thresh2 && + time_after(now, tbl->last_flush + 5 * HZ))) { + if (!neigh_forced_gc(tbl) && + entries >= tbl->gc_thresh3) + goto out_entries; + } + + n = kmem_cache_alloc(tbl->kmem_cachep, SLAB_ATOMIC); + if (!n) + goto out_entries; + + memset(n, 0, tbl->entry_size); + + skb_queue_head_init(&n->arp_queue); + rwlock_init(&n->lock); + n->updated = n->used = now; + n->nud_state = NUD_NONE; + n->output = neigh_blackhole; + n->parms = neigh_parms_clone(&tbl->parms); + init_timer(&n->timer); + n->timer.function = neigh_timer_handler; + n->timer.data = (unsigned long)n; + + NEIGH_CACHE_STAT_INC(tbl, allocs); + n->tbl = tbl; + atomic_set(&n->refcnt, 1); + n->dead = 1; +out: + return n; + +out_entries: + atomic_dec(&tbl->entries); + goto out; +} + +static struct neighbour **neigh_hash_alloc(unsigned int entries) +{ + unsigned long size = entries * sizeof(struct neighbour *); + struct neighbour **ret; + + if (size <= PAGE_SIZE) { + ret = kmalloc(size, GFP_ATOMIC); + } else { + ret = (struct neighbour **) + __get_free_pages(GFP_ATOMIC, get_order(size)); + } + if (ret) + memset(ret, 0, size); + + return ret; +} + +static void neigh_hash_free(struct neighbour **hash, unsigned int entries) +{ + unsigned long size = entries * sizeof(struct neighbour *); + + if (size <= PAGE_SIZE) + kfree(hash); + else + free_pages((unsigned long)hash, get_order(size)); +} + +static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries) +{ + struct neighbour **new_hash, **old_hash; + unsigned int i, new_hash_mask, old_entries; + + NEIGH_CACHE_STAT_INC(tbl, hash_grows); + + BUG_ON(new_entries & (new_entries - 1)); + new_hash = neigh_hash_alloc(new_entries); + if (!new_hash) + return; + + old_entries = tbl->hash_mask + 1; + new_hash_mask = new_entries - 1; + old_hash = tbl->hash_buckets; + + get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); + for (i = 0; i < old_entries; i++) { + struct neighbour *n, *next; + + for (n = old_hash[i]; n; n = next) { + unsigned int hash_val = tbl->hash(n->primary_key, n->dev); + + hash_val &= new_hash_mask; + next = n->next; + + n->next = new_hash[hash_val]; + new_hash[hash_val] = n; + } + } + tbl->hash_buckets = new_hash; + tbl->hash_mask = new_hash_mask; + + neigh_hash_free(old_hash, old_entries); +} + +struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, + struct net_device *dev) +{ + struct neighbour *n; + int key_len = tbl->key_len; + u32 hash_val = tbl->hash(pkey, dev) & tbl->hash_mask; + + NEIGH_CACHE_STAT_INC(tbl, lookups); + + read_lock_bh(&tbl->lock); + for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { + if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { + neigh_hold(n); + NEIGH_CACHE_STAT_INC(tbl, hits); + break; + } + } + read_unlock_bh(&tbl->lock); + return n; +} + +struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, const void *pkey) +{ + struct neighbour *n; + int key_len = tbl->key_len; + u32 hash_val = tbl->hash(pkey, NULL) & tbl->hash_mask; + + NEIGH_CACHE_STAT_INC(tbl, lookups); + + read_lock_bh(&tbl->lock); + for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { + if (!memcmp(n->primary_key, pkey, key_len)) { + neigh_hold(n); + NEIGH_CACHE_STAT_INC(tbl, hits); + break; + } + } + read_unlock_bh(&tbl->lock); + return n; +} + +struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, + struct net_device *dev) +{ + u32 hash_val; + int key_len = tbl->key_len; + int error; + struct neighbour *n1, *rc, *n = neigh_alloc(tbl); + + if (!n) { + rc = ERR_PTR(-ENOBUFS); + goto out; + } + + memcpy(n->primary_key, pkey, key_len); + n->dev = dev; + dev_hold(dev); + + /* Protocol specific setup. */ + if (tbl->constructor && (error = tbl->constructor(n)) < 0) { + rc = ERR_PTR(error); + goto out_neigh_release; + } + + /* Device specific setup. */ + if (n->parms->neigh_setup && + (error = n->parms->neigh_setup(n)) < 0) { + rc = ERR_PTR(error); + goto out_neigh_release; + } + + n->confirmed = jiffies - (n->parms->base_reachable_time << 1); + + write_lock_bh(&tbl->lock); + + if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1)) + neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1); + + hash_val = tbl->hash(pkey, dev) & tbl->hash_mask; + + if (n->parms->dead) { + rc = ERR_PTR(-EINVAL); + goto out_tbl_unlock; + } + + for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) { + if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { + neigh_hold(n1); + rc = n1; + goto out_tbl_unlock; + } + } + + n->next = tbl->hash_buckets[hash_val]; + tbl->hash_buckets[hash_val] = n; + n->dead = 0; + neigh_hold(n); + write_unlock_bh(&tbl->lock); + NEIGH_PRINTK2("neigh %p is created.\n", n); + rc = n; +out: + return rc; +out_tbl_unlock: + write_unlock_bh(&tbl->lock); +out_neigh_release: + neigh_release(n); + goto out; +} + +struct pneigh_entry * pneigh_lookup(struct neigh_table *tbl, const void *pkey, + struct net_device *dev, int creat) +{ + struct pneigh_entry *n; + int key_len = tbl->key_len; + u32 hash_val = *(u32 *)(pkey + key_len - 4); + + hash_val ^= (hash_val >> 16); + hash_val ^= hash_val >> 8; + hash_val ^= hash_val >> 4; + hash_val &= PNEIGH_HASHMASK; + + read_lock_bh(&tbl->lock); + + for (n = tbl->phash_buckets[hash_val]; n; n = n->next) { + if (!memcmp(n->key, pkey, key_len) && + (n->dev == dev || !n->dev)) { + read_unlock_bh(&tbl->lock); + goto out; + } + } + read_unlock_bh(&tbl->lock); + n = NULL; + if (!creat) + goto out; + + n = kmalloc(sizeof(*n) + key_len, GFP_KERNEL); + if (!n) + goto out; + + memcpy(n->key, pkey, key_len); + n->dev = dev; + if (dev) + dev_hold(dev); + + if (tbl->pconstructor && tbl->pconstructor(n)) { + if (dev) + dev_put(dev); + kfree(n); + n = NULL; + goto out; + } + + write_lock_bh(&tbl->lock); + n->next = tbl->phash_buckets[hash_val]; + tbl->phash_buckets[hash_val] = n; + write_unlock_bh(&tbl->lock); +out: + return n; +} + + +int pneigh_delete(struct neigh_table *tbl, const void *pkey, + struct net_device *dev) +{ + struct pneigh_entry *n, **np; + int key_len = tbl->key_len; + u32 hash_val = *(u32 *)(pkey + key_len - 4); + + hash_val ^= (hash_val >> 16); + hash_val ^= hash_val >> 8; + hash_val ^= hash_val >> 4; + hash_val &= PNEIGH_HASHMASK; + + write_lock_bh(&tbl->lock); + for (np = &tbl->phash_buckets[hash_val]; (n = *np) != NULL; + np = &n->next) { + if (!memcmp(n->key, pkey, key_len) && n->dev == dev) { + *np = n->next; + write_unlock_bh(&tbl->lock); + if (tbl->pdestructor) + tbl->pdestructor(n); + if (n->dev) + dev_put(n->dev); + kfree(n); + return 0; + } + } + write_unlock_bh(&tbl->lock); + return -ENOENT; +} + +static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev) +{ + struct pneigh_entry *n, **np; + u32 h; + + for (h = 0; h <= PNEIGH_HASHMASK; h++) { + np = &tbl->phash_buckets[h]; + while ((n = *np) != NULL) { + if (!dev || n->dev == dev) { + *np = n->next; + if (tbl->pdestructor) + tbl->pdestructor(n); + if (n->dev) + dev_put(n->dev); + kfree(n); + continue; + } + np = &n->next; + } + } + return -ENOENT; +} + + +/* + * neighbour must already be out of the table; + * + */ +void neigh_destroy(struct neighbour *neigh) +{ + struct hh_cache *hh; + + NEIGH_CACHE_STAT_INC(neigh->tbl, destroys); + + if (!neigh->dead) { + printk(KERN_WARNING + "Destroying alive neighbour %p\n", neigh); + dump_stack(); + return; + } + + if (neigh_del_timer(neigh)) + printk(KERN_WARNING "Impossible event.\n"); + + while ((hh = neigh->hh) != NULL) { + neigh->hh = hh->hh_next; + hh->hh_next = NULL; + write_lock_bh(&hh->hh_lock); + hh->hh_output = neigh_blackhole; + write_unlock_bh(&hh->hh_lock); + if (atomic_dec_and_test(&hh->hh_refcnt)) + kfree(hh); + } + + if (neigh->ops && neigh->ops->destructor) + (neigh->ops->destructor)(neigh); + + skb_queue_purge(&neigh->arp_queue); + + dev_put(neigh->dev); + neigh_parms_put(neigh->parms); + + NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); + + atomic_dec(&neigh->tbl->entries); + kmem_cache_free(neigh->tbl->kmem_cachep, neigh); +} + +/* Neighbour state is suspicious; + disable fast path. + + Called with write_locked neigh. + */ +static void neigh_suspect(struct neighbour *neigh) +{ + struct hh_cache *hh; + + NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); + + neigh->output = neigh->ops->output; + + for (hh = neigh->hh; hh; hh = hh->hh_next) + hh->hh_output = neigh->ops->output; +} + +/* Neighbour state is OK; + enable fast path. + + Called with write_locked neigh. + */ +static void neigh_connect(struct neighbour *neigh) +{ + struct hh_cache *hh; + + NEIGH_PRINTK2("neigh %p is connected.\n", neigh); + + neigh->output = neigh->ops->connected_output; + + for (hh = neigh->hh; hh; hh = hh->hh_next) + hh->hh_output = neigh->ops->hh_output; +} + +static void neigh_periodic_timer(unsigned long arg) +{ + struct neigh_table *tbl = (struct neigh_table *)arg; + struct neighbour *n, **np; + unsigned long expire, now = jiffies; + + NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); + + write_lock(&tbl->lock); + + /* + * periodically recompute ReachableTime from random function + */ + + if (time_after(now, tbl->last_rand + 300 * HZ)) { + struct neigh_parms *p; + tbl->last_rand = now; + for (p = &tbl->parms; p; p = p->next) + p->reachable_time = + neigh_rand_reach_time(p->base_reachable_time); + } + + np = &tbl->hash_buckets[tbl->hash_chain_gc]; + tbl->hash_chain_gc = ((tbl->hash_chain_gc + 1) & tbl->hash_mask); + + while ((n = *np) != NULL) { + unsigned int state; + + write_lock(&n->lock); + + state = n->nud_state; + if (state & (NUD_PERMANENT | NUD_IN_TIMER)) { + write_unlock(&n->lock); + goto next_elt; + } + + if (time_before(n->used, n->confirmed)) + n->used = n->confirmed; + + if (atomic_read(&n->refcnt) == 1 && + (state == NUD_FAILED || + time_after(now, n->used + n->parms->gc_staletime))) { + *np = n->next; + n->dead = 1; + write_unlock(&n->lock); + neigh_release(n); + continue; + } + write_unlock(&n->lock); + +next_elt: + np = &n->next; + } + + /* Cycle through all hash buckets every base_reachable_time/2 ticks. + * ARP entry timeouts range from 1/2 base_reachable_time to 3/2 + * base_reachable_time. + */ + expire = tbl->parms.base_reachable_time >> 1; + expire /= (tbl->hash_mask + 1); + if (!expire) + expire = 1; + + mod_timer(&tbl->gc_timer, now + expire); + + write_unlock(&tbl->lock); +} + +static __inline__ int neigh_max_probes(struct neighbour *n) +{ + struct neigh_parms *p = n->parms; + return (n->nud_state & NUD_PROBE ? + p->ucast_probes : + p->ucast_probes + p->app_probes + p->mcast_probes); +} + + +/* Called when a timer expires for a neighbour entry. */ + +static void neigh_timer_handler(unsigned long arg) +{ + unsigned long now, next; + struct neighbour *neigh = (struct neighbour *)arg; + unsigned state; + int notify = 0; + + write_lock(&neigh->lock); + + state = neigh->nud_state; + now = jiffies; + next = now + HZ; + + if (!(state & NUD_IN_TIMER)) { +#ifndef CONFIG_SMP + printk(KERN_WARNING "neigh: timer & !nud_in_timer\n"); +#endif + goto out; + } + + if (state & NUD_REACHABLE) { + if (time_before_eq(now, + neigh->confirmed + neigh->parms->reachable_time)) { + NEIGH_PRINTK2("neigh %p is still alive.\n", neigh); + next = neigh->confirmed + neigh->parms->reachable_time; + } else if (time_before_eq(now, + neigh->used + neigh->parms->delay_probe_time)) { + NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh->nud_state = NUD_DELAY; + neigh_suspect(neigh); + next = now + neigh->parms->delay_probe_time; + } else { + NEIGH_PRINTK2("neigh %p is suspected.\n", neigh); + neigh->nud_state = NUD_STALE; + neigh_suspect(neigh); + } + } else if (state & NUD_DELAY) { + if (time_before_eq(now, + neigh->confirmed + neigh->parms->delay_probe_time)) { + NEIGH_PRINTK2("neigh %p is now reachable.\n", neigh); + neigh->nud_state = NUD_REACHABLE; + neigh_connect(neigh); + next = neigh->confirmed + neigh->parms->reachable_time; + } else { + NEIGH_PRINTK2("neigh %p is probed.\n", neigh); + neigh->nud_state = NUD_PROBE; + atomic_set(&neigh->probes, 0); + next = now + neigh->parms->retrans_time; + } + } else { + /* NUD_PROBE|NUD_INCOMPLETE */ + next = now + neigh->parms->retrans_time; + } + + if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) && + atomic_read(&neigh->probes) >= neigh_max_probes(neigh)) { + struct sk_buff *skb; + + neigh->nud_state = NUD_FAILED; + notify = 1; + NEIGH_CACHE_STAT_INC(neigh->tbl, res_failed); + NEIGH_PRINTK2("neigh %p is failed.\n", neigh); + + /* It is very thin place. report_unreachable is very complicated + routine. Particularly, it can hit the same neighbour entry! + + So that, we try to be accurate and avoid dead loop. --ANK + */ + while (neigh->nud_state == NUD_FAILED && + (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { + write_unlock(&neigh->lock); + neigh->ops->error_report(neigh, skb); + write_lock(&neigh->lock); + } + skb_queue_purge(&neigh->arp_queue); + } + + if (neigh->nud_state & NUD_IN_TIMER) { + neigh_hold(neigh); + if (time_before(next, jiffies + HZ/2)) + next = jiffies + HZ/2; + neigh->timer.expires = next; + add_timer(&neigh->timer); + } + if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) { + struct sk_buff *skb = skb_peek(&neigh->arp_queue); + /* keep skb alive even if arp_queue overflows */ + if (skb) + skb_get(skb); + write_unlock(&neigh->lock); + neigh->ops->solicit(neigh, skb); + atomic_inc(&neigh->probes); + if (skb) + kfree_skb(skb); + } else { +out: + write_unlock(&neigh->lock); + } + +#ifdef CONFIG_ARPD + if (notify && neigh->parms->app_probes) + neigh_app_notify(neigh); +#endif + neigh_release(neigh); +} + +int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) +{ + int rc; + unsigned long now; + + write_lock_bh(&neigh->lock); + + rc = 0; + if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE)) + goto out_unlock_bh; + + now = jiffies; + + if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) { + if (neigh->parms->mcast_probes + neigh->parms->app_probes) { + atomic_set(&neigh->probes, neigh->parms->ucast_probes); + neigh->nud_state = NUD_INCOMPLETE; + neigh_hold(neigh); + neigh->timer.expires = now + 1; + add_timer(&neigh->timer); + } else { + neigh->nud_state = NUD_FAILED; + write_unlock_bh(&neigh->lock); + + if (skb) + kfree_skb(skb); + return 1; + } + } else if (neigh->nud_state & NUD_STALE) { + NEIGH_PRINTK2("neigh %p is delayed.\n", neigh); + neigh_hold(neigh); + neigh->nud_state = NUD_DELAY; + neigh->timer.expires = jiffies + neigh->parms->delay_probe_time; + add_timer(&neigh->timer); + } + + if (neigh->nud_state == NUD_INCOMPLETE) { + if (skb) { + if (skb_queue_len(&neigh->arp_queue) >= + neigh->parms->queue_len) { + struct sk_buff *buff; + buff = neigh->arp_queue.next; + __skb_unlink(buff, &neigh->arp_queue); + kfree_skb(buff); + } + __skb_queue_tail(&neigh->arp_queue, skb); + } + rc = 1; + } +out_unlock_bh: + write_unlock_bh(&neigh->lock); + return rc; +} + +static __inline__ void neigh_update_hhs(struct neighbour *neigh) +{ + struct hh_cache *hh; + void (*update)(struct hh_cache*, struct net_device*, unsigned char *) = + neigh->dev->header_cache_update; + + if (update) { + for (hh = neigh->hh; hh; hh = hh->hh_next) { + write_lock_bh(&hh->hh_lock); + update(hh, neigh->dev, neigh->ha); + write_unlock_bh(&hh->hh_lock); + } + } +} + + + +/* Generic update routine. + -- lladdr is new lladdr or NULL, if it is not supplied. + -- new is new state. + -- flags + NEIGH_UPDATE_F_OVERRIDE allows to override existing lladdr, + if it is different. + NEIGH_UPDATE_F_WEAK_OVERRIDE will suspect existing "connected" + lladdr instead of overriding it + if it is different. + It also allows to retain current state + if lladdr is unchanged. + NEIGH_UPDATE_F_ADMIN means that the change is administrative. + + NEIGH_UPDATE_F_OVERRIDE_ISROUTER allows to override existing + NTF_ROUTER flag. + NEIGH_UPDATE_F_ISROUTER indicates if the neighbour is known as + a router. + + Caller MUST hold reference count on the entry. + */ + +int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, + u32 flags) +{ + u8 old; + int err; +#ifdef CONFIG_ARPD + int notify = 0; +#endif + struct net_device *dev; + int update_isrouter = 0; + + write_lock_bh(&neigh->lock); + + dev = neigh->dev; + old = neigh->nud_state; + err = -EPERM; + + if (!(flags & NEIGH_UPDATE_F_ADMIN) && + (old & (NUD_NOARP | NUD_PERMANENT))) + goto out; + + if (!(new & NUD_VALID)) { + neigh_del_timer(neigh); + if (old & NUD_CONNECTED) + neigh_suspect(neigh); + neigh->nud_state = new; + err = 0; +#ifdef CONFIG_ARPD + notify = old & NUD_VALID; +#endif + goto out; + } + + /* Compare new lladdr with cached one */ + if (!dev->addr_len) { + /* First case: device needs no address. */ + lladdr = neigh->ha; + } else if (lladdr) { + /* The second case: if something is already cached + and a new address is proposed: + - compare new & old + - if they are different, check override flag + */ + if ((old & NUD_VALID) && + !memcmp(lladdr, neigh->ha, dev->addr_len)) + lladdr = neigh->ha; + } else { + /* No address is supplied; if we know something, + use it, otherwise discard the request. + */ + err = -EINVAL; + if (!(old & NUD_VALID)) + goto out; + lladdr = neigh->ha; + } + + if (new & NUD_CONNECTED) + neigh->confirmed = jiffies; + neigh->updated = jiffies; + + /* If entry was valid and address is not changed, + do not change entry state, if new one is STALE. + */ + err = 0; + update_isrouter = flags & NEIGH_UPDATE_F_OVERRIDE_ISROUTER; + if (old & NUD_VALID) { + if (lladdr != neigh->ha && !(flags & NEIGH_UPDATE_F_OVERRIDE)) { + update_isrouter = 0; + if ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) && + (old & NUD_CONNECTED)) { + lladdr = neigh->ha; + new = NUD_STALE; + } else + goto out; + } else { + if (lladdr == neigh->ha && new == NUD_STALE && + ((flags & NEIGH_UPDATE_F_WEAK_OVERRIDE) || + (old & NUD_CONNECTED)) + ) + new = old; + } + } + + if (new != old) { + neigh_del_timer(neigh); + if (new & NUD_IN_TIMER) { + neigh_hold(neigh); + neigh->timer.expires = jiffies + + ((new & NUD_REACHABLE) ? + neigh->parms->reachable_time : 0); + add_timer(&neigh->timer); + } + neigh->nud_state = new; + } + + if (lladdr != neigh->ha) { + memcpy(&neigh->ha, lladdr, dev->addr_len); + neigh_update_hhs(neigh); + if (!(new & NUD_CONNECTED)) + neigh->confirmed = jiffies - + (neigh->parms->base_reachable_time << 1); +#ifdef CONFIG_ARPD + notify = 1; +#endif + } + if (new == old) + goto out; + if (new & NUD_CONNECTED) + neigh_connect(neigh); + else + neigh_suspect(neigh); + if (!(old & NUD_VALID)) { + struct sk_buff *skb; + + /* Again: avoid dead loop if something went wrong */ + + while (neigh->nud_state & NUD_VALID && + (skb = __skb_dequeue(&neigh->arp_queue)) != NULL) { + struct neighbour *n1 = neigh; + write_unlock_bh(&neigh->lock); + /* On shaper/eql skb->dst->neighbour != neigh :( */ + if (skb->dst && skb->dst->neighbour) + n1 = skb->dst->neighbour; + n1->output(skb); + write_lock_bh(&neigh->lock); + } + skb_queue_purge(&neigh->arp_queue); + } +out: + if (update_isrouter) { + neigh->flags = (flags & NEIGH_UPDATE_F_ISROUTER) ? + (neigh->flags | NTF_ROUTER) : + (neigh->flags & ~NTF_ROUTER); + } + write_unlock_bh(&neigh->lock); +#ifdef CONFIG_ARPD + if (notify && neigh->parms->app_probes) + neigh_app_notify(neigh); +#endif + return err; +} + +struct neighbour *neigh_event_ns(struct neigh_table *tbl, + u8 *lladdr, void *saddr, + struct net_device *dev) +{ + struct neighbour *neigh = __neigh_lookup(tbl, saddr, dev, + lladdr || !dev->addr_len); + if (neigh) + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_OVERRIDE); + return neigh; +} + +static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, + u16 protocol) +{ + struct hh_cache *hh; + struct net_device *dev = dst->dev; + + for (hh = n->hh; hh; hh = hh->hh_next) + if (hh->hh_type == protocol) + break; + + if (!hh && (hh = kmalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) { + memset(hh, 0, sizeof(struct hh_cache)); + rwlock_init(&hh->hh_lock); + hh->hh_type = protocol; + atomic_set(&hh->hh_refcnt, 0); + hh->hh_next = NULL; + if (dev->hard_header_cache(n, hh)) { + kfree(hh); + hh = NULL; + } else { + atomic_inc(&hh->hh_refcnt); + hh->hh_next = n->hh; + n->hh = hh; + if (n->nud_state & NUD_CONNECTED) + hh->hh_output = n->ops->hh_output; + else + hh->hh_output = n->ops->output; + } + } + if (hh) { + atomic_inc(&hh->hh_refcnt); + dst->hh = hh; + } +} + +/* This function can be used in contexts, where only old dev_queue_xmit + worked, f.e. if you want to override normal output path (eql, shaper), + but resolution is not made yet. + */ + +int neigh_compat_output(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + + __skb_pull(skb, skb->nh.raw - skb->data); + + if (dev->hard_header && + dev->hard_header(skb, dev, ntohs(skb->protocol), NULL, NULL, + skb->len) < 0 && + dev->rebuild_header(skb)) + return 0; + + return dev_queue_xmit(skb); +} + +/* Slow and careful. */ + +int neigh_resolve_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct neighbour *neigh; + int rc = 0; + + if (!dst || !(neigh = dst->neighbour)) + goto discard; + + __skb_pull(skb, skb->nh.raw - skb->data); + + if (!neigh_event_send(neigh, skb)) { + int err; + struct net_device *dev = neigh->dev; + if (dev->hard_header_cache && !dst->hh) { + write_lock_bh(&neigh->lock); + if (!dst->hh) + neigh_hh_init(neigh, dst, dst->ops->protocol); + err = dev->hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + write_unlock_bh(&neigh->lock); + } else { + read_lock_bh(&neigh->lock); + err = dev->hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + read_unlock_bh(&neigh->lock); + } + if (err >= 0) + rc = neigh->ops->queue_xmit(skb); + else + goto out_kfree_skb; + } +out: + return rc; +discard: + NEIGH_PRINTK1("neigh_resolve_output: dst=%p neigh=%p\n", + dst, dst ? dst->neighbour : NULL); +out_kfree_skb: + rc = -EINVAL; + kfree_skb(skb); + goto out; +} + +/* As fast as possible without hh cache */ + +int neigh_connected_output(struct sk_buff *skb) +{ + int err; + struct dst_entry *dst = skb->dst; + struct neighbour *neigh = dst->neighbour; + struct net_device *dev = neigh->dev; + + __skb_pull(skb, skb->nh.raw - skb->data); + + read_lock_bh(&neigh->lock); + err = dev->hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + read_unlock_bh(&neigh->lock); + if (err >= 0) + err = neigh->ops->queue_xmit(skb); + else { + err = -EINVAL; + kfree_skb(skb); + } + return err; +} + +static void neigh_proxy_process(unsigned long arg) +{ + struct neigh_table *tbl = (struct neigh_table *)arg; + long sched_next = 0; + unsigned long now = jiffies; + struct sk_buff *skb; + + spin_lock(&tbl->proxy_queue.lock); + + skb = tbl->proxy_queue.next; + + while (skb != (struct sk_buff *)&tbl->proxy_queue) { + struct sk_buff *back = skb; + long tdif = back->stamp.tv_usec - now; + + skb = skb->next; + if (tdif <= 0) { + struct net_device *dev = back->dev; + __skb_unlink(back, &tbl->proxy_queue); + if (tbl->proxy_redo && netif_running(dev)) + tbl->proxy_redo(back); + else + kfree_skb(back); + + dev_put(dev); + } else if (!sched_next || tdif < sched_next) + sched_next = tdif; + } + del_timer(&tbl->proxy_timer); + if (sched_next) + mod_timer(&tbl->proxy_timer, jiffies + sched_next); + spin_unlock(&tbl->proxy_queue.lock); +} + +void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, + struct sk_buff *skb) +{ + unsigned long now = jiffies; + unsigned long sched_next = now + (net_random() % p->proxy_delay); + + if (tbl->proxy_queue.qlen > p->proxy_qlen) { + kfree_skb(skb); + return; + } + skb->stamp.tv_sec = LOCALLY_ENQUEUED; + skb->stamp.tv_usec = sched_next; + + spin_lock(&tbl->proxy_queue.lock); + if (del_timer(&tbl->proxy_timer)) { + if (time_before(tbl->proxy_timer.expires, sched_next)) + sched_next = tbl->proxy_timer.expires; + } + dst_release(skb->dst); + skb->dst = NULL; + dev_hold(skb->dev); + __skb_queue_tail(&tbl->proxy_queue, skb); + mod_timer(&tbl->proxy_timer, sched_next); + spin_unlock(&tbl->proxy_queue.lock); +} + + +struct neigh_parms *neigh_parms_alloc(struct net_device *dev, + struct neigh_table *tbl) +{ + struct neigh_parms *p = kmalloc(sizeof(*p), GFP_KERNEL); + + if (p) { + memcpy(p, &tbl->parms, sizeof(*p)); + p->tbl = tbl; + atomic_set(&p->refcnt, 1); + INIT_RCU_HEAD(&p->rcu_head); + p->reachable_time = + neigh_rand_reach_time(p->base_reachable_time); + if (dev && dev->neigh_setup && dev->neigh_setup(dev, p)) { + kfree(p); + return NULL; + } + p->sysctl_table = NULL; + write_lock_bh(&tbl->lock); + p->next = tbl->parms.next; + tbl->parms.next = p; + write_unlock_bh(&tbl->lock); + } + return p; +} + +static void neigh_rcu_free_parms(struct rcu_head *head) +{ + struct neigh_parms *parms = + container_of(head, struct neigh_parms, rcu_head); + + neigh_parms_put(parms); +} + +void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms) +{ + struct neigh_parms **p; + + if (!parms || parms == &tbl->parms) + return; + write_lock_bh(&tbl->lock); + for (p = &tbl->parms.next; *p; p = &(*p)->next) { + if (*p == parms) { + *p = parms->next; + parms->dead = 1; + write_unlock_bh(&tbl->lock); + call_rcu(&parms->rcu_head, neigh_rcu_free_parms); + return; + } + } + write_unlock_bh(&tbl->lock); + NEIGH_PRINTK1("neigh_parms_release: not found\n"); +} + +void neigh_parms_destroy(struct neigh_parms *parms) +{ + kfree(parms); +} + + +void neigh_table_init(struct neigh_table *tbl) +{ + unsigned long now = jiffies; + unsigned long phsize; + + atomic_set(&tbl->parms.refcnt, 1); + INIT_RCU_HEAD(&tbl->parms.rcu_head); + tbl->parms.reachable_time = + neigh_rand_reach_time(tbl->parms.base_reachable_time); + + if (!tbl->kmem_cachep) + tbl->kmem_cachep = kmem_cache_create(tbl->id, + tbl->entry_size, + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + + if (!tbl->kmem_cachep) + panic("cannot create neighbour cache"); + + tbl->stats = alloc_percpu(struct neigh_statistics); + if (!tbl->stats) + panic("cannot create neighbour cache statistics"); + +#ifdef CONFIG_PROC_FS + tbl->pde = create_proc_entry(tbl->id, 0, proc_net_stat); + if (!tbl->pde) + panic("cannot create neighbour proc dir entry"); + tbl->pde->proc_fops = &neigh_stat_seq_fops; + tbl->pde->data = tbl; +#endif + + tbl->hash_mask = 1; + tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1); + + phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); + tbl->phash_buckets = kmalloc(phsize, GFP_KERNEL); + + if (!tbl->hash_buckets || !tbl->phash_buckets) + panic("cannot allocate neighbour cache hashes"); + + memset(tbl->phash_buckets, 0, phsize); + + get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); + + rwlock_init(&tbl->lock); + init_timer(&tbl->gc_timer); + tbl->gc_timer.data = (unsigned long)tbl; + tbl->gc_timer.function = neigh_periodic_timer; + tbl->gc_timer.expires = now + 1; + add_timer(&tbl->gc_timer); + + init_timer(&tbl->proxy_timer); + tbl->proxy_timer.data = (unsigned long)tbl; + tbl->proxy_timer.function = neigh_proxy_process; + skb_queue_head_init(&tbl->proxy_queue); + + tbl->last_flush = now; + tbl->last_rand = now + tbl->parms.reachable_time * 20; + write_lock(&neigh_tbl_lock); + tbl->next = neigh_tables; + neigh_tables = tbl; + write_unlock(&neigh_tbl_lock); +} + +int neigh_table_clear(struct neigh_table *tbl) +{ + struct neigh_table **tp; + + /* It is not clean... Fix it to unload IPv6 module safely */ + del_timer_sync(&tbl->gc_timer); + del_timer_sync(&tbl->proxy_timer); + pneigh_queue_purge(&tbl->proxy_queue); + neigh_ifdown(tbl, NULL); + if (atomic_read(&tbl->entries)) + printk(KERN_CRIT "neighbour leakage\n"); + write_lock(&neigh_tbl_lock); + for (tp = &neigh_tables; *tp; tp = &(*tp)->next) { + if (*tp == tbl) { + *tp = tbl->next; + break; + } + } + write_unlock(&neigh_tbl_lock); + + neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1); + tbl->hash_buckets = NULL; + + kfree(tbl->phash_buckets); + tbl->phash_buckets = NULL; + + return 0; +} + +int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct ndmsg *ndm = NLMSG_DATA(nlh); + struct rtattr **nda = arg; + struct neigh_table *tbl; + struct net_device *dev = NULL; + int err = -ENODEV; + + if (ndm->ndm_ifindex && + (dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) + goto out; + + read_lock(&neigh_tbl_lock); + for (tbl = neigh_tables; tbl; tbl = tbl->next) { + struct rtattr *dst_attr = nda[NDA_DST - 1]; + struct neighbour *n; + + if (tbl->family != ndm->ndm_family) + continue; + read_unlock(&neigh_tbl_lock); + + err = -EINVAL; + if (!dst_attr || RTA_PAYLOAD(dst_attr) < tbl->key_len) + goto out_dev_put; + + if (ndm->ndm_flags & NTF_PROXY) { + err = pneigh_delete(tbl, RTA_DATA(dst_attr), dev); + goto out_dev_put; + } + + if (!dev) + goto out; + + n = neigh_lookup(tbl, RTA_DATA(dst_attr), dev); + if (n) { + err = neigh_update(n, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_ADMIN); + neigh_release(n); + } + goto out_dev_put; + } + read_unlock(&neigh_tbl_lock); + err = -EADDRNOTAVAIL; +out_dev_put: + if (dev) + dev_put(dev); +out: + return err; +} + +int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct ndmsg *ndm = NLMSG_DATA(nlh); + struct rtattr **nda = arg; + struct neigh_table *tbl; + struct net_device *dev = NULL; + int err = -ENODEV; + + if (ndm->ndm_ifindex && + (dev = dev_get_by_index(ndm->ndm_ifindex)) == NULL) + goto out; + + read_lock(&neigh_tbl_lock); + for (tbl = neigh_tables; tbl; tbl = tbl->next) { + struct rtattr *lladdr_attr = nda[NDA_LLADDR - 1]; + struct rtattr *dst_attr = nda[NDA_DST - 1]; + int override = 1; + struct neighbour *n; + + if (tbl->family != ndm->ndm_family) + continue; + read_unlock(&neigh_tbl_lock); + + err = -EINVAL; + if (!dst_attr || RTA_PAYLOAD(dst_attr) < tbl->key_len) + goto out_dev_put; + + if (ndm->ndm_flags & NTF_PROXY) { + err = -ENOBUFS; + if (pneigh_lookup(tbl, RTA_DATA(dst_attr), dev, 1)) + err = 0; + goto out_dev_put; + } + + err = -EINVAL; + if (!dev) + goto out; + if (lladdr_attr && RTA_PAYLOAD(lladdr_attr) < dev->addr_len) + goto out_dev_put; + + n = neigh_lookup(tbl, RTA_DATA(dst_attr), dev); + if (n) { + if (nlh->nlmsg_flags & NLM_F_EXCL) { + err = -EEXIST; + neigh_release(n); + goto out_dev_put; + } + + override = nlh->nlmsg_flags & NLM_F_REPLACE; + } else if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { + err = -ENOENT; + goto out_dev_put; + } else { + n = __neigh_lookup_errno(tbl, RTA_DATA(dst_attr), dev); + if (IS_ERR(n)) { + err = PTR_ERR(n); + goto out_dev_put; + } + } + + err = neigh_update(n, + lladdr_attr ? RTA_DATA(lladdr_attr) : NULL, + ndm->ndm_state, + (override ? NEIGH_UPDATE_F_OVERRIDE : 0) | + NEIGH_UPDATE_F_ADMIN); + + neigh_release(n); + goto out_dev_put; + } + + read_unlock(&neigh_tbl_lock); + err = -EADDRNOTAVAIL; +out_dev_put: + if (dev) + dev_put(dev); +out: + return err; +} + + +static int neigh_fill_info(struct sk_buff *skb, struct neighbour *n, + u32 pid, u32 seq, int event) +{ + unsigned long now = jiffies; + unsigned char *b = skb->tail; + struct nda_cacheinfo ci; + int locked = 0; + u32 probes; + struct nlmsghdr *nlh = NLMSG_PUT(skb, pid, seq, event, + sizeof(struct ndmsg)); + struct ndmsg *ndm = NLMSG_DATA(nlh); + + nlh->nlmsg_flags = pid ? NLM_F_MULTI : 0; + ndm->ndm_family = n->ops->family; + ndm->ndm_flags = n->flags; + ndm->ndm_type = n->type; + ndm->ndm_ifindex = n->dev->ifindex; + RTA_PUT(skb, NDA_DST, n->tbl->key_len, n->primary_key); + read_lock_bh(&n->lock); + locked = 1; + ndm->ndm_state = n->nud_state; + if (n->nud_state & NUD_VALID) + RTA_PUT(skb, NDA_LLADDR, n->dev->addr_len, n->ha); + ci.ndm_used = now - n->used; + ci.ndm_confirmed = now - n->confirmed; + ci.ndm_updated = now - n->updated; + ci.ndm_refcnt = atomic_read(&n->refcnt) - 1; + probes = atomic_read(&n->probes); + read_unlock_bh(&n->lock); + locked = 0; + RTA_PUT(skb, NDA_CACHEINFO, sizeof(ci), &ci); + RTA_PUT(skb, NDA_PROBES, sizeof(probes), &probes); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + if (locked) + read_unlock_bh(&n->lock); + skb_trim(skb, b - skb->data); + return -1; +} + + +static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, + struct netlink_callback *cb) +{ + struct neighbour *n; + int rc, h, s_h = cb->args[1]; + int idx, s_idx = idx = cb->args[2]; + + for (h = 0; h <= tbl->hash_mask; h++) { + if (h < s_h) + continue; + if (h > s_h) + s_idx = 0; + read_lock_bh(&tbl->lock); + for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next, idx++) { + if (idx < s_idx) + continue; + if (neigh_fill_info(skb, n, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWNEIGH) <= 0) { + read_unlock_bh(&tbl->lock); + rc = -1; + goto out; + } + } + read_unlock_bh(&tbl->lock); + } + rc = skb->len; +out: + cb->args[1] = h; + cb->args[2] = idx; + return rc; +} + +int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct neigh_table *tbl; + int t, family, s_t; + + read_lock(&neigh_tbl_lock); + family = ((struct rtgenmsg *)NLMSG_DATA(cb->nlh))->rtgen_family; + s_t = cb->args[0]; + + for (tbl = neigh_tables, t = 0; tbl; tbl = tbl->next, t++) { + if (t < s_t || (family && tbl->family != family)) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args) - + sizeof(cb->args[0])); + if (neigh_dump_table(tbl, skb, cb) < 0) + break; + } + read_unlock(&neigh_tbl_lock); + + cb->args[0] = t; + return skb->len; +} + +void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) +{ + int chain; + + read_lock_bh(&tbl->lock); + for (chain = 0; chain <= tbl->hash_mask; chain++) { + struct neighbour *n; + + for (n = tbl->hash_buckets[chain]; n; n = n->next) + cb(n, cookie); + } + read_unlock_bh(&tbl->lock); +} +EXPORT_SYMBOL(neigh_for_each); + +/* The tbl->lock must be held as a writer and BH disabled. */ +void __neigh_for_each_release(struct neigh_table *tbl, + int (*cb)(struct neighbour *)) +{ + int chain; + + for (chain = 0; chain <= tbl->hash_mask; chain++) { + struct neighbour *n, **np; + + np = &tbl->hash_buckets[chain]; + while ((n = *np) != NULL) { + int release; + + write_lock(&n->lock); + release = cb(n); + if (release) { + *np = n->next; + n->dead = 1; + } else + np = &n->next; + write_unlock(&n->lock); + if (release) + neigh_release(n); + } + } +} +EXPORT_SYMBOL(__neigh_for_each_release); + +#ifdef CONFIG_PROC_FS + +static struct neighbour *neigh_get_first(struct seq_file *seq) +{ + struct neigh_seq_state *state = seq->private; + struct neigh_table *tbl = state->tbl; + struct neighbour *n = NULL; + int bucket = state->bucket; + + state->flags &= ~NEIGH_SEQ_IS_PNEIGH; + for (bucket = 0; bucket <= tbl->hash_mask; bucket++) { + n = tbl->hash_buckets[bucket]; + + while (n) { + if (state->neigh_sub_iter) { + loff_t fakep = 0; + void *v; + + v = state->neigh_sub_iter(state, n, &fakep); + if (!v) + goto next; + } + if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) + break; + if (n->nud_state & ~NUD_NOARP) + break; + next: + n = n->next; + } + + if (n) + break; + } + state->bucket = bucket; + + return n; +} + +static struct neighbour *neigh_get_next(struct seq_file *seq, + struct neighbour *n, + loff_t *pos) +{ + struct neigh_seq_state *state = seq->private; + struct neigh_table *tbl = state->tbl; + + if (state->neigh_sub_iter) { + void *v = state->neigh_sub_iter(state, n, pos); + if (v) + return n; + } + n = n->next; + + while (1) { + while (n) { + if (state->neigh_sub_iter) { + void *v = state->neigh_sub_iter(state, n, pos); + if (v) + return n; + goto next; + } + if (!(state->flags & NEIGH_SEQ_SKIP_NOARP)) + break; + + if (n->nud_state & ~NUD_NOARP) + break; + next: + n = n->next; + } + + if (n) + break; + + if (++state->bucket > tbl->hash_mask) + break; + + n = tbl->hash_buckets[state->bucket]; + } + + if (n && pos) + --(*pos); + return n; +} + +static struct neighbour *neigh_get_idx(struct seq_file *seq, loff_t *pos) +{ + struct neighbour *n = neigh_get_first(seq); + + if (n) { + while (*pos) { + n = neigh_get_next(seq, n, pos); + if (!n) + break; + } + } + return *pos ? NULL : n; +} + +static struct pneigh_entry *pneigh_get_first(struct seq_file *seq) +{ + struct neigh_seq_state *state = seq->private; + struct neigh_table *tbl = state->tbl; + struct pneigh_entry *pn = NULL; + int bucket = state->bucket; + + state->flags |= NEIGH_SEQ_IS_PNEIGH; + for (bucket = 0; bucket <= PNEIGH_HASHMASK; bucket++) { + pn = tbl->phash_buckets[bucket]; + if (pn) + break; + } + state->bucket = bucket; + + return pn; +} + +static struct pneigh_entry *pneigh_get_next(struct seq_file *seq, + struct pneigh_entry *pn, + loff_t *pos) +{ + struct neigh_seq_state *state = seq->private; + struct neigh_table *tbl = state->tbl; + + pn = pn->next; + while (!pn) { + if (++state->bucket > PNEIGH_HASHMASK) + break; + pn = tbl->phash_buckets[state->bucket]; + if (pn) + break; + } + + if (pn && pos) + --(*pos); + + return pn; +} + +static struct pneigh_entry *pneigh_get_idx(struct seq_file *seq, loff_t *pos) +{ + struct pneigh_entry *pn = pneigh_get_first(seq); + + if (pn) { + while (*pos) { + pn = pneigh_get_next(seq, pn, pos); + if (!pn) + break; + } + } + return *pos ? NULL : pn; +} + +static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) +{ + struct neigh_seq_state *state = seq->private; + void *rc; + + rc = neigh_get_idx(seq, pos); + if (!rc && !(state->flags & NEIGH_SEQ_NEIGH_ONLY)) + rc = pneigh_get_idx(seq, pos); + + return rc; +} + +void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) +{ + struct neigh_seq_state *state = seq->private; + loff_t pos_minus_one; + + state->tbl = tbl; + state->bucket = 0; + state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); + + read_lock_bh(&tbl->lock); + + pos_minus_one = *pos - 1; + return *pos ? neigh_get_idx_any(seq, &pos_minus_one) : SEQ_START_TOKEN; +} +EXPORT_SYMBOL(neigh_seq_start); + +void *neigh_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct neigh_seq_state *state; + void *rc; + + if (v == SEQ_START_TOKEN) { + rc = neigh_get_idx(seq, pos); + goto out; + } + + state = seq->private; + if (!(state->flags & NEIGH_SEQ_IS_PNEIGH)) { + rc = neigh_get_next(seq, v, NULL); + if (rc) + goto out; + if (!(state->flags & NEIGH_SEQ_NEIGH_ONLY)) + rc = pneigh_get_first(seq); + } else { + BUG_ON(state->flags & NEIGH_SEQ_NEIGH_ONLY); + rc = pneigh_get_next(seq, v, NULL); + } +out: + ++(*pos); + return rc; +} +EXPORT_SYMBOL(neigh_seq_next); + +void neigh_seq_stop(struct seq_file *seq, void *v) +{ + struct neigh_seq_state *state = seq->private; + struct neigh_table *tbl = state->tbl; + + read_unlock_bh(&tbl->lock); +} +EXPORT_SYMBOL(neigh_seq_stop); + +/* statistics via seq_file */ + +static void *neigh_stat_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct proc_dir_entry *pde = seq->private; + struct neigh_table *tbl = pde->data; + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(tbl->stats, cpu); + } + return NULL; +} + +static void *neigh_stat_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct proc_dir_entry *pde = seq->private; + struct neigh_table *tbl = pde->data; + int cpu; + + for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(tbl->stats, cpu); + } + return NULL; +} + +static void neigh_stat_seq_stop(struct seq_file *seq, void *v) +{ + +} + +static int neigh_stat_seq_show(struct seq_file *seq, void *v) +{ + struct proc_dir_entry *pde = seq->private; + struct neigh_table *tbl = pde->data; + struct neigh_statistics *st = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries allocs destroys hash_grows lookups hits res_failed rcv_probes_mcast rcv_probes_ucast periodic_gc_runs forced_gc_runs forced_gc_goal_miss\n"); + return 0; + } + + seq_printf(seq, "%08x %08lx %08lx %08lx %08lx %08lx %08lx " + "%08lx %08lx %08lx %08lx\n", + atomic_read(&tbl->entries), + + st->allocs, + st->destroys, + st->hash_grows, + + st->lookups, + st->hits, + + st->res_failed, + + st->rcv_probes_mcast, + st->rcv_probes_ucast, + + st->periodic_gc_runs, + st->forced_gc_runs + ); + + return 0; +} + +static struct seq_operations neigh_stat_seq_ops = { + .start = neigh_stat_seq_start, + .next = neigh_stat_seq_next, + .stop = neigh_stat_seq_stop, + .show = neigh_stat_seq_show, +}; + +static int neigh_stat_seq_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &neigh_stat_seq_ops); + + if (!ret) { + struct seq_file *sf = file->private_data; + sf->private = PDE(inode); + } + return ret; +}; + +static struct file_operations neigh_stat_seq_fops = { + .owner = THIS_MODULE, + .open = neigh_stat_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_ARPD +void neigh_app_ns(struct neighbour *n) +{ + struct nlmsghdr *nlh; + int size = NLMSG_SPACE(sizeof(struct ndmsg) + 256); + struct sk_buff *skb = alloc_skb(size, GFP_ATOMIC); + + if (!skb) + return; + + if (neigh_fill_info(skb, n, 0, 0, RTM_GETNEIGH) < 0) { + kfree_skb(skb); + return; + } + nlh = (struct nlmsghdr *)skb->data; + nlh->nlmsg_flags = NLM_F_REQUEST; + NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; + netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); +} + +static void neigh_app_notify(struct neighbour *n) +{ + struct nlmsghdr *nlh; + int size = NLMSG_SPACE(sizeof(struct ndmsg) + 256); + struct sk_buff *skb = alloc_skb(size, GFP_ATOMIC); + + if (!skb) + return; + + if (neigh_fill_info(skb, n, 0, 0, RTM_NEWNEIGH) < 0) { + kfree_skb(skb); + return; + } + nlh = (struct nlmsghdr *)skb->data; + NETLINK_CB(skb).dst_groups = RTMGRP_NEIGH; + netlink_broadcast(rtnl, skb, 0, RTMGRP_NEIGH, GFP_ATOMIC); +} + +#endif /* CONFIG_ARPD */ + +#ifdef CONFIG_SYSCTL + +static struct neigh_sysctl_table { + struct ctl_table_header *sysctl_header; + ctl_table neigh_vars[__NET_NEIGH_MAX]; + ctl_table neigh_dev[2]; + ctl_table neigh_neigh_dir[2]; + ctl_table neigh_proto_dir[2]; + ctl_table neigh_root_dir[2]; +} neigh_sysctl_template = { + .neigh_vars = { + { + .ctl_name = NET_NEIGH_MCAST_SOLICIT, + .procname = "mcast_solicit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NEIGH_UCAST_SOLICIT, + .procname = "ucast_solicit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NEIGH_APP_SOLICIT, + .procname = "app_solicit", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NEIGH_RETRANS_TIME, + .procname = "retrans_time", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_userhz_jiffies, + }, + { + .ctl_name = NET_NEIGH_REACHABLE_TIME, + .procname = "base_reachable_time", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_NEIGH_DELAY_PROBE_TIME, + .procname = "delay_first_probe_time", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_NEIGH_GC_STALE_TIME, + .procname = "gc_stale_time", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_NEIGH_UNRES_QLEN, + .procname = "unres_qlen", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NEIGH_PROXY_QLEN, + .procname = "proxy_qlen", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NEIGH_ANYCAST_DELAY, + .procname = "anycast_delay", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_userhz_jiffies, + }, + { + .ctl_name = NET_NEIGH_PROXY_DELAY, + .procname = "proxy_delay", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_userhz_jiffies, + }, + { + .ctl_name = NET_NEIGH_LOCKTIME, + .procname = "locktime", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_userhz_jiffies, + }, + { + .ctl_name = NET_NEIGH_GC_INTERVAL, + .procname = "gc_interval", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_NEIGH_GC_THRESH1, + .procname = "gc_thresh1", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NEIGH_GC_THRESH2, + .procname = "gc_thresh2", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NEIGH_GC_THRESH3, + .procname = "gc_thresh3", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_NEIGH_RETRANS_TIME_MS, + .procname = "retrans_time_ms", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_ms_jiffies, + .strategy = &sysctl_ms_jiffies, + }, + { + .ctl_name = NET_NEIGH_REACHABLE_TIME_MS, + .procname = "base_reachable_time_ms", + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_ms_jiffies, + .strategy = &sysctl_ms_jiffies, + }, + }, + .neigh_dev = { + { + .ctl_name = NET_PROTO_CONF_DEFAULT, + .procname = "default", + .mode = 0555, + }, + }, + .neigh_neigh_dir = { + { + .procname = "neigh", + .mode = 0555, + }, + }, + .neigh_proto_dir = { + { + .mode = 0555, + }, + }, + .neigh_root_dir = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + }, + }, +}; + +int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, + int p_id, int pdev_id, char *p_name, + proc_handler *handler, ctl_handler *strategy) +{ + struct neigh_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL); + const char *dev_name_source = NULL; + char *dev_name = NULL; + int err = 0; + + if (!t) + return -ENOBUFS; + memcpy(t, &neigh_sysctl_template, sizeof(*t)); + t->neigh_vars[0].data = &p->mcast_probes; + t->neigh_vars[1].data = &p->ucast_probes; + t->neigh_vars[2].data = &p->app_probes; + t->neigh_vars[3].data = &p->retrans_time; + t->neigh_vars[4].data = &p->base_reachable_time; + t->neigh_vars[5].data = &p->delay_probe_time; + t->neigh_vars[6].data = &p->gc_staletime; + t->neigh_vars[7].data = &p->queue_len; + t->neigh_vars[8].data = &p->proxy_qlen; + t->neigh_vars[9].data = &p->anycast_delay; + t->neigh_vars[10].data = &p->proxy_delay; + t->neigh_vars[11].data = &p->locktime; + + if (dev) { + dev_name_source = dev->name; + t->neigh_dev[0].ctl_name = dev->ifindex; + t->neigh_vars[12].procname = NULL; + t->neigh_vars[13].procname = NULL; + t->neigh_vars[14].procname = NULL; + t->neigh_vars[15].procname = NULL; + } else { + dev_name_source = t->neigh_dev[0].procname; + t->neigh_vars[12].data = (int *)(p + 1); + t->neigh_vars[13].data = (int *)(p + 1) + 1; + t->neigh_vars[14].data = (int *)(p + 1) + 2; + t->neigh_vars[15].data = (int *)(p + 1) + 3; + } + + t->neigh_vars[16].data = &p->retrans_time; + t->neigh_vars[17].data = &p->base_reachable_time; + + if (handler || strategy) { + /* RetransTime */ + t->neigh_vars[3].proc_handler = handler; + t->neigh_vars[3].strategy = strategy; + t->neigh_vars[3].extra1 = dev; + /* ReachableTime */ + t->neigh_vars[4].proc_handler = handler; + t->neigh_vars[4].strategy = strategy; + t->neigh_vars[4].extra1 = dev; + /* RetransTime (in milliseconds)*/ + t->neigh_vars[16].proc_handler = handler; + t->neigh_vars[16].strategy = strategy; + t->neigh_vars[16].extra1 = dev; + /* ReachableTime (in milliseconds) */ + t->neigh_vars[17].proc_handler = handler; + t->neigh_vars[17].strategy = strategy; + t->neigh_vars[17].extra1 = dev; + } + + dev_name = net_sysctl_strdup(dev_name_source); + if (!dev_name) { + err = -ENOBUFS; + goto free; + } + + t->neigh_dev[0].procname = dev_name; + + t->neigh_neigh_dir[0].ctl_name = pdev_id; + + t->neigh_proto_dir[0].procname = p_name; + t->neigh_proto_dir[0].ctl_name = p_id; + + t->neigh_dev[0].child = t->neigh_vars; + t->neigh_neigh_dir[0].child = t->neigh_dev; + t->neigh_proto_dir[0].child = t->neigh_neigh_dir; + t->neigh_root_dir[0].child = t->neigh_proto_dir; + + t->sysctl_header = register_sysctl_table(t->neigh_root_dir, 0); + if (!t->sysctl_header) { + err = -ENOBUFS; + goto free_procname; + } + p->sysctl_table = t; + return 0; + + /* error path */ + free_procname: + kfree(dev_name); + free: + kfree(t); + + return err; +} + +void neigh_sysctl_unregister(struct neigh_parms *p) +{ + if (p->sysctl_table) { + struct neigh_sysctl_table *t = p->sysctl_table; + p->sysctl_table = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t->neigh_dev[0].procname); + kfree(t); + } +} + +#endif /* CONFIG_SYSCTL */ + +EXPORT_SYMBOL(__neigh_event_send); +EXPORT_SYMBOL(neigh_add); +EXPORT_SYMBOL(neigh_changeaddr); +EXPORT_SYMBOL(neigh_compat_output); +EXPORT_SYMBOL(neigh_connected_output); +EXPORT_SYMBOL(neigh_create); +EXPORT_SYMBOL(neigh_delete); +EXPORT_SYMBOL(neigh_destroy); +EXPORT_SYMBOL(neigh_dump_info); +EXPORT_SYMBOL(neigh_event_ns); +EXPORT_SYMBOL(neigh_ifdown); +EXPORT_SYMBOL(neigh_lookup); +EXPORT_SYMBOL(neigh_lookup_nodev); +EXPORT_SYMBOL(neigh_parms_alloc); +EXPORT_SYMBOL(neigh_parms_release); +EXPORT_SYMBOL(neigh_rand_reach_time); +EXPORT_SYMBOL(neigh_resolve_output); +EXPORT_SYMBOL(neigh_table_clear); +EXPORT_SYMBOL(neigh_table_init); +EXPORT_SYMBOL(neigh_update); +EXPORT_SYMBOL(neigh_update_hhs); +EXPORT_SYMBOL(pneigh_enqueue); +EXPORT_SYMBOL(pneigh_lookup); + +#ifdef CONFIG_ARPD +EXPORT_SYMBOL(neigh_app_ns); +#endif +#ifdef CONFIG_SYSCTL +EXPORT_SYMBOL(neigh_sysctl_register); +EXPORT_SYMBOL(neigh_sysctl_unregister); +#endif diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c new file mode 100644 index 000000000000..060f703659e8 --- /dev/null +++ b/net/core/net-sysfs.c @@ -0,0 +1,461 @@ +/* + * net-sysfs.c - network device class and attributes + * + * Copyright (c) 2003 Stephen Hemminger + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define to_class_dev(obj) container_of(obj,struct class_device,kobj) +#define to_net_dev(class) container_of(class, struct net_device, class_dev) + +static const char fmt_hex[] = "%#x\n"; +static const char fmt_dec[] = "%d\n"; +static const char fmt_ulong[] = "%lu\n"; + +static inline int dev_isalive(const struct net_device *dev) +{ + return dev->reg_state == NETREG_REGISTERED; +} + +/* use same locking rules as GIF* ioctl's */ +static ssize_t netdev_show(const struct class_device *cd, char *buf, + ssize_t (*format)(const struct net_device *, char *)) +{ + struct net_device *net = to_net_dev(cd); + ssize_t ret = -EINVAL; + + read_lock(&dev_base_lock); + if (dev_isalive(net)) + ret = (*format)(net, buf); + read_unlock(&dev_base_lock); + + return ret; +} + +/* generate a show function for simple field */ +#define NETDEVICE_SHOW(field, format_string) \ +static ssize_t format_##field(const struct net_device *net, char *buf) \ +{ \ + return sprintf(buf, format_string, net->field); \ +} \ +static ssize_t show_##field(struct class_device *cd, char *buf) \ +{ \ + return netdev_show(cd, buf, format_##field); \ +} + + +/* use same locking and permission rules as SIF* ioctl's */ +static ssize_t netdev_store(struct class_device *dev, + const char *buf, size_t len, + int (*set)(struct net_device *, unsigned long)) +{ + struct net_device *net = to_net_dev(dev); + char *endp; + unsigned long new; + int ret = -EINVAL; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + new = simple_strtoul(buf, &endp, 0); + if (endp == buf) + goto err; + + rtnl_lock(); + if (dev_isalive(net)) { + if ((ret = (*set)(net, new)) == 0) + ret = len; + } + rtnl_unlock(); + err: + return ret; +} + +/* generate a read-only network device class attribute */ +#define NETDEVICE_ATTR(field, format_string) \ +NETDEVICE_SHOW(field, format_string) \ +static CLASS_DEVICE_ATTR(field, S_IRUGO, show_##field, NULL) \ + +NETDEVICE_ATTR(addr_len, fmt_dec); +NETDEVICE_ATTR(iflink, fmt_dec); +NETDEVICE_ATTR(ifindex, fmt_dec); +NETDEVICE_ATTR(features, fmt_hex); +NETDEVICE_ATTR(type, fmt_dec); + +/* use same locking rules as GIFHWADDR ioctl's */ +static ssize_t format_addr(char *buf, const unsigned char *addr, int len) +{ + int i; + char *cp = buf; + + for (i = 0; i < len; i++) + cp += sprintf(cp, "%02x%c", addr[i], + i == (len - 1) ? '\n' : ':'); + return cp - buf; +} + +static ssize_t show_address(struct class_device *dev, char *buf) +{ + struct net_device *net = to_net_dev(dev); + ssize_t ret = -EINVAL; + + read_lock(&dev_base_lock); + if (dev_isalive(net)) + ret = format_addr(buf, net->dev_addr, net->addr_len); + read_unlock(&dev_base_lock); + return ret; +} + +static ssize_t show_broadcast(struct class_device *dev, char *buf) +{ + struct net_device *net = to_net_dev(dev); + if (dev_isalive(net)) + return format_addr(buf, net->broadcast, net->addr_len); + return -EINVAL; +} + +static ssize_t show_carrier(struct class_device *dev, char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + if (netif_running(netdev)) { + return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev)); + } + return -EINVAL; +} + +static CLASS_DEVICE_ATTR(address, S_IRUGO, show_address, NULL); +static CLASS_DEVICE_ATTR(broadcast, S_IRUGO, show_broadcast, NULL); +static CLASS_DEVICE_ATTR(carrier, S_IRUGO, show_carrier, NULL); + +/* read-write attributes */ +NETDEVICE_SHOW(mtu, fmt_dec); + +static int change_mtu(struct net_device *net, unsigned long new_mtu) +{ + return dev_set_mtu(net, (int) new_mtu); +} + +static ssize_t store_mtu(struct class_device *dev, const char *buf, size_t len) +{ + return netdev_store(dev, buf, len, change_mtu); +} + +static CLASS_DEVICE_ATTR(mtu, S_IRUGO | S_IWUSR, show_mtu, store_mtu); + +NETDEVICE_SHOW(flags, fmt_hex); + +static int change_flags(struct net_device *net, unsigned long new_flags) +{ + return dev_change_flags(net, (unsigned) new_flags); +} + +static ssize_t store_flags(struct class_device *dev, const char *buf, size_t len) +{ + return netdev_store(dev, buf, len, change_flags); +} + +static CLASS_DEVICE_ATTR(flags, S_IRUGO | S_IWUSR, show_flags, store_flags); + +NETDEVICE_SHOW(tx_queue_len, fmt_ulong); + +static int change_tx_queue_len(struct net_device *net, unsigned long new_len) +{ + net->tx_queue_len = new_len; + return 0; +} + +static ssize_t store_tx_queue_len(struct class_device *dev, const char *buf, size_t len) +{ + return netdev_store(dev, buf, len, change_tx_queue_len); +} + +static CLASS_DEVICE_ATTR(tx_queue_len, S_IRUGO | S_IWUSR, show_tx_queue_len, + store_tx_queue_len); + + +static struct class_device_attribute *net_class_attributes[] = { + &class_device_attr_ifindex, + &class_device_attr_iflink, + &class_device_attr_addr_len, + &class_device_attr_tx_queue_len, + &class_device_attr_features, + &class_device_attr_mtu, + &class_device_attr_flags, + &class_device_attr_type, + &class_device_attr_address, + &class_device_attr_broadcast, + &class_device_attr_carrier, + NULL +}; + +/* Show a given an attribute in the statistics group */ +static ssize_t netstat_show(const struct class_device *cd, char *buf, + unsigned long offset) +{ + struct net_device *dev = to_net_dev(cd); + struct net_device_stats *stats; + ssize_t ret = -EINVAL; + + if (offset > sizeof(struct net_device_stats) || + offset % sizeof(unsigned long) != 0) + WARN_ON(1); + + read_lock(&dev_base_lock); + if (dev_isalive(dev) && dev->get_stats && + (stats = (*dev->get_stats)(dev))) + ret = sprintf(buf, fmt_ulong, + *(unsigned long *)(((u8 *) stats) + offset)); + + read_unlock(&dev_base_lock); + return ret; +} + +/* generate a read-only statistics attribute */ +#define NETSTAT_ENTRY(name) \ +static ssize_t show_##name(struct class_device *cd, char *buf) \ +{ \ + return netstat_show(cd, buf, \ + offsetof(struct net_device_stats, name)); \ +} \ +static CLASS_DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + +NETSTAT_ENTRY(rx_packets); +NETSTAT_ENTRY(tx_packets); +NETSTAT_ENTRY(rx_bytes); +NETSTAT_ENTRY(tx_bytes); +NETSTAT_ENTRY(rx_errors); +NETSTAT_ENTRY(tx_errors); +NETSTAT_ENTRY(rx_dropped); +NETSTAT_ENTRY(tx_dropped); +NETSTAT_ENTRY(multicast); +NETSTAT_ENTRY(collisions); +NETSTAT_ENTRY(rx_length_errors); +NETSTAT_ENTRY(rx_over_errors); +NETSTAT_ENTRY(rx_crc_errors); +NETSTAT_ENTRY(rx_frame_errors); +NETSTAT_ENTRY(rx_fifo_errors); +NETSTAT_ENTRY(rx_missed_errors); +NETSTAT_ENTRY(tx_aborted_errors); +NETSTAT_ENTRY(tx_carrier_errors); +NETSTAT_ENTRY(tx_fifo_errors); +NETSTAT_ENTRY(tx_heartbeat_errors); +NETSTAT_ENTRY(tx_window_errors); +NETSTAT_ENTRY(rx_compressed); +NETSTAT_ENTRY(tx_compressed); + +static struct attribute *netstat_attrs[] = { + &class_device_attr_rx_packets.attr, + &class_device_attr_tx_packets.attr, + &class_device_attr_rx_bytes.attr, + &class_device_attr_tx_bytes.attr, + &class_device_attr_rx_errors.attr, + &class_device_attr_tx_errors.attr, + &class_device_attr_rx_dropped.attr, + &class_device_attr_tx_dropped.attr, + &class_device_attr_multicast.attr, + &class_device_attr_collisions.attr, + &class_device_attr_rx_length_errors.attr, + &class_device_attr_rx_over_errors.attr, + &class_device_attr_rx_crc_errors.attr, + &class_device_attr_rx_frame_errors.attr, + &class_device_attr_rx_fifo_errors.attr, + &class_device_attr_rx_missed_errors.attr, + &class_device_attr_tx_aborted_errors.attr, + &class_device_attr_tx_carrier_errors.attr, + &class_device_attr_tx_fifo_errors.attr, + &class_device_attr_tx_heartbeat_errors.attr, + &class_device_attr_tx_window_errors.attr, + &class_device_attr_rx_compressed.attr, + &class_device_attr_tx_compressed.attr, + NULL +}; + + +static struct attribute_group netstat_group = { + .name = "statistics", + .attrs = netstat_attrs, +}; + +#ifdef WIRELESS_EXT +/* helper function that does all the locking etc for wireless stats */ +static ssize_t wireless_show(struct class_device *cd, char *buf, + ssize_t (*format)(const struct iw_statistics *, + char *)) +{ + struct net_device *dev = to_net_dev(cd); + const struct iw_statistics *iw; + ssize_t ret = -EINVAL; + + read_lock(&dev_base_lock); + if (dev_isalive(dev) && dev->get_wireless_stats + && (iw = dev->get_wireless_stats(dev)) != NULL) + ret = (*format)(iw, buf); + read_unlock(&dev_base_lock); + + return ret; +} + +/* show function template for wireless fields */ +#define WIRELESS_SHOW(name, field, format_string) \ +static ssize_t format_iw_##name(const struct iw_statistics *iw, char *buf) \ +{ \ + return sprintf(buf, format_string, iw->field); \ +} \ +static ssize_t show_iw_##name(struct class_device *cd, char *buf) \ +{ \ + return wireless_show(cd, buf, format_iw_##name); \ +} \ +static CLASS_DEVICE_ATTR(name, S_IRUGO, show_iw_##name, NULL) + +WIRELESS_SHOW(status, status, fmt_hex); +WIRELESS_SHOW(link, qual.qual, fmt_dec); +WIRELESS_SHOW(level, qual.level, fmt_dec); +WIRELESS_SHOW(noise, qual.noise, fmt_dec); +WIRELESS_SHOW(nwid, discard.nwid, fmt_dec); +WIRELESS_SHOW(crypt, discard.code, fmt_dec); +WIRELESS_SHOW(fragment, discard.fragment, fmt_dec); +WIRELESS_SHOW(misc, discard.misc, fmt_dec); +WIRELESS_SHOW(retries, discard.retries, fmt_dec); +WIRELESS_SHOW(beacon, miss.beacon, fmt_dec); + +static struct attribute *wireless_attrs[] = { + &class_device_attr_status.attr, + &class_device_attr_link.attr, + &class_device_attr_level.attr, + &class_device_attr_noise.attr, + &class_device_attr_nwid.attr, + &class_device_attr_crypt.attr, + &class_device_attr_fragment.attr, + &class_device_attr_retries.attr, + &class_device_attr_misc.attr, + &class_device_attr_beacon.attr, + NULL +}; + +static struct attribute_group wireless_group = { + .name = "wireless", + .attrs = wireless_attrs, +}; +#endif + +#ifdef CONFIG_HOTPLUG +static int netdev_hotplug(struct class_device *cd, char **envp, + int num_envp, char *buf, int size) +{ + struct net_device *dev = to_net_dev(cd); + int i = 0; + int n; + + /* pass interface in env to hotplug. */ + envp[i++] = buf; + n = snprintf(buf, size, "INTERFACE=%s", dev->name) + 1; + buf += n; + size -= n; + + if ((size <= 0) || (i >= num_envp)) + return -ENOMEM; + + envp[i] = NULL; + return 0; +} +#endif + +/* + * netdev_release -- destroy and free a dead device. + * Called when last reference to class_device kobject is gone. + */ +static void netdev_release(struct class_device *cd) +{ + struct net_device *dev + = container_of(cd, struct net_device, class_dev); + + BUG_ON(dev->reg_state != NETREG_RELEASED); + + kfree((char *)dev - dev->padded); +} + +static struct class net_class = { + .name = "net", + .release = netdev_release, +#ifdef CONFIG_HOTPLUG + .hotplug = netdev_hotplug, +#endif +}; + +void netdev_unregister_sysfs(struct net_device * net) +{ + struct class_device * class_dev = &(net->class_dev); + + if (net->get_stats) + sysfs_remove_group(&class_dev->kobj, &netstat_group); + +#ifdef WIRELESS_EXT + if (net->get_wireless_stats) + sysfs_remove_group(&class_dev->kobj, &wireless_group); +#endif + class_device_del(class_dev); + +} + +/* Create sysfs entries for network device. */ +int netdev_register_sysfs(struct net_device *net) +{ + struct class_device *class_dev = &(net->class_dev); + int i; + struct class_device_attribute *attr; + int ret; + + class_dev->class = &net_class; + class_dev->class_data = net; + + strlcpy(class_dev->class_id, net->name, BUS_ID_SIZE); + if ((ret = class_device_register(class_dev))) + goto out; + + for (i = 0; (attr = net_class_attributes[i]) != NULL; i++) { + if ((ret = class_device_create_file(class_dev, attr))) + goto out_unreg; + } + + + if (net->get_stats && + (ret = sysfs_create_group(&class_dev->kobj, &netstat_group))) + goto out_unreg; + +#ifdef WIRELESS_EXT + if (net->get_wireless_stats && + (ret = sysfs_create_group(&class_dev->kobj, &wireless_group))) + goto out_cleanup; + + return 0; +out_cleanup: + if (net->get_stats) + sysfs_remove_group(&class_dev->kobj, &netstat_group); +#else + return 0; +#endif + +out_unreg: + printk(KERN_WARNING "%s: sysfs attribute registration failed %d\n", + net->name, ret); + class_device_unregister(class_dev); +out: + return ret; +} + +int netdev_sysfs_init(void) +{ + return class_register(&net_class); +} diff --git a/net/core/netfilter.c b/net/core/netfilter.c new file mode 100644 index 000000000000..e51cfa46950c --- /dev/null +++ b/net/core/netfilter.c @@ -0,0 +1,799 @@ +/* netfilter.c: look after the filters for various protocols. + * Heavily influenced by the old firewall.c by David Bonn and Alan Cox. + * + * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any + * way. + * + * Rusty Russell (C)2000 -- This code is GPL. + * + * February 2000: Modified by James Morris to have 1 queue per protocol. + * 15-Mar-2000: Added NF_REPEAT --RR. + * 08-May-2003: Internal logging interface added by Jozsef Kadlecsik. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* In this code, we can be waiting indefinitely for userspace to + * service a packet if a hook returns NF_QUEUE. We could keep a count + * of skbuffs queued for userspace, and not deregister a hook unless + * this is zero, but that sucks. Now, we simply check when the + * packets come back: if the hook is gone, the packet is discarded. */ +#ifdef CONFIG_NETFILTER_DEBUG +#define NFDEBUG(format, args...) printk(format , ## args) +#else +#define NFDEBUG(format, args...) +#endif + +/* Sockopts only registered and called from user context, so + net locking would be overkill. Also, [gs]etsockopt calls may + sleep. */ +static DECLARE_MUTEX(nf_sockopt_mutex); + +struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS]; +static LIST_HEAD(nf_sockopts); +static DEFINE_SPINLOCK(nf_hook_lock); + +/* + * A queue handler may be registered for each protocol. Each is protected by + * long term mutex. The handler must provide an an outfn() to accept packets + * for queueing and must reinject all packets it receives, no matter what. + */ +static struct nf_queue_handler_t { + nf_queue_outfn_t outfn; + void *data; +} queue_handler[NPROTO]; +static DEFINE_RWLOCK(queue_handler_lock); + +int nf_register_hook(struct nf_hook_ops *reg) +{ + struct list_head *i; + + spin_lock_bh(&nf_hook_lock); + list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) { + if (reg->priority < ((struct nf_hook_ops *)i)->priority) + break; + } + list_add_rcu(®->list, i->prev); + spin_unlock_bh(&nf_hook_lock); + + synchronize_net(); + return 0; +} + +void nf_unregister_hook(struct nf_hook_ops *reg) +{ + spin_lock_bh(&nf_hook_lock); + list_del_rcu(®->list); + spin_unlock_bh(&nf_hook_lock); + + synchronize_net(); +} + +/* Do exclusive ranges overlap? */ +static inline int overlap(int min1, int max1, int min2, int max2) +{ + return max1 > min2 && min1 < max2; +} + +/* Functions to register sockopt ranges (exclusive). */ +int nf_register_sockopt(struct nf_sockopt_ops *reg) +{ + struct list_head *i; + int ret = 0; + + if (down_interruptible(&nf_sockopt_mutex) != 0) + return -EINTR; + + list_for_each(i, &nf_sockopts) { + struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i; + if (ops->pf == reg->pf + && (overlap(ops->set_optmin, ops->set_optmax, + reg->set_optmin, reg->set_optmax) + || overlap(ops->get_optmin, ops->get_optmax, + reg->get_optmin, reg->get_optmax))) { + NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n", + ops->set_optmin, ops->set_optmax, + ops->get_optmin, ops->get_optmax, + reg->set_optmin, reg->set_optmax, + reg->get_optmin, reg->get_optmax); + ret = -EBUSY; + goto out; + } + } + + list_add(®->list, &nf_sockopts); +out: + up(&nf_sockopt_mutex); + return ret; +} + +void nf_unregister_sockopt(struct nf_sockopt_ops *reg) +{ + /* No point being interruptible: we're probably in cleanup_module() */ + restart: + down(&nf_sockopt_mutex); + if (reg->use != 0) { + /* To be woken by nf_sockopt call... */ + /* FIXME: Stuart Young's name appears gratuitously. */ + set_current_state(TASK_UNINTERRUPTIBLE); + reg->cleanup_task = current; + up(&nf_sockopt_mutex); + schedule(); + goto restart; + } + list_del(®->list); + up(&nf_sockopt_mutex); +} + +#ifdef CONFIG_NETFILTER_DEBUG +#include +#include +#include + +static void debug_print_hooks_ip(unsigned int nf_debug) +{ + if (nf_debug & (1 << NF_IP_PRE_ROUTING)) { + printk("PRE_ROUTING "); + nf_debug ^= (1 << NF_IP_PRE_ROUTING); + } + if (nf_debug & (1 << NF_IP_LOCAL_IN)) { + printk("LOCAL_IN "); + nf_debug ^= (1 << NF_IP_LOCAL_IN); + } + if (nf_debug & (1 << NF_IP_FORWARD)) { + printk("FORWARD "); + nf_debug ^= (1 << NF_IP_FORWARD); + } + if (nf_debug & (1 << NF_IP_LOCAL_OUT)) { + printk("LOCAL_OUT "); + nf_debug ^= (1 << NF_IP_LOCAL_OUT); + } + if (nf_debug & (1 << NF_IP_POST_ROUTING)) { + printk("POST_ROUTING "); + nf_debug ^= (1 << NF_IP_POST_ROUTING); + } + if (nf_debug) + printk("Crap bits: 0x%04X", nf_debug); + printk("\n"); +} + +static void nf_dump_skb(int pf, struct sk_buff *skb) +{ + printk("skb: pf=%i %s dev=%s len=%u\n", + pf, + skb->sk ? "(owned)" : "(unowned)", + skb->dev ? skb->dev->name : "(no dev)", + skb->len); + switch (pf) { + case PF_INET: { + const struct iphdr *ip = skb->nh.iph; + __u32 *opt = (__u32 *) (ip + 1); + int opti; + __u16 src_port = 0, dst_port = 0; + + if (ip->protocol == IPPROTO_TCP + || ip->protocol == IPPROTO_UDP) { + struct tcphdr *tcp=(struct tcphdr *)((__u32 *)ip+ip->ihl); + src_port = ntohs(tcp->source); + dst_port = ntohs(tcp->dest); + } + + printk("PROTO=%d %u.%u.%u.%u:%hu %u.%u.%u.%u:%hu" + " L=%hu S=0x%2.2hX I=%hu F=0x%4.4hX T=%hu", + ip->protocol, NIPQUAD(ip->saddr), + src_port, NIPQUAD(ip->daddr), + dst_port, + ntohs(ip->tot_len), ip->tos, ntohs(ip->id), + ntohs(ip->frag_off), ip->ttl); + + for (opti = 0; opti < (ip->ihl - sizeof(struct iphdr) / 4); opti++) + printk(" O=0x%8.8X", *opt++); + printk("\n"); + } + } +} + +void nf_debug_ip_local_deliver(struct sk_buff *skb) +{ + /* If it's a loopback packet, it must have come through + * NF_IP_LOCAL_OUT, NF_IP_RAW_INPUT, NF_IP_PRE_ROUTING and + * NF_IP_LOCAL_IN. Otherwise, must have gone through + * NF_IP_RAW_INPUT and NF_IP_PRE_ROUTING. */ + if (!skb->dev) { + printk("ip_local_deliver: skb->dev is NULL.\n"); + } + else if (strcmp(skb->dev->name, "lo") == 0) { + if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT) + | (1 << NF_IP_POST_ROUTING) + | (1 << NF_IP_PRE_ROUTING) + | (1 << NF_IP_LOCAL_IN))) { + printk("ip_local_deliver: bad loopback skb: "); + debug_print_hooks_ip(skb->nf_debug); + nf_dump_skb(PF_INET, skb); + } + } + else { + if (skb->nf_debug != ((1<nf_debug); + nf_dump_skb(PF_INET, skb); + } + } +} + +void nf_debug_ip_loopback_xmit(struct sk_buff *newskb) +{ + if (newskb->nf_debug != ((1 << NF_IP_LOCAL_OUT) + | (1 << NF_IP_POST_ROUTING))) { + printk("ip_dev_loopback_xmit: bad owned skb = %p: ", + newskb); + debug_print_hooks_ip(newskb->nf_debug); + nf_dump_skb(PF_INET, newskb); + } + /* Clear to avoid confusing input check */ + newskb->nf_debug = 0; +} + +void nf_debug_ip_finish_output2(struct sk_buff *skb) +{ + /* If it's owned, it must have gone through the + * NF_IP_LOCAL_OUT and NF_IP_POST_ROUTING. + * Otherwise, must have gone through + * NF_IP_PRE_ROUTING, NF_IP_FORWARD and NF_IP_POST_ROUTING. + */ + if (skb->sk) { + if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT) + | (1 << NF_IP_POST_ROUTING))) { + printk("ip_finish_output: bad owned skb = %p: ", skb); + debug_print_hooks_ip(skb->nf_debug); + nf_dump_skb(PF_INET, skb); + } + } else { + if (skb->nf_debug != ((1 << NF_IP_PRE_ROUTING) + | (1 << NF_IP_FORWARD) + | (1 << NF_IP_POST_ROUTING))) { + /* Fragments, entunnelled packets, TCP RSTs + generated by ipt_REJECT will have no + owners, but still may be local */ + if (skb->nf_debug != ((1 << NF_IP_LOCAL_OUT) + | (1 << NF_IP_POST_ROUTING))){ + printk("ip_finish_output:" + " bad unowned skb = %p: ",skb); + debug_print_hooks_ip(skb->nf_debug); + nf_dump_skb(PF_INET, skb); + } + } + } +} +#endif /*CONFIG_NETFILTER_DEBUG*/ + +/* Call get/setsockopt() */ +static int nf_sockopt(struct sock *sk, int pf, int val, + char __user *opt, int *len, int get) +{ + struct list_head *i; + struct nf_sockopt_ops *ops; + int ret; + + if (down_interruptible(&nf_sockopt_mutex) != 0) + return -EINTR; + + list_for_each(i, &nf_sockopts) { + ops = (struct nf_sockopt_ops *)i; + if (ops->pf == pf) { + if (get) { + if (val >= ops->get_optmin + && val < ops->get_optmax) { + ops->use++; + up(&nf_sockopt_mutex); + ret = ops->get(sk, val, opt, len); + goto out; + } + } else { + if (val >= ops->set_optmin + && val < ops->set_optmax) { + ops->use++; + up(&nf_sockopt_mutex); + ret = ops->set(sk, val, opt, *len); + goto out; + } + } + } + } + up(&nf_sockopt_mutex); + return -ENOPROTOOPT; + + out: + down(&nf_sockopt_mutex); + ops->use--; + if (ops->cleanup_task) + wake_up_process(ops->cleanup_task); + up(&nf_sockopt_mutex); + return ret; +} + +int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt, + int len) +{ + return nf_sockopt(sk, pf, val, opt, &len, 0); +} + +int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len) +{ + return nf_sockopt(sk, pf, val, opt, len, 1); +} + +static unsigned int nf_iterate(struct list_head *head, + struct sk_buff **skb, + int hook, + const struct net_device *indev, + const struct net_device *outdev, + struct list_head **i, + int (*okfn)(struct sk_buff *), + int hook_thresh) +{ + unsigned int verdict; + + /* + * The caller must not block between calls to this + * function because of risk of continuing from deleted element. + */ + list_for_each_continue_rcu(*i, head) { + struct nf_hook_ops *elem = (struct nf_hook_ops *)*i; + + if (hook_thresh > elem->priority) + continue; + + /* Optimization: we don't need to hold module + reference here, since function can't sleep. --RR */ + verdict = elem->hook(hook, skb, indev, outdev, okfn); + if (verdict != NF_ACCEPT) { +#ifdef CONFIG_NETFILTER_DEBUG + if (unlikely(verdict > NF_MAX_VERDICT)) { + NFDEBUG("Evil return from %p(%u).\n", + elem->hook, hook); + continue; + } +#endif + if (verdict != NF_REPEAT) + return verdict; + *i = (*i)->prev; + } + } + return NF_ACCEPT; +} + +int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data) +{ + int ret; + + write_lock_bh(&queue_handler_lock); + if (queue_handler[pf].outfn) + ret = -EBUSY; + else { + queue_handler[pf].outfn = outfn; + queue_handler[pf].data = data; + ret = 0; + } + write_unlock_bh(&queue_handler_lock); + + return ret; +} + +/* The caller must flush their queue before this */ +int nf_unregister_queue_handler(int pf) +{ + write_lock_bh(&queue_handler_lock); + queue_handler[pf].outfn = NULL; + queue_handler[pf].data = NULL; + write_unlock_bh(&queue_handler_lock); + + return 0; +} + +/* + * Any packet that leaves via this function must come back + * through nf_reinject(). + */ +static int nf_queue(struct sk_buff *skb, + struct list_head *elem, + int pf, unsigned int hook, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *)) +{ + int status; + struct nf_info *info; +#ifdef CONFIG_BRIDGE_NETFILTER + struct net_device *physindev = NULL; + struct net_device *physoutdev = NULL; +#endif + + /* QUEUE == DROP if noone is waiting, to be safe. */ + read_lock(&queue_handler_lock); + if (!queue_handler[pf].outfn) { + read_unlock(&queue_handler_lock); + kfree_skb(skb); + return 1; + } + + info = kmalloc(sizeof(*info), GFP_ATOMIC); + if (!info) { + if (net_ratelimit()) + printk(KERN_ERR "OOM queueing packet %p\n", + skb); + read_unlock(&queue_handler_lock); + kfree_skb(skb); + return 1; + } + + *info = (struct nf_info) { + (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn }; + + /* If it's going away, ignore hook. */ + if (!try_module_get(info->elem->owner)) { + read_unlock(&queue_handler_lock); + kfree(info); + return 0; + } + + /* Bump dev refs so they don't vanish while packet is out */ + if (indev) dev_hold(indev); + if (outdev) dev_hold(outdev); + +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) { + physindev = skb->nf_bridge->physindev; + if (physindev) dev_hold(physindev); + physoutdev = skb->nf_bridge->physoutdev; + if (physoutdev) dev_hold(physoutdev); + } +#endif + + status = queue_handler[pf].outfn(skb, info, queue_handler[pf].data); + read_unlock(&queue_handler_lock); + + if (status < 0) { + /* James M doesn't say fuck enough. */ + if (indev) dev_put(indev); + if (outdev) dev_put(outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (physindev) dev_put(physindev); + if (physoutdev) dev_put(physoutdev); +#endif + module_put(info->elem->owner); + kfree(info); + kfree_skb(skb); + return 1; + } + return 1; +} + +/* Returns 1 if okfn() needs to be executed by the caller, + * -EPERM for NF_DROP, 0 otherwise. */ +int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb, + struct net_device *indev, + struct net_device *outdev, + int (*okfn)(struct sk_buff *), + int hook_thresh) +{ + struct list_head *elem; + unsigned int verdict; + int ret = 0; + + /* We may already have this, but read-locks nest anyway */ + rcu_read_lock(); + +#ifdef CONFIG_NETFILTER_DEBUG + if (unlikely((*pskb)->nf_debug & (1 << hook))) { + printk("nf_hook: hook %i already set.\n", hook); + nf_dump_skb(pf, *pskb); + } + (*pskb)->nf_debug |= (1 << hook); +#endif + + elem = &nf_hooks[pf][hook]; +next_hook: + verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev, + outdev, &elem, okfn, hook_thresh); + if (verdict == NF_ACCEPT || verdict == NF_STOP) { + ret = 1; + goto unlock; + } else if (verdict == NF_DROP) { + kfree_skb(*pskb); + ret = -EPERM; + } else if (verdict == NF_QUEUE) { + NFDEBUG("nf_hook: Verdict = QUEUE.\n"); + if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn)) + goto next_hook; + } +unlock: + rcu_read_unlock(); + return ret; +} + +void nf_reinject(struct sk_buff *skb, struct nf_info *info, + unsigned int verdict) +{ + struct list_head *elem = &info->elem->list; + struct list_head *i; + + rcu_read_lock(); + + /* Release those devices we held, or Alexey will kill me. */ + if (info->indev) dev_put(info->indev); + if (info->outdev) dev_put(info->outdev); +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) { + if (skb->nf_bridge->physindev) + dev_put(skb->nf_bridge->physindev); + if (skb->nf_bridge->physoutdev) + dev_put(skb->nf_bridge->physoutdev); + } +#endif + + /* Drop reference to owner of hook which queued us. */ + module_put(info->elem->owner); + + list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) { + if (i == elem) + break; + } + + if (elem == &nf_hooks[info->pf][info->hook]) { + /* The module which sent it to userspace is gone. */ + NFDEBUG("%s: module disappeared, dropping packet.\n", + __FUNCTION__); + verdict = NF_DROP; + } + + /* Continue traversal iff userspace said ok... */ + if (verdict == NF_REPEAT) { + elem = elem->prev; + verdict = NF_ACCEPT; + } + + if (verdict == NF_ACCEPT) { + next_hook: + verdict = nf_iterate(&nf_hooks[info->pf][info->hook], + &skb, info->hook, + info->indev, info->outdev, &elem, + info->okfn, INT_MIN); + } + + switch (verdict) { + case NF_ACCEPT: + info->okfn(skb); + break; + + case NF_QUEUE: + if (!nf_queue(skb, elem, info->pf, info->hook, + info->indev, info->outdev, info->okfn)) + goto next_hook; + break; + } + rcu_read_unlock(); + + if (verdict == NF_DROP) + kfree_skb(skb); + + kfree(info); + return; +} + +#ifdef CONFIG_INET +/* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */ +int ip_route_me_harder(struct sk_buff **pskb) +{ + struct iphdr *iph = (*pskb)->nh.iph; + struct rtable *rt; + struct flowi fl = {}; + struct dst_entry *odst; + unsigned int hh_len; + + /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause + * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook. + */ + if (inet_addr_type(iph->saddr) == RTN_LOCAL) { + fl.nl_u.ip4_u.daddr = iph->daddr; + fl.nl_u.ip4_u.saddr = iph->saddr; + fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); + fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0; +#ifdef CONFIG_IP_ROUTE_FWMARK + fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark; +#endif + fl.proto = iph->protocol; + if (ip_route_output_key(&rt, &fl) != 0) + return -1; + + /* Drop old route. */ + dst_release((*pskb)->dst); + (*pskb)->dst = &rt->u.dst; + } else { + /* non-local src, find valid iif to satisfy + * rp-filter when calling ip_route_input. */ + fl.nl_u.ip4_u.daddr = iph->saddr; + if (ip_route_output_key(&rt, &fl) != 0) + return -1; + + odst = (*pskb)->dst; + if (ip_route_input(*pskb, iph->daddr, iph->saddr, + RT_TOS(iph->tos), rt->u.dst.dev) != 0) { + dst_release(&rt->u.dst); + return -1; + } + dst_release(&rt->u.dst); + dst_release(odst); + } + + if ((*pskb)->dst->error) + return -1; + + /* Change in oif may mean change in hh_len. */ + hh_len = (*pskb)->dst->dev->hard_header_len; + if (skb_headroom(*pskb) < hh_len) { + struct sk_buff *nskb; + + nskb = skb_realloc_headroom(*pskb, hh_len); + if (!nskb) + return -1; + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = nskb; + } + + return 0; +} +EXPORT_SYMBOL(ip_route_me_harder); + +int skb_ip_make_writable(struct sk_buff **pskb, unsigned int writable_len) +{ + struct sk_buff *nskb; + + if (writable_len > (*pskb)->len) + return 0; + + /* Not exclusive use of packet? Must copy. */ + if (skb_shared(*pskb) || skb_cloned(*pskb)) + goto copy_skb; + + return pskb_may_pull(*pskb, writable_len); + +copy_skb: + nskb = skb_copy(*pskb, GFP_ATOMIC); + if (!nskb) + return 0; + BUG_ON(skb_is_nonlinear(nskb)); + + /* Rest of kernel will get very unhappy if we pass it a + suddenly-orphaned skbuff */ + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = nskb; + return 1; +} +EXPORT_SYMBOL(skb_ip_make_writable); +#endif /*CONFIG_INET*/ + +/* Internal logging interface, which relies on the real + LOG target modules */ + +#define NF_LOG_PREFIXLEN 128 + +static nf_logfn *nf_logging[NPROTO]; /* = NULL */ +static int reported = 0; +static DEFINE_SPINLOCK(nf_log_lock); + +int nf_log_register(int pf, nf_logfn *logfn) +{ + int ret = -EBUSY; + + /* Any setup of logging members must be done before + * substituting pointer. */ + spin_lock(&nf_log_lock); + if (!nf_logging[pf]) { + rcu_assign_pointer(nf_logging[pf], logfn); + ret = 0; + } + spin_unlock(&nf_log_lock); + return ret; +} + +void nf_log_unregister(int pf, nf_logfn *logfn) +{ + spin_lock(&nf_log_lock); + if (nf_logging[pf] == logfn) + nf_logging[pf] = NULL; + spin_unlock(&nf_log_lock); + + /* Give time to concurrent readers. */ + synchronize_net(); +} + +void nf_log_packet(int pf, + unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const char *fmt, ...) +{ + va_list args; + char prefix[NF_LOG_PREFIXLEN]; + nf_logfn *logfn; + + rcu_read_lock(); + logfn = rcu_dereference(nf_logging[pf]); + if (logfn) { + va_start(args, fmt); + vsnprintf(prefix, sizeof(prefix), fmt, args); + va_end(args); + /* We must read logging before nf_logfn[pf] */ + logfn(hooknum, skb, in, out, prefix); + } else if (!reported) { + printk(KERN_WARNING "nf_log_packet: can\'t log yet, " + "no backend logging module loaded in!\n"); + reported++; + } + rcu_read_unlock(); +} +EXPORT_SYMBOL(nf_log_register); +EXPORT_SYMBOL(nf_log_unregister); +EXPORT_SYMBOL(nf_log_packet); + +/* This does not belong here, but locally generated errors need it if connection + tracking in use: without this, connection may not be in hash table, and hence + manufactured ICMP or RST packets will not be associated with it. */ +void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *); + +void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) +{ + void (*attach)(struct sk_buff *, struct sk_buff *); + + if (skb->nfct && (attach = ip_ct_attach) != NULL) { + mb(); /* Just to be sure: must be read before executing this */ + attach(new, skb); + } +} + +void __init netfilter_init(void) +{ + int i, h; + + for (i = 0; i < NPROTO; i++) { + for (h = 0; h < NF_MAX_HOOKS; h++) + INIT_LIST_HEAD(&nf_hooks[i][h]); + } +} + +EXPORT_SYMBOL(ip_ct_attach); +EXPORT_SYMBOL(nf_ct_attach); +EXPORT_SYMBOL(nf_getsockopt); +EXPORT_SYMBOL(nf_hook_slow); +EXPORT_SYMBOL(nf_hooks); +EXPORT_SYMBOL(nf_register_hook); +EXPORT_SYMBOL(nf_register_queue_handler); +EXPORT_SYMBOL(nf_register_sockopt); +EXPORT_SYMBOL(nf_reinject); +EXPORT_SYMBOL(nf_setsockopt); +EXPORT_SYMBOL(nf_unregister_hook); +EXPORT_SYMBOL(nf_unregister_queue_handler); +EXPORT_SYMBOL(nf_unregister_sockopt); diff --git a/net/core/netpoll.c b/net/core/netpoll.c new file mode 100644 index 000000000000..a119696d5521 --- /dev/null +++ b/net/core/netpoll.c @@ -0,0 +1,735 @@ +/* + * Common framework for low-level network console, dump, and debugger code + * + * Sep 8 2003 Matt Mackall + * + * based on the netconsole code from: + * + * Copyright (C) 2001 Ingo Molnar + * Copyright (C) 2002 Red Hat, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * We maintain a small pool of fully-sized skbs, to make sure the + * message gets out even in extreme OOM situations. + */ + +#define MAX_UDP_CHUNK 1460 +#define MAX_SKBS 32 +#define MAX_QUEUE_DEPTH (MAX_SKBS / 2) + +static DEFINE_SPINLOCK(skb_list_lock); +static int nr_skbs; +static struct sk_buff *skbs; + +static DEFINE_SPINLOCK(queue_lock); +static int queue_depth; +static struct sk_buff *queue_head, *queue_tail; + +static atomic_t trapped; + +#define NETPOLL_RX_ENABLED 1 +#define NETPOLL_RX_DROP 2 + +#define MAX_SKB_SIZE \ + (MAX_UDP_CHUNK + sizeof(struct udphdr) + \ + sizeof(struct iphdr) + sizeof(struct ethhdr)) + +static void zap_completion_queue(void); + +static void queue_process(void *p) +{ + unsigned long flags; + struct sk_buff *skb; + + while (queue_head) { + spin_lock_irqsave(&queue_lock, flags); + + skb = queue_head; + queue_head = skb->next; + if (skb == queue_tail) + queue_head = NULL; + + queue_depth--; + + spin_unlock_irqrestore(&queue_lock, flags); + + dev_queue_xmit(skb); + } +} + +static DECLARE_WORK(send_queue, queue_process, NULL); + +void netpoll_queue(struct sk_buff *skb) +{ + unsigned long flags; + + if (queue_depth == MAX_QUEUE_DEPTH) { + __kfree_skb(skb); + return; + } + + spin_lock_irqsave(&queue_lock, flags); + if (!queue_head) + queue_head = skb; + else + queue_tail->next = skb; + queue_tail = skb; + queue_depth++; + spin_unlock_irqrestore(&queue_lock, flags); + + schedule_work(&send_queue); +} + +static int checksum_udp(struct sk_buff *skb, struct udphdr *uh, + unsigned short ulen, u32 saddr, u32 daddr) +{ + if (uh->check == 0) + return 0; + + if (skb->ip_summed == CHECKSUM_HW) + return csum_tcpudp_magic( + saddr, daddr, ulen, IPPROTO_UDP, skb->csum); + + skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + + return csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); +} + +/* + * Check whether delayed processing was scheduled for our NIC. If so, + * we attempt to grab the poll lock and use ->poll() to pump the card. + * If this fails, either we've recursed in ->poll() or it's already + * running on another CPU. + * + * Note: we don't mask interrupts with this lock because we're using + * trylock here and interrupts are already disabled in the softirq + * case. Further, we test the poll_owner to avoid recursion on UP + * systems where the lock doesn't exist. + * + * In cases where there is bi-directional communications, reading only + * one message at a time can lead to packets being dropped by the + * network adapter, forcing superfluous retries and possibly timeouts. + * Thus, we set our budget to greater than 1. + */ +static void poll_napi(struct netpoll *np) +{ + int budget = 16; + + if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) && + np->poll_owner != smp_processor_id() && + spin_trylock(&np->poll_lock)) { + np->rx_flags |= NETPOLL_RX_DROP; + atomic_inc(&trapped); + + np->dev->poll(np->dev, &budget); + + atomic_dec(&trapped); + np->rx_flags &= ~NETPOLL_RX_DROP; + spin_unlock(&np->poll_lock); + } +} + +void netpoll_poll(struct netpoll *np) +{ + if(!np->dev || !netif_running(np->dev) || !np->dev->poll_controller) + return; + + /* Process pending work on NIC */ + np->dev->poll_controller(np->dev); + if (np->dev->poll) + poll_napi(np); + + zap_completion_queue(); +} + +static void refill_skbs(void) +{ + struct sk_buff *skb; + unsigned long flags; + + spin_lock_irqsave(&skb_list_lock, flags); + while (nr_skbs < MAX_SKBS) { + skb = alloc_skb(MAX_SKB_SIZE, GFP_ATOMIC); + if (!skb) + break; + + skb->next = skbs; + skbs = skb; + nr_skbs++; + } + spin_unlock_irqrestore(&skb_list_lock, flags); +} + +static void zap_completion_queue(void) +{ + unsigned long flags; + struct softnet_data *sd = &get_cpu_var(softnet_data); + + if (sd->completion_queue) { + struct sk_buff *clist; + + local_irq_save(flags); + clist = sd->completion_queue; + sd->completion_queue = NULL; + local_irq_restore(flags); + + while (clist != NULL) { + struct sk_buff *skb = clist; + clist = clist->next; + if(skb->destructor) + dev_kfree_skb_any(skb); /* put this one back */ + else + __kfree_skb(skb); + } + } + + put_cpu_var(softnet_data); +} + +static struct sk_buff * find_skb(struct netpoll *np, int len, int reserve) +{ + int once = 1, count = 0; + unsigned long flags; + struct sk_buff *skb = NULL; + + zap_completion_queue(); +repeat: + if (nr_skbs < MAX_SKBS) + refill_skbs(); + + skb = alloc_skb(len, GFP_ATOMIC); + + if (!skb) { + spin_lock_irqsave(&skb_list_lock, flags); + skb = skbs; + if (skb) { + skbs = skb->next; + skb->next = NULL; + nr_skbs--; + } + spin_unlock_irqrestore(&skb_list_lock, flags); + } + + if(!skb) { + count++; + if (once && (count == 1000000)) { + printk("out of netpoll skbs!\n"); + once = 0; + } + netpoll_poll(np); + goto repeat; + } + + atomic_set(&skb->users, 1); + skb_reserve(skb, reserve); + return skb; +} + +static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +{ + int status; + +repeat: + if(!np || !np->dev || !netif_running(np->dev)) { + __kfree_skb(skb); + return; + } + + /* avoid recursion */ + if(np->poll_owner == smp_processor_id() || + np->dev->xmit_lock_owner == smp_processor_id()) { + if (np->drop) + np->drop(skb); + else + __kfree_skb(skb); + return; + } + + spin_lock(&np->dev->xmit_lock); + np->dev->xmit_lock_owner = smp_processor_id(); + + /* + * network drivers do not expect to be called if the queue is + * stopped. + */ + if (netif_queue_stopped(np->dev)) { + np->dev->xmit_lock_owner = -1; + spin_unlock(&np->dev->xmit_lock); + + netpoll_poll(np); + goto repeat; + } + + status = np->dev->hard_start_xmit(skb, np->dev); + np->dev->xmit_lock_owner = -1; + spin_unlock(&np->dev->xmit_lock); + + /* transmit busy */ + if(status) { + netpoll_poll(np); + goto repeat; + } +} + +void netpoll_send_udp(struct netpoll *np, const char *msg, int len) +{ + int total_len, eth_len, ip_len, udp_len; + struct sk_buff *skb; + struct udphdr *udph; + struct iphdr *iph; + struct ethhdr *eth; + + udp_len = len + sizeof(*udph); + ip_len = eth_len = udp_len + sizeof(*iph); + total_len = eth_len + ETH_HLEN + NET_IP_ALIGN; + + skb = find_skb(np, total_len, total_len - len); + if (!skb) + return; + + memcpy(skb->data, msg, len); + skb->len += len; + + udph = (struct udphdr *) skb_push(skb, sizeof(*udph)); + udph->source = htons(np->local_port); + udph->dest = htons(np->remote_port); + udph->len = htons(udp_len); + udph->check = 0; + + iph = (struct iphdr *)skb_push(skb, sizeof(*iph)); + + /* iph->version = 4; iph->ihl = 5; */ + put_unaligned(0x45, (unsigned char *)iph); + iph->tos = 0; + put_unaligned(htons(ip_len), &(iph->tot_len)); + iph->id = 0; + iph->frag_off = 0; + iph->ttl = 64; + iph->protocol = IPPROTO_UDP; + iph->check = 0; + put_unaligned(htonl(np->local_ip), &(iph->saddr)); + put_unaligned(htonl(np->remote_ip), &(iph->daddr)); + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + + eth = (struct ethhdr *) skb_push(skb, ETH_HLEN); + + eth->h_proto = htons(ETH_P_IP); + memcpy(eth->h_source, np->local_mac, 6); + memcpy(eth->h_dest, np->remote_mac, 6); + + skb->dev = np->dev; + + netpoll_send_skb(np, skb); +} + +static void arp_reply(struct sk_buff *skb) +{ + struct arphdr *arp; + unsigned char *arp_ptr; + int size, type = ARPOP_REPLY, ptype = ETH_P_ARP; + u32 sip, tip; + struct sk_buff *send_skb; + struct netpoll *np = skb->dev->np; + + if (!np) return; + + /* No arp on this interface */ + if (skb->dev->flags & IFF_NOARP) + return; + + if (!pskb_may_pull(skb, (sizeof(struct arphdr) + + (2 * skb->dev->addr_len) + + (2 * sizeof(u32))))) + return; + + skb->h.raw = skb->nh.raw = skb->data; + arp = skb->nh.arph; + + if ((arp->ar_hrd != htons(ARPHRD_ETHER) && + arp->ar_hrd != htons(ARPHRD_IEEE802)) || + arp->ar_pro != htons(ETH_P_IP) || + arp->ar_op != htons(ARPOP_REQUEST)) + return; + + arp_ptr = (unsigned char *)(arp+1) + skb->dev->addr_len; + memcpy(&sip, arp_ptr, 4); + arp_ptr += 4 + skb->dev->addr_len; + memcpy(&tip, arp_ptr, 4); + + /* Should we ignore arp? */ + if (tip != htonl(np->local_ip) || LOOPBACK(tip) || MULTICAST(tip)) + return; + + size = sizeof(struct arphdr) + 2 * (skb->dev->addr_len + 4); + send_skb = find_skb(np, size + LL_RESERVED_SPACE(np->dev), + LL_RESERVED_SPACE(np->dev)); + + if (!send_skb) + return; + + send_skb->nh.raw = send_skb->data; + arp = (struct arphdr *) skb_put(send_skb, size); + send_skb->dev = skb->dev; + send_skb->protocol = htons(ETH_P_ARP); + + /* Fill the device header for the ARP frame */ + + if (np->dev->hard_header && + np->dev->hard_header(send_skb, skb->dev, ptype, + np->remote_mac, np->local_mac, + send_skb->len) < 0) { + kfree_skb(send_skb); + return; + } + + /* + * Fill out the arp protocol part. + * + * we only support ethernet device type, + * which (according to RFC 1390) should always equal 1 (Ethernet). + */ + + arp->ar_hrd = htons(np->dev->type); + arp->ar_pro = htons(ETH_P_IP); + arp->ar_hln = np->dev->addr_len; + arp->ar_pln = 4; + arp->ar_op = htons(type); + + arp_ptr=(unsigned char *)(arp + 1); + memcpy(arp_ptr, np->dev->dev_addr, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &tip, 4); + arp_ptr += 4; + memcpy(arp_ptr, np->remote_mac, np->dev->addr_len); + arp_ptr += np->dev->addr_len; + memcpy(arp_ptr, &sip, 4); + + netpoll_send_skb(np, send_skb); +} + +int __netpoll_rx(struct sk_buff *skb) +{ + int proto, len, ulen; + struct iphdr *iph; + struct udphdr *uh; + struct netpoll *np = skb->dev->np; + + if (!np->rx_hook) + goto out; + if (skb->dev->type != ARPHRD_ETHER) + goto out; + + /* check if netpoll clients need ARP */ + if (skb->protocol == __constant_htons(ETH_P_ARP) && + atomic_read(&trapped)) { + arp_reply(skb); + return 1; + } + + proto = ntohs(eth_hdr(skb)->h_proto); + if (proto != ETH_P_IP) + goto out; + if (skb->pkt_type == PACKET_OTHERHOST) + goto out; + if (skb_shared(skb)) + goto out; + + iph = (struct iphdr *)skb->data; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + if (iph->ihl < 5 || iph->version != 4) + goto out; + if (!pskb_may_pull(skb, iph->ihl*4)) + goto out; + if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) + goto out; + + len = ntohs(iph->tot_len); + if (skb->len < len || len < iph->ihl*4) + goto out; + + if (iph->protocol != IPPROTO_UDP) + goto out; + + len -= iph->ihl*4; + uh = (struct udphdr *)(((char *)iph) + iph->ihl*4); + ulen = ntohs(uh->len); + + if (ulen != len) + goto out; + if (checksum_udp(skb, uh, ulen, iph->saddr, iph->daddr) < 0) + goto out; + if (np->local_ip && np->local_ip != ntohl(iph->daddr)) + goto out; + if (np->remote_ip && np->remote_ip != ntohl(iph->saddr)) + goto out; + if (np->local_port && np->local_port != ntohs(uh->dest)) + goto out; + + np->rx_hook(np, ntohs(uh->source), + (char *)(uh+1), + ulen - sizeof(struct udphdr)); + + kfree_skb(skb); + return 1; + +out: + if (atomic_read(&trapped)) { + kfree_skb(skb); + return 1; + } + + return 0; +} + +int netpoll_parse_options(struct netpoll *np, char *opt) +{ + char *cur=opt, *delim; + + if(*cur != '@') { + if ((delim = strchr(cur, '@')) == NULL) + goto parse_failed; + *delim=0; + np->local_port=simple_strtol(cur, NULL, 10); + cur=delim; + } + cur++; + printk(KERN_INFO "%s: local port %d\n", np->name, np->local_port); + + if(*cur != '/') { + if ((delim = strchr(cur, '/')) == NULL) + goto parse_failed; + *delim=0; + np->local_ip=ntohl(in_aton(cur)); + cur=delim; + + printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n", + np->name, HIPQUAD(np->local_ip)); + } + cur++; + + if ( *cur != ',') { + /* parse out dev name */ + if ((delim = strchr(cur, ',')) == NULL) + goto parse_failed; + *delim=0; + strlcpy(np->dev_name, cur, sizeof(np->dev_name)); + cur=delim; + } + cur++; + + printk(KERN_INFO "%s: interface %s\n", np->name, np->dev_name); + + if ( *cur != '@' ) { + /* dst port */ + if ((delim = strchr(cur, '@')) == NULL) + goto parse_failed; + *delim=0; + np->remote_port=simple_strtol(cur, NULL, 10); + cur=delim; + } + cur++; + printk(KERN_INFO "%s: remote port %d\n", np->name, np->remote_port); + + /* dst ip */ + if ((delim = strchr(cur, '/')) == NULL) + goto parse_failed; + *delim=0; + np->remote_ip=ntohl(in_aton(cur)); + cur=delim+1; + + printk(KERN_INFO "%s: remote IP %d.%d.%d.%d\n", + np->name, HIPQUAD(np->remote_ip)); + + if( *cur != 0 ) + { + /* MAC address */ + if ((delim = strchr(cur, ':')) == NULL) + goto parse_failed; + *delim=0; + np->remote_mac[0]=simple_strtol(cur, NULL, 16); + cur=delim+1; + if ((delim = strchr(cur, ':')) == NULL) + goto parse_failed; + *delim=0; + np->remote_mac[1]=simple_strtol(cur, NULL, 16); + cur=delim+1; + if ((delim = strchr(cur, ':')) == NULL) + goto parse_failed; + *delim=0; + np->remote_mac[2]=simple_strtol(cur, NULL, 16); + cur=delim+1; + if ((delim = strchr(cur, ':')) == NULL) + goto parse_failed; + *delim=0; + np->remote_mac[3]=simple_strtol(cur, NULL, 16); + cur=delim+1; + if ((delim = strchr(cur, ':')) == NULL) + goto parse_failed; + *delim=0; + np->remote_mac[4]=simple_strtol(cur, NULL, 16); + cur=delim+1; + np->remote_mac[5]=simple_strtol(cur, NULL, 16); + } + + printk(KERN_INFO "%s: remote ethernet address " + "%02x:%02x:%02x:%02x:%02x:%02x\n", + np->name, + np->remote_mac[0], + np->remote_mac[1], + np->remote_mac[2], + np->remote_mac[3], + np->remote_mac[4], + np->remote_mac[5]); + + return 0; + + parse_failed: + printk(KERN_INFO "%s: couldn't parse config at %s!\n", + np->name, cur); + return -1; +} + +int netpoll_setup(struct netpoll *np) +{ + struct net_device *ndev = NULL; + struct in_device *in_dev; + + np->poll_lock = SPIN_LOCK_UNLOCKED; + np->poll_owner = -1; + + if (np->dev_name) + ndev = dev_get_by_name(np->dev_name); + if (!ndev) { + printk(KERN_ERR "%s: %s doesn't exist, aborting.\n", + np->name, np->dev_name); + return -1; + } + + np->dev = ndev; + ndev->np = np; + + if (!ndev->poll_controller) { + printk(KERN_ERR "%s: %s doesn't support polling, aborting.\n", + np->name, np->dev_name); + goto release; + } + + if (!netif_running(ndev)) { + unsigned long atmost, atleast; + + printk(KERN_INFO "%s: device %s not up yet, forcing it\n", + np->name, np->dev_name); + + rtnl_shlock(); + if (dev_change_flags(ndev, ndev->flags | IFF_UP) < 0) { + printk(KERN_ERR "%s: failed to open %s\n", + np->name, np->dev_name); + rtnl_shunlock(); + goto release; + } + rtnl_shunlock(); + + atleast = jiffies + HZ/10; + atmost = jiffies + 4*HZ; + while (!netif_carrier_ok(ndev)) { + if (time_after(jiffies, atmost)) { + printk(KERN_NOTICE + "%s: timeout waiting for carrier\n", + np->name); + break; + } + cond_resched(); + } + + /* If carrier appears to come up instantly, we don't + * trust it and pause so that we don't pump all our + * queued console messages into the bitbucket. + */ + + if (time_before(jiffies, atleast)) { + printk(KERN_NOTICE "%s: carrier detect appears" + " untrustworthy, waiting 4 seconds\n", + np->name); + msleep(4000); + } + } + + if (!memcmp(np->local_mac, "\0\0\0\0\0\0", 6) && ndev->dev_addr) + memcpy(np->local_mac, ndev->dev_addr, 6); + + if (!np->local_ip) { + rcu_read_lock(); + in_dev = __in_dev_get(ndev); + + if (!in_dev || !in_dev->ifa_list) { + rcu_read_unlock(); + printk(KERN_ERR "%s: no IP address for %s, aborting\n", + np->name, np->dev_name); + goto release; + } + + np->local_ip = ntohl(in_dev->ifa_list->ifa_local); + rcu_read_unlock(); + printk(KERN_INFO "%s: local IP %d.%d.%d.%d\n", + np->name, HIPQUAD(np->local_ip)); + } + + if(np->rx_hook) + np->rx_flags = NETPOLL_RX_ENABLED; + + return 0; + + release: + ndev->np = NULL; + np->dev = NULL; + dev_put(ndev); + return -1; +} + +void netpoll_cleanup(struct netpoll *np) +{ + if (np->dev) + np->dev->np = NULL; + dev_put(np->dev); + np->dev = NULL; +} + +int netpoll_trap(void) +{ + return atomic_read(&trapped); +} + +void netpoll_set_trap(int trap) +{ + if (trap) + atomic_inc(&trapped); + else + atomic_dec(&trapped); +} + +EXPORT_SYMBOL(netpoll_set_trap); +EXPORT_SYMBOL(netpoll_trap); +EXPORT_SYMBOL(netpoll_parse_options); +EXPORT_SYMBOL(netpoll_setup); +EXPORT_SYMBOL(netpoll_cleanup); +EXPORT_SYMBOL(netpoll_send_udp); +EXPORT_SYMBOL(netpoll_poll); +EXPORT_SYMBOL(netpoll_queue); diff --git a/net/core/pktgen.c b/net/core/pktgen.c new file mode 100644 index 000000000000..c57b06bc79f3 --- /dev/null +++ b/net/core/pktgen.c @@ -0,0 +1,3132 @@ +/* + * Authors: + * Copyright 2001, 2002 by Robert Olsson + * Uppsala University and + * Swedish University of Agricultural Sciences + * + * Alexey Kuznetsov + * Ben Greear + * Jens Låås + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * + * A tool for loading the network with preconfigurated packets. + * The tool is implemented as a linux module. Parameters are output + * device, delay (to hard_xmit), number of packets, and whether + * to use multiple SKBs or just the same one. + * pktgen uses the installed interface's output routine. + * + * Additional hacking by: + * + * Jens.Laas@data.slu.se + * Improved by ANK. 010120. + * Improved by ANK even more. 010212. + * MAC address typo fixed. 010417 --ro + * Integrated. 020301 --DaveM + * Added multiskb option 020301 --DaveM + * Scaling of results. 020417--sigurdur@linpro.no + * Significant re-work of the module: + * * Convert to threaded model to more efficiently be able to transmit + * and receive on multiple interfaces at once. + * * Converted many counters to __u64 to allow longer runs. + * * Allow configuration of ranges, like min/max IP address, MACs, + * and UDP-ports, for both source and destination, and can + * set to use a random distribution or sequentially walk the range. + * * Can now change most values after starting. + * * Place 12-byte packet in UDP payload with magic number, + * sequence number, and timestamp. + * * Add receiver code that detects dropped pkts, re-ordered pkts, and + * latencies (with micro-second) precision. + * * Add IOCTL interface to easily get counters & configuration. + * --Ben Greear + * + * Renamed multiskb to clone_skb and cleaned up sending core for two distinct + * skb modes. A clone_skb=0 mode for Ben "ranges" work and a clone_skb != 0 + * as a "fastpath" with a configurable number of clones after alloc's. + * clone_skb=0 means all packets are allocated this also means ranges time + * stamps etc can be used. clone_skb=100 means 1 malloc is followed by 100 + * clones. + * + * Also moved to /proc/net/pktgen/ + * --ro + * + * Sept 10: Fixed threading/locking. Lots of bone-headed and more clever + * mistakes. Also merged in DaveM's patch in the -pre6 patch. + * --Ben Greear + * + * Integrated to 2.5.x 021029 --Lucio Maciel (luciomaciel@zipmail.com.br) + * + * + * 021124 Finished major redesign and rewrite for new functionality. + * See Documentation/networking/pktgen.txt for how to use this. + * + * The new operation: + * For each CPU one thread/process is created at start. This process checks + * for running devices in the if_list and sends packets until count is 0 it + * also the thread checks the thread->control which is used for inter-process + * communication. controlling process "posts" operations to the threads this + * way. The if_lock should be possible to remove when add/rem_device is merged + * into this too. + * + * By design there should only be *one* "controlling" process. In practice + * multiple write accesses gives unpredictable result. Understood by "write" + * to /proc gives result code thats should be read be the "writer". + * For pratical use this should be no problem. + * + * Note when adding devices to a specific CPU there good idea to also assign + * /proc/irq/XX/smp_affinity so TX-interrupts gets bound to the same CPU. + * --ro + * + * Fix refcount off by one if first packet fails, potential null deref, + * memleak 030710- KJP + * + * First "ranges" functionality for ipv6 030726 --ro + * + * Included flow support. 030802 ANK. + * + * Fixed unaligned access on IA-64 Grant Grundler + * + * Remove if fix from added Harald Welte 040419 + * ia64 compilation fix from Aron Griffis 040604 + * + * New xmit() return, do_div and misc clean up by Stephen Hemminger + * 040923 + * + * Rany Dunlap fixed u64 printk compiler waring + * + * Remove FCS from BW calculation. Lennert Buytenhek + * New time handling. Lennert Buytenhek 041213 + * + * Corrections from Nikolai Malykh (nmalykh@bilim.com) + * Removed unused flags F_SET_SRCMAC & F_SET_SRCIP 041230 + * + * interruptible_sleep_on_timeout() replaced Nishanth Aravamudan + * 050103 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* do_div */ +#include + + +#define VERSION "pktgen v2.61: Packet Generator for packet performance testing.\n" + +/* #define PG_DEBUG(a) a */ +#define PG_DEBUG(a) + +/* The buckets are exponential in 'width' */ +#define LAT_BUCKETS_MAX 32 +#define IP_NAME_SZ 32 + +/* Device flag bits */ +#define F_IPSRC_RND (1<<0) /* IP-Src Random */ +#define F_IPDST_RND (1<<1) /* IP-Dst Random */ +#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */ +#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */ +#define F_MACSRC_RND (1<<4) /* MAC-Src Random */ +#define F_MACDST_RND (1<<5) /* MAC-Dst Random */ +#define F_TXSIZE_RND (1<<6) /* Transmit size is random */ +#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */ + +/* Thread control flag bits */ +#define T_TERMINATE (1<<0) +#define T_STOP (1<<1) /* Stop run */ +#define T_RUN (1<<2) /* Start run */ +#define T_REMDEV (1<<3) /* Remove all devs */ + +/* Locks */ +#define thread_lock() spin_lock(&_thread_lock) +#define thread_unlock() spin_unlock(&_thread_lock) + +/* If lock -- can be removed after some work */ +#define if_lock(t) spin_lock(&(t->if_lock)); +#define if_unlock(t) spin_unlock(&(t->if_lock)); + +/* Used to help with determining the pkts on receive */ +#define PKTGEN_MAGIC 0xbe9be955 +#define PG_PROC_DIR "pktgen" + +#define MAX_CFLOWS 65536 + +struct flow_state +{ + __u32 cur_daddr; + int count; +}; + +struct pktgen_dev { + + /* + * Try to keep frequent/infrequent used vars. separated. + */ + + char ifname[32]; + struct proc_dir_entry *proc_ent; + char result[512]; + /* proc file names */ + char fname[80]; + + struct pktgen_thread* pg_thread; /* the owner */ + struct pktgen_dev *next; /* Used for chaining in the thread's run-queue */ + + int running; /* if this changes to false, the test will stop */ + + /* If min != max, then we will either do a linear iteration, or + * we will do a random selection from within the range. + */ + __u32 flags; + + int min_pkt_size; /* = ETH_ZLEN; */ + int max_pkt_size; /* = ETH_ZLEN; */ + int nfrags; + __u32 delay_us; /* Default delay */ + __u32 delay_ns; + __u64 count; /* Default No packets to send */ + __u64 sofar; /* How many pkts we've sent so far */ + __u64 tx_bytes; /* How many bytes we've transmitted */ + __u64 errors; /* Errors when trying to transmit, pkts will be re-sent */ + + /* runtime counters relating to clone_skb */ + __u64 next_tx_us; /* timestamp of when to tx next */ + __u32 next_tx_ns; + + __u64 allocated_skbs; + __u32 clone_count; + int last_ok; /* Was last skb sent? + * Or a failed transmit of some sort? This will keep + * sequence numbers in order, for example. + */ + __u64 started_at; /* micro-seconds */ + __u64 stopped_at; /* micro-seconds */ + __u64 idle_acc; /* micro-seconds */ + __u32 seq_num; + + int clone_skb; /* Use multiple SKBs during packet gen. If this number + * is greater than 1, then that many coppies of the same + * packet will be sent before a new packet is allocated. + * For instance, if you want to send 1024 identical packets + * before creating a new packet, set clone_skb to 1024. + */ + + char dst_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char dst_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_min[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + char src_max[IP_NAME_SZ]; /* IP, ie 1.2.3.4 */ + + struct in6_addr in6_saddr; + struct in6_addr in6_daddr; + struct in6_addr cur_in6_daddr; + struct in6_addr cur_in6_saddr; + /* For ranges */ + struct in6_addr min_in6_daddr; + struct in6_addr max_in6_daddr; + struct in6_addr min_in6_saddr; + struct in6_addr max_in6_saddr; + + /* If we're doing ranges, random or incremental, then this + * defines the min/max for those ranges. + */ + __u32 saddr_min; /* inclusive, source IP address */ + __u32 saddr_max; /* exclusive, source IP address */ + __u32 daddr_min; /* inclusive, dest IP address */ + __u32 daddr_max; /* exclusive, dest IP address */ + + __u16 udp_src_min; /* inclusive, source UDP port */ + __u16 udp_src_max; /* exclusive, source UDP port */ + __u16 udp_dst_min; /* inclusive, dest UDP port */ + __u16 udp_dst_max; /* exclusive, dest UDP port */ + + __u32 src_mac_count; /* How many MACs to iterate through */ + __u32 dst_mac_count; /* How many MACs to iterate through */ + + unsigned char dst_mac[6]; + unsigned char src_mac[6]; + + __u32 cur_dst_mac_offset; + __u32 cur_src_mac_offset; + __u32 cur_saddr; + __u32 cur_daddr; + __u16 cur_udp_dst; + __u16 cur_udp_src; + __u32 cur_pkt_size; + + __u8 hh[14]; + /* = { + 0x00, 0x80, 0xC8, 0x79, 0xB3, 0xCB, + + We fill in SRC address later + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x08, 0x00 + }; + */ + __u16 pad; /* pad out the hh struct to an even 16 bytes */ + + struct sk_buff* skb; /* skb we are to transmit next, mainly used for when we + * are transmitting the same one multiple times + */ + struct net_device* odev; /* The out-going device. Note that the device should + * have it's pg_info pointer pointing back to this + * device. This will be set when the user specifies + * the out-going device name (not when the inject is + * started as it used to do.) + */ + struct flow_state *flows; + unsigned cflows; /* Concurrent flows (config) */ + unsigned lflow; /* Flow length (config) */ + unsigned nflows; /* accumulated flows (stats) */ +}; + +struct pktgen_hdr { + __u32 pgh_magic; + __u32 seq_num; + __u32 tv_sec; + __u32 tv_usec; +}; + +struct pktgen_thread { + spinlock_t if_lock; + struct pktgen_dev *if_list; /* All device here */ + struct pktgen_thread* next; + char name[32]; + char fname[128]; /* name of proc file */ + struct proc_dir_entry *proc_ent; + char result[512]; + u32 max_before_softirq; /* We'll call do_softirq to prevent starvation. */ + + /* Field for thread to receive "posted" events terminate, stop ifs etc.*/ + + u32 control; + int pid; + int cpu; + + wait_queue_head_t queue; +}; + +#define REMOVE 1 +#define FIND 0 + +/* This code works around the fact that do_div cannot handle two 64-bit + numbers, and regular 64-bit division doesn't work on x86 kernels. + --Ben +*/ + +#define PG_DIV 0 + +/* This was emailed to LMKL by: Chris Caputo + * Function copied/adapted/optimized from: + * + * nemesis.sourceforge.net/browse/lib/static/intmath/ix86/intmath.c.html + * + * Copyright 1994, University of Cambridge Computer Laboratory + * All Rights Reserved. + * + */ +inline static s64 divremdi3(s64 x, s64 y, int type) +{ + u64 a = (x < 0) ? -x : x; + u64 b = (y < 0) ? -y : y; + u64 res = 0, d = 1; + + if (b > 0) { + while (b < a) { + b <<= 1; + d <<= 1; + } + } + + do { + if ( a >= b ) { + a -= b; + res += d; + } + b >>= 1; + d >>= 1; + } + while (d); + + if (PG_DIV == type) { + return (((x ^ y) & (1ll<<63)) == 0) ? res : -(s64)res; + } + else { + return ((x & (1ll<<63)) == 0) ? a : -(s64)a; + } +} + +/* End of hacks to deal with 64-bit math on x86 */ + +/** Convert to miliseconds */ +static inline __u64 tv_to_ms(const struct timeval* tv) +{ + __u64 ms = tv->tv_usec / 1000; + ms += (__u64)tv->tv_sec * (__u64)1000; + return ms; +} + + +/** Convert to micro-seconds */ +static inline __u64 tv_to_us(const struct timeval* tv) +{ + __u64 us = tv->tv_usec; + us += (__u64)tv->tv_sec * (__u64)1000000; + return us; +} + +static inline __u64 pg_div(__u64 n, __u32 base) { + __u64 tmp = n; + do_div(tmp, base); + /* printk("pktgen: pg_div, n: %llu base: %d rv: %llu\n", + n, base, tmp); */ + return tmp; +} + +static inline __u64 pg_div64(__u64 n, __u64 base) +{ + __u64 tmp = n; +/* + * How do we know if the architectrure we are running on + * supports division with 64 bit base? + * + */ +#if defined(__sparc_v9__) || defined(__powerpc64__) || defined(__alpha__) || defined(__x86_64__) || defined(__ia64__) + + do_div(tmp, base); +#else + tmp = divremdi3(n, base, PG_DIV); +#endif + return tmp; +} + +static inline u32 pktgen_random(void) +{ +#if 0 + __u32 n; + get_random_bytes(&n, 4); + return n; +#else + return net_random(); +#endif +} + +static inline __u64 getCurMs(void) +{ + struct timeval tv; + do_gettimeofday(&tv); + return tv_to_ms(&tv); +} + +static inline __u64 getCurUs(void) +{ + struct timeval tv; + do_gettimeofday(&tv); + return tv_to_us(&tv); +} + +static inline __u64 tv_diff(const struct timeval* a, const struct timeval* b) +{ + return tv_to_us(a) - tv_to_us(b); +} + + +/* old include end */ + +static char version[] __initdata = VERSION; + +static ssize_t proc_pgctrl_read(struct file* file, char __user * buf, size_t count, loff_t *ppos); +static ssize_t proc_pgctrl_write(struct file* file, const char __user * buf, size_t count, loff_t *ppos); +static int proc_if_read(char *buf , char **start, off_t offset, int len, int *eof, void *data); + +static int proc_thread_read(char *buf , char **start, off_t offset, int len, int *eof, void *data); +static int proc_if_write(struct file *file, const char __user *user_buffer, unsigned long count, void *data); +static int proc_thread_write(struct file *file, const char __user *user_buffer, unsigned long count, void *data); +static int create_proc_dir(void); +static int remove_proc_dir(void); + +static int pktgen_remove_device(struct pktgen_thread* t, struct pktgen_dev *i); +static int pktgen_add_device(struct pktgen_thread* t, const char* ifname); +static struct pktgen_thread* pktgen_find_thread(const char* name); +static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread* t, const char* ifname); +static int pktgen_device_event(struct notifier_block *, unsigned long, void *); +static void pktgen_run_all_threads(void); +static void pktgen_stop_all_threads_ifs(void); +static int pktgen_stop_device(struct pktgen_dev *pkt_dev); +static void pktgen_stop(struct pktgen_thread* t); +static void pktgen_clear_counters(struct pktgen_dev *pkt_dev); +static struct pktgen_dev *pktgen_NN_threads(const char* dev_name, int remove); +static unsigned int scan_ip6(const char *s,char ip[16]); +static unsigned int fmt_ip6(char *s,const char ip[16]); + +/* Module parameters, defaults. */ +static int pg_count_d = 1000; /* 1000 pkts by default */ +static int pg_delay_d = 0; +static int pg_clone_skb_d = 0; +static int debug = 0; + +static spinlock_t _thread_lock = SPIN_LOCK_UNLOCKED; +static struct pktgen_thread *pktgen_threads = NULL; + +static char module_fname[128]; +static struct proc_dir_entry *module_proc_ent = NULL; + +static struct notifier_block pktgen_notifier_block = { + .notifier_call = pktgen_device_event, +}; + +static struct file_operations pktgen_fops = { + .read = proc_pgctrl_read, + .write = proc_pgctrl_write, + /* .ioctl = pktgen_ioctl, later maybe */ +}; + +/* + * /proc handling functions + * + */ + +static struct proc_dir_entry *pg_proc_dir = NULL; +static int proc_pgctrl_read_eof=0; + +static ssize_t proc_pgctrl_read(struct file* file, char __user * buf, + size_t count, loff_t *ppos) +{ + char data[200]; + int len = 0; + + if(proc_pgctrl_read_eof) { + proc_pgctrl_read_eof=0; + len = 0; + goto out; + } + + sprintf(data, "%s", VERSION); + + len = strlen(data); + + if(len > count) { + len =-EFAULT; + goto out; + } + + if (copy_to_user(buf, data, len)) { + len =-EFAULT; + goto out; + } + + *ppos += len; + proc_pgctrl_read_eof=1; /* EOF next call */ + + out: + return len; +} + +static ssize_t proc_pgctrl_write(struct file* file,const char __user * buf, + size_t count, loff_t *ppos) +{ + char *data = NULL; + int err = 0; + + if (!capable(CAP_NET_ADMIN)){ + err = -EPERM; + goto out; + } + + data = (void*)vmalloc ((unsigned int)count); + + if(!data) { + err = -ENOMEM; + goto out; + } + if (copy_from_user(data, buf, count)) { + err =-EFAULT; + goto out_free; + } + data[count-1] = 0; /* Make string */ + + if (!strcmp(data, "stop")) + pktgen_stop_all_threads_ifs(); + + else if (!strcmp(data, "start")) + pktgen_run_all_threads(); + + else + printk("pktgen: Unknown command: %s\n", data); + + err = count; + + out_free: + vfree (data); + out: + return err; +} + +static int proc_if_read(char *buf , char **start, off_t offset, + int len, int *eof, void *data) +{ + char *p; + int i; + struct pktgen_dev *pkt_dev = (struct pktgen_dev*)(data); + __u64 sa; + __u64 stopped; + __u64 now = getCurUs(); + + p = buf; + p += sprintf(p, "Params: count %llu min_pkt_size: %u max_pkt_size: %u\n", + (unsigned long long) pkt_dev->count, + pkt_dev->min_pkt_size, pkt_dev->max_pkt_size); + + p += sprintf(p, " frags: %d delay: %u clone_skb: %d ifname: %s\n", + pkt_dev->nfrags, 1000*pkt_dev->delay_us+pkt_dev->delay_ns, pkt_dev->clone_skb, pkt_dev->ifname); + + p += sprintf(p, " flows: %u flowlen: %u\n", pkt_dev->cflows, pkt_dev->lflow); + + + if(pkt_dev->flags & F_IPV6) { + char b1[128], b2[128], b3[128]; + fmt_ip6(b1, pkt_dev->in6_saddr.s6_addr); + fmt_ip6(b2, pkt_dev->min_in6_saddr.s6_addr); + fmt_ip6(b3, pkt_dev->max_in6_saddr.s6_addr); + p += sprintf(p, " saddr: %s min_saddr: %s max_saddr: %s\n", b1, b2, b3); + + fmt_ip6(b1, pkt_dev->in6_daddr.s6_addr); + fmt_ip6(b2, pkt_dev->min_in6_daddr.s6_addr); + fmt_ip6(b3, pkt_dev->max_in6_daddr.s6_addr); + p += sprintf(p, " daddr: %s min_daddr: %s max_daddr: %s\n", b1, b2, b3); + + } + else + p += sprintf(p, " dst_min: %s dst_max: %s\n src_min: %s src_max: %s\n", + pkt_dev->dst_min, pkt_dev->dst_max, pkt_dev->src_min, pkt_dev->src_max); + + p += sprintf(p, " src_mac: "); + + if ((pkt_dev->src_mac[0] == 0) && + (pkt_dev->src_mac[1] == 0) && + (pkt_dev->src_mac[2] == 0) && + (pkt_dev->src_mac[3] == 0) && + (pkt_dev->src_mac[4] == 0) && + (pkt_dev->src_mac[5] == 0)) + + for (i = 0; i < 6; i++) + p += sprintf(p, "%02X%s", pkt_dev->odev->dev_addr[i], i == 5 ? " " : ":"); + + else + for (i = 0; i < 6; i++) + p += sprintf(p, "%02X%s", pkt_dev->src_mac[i], i == 5 ? " " : ":"); + + p += sprintf(p, "dst_mac: "); + for (i = 0; i < 6; i++) + p += sprintf(p, "%02X%s", pkt_dev->dst_mac[i], i == 5 ? "\n" : ":"); + + p += sprintf(p, " udp_src_min: %d udp_src_max: %d udp_dst_min: %d udp_dst_max: %d\n", + pkt_dev->udp_src_min, pkt_dev->udp_src_max, pkt_dev->udp_dst_min, + pkt_dev->udp_dst_max); + + p += sprintf(p, " src_mac_count: %d dst_mac_count: %d \n Flags: ", + pkt_dev->src_mac_count, pkt_dev->dst_mac_count); + + + if (pkt_dev->flags & F_IPV6) + p += sprintf(p, "IPV6 "); + + if (pkt_dev->flags & F_IPSRC_RND) + p += sprintf(p, "IPSRC_RND "); + + if (pkt_dev->flags & F_IPDST_RND) + p += sprintf(p, "IPDST_RND "); + + if (pkt_dev->flags & F_TXSIZE_RND) + p += sprintf(p, "TXSIZE_RND "); + + if (pkt_dev->flags & F_UDPSRC_RND) + p += sprintf(p, "UDPSRC_RND "); + + if (pkt_dev->flags & F_UDPDST_RND) + p += sprintf(p, "UDPDST_RND "); + + if (pkt_dev->flags & F_MACSRC_RND) + p += sprintf(p, "MACSRC_RND "); + + if (pkt_dev->flags & F_MACDST_RND) + p += sprintf(p, "MACDST_RND "); + + + p += sprintf(p, "\n"); + + sa = pkt_dev->started_at; + stopped = pkt_dev->stopped_at; + if (pkt_dev->running) + stopped = now; /* not really stopped, more like last-running-at */ + + p += sprintf(p, "Current:\n pkts-sofar: %llu errors: %llu\n started: %lluus stopped: %lluus idle: %lluus\n", + (unsigned long long) pkt_dev->sofar, + (unsigned long long) pkt_dev->errors, + (unsigned long long) sa, + (unsigned long long) stopped, + (unsigned long long) pkt_dev->idle_acc); + + p += sprintf(p, " seq_num: %d cur_dst_mac_offset: %d cur_src_mac_offset: %d\n", + pkt_dev->seq_num, pkt_dev->cur_dst_mac_offset, pkt_dev->cur_src_mac_offset); + + if(pkt_dev->flags & F_IPV6) { + char b1[128], b2[128]; + fmt_ip6(b1, pkt_dev->cur_in6_daddr.s6_addr); + fmt_ip6(b2, pkt_dev->cur_in6_saddr.s6_addr); + p += sprintf(p, " cur_saddr: %s cur_daddr: %s\n", b2, b1); + } + else + p += sprintf(p, " cur_saddr: 0x%x cur_daddr: 0x%x\n", + pkt_dev->cur_saddr, pkt_dev->cur_daddr); + + + p += sprintf(p, " cur_udp_dst: %d cur_udp_src: %d\n", + pkt_dev->cur_udp_dst, pkt_dev->cur_udp_src); + + p += sprintf(p, " flows: %u\n", pkt_dev->nflows); + + if (pkt_dev->result[0]) + p += sprintf(p, "Result: %s\n", pkt_dev->result); + else + p += sprintf(p, "Result: Idle\n"); + *eof = 1; + + return p - buf; +} + + +static int count_trail_chars(const char __user *user_buffer, unsigned int maxlen) +{ + int i; + + for (i = 0; i < maxlen; i++) { + char c; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + switch (c) { + case '\"': + case '\n': + case '\r': + case '\t': + case ' ': + case '=': + break; + default: + goto done; + }; + } +done: + return i; +} + +static unsigned long num_arg(const char __user *user_buffer, unsigned long maxlen, + unsigned long *num) +{ + int i = 0; + *num = 0; + + for(; i < maxlen; i++) { + char c; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + if ((c >= '0') && (c <= '9')) { + *num *= 10; + *num += c -'0'; + } else + break; + } + return i; +} + +static int strn_len(const char __user *user_buffer, unsigned int maxlen) +{ + int i = 0; + + for(; i < maxlen; i++) { + char c; + if (get_user(c, &user_buffer[i])) + return -EFAULT; + switch (c) { + case '\"': + case '\n': + case '\r': + case '\t': + case ' ': + goto done_str; + break; + default: + break; + }; + } +done_str: + + return i; +} + +static int proc_if_write(struct file *file, const char __user *user_buffer, + unsigned long count, void *data) +{ + int i = 0, max, len; + char name[16], valstr[32]; + unsigned long value = 0; + struct pktgen_dev *pkt_dev = (struct pktgen_dev*)(data); + char* pg_result = NULL; + int tmp = 0; + char buf[128]; + + pg_result = &(pkt_dev->result[0]); + + if (count < 1) { + printk("pktgen: wrong command format\n"); + return -EINVAL; + } + + max = count - i; + tmp = count_trail_chars(&user_buffer[i], max); + if (tmp < 0) { + printk("pktgen: illegal format\n"); + return tmp; + } + i += tmp; + + /* Read variable name */ + + len = strn_len(&user_buffer[i], sizeof(name) - 1); + if (len < 0) { return len; } + memset(name, 0, sizeof(name)); + if (copy_from_user(name, &user_buffer[i], len) ) + return -EFAULT; + i += len; + + max = count -i; + len = count_trail_chars(&user_buffer[i], max); + if (len < 0) + return len; + + i += len; + + if (debug) { + char tb[count + 1]; + if (copy_from_user(tb, user_buffer, count)) + return -EFAULT; + tb[count] = 0; + printk("pktgen: %s,%lu buffer -:%s:-\n", name, count, tb); + } + + if (!strcmp(name, "min_pkt_size")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value < 14+20+8) + value = 14+20+8; + if (value != pkt_dev->min_pkt_size) { + pkt_dev->min_pkt_size = value; + pkt_dev->cur_pkt_size = value; + } + sprintf(pg_result, "OK: min_pkt_size=%u", pkt_dev->min_pkt_size); + return count; + } + + if (!strcmp(name, "max_pkt_size")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value < 14+20+8) + value = 14+20+8; + if (value != pkt_dev->max_pkt_size) { + pkt_dev->max_pkt_size = value; + pkt_dev->cur_pkt_size = value; + } + sprintf(pg_result, "OK: max_pkt_size=%u", pkt_dev->max_pkt_size); + return count; + } + + /* Shortcut for min = max */ + + if (!strcmp(name, "pkt_size")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value < 14+20+8) + value = 14+20+8; + if (value != pkt_dev->min_pkt_size) { + pkt_dev->min_pkt_size = value; + pkt_dev->max_pkt_size = value; + pkt_dev->cur_pkt_size = value; + } + sprintf(pg_result, "OK: pkt_size=%u", pkt_dev->min_pkt_size); + return count; + } + + if (!strcmp(name, "debug")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + debug = value; + sprintf(pg_result, "OK: debug=%u", debug); + return count; + } + + if (!strcmp(name, "frags")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + pkt_dev->nfrags = value; + sprintf(pg_result, "OK: frags=%u", pkt_dev->nfrags); + return count; + } + if (!strcmp(name, "delay")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value == 0x7FFFFFFF) { + pkt_dev->delay_us = 0x7FFFFFFF; + pkt_dev->delay_ns = 0; + } else { + pkt_dev->delay_us = value / 1000; + pkt_dev->delay_ns = value % 1000; + } + sprintf(pg_result, "OK: delay=%u", 1000*pkt_dev->delay_us+pkt_dev->delay_ns); + return count; + } + if (!strcmp(name, "udp_src_min")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value != pkt_dev->udp_src_min) { + pkt_dev->udp_src_min = value; + pkt_dev->cur_udp_src = value; + } + sprintf(pg_result, "OK: udp_src_min=%u", pkt_dev->udp_src_min); + return count; + } + if (!strcmp(name, "udp_dst_min")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value != pkt_dev->udp_dst_min) { + pkt_dev->udp_dst_min = value; + pkt_dev->cur_udp_dst = value; + } + sprintf(pg_result, "OK: udp_dst_min=%u", pkt_dev->udp_dst_min); + return count; + } + if (!strcmp(name, "udp_src_max")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value != pkt_dev->udp_src_max) { + pkt_dev->udp_src_max = value; + pkt_dev->cur_udp_src = value; + } + sprintf(pg_result, "OK: udp_src_max=%u", pkt_dev->udp_src_max); + return count; + } + if (!strcmp(name, "udp_dst_max")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value != pkt_dev->udp_dst_max) { + pkt_dev->udp_dst_max = value; + pkt_dev->cur_udp_dst = value; + } + sprintf(pg_result, "OK: udp_dst_max=%u", pkt_dev->udp_dst_max); + return count; + } + if (!strcmp(name, "clone_skb")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + pkt_dev->clone_skb = value; + + sprintf(pg_result, "OK: clone_skb=%d", pkt_dev->clone_skb); + return count; + } + if (!strcmp(name, "count")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + pkt_dev->count = value; + sprintf(pg_result, "OK: count=%llu", + (unsigned long long) pkt_dev->count); + return count; + } + if (!strcmp(name, "src_mac_count")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (pkt_dev->src_mac_count != value) { + pkt_dev->src_mac_count = value; + pkt_dev->cur_src_mac_offset = 0; + } + sprintf(pg_result, "OK: src_mac_count=%d", pkt_dev->src_mac_count); + return count; + } + if (!strcmp(name, "dst_mac_count")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (pkt_dev->dst_mac_count != value) { + pkt_dev->dst_mac_count = value; + pkt_dev->cur_dst_mac_offset = 0; + } + sprintf(pg_result, "OK: dst_mac_count=%d", pkt_dev->dst_mac_count); + return count; + } + if (!strcmp(name, "flag")) { + char f[32]; + memset(f, 0, 32); + len = strn_len(&user_buffer[i], sizeof(f) - 1); + if (len < 0) { return len; } + if (copy_from_user(f, &user_buffer[i], len)) + return -EFAULT; + i += len; + if (strcmp(f, "IPSRC_RND") == 0) + pkt_dev->flags |= F_IPSRC_RND; + + else if (strcmp(f, "!IPSRC_RND") == 0) + pkt_dev->flags &= ~F_IPSRC_RND; + + else if (strcmp(f, "TXSIZE_RND") == 0) + pkt_dev->flags |= F_TXSIZE_RND; + + else if (strcmp(f, "!TXSIZE_RND") == 0) + pkt_dev->flags &= ~F_TXSIZE_RND; + + else if (strcmp(f, "IPDST_RND") == 0) + pkt_dev->flags |= F_IPDST_RND; + + else if (strcmp(f, "!IPDST_RND") == 0) + pkt_dev->flags &= ~F_IPDST_RND; + + else if (strcmp(f, "UDPSRC_RND") == 0) + pkt_dev->flags |= F_UDPSRC_RND; + + else if (strcmp(f, "!UDPSRC_RND") == 0) + pkt_dev->flags &= ~F_UDPSRC_RND; + + else if (strcmp(f, "UDPDST_RND") == 0) + pkt_dev->flags |= F_UDPDST_RND; + + else if (strcmp(f, "!UDPDST_RND") == 0) + pkt_dev->flags &= ~F_UDPDST_RND; + + else if (strcmp(f, "MACSRC_RND") == 0) + pkt_dev->flags |= F_MACSRC_RND; + + else if (strcmp(f, "!MACSRC_RND") == 0) + pkt_dev->flags &= ~F_MACSRC_RND; + + else if (strcmp(f, "MACDST_RND") == 0) + pkt_dev->flags |= F_MACDST_RND; + + else if (strcmp(f, "!MACDST_RND") == 0) + pkt_dev->flags &= ~F_MACDST_RND; + + else { + sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", + f, + "IPSRC_RND, IPDST_RND, TXSIZE_RND, UDPSRC_RND, UDPDST_RND, MACSRC_RND, MACDST_RND\n"); + return count; + } + sprintf(pg_result, "OK: flags=0x%x", pkt_dev->flags); + return count; + } + if (!strcmp(name, "dst_min") || !strcmp(name, "dst")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_min) - 1); + if (len < 0) { return len; } + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + if (strcmp(buf, pkt_dev->dst_min) != 0) { + memset(pkt_dev->dst_min, 0, sizeof(pkt_dev->dst_min)); + strncpy(pkt_dev->dst_min, buf, len); + pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); + pkt_dev->cur_daddr = pkt_dev->daddr_min; + } + if(debug) + printk("pktgen: dst_min set to: %s\n", pkt_dev->dst_min); + i += len; + sprintf(pg_result, "OK: dst_min=%s", pkt_dev->dst_min); + return count; + } + if (!strcmp(name, "dst_max")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->dst_max) - 1); + if (len < 0) { return len; } + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + + buf[len] = 0; + if (strcmp(buf, pkt_dev->dst_max) != 0) { + memset(pkt_dev->dst_max, 0, sizeof(pkt_dev->dst_max)); + strncpy(pkt_dev->dst_max, buf, len); + pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); + pkt_dev->cur_daddr = pkt_dev->daddr_max; + } + if(debug) + printk("pktgen: dst_max set to: %s\n", pkt_dev->dst_max); + i += len; + sprintf(pg_result, "OK: dst_max=%s", pkt_dev->dst_max); + return count; + } + if (!strcmp(name, "dst6")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + scan_ip6(buf, pkt_dev->in6_daddr.s6_addr); + fmt_ip6(buf, pkt_dev->in6_daddr.s6_addr); + + ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr); + + if(debug) + printk("pktgen: dst6 set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: dst6=%s", buf); + return count; + } + if (!strcmp(name, "dst6_min")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); + fmt_ip6(buf, pkt_dev->min_in6_daddr.s6_addr); + + ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->min_in6_daddr); + if(debug) + printk("pktgen: dst6_min set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: dst6_min=%s", buf); + return count; + } + if (!strcmp(name, "dst6_max")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + scan_ip6(buf, pkt_dev->max_in6_daddr.s6_addr); + fmt_ip6(buf, pkt_dev->max_in6_daddr.s6_addr); + + if(debug) + printk("pktgen: dst6_max set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: dst6_max=%s", buf); + return count; + } + if (!strcmp(name, "src6")) { + len = strn_len(&user_buffer[i], sizeof(buf) - 1); + if (len < 0) return len; + + pkt_dev->flags |= F_IPV6; + + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + + scan_ip6(buf, pkt_dev->in6_saddr.s6_addr); + fmt_ip6(buf, pkt_dev->in6_saddr.s6_addr); + + ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr); + + if(debug) + printk("pktgen: src6 set to: %s\n", buf); + + i += len; + sprintf(pg_result, "OK: src6=%s", buf); + return count; + } + if (!strcmp(name, "src_min")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_min) - 1); + if (len < 0) { return len; } + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + if (strcmp(buf, pkt_dev->src_min) != 0) { + memset(pkt_dev->src_min, 0, sizeof(pkt_dev->src_min)); + strncpy(pkt_dev->src_min, buf, len); + pkt_dev->saddr_min = in_aton(pkt_dev->src_min); + pkt_dev->cur_saddr = pkt_dev->saddr_min; + } + if(debug) + printk("pktgen: src_min set to: %s\n", pkt_dev->src_min); + i += len; + sprintf(pg_result, "OK: src_min=%s", pkt_dev->src_min); + return count; + } + if (!strcmp(name, "src_max")) { + len = strn_len(&user_buffer[i], sizeof(pkt_dev->src_max) - 1); + if (len < 0) { return len; } + if (copy_from_user(buf, &user_buffer[i], len)) + return -EFAULT; + buf[len] = 0; + if (strcmp(buf, pkt_dev->src_max) != 0) { + memset(pkt_dev->src_max, 0, sizeof(pkt_dev->src_max)); + strncpy(pkt_dev->src_max, buf, len); + pkt_dev->saddr_max = in_aton(pkt_dev->src_max); + pkt_dev->cur_saddr = pkt_dev->saddr_max; + } + if(debug) + printk("pktgen: src_max set to: %s\n", pkt_dev->src_max); + i += len; + sprintf(pg_result, "OK: src_max=%s", pkt_dev->src_max); + return count; + } + if (!strcmp(name, "dst_mac")) { + char *v = valstr; + unsigned char old_dmac[6]; + unsigned char *m = pkt_dev->dst_mac; + memcpy(old_dmac, pkt_dev->dst_mac, 6); + + len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + if (len < 0) { return len; } + memset(valstr, 0, sizeof(valstr)); + if( copy_from_user(valstr, &user_buffer[i], len)) + return -EFAULT; + i += len; + + for(*m = 0;*v && m < pkt_dev->dst_mac + 6; v++) { + if (*v >= '0' && *v <= '9') { + *m *= 16; + *m += *v - '0'; + } + if (*v >= 'A' && *v <= 'F') { + *m *= 16; + *m += *v - 'A' + 10; + } + if (*v >= 'a' && *v <= 'f') { + *m *= 16; + *m += *v - 'a' + 10; + } + if (*v == ':') { + m++; + *m = 0; + } + } + + /* Set up Dest MAC */ + if (memcmp(old_dmac, pkt_dev->dst_mac, 6) != 0) + memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, 6); + + sprintf(pg_result, "OK: dstmac"); + return count; + } + if (!strcmp(name, "src_mac")) { + char *v = valstr; + unsigned char *m = pkt_dev->src_mac; + + len = strn_len(&user_buffer[i], sizeof(valstr) - 1); + if (len < 0) { return len; } + memset(valstr, 0, sizeof(valstr)); + if( copy_from_user(valstr, &user_buffer[i], len)) + return -EFAULT; + i += len; + + for(*m = 0;*v && m < pkt_dev->src_mac + 6; v++) { + if (*v >= '0' && *v <= '9') { + *m *= 16; + *m += *v - '0'; + } + if (*v >= 'A' && *v <= 'F') { + *m *= 16; + *m += *v - 'A' + 10; + } + if (*v >= 'a' && *v <= 'f') { + *m *= 16; + *m += *v - 'a' + 10; + } + if (*v == ':') { + m++; + *m = 0; + } + } + + sprintf(pg_result, "OK: srcmac"); + return count; + } + + if (!strcmp(name, "clear_counters")) { + pktgen_clear_counters(pkt_dev); + sprintf(pg_result, "OK: Clearing counters.\n"); + return count; + } + + if (!strcmp(name, "flows")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + if (value > MAX_CFLOWS) + value = MAX_CFLOWS; + + pkt_dev->cflows = value; + sprintf(pg_result, "OK: flows=%u", pkt_dev->cflows); + return count; + } + + if (!strcmp(name, "flowlen")) { + len = num_arg(&user_buffer[i], 10, &value); + if (len < 0) { return len; } + i += len; + pkt_dev->lflow = value; + sprintf(pg_result, "OK: flowlen=%u", pkt_dev->lflow); + return count; + } + + sprintf(pkt_dev->result, "No such parameter \"%s\"", name); + return -EINVAL; +} + +static int proc_thread_read(char *buf , char **start, off_t offset, + int len, int *eof, void *data) +{ + char *p; + struct pktgen_thread *t = (struct pktgen_thread*)(data); + struct pktgen_dev *pkt_dev = NULL; + + + if (!t) { + printk("pktgen: ERROR: could not find thread in proc_thread_read\n"); + return -EINVAL; + } + + p = buf; + p += sprintf(p, "Name: %s max_before_softirq: %d\n", + t->name, t->max_before_softirq); + + p += sprintf(p, "Running: "); + + if_lock(t); + for(pkt_dev = t->if_list;pkt_dev; pkt_dev = pkt_dev->next) + if(pkt_dev->running) + p += sprintf(p, "%s ", pkt_dev->ifname); + + p += sprintf(p, "\nStopped: "); + + for(pkt_dev = t->if_list;pkt_dev; pkt_dev = pkt_dev->next) + if(!pkt_dev->running) + p += sprintf(p, "%s ", pkt_dev->ifname); + + if (t->result[0]) + p += sprintf(p, "\nResult: %s\n", t->result); + else + p += sprintf(p, "\nResult: NA\n"); + + *eof = 1; + + if_unlock(t); + + return p - buf; +} + +static int proc_thread_write(struct file *file, const char __user *user_buffer, + unsigned long count, void *data) +{ + int i = 0, max, len, ret; + char name[40]; + struct pktgen_thread *t; + char *pg_result; + unsigned long value = 0; + + if (count < 1) { + // sprintf(pg_result, "Wrong command format"); + return -EINVAL; + } + + max = count - i; + len = count_trail_chars(&user_buffer[i], max); + if (len < 0) + return len; + + i += len; + + /* Read variable name */ + + len = strn_len(&user_buffer[i], sizeof(name) - 1); + if (len < 0) + return len; + + memset(name, 0, sizeof(name)); + if (copy_from_user(name, &user_buffer[i], len)) + return -EFAULT; + i += len; + + max = count -i; + len = count_trail_chars(&user_buffer[i], max); + if (len < 0) + return len; + + i += len; + + if (debug) + printk("pktgen: t=%s, count=%lu\n", name, count); + + + t = (struct pktgen_thread*)(data); + if(!t) { + printk("pktgen: ERROR: No thread\n"); + ret = -EINVAL; + goto out; + } + + pg_result = &(t->result[0]); + + if (!strcmp(name, "add_device")) { + char f[32]; + memset(f, 0, 32); + len = strn_len(&user_buffer[i], sizeof(f) - 1); + if (len < 0) { + ret = len; + goto out; + } + if( copy_from_user(f, &user_buffer[i], len) ) + return -EFAULT; + i += len; + thread_lock(); + pktgen_add_device(t, f); + thread_unlock(); + ret = count; + sprintf(pg_result, "OK: add_device=%s", f); + goto out; + } + + if (!strcmp(name, "rem_device_all")) { + thread_lock(); + t->control |= T_REMDEV; + thread_unlock(); + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/8); /* Propagate thread->control */ + ret = count; + sprintf(pg_result, "OK: rem_device_all"); + goto out; + } + + if (!strcmp(name, "max_before_softirq")) { + len = num_arg(&user_buffer[i], 10, &value); + thread_lock(); + t->max_before_softirq = value; + thread_unlock(); + ret = count; + sprintf(pg_result, "OK: max_before_softirq=%lu", value); + goto out; + } + + ret = -EINVAL; + out: + + return ret; +} + +static int create_proc_dir(void) +{ + int len; + /* does proc_dir already exists */ + len = strlen(PG_PROC_DIR); + + for (pg_proc_dir = proc_net->subdir; pg_proc_dir; pg_proc_dir=pg_proc_dir->next) { + if ((pg_proc_dir->namelen == len) && + (! memcmp(pg_proc_dir->name, PG_PROC_DIR, len))) + break; + } + + if (!pg_proc_dir) + pg_proc_dir = create_proc_entry(PG_PROC_DIR, S_IFDIR, proc_net); + + if (!pg_proc_dir) + return -ENODEV; + + return 0; +} + +static int remove_proc_dir(void) +{ + remove_proc_entry(PG_PROC_DIR, proc_net); + return 0; +} + +/* Think find or remove for NN */ +static struct pktgen_dev *__pktgen_NN_threads(const char* ifname, int remove) +{ + struct pktgen_thread *t; + struct pktgen_dev *pkt_dev = NULL; + + t = pktgen_threads; + + while (t) { + pkt_dev = pktgen_find_dev(t, ifname); + if (pkt_dev) { + if(remove) { + if_lock(t); + pktgen_remove_device(t, pkt_dev); + if_unlock(t); + } + break; + } + t = t->next; + } + return pkt_dev; +} + +static struct pktgen_dev *pktgen_NN_threads(const char* ifname, int remove) +{ + struct pktgen_dev *pkt_dev = NULL; + thread_lock(); + pkt_dev = __pktgen_NN_threads(ifname, remove); + thread_unlock(); + return pkt_dev; +} + +static int pktgen_device_event(struct notifier_block *unused, unsigned long event, void *ptr) +{ + struct net_device *dev = (struct net_device *)(ptr); + + /* It is OK that we do not hold the group lock right now, + * as we run under the RTNL lock. + */ + + switch (event) { + case NETDEV_CHANGEADDR: + case NETDEV_GOING_DOWN: + case NETDEV_DOWN: + case NETDEV_UP: + /* Ignore for now */ + break; + + case NETDEV_UNREGISTER: + pktgen_NN_threads(dev->name, REMOVE); + break; + }; + + return NOTIFY_DONE; +} + +/* Associate pktgen_dev with a device. */ + +static struct net_device* pktgen_setup_dev(struct pktgen_dev *pkt_dev) { + struct net_device *odev; + + /* Clean old setups */ + + if (pkt_dev->odev) { + dev_put(pkt_dev->odev); + pkt_dev->odev = NULL; + } + + odev = dev_get_by_name(pkt_dev->ifname); + + if (!odev) { + printk("pktgen: no such netdevice: \"%s\"\n", pkt_dev->ifname); + goto out; + } + if (odev->type != ARPHRD_ETHER) { + printk("pktgen: not an ethernet device: \"%s\"\n", pkt_dev->ifname); + goto out_put; + } + if (!netif_running(odev)) { + printk("pktgen: device is down: \"%s\"\n", pkt_dev->ifname); + goto out_put; + } + pkt_dev->odev = odev; + + return pkt_dev->odev; + +out_put: + dev_put(odev); +out: + return NULL; + +} + +/* Read pkt_dev from the interface and set up internal pktgen_dev + * structure to have the right information to create/send packets + */ +static void pktgen_setup_inject(struct pktgen_dev *pkt_dev) +{ + /* Try once more, just in case it works now. */ + if (!pkt_dev->odev) + pktgen_setup_dev(pkt_dev); + + if (!pkt_dev->odev) { + printk("pktgen: ERROR: pkt_dev->odev == NULL in setup_inject.\n"); + sprintf(pkt_dev->result, "ERROR: pkt_dev->odev == NULL in setup_inject.\n"); + return; + } + + /* Default to the interface's mac if not explicitly set. */ + + if ((pkt_dev->src_mac[0] == 0) && + (pkt_dev->src_mac[1] == 0) && + (pkt_dev->src_mac[2] == 0) && + (pkt_dev->src_mac[3] == 0) && + (pkt_dev->src_mac[4] == 0) && + (pkt_dev->src_mac[5] == 0)) { + + memcpy(&(pkt_dev->hh[6]), pkt_dev->odev->dev_addr, 6); + } + /* Set up Dest MAC */ + memcpy(&(pkt_dev->hh[0]), pkt_dev->dst_mac, 6); + + /* Set up pkt size */ + pkt_dev->cur_pkt_size = pkt_dev->min_pkt_size; + + if(pkt_dev->flags & F_IPV6) { + /* + * Skip this automatic address setting until locks or functions + * gets exported + */ + +#ifdef NOTNOW + int i, set = 0, err=1; + struct inet6_dev *idev; + + for(i=0; i< IN6_ADDR_HSIZE; i++) + if(pkt_dev->cur_in6_saddr.s6_addr[i]) { + set = 1; + break; + } + + if(!set) { + + /* + * Use linklevel address if unconfigured. + * + * use ipv6_get_lladdr if/when it's get exported + */ + + + read_lock(&addrconf_lock); + if ((idev = __in6_dev_get(pkt_dev->odev)) != NULL) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { + if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) { + ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &ifp->addr); + err = 0; + break; + } + } + read_unlock_bh(&idev->lock); + } + read_unlock(&addrconf_lock); + if(err) printk("pktgen: ERROR: IPv6 link address not availble.\n"); + } +#endif + } + else { + pkt_dev->saddr_min = 0; + pkt_dev->saddr_max = 0; + if (strlen(pkt_dev->src_min) == 0) { + + struct in_device *in_dev; + + rcu_read_lock(); + in_dev = __in_dev_get(pkt_dev->odev); + if (in_dev) { + if (in_dev->ifa_list) { + pkt_dev->saddr_min = in_dev->ifa_list->ifa_address; + pkt_dev->saddr_max = pkt_dev->saddr_min; + } + __in_dev_put(in_dev); + } + rcu_read_unlock(); + } + else { + pkt_dev->saddr_min = in_aton(pkt_dev->src_min); + pkt_dev->saddr_max = in_aton(pkt_dev->src_max); + } + + pkt_dev->daddr_min = in_aton(pkt_dev->dst_min); + pkt_dev->daddr_max = in_aton(pkt_dev->dst_max); + } + /* Initialize current values. */ + pkt_dev->cur_dst_mac_offset = 0; + pkt_dev->cur_src_mac_offset = 0; + pkt_dev->cur_saddr = pkt_dev->saddr_min; + pkt_dev->cur_daddr = pkt_dev->daddr_min; + pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min; + pkt_dev->cur_udp_src = pkt_dev->udp_src_min; + pkt_dev->nflows = 0; +} + +static void spin(struct pktgen_dev *pkt_dev, __u64 spin_until_us) +{ + __u64 start; + __u64 now; + + start = now = getCurUs(); + printk(KERN_INFO "sleeping for %d\n", (int)(spin_until_us - now)); + while (now < spin_until_us) { + /* TODO: optimise sleeping behavior */ + if (spin_until_us - now > (1000000/HZ)+1) { + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(1); + } else if (spin_until_us - now > 100) { + do_softirq(); + if (!pkt_dev->running) + return; + if (need_resched()) + schedule(); + } + + now = getCurUs(); + } + + pkt_dev->idle_acc += now - start; +} + + +/* Increment/randomize headers according to flags and current values + * for IP src/dest, UDP src/dst port, MAC-Addr src/dst + */ +static void mod_cur_headers(struct pktgen_dev *pkt_dev) { + __u32 imn; + __u32 imx; + int flow = 0; + + if(pkt_dev->cflows) { + flow = pktgen_random() % pkt_dev->cflows; + + if (pkt_dev->flows[flow].count > pkt_dev->lflow) + pkt_dev->flows[flow].count = 0; + } + + + /* Deal with source MAC */ + if (pkt_dev->src_mac_count > 1) { + __u32 mc; + __u32 tmp; + + if (pkt_dev->flags & F_MACSRC_RND) + mc = pktgen_random() % (pkt_dev->src_mac_count); + else { + mc = pkt_dev->cur_src_mac_offset++; + if (pkt_dev->cur_src_mac_offset > pkt_dev->src_mac_count) + pkt_dev->cur_src_mac_offset = 0; + } + + tmp = pkt_dev->src_mac[5] + (mc & 0xFF); + pkt_dev->hh[11] = tmp; + tmp = (pkt_dev->src_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[10] = tmp; + tmp = (pkt_dev->src_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[9] = tmp; + tmp = (pkt_dev->src_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[8] = tmp; + tmp = (pkt_dev->src_mac[1] + (tmp >> 8)); + pkt_dev->hh[7] = tmp; + } + + /* Deal with Destination MAC */ + if (pkt_dev->dst_mac_count > 1) { + __u32 mc; + __u32 tmp; + + if (pkt_dev->flags & F_MACDST_RND) + mc = pktgen_random() % (pkt_dev->dst_mac_count); + + else { + mc = pkt_dev->cur_dst_mac_offset++; + if (pkt_dev->cur_dst_mac_offset > pkt_dev->dst_mac_count) { + pkt_dev->cur_dst_mac_offset = 0; + } + } + + tmp = pkt_dev->dst_mac[5] + (mc & 0xFF); + pkt_dev->hh[5] = tmp; + tmp = (pkt_dev->dst_mac[4] + ((mc >> 8) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[4] = tmp; + tmp = (pkt_dev->dst_mac[3] + ((mc >> 16) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[3] = tmp; + tmp = (pkt_dev->dst_mac[2] + ((mc >> 24) & 0xFF) + (tmp >> 8)); + pkt_dev->hh[2] = tmp; + tmp = (pkt_dev->dst_mac[1] + (tmp >> 8)); + pkt_dev->hh[1] = tmp; + } + + if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) { + if (pkt_dev->flags & F_UDPSRC_RND) + pkt_dev->cur_udp_src = ((pktgen_random() % (pkt_dev->udp_src_max - pkt_dev->udp_src_min)) + pkt_dev->udp_src_min); + + else { + pkt_dev->cur_udp_src++; + if (pkt_dev->cur_udp_src >= pkt_dev->udp_src_max) + pkt_dev->cur_udp_src = pkt_dev->udp_src_min; + } + } + + if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) { + if (pkt_dev->flags & F_UDPDST_RND) { + pkt_dev->cur_udp_dst = ((pktgen_random() % (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min)) + pkt_dev->udp_dst_min); + } + else { + pkt_dev->cur_udp_dst++; + if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max) + pkt_dev->cur_udp_dst = pkt_dev->udp_dst_min; + } + } + + if (!(pkt_dev->flags & F_IPV6)) { + + if ((imn = ntohl(pkt_dev->saddr_min)) < (imx = ntohl(pkt_dev->saddr_max))) { + __u32 t; + if (pkt_dev->flags & F_IPSRC_RND) + t = ((pktgen_random() % (imx - imn)) + imn); + else { + t = ntohl(pkt_dev->cur_saddr); + t++; + if (t > imx) { + t = imn; + } + } + pkt_dev->cur_saddr = htonl(t); + } + + if (pkt_dev->cflows && pkt_dev->flows[flow].count != 0) { + pkt_dev->cur_daddr = pkt_dev->flows[flow].cur_daddr; + } else { + + if ((imn = ntohl(pkt_dev->daddr_min)) < (imx = ntohl(pkt_dev->daddr_max))) { + __u32 t; + if (pkt_dev->flags & F_IPDST_RND) { + + t = ((pktgen_random() % (imx - imn)) + imn); + t = htonl(t); + + while( LOOPBACK(t) || MULTICAST(t) || BADCLASS(t) || ZERONET(t) || LOCAL_MCAST(t) ) { + t = ((pktgen_random() % (imx - imn)) + imn); + t = htonl(t); + } + pkt_dev->cur_daddr = t; + } + + else { + t = ntohl(pkt_dev->cur_daddr); + t++; + if (t > imx) { + t = imn; + } + pkt_dev->cur_daddr = htonl(t); + } + } + if(pkt_dev->cflows) { + pkt_dev->flows[flow].cur_daddr = pkt_dev->cur_daddr; + pkt_dev->nflows++; + } + } + } + else /* IPV6 * */ + { + if(pkt_dev->min_in6_daddr.s6_addr32[0] == 0 && + pkt_dev->min_in6_daddr.s6_addr32[1] == 0 && + pkt_dev->min_in6_daddr.s6_addr32[2] == 0 && + pkt_dev->min_in6_daddr.s6_addr32[3] == 0); + else { + int i; + + /* Only random destinations yet */ + + for(i=0; i < 4; i++) { + pkt_dev->cur_in6_daddr.s6_addr32[i] = + ((pktgen_random() | + pkt_dev->min_in6_daddr.s6_addr32[i]) & + pkt_dev->max_in6_daddr.s6_addr32[i]); + } + } + } + + if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { + __u32 t; + if (pkt_dev->flags & F_TXSIZE_RND) { + t = ((pktgen_random() % (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size)) + + pkt_dev->min_pkt_size); + } + else { + t = pkt_dev->cur_pkt_size + 1; + if (t > pkt_dev->max_pkt_size) + t = pkt_dev->min_pkt_size; + } + pkt_dev->cur_pkt_size = t; + } + + pkt_dev->flows[flow].count++; +} + + +static struct sk_buff *fill_packet_ipv4(struct net_device *odev, + struct pktgen_dev *pkt_dev) +{ + struct sk_buff *skb = NULL; + __u8 *eth; + struct udphdr *udph; + int datalen, iplen; + struct iphdr *iph; + struct pktgen_hdr *pgh = NULL; + + skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC); + if (!skb) { + sprintf(pkt_dev->result, "No memory"); + return NULL; + } + + skb_reserve(skb, 16); + + /* Reserve for ethernet and IP header */ + eth = (__u8 *) skb_push(skb, 14); + iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)); + udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); + + /* Update any of the values, used when we're incrementing various + * fields. + */ + mod_cur_headers(pkt_dev); + + memcpy(eth, pkt_dev->hh, 12); + *(u16*)ð[12] = __constant_htons(ETH_P_IP); + + datalen = pkt_dev->cur_pkt_size - 14 - 20 - 8; /* Eth + IPh + UDPh */ + if (datalen < sizeof(struct pktgen_hdr)) + datalen = sizeof(struct pktgen_hdr); + + udph->source = htons(pkt_dev->cur_udp_src); + udph->dest = htons(pkt_dev->cur_udp_dst); + udph->len = htons(datalen + 8); /* DATA + udphdr */ + udph->check = 0; /* No checksum */ + + iph->ihl = 5; + iph->version = 4; + iph->ttl = 32; + iph->tos = 0; + iph->protocol = IPPROTO_UDP; /* UDP */ + iph->saddr = pkt_dev->cur_saddr; + iph->daddr = pkt_dev->cur_daddr; + iph->frag_off = 0; + iplen = 20 + 8 + datalen; + iph->tot_len = htons(iplen); + iph->check = 0; + iph->check = ip_fast_csum((void *) iph, iph->ihl); + skb->protocol = __constant_htons(ETH_P_IP); + skb->mac.raw = ((u8 *)iph) - 14; + skb->dev = odev; + skb->pkt_type = PACKET_HOST; + + if (pkt_dev->nfrags <= 0) + pgh = (struct pktgen_hdr *)skb_put(skb, datalen); + else { + int frags = pkt_dev->nfrags; + int i; + + pgh = (struct pktgen_hdr*)(((char*)(udph)) + 8); + + if (frags > MAX_SKB_FRAGS) + frags = MAX_SKB_FRAGS; + if (datalen > frags*PAGE_SIZE) { + skb_put(skb, datalen-frags*PAGE_SIZE); + datalen = frags*PAGE_SIZE; + } + + i = 0; + while (datalen > 0) { + struct page *page = alloc_pages(GFP_KERNEL, 0); + skb_shinfo(skb)->frags[i].page = page; + skb_shinfo(skb)->frags[i].page_offset = 0; + skb_shinfo(skb)->frags[i].size = + (datalen < PAGE_SIZE ? datalen : PAGE_SIZE); + datalen -= skb_shinfo(skb)->frags[i].size; + skb->len += skb_shinfo(skb)->frags[i].size; + skb->data_len += skb_shinfo(skb)->frags[i].size; + i++; + skb_shinfo(skb)->nr_frags = i; + } + + while (i < frags) { + int rem; + + if (i == 0) + break; + + rem = skb_shinfo(skb)->frags[i - 1].size / 2; + if (rem == 0) + break; + + skb_shinfo(skb)->frags[i - 1].size -= rem; + + skb_shinfo(skb)->frags[i] = skb_shinfo(skb)->frags[i - 1]; + get_page(skb_shinfo(skb)->frags[i].page); + skb_shinfo(skb)->frags[i].page = skb_shinfo(skb)->frags[i - 1].page; + skb_shinfo(skb)->frags[i].page_offset += skb_shinfo(skb)->frags[i - 1].size; + skb_shinfo(skb)->frags[i].size = rem; + i++; + skb_shinfo(skb)->nr_frags = i; + } + } + + /* Stamp the time, and sequence number, convert them to network byte order */ + + if (pgh) { + struct timeval timestamp; + + pgh->pgh_magic = htonl(PKTGEN_MAGIC); + pgh->seq_num = htonl(pkt_dev->seq_num); + + do_gettimeofday(×tamp); + pgh->tv_sec = htonl(timestamp.tv_sec); + pgh->tv_usec = htonl(timestamp.tv_usec); + } + pkt_dev->seq_num++; + + return skb; +} + +/* + * scan_ip6, fmt_ip taken from dietlibc-0.21 + * Author Felix von Leitner + * + * Slightly modified for kernel. + * Should be candidate for net/ipv4/utils.c + * --ro + */ + +static unsigned int scan_ip6(const char *s,char ip[16]) +{ + unsigned int i; + unsigned int len=0; + unsigned long u; + char suffix[16]; + unsigned int prefixlen=0; + unsigned int suffixlen=0; + __u32 tmp; + + for (i=0; i<16; i++) ip[i]=0; + + for (;;) { + if (*s == ':') { + len++; + if (s[1] == ':') { /* Found "::", skip to part 2 */ + s+=2; + len++; + break; + } + s++; + } + { + char *tmp; + u=simple_strtoul(s,&tmp,16); + i=tmp-s; + } + + if (!i) return 0; + if (prefixlen==12 && s[i]=='.') { + + /* the last 4 bytes may be written as IPv4 address */ + + tmp = in_aton(s); + memcpy((struct in_addr*)(ip+12), &tmp, sizeof(tmp)); + return i+len; + } + ip[prefixlen++] = (u >> 8); + ip[prefixlen++] = (u & 255); + s += i; len += i; + if (prefixlen==16) + return len; + } + +/* part 2, after "::" */ + for (;;) { + if (*s == ':') { + if (suffixlen==0) + break; + s++; + len++; + } else if (suffixlen!=0) + break; + { + char *tmp; + u=simple_strtol(s,&tmp,16); + i=tmp-s; + } + if (!i) { + if (*s) len--; + break; + } + if (suffixlen+prefixlen<=12 && s[i]=='.') { + tmp = in_aton(s); + memcpy((struct in_addr*)(suffix+suffixlen), &tmp, sizeof(tmp)); + suffixlen+=4; + len+=strlen(s); + break; + } + suffix[suffixlen++] = (u >> 8); + suffix[suffixlen++] = (u & 255); + s += i; len += i; + if (prefixlen+suffixlen==16) + break; + } + for (i=0; i9?hexdigit+'a'-10:hexdigit+'0'; +} + +static int fmt_xlong(char* s,unsigned int i) { + char* bak=s; + *s=tohex((i>>12)&0xf); if (s!=bak || *s!='0') ++s; + *s=tohex((i>>8)&0xf); if (s!=bak || *s!='0') ++s; + *s=tohex((i>>4)&0xf); if (s!=bak || *s!='0') ++s; + *s=tohex(i&0xf); + return s-bak+1; +} + +static unsigned int fmt_ip6(char *s,const char ip[16]) { + unsigned int len; + unsigned int i; + unsigned int temp; + unsigned int compressing; + int j; + + len = 0; compressing = 0; + for (j=0; j<16; j+=2) { + +#ifdef V4MAPPEDPREFIX + if (j==12 && !memcmp(ip,V4mappedprefix,12)) { + inet_ntoa_r(*(struct in_addr*)(ip+12),s); + temp=strlen(s); + return len+temp; + } +#endif + temp = ((unsigned long) (unsigned char) ip[j] << 8) + + (unsigned long) (unsigned char) ip[j+1]; + if (temp == 0) { + if (!compressing) { + compressing=1; + if (j==0) { + *s++=':'; ++len; + } + } + } else { + if (compressing) { + compressing=0; + *s++=':'; ++len; + } + i = fmt_xlong(s,temp); len += i; s += i; + if (j<14) { + *s++ = ':'; + ++len; + } + } + } + if (compressing) { + *s++=':'; ++len; + } + *s=0; + return len; +} + +static struct sk_buff *fill_packet_ipv6(struct net_device *odev, + struct pktgen_dev *pkt_dev) +{ + struct sk_buff *skb = NULL; + __u8 *eth; + struct udphdr *udph; + int datalen; + struct ipv6hdr *iph; + struct pktgen_hdr *pgh = NULL; + + skb = alloc_skb(pkt_dev->cur_pkt_size + 64 + 16, GFP_ATOMIC); + if (!skb) { + sprintf(pkt_dev->result, "No memory"); + return NULL; + } + + skb_reserve(skb, 16); + + /* Reserve for ethernet and IP header */ + eth = (__u8 *) skb_push(skb, 14); + iph = (struct ipv6hdr *)skb_put(skb, sizeof(struct ipv6hdr)); + udph = (struct udphdr *)skb_put(skb, sizeof(struct udphdr)); + + + /* Update any of the values, used when we're incrementing various + * fields. + */ + mod_cur_headers(pkt_dev); + + + memcpy(eth, pkt_dev->hh, 12); + *(u16*)ð[12] = __constant_htons(ETH_P_IPV6); + + + datalen = pkt_dev->cur_pkt_size-14- + sizeof(struct ipv6hdr)-sizeof(struct udphdr); /* Eth + IPh + UDPh */ + + if (datalen < sizeof(struct pktgen_hdr)) { + datalen = sizeof(struct pktgen_hdr); + if (net_ratelimit()) + printk(KERN_INFO "pktgen: increased datalen to %d\n", datalen); + } + + udph->source = htons(pkt_dev->cur_udp_src); + udph->dest = htons(pkt_dev->cur_udp_dst); + udph->len = htons(datalen + sizeof(struct udphdr)); + udph->check = 0; /* No checksum */ + + *(u32*)iph = __constant_htonl(0x60000000); /* Version + flow */ + + iph->hop_limit = 32; + + iph->payload_len = htons(sizeof(struct udphdr) + datalen); + iph->nexthdr = IPPROTO_UDP; + + ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr); + ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr); + + skb->mac.raw = ((u8 *)iph) - 14; + skb->protocol = __constant_htons(ETH_P_IPV6); + skb->dev = odev; + skb->pkt_type = PACKET_HOST; + + if (pkt_dev->nfrags <= 0) + pgh = (struct pktgen_hdr *)skb_put(skb, datalen); + else { + int frags = pkt_dev->nfrags; + int i; + + pgh = (struct pktgen_hdr*)(((char*)(udph)) + 8); + + if (frags > MAX_SKB_FRAGS) + frags = MAX_SKB_FRAGS; + if (datalen > frags*PAGE_SIZE) { + skb_put(skb, datalen-frags*PAGE_SIZE); + datalen = frags*PAGE_SIZE; + } + + i = 0; + while (datalen > 0) { + struct page *page = alloc_pages(GFP_KERNEL, 0); + skb_shinfo(skb)->frags[i].page = page; + skb_shinfo(skb)->frags[i].page_offset = 0; + skb_shinfo(skb)->frags[i].size = + (datalen < PAGE_SIZE ? datalen : PAGE_SIZE); + datalen -= skb_shinfo(skb)->frags[i].size; + skb->len += skb_shinfo(skb)->frags[i].size; + skb->data_len += skb_shinfo(skb)->frags[i].size; + i++; + skb_shinfo(skb)->nr_frags = i; + } + + while (i < frags) { + int rem; + + if (i == 0) + break; + + rem = skb_shinfo(skb)->frags[i - 1].size / 2; + if (rem == 0) + break; + + skb_shinfo(skb)->frags[i - 1].size -= rem; + + skb_shinfo(skb)->frags[i] = skb_shinfo(skb)->frags[i - 1]; + get_page(skb_shinfo(skb)->frags[i].page); + skb_shinfo(skb)->frags[i].page = skb_shinfo(skb)->frags[i - 1].page; + skb_shinfo(skb)->frags[i].page_offset += skb_shinfo(skb)->frags[i - 1].size; + skb_shinfo(skb)->frags[i].size = rem; + i++; + skb_shinfo(skb)->nr_frags = i; + } + } + + /* Stamp the time, and sequence number, convert them to network byte order */ + /* should we update cloned packets too ? */ + if (pgh) { + struct timeval timestamp; + + pgh->pgh_magic = htonl(PKTGEN_MAGIC); + pgh->seq_num = htonl(pkt_dev->seq_num); + + do_gettimeofday(×tamp); + pgh->tv_sec = htonl(timestamp.tv_sec); + pgh->tv_usec = htonl(timestamp.tv_usec); + } + pkt_dev->seq_num++; + + return skb; +} + +static inline struct sk_buff *fill_packet(struct net_device *odev, + struct pktgen_dev *pkt_dev) +{ + if(pkt_dev->flags & F_IPV6) + return fill_packet_ipv6(odev, pkt_dev); + else + return fill_packet_ipv4(odev, pkt_dev); +} + +static void pktgen_clear_counters(struct pktgen_dev *pkt_dev) +{ + pkt_dev->seq_num = 1; + pkt_dev->idle_acc = 0; + pkt_dev->sofar = 0; + pkt_dev->tx_bytes = 0; + pkt_dev->errors = 0; +} + +/* Set up structure for sending pkts, clear counters */ + +static void pktgen_run(struct pktgen_thread *t) +{ + struct pktgen_dev *pkt_dev = NULL; + int started = 0; + + PG_DEBUG(printk("pktgen: entering pktgen_run. %p\n", t)); + + if_lock(t); + for (pkt_dev = t->if_list; pkt_dev; pkt_dev = pkt_dev->next ) { + + /* + * setup odev and create initial packet. + */ + pktgen_setup_inject(pkt_dev); + + if(pkt_dev->odev) { + pktgen_clear_counters(pkt_dev); + pkt_dev->running = 1; /* Cranke yeself! */ + pkt_dev->skb = NULL; + pkt_dev->started_at = getCurUs(); + pkt_dev->next_tx_us = getCurUs(); /* Transmit immediately */ + pkt_dev->next_tx_ns = 0; + + strcpy(pkt_dev->result, "Starting"); + started++; + } + else + strcpy(pkt_dev->result, "Error starting"); + } + if_unlock(t); + if(started) t->control &= ~(T_STOP); +} + +static void pktgen_stop_all_threads_ifs(void) +{ + struct pktgen_thread *t = pktgen_threads; + + PG_DEBUG(printk("pktgen: entering pktgen_stop_all_threads.\n")); + + thread_lock(); + while(t) { + pktgen_stop(t); + t = t->next; + } + thread_unlock(); +} + +static int thread_is_running(struct pktgen_thread *t ) +{ + struct pktgen_dev *next; + int res = 0; + + for(next=t->if_list; next; next=next->next) { + if(next->running) { + res = 1; + break; + } + } + return res; +} + +static int pktgen_wait_thread_run(struct pktgen_thread *t ) +{ + if_lock(t); + + while(thread_is_running(t)) { + + if_unlock(t); + + msleep_interruptible(100); + + if (signal_pending(current)) + goto signal; + if_lock(t); + } + if_unlock(t); + return 1; + signal: + return 0; +} + +static int pktgen_wait_all_threads_run(void) +{ + struct pktgen_thread *t = pktgen_threads; + int sig = 1; + + while (t) { + sig = pktgen_wait_thread_run(t); + if( sig == 0 ) break; + thread_lock(); + t=t->next; + thread_unlock(); + } + if(sig == 0) { + thread_lock(); + while (t) { + t->control |= (T_STOP); + t=t->next; + } + thread_unlock(); + } + return sig; +} + +static void pktgen_run_all_threads(void) +{ + struct pktgen_thread *t = pktgen_threads; + + PG_DEBUG(printk("pktgen: entering pktgen_run_all_threads.\n")); + + thread_lock(); + + while(t) { + t->control |= (T_RUN); + t = t->next; + } + thread_unlock(); + + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(HZ/8); /* Propagate thread->control */ + + pktgen_wait_all_threads_run(); +} + + +static void show_results(struct pktgen_dev *pkt_dev, int nr_frags) +{ + __u64 total_us, bps, mbps, pps, idle; + char *p = pkt_dev->result; + + total_us = pkt_dev->stopped_at - pkt_dev->started_at; + + idle = pkt_dev->idle_acc; + + p += sprintf(p, "OK: %llu(c%llu+d%llu) usec, %llu (%dbyte,%dfrags)\n", + (unsigned long long) total_us, + (unsigned long long)(total_us - idle), + (unsigned long long) idle, + (unsigned long long) pkt_dev->sofar, + pkt_dev->cur_pkt_size, nr_frags); + + pps = pkt_dev->sofar * USEC_PER_SEC; + + while ((total_us >> 32) != 0) { + pps >>= 1; + total_us >>= 1; + } + + do_div(pps, total_us); + + bps = pps * 8 * pkt_dev->cur_pkt_size; + + mbps = bps; + do_div(mbps, 1000000); + p += sprintf(p, " %llupps %lluMb/sec (%llubps) errors: %llu", + (unsigned long long) pps, + (unsigned long long) mbps, + (unsigned long long) bps, + (unsigned long long) pkt_dev->errors); +} + + +/* Set stopped-at timer, remove from running list, do counters & statistics */ + +static int pktgen_stop_device(struct pktgen_dev *pkt_dev) +{ + + if (!pkt_dev->running) { + printk("pktgen: interface: %s is already stopped\n", pkt_dev->ifname); + return -EINVAL; + } + + pkt_dev->stopped_at = getCurUs(); + pkt_dev->running = 0; + + show_results(pkt_dev, skb_shinfo(pkt_dev->skb)->nr_frags); + + if (pkt_dev->skb) + kfree_skb(pkt_dev->skb); + + pkt_dev->skb = NULL; + + return 0; +} + +static struct pktgen_dev *next_to_run(struct pktgen_thread *t ) +{ + struct pktgen_dev *next, *best = NULL; + + if_lock(t); + + for(next=t->if_list; next ; next=next->next) { + if(!next->running) continue; + if(best == NULL) best=next; + else if ( next->next_tx_us < best->next_tx_us) + best = next; + } + if_unlock(t); + return best; +} + +static void pktgen_stop(struct pktgen_thread *t) { + struct pktgen_dev *next = NULL; + + PG_DEBUG(printk("pktgen: entering pktgen_stop.\n")); + + if_lock(t); + + for(next=t->if_list; next; next=next->next) + pktgen_stop_device(next); + + if_unlock(t); +} + +static void pktgen_rem_all_ifs(struct pktgen_thread *t) +{ + struct pktgen_dev *cur, *next = NULL; + + /* Remove all devices, free mem */ + + if_lock(t); + + for(cur=t->if_list; cur; cur=next) { + next = cur->next; + pktgen_remove_device(t, cur); + } + + if_unlock(t); +} + +static void pktgen_rem_thread(struct pktgen_thread *t) +{ + /* Remove from the thread list */ + + struct pktgen_thread *tmp = pktgen_threads; + + if (strlen(t->fname)) + remove_proc_entry(t->fname, NULL); + + thread_lock(); + + if (tmp == t) + pktgen_threads = tmp->next; + else { + while (tmp) { + if (tmp->next == t) { + tmp->next = t->next; + t->next = NULL; + break; + } + tmp = tmp->next; + } + } + thread_unlock(); +} + +static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) +{ + struct net_device *odev = NULL; + __u64 idle_start = 0; + int ret; + + odev = pkt_dev->odev; + + if (pkt_dev->delay_us || pkt_dev->delay_ns) { + u64 now; + + now = getCurUs(); + if (now < pkt_dev->next_tx_us) + spin(pkt_dev, pkt_dev->next_tx_us); + + /* This is max DELAY, this has special meaning of + * "never transmit" + */ + if (pkt_dev->delay_us == 0x7FFFFFFF) { + pkt_dev->next_tx_us = getCurUs() + pkt_dev->delay_us; + pkt_dev->next_tx_ns = pkt_dev->delay_ns; + goto out; + } + } + + if (netif_queue_stopped(odev) || need_resched()) { + idle_start = getCurUs(); + + if (!netif_running(odev)) { + pktgen_stop_device(pkt_dev); + goto out; + } + if (need_resched()) + schedule(); + + pkt_dev->idle_acc += getCurUs() - idle_start; + + if (netif_queue_stopped(odev)) { + pkt_dev->next_tx_us = getCurUs(); /* TODO */ + pkt_dev->next_tx_ns = 0; + goto out; /* Try the next interface */ + } + } + + if (pkt_dev->last_ok || !pkt_dev->skb) { + if ((++pkt_dev->clone_count >= pkt_dev->clone_skb ) || (!pkt_dev->skb)) { + /* build a new pkt */ + if (pkt_dev->skb) + kfree_skb(pkt_dev->skb); + + pkt_dev->skb = fill_packet(odev, pkt_dev); + if (pkt_dev->skb == NULL) { + printk("pktgen: ERROR: couldn't allocate skb in fill_packet.\n"); + schedule(); + pkt_dev->clone_count--; /* back out increment, OOM */ + goto out; + } + pkt_dev->allocated_skbs++; + pkt_dev->clone_count = 0; /* reset counter */ + } + } + + spin_lock_bh(&odev->xmit_lock); + if (!netif_queue_stopped(odev)) { + + atomic_inc(&(pkt_dev->skb->users)); +retry_now: + ret = odev->hard_start_xmit(pkt_dev->skb, odev); + if (likely(ret == NETDEV_TX_OK)) { + pkt_dev->last_ok = 1; + pkt_dev->sofar++; + pkt_dev->seq_num++; + pkt_dev->tx_bytes += pkt_dev->cur_pkt_size; + + } else if (ret == NETDEV_TX_LOCKED + && (odev->features & NETIF_F_LLTX)) { + cpu_relax(); + goto retry_now; + } else { /* Retry it next time */ + + atomic_dec(&(pkt_dev->skb->users)); + + if (debug && net_ratelimit()) + printk(KERN_INFO "pktgen: Hard xmit error\n"); + + pkt_dev->errors++; + pkt_dev->last_ok = 0; + } + + pkt_dev->next_tx_us = getCurUs(); + pkt_dev->next_tx_ns = 0; + + pkt_dev->next_tx_us += pkt_dev->delay_us; + pkt_dev->next_tx_ns += pkt_dev->delay_ns; + + if (pkt_dev->next_tx_ns > 1000) { + pkt_dev->next_tx_us++; + pkt_dev->next_tx_ns -= 1000; + } + } + + else { /* Retry it next time */ + pkt_dev->last_ok = 0; + pkt_dev->next_tx_us = getCurUs(); /* TODO */ + pkt_dev->next_tx_ns = 0; + } + + spin_unlock_bh(&odev->xmit_lock); + + /* If pkt_dev->count is zero, then run forever */ + if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { + if (atomic_read(&(pkt_dev->skb->users)) != 1) { + idle_start = getCurUs(); + while (atomic_read(&(pkt_dev->skb->users)) != 1) { + if (signal_pending(current)) { + break; + } + schedule(); + } + pkt_dev->idle_acc += getCurUs() - idle_start; + } + + /* Done with this */ + pktgen_stop_device(pkt_dev); + } + out:; + } + +/* + * Main loop of the thread goes here + */ + +static void pktgen_thread_worker(struct pktgen_thread *t) +{ + DEFINE_WAIT(wait); + struct pktgen_dev *pkt_dev = NULL; + int cpu = t->cpu; + sigset_t tmpsig; + u32 max_before_softirq; + u32 tx_since_softirq = 0; + + daemonize("pktgen/%d", cpu); + + /* Block all signals except SIGKILL, SIGSTOP and SIGTERM */ + + spin_lock_irq(¤t->sighand->siglock); + tmpsig = current->blocked; + siginitsetinv(¤t->blocked, + sigmask(SIGKILL) | + sigmask(SIGSTOP)| + sigmask(SIGTERM)); + + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + /* Migrate to the right CPU */ + set_cpus_allowed(current, cpumask_of_cpu(cpu)); + if (smp_processor_id() != cpu) + BUG(); + + init_waitqueue_head(&t->queue); + + t->control &= ~(T_TERMINATE); + t->control &= ~(T_RUN); + t->control &= ~(T_STOP); + t->control &= ~(T_REMDEV); + + t->pid = current->pid; + + PG_DEBUG(printk("pktgen: starting pktgen/%d: pid=%d\n", cpu, current->pid)); + + max_before_softirq = t->max_before_softirq; + + __set_current_state(TASK_INTERRUPTIBLE); + mb(); + + while (1) { + + __set_current_state(TASK_RUNNING); + + /* + * Get next dev to xmit -- if any. + */ + + pkt_dev = next_to_run(t); + + if (pkt_dev) { + + pktgen_xmit(pkt_dev); + + /* + * We like to stay RUNNING but must also give + * others fair share. + */ + + tx_since_softirq += pkt_dev->last_ok; + + if (tx_since_softirq > max_before_softirq) { + if (local_softirq_pending()) + do_softirq(); + tx_since_softirq = 0; + } + } else { + prepare_to_wait(&(t->queue), &wait, TASK_INTERRUPTIBLE); + schedule_timeout(HZ/10); + finish_wait(&(t->queue), &wait); + } + + /* + * Back from sleep, either due to the timeout or signal. + * We check if we have any "posted" work for us. + */ + + if (t->control & T_TERMINATE || signal_pending(current)) + /* we received a request to terminate ourself */ + break; + + + if(t->control & T_STOP) { + pktgen_stop(t); + t->control &= ~(T_STOP); + } + + if(t->control & T_RUN) { + pktgen_run(t); + t->control &= ~(T_RUN); + } + + if(t->control & T_REMDEV) { + pktgen_rem_all_ifs(t); + t->control &= ~(T_REMDEV); + } + + if (need_resched()) + schedule(); + } + + PG_DEBUG(printk("pktgen: %s stopping all device\n", t->name)); + pktgen_stop(t); + + PG_DEBUG(printk("pktgen: %s removing all device\n", t->name)); + pktgen_rem_all_ifs(t); + + PG_DEBUG(printk("pktgen: %s removing thread.\n", t->name)); + pktgen_rem_thread(t); +} + +static struct pktgen_dev *pktgen_find_dev(struct pktgen_thread *t, const char* ifname) +{ + struct pktgen_dev *pkt_dev = NULL; + if_lock(t); + + for(pkt_dev=t->if_list; pkt_dev; pkt_dev = pkt_dev->next ) { + if (strcmp(pkt_dev->ifname, ifname) == 0) { + break; + } + } + + if_unlock(t); + PG_DEBUG(printk("pktgen: find_dev(%s) returning %p\n", ifname,pkt_dev)); + return pkt_dev; +} + +/* + * Adds a dev at front of if_list. + */ + +static int add_dev_to_thread(struct pktgen_thread *t, struct pktgen_dev *pkt_dev) +{ + int rv = 0; + + if_lock(t); + + if (pkt_dev->pg_thread) { + printk("pktgen: ERROR: already assigned to a thread.\n"); + rv = -EBUSY; + goto out; + } + pkt_dev->next =t->if_list; t->if_list=pkt_dev; + pkt_dev->pg_thread = t; + pkt_dev->running = 0; + + out: + if_unlock(t); + return rv; +} + +/* Called under thread lock */ + +static int pktgen_add_device(struct pktgen_thread *t, const char* ifname) +{ + struct pktgen_dev *pkt_dev; + + /* We don't allow a device to be on several threads */ + + if( (pkt_dev = __pktgen_NN_threads(ifname, FIND)) == NULL) { + + pkt_dev = kmalloc(sizeof(struct pktgen_dev), GFP_KERNEL); + if (!pkt_dev) + return -ENOMEM; + + memset(pkt_dev, 0, sizeof(struct pktgen_dev)); + + pkt_dev->flows = vmalloc(MAX_CFLOWS*sizeof(struct flow_state)); + if (pkt_dev->flows == NULL) { + kfree(pkt_dev); + return -ENOMEM; + } + memset(pkt_dev->flows, 0, MAX_CFLOWS*sizeof(struct flow_state)); + + pkt_dev->min_pkt_size = ETH_ZLEN; + pkt_dev->max_pkt_size = ETH_ZLEN; + pkt_dev->nfrags = 0; + pkt_dev->clone_skb = pg_clone_skb_d; + pkt_dev->delay_us = pg_delay_d / 1000; + pkt_dev->delay_ns = pg_delay_d % 1000; + pkt_dev->count = pg_count_d; + pkt_dev->sofar = 0; + pkt_dev->udp_src_min = 9; /* sink port */ + pkt_dev->udp_src_max = 9; + pkt_dev->udp_dst_min = 9; + pkt_dev->udp_dst_max = 9; + + strncpy(pkt_dev->ifname, ifname, 31); + sprintf(pkt_dev->fname, "net/%s/%s", PG_PROC_DIR, ifname); + + if (! pktgen_setup_dev(pkt_dev)) { + printk("pktgen: ERROR: pktgen_setup_dev failed.\n"); + if (pkt_dev->flows) + vfree(pkt_dev->flows); + kfree(pkt_dev); + return -ENODEV; + } + + pkt_dev->proc_ent = create_proc_entry(pkt_dev->fname, 0600, NULL); + if (!pkt_dev->proc_ent) { + printk("pktgen: cannot create %s procfs entry.\n", pkt_dev->fname); + if (pkt_dev->flows) + vfree(pkt_dev->flows); + kfree(pkt_dev); + return -EINVAL; + } + pkt_dev->proc_ent->read_proc = proc_if_read; + pkt_dev->proc_ent->write_proc = proc_if_write; + pkt_dev->proc_ent->data = (void*)(pkt_dev); + pkt_dev->proc_ent->owner = THIS_MODULE; + + return add_dev_to_thread(t, pkt_dev); + } + else { + printk("pktgen: ERROR: interface already used.\n"); + return -EBUSY; + } +} + +static struct pktgen_thread *pktgen_find_thread(const char* name) +{ + struct pktgen_thread *t = NULL; + + thread_lock(); + + t = pktgen_threads; + while (t) { + if (strcmp(t->name, name) == 0) + break; + + t = t->next; + } + thread_unlock(); + return t; +} + +static int pktgen_create_thread(const char* name, int cpu) +{ + struct pktgen_thread *t = NULL; + + if (strlen(name) > 31) { + printk("pktgen: ERROR: Thread name cannot be more than 31 characters.\n"); + return -EINVAL; + } + + if (pktgen_find_thread(name)) { + printk("pktgen: ERROR: thread: %s already exists\n", name); + return -EINVAL; + } + + t = (struct pktgen_thread*)(kmalloc(sizeof(struct pktgen_thread), GFP_KERNEL)); + if (!t) { + printk("pktgen: ERROR: out of memory, can't create new thread.\n"); + return -ENOMEM; + } + + memset(t, 0, sizeof(struct pktgen_thread)); + strcpy(t->name, name); + spin_lock_init(&t->if_lock); + t->cpu = cpu; + + sprintf(t->fname, "net/%s/%s", PG_PROC_DIR, t->name); + t->proc_ent = create_proc_entry(t->fname, 0600, NULL); + if (!t->proc_ent) { + printk("pktgen: cannot create %s procfs entry.\n", t->fname); + kfree(t); + return -EINVAL; + } + t->proc_ent->read_proc = proc_thread_read; + t->proc_ent->write_proc = proc_thread_write; + t->proc_ent->data = (void*)(t); + t->proc_ent->owner = THIS_MODULE; + + t->next = pktgen_threads; + pktgen_threads = t; + + if (kernel_thread((void *) pktgen_thread_worker, (void *) t, + CLONE_FS | CLONE_FILES | CLONE_SIGHAND) < 0) + printk("pktgen: kernel_thread() failed for cpu %d\n", t->cpu); + + return 0; +} + +/* + * Removes a device from the thread if_list. + */ +static void _rem_dev_from_if_list(struct pktgen_thread *t, struct pktgen_dev *pkt_dev) +{ + struct pktgen_dev *i, *prev = NULL; + + i = t->if_list; + + while(i) { + if(i == pkt_dev) { + if(prev) prev->next = i->next; + else t->if_list = NULL; + break; + } + prev = i; + i=i->next; + } +} + +static int pktgen_remove_device(struct pktgen_thread *t, struct pktgen_dev *pkt_dev) +{ + + PG_DEBUG(printk("pktgen: remove_device pkt_dev=%p\n", pkt_dev)); + + if (pkt_dev->running) { + printk("pktgen:WARNING: trying to remove a running interface, stopping it now.\n"); + pktgen_stop_device(pkt_dev); + } + + /* Dis-associate from the interface */ + + if (pkt_dev->odev) { + dev_put(pkt_dev->odev); + pkt_dev->odev = NULL; + } + + /* And update the thread if_list */ + + _rem_dev_from_if_list(t, pkt_dev); + + /* Clean up proc file system */ + + if (strlen(pkt_dev->fname)) + remove_proc_entry(pkt_dev->fname, NULL); + + if (pkt_dev->flows) + vfree(pkt_dev->flows); + kfree(pkt_dev); + return 0; +} + +static int __init pg_init(void) +{ + int cpu; + printk(version); + + module_fname[0] = 0; + + create_proc_dir(); + + sprintf(module_fname, "net/%s/pgctrl", PG_PROC_DIR); + module_proc_ent = create_proc_entry(module_fname, 0600, NULL); + if (!module_proc_ent) { + printk("pktgen: ERROR: cannot create %s procfs entry.\n", module_fname); + return -EINVAL; + } + + module_proc_ent->proc_fops = &pktgen_fops; + module_proc_ent->data = NULL; + + /* Register us to receive netdevice events */ + register_netdevice_notifier(&pktgen_notifier_block); + + for (cpu = 0; cpu < NR_CPUS ; cpu++) { + char buf[30]; + + if (!cpu_online(cpu)) + continue; + + sprintf(buf, "kpktgend_%i", cpu); + pktgen_create_thread(buf, cpu); + } + return 0; +} + +static void __exit pg_cleanup(void) +{ + wait_queue_head_t queue; + init_waitqueue_head(&queue); + + /* Stop all interfaces & threads */ + + while (pktgen_threads) { + struct pktgen_thread *t = pktgen_threads; + pktgen_threads->control |= (T_TERMINATE); + + wait_event_interruptible_timeout(queue, (t != pktgen_threads), HZ); + } + + /* Un-register us from receiving netdevice events */ + unregister_netdevice_notifier(&pktgen_notifier_block); + + /* Clean up proc file system */ + + remove_proc_entry(module_fname, NULL); + + remove_proc_dir(); +} + + +module_init(pg_init); +module_exit(pg_cleanup); + +MODULE_AUTHOR("Robert Olsson + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Vitaly E. Lavrov RTA_OK arithmetics was wrong. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DECLARE_MUTEX(rtnl_sem); + +void rtnl_lock(void) +{ + rtnl_shlock(); +} + +int rtnl_lock_interruptible(void) +{ + return down_interruptible(&rtnl_sem); +} + +void rtnl_unlock(void) +{ + rtnl_shunlock(); + + netdev_run_todo(); +} + +int rtattr_parse(struct rtattr *tb[], int maxattr, struct rtattr *rta, int len) +{ + memset(tb, 0, sizeof(struct rtattr*)*maxattr); + + while (RTA_OK(rta, len)) { + unsigned flavor = rta->rta_type; + if (flavor && flavor <= maxattr) + tb[flavor-1] = rta; + rta = RTA_NEXT(rta, len); + } + return 0; +} + +struct sock *rtnl; + +struct rtnetlink_link * rtnetlink_links[NPROTO]; + +static const int rtm_min[(RTM_MAX+1-RTM_BASE)/4] = +{ + NLMSG_LENGTH(sizeof(struct ifinfomsg)), + NLMSG_LENGTH(sizeof(struct ifaddrmsg)), + NLMSG_LENGTH(sizeof(struct rtmsg)), + NLMSG_LENGTH(sizeof(struct ndmsg)), + NLMSG_LENGTH(sizeof(struct rtmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)), + NLMSG_LENGTH(sizeof(struct tcmsg)), + NLMSG_LENGTH(sizeof(struct tcamsg)) +}; + +static const int rta_max[(RTM_MAX+1-RTM_BASE)/4] = +{ + IFLA_MAX, + IFA_MAX, + RTA_MAX, + NDA_MAX, + RTA_MAX, + TCA_MAX, + TCA_MAX, + TCA_MAX, + TCAA_MAX +}; + +void __rta_fill(struct sk_buff *skb, int attrtype, int attrlen, const void *data) +{ + struct rtattr *rta; + int size = RTA_LENGTH(attrlen); + + rta = (struct rtattr*)skb_put(skb, RTA_ALIGN(size)); + rta->rta_type = attrtype; + rta->rta_len = size; + memcpy(RTA_DATA(rta), data, attrlen); +} + +size_t rtattr_strlcpy(char *dest, const struct rtattr *rta, size_t size) +{ + size_t ret = RTA_PAYLOAD(rta); + char *src = RTA_DATA(rta); + + if (ret > 0 && src[ret - 1] == '\0') + ret--; + if (size > 0) { + size_t len = (ret >= size) ? size - 1 : ret; + memset(dest, 0, size); + memcpy(dest, src, len); + } + return ret; +} + +int rtnetlink_send(struct sk_buff *skb, u32 pid, unsigned group, int echo) +{ + int err = 0; + + NETLINK_CB(skb).dst_groups = group; + if (echo) + atomic_inc(&skb->users); + netlink_broadcast(rtnl, skb, pid, group, GFP_KERNEL); + if (echo) + err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); + return err; +} + +int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics) +{ + struct rtattr *mx = (struct rtattr*)skb->tail; + int i; + + RTA_PUT(skb, RTA_METRICS, 0, NULL); + for (i=0; irta_len = skb->tail - (u8*)mx; + if (mx->rta_len == RTA_LENGTH(0)) + skb_trim(skb, (u8*)mx - skb->data); + return 0; + +rtattr_failure: + skb_trim(skb, (u8*)mx - skb->data); + return -1; +} + + +static int rtnetlink_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, + int type, u32 pid, u32 seq, u32 change) +{ + struct ifinfomsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*r)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; + r = NLMSG_DATA(nlh); + r->ifi_family = AF_UNSPEC; + r->ifi_type = dev->type; + r->ifi_index = dev->ifindex; + r->ifi_flags = dev_get_flags(dev); + r->ifi_change = change; + + RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name); + + if (1) { + u32 txqlen = dev->tx_queue_len; + RTA_PUT(skb, IFLA_TXQLEN, sizeof(txqlen), &txqlen); + } + + if (1) { + u32 weight = dev->weight; + RTA_PUT(skb, IFLA_WEIGHT, sizeof(weight), &weight); + } + + if (1) { + struct rtnl_link_ifmap map = { + .mem_start = dev->mem_start, + .mem_end = dev->mem_end, + .base_addr = dev->base_addr, + .irq = dev->irq, + .dma = dev->dma, + .port = dev->if_port, + }; + RTA_PUT(skb, IFLA_MAP, sizeof(map), &map); + } + + if (dev->addr_len) { + RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + RTA_PUT(skb, IFLA_BROADCAST, dev->addr_len, dev->broadcast); + } + + if (1) { + u32 mtu = dev->mtu; + RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu); + } + + if (dev->ifindex != dev->iflink) { + u32 iflink = dev->iflink; + RTA_PUT(skb, IFLA_LINK, sizeof(iflink), &iflink); + } + + if (dev->qdisc_sleeping) + RTA_PUT(skb, IFLA_QDISC, + strlen(dev->qdisc_sleeping->ops->id) + 1, + dev->qdisc_sleeping->ops->id); + + if (dev->master) { + u32 master = dev->master->ifindex; + RTA_PUT(skb, IFLA_MASTER, sizeof(master), &master); + } + + if (dev->get_stats) { + unsigned long *stats = (unsigned long*)dev->get_stats(dev); + if (stats) { + struct rtattr *a; + __u32 *s; + int i; + int n = sizeof(struct rtnl_link_stats)/4; + + a = __RTA_PUT(skb, IFLA_STATS, n*4); + s = RTA_DATA(a); + for (i=0; inlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int rtnetlink_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->args[0]; + struct net_device *dev; + + read_lock(&dev_base_lock); + for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0) <= 0) + break; + } + read_unlock(&dev_base_lock); + cb->args[0] = idx; + + return skb->len; +} + +static int do_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct ifinfomsg *ifm = NLMSG_DATA(nlh); + struct rtattr **ida = arg; + struct net_device *dev; + int err, send_addr_notify = 0; + + if (ifm->ifi_index >= 0) + dev = dev_get_by_index(ifm->ifi_index); + else if (ida[IFLA_IFNAME - 1]) { + char ifname[IFNAMSIZ]; + + if (rtattr_strlcpy(ifname, ida[IFLA_IFNAME - 1], + IFNAMSIZ) >= IFNAMSIZ) + return -EINVAL; + dev = dev_get_by_name(ifname); + } else + return -EINVAL; + + if (!dev) + return -ENODEV; + + err = -EINVAL; + + if (ifm->ifi_flags) + dev_change_flags(dev, ifm->ifi_flags); + + if (ida[IFLA_MAP - 1]) { + struct rtnl_link_ifmap *u_map; + struct ifmap k_map; + + if (!dev->set_config) { + err = -EOPNOTSUPP; + goto out; + } + + if (!netif_device_present(dev)) { + err = -ENODEV; + goto out; + } + + if (ida[IFLA_MAP - 1]->rta_len != RTA_LENGTH(sizeof(*u_map))) + goto out; + + u_map = RTA_DATA(ida[IFLA_MAP - 1]); + + k_map.mem_start = (unsigned long) u_map->mem_start; + k_map.mem_end = (unsigned long) u_map->mem_end; + k_map.base_addr = (unsigned short) u_map->base_addr; + k_map.irq = (unsigned char) u_map->irq; + k_map.dma = (unsigned char) u_map->dma; + k_map.port = (unsigned char) u_map->port; + + err = dev->set_config(dev, &k_map); + + if (err) + goto out; + } + + if (ida[IFLA_ADDRESS - 1]) { + if (!dev->set_mac_address) { + err = -EOPNOTSUPP; + goto out; + } + if (!netif_device_present(dev)) { + err = -ENODEV; + goto out; + } + if (ida[IFLA_ADDRESS - 1]->rta_len != RTA_LENGTH(dev->addr_len)) + goto out; + + err = dev->set_mac_address(dev, RTA_DATA(ida[IFLA_ADDRESS - 1])); + if (err) + goto out; + send_addr_notify = 1; + } + + if (ida[IFLA_BROADCAST - 1]) { + if (ida[IFLA_BROADCAST - 1]->rta_len != RTA_LENGTH(dev->addr_len)) + goto out; + memcpy(dev->broadcast, RTA_DATA(ida[IFLA_BROADCAST - 1]), + dev->addr_len); + send_addr_notify = 1; + } + + if (ida[IFLA_MTU - 1]) { + if (ida[IFLA_MTU - 1]->rta_len != RTA_LENGTH(sizeof(u32))) + goto out; + err = dev_set_mtu(dev, *((u32 *) RTA_DATA(ida[IFLA_MTU - 1]))); + + if (err) + goto out; + + } + + if (ida[IFLA_TXQLEN - 1]) { + if (ida[IFLA_TXQLEN - 1]->rta_len != RTA_LENGTH(sizeof(u32))) + goto out; + + dev->tx_queue_len = *((u32 *) RTA_DATA(ida[IFLA_TXQLEN - 1])); + } + + if (ida[IFLA_WEIGHT - 1]) { + if (ida[IFLA_WEIGHT - 1]->rta_len != RTA_LENGTH(sizeof(u32))) + goto out; + + dev->weight = *((u32 *) RTA_DATA(ida[IFLA_WEIGHT - 1])); + } + + if (ifm->ifi_index >= 0 && ida[IFLA_IFNAME - 1]) { + char ifname[IFNAMSIZ]; + + if (rtattr_strlcpy(ifname, ida[IFLA_IFNAME - 1], + IFNAMSIZ) >= IFNAMSIZ) + goto out; + err = dev_change_name(dev, ifname); + if (err) + goto out; + } + + err = 0; + +out: + if (send_addr_notify) + call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + + dev_put(dev); + return err; +} + +static int rtnetlink_dump_all(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->family; + + if (s_idx == 0) + s_idx = 1; + for (idx=1; idxnlh->nlmsg_type-RTM_BASE; + if (idx < s_idx || idx == PF_PACKET) + continue; + if (rtnetlink_links[idx] == NULL || + rtnetlink_links[idx][type].dumpit == NULL) + continue; + if (idx > s_idx) + memset(&cb->args[0], 0, sizeof(cb->args)); + if (rtnetlink_links[idx][type].dumpit(skb, cb)) + break; + } + cb->family = idx; + + return skb->len; +} + +void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) +{ + struct sk_buff *skb; + int size = NLMSG_SPACE(sizeof(struct ifinfomsg) + + sizeof(struct rtnl_link_ifmap) + + sizeof(struct rtnl_link_stats) + 128); + + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return; + + if (rtnetlink_fill_ifinfo(skb, dev, type, 0, 0, change) < 0) { + kfree_skb(skb); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_LINK; + netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_KERNEL); +} + +static int rtnetlink_done(struct netlink_callback *cb) +{ + return 0; +} + +/* Protected by RTNL sempahore. */ +static struct rtattr **rta_buf; +static int rtattr_max; + +/* Process one rtnetlink message. */ + +static __inline__ int +rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) +{ + struct rtnetlink_link *link; + struct rtnetlink_link *link_tab; + int sz_idx, kind; + int min_len; + int family; + int type; + int err; + + /* Only requests are handled by kernel now */ + if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) + return 0; + + type = nlh->nlmsg_type; + + /* A control message: ignore them */ + if (type < RTM_BASE) + return 0; + + /* Unknown message: reply with EINVAL */ + if (type > RTM_MAX) + goto err_inval; + + type -= RTM_BASE; + + /* All the messages must have at least 1 byte length */ + if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(struct rtgenmsg))) + return 0; + + family = ((struct rtgenmsg*)NLMSG_DATA(nlh))->rtgen_family; + if (family >= NPROTO) { + *errp = -EAFNOSUPPORT; + return -1; + } + + link_tab = rtnetlink_links[family]; + if (link_tab == NULL) + link_tab = rtnetlink_links[PF_UNSPEC]; + link = &link_tab[type]; + + sz_idx = type>>2; + kind = type&3; + + if (kind != 2 && security_netlink_recv(skb)) { + *errp = -EPERM; + return -1; + } + + if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { + u32 rlen; + + if (link->dumpit == NULL) + link = &(rtnetlink_links[PF_UNSPEC][type]); + + if (link->dumpit == NULL) + goto err_inval; + + if ((*errp = netlink_dump_start(rtnl, skb, nlh, + link->dumpit, + rtnetlink_done)) != 0) { + return -1; + } + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return -1; + } + + memset(rta_buf, 0, (rtattr_max * sizeof(struct rtattr *))); + + min_len = rtm_min[sz_idx]; + if (nlh->nlmsg_len < min_len) + goto err_inval; + + if (nlh->nlmsg_len > min_len) { + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + struct rtattr *attr = (void*)nlh + NLMSG_ALIGN(min_len); + + while (RTA_OK(attr, attrlen)) { + unsigned flavor = attr->rta_type; + if (flavor) { + if (flavor > rta_max[sz_idx]) + goto err_inval; + rta_buf[flavor-1] = attr; + } + attr = RTA_NEXT(attr, attrlen); + } + } + + if (link->doit == NULL) + link = &(rtnetlink_links[PF_UNSPEC][type]); + if (link->doit == NULL) + goto err_inval; + err = link->doit(skb, nlh, (void *)&rta_buf[0]); + + *errp = err; + return err; + +err_inval: + *errp = -EINVAL; + return -1; +} + +/* + * Process one packet of messages. + * Malformed skbs with wrong lengths of messages are discarded silently. + */ + +static inline int rtnetlink_rcv_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr * nlh; + + while (skb->len >= NLMSG_SPACE(0)) { + u32 rlen; + + nlh = (struct nlmsghdr *)skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return 0; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + if (rtnetlink_rcv_msg(skb, nlh, &err)) { + /* Not error, but we must interrupt processing here: + * Note, that in this case we do not pull message + * from skb, it will be processed later. + */ + if (err == 0) + return -1; + netlink_ack(skb, nlh, err); + } else if (nlh->nlmsg_flags&NLM_F_ACK) + netlink_ack(skb, nlh, 0); + skb_pull(skb, rlen); + } + + return 0; +} + +/* + * rtnetlink input queue processing routine: + * - try to acquire shared lock. If it is failed, defer processing. + * - feed skbs to rtnetlink_rcv_skb, until it refuse a message, + * that will occur, when a dump started and/or acquisition of + * exclusive lock failed. + */ + +static void rtnetlink_rcv(struct sock *sk, int len) +{ + do { + struct sk_buff *skb; + + if (rtnl_shlock_nowait()) + return; + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + if (rtnetlink_rcv_skb(skb)) { + if (skb->len) + skb_queue_head(&sk->sk_receive_queue, + skb); + else + kfree_skb(skb); + break; + } + kfree_skb(skb); + } + + up(&rtnl_sem); + + netdev_run_todo(); + } while (rtnl && rtnl->sk_receive_queue.qlen); +} + +static struct rtnetlink_link link_rtnetlink_table[RTM_MAX-RTM_BASE+1] = +{ + [RTM_GETLINK - RTM_BASE] = { .dumpit = rtnetlink_dump_ifinfo }, + [RTM_SETLINK - RTM_BASE] = { .doit = do_setlink }, + [RTM_GETADDR - RTM_BASE] = { .dumpit = rtnetlink_dump_all }, + [RTM_GETROUTE - RTM_BASE] = { .dumpit = rtnetlink_dump_all }, + [RTM_NEWNEIGH - RTM_BASE] = { .doit = neigh_add }, + [RTM_DELNEIGH - RTM_BASE] = { .doit = neigh_delete }, + [RTM_GETNEIGH - RTM_BASE] = { .dumpit = neigh_dump_info } +}; + +static int rtnetlink_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + switch (event) { + case NETDEV_UNREGISTER: + rtmsg_ifinfo(RTM_DELLINK, dev, ~0U); + break; + case NETDEV_REGISTER: + rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U); + break; + case NETDEV_UP: + case NETDEV_DOWN: + rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING); + break; + case NETDEV_CHANGE: + case NETDEV_GOING_DOWN: + break; + default: + rtmsg_ifinfo(RTM_NEWLINK, dev, 0); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block rtnetlink_dev_notifier = { + .notifier_call = rtnetlink_event, +}; + +void __init rtnetlink_init(void) +{ + int i; + + rtattr_max = 0; + for (i = 0; i < ARRAY_SIZE(rta_max); i++) + if (rta_max[i] > rtattr_max) + rtattr_max = rta_max[i]; + rta_buf = kmalloc(rtattr_max * sizeof(struct rtattr *), GFP_KERNEL); + if (!rta_buf) + panic("rtnetlink_init: cannot allocate rta_buf\n"); + + rtnl = netlink_kernel_create(NETLINK_ROUTE, rtnetlink_rcv); + if (rtnl == NULL) + panic("rtnetlink_init: cannot initialize rtnetlink\n"); + netlink_set_nonroot(NETLINK_ROUTE, NL_NONROOT_RECV); + register_netdevice_notifier(&rtnetlink_dev_notifier); + rtnetlink_links[PF_UNSPEC] = link_rtnetlink_table; + rtnetlink_links[PF_PACKET] = link_rtnetlink_table; +} + +EXPORT_SYMBOL(__rta_fill); +EXPORT_SYMBOL(rtattr_strlcpy); +EXPORT_SYMBOL(rtattr_parse); +EXPORT_SYMBOL(rtnetlink_links); +EXPORT_SYMBOL(rtnetlink_put_metrics); +EXPORT_SYMBOL(rtnl); +EXPORT_SYMBOL(rtnl_lock); +EXPORT_SYMBOL(rtnl_lock_interruptible); +EXPORT_SYMBOL(rtnl_sem); +EXPORT_SYMBOL(rtnl_unlock); diff --git a/net/core/scm.c b/net/core/scm.c new file mode 100644 index 000000000000..a2ebf30f6aa8 --- /dev/null +++ b/net/core/scm.c @@ -0,0 +1,291 @@ +/* scm.c - Socket level control messages processing. + * + * Author: Alexey Kuznetsov, + * Alignment and value checking mods by Craig Metz + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + + +/* + * Only allow a user to send credentials, that they could set with + * setu(g)id. + */ + +static __inline__ int scm_check_creds(struct ucred *creds) +{ + if ((creds->pid == current->tgid || capable(CAP_SYS_ADMIN)) && + ((creds->uid == current->uid || creds->uid == current->euid || + creds->uid == current->suid) || capable(CAP_SETUID)) && + ((creds->gid == current->gid || creds->gid == current->egid || + creds->gid == current->sgid) || capable(CAP_SETGID))) { + return 0; + } + return -EPERM; +} + +static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) +{ + int *fdp = (int*)CMSG_DATA(cmsg); + struct scm_fp_list *fpl = *fplp; + struct file **fpp; + int i, num; + + num = (cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)))/sizeof(int); + + if (num <= 0) + return 0; + + if (num > SCM_MAX_FD) + return -EINVAL; + + if (!fpl) + { + fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL); + if (!fpl) + return -ENOMEM; + *fplp = fpl; + fpl->count = 0; + } + fpp = &fpl->fp[fpl->count]; + + if (fpl->count + num > SCM_MAX_FD) + return -EINVAL; + + /* + * Verify the descriptors and increment the usage count. + */ + + for (i=0; i< num; i++) + { + int fd = fdp[i]; + struct file *file; + + if (fd < 0 || !(file = fget(fd))) + return -EBADF; + *fpp++ = file; + fpl->count++; + } + return num; +} + +void __scm_destroy(struct scm_cookie *scm) +{ + struct scm_fp_list *fpl = scm->fp; + int i; + + if (fpl) { + scm->fp = NULL; + for (i=fpl->count-1; i>=0; i--) + fput(fpl->fp[i]); + kfree(fpl); + } +} + +int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p) +{ + struct cmsghdr *cmsg; + int err; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) + { + err = -EINVAL; + + /* Verify that cmsg_len is at least sizeof(struct cmsghdr) */ + /* The first check was omitted in <= 2.2.5. The reasoning was + that parser checks cmsg_len in any case, so that + additional check would be work duplication. + But if cmsg_level is not SOL_SOCKET, we do not check + for too short ancillary data object at all! Oops. + OK, let's add it... + */ + if (!CMSG_OK(msg, cmsg)) + goto error; + + if (cmsg->cmsg_level != SOL_SOCKET) + continue; + + switch (cmsg->cmsg_type) + { + case SCM_RIGHTS: + err=scm_fp_copy(cmsg, &p->fp); + if (err<0) + goto error; + break; + case SCM_CREDENTIALS: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct ucred))) + goto error; + memcpy(&p->creds, CMSG_DATA(cmsg), sizeof(struct ucred)); + err = scm_check_creds(&p->creds); + if (err) + goto error; + break; + default: + goto error; + } + } + + if (p->fp && !p->fp->count) + { + kfree(p->fp); + p->fp = NULL; + } + return 0; + +error: + scm_destroy(p); + return err; +} + +int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) +{ + struct cmsghdr __user *cm = (struct cmsghdr __user *)msg->msg_control; + struct cmsghdr cmhdr; + int cmlen = CMSG_LEN(len); + int err; + + if (MSG_CMSG_COMPAT & msg->msg_flags) + return put_cmsg_compat(msg, level, type, len, data); + + if (cm==NULL || msg->msg_controllen < sizeof(*cm)) { + msg->msg_flags |= MSG_CTRUNC; + return 0; /* XXX: return error? check spec. */ + } + if (msg->msg_controllen < cmlen) { + msg->msg_flags |= MSG_CTRUNC; + cmlen = msg->msg_controllen; + } + cmhdr.cmsg_level = level; + cmhdr.cmsg_type = type; + cmhdr.cmsg_len = cmlen; + + err = -EFAULT; + if (copy_to_user(cm, &cmhdr, sizeof cmhdr)) + goto out; + if (copy_to_user(CMSG_DATA(cm), data, cmlen - sizeof(struct cmsghdr))) + goto out; + cmlen = CMSG_SPACE(len); + msg->msg_control += cmlen; + msg->msg_controllen -= cmlen; + err = 0; +out: + return err; +} + +void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm) +{ + struct cmsghdr __user *cm = (struct cmsghdr __user*)msg->msg_control; + + int fdmax = 0; + int fdnum = scm->fp->count; + struct file **fp = scm->fp->fp; + int __user *cmfptr; + int err = 0, i; + + if (MSG_CMSG_COMPAT & msg->msg_flags) { + scm_detach_fds_compat(msg, scm); + return; + } + + if (msg->msg_controllen > sizeof(struct cmsghdr)) + fdmax = ((msg->msg_controllen - sizeof(struct cmsghdr)) + / sizeof(int)); + + if (fdnum < fdmax) + fdmax = fdnum; + + for (i=0, cmfptr=(int __user *)CMSG_DATA(cm); i 0) + { + int cmlen = CMSG_LEN(i*sizeof(int)); + if (!err) + err = put_user(SOL_SOCKET, &cm->cmsg_level); + if (!err) + err = put_user(SCM_RIGHTS, &cm->cmsg_type); + if (!err) + err = put_user(cmlen, &cm->cmsg_len); + if (!err) { + cmlen = CMSG_SPACE(i*sizeof(int)); + msg->msg_control += cmlen; + msg->msg_controllen -= cmlen; + } + } + if (i < fdnum || (fdnum && fdmax <= 0)) + msg->msg_flags |= MSG_CTRUNC; + + /* + * All of the files that fit in the message have had their + * usage counts incremented, so we just free the list. + */ + __scm_destroy(scm); +} + +struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) +{ + struct scm_fp_list *new_fpl; + int i; + + if (!fpl) + return NULL; + + new_fpl = kmalloc(sizeof(*fpl), GFP_KERNEL); + if (new_fpl) { + for (i=fpl->count-1; i>=0; i--) + get_file(fpl->fp[i]); + memcpy(new_fpl, fpl, sizeof(*fpl)); + } + return new_fpl; +} + +EXPORT_SYMBOL(__scm_destroy); +EXPORT_SYMBOL(__scm_send); +EXPORT_SYMBOL(put_cmsg); +EXPORT_SYMBOL(scm_detach_fds); +EXPORT_SYMBOL(scm_fp_dup); diff --git a/net/core/skbuff.c b/net/core/skbuff.c new file mode 100644 index 000000000000..bf02ca9f80ac --- /dev/null +++ b/net/core/skbuff.c @@ -0,0 +1,1460 @@ +/* + * Routines having to do with the 'struct sk_buff' memory handlers. + * + * Authors: Alan Cox + * Florian La Roche + * + * Version: $Id: skbuff.c,v 1.90 2001/11/07 05:56:19 davem Exp $ + * + * Fixes: + * Alan Cox : Fixed the worst of the load + * balancer bugs. + * Dave Platt : Interrupt stacking fix. + * Richard Kooijman : Timestamp fixes. + * Alan Cox : Changed buffer format. + * Alan Cox : destructor hook for AF_UNIX etc. + * Linus Torvalds : Better skb_clone. + * Alan Cox : Added skb_copy. + * Alan Cox : Added all the changed routines Linus + * only put in the headers + * Ray VanTassle : Fixed --skb->lock in free + * Alan Cox : skb_copy copy arp field + * Andi Kleen : slabified it. + * Robert Olsson : Removed skb_head_pool + * + * NOTE: + * The __skb_ routines should be called with interrupts + * disabled, or you better be *real* sure that the operation is atomic + * with respect to whatever list is being frobbed (e.g. via lock_sock() + * or via disabling bottom half handlers, etc). + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * The functions in this file will not compile correctly with gcc 2.4.x + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_NET_CLS_ACT +#include +#endif +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +static kmem_cache_t *skbuff_head_cache; + +/* + * Keep out-of-line to prevent kernel bloat. + * __builtin_return_address is not used because it is not always + * reliable. + */ + +/** + * skb_over_panic - private function + * @skb: buffer + * @sz: size + * @here: address + * + * Out of line support code for skb_put(). Not user callable. + */ +void skb_over_panic(struct sk_buff *skb, int sz, void *here) +{ + printk(KERN_INFO "skput:over: %p:%d put:%d dev:%s", + here, skb->len, sz, skb->dev ? skb->dev->name : ""); + BUG(); +} + +/** + * skb_under_panic - private function + * @skb: buffer + * @sz: size + * @here: address + * + * Out of line support code for skb_push(). Not user callable. + */ + +void skb_under_panic(struct sk_buff *skb, int sz, void *here) +{ + printk(KERN_INFO "skput:under: %p:%d put:%d dev:%s", + here, skb->len, sz, skb->dev ? skb->dev->name : ""); + BUG(); +} + +/* Allocate a new skbuff. We do this ourselves so we can fill in a few + * 'private' fields and also do memory statistics to find all the + * [BEEP] leaks. + * + */ + +/** + * alloc_skb - allocate a network buffer + * @size: size to allocate + * @gfp_mask: allocation mask + * + * Allocate a new &sk_buff. The returned buffer has no headroom and a + * tail room of size bytes. The object has a reference count of one. + * The return is the buffer. On a failure the return is %NULL. + * + * Buffers may only be allocated from interrupts using a @gfp_mask of + * %GFP_ATOMIC. + */ +struct sk_buff *alloc_skb(unsigned int size, int gfp_mask) +{ + struct sk_buff *skb; + u8 *data; + + /* Get the HEAD */ + skb = kmem_cache_alloc(skbuff_head_cache, + gfp_mask & ~__GFP_DMA); + if (!skb) + goto out; + + /* Get the DATA. Size must match skb_add_mtu(). */ + size = SKB_DATA_ALIGN(size); + data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); + if (!data) + goto nodata; + + memset(skb, 0, offsetof(struct sk_buff, truesize)); + skb->truesize = size + sizeof(struct sk_buff); + atomic_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb->tail = data; + skb->end = data + size; + + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->frag_list = NULL; +out: + return skb; +nodata: + kmem_cache_free(skbuff_head_cache, skb); + skb = NULL; + goto out; +} + +/** + * alloc_skb_from_cache - allocate a network buffer + * @cp: kmem_cache from which to allocate the data area + * (object size must be big enough for @size bytes + skb overheads) + * @size: size to allocate + * @gfp_mask: allocation mask + * + * Allocate a new &sk_buff. The returned buffer has no headroom and + * tail room of size bytes. The object has a reference count of one. + * The return is the buffer. On a failure the return is %NULL. + * + * Buffers may only be allocated from interrupts using a @gfp_mask of + * %GFP_ATOMIC. + */ +struct sk_buff *alloc_skb_from_cache(kmem_cache_t *cp, + unsigned int size, int gfp_mask) +{ + struct sk_buff *skb; + u8 *data; + + /* Get the HEAD */ + skb = kmem_cache_alloc(skbuff_head_cache, + gfp_mask & ~__GFP_DMA); + if (!skb) + goto out; + + /* Get the DATA. */ + size = SKB_DATA_ALIGN(size); + data = kmem_cache_alloc(cp, gfp_mask); + if (!data) + goto nodata; + + memset(skb, 0, offsetof(struct sk_buff, truesize)); + skb->truesize = size + sizeof(struct sk_buff); + atomic_set(&skb->users, 1); + skb->head = data; + skb->data = data; + skb->tail = data; + skb->end = data + size; + + atomic_set(&(skb_shinfo(skb)->dataref), 1); + skb_shinfo(skb)->nr_frags = 0; + skb_shinfo(skb)->tso_size = 0; + skb_shinfo(skb)->tso_segs = 0; + skb_shinfo(skb)->frag_list = NULL; +out: + return skb; +nodata: + kmem_cache_free(skbuff_head_cache, skb); + skb = NULL; + goto out; +} + + +static void skb_drop_fraglist(struct sk_buff *skb) +{ + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + skb_shinfo(skb)->frag_list = NULL; + + do { + struct sk_buff *this = list; + list = list->next; + kfree_skb(this); + } while (list); +} + +static void skb_clone_fraglist(struct sk_buff *skb) +{ + struct sk_buff *list; + + for (list = skb_shinfo(skb)->frag_list; list; list = list->next) + skb_get(list); +} + +void skb_release_data(struct sk_buff *skb) +{ + if (!skb->cloned || + !atomic_sub_return(skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1, + &skb_shinfo(skb)->dataref)) { + if (skb_shinfo(skb)->nr_frags) { + int i; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + put_page(skb_shinfo(skb)->frags[i].page); + } + + if (skb_shinfo(skb)->frag_list) + skb_drop_fraglist(skb); + + kfree(skb->head); + } +} + +/* + * Free an skbuff by memory without cleaning the state. + */ +void kfree_skbmem(struct sk_buff *skb) +{ + skb_release_data(skb); + kmem_cache_free(skbuff_head_cache, skb); +} + +/** + * __kfree_skb - private function + * @skb: buffer + * + * Free an sk_buff. Release anything attached to the buffer. + * Clean the state. This is an internal helper function. Users should + * always call kfree_skb + */ + +void __kfree_skb(struct sk_buff *skb) +{ + if (skb->list) { + printk(KERN_WARNING "Warning: kfree_skb passed an skb still " + "on a list (from %p).\n", NET_CALLER(skb)); + BUG(); + } + + dst_release(skb->dst); +#ifdef CONFIG_XFRM + secpath_put(skb->sp); +#endif + if(skb->destructor) { + if (in_irq()) + printk(KERN_WARNING "Warning: kfree_skb on " + "hard IRQ %p\n", NET_CALLER(skb)); + skb->destructor(skb); + } +#ifdef CONFIG_NETFILTER + nf_conntrack_put(skb->nfct); +#ifdef CONFIG_BRIDGE_NETFILTER + nf_bridge_put(skb->nf_bridge); +#endif +#endif +/* XXX: IS this still necessary? - JHS */ +#ifdef CONFIG_NET_SCHED + skb->tc_index = 0; +#ifdef CONFIG_NET_CLS_ACT + skb->tc_verd = 0; + skb->tc_classid = 0; +#endif +#endif + + kfree_skbmem(skb); +} + +/** + * skb_clone - duplicate an sk_buff + * @skb: buffer to clone + * @gfp_mask: allocation priority + * + * Duplicate an &sk_buff. The new one is not owned by a socket. Both + * copies share the same packet data but not structure. The new + * buffer has a reference count of 1. If the allocation fails the + * function returns %NULL otherwise the new buffer is returned. + * + * If this function is called from an interrupt gfp_mask() must be + * %GFP_ATOMIC. + */ + +struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) +{ + struct sk_buff *n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); + + if (!n) + return NULL; + +#define C(x) n->x = skb->x + + n->next = n->prev = NULL; + n->list = NULL; + n->sk = NULL; + C(stamp); + C(dev); + C(real_dev); + C(h); + C(nh); + C(mac); + C(dst); + dst_clone(skb->dst); + C(sp); +#ifdef CONFIG_INET + secpath_get(skb->sp); +#endif + memcpy(n->cb, skb->cb, sizeof(skb->cb)); + C(len); + C(data_len); + C(csum); + C(local_df); + n->cloned = 1; + n->nohdr = 0; + C(pkt_type); + C(ip_summed); + C(priority); + C(protocol); + C(security); + n->destructor = NULL; +#ifdef CONFIG_NETFILTER + C(nfmark); + C(nfcache); + C(nfct); + nf_conntrack_get(skb->nfct); + C(nfctinfo); +#ifdef CONFIG_NETFILTER_DEBUG + C(nf_debug); +#endif +#ifdef CONFIG_BRIDGE_NETFILTER + C(nf_bridge); + nf_bridge_get(skb->nf_bridge); +#endif +#endif /*CONFIG_NETFILTER*/ +#if defined(CONFIG_HIPPI) + C(private); +#endif +#ifdef CONFIG_NET_SCHED + C(tc_index); +#ifdef CONFIG_NET_CLS_ACT + n->tc_verd = SET_TC_VERD(skb->tc_verd,0); + n->tc_verd = CLR_TC_OK2MUNGE(skb->tc_verd); + n->tc_verd = CLR_TC_MUNGED(skb->tc_verd); + C(input_dev); + C(tc_classid); +#endif + +#endif + C(truesize); + atomic_set(&n->users, 1); + C(head); + C(data); + C(tail); + C(end); + + atomic_inc(&(skb_shinfo(skb)->dataref)); + skb->cloned = 1; + + return n; +} + +static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) +{ + /* + * Shift between the two data areas in bytes + */ + unsigned long offset = new->data - old->data; + + new->list = NULL; + new->sk = NULL; + new->dev = old->dev; + new->real_dev = old->real_dev; + new->priority = old->priority; + new->protocol = old->protocol; + new->dst = dst_clone(old->dst); +#ifdef CONFIG_INET + new->sp = secpath_get(old->sp); +#endif + new->h.raw = old->h.raw + offset; + new->nh.raw = old->nh.raw + offset; + new->mac.raw = old->mac.raw + offset; + memcpy(new->cb, old->cb, sizeof(old->cb)); + new->local_df = old->local_df; + new->pkt_type = old->pkt_type; + new->stamp = old->stamp; + new->destructor = NULL; + new->security = old->security; +#ifdef CONFIG_NETFILTER + new->nfmark = old->nfmark; + new->nfcache = old->nfcache; + new->nfct = old->nfct; + nf_conntrack_get(old->nfct); + new->nfctinfo = old->nfctinfo; +#ifdef CONFIG_NETFILTER_DEBUG + new->nf_debug = old->nf_debug; +#endif +#ifdef CONFIG_BRIDGE_NETFILTER + new->nf_bridge = old->nf_bridge; + nf_bridge_get(old->nf_bridge); +#endif +#endif +#ifdef CONFIG_NET_SCHED +#ifdef CONFIG_NET_CLS_ACT + new->tc_verd = old->tc_verd; +#endif + new->tc_index = old->tc_index; +#endif + atomic_set(&new->users, 1); + skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size; + skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs; +} + +/** + * skb_copy - create private copy of an sk_buff + * @skb: buffer to copy + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and its data. This is used when the + * caller wishes to modify the data and needs a private copy of the + * data to alter. Returns %NULL on failure or the pointer to the buffer + * on success. The returned buffer has a reference count of 1. + * + * As by-product this function converts non-linear &sk_buff to linear + * one, so that &sk_buff becomes completely private and caller is allowed + * to modify all the data of returned buffer. This means that this + * function is not recommended for use in circumstances when only + * header is going to be modified. Use pskb_copy() instead. + */ + +struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask) +{ + int headerlen = skb->data - skb->head; + /* + * Allocate the copy buffer + */ + struct sk_buff *n = alloc_skb(skb->end - skb->head + skb->data_len, + gfp_mask); + if (!n) + return NULL; + + /* Set the data pointer */ + skb_reserve(n, headerlen); + /* Set the tail pointer and length */ + skb_put(n, skb->len); + n->csum = skb->csum; + n->ip_summed = skb->ip_summed; + + if (skb_copy_bits(skb, -headerlen, n->head, headerlen + skb->len)) + BUG(); + + copy_skb_header(n, skb); + return n; +} + + +/** + * pskb_copy - create copy of an sk_buff with private head. + * @skb: buffer to copy + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and part of its data, located + * in header. Fragmented data remain shared. This is used when + * the caller wishes to modify only header of &sk_buff and needs + * private copy of the header to alter. Returns %NULL on failure + * or the pointer to the buffer on success. + * The returned buffer has a reference count of 1. + */ + +struct sk_buff *pskb_copy(struct sk_buff *skb, int gfp_mask) +{ + /* + * Allocate the copy buffer + */ + struct sk_buff *n = alloc_skb(skb->end - skb->head, gfp_mask); + + if (!n) + goto out; + + /* Set the data pointer */ + skb_reserve(n, skb->data - skb->head); + /* Set the tail pointer and length */ + skb_put(n, skb_headlen(skb)); + /* Copy the bytes */ + memcpy(n->data, skb->data, n->len); + n->csum = skb->csum; + n->ip_summed = skb->ip_summed; + + n->data_len = skb->data_len; + n->len = skb->len; + + if (skb_shinfo(skb)->nr_frags) { + int i; + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i]; + get_page(skb_shinfo(n)->frags[i].page); + } + skb_shinfo(n)->nr_frags = i; + } + + if (skb_shinfo(skb)->frag_list) { + skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; + skb_clone_fraglist(n); + } + + copy_skb_header(n, skb); +out: + return n; +} + +/** + * pskb_expand_head - reallocate header of &sk_buff + * @skb: buffer to reallocate + * @nhead: room to add at head + * @ntail: room to add at tail + * @gfp_mask: allocation priority + * + * Expands (or creates identical copy, if &nhead and &ntail are zero) + * header of skb. &sk_buff itself is not changed. &sk_buff MUST have + * reference count of 1. Returns zero in the case of success or error, + * if expansion failed. In the last case, &sk_buff is not changed. + * + * All the pointers pointing into skb header may change and must be + * reloaded after call to this function. + */ + +int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, int gfp_mask) +{ + int i; + u8 *data; + int size = nhead + (skb->end - skb->head) + ntail; + long off; + + if (skb_shared(skb)) + BUG(); + + size = SKB_DATA_ALIGN(size); + + data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); + if (!data) + goto nodata; + + /* Copy only real data... and, alas, header. This should be + * optimized for the cases when header is void. */ + memcpy(data + nhead, skb->head, skb->tail - skb->head); + memcpy(data + size, skb->end, sizeof(struct skb_shared_info)); + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + get_page(skb_shinfo(skb)->frags[i].page); + + if (skb_shinfo(skb)->frag_list) + skb_clone_fraglist(skb); + + skb_release_data(skb); + + off = (data + nhead) - skb->head; + + skb->head = data; + skb->end = data + size; + skb->data += off; + skb->tail += off; + skb->mac.raw += off; + skb->h.raw += off; + skb->nh.raw += off; + skb->cloned = 0; + skb->nohdr = 0; + atomic_set(&skb_shinfo(skb)->dataref, 1); + return 0; + +nodata: + return -ENOMEM; +} + +/* Make private copy of skb with writable head and some headroom */ + +struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom) +{ + struct sk_buff *skb2; + int delta = headroom - skb_headroom(skb); + + if (delta <= 0) + skb2 = pskb_copy(skb, GFP_ATOMIC); + else { + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 && pskb_expand_head(skb2, SKB_DATA_ALIGN(delta), 0, + GFP_ATOMIC)) { + kfree_skb(skb2); + skb2 = NULL; + } + } + return skb2; +} + + +/** + * skb_copy_expand - copy and expand sk_buff + * @skb: buffer to copy + * @newheadroom: new free bytes at head + * @newtailroom: new free bytes at tail + * @gfp_mask: allocation priority + * + * Make a copy of both an &sk_buff and its data and while doing so + * allocate additional space. + * + * This is used when the caller wishes to modify the data and needs a + * private copy of the data to alter as well as more space for new fields. + * Returns %NULL on failure or the pointer to the buffer + * on success. The returned buffer has a reference count of 1. + * + * You must pass %GFP_ATOMIC as the allocation priority if this function + * is called from an interrupt. + * + * BUG ALERT: ip_summed is not copied. Why does this work? Is it used + * only by netfilter in the cases when checksum is recalculated? --ANK + */ +struct sk_buff *skb_copy_expand(const struct sk_buff *skb, + int newheadroom, int newtailroom, int gfp_mask) +{ + /* + * Allocate the copy buffer + */ + struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, + gfp_mask); + int head_copy_len, head_copy_off; + + if (!n) + return NULL; + + skb_reserve(n, newheadroom); + + /* Set the tail pointer and length */ + skb_put(n, skb->len); + + head_copy_len = skb_headroom(skb); + head_copy_off = 0; + if (newheadroom <= head_copy_len) + head_copy_len = newheadroom; + else + head_copy_off = newheadroom - head_copy_len; + + /* Copy the linear header and data. */ + if (skb_copy_bits(skb, -head_copy_len, n->head + head_copy_off, + skb->len + head_copy_len)) + BUG(); + + copy_skb_header(n, skb); + + return n; +} + +/** + * skb_pad - zero pad the tail of an skb + * @skb: buffer to pad + * @pad: space to pad + * + * Ensure that a buffer is followed by a padding area that is zero + * filled. Used by network drivers which may DMA or transfer data + * beyond the buffer end onto the wire. + * + * May return NULL in out of memory cases. + */ + +struct sk_buff *skb_pad(struct sk_buff *skb, int pad) +{ + struct sk_buff *nskb; + + /* If the skbuff is non linear tailroom is always zero.. */ + if (skb_tailroom(skb) >= pad) { + memset(skb->data+skb->len, 0, pad); + return skb; + } + + nskb = skb_copy_expand(skb, skb_headroom(skb), skb_tailroom(skb) + pad, GFP_ATOMIC); + kfree_skb(skb); + if (nskb) + memset(nskb->data+nskb->len, 0, pad); + return nskb; +} + +/* Trims skb to length len. It can change skb pointers, if "realloc" is 1. + * If realloc==0 and trimming is impossible without change of data, + * it is BUG(). + */ + +int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc) +{ + int offset = skb_headlen(skb); + int nfrags = skb_shinfo(skb)->nr_frags; + int i; + + for (i = 0; i < nfrags; i++) { + int end = offset + skb_shinfo(skb)->frags[i].size; + if (end > len) { + if (skb_cloned(skb)) { + if (!realloc) + BUG(); + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return -ENOMEM; + } + if (len <= offset) { + put_page(skb_shinfo(skb)->frags[i].page); + skb_shinfo(skb)->nr_frags--; + } else { + skb_shinfo(skb)->frags[i].size = len - offset; + } + } + offset = end; + } + + if (offset < len) { + skb->data_len -= skb->len - len; + skb->len = len; + } else { + if (len <= skb_headlen(skb)) { + skb->len = len; + skb->data_len = 0; + skb->tail = skb->data + len; + if (skb_shinfo(skb)->frag_list && !skb_cloned(skb)) + skb_drop_fraglist(skb); + } else { + skb->data_len -= skb->len - len; + skb->len = len; + } + } + + return 0; +} + +/** + * __pskb_pull_tail - advance tail of skb header + * @skb: buffer to reallocate + * @delta: number of bytes to advance tail + * + * The function makes a sense only on a fragmented &sk_buff, + * it expands header moving its tail forward and copying necessary + * data from fragmented part. + * + * &sk_buff MUST have reference count of 1. + * + * Returns %NULL (and &sk_buff does not change) if pull failed + * or value of new tail of skb in the case of success. + * + * All the pointers pointing into skb header may change and must be + * reloaded after call to this function. + */ + +/* Moves tail of skb head forward, copying data from fragmented part, + * when it is necessary. + * 1. It may fail due to malloc failure. + * 2. It may change skb pointers. + * + * It is pretty complicated. Luckily, it is called only in exceptional cases. + */ +unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) +{ + /* If skb has not enough free space at tail, get new one + * plus 128 bytes for future expansions. If we have enough + * room at tail, reallocate without expansion only if skb is cloned. + */ + int i, k, eat = (skb->tail + delta) - skb->end; + + if (eat > 0 || skb_cloned(skb)) { + if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0, + GFP_ATOMIC)) + return NULL; + } + + if (skb_copy_bits(skb, skb_headlen(skb), skb->tail, delta)) + BUG(); + + /* Optimization: no fragments, no reasons to preestimate + * size of pulled pages. Superb. + */ + if (!skb_shinfo(skb)->frag_list) + goto pull_pages; + + /* Estimate size of pulled pages. */ + eat = delta; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + if (skb_shinfo(skb)->frags[i].size >= eat) + goto pull_pages; + eat -= skb_shinfo(skb)->frags[i].size; + } + + /* If we need update frag list, we are in troubles. + * Certainly, it possible to add an offset to skb data, + * but taking into account that pulling is expected to + * be very rare operation, it is worth to fight against + * further bloating skb head and crucify ourselves here instead. + * Pure masohism, indeed. 8)8) + */ + if (eat) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + struct sk_buff *clone = NULL; + struct sk_buff *insp = NULL; + + do { + if (!list) + BUG(); + + if (list->len <= eat) { + /* Eaten as whole. */ + eat -= list->len; + list = list->next; + insp = list; + } else { + /* Eaten partially. */ + + if (skb_shared(list)) { + /* Sucks! We need to fork list. :-( */ + clone = skb_clone(list, GFP_ATOMIC); + if (!clone) + return NULL; + insp = list->next; + list = clone; + } else { + /* This may be pulled without + * problems. */ + insp = list; + } + if (!pskb_pull(list, eat)) { + if (clone) + kfree_skb(clone); + return NULL; + } + break; + } + } while (eat); + + /* Free pulled out fragments. */ + while ((list = skb_shinfo(skb)->frag_list) != insp) { + skb_shinfo(skb)->frag_list = list->next; + kfree_skb(list); + } + /* And insert new clone at head. */ + if (clone) { + clone->next = list; + skb_shinfo(skb)->frag_list = clone; + } + } + /* Success! Now we may commit changes to skb data. */ + +pull_pages: + eat = delta; + k = 0; + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + if (skb_shinfo(skb)->frags[i].size <= eat) { + put_page(skb_shinfo(skb)->frags[i].page); + eat -= skb_shinfo(skb)->frags[i].size; + } else { + skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; + if (eat) { + skb_shinfo(skb)->frags[k].page_offset += eat; + skb_shinfo(skb)->frags[k].size -= eat; + eat = 0; + } + k++; + } + } + skb_shinfo(skb)->nr_frags = k; + + skb->tail += delta; + skb->data_len -= delta; + + return skb->tail; +} + +/* Copy some data bits from skb to kernel buffer. */ + +int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) +{ + int i, copy; + int start = skb_headlen(skb); + + if (offset > (int)skb->len - len) + goto fault; + + /* Copy header. */ + if ((copy = start - offset) > 0) { + if (copy > len) + copy = len; + memcpy(to, skb->data + offset, copy); + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + u8 *vaddr; + + if (copy > len) + copy = len; + + vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); + memcpy(to, + vaddr + skb_shinfo(skb)->frags[i].page_offset+ + offset - start, copy); + kunmap_skb_frag(vaddr); + + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_bits(list, offset - start, + to, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + to += copy; + } + start = end; + } + } + if (!len) + return 0; + +fault: + return -EFAULT; +} + +/* Checksum skb data. */ + +unsigned int skb_checksum(const struct sk_buff *skb, int offset, + int len, unsigned int csum) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + int pos = 0; + + /* Checksum header. */ + if (copy > 0) { + if (copy > len) + copy = len; + csum = csum_partial(skb->data + offset, copy, csum); + if ((len -= copy) == 0) + return csum; + offset += copy; + pos = copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + unsigned int csum2; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + vaddr = kmap_skb_frag(frag); + csum2 = csum_partial(vaddr + frag->page_offset + + offset - start, copy, 0); + kunmap_skb_frag(vaddr); + csum = csum_block_add(csum, csum2, pos); + if (!(len -= copy)) + return csum; + offset += copy; + pos += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + unsigned int csum2; + if (copy > len) + copy = len; + csum2 = skb_checksum(list, offset - start, + copy, 0); + csum = csum_block_add(csum, csum2, pos); + if ((len -= copy) == 0) + return csum; + offset += copy; + pos += copy; + } + start = end; + } + } + if (len) + BUG(); + + return csum; +} + +/* Both of above in one bottle. */ + +unsigned int skb_copy_and_csum_bits(const struct sk_buff *skb, int offset, + u8 *to, int len, unsigned int csum) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + int pos = 0; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + csum = csum_partial_copy_nocheck(skb->data + offset, to, + copy, csum); + if ((len -= copy) == 0) + return csum; + offset += copy; + to += copy; + pos = copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + unsigned int csum2; + u8 *vaddr; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + vaddr = kmap_skb_frag(frag); + csum2 = csum_partial_copy_nocheck(vaddr + + frag->page_offset + + offset - start, to, + copy, 0); + kunmap_skb_frag(vaddr); + csum = csum_block_add(csum, csum2, pos); + if (!(len -= copy)) + return csum; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + unsigned int csum2; + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + csum2 = skb_copy_and_csum_bits(list, + offset - start, + to, copy, 0); + csum = csum_block_add(csum, csum2, pos); + if ((len -= copy) == 0) + return csum; + offset += copy; + to += copy; + pos += copy; + } + start = end; + } + } + if (len) + BUG(); + return csum; +} + +void skb_copy_and_csum_dev(const struct sk_buff *skb, u8 *to) +{ + unsigned int csum; + long csstart; + + if (skb->ip_summed == CHECKSUM_HW) + csstart = skb->h.raw - skb->data; + else + csstart = skb_headlen(skb); + + if (csstart > skb_headlen(skb)) + BUG(); + + memcpy(to, skb->data, csstart); + + csum = 0; + if (csstart != skb->len) + csum = skb_copy_and_csum_bits(skb, csstart, to + csstart, + skb->len - csstart, 0); + + if (skb->ip_summed == CHECKSUM_HW) { + long csstuff = csstart + skb->csum; + + *((unsigned short *)(to + csstuff)) = csum_fold(csum); + } +} + +/** + * skb_dequeue - remove from the head of the queue + * @list: list to dequeue from + * + * Remove the head of the list. The list lock is taken so the function + * may be used safely with other locking list functions. The head item is + * returned or %NULL if the list is empty. + */ + +struct sk_buff *skb_dequeue(struct sk_buff_head *list) +{ + unsigned long flags; + struct sk_buff *result; + + spin_lock_irqsave(&list->lock, flags); + result = __skb_dequeue(list); + spin_unlock_irqrestore(&list->lock, flags); + return result; +} + +/** + * skb_dequeue_tail - remove from the tail of the queue + * @list: list to dequeue from + * + * Remove the tail of the list. The list lock is taken so the function + * may be used safely with other locking list functions. The tail item is + * returned or %NULL if the list is empty. + */ +struct sk_buff *skb_dequeue_tail(struct sk_buff_head *list) +{ + unsigned long flags; + struct sk_buff *result; + + spin_lock_irqsave(&list->lock, flags); + result = __skb_dequeue_tail(list); + spin_unlock_irqrestore(&list->lock, flags); + return result; +} + +/** + * skb_queue_purge - empty a list + * @list: list to empty + * + * Delete all buffers on an &sk_buff list. Each buffer is removed from + * the list and one reference dropped. This function takes the list + * lock and is atomic with respect to other list locking functions. + */ +void skb_queue_purge(struct sk_buff_head *list) +{ + struct sk_buff *skb; + while ((skb = skb_dequeue(list)) != NULL) + kfree_skb(skb); +} + +/** + * skb_queue_head - queue a buffer at the list head + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the start of the list. This function takes the + * list lock and can be used safely with other locking &sk_buff functions + * safely. + * + * A buffer cannot be placed on two lists at the same time. + */ +void skb_queue_head(struct sk_buff_head *list, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_head(list, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} + +/** + * skb_queue_tail - queue a buffer at the list tail + * @list: list to use + * @newsk: buffer to queue + * + * Queue a buffer at the tail of the list. This function takes the + * list lock and can be used safely with other locking &sk_buff functions + * safely. + * + * A buffer cannot be placed on two lists at the same time. + */ +void skb_queue_tail(struct sk_buff_head *list, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + __skb_queue_tail(list, newsk); + spin_unlock_irqrestore(&list->lock, flags); +} +/** + * skb_unlink - remove a buffer from a list + * @skb: buffer to remove + * + * Place a packet after a given packet in a list. The list locks are taken + * and this function is atomic with respect to other list locked calls + * + * Works even without knowing the list it is sitting on, which can be + * handy at times. It also means that THE LIST MUST EXIST when you + * unlink. Thus a list must have its contents unlinked before it is + * destroyed. + */ +void skb_unlink(struct sk_buff *skb) +{ + struct sk_buff_head *list = skb->list; + + if (list) { + unsigned long flags; + + spin_lock_irqsave(&list->lock, flags); + if (skb->list == list) + __skb_unlink(skb, skb->list); + spin_unlock_irqrestore(&list->lock, flags); + } +} + + +/** + * skb_append - append a buffer + * @old: buffer to insert after + * @newsk: buffer to insert + * + * Place a packet after a given packet in a list. The list locks are taken + * and this function is atomic with respect to other list locked calls. + * A buffer cannot be placed on two lists at the same time. + */ + +void skb_append(struct sk_buff *old, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&old->list->lock, flags); + __skb_append(old, newsk); + spin_unlock_irqrestore(&old->list->lock, flags); +} + + +/** + * skb_insert - insert a buffer + * @old: buffer to insert before + * @newsk: buffer to insert + * + * Place a packet before a given packet in a list. The list locks are taken + * and this function is atomic with respect to other list locked calls + * A buffer cannot be placed on two lists at the same time. + */ + +void skb_insert(struct sk_buff *old, struct sk_buff *newsk) +{ + unsigned long flags; + + spin_lock_irqsave(&old->list->lock, flags); + __skb_insert(newsk, old->prev, old, old->list); + spin_unlock_irqrestore(&old->list->lock, flags); +} + +#if 0 +/* + * Tune the memory allocator for a new MTU size. + */ +void skb_add_mtu(int mtu) +{ + /* Must match allocation in alloc_skb */ + mtu = SKB_DATA_ALIGN(mtu) + sizeof(struct skb_shared_info); + + kmem_add_cache_size(mtu); +} +#endif + +static inline void skb_split_inside_header(struct sk_buff *skb, + struct sk_buff* skb1, + const u32 len, const int pos) +{ + int i; + + memcpy(skb_put(skb1, pos - len), skb->data + len, pos - len); + + /* And move data appendix as is. */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i]; + + skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags; + skb_shinfo(skb)->nr_frags = 0; + skb1->data_len = skb->data_len; + skb1->len += skb1->data_len; + skb->data_len = 0; + skb->len = len; + skb->tail = skb->data + len; +} + +static inline void skb_split_no_header(struct sk_buff *skb, + struct sk_buff* skb1, + const u32 len, int pos) +{ + int i, k = 0; + const int nfrags = skb_shinfo(skb)->nr_frags; + + skb_shinfo(skb)->nr_frags = 0; + skb1->len = skb1->data_len = skb->len - len; + skb->len = len; + skb->data_len = len - pos; + + for (i = 0; i < nfrags; i++) { + int size = skb_shinfo(skb)->frags[i].size; + + if (pos + size > len) { + skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i]; + + if (pos < len) { + /* Split frag. + * We have two variants in this case: + * 1. Move all the frag to the second + * part, if it is possible. F.e. + * this approach is mandatory for TUX, + * where splitting is expensive. + * 2. Split is accurately. We make this. + */ + get_page(skb_shinfo(skb)->frags[i].page); + skb_shinfo(skb1)->frags[0].page_offset += len - pos; + skb_shinfo(skb1)->frags[0].size -= len - pos; + skb_shinfo(skb)->frags[i].size = len - pos; + skb_shinfo(skb)->nr_frags++; + } + k++; + } else + skb_shinfo(skb)->nr_frags++; + pos += size; + } + skb_shinfo(skb1)->nr_frags = k; +} + +/** + * skb_split - Split fragmented skb to two parts at length len. + * @skb: the buffer to split + * @skb1: the buffer to receive the second part + * @len: new length for skb + */ +void skb_split(struct sk_buff *skb, struct sk_buff *skb1, const u32 len) +{ + int pos = skb_headlen(skb); + + if (len < pos) /* Split line is inside header. */ + skb_split_inside_header(skb, skb1, len, pos); + else /* Second chunk has no header, nothing to copy. */ + skb_split_no_header(skb, skb1, len, pos); +} + +void __init skb_init(void) +{ + skbuff_head_cache = kmem_cache_create("skbuff_head_cache", + sizeof(struct sk_buff), + 0, + SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!skbuff_head_cache) + panic("cannot create skbuff cache"); +} + +EXPORT_SYMBOL(___pskb_trim); +EXPORT_SYMBOL(__kfree_skb); +EXPORT_SYMBOL(__pskb_pull_tail); +EXPORT_SYMBOL(alloc_skb); +EXPORT_SYMBOL(pskb_copy); +EXPORT_SYMBOL(pskb_expand_head); +EXPORT_SYMBOL(skb_checksum); +EXPORT_SYMBOL(skb_clone); +EXPORT_SYMBOL(skb_clone_fraglist); +EXPORT_SYMBOL(skb_copy); +EXPORT_SYMBOL(skb_copy_and_csum_bits); +EXPORT_SYMBOL(skb_copy_and_csum_dev); +EXPORT_SYMBOL(skb_copy_bits); +EXPORT_SYMBOL(skb_copy_expand); +EXPORT_SYMBOL(skb_over_panic); +EXPORT_SYMBOL(skb_pad); +EXPORT_SYMBOL(skb_realloc_headroom); +EXPORT_SYMBOL(skb_under_panic); +EXPORT_SYMBOL(skb_dequeue); +EXPORT_SYMBOL(skb_dequeue_tail); +EXPORT_SYMBOL(skb_insert); +EXPORT_SYMBOL(skb_queue_purge); +EXPORT_SYMBOL(skb_queue_head); +EXPORT_SYMBOL(skb_queue_tail); +EXPORT_SYMBOL(skb_unlink); +EXPORT_SYMBOL(skb_append); +EXPORT_SYMBOL(skb_split); diff --git a/net/core/sock.c b/net/core/sock.c new file mode 100644 index 000000000000..629ab4a5b45b --- /dev/null +++ b/net/core/sock.c @@ -0,0 +1,1565 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Generic socket support routines. Memory allocators, socket lock/release + * handler for protocols to use and generic option handler. + * + * + * Version: $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Florian La Roche, + * Alan Cox, + * + * Fixes: + * Alan Cox : Numerous verify_area() problems + * Alan Cox : Connecting on a connecting socket + * now returns an error for tcp. + * Alan Cox : sock->protocol is set correctly. + * and is not sometimes left as 0. + * Alan Cox : connect handles icmp errors on a + * connect properly. Unfortunately there + * is a restart syscall nasty there. I + * can't match BSD without hacking the C + * library. Ideas urgently sought! + * Alan Cox : Disallow bind() to addresses that are + * not ours - especially broadcast ones!! + * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost) + * Alan Cox : sock_wfree/sock_rfree don't destroy sockets, + * instead they leave that for the DESTROY timer. + * Alan Cox : Clean up error flag in accept + * Alan Cox : TCP ack handling is buggy, the DESTROY timer + * was buggy. Put a remove_sock() in the handler + * for memory when we hit 0. Also altered the timer + * code. The ACK stuff can wait and needs major + * TCP layer surgery. + * Alan Cox : Fixed TCP ack bug, removed remove sock + * and fixed timer/inet_bh race. + * Alan Cox : Added zapped flag for TCP + * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code + * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb + * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources + * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing. + * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so... + * Rick Sladkey : Relaxed UDP rules for matching packets. + * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support + * Pauline Middelink : identd support + * Alan Cox : Fixed connect() taking signals I think. + * Alan Cox : SO_LINGER supported + * Alan Cox : Error reporting fixes + * Anonymous : inet_create tidied up (sk->reuse setting) + * Alan Cox : inet sockets don't set sk->type! + * Alan Cox : Split socket option code + * Alan Cox : Callbacks + * Alan Cox : Nagle flag for Charles & Johannes stuff + * Alex : Removed restriction on inet fioctl + * Alan Cox : Splitting INET from NET core + * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt() + * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code + * Alan Cox : Split IP from generic code + * Alan Cox : New kfree_skbmem() + * Alan Cox : Make SO_DEBUG superuser only. + * Alan Cox : Allow anyone to clear SO_DEBUG + * (compatibility fix) + * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput. + * Alan Cox : Allocator for a socket is settable. + * Alan Cox : SO_ERROR includes soft errors. + * Alan Cox : Allow NULL arguments on some SO_ opts + * Alan Cox : Generic socket allocation to make hooks + * easier (suggested by Craig Metz). + * Michael Pall : SO_ERROR returns positive errno again + * Steve Whitehouse: Added default destructor to free + * protocol private data. + * Steve Whitehouse: Added various other default routines + * common to several socket families. + * Chris Evans : Call suser() check last on F_SETOWN + * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER. + * Andi Kleen : Add sock_kmalloc()/sock_kfree_s() + * Andi Kleen : Fix write_space callback + * Chris Evans : Security fixes - signedness again + * Arnaldo C. Melo : cleanups, use skb_queue_purge + * + * To Fix: + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_INET +#include +#endif + +/* Take into consideration the size of the struct sk_buff overhead in the + * determination of these values, since that is non-constant across + * platforms. This makes socket queueing behavior and performance + * not depend upon such differences. + */ +#define _SK_MEM_PACKETS 256 +#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256) +#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) +#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) + +/* Run time adjustable parameters. */ +__u32 sysctl_wmem_max = SK_WMEM_MAX; +__u32 sysctl_rmem_max = SK_RMEM_MAX; +__u32 sysctl_wmem_default = SK_WMEM_MAX; +__u32 sysctl_rmem_default = SK_RMEM_MAX; + +/* Maximal space eaten by iovec or ancilliary data plus some space */ +int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512); + +static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) +{ + struct timeval tv; + + if (optlen < sizeof(tv)) + return -EINVAL; + if (copy_from_user(&tv, optval, sizeof(tv))) + return -EFAULT; + + *timeo_p = MAX_SCHEDULE_TIMEOUT; + if (tv.tv_sec == 0 && tv.tv_usec == 0) + return 0; + if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1)) + *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ); + return 0; +} + +static void sock_warn_obsolete_bsdism(const char *name) +{ + static int warned; + static char warncomm[TASK_COMM_LEN]; + if (strcmp(warncomm, current->comm) && warned < 5) { + strcpy(warncomm, current->comm); + printk(KERN_WARNING "process `%s' is using obsolete " + "%s SO_BSDCOMPAT\n", warncomm, name); + warned++; + } +} + +static void sock_disable_timestamp(struct sock *sk) +{ + if (sock_flag(sk, SOCK_TIMESTAMP)) { + sock_reset_flag(sk, SOCK_TIMESTAMP); + net_disable_timestamp(); + } +} + + +/* + * This is meant for all protocols to use and covers goings on + * at the socket level. Everything here is generic. + */ + +int sock_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct sock *sk=sock->sk; + struct sk_filter *filter; + int val; + int valbool; + struct linger ling; + int ret = 0; + + /* + * Options without arguments + */ + +#ifdef SO_DONTLINGER /* Compatibility item... */ + switch (optname) { + case SO_DONTLINGER: + sock_reset_flag(sk, SOCK_LINGER); + return 0; + } +#endif + + if(optlensk_reuse = valbool; + break; + case SO_TYPE: + case SO_ERROR: + ret = -ENOPROTOOPT; + break; + case SO_DONTROUTE: + if (valbool) + sock_set_flag(sk, SOCK_LOCALROUTE); + else + sock_reset_flag(sk, SOCK_LOCALROUTE); + break; + case SO_BROADCAST: + sock_valbool_flag(sk, SOCK_BROADCAST, valbool); + break; + case SO_SNDBUF: + /* Don't error on this BSD doesn't and if you think + about it this is right. Otherwise apps have to + play 'guess the biggest size' games. RCVBUF/SNDBUF + are treated in BSD as hints */ + + if (val > sysctl_wmem_max) + val = sysctl_wmem_max; + + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + if ((val * 2) < SOCK_MIN_SNDBUF) + sk->sk_sndbuf = SOCK_MIN_SNDBUF; + else + sk->sk_sndbuf = val * 2; + + /* + * Wake up sending tasks if we + * upped the value. + */ + sk->sk_write_space(sk); + break; + + case SO_RCVBUF: + /* Don't error on this BSD doesn't and if you think + about it this is right. Otherwise apps have to + play 'guess the biggest size' games. RCVBUF/SNDBUF + are treated in BSD as hints */ + + if (val > sysctl_rmem_max) + val = sysctl_rmem_max; + + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + /* FIXME: is this lower bound the right one? */ + if ((val * 2) < SOCK_MIN_RCVBUF) + sk->sk_rcvbuf = SOCK_MIN_RCVBUF; + else + sk->sk_rcvbuf = val * 2; + break; + + case SO_KEEPALIVE: +#ifdef CONFIG_INET + if (sk->sk_protocol == IPPROTO_TCP) + tcp_set_keepalive(sk, valbool); +#endif + sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); + break; + + case SO_OOBINLINE: + sock_valbool_flag(sk, SOCK_URGINLINE, valbool); + break; + + case SO_NO_CHECK: + sk->sk_no_check = valbool; + break; + + case SO_PRIORITY: + if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) + sk->sk_priority = val; + else + ret = -EPERM; + break; + + case SO_LINGER: + if(optlen= MAX_SCHEDULE_TIMEOUT/HZ) + sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT; + else +#endif + sk->sk_lingertime = ling.l_linger * HZ; + sock_set_flag(sk, SOCK_LINGER); + } + break; + + case SO_BSDCOMPAT: + sock_warn_obsolete_bsdism("setsockopt"); + break; + + case SO_PASSCRED: + if (valbool) + set_bit(SOCK_PASSCRED, &sock->flags); + else + clear_bit(SOCK_PASSCRED, &sock->flags); + break; + + case SO_TIMESTAMP: + if (valbool) { + sock_set_flag(sk, SOCK_RCVTSTAMP); + sock_enable_timestamp(sk); + } else + sock_reset_flag(sk, SOCK_RCVTSTAMP); + break; + + case SO_RCVLOWAT: + if (val < 0) + val = INT_MAX; + sk->sk_rcvlowat = val ? : 1; + break; + + case SO_RCVTIMEO: + ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen); + break; + + case SO_SNDTIMEO: + ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen); + break; + +#ifdef CONFIG_NETDEVICES + case SO_BINDTODEVICE: + { + char devname[IFNAMSIZ]; + + /* Sorry... */ + if (!capable(CAP_NET_RAW)) { + ret = -EPERM; + break; + } + + /* Bind this socket to a particular device like "eth0", + * as specified in the passed interface name. If the + * name is "" or the option length is zero the socket + * is not bound. + */ + + if (!valbool) { + sk->sk_bound_dev_if = 0; + } else { + if (optlen > IFNAMSIZ) + optlen = IFNAMSIZ; + if (copy_from_user(devname, optval, optlen)) { + ret = -EFAULT; + break; + } + + /* Remove any cached route for this socket. */ + sk_dst_reset(sk); + + if (devname[0] == '\0') { + sk->sk_bound_dev_if = 0; + } else { + struct net_device *dev = dev_get_by_name(devname); + if (!dev) { + ret = -ENODEV; + break; + } + sk->sk_bound_dev_if = dev->ifindex; + dev_put(dev); + } + } + break; + } +#endif + + + case SO_ATTACH_FILTER: + ret = -EINVAL; + if (optlen == sizeof(struct sock_fprog)) { + struct sock_fprog fprog; + + ret = -EFAULT; + if (copy_from_user(&fprog, optval, sizeof(fprog))) + break; + + ret = sk_attach_filter(&fprog, sk); + } + break; + + case SO_DETACH_FILTER: + spin_lock_bh(&sk->sk_lock.slock); + filter = sk->sk_filter; + if (filter) { + sk->sk_filter = NULL; + spin_unlock_bh(&sk->sk_lock.slock); + sk_filter_release(sk, filter); + break; + } + spin_unlock_bh(&sk->sk_lock.slock); + ret = -ENONET; + break; + + /* We implement the SO_SNDLOWAT etc to + not be settable (1003.1g 5.3) */ + default: + ret = -ENOPROTOOPT; + break; + } + release_sock(sk); + return ret; +} + + +int sock_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + + union + { + int val; + struct linger ling; + struct timeval tm; + } v; + + unsigned int lv = sizeof(int); + int len; + + if(get_user(len,optlen)) + return -EFAULT; + if(len < 0) + return -EINVAL; + + switch(optname) + { + case SO_DEBUG: + v.val = sock_flag(sk, SOCK_DBG); + break; + + case SO_DONTROUTE: + v.val = sock_flag(sk, SOCK_LOCALROUTE); + break; + + case SO_BROADCAST: + v.val = !!sock_flag(sk, SOCK_BROADCAST); + break; + + case SO_SNDBUF: + v.val = sk->sk_sndbuf; + break; + + case SO_RCVBUF: + v.val = sk->sk_rcvbuf; + break; + + case SO_REUSEADDR: + v.val = sk->sk_reuse; + break; + + case SO_KEEPALIVE: + v.val = !!sock_flag(sk, SOCK_KEEPOPEN); + break; + + case SO_TYPE: + v.val = sk->sk_type; + break; + + case SO_ERROR: + v.val = -sock_error(sk); + if(v.val==0) + v.val = xchg(&sk->sk_err_soft, 0); + break; + + case SO_OOBINLINE: + v.val = !!sock_flag(sk, SOCK_URGINLINE); + break; + + case SO_NO_CHECK: + v.val = sk->sk_no_check; + break; + + case SO_PRIORITY: + v.val = sk->sk_priority; + break; + + case SO_LINGER: + lv = sizeof(v.ling); + v.ling.l_onoff = !!sock_flag(sk, SOCK_LINGER); + v.ling.l_linger = sk->sk_lingertime / HZ; + break; + + case SO_BSDCOMPAT: + sock_warn_obsolete_bsdism("getsockopt"); + break; + + case SO_TIMESTAMP: + v.val = sock_flag(sk, SOCK_RCVTSTAMP); + break; + + case SO_RCVTIMEO: + lv=sizeof(struct timeval); + if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { + v.tm.tv_sec = 0; + v.tm.tv_usec = 0; + } else { + v.tm.tv_sec = sk->sk_rcvtimeo / HZ; + v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ; + } + break; + + case SO_SNDTIMEO: + lv=sizeof(struct timeval); + if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) { + v.tm.tv_sec = 0; + v.tm.tv_usec = 0; + } else { + v.tm.tv_sec = sk->sk_sndtimeo / HZ; + v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ; + } + break; + + case SO_RCVLOWAT: + v.val = sk->sk_rcvlowat; + break; + + case SO_SNDLOWAT: + v.val=1; + break; + + case SO_PASSCRED: + v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0; + break; + + case SO_PEERCRED: + if (len > sizeof(sk->sk_peercred)) + len = sizeof(sk->sk_peercred); + if (copy_to_user(optval, &sk->sk_peercred, len)) + return -EFAULT; + goto lenout; + + case SO_PEERNAME: + { + char address[128]; + + if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2)) + return -ENOTCONN; + if (lv < len) + return -EINVAL; + if (copy_to_user(optval, address, len)) + return -EFAULT; + goto lenout; + } + + /* Dubious BSD thing... Probably nobody even uses it, but + * the UNIX standard wants it for whatever reason... -DaveM + */ + case SO_ACCEPTCONN: + v.val = sk->sk_state == TCP_LISTEN; + break; + + case SO_PEERSEC: + return security_socket_getpeersec(sock, optval, optlen, len); + + default: + return(-ENOPROTOOPT); + } + if (len > lv) + len = lv; + if (copy_to_user(optval, &v, len)) + return -EFAULT; +lenout: + if (put_user(len, optlen)) + return -EFAULT; + return 0; +} + +/** + * sk_alloc - All socket objects are allocated here + * @family - protocol family + * @priority - for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * @prot - struct proto associated with this new sock instance + * @zero_it - if we should zero the newly allocated sock + */ +struct sock *sk_alloc(int family, int priority, struct proto *prot, int zero_it) +{ + struct sock *sk = NULL; + kmem_cache_t *slab = prot->slab; + + if (slab != NULL) + sk = kmem_cache_alloc(slab, priority); + else + sk = kmalloc(prot->obj_size, priority); + + if (sk) { + if (zero_it) { + memset(sk, 0, prot->obj_size); + sk->sk_family = family; + sk->sk_prot = prot; + sock_lock_init(sk); + } + + if (security_sk_alloc(sk, family, priority)) { + kmem_cache_free(slab, sk); + sk = NULL; + } else + __module_get(prot->owner); + } + return sk; +} + +void sk_free(struct sock *sk) +{ + struct sk_filter *filter; + struct module *owner = sk->sk_prot->owner; + + if (sk->sk_destruct) + sk->sk_destruct(sk); + + filter = sk->sk_filter; + if (filter) { + sk_filter_release(sk, filter); + sk->sk_filter = NULL; + } + + sock_disable_timestamp(sk); + + if (atomic_read(&sk->sk_omem_alloc)) + printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", + __FUNCTION__, atomic_read(&sk->sk_omem_alloc)); + + security_sk_free(sk); + if (sk->sk_prot->slab != NULL) + kmem_cache_free(sk->sk_prot->slab, sk); + else + kfree(sk); + module_put(owner); +} + +void __init sk_init(void) +{ + if (num_physpages <= 4096) { + sysctl_wmem_max = 32767; + sysctl_rmem_max = 32767; + sysctl_wmem_default = 32767; + sysctl_rmem_default = 32767; + } else if (num_physpages >= 131072) { + sysctl_wmem_max = 131071; + sysctl_rmem_max = 131071; + } +} + +/* + * Simple resource managers for sockets. + */ + + +/* + * Write buffer destructor automatically called from kfree_skb. + */ +void sock_wfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + /* In case it might be waiting for more memory. */ + atomic_sub(skb->truesize, &sk->sk_wmem_alloc); + if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) + sk->sk_write_space(sk); + sock_put(sk); +} + +/* + * Read buffer destructor automatically called from kfree_skb. + */ +void sock_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); +} + + +int sock_i_uid(struct sock *sk) +{ + int uid; + + read_lock(&sk->sk_callback_lock); + uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0; + read_unlock(&sk->sk_callback_lock); + return uid; +} + +unsigned long sock_i_ino(struct sock *sk) +{ + unsigned long ino; + + read_lock(&sk->sk_callback_lock); + ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0; + read_unlock(&sk->sk_callback_lock); + return ino; +} + +/* + * Allocate a skb from the socket's send buffer. + */ +struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority) +{ + if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + struct sk_buff * skb = alloc_skb(size, priority); + if (skb) { + skb_set_owner_w(skb, sk); + return skb; + } + } + return NULL; +} + +/* + * Allocate a skb from the socket's receive buffer. + */ +struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority) +{ + if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) { + struct sk_buff *skb = alloc_skb(size, priority); + if (skb) { + skb_set_owner_r(skb, sk); + return skb; + } + } + return NULL; +} + +/* + * Allocate a memory block from the socket's option memory buffer. + */ +void *sock_kmalloc(struct sock *sk, int size, int priority) +{ + if ((unsigned)size <= sysctl_optmem_max && + atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) { + void *mem; + /* First do the add, to avoid the race if kmalloc + * might sleep. + */ + atomic_add(size, &sk->sk_omem_alloc); + mem = kmalloc(size, priority); + if (mem) + return mem; + atomic_sub(size, &sk->sk_omem_alloc); + } + return NULL; +} + +/* + * Free an option memory block. + */ +void sock_kfree_s(struct sock *sk, void *mem, int size) +{ + kfree(mem); + atomic_sub(size, &sk->sk_omem_alloc); +} + +/* It is almost wait_for_tcp_memory minus release_sock/lock_sock. + I think, these locks should be removed for datagram sockets. + */ +static long sock_wait_for_wmem(struct sock * sk, long timeo) +{ + DEFINE_WAIT(wait); + + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + for (;;) { + if (!timeo) + break; + if (signal_pending(current)) + break; + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) + break; + if (sk->sk_shutdown & SEND_SHUTDOWN) + break; + if (sk->sk_err) + break; + timeo = schedule_timeout(timeo); + } + finish_wait(sk->sk_sleep, &wait); + return timeo; +} + + +/* + * Generic send/receive buffer handlers + */ + +static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, + unsigned long header_len, + unsigned long data_len, + int noblock, int *errcode) +{ + struct sk_buff *skb; + unsigned int gfp_mask; + long timeo; + int err; + + gfp_mask = sk->sk_allocation; + if (gfp_mask & __GFP_WAIT) + gfp_mask |= __GFP_REPEAT; + + timeo = sock_sndtimeo(sk, noblock); + while (1) { + err = sock_error(sk); + if (err != 0) + goto failure; + + err = -EPIPE; + if (sk->sk_shutdown & SEND_SHUTDOWN) + goto failure; + + if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) { + skb = alloc_skb(header_len, sk->sk_allocation); + if (skb) { + int npages; + int i; + + /* No pages, we're done... */ + if (!data_len) + break; + + npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT; + skb->truesize += data_len; + skb_shinfo(skb)->nr_frags = npages; + for (i = 0; i < npages; i++) { + struct page *page; + skb_frag_t *frag; + + page = alloc_pages(sk->sk_allocation, 0); + if (!page) { + err = -ENOBUFS; + skb_shinfo(skb)->nr_frags = i; + kfree_skb(skb); + goto failure; + } + + frag = &skb_shinfo(skb)->frags[i]; + frag->page = page; + frag->page_offset = 0; + frag->size = (data_len >= PAGE_SIZE ? + PAGE_SIZE : + data_len); + data_len -= PAGE_SIZE; + } + + /* Full success... */ + break; + } + err = -ENOBUFS; + goto failure; + } + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + err = -EAGAIN; + if (!timeo) + goto failure; + if (signal_pending(current)) + goto interrupted; + timeo = sock_wait_for_wmem(sk, timeo); + } + + skb_set_owner_w(skb, sk); + return skb; + +interrupted: + err = sock_intr_errno(timeo); +failure: + *errcode = err; + return NULL; +} + +struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, + int noblock, int *errcode) +{ + return sock_alloc_send_pskb(sk, size, 0, noblock, errcode); +} + +static void __lock_sock(struct sock *sk) +{ + DEFINE_WAIT(wait); + + for(;;) { + prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait, + TASK_UNINTERRUPTIBLE); + spin_unlock_bh(&sk->sk_lock.slock); + schedule(); + spin_lock_bh(&sk->sk_lock.slock); + if(!sock_owned_by_user(sk)) + break; + } + finish_wait(&sk->sk_lock.wq, &wait); +} + +static void __release_sock(struct sock *sk) +{ + struct sk_buff *skb = sk->sk_backlog.head; + + do { + sk->sk_backlog.head = sk->sk_backlog.tail = NULL; + bh_unlock_sock(sk); + + do { + struct sk_buff *next = skb->next; + + skb->next = NULL; + sk->sk_backlog_rcv(sk, skb); + + /* + * We are in process context here with softirqs + * disabled, use cond_resched_softirq() to preempt. + * This is safe to do because we've taken the backlog + * queue private: + */ + cond_resched_softirq(); + + skb = next; + } while (skb != NULL); + + bh_lock_sock(sk); + } while((skb = sk->sk_backlog.head) != NULL); +} + +/** + * sk_wait_data - wait for data to arrive at sk_receive_queue + * sk - sock to wait on + * timeo - for how long + * + * Now socket state including sk->sk_err is changed only under lock, + * hence we may omit checks after joining wait queue. + * We check receive queue before schedule() only as optimization; + * it is very likely that release_sock() added new data. + */ +int sk_wait_data(struct sock *sk, long *timeo) +{ + int rc; + DEFINE_WAIT(wait); + + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue)); + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + finish_wait(sk->sk_sleep, &wait); + return rc; +} + +EXPORT_SYMBOL(sk_wait_data); + +/* + * Set of default routines for initialising struct proto_ops when + * the protocol does not support a particular function. In certain + * cases where it makes no sense for a protocol to have a "do nothing" + * function, some default processing is provided. + */ + +int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len) +{ + return -EOPNOTSUPP; +} + +int sock_no_connect(struct socket *sock, struct sockaddr *saddr, + int len, int flags) +{ + return -EOPNOTSUPP; +} + +int sock_no_socketpair(struct socket *sock1, struct socket *sock2) +{ + return -EOPNOTSUPP; +} + +int sock_no_accept(struct socket *sock, struct socket *newsock, int flags) +{ + return -EOPNOTSUPP; +} + +int sock_no_getname(struct socket *sock, struct sockaddr *saddr, + int *len, int peer) +{ + return -EOPNOTSUPP; +} + +unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt) +{ + return 0; +} + +int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + return -EOPNOTSUPP; +} + +int sock_no_listen(struct socket *sock, int backlog) +{ + return -EOPNOTSUPP; +} + +int sock_no_shutdown(struct socket *sock, int how) +{ + return -EOPNOTSUPP; +} + +int sock_no_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + return -EOPNOTSUPP; +} + +int sock_no_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + return -EOPNOTSUPP; +} + +int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, + size_t len) +{ + return -EOPNOTSUPP; +} + +int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m, + size_t len, int flags) +{ + return -EOPNOTSUPP; +} + +int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) +{ + /* Mirror missing mmap method error code */ + return -ENODEV; +} + +ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +{ + ssize_t res; + struct msghdr msg = {.msg_flags = flags}; + struct kvec iov; + char *kaddr = kmap(page); + iov.iov_base = kaddr + offset; + iov.iov_len = size; + res = kernel_sendmsg(sock, &msg, &iov, 1, size); + kunmap(page); + return res; +} + +/* + * Default Socket Callbacks + */ + +static void sock_def_wakeup(struct sock *sk) +{ + read_lock(&sk->sk_callback_lock); + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible_all(sk->sk_sleep); + read_unlock(&sk->sk_callback_lock); +} + +static void sock_def_error_report(struct sock *sk) +{ + read_lock(&sk->sk_callback_lock); + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + sk_wake_async(sk,0,POLL_ERR); + read_unlock(&sk->sk_callback_lock); +} + +static void sock_def_readable(struct sock *sk, int len) +{ + read_lock(&sk->sk_callback_lock); + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + sk_wake_async(sk,1,POLL_IN); + read_unlock(&sk->sk_callback_lock); +} + +static void sock_def_write_space(struct sock *sk) +{ + read_lock(&sk->sk_callback_lock); + + /* Do not wake up a writer until he can make "significant" + * progress. --DaveM + */ + if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + + /* Should agree with poll, otherwise some programs break */ + if (sock_writeable(sk)) + sk_wake_async(sk, 2, POLL_OUT); + } + + read_unlock(&sk->sk_callback_lock); +} + +static void sock_def_destruct(struct sock *sk) +{ + if (sk->sk_protinfo) + kfree(sk->sk_protinfo); +} + +void sk_send_sigurg(struct sock *sk) +{ + if (sk->sk_socket && sk->sk_socket->file) + if (send_sigurg(&sk->sk_socket->file->f_owner)) + sk_wake_async(sk, 3, POLL_PRI); +} + +void sk_reset_timer(struct sock *sk, struct timer_list* timer, + unsigned long expires) +{ + if (!mod_timer(timer, expires)) + sock_hold(sk); +} + +EXPORT_SYMBOL(sk_reset_timer); + +void sk_stop_timer(struct sock *sk, struct timer_list* timer) +{ + if (timer_pending(timer) && del_timer(timer)) + __sock_put(sk); +} + +EXPORT_SYMBOL(sk_stop_timer); + +void sock_init_data(struct socket *sock, struct sock *sk) +{ + skb_queue_head_init(&sk->sk_receive_queue); + skb_queue_head_init(&sk->sk_write_queue); + skb_queue_head_init(&sk->sk_error_queue); + + sk->sk_send_head = NULL; + + init_timer(&sk->sk_timer); + + sk->sk_allocation = GFP_KERNEL; + sk->sk_rcvbuf = sysctl_rmem_default; + sk->sk_sndbuf = sysctl_wmem_default; + sk->sk_state = TCP_CLOSE; + sk->sk_socket = sock; + + sock_set_flag(sk, SOCK_ZAPPED); + + if(sock) + { + sk->sk_type = sock->type; + sk->sk_sleep = &sock->wait; + sock->sk = sk; + } else + sk->sk_sleep = NULL; + + rwlock_init(&sk->sk_dst_lock); + rwlock_init(&sk->sk_callback_lock); + + sk->sk_state_change = sock_def_wakeup; + sk->sk_data_ready = sock_def_readable; + sk->sk_write_space = sock_def_write_space; + sk->sk_error_report = sock_def_error_report; + sk->sk_destruct = sock_def_destruct; + + sk->sk_sndmsg_page = NULL; + sk->sk_sndmsg_off = 0; + + sk->sk_peercred.pid = 0; + sk->sk_peercred.uid = -1; + sk->sk_peercred.gid = -1; + sk->sk_write_pending = 0; + sk->sk_rcvlowat = 1; + sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; + sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; + + sk->sk_stamp.tv_sec = -1L; + sk->sk_stamp.tv_usec = -1L; + + atomic_set(&sk->sk_refcnt, 1); +} + +void fastcall lock_sock(struct sock *sk) +{ + might_sleep(); + spin_lock_bh(&(sk->sk_lock.slock)); + if (sk->sk_lock.owner) + __lock_sock(sk); + sk->sk_lock.owner = (void *)1; + spin_unlock_bh(&(sk->sk_lock.slock)); +} + +EXPORT_SYMBOL(lock_sock); + +void fastcall release_sock(struct sock *sk) +{ + spin_lock_bh(&(sk->sk_lock.slock)); + if (sk->sk_backlog.tail) + __release_sock(sk); + sk->sk_lock.owner = NULL; + if (waitqueue_active(&(sk->sk_lock.wq))) + wake_up(&(sk->sk_lock.wq)); + spin_unlock_bh(&(sk->sk_lock.slock)); +} +EXPORT_SYMBOL(release_sock); + +int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) +{ + if (!sock_flag(sk, SOCK_TIMESTAMP)) + sock_enable_timestamp(sk); + if (sk->sk_stamp.tv_sec == -1) + return -ENOENT; + if (sk->sk_stamp.tv_sec == 0) + do_gettimeofday(&sk->sk_stamp); + return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ? + -EFAULT : 0; +} +EXPORT_SYMBOL(sock_get_timestamp); + +void sock_enable_timestamp(struct sock *sk) +{ + if (!sock_flag(sk, SOCK_TIMESTAMP)) { + sock_set_flag(sk, SOCK_TIMESTAMP); + net_enable_timestamp(); + } +} +EXPORT_SYMBOL(sock_enable_timestamp); + +/* + * Get a socket option on an socket. + * + * FIX: POSIX 1003.1g is very ambiguous here. It states that + * asynchronous errors should be reported by getsockopt. We assume + * this means if you specify SO_ERROR (otherwise whats the point of it). + */ +int sock_common_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + + return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); +} + +EXPORT_SYMBOL(sock_common_getsockopt); + +int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + int addr_len = 0; + int err; + + err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT, + flags & ~MSG_DONTWAIT, &addr_len); + if (err >= 0) + msg->msg_namelen = addr_len; + return err; +} + +EXPORT_SYMBOL(sock_common_recvmsg); + +/* + * Set socket options on an inet socket. + */ +int sock_common_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + + return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); +} + +EXPORT_SYMBOL(sock_common_setsockopt); + +void sk_common_release(struct sock *sk) +{ + if (sk->sk_prot->destroy) + sk->sk_prot->destroy(sk); + + /* + * Observation: when sock_common_release is called, processes have + * no access to socket. But net still has. + * Step one, detach it from networking: + * + * A. Remove from hash tables. + */ + + sk->sk_prot->unhash(sk); + + /* + * In this point socket cannot receive new packets, but it is possible + * that some packets are in flight because some CPU runs receiver and + * did hash table lookup before we unhashed socket. They will achieve + * receive queue and will be purged by socket destructor. + * + * Also we still have packets pending on receive queue and probably, + * our own packets waiting in device queues. sock_destroy will drain + * receive queue, but transmitted packets will delay socket destruction + * until the last reference will be released. + */ + + sock_orphan(sk); + + xfrm_sk_free_policy(sk); + +#ifdef INET_REFCNT_DEBUG + if (atomic_read(&sk->sk_refcnt) != 1) + printk(KERN_DEBUG "Destruction of the socket %p delayed, c=%d\n", + sk, atomic_read(&sk->sk_refcnt)); +#endif + sock_put(sk); +} + +EXPORT_SYMBOL(sk_common_release); + +static DEFINE_RWLOCK(proto_list_lock); +static LIST_HEAD(proto_list); + +int proto_register(struct proto *prot, int alloc_slab) +{ + int rc = -ENOBUFS; + + write_lock(&proto_list_lock); + + if (alloc_slab) { + prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + + if (prot->slab == NULL) { + printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", + prot->name); + goto out_unlock; + } + } + + list_add(&prot->node, &proto_list); + rc = 0; +out_unlock: + write_unlock(&proto_list_lock); + return rc; +} + +EXPORT_SYMBOL(proto_register); + +void proto_unregister(struct proto *prot) +{ + write_lock(&proto_list_lock); + + if (prot->slab != NULL) { + kmem_cache_destroy(prot->slab); + prot->slab = NULL; + } + + list_del(&prot->node); + write_unlock(&proto_list_lock); +} + +EXPORT_SYMBOL(proto_unregister); + +#ifdef CONFIG_PROC_FS +static inline struct proto *__proto_head(void) +{ + return list_entry(proto_list.next, struct proto, node); +} + +static inline struct proto *proto_head(void) +{ + return list_empty(&proto_list) ? NULL : __proto_head(); +} + +static inline struct proto *proto_next(struct proto *proto) +{ + return proto->node.next == &proto_list ? NULL : + list_entry(proto->node.next, struct proto, node); +} + +static inline struct proto *proto_get_idx(loff_t pos) +{ + struct proto *proto; + loff_t i = 0; + + list_for_each_entry(proto, &proto_list, node) + if (i++ == pos) + goto out; + + proto = NULL; +out: + return proto; +} + +static void *proto_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&proto_list_lock); + return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return v == SEQ_START_TOKEN ? proto_head() : proto_next(v); +} + +static void proto_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&proto_list_lock); +} + +static char proto_method_implemented(const void *method) +{ + return method == NULL ? 'n' : 'y'; +} + +static void proto_seq_printf(struct seq_file *seq, struct proto *proto) +{ + seq_printf(seq, "%-9s %4u %6d %6d %-3s %6u %-3s %-10s " + "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", + proto->name, + proto->obj_size, + proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1, + proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1, + proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", + proto->max_header, + proto->slab == NULL ? "no" : "yes", + module_name(proto->owner), + proto_method_implemented(proto->close), + proto_method_implemented(proto->connect), + proto_method_implemented(proto->disconnect), + proto_method_implemented(proto->accept), + proto_method_implemented(proto->ioctl), + proto_method_implemented(proto->init), + proto_method_implemented(proto->destroy), + proto_method_implemented(proto->shutdown), + proto_method_implemented(proto->setsockopt), + proto_method_implemented(proto->getsockopt), + proto_method_implemented(proto->sendmsg), + proto_method_implemented(proto->recvmsg), + proto_method_implemented(proto->sendpage), + proto_method_implemented(proto->bind), + proto_method_implemented(proto->backlog_rcv), + proto_method_implemented(proto->hash), + proto_method_implemented(proto->unhash), + proto_method_implemented(proto->get_port), + proto_method_implemented(proto->enter_memory_pressure)); +} + +static int proto_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s", + "protocol", + "size", + "sockets", + "memory", + "press", + "maxhdr", + "slab", + "module", + "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n"); + else + proto_seq_printf(seq, v); + return 0; +} + +static struct seq_operations proto_seq_ops = { + .start = proto_seq_start, + .next = proto_seq_next, + .stop = proto_seq_stop, + .show = proto_seq_show, +}; + +static int proto_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &proto_seq_ops); +} + +static struct file_operations proto_seq_fops = { + .owner = THIS_MODULE, + .open = proto_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init proto_init(void) +{ + /* register /proc/net/protocols */ + return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0; +} + +subsys_initcall(proto_init); + +#endif /* PROC_FS */ + +EXPORT_SYMBOL(sk_alloc); +EXPORT_SYMBOL(sk_free); +EXPORT_SYMBOL(sk_send_sigurg); +EXPORT_SYMBOL(sock_alloc_send_skb); +EXPORT_SYMBOL(sock_init_data); +EXPORT_SYMBOL(sock_kfree_s); +EXPORT_SYMBOL(sock_kmalloc); +EXPORT_SYMBOL(sock_no_accept); +EXPORT_SYMBOL(sock_no_bind); +EXPORT_SYMBOL(sock_no_connect); +EXPORT_SYMBOL(sock_no_getname); +EXPORT_SYMBOL(sock_no_getsockopt); +EXPORT_SYMBOL(sock_no_ioctl); +EXPORT_SYMBOL(sock_no_listen); +EXPORT_SYMBOL(sock_no_mmap); +EXPORT_SYMBOL(sock_no_poll); +EXPORT_SYMBOL(sock_no_recvmsg); +EXPORT_SYMBOL(sock_no_sendmsg); +EXPORT_SYMBOL(sock_no_sendpage); +EXPORT_SYMBOL(sock_no_setsockopt); +EXPORT_SYMBOL(sock_no_shutdown); +EXPORT_SYMBOL(sock_no_socketpair); +EXPORT_SYMBOL(sock_rfree); +EXPORT_SYMBOL(sock_setsockopt); +EXPORT_SYMBOL(sock_wfree); +EXPORT_SYMBOL(sock_wmalloc); +EXPORT_SYMBOL(sock_i_uid); +EXPORT_SYMBOL(sock_i_ino); +#ifdef CONFIG_SYSCTL +EXPORT_SYMBOL(sysctl_optmem_max); +EXPORT_SYMBOL(sysctl_rmem_max); +EXPORT_SYMBOL(sysctl_wmem_max); +#endif diff --git a/net/core/stream.c b/net/core/stream.c new file mode 100644 index 000000000000..1e27a57b5a97 --- /dev/null +++ b/net/core/stream.c @@ -0,0 +1,287 @@ +/* + * SUCS NET3: + * + * Generic stream handling routines. These are generic for most + * protocols. Even IP. Tonight 8-). + * This is used because TCP, LLC (others too) layer all have mostly + * identical sendmsg() and recvmsg() code. + * So we (will) share it here. + * + * Authors: Arnaldo Carvalho de Melo + * (from old tcp.c code) + * Alan Cox (Borrowed comments 8-)) + */ + +#include +#include +#include +#include +#include +#include + +/** + * sk_stream_write_space - stream socket write_space callback. + * sk - socket + * + * FIXME: write proper description + */ +void sk_stream_write_space(struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && sock) { + clear_bit(SOCK_NOSPACE, &sock->flags); + + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + if (sock->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN)) + sock_wake_async(sock, 2, POLL_OUT); + } +} + +EXPORT_SYMBOL(sk_stream_write_space); + +/** + * sk_stream_wait_connect - Wait for a socket to get into the connected state + * @sk - sock to wait on + * @timeo_p - for how long to wait + * + * Must be called with the socket locked. + */ +int sk_stream_wait_connect(struct sock *sk, long *timeo_p) +{ + struct task_struct *tsk = current; + DEFINE_WAIT(wait); + + while (1) { + if (sk->sk_err) + return sock_error(sk); + if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) + return -EPIPE; + if (!*timeo_p) + return -EAGAIN; + if (signal_pending(tsk)) + return sock_intr_errno(*timeo_p); + + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + sk->sk_write_pending++; + if (sk_wait_event(sk, timeo_p, + !((1 << sk->sk_state) & + ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))) + break; + finish_wait(sk->sk_sleep, &wait); + sk->sk_write_pending--; + } + return 0; +} + +EXPORT_SYMBOL(sk_stream_wait_connect); + +/** + * sk_stream_closing - Return 1 if we still have things to send in our buffers. + * @sk - socket to verify + */ +static inline int sk_stream_closing(struct sock *sk) +{ + return (1 << sk->sk_state) & + (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK); +} + +void sk_stream_wait_close(struct sock *sk, long timeout) +{ + if (timeout) { + DEFINE_WAIT(wait); + + do { + prepare_to_wait(sk->sk_sleep, &wait, + TASK_INTERRUPTIBLE); + if (sk_wait_event(sk, &timeout, !sk_stream_closing(sk))) + break; + } while (!signal_pending(current) && timeout); + + finish_wait(sk->sk_sleep, &wait); + } +} + +EXPORT_SYMBOL(sk_stream_wait_close); + +/** + * sk_stream_wait_memory - Wait for more memory for a socket + * @sk - socket to wait for memory + * @timeo_p - for how long + */ +int sk_stream_wait_memory(struct sock *sk, long *timeo_p) +{ + int err = 0; + long vm_wait = 0; + long current_timeo = *timeo_p; + DEFINE_WAIT(wait); + + if (sk_stream_memory_free(sk)) + current_timeo = vm_wait = (net_random() % (HZ / 5)) + 2; + + while (1) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto do_error; + if (!*timeo_p) + goto do_nonblock; + if (signal_pending(current)) + goto do_interrupted; + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + if (sk_stream_memory_free(sk) && !vm_wait) + break; + + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + sk->sk_write_pending++; + sk_wait_event(sk, ¤t_timeo, sk_stream_memory_free(sk) && + vm_wait); + sk->sk_write_pending--; + + if (vm_wait) { + vm_wait -= current_timeo; + current_timeo = *timeo_p; + if (current_timeo != MAX_SCHEDULE_TIMEOUT && + (current_timeo -= vm_wait) < 0) + current_timeo = 0; + vm_wait = 0; + } + *timeo_p = current_timeo; + } +out: + finish_wait(sk->sk_sleep, &wait); + return err; + +do_error: + err = -EPIPE; + goto out; +do_nonblock: + err = -EAGAIN; + goto out; +do_interrupted: + err = sock_intr_errno(*timeo_p); + goto out; +} + +EXPORT_SYMBOL(sk_stream_wait_memory); + +void sk_stream_rfree(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + + atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + sk->sk_forward_alloc += skb->truesize; +} + +EXPORT_SYMBOL(sk_stream_rfree); + +int sk_stream_error(struct sock *sk, int flags, int err) +{ + if (err == -EPIPE) + err = sock_error(sk) ? : -EPIPE; + if (err == -EPIPE && !(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + return err; +} + +EXPORT_SYMBOL(sk_stream_error); + +void __sk_stream_mem_reclaim(struct sock *sk) +{ + if (sk->sk_forward_alloc >= SK_STREAM_MEM_QUANTUM) { + atomic_sub(sk->sk_forward_alloc / SK_STREAM_MEM_QUANTUM, + sk->sk_prot->memory_allocated); + sk->sk_forward_alloc &= SK_STREAM_MEM_QUANTUM - 1; + if (*sk->sk_prot->memory_pressure && + (atomic_read(sk->sk_prot->memory_allocated) < + sk->sk_prot->sysctl_mem[0])) + *sk->sk_prot->memory_pressure = 0; + } +} + +EXPORT_SYMBOL(__sk_stream_mem_reclaim); + +int sk_stream_mem_schedule(struct sock *sk, int size, int kind) +{ + int amt = sk_stream_pages(size); + + sk->sk_forward_alloc += amt * SK_STREAM_MEM_QUANTUM; + atomic_add(amt, sk->sk_prot->memory_allocated); + + /* Under limit. */ + if (atomic_read(sk->sk_prot->memory_allocated) < sk->sk_prot->sysctl_mem[0]) { + if (*sk->sk_prot->memory_pressure) + *sk->sk_prot->memory_pressure = 0; + return 1; + } + + /* Over hard limit. */ + if (atomic_read(sk->sk_prot->memory_allocated) > sk->sk_prot->sysctl_mem[2]) { + sk->sk_prot->enter_memory_pressure(); + goto suppress_allocation; + } + + /* Under pressure. */ + if (atomic_read(sk->sk_prot->memory_allocated) > sk->sk_prot->sysctl_mem[1]) + sk->sk_prot->enter_memory_pressure(); + + if (kind) { + if (atomic_read(&sk->sk_rmem_alloc) < sk->sk_prot->sysctl_rmem[0]) + return 1; + } else if (sk->sk_wmem_queued < sk->sk_prot->sysctl_wmem[0]) + return 1; + + if (!*sk->sk_prot->memory_pressure || + sk->sk_prot->sysctl_mem[2] > atomic_read(sk->sk_prot->sockets_allocated) * + sk_stream_pages(sk->sk_wmem_queued + + atomic_read(&sk->sk_rmem_alloc) + + sk->sk_forward_alloc)) + return 1; + +suppress_allocation: + + if (!kind) { + sk_stream_moderate_sndbuf(sk); + + /* Fail only if socket is _under_ its sndbuf. + * In this case we cannot block, so that we have to fail. + */ + if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) + return 1; + } + + /* Alas. Undo changes. */ + sk->sk_forward_alloc -= amt * SK_STREAM_MEM_QUANTUM; + atomic_sub(amt, sk->sk_prot->memory_allocated); + return 0; +} + +EXPORT_SYMBOL(sk_stream_mem_schedule); + +void sk_stream_kill_queues(struct sock *sk) +{ + /* First the read buffer. */ + __skb_queue_purge(&sk->sk_receive_queue); + + /* Next, the error queue. */ + __skb_queue_purge(&sk->sk_error_queue); + + /* Next, the write queue. */ + BUG_TRAP(skb_queue_empty(&sk->sk_write_queue)); + + /* Account for returned memory. */ + sk_stream_mem_reclaim(sk); + + BUG_TRAP(!sk->sk_wmem_queued); + BUG_TRAP(!sk->sk_forward_alloc); + + /* It is _impossible_ for the backlog to contain anything + * when we get here. All user references to this socket + * have gone away, only the net layer knows can touch it. + */ +} + +EXPORT_SYMBOL(sk_stream_kill_queues); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c new file mode 100644 index 000000000000..c8be646cb191 --- /dev/null +++ b/net/core/sysctl_net_core.c @@ -0,0 +1,182 @@ +/* -*- linux-c -*- + * sysctl_net_core.c: sysctl interface to net core subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/core directory entry (empty =) ). [MS] + */ + +#include +#include +#include +#include + +#ifdef CONFIG_SYSCTL + +extern int netdev_max_backlog; +extern int weight_p; +extern int no_cong_thresh; +extern int no_cong; +extern int lo_cong; +extern int mod_cong; +extern int netdev_fastroute; +extern int net_msg_cost; +extern int net_msg_burst; + +extern __u32 sysctl_wmem_max; +extern __u32 sysctl_rmem_max; +extern __u32 sysctl_wmem_default; +extern __u32 sysctl_rmem_default; + +extern int sysctl_core_destroy_delay; +extern int sysctl_optmem_max; +extern int sysctl_somaxconn; + +#ifdef CONFIG_NET_DIVERT +extern char sysctl_divert_version[]; +#endif /* CONFIG_NET_DIVERT */ + +/* + * This strdup() is used for creating copies of network + * device names to be handed over to sysctl. + */ + +char *net_sysctl_strdup(const char *s) +{ + char *rv = kmalloc(strlen(s)+1, GFP_KERNEL); + if (rv) + strcpy(rv, s); + return rv; +} + +ctl_table core_table[] = { +#ifdef CONFIG_NET + { + .ctl_name = NET_CORE_WMEM_MAX, + .procname = "wmem_max", + .data = &sysctl_wmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_RMEM_MAX, + .procname = "rmem_max", + .data = &sysctl_rmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_WMEM_DEFAULT, + .procname = "wmem_default", + .data = &sysctl_wmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_RMEM_DEFAULT, + .procname = "rmem_default", + .data = &sysctl_rmem_default, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_DEV_WEIGHT, + .procname = "dev_weight", + .data = &weight_p, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_MAX_BACKLOG, + .procname = "netdev_max_backlog", + .data = &netdev_max_backlog, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_NO_CONG_THRESH, + .procname = "no_cong_thresh", + .data = &no_cong_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_NO_CONG, + .procname = "no_cong", + .data = &no_cong, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_LO_CONG, + .procname = "lo_cong", + .data = &lo_cong, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_MOD_CONG, + .procname = "mod_cong", + .data = &mod_cong, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_CORE_MSG_COST, + .procname = "message_cost", + .data = &net_msg_cost, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_CORE_MSG_BURST, + .procname = "message_burst", + .data = &net_msg_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_CORE_OPTMEM_MAX, + .procname = "optmem_max", + .data = &sysctl_optmem_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, +#ifdef CONFIG_NET_DIVERT + { + .ctl_name = NET_CORE_DIVERT_VERSION, + .procname = "divert_version", + .data = (void *)sysctl_divert_version, + .maxlen = 32, + .mode = 0444, + .proc_handler = &proc_dostring + }, +#endif /* CONFIG_NET_DIVERT */ +#endif /* CONFIG_NET */ + { + .ctl_name = NET_CORE_SOMAXCONN, + .procname = "somaxconn", + .data = &sysctl_somaxconn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = 0 } +}; + +EXPORT_SYMBOL(net_sysctl_strdup); + +#endif diff --git a/net/core/utils.c b/net/core/utils.c new file mode 100644 index 000000000000..e11a8654f363 --- /dev/null +++ b/net/core/utils.c @@ -0,0 +1,155 @@ +/* + * Generic address resultion entity + * + * Authors: + * net_random Alan Cox + * net_ratelimit Andy Kleen + * + * Created by Alexey Kuznetsov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +/* + This is a maximally equidistributed combined Tausworthe generator + based on code from GNU Scientific Library 1.5 (30 Jun 2004) + + x_n = (s1_n ^ s2_n ^ s3_n) + + s1_{n+1} = (((s1_n & 4294967294) <<12) ^ (((s1_n <<13) ^ s1_n) >>19)) + s2_{n+1} = (((s2_n & 4294967288) << 4) ^ (((s2_n << 2) ^ s2_n) >>25)) + s3_{n+1} = (((s3_n & 4294967280) <<17) ^ (((s3_n << 3) ^ s3_n) >>11)) + + The period of this generator is about 2^88. + + From: P. L'Ecuyer, "Maximally Equidistributed Combined Tausworthe + Generators", Mathematics of Computation, 65, 213 (1996), 203--213. + + This is available on the net from L'Ecuyer's home page, + + http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme.ps + ftp://ftp.iro.umontreal.ca/pub/simulation/lecuyer/papers/tausme.ps + + There is an erratum in the paper "Tables of Maximally + Equidistributed Combined LFSR Generators", Mathematics of + Computation, 68, 225 (1999), 261--269: + http://www.iro.umontreal.ca/~lecuyer/myftp/papers/tausme2.ps + + ... the k_j most significant bits of z_j must be non- + zero, for each j. (Note: this restriction also applies to the + computer code given in [4], but was mistakenly not mentioned in + that paper.) + + This affects the seeding procedure by imposing the requirement + s1 > 1, s2 > 7, s3 > 15. + +*/ +struct nrnd_state { + u32 s1, s2, s3; +}; + +static DEFINE_PER_CPU(struct nrnd_state, net_rand_state); + +static u32 __net_random(struct nrnd_state *state) +{ +#define TAUSWORTHE(s,a,b,c,d) ((s&c)<>b) + + state->s1 = TAUSWORTHE(state->s1, 13, 19, 4294967294UL, 12); + state->s2 = TAUSWORTHE(state->s2, 2, 25, 4294967288UL, 4); + state->s3 = TAUSWORTHE(state->s3, 3, 11, 4294967280UL, 17); + + return (state->s1 ^ state->s2 ^ state->s3); +} + +static void __net_srandom(struct nrnd_state *state, unsigned long s) +{ + if (s == 0) + s = 1; /* default seed is 1 */ + +#define LCG(n) (69069 * n) + state->s1 = LCG(s); + state->s2 = LCG(state->s1); + state->s3 = LCG(state->s2); + + /* "warm it up" */ + __net_random(state); + __net_random(state); + __net_random(state); + __net_random(state); + __net_random(state); + __net_random(state); +} + + +unsigned long net_random(void) +{ + unsigned long r; + struct nrnd_state *state = &get_cpu_var(net_rand_state); + r = __net_random(state); + put_cpu_var(state); + return r; +} + + +void net_srandom(unsigned long entropy) +{ + struct nrnd_state *state = &get_cpu_var(net_rand_state); + __net_srandom(state, state->s1^entropy); + put_cpu_var(state); +} + +void __init net_random_init(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) { + struct nrnd_state *state = &per_cpu(net_rand_state,i); + __net_srandom(state, i+jiffies); + } +} + +static int net_random_reseed(void) +{ + int i; + unsigned long seed[NR_CPUS]; + + get_random_bytes(seed, sizeof(seed)); + for (i = 0; i < NR_CPUS; i++) { + struct nrnd_state *state = &per_cpu(net_rand_state,i); + __net_srandom(state, seed[i]); + } + return 0; +} +late_initcall(net_random_reseed); + +int net_msg_cost = 5*HZ; +int net_msg_burst = 10; + +/* + * All net warning printk()s should be guarded by this function. + */ +int net_ratelimit(void) +{ + return __printk_ratelimit(net_msg_cost, net_msg_burst); +} + +EXPORT_SYMBOL(net_random); +EXPORT_SYMBOL(net_ratelimit); +EXPORT_SYMBOL(net_srandom); diff --git a/net/core/wireless.c b/net/core/wireless.c new file mode 100644 index 000000000000..750cc5daeb03 --- /dev/null +++ b/net/core/wireless.c @@ -0,0 +1,1459 @@ +/* + * This file implement the Wireless Extensions APIs. + * + * Authors : Jean Tourrilhes - HPL - + * Copyright (c) 1997-2004 Jean Tourrilhes, All Rights Reserved. + * + * (As all part of the Linux kernel, this file is GPL) + */ + +/************************** DOCUMENTATION **************************/ +/* + * API definition : + * -------------- + * See for details of the APIs and the rest. + * + * History : + * ------- + * + * v1 - 5.12.01 - Jean II + * o Created this file. + * + * v2 - 13.12.01 - Jean II + * o Move /proc/net/wireless stuff from net/core/dev.c to here + * o Make Wireless Extension IOCTLs go through here + * o Added iw_handler handling ;-) + * o Added standard ioctl description + * o Initial dumb commit strategy based on orinoco.c + * + * v3 - 19.12.01 - Jean II + * o Make sure we don't go out of standard_ioctl[] in ioctl_standard_call + * o Add event dispatcher function + * o Add event description + * o Propagate events as rtnetlink IFLA_WIRELESS option + * o Generate event on selected SET requests + * + * v4 - 18.04.02 - Jean II + * o Fix stupid off by one in iw_ioctl_description : IW_ESSID_MAX_SIZE + 1 + * + * v5 - 21.06.02 - Jean II + * o Add IW_PRIV_TYPE_ADDR in priv_type_size (+cleanup) + * o Reshuffle IW_HEADER_TYPE_XXX to map IW_PRIV_TYPE_XXX changes + * o Add IWEVCUSTOM for driver specific event/scanning token + * o Turn on WE_STRICT_WRITE by default + kernel warning + * o Fix WE_STRICT_WRITE in ioctl_export_private() (32 => iw_num) + * o Fix off-by-one in test (extra_size <= IFNAMSIZ) + * + * v6 - 9.01.03 - Jean II + * o Add common spy support : iw_handler_set_spy(), wireless_spy_update() + * o Add enhanced spy support : iw_handler_set_thrspy() and event. + * o Add WIRELESS_EXT version display in /proc/net/wireless + * + * v6 - 18.06.04 - Jean II + * o Change get_spydata() method for added safety + * o Remove spy #ifdef, they are always on -> cleaner code + * o Allow any size GET request if user specifies length > max + * and if request has IW_DESCR_FLAG_NOMAX flag or is SIOCGIWPRIV + * o Start migrating get_wireless_stats to struct iw_handler_def + * o Add wmb() in iw_handler_set_spy() for non-coherent archs/cpus + * Based on patch from Pavel Roskin : + * o Fix kernel data leak to user space in private handler handling + */ + +/***************************** INCLUDES *****************************/ + +#include /* Not needed ??? */ +#include +#include /* off_t */ +#include /* struct ifreq, dev_get_by_name() */ +#include +#include /* rtnetlink stuff */ +#include +#include /* for __init */ +#include /* ARPHRD_ETHER */ + +#include /* Pretty obvious */ +#include /* New driver API */ + +#include /* copy_to_user() */ + +/**************************** CONSTANTS ****************************/ + +/* Debugging stuff */ +#undef WE_IOCTL_DEBUG /* Debug IOCTL API */ +#undef WE_EVENT_DEBUG /* Debug Event dispatcher */ +#undef WE_SPY_DEBUG /* Debug enhanced spy support */ + +/* Options */ +#define WE_EVENT_NETLINK /* Propagate events using rtnetlink */ +#define WE_SET_EVENT /* Generate an event on some set commands */ + +/************************* GLOBAL VARIABLES *************************/ +/* + * You should not use global variables, because of re-entrancy. + * On our case, it's only const, so it's OK... + */ +/* + * Meta-data about all the standard Wireless Extension request we + * know about. + */ +static const struct iw_ioctl_description standard_ioctl[] = { + [SIOCSIWCOMMIT - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_NULL, + }, + [SIOCGIWNAME - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_CHAR, + .flags = IW_DESCR_FLAG_DUMP, + }, + [SIOCSIWNWID - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + .flags = IW_DESCR_FLAG_EVENT, + }, + [SIOCGIWNWID - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + .flags = IW_DESCR_FLAG_DUMP, + }, + [SIOCSIWFREQ - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_FREQ, + .flags = IW_DESCR_FLAG_EVENT, + }, + [SIOCGIWFREQ - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_FREQ, + .flags = IW_DESCR_FLAG_DUMP, + }, + [SIOCSIWMODE - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_UINT, + .flags = IW_DESCR_FLAG_EVENT, + }, + [SIOCGIWMODE - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_UINT, + .flags = IW_DESCR_FLAG_DUMP, + }, + [SIOCSIWSENS - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCGIWSENS - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCSIWRANGE - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_NULL, + }, + [SIOCGIWRANGE - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = sizeof(struct iw_range), + .flags = IW_DESCR_FLAG_DUMP, + }, + [SIOCSIWPRIV - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_NULL, + }, + [SIOCGIWPRIV - SIOCIWFIRST] = { /* (handled directly by us) */ + .header_type = IW_HEADER_TYPE_NULL, + }, + [SIOCSIWSTATS - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_NULL, + }, + [SIOCGIWSTATS - SIOCIWFIRST] = { /* (handled directly by us) */ + .header_type = IW_HEADER_TYPE_NULL, + .flags = IW_DESCR_FLAG_DUMP, + }, + [SIOCSIWSPY - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = sizeof(struct sockaddr), + .max_tokens = IW_MAX_SPY, + }, + [SIOCGIWSPY - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = sizeof(struct sockaddr) + + sizeof(struct iw_quality), + .max_tokens = IW_MAX_SPY, + }, + [SIOCSIWTHRSPY - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = sizeof(struct iw_thrspy), + .min_tokens = 1, + .max_tokens = 1, + }, + [SIOCGIWTHRSPY - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = sizeof(struct iw_thrspy), + .min_tokens = 1, + .max_tokens = 1, + }, + [SIOCSIWAP - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_ADDR, + }, + [SIOCGIWAP - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_ADDR, + .flags = IW_DESCR_FLAG_DUMP, + }, + [SIOCGIWAPLIST - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = sizeof(struct sockaddr) + + sizeof(struct iw_quality), + .max_tokens = IW_MAX_AP, + .flags = IW_DESCR_FLAG_NOMAX, + }, + [SIOCSIWSCAN - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCGIWSCAN - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_SCAN_MAX_DATA, + .flags = IW_DESCR_FLAG_NOMAX, + }, + [SIOCSIWESSID - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ESSID_MAX_SIZE + 1, + .flags = IW_DESCR_FLAG_EVENT, + }, + [SIOCGIWESSID - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ESSID_MAX_SIZE + 1, + .flags = IW_DESCR_FLAG_DUMP, + }, + [SIOCSIWNICKN - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ESSID_MAX_SIZE + 1, + }, + [SIOCGIWNICKN - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ESSID_MAX_SIZE + 1, + }, + [SIOCSIWRATE - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCGIWRATE - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCSIWRTS - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCGIWRTS - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCSIWFRAG - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCGIWFRAG - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCSIWTXPOW - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCGIWTXPOW - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCSIWRETRY - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCGIWRETRY - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCSIWENCODE - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ENCODING_TOKEN_MAX, + .flags = IW_DESCR_FLAG_EVENT | IW_DESCR_FLAG_RESTRICT, + }, + [SIOCGIWENCODE - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_ENCODING_TOKEN_MAX, + .flags = IW_DESCR_FLAG_DUMP | IW_DESCR_FLAG_RESTRICT, + }, + [SIOCSIWPOWER - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, + [SIOCGIWPOWER - SIOCIWFIRST] = { + .header_type = IW_HEADER_TYPE_PARAM, + }, +}; +static const int standard_ioctl_num = (sizeof(standard_ioctl) / + sizeof(struct iw_ioctl_description)); + +/* + * Meta-data about all the additional standard Wireless Extension events + * we know about. + */ +static const struct iw_ioctl_description standard_event[] = { + [IWEVTXDROP - IWEVFIRST] = { + .header_type = IW_HEADER_TYPE_ADDR, + }, + [IWEVQUAL - IWEVFIRST] = { + .header_type = IW_HEADER_TYPE_QUAL, + }, + [IWEVCUSTOM - IWEVFIRST] = { + .header_type = IW_HEADER_TYPE_POINT, + .token_size = 1, + .max_tokens = IW_CUSTOM_MAX, + }, + [IWEVREGISTERED - IWEVFIRST] = { + .header_type = IW_HEADER_TYPE_ADDR, + }, + [IWEVEXPIRED - IWEVFIRST] = { + .header_type = IW_HEADER_TYPE_ADDR, + }, +}; +static const int standard_event_num = (sizeof(standard_event) / + sizeof(struct iw_ioctl_description)); + +/* Size (in bytes) of the various private data types */ +static const char iw_priv_type_size[] = { + 0, /* IW_PRIV_TYPE_NONE */ + 1, /* IW_PRIV_TYPE_BYTE */ + 1, /* IW_PRIV_TYPE_CHAR */ + 0, /* Not defined */ + sizeof(__u32), /* IW_PRIV_TYPE_INT */ + sizeof(struct iw_freq), /* IW_PRIV_TYPE_FLOAT */ + sizeof(struct sockaddr), /* IW_PRIV_TYPE_ADDR */ + 0, /* Not defined */ +}; + +/* Size (in bytes) of various events */ +static const int event_type_size[] = { + IW_EV_LCP_LEN, /* IW_HEADER_TYPE_NULL */ + 0, + IW_EV_CHAR_LEN, /* IW_HEADER_TYPE_CHAR */ + 0, + IW_EV_UINT_LEN, /* IW_HEADER_TYPE_UINT */ + IW_EV_FREQ_LEN, /* IW_HEADER_TYPE_FREQ */ + IW_EV_ADDR_LEN, /* IW_HEADER_TYPE_ADDR */ + 0, + IW_EV_POINT_LEN, /* Without variable payload */ + IW_EV_PARAM_LEN, /* IW_HEADER_TYPE_PARAM */ + IW_EV_QUAL_LEN, /* IW_HEADER_TYPE_QUAL */ +}; + +/************************ COMMON SUBROUTINES ************************/ +/* + * Stuff that may be used in various place or doesn't fit in one + * of the section below. + */ + +/* ---------------------------------------------------------------- */ +/* + * Return the driver handler associated with a specific Wireless Extension. + * Called from various place, so make sure it remains efficient. + */ +static inline iw_handler get_handler(struct net_device *dev, + unsigned int cmd) +{ + /* Don't "optimise" the following variable, it will crash */ + unsigned int index; /* *MUST* be unsigned */ + + /* Check if we have some wireless handlers defined */ + if(dev->wireless_handlers == NULL) + return NULL; + + /* Try as a standard command */ + index = cmd - SIOCIWFIRST; + if(index < dev->wireless_handlers->num_standard) + return dev->wireless_handlers->standard[index]; + + /* Try as a private command */ + index = cmd - SIOCIWFIRSTPRIV; + if(index < dev->wireless_handlers->num_private) + return dev->wireless_handlers->private[index]; + + /* Not found */ + return NULL; +} + +/* ---------------------------------------------------------------- */ +/* + * Get statistics out of the driver + */ +static inline struct iw_statistics *get_wireless_stats(struct net_device *dev) +{ + /* New location */ + if((dev->wireless_handlers != NULL) && + (dev->wireless_handlers->get_wireless_stats != NULL)) + return dev->wireless_handlers->get_wireless_stats(dev); + + /* Old location, will be phased out in next WE */ + return (dev->get_wireless_stats ? + dev->get_wireless_stats(dev) : + (struct iw_statistics *) NULL); +} + +/* ---------------------------------------------------------------- */ +/* + * Call the commit handler in the driver + * (if exist and if conditions are right) + * + * Note : our current commit strategy is currently pretty dumb, + * but we will be able to improve on that... + * The goal is to try to agreagate as many changes as possible + * before doing the commit. Drivers that will define a commit handler + * are usually those that need a reset after changing parameters, so + * we want to minimise the number of reset. + * A cool idea is to use a timer : at each "set" command, we re-set the + * timer, when the timer eventually fires, we call the driver. + * Hopefully, more on that later. + * + * Also, I'm waiting to see how many people will complain about the + * netif_running(dev) test. I'm open on that one... + * Hopefully, the driver will remember to do a commit in "open()" ;-) + */ +static inline int call_commit_handler(struct net_device * dev) +{ + if((netif_running(dev)) && + (dev->wireless_handlers->standard[0] != NULL)) { + /* Call the commit handler on the driver */ + return dev->wireless_handlers->standard[0](dev, NULL, + NULL, NULL); + } else + return 0; /* Command completed successfully */ +} + +/* ---------------------------------------------------------------- */ +/* + * Calculate size of private arguments + */ +static inline int get_priv_size(__u16 args) +{ + int num = args & IW_PRIV_SIZE_MASK; + int type = (args & IW_PRIV_TYPE_MASK) >> 12; + + return num * iw_priv_type_size[type]; +} + +/* ---------------------------------------------------------------- */ +/* + * Re-calculate the size of private arguments + */ +static inline int adjust_priv_size(__u16 args, + union iwreq_data * wrqu) +{ + int num = wrqu->data.length; + int max = args & IW_PRIV_SIZE_MASK; + int type = (args & IW_PRIV_TYPE_MASK) >> 12; + + /* Make sure the driver doesn't goof up */ + if (max < num) + num = max; + + return num * iw_priv_type_size[type]; +} + + +/******************** /proc/net/wireless SUPPORT ********************/ +/* + * The /proc/net/wireless file is a human readable user-space interface + * exporting various wireless specific statistics from the wireless devices. + * This is the most popular part of the Wireless Extensions ;-) + * + * This interface is a pure clone of /proc/net/dev (in net/core/dev.c). + * The content of the file is basically the content of "struct iw_statistics". + */ + +#ifdef CONFIG_PROC_FS + +/* ---------------------------------------------------------------- */ +/* + * Print one entry (line) of /proc/net/wireless + */ +static __inline__ void wireless_seq_printf_stats(struct seq_file *seq, + struct net_device *dev) +{ + /* Get stats from the driver */ + struct iw_statistics *stats = get_wireless_stats(dev); + + if (stats) { + seq_printf(seq, "%6s: %04x %3d%c %3d%c %3d%c %6d %6d %6d " + "%6d %6d %6d\n", + dev->name, stats->status, stats->qual.qual, + stats->qual.updated & IW_QUAL_QUAL_UPDATED + ? '.' : ' ', + ((__u8) stats->qual.level), + stats->qual.updated & IW_QUAL_LEVEL_UPDATED + ? '.' : ' ', + ((__u8) stats->qual.noise), + stats->qual.updated & IW_QUAL_NOISE_UPDATED + ? '.' : ' ', + stats->discard.nwid, stats->discard.code, + stats->discard.fragment, stats->discard.retries, + stats->discard.misc, stats->miss.beacon); + stats->qual.updated = 0; + } +} + +/* ---------------------------------------------------------------- */ +/* + * Print info for /proc/net/wireless (print all entries) + */ +static int wireless_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, "Inter-| sta-| Quality | Discarded " + "packets | Missed | WE\n" + " face | tus | link level noise | nwid " + "crypt frag retry misc | beacon | %d\n", + WIRELESS_EXT); + else + wireless_seq_printf_stats(seq, v); + return 0; +} + +extern void *dev_seq_start(struct seq_file *seq, loff_t *pos); +extern void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos); +extern void dev_seq_stop(struct seq_file *seq, void *v); + +static struct seq_operations wireless_seq_ops = { + .start = dev_seq_start, + .next = dev_seq_next, + .stop = dev_seq_stop, + .show = wireless_seq_show, +}; + +static int wireless_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &wireless_seq_ops); +} + +static struct file_operations wireless_seq_fops = { + .owner = THIS_MODULE, + .open = wireless_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +int __init wireless_proc_init(void) +{ + if (!proc_net_fops_create("wireless", S_IRUGO, &wireless_seq_fops)) + return -ENOMEM; + + return 0; +} +#endif /* CONFIG_PROC_FS */ + +/************************** IOCTL SUPPORT **************************/ +/* + * The original user space API to configure all those Wireless Extensions + * is through IOCTLs. + * In there, we check if we need to call the new driver API (iw_handler) + * or just call the driver ioctl handler. + */ + +/* ---------------------------------------------------------------- */ +/* + * Allow programatic access to /proc/net/wireless even if /proc + * doesn't exist... Also more efficient... + */ +static inline int dev_iwstats(struct net_device *dev, struct ifreq *ifr) +{ + /* Get stats from the driver */ + struct iw_statistics *stats; + + stats = get_wireless_stats(dev); + if (stats != (struct iw_statistics *) NULL) { + struct iwreq * wrq = (struct iwreq *)ifr; + + /* Copy statistics to the user buffer */ + if(copy_to_user(wrq->u.data.pointer, stats, + sizeof(struct iw_statistics))) + return -EFAULT; + + /* Check if we need to clear the update flag */ + if(wrq->u.data.flags != 0) + stats->qual.updated = 0; + return 0; + } else + return -EOPNOTSUPP; +} + +/* ---------------------------------------------------------------- */ +/* + * Export the driver private handler definition + * They will be picked up by tools like iwpriv... + */ +static inline int ioctl_export_private(struct net_device * dev, + struct ifreq * ifr) +{ + struct iwreq * iwr = (struct iwreq *) ifr; + + /* Check if the driver has something to export */ + if((dev->wireless_handlers->num_private_args == 0) || + (dev->wireless_handlers->private_args == NULL)) + return -EOPNOTSUPP; + + /* Check NULL pointer */ + if(iwr->u.data.pointer == NULL) + return -EFAULT; + + /* Check if there is enough buffer up there */ + if(iwr->u.data.length < dev->wireless_handlers->num_private_args) { + /* User space can't know in advance how large the buffer + * needs to be. Give it a hint, so that we can support + * any size buffer we want somewhat efficiently... */ + iwr->u.data.length = dev->wireless_handlers->num_private_args; + return -E2BIG; + } + + /* Set the number of available ioctls. */ + iwr->u.data.length = dev->wireless_handlers->num_private_args; + + /* Copy structure to the user buffer. */ + if (copy_to_user(iwr->u.data.pointer, + dev->wireless_handlers->private_args, + sizeof(struct iw_priv_args) * iwr->u.data.length)) + return -EFAULT; + + return 0; +} + +/* ---------------------------------------------------------------- */ +/* + * Wrapper to call a standard Wireless Extension handler. + * We do various checks and also take care of moving data between + * user space and kernel space. + */ +static inline int ioctl_standard_call(struct net_device * dev, + struct ifreq * ifr, + unsigned int cmd, + iw_handler handler) +{ + struct iwreq * iwr = (struct iwreq *) ifr; + const struct iw_ioctl_description * descr; + struct iw_request_info info; + int ret = -EINVAL; + + /* Get the description of the IOCTL */ + if((cmd - SIOCIWFIRST) >= standard_ioctl_num) + return -EOPNOTSUPP; + descr = &(standard_ioctl[cmd - SIOCIWFIRST]); + +#ifdef WE_IOCTL_DEBUG + printk(KERN_DEBUG "%s (WE) : Found standard handler for 0x%04X\n", + ifr->ifr_name, cmd); + printk(KERN_DEBUG "%s (WE) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens); +#endif /* WE_IOCTL_DEBUG */ + + /* Prepare the call */ + info.cmd = cmd; + info.flags = 0; + + /* Check if we have a pointer to user space data or not */ + if(descr->header_type != IW_HEADER_TYPE_POINT) { + + /* No extra arguments. Trivial to handle */ + ret = handler(dev, &info, &(iwr->u), NULL); + +#ifdef WE_SET_EVENT + /* Generate an event to notify listeners of the change */ + if((descr->flags & IW_DESCR_FLAG_EVENT) && + ((ret == 0) || (ret == -EIWCOMMIT))) + wireless_send_event(dev, cmd, &(iwr->u), NULL); +#endif /* WE_SET_EVENT */ + } else { + char * extra; + int extra_size; + int user_length = 0; + int err; + + /* Calculate space needed by arguments. Always allocate + * for max space. Easier, and won't last long... */ + extra_size = descr->max_tokens * descr->token_size; + + /* Check what user space is giving us */ + if(IW_IS_SET(cmd)) { + /* Check NULL pointer */ + if((iwr->u.data.pointer == NULL) && + (iwr->u.data.length != 0)) + return -EFAULT; + /* Check if number of token fits within bounds */ + if(iwr->u.data.length > descr->max_tokens) + return -E2BIG; + if(iwr->u.data.length < descr->min_tokens) + return -EINVAL; + } else { + /* Check NULL pointer */ + if(iwr->u.data.pointer == NULL) + return -EFAULT; + /* Save user space buffer size for checking */ + user_length = iwr->u.data.length; + + /* Don't check if user_length > max to allow forward + * compatibility. The test user_length < min is + * implied by the test at the end. */ + + /* Support for very large requests */ + if((descr->flags & IW_DESCR_FLAG_NOMAX) && + (user_length > descr->max_tokens)) { + /* Allow userspace to GET more than max so + * we can support any size GET requests. + * There is still a limit : -ENOMEM. */ + extra_size = user_length * descr->token_size; + /* Note : user_length is originally a __u16, + * and token_size is controlled by us, + * so extra_size won't get negative and + * won't overflow... */ + } + } + +#ifdef WE_IOCTL_DEBUG + printk(KERN_DEBUG "%s (WE) : Malloc %d bytes\n", + dev->name, extra_size); +#endif /* WE_IOCTL_DEBUG */ + + /* Create the kernel buffer */ + extra = kmalloc(extra_size, GFP_KERNEL); + if (extra == NULL) { + return -ENOMEM; + } + + /* If it is a SET, get all the extra data in here */ + if(IW_IS_SET(cmd) && (iwr->u.data.length != 0)) { + err = copy_from_user(extra, iwr->u.data.pointer, + iwr->u.data.length * + descr->token_size); + if (err) { + kfree(extra); + return -EFAULT; + } +#ifdef WE_IOCTL_DEBUG + printk(KERN_DEBUG "%s (WE) : Got %d bytes\n", + dev->name, + iwr->u.data.length * descr->token_size); +#endif /* WE_IOCTL_DEBUG */ + } + + /* Call the handler */ + ret = handler(dev, &info, &(iwr->u), extra); + + /* If we have something to return to the user */ + if (!ret && IW_IS_GET(cmd)) { + /* Check if there is enough buffer up there */ + if(user_length < iwr->u.data.length) { + kfree(extra); + return -E2BIG; + } + + err = copy_to_user(iwr->u.data.pointer, extra, + iwr->u.data.length * + descr->token_size); + if (err) + ret = -EFAULT; +#ifdef WE_IOCTL_DEBUG + printk(KERN_DEBUG "%s (WE) : Wrote %d bytes\n", + dev->name, + iwr->u.data.length * descr->token_size); +#endif /* WE_IOCTL_DEBUG */ + } + +#ifdef WE_SET_EVENT + /* Generate an event to notify listeners of the change */ + if((descr->flags & IW_DESCR_FLAG_EVENT) && + ((ret == 0) || (ret == -EIWCOMMIT))) { + if(descr->flags & IW_DESCR_FLAG_RESTRICT) + /* If the event is restricted, don't + * export the payload */ + wireless_send_event(dev, cmd, &(iwr->u), NULL); + else + wireless_send_event(dev, cmd, &(iwr->u), + extra); + } +#endif /* WE_SET_EVENT */ + + /* Cleanup - I told you it wasn't that long ;-) */ + kfree(extra); + } + + /* Call commit handler if needed and defined */ + if(ret == -EIWCOMMIT) + ret = call_commit_handler(dev); + + /* Here, we will generate the appropriate event if needed */ + + return ret; +} + +/* ---------------------------------------------------------------- */ +/* + * Wrapper to call a private Wireless Extension handler. + * We do various checks and also take care of moving data between + * user space and kernel space. + * It's not as nice and slimline as the standard wrapper. The cause + * is struct iw_priv_args, which was not really designed for the + * job we are going here. + * + * IMPORTANT : This function prevent to set and get data on the same + * IOCTL and enforce the SET/GET convention. Not doing it would be + * far too hairy... + * If you need to set and get data at the same time, please don't use + * a iw_handler but process it in your ioctl handler (i.e. use the + * old driver API). + */ +static inline int ioctl_private_call(struct net_device * dev, + struct ifreq * ifr, + unsigned int cmd, + iw_handler handler) +{ + struct iwreq * iwr = (struct iwreq *) ifr; + const struct iw_priv_args * descr = NULL; + struct iw_request_info info; + int extra_size = 0; + int i; + int ret = -EINVAL; + + /* Get the description of the IOCTL */ + for(i = 0; i < dev->wireless_handlers->num_private_args; i++) + if(cmd == dev->wireless_handlers->private_args[i].cmd) { + descr = &(dev->wireless_handlers->private_args[i]); + break; + } + +#ifdef WE_IOCTL_DEBUG + printk(KERN_DEBUG "%s (WE) : Found private handler for 0x%04X\n", + ifr->ifr_name, cmd); + if(descr) { + printk(KERN_DEBUG "%s (WE) : Name %s, set %X, get %X\n", + dev->name, descr->name, + descr->set_args, descr->get_args); + } +#endif /* WE_IOCTL_DEBUG */ + + /* Compute the size of the set/get arguments */ + if(descr != NULL) { + if(IW_IS_SET(cmd)) { + int offset = 0; /* For sub-ioctls */ + /* Check for sub-ioctl handler */ + if(descr->name[0] == '\0') + /* Reserve one int for sub-ioctl index */ + offset = sizeof(__u32); + + /* Size of set arguments */ + extra_size = get_priv_size(descr->set_args); + + /* Does it fits in iwr ? */ + if((descr->set_args & IW_PRIV_SIZE_FIXED) && + ((extra_size + offset) <= IFNAMSIZ)) + extra_size = 0; + } else { + /* Size of get arguments */ + extra_size = get_priv_size(descr->get_args); + + /* Does it fits in iwr ? */ + if((descr->get_args & IW_PRIV_SIZE_FIXED) && + (extra_size <= IFNAMSIZ)) + extra_size = 0; + } + } + + /* Prepare the call */ + info.cmd = cmd; + info.flags = 0; + + /* Check if we have a pointer to user space data or not. */ + if(extra_size == 0) { + /* No extra arguments. Trivial to handle */ + ret = handler(dev, &info, &(iwr->u), (char *) &(iwr->u)); + } else { + char * extra; + int err; + + /* Check what user space is giving us */ + if(IW_IS_SET(cmd)) { + /* Check NULL pointer */ + if((iwr->u.data.pointer == NULL) && + (iwr->u.data.length != 0)) + return -EFAULT; + + /* Does it fits within bounds ? */ + if(iwr->u.data.length > (descr->set_args & + IW_PRIV_SIZE_MASK)) + return -E2BIG; + } else { + /* Check NULL pointer */ + if(iwr->u.data.pointer == NULL) + return -EFAULT; + } + +#ifdef WE_IOCTL_DEBUG + printk(KERN_DEBUG "%s (WE) : Malloc %d bytes\n", + dev->name, extra_size); +#endif /* WE_IOCTL_DEBUG */ + + /* Always allocate for max space. Easier, and won't last + * long... */ + extra = kmalloc(extra_size, GFP_KERNEL); + if (extra == NULL) { + return -ENOMEM; + } + + /* If it is a SET, get all the extra data in here */ + if(IW_IS_SET(cmd) && (iwr->u.data.length != 0)) { + err = copy_from_user(extra, iwr->u.data.pointer, + extra_size); + if (err) { + kfree(extra); + return -EFAULT; + } +#ifdef WE_IOCTL_DEBUG + printk(KERN_DEBUG "%s (WE) : Got %d elem\n", + dev->name, iwr->u.data.length); +#endif /* WE_IOCTL_DEBUG */ + } + + /* Call the handler */ + ret = handler(dev, &info, &(iwr->u), extra); + + /* If we have something to return to the user */ + if (!ret && IW_IS_GET(cmd)) { + + /* Adjust for the actual length if it's variable, + * avoid leaking kernel bits outside. */ + if (!(descr->get_args & IW_PRIV_SIZE_FIXED)) { + extra_size = adjust_priv_size(descr->get_args, + &(iwr->u)); + } + + err = copy_to_user(iwr->u.data.pointer, extra, + extra_size); + if (err) + ret = -EFAULT; +#ifdef WE_IOCTL_DEBUG + printk(KERN_DEBUG "%s (WE) : Wrote %d elem\n", + dev->name, iwr->u.data.length); +#endif /* WE_IOCTL_DEBUG */ + } + + /* Cleanup - I told you it wasn't that long ;-) */ + kfree(extra); + } + + + /* Call commit handler if needed and defined */ + if(ret == -EIWCOMMIT) + ret = call_commit_handler(dev); + + return ret; +} + +/* ---------------------------------------------------------------- */ +/* + * Main IOCTl dispatcher. Called from the main networking code + * (dev_ioctl() in net/core/dev.c). + * Check the type of IOCTL and call the appropriate wrapper... + */ +int wireless_process_ioctl(struct ifreq *ifr, unsigned int cmd) +{ + struct net_device *dev; + iw_handler handler; + + /* Permissions are already checked in dev_ioctl() before calling us. + * The copy_to/from_user() of ifr is also dealt with in there */ + + /* Make sure the device exist */ + if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL) + return -ENODEV; + + /* A bunch of special cases, then the generic case... + * Note that 'cmd' is already filtered in dev_ioctl() with + * (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) */ + switch(cmd) + { + case SIOCGIWSTATS: + /* Get Wireless Stats */ + return dev_iwstats(dev, ifr); + + case SIOCGIWPRIV: + /* Check if we have some wireless handlers defined */ + if(dev->wireless_handlers != NULL) { + /* We export to user space the definition of + * the private handler ourselves */ + return ioctl_export_private(dev, ifr); + } + // ## Fall-through for old API ## + default: + /* Generic IOCTL */ + /* Basic check */ + if (!netif_device_present(dev)) + return -ENODEV; + /* New driver API : try to find the handler */ + handler = get_handler(dev, cmd); + if(handler != NULL) { + /* Standard and private are not the same */ + if(cmd < SIOCIWFIRSTPRIV) + return ioctl_standard_call(dev, + ifr, + cmd, + handler); + else + return ioctl_private_call(dev, + ifr, + cmd, + handler); + } + /* Old driver API : call driver ioctl handler */ + if (dev->do_ioctl) { + return dev->do_ioctl(dev, ifr, cmd); + } + return -EOPNOTSUPP; + } + /* Not reached */ + return -EINVAL; +} + +/************************* EVENT PROCESSING *************************/ +/* + * Process events generated by the wireless layer or the driver. + * Most often, the event will be propagated through rtnetlink + */ + +#ifdef WE_EVENT_NETLINK +/* "rtnl" is defined in net/core/rtnetlink.c, but we need it here. + * It is declared in */ + +/* ---------------------------------------------------------------- */ +/* + * Fill a rtnetlink message with our event data. + * Note that we propage only the specified event and don't dump the + * current wireless config. Dumping the wireless config is far too + * expensive (for each parameter, the driver need to query the hardware). + */ +static inline int rtnetlink_fill_iwinfo(struct sk_buff * skb, + struct net_device * dev, + int type, + char * event, + int event_len) +{ + struct ifinfomsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, 0, 0, type, sizeof(*r)); + r = NLMSG_DATA(nlh); + r->ifi_family = AF_UNSPEC; + r->ifi_type = dev->type; + r->ifi_index = dev->ifindex; + r->ifi_flags = dev->flags; + r->ifi_change = 0; /* Wireless changes don't affect those flags */ + + /* Add the wireless events in the netlink packet */ + RTA_PUT(skb, IFLA_WIRELESS, + event_len, event); + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +/* ---------------------------------------------------------------- */ +/* + * Create and broadcast and send it on the standard rtnetlink socket + * This is a pure clone rtmsg_ifinfo() in net/core/rtnetlink.c + * Andrzej Krzysztofowicz mandated that I used a IFLA_XXX field + * within a RTM_NEWLINK event. + */ +static inline void rtmsg_iwinfo(struct net_device * dev, + char * event, + int event_len) +{ + struct sk_buff *skb; + int size = NLMSG_GOODSIZE; + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + return; + + if (rtnetlink_fill_iwinfo(skb, dev, RTM_NEWLINK, + event, event_len) < 0) { + kfree_skb(skb); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_LINK; + netlink_broadcast(rtnl, skb, 0, RTMGRP_LINK, GFP_ATOMIC); +} +#endif /* WE_EVENT_NETLINK */ + +/* ---------------------------------------------------------------- */ +/* + * Main event dispatcher. Called from other parts and drivers. + * Send the event on the appropriate channels. + * May be called from interrupt context. + */ +void wireless_send_event(struct net_device * dev, + unsigned int cmd, + union iwreq_data * wrqu, + char * extra) +{ + const struct iw_ioctl_description * descr = NULL; + int extra_len = 0; + struct iw_event *event; /* Mallocated whole event */ + int event_len; /* Its size */ + int hdr_len; /* Size of the event header */ + /* Don't "optimise" the following variable, it will crash */ + unsigned cmd_index; /* *MUST* be unsigned */ + + /* Get the description of the IOCTL */ + if(cmd <= SIOCIWLAST) { + cmd_index = cmd - SIOCIWFIRST; + if(cmd_index < standard_ioctl_num) + descr = &(standard_ioctl[cmd_index]); + } else { + cmd_index = cmd - IWEVFIRST; + if(cmd_index < standard_event_num) + descr = &(standard_event[cmd_index]); + } + /* Don't accept unknown events */ + if(descr == NULL) { + /* Note : we don't return an error to the driver, because + * the driver would not know what to do about it. It can't + * return an error to the user, because the event is not + * initiated by a user request. + * The best the driver could do is to log an error message. + * We will do it ourselves instead... + */ + printk(KERN_ERR "%s (WE) : Invalid/Unknown Wireless Event (0x%04X)\n", + dev->name, cmd); + return; + } +#ifdef WE_EVENT_DEBUG + printk(KERN_DEBUG "%s (WE) : Got event 0x%04X\n", + dev->name, cmd); + printk(KERN_DEBUG "%s (WE) : Header type : %d, Token type : %d, size : %d, token : %d\n", dev->name, descr->header_type, descr->token_type, descr->token_size, descr->max_tokens); +#endif /* WE_EVENT_DEBUG */ + + /* Check extra parameters and set extra_len */ + if(descr->header_type == IW_HEADER_TYPE_POINT) { + /* Check if number of token fits within bounds */ + if(wrqu->data.length > descr->max_tokens) { + printk(KERN_ERR "%s (WE) : Wireless Event too big (%d)\n", dev->name, wrqu->data.length); + return; + } + if(wrqu->data.length < descr->min_tokens) { + printk(KERN_ERR "%s (WE) : Wireless Event too small (%d)\n", dev->name, wrqu->data.length); + return; + } + /* Calculate extra_len - extra is NULL for restricted events */ + if(extra != NULL) + extra_len = wrqu->data.length * descr->token_size; +#ifdef WE_EVENT_DEBUG + printk(KERN_DEBUG "%s (WE) : Event 0x%04X, tokens %d, extra_len %d\n", dev->name, cmd, wrqu->data.length, extra_len); +#endif /* WE_EVENT_DEBUG */ + } + + /* Total length of the event */ + hdr_len = event_type_size[descr->header_type]; + event_len = hdr_len + extra_len; + +#ifdef WE_EVENT_DEBUG + printk(KERN_DEBUG "%s (WE) : Event 0x%04X, hdr_len %d, event_len %d\n", dev->name, cmd, hdr_len, event_len); +#endif /* WE_EVENT_DEBUG */ + + /* Create temporary buffer to hold the event */ + event = kmalloc(event_len, GFP_ATOMIC); + if(event == NULL) + return; + + /* Fill event */ + event->len = event_len; + event->cmd = cmd; + memcpy(&event->u, wrqu, hdr_len - IW_EV_LCP_LEN); + if(extra != NULL) + memcpy(((char *) event) + hdr_len, extra, extra_len); + +#ifdef WE_EVENT_NETLINK + /* rtnetlink event channel */ + rtmsg_iwinfo(dev, (char *) event, event_len); +#endif /* WE_EVENT_NETLINK */ + + /* Cleanup */ + kfree(event); + + return; /* Always success, I guess ;-) */ +} + +/********************** ENHANCED IWSPY SUPPORT **********************/ +/* + * In the old days, the driver was handling spy support all by itself. + * Now, the driver can delegate this task to Wireless Extensions. + * It needs to use those standard spy iw_handler in struct iw_handler_def, + * push data to us via wireless_spy_update() and include struct iw_spy_data + * in its private part (and advertise it in iw_handler_def->spy_offset). + * One of the main advantage of centralising spy support here is that + * it becomes much easier to improve and extend it without having to touch + * the drivers. One example is the addition of the Spy-Threshold events. + */ + +/* ---------------------------------------------------------------- */ +/* + * Return the pointer to the spy data in the driver. + * Because this is called on the Rx path via wireless_spy_update(), + * we want it to be efficient... + */ +static inline struct iw_spy_data * get_spydata(struct net_device *dev) +{ + /* This is the new way */ + if(dev->wireless_data) + return(dev->wireless_data->spy_data); + + /* This is the old way. Doesn't work for multi-headed drivers. + * It will be removed in the next version of WE. */ + return (dev->priv + dev->wireless_handlers->spy_offset); +} + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : set Spy List + */ +int iw_handler_set_spy(struct net_device * dev, + struct iw_request_info * info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct sockaddr * address = (struct sockaddr *) extra; + + if(!dev->wireless_data) + /* Help user know that driver needs updating */ + printk(KERN_DEBUG "%s (WE) : Driver using old/buggy spy support, please fix driver !\n", + dev->name); + /* Make sure driver is not buggy or using the old API */ + if(!spydata) + return -EOPNOTSUPP; + + /* Disable spy collection while we copy the addresses. + * While we copy addresses, any call to wireless_spy_update() + * will NOP. This is OK, as anyway the addresses are changing. */ + spydata->spy_number = 0; + + /* We want to operate without locking, because wireless_spy_update() + * most likely will happen in the interrupt handler, and therefore + * have its own locking constraints and needs performance. + * The rtnl_lock() make sure we don't race with the other iw_handlers. + * This make sure wireless_spy_update() "see" that the spy list + * is temporarily disabled. */ + wmb(); + + /* Are there are addresses to copy? */ + if(wrqu->data.length > 0) { + int i; + + /* Copy addresses */ + for(i = 0; i < wrqu->data.length; i++) + memcpy(spydata->spy_address[i], address[i].sa_data, + ETH_ALEN); + /* Reset stats */ + memset(spydata->spy_stat, 0, + sizeof(struct iw_quality) * IW_MAX_SPY); + +#ifdef WE_SPY_DEBUG + printk(KERN_DEBUG "iw_handler_set_spy() : offset %ld, spydata %p, num %d\n", dev->wireless_handlers->spy_offset, spydata, wrqu->data.length); + for (i = 0; i < wrqu->data.length; i++) + printk(KERN_DEBUG + "%02X:%02X:%02X:%02X:%02X:%02X \n", + spydata->spy_address[i][0], + spydata->spy_address[i][1], + spydata->spy_address[i][2], + spydata->spy_address[i][3], + spydata->spy_address[i][4], + spydata->spy_address[i][5]); +#endif /* WE_SPY_DEBUG */ + } + + /* Make sure above is updated before re-enabling */ + wmb(); + + /* Enable addresses */ + spydata->spy_number = wrqu->data.length; + + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : get Spy List + */ +int iw_handler_get_spy(struct net_device * dev, + struct iw_request_info * info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct sockaddr * address = (struct sockaddr *) extra; + int i; + + /* Make sure driver is not buggy or using the old API */ + if(!spydata) + return -EOPNOTSUPP; + + wrqu->data.length = spydata->spy_number; + + /* Copy addresses. */ + for(i = 0; i < spydata->spy_number; i++) { + memcpy(address[i].sa_data, spydata->spy_address[i], ETH_ALEN); + address[i].sa_family = AF_UNIX; + } + /* Copy stats to the user buffer (just after). */ + if(spydata->spy_number > 0) + memcpy(extra + (sizeof(struct sockaddr) *spydata->spy_number), + spydata->spy_stat, + sizeof(struct iw_quality) * spydata->spy_number); + /* Reset updated flags. */ + for(i = 0; i < spydata->spy_number; i++) + spydata->spy_stat[i].updated = 0; + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : set spy threshold + */ +int iw_handler_set_thrspy(struct net_device * dev, + struct iw_request_info *info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct iw_thrspy * threshold = (struct iw_thrspy *) extra; + + /* Make sure driver is not buggy or using the old API */ + if(!spydata) + return -EOPNOTSUPP; + + /* Just do it */ + memcpy(&(spydata->spy_thr_low), &(threshold->low), + 2 * sizeof(struct iw_quality)); + + /* Clear flag */ + memset(spydata->spy_thr_under, '\0', sizeof(spydata->spy_thr_under)); + +#ifdef WE_SPY_DEBUG + printk(KERN_DEBUG "iw_handler_set_thrspy() : low %d ; high %d\n", spydata->spy_thr_low.level, spydata->spy_thr_high.level); +#endif /* WE_SPY_DEBUG */ + + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Standard Wireless Handler : get spy threshold + */ +int iw_handler_get_thrspy(struct net_device * dev, + struct iw_request_info *info, + union iwreq_data * wrqu, + char * extra) +{ + struct iw_spy_data * spydata = get_spydata(dev); + struct iw_thrspy * threshold = (struct iw_thrspy *) extra; + + /* Make sure driver is not buggy or using the old API */ + if(!spydata) + return -EOPNOTSUPP; + + /* Just do it */ + memcpy(&(threshold->low), &(spydata->spy_thr_low), + 2 * sizeof(struct iw_quality)); + + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Prepare and send a Spy Threshold event + */ +static void iw_send_thrspy_event(struct net_device * dev, + struct iw_spy_data * spydata, + unsigned char * address, + struct iw_quality * wstats) +{ + union iwreq_data wrqu; + struct iw_thrspy threshold; + + /* Init */ + wrqu.data.length = 1; + wrqu.data.flags = 0; + /* Copy address */ + memcpy(threshold.addr.sa_data, address, ETH_ALEN); + threshold.addr.sa_family = ARPHRD_ETHER; + /* Copy stats */ + memcpy(&(threshold.qual), wstats, sizeof(struct iw_quality)); + /* Copy also thresholds */ + memcpy(&(threshold.low), &(spydata->spy_thr_low), + 2 * sizeof(struct iw_quality)); + +#ifdef WE_SPY_DEBUG + printk(KERN_DEBUG "iw_send_thrspy_event() : address %02X:%02X:%02X:%02X:%02X:%02X, level %d, up = %d\n", + threshold.addr.sa_data[0], + threshold.addr.sa_data[1], + threshold.addr.sa_data[2], + threshold.addr.sa_data[3], + threshold.addr.sa_data[4], + threshold.addr.sa_data[5], threshold.qual.level); +#endif /* WE_SPY_DEBUG */ + + /* Send event to user space */ + wireless_send_event(dev, SIOCGIWTHRSPY, &wrqu, (char *) &threshold); +} + +/* ---------------------------------------------------------------- */ +/* + * Call for the driver to update the spy data. + * For now, the spy data is a simple array. As the size of the array is + * small, this is good enough. If we wanted to support larger number of + * spy addresses, we should use something more efficient... + */ +void wireless_spy_update(struct net_device * dev, + unsigned char * address, + struct iw_quality * wstats) +{ + struct iw_spy_data * spydata = get_spydata(dev); + int i; + int match = -1; + + /* Make sure driver is not buggy or using the old API */ + if(!spydata) + return; + +#ifdef WE_SPY_DEBUG + printk(KERN_DEBUG "wireless_spy_update() : offset %ld, spydata %p, address %02X:%02X:%02X:%02X:%02X:%02X\n", dev->wireless_handlers->spy_offset, spydata, address[0], address[1], address[2], address[3], address[4], address[5]); +#endif /* WE_SPY_DEBUG */ + + /* Update all records that match */ + for(i = 0; i < spydata->spy_number; i++) + if(!memcmp(address, spydata->spy_address[i], ETH_ALEN)) { + memcpy(&(spydata->spy_stat[i]), wstats, + sizeof(struct iw_quality)); + match = i; + } + + /* Generate an event if we cross the spy threshold. + * To avoid event storms, we have a simple hysteresis : we generate + * event only when we go under the low threshold or above the + * high threshold. */ + if(match >= 0) { + if(spydata->spy_thr_under[match]) { + if(wstats->level > spydata->spy_thr_high.level) { + spydata->spy_thr_under[match] = 0; + iw_send_thrspy_event(dev, spydata, + address, wstats); + } + } else { + if(wstats->level < spydata->spy_thr_low.level) { + spydata->spy_thr_under[match] = 1; + iw_send_thrspy_event(dev, spydata, + address, wstats); + } + } + } +} + +EXPORT_SYMBOL(iw_handler_get_spy); +EXPORT_SYMBOL(iw_handler_get_thrspy); +EXPORT_SYMBOL(iw_handler_set_spy); +EXPORT_SYMBOL(iw_handler_set_thrspy); +EXPORT_SYMBOL(wireless_send_event); +EXPORT_SYMBOL(wireless_spy_update); diff --git a/net/decnet/Kconfig b/net/decnet/Kconfig new file mode 100644 index 000000000000..2101da542ba8 --- /dev/null +++ b/net/decnet/Kconfig @@ -0,0 +1,27 @@ +# +# DECnet configuration +# +config DECNET_ROUTER + bool "DECnet: router support (EXPERIMENTAL)" + depends on DECNET && EXPERIMENTAL + ---help--- + Add support for turning your DECnet Endnode into a level 1 or 2 + router. This is an experimental, but functional option. If you + do say Y here, then make sure that you also say Y to "Kernel/User + network link driver", "Routing messages" and "Network packet + filtering". The first two are required to allow configuration via + rtnetlink (you will need Alexey Kuznetsov's iproute2 package + from ). The "Network packet + filtering" option will be required for the forthcoming routing daemon + to work. + + See for more information. + +config DECNET_ROUTE_FWMARK + bool "DECnet: use FWMARK value as routing key (EXPERIMENTAL)" + depends on DECNET_ROUTER && NETFILTER + help + If you say Y here, you will be able to specify different routes for + packets with different FWMARK ("firewalling mark") values + (see ipchains(8), "-m" argument). + diff --git a/net/decnet/Makefile b/net/decnet/Makefile new file mode 100644 index 000000000000..e44003af71f6 --- /dev/null +++ b/net/decnet/Makefile @@ -0,0 +1,10 @@ + +obj-$(CONFIG_DECNET) += decnet.o + +decnet-y := af_decnet.o dn_nsp_in.o dn_nsp_out.o \ + dn_route.o dn_dev.o dn_neigh.o dn_timer.o +decnet-$(CONFIG_DECNET_ROUTER) += dn_fib.o dn_rules.o dn_table.o +decnet-y += sysctl_net_decnet.o + +obj-$(CONFIG_NETFILTER) += netfilter/ + diff --git a/net/decnet/README b/net/decnet/README new file mode 100644 index 000000000000..60e7ec88c81f --- /dev/null +++ b/net/decnet/README @@ -0,0 +1,8 @@ + Linux DECnet Project + ====================== + +The documentation for this kernel subsystem is available in the +Documentation/networking subdirectory of this distribution and also +on line at http://www.chygwyn.com/DECnet/ + +Steve Whitehouse diff --git a/net/decnet/TODO b/net/decnet/TODO new file mode 100644 index 000000000000..ebb5ac69d128 --- /dev/null +++ b/net/decnet/TODO @@ -0,0 +1,41 @@ +Steve's quick list of things that need finishing off: +[they are in no particular order and range from the trivial to the long winded] + + o Proper timeouts on each neighbour (in routing mode) rather than + just the 60 second On-Ethernet cache value. + + o Support for X.25 linklayer + + o Support for DDCMP link layer + + o The DDCMP device itself + + o PPP support (rfc1762) + + o Lots of testing with real applications + + o Verify errors etc. against POSIX 1003.1g (draft) + + o Using send/recvmsg() to get at connect/disconnect data (POSIX 1003.1g) + [maybe this should be done at socket level... the control data in the + send/recvmsg() calls should simply be a vector of set/getsockopt() + calls] + + o check MSG_CTRUNC is set where it should be. + + o Find all the commonality between DECnet and IPv4 routing code and extract + it into a small library of routines. [probably a project for 2.7.xx] + + o Add perfect socket hashing - an idea suggested by Paul Koning. Currently + we have a half-way house scheme which seems to work reasonably well, but + the full scheme is still worth implementing, its not not top of my list + right now. + + o Add session control message flow control + + o Add NSP message flow control + + o DECnet sendpages() function + + o AIO for DECnet + diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c new file mode 100644 index 000000000000..29bb3cd21965 --- /dev/null +++ b/net/decnet/af_decnet.c @@ -0,0 +1,2405 @@ + +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Socket Layer Interface + * + * Authors: Eduardo Marcelo Serrat + * Patrick Caulfield + * + * Changes: + * Steve Whitehouse: Copied from Eduardo Serrat and Patrick Caulfield's + * version of the code. Original copyright preserved + * below. + * Steve Whitehouse: Some bug fixes, cleaning up some code to make it + * compatible with my routing layer. + * Steve Whitehouse: Merging changes from Eduardo Serrat and Patrick + * Caulfield. + * Steve Whitehouse: Further bug fixes, checking module code still works + * with new routing layer. + * Steve Whitehouse: Additional set/get_sockopt() calls. + * Steve Whitehouse: Fixed TIOCINQ ioctl to be same as Eduardo's new + * code. + * Steve Whitehouse: recvmsg() changed to try and behave in a POSIX like + * way. Didn't manage it entirely, but its better. + * Steve Whitehouse: ditto for sendmsg(). + * Steve Whitehouse: A selection of bug fixes to various things. + * Steve Whitehouse: Added TIOCOUTQ ioctl. + * Steve Whitehouse: Fixes to username2sockaddr & sockaddr2username. + * Steve Whitehouse: Fixes to connect() error returns. + * Patrick Caulfield: Fixes to delayed acceptance logic. + * David S. Miller: New socket locking + * Steve Whitehouse: Socket list hashing/locking + * Arnaldo C. Melo: use capable, not suser + * Steve Whitehouse: Removed unused code. Fix to use sk->allocation + * when required. + * Patrick Caulfield: /proc/net/decnet now has object name/number + * Steve Whitehouse: Fixed local port allocation, hashed sk list + * Matthew Wilcox: Fixes for dn_ioctl() + * Steve Whitehouse: New connect/accept logic to allow timeouts and + * prepare for sendpage etc. + */ + + +/****************************************************************************** + (c) 1995-1998 E.M. Serrat emserrat@geocities.com + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + +HISTORY: + +Version Kernel Date Author/Comments +------- ------ ---- --------------- +Version 0.0.1 2.0.30 01-dic-97 Eduardo Marcelo Serrat + (emserrat@geocities.com) + + First Development of DECnet Socket La- + yer for Linux. Only supports outgoing + connections. + +Version 0.0.2 2.1.105 20-jun-98 Patrick J. Caulfield + (patrick@pandh.demon.co.uk) + + Port to new kernel development version. + +Version 0.0.3 2.1.106 25-jun-98 Eduardo Marcelo Serrat + (emserrat@geocities.com) + _ + Added support for incoming connections + so we can start developing server apps + on Linux. + - + Module Support +Version 0.0.4 2.1.109 21-jul-98 Eduardo Marcelo Serrat + (emserrat@geocities.com) + _ + Added support for X11R6.4. Now we can + use DECnet transport for X on Linux!!! + - +Version 0.0.5 2.1.110 01-aug-98 Eduardo Marcelo Serrat + (emserrat@geocities.com) + Removed bugs on flow control + Removed bugs on incoming accessdata + order + - +Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat + dn_recvmsg fixes + + Patrick J. Caulfield + dn_bind fixes +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct dn_sock { + struct sock sk; + struct dn_scp scp; +}; + +static void dn_keepalive(struct sock *sk); + +#define DN_SK_HASH_SHIFT 8 +#define DN_SK_HASH_SIZE (1 << DN_SK_HASH_SHIFT) +#define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1) + + +static struct proto_ops dn_proto_ops; +static DEFINE_RWLOCK(dn_hash_lock); +static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; +static struct hlist_head dn_wild_sk; + +static int __dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen, int flags); +static int __dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen, int flags); + +static struct hlist_head *dn_find_list(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + if (scp->addr.sdn_flags & SDF_WILD) + return hlist_empty(&dn_wild_sk) ? &dn_wild_sk : NULL; + + return &dn_sk_hash[scp->addrloc & DN_SK_HASH_MASK]; +} + +/* + * Valid ports are those greater than zero and not already in use. + */ +static int check_port(unsigned short port) +{ + struct sock *sk; + struct hlist_node *node; + + if (port == 0) + return -1; + + sk_for_each(sk, node, &dn_sk_hash[port & DN_SK_HASH_MASK]) { + struct dn_scp *scp = DN_SK(sk); + if (scp->addrloc == port) + return -1; + } + return 0; +} + +static unsigned short port_alloc(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); +static unsigned short port = 0x2000; + unsigned short i_port = port; + + while(check_port(++port) != 0) { + if (port == i_port) + return 0; + } + + scp->addrloc = port; + + return 1; +} + +/* + * Since this is only ever called from user + * level, we don't need a write_lock() version + * of this. + */ +static int dn_hash_sock(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + struct hlist_head *list; + int rv = -EUSERS; + + BUG_ON(sk_hashed(sk)); + + write_lock_bh(&dn_hash_lock); + + if (!scp->addrloc && !port_alloc(sk)) + goto out; + + rv = -EADDRINUSE; + if ((list = dn_find_list(sk)) == NULL) + goto out; + + sk_add_node(sk, list); + rv = 0; +out: + write_unlock_bh(&dn_hash_lock); + return rv; +} + +static void dn_unhash_sock(struct sock *sk) +{ + write_lock(&dn_hash_lock); + sk_del_node_init(sk); + write_unlock(&dn_hash_lock); +} + +static void dn_unhash_sock_bh(struct sock *sk) +{ + write_lock_bh(&dn_hash_lock); + sk_del_node_init(sk); + write_unlock_bh(&dn_hash_lock); +} + +static struct hlist_head *listen_hash(struct sockaddr_dn *addr) +{ + int i; + unsigned hash = addr->sdn_objnum; + + if (hash == 0) { + hash = addr->sdn_objnamel; + for(i = 0; i < dn_ntohs(addr->sdn_objnamel); i++) { + hash ^= addr->sdn_objname[i]; + hash ^= (hash << 3); + } + } + + return &dn_sk_hash[hash & DN_SK_HASH_MASK]; +} + +/* + * Called to transform a socket from bound (i.e. with a local address) + * into a listening socket (doesn't need a local port number) and rehashes + * based upon the object name/number. + */ +static void dn_rehash_sock(struct sock *sk) +{ + struct hlist_head *list; + struct dn_scp *scp = DN_SK(sk); + + if (scp->addr.sdn_flags & SDF_WILD) + return; + + write_lock_bh(&dn_hash_lock); + sk_del_node_init(sk); + DN_SK(sk)->addrloc = 0; + list = listen_hash(&DN_SK(sk)->addr); + sk_add_node(sk, list); + write_unlock_bh(&dn_hash_lock); +} + +int dn_sockaddr2username(struct sockaddr_dn *sdn, unsigned char *buf, unsigned char type) +{ + int len = 2; + + *buf++ = type; + + switch(type) { + case 0: + *buf++ = sdn->sdn_objnum; + break; + case 1: + *buf++ = 0; + *buf++ = dn_ntohs(sdn->sdn_objnamel); + memcpy(buf, sdn->sdn_objname, dn_ntohs(sdn->sdn_objnamel)); + len = 3 + dn_ntohs(sdn->sdn_objnamel); + break; + case 2: + memset(buf, 0, 5); + buf += 5; + *buf++ = dn_ntohs(sdn->sdn_objnamel); + memcpy(buf, sdn->sdn_objname, dn_ntohs(sdn->sdn_objnamel)); + len = 7 + dn_ntohs(sdn->sdn_objnamel); + break; + } + + return len; +} + +/* + * On reception of usernames, we handle types 1 and 0 for destination + * addresses only. Types 2 and 4 are used for source addresses, but the + * UIC, GIC are ignored and they are both treated the same way. Type 3 + * is never used as I've no idea what its purpose might be or what its + * format is. + */ +int dn_username2sockaddr(unsigned char *data, int len, struct sockaddr_dn *sdn, unsigned char *fmt) +{ + unsigned char type; + int size = len; + int namel = 12; + + sdn->sdn_objnum = 0; + sdn->sdn_objnamel = dn_htons(0); + memset(sdn->sdn_objname, 0, DN_MAXOBJL); + + if (len < 2) + return -1; + + len -= 2; + *fmt = *data++; + type = *data++; + + switch(*fmt) { + case 0: + sdn->sdn_objnum = type; + return 2; + case 1: + namel = 16; + break; + case 2: + len -= 4; + data += 4; + break; + case 4: + len -= 8; + data += 8; + break; + default: + return -1; + } + + len -= 1; + + if (len < 0) + return -1; + + sdn->sdn_objnamel = dn_htons(*data++); + len -= dn_ntohs(sdn->sdn_objnamel); + + if ((len < 0) || (dn_ntohs(sdn->sdn_objnamel) > namel)) + return -1; + + memcpy(sdn->sdn_objname, data, dn_ntohs(sdn->sdn_objnamel)); + + return size - len; +} + +struct sock *dn_sklist_find_listener(struct sockaddr_dn *addr) +{ + struct hlist_head *list = listen_hash(addr); + struct hlist_node *node; + struct sock *sk; + + read_lock(&dn_hash_lock); + sk_for_each(sk, node, list) { + struct dn_scp *scp = DN_SK(sk); + if (sk->sk_state != TCP_LISTEN) + continue; + if (scp->addr.sdn_objnum) { + if (scp->addr.sdn_objnum != addr->sdn_objnum) + continue; + } else { + if (addr->sdn_objnum) + continue; + if (scp->addr.sdn_objnamel != addr->sdn_objnamel) + continue; + if (memcmp(scp->addr.sdn_objname, addr->sdn_objname, dn_ntohs(addr->sdn_objnamel)) != 0) + continue; + } + sock_hold(sk); + read_unlock(&dn_hash_lock); + return sk; + } + + sk = sk_head(&dn_wild_sk); + if (sk) { + if (sk->sk_state == TCP_LISTEN) + sock_hold(sk); + else + sk = NULL; + } + + read_unlock(&dn_hash_lock); + return sk; +} + +struct sock *dn_find_by_skb(struct sk_buff *skb) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct sock *sk; + struct hlist_node *node; + struct dn_scp *scp; + + read_lock(&dn_hash_lock); + sk_for_each(sk, node, &dn_sk_hash[cb->dst_port & DN_SK_HASH_MASK]) { + scp = DN_SK(sk); + if (cb->src != dn_saddr2dn(&scp->peer)) + continue; + if (cb->dst_port != scp->addrloc) + continue; + if (scp->addrrem && (cb->src_port != scp->addrrem)) + continue; + sock_hold(sk); + goto found; + } + sk = NULL; +found: + read_unlock(&dn_hash_lock); + return sk; +} + + + +static void dn_destruct(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + skb_queue_purge(&scp->data_xmit_queue); + skb_queue_purge(&scp->other_xmit_queue); + skb_queue_purge(&scp->other_receive_queue); + + dst_release(xchg(&sk->sk_dst_cache, NULL)); +} + +static struct proto dn_proto = { + .name = "DECNET", + .owner = THIS_MODULE, + .obj_size = sizeof(struct dn_sock), +}; + +static struct sock *dn_alloc_sock(struct socket *sock, int gfp) +{ + struct dn_scp *scp; + struct sock *sk = sk_alloc(PF_DECnet, gfp, &dn_proto, 1); + + if (!sk) + goto out; + + if (sock) + sock->ops = &dn_proto_ops; + sock_init_data(sock, sk); + + sk->sk_backlog_rcv = dn_nsp_backlog_rcv; + sk->sk_destruct = dn_destruct; + sk->sk_no_check = 1; + sk->sk_family = PF_DECnet; + sk->sk_protocol = 0; + sk->sk_allocation = gfp; + + /* Initialization of DECnet Session Control Port */ + scp = DN_SK(sk); + scp->state = DN_O; /* Open */ + scp->numdat = 1; /* Next data seg to tx */ + scp->numoth = 1; /* Next oth data to tx */ + scp->ackxmt_dat = 0; /* Last data seg ack'ed */ + scp->ackxmt_oth = 0; /* Last oth data ack'ed */ + scp->ackrcv_dat = 0; /* Highest data ack recv*/ + scp->ackrcv_oth = 0; /* Last oth data ack rec*/ + scp->flowrem_sw = DN_SEND; + scp->flowloc_sw = DN_SEND; + scp->flowrem_dat = 0; + scp->flowrem_oth = 1; + scp->flowloc_dat = 0; + scp->flowloc_oth = 1; + scp->services_rem = 0; + scp->services_loc = 1 | NSP_FC_NONE; + scp->info_rem = 0; + scp->info_loc = 0x03; /* NSP version 4.1 */ + scp->segsize_rem = 230 - DN_MAX_NSP_DATA_HEADER; /* Default: Updated by remote segsize */ + scp->nonagle = 0; + scp->multi_ireq = 1; + scp->accept_mode = ACC_IMMED; + scp->addr.sdn_family = AF_DECnet; + scp->peer.sdn_family = AF_DECnet; + scp->accessdata.acc_accl = 5; + memcpy(scp->accessdata.acc_acc, "LINUX", 5); + + scp->max_window = NSP_MAX_WINDOW; + scp->snd_window = NSP_MIN_WINDOW; + scp->nsp_srtt = NSP_INITIAL_SRTT; + scp->nsp_rttvar = NSP_INITIAL_RTTVAR; + scp->nsp_rxtshift = 0; + + skb_queue_head_init(&scp->data_xmit_queue); + skb_queue_head_init(&scp->other_xmit_queue); + skb_queue_head_init(&scp->other_receive_queue); + + scp->persist = 0; + scp->persist_fxn = NULL; + scp->keepalive = 10 * HZ; + scp->keepalive_fxn = dn_keepalive; + + init_timer(&scp->delack_timer); + scp->delack_pending = 0; + scp->delack_fxn = dn_nsp_delayed_ack; + + dn_start_slow_timer(sk); +out: + return sk; +} + +/* + * Keepalive timer. + * FIXME: Should respond to SO_KEEPALIVE etc. + */ +static void dn_keepalive(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + /* + * By checking the other_data transmit queue is empty + * we are double checking that we are not sending too + * many of these keepalive frames. + */ + if (skb_queue_len(&scp->other_xmit_queue) == 0) + dn_nsp_send_link(sk, DN_NOCHANGE, 0); +} + + +/* + * Timer for shutdown/destroyed sockets. + * When socket is dead & no packets have been sent for a + * certain amount of time, they are removed by this + * routine. Also takes care of sending out DI & DC + * frames at correct times. + */ +int dn_destroy_timer(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + scp->persist = dn_nsp_persist(sk); + + switch(scp->state) { + case DN_DI: + dn_nsp_send_disc(sk, NSP_DISCINIT, 0, GFP_ATOMIC); + if (scp->nsp_rxtshift >= decnet_di_count) + scp->state = DN_CN; + return 0; + + case DN_DR: + dn_nsp_send_disc(sk, NSP_DISCINIT, 0, GFP_ATOMIC); + if (scp->nsp_rxtshift >= decnet_dr_count) + scp->state = DN_DRC; + return 0; + + case DN_DN: + if (scp->nsp_rxtshift < decnet_dn_count) { + /* printk(KERN_DEBUG "dn_destroy_timer: DN\n"); */ + dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC, GFP_ATOMIC); + return 0; + } + } + + scp->persist = (HZ * decnet_time_wait); + + if (sk->sk_socket) + return 0; + + if ((jiffies - scp->stamp) >= (HZ * decnet_time_wait)) { + dn_unhash_sock(sk); + sock_put(sk); + return 1; + } + + return 0; +} + +static void dn_destroy_sock(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + scp->nsp_rxtshift = 0; /* reset back off */ + + if (sk->sk_socket) { + if (sk->sk_socket->state != SS_UNCONNECTED) + sk->sk_socket->state = SS_DISCONNECTING; + } + + sk->sk_state = TCP_CLOSE; + + switch(scp->state) { + case DN_DN: + dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC, + sk->sk_allocation); + scp->persist_fxn = dn_destroy_timer; + scp->persist = dn_nsp_persist(sk); + break; + case DN_CR: + scp->state = DN_DR; + goto disc_reject; + case DN_RUN: + scp->state = DN_DI; + case DN_DI: + case DN_DR: +disc_reject: + dn_nsp_send_disc(sk, NSP_DISCINIT, 0, sk->sk_allocation); + case DN_NC: + case DN_NR: + case DN_RJ: + case DN_DIC: + case DN_CN: + case DN_DRC: + case DN_CI: + case DN_CD: + scp->persist_fxn = dn_destroy_timer; + scp->persist = dn_nsp_persist(sk); + break; + default: + printk(KERN_DEBUG "DECnet: dn_destroy_sock passed socket in invalid state\n"); + case DN_O: + dn_stop_slow_timer(sk); + + dn_unhash_sock_bh(sk); + sock_put(sk); + + break; + } +} + +char *dn_addr2asc(dn_address addr, char *buf) +{ + unsigned short node, area; + + node = addr & 0x03ff; + area = addr >> 10; + sprintf(buf, "%hd.%hd", area, node); + + return buf; +} + + + +static int dn_create(struct socket *sock, int protocol) +{ + struct sock *sk; + + switch(sock->type) { + case SOCK_SEQPACKET: + if (protocol != DNPROTO_NSP) + return -EPROTONOSUPPORT; + break; + case SOCK_STREAM: + break; + default: + return -ESOCKTNOSUPPORT; + } + + + if ((sk = dn_alloc_sock(sock, GFP_KERNEL)) == NULL) + return -ENOBUFS; + + sk->sk_protocol = protocol; + + return 0; +} + + +static int +dn_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk) { + sock_orphan(sk); + sock_hold(sk); + lock_sock(sk); + dn_destroy_sock(sk); + release_sock(sk); + sock_put(sk); + } + + return 0; +} + +static int dn_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + struct sockaddr_dn *saddr = (struct sockaddr_dn *)uaddr; + struct net_device *dev; + int rv; + + if (addr_len != sizeof(struct sockaddr_dn)) + return -EINVAL; + + if (saddr->sdn_family != AF_DECnet) + return -EINVAL; + + if (dn_ntohs(saddr->sdn_nodeaddrl) && (dn_ntohs(saddr->sdn_nodeaddrl) != 2)) + return -EINVAL; + + if (dn_ntohs(saddr->sdn_objnamel) > DN_MAXOBJL) + return -EINVAL; + + if (saddr->sdn_flags & ~SDF_WILD) + return -EINVAL; + +#if 1 + if (!capable(CAP_NET_BIND_SERVICE) && (saddr->sdn_objnum || + (saddr->sdn_flags & SDF_WILD))) + return -EACCES; +#else + /* + * Maybe put the default actions in the default security ops for + * dn_prot_sock ? Would be nice if the capable call would go there + * too. + */ + if (security_dn_prot_sock(saddr) && + !capable(CAP_NET_BIND_SERVICE) || + saddr->sdn_objnum || (saddr->sdn_flags & SDF_WILD)) + return -EACCES; +#endif + + + if (!(saddr->sdn_flags & SDF_WILD)) { + if (dn_ntohs(saddr->sdn_nodeaddrl)) { + read_lock(&dev_base_lock); + for(dev = dev_base; dev; dev = dev->next) { + if (!dev->dn_ptr) + continue; + if (dn_dev_islocal(dev, dn_saddr2dn(saddr))) + break; + } + read_unlock(&dev_base_lock); + if (dev == NULL) + return -EADDRNOTAVAIL; + } + } + + rv = -EINVAL; + lock_sock(sk); + if (sock_flag(sk, SOCK_ZAPPED)) { + memcpy(&scp->addr, saddr, addr_len); + sock_reset_flag(sk, SOCK_ZAPPED); + + rv = dn_hash_sock(sk); + if (rv) + sock_set_flag(sk, SOCK_ZAPPED); + } + release_sock(sk); + + return rv; +} + + +static int dn_auto_bind(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + int rv; + + sock_reset_flag(sk, SOCK_ZAPPED); + + scp->addr.sdn_flags = 0; + scp->addr.sdn_objnum = 0; + + /* + * This stuff is to keep compatibility with Eduardo's + * patch. I hope I can dispense with it shortly... + */ + if ((scp->accessdata.acc_accl != 0) && + (scp->accessdata.acc_accl <= 12)) { + + scp->addr.sdn_objnamel = dn_htons(scp->accessdata.acc_accl); + memcpy(scp->addr.sdn_objname, scp->accessdata.acc_acc, dn_ntohs(scp->addr.sdn_objnamel)); + + scp->accessdata.acc_accl = 0; + memset(scp->accessdata.acc_acc, 0, 40); + } + /* End of compatibility stuff */ + + scp->addr.sdn_add.a_len = dn_htons(2); + rv = dn_dev_bind_default((dn_address *)scp->addr.sdn_add.a_addr); + if (rv == 0) { + rv = dn_hash_sock(sk); + if (rv) + sock_set_flag(sk, SOCK_ZAPPED); + } + + return rv; +} + +static int dn_confirm_accept(struct sock *sk, long *timeo, int allocation) +{ + struct dn_scp *scp = DN_SK(sk); + DEFINE_WAIT(wait); + int err; + + if (scp->state != DN_CR) + return -EINVAL; + + scp->state = DN_CC; + scp->segsize_loc = dst_metric(__sk_dst_get(sk), RTAX_ADVMSS); + dn_send_conn_conf(sk, allocation); + + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + for(;;) { + release_sock(sk); + if (scp->state == DN_CC) + *timeo = schedule_timeout(*timeo); + lock_sock(sk); + err = 0; + if (scp->state == DN_RUN) + break; + err = sock_error(sk); + if (err) + break; + err = sock_intr_errno(*timeo); + if (signal_pending(current)) + break; + err = -EAGAIN; + if (!*timeo) + break; + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + } + finish_wait(sk->sk_sleep, &wait); + if (err == 0) { + sk->sk_socket->state = SS_CONNECTED; + } else if (scp->state != DN_CC) { + sk->sk_socket->state = SS_UNCONNECTED; + } + return err; +} + +static int dn_wait_run(struct sock *sk, long *timeo) +{ + struct dn_scp *scp = DN_SK(sk); + DEFINE_WAIT(wait); + int err = 0; + + if (scp->state == DN_RUN) + goto out; + + if (!*timeo) + return -EALREADY; + + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + for(;;) { + release_sock(sk); + if (scp->state == DN_CI || scp->state == DN_CC) + *timeo = schedule_timeout(*timeo); + lock_sock(sk); + err = 0; + if (scp->state == DN_RUN) + break; + err = sock_error(sk); + if (err) + break; + err = sock_intr_errno(*timeo); + if (signal_pending(current)) + break; + err = -ETIMEDOUT; + if (!*timeo) + break; + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + } + finish_wait(sk->sk_sleep, &wait); +out: + if (err == 0) { + sk->sk_socket->state = SS_CONNECTED; + } else if (scp->state != DN_CI && scp->state != DN_CC) { + sk->sk_socket->state = SS_UNCONNECTED; + } + return err; +} + +static int __dn_connect(struct sock *sk, struct sockaddr_dn *addr, int addrlen, long *timeo, int flags) +{ + struct socket *sock = sk->sk_socket; + struct dn_scp *scp = DN_SK(sk); + int err = -EISCONN; + struct flowi fl; + + if (sock->state == SS_CONNECTED) + goto out; + + if (sock->state == SS_CONNECTING) { + err = 0; + if (scp->state == DN_RUN) { + sock->state = SS_CONNECTED; + goto out; + } + err = -ECONNREFUSED; + if (scp->state != DN_CI && scp->state != DN_CC) { + sock->state = SS_UNCONNECTED; + goto out; + } + return dn_wait_run(sk, timeo); + } + + err = -EINVAL; + if (scp->state != DN_O) + goto out; + + if (addr == NULL || addrlen != sizeof(struct sockaddr_dn)) + goto out; + if (addr->sdn_family != AF_DECnet) + goto out; + if (addr->sdn_flags & SDF_WILD) + goto out; + + if (sock_flag(sk, SOCK_ZAPPED)) { + err = dn_auto_bind(sk->sk_socket); + if (err) + goto out; + } + + memcpy(&scp->peer, addr, sizeof(struct sockaddr_dn)); + + err = -EHOSTUNREACH; + memset(&fl, 0, sizeof(fl)); + fl.oif = sk->sk_bound_dev_if; + fl.fld_dst = dn_saddr2dn(&scp->peer); + fl.fld_src = dn_saddr2dn(&scp->addr); + dn_sk_ports_copy(&fl, scp); + fl.proto = DNPROTO_NSP; + if (dn_route_output_sock(&sk->sk_dst_cache, &fl, sk, flags) < 0) + goto out; + sk->sk_route_caps = sk->sk_dst_cache->dev->features; + sock->state = SS_CONNECTING; + scp->state = DN_CI; + scp->segsize_loc = dst_metric(sk->sk_dst_cache, RTAX_ADVMSS); + + dn_nsp_send_conninit(sk, NSP_CI); + err = -EINPROGRESS; + if (*timeo) { + err = dn_wait_run(sk, timeo); + } +out: + return err; +} + +static int dn_connect(struct socket *sock, struct sockaddr *uaddr, int addrlen, int flags) +{ + struct sockaddr_dn *addr = (struct sockaddr_dn *)uaddr; + struct sock *sk = sock->sk; + int err; + long timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); + + lock_sock(sk); + err = __dn_connect(sk, addr, addrlen, &timeo, 0); + release_sock(sk); + + return err; +} + +static inline int dn_check_state(struct sock *sk, struct sockaddr_dn *addr, int addrlen, long *timeo, int flags) +{ + struct dn_scp *scp = DN_SK(sk); + + switch(scp->state) { + case DN_RUN: + return 0; + case DN_CR: + return dn_confirm_accept(sk, timeo, sk->sk_allocation); + case DN_CI: + case DN_CC: + return dn_wait_run(sk, timeo); + case DN_O: + return __dn_connect(sk, addr, addrlen, timeo, flags); + } + + return -EINVAL; +} + + +static void dn_access_copy(struct sk_buff *skb, struct accessdata_dn *acc) +{ + unsigned char *ptr = skb->data; + + acc->acc_userl = *ptr++; + memcpy(&acc->acc_user, ptr, acc->acc_userl); + ptr += acc->acc_userl; + + acc->acc_passl = *ptr++; + memcpy(&acc->acc_pass, ptr, acc->acc_passl); + ptr += acc->acc_passl; + + acc->acc_accl = *ptr++; + memcpy(&acc->acc_acc, ptr, acc->acc_accl); + + skb_pull(skb, acc->acc_accl + acc->acc_passl + acc->acc_userl + 3); + +} + +static void dn_user_copy(struct sk_buff *skb, struct optdata_dn *opt) +{ + unsigned char *ptr = skb->data; + + opt->opt_optl = *ptr++; + opt->opt_status = 0; + memcpy(opt->opt_data, ptr, opt->opt_optl); + skb_pull(skb, opt->opt_optl + 1); + +} + +static struct sk_buff *dn_wait_for_connect(struct sock *sk, long *timeo) +{ + DEFINE_WAIT(wait); + struct sk_buff *skb = NULL; + int err = 0; + + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + for(;;) { + release_sock(sk); + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb == NULL) { + *timeo = schedule_timeout(*timeo); + skb = skb_dequeue(&sk->sk_receive_queue); + } + lock_sock(sk); + if (skb != NULL) + break; + err = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + break; + err = sock_intr_errno(*timeo); + if (signal_pending(current)) + break; + err = -EAGAIN; + if (!*timeo) + break; + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + } + finish_wait(sk->sk_sleep, &wait); + + return skb == NULL ? ERR_PTR(err) : skb; +} + +static int dn_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk = sock->sk, *newsk; + struct sk_buff *skb = NULL; + struct dn_skb_cb *cb; + unsigned char menuver; + int err = 0; + unsigned char type; + long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + lock_sock(sk); + + if (sk->sk_state != TCP_LISTEN || DN_SK(sk)->state != DN_O) { + release_sock(sk); + return -EINVAL; + } + + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb == NULL) { + skb = dn_wait_for_connect(sk, &timeo); + if (IS_ERR(skb)) { + release_sock(sk); + return PTR_ERR(skb); + } + } + + cb = DN_SKB_CB(skb); + sk->sk_ack_backlog--; + newsk = dn_alloc_sock(newsock, sk->sk_allocation); + if (newsk == NULL) { + release_sock(sk); + kfree_skb(skb); + return -ENOBUFS; + } + release_sock(sk); + + dst_release(xchg(&newsk->sk_dst_cache, skb->dst)); + skb->dst = NULL; + + DN_SK(newsk)->state = DN_CR; + DN_SK(newsk)->addrrem = cb->src_port; + DN_SK(newsk)->services_rem = cb->services; + DN_SK(newsk)->info_rem = cb->info; + DN_SK(newsk)->segsize_rem = cb->segsize; + DN_SK(newsk)->accept_mode = DN_SK(sk)->accept_mode; + + if (DN_SK(newsk)->segsize_rem < 230) + DN_SK(newsk)->segsize_rem = 230; + + if ((DN_SK(newsk)->services_rem & NSP_FC_MASK) == NSP_FC_NONE) + DN_SK(newsk)->max_window = decnet_no_fc_max_cwnd; + + newsk->sk_state = TCP_LISTEN; + memcpy(&(DN_SK(newsk)->addr), &(DN_SK(sk)->addr), sizeof(struct sockaddr_dn)); + + /* + * If we are listening on a wild socket, we don't want + * the newly created socket on the wrong hash queue. + */ + DN_SK(newsk)->addr.sdn_flags &= ~SDF_WILD; + + skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->addr), &type)); + skb_pull(skb, dn_username2sockaddr(skb->data, skb->len, &(DN_SK(newsk)->peer), &type)); + *(dn_address *)(DN_SK(newsk)->peer.sdn_add.a_addr) = cb->src; + *(dn_address *)(DN_SK(newsk)->addr.sdn_add.a_addr) = cb->dst; + + menuver = *skb->data; + skb_pull(skb, 1); + + if (menuver & DN_MENUVER_ACC) + dn_access_copy(skb, &(DN_SK(newsk)->accessdata)); + + if (menuver & DN_MENUVER_USR) + dn_user_copy(skb, &(DN_SK(newsk)->conndata_in)); + + if (menuver & DN_MENUVER_PRX) + DN_SK(newsk)->peer.sdn_flags |= SDF_PROXY; + + if (menuver & DN_MENUVER_UIC) + DN_SK(newsk)->peer.sdn_flags |= SDF_UICPROXY; + + kfree_skb(skb); + + memcpy(&(DN_SK(newsk)->conndata_out), &(DN_SK(sk)->conndata_out), + sizeof(struct optdata_dn)); + memcpy(&(DN_SK(newsk)->discdata_out), &(DN_SK(sk)->discdata_out), + sizeof(struct optdata_dn)); + + lock_sock(newsk); + err = dn_hash_sock(newsk); + if (err == 0) { + sock_reset_flag(newsk, SOCK_ZAPPED); + dn_send_conn_ack(newsk); + + /* + * Here we use sk->sk_allocation since although the conn conf is + * for the newsk, the context is the old socket. + */ + if (DN_SK(newsk)->accept_mode == ACC_IMMED) + err = dn_confirm_accept(newsk, &timeo, + sk->sk_allocation); + } + release_sock(newsk); + return err; +} + + +static int dn_getname(struct socket *sock, struct sockaddr *uaddr,int *uaddr_len,int peer) +{ + struct sockaddr_dn *sa = (struct sockaddr_dn *)uaddr; + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + + *uaddr_len = sizeof(struct sockaddr_dn); + + lock_sock(sk); + + if (peer) { + if ((sock->state != SS_CONNECTED && + sock->state != SS_CONNECTING) && + scp->accept_mode == ACC_IMMED) + return -ENOTCONN; + + memcpy(sa, &scp->peer, sizeof(struct sockaddr_dn)); + } else { + memcpy(sa, &scp->addr, sizeof(struct sockaddr_dn)); + } + + release_sock(sk); + + return 0; +} + + +static unsigned int dn_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + int mask = datagram_poll(file, sock, wait); + + if (skb_queue_len(&scp->other_receive_queue)) + mask |= POLLRDBAND; + + return mask; +} + +static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + int err = -EOPNOTSUPP; + long amount = 0; + struct sk_buff *skb; + int val; + + switch(cmd) + { + case SIOCGIFADDR: + case SIOCSIFADDR: + return dn_dev_ioctl(cmd, (void __user *)arg); + + case SIOCATMARK: + lock_sock(sk); + val = (skb_queue_len(&scp->other_receive_queue) != 0); + if (scp->state != DN_RUN) + val = -ENOTCONN; + release_sock(sk); + return val; + + case TIOCOUTQ: + amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); + if (amount < 0) + amount = 0; + err = put_user(amount, (int __user *)arg); + break; + + case TIOCINQ: + lock_sock(sk); + if ((skb = skb_peek(&scp->other_receive_queue)) != NULL) { + amount = skb->len; + } else { + struct sk_buff *skb = sk->sk_receive_queue.next; + for(;;) { + if (skb == + (struct sk_buff *)&sk->sk_receive_queue) + break; + amount += skb->len; + skb = skb->next; + } + } + release_sock(sk); + err = put_user(amount, (int __user *)arg); + break; + + default: + err = dev_ioctl(cmd, (void __user *)arg); + break; + } + + return err; +} + +static int dn_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int err = -EINVAL; + + lock_sock(sk); + + if (sock_flag(sk, SOCK_ZAPPED)) + goto out; + + if ((DN_SK(sk)->state != DN_O) || (sk->sk_state == TCP_LISTEN)) + goto out; + + sk->sk_max_ack_backlog = backlog; + sk->sk_ack_backlog = 0; + sk->sk_state = TCP_LISTEN; + err = 0; + dn_rehash_sock(sk); + +out: + release_sock(sk); + + return err; +} + + +static int dn_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + int err = -ENOTCONN; + + lock_sock(sk); + + if (sock->state == SS_UNCONNECTED) + goto out; + + err = 0; + if (sock->state == SS_DISCONNECTING) + goto out; + + err = -EINVAL; + if (scp->state == DN_O) + goto out; + + if (how != SHUTDOWN_MASK) + goto out; + + sk->sk_shutdown = how; + dn_destroy_sock(sk); + err = 0; + +out: + release_sock(sk); + + return err; +} + +static int dn_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + int err; + + lock_sock(sk); + err = __dn_setsockopt(sock, level, optname, optval, optlen, 0); + release_sock(sk); + + return err; +} + +static int __dn_setsockopt(struct socket *sock, int level,int optname, char __user *optval, int optlen, int flags) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + long timeo; + union { + struct optdata_dn opt; + struct accessdata_dn acc; + int mode; + unsigned long win; + int val; + unsigned char services; + unsigned char info; + } u; + int err; + + if (optlen && !optval) + return -EINVAL; + + if (optlen > sizeof(u)) + return -EINVAL; + + if (copy_from_user(&u, optval, optlen)) + return -EFAULT; + + switch(optname) { + case DSO_CONDATA: + if (sock->state == SS_CONNECTED) + return -EISCONN; + if ((scp->state != DN_O) && (scp->state != DN_CR)) + return -EINVAL; + + if (optlen != sizeof(struct optdata_dn)) + return -EINVAL; + + if (u.opt.opt_optl > 16) + return -EINVAL; + + memcpy(&scp->conndata_out, &u.opt, optlen); + break; + + case DSO_DISDATA: + if (sock->state != SS_CONNECTED && scp->accept_mode == ACC_IMMED) + return -ENOTCONN; + + if (optlen != sizeof(struct optdata_dn)) + return -EINVAL; + + if (u.opt.opt_optl > 16) + return -EINVAL; + + memcpy(&scp->discdata_out, &u.opt, optlen); + break; + + case DSO_CONACCESS: + if (sock->state == SS_CONNECTED) + return -EISCONN; + if (scp->state != DN_O) + return -EINVAL; + + if (optlen != sizeof(struct accessdata_dn)) + return -EINVAL; + + if ((u.acc.acc_accl > DN_MAXACCL) || + (u.acc.acc_passl > DN_MAXACCL) || + (u.acc.acc_userl > DN_MAXACCL)) + return -EINVAL; + + memcpy(&scp->accessdata, &u.acc, optlen); + break; + + case DSO_ACCEPTMODE: + if (sock->state == SS_CONNECTED) + return -EISCONN; + if (scp->state != DN_O) + return -EINVAL; + + if (optlen != sizeof(int)) + return -EINVAL; + + if ((u.mode != ACC_IMMED) && (u.mode != ACC_DEFER)) + return -EINVAL; + + scp->accept_mode = (unsigned char)u.mode; + break; + + case DSO_CONACCEPT: + + if (scp->state != DN_CR) + return -EINVAL; + timeo = sock_rcvtimeo(sk, 0); + err = dn_confirm_accept(sk, &timeo, sk->sk_allocation); + return err; + + case DSO_CONREJECT: + + if (scp->state != DN_CR) + return -EINVAL; + + scp->state = DN_DR; + sk->sk_shutdown = SHUTDOWN_MASK; + dn_nsp_send_disc(sk, 0x38, 0, sk->sk_allocation); + break; + + default: +#ifdef CONFIG_NETFILTER + return nf_setsockopt(sk, PF_DECnet, optname, optval, optlen); +#endif + case DSO_LINKINFO: + case DSO_STREAM: + case DSO_SEQPACKET: + return -ENOPROTOOPT; + + case DSO_MAXWINDOW: + if (optlen != sizeof(unsigned long)) + return -EINVAL; + if (u.win > NSP_MAX_WINDOW) + u.win = NSP_MAX_WINDOW; + if (u.win == 0) + return -EINVAL; + scp->max_window = u.win; + if (scp->snd_window > u.win) + scp->snd_window = u.win; + break; + + case DSO_NODELAY: + if (optlen != sizeof(int)) + return -EINVAL; + if (scp->nonagle == 2) + return -EINVAL; + scp->nonagle = (u.val == 0) ? 0 : 1; + /* if (scp->nonagle == 1) { Push pending frames } */ + break; + + case DSO_CORK: + if (optlen != sizeof(int)) + return -EINVAL; + if (scp->nonagle == 1) + return -EINVAL; + scp->nonagle = (u.val == 0) ? 0 : 2; + /* if (scp->nonagle == 0) { Push pending frames } */ + break; + + case DSO_SERVICES: + if (optlen != sizeof(unsigned char)) + return -EINVAL; + if ((u.services & ~NSP_FC_MASK) != 0x01) + return -EINVAL; + if ((u.services & NSP_FC_MASK) == NSP_FC_MASK) + return -EINVAL; + scp->services_loc = u.services; + break; + + case DSO_INFO: + if (optlen != sizeof(unsigned char)) + return -EINVAL; + if (u.info & 0xfc) + return -EINVAL; + scp->info_loc = u.info; + break; + } + + return 0; +} + +static int dn_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + int err; + + lock_sock(sk); + err = __dn_getsockopt(sock, level, optname, optval, optlen, 0); + release_sock(sk); + + return err; +} + +static int __dn_getsockopt(struct socket *sock, int level,int optname, char __user *optval,int __user *optlen, int flags) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + struct linkinfo_dn link; + unsigned int r_len; + void *r_data = NULL; + unsigned int val; + + if(get_user(r_len , optlen)) + return -EFAULT; + + switch(optname) { + case DSO_CONDATA: + if (r_len > sizeof(struct optdata_dn)) + r_len = sizeof(struct optdata_dn); + r_data = &scp->conndata_in; + break; + + case DSO_DISDATA: + if (r_len > sizeof(struct optdata_dn)) + r_len = sizeof(struct optdata_dn); + r_data = &scp->discdata_in; + break; + + case DSO_CONACCESS: + if (r_len > sizeof(struct accessdata_dn)) + r_len = sizeof(struct accessdata_dn); + r_data = &scp->accessdata; + break; + + case DSO_ACCEPTMODE: + if (r_len > sizeof(unsigned char)) + r_len = sizeof(unsigned char); + r_data = &scp->accept_mode; + break; + + case DSO_LINKINFO: + if (r_len > sizeof(struct linkinfo_dn)) + r_len = sizeof(struct linkinfo_dn); + + switch(sock->state) { + case SS_CONNECTING: + link.idn_linkstate = LL_CONNECTING; + break; + case SS_DISCONNECTING: + link.idn_linkstate = LL_DISCONNECTING; + break; + case SS_CONNECTED: + link.idn_linkstate = LL_RUNNING; + break; + default: + link.idn_linkstate = LL_INACTIVE; + } + + link.idn_segsize = scp->segsize_rem; + r_data = &link; + break; + + default: +#ifdef CONFIG_NETFILTER + { + int val, len; + + if(get_user(len, optlen)) + return -EFAULT; + + val = nf_getsockopt(sk, PF_DECnet, optname, + optval, &len); + if (val >= 0) + val = put_user(len, optlen); + return val; + } +#endif + case DSO_STREAM: + case DSO_SEQPACKET: + case DSO_CONACCEPT: + case DSO_CONREJECT: + return -ENOPROTOOPT; + + case DSO_MAXWINDOW: + if (r_len > sizeof(unsigned long)) + r_len = sizeof(unsigned long); + r_data = &scp->max_window; + break; + + case DSO_NODELAY: + if (r_len > sizeof(int)) + r_len = sizeof(int); + val = (scp->nonagle == 1); + r_data = &val; + break; + + case DSO_CORK: + if (r_len > sizeof(int)) + r_len = sizeof(int); + val = (scp->nonagle == 2); + r_data = &val; + break; + + case DSO_SERVICES: + if (r_len > sizeof(unsigned char)) + r_len = sizeof(unsigned char); + r_data = &scp->services_rem; + break; + + case DSO_INFO: + if (r_len > sizeof(unsigned char)) + r_len = sizeof(unsigned char); + r_data = &scp->info_rem; + break; + } + + if (r_data) { + if (copy_to_user(optval, r_data, r_len)) + return -EFAULT; + if (put_user(r_len, optlen)) + return -EFAULT; + } + + return 0; +} + + +static int dn_data_ready(struct sock *sk, struct sk_buff_head *q, int flags, int target) +{ + struct sk_buff *skb = q->next; + int len = 0; + + if (flags & MSG_OOB) + return skb_queue_len(q) ? 1 : 0; + + while(skb != (struct sk_buff *)q) { + struct dn_skb_cb *cb = DN_SKB_CB(skb); + len += skb->len; + + if (cb->nsp_flags & 0x40) { + /* SOCK_SEQPACKET reads to EOM */ + if (sk->sk_type == SOCK_SEQPACKET) + return 1; + /* so does SOCK_STREAM unless WAITALL is specified */ + if (!(flags & MSG_WAITALL)) + return 1; + } + + /* minimum data length for read exceeded */ + if (len >= target) + return 1; + + skb = skb->next; + } + + return 0; +} + + +static int dn_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + struct sk_buff_head *queue = &sk->sk_receive_queue; + size_t target = size > 1 ? 1 : 0; + size_t copied = 0; + int rv = 0; + struct sk_buff *skb, *nskb; + struct dn_skb_cb *cb = NULL; + unsigned char eor = 0; + long timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + lock_sock(sk); + + if (sock_flag(sk, SOCK_ZAPPED)) { + rv = -EADDRNOTAVAIL; + goto out; + } + + rv = dn_check_state(sk, NULL, 0, &timeo, flags); + if (rv) + goto out; + + if (sk->sk_shutdown & RCV_SHUTDOWN) { + if (!(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + rv = -EPIPE; + goto out; + } + + if (flags & ~(MSG_PEEK|MSG_OOB|MSG_WAITALL|MSG_DONTWAIT|MSG_NOSIGNAL)) { + rv = -EOPNOTSUPP; + goto out; + } + + if (flags & MSG_OOB) + queue = &scp->other_receive_queue; + + if (flags & MSG_WAITALL) + target = size; + + + /* + * See if there is data ready to read, sleep if there isn't + */ + for(;;) { + if (sk->sk_err) + goto out; + + if (skb_queue_len(&scp->other_receive_queue)) { + if (!(flags & MSG_OOB)) { + msg->msg_flags |= MSG_OOB; + if (!scp->other_report) { + scp->other_report = 1; + goto out; + } + } + } + + if (scp->state != DN_RUN) + goto out; + + if (signal_pending(current)) { + rv = sock_intr_errno(timeo); + goto out; + } + + if (dn_data_ready(sk, queue, flags, target)) + break; + + if (flags & MSG_DONTWAIT) { + rv = -EWOULDBLOCK; + goto out; + } + + set_bit(SOCK_ASYNC_WAITDATA, &sock->flags); + SOCK_SLEEP_PRE(sk) + + if (!dn_data_ready(sk, queue, flags, target)) + schedule(); + + SOCK_SLEEP_POST(sk) + clear_bit(SOCK_ASYNC_WAITDATA, &sock->flags); + } + + for(skb = queue->next; skb != (struct sk_buff *)queue; skb = nskb) { + unsigned int chunk = skb->len; + cb = DN_SKB_CB(skb); + + if ((chunk + copied) > size) + chunk = size - copied; + + if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { + rv = -EFAULT; + break; + } + copied += chunk; + + if (!(flags & MSG_PEEK)) + skb_pull(skb, chunk); + + eor = cb->nsp_flags & 0x40; + nskb = skb->next; + + if (skb->len == 0) { + skb_unlink(skb); + kfree_skb(skb); + /* + * N.B. Don't refer to skb or cb after this point + * in loop. + */ + if ((scp->flowloc_sw == DN_DONTSEND) && !dn_congested(sk)) { + scp->flowloc_sw = DN_SEND; + dn_nsp_send_link(sk, DN_SEND, 0); + } + } + + if (eor) { + if (sk->sk_type == SOCK_SEQPACKET) + break; + if (!(flags & MSG_WAITALL)) + break; + } + + if (flags & MSG_OOB) + break; + + if (copied >= target) + break; + } + + rv = copied; + + + if (eor && (sk->sk_type == SOCK_SEQPACKET)) + msg->msg_flags |= MSG_EOR; + +out: + if (rv == 0) + rv = (flags & MSG_PEEK) ? -sk->sk_err : sock_error(sk); + + if ((rv >= 0) && msg->msg_name) { + memcpy(msg->msg_name, &scp->peer, sizeof(struct sockaddr_dn)); + msg->msg_namelen = sizeof(struct sockaddr_dn); + } + + release_sock(sk); + + return rv; +} + + +static inline int dn_queue_too_long(struct dn_scp *scp, struct sk_buff_head *queue, int flags) +{ + unsigned char fctype = scp->services_rem & NSP_FC_MASK; + if (skb_queue_len(queue) >= scp->snd_window) + return 1; + if (fctype != NSP_FC_NONE) { + if (flags & MSG_OOB) { + if (scp->flowrem_oth == 0) + return 1; + } else { + if (scp->flowrem_dat == 0) + return 1; + } + } + return 0; +} + +/* + * The DECnet spec requires the the "routing layer" accepts packets which + * are at least 230 bytes in size. This excludes any headers which the NSP + * layer might add, so we always assume that we'll be using the maximal + * length header on data packets. The variation in length is due to the + * inclusion (or not) of the two 16 bit acknowledgement fields so it doesn't + * make much practical difference. + */ +unsigned dn_mss_from_pmtu(struct net_device *dev, int mtu) +{ + unsigned mss = 230 - DN_MAX_NSP_DATA_HEADER; + if (dev) { + struct dn_dev *dn_db = dev->dn_ptr; + mtu -= LL_RESERVED_SPACE(dev); + if (dn_db->use_long) + mtu -= 21; + else + mtu -= 6; + mtu -= DN_MAX_NSP_DATA_HEADER; + } else { + /* + * 21 = long header, 16 = guess at MAC header length + */ + mtu -= (21 + DN_MAX_NSP_DATA_HEADER + 16); + } + if (mtu > mss) + mss = mtu; + return mss; +} + +static inline unsigned int dn_current_mss(struct sock *sk, int flags) +{ + struct dst_entry *dst = __sk_dst_get(sk); + struct dn_scp *scp = DN_SK(sk); + int mss_now = min_t(int, scp->segsize_loc, scp->segsize_rem); + + /* Other data messages are limited to 16 bytes per packet */ + if (flags & MSG_OOB) + return 16; + + /* This works out the maximum size of segment we can send out */ + if (dst) { + u32 mtu = dst_mtu(dst); + mss_now = min_t(int, dn_mss_from_pmtu(dst->dev, mtu), mss_now); + } + + return mss_now; +} + +static int dn_error(struct sock *sk, int flags, int err) +{ + if (err == -EPIPE) + err = sock_error(sk) ? : -EPIPE; + if (err == -EPIPE && !(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + return err; +} + +static int dn_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size) +{ + struct sock *sk = sock->sk; + struct dn_scp *scp = DN_SK(sk); + size_t mss; + struct sk_buff_head *queue = &scp->data_xmit_queue; + int flags = msg->msg_flags; + int err = 0; + size_t sent = 0; + int addr_len = msg->msg_namelen; + struct sockaddr_dn *addr = (struct sockaddr_dn *)msg->msg_name; + struct sk_buff *skb = NULL; + struct dn_skb_cb *cb; + size_t len; + unsigned char fctype; + long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + + if (flags & ~(MSG_TRYHARD|MSG_OOB|MSG_DONTWAIT|MSG_EOR|MSG_NOSIGNAL|MSG_MORE|MSG_CMSG_COMPAT)) + return -EOPNOTSUPP; + + if (addr_len && (addr_len != sizeof(struct sockaddr_dn))) + return -EINVAL; + + /* + * The only difference between stream sockets and sequenced packet + * sockets is that the stream sockets always behave as if MSG_EOR + * has been set. + */ + if (sock->type == SOCK_STREAM) { + if (flags & MSG_EOR) + return -EINVAL; + flags |= MSG_EOR; + } + + lock_sock(sk); + + err = dn_check_state(sk, addr, addr_len, &timeo, flags); + if (err) + goto out_err; + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + err = -EPIPE; + goto out_err; + } + + if ((flags & MSG_TRYHARD) && sk->sk_dst_cache) + dst_negative_advice(&sk->sk_dst_cache); + + mss = scp->segsize_rem; + fctype = scp->services_rem & NSP_FC_MASK; + + mss = dn_current_mss(sk, flags); + + if (flags & MSG_OOB) { + queue = &scp->other_xmit_queue; + if (size > mss) { + err = -EMSGSIZE; + goto out; + } + } + + scp->persist_fxn = dn_nsp_xmit_timeout; + + while(sent < size) { + err = sock_error(sk); + if (err) + goto out; + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + goto out; + } + + /* + * Calculate size that we wish to send. + */ + len = size - sent; + + if (len > mss) + len = mss; + + /* + * Wait for queue size to go down below the window + * size. + */ + if (dn_queue_too_long(scp, queue, flags)) { + if (flags & MSG_DONTWAIT) { + err = -EWOULDBLOCK; + goto out; + } + + SOCK_SLEEP_PRE(sk) + + if (dn_queue_too_long(scp, queue, flags)) + schedule(); + + SOCK_SLEEP_POST(sk) + + continue; + } + + /* + * Get a suitably sized skb. + */ + skb = dn_alloc_send_skb(sk, &len, flags & MSG_DONTWAIT, timeo, &err); + + if (err) + break; + + if (!skb) + continue; + + cb = DN_SKB_CB(skb); + + skb_reserve(skb, DN_MAX_NSP_DATA_HEADER); + + if (memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len)) { + err = -EFAULT; + goto out; + } + + if (flags & MSG_OOB) { + cb->nsp_flags = 0x30; + if (fctype != NSP_FC_NONE) + scp->flowrem_oth--; + } else { + cb->nsp_flags = 0x00; + if (scp->seg_total == 0) + cb->nsp_flags |= 0x20; + + scp->seg_total += len; + + if (((sent + len) == size) && (flags & MSG_EOR)) { + cb->nsp_flags |= 0x40; + scp->seg_total = 0; + if (fctype == NSP_FC_SCMC) + scp->flowrem_dat--; + } + if (fctype == NSP_FC_SRC) + scp->flowrem_dat--; + } + + sent += len; + dn_nsp_queue_xmit(sk, skb, sk->sk_allocation, flags & MSG_OOB); + skb = NULL; + + scp->persist = dn_nsp_persist(sk); + + } +out: + + if (skb) + kfree_skb(skb); + + release_sock(sk); + + return sent ? sent : err; + +out_err: + err = dn_error(sk, flags, err); + release_sock(sk); + return err; +} + +static int dn_device_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = (struct net_device *)ptr; + + switch(event) { + case NETDEV_UP: + dn_dev_up(dev); + break; + case NETDEV_DOWN: + dn_dev_down(dev); + break; + default: + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block dn_dev_notifier = { + .notifier_call = dn_device_event, +}; + +extern int dn_route_rcv(struct sk_buff *, struct net_device *, struct packet_type *); + +static struct packet_type dn_dix_packet_type = { + .type = __constant_htons(ETH_P_DNA_RT), + .dev = NULL, /* All devices */ + .func = dn_route_rcv, +}; + +#ifdef CONFIG_PROC_FS +struct dn_iter_state { + int bucket; +}; + +static struct sock *dn_socket_get_first(struct seq_file *seq) +{ + struct dn_iter_state *state = seq->private; + struct sock *n = NULL; + + for(state->bucket = 0; + state->bucket < DN_SK_HASH_SIZE; + ++state->bucket) { + n = sk_head(&dn_sk_hash[state->bucket]); + if (n) + break; + } + + return n; +} + +static struct sock *dn_socket_get_next(struct seq_file *seq, + struct sock *n) +{ + struct dn_iter_state *state = seq->private; + + n = sk_next(n); +try_again: + if (n) + goto out; + if (++state->bucket >= DN_SK_HASH_SIZE) + goto out; + n = sk_head(&dn_sk_hash[state->bucket]); + goto try_again; +out: + return n; +} + +static struct sock *socket_get_idx(struct seq_file *seq, loff_t *pos) +{ + struct sock *sk = dn_socket_get_first(seq); + + if (sk) { + while(*pos && (sk = dn_socket_get_next(seq, sk))) + --*pos; + } + return *pos ? NULL : sk; +} + +static void *dn_socket_get_idx(struct seq_file *seq, loff_t pos) +{ + void *rc; + read_lock_bh(&dn_hash_lock); + rc = socket_get_idx(seq, &pos); + if (!rc) { + read_unlock_bh(&dn_hash_lock); + } + return rc; +} + +static void *dn_socket_seq_start(struct seq_file *seq, loff_t *pos) +{ + return *pos ? dn_socket_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *dn_socket_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + void *rc; + + if (v == SEQ_START_TOKEN) { + rc = dn_socket_get_idx(seq, 0); + goto out; + } + + rc = dn_socket_get_next(seq, v); + if (rc) + goto out; + read_unlock_bh(&dn_hash_lock); +out: + ++*pos; + return rc; +} + +static void dn_socket_seq_stop(struct seq_file *seq, void *v) +{ + if (v && v != SEQ_START_TOKEN) + read_unlock_bh(&dn_hash_lock); +} + +#define IS_NOT_PRINTABLE(x) ((x) < 32 || (x) > 126) + +static void dn_printable_object(struct sockaddr_dn *dn, unsigned char *buf) +{ + int i; + + switch (dn_ntohs(dn->sdn_objnamel)) { + case 0: + sprintf(buf, "%d", dn->sdn_objnum); + break; + default: + for (i = 0; i < dn_ntohs(dn->sdn_objnamel); i++) { + buf[i] = dn->sdn_objname[i]; + if (IS_NOT_PRINTABLE(buf[i])) + buf[i] = '.'; + } + buf[i] = 0; + } +} + +static char *dn_state2asc(unsigned char state) +{ + switch(state) { + case DN_O: + return "OPEN"; + case DN_CR: + return " CR"; + case DN_DR: + return " DR"; + case DN_DRC: + return " DRC"; + case DN_CC: + return " CC"; + case DN_CI: + return " CI"; + case DN_NR: + return " NR"; + case DN_NC: + return " NC"; + case DN_CD: + return " CD"; + case DN_RJ: + return " RJ"; + case DN_RUN: + return " RUN"; + case DN_DI: + return " DI"; + case DN_DIC: + return " DIC"; + case DN_DN: + return " DN"; + case DN_CL: + return " CL"; + case DN_CN: + return " CN"; + } + + return "????"; +} + +static inline void dn_socket_format_entry(struct seq_file *seq, struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + char buf1[DN_ASCBUF_LEN]; + char buf2[DN_ASCBUF_LEN]; + char local_object[DN_MAXOBJL+3]; + char remote_object[DN_MAXOBJL+3]; + + dn_printable_object(&scp->addr, local_object); + dn_printable_object(&scp->peer, remote_object); + + seq_printf(seq, + "%6s/%04X %04d:%04d %04d:%04d %01d %-16s " + "%6s/%04X %04d:%04d %04d:%04d %01d %-16s %4s %s\n", + dn_addr2asc(dn_ntohs(dn_saddr2dn(&scp->addr)), buf1), + scp->addrloc, + scp->numdat, + scp->numoth, + scp->ackxmt_dat, + scp->ackxmt_oth, + scp->flowloc_sw, + local_object, + dn_addr2asc(dn_ntohs(dn_saddr2dn(&scp->peer)), buf2), + scp->addrrem, + scp->numdat_rcv, + scp->numoth_rcv, + scp->ackrcv_dat, + scp->ackrcv_oth, + scp->flowrem_sw, + remote_object, + dn_state2asc(scp->state), + ((scp->accept_mode == ACC_IMMED) ? "IMMED" : "DEFER")); +} + +static int dn_socket_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "Local Remote\n"); + } else { + dn_socket_format_entry(seq, v); + } + return 0; +} + +static struct seq_operations dn_socket_seq_ops = { + .start = dn_socket_seq_start, + .next = dn_socket_seq_next, + .stop = dn_socket_seq_stop, + .show = dn_socket_seq_show, +}; + +static int dn_socket_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct dn_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &dn_socket_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations dn_socket_seq_fops = { + .owner = THIS_MODULE, + .open = dn_socket_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + +static struct net_proto_family dn_family_ops = { + .family = AF_DECnet, + .create = dn_create, + .owner = THIS_MODULE, +}; + +static struct proto_ops dn_proto_ops = { + .family = AF_DECnet, + .owner = THIS_MODULE, + .release = dn_release, + .bind = dn_bind, + .connect = dn_connect, + .socketpair = sock_no_socketpair, + .accept = dn_accept, + .getname = dn_getname, + .poll = dn_poll, + .ioctl = dn_ioctl, + .listen = dn_listen, + .shutdown = dn_shutdown, + .setsockopt = dn_setsockopt, + .getsockopt = dn_getsockopt, + .sendmsg = dn_sendmsg, + .recvmsg = dn_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +void dn_register_sysctl(void); +void dn_unregister_sysctl(void); + +MODULE_DESCRIPTION("The Linux DECnet Network Protocol"); +MODULE_AUTHOR("Linux DECnet Project Team"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_DECnet); + +static char banner[] __initdata = KERN_INFO "NET4: DECnet for Linux: V.2.5.68s (C) 1995-2003 Linux DECnet Project Team\n"; + +static int __init decnet_init(void) +{ + int rc; + + printk(banner); + + rc = proto_register(&dn_proto, 1); + if (rc != 0) + goto out; + + dn_neigh_init(); + dn_dev_init(); + dn_route_init(); + dn_fib_init(); + + sock_register(&dn_family_ops); + dev_add_pack(&dn_dix_packet_type); + register_netdevice_notifier(&dn_dev_notifier); + + proc_net_fops_create("decnet", S_IRUGO, &dn_socket_seq_fops); + dn_register_sysctl(); +out: + return rc; + +} +module_init(decnet_init); + +/* + * Prevent DECnet module unloading until its fixed properly. + * Requires an audit of the code to check for memory leaks and + * initialisation problems etc. + */ +#if 0 +static void __exit decnet_exit(void) +{ + sock_unregister(AF_DECnet); + dev_remove_pack(&dn_dix_packet_type); + + dn_unregister_sysctl(); + + unregister_netdevice_notifier(&dn_dev_notifier); + + dn_route_cleanup(); + dn_dev_cleanup(); + dn_neigh_cleanup(); + dn_fib_cleanup(); + + proc_net_remove("decnet"); + + proto_unregister(&dn_proto); +} +module_exit(decnet_exit); +#endif diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c new file mode 100644 index 000000000000..c2a0346f423b --- /dev/null +++ b/net/decnet/dn_dev.c @@ -0,0 +1,1481 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Device Layer + * + * Authors: Steve Whitehouse + * Eduardo Marcelo Serrat + * + * Changes: + * Steve Whitehouse : Devices now see incoming frames so they + * can mark on who it came from. + * Steve Whitehouse : Fixed bug in creating neighbours. Each neighbour + * can now have a device specific setup func. + * Steve Whitehouse : Added /proc/sys/net/decnet/conf// + * Steve Whitehouse : Fixed bug which sometimes killed timer + * Steve Whitehouse : Multiple ifaddr support + * Steve Whitehouse : SIOCGIFCONF is now a compile time option + * Steve Whitehouse : /proc/sys/net/decnet/conf//forwarding + * Steve Whitehouse : Removed timer1 - it's a user space issue now + * Patrick Caulfield : Fixed router hello message format + * Steve Whitehouse : Got rid of constant sizes for blksize for + * devices. All mtu based now. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DN_IFREQ_SIZE (sizeof(struct ifreq) - sizeof(struct sockaddr) + sizeof(struct sockaddr_dn)) + +static char dn_rt_all_end_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x04,0x00,0x00}; +static char dn_rt_all_rt_mcast[ETH_ALEN] = {0xAB,0x00,0x00,0x03,0x00,0x00}; +static char dn_hiord[ETH_ALEN] = {0xAA,0x00,0x04,0x00,0x00,0x00}; +static unsigned char dn_eco_version[3] = {0x02,0x00,0x00}; + +extern struct neigh_table dn_neigh_table; + +/* + * decnet_address is kept in network order. + */ +dn_address decnet_address = 0; + +static DEFINE_RWLOCK(dndev_lock); +static struct net_device *decnet_default_device; +static struct notifier_block *dnaddr_chain; + +static struct dn_dev *dn_dev_create(struct net_device *dev, int *err); +static void dn_dev_delete(struct net_device *dev); +static void rtmsg_ifa(int event, struct dn_ifaddr *ifa); + +static int dn_eth_up(struct net_device *); +static void dn_eth_down(struct net_device *); +static void dn_send_brd_hello(struct net_device *dev, struct dn_ifaddr *ifa); +static void dn_send_ptp_hello(struct net_device *dev, struct dn_ifaddr *ifa); + +static struct dn_dev_parms dn_dev_list[] = { +{ + .type = ARPHRD_ETHER, /* Ethernet */ + .mode = DN_DEV_BCAST, + .state = DN_DEV_S_RU, + .t2 = 1, + .t3 = 10, + .name = "ethernet", + .ctl_name = NET_DECNET_CONF_ETHER, + .up = dn_eth_up, + .down = dn_eth_down, + .timer3 = dn_send_brd_hello, +}, +{ + .type = ARPHRD_IPGRE, /* DECnet tunneled over GRE in IP */ + .mode = DN_DEV_BCAST, + .state = DN_DEV_S_RU, + .t2 = 1, + .t3 = 10, + .name = "ipgre", + .ctl_name = NET_DECNET_CONF_GRE, + .timer3 = dn_send_brd_hello, +}, +#if 0 +{ + .type = ARPHRD_X25, /* Bog standard X.25 */ + .mode = DN_DEV_UCAST, + .state = DN_DEV_S_DS, + .t2 = 1, + .t3 = 120, + .name = "x25", + .ctl_name = NET_DECNET_CONF_X25, + .timer3 = dn_send_ptp_hello, +}, +#endif +#if 0 +{ + .type = ARPHRD_PPP, /* DECnet over PPP */ + .mode = DN_DEV_BCAST, + .state = DN_DEV_S_RU, + .t2 = 1, + .t3 = 10, + .name = "ppp", + .ctl_name = NET_DECNET_CONF_PPP, + .timer3 = dn_send_brd_hello, +}, +#endif +{ + .type = ARPHRD_DDCMP, /* DECnet over DDCMP */ + .mode = DN_DEV_UCAST, + .state = DN_DEV_S_DS, + .t2 = 1, + .t3 = 120, + .name = "ddcmp", + .ctl_name = NET_DECNET_CONF_DDCMP, + .timer3 = dn_send_ptp_hello, +}, +{ + .type = ARPHRD_LOOPBACK, /* Loopback interface - always last */ + .mode = DN_DEV_BCAST, + .state = DN_DEV_S_RU, + .t2 = 1, + .t3 = 10, + .name = "loopback", + .ctl_name = NET_DECNET_CONF_LOOPBACK, + .timer3 = dn_send_brd_hello, +} +}; + +#define DN_DEV_LIST_SIZE (sizeof(dn_dev_list)/sizeof(struct dn_dev_parms)) + +#define DN_DEV_PARMS_OFFSET(x) ((int) ((char *) &((struct dn_dev_parms *)0)->x)) + +#ifdef CONFIG_SYSCTL + +static int min_t2[] = { 1 }; +static int max_t2[] = { 60 }; /* No max specified, but this seems sensible */ +static int min_t3[] = { 1 }; +static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MULT or T3MULT */ + +static int min_priority[1]; +static int max_priority[] = { 127 }; /* From DECnet spec */ + +static int dn_forwarding_proc(ctl_table *, int, struct file *, + void __user *, size_t *, loff_t *); +static int dn_forwarding_sysctl(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context); + +static struct dn_dev_sysctl_table { + struct ctl_table_header *sysctl_header; + ctl_table dn_dev_vars[5]; + ctl_table dn_dev_dev[2]; + ctl_table dn_dev_conf_dir[2]; + ctl_table dn_dev_proto_dir[2]; + ctl_table dn_dev_root_dir[2]; +} dn_dev_sysctl = { + NULL, + { + { + .ctl_name = NET_DECNET_CONF_DEV_FORWARDING, + .procname = "forwarding", + .data = (void *)DN_DEV_PARMS_OFFSET(forwarding), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = dn_forwarding_proc, + .strategy = dn_forwarding_sysctl, + }, + { + .ctl_name = NET_DECNET_CONF_DEV_PRIORITY, + .procname = "priority", + .data = (void *)DN_DEV_PARMS_OFFSET(priority), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .strategy = sysctl_intvec, + .extra1 = &min_priority, + .extra2 = &max_priority + }, + { + .ctl_name = NET_DECNET_CONF_DEV_T2, + .procname = "t2", + .data = (void *)DN_DEV_PARMS_OFFSET(t2), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .strategy = sysctl_intvec, + .extra1 = &min_t2, + .extra2 = &max_t2 + }, + { + .ctl_name = NET_DECNET_CONF_DEV_T3, + .procname = "t3", + .data = (void *)DN_DEV_PARMS_OFFSET(t3), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .strategy = sysctl_intvec, + .extra1 = &min_t3, + .extra2 = &max_t3 + }, + {0} + }, + {{ + .ctl_name = 0, + .procname = "", + .mode = 0555, + .child = dn_dev_sysctl.dn_dev_vars + }, {0}}, + {{ + .ctl_name = NET_DECNET_CONF, + .procname = "conf", + .mode = 0555, + .child = dn_dev_sysctl.dn_dev_dev + }, {0}}, + {{ + .ctl_name = NET_DECNET, + .procname = "decnet", + .mode = 0555, + .child = dn_dev_sysctl.dn_dev_conf_dir + }, {0}}, + {{ + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = dn_dev_sysctl.dn_dev_proto_dir + }, {0}} +}; + +static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms) +{ + struct dn_dev_sysctl_table *t; + int i; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (t == NULL) + return; + + memcpy(t, &dn_dev_sysctl, sizeof(*t)); + + for(i = 0; i < ARRAY_SIZE(t->dn_dev_vars) - 1; i++) { + long offset = (long)t->dn_dev_vars[i].data; + t->dn_dev_vars[i].data = ((char *)parms) + offset; + t->dn_dev_vars[i].de = NULL; + } + + if (dev) { + t->dn_dev_dev[0].procname = dev->name; + t->dn_dev_dev[0].ctl_name = dev->ifindex; + } else { + t->dn_dev_dev[0].procname = parms->name; + t->dn_dev_dev[0].ctl_name = parms->ctl_name; + } + + t->dn_dev_dev[0].child = t->dn_dev_vars; + t->dn_dev_dev[0].de = NULL; + t->dn_dev_conf_dir[0].child = t->dn_dev_dev; + t->dn_dev_conf_dir[0].de = NULL; + t->dn_dev_proto_dir[0].child = t->dn_dev_conf_dir; + t->dn_dev_proto_dir[0].de = NULL; + t->dn_dev_root_dir[0].child = t->dn_dev_proto_dir; + t->dn_dev_root_dir[0].de = NULL; + t->dn_dev_vars[0].extra1 = (void *)dev; + + t->sysctl_header = register_sysctl_table(t->dn_dev_root_dir, 0); + if (t->sysctl_header == NULL) + kfree(t); + else + parms->sysctl = t; +} + +static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms) +{ + if (parms->sysctl) { + struct dn_dev_sysctl_table *t = parms->sysctl; + parms->sysctl = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t); + } +} + +static int dn_forwarding_proc(ctl_table *table, int write, + struct file *filep, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ +#ifdef CONFIG_DECNET_ROUTER + struct net_device *dev = table->extra1; + struct dn_dev *dn_db; + int err; + int tmp, old; + + if (table->extra1 == NULL) + return -EINVAL; + + dn_db = dev->dn_ptr; + old = dn_db->parms.forwarding; + + err = proc_dointvec(table, write, filep, buffer, lenp, ppos); + + if ((err >= 0) && write) { + if (dn_db->parms.forwarding < 0) + dn_db->parms.forwarding = 0; + if (dn_db->parms.forwarding > 2) + dn_db->parms.forwarding = 2; + /* + * What an ugly hack this is... its works, just. It + * would be nice if sysctl/proc were just that little + * bit more flexible so I don't have to write a special + * routine, or suffer hacks like this - SJW + */ + tmp = dn_db->parms.forwarding; + dn_db->parms.forwarding = old; + if (dn_db->parms.down) + dn_db->parms.down(dev); + dn_db->parms.forwarding = tmp; + if (dn_db->parms.up) + dn_db->parms.up(dev); + } + + return err; +#else + return -EINVAL; +#endif +} + +static int dn_forwarding_sysctl(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ +#ifdef CONFIG_DECNET_ROUTER + struct net_device *dev = table->extra1; + struct dn_dev *dn_db; + int value; + + if (table->extra1 == NULL) + return -EINVAL; + + dn_db = dev->dn_ptr; + + if (newval && newlen) { + if (newlen != sizeof(int)) + return -EINVAL; + + if (get_user(value, (int __user *)newval)) + return -EFAULT; + if (value < 0) + return -EINVAL; + if (value > 2) + return -EINVAL; + + if (dn_db->parms.down) + dn_db->parms.down(dev); + dn_db->parms.forwarding = value; + if (dn_db->parms.up) + dn_db->parms.up(dev); + } + + return 0; +#else + return -EINVAL; +#endif +} + +#else /* CONFIG_SYSCTL */ +static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms) +{ +} +static void dn_dev_sysctl_register(struct net_device *dev, struct dn_dev_parms *parms) +{ +} + +#endif /* CONFIG_SYSCTL */ + +static inline __u16 mtu2blksize(struct net_device *dev) +{ + u32 blksize = dev->mtu; + if (blksize > 0xffff) + blksize = 0xffff; + + if (dev->type == ARPHRD_ETHER || + dev->type == ARPHRD_PPP || + dev->type == ARPHRD_IPGRE || + dev->type == ARPHRD_LOOPBACK) + blksize -= 2; + + return (__u16)blksize; +} + +static struct dn_ifaddr *dn_dev_alloc_ifa(void) +{ + struct dn_ifaddr *ifa; + + ifa = kmalloc(sizeof(*ifa), GFP_KERNEL); + + if (ifa) { + memset(ifa, 0, sizeof(*ifa)); + } + + return ifa; +} + +static __inline__ void dn_dev_free_ifa(struct dn_ifaddr *ifa) +{ + kfree(ifa); +} + +static void dn_dev_del_ifa(struct dn_dev *dn_db, struct dn_ifaddr **ifap, int destroy) +{ + struct dn_ifaddr *ifa1 = *ifap; + unsigned char mac_addr[6]; + struct net_device *dev = dn_db->dev; + + ASSERT_RTNL(); + + *ifap = ifa1->ifa_next; + + if (dn_db->dev->type == ARPHRD_ETHER) { + if (ifa1->ifa_local != dn_htons(dn_eth2dn(dev->dev_addr))) { + dn_dn2eth(mac_addr, ifa1->ifa_local); + dev_mc_delete(dev, mac_addr, ETH_ALEN, 0); + } + } + + rtmsg_ifa(RTM_DELADDR, ifa1); + notifier_call_chain(&dnaddr_chain, NETDEV_DOWN, ifa1); + if (destroy) { + dn_dev_free_ifa(ifa1); + + if (dn_db->ifa_list == NULL) + dn_dev_delete(dn_db->dev); + } +} + +static int dn_dev_insert_ifa(struct dn_dev *dn_db, struct dn_ifaddr *ifa) +{ + struct net_device *dev = dn_db->dev; + struct dn_ifaddr *ifa1; + unsigned char mac_addr[6]; + + ASSERT_RTNL(); + + /* Check for duplicates */ + for(ifa1 = dn_db->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { + if (ifa1->ifa_local == ifa->ifa_local) + return -EEXIST; + } + + if (dev->type == ARPHRD_ETHER) { + if (ifa->ifa_local != dn_htons(dn_eth2dn(dev->dev_addr))) { + dn_dn2eth(mac_addr, ifa->ifa_local); + dev_mc_add(dev, mac_addr, ETH_ALEN, 0); + dev_mc_upload(dev); + } + } + + ifa->ifa_next = dn_db->ifa_list; + dn_db->ifa_list = ifa; + + rtmsg_ifa(RTM_NEWADDR, ifa); + notifier_call_chain(&dnaddr_chain, NETDEV_UP, ifa); + + return 0; +} + +static int dn_dev_set_ifa(struct net_device *dev, struct dn_ifaddr *ifa) +{ + struct dn_dev *dn_db = dev->dn_ptr; + int rv; + + if (dn_db == NULL) { + int err; + dn_db = dn_dev_create(dev, &err); + if (dn_db == NULL) + return err; + } + + ifa->ifa_dev = dn_db; + + if (dev->flags & IFF_LOOPBACK) + ifa->ifa_scope = RT_SCOPE_HOST; + + rv = dn_dev_insert_ifa(dn_db, ifa); + if (rv) + dn_dev_free_ifa(ifa); + return rv; +} + + +int dn_dev_ioctl(unsigned int cmd, void __user *arg) +{ + char buffer[DN_IFREQ_SIZE]; + struct ifreq *ifr = (struct ifreq *)buffer; + struct sockaddr_dn *sdn = (struct sockaddr_dn *)&ifr->ifr_addr; + struct dn_dev *dn_db; + struct net_device *dev; + struct dn_ifaddr *ifa = NULL, **ifap = NULL; + int ret = 0; + + if (copy_from_user(ifr, arg, DN_IFREQ_SIZE)) + return -EFAULT; + ifr->ifr_name[IFNAMSIZ-1] = 0; + +#ifdef CONFIG_KMOD + dev_load(ifr->ifr_name); +#endif + + switch(cmd) { + case SIOCGIFADDR: + break; + case SIOCSIFADDR: + if (!capable(CAP_NET_ADMIN)) + return -EACCES; + if (sdn->sdn_family != AF_DECnet) + return -EINVAL; + break; + default: + return -EINVAL; + } + + rtnl_lock(); + + if ((dev = __dev_get_by_name(ifr->ifr_name)) == NULL) { + ret = -ENODEV; + goto done; + } + + if ((dn_db = dev->dn_ptr) != NULL) { + for (ifap = &dn_db->ifa_list; (ifa=*ifap) != NULL; ifap = &ifa->ifa_next) + if (strcmp(ifr->ifr_name, ifa->ifa_label) == 0) + break; + } + + if (ifa == NULL && cmd != SIOCSIFADDR) { + ret = -EADDRNOTAVAIL; + goto done; + } + + switch(cmd) { + case SIOCGIFADDR: + *((dn_address *)sdn->sdn_nodeaddr) = ifa->ifa_local; + goto rarok; + + case SIOCSIFADDR: + if (!ifa) { + if ((ifa = dn_dev_alloc_ifa()) == NULL) { + ret = -ENOBUFS; + break; + } + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + } else { + if (ifa->ifa_local == dn_saddr2dn(sdn)) + break; + dn_dev_del_ifa(dn_db, ifap, 0); + } + + ifa->ifa_local = ifa->ifa_address = dn_saddr2dn(sdn); + + ret = dn_dev_set_ifa(dev, ifa); + } +done: + rtnl_unlock(); + + return ret; +rarok: + if (copy_to_user(arg, ifr, DN_IFREQ_SIZE)) + ret = -EFAULT; + goto done; +} + +struct net_device *dn_dev_get_default(void) +{ + struct net_device *dev; + read_lock(&dndev_lock); + dev = decnet_default_device; + if (dev) { + if (dev->dn_ptr) + dev_hold(dev); + else + dev = NULL; + } + read_unlock(&dndev_lock); + return dev; +} + +int dn_dev_set_default(struct net_device *dev, int force) +{ + struct net_device *old = NULL; + int rv = -EBUSY; + if (!dev->dn_ptr) + return -ENODEV; + write_lock(&dndev_lock); + if (force || decnet_default_device == NULL) { + old = decnet_default_device; + decnet_default_device = dev; + rv = 0; + } + write_unlock(&dndev_lock); + if (old) + dev_put(dev); + return rv; +} + +static void dn_dev_check_default(struct net_device *dev) +{ + write_lock(&dndev_lock); + if (dev == decnet_default_device) { + decnet_default_device = NULL; + } else { + dev = NULL; + } + write_unlock(&dndev_lock); + if (dev) + dev_put(dev); +} + +static struct dn_dev *dn_dev_by_index(int ifindex) +{ + struct net_device *dev; + struct dn_dev *dn_dev = NULL; + dev = dev_get_by_index(ifindex); + if (dev) { + dn_dev = dev->dn_ptr; + dev_put(dev); + } + + return dn_dev; +} + +static int dn_dev_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct dn_dev *dn_db; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct dn_ifaddr *ifa, **ifap; + + if ((dn_db = dn_dev_by_index(ifm->ifa_index)) == NULL) + return -EADDRNOTAVAIL; + + for(ifap = &dn_db->ifa_list; (ifa=*ifap) != NULL; ifap = &ifa->ifa_next) { + void *tmp = rta[IFA_LOCAL-1]; + if ((tmp && memcmp(RTA_DATA(tmp), &ifa->ifa_local, 2)) || + (rta[IFA_LABEL-1] && rtattr_strcmp(rta[IFA_LABEL-1], ifa->ifa_label))) + continue; + + dn_dev_del_ifa(dn_db, ifap, 1); + return 0; + } + + return -EADDRNOTAVAIL; +} + +static int dn_dev_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct net_device *dev; + struct dn_dev *dn_db; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct dn_ifaddr *ifa; + int rv; + + if (rta[IFA_LOCAL-1] == NULL) + return -EINVAL; + + if ((dev = __dev_get_by_index(ifm->ifa_index)) == NULL) + return -ENODEV; + + if ((dn_db = dev->dn_ptr) == NULL) { + int err; + dn_db = dn_dev_create(dev, &err); + if (!dn_db) + return err; + } + + if ((ifa = dn_dev_alloc_ifa()) == NULL) + return -ENOBUFS; + + if (!rta[IFA_ADDRESS - 1]) + rta[IFA_ADDRESS - 1] = rta[IFA_LOCAL - 1]; + memcpy(&ifa->ifa_local, RTA_DATA(rta[IFA_LOCAL-1]), 2); + memcpy(&ifa->ifa_address, RTA_DATA(rta[IFA_ADDRESS-1]), 2); + ifa->ifa_flags = ifm->ifa_flags; + ifa->ifa_scope = ifm->ifa_scope; + ifa->ifa_dev = dn_db; + if (rta[IFA_LABEL-1]) + rtattr_strlcpy(ifa->ifa_label, rta[IFA_LABEL-1], IFNAMSIZ); + else + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + + rv = dn_dev_insert_ifa(dn_db, ifa); + if (rv) + dn_dev_free_ifa(ifa); + return rv; +} + +static int dn_dev_fill_ifaddr(struct sk_buff *skb, struct dn_ifaddr *ifa, + u32 pid, u32 seq, int event) +{ + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); + ifm = NLMSG_DATA(nlh); + + ifm->ifa_family = AF_DECnet; + ifm->ifa_prefixlen = 16; + ifm->ifa_flags = ifa->ifa_flags | IFA_F_PERMANENT; + ifm->ifa_scope = ifa->ifa_scope; + ifm->ifa_index = ifa->ifa_dev->dev->ifindex; + if (ifa->ifa_address) + RTA_PUT(skb, IFA_ADDRESS, 2, &ifa->ifa_address); + if (ifa->ifa_local) + RTA_PUT(skb, IFA_LOCAL, 2, &ifa->ifa_local); + if (ifa->ifa_label[0]) + RTA_PUT(skb, IFA_LABEL, IFNAMSIZ, &ifa->ifa_label); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static void rtmsg_ifa(int event, struct dn_ifaddr *ifa) +{ + struct sk_buff *skb; + int size = NLMSG_SPACE(sizeof(struct ifaddrmsg)+128); + + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) { + netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, ENOBUFS); + return; + } + if (dn_dev_fill_ifaddr(skb, ifa, 0, 0, event) < 0) { + kfree_skb(skb); + netlink_set_err(rtnl, 0, RTMGRP_DECnet_IFADDR, EINVAL); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_DECnet_IFADDR; + netlink_broadcast(rtnl, skb, 0, RTMGRP_DECnet_IFADDR, GFP_KERNEL); +} + +static int dn_dev_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx, dn_idx; + int s_idx, s_dn_idx; + struct net_device *dev; + struct dn_dev *dn_db; + struct dn_ifaddr *ifa; + + s_idx = cb->args[0]; + s_dn_idx = dn_idx = cb->args[1]; + read_lock(&dev_base_lock); + for(dev = dev_base, idx = 0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (idx > s_idx) + s_dn_idx = 0; + if ((dn_db = dev->dn_ptr) == NULL) + continue; + + for(ifa = dn_db->ifa_list, dn_idx = 0; ifa; ifa = ifa->ifa_next, dn_idx++) { + if (dn_idx < s_dn_idx) + continue; + + if (dn_dev_fill_ifaddr(skb, ifa, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWADDR) <= 0) + goto done; + } + } +done: + read_unlock(&dev_base_lock); + cb->args[0] = idx; + cb->args[1] = dn_idx; + + return skb->len; +} + +static int dn_dev_get_first(struct net_device *dev, dn_address *addr) +{ + struct dn_dev *dn_db = (struct dn_dev *)dev->dn_ptr; + struct dn_ifaddr *ifa; + int rv = -ENODEV; + if (dn_db == NULL) + goto out; + ifa = dn_db->ifa_list; + if (ifa != NULL) { + *addr = ifa->ifa_local; + rv = 0; + } +out: + return rv; +} + +/* + * Find a default address to bind to. + * + * This is one of those areas where the initial VMS concepts don't really + * map onto the Linux concepts, and since we introduced multiple addresses + * per interface we have to cope with slightly odd ways of finding out what + * "our address" really is. Mostly it's not a problem; for this we just guess + * a sensible default. Eventually the routing code will take care of all the + * nasties for us I hope. + */ +int dn_dev_bind_default(dn_address *addr) +{ + struct net_device *dev; + int rv; + dev = dn_dev_get_default(); +last_chance: + if (dev) { + read_lock(&dev_base_lock); + rv = dn_dev_get_first(dev, addr); + read_unlock(&dev_base_lock); + dev_put(dev); + if (rv == 0 || dev == &loopback_dev) + return rv; + } + dev = &loopback_dev; + dev_hold(dev); + goto last_chance; +} + +static void dn_send_endnode_hello(struct net_device *dev, struct dn_ifaddr *ifa) +{ + struct endnode_hello_message *msg; + struct sk_buff *skb = NULL; + unsigned short int *pktlen; + struct dn_dev *dn_db = (struct dn_dev *)dev->dn_ptr; + + if ((skb = dn_alloc_skb(NULL, sizeof(*msg), GFP_ATOMIC)) == NULL) + return; + + skb->dev = dev; + + msg = (struct endnode_hello_message *)skb_put(skb,sizeof(*msg)); + + msg->msgflg = 0x0D; + memcpy(msg->tiver, dn_eco_version, 3); + dn_dn2eth(msg->id, ifa->ifa_local); + msg->iinfo = DN_RT_INFO_ENDN; + msg->blksize = dn_htons(mtu2blksize(dev)); + msg->area = 0x00; + memset(msg->seed, 0, 8); + memcpy(msg->neighbor, dn_hiord, ETH_ALEN); + + if (dn_db->router) { + struct dn_neigh *dn = (struct dn_neigh *)dn_db->router; + dn_dn2eth(msg->neighbor, dn->addr); + } + + msg->timer = dn_htons((unsigned short)dn_db->parms.t3); + msg->mpd = 0x00; + msg->datalen = 0x02; + memset(msg->data, 0xAA, 2); + + pktlen = (unsigned short *)skb_push(skb,2); + *pktlen = dn_htons(skb->len - 2); + + skb->nh.raw = skb->data; + + dn_rt_finish_output(skb, dn_rt_all_rt_mcast, msg->id); +} + + +#define DRDELAY (5 * HZ) + +static int dn_am_i_a_router(struct dn_neigh *dn, struct dn_dev *dn_db, struct dn_ifaddr *ifa) +{ + /* First check time since device went up */ + if ((jiffies - dn_db->uptime) < DRDELAY) + return 0; + + /* If there is no router, then yes... */ + if (!dn_db->router) + return 1; + + /* otherwise only if we have a higher priority or.. */ + if (dn->priority < dn_db->parms.priority) + return 1; + + /* if we have equal priority and a higher node number */ + if (dn->priority != dn_db->parms.priority) + return 0; + + if (dn_ntohs(dn->addr) < dn_ntohs(ifa->ifa_local)) + return 1; + + return 0; +} + +static void dn_send_router_hello(struct net_device *dev, struct dn_ifaddr *ifa) +{ + int n; + struct dn_dev *dn_db = dev->dn_ptr; + struct dn_neigh *dn = (struct dn_neigh *)dn_db->router; + struct sk_buff *skb; + size_t size; + unsigned char *ptr; + unsigned char *i1, *i2; + unsigned short *pktlen; + char *src; + + if (mtu2blksize(dev) < (26 + 7)) + return; + + n = mtu2blksize(dev) - 26; + n /= 7; + + if (n > 32) + n = 32; + + size = 2 + 26 + 7 * n; + + if ((skb = dn_alloc_skb(NULL, size, GFP_ATOMIC)) == NULL) + return; + + skb->dev = dev; + ptr = skb_put(skb, size); + + *ptr++ = DN_RT_PKT_CNTL | DN_RT_PKT_ERTH; + *ptr++ = 2; /* ECO */ + *ptr++ = 0; + *ptr++ = 0; + dn_dn2eth(ptr, ifa->ifa_local); + src = ptr; + ptr += ETH_ALEN; + *ptr++ = dn_db->parms.forwarding == 1 ? + DN_RT_INFO_L1RT : DN_RT_INFO_L2RT; + *((unsigned short *)ptr) = dn_htons(mtu2blksize(dev)); + ptr += 2; + *ptr++ = dn_db->parms.priority; /* Priority */ + *ptr++ = 0; /* Area: Reserved */ + *((unsigned short *)ptr) = dn_htons((unsigned short)dn_db->parms.t3); + ptr += 2; + *ptr++ = 0; /* MPD: Reserved */ + i1 = ptr++; + memset(ptr, 0, 7); /* Name: Reserved */ + ptr += 7; + i2 = ptr++; + + n = dn_neigh_elist(dev, ptr, n); + + *i2 = 7 * n; + *i1 = 8 + *i2; + + skb_trim(skb, (27 + *i2)); + + pktlen = (unsigned short *)skb_push(skb, 2); + *pktlen = dn_htons(skb->len - 2); + + skb->nh.raw = skb->data; + + if (dn_am_i_a_router(dn, dn_db, ifa)) { + struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC); + if (skb2) { + dn_rt_finish_output(skb2, dn_rt_all_end_mcast, src); + } + } + + dn_rt_finish_output(skb, dn_rt_all_rt_mcast, src); +} + +static void dn_send_brd_hello(struct net_device *dev, struct dn_ifaddr *ifa) +{ + struct dn_dev *dn_db = (struct dn_dev *)dev->dn_ptr; + + if (dn_db->parms.forwarding == 0) + dn_send_endnode_hello(dev, ifa); + else + dn_send_router_hello(dev, ifa); +} + +static void dn_send_ptp_hello(struct net_device *dev, struct dn_ifaddr *ifa) +{ + int tdlen = 16; + int size = dev->hard_header_len + 2 + 4 + tdlen; + struct sk_buff *skb = dn_alloc_skb(NULL, size, GFP_ATOMIC); + int i; + unsigned char *ptr; + char src[ETH_ALEN]; + + if (skb == NULL) + return ; + + skb->dev = dev; + skb_push(skb, dev->hard_header_len); + ptr = skb_put(skb, 2 + 4 + tdlen); + + *ptr++ = DN_RT_PKT_HELO; + *((dn_address *)ptr) = ifa->ifa_local; + ptr += 2; + *ptr++ = tdlen; + + for(i = 0; i < tdlen; i++) + *ptr++ = 0252; + + dn_dn2eth(src, ifa->ifa_local); + dn_rt_finish_output(skb, dn_rt_all_rt_mcast, src); +} + +static int dn_eth_up(struct net_device *dev) +{ + struct dn_dev *dn_db = dev->dn_ptr; + + if (dn_db->parms.forwarding == 0) + dev_mc_add(dev, dn_rt_all_end_mcast, ETH_ALEN, 0); + else + dev_mc_add(dev, dn_rt_all_rt_mcast, ETH_ALEN, 0); + + dev_mc_upload(dev); + + dn_db->use_long = 1; + + return 0; +} + +static void dn_eth_down(struct net_device *dev) +{ + struct dn_dev *dn_db = dev->dn_ptr; + + if (dn_db->parms.forwarding == 0) + dev_mc_delete(dev, dn_rt_all_end_mcast, ETH_ALEN, 0); + else + dev_mc_delete(dev, dn_rt_all_rt_mcast, ETH_ALEN, 0); +} + +static void dn_dev_set_timer(struct net_device *dev); + +static void dn_dev_timer_func(unsigned long arg) +{ + struct net_device *dev = (struct net_device *)arg; + struct dn_dev *dn_db = dev->dn_ptr; + struct dn_ifaddr *ifa; + + if (dn_db->t3 <= dn_db->parms.t2) { + if (dn_db->parms.timer3) { + for(ifa = dn_db->ifa_list; ifa; ifa = ifa->ifa_next) { + if (!(ifa->ifa_flags & IFA_F_SECONDARY)) + dn_db->parms.timer3(dev, ifa); + } + } + dn_db->t3 = dn_db->parms.t3; + } else { + dn_db->t3 -= dn_db->parms.t2; + } + + dn_dev_set_timer(dev); +} + +static void dn_dev_set_timer(struct net_device *dev) +{ + struct dn_dev *dn_db = dev->dn_ptr; + + if (dn_db->parms.t2 > dn_db->parms.t3) + dn_db->parms.t2 = dn_db->parms.t3; + + dn_db->timer.data = (unsigned long)dev; + dn_db->timer.function = dn_dev_timer_func; + dn_db->timer.expires = jiffies + (dn_db->parms.t2 * HZ); + + add_timer(&dn_db->timer); +} + +struct dn_dev *dn_dev_create(struct net_device *dev, int *err) +{ + int i; + struct dn_dev_parms *p = dn_dev_list; + struct dn_dev *dn_db; + + for(i = 0; i < DN_DEV_LIST_SIZE; i++, p++) { + if (p->type == dev->type) + break; + } + + *err = -ENODEV; + if (i == DN_DEV_LIST_SIZE) + return NULL; + + *err = -ENOBUFS; + if ((dn_db = kmalloc(sizeof(struct dn_dev), GFP_ATOMIC)) == NULL) + return NULL; + + memset(dn_db, 0, sizeof(struct dn_dev)); + memcpy(&dn_db->parms, p, sizeof(struct dn_dev_parms)); + smp_wmb(); + dev->dn_ptr = dn_db; + dn_db->dev = dev; + init_timer(&dn_db->timer); + + dn_db->uptime = jiffies; + if (dn_db->parms.up) { + if (dn_db->parms.up(dev) < 0) { + dev->dn_ptr = NULL; + kfree(dn_db); + return NULL; + } + } + + dn_db->neigh_parms = neigh_parms_alloc(dev, &dn_neigh_table); + + dn_dev_sysctl_register(dev, &dn_db->parms); + + dn_dev_set_timer(dev); + + *err = 0; + return dn_db; +} + + +/* + * This processes a device up event. We only start up + * the loopback device & ethernet devices with correct + * MAC addreses automatically. Others must be started + * specifically. + * + * FIXME: How should we configure the loopback address ? If we could dispense + * with using decnet_address here and for autobind, it will be one less thing + * for users to worry about setting up. + */ + +void dn_dev_up(struct net_device *dev) +{ + struct dn_ifaddr *ifa; + dn_address addr = decnet_address; + int maybe_default = 0; + struct dn_dev *dn_db = (struct dn_dev *)dev->dn_ptr; + + if ((dev->type != ARPHRD_ETHER) && (dev->type != ARPHRD_LOOPBACK)) + return; + + /* + * Need to ensure that loopback device has a dn_db attached to it + * to allow creation of neighbours against it, even though it might + * not have a local address of its own. Might as well do the same for + * all autoconfigured interfaces. + */ + if (dn_db == NULL) { + int err; + dn_db = dn_dev_create(dev, &err); + if (dn_db == NULL) + return; + } + + if (dev->type == ARPHRD_ETHER) { + if (memcmp(dev->dev_addr, dn_hiord, 4) != 0) + return; + addr = dn_htons(dn_eth2dn(dev->dev_addr)); + maybe_default = 1; + } + + if (addr == 0) + return; + + if ((ifa = dn_dev_alloc_ifa()) == NULL) + return; + + ifa->ifa_local = ifa->ifa_address = addr; + ifa->ifa_flags = 0; + ifa->ifa_scope = RT_SCOPE_UNIVERSE; + strcpy(ifa->ifa_label, dev->name); + + dn_dev_set_ifa(dev, ifa); + + /* + * Automagically set the default device to the first automatically + * configured ethernet card in the system. + */ + if (maybe_default) { + dev_hold(dev); + if (dn_dev_set_default(dev, 0)) + dev_put(dev); + } +} + +static void dn_dev_delete(struct net_device *dev) +{ + struct dn_dev *dn_db = dev->dn_ptr; + + if (dn_db == NULL) + return; + + del_timer_sync(&dn_db->timer); + dn_dev_sysctl_unregister(&dn_db->parms); + dn_dev_check_default(dev); + neigh_ifdown(&dn_neigh_table, dev); + + if (dn_db->parms.down) + dn_db->parms.down(dev); + + dev->dn_ptr = NULL; + + neigh_parms_release(&dn_neigh_table, dn_db->neigh_parms); + neigh_ifdown(&dn_neigh_table, dev); + + if (dn_db->router) + neigh_release(dn_db->router); + if (dn_db->peer) + neigh_release(dn_db->peer); + + kfree(dn_db); +} + +void dn_dev_down(struct net_device *dev) +{ + struct dn_dev *dn_db = dev->dn_ptr; + struct dn_ifaddr *ifa; + + if (dn_db == NULL) + return; + + while((ifa = dn_db->ifa_list) != NULL) { + dn_dev_del_ifa(dn_db, &dn_db->ifa_list, 0); + dn_dev_free_ifa(ifa); + } + + dn_dev_delete(dev); +} + +void dn_dev_init_pkt(struct sk_buff *skb) +{ + return; +} + +void dn_dev_veri_pkt(struct sk_buff *skb) +{ + return; +} + +void dn_dev_hello(struct sk_buff *skb) +{ + return; +} + +void dn_dev_devices_off(void) +{ + struct net_device *dev; + + rtnl_lock(); + for(dev = dev_base; dev; dev = dev->next) + dn_dev_down(dev); + rtnl_unlock(); + +} + +void dn_dev_devices_on(void) +{ + struct net_device *dev; + + rtnl_lock(); + for(dev = dev_base; dev; dev = dev->next) { + if (dev->flags & IFF_UP) + dn_dev_up(dev); + } + rtnl_unlock(); +} + +int register_dnaddr_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&dnaddr_chain, nb); +} + +int unregister_dnaddr_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&dnaddr_chain, nb); +} + +#ifdef CONFIG_PROC_FS +static inline struct net_device *dn_dev_get_next(struct seq_file *seq, struct net_device *dev) +{ + do { + dev = dev->next; + } while(dev && !dev->dn_ptr); + + return dev; +} + +static struct net_device *dn_dev_get_idx(struct seq_file *seq, loff_t pos) +{ + struct net_device *dev; + + dev = dev_base; + if (dev && !dev->dn_ptr) + dev = dn_dev_get_next(seq, dev); + if (pos) { + while(dev && (dev = dn_dev_get_next(seq, dev))) + --pos; + } + return dev; +} + +static void *dn_dev_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos) { + struct net_device *dev; + read_lock(&dev_base_lock); + dev = dn_dev_get_idx(seq, *pos - 1); + if (dev == NULL) + read_unlock(&dev_base_lock); + return dev; + } + return SEQ_START_TOKEN; +} + +static void *dn_dev_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct net_device *dev = v; + loff_t one = 1; + + if (v == SEQ_START_TOKEN) { + dev = dn_dev_seq_start(seq, &one); + } else { + dev = dn_dev_get_next(seq, dev); + if (dev == NULL) + read_unlock(&dev_base_lock); + } + ++*pos; + return dev; +} + +static void dn_dev_seq_stop(struct seq_file *seq, void *v) +{ + if (v && v != SEQ_START_TOKEN) + read_unlock(&dev_base_lock); +} + +static char *dn_type2asc(char type) +{ + switch(type) { + case DN_DEV_BCAST: + return "B"; + case DN_DEV_UCAST: + return "U"; + case DN_DEV_MPOINT: + return "M"; + } + + return "?"; +} + +static int dn_dev_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Name Flags T1 Timer1 T3 Timer3 BlkSize Pri State DevType Router Peer\n"); + else { + struct net_device *dev = v; + char peer_buf[DN_ASCBUF_LEN]; + char router_buf[DN_ASCBUF_LEN]; + struct dn_dev *dn_db = dev->dn_ptr; + + seq_printf(seq, "%-8s %1s %04u %04u %04lu %04lu" + " %04hu %03d %02x %-10s %-7s %-7s\n", + dev->name ? dev->name : "???", + dn_type2asc(dn_db->parms.mode), + 0, 0, + dn_db->t3, dn_db->parms.t3, + mtu2blksize(dev), + dn_db->parms.priority, + dn_db->parms.state, dn_db->parms.name, + dn_db->router ? dn_addr2asc(dn_ntohs(*(dn_address *)dn_db->router->primary_key), router_buf) : "", + dn_db->peer ? dn_addr2asc(dn_ntohs(*(dn_address *)dn_db->peer->primary_key), peer_buf) : ""); + } + return 0; +} + +static struct seq_operations dn_dev_seq_ops = { + .start = dn_dev_seq_start, + .next = dn_dev_seq_next, + .stop = dn_dev_seq_stop, + .show = dn_dev_seq_show, +}; + +static int dn_dev_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &dn_dev_seq_ops); +} + +static struct file_operations dn_dev_seq_fops = { + .owner = THIS_MODULE, + .open = dn_dev_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* CONFIG_PROC_FS */ + +static struct rtnetlink_link dnet_rtnetlink_table[RTM_MAX-RTM_BASE+1] = +{ + [4] = { .doit = dn_dev_rtm_newaddr, }, + [5] = { .doit = dn_dev_rtm_deladdr, }, + [6] = { .dumpit = dn_dev_dump_ifaddr, }, + +#ifdef CONFIG_DECNET_ROUTER + [8] = { .doit = dn_fib_rtm_newroute, }, + [9] = { .doit = dn_fib_rtm_delroute, }, + [10] = { .doit = dn_cache_getroute, .dumpit = dn_fib_dump, }, + [16] = { .doit = dn_fib_rtm_newrule, }, + [17] = { .doit = dn_fib_rtm_delrule, }, + [18] = { .dumpit = dn_fib_dump_rules, }, +#else + [10] = { .doit = dn_cache_getroute, .dumpit = dn_cache_dump, }, +#endif + +}; + +static int __initdata addr[2]; +module_param_array(addr, int, NULL, 0444); +MODULE_PARM_DESC(addr, "The DECnet address of this machine: area,node"); + +void __init dn_dev_init(void) +{ + if (addr[0] > 63 || addr[0] < 0) { + printk(KERN_ERR "DECnet: Area must be between 0 and 63"); + return; + } + + if (addr[1] > 1023 || addr[1] < 0) { + printk(KERN_ERR "DECnet: Node must be between 0 and 1023"); + return; + } + + decnet_address = dn_htons((addr[0] << 10) | addr[1]); + + dn_dev_devices_on(); + + rtnetlink_links[PF_DECnet] = dnet_rtnetlink_table; + + proc_net_fops_create("decnet_dev", S_IRUGO, &dn_dev_seq_fops); + +#ifdef CONFIG_SYSCTL + { + int i; + for(i = 0; i < DN_DEV_LIST_SIZE; i++) + dn_dev_sysctl_register(NULL, &dn_dev_list[i]); + } +#endif /* CONFIG_SYSCTL */ +} + +void __exit dn_dev_cleanup(void) +{ + rtnetlink_links[PF_DECnet] = NULL; + +#ifdef CONFIG_SYSCTL + { + int i; + for(i = 0; i < DN_DEV_LIST_SIZE; i++) + dn_dev_sysctl_unregister(&dn_dev_list[i]); + } +#endif /* CONFIG_SYSCTL */ + + proc_net_remove("decnet_dev"); + + dn_dev_devices_off(); +} diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c new file mode 100644 index 000000000000..9934b25720e4 --- /dev/null +++ b/net/decnet/dn_fib.c @@ -0,0 +1,802 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Routing Forwarding Information Base (Glue/Info List) + * + * Author: Steve Whitehouse + * + * + * Changes: + * Alexey Kuznetsov : SMP locking changes + * Steve Whitehouse : Rewrote it... Well to be more correct, I + * copied most of it from the ipv4 fib code. + * Steve Whitehouse : Updated it in style and fixed a few bugs + * which were fixed in the ipv4 code since + * this code was copied from it. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RT_MIN_TABLE 1 + +#define for_fib_info() { struct dn_fib_info *fi;\ + for(fi = dn_fib_info_list; fi; fi = fi->fib_next) +#define endfor_fib_info() } + +#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\ + for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) + +#define change_nexthops(fi) { int nhsel; struct dn_fib_nh *nh;\ + for(nhsel = 0, nh = (struct dn_fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++) + +#define endfor_nexthops(fi) } + +extern int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb); + +static DEFINE_SPINLOCK(dn_fib_multipath_lock); +static struct dn_fib_info *dn_fib_info_list; +static DEFINE_RWLOCK(dn_fib_info_lock); + +static struct +{ + int error; + u8 scope; +} dn_fib_props[RTA_MAX+1] = { + [RTN_UNSPEC] = { .error = 0, .scope = RT_SCOPE_NOWHERE }, + [RTN_UNICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE }, + [RTN_LOCAL] = { .error = 0, .scope = RT_SCOPE_HOST }, + [RTN_BROADCAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE }, + [RTN_ANYCAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE }, + [RTN_MULTICAST] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE }, + [RTN_BLACKHOLE] = { .error = -EINVAL, .scope = RT_SCOPE_UNIVERSE }, + [RTN_UNREACHABLE] = { .error = -EHOSTUNREACH, .scope = RT_SCOPE_UNIVERSE }, + [RTN_PROHIBIT] = { .error = -EACCES, .scope = RT_SCOPE_UNIVERSE }, + [RTN_THROW] = { .error = -EAGAIN, .scope = RT_SCOPE_UNIVERSE }, + [RTN_NAT] = { .error = 0, .scope = RT_SCOPE_NOWHERE }, + [RTN_XRESOLVE] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE }, +}; + +void dn_fib_free_info(struct dn_fib_info *fi) +{ + if (fi->fib_dead == 0) { + printk(KERN_DEBUG "DECnet: BUG! Attempt to free alive dn_fib_info\n"); + return; + } + + change_nexthops(fi) { + if (nh->nh_dev) + dev_put(nh->nh_dev); + nh->nh_dev = NULL; + } endfor_nexthops(fi); + kfree(fi); +} + +void dn_fib_release_info(struct dn_fib_info *fi) +{ + write_lock(&dn_fib_info_lock); + if (fi && --fi->fib_treeref == 0) { + if (fi->fib_next) + fi->fib_next->fib_prev = fi->fib_prev; + if (fi->fib_prev) + fi->fib_prev->fib_next = fi->fib_next; + if (fi == dn_fib_info_list) + dn_fib_info_list = fi->fib_next; + fi->fib_dead = 1; + dn_fib_info_put(fi); + } + write_unlock(&dn_fib_info_lock); +} + +static inline int dn_fib_nh_comp(const struct dn_fib_info *fi, const struct dn_fib_info *ofi) +{ + const struct dn_fib_nh *onh = ofi->fib_nh; + + for_nexthops(fi) { + if (nh->nh_oif != onh->nh_oif || + nh->nh_gw != onh->nh_gw || + nh->nh_scope != onh->nh_scope || + nh->nh_weight != onh->nh_weight || + ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) + return -1; + onh++; + } endfor_nexthops(fi); + return 0; +} + +static inline struct dn_fib_info *dn_fib_find_info(const struct dn_fib_info *nfi) +{ + for_fib_info() { + if (fi->fib_nhs != nfi->fib_nhs) + continue; + if (nfi->fib_protocol == fi->fib_protocol && + nfi->fib_prefsrc == fi->fib_prefsrc && + nfi->fib_priority == fi->fib_priority && + memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(fi->fib_metrics)) == 0 && + ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && + (nfi->fib_nhs == 0 || dn_fib_nh_comp(fi, nfi) == 0)) + return fi; + } endfor_fib_info(); + return NULL; +} + +u16 dn_fib_get_attr16(struct rtattr *attr, int attrlen, int type) +{ + while(RTA_OK(attr,attrlen)) { + if (attr->rta_type == type) + return *(u16*)RTA_DATA(attr); + attr = RTA_NEXT(attr, attrlen); + } + + return 0; +} + +static int dn_fib_count_nhs(struct rtattr *rta) +{ + int nhs = 0; + struct rtnexthop *nhp = RTA_DATA(rta); + int nhlen = RTA_PAYLOAD(rta); + + while(nhlen >= (int)sizeof(struct rtnexthop)) { + if ((nhlen -= nhp->rtnh_len) < 0) + return 0; + nhs++; + nhp = RTNH_NEXT(nhp); + } + + return nhs; +} + +static int dn_fib_get_nhs(struct dn_fib_info *fi, const struct rtattr *rta, const struct rtmsg *r) +{ + struct rtnexthop *nhp = RTA_DATA(rta); + int nhlen = RTA_PAYLOAD(rta); + + change_nexthops(fi) { + int attrlen = nhlen - sizeof(struct rtnexthop); + if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0) + return -EINVAL; + + nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags; + nh->nh_oif = nhp->rtnh_ifindex; + nh->nh_weight = nhp->rtnh_hops + 1; + + if (attrlen) { + nh->nh_gw = dn_fib_get_attr16(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); + } + nhp = RTNH_NEXT(nhp); + } endfor_nexthops(fi); + + return 0; +} + + +static int dn_fib_check_nh(const struct rtmsg *r, struct dn_fib_info *fi, struct dn_fib_nh *nh) +{ + int err; + + if (nh->nh_gw) { + struct flowi fl; + struct dn_fib_res res; + + memset(&fl, 0, sizeof(fl)); + + if (nh->nh_flags&RTNH_F_ONLINK) { + struct net_device *dev; + + if (r->rtm_scope >= RT_SCOPE_LINK) + return -EINVAL; + if (dnet_addr_type(nh->nh_gw) != RTN_UNICAST) + return -EINVAL; + if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL) + return -ENODEV; + if (!(dev->flags&IFF_UP)) + return -ENETDOWN; + nh->nh_dev = dev; + dev_hold(dev); + nh->nh_scope = RT_SCOPE_LINK; + return 0; + } + + memset(&fl, 0, sizeof(fl)); + fl.fld_dst = nh->nh_gw; + fl.oif = nh->nh_oif; + fl.fld_scope = r->rtm_scope + 1; + + if (fl.fld_scope < RT_SCOPE_LINK) + fl.fld_scope = RT_SCOPE_LINK; + + if ((err = dn_fib_lookup(&fl, &res)) != 0) + return err; + + err = -EINVAL; + if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) + goto out; + nh->nh_scope = res.scope; + nh->nh_oif = DN_FIB_RES_OIF(res); + nh->nh_dev = DN_FIB_RES_DEV(res); + if (nh->nh_dev == NULL) + goto out; + dev_hold(nh->nh_dev); + err = -ENETDOWN; + if (!(nh->nh_dev->flags & IFF_UP)) + goto out; + err = 0; +out: + dn_fib_res_put(&res); + return err; + } else { + struct net_device *dev; + + if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) + return -EINVAL; + + dev = __dev_get_by_index(nh->nh_oif); + if (dev == NULL || dev->dn_ptr == NULL) + return -ENODEV; + if (!(dev->flags&IFF_UP)) + return -ENETDOWN; + nh->nh_dev = dev; + dev_hold(nh->nh_dev); + nh->nh_scope = RT_SCOPE_HOST; + } + + return 0; +} + + +struct dn_fib_info *dn_fib_create_info(const struct rtmsg *r, struct dn_kern_rta *rta, const struct nlmsghdr *nlh, int *errp) +{ + int err; + struct dn_fib_info *fi = NULL; + struct dn_fib_info *ofi; + int nhs = 1; + + if (dn_fib_props[r->rtm_type].scope > r->rtm_scope) + goto err_inval; + + if (rta->rta_mp) { + nhs = dn_fib_count_nhs(rta->rta_mp); + if (nhs == 0) + goto err_inval; + } + + fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct dn_fib_nh), GFP_KERNEL); + err = -ENOBUFS; + if (fi == NULL) + goto failure; + memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct dn_fib_nh)); + + fi->fib_protocol = r->rtm_protocol; + fi->fib_nhs = nhs; + fi->fib_flags = r->rtm_flags; + if (rta->rta_priority) + fi->fib_priority = *rta->rta_priority; + if (rta->rta_mx) { + int attrlen = RTA_PAYLOAD(rta->rta_mx); + struct rtattr *attr = RTA_DATA(rta->rta_mx); + + while(RTA_OK(attr, attrlen)) { + unsigned flavour = attr->rta_type; + if (flavour) { + if (flavour > RTAX_MAX) + goto err_inval; + fi->fib_metrics[flavour-1] = *(unsigned*)RTA_DATA(attr); + } + attr = RTA_NEXT(attr, attrlen); + } + } + if (rta->rta_prefsrc) + memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 2); + + if (rta->rta_mp) { + if ((err = dn_fib_get_nhs(fi, rta->rta_mp, r)) != 0) + goto failure; + if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif) + goto err_inval; + if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 2)) + goto err_inval; + } else { + struct dn_fib_nh *nh = fi->fib_nh; + if (rta->rta_oif) + nh->nh_oif = *rta->rta_oif; + if (rta->rta_gw) + memcpy(&nh->nh_gw, rta->rta_gw, 2); + nh->nh_flags = r->rtm_flags; + nh->nh_weight = 1; + } + + if (r->rtm_type == RTN_NAT) { + if (rta->rta_gw == NULL || nhs != 1 || rta->rta_oif) + goto err_inval; + memcpy(&fi->fib_nh->nh_gw, rta->rta_gw, 2); + goto link_it; + } + + if (dn_fib_props[r->rtm_type].error) { + if (rta->rta_gw || rta->rta_oif || rta->rta_mp) + goto err_inval; + goto link_it; + } + + if (r->rtm_scope > RT_SCOPE_HOST) + goto err_inval; + + if (r->rtm_scope == RT_SCOPE_HOST) { + struct dn_fib_nh *nh = fi->fib_nh; + + /* Local address is added */ + if (nhs != 1 || nh->nh_gw) + goto err_inval; + nh->nh_scope = RT_SCOPE_NOWHERE; + nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif); + err = -ENODEV; + if (nh->nh_dev == NULL) + goto failure; + } else { + change_nexthops(fi) { + if ((err = dn_fib_check_nh(r, fi, nh)) != 0) + goto failure; + } endfor_nexthops(fi) + } + + if (fi->fib_prefsrc) { + if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL || + memcmp(&fi->fib_prefsrc, rta->rta_dst, 2)) + if (dnet_addr_type(fi->fib_prefsrc) != RTN_LOCAL) + goto err_inval; + } + +link_it: + if ((ofi = dn_fib_find_info(fi)) != NULL) { + fi->fib_dead = 1; + dn_fib_free_info(fi); + ofi->fib_treeref++; + return ofi; + } + + fi->fib_treeref++; + atomic_inc(&fi->fib_clntref); + write_lock(&dn_fib_info_lock); + fi->fib_next = dn_fib_info_list; + fi->fib_prev = NULL; + if (dn_fib_info_list) + dn_fib_info_list->fib_prev = fi; + dn_fib_info_list = fi; + write_unlock(&dn_fib_info_lock); + return fi; + +err_inval: + err = -EINVAL; + +failure: + *errp = err; + if (fi) { + fi->fib_dead = 1; + dn_fib_free_info(fi); + } + + return NULL; +} + +int dn_fib_semantic_match(int type, struct dn_fib_info *fi, const struct flowi *fl, struct dn_fib_res *res) +{ + int err = dn_fib_props[type].error; + + if (err == 0) { + if (fi->fib_flags & RTNH_F_DEAD) + return 1; + + res->fi = fi; + + switch(type) { + case RTN_NAT: + DN_FIB_RES_RESET(*res); + atomic_inc(&fi->fib_clntref); + return 0; + case RTN_UNICAST: + case RTN_LOCAL: + for_nexthops(fi) { + if (nh->nh_flags & RTNH_F_DEAD) + continue; + if (!fl->oif || fl->oif == nh->nh_oif) + break; + } + if (nhsel < fi->fib_nhs) { + res->nh_sel = nhsel; + atomic_inc(&fi->fib_clntref); + return 0; + } + endfor_nexthops(fi); + res->fi = NULL; + return 1; + default: + if (net_ratelimit()) + printk("DECnet: impossible routing event : dn_fib_semantic_match type=%d\n", type); + res->fi = NULL; + return -EINVAL; + } + } + return err; +} + +void dn_fib_select_multipath(const struct flowi *fl, struct dn_fib_res *res) +{ + struct dn_fib_info *fi = res->fi; + int w; + + spin_lock_bh(&dn_fib_multipath_lock); + if (fi->fib_power <= 0) { + int power = 0; + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD)) { + power += nh->nh_weight; + nh->nh_power = nh->nh_weight; + } + } endfor_nexthops(fi); + fi->fib_power = power; + if (power < 0) { + spin_unlock_bh(&dn_fib_multipath_lock); + res->nh_sel = 0; + return; + } + } + + w = jiffies % fi->fib_power; + + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { + if ((w -= nh->nh_power) <= 0) { + nh->nh_power--; + fi->fib_power--; + res->nh_sel = nhsel; + spin_unlock_bh(&dn_fib_multipath_lock); + return; + } + } + } endfor_nexthops(fi); + res->nh_sel = 0; + spin_unlock_bh(&dn_fib_multipath_lock); +} + + +static int dn_fib_check_attr(struct rtmsg *r, struct rtattr **rta) +{ + int i; + + for(i = 1; i <= RTA_MAX; i++) { + struct rtattr *attr = rta[i-1]; + if (attr) { + if (RTA_PAYLOAD(attr) < 4 && RTA_PAYLOAD(attr) != 2) + return -EINVAL; + if (i != RTA_MULTIPATH && i != RTA_METRICS) + rta[i-1] = (struct rtattr *)RTA_DATA(attr); + } + } + + return 0; +} + +int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct dn_fib_table *tb; + struct rtattr **rta = arg; + struct rtmsg *r = NLMSG_DATA(nlh); + + if (dn_fib_check_attr(r, rta)) + return -EINVAL; + + tb = dn_fib_get_table(r->rtm_table, 0); + if (tb) + return tb->delete(tb, r, (struct dn_kern_rta *)rta, nlh, &NETLINK_CB(skb)); + + return -ESRCH; +} + +int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct dn_fib_table *tb; + struct rtattr **rta = arg; + struct rtmsg *r = NLMSG_DATA(nlh); + + if (dn_fib_check_attr(r, rta)) + return -EINVAL; + + tb = dn_fib_get_table(r->rtm_table, 1); + if (tb) + return tb->insert(tb, r, (struct dn_kern_rta *)rta, nlh, &NETLINK_CB(skb)); + + return -ENOBUFS; +} + + +int dn_fib_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct dn_fib_table *tb; + + if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) && + ((struct rtmsg *)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED) + return dn_cache_dump(skb, cb); + + s_t = cb->args[0]; + if (s_t == 0) + s_t = cb->args[0] = RT_MIN_TABLE; + + for(t = s_t; t <= RT_TABLE_MAX; t++) { + if (t < s_t) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(int)); + tb = dn_fib_get_table(t, 0); + if (tb == NULL) + continue; + if (tb->dump(tb, skb, cb) < 0) + break; + } + + cb->args[0] = t; + + return skb->len; +} + +static void fib_magic(int cmd, int type, __u16 dst, int dst_len, struct dn_ifaddr *ifa) +{ + struct dn_fib_table *tb; + struct { + struct nlmsghdr nlh; + struct rtmsg rtm; + } req; + struct dn_kern_rta rta; + + memset(&req.rtm, 0, sizeof(req.rtm)); + memset(&rta, 0, sizeof(rta)); + + if (type == RTN_UNICAST) + tb = dn_fib_get_table(RT_MIN_TABLE, 1); + else + tb = dn_fib_get_table(RT_TABLE_LOCAL, 1); + + if (tb == NULL) + return; + + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = cmd; + req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND; + req.nlh.nlmsg_pid = 0; + req.nlh.nlmsg_seq = 0; + + req.rtm.rtm_dst_len = dst_len; + req.rtm.rtm_table = tb->n; + req.rtm.rtm_protocol = RTPROT_KERNEL; + req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST); + req.rtm.rtm_type = type; + + rta.rta_dst = &dst; + rta.rta_prefsrc = &ifa->ifa_local; + rta.rta_oif = &ifa->ifa_dev->dev->ifindex; + + if (cmd == RTM_NEWROUTE) + tb->insert(tb, &req.rtm, &rta, &req.nlh, NULL); + else + tb->delete(tb, &req.rtm, &rta, &req.nlh, NULL); +} + +static void dn_fib_add_ifaddr(struct dn_ifaddr *ifa) +{ + + fib_magic(RTM_NEWROUTE, RTN_LOCAL, ifa->ifa_local, 16, ifa); + +#if 0 + if (!(dev->flags&IFF_UP)) + return; + /* In the future, we will want to add default routes here */ + +#endif +} + +static void dn_fib_del_ifaddr(struct dn_ifaddr *ifa) +{ + int found_it = 0; + struct net_device *dev; + struct dn_dev *dn_db; + struct dn_ifaddr *ifa2; + + ASSERT_RTNL(); + + /* Scan device list */ + read_lock(&dev_base_lock); + for(dev = dev_base; dev; dev = dev->next) { + dn_db = dev->dn_ptr; + if (dn_db == NULL) + continue; + for(ifa2 = dn_db->ifa_list; ifa2; ifa2 = ifa2->ifa_next) { + if (ifa2->ifa_local == ifa->ifa_local) { + found_it = 1; + break; + } + } + } + read_unlock(&dev_base_lock); + + if (found_it == 0) { + fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 16, ifa); + + if (dnet_addr_type(ifa->ifa_local) != RTN_LOCAL) { + if (dn_fib_sync_down(ifa->ifa_local, NULL, 0)) + dn_fib_flush(); + } + } +} + +static void dn_fib_disable_addr(struct net_device *dev, int force) +{ + if (dn_fib_sync_down(0, dev, force)) + dn_fib_flush(); + dn_rt_cache_flush(0); + neigh_ifdown(&dn_neigh_table, dev); +} + +static int dn_fib_dnaddr_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct dn_ifaddr *ifa = (struct dn_ifaddr *)ptr; + + switch(event) { + case NETDEV_UP: + dn_fib_add_ifaddr(ifa); + dn_fib_sync_up(ifa->ifa_dev->dev); + dn_rt_cache_flush(-1); + break; + case NETDEV_DOWN: + dn_fib_del_ifaddr(ifa); + if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) { + dn_fib_disable_addr(ifa->ifa_dev->dev, 1); + } else { + dn_rt_cache_flush(-1); + } + break; + } + return NOTIFY_DONE; +} + +int dn_fib_sync_down(dn_address local, struct net_device *dev, int force) +{ + int ret = 0; + int scope = RT_SCOPE_NOWHERE; + + if (force) + scope = -1; + + for_fib_info() { + /* + * This makes no sense for DECnet.... we will almost + * certainly have more than one local address the same + * over all our interfaces. It needs thinking about + * some more. + */ + if (local && fi->fib_prefsrc == local) { + fi->fib_flags |= RTNH_F_DEAD; + ret++; + } else if (dev && fi->fib_nhs) { + int dead = 0; + + change_nexthops(fi) { + if (nh->nh_flags&RTNH_F_DEAD) + dead++; + else if (nh->nh_dev == dev && + nh->nh_scope != scope) { + spin_lock_bh(&dn_fib_multipath_lock); + nh->nh_flags |= RTNH_F_DEAD; + fi->fib_power -= nh->nh_power; + nh->nh_power = 0; + spin_unlock_bh(&dn_fib_multipath_lock); + dead++; + } + } endfor_nexthops(fi) + if (dead == fi->fib_nhs) { + fi->fib_flags |= RTNH_F_DEAD; + ret++; + } + } + } endfor_fib_info(); + return ret; +} + + +int dn_fib_sync_up(struct net_device *dev) +{ + int ret = 0; + + if (!(dev->flags&IFF_UP)) + return 0; + + for_fib_info() { + int alive = 0; + + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD)) { + alive++; + continue; + } + if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) + continue; + if (nh->nh_dev != dev || dev->dn_ptr == NULL) + continue; + alive++; + spin_lock_bh(&dn_fib_multipath_lock); + nh->nh_power = 0; + nh->nh_flags &= ~RTNH_F_DEAD; + spin_unlock_bh(&dn_fib_multipath_lock); + } endfor_nexthops(fi); + + if (alive > 0) { + fi->fib_flags &= ~RTNH_F_DEAD; + ret++; + } + } endfor_fib_info(); + return ret; +} + +void dn_fib_flush(void) +{ + int flushed = 0; + struct dn_fib_table *tb; + int id; + + for(id = RT_TABLE_MAX; id > 0; id--) { + if ((tb = dn_fib_get_table(id, 0)) == NULL) + continue; + flushed += tb->flush(tb); + } + + if (flushed) + dn_rt_cache_flush(-1); +} + +static struct notifier_block dn_fib_dnaddr_notifier = { + .notifier_call = dn_fib_dnaddr_event, +}; + +void __exit dn_fib_cleanup(void) +{ + dn_fib_table_cleanup(); + dn_fib_rules_cleanup(); + + unregister_dnaddr_notifier(&dn_fib_dnaddr_notifier); +} + + +void __init dn_fib_init(void) +{ + + dn_fib_table_init(); + dn_fib_rules_init(); + + register_dnaddr_notifier(&dn_fib_dnaddr_notifier); +} + + diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c new file mode 100644 index 000000000000..f6dfe96f45b7 --- /dev/null +++ b/net/decnet/dn_neigh.c @@ -0,0 +1,627 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Neighbour Functions (Adjacency Database and + * On-Ethernet Cache) + * + * Author: Steve Whitehouse + * + * + * Changes: + * Steve Whitehouse : Fixed router listing routine + * Steve Whitehouse : Added error_report functions + * Steve Whitehouse : Added default router detection + * Steve Whitehouse : Hop counts in outgoing messages + * Steve Whitehouse : Fixed src/dst in outgoing messages so + * forwarding now stands a good chance of + * working. + * Steve Whitehouse : Fixed neighbour states (for now anyway). + * Steve Whitehouse : Made error_report functions dummies. This + * is not the right place to return skbs. + * Steve Whitehouse : Convert to seq_file + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev); +static int dn_neigh_construct(struct neighbour *); +static void dn_long_error_report(struct neighbour *, struct sk_buff *); +static void dn_short_error_report(struct neighbour *, struct sk_buff *); +static int dn_long_output(struct sk_buff *); +static int dn_short_output(struct sk_buff *); +static int dn_phase3_output(struct sk_buff *); + + +/* + * For talking to broadcast devices: Ethernet & PPP + */ +static struct neigh_ops dn_long_ops = { + .family = AF_DECnet, + .error_report = dn_long_error_report, + .output = dn_long_output, + .connected_output = dn_long_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +/* + * For talking to pointopoint and multidrop devices: DDCMP and X.25 + */ +static struct neigh_ops dn_short_ops = { + .family = AF_DECnet, + .error_report = dn_short_error_report, + .output = dn_short_output, + .connected_output = dn_short_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +/* + * For talking to DECnet phase III nodes + */ +static struct neigh_ops dn_phase3_ops = { + .family = AF_DECnet, + .error_report = dn_short_error_report, /* Can use short version here */ + .output = dn_phase3_output, + .connected_output = dn_phase3_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit +}; + +struct neigh_table dn_neigh_table = { + .family = PF_DECnet, + .entry_size = sizeof(struct dn_neigh), + .key_len = sizeof(dn_address), + .hash = dn_neigh_hash, + .constructor = dn_neigh_construct, + .id = "dn_neigh_cache", + .parms ={ + .tbl = &dn_neigh_table, + .entries = 0, + .base_reachable_time = 30 * HZ, + .retrans_time = 1 * HZ, + .gc_staletime = 60 * HZ, + .reachable_time = 30 * HZ, + .delay_probe_time = 5 * HZ, + .queue_len = 3, + .ucast_probes = 0, + .app_probes = 0, + .mcast_probes = 0, + .anycast_delay = 0, + .proxy_delay = 0, + .proxy_qlen = 0, + .locktime = 1 * HZ, + }, + .gc_interval = 30 * HZ, + .gc_thresh1 = 128, + .gc_thresh2 = 512, + .gc_thresh3 = 1024, +}; + +static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev) +{ + return jhash_2words(*(dn_address *)pkey, 0, dn_neigh_table.hash_rnd); +} + +static int dn_neigh_construct(struct neighbour *neigh) +{ + struct net_device *dev = neigh->dev; + struct dn_neigh *dn = (struct dn_neigh *)neigh; + struct dn_dev *dn_db; + struct neigh_parms *parms; + + rcu_read_lock(); + dn_db = rcu_dereference(dev->dn_ptr); + if (dn_db == NULL) { + rcu_read_unlock(); + return -EINVAL; + } + + parms = dn_db->neigh_parms; + if (!parms) { + rcu_read_unlock(); + return -EINVAL; + } + + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); + rcu_read_unlock(); + + if (dn_db->use_long) + neigh->ops = &dn_long_ops; + else + neigh->ops = &dn_short_ops; + + if (dn->flags & DN_NDFLAG_P3) + neigh->ops = &dn_phase3_ops; + + neigh->nud_state = NUD_NOARP; + neigh->output = neigh->ops->connected_output; + + if ((dev->type == ARPHRD_IPGRE) || (dev->flags & IFF_POINTOPOINT)) + memcpy(neigh->ha, dev->broadcast, dev->addr_len); + else if ((dev->type == ARPHRD_ETHER) || (dev->type == ARPHRD_LOOPBACK)) + dn_dn2eth(neigh->ha, dn->addr); + else { + if (net_ratelimit()) + printk(KERN_DEBUG "Trying to create neigh for hw %d\n", dev->type); + return -EINVAL; + } + + /* + * Make an estimate of the remote block size by assuming that its + * two less then the device mtu, which it true for ethernet (and + * other things which support long format headers) since there is + * an extra length field (of 16 bits) which isn't part of the + * ethernet headers and which the DECnet specs won't admit is part + * of the DECnet routing headers either. + * + * If we over estimate here its no big deal, the NSP negotiations + * will prevent us from sending packets which are too large for the + * remote node to handle. In any case this figure is normally updated + * by a hello message in most cases. + */ + dn->blksize = dev->mtu - 2; + + return 0; +} + +static void dn_long_error_report(struct neighbour *neigh, struct sk_buff *skb) +{ + printk(KERN_DEBUG "dn_long_error_report: called\n"); + kfree_skb(skb); +} + + +static void dn_short_error_report(struct neighbour *neigh, struct sk_buff *skb) +{ + printk(KERN_DEBUG "dn_short_error_report: called\n"); + kfree_skb(skb); +} + +static int dn_neigh_output_packet(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct dn_route *rt = (struct dn_route *)dst; + struct neighbour *neigh = dst->neighbour; + struct net_device *dev = neigh->dev; + char mac_addr[ETH_ALEN]; + + dn_dn2eth(mac_addr, rt->rt_local_src); + if (!dev->hard_header || dev->hard_header(skb, dev, ntohs(skb->protocol), neigh->ha, mac_addr, skb->len) >= 0) + return neigh->ops->queue_xmit(skb); + + if (net_ratelimit()) + printk(KERN_DEBUG "dn_neigh_output_packet: oops, can't send packet\n"); + + kfree_skb(skb); + return -EINVAL; +} + +static int dn_long_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct neighbour *neigh = dst->neighbour; + struct net_device *dev = neigh->dev; + int headroom = dev->hard_header_len + sizeof(struct dn_long_packet) + 3; + unsigned char *data; + struct dn_long_packet *lp; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + + + if (skb_headroom(skb) < headroom) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom); + if (skb2 == NULL) { + if (net_ratelimit()) + printk(KERN_CRIT "dn_long_output: no memory\n"); + kfree_skb(skb); + return -ENOBUFS; + } + kfree_skb(skb); + skb = skb2; + if (net_ratelimit()) + printk(KERN_INFO "dn_long_output: Increasing headroom\n"); + } + + data = skb_push(skb, sizeof(struct dn_long_packet) + 3); + lp = (struct dn_long_packet *)(data+3); + + *((unsigned short *)data) = dn_htons(skb->len - 2); + *(data + 2) = 1 | DN_RT_F_PF; /* Padding */ + + lp->msgflg = DN_RT_PKT_LONG|(cb->rt_flags&(DN_RT_F_IE|DN_RT_F_RQR|DN_RT_F_RTS)); + lp->d_area = lp->d_subarea = 0; + dn_dn2eth(lp->d_id, dn_ntohs(cb->dst)); + lp->s_area = lp->s_subarea = 0; + dn_dn2eth(lp->s_id, dn_ntohs(cb->src)); + lp->nl2 = 0; + lp->visit_ct = cb->hops & 0x3f; + lp->s_class = 0; + lp->pt = 0; + + skb->nh.raw = skb->data; + + return NF_HOOK(PF_DECnet, NF_DN_POST_ROUTING, skb, NULL, neigh->dev, dn_neigh_output_packet); +} + +static int dn_short_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct neighbour *neigh = dst->neighbour; + struct net_device *dev = neigh->dev; + int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2; + struct dn_short_packet *sp; + unsigned char *data; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + + + if (skb_headroom(skb) < headroom) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom); + if (skb2 == NULL) { + if (net_ratelimit()) + printk(KERN_CRIT "dn_short_output: no memory\n"); + kfree_skb(skb); + return -ENOBUFS; + } + kfree_skb(skb); + skb = skb2; + if (net_ratelimit()) + printk(KERN_INFO "dn_short_output: Increasing headroom\n"); + } + + data = skb_push(skb, sizeof(struct dn_short_packet) + 2); + *((unsigned short *)data) = dn_htons(skb->len - 2); + sp = (struct dn_short_packet *)(data+2); + + sp->msgflg = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS)); + sp->dstnode = cb->dst; + sp->srcnode = cb->src; + sp->forward = cb->hops & 0x3f; + + skb->nh.raw = skb->data; + + return NF_HOOK(PF_DECnet, NF_DN_POST_ROUTING, skb, NULL, neigh->dev, dn_neigh_output_packet); +} + +/* + * Phase 3 output is the same is short output, execpt that + * it clears the area bits before transmission. + */ +static int dn_phase3_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct neighbour *neigh = dst->neighbour; + struct net_device *dev = neigh->dev; + int headroom = dev->hard_header_len + sizeof(struct dn_short_packet) + 2; + struct dn_short_packet *sp; + unsigned char *data; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + + if (skb_headroom(skb) < headroom) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, headroom); + if (skb2 == NULL) { + if (net_ratelimit()) + printk(KERN_CRIT "dn_phase3_output: no memory\n"); + kfree_skb(skb); + return -ENOBUFS; + } + kfree_skb(skb); + skb = skb2; + if (net_ratelimit()) + printk(KERN_INFO "dn_phase3_output: Increasing headroom\n"); + } + + data = skb_push(skb, sizeof(struct dn_short_packet) + 2); + *((unsigned short *)data) = dn_htons(skb->len - 2); + sp = (struct dn_short_packet *)(data + 2); + + sp->msgflg = DN_RT_PKT_SHORT|(cb->rt_flags&(DN_RT_F_RQR|DN_RT_F_RTS)); + sp->dstnode = cb->dst & dn_htons(0x03ff); + sp->srcnode = cb->src & dn_htons(0x03ff); + sp->forward = cb->hops & 0x3f; + + skb->nh.raw = skb->data; + + return NF_HOOK(PF_DECnet, NF_DN_POST_ROUTING, skb, NULL, neigh->dev, dn_neigh_output_packet); +} + +/* + * Unfortunately, the neighbour code uses the device in its hash + * function, so we don't get any advantage from it. This function + * basically does a neigh_lookup(), but without comparing the device + * field. This is required for the On-Ethernet cache + */ + +/* + * Pointopoint link receives a hello message + */ +void dn_neigh_pointopoint_hello(struct sk_buff *skb) +{ + kfree_skb(skb); +} + +/* + * Ethernet router hello message received + */ +int dn_neigh_router_hello(struct sk_buff *skb) +{ + struct rtnode_hello_message *msg = (struct rtnode_hello_message *)skb->data; + + struct neighbour *neigh; + struct dn_neigh *dn; + struct dn_dev *dn_db; + dn_address src; + + src = dn_htons(dn_eth2dn(msg->id)); + + neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1); + + dn = (struct dn_neigh *)neigh; + + if (neigh) { + write_lock(&neigh->lock); + + neigh->used = jiffies; + dn_db = (struct dn_dev *)neigh->dev->dn_ptr; + + if (!(neigh->nud_state & NUD_PERMANENT)) { + neigh->updated = jiffies; + + if (neigh->dev->type == ARPHRD_ETHER) + memcpy(neigh->ha, ð_hdr(skb)->h_source, ETH_ALEN); + + dn->blksize = dn_ntohs(msg->blksize); + dn->priority = msg->priority; + + dn->flags &= ~DN_NDFLAG_P3; + + switch(msg->iinfo & DN_RT_INFO_TYPE) { + case DN_RT_INFO_L1RT: + dn->flags &=~DN_NDFLAG_R2; + dn->flags |= DN_NDFLAG_R1; + break; + case DN_RT_INFO_L2RT: + dn->flags |= DN_NDFLAG_R2; + } + } + + if (!dn_db->router) { + dn_db->router = neigh_clone(neigh); + } else { + if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority) + neigh_release(xchg(&dn_db->router, neigh_clone(neigh))); + } + write_unlock(&neigh->lock); + neigh_release(neigh); + } + + kfree_skb(skb); + return 0; +} + +/* + * Endnode hello message received + */ +int dn_neigh_endnode_hello(struct sk_buff *skb) +{ + struct endnode_hello_message *msg = (struct endnode_hello_message *)skb->data; + struct neighbour *neigh; + struct dn_neigh *dn; + dn_address src; + + src = dn_htons(dn_eth2dn(msg->id)); + + neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1); + + dn = (struct dn_neigh *)neigh; + + if (neigh) { + write_lock(&neigh->lock); + + neigh->used = jiffies; + + if (!(neigh->nud_state & NUD_PERMANENT)) { + neigh->updated = jiffies; + + if (neigh->dev->type == ARPHRD_ETHER) + memcpy(neigh->ha, ð_hdr(skb)->h_source, ETH_ALEN); + dn->flags &= ~(DN_NDFLAG_R1 | DN_NDFLAG_R2); + dn->blksize = dn_ntohs(msg->blksize); + dn->priority = 0; + } + + write_unlock(&neigh->lock); + neigh_release(neigh); + } + + kfree_skb(skb); + return 0; +} + +static char *dn_find_slot(char *base, int max, int priority) +{ + int i; + unsigned char *min = NULL; + + base += 6; /* skip first id */ + + for(i = 0; i < max; i++) { + if (!min || (*base < *min)) + min = base; + base += 7; /* find next priority */ + } + + if (!min) + return NULL; + + return (*min < priority) ? (min - 6) : NULL; +} + +struct elist_cb_state { + struct net_device *dev; + unsigned char *ptr; + unsigned char *rs; + int t, n; +}; + +static void neigh_elist_cb(struct neighbour *neigh, void *_info) +{ + struct elist_cb_state *s = _info; + struct dn_dev *dn_db; + struct dn_neigh *dn; + + if (neigh->dev != s->dev) + return; + + dn = (struct dn_neigh *) neigh; + if (!(dn->flags & (DN_NDFLAG_R1|DN_NDFLAG_R2))) + return; + + dn_db = (struct dn_dev *) s->dev->dn_ptr; + if (dn_db->parms.forwarding == 1 && (dn->flags & DN_NDFLAG_R2)) + return; + + if (s->t == s->n) + s->rs = dn_find_slot(s->ptr, s->n, dn->priority); + else + s->t++; + if (s->rs == NULL) + return; + + dn_dn2eth(s->rs, dn->addr); + s->rs += 6; + *(s->rs) = neigh->nud_state & NUD_CONNECTED ? 0x80 : 0x0; + *(s->rs) |= dn->priority; + s->rs++; +} + +int dn_neigh_elist(struct net_device *dev, unsigned char *ptr, int n) +{ + struct elist_cb_state state; + + state.dev = dev; + state.t = 0; + state.n = n; + state.ptr = ptr; + state.rs = ptr; + + neigh_for_each(&dn_neigh_table, neigh_elist_cb, &state); + + return state.t; +} + + +#ifdef CONFIG_PROC_FS + +static inline void dn_neigh_format_entry(struct seq_file *seq, + struct neighbour *n) +{ + struct dn_neigh *dn = (struct dn_neigh *) n; + char buf[DN_ASCBUF_LEN]; + + read_lock(&n->lock); + seq_printf(seq, "%-7s %s%s%s %02x %02d %07ld %-8s\n", + dn_addr2asc(dn_ntohs(dn->addr), buf), + (dn->flags&DN_NDFLAG_R1) ? "1" : "-", + (dn->flags&DN_NDFLAG_R2) ? "2" : "-", + (dn->flags&DN_NDFLAG_P3) ? "3" : "-", + dn->n.nud_state, + atomic_read(&dn->n.refcnt), + dn->blksize, + (dn->n.dev) ? dn->n.dev->name : "?"); + read_unlock(&n->lock); +} + +static int dn_neigh_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "Addr Flags State Use Blksize Dev\n"); + } else { + dn_neigh_format_entry(seq, v); + } + + return 0; +} + +static void *dn_neigh_seq_start(struct seq_file *seq, loff_t *pos) +{ + return neigh_seq_start(seq, pos, &dn_neigh_table, + NEIGH_SEQ_NEIGH_ONLY); +} + +static struct seq_operations dn_neigh_seq_ops = { + .start = dn_neigh_seq_start, + .next = neigh_seq_next, + .stop = neigh_seq_stop, + .show = dn_neigh_seq_show, +}; + +static int dn_neigh_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct neigh_seq_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + memset(s, 0, sizeof(*s)); + rc = seq_open(file, &dn_neigh_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations dn_neigh_seq_fops = { + .owner = THIS_MODULE, + .open = dn_neigh_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif + +void __init dn_neigh_init(void) +{ + neigh_table_init(&dn_neigh_table); + proc_net_fops_create("decnet_neigh", S_IRUGO, &dn_neigh_seq_fops); +} + +void __exit dn_neigh_cleanup(void) +{ + proc_net_remove("decnet_neigh"); + neigh_table_clear(&dn_neigh_table); +} diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c new file mode 100644 index 000000000000..202dbde9850d --- /dev/null +++ b/net/decnet/dn_nsp_in.c @@ -0,0 +1,934 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Network Services Protocol (Input) + * + * Author: Eduardo Marcelo Serrat + * + * Changes: + * + * Steve Whitehouse: Split into dn_nsp_in.c and dn_nsp_out.c from + * original dn_nsp.c. + * Steve Whitehouse: Updated to work with my new routing architecture. + * Steve Whitehouse: Add changes from Eduardo Serrat's patches. + * Steve Whitehouse: Put all ack handling code in a common routine. + * Steve Whitehouse: Put other common bits into dn_nsp_rx() + * Steve Whitehouse: More checks on skb->len to catch bogus packets + * Fixed various race conditions and possible nasties. + * Steve Whitehouse: Now handles returned conninit frames. + * David S. Miller: New socket locking + * Steve Whitehouse: Fixed lockup when socket filtering was enabled. + * Paul Koning: Fix to push CC sockets into RUN when acks are + * received. + * Steve Whitehouse: + * Patrick Caulfield: Checking conninits for correctness & sending of error + * responses. + * Steve Whitehouse: Added backlog congestion level return codes. + * Patrick Caulfield: + * Steve Whitehouse: Added flow control support (outbound) + * Steve Whitehouse: Prepare for nonlinear skbs + */ + +/****************************************************************************** + (c) 1995-1998 E.M. Serrat emserrat@geocities.com + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern int decnet_log_martians; + +static void dn_log_martian(struct sk_buff *skb, const char *msg) +{ + if (decnet_log_martians && net_ratelimit()) { + char *devname = skb->dev ? skb->dev->name : "???"; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + printk(KERN_INFO "DECnet: Martian packet (%s) dev=%s src=0x%04hx dst=0x%04hx srcport=0x%04hx dstport=0x%04hx\n", msg, devname, cb->src, cb->dst, cb->src_port, cb->dst_port); + } +} + +/* + * For this function we've flipped the cross-subchannel bit + * if the message is an otherdata or linkservice message. Thus + * we can use it to work out what to update. + */ +static void dn_ack(struct sock *sk, struct sk_buff *skb, unsigned short ack) +{ + struct dn_scp *scp = DN_SK(sk); + unsigned short type = ((ack >> 12) & 0x0003); + int wakeup = 0; + + switch(type) { + case 0: /* ACK - Data */ + if (dn_after(ack, scp->ackrcv_dat)) { + scp->ackrcv_dat = ack & 0x0fff; + wakeup |= dn_nsp_check_xmit_queue(sk, skb, &scp->data_xmit_queue, ack); + } + break; + case 1: /* NAK - Data */ + break; + case 2: /* ACK - OtherData */ + if (dn_after(ack, scp->ackrcv_oth)) { + scp->ackrcv_oth = ack & 0x0fff; + wakeup |= dn_nsp_check_xmit_queue(sk, skb, &scp->other_xmit_queue, ack); + } + break; + case 3: /* NAK - OtherData */ + break; + } + + if (wakeup && !sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); +} + +/* + * This function is a universal ack processor. + */ +static int dn_process_ack(struct sock *sk, struct sk_buff *skb, int oth) +{ + unsigned short *ptr = (unsigned short *)skb->data; + int len = 0; + unsigned short ack; + + if (skb->len < 2) + return len; + + if ((ack = dn_ntohs(*ptr)) & 0x8000) { + skb_pull(skb, 2); + ptr++; + len += 2; + if ((ack & 0x4000) == 0) { + if (oth) + ack ^= 0x2000; + dn_ack(sk, skb, ack); + } + } + + if (skb->len < 2) + return len; + + if ((ack = dn_ntohs(*ptr)) & 0x8000) { + skb_pull(skb, 2); + len += 2; + if ((ack & 0x4000) == 0) { + if (oth) + ack ^= 0x2000; + dn_ack(sk, skb, ack); + } + } + + return len; +} + + +/** + * dn_check_idf - Check an image data field format is correct. + * @pptr: Pointer to pointer to image data + * @len: Pointer to length of image data + * @max: The maximum allowed length of the data in the image data field + * @follow_on: Check that this many bytes exist beyond the end of the image data + * + * Returns: 0 if ok, -1 on error + */ +static inline int dn_check_idf(unsigned char **pptr, int *len, unsigned char max, unsigned char follow_on) +{ + unsigned char *ptr = *pptr; + unsigned char flen = *ptr++; + + (*len)--; + if (flen > max) + return -1; + if ((flen + follow_on) > *len) + return -1; + + *len -= flen; + *pptr = ptr + flen; + return 0; +} + +/* + * Table of reason codes to pass back to node which sent us a badly + * formed message, plus text messages for the log. A zero entry in + * the reason field means "don't reply" otherwise a disc init is sent with + * the specified reason code. + */ +static struct { + unsigned short reason; + const char *text; +} ci_err_table[] = { + { 0, "CI: Truncated message" }, + { NSP_REASON_ID, "CI: Destination username error" }, + { NSP_REASON_ID, "CI: Destination username type" }, + { NSP_REASON_US, "CI: Source username error" }, + { 0, "CI: Truncated at menuver" }, + { 0, "CI: Truncated before access or user data" }, + { NSP_REASON_IO, "CI: Access data format error" }, + { NSP_REASON_IO, "CI: User data format error" } +}; + +/* + * This function uses a slightly different lookup method + * to find its sockets, since it searches on object name/number + * rather than port numbers. Various tests are done to ensure that + * the incoming data is in the correct format before it is queued to + * a socket. + */ +static struct sock *dn_find_listener(struct sk_buff *skb, unsigned short *reason) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct nsp_conn_init_msg *msg = (struct nsp_conn_init_msg *)skb->data; + struct sockaddr_dn dstaddr; + struct sockaddr_dn srcaddr; + unsigned char type = 0; + int dstlen; + int srclen; + unsigned char *ptr; + int len; + int err = 0; + unsigned char menuver; + + memset(&dstaddr, 0, sizeof(struct sockaddr_dn)); + memset(&srcaddr, 0, sizeof(struct sockaddr_dn)); + + /* + * 1. Decode & remove message header + */ + cb->src_port = msg->srcaddr; + cb->dst_port = msg->dstaddr; + cb->services = msg->services; + cb->info = msg->info; + cb->segsize = dn_ntohs(msg->segsize); + + if (!pskb_may_pull(skb, sizeof(*msg))) + goto err_out; + + skb_pull(skb, sizeof(*msg)); + + len = skb->len; + ptr = skb->data; + + /* + * 2. Check destination end username format + */ + dstlen = dn_username2sockaddr(ptr, len, &dstaddr, &type); + err++; + if (dstlen < 0) + goto err_out; + + err++; + if (type > 1) + goto err_out; + + len -= dstlen; + ptr += dstlen; + + /* + * 3. Check source end username format + */ + srclen = dn_username2sockaddr(ptr, len, &srcaddr, &type); + err++; + if (srclen < 0) + goto err_out; + + len -= srclen; + ptr += srclen; + err++; + if (len < 1) + goto err_out; + + menuver = *ptr; + ptr++; + len--; + + /* + * 4. Check that optional data actually exists if menuver says it does + */ + err++; + if ((menuver & (DN_MENUVER_ACC | DN_MENUVER_USR)) && (len < 1)) + goto err_out; + + /* + * 5. Check optional access data format + */ + err++; + if (menuver & DN_MENUVER_ACC) { + if (dn_check_idf(&ptr, &len, 39, 1)) + goto err_out; + if (dn_check_idf(&ptr, &len, 39, 1)) + goto err_out; + if (dn_check_idf(&ptr, &len, 39, (menuver & DN_MENUVER_USR) ? 1 : 0)) + goto err_out; + } + + /* + * 6. Check optional user data format + */ + err++; + if (menuver & DN_MENUVER_USR) { + if (dn_check_idf(&ptr, &len, 16, 0)) + goto err_out; + } + + /* + * 7. Look up socket based on destination end username + */ + return dn_sklist_find_listener(&dstaddr); +err_out: + dn_log_martian(skb, ci_err_table[err].text); + *reason = ci_err_table[err].reason; + return NULL; +} + + +static void dn_nsp_conn_init(struct sock *sk, struct sk_buff *skb) +{ + if (sk_acceptq_is_full(sk)) { + kfree_skb(skb); + return; + } + + sk->sk_ack_backlog++; + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_state_change(sk); +} + +static void dn_nsp_conn_conf(struct sock *sk, struct sk_buff *skb) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct dn_scp *scp = DN_SK(sk); + unsigned char *ptr; + + if (skb->len < 4) + goto out; + + ptr = skb->data; + cb->services = *ptr++; + cb->info = *ptr++; + cb->segsize = dn_ntohs(*(__u16 *)ptr); + + if ((scp->state == DN_CI) || (scp->state == DN_CD)) { + scp->persist = 0; + scp->addrrem = cb->src_port; + sk->sk_state = TCP_ESTABLISHED; + scp->state = DN_RUN; + scp->services_rem = cb->services; + scp->info_rem = cb->info; + scp->segsize_rem = cb->segsize; + + if ((scp->services_rem & NSP_FC_MASK) == NSP_FC_NONE) + scp->max_window = decnet_no_fc_max_cwnd; + + if (skb->len > 0) { + unsigned char dlen = *skb->data; + if ((dlen <= 16) && (dlen <= skb->len)) { + scp->conndata_in.opt_optl = dlen; + memcpy(scp->conndata_in.opt_data, skb->data + 1, dlen); + } + } + dn_nsp_send_link(sk, DN_NOCHANGE, 0); + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + } + +out: + kfree_skb(skb); +} + +static void dn_nsp_conn_ack(struct sock *sk, struct sk_buff *skb) +{ + struct dn_scp *scp = DN_SK(sk); + + if (scp->state == DN_CI) { + scp->state = DN_CD; + scp->persist = 0; + } + + kfree_skb(skb); +} + +static void dn_nsp_disc_init(struct sock *sk, struct sk_buff *skb) +{ + struct dn_scp *scp = DN_SK(sk); + struct dn_skb_cb *cb = DN_SKB_CB(skb); + unsigned short reason; + + if (skb->len < 2) + goto out; + + reason = dn_ntohs(*(__u16 *)skb->data); + skb_pull(skb, 2); + + scp->discdata_in.opt_status = reason; + scp->discdata_in.opt_optl = 0; + memset(scp->discdata_in.opt_data, 0, 16); + + if (skb->len > 0) { + unsigned char dlen = *skb->data; + if ((dlen <= 16) && (dlen <= skb->len)) { + scp->discdata_in.opt_optl = dlen; + memcpy(scp->discdata_in.opt_data, skb->data + 1, dlen); + } + } + + scp->addrrem = cb->src_port; + sk->sk_state = TCP_CLOSE; + + switch(scp->state) { + case DN_CI: + case DN_CD: + scp->state = DN_RJ; + sk->sk_err = ECONNREFUSED; + break; + case DN_RUN: + sk->sk_shutdown |= SHUTDOWN_MASK; + scp->state = DN_DN; + break; + case DN_DI: + scp->state = DN_DIC; + break; + } + + if (!sock_flag(sk, SOCK_DEAD)) { + if (sk->sk_socket->state != SS_UNCONNECTED) + sk->sk_socket->state = SS_DISCONNECTING; + sk->sk_state_change(sk); + } + + /* + * It appears that its possible for remote machines to send disc + * init messages with no port identifier if we are in the CI and + * possibly also the CD state. Obviously we shouldn't reply with + * a message if we don't know what the end point is. + */ + if (scp->addrrem) { + dn_nsp_send_disc(sk, NSP_DISCCONF, NSP_REASON_DC, GFP_ATOMIC); + } + scp->persist_fxn = dn_destroy_timer; + scp->persist = dn_nsp_persist(sk); + +out: + kfree_skb(skb); +} + +/* + * disc_conf messages are also called no_resources or no_link + * messages depending upon the "reason" field. + */ +static void dn_nsp_disc_conf(struct sock *sk, struct sk_buff *skb) +{ + struct dn_scp *scp = DN_SK(sk); + unsigned short reason; + + if (skb->len != 2) + goto out; + + reason = dn_ntohs(*(__u16 *)skb->data); + + sk->sk_state = TCP_CLOSE; + + switch(scp->state) { + case DN_CI: + scp->state = DN_NR; + break; + case DN_DR: + if (reason == NSP_REASON_DC) + scp->state = DN_DRC; + if (reason == NSP_REASON_NL) + scp->state = DN_CN; + break; + case DN_DI: + scp->state = DN_DIC; + break; + case DN_RUN: + sk->sk_shutdown |= SHUTDOWN_MASK; + case DN_CC: + scp->state = DN_CN; + } + + if (!sock_flag(sk, SOCK_DEAD)) { + if (sk->sk_socket->state != SS_UNCONNECTED) + sk->sk_socket->state = SS_DISCONNECTING; + sk->sk_state_change(sk); + } + + scp->persist_fxn = dn_destroy_timer; + scp->persist = dn_nsp_persist(sk); + +out: + kfree_skb(skb); +} + +static void dn_nsp_linkservice(struct sock *sk, struct sk_buff *skb) +{ + struct dn_scp *scp = DN_SK(sk); + unsigned short segnum; + unsigned char lsflags; + signed char fcval; + int wake_up = 0; + char *ptr = skb->data; + unsigned char fctype = scp->services_rem & NSP_FC_MASK; + + if (skb->len != 4) + goto out; + + segnum = dn_ntohs(*(__u16 *)ptr); + ptr += 2; + lsflags = *(unsigned char *)ptr++; + fcval = *ptr; + + /* + * Here we ignore erronous packets which should really + * should cause a connection abort. It is not critical + * for now though. + */ + if (lsflags & 0xf8) + goto out; + + if (seq_next(scp->numoth_rcv, segnum)) { + seq_add(&scp->numoth_rcv, 1); + switch(lsflags & 0x04) { /* FCVAL INT */ + case 0x00: /* Normal Request */ + switch(lsflags & 0x03) { /* FCVAL MOD */ + case 0x00: /* Request count */ + if (fcval < 0) { + unsigned char p_fcval = -fcval; + if ((scp->flowrem_dat > p_fcval) && + (fctype == NSP_FC_SCMC)) { + scp->flowrem_dat -= p_fcval; + } + } else if (fcval > 0) { + scp->flowrem_dat += fcval; + wake_up = 1; + } + break; + case 0x01: /* Stop outgoing data */ + scp->flowrem_sw = DN_DONTSEND; + break; + case 0x02: /* Ok to start again */ + scp->flowrem_sw = DN_SEND; + dn_nsp_output(sk); + wake_up = 1; + } + break; + case 0x04: /* Interrupt Request */ + if (fcval > 0) { + scp->flowrem_oth += fcval; + wake_up = 1; + } + break; + } + if (wake_up && !sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + } + + dn_nsp_send_oth_ack(sk); + +out: + kfree_skb(skb); +} + +/* + * Copy of sock_queue_rcv_skb (from sock.h) without + * bh_lock_sock() (its already held when this is called) which + * also allows data and other data to be queued to a socket. + */ +static __inline__ int dn_queue_skb(struct sock *sk, struct sk_buff *skb, int sig, struct sk_buff_head *queue) +{ + int err; + + /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces + number of warnings when compiling with -W --ANK + */ + if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= + (unsigned)sk->sk_rcvbuf) { + err = -ENOMEM; + goto out; + } + + err = sk_filter(sk, skb, 0); + if (err) + goto out; + + skb_set_owner_r(skb, sk); + skb_queue_tail(queue, skb); + + /* This code only runs from BH or BH protected context. + * Therefore the plain read_lock is ok here. -DaveM + */ + read_lock(&sk->sk_callback_lock); + if (!sock_flag(sk, SOCK_DEAD)) { + struct socket *sock = sk->sk_socket; + wake_up_interruptible(sk->sk_sleep); + if (sock && sock->fasync_list && + !test_bit(SOCK_ASYNC_WAITDATA, &sock->flags)) + __kill_fasync(sock->fasync_list, sig, + (sig == SIGURG) ? POLL_PRI : POLL_IN); + } + read_unlock(&sk->sk_callback_lock); +out: + return err; +} + +static void dn_nsp_otherdata(struct sock *sk, struct sk_buff *skb) +{ + struct dn_scp *scp = DN_SK(sk); + unsigned short segnum; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + int queued = 0; + + if (skb->len < 2) + goto out; + + cb->segnum = segnum = dn_ntohs(*(__u16 *)skb->data); + skb_pull(skb, 2); + + if (seq_next(scp->numoth_rcv, segnum)) { + + if (dn_queue_skb(sk, skb, SIGURG, &scp->other_receive_queue) == 0) { + seq_add(&scp->numoth_rcv, 1); + scp->other_report = 0; + queued = 1; + } + } + + dn_nsp_send_oth_ack(sk); +out: + if (!queued) + kfree_skb(skb); +} + +static void dn_nsp_data(struct sock *sk, struct sk_buff *skb) +{ + int queued = 0; + unsigned short segnum; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct dn_scp *scp = DN_SK(sk); + + if (skb->len < 2) + goto out; + + cb->segnum = segnum = dn_ntohs(*(__u16 *)skb->data); + skb_pull(skb, 2); + + if (seq_next(scp->numdat_rcv, segnum)) { + if (dn_queue_skb(sk, skb, SIGIO, &sk->sk_receive_queue) == 0) { + seq_add(&scp->numdat_rcv, 1); + queued = 1; + } + + if ((scp->flowloc_sw == DN_SEND) && dn_congested(sk)) { + scp->flowloc_sw = DN_DONTSEND; + dn_nsp_send_link(sk, DN_DONTSEND, 0); + } + } + + dn_nsp_send_data_ack(sk); +out: + if (!queued) + kfree_skb(skb); +} + +/* + * If one of our conninit messages is returned, this function + * deals with it. It puts the socket into the NO_COMMUNICATION + * state. + */ +static void dn_returned_conn_init(struct sock *sk, struct sk_buff *skb) +{ + struct dn_scp *scp = DN_SK(sk); + + if (scp->state == DN_CI) { + scp->state = DN_NC; + sk->sk_state = TCP_CLOSE; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + } + + kfree_skb(skb); +} + +static int dn_nsp_no_socket(struct sk_buff *skb, unsigned short reason) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + int ret = NET_RX_DROP; + + /* Must not reply to returned packets */ + if (cb->rt_flags & DN_RT_F_RTS) + goto out; + + if ((reason != NSP_REASON_OK) && ((cb->nsp_flags & 0x0c) == 0x08)) { + switch(cb->nsp_flags & 0x70) { + case 0x10: + case 0x60: /* (Retransmitted) Connect Init */ + dn_nsp_return_disc(skb, NSP_DISCINIT, reason); + ret = NET_RX_SUCCESS; + break; + case 0x20: /* Connect Confirm */ + dn_nsp_return_disc(skb, NSP_DISCCONF, reason); + ret = NET_RX_SUCCESS; + break; + } + } + +out: + kfree_skb(skb); + return ret; +} + +static int dn_nsp_rx_packet(struct sk_buff *skb) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct sock *sk = NULL; + unsigned char *ptr = (unsigned char *)skb->data; + unsigned short reason = NSP_REASON_NL; + + if (!pskb_may_pull(skb, 2)) + goto free_out; + + skb->h.raw = skb->data; + cb->nsp_flags = *ptr++; + + if (decnet_debug_level & 2) + printk(KERN_DEBUG "dn_nsp_rx: Message type 0x%02x\n", (int)cb->nsp_flags); + + if (cb->nsp_flags & 0x83) + goto free_out; + + /* + * Filter out conninits and useless packet types + */ + if ((cb->nsp_flags & 0x0c) == 0x08) { + switch(cb->nsp_flags & 0x70) { + case 0x00: /* NOP */ + case 0x70: /* Reserved */ + case 0x50: /* Reserved, Phase II node init */ + goto free_out; + case 0x10: + case 0x60: + if (unlikely(cb->rt_flags & DN_RT_F_RTS)) + goto free_out; + sk = dn_find_listener(skb, &reason); + goto got_it; + } + } + + if (!pskb_may_pull(skb, 3)) + goto free_out; + + /* + * Grab the destination address. + */ + cb->dst_port = *(unsigned short *)ptr; + cb->src_port = 0; + ptr += 2; + + /* + * If not a connack, grab the source address too. + */ + if (pskb_may_pull(skb, 5)) { + cb->src_port = *(unsigned short *)ptr; + ptr += 2; + skb_pull(skb, 5); + } + + /* + * Returned packets... + * Swap src & dst and look up in the normal way. + */ + if (unlikely(cb->rt_flags & DN_RT_F_RTS)) { + unsigned short tmp = cb->dst_port; + cb->dst_port = cb->src_port; + cb->src_port = tmp; + tmp = cb->dst; + cb->dst = cb->src; + cb->src = tmp; + } + + /* + * Find the socket to which this skb is destined. + */ + sk = dn_find_by_skb(skb); +got_it: + if (sk != NULL) { + struct dn_scp *scp = DN_SK(sk); + int ret; + + /* Reset backoff */ + scp->nsp_rxtshift = 0; + + /* + * We linearize everything except data segments here. + */ + if (cb->nsp_flags & ~0x60) { + if (unlikely(skb_is_nonlinear(skb)) && + skb_linearize(skb, GFP_ATOMIC) != 0) + goto free_out; + } + + bh_lock_sock(sk); + ret = NET_RX_SUCCESS; + if (decnet_debug_level & 8) + printk(KERN_DEBUG "NSP: 0x%02x 0x%02x 0x%04x 0x%04x %d\n", + (int)cb->rt_flags, (int)cb->nsp_flags, + (int)cb->src_port, (int)cb->dst_port, + !!sock_owned_by_user(sk)); + if (!sock_owned_by_user(sk)) + ret = dn_nsp_backlog_rcv(sk, skb); + else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); + sock_put(sk); + + return ret; + } + + return dn_nsp_no_socket(skb, reason); + +free_out: + kfree_skb(skb); + return NET_RX_DROP; +} + +int dn_nsp_rx(struct sk_buff *skb) +{ + return NF_HOOK(PF_DECnet, NF_DN_LOCAL_IN, skb, skb->dev, NULL, dn_nsp_rx_packet); +} + +/* + * This is the main receive routine for sockets. It is called + * from the above when the socket is not busy, and also from + * sock_release() when there is a backlog queued up. + */ +int dn_nsp_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct dn_scp *scp = DN_SK(sk); + struct dn_skb_cb *cb = DN_SKB_CB(skb); + + if (cb->rt_flags & DN_RT_F_RTS) { + if (cb->nsp_flags == 0x18 || cb->nsp_flags == 0x68) + dn_returned_conn_init(sk, skb); + else + kfree_skb(skb); + return NET_RX_SUCCESS; + } + + /* + * Control packet. + */ + if ((cb->nsp_flags & 0x0c) == 0x08) { + switch(cb->nsp_flags & 0x70) { + case 0x10: + case 0x60: + dn_nsp_conn_init(sk, skb); + break; + case 0x20: + dn_nsp_conn_conf(sk, skb); + break; + case 0x30: + dn_nsp_disc_init(sk, skb); + break; + case 0x40: + dn_nsp_disc_conf(sk, skb); + break; + } + + } else if (cb->nsp_flags == 0x24) { + /* + * Special for connacks, 'cos they don't have + * ack data or ack otherdata info. + */ + dn_nsp_conn_ack(sk, skb); + } else { + int other = 1; + + /* both data and ack frames can kick a CC socket into RUN */ + if ((scp->state == DN_CC) && !sock_flag(sk, SOCK_DEAD)) { + scp->state = DN_RUN; + sk->sk_state = TCP_ESTABLISHED; + sk->sk_state_change(sk); + } + + if ((cb->nsp_flags & 0x1c) == 0) + other = 0; + if (cb->nsp_flags == 0x04) + other = 0; + + /* + * Read out ack data here, this applies equally + * to data, other data, link serivce and both + * ack data and ack otherdata. + */ + dn_process_ack(sk, skb, other); + + /* + * If we've some sort of data here then call a + * suitable routine for dealing with it, otherwise + * the packet is an ack and can be discarded. + */ + if ((cb->nsp_flags & 0x0c) == 0) { + + if (scp->state != DN_RUN) + goto free_out; + + switch(cb->nsp_flags) { + case 0x10: /* LS */ + dn_nsp_linkservice(sk, skb); + break; + case 0x30: /* OD */ + dn_nsp_otherdata(sk, skb); + break; + default: + dn_nsp_data(sk, skb); + } + + } else { /* Ack, chuck it out here */ +free_out: + kfree_skb(skb); + } + } + + return NET_RX_SUCCESS; +} + diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c new file mode 100644 index 000000000000..42abbf3f524f --- /dev/null +++ b/net/decnet/dn_nsp_out.c @@ -0,0 +1,782 @@ + +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Network Services Protocol (Output) + * + * Author: Eduardo Marcelo Serrat + * + * Changes: + * + * Steve Whitehouse: Split into dn_nsp_in.c and dn_nsp_out.c from + * original dn_nsp.c. + * Steve Whitehouse: Updated to work with my new routing architecture. + * Steve Whitehouse: Added changes from Eduardo Serrat's patches. + * Steve Whitehouse: Now conninits have the "return" bit set. + * Steve Whitehouse: Fixes to check alloc'd skbs are non NULL! + * Moved output state machine into one function + * Steve Whitehouse: New output state machine + * Paul Koning: Connect Confirm message fix. + * Eduardo Serrat: Fix to stop dn_nsp_do_disc() sending malformed packets. + * Steve Whitehouse: dn_nsp_output() and friends needed a spring clean + * Steve Whitehouse: Moved dn_nsp_send() in here from route.h + */ + +/****************************************************************************** + (c) 1995-1998 E.M. Serrat emserrat@geocities.com + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +static int nsp_backoff[NSP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 64, 64, 64, 64, 64, 64 }; + +static void dn_nsp_send(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct dn_scp *scp = DN_SK(sk); + struct dst_entry *dst; + struct flowi fl; + + skb->h.raw = skb->data; + scp->stamp = jiffies; + + dst = sk_dst_check(sk, 0); + if (dst) { +try_again: + skb->dst = dst; + dst_output(skb); + return; + } + + memset(&fl, 0, sizeof(fl)); + fl.oif = sk->sk_bound_dev_if; + fl.fld_src = dn_saddr2dn(&scp->addr); + fl.fld_dst = dn_saddr2dn(&scp->peer); + dn_sk_ports_copy(&fl, scp); + fl.proto = DNPROTO_NSP; + if (dn_route_output_sock(&sk->sk_dst_cache, &fl, sk, 0) == 0) { + dst = sk_dst_get(sk); + sk->sk_route_caps = dst->dev->features; + goto try_again; + } + + sk->sk_err = EHOSTUNREACH; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); +} + + +/* + * If sk == NULL, then we assume that we are supposed to be making + * a routing layer skb. If sk != NULL, then we are supposed to be + * creating an skb for the NSP layer. + * + * The eventual aim is for each socket to have a cached header size + * for its outgoing packets, and to set hdr from this when sk != NULL. + */ +struct sk_buff *dn_alloc_skb(struct sock *sk, int size, int pri) +{ + struct sk_buff *skb; + int hdr = 64; + + if ((skb = alloc_skb(size + hdr, pri)) == NULL) + return NULL; + + skb->protocol = __constant_htons(ETH_P_DNA_RT); + skb->pkt_type = PACKET_OUTGOING; + + if (sk) + skb_set_owner_w(skb, sk); + + skb_reserve(skb, hdr); + + return skb; +} + +/* + * Wrapper for the above, for allocs of data skbs. We try and get the + * whole size thats been asked for (plus 11 bytes of header). If this + * fails, then we try for any size over 16 bytes for SOCK_STREAMS. + */ +struct sk_buff *dn_alloc_send_skb(struct sock *sk, size_t *size, int noblock, long timeo, int *err) +{ + int space; + int len; + struct sk_buff *skb = NULL; + + *err = 0; + + while(skb == NULL) { + if (signal_pending(current)) { + *err = sock_intr_errno(timeo); + break; + } + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + *err = EINVAL; + break; + } + + if (sk->sk_err) + break; + + len = *size + 11; + space = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); + + if (space < len) { + if ((sk->sk_socket->type == SOCK_STREAM) && + (space >= (16 + 11))) + len = space; + } + + if (space < len) { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + if (noblock) { + *err = EWOULDBLOCK; + break; + } + + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + SOCK_SLEEP_PRE(sk) + + if ((sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc)) < + len) + schedule(); + + SOCK_SLEEP_POST(sk) + continue; + } + + if ((skb = dn_alloc_skb(sk, len, sk->sk_allocation)) == NULL) + continue; + + *size = len - 11; + } + + return skb; +} + +/* + * Calculate persist timer based upon the smoothed round + * trip time and the variance. Backoff according to the + * nsp_backoff[] array. + */ +unsigned long dn_nsp_persist(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + unsigned long t = ((scp->nsp_srtt >> 2) + scp->nsp_rttvar) >> 1; + + t *= nsp_backoff[scp->nsp_rxtshift]; + + if (t < HZ) t = HZ; + if (t > (600*HZ)) t = (600*HZ); + + if (scp->nsp_rxtshift < NSP_MAXRXTSHIFT) + scp->nsp_rxtshift++; + + /* printk(KERN_DEBUG "rxtshift %lu, t=%lu\n", scp->nsp_rxtshift, t); */ + + return t; +} + +/* + * This is called each time we get an estimate for the rtt + * on the link. + */ +static void dn_nsp_rtt(struct sock *sk, long rtt) +{ + struct dn_scp *scp = DN_SK(sk); + long srtt = (long)scp->nsp_srtt; + long rttvar = (long)scp->nsp_rttvar; + long delta; + + /* + * If the jiffies clock flips over in the middle of timestamp + * gathering this value might turn out negative, so we make sure + * that is it always positive here. + */ + if (rtt < 0) + rtt = -rtt; + /* + * Add new rtt to smoothed average + */ + delta = ((rtt << 3) - srtt); + srtt += (delta >> 3); + if (srtt >= 1) + scp->nsp_srtt = (unsigned long)srtt; + else + scp->nsp_srtt = 1; + + /* + * Add new rtt varience to smoothed varience + */ + delta >>= 1; + rttvar += ((((delta>0)?(delta):(-delta)) - rttvar) >> 2); + if (rttvar >= 1) + scp->nsp_rttvar = (unsigned long)rttvar; + else + scp->nsp_rttvar = 1; + + /* printk(KERN_DEBUG "srtt=%lu rttvar=%lu\n", scp->nsp_srtt, scp->nsp_rttvar); */ +} + +/** + * dn_nsp_clone_and_send - Send a data packet by cloning it + * @skb: The packet to clone and transmit + * @gfp: memory allocation flag + * + * Clone a queued data or other data packet and transmit it. + * + * Returns: The number of times the packet has been sent previously + */ +static inline unsigned dn_nsp_clone_and_send(struct sk_buff *skb, int gfp) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct sk_buff *skb2; + int ret = 0; + + if ((skb2 = skb_clone(skb, gfp)) != NULL) { + ret = cb->xmit_count; + cb->xmit_count++; + cb->stamp = jiffies; + skb2->sk = skb->sk; + dn_nsp_send(skb2); + } + + return ret; +} + +/** + * dn_nsp_output - Try and send something from socket queues + * @sk: The socket whose queues are to be investigated + * @gfp: The memory allocation flags + * + * Try and send the packet on the end of the data and other data queues. + * Other data gets priority over data, and if we retransmit a packet we + * reduce the window by dividing it in two. + * + */ +void dn_nsp_output(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + struct sk_buff *skb; + unsigned reduce_win = 0; + + /* + * First we check for otherdata/linkservice messages + */ + if ((skb = skb_peek(&scp->other_xmit_queue)) != NULL) + reduce_win = dn_nsp_clone_and_send(skb, GFP_ATOMIC); + + /* + * If we may not send any data, we don't. + * If we are still trying to get some other data down the + * channel, we don't try and send any data. + */ + if (reduce_win || (scp->flowrem_sw != DN_SEND)) + goto recalc_window; + + if ((skb = skb_peek(&scp->data_xmit_queue)) != NULL) + reduce_win = dn_nsp_clone_and_send(skb, GFP_ATOMIC); + + /* + * If we've sent any frame more than once, we cut the + * send window size in half. There is always a minimum + * window size of one available. + */ +recalc_window: + if (reduce_win) { + scp->snd_window >>= 1; + if (scp->snd_window < NSP_MIN_WINDOW) + scp->snd_window = NSP_MIN_WINDOW; + } +} + +int dn_nsp_xmit_timeout(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + dn_nsp_output(sk); + + if (skb_queue_len(&scp->data_xmit_queue) || skb_queue_len(&scp->other_xmit_queue)) + scp->persist = dn_nsp_persist(sk); + + return 0; +} + +static inline unsigned char *dn_mk_common_header(struct dn_scp *scp, struct sk_buff *skb, unsigned char msgflag, int len) +{ + unsigned char *ptr = skb_push(skb, len); + + BUG_ON(len < 5); + + *ptr++ = msgflag; + *((unsigned short *)ptr) = scp->addrrem; + ptr += 2; + *((unsigned short *)ptr) = scp->addrloc; + ptr += 2; + return ptr; +} + +static unsigned short *dn_mk_ack_header(struct sock *sk, struct sk_buff *skb, unsigned char msgflag, int hlen, int other) +{ + struct dn_scp *scp = DN_SK(sk); + unsigned short acknum = scp->numdat_rcv & 0x0FFF; + unsigned short ackcrs = scp->numoth_rcv & 0x0FFF; + unsigned short *ptr; + + BUG_ON(hlen < 9); + + scp->ackxmt_dat = acknum; + scp->ackxmt_oth = ackcrs; + acknum |= 0x8000; + ackcrs |= 0x8000; + + /* If this is an "other data/ack" message, swap acknum and ackcrs */ + if (other) { + unsigned short tmp = acknum; + acknum = ackcrs; + ackcrs = tmp; + } + + /* Set "cross subchannel" bit in ackcrs */ + ackcrs |= 0x2000; + + ptr = (unsigned short *)dn_mk_common_header(scp, skb, msgflag, hlen); + + *ptr++ = dn_htons(acknum); + *ptr++ = dn_htons(ackcrs); + + return ptr; +} + +static unsigned short *dn_nsp_mk_data_header(struct sock *sk, struct sk_buff *skb, int oth) +{ + struct dn_scp *scp = DN_SK(sk); + struct dn_skb_cb *cb = DN_SKB_CB(skb); + unsigned short *ptr = dn_mk_ack_header(sk, skb, cb->nsp_flags, 11, oth); + + if (unlikely(oth)) { + cb->segnum = scp->numoth; + seq_add(&scp->numoth, 1); + } else { + cb->segnum = scp->numdat; + seq_add(&scp->numdat, 1); + } + *(ptr++) = dn_htons(cb->segnum); + + return ptr; +} + +void dn_nsp_queue_xmit(struct sock *sk, struct sk_buff *skb, int gfp, int oth) +{ + struct dn_scp *scp = DN_SK(sk); + struct dn_skb_cb *cb = DN_SKB_CB(skb); + unsigned long t = ((scp->nsp_srtt >> 2) + scp->nsp_rttvar) >> 1; + + cb->xmit_count = 0; + dn_nsp_mk_data_header(sk, skb, oth); + + /* + * Slow start: If we have been idle for more than + * one RTT, then reset window to min size. + */ + if ((jiffies - scp->stamp) > t) + scp->snd_window = NSP_MIN_WINDOW; + + if (oth) + skb_queue_tail(&scp->other_xmit_queue, skb); + else + skb_queue_tail(&scp->data_xmit_queue, skb); + + if (scp->flowrem_sw != DN_SEND) + return; + + dn_nsp_clone_and_send(skb, gfp); +} + + +int dn_nsp_check_xmit_queue(struct sock *sk, struct sk_buff *skb, struct sk_buff_head *q, unsigned short acknum) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct dn_scp *scp = DN_SK(sk); + struct sk_buff *skb2, *list, *ack = NULL; + int wakeup = 0; + int try_retrans = 0; + unsigned long reftime = cb->stamp; + unsigned long pkttime; + unsigned short xmit_count; + unsigned short segnum; + + skb2 = q->next; + list = (struct sk_buff *)q; + while(list != skb2) { + struct dn_skb_cb *cb2 = DN_SKB_CB(skb2); + + if (dn_before_or_equal(cb2->segnum, acknum)) + ack = skb2; + + /* printk(KERN_DEBUG "ack: %s %04x %04x\n", ack ? "ACK" : "SKIP", (int)cb2->segnum, (int)acknum); */ + + skb2 = skb2->next; + + if (ack == NULL) + continue; + + /* printk(KERN_DEBUG "check_xmit_queue: %04x, %d\n", acknum, cb2->xmit_count); */ + + /* Does _last_ packet acked have xmit_count > 1 */ + try_retrans = 0; + /* Remember to wake up the sending process */ + wakeup = 1; + /* Keep various statistics */ + pkttime = cb2->stamp; + xmit_count = cb2->xmit_count; + segnum = cb2->segnum; + /* Remove and drop ack'ed packet */ + skb_unlink(ack); + kfree_skb(ack); + ack = NULL; + + /* + * We don't expect to see acknowledgements for packets we + * haven't sent yet. + */ + WARN_ON(xmit_count == 0); + + /* + * If the packet has only been sent once, we can use it + * to calculate the RTT and also open the window a little + * further. + */ + if (xmit_count == 1) { + if (dn_equal(segnum, acknum)) + dn_nsp_rtt(sk, (long)(pkttime - reftime)); + + if (scp->snd_window < scp->max_window) + scp->snd_window++; + } + + /* + * Packet has been sent more than once. If this is the last + * packet to be acknowledged then we want to send the next + * packet in the send queue again (assumes the remote host does + * go-back-N error control). + */ + if (xmit_count > 1) + try_retrans = 1; + } + + if (try_retrans) + dn_nsp_output(sk); + + return wakeup; +} + +void dn_nsp_send_data_ack(struct sock *sk) +{ + struct sk_buff *skb = NULL; + + if ((skb = dn_alloc_skb(sk, 9, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, 9); + dn_mk_ack_header(sk, skb, 0x04, 9, 0); + dn_nsp_send(skb); +} + +void dn_nsp_send_oth_ack(struct sock *sk) +{ + struct sk_buff *skb = NULL; + + if ((skb = dn_alloc_skb(sk, 9, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, 9); + dn_mk_ack_header(sk, skb, 0x14, 9, 1); + dn_nsp_send(skb); +} + + +void dn_send_conn_ack (struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + struct sk_buff *skb = NULL; + struct nsp_conn_ack_msg *msg; + + if ((skb = dn_alloc_skb(sk, 3, sk->sk_allocation)) == NULL) + return; + + msg = (struct nsp_conn_ack_msg *)skb_put(skb, 3); + msg->msgflg = 0x24; + msg->dstaddr = scp->addrrem; + + dn_nsp_send(skb); +} + +void dn_nsp_delayed_ack(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + if (scp->ackxmt_oth != scp->numoth_rcv) + dn_nsp_send_oth_ack(sk); + + if (scp->ackxmt_dat != scp->numdat_rcv) + dn_nsp_send_data_ack(sk); +} + +static int dn_nsp_retrans_conn_conf(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + if (scp->state == DN_CC) + dn_send_conn_conf(sk, GFP_ATOMIC); + + return 0; +} + +void dn_send_conn_conf(struct sock *sk, int gfp) +{ + struct dn_scp *scp = DN_SK(sk); + struct sk_buff *skb = NULL; + struct nsp_conn_init_msg *msg; + unsigned char len = scp->conndata_out.opt_optl; + + if ((skb = dn_alloc_skb(sk, 50 + scp->conndata_out.opt_optl, gfp)) == NULL) + return; + + msg = (struct nsp_conn_init_msg *)skb_put(skb, sizeof(*msg)); + msg->msgflg = 0x28; + msg->dstaddr = scp->addrrem; + msg->srcaddr = scp->addrloc; + msg->services = scp->services_loc; + msg->info = scp->info_loc; + msg->segsize = dn_htons(scp->segsize_loc); + + *skb_put(skb,1) = len; + + if (len > 0) + memcpy(skb_put(skb, len), scp->conndata_out.opt_data, len); + + + dn_nsp_send(skb); + + scp->persist = dn_nsp_persist(sk); + scp->persist_fxn = dn_nsp_retrans_conn_conf; +} + + +static __inline__ void dn_nsp_do_disc(struct sock *sk, unsigned char msgflg, + unsigned short reason, int gfp, struct dst_entry *dst, + int ddl, unsigned char *dd, __u16 rem, __u16 loc) +{ + struct sk_buff *skb = NULL; + int size = 7 + ddl + ((msgflg == NSP_DISCINIT) ? 1 : 0); + unsigned char *msg; + + if ((dst == NULL) || (rem == 0)) { + if (net_ratelimit()) + printk(KERN_DEBUG "DECnet: dn_nsp_do_disc: BUG! Please report this to SteveW@ACM.org rem=%u dst=%p\n", (unsigned)rem, dst); + return; + } + + if ((skb = dn_alloc_skb(sk, size, gfp)) == NULL) + return; + + msg = skb_put(skb, size); + *msg++ = msgflg; + *(__u16 *)msg = rem; + msg += 2; + *(__u16 *)msg = loc; + msg += 2; + *(__u16 *)msg = dn_htons(reason); + msg += 2; + if (msgflg == NSP_DISCINIT) + *msg++ = ddl; + + if (ddl) { + memcpy(msg, dd, ddl); + } + + /* + * This doesn't go via the dn_nsp_send() function since we need + * to be able to send disc packets out which have no socket + * associations. + */ + skb->dst = dst_clone(dst); + dst_output(skb); +} + + +void dn_nsp_send_disc(struct sock *sk, unsigned char msgflg, + unsigned short reason, int gfp) +{ + struct dn_scp *scp = DN_SK(sk); + int ddl = 0; + + if (msgflg == NSP_DISCINIT) + ddl = scp->discdata_out.opt_optl; + + if (reason == 0) + reason = scp->discdata_out.opt_status; + + dn_nsp_do_disc(sk, msgflg, reason, gfp, sk->sk_dst_cache, ddl, + scp->discdata_out.opt_data, scp->addrrem, scp->addrloc); +} + + +void dn_nsp_return_disc(struct sk_buff *skb, unsigned char msgflg, + unsigned short reason) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + int ddl = 0; + int gfp = GFP_ATOMIC; + + dn_nsp_do_disc(NULL, msgflg, reason, gfp, skb->dst, ddl, + NULL, cb->src_port, cb->dst_port); +} + + +void dn_nsp_send_link(struct sock *sk, unsigned char lsflags, char fcval) +{ + struct dn_scp *scp = DN_SK(sk); + struct sk_buff *skb; + unsigned char *ptr; + int gfp = GFP_ATOMIC; + + if ((skb = dn_alloc_skb(sk, DN_MAX_NSP_DATA_HEADER + 2, gfp)) == NULL) + return; + + skb_reserve(skb, DN_MAX_NSP_DATA_HEADER); + ptr = skb_put(skb, 2); + DN_SKB_CB(skb)->nsp_flags = 0x10; + *ptr++ = lsflags; + *ptr = fcval; + + dn_nsp_queue_xmit(sk, skb, gfp, 1); + + scp->persist = dn_nsp_persist(sk); + scp->persist_fxn = dn_nsp_xmit_timeout; +} + +static int dn_nsp_retrans_conninit(struct sock *sk) +{ + struct dn_scp *scp = DN_SK(sk); + + if (scp->state == DN_CI) + dn_nsp_send_conninit(sk, NSP_RCI); + + return 0; +} + +void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg) +{ + struct dn_scp *scp = DN_SK(sk); + struct nsp_conn_init_msg *msg; + unsigned char aux; + unsigned char menuver; + struct dn_skb_cb *cb; + unsigned char type = 1; + int allocation = (msgflg == NSP_CI) ? sk->sk_allocation : GFP_ATOMIC; + struct sk_buff *skb = dn_alloc_skb(sk, 200, allocation); + + if (!skb) + return; + + cb = DN_SKB_CB(skb); + msg = (struct nsp_conn_init_msg *)skb_put(skb,sizeof(*msg)); + + msg->msgflg = msgflg; + msg->dstaddr = 0x0000; /* Remote Node will assign it*/ + + msg->srcaddr = scp->addrloc; + msg->services = scp->services_loc; /* Requested flow control */ + msg->info = scp->info_loc; /* Version Number */ + msg->segsize = dn_htons(scp->segsize_loc); /* Max segment size */ + + if (scp->peer.sdn_objnum) + type = 0; + + skb_put(skb, dn_sockaddr2username(&scp->peer, skb->tail, type)); + skb_put(skb, dn_sockaddr2username(&scp->addr, skb->tail, 2)); + + menuver = DN_MENUVER_ACC | DN_MENUVER_USR; + if (scp->peer.sdn_flags & SDF_PROXY) + menuver |= DN_MENUVER_PRX; + if (scp->peer.sdn_flags & SDF_UICPROXY) + menuver |= DN_MENUVER_UIC; + + *skb_put(skb, 1) = menuver; /* Menu Version */ + + aux = scp->accessdata.acc_userl; + *skb_put(skb, 1) = aux; + if (aux > 0) + memcpy(skb_put(skb, aux), scp->accessdata.acc_user, aux); + + aux = scp->accessdata.acc_passl; + *skb_put(skb, 1) = aux; + if (aux > 0) + memcpy(skb_put(skb, aux), scp->accessdata.acc_pass, aux); + + aux = scp->accessdata.acc_accl; + *skb_put(skb, 1) = aux; + if (aux > 0) + memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux); + + aux = scp->conndata_out.opt_optl; + *skb_put(skb, 1) = aux; + if (aux > 0) + memcpy(skb_put(skb,aux), scp->conndata_out.opt_data, aux); + + scp->persist = dn_nsp_persist(sk); + scp->persist_fxn = dn_nsp_retrans_conninit; + + cb->rt_flags = DN_RT_F_RQR; + + dn_nsp_send(skb); +} + diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c new file mode 100644 index 000000000000..1e7b5c3ea215 --- /dev/null +++ b/net/decnet/dn_route.c @@ -0,0 +1,1840 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Routing Functions (Endnode and Router) + * + * Authors: Steve Whitehouse + * Eduardo Marcelo Serrat + * + * Changes: + * Steve Whitehouse : Fixes to allow "intra-ethernet" and + * "return-to-sender" bits on outgoing + * packets. + * Steve Whitehouse : Timeouts for cached routes. + * Steve Whitehouse : Use dst cache for input routes too. + * Steve Whitehouse : Fixed error values in dn_send_skb. + * Steve Whitehouse : Rework routing functions to better fit + * DECnet routing design + * Alexey Kuznetsov : New SMP locking + * Steve Whitehouse : More SMP locking changes & dn_cache_dump() + * Steve Whitehouse : Prerouting NF hook, now really is prerouting. + * Fixed possible skb leak in rtnetlink funcs. + * Steve Whitehouse : Dave Miller's dynamic hash table sizing and + * Alexey Kuznetsov's finer grained locking + * from ipv4/route.c. + * Steve Whitehouse : Routing is now starting to look like a + * sensible set of code now, mainly due to + * my copying the IPv4 routing code. The + * hooks here are modified and will continue + * to evolve for a while. + * Steve Whitehouse : Real SMP at last :-) Also new netfilter + * stuff. Look out raw sockets your days + * are numbered! + * Steve Whitehouse : Added return-to-sender functions. Added + * backlog congestion level return codes. + * Steve Whitehouse : Fixed bug where routes were set up with + * no ref count on net devices. + * Steve Whitehouse : RCU for the route cache + * Steve Whitehouse : Preparations for the flow cache + * Steve Whitehouse : Prepare for nonlinear skbs + */ + +/****************************************************************************** + (c) 1995-1998 E.M. Serrat emserrat@geocities.com + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. +*******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct dn_rt_hash_bucket +{ + struct dn_route *chain; + spinlock_t lock; +} __attribute__((__aligned__(8))); + +extern struct neigh_table dn_neigh_table; + + +static unsigned char dn_hiord_addr[6] = {0xAA,0x00,0x04,0x00,0x00,0x00}; + +static const int dn_rt_min_delay = 2 * HZ; +static const int dn_rt_max_delay = 10 * HZ; +static const int dn_rt_mtu_expires = 10 * 60 * HZ; + +static unsigned long dn_rt_deadline; + +static int dn_dst_gc(void); +static struct dst_entry *dn_dst_check(struct dst_entry *, __u32); +static struct dst_entry *dn_dst_negative_advice(struct dst_entry *); +static void dn_dst_link_failure(struct sk_buff *); +static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu); +static int dn_route_input(struct sk_buff *); +static void dn_run_flush(unsigned long dummy); + +static struct dn_rt_hash_bucket *dn_rt_hash_table; +static unsigned dn_rt_hash_mask; + +static struct timer_list dn_route_timer; +static struct timer_list dn_rt_flush_timer = + TIMER_INITIALIZER(dn_run_flush, 0, 0); +int decnet_dst_gc_interval = 2; + +static struct dst_ops dn_dst_ops = { + .family = PF_DECnet, + .protocol = __constant_htons(ETH_P_DNA_RT), + .gc_thresh = 128, + .gc = dn_dst_gc, + .check = dn_dst_check, + .negative_advice = dn_dst_negative_advice, + .link_failure = dn_dst_link_failure, + .update_pmtu = dn_dst_update_pmtu, + .entry_size = sizeof(struct dn_route), + .entries = ATOMIC_INIT(0), +}; + +static __inline__ unsigned dn_hash(unsigned short src, unsigned short dst) +{ + unsigned short tmp = src ^ dst; + tmp ^= (tmp >> 3); + tmp ^= (tmp >> 5); + tmp ^= (tmp >> 10); + return dn_rt_hash_mask & (unsigned)tmp; +} + +static inline void dnrt_free(struct dn_route *rt) +{ + call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); +} + +static inline void dnrt_drop(struct dn_route *rt) +{ + if (rt) + dst_release(&rt->u.dst); + call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); +} + +static void dn_dst_check_expire(unsigned long dummy) +{ + int i; + struct dn_route *rt, **rtp; + unsigned long now = jiffies; + unsigned long expire = 120 * HZ; + + for(i = 0; i <= dn_rt_hash_mask; i++) { + rtp = &dn_rt_hash_table[i].chain; + + spin_lock(&dn_rt_hash_table[i].lock); + while((rt=*rtp) != NULL) { + if (atomic_read(&rt->u.dst.__refcnt) || + (now - rt->u.dst.lastuse) < expire) { + rtp = &rt->u.rt_next; + continue; + } + *rtp = rt->u.rt_next; + rt->u.rt_next = NULL; + dnrt_free(rt); + } + spin_unlock(&dn_rt_hash_table[i].lock); + + if ((jiffies - now) > 0) + break; + } + + mod_timer(&dn_route_timer, now + decnet_dst_gc_interval * HZ); +} + +static int dn_dst_gc(void) +{ + struct dn_route *rt, **rtp; + int i; + unsigned long now = jiffies; + unsigned long expire = 10 * HZ; + + for(i = 0; i <= dn_rt_hash_mask; i++) { + + spin_lock_bh(&dn_rt_hash_table[i].lock); + rtp = &dn_rt_hash_table[i].chain; + + while((rt=*rtp) != NULL) { + if (atomic_read(&rt->u.dst.__refcnt) || + (now - rt->u.dst.lastuse) < expire) { + rtp = &rt->u.rt_next; + continue; + } + *rtp = rt->u.rt_next; + rt->u.rt_next = NULL; + dnrt_drop(rt); + break; + } + spin_unlock_bh(&dn_rt_hash_table[i].lock); + } + + return 0; +} + +/* + * The decnet standards don't impose a particular minimum mtu, what they + * do insist on is that the routing layer accepts a datagram of at least + * 230 bytes long. Here we have to subtract the routing header length from + * 230 to get the minimum acceptable mtu. If there is no neighbour, then we + * assume the worst and use a long header size. + * + * We update both the mtu and the advertised mss (i.e. the segment size we + * advertise to the other end). + */ +static void dn_dst_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + u32 min_mtu = 230; + struct dn_dev *dn = dst->neighbour ? + (struct dn_dev *)dst->neighbour->dev->dn_ptr : NULL; + + if (dn && dn->use_long == 0) + min_mtu -= 6; + else + min_mtu -= 21; + + if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= min_mtu) { + if (!(dst_metric_locked(dst, RTAX_MTU))) { + dst->metrics[RTAX_MTU-1] = mtu; + dst_set_expires(dst, dn_rt_mtu_expires); + } + if (!(dst_metric_locked(dst, RTAX_ADVMSS))) { + u32 mss = mtu - DN_MAX_NSP_DATA_HEADER; + if (dst->metrics[RTAX_ADVMSS-1] > mss) + dst->metrics[RTAX_ADVMSS-1] = mss; + } + } +} + +/* + * When a route has been marked obsolete. (e.g. routing cache flush) + */ +static struct dst_entry *dn_dst_check(struct dst_entry *dst, __u32 cookie) +{ + return NULL; +} + +static struct dst_entry *dn_dst_negative_advice(struct dst_entry *dst) +{ + dst_release(dst); + return NULL; +} + +static void dn_dst_link_failure(struct sk_buff *skb) +{ + return; +} + +static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) +{ + return memcmp(&fl1->nl_u.dn_u, &fl2->nl_u.dn_u, sizeof(fl1->nl_u.dn_u)) == 0 && + fl1->oif == fl2->oif && + fl1->iif == fl2->iif; +} + +static int dn_insert_route(struct dn_route *rt, unsigned hash, struct dn_route **rp) +{ + struct dn_route *rth, **rthp; + unsigned long now = jiffies; + + rthp = &dn_rt_hash_table[hash].chain; + + spin_lock_bh(&dn_rt_hash_table[hash].lock); + while((rth = *rthp) != NULL) { + if (compare_keys(&rth->fl, &rt->fl)) { + /* Put it first */ + *rthp = rth->u.rt_next; + rcu_assign_pointer(rth->u.rt_next, + dn_rt_hash_table[hash].chain); + rcu_assign_pointer(dn_rt_hash_table[hash].chain, rth); + + rth->u.dst.__use++; + dst_hold(&rth->u.dst); + rth->u.dst.lastuse = now; + spin_unlock_bh(&dn_rt_hash_table[hash].lock); + + dnrt_drop(rt); + *rp = rth; + return 0; + } + rthp = &rth->u.rt_next; + } + + rcu_assign_pointer(rt->u.rt_next, dn_rt_hash_table[hash].chain); + rcu_assign_pointer(dn_rt_hash_table[hash].chain, rt); + + dst_hold(&rt->u.dst); + rt->u.dst.__use++; + rt->u.dst.lastuse = now; + spin_unlock_bh(&dn_rt_hash_table[hash].lock); + *rp = rt; + return 0; +} + +void dn_run_flush(unsigned long dummy) +{ + int i; + struct dn_route *rt, *next; + + for(i = 0; i < dn_rt_hash_mask; i++) { + spin_lock_bh(&dn_rt_hash_table[i].lock); + + if ((rt = xchg(&dn_rt_hash_table[i].chain, NULL)) == NULL) + goto nothing_to_declare; + + for(; rt; rt=next) { + next = rt->u.rt_next; + rt->u.rt_next = NULL; + dst_free((struct dst_entry *)rt); + } + +nothing_to_declare: + spin_unlock_bh(&dn_rt_hash_table[i].lock); + } +} + +static DEFINE_SPINLOCK(dn_rt_flush_lock); + +void dn_rt_cache_flush(int delay) +{ + unsigned long now = jiffies; + int user_mode = !in_interrupt(); + + if (delay < 0) + delay = dn_rt_min_delay; + + spin_lock_bh(&dn_rt_flush_lock); + + if (del_timer(&dn_rt_flush_timer) && delay > 0 && dn_rt_deadline) { + long tmo = (long)(dn_rt_deadline - now); + + if (user_mode && tmo < dn_rt_max_delay - dn_rt_min_delay) + tmo = 0; + + if (delay > tmo) + delay = tmo; + } + + if (delay <= 0) { + spin_unlock_bh(&dn_rt_flush_lock); + dn_run_flush(0); + return; + } + + if (dn_rt_deadline == 0) + dn_rt_deadline = now + dn_rt_max_delay; + + dn_rt_flush_timer.expires = now + delay; + add_timer(&dn_rt_flush_timer); + spin_unlock_bh(&dn_rt_flush_lock); +} + +/** + * dn_return_short - Return a short packet to its sender + * @skb: The packet to return + * + */ +static int dn_return_short(struct sk_buff *skb) +{ + struct dn_skb_cb *cb; + unsigned char *ptr; + dn_address *src; + dn_address *dst; + dn_address tmp; + + /* Add back headers */ + skb_push(skb, skb->data - skb->nh.raw); + + if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL) + return NET_RX_DROP; + + cb = DN_SKB_CB(skb); + /* Skip packet length and point to flags */ + ptr = skb->data + 2; + *ptr++ = (cb->rt_flags & ~DN_RT_F_RQR) | DN_RT_F_RTS; + + dst = (dn_address *)ptr; + ptr += 2; + src = (dn_address *)ptr; + ptr += 2; + *ptr = 0; /* Zero hop count */ + + /* Swap source and destination */ + tmp = *src; + *src = *dst; + *dst = tmp; + + skb->pkt_type = PACKET_OUTGOING; + dn_rt_finish_output(skb, NULL, NULL); + return NET_RX_SUCCESS; +} + +/** + * dn_return_long - Return a long packet to its sender + * @skb: The long format packet to return + * + */ +static int dn_return_long(struct sk_buff *skb) +{ + struct dn_skb_cb *cb; + unsigned char *ptr; + unsigned char *src_addr, *dst_addr; + unsigned char tmp[ETH_ALEN]; + + /* Add back all headers */ + skb_push(skb, skb->data - skb->nh.raw); + + if ((skb = skb_unshare(skb, GFP_ATOMIC)) == NULL) + return NET_RX_DROP; + + cb = DN_SKB_CB(skb); + /* Ignore packet length and point to flags */ + ptr = skb->data + 2; + + /* Skip padding */ + if (*ptr & DN_RT_F_PF) { + char padlen = (*ptr & ~DN_RT_F_PF); + ptr += padlen; + } + + *ptr++ = (cb->rt_flags & ~DN_RT_F_RQR) | DN_RT_F_RTS; + ptr += 2; + dst_addr = ptr; + ptr += 8; + src_addr = ptr; + ptr += 6; + *ptr = 0; /* Zero hop count */ + + /* Swap source and destination */ + memcpy(tmp, src_addr, ETH_ALEN); + memcpy(src_addr, dst_addr, ETH_ALEN); + memcpy(dst_addr, tmp, ETH_ALEN); + + skb->pkt_type = PACKET_OUTGOING; + dn_rt_finish_output(skb, dst_addr, src_addr); + return NET_RX_SUCCESS; +} + +/** + * dn_route_rx_packet - Try and find a route for an incoming packet + * @skb: The packet to find a route for + * + * Returns: result of input function if route is found, error code otherwise + */ +static int dn_route_rx_packet(struct sk_buff *skb) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + int err; + + if ((err = dn_route_input(skb)) == 0) + return dst_input(skb); + + if (decnet_debug_level & 4) { + char *devname = skb->dev ? skb->dev->name : "???"; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + printk(KERN_DEBUG + "DECnet: dn_route_rx_packet: rt_flags=0x%02x dev=%s len=%d src=0x%04hx dst=0x%04hx err=%d type=%d\n", + (int)cb->rt_flags, devname, skb->len, cb->src, cb->dst, + err, skb->pkt_type); + } + + if ((skb->pkt_type == PACKET_HOST) && (cb->rt_flags & DN_RT_F_RQR)) { + switch(cb->rt_flags & DN_RT_PKT_MSK) { + case DN_RT_PKT_SHORT: + return dn_return_short(skb); + case DN_RT_PKT_LONG: + return dn_return_long(skb); + } + } + + kfree_skb(skb); + return NET_RX_DROP; +} + +static int dn_route_rx_long(struct sk_buff *skb) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + unsigned char *ptr = skb->data; + + if (!pskb_may_pull(skb, 21)) /* 20 for long header, 1 for shortest nsp */ + goto drop_it; + + skb_pull(skb, 20); + skb->h.raw = skb->data; + + /* Destination info */ + ptr += 2; + cb->dst = dn_htons(dn_eth2dn(ptr)); + if (memcmp(ptr, dn_hiord_addr, 4) != 0) + goto drop_it; + ptr += 6; + + + /* Source info */ + ptr += 2; + cb->src = dn_htons(dn_eth2dn(ptr)); + if (memcmp(ptr, dn_hiord_addr, 4) != 0) + goto drop_it; + ptr += 6; + /* Other junk */ + ptr++; + cb->hops = *ptr++; /* Visit Count */ + + return NF_HOOK(PF_DECnet, NF_DN_PRE_ROUTING, skb, skb->dev, NULL, dn_route_rx_packet); + +drop_it: + kfree_skb(skb); + return NET_RX_DROP; +} + + + +static int dn_route_rx_short(struct sk_buff *skb) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + unsigned char *ptr = skb->data; + + if (!pskb_may_pull(skb, 6)) /* 5 for short header + 1 for shortest nsp */ + goto drop_it; + + skb_pull(skb, 5); + skb->h.raw = skb->data; + + cb->dst = *(dn_address *)ptr; + ptr += 2; + cb->src = *(dn_address *)ptr; + ptr += 2; + cb->hops = *ptr & 0x3f; + + return NF_HOOK(PF_DECnet, NF_DN_PRE_ROUTING, skb, skb->dev, NULL, dn_route_rx_packet); + +drop_it: + kfree_skb(skb); + return NET_RX_DROP; +} + +static int dn_route_discard(struct sk_buff *skb) +{ + /* + * I know we drop the packet here, but thats considered success in + * this case + */ + kfree_skb(skb); + return NET_RX_SUCCESS; +} + +static int dn_route_ptp_hello(struct sk_buff *skb) +{ + dn_dev_hello(skb); + dn_neigh_pointopoint_hello(skb); + return NET_RX_SUCCESS; +} + +int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +{ + struct dn_skb_cb *cb; + unsigned char flags = 0; + __u16 len = dn_ntohs(*(__u16 *)skb->data); + struct dn_dev *dn = (struct dn_dev *)dev->dn_ptr; + unsigned char padlen = 0; + + if (dn == NULL) + goto dump_it; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + goto out; + + if (!pskb_may_pull(skb, 3)) + goto dump_it; + + skb_pull(skb, 2); + + if (len > skb->len) + goto dump_it; + + skb_trim(skb, len); + + flags = *skb->data; + + cb = DN_SKB_CB(skb); + cb->stamp = jiffies; + cb->iif = dev->ifindex; + + /* + * If we have padding, remove it. + */ + if (flags & DN_RT_F_PF) { + padlen = flags & ~DN_RT_F_PF; + if (!pskb_may_pull(skb, padlen + 1)) + goto dump_it; + skb_pull(skb, padlen); + flags = *skb->data; + } + + skb->nh.raw = skb->data; + + /* + * Weed out future version DECnet + */ + if (flags & DN_RT_F_VER) + goto dump_it; + + cb->rt_flags = flags; + + if (decnet_debug_level & 1) + printk(KERN_DEBUG + "dn_route_rcv: got 0x%02x from %s [%d %d %d]\n", + (int)flags, (dev) ? dev->name : "???", len, skb->len, + padlen); + + if (flags & DN_RT_PKT_CNTL) { + if (unlikely(skb_is_nonlinear(skb)) && + skb_linearize(skb, GFP_ATOMIC) != 0) + goto dump_it; + + switch(flags & DN_RT_CNTL_MSK) { + case DN_RT_PKT_INIT: + dn_dev_init_pkt(skb); + break; + case DN_RT_PKT_VERI: + dn_dev_veri_pkt(skb); + break; + } + + if (dn->parms.state != DN_DEV_S_RU) + goto dump_it; + + switch(flags & DN_RT_CNTL_MSK) { + case DN_RT_PKT_HELO: + return NF_HOOK(PF_DECnet, NF_DN_HELLO, skb, skb->dev, NULL, dn_route_ptp_hello); + + case DN_RT_PKT_L1RT: + case DN_RT_PKT_L2RT: + return NF_HOOK(PF_DECnet, NF_DN_ROUTE, skb, skb->dev, NULL, dn_route_discard); + case DN_RT_PKT_ERTH: + return NF_HOOK(PF_DECnet, NF_DN_HELLO, skb, skb->dev, NULL, dn_neigh_router_hello); + + case DN_RT_PKT_EEDH: + return NF_HOOK(PF_DECnet, NF_DN_HELLO, skb, skb->dev, NULL, dn_neigh_endnode_hello); + } + } else { + if (dn->parms.state != DN_DEV_S_RU) + goto dump_it; + + skb_pull(skb, 1); /* Pull flags */ + + switch(flags & DN_RT_PKT_MSK) { + case DN_RT_PKT_LONG: + return dn_route_rx_long(skb); + case DN_RT_PKT_SHORT: + return dn_route_rx_short(skb); + } + } + +dump_it: + kfree_skb(skb); +out: + return NET_RX_DROP; +} + +static int dn_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct dn_route *rt = (struct dn_route *)dst; + struct net_device *dev = dst->dev; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct neighbour *neigh; + + int err = -EINVAL; + + if ((neigh = dst->neighbour) == NULL) + goto error; + + skb->dev = dev; + + cb->src = rt->rt_saddr; + cb->dst = rt->rt_daddr; + + /* + * Always set the Intra-Ethernet bit on all outgoing packets + * originated on this node. Only valid flag from upper layers + * is return-to-sender-requested. Set hop count to 0 too. + */ + cb->rt_flags &= ~DN_RT_F_RQR; + cb->rt_flags |= DN_RT_F_IE; + cb->hops = 0; + + return NF_HOOK(PF_DECnet, NF_DN_LOCAL_OUT, skb, NULL, dev, neigh->output); + +error: + if (net_ratelimit()) + printk(KERN_DEBUG "dn_output: This should not happen\n"); + + kfree_skb(skb); + + return err; +} + +static int dn_forward(struct sk_buff *skb) +{ + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct dst_entry *dst = skb->dst; + struct dn_dev *dn_db = dst->dev->dn_ptr; + struct dn_route *rt; + struct neighbour *neigh = dst->neighbour; + int header_len; +#ifdef CONFIG_NETFILTER + struct net_device *dev = skb->dev; +#endif + + if (skb->pkt_type != PACKET_HOST) + goto drop; + + /* Ensure that we have enough space for headers */ + rt = (struct dn_route *)skb->dst; + header_len = dn_db->use_long ? 21 : 6; + if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+header_len)) + goto drop; + + /* + * Hop count exceeded. + */ + if (++cb->hops > 30) + goto drop; + + skb->dev = rt->u.dst.dev; + + /* + * If packet goes out same interface it came in on, then set + * the Intra-Ethernet bit. This has no effect for short + * packets, so we don't need to test for them here. + */ + cb->rt_flags &= ~DN_RT_F_IE; + if (rt->rt_flags & RTCF_DOREDIRECT) + cb->rt_flags |= DN_RT_F_IE; + + return NF_HOOK(PF_DECnet, NF_DN_FORWARD, skb, dev, skb->dev, neigh->output); + +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +/* + * Drop packet. This is used for endnodes and for + * when we should not be forwarding packets from + * this dest. + */ +static int dn_blackhole(struct sk_buff *skb) +{ + kfree_skb(skb); + return NET_RX_DROP; +} + +/* + * Used to catch bugs. This should never normally get + * called. + */ +static int dn_rt_bug(struct sk_buff *skb) +{ + if (net_ratelimit()) { + struct dn_skb_cb *cb = DN_SKB_CB(skb); + + printk(KERN_DEBUG "dn_rt_bug: skb from:%04x to:%04x\n", + cb->src, cb->dst); + } + + kfree_skb(skb); + + return NET_RX_BAD; +} + +static int dn_rt_set_next_hop(struct dn_route *rt, struct dn_fib_res *res) +{ + struct dn_fib_info *fi = res->fi; + struct net_device *dev = rt->u.dst.dev; + struct neighbour *n; + unsigned mss; + + if (fi) { + if (DN_FIB_RES_GW(*res) && + DN_FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) + rt->rt_gateway = DN_FIB_RES_GW(*res); + memcpy(rt->u.dst.metrics, fi->fib_metrics, + sizeof(rt->u.dst.metrics)); + } + rt->rt_type = res->type; + + if (dev != NULL && rt->u.dst.neighbour == NULL) { + n = __neigh_lookup_errno(&dn_neigh_table, &rt->rt_gateway, dev); + if (IS_ERR(n)) + return PTR_ERR(n); + rt->u.dst.neighbour = n; + } + + if (rt->u.dst.metrics[RTAX_MTU-1] == 0 || + rt->u.dst.metrics[RTAX_MTU-1] > rt->u.dst.dev->mtu) + rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; + mss = dn_mss_from_pmtu(dev, dst_mtu(&rt->u.dst)); + if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0 || + rt->u.dst.metrics[RTAX_ADVMSS-1] > mss) + rt->u.dst.metrics[RTAX_ADVMSS-1] = mss; + return 0; +} + +static inline int dn_match_addr(__u16 addr1, __u16 addr2) +{ + __u16 tmp = dn_ntohs(addr1) ^ dn_ntohs(addr2); + int match = 16; + while(tmp) { + tmp >>= 1; + match--; + } + return match; +} + +static __u16 dnet_select_source(const struct net_device *dev, __u16 daddr, int scope) +{ + __u16 saddr = 0; + struct dn_dev *dn_db = dev->dn_ptr; + struct dn_ifaddr *ifa; + int best_match = 0; + int ret; + + read_lock(&dev_base_lock); + for(ifa = dn_db->ifa_list; ifa; ifa = ifa->ifa_next) { + if (ifa->ifa_scope > scope) + continue; + if (!daddr) { + saddr = ifa->ifa_local; + break; + } + ret = dn_match_addr(daddr, ifa->ifa_local); + if (ret > best_match) + saddr = ifa->ifa_local; + if (best_match == 0) + saddr = ifa->ifa_local; + } + read_unlock(&dev_base_lock); + + return saddr; +} + +static inline __u16 __dn_fib_res_prefsrc(struct dn_fib_res *res) +{ + return dnet_select_source(DN_FIB_RES_DEV(*res), DN_FIB_RES_GW(*res), res->scope); +} + +static inline __u16 dn_fib_rules_map_destination(__u16 daddr, struct dn_fib_res *res) +{ + __u16 mask = dnet_make_mask(res->prefixlen); + return (daddr&~mask)|res->fi->fib_nh->nh_gw; +} + +static int dn_route_output_slow(struct dst_entry **pprt, const struct flowi *oldflp, int try_hard) +{ + struct flowi fl = { .nl_u = { .dn_u = + { .daddr = oldflp->fld_dst, + .saddr = oldflp->fld_src, + .scope = RT_SCOPE_UNIVERSE, +#ifdef CONFIG_DECNET_ROUTE_FWMARK + .fwmark = oldflp->fld_fwmark +#endif + } }, + .iif = loopback_dev.ifindex, + .oif = oldflp->oif }; + struct dn_route *rt = NULL; + struct net_device *dev_out = NULL; + struct neighbour *neigh = NULL; + unsigned hash; + unsigned flags = 0; + struct dn_fib_res res = { .fi = NULL, .type = RTN_UNICAST }; + int err; + int free_res = 0; + __u16 gateway = 0; + + if (decnet_debug_level & 16) + printk(KERN_DEBUG + "dn_route_output_slow: dst=%04x src=%04x mark=%d" + " iif=%d oif=%d\n", oldflp->fld_dst, oldflp->fld_src, + oldflp->fld_fwmark, loopback_dev.ifindex, oldflp->oif); + + /* If we have an output interface, verify its a DECnet device */ + if (oldflp->oif) { + dev_out = dev_get_by_index(oldflp->oif); + err = -ENODEV; + if (dev_out && dev_out->dn_ptr == NULL) { + dev_put(dev_out); + dev_out = NULL; + } + if (dev_out == NULL) + goto out; + } + + /* If we have a source address, verify that its a local address */ + if (oldflp->fld_src) { + err = -EADDRNOTAVAIL; + + if (dev_out) { + if (dn_dev_islocal(dev_out, oldflp->fld_src)) + goto source_ok; + dev_put(dev_out); + goto out; + } + read_lock(&dev_base_lock); + for(dev_out = dev_base; dev_out; dev_out = dev_out->next) { + if (!dev_out->dn_ptr) + continue; + if (dn_dev_islocal(dev_out, oldflp->fld_src)) + break; + } + read_unlock(&dev_base_lock); + if (dev_out == NULL) + goto out; + dev_hold(dev_out); +source_ok: + ; + } + + /* No destination? Assume its local */ + if (!fl.fld_dst) { + fl.fld_dst = fl.fld_src; + + err = -EADDRNOTAVAIL; + if (dev_out) + dev_put(dev_out); + dev_out = &loopback_dev; + dev_hold(dev_out); + if (!fl.fld_dst) { + fl.fld_dst = + fl.fld_src = dnet_select_source(dev_out, 0, + RT_SCOPE_HOST); + if (!fl.fld_dst) + goto out; + } + fl.oif = loopback_dev.ifindex; + res.type = RTN_LOCAL; + goto make_route; + } + + if (decnet_debug_level & 16) + printk(KERN_DEBUG + "dn_route_output_slow: initial checks complete." + " dst=%o4x src=%04x oif=%d try_hard=%d\n", fl.fld_dst, + fl.fld_src, fl.oif, try_hard); + + /* + * N.B. If the kernel is compiled without router support then + * dn_fib_lookup() will evaluate to non-zero so this if () block + * will always be executed. + */ + err = -ESRCH; + if (try_hard || (err = dn_fib_lookup(&fl, &res)) != 0) { + struct dn_dev *dn_db; + if (err != -ESRCH) + goto out; + /* + * Here the fallback is basically the standard algorithm for + * routing in endnodes which is described in the DECnet routing + * docs + * + * If we are not trying hard, look in neighbour cache. + * The result is tested to ensure that if a specific output + * device/source address was requested, then we honour that + * here + */ + if (!try_hard) { + neigh = neigh_lookup_nodev(&dn_neigh_table, &fl.fld_dst); + if (neigh) { + if ((oldflp->oif && + (neigh->dev->ifindex != oldflp->oif)) || + (oldflp->fld_src && + (!dn_dev_islocal(neigh->dev, + oldflp->fld_src)))) { + neigh_release(neigh); + neigh = NULL; + } else { + if (dev_out) + dev_put(dev_out); + if (dn_dev_islocal(neigh->dev, fl.fld_dst)) { + dev_out = &loopback_dev; + res.type = RTN_LOCAL; + } else { + dev_out = neigh->dev; + } + dev_hold(dev_out); + goto select_source; + } + } + } + + /* Not there? Perhaps its a local address */ + if (dev_out == NULL) + dev_out = dn_dev_get_default(); + err = -ENODEV; + if (dev_out == NULL) + goto out; + dn_db = dev_out->dn_ptr; + /* Possible improvement - check all devices for local addr */ + if (dn_dev_islocal(dev_out, fl.fld_dst)) { + dev_put(dev_out); + dev_out = &loopback_dev; + dev_hold(dev_out); + res.type = RTN_LOCAL; + goto select_source; + } + /* Not local either.... try sending it to the default router */ + neigh = neigh_clone(dn_db->router); + BUG_ON(neigh && neigh->dev != dev_out); + + /* Ok then, we assume its directly connected and move on */ +select_source: + if (neigh) + gateway = ((struct dn_neigh *)neigh)->addr; + if (gateway == 0) + gateway = fl.fld_dst; + if (fl.fld_src == 0) { + fl.fld_src = dnet_select_source(dev_out, gateway, + res.type == RTN_LOCAL ? + RT_SCOPE_HOST : + RT_SCOPE_LINK); + if (fl.fld_src == 0 && res.type != RTN_LOCAL) + goto e_addr; + } + fl.oif = dev_out->ifindex; + goto make_route; + } + free_res = 1; + + if (res.type == RTN_NAT) + goto e_inval; + + if (res.type == RTN_LOCAL) { + if (!fl.fld_src) + fl.fld_src = fl.fld_dst; + if (dev_out) + dev_put(dev_out); + dev_out = &loopback_dev; + dev_hold(dev_out); + fl.oif = dev_out->ifindex; + if (res.fi) + dn_fib_info_put(res.fi); + res.fi = NULL; + goto make_route; + } + + if (res.fi->fib_nhs > 1 && fl.oif == 0) + dn_fib_select_multipath(&fl, &res); + + /* + * We could add some logic to deal with default routes here and + * get rid of some of the special casing above. + */ + + if (!fl.fld_src) + fl.fld_src = DN_FIB_RES_PREFSRC(res); + + if (dev_out) + dev_put(dev_out); + dev_out = DN_FIB_RES_DEV(res); + dev_hold(dev_out); + fl.oif = dev_out->ifindex; + gateway = DN_FIB_RES_GW(res); + +make_route: + if (dev_out->flags & IFF_LOOPBACK) + flags |= RTCF_LOCAL; + + rt = dst_alloc(&dn_dst_ops); + if (rt == NULL) + goto e_nobufs; + + atomic_set(&rt->u.dst.__refcnt, 1); + rt->u.dst.flags = DST_HOST; + + rt->fl.fld_src = oldflp->fld_src; + rt->fl.fld_dst = oldflp->fld_dst; + rt->fl.oif = oldflp->oif; + rt->fl.iif = 0; +#ifdef CONFIG_DECNET_ROUTE_FWMARK + rt->fl.fld_fwmark = oldflp->fld_fwmark; +#endif + + rt->rt_saddr = fl.fld_src; + rt->rt_daddr = fl.fld_dst; + rt->rt_gateway = gateway ? gateway : fl.fld_dst; + rt->rt_local_src = fl.fld_src; + + rt->rt_dst_map = fl.fld_dst; + rt->rt_src_map = fl.fld_src; + + rt->u.dst.dev = dev_out; + dev_hold(dev_out); + rt->u.dst.neighbour = neigh; + neigh = NULL; + + rt->u.dst.lastuse = jiffies; + rt->u.dst.output = dn_output; + rt->u.dst.input = dn_rt_bug; + rt->rt_flags = flags; + if (flags & RTCF_LOCAL) + rt->u.dst.input = dn_nsp_rx; + + err = dn_rt_set_next_hop(rt, &res); + if (err) + goto e_neighbour; + + hash = dn_hash(rt->fl.fld_src, rt->fl.fld_dst); + dn_insert_route(rt, hash, (struct dn_route **)pprt); + +done: + if (neigh) + neigh_release(neigh); + if (free_res) + dn_fib_res_put(&res); + if (dev_out) + dev_put(dev_out); +out: + return err; + +e_addr: + err = -EADDRNOTAVAIL; + goto done; +e_inval: + err = -EINVAL; + goto done; +e_nobufs: + err = -ENOBUFS; + goto done; +e_neighbour: + dst_free(&rt->u.dst); + goto e_nobufs; +} + + +/* + * N.B. The flags may be moved into the flowi at some future stage. + */ +static int __dn_route_output_key(struct dst_entry **pprt, const struct flowi *flp, int flags) +{ + unsigned hash = dn_hash(flp->fld_src, flp->fld_dst); + struct dn_route *rt = NULL; + + if (!(flags & MSG_TRYHARD)) { + rcu_read_lock_bh(); + for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt; + rt = rcu_dereference(rt->u.rt_next)) { + if ((flp->fld_dst == rt->fl.fld_dst) && + (flp->fld_src == rt->fl.fld_src) && +#ifdef CONFIG_DECNET_ROUTE_FWMARK + (flp->fld_fwmark == rt->fl.fld_fwmark) && +#endif + (rt->fl.iif == 0) && + (rt->fl.oif == flp->oif)) { + rt->u.dst.lastuse = jiffies; + dst_hold(&rt->u.dst); + rt->u.dst.__use++; + rcu_read_unlock_bh(); + *pprt = &rt->u.dst; + return 0; + } + } + rcu_read_unlock_bh(); + } + + return dn_route_output_slow(pprt, flp, flags); +} + +static int dn_route_output_key(struct dst_entry **pprt, struct flowi *flp, int flags) +{ + int err; + + err = __dn_route_output_key(pprt, flp, flags); + if (err == 0 && flp->proto) { + err = xfrm_lookup(pprt, flp, NULL, 0); + } + return err; +} + +int dn_route_output_sock(struct dst_entry **pprt, struct flowi *fl, struct sock *sk, int flags) +{ + int err; + + err = __dn_route_output_key(pprt, fl, flags & MSG_TRYHARD); + if (err == 0 && fl->proto) { + err = xfrm_lookup(pprt, fl, sk, !(flags & MSG_DONTWAIT)); + } + return err; +} + +static int dn_route_input_slow(struct sk_buff *skb) +{ + struct dn_route *rt = NULL; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + struct net_device *in_dev = skb->dev; + struct net_device *out_dev = NULL; + struct dn_dev *dn_db; + struct neighbour *neigh = NULL; + unsigned hash; + int flags = 0; + __u16 gateway = 0; + __u16 local_src = 0; + struct flowi fl = { .nl_u = { .dn_u = + { .daddr = cb->dst, + .saddr = cb->src, + .scope = RT_SCOPE_UNIVERSE, +#ifdef CONFIG_DECNET_ROUTE_FWMARK + .fwmark = skb->nfmark +#endif + } }, + .iif = skb->dev->ifindex }; + struct dn_fib_res res = { .fi = NULL, .type = RTN_UNREACHABLE }; + int err = -EINVAL; + int free_res = 0; + + dev_hold(in_dev); + + if ((dn_db = in_dev->dn_ptr) == NULL) + goto out; + + /* Zero source addresses are not allowed */ + if (fl.fld_src == 0) + goto out; + + /* + * In this case we've just received a packet from a source + * outside ourselves pretending to come from us. We don't + * allow it any further to prevent routing loops, spoofing and + * other nasties. Loopback packets already have the dst attached + * so this only affects packets which have originated elsewhere. + */ + err = -ENOTUNIQ; + if (dn_dev_islocal(in_dev, cb->src)) + goto out; + + err = dn_fib_lookup(&fl, &res); + if (err) { + if (err != -ESRCH) + goto out; + /* + * Is the destination us ? + */ + if (!dn_dev_islocal(in_dev, cb->dst)) + goto e_inval; + + res.type = RTN_LOCAL; + flags |= RTCF_DIRECTSRC; + } else { + __u16 src_map = fl.fld_src; + free_res = 1; + + out_dev = DN_FIB_RES_DEV(res); + if (out_dev == NULL) { + if (net_ratelimit()) + printk(KERN_CRIT "Bug in dn_route_input_slow() " + "No output device\n"); + goto e_inval; + } + dev_hold(out_dev); + + if (res.r) + src_map = dn_fib_rules_policy(fl.fld_src, &res, &flags); + + gateway = DN_FIB_RES_GW(res); + if (res.type == RTN_NAT) { + fl.fld_dst = dn_fib_rules_map_destination(fl.fld_dst, &res); + dn_fib_res_put(&res); + free_res = 0; + if (dn_fib_lookup(&fl, &res)) + goto e_inval; + free_res = 1; + if (res.type != RTN_UNICAST) + goto e_inval; + flags |= RTCF_DNAT; + gateway = fl.fld_dst; + } + fl.fld_src = src_map; + } + + switch(res.type) { + case RTN_UNICAST: + /* + * Forwarding check here, we only check for forwarding + * being turned off, if you want to only forward intra + * area, its up to you to set the routing tables up + * correctly. + */ + if (dn_db->parms.forwarding == 0) + goto e_inval; + + if (res.fi->fib_nhs > 1 && fl.oif == 0) + dn_fib_select_multipath(&fl, &res); + + /* + * Check for out_dev == in_dev. We use the RTCF_DOREDIRECT + * flag as a hint to set the intra-ethernet bit when + * forwarding. If we've got NAT in operation, we don't do + * this optimisation. + */ + if (out_dev == in_dev && !(flags & RTCF_NAT)) + flags |= RTCF_DOREDIRECT; + + local_src = DN_FIB_RES_PREFSRC(res); + + case RTN_BLACKHOLE: + case RTN_UNREACHABLE: + break; + case RTN_LOCAL: + flags |= RTCF_LOCAL; + fl.fld_src = cb->dst; + fl.fld_dst = cb->src; + + /* Routing tables gave us a gateway */ + if (gateway) + goto make_route; + + /* Packet was intra-ethernet, so we know its on-link */ + if (cb->rt_flags | DN_RT_F_IE) { + gateway = cb->src; + flags |= RTCF_DIRECTSRC; + goto make_route; + } + + /* Use the default router if there is one */ + neigh = neigh_clone(dn_db->router); + if (neigh) { + gateway = ((struct dn_neigh *)neigh)->addr; + goto make_route; + } + + /* Close eyes and pray */ + gateway = cb->src; + flags |= RTCF_DIRECTSRC; + goto make_route; + default: + goto e_inval; + } + +make_route: + rt = dst_alloc(&dn_dst_ops); + if (rt == NULL) + goto e_nobufs; + + rt->rt_saddr = fl.fld_src; + rt->rt_daddr = fl.fld_dst; + rt->rt_gateway = fl.fld_dst; + if (gateway) + rt->rt_gateway = gateway; + rt->rt_local_src = local_src ? local_src : rt->rt_saddr; + + rt->rt_dst_map = fl.fld_dst; + rt->rt_src_map = fl.fld_src; + + rt->fl.fld_src = cb->src; + rt->fl.fld_dst = cb->dst; + rt->fl.oif = 0; + rt->fl.iif = in_dev->ifindex; + rt->fl.fld_fwmark = fl.fld_fwmark; + + rt->u.dst.flags = DST_HOST; + rt->u.dst.neighbour = neigh; + rt->u.dst.dev = out_dev; + rt->u.dst.lastuse = jiffies; + rt->u.dst.output = dn_rt_bug; + switch(res.type) { + case RTN_UNICAST: + rt->u.dst.input = dn_forward; + break; + case RTN_LOCAL: + rt->u.dst.output = dn_output; + rt->u.dst.input = dn_nsp_rx; + rt->u.dst.dev = in_dev; + flags |= RTCF_LOCAL; + break; + default: + case RTN_UNREACHABLE: + case RTN_BLACKHOLE: + rt->u.dst.input = dn_blackhole; + } + rt->rt_flags = flags; + if (rt->u.dst.dev) + dev_hold(rt->u.dst.dev); + + err = dn_rt_set_next_hop(rt, &res); + if (err) + goto e_neighbour; + + hash = dn_hash(rt->fl.fld_src, rt->fl.fld_dst); + dn_insert_route(rt, hash, (struct dn_route **)&skb->dst); + +done: + if (neigh) + neigh_release(neigh); + if (free_res) + dn_fib_res_put(&res); + dev_put(in_dev); + if (out_dev) + dev_put(out_dev); +out: + return err; + +e_inval: + err = -EINVAL; + goto done; + +e_nobufs: + err = -ENOBUFS; + goto done; + +e_neighbour: + dst_free(&rt->u.dst); + goto done; +} + +int dn_route_input(struct sk_buff *skb) +{ + struct dn_route *rt; + struct dn_skb_cb *cb = DN_SKB_CB(skb); + unsigned hash = dn_hash(cb->src, cb->dst); + + if (skb->dst) + return 0; + + rcu_read_lock(); + for(rt = rcu_dereference(dn_rt_hash_table[hash].chain); rt != NULL; + rt = rcu_dereference(rt->u.rt_next)) { + if ((rt->fl.fld_src == cb->src) && + (rt->fl.fld_dst == cb->dst) && + (rt->fl.oif == 0) && +#ifdef CONFIG_DECNET_ROUTE_FWMARK + (rt->fl.fld_fwmark == skb->nfmark) && +#endif + (rt->fl.iif == cb->iif)) { + rt->u.dst.lastuse = jiffies; + dst_hold(&rt->u.dst); + rt->u.dst.__use++; + rcu_read_unlock(); + skb->dst = (struct dst_entry *)rt; + return 0; + } + } + rcu_read_unlock(); + + return dn_route_input_slow(skb); +} + +static int dn_rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait) +{ + struct dn_route *rt = (struct dn_route *)skb->dst; + struct rtmsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct rta_cacheinfo ci; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); + r = NLMSG_DATA(nlh); + nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; + r->rtm_family = AF_DECnet; + r->rtm_dst_len = 16; + r->rtm_src_len = 0; + r->rtm_tos = 0; + r->rtm_table = RT_TABLE_MAIN; + r->rtm_type = rt->rt_type; + r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; + r->rtm_scope = RT_SCOPE_UNIVERSE; + r->rtm_protocol = RTPROT_UNSPEC; + if (rt->rt_flags & RTCF_NOTIFY) + r->rtm_flags |= RTM_F_NOTIFY; + RTA_PUT(skb, RTA_DST, 2, &rt->rt_daddr); + if (rt->fl.fld_src) { + r->rtm_src_len = 16; + RTA_PUT(skb, RTA_SRC, 2, &rt->fl.fld_src); + } + if (rt->u.dst.dev) + RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); + /* + * Note to self - change this if input routes reverse direction when + * they deal only with inputs and not with replies like they do + * currently. + */ + RTA_PUT(skb, RTA_PREFSRC, 2, &rt->rt_local_src); + if (rt->rt_daddr != rt->rt_gateway) + RTA_PUT(skb, RTA_GATEWAY, 2, &rt->rt_gateway); + if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) + goto rtattr_failure; + ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse); + ci.rta_used = rt->u.dst.__use; + ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt); + if (rt->u.dst.expires) + ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies); + else + ci.rta_expires = 0; + ci.rta_error = rt->u.dst.error; + ci.rta_id = ci.rta_ts = ci.rta_tsage = 0; + RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); + if (rt->fl.iif) + RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif); + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +/* + * This is called by both endnodes and routers now. + */ +int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct dn_route *rt = NULL; + struct dn_skb_cb *cb; + int err; + struct sk_buff *skb; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + fl.proto = DNPROTO_NSP; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + return -ENOBUFS; + skb->mac.raw = skb->data; + cb = DN_SKB_CB(skb); + + if (rta[RTA_SRC-1]) + memcpy(&fl.fld_src, RTA_DATA(rta[RTA_SRC-1]), 2); + if (rta[RTA_DST-1]) + memcpy(&fl.fld_dst, RTA_DATA(rta[RTA_DST-1]), 2); + if (rta[RTA_IIF-1]) + memcpy(&fl.iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int)); + + if (fl.iif) { + struct net_device *dev; + if ((dev = dev_get_by_index(fl.iif)) == NULL) { + kfree_skb(skb); + return -ENODEV; + } + if (!dev->dn_ptr) { + dev_put(dev); + kfree_skb(skb); + return -ENODEV; + } + skb->protocol = __constant_htons(ETH_P_DNA_RT); + skb->dev = dev; + cb->src = fl.fld_src; + cb->dst = fl.fld_dst; + local_bh_disable(); + err = dn_route_input(skb); + local_bh_enable(); + memset(cb, 0, sizeof(struct dn_skb_cb)); + rt = (struct dn_route *)skb->dst; + if (!err && -rt->u.dst.error) + err = rt->u.dst.error; + } else { + int oif = 0; + if (rta[RTA_OIF - 1]) + memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int)); + fl.oif = oif; + err = dn_route_output_key((struct dst_entry **)&rt, &fl, 0); + } + + if (skb->dev) + dev_put(skb->dev); + skb->dev = NULL; + if (err) + goto out_free; + skb->dst = &rt->u.dst; + if (rtm->rtm_flags & RTM_F_NOTIFY) + rt->rt_flags |= RTCF_NOTIFY; + + NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; + + err = dn_rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0); + + if (err == 0) + goto out_free; + if (err < 0) { + err = -EMSGSIZE; + goto out_free; + } + + err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); + + return err; + +out_free: + kfree_skb(skb); + return err; +} + +/* + * For routers, this is called from dn_fib_dump, but for endnodes its + * called directly from the rtnetlink dispatch table. + */ +int dn_cache_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct dn_route *rt; + int h, s_h; + int idx, s_idx; + + if (NLMSG_PAYLOAD(cb->nlh, 0) < sizeof(struct rtmsg)) + return -EINVAL; + if (!(((struct rtmsg *)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED)) + return 0; + + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + for(h = 0; h <= dn_rt_hash_mask; h++) { + if (h < s_h) + continue; + if (h > s_h) + s_idx = 0; + rcu_read_lock_bh(); + for(rt = rcu_dereference(dn_rt_hash_table[h].chain), idx = 0; + rt; + rt = rcu_dereference(rt->u.rt_next), idx++) { + if (idx < s_idx) + continue; + skb->dst = dst_clone(&rt->u.dst); + if (dn_rt_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1) <= 0) { + dst_release(xchg(&skb->dst, NULL)); + rcu_read_unlock_bh(); + goto done; + } + dst_release(xchg(&skb->dst, NULL)); + } + rcu_read_unlock_bh(); + } + +done: + cb->args[0] = h; + cb->args[1] = idx; + return skb->len; +} + +#ifdef CONFIG_PROC_FS +struct dn_rt_cache_iter_state { + int bucket; +}; + +static struct dn_route *dn_rt_cache_get_first(struct seq_file *seq) +{ + struct dn_route *rt = NULL; + struct dn_rt_cache_iter_state *s = seq->private; + + for(s->bucket = dn_rt_hash_mask; s->bucket >= 0; --s->bucket) { + rcu_read_lock_bh(); + rt = dn_rt_hash_table[s->bucket].chain; + if (rt) + break; + rcu_read_unlock_bh(); + } + return rt; +} + +static struct dn_route *dn_rt_cache_get_next(struct seq_file *seq, struct dn_route *rt) +{ + struct dn_rt_cache_iter_state *s = rcu_dereference(seq->private); + + rt = rt->u.rt_next; + while(!rt) { + rcu_read_unlock_bh(); + if (--s->bucket < 0) + break; + rcu_read_lock_bh(); + rt = dn_rt_hash_table[s->bucket].chain; + } + return rt; +} + +static void *dn_rt_cache_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct dn_route *rt = dn_rt_cache_get_first(seq); + + if (rt) { + while(*pos && (rt = dn_rt_cache_get_next(seq, rt))) + --*pos; + } + return *pos ? NULL : rt; +} + +static void *dn_rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct dn_route *rt = dn_rt_cache_get_next(seq, v); + ++*pos; + return rt; +} + +static void dn_rt_cache_seq_stop(struct seq_file *seq, void *v) +{ + if (v) + rcu_read_unlock_bh(); +} + +static int dn_rt_cache_seq_show(struct seq_file *seq, void *v) +{ + struct dn_route *rt = v; + char buf1[DN_ASCBUF_LEN], buf2[DN_ASCBUF_LEN]; + + seq_printf(seq, "%-8s %-7s %-7s %04d %04d %04d\n", + rt->u.dst.dev ? rt->u.dst.dev->name : "*", + dn_addr2asc(dn_ntohs(rt->rt_daddr), buf1), + dn_addr2asc(dn_ntohs(rt->rt_saddr), buf2), + atomic_read(&rt->u.dst.__refcnt), + rt->u.dst.__use, + (int) dst_metric(&rt->u.dst, RTAX_RTT)); + return 0; +} + +static struct seq_operations dn_rt_cache_seq_ops = { + .start = dn_rt_cache_seq_start, + .next = dn_rt_cache_seq_next, + .stop = dn_rt_cache_seq_stop, + .show = dn_rt_cache_seq_show, +}; + +static int dn_rt_cache_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct dn_rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + rc = seq_open(file, &dn_rt_cache_seq_ops); + if (rc) + goto out_kfree; + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations dn_rt_cache_seq_fops = { + .owner = THIS_MODULE, + .open = dn_rt_cache_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* CONFIG_PROC_FS */ + +void __init dn_route_init(void) +{ + int i, goal, order; + + dn_dst_ops.kmem_cachep = kmem_cache_create("dn_dst_cache", + sizeof(struct dn_route), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + + if (!dn_dst_ops.kmem_cachep) + panic("DECnet: Failed to allocate dn_dst_cache\n"); + + init_timer(&dn_route_timer); + dn_route_timer.function = dn_dst_check_expire; + dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ; + add_timer(&dn_route_timer); + + goal = num_physpages >> (26 - PAGE_SHIFT); + + for(order = 0; (1UL << order) < goal; order++) + /* NOTHING */; + + /* + * Only want 1024 entries max, since the table is very, very unlikely + * to be larger than that. + */ + while(order && ((((1UL << order) * PAGE_SIZE) / + sizeof(struct dn_rt_hash_bucket)) >= 2048)) + order--; + + do { + dn_rt_hash_mask = (1UL << order) * PAGE_SIZE / + sizeof(struct dn_rt_hash_bucket); + while(dn_rt_hash_mask & (dn_rt_hash_mask - 1)) + dn_rt_hash_mask--; + dn_rt_hash_table = (struct dn_rt_hash_bucket *) + __get_free_pages(GFP_ATOMIC, order); + } while (dn_rt_hash_table == NULL && --order > 0); + + if (!dn_rt_hash_table) + panic("Failed to allocate DECnet route cache hash table\n"); + + printk(KERN_INFO + "DECnet: Routing cache hash table of %u buckets, %ldKbytes\n", + dn_rt_hash_mask, + (long)(dn_rt_hash_mask*sizeof(struct dn_rt_hash_bucket))/1024); + + dn_rt_hash_mask--; + for(i = 0; i <= dn_rt_hash_mask; i++) { + spin_lock_init(&dn_rt_hash_table[i].lock); + dn_rt_hash_table[i].chain = NULL; + } + + dn_dst_ops.gc_thresh = (dn_rt_hash_mask + 1); + + proc_net_fops_create("decnet_cache", S_IRUGO, &dn_rt_cache_seq_fops); +} + +void __exit dn_route_cleanup(void) +{ + del_timer(&dn_route_timer); + dn_run_flush(0); + + proc_net_remove("decnet_cache"); +} + diff --git a/net/decnet/dn_rules.c b/net/decnet/dn_rules.c new file mode 100644 index 000000000000..597587d170d8 --- /dev/null +++ b/net/decnet/dn_rules.c @@ -0,0 +1,416 @@ + +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Routing Forwarding Information Base (Rules) + * + * Author: Steve Whitehouse + * Mostly copied from Alexey Kuznetsov's ipv4/fib_rules.c + * + * + * Changes: + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct dn_fib_rule +{ + struct dn_fib_rule *r_next; + atomic_t r_clntref; + u32 r_preference; + unsigned char r_table; + unsigned char r_action; + unsigned char r_dst_len; + unsigned char r_src_len; + dn_address r_src; + dn_address r_srcmask; + dn_address r_dst; + dn_address r_dstmask; + dn_address r_srcmap; + u8 r_flags; +#ifdef CONFIG_DECNET_ROUTE_FWMARK + u32 r_fwmark; +#endif + int r_ifindex; + char r_ifname[IFNAMSIZ]; + int r_dead; +}; + +static struct dn_fib_rule default_rule = { + .r_clntref = ATOMIC_INIT(2), + .r_preference = 0x7fff, + .r_table = RT_TABLE_MAIN, + .r_action = RTN_UNICAST +}; + +static struct dn_fib_rule *dn_fib_rules = &default_rule; +static DEFINE_RWLOCK(dn_fib_rules_lock); + + +int dn_fib_rtm_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct dn_fib_rule *r, **rp; + int err = -ESRCH; + + for(rp=&dn_fib_rules; (r=*rp) != NULL; rp = &r->r_next) { + if ((!rta[RTA_SRC-1] || memcmp(RTA_DATA(rta[RTA_SRC-1]), &r->r_src, 2) == 0) && + rtm->rtm_src_len == r->r_src_len && + rtm->rtm_dst_len == r->r_dst_len && + (!rta[RTA_DST-1] || memcmp(RTA_DATA(rta[RTA_DST-1]), &r->r_dst, 2) == 0) && +#ifdef CONFIG_DECNET_ROUTE_FWMARK + (!rta[RTA_PROTOINFO-1] || memcmp(RTA_DATA(rta[RTA_PROTOINFO-1]), &r->r_fwmark, 4) == 0) && +#endif + (!rtm->rtm_type || rtm->rtm_type == r->r_action) && + (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) && + (!rta[RTA_IIF-1] || rtattr_strcmp(rta[RTA_IIF-1], r->r_ifname) == 0) && + (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) { + + err = -EPERM; + if (r == &default_rule) + break; + + write_lock_bh(&dn_fib_rules_lock); + *rp = r->r_next; + r->r_dead = 1; + write_unlock_bh(&dn_fib_rules_lock); + dn_fib_rule_put(r); + err = 0; + break; + } + } + + return err; +} + +void dn_fib_rule_put(struct dn_fib_rule *r) +{ + if (atomic_dec_and_test(&r->r_clntref)) { + if (r->r_dead) + kfree(r); + else + printk(KERN_DEBUG "Attempt to free alive dn_fib_rule\n"); + } +} + + +int dn_fib_rtm_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct dn_fib_rule *r, *new_r, **rp; + unsigned char table_id; + + if (rtm->rtm_src_len > 16 || rtm->rtm_dst_len > 16) + return -EINVAL; + + if (rta[RTA_IIF-1] && RTA_PAYLOAD(rta[RTA_IIF-1]) > IFNAMSIZ) + return -EINVAL; + + if (rtm->rtm_type == RTN_NAT) + return -EINVAL; + + table_id = rtm->rtm_table; + if (table_id == RT_TABLE_UNSPEC) { + struct dn_fib_table *tb; + if (rtm->rtm_type == RTN_UNICAST) { + if ((tb = dn_fib_empty_table()) == NULL) + return -ENOBUFS; + table_id = tb->n; + } + } + + new_r = kmalloc(sizeof(*new_r), GFP_KERNEL); + if (!new_r) + return -ENOMEM; + memset(new_r, 0, sizeof(*new_r)); + if (rta[RTA_SRC-1]) + memcpy(&new_r->r_src, RTA_DATA(rta[RTA_SRC-1]), 2); + if (rta[RTA_DST-1]) + memcpy(&new_r->r_dst, RTA_DATA(rta[RTA_DST-1]), 2); + if (rta[RTA_GATEWAY-1]) + memcpy(&new_r->r_srcmap, RTA_DATA(rta[RTA_GATEWAY-1]), 2); + new_r->r_src_len = rtm->rtm_src_len; + new_r->r_dst_len = rtm->rtm_dst_len; + new_r->r_srcmask = dnet_make_mask(rtm->rtm_src_len); + new_r->r_dstmask = dnet_make_mask(rtm->rtm_dst_len); +#ifdef CONFIG_DECNET_ROUTE_FWMARK + if (rta[RTA_PROTOINFO-1]) + memcpy(&new_r->r_fwmark, RTA_DATA(rta[RTA_PROTOINFO-1]), 4); +#endif + new_r->r_action = rtm->rtm_type; + new_r->r_flags = rtm->rtm_flags; + if (rta[RTA_PRIORITY-1]) + memcpy(&new_r->r_preference, RTA_DATA(rta[RTA_PRIORITY-1]), 4); + new_r->r_table = table_id; + if (rta[RTA_IIF-1]) { + struct net_device *dev; + rtattr_strlcpy(new_r->r_ifname, rta[RTA_IIF-1], IFNAMSIZ); + new_r->r_ifindex = -1; + dev = dev_get_by_name(new_r->r_ifname); + if (dev) { + new_r->r_ifindex = dev->ifindex; + dev_put(dev); + } + } + + rp = &dn_fib_rules; + if (!new_r->r_preference) { + r = dn_fib_rules; + if (r && (r = r->r_next) != NULL) { + rp = &dn_fib_rules->r_next; + if (r->r_preference) + new_r->r_preference = r->r_preference - 1; + } + } + + while((r=*rp) != NULL) { + if (r->r_preference > new_r->r_preference) + break; + rp = &r->r_next; + } + + new_r->r_next = r; + atomic_inc(&new_r->r_clntref); + write_lock_bh(&dn_fib_rules_lock); + *rp = new_r; + write_unlock_bh(&dn_fib_rules_lock); + return 0; +} + + +int dn_fib_lookup(const struct flowi *flp, struct dn_fib_res *res) +{ + struct dn_fib_rule *r, *policy; + struct dn_fib_table *tb; + dn_address saddr = flp->fld_src; + dn_address daddr = flp->fld_dst; + int err; + + read_lock(&dn_fib_rules_lock); + for(r = dn_fib_rules; r; r = r->r_next) { + if (((saddr^r->r_src) & r->r_srcmask) || + ((daddr^r->r_dst) & r->r_dstmask) || +#ifdef CONFIG_DECNET_ROUTE_FWMARK + (r->r_fwmark && r->r_fwmark != flp->fld_fwmark) || +#endif + (r->r_ifindex && r->r_ifindex != flp->iif)) + continue; + + switch(r->r_action) { + case RTN_UNICAST: + case RTN_NAT: + policy = r; + break; + case RTN_UNREACHABLE: + read_unlock(&dn_fib_rules_lock); + return -ENETUNREACH; + default: + case RTN_BLACKHOLE: + read_unlock(&dn_fib_rules_lock); + return -EINVAL; + case RTN_PROHIBIT: + read_unlock(&dn_fib_rules_lock); + return -EACCES; + } + + if ((tb = dn_fib_get_table(r->r_table, 0)) == NULL) + continue; + err = tb->lookup(tb, flp, res); + if (err == 0) { + res->r = policy; + if (policy) + atomic_inc(&policy->r_clntref); + read_unlock(&dn_fib_rules_lock); + return 0; + } + if (err < 0 && err != -EAGAIN) { + read_unlock(&dn_fib_rules_lock); + return err; + } + } + + read_unlock(&dn_fib_rules_lock); + return -ESRCH; +} + +unsigned dnet_addr_type(__u16 addr) +{ + struct flowi fl = { .nl_u = { .dn_u = { .daddr = addr } } }; + struct dn_fib_res res; + unsigned ret = RTN_UNICAST; + struct dn_fib_table *tb = dn_fib_tables[RT_TABLE_LOCAL]; + + res.r = NULL; + + if (tb) { + if (!tb->lookup(tb, &fl, &res)) { + ret = res.type; + dn_fib_res_put(&res); + } + } + return ret; +} + +__u16 dn_fib_rules_policy(__u16 saddr, struct dn_fib_res *res, unsigned *flags) +{ + struct dn_fib_rule *r = res->r; + + if (r->r_action == RTN_NAT) { + int addrtype = dnet_addr_type(r->r_srcmap); + + if (addrtype == RTN_NAT) { + saddr = (saddr&~r->r_srcmask)|r->r_srcmap; + *flags |= RTCF_SNAT; + } else if (addrtype == RTN_LOCAL || r->r_srcmap == 0) { + saddr = r->r_srcmap; + *flags |= RTCF_MASQ; + } + } + return saddr; +} + +static void dn_fib_rules_detach(struct net_device *dev) +{ + struct dn_fib_rule *r; + + for(r = dn_fib_rules; r; r = r->r_next) { + if (r->r_ifindex == dev->ifindex) { + write_lock_bh(&dn_fib_rules_lock); + r->r_ifindex = -1; + write_unlock_bh(&dn_fib_rules_lock); + } + } +} + +static void dn_fib_rules_attach(struct net_device *dev) +{ + struct dn_fib_rule *r; + + for(r = dn_fib_rules; r; r = r->r_next) { + if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0) { + write_lock_bh(&dn_fib_rules_lock); + r->r_ifindex = dev->ifindex; + write_unlock_bh(&dn_fib_rules_lock); + } + } +} + +static int dn_fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + switch(event) { + case NETDEV_UNREGISTER: + dn_fib_rules_detach(dev); + dn_fib_sync_down(0, dev, 1); + case NETDEV_REGISTER: + dn_fib_rules_attach(dev); + dn_fib_sync_up(dev); + } + + return NOTIFY_DONE; +} + + +static struct notifier_block dn_fib_rules_notifier = { + .notifier_call = dn_fib_rules_event, +}; + +static int dn_fib_fill_rule(struct sk_buff *skb, struct dn_fib_rule *r, struct netlink_callback *cb) +{ + struct rtmsg *rtm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + + nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm)); + rtm = NLMSG_DATA(nlh); + rtm->rtm_family = AF_DECnet; + rtm->rtm_dst_len = r->r_dst_len; + rtm->rtm_src_len = r->r_src_len; + rtm->rtm_tos = 0; +#ifdef CONFIG_DECNET_ROUTE_FWMARK + if (r->r_fwmark) + RTA_PUT(skb, RTA_PROTOINFO, 4, &r->r_fwmark); +#endif + rtm->rtm_table = r->r_table; + rtm->rtm_protocol = 0; + rtm->rtm_scope = 0; + rtm->rtm_type = r->r_action; + rtm->rtm_flags = r->r_flags; + + if (r->r_dst_len) + RTA_PUT(skb, RTA_DST, 2, &r->r_dst); + if (r->r_src_len) + RTA_PUT(skb, RTA_SRC, 2, &r->r_src); + if (r->r_ifname[0]) + RTA_PUT(skb, RTA_IIF, IFNAMSIZ, &r->r_ifname); + if (r->r_preference) + RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference); + if (r->r_srcmap) + RTA_PUT(skb, RTA_GATEWAY, 2, &r->r_srcmap); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +int dn_fib_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->args[0]; + struct dn_fib_rule *r; + + read_lock(&dn_fib_rules_lock); + for(r = dn_fib_rules, idx = 0; r; r = r->r_next, idx++) { + if (idx < s_idx) + continue; + if (dn_fib_fill_rule(skb, r, cb) < 0) + break; + } + read_unlock(&dn_fib_rules_lock); + cb->args[0] = idx; + + return skb->len; +} + +void __init dn_fib_rules_init(void) +{ + register_netdevice_notifier(&dn_fib_rules_notifier); +} + +void __exit dn_fib_rules_cleanup(void) +{ + unregister_netdevice_notifier(&dn_fib_rules_notifier); +} + + diff --git a/net/decnet/dn_table.c b/net/decnet/dn_table.c new file mode 100644 index 000000000000..dad5603912be --- /dev/null +++ b/net/decnet/dn_table.c @@ -0,0 +1,825 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Routing Forwarding Information Base (Routing Tables) + * + * Author: Steve Whitehouse + * Mostly copied from the IPv4 routing code + * + * + * Changes: + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* RTF_xxx */ +#include +#include +#include +#include +#include +#include +#include +#include + +struct dn_zone +{ + struct dn_zone *dz_next; + struct dn_fib_node **dz_hash; + int dz_nent; + int dz_divisor; + u32 dz_hashmask; +#define DZ_HASHMASK(dz) ((dz)->dz_hashmask) + int dz_order; + u16 dz_mask; +#define DZ_MASK(dz) ((dz)->dz_mask) +}; + +struct dn_hash +{ + struct dn_zone *dh_zones[17]; + struct dn_zone *dh_zone_list; +}; + +#define dz_key_0(key) ((key).datum = 0) +#define dz_prefix(key,dz) ((key).datum) + +#define for_nexthops(fi) { int nhsel; const struct dn_fib_nh *nh;\ + for(nhsel = 0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) + +#define endfor_nexthops(fi) } + +#define DN_MAX_DIVISOR 1024 +#define DN_S_ZOMBIE 1 +#define DN_S_ACCESSED 2 + +#define DN_FIB_SCAN(f, fp) \ +for( ; ((f) = *(fp)) != NULL; (fp) = &(f)->fn_next) + +#define DN_FIB_SCAN_KEY(f, fp, key) \ +for( ; ((f) = *(fp)) != NULL && dn_key_eq((f)->fn_key, (key)); (fp) = &(f)->fn_next) + +#define RT_TABLE_MIN 1 + +static DEFINE_RWLOCK(dn_fib_tables_lock); +struct dn_fib_table *dn_fib_tables[RT_TABLE_MAX + 1]; + +static kmem_cache_t *dn_hash_kmem; +static int dn_fib_hash_zombies; + +static inline dn_fib_idx_t dn_hash(dn_fib_key_t key, struct dn_zone *dz) +{ + u16 h = ntohs(key.datum)>>(16 - dz->dz_order); + h ^= (h >> 10); + h ^= (h >> 6); + h &= DZ_HASHMASK(dz); + return *(dn_fib_idx_t *)&h; +} + +static inline dn_fib_key_t dz_key(u16 dst, struct dn_zone *dz) +{ + dn_fib_key_t k; + k.datum = dst & DZ_MASK(dz); + return k; +} + +static inline struct dn_fib_node **dn_chain_p(dn_fib_key_t key, struct dn_zone *dz) +{ + return &dz->dz_hash[dn_hash(key, dz).datum]; +} + +static inline struct dn_fib_node *dz_chain(dn_fib_key_t key, struct dn_zone *dz) +{ + return dz->dz_hash[dn_hash(key, dz).datum]; +} + +static inline int dn_key_eq(dn_fib_key_t a, dn_fib_key_t b) +{ + return a.datum == b.datum; +} + +static inline int dn_key_leq(dn_fib_key_t a, dn_fib_key_t b) +{ + return a.datum <= b.datum; +} + +static inline void dn_rebuild_zone(struct dn_zone *dz, + struct dn_fib_node **old_ht, + int old_divisor) +{ + int i; + struct dn_fib_node *f, **fp, *next; + + for(i = 0; i < old_divisor; i++) { + for(f = old_ht[i]; f; f = f->fn_next) { + next = f->fn_next; + for(fp = dn_chain_p(f->fn_key, dz); + *fp && dn_key_leq((*fp)->fn_key, f->fn_key); + fp = &(*fp)->fn_next) + /* NOTHING */; + f->fn_next = *fp; + *fp = f; + } + } +} + +static void dn_rehash_zone(struct dn_zone *dz) +{ + struct dn_fib_node **ht, **old_ht; + int old_divisor, new_divisor; + u32 new_hashmask; + + old_divisor = dz->dz_divisor; + + switch(old_divisor) { + case 16: + new_divisor = 256; + new_hashmask = 0xFF; + break; + default: + printk(KERN_DEBUG "DECnet: dn_rehash_zone: BUG! %d\n", old_divisor); + case 256: + new_divisor = 1024; + new_hashmask = 0x3FF; + break; + } + + ht = kmalloc(new_divisor*sizeof(struct dn_fib_node*), GFP_KERNEL); + + if (ht == NULL) + return; + + memset(ht, 0, new_divisor*sizeof(struct dn_fib_node *)); + write_lock_bh(&dn_fib_tables_lock); + old_ht = dz->dz_hash; + dz->dz_hash = ht; + dz->dz_hashmask = new_hashmask; + dz->dz_divisor = new_divisor; + dn_rebuild_zone(dz, old_ht, old_divisor); + write_unlock_bh(&dn_fib_tables_lock); + kfree(old_ht); +} + +static void dn_free_node(struct dn_fib_node *f) +{ + dn_fib_release_info(DN_FIB_INFO(f)); + kmem_cache_free(dn_hash_kmem, f); +} + + +static struct dn_zone *dn_new_zone(struct dn_hash *table, int z) +{ + int i; + struct dn_zone *dz = kmalloc(sizeof(struct dn_zone), GFP_KERNEL); + if (!dz) + return NULL; + + memset(dz, 0, sizeof(struct dn_zone)); + if (z) { + dz->dz_divisor = 16; + dz->dz_hashmask = 0x0F; + } else { + dz->dz_divisor = 1; + dz->dz_hashmask = 0; + } + + dz->dz_hash = kmalloc(dz->dz_divisor*sizeof(struct dn_fib_node *), GFP_KERNEL); + + if (!dz->dz_hash) { + kfree(dz); + return NULL; + } + + memset(dz->dz_hash, 0, dz->dz_divisor*sizeof(struct dn_fib_node*)); + dz->dz_order = z; + dz->dz_mask = dnet_make_mask(z); + + for(i = z + 1; i <= 16; i++) + if (table->dh_zones[i]) + break; + + write_lock_bh(&dn_fib_tables_lock); + if (i>16) { + dz->dz_next = table->dh_zone_list; + table->dh_zone_list = dz; + } else { + dz->dz_next = table->dh_zones[i]->dz_next; + table->dh_zones[i]->dz_next = dz; + } + table->dh_zones[z] = dz; + write_unlock_bh(&dn_fib_tables_lock); + return dz; +} + + +static int dn_fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct dn_kern_rta *rta, struct dn_fib_info *fi) +{ + struct rtnexthop *nhp; + int nhlen; + + if (rta->rta_priority && *rta->rta_priority != fi->fib_priority) + return 1; + + if (rta->rta_oif || rta->rta_gw) { + if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) && + (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 2) == 0)) + return 0; + return 1; + } + + if (rta->rta_mp == NULL) + return 0; + + nhp = RTA_DATA(rta->rta_mp); + nhlen = RTA_PAYLOAD(rta->rta_mp); + + for_nexthops(fi) { + int attrlen = nhlen - sizeof(struct rtnexthop); + dn_address gw; + + if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0) + return -EINVAL; + if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif) + return 1; + if (attrlen) { + gw = dn_fib_get_attr16(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); + + if (gw && gw != nh->nh_gw) + return 1; + } + nhp = RTNH_NEXT(nhp); + } endfor_nexthops(fi); + + return 0; +} + +static int dn_fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, + u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, + struct dn_fib_info *fi) +{ + struct rtmsg *rtm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm)); + rtm = NLMSG_DATA(nlh); + rtm->rtm_family = AF_DECnet; + rtm->rtm_dst_len = dst_len; + rtm->rtm_src_len = 0; + rtm->rtm_tos = 0; + rtm->rtm_table = tb_id; + rtm->rtm_flags = fi->fib_flags; + rtm->rtm_scope = scope; + rtm->rtm_type = type; + if (rtm->rtm_dst_len) + RTA_PUT(skb, RTA_DST, 2, dst); + rtm->rtm_protocol = fi->fib_protocol; + if (fi->fib_priority) + RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority); + if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) + goto rtattr_failure; + if (fi->fib_nhs == 1) { + if (fi->fib_nh->nh_gw) + RTA_PUT(skb, RTA_GATEWAY, 2, &fi->fib_nh->nh_gw); + if (fi->fib_nh->nh_oif) + RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif); + } + if (fi->fib_nhs > 1) { + struct rtnexthop *nhp; + struct rtattr *mp_head; + if (skb_tailroom(skb) <= RTA_SPACE(0)) + goto rtattr_failure; + mp_head = (struct rtattr *)skb_put(skb, RTA_SPACE(0)); + + for_nexthops(fi) { + if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) + goto rtattr_failure; + nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); + nhp->rtnh_flags = nh->nh_flags & 0xFF; + nhp->rtnh_hops = nh->nh_weight - 1; + nhp->rtnh_ifindex = nh->nh_oif; + if (nh->nh_gw) + RTA_PUT(skb, RTA_GATEWAY, 2, &nh->nh_gw); + nhp->rtnh_len = skb->tail - (unsigned char *)nhp; + } endfor_nexthops(fi); + mp_head->rta_type = RTA_MULTIPATH; + mp_head->rta_len = skb->tail - (u8*)mp_head; + } + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + + +static void dn_rtmsg_fib(int event, struct dn_fib_node *f, int z, int tb_id, + struct nlmsghdr *nlh, struct netlink_skb_parms *req) +{ + struct sk_buff *skb; + u32 pid = req ? req->pid : 0; + int size = NLMSG_SPACE(sizeof(struct rtmsg) + 256); + + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return; + + if (dn_fib_dump_info(skb, pid, nlh->nlmsg_seq, event, tb_id, + f->fn_type, f->fn_scope, &f->fn_key, z, + DN_FIB_INFO(f)) < 0) { + kfree_skb(skb); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_DECnet_ROUTE; + if (nlh->nlmsg_flags & NLM_F_ECHO) + atomic_inc(&skb->users); + netlink_broadcast(rtnl, skb, pid, RTMGRP_DECnet_ROUTE, GFP_KERNEL); + if (nlh->nlmsg_flags & NLM_F_ECHO) + netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); +} + +static __inline__ int dn_hash_dump_bucket(struct sk_buff *skb, + struct netlink_callback *cb, + struct dn_fib_table *tb, + struct dn_zone *dz, + struct dn_fib_node *f) +{ + int i, s_i; + + s_i = cb->args[3]; + for(i = 0; f; i++, f = f->fn_next) { + if (i < s_i) + continue; + if (f->fn_state & DN_S_ZOMBIE) + continue; + if (dn_fib_dump_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWROUTE, + tb->n, + (f->fn_state & DN_S_ZOMBIE) ? 0 : f->fn_type, + f->fn_scope, &f->fn_key, dz->dz_order, + f->fn_info) < 0) { + cb->args[3] = i; + return -1; + } + } + cb->args[3] = i; + return skb->len; +} + +static __inline__ int dn_hash_dump_zone(struct sk_buff *skb, + struct netlink_callback *cb, + struct dn_fib_table *tb, + struct dn_zone *dz) +{ + int h, s_h; + + s_h = cb->args[2]; + for(h = 0; h < dz->dz_divisor; h++) { + if (h < s_h) + continue; + if (h > s_h) + memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0])); + if (dz->dz_hash == NULL || dz->dz_hash[h] == NULL) + continue; + if (dn_hash_dump_bucket(skb, cb, tb, dz, dz->dz_hash[h]) < 0) { + cb->args[2] = h; + return -1; + } + } + cb->args[2] = h; + return skb->len; +} + +static int dn_fib_table_dump(struct dn_fib_table *tb, struct sk_buff *skb, + struct netlink_callback *cb) +{ + int m, s_m; + struct dn_zone *dz; + struct dn_hash *table = (struct dn_hash *)tb->data; + + s_m = cb->args[1]; + read_lock(&dn_fib_tables_lock); + for(dz = table->dh_zone_list, m = 0; dz; dz = dz->dz_next, m++) { + if (m < s_m) + continue; + if (m > s_m) + memset(&cb->args[2], 0, sizeof(cb->args) - 2*sizeof(cb->args[0])); + + if (dn_hash_dump_zone(skb, cb, tb, dz) < 0) { + cb->args[1] = m; + read_unlock(&dn_fib_tables_lock); + return -1; + } + } + read_unlock(&dn_fib_tables_lock); + cb->args[1] = m; + + return skb->len; +} + +static int dn_fib_table_insert(struct dn_fib_table *tb, struct rtmsg *r, struct dn_kern_rta *rta, struct nlmsghdr *n, struct netlink_skb_parms *req) +{ + struct dn_hash *table = (struct dn_hash *)tb->data; + struct dn_fib_node *new_f, *f, **fp, **del_fp; + struct dn_zone *dz; + struct dn_fib_info *fi; + int z = r->rtm_dst_len; + int type = r->rtm_type; + dn_fib_key_t key; + int err; + + if (z > 16) + return -EINVAL; + + dz = table->dh_zones[z]; + if (!dz && !(dz = dn_new_zone(table, z))) + return -ENOBUFS; + + dz_key_0(key); + if (rta->rta_dst) { + dn_address dst; + memcpy(&dst, rta->rta_dst, 2); + if (dst & ~DZ_MASK(dz)) + return -EINVAL; + key = dz_key(dst, dz); + } + + if ((fi = dn_fib_create_info(r, rta, n, &err)) == NULL) + return err; + + if (dz->dz_nent > (dz->dz_divisor << 2) && + dz->dz_divisor > DN_MAX_DIVISOR && + (z==16 || (1< dz->dz_divisor)) + dn_rehash_zone(dz); + + fp = dn_chain_p(key, dz); + + DN_FIB_SCAN(f, fp) { + if (dn_key_leq(key, f->fn_key)) + break; + } + + del_fp = NULL; + + if (f && (f->fn_state & DN_S_ZOMBIE) && + dn_key_eq(f->fn_key, key)) { + del_fp = fp; + fp = &f->fn_next; + f = *fp; + goto create; + } + + DN_FIB_SCAN_KEY(f, fp, key) { + if (fi->fib_priority <= DN_FIB_INFO(f)->fib_priority) + break; + } + + if (f && dn_key_eq(f->fn_key, key) && + fi->fib_priority == DN_FIB_INFO(f)->fib_priority) { + struct dn_fib_node **ins_fp; + + err = -EEXIST; + if (n->nlmsg_flags & NLM_F_EXCL) + goto out; + + if (n->nlmsg_flags & NLM_F_REPLACE) { + del_fp = fp; + fp = &f->fn_next; + f = *fp; + goto replace; + } + + ins_fp = fp; + err = -EEXIST; + + DN_FIB_SCAN_KEY(f, fp, key) { + if (fi->fib_priority != DN_FIB_INFO(f)->fib_priority) + break; + if (f->fn_type == type && f->fn_scope == r->rtm_scope + && DN_FIB_INFO(f) == fi) + goto out; + } + + if (!(n->nlmsg_flags & NLM_F_APPEND)) { + fp = ins_fp; + f = *fp; + } + } + +create: + err = -ENOENT; + if (!(n->nlmsg_flags & NLM_F_CREATE)) + goto out; + +replace: + err = -ENOBUFS; + new_f = kmem_cache_alloc(dn_hash_kmem, SLAB_KERNEL); + if (new_f == NULL) + goto out; + + memset(new_f, 0, sizeof(struct dn_fib_node)); + + new_f->fn_key = key; + new_f->fn_type = type; + new_f->fn_scope = r->rtm_scope; + DN_FIB_INFO(new_f) = fi; + + new_f->fn_next = f; + write_lock_bh(&dn_fib_tables_lock); + *fp = new_f; + write_unlock_bh(&dn_fib_tables_lock); + dz->dz_nent++; + + if (del_fp) { + f = *del_fp; + write_lock_bh(&dn_fib_tables_lock); + *del_fp = f->fn_next; + write_unlock_bh(&dn_fib_tables_lock); + + if (!(f->fn_state & DN_S_ZOMBIE)) + dn_rtmsg_fib(RTM_DELROUTE, f, z, tb->n, n, req); + if (f->fn_state & DN_S_ACCESSED) + dn_rt_cache_flush(-1); + dn_free_node(f); + dz->dz_nent--; + } else { + dn_rt_cache_flush(-1); + } + + dn_rtmsg_fib(RTM_NEWROUTE, new_f, z, tb->n, n, req); + + return 0; +out: + dn_fib_release_info(fi); + return err; +} + + +static int dn_fib_table_delete(struct dn_fib_table *tb, struct rtmsg *r, struct dn_kern_rta *rta, struct nlmsghdr *n, struct netlink_skb_parms *req) +{ + struct dn_hash *table = (struct dn_hash*)tb->data; + struct dn_fib_node **fp, **del_fp, *f; + int z = r->rtm_dst_len; + struct dn_zone *dz; + dn_fib_key_t key; + int matched; + + + if (z > 16) + return -EINVAL; + + if ((dz = table->dh_zones[z]) == NULL) + return -ESRCH; + + dz_key_0(key); + if (rta->rta_dst) { + dn_address dst; + memcpy(&dst, rta->rta_dst, 2); + if (dst & ~DZ_MASK(dz)) + return -EINVAL; + key = dz_key(dst, dz); + } + + fp = dn_chain_p(key, dz); + + DN_FIB_SCAN(f, fp) { + if (dn_key_eq(f->fn_key, key)) + break; + if (dn_key_leq(key, f->fn_key)) + return -ESRCH; + } + + matched = 0; + del_fp = NULL; + DN_FIB_SCAN_KEY(f, fp, key) { + struct dn_fib_info *fi = DN_FIB_INFO(f); + + if (f->fn_state & DN_S_ZOMBIE) + return -ESRCH; + + matched++; + + if (del_fp == NULL && + (!r->rtm_type || f->fn_type == r->rtm_type) && + (r->rtm_scope == RT_SCOPE_NOWHERE || f->fn_scope == r->rtm_scope) && + (!r->rtm_protocol || + fi->fib_protocol == r->rtm_protocol) && + dn_fib_nh_match(r, n, rta, fi) == 0) + del_fp = fp; + } + + if (del_fp) { + f = *del_fp; + dn_rtmsg_fib(RTM_DELROUTE, f, z, tb->n, n, req); + + if (matched != 1) { + write_lock_bh(&dn_fib_tables_lock); + *del_fp = f->fn_next; + write_unlock_bh(&dn_fib_tables_lock); + + if (f->fn_state & DN_S_ACCESSED) + dn_rt_cache_flush(-1); + dn_free_node(f); + dz->dz_nent--; + } else { + f->fn_state |= DN_S_ZOMBIE; + if (f->fn_state & DN_S_ACCESSED) { + f->fn_state &= ~DN_S_ACCESSED; + dn_rt_cache_flush(-1); + } + if (++dn_fib_hash_zombies > 128) + dn_fib_flush(); + } + + return 0; + } + + return -ESRCH; +} + +static inline int dn_flush_list(struct dn_fib_node **fp, int z, struct dn_hash *table) +{ + int found = 0; + struct dn_fib_node *f; + + while((f = *fp) != NULL) { + struct dn_fib_info *fi = DN_FIB_INFO(f); + + if (fi && ((f->fn_state & DN_S_ZOMBIE) || (fi->fib_flags & RTNH_F_DEAD))) { + write_lock_bh(&dn_fib_tables_lock); + *fp = f->fn_next; + write_unlock_bh(&dn_fib_tables_lock); + + dn_free_node(f); + found++; + continue; + } + fp = &f->fn_next; + } + + return found; +} + +static int dn_fib_table_flush(struct dn_fib_table *tb) +{ + struct dn_hash *table = (struct dn_hash *)tb->data; + struct dn_zone *dz; + int found = 0; + + dn_fib_hash_zombies = 0; + for(dz = table->dh_zone_list; dz; dz = dz->dz_next) { + int i; + int tmp = 0; + for(i = dz->dz_divisor-1; i >= 0; i--) + tmp += dn_flush_list(&dz->dz_hash[i], dz->dz_order, table); + dz->dz_nent -= tmp; + found += tmp; + } + + return found; +} + +static int dn_fib_table_lookup(struct dn_fib_table *tb, const struct flowi *flp, struct dn_fib_res *res) +{ + int err; + struct dn_zone *dz; + struct dn_hash *t = (struct dn_hash *)tb->data; + + read_lock(&dn_fib_tables_lock); + for(dz = t->dh_zone_list; dz; dz = dz->dz_next) { + struct dn_fib_node *f; + dn_fib_key_t k = dz_key(flp->fld_dst, dz); + + for(f = dz_chain(k, dz); f; f = f->fn_next) { + if (!dn_key_eq(k, f->fn_key)) { + if (dn_key_leq(k, f->fn_key)) + break; + else + continue; + } + + f->fn_state |= DN_S_ACCESSED; + + if (f->fn_state&DN_S_ZOMBIE) + continue; + + if (f->fn_scope < flp->fld_scope) + continue; + + err = dn_fib_semantic_match(f->fn_type, DN_FIB_INFO(f), flp, res); + + if (err == 0) { + res->type = f->fn_type; + res->scope = f->fn_scope; + res->prefixlen = dz->dz_order; + goto out; + } + if (err < 0) + goto out; + } + } + err = 1; +out: + read_unlock(&dn_fib_tables_lock); + return err; +} + + +struct dn_fib_table *dn_fib_get_table(int n, int create) +{ + struct dn_fib_table *t; + + if (n < RT_TABLE_MIN) + return NULL; + + if (n > RT_TABLE_MAX) + return NULL; + + if (dn_fib_tables[n]) + return dn_fib_tables[n]; + + if (!create) + return NULL; + + if (in_interrupt() && net_ratelimit()) { + printk(KERN_DEBUG "DECnet: BUG! Attempt to create routing table from interrupt\n"); + return NULL; + } + if ((t = kmalloc(sizeof(struct dn_fib_table) + sizeof(struct dn_hash), GFP_KERNEL)) == NULL) + return NULL; + + memset(t, 0, sizeof(struct dn_fib_table)); + + t->n = n; + t->insert = dn_fib_table_insert; + t->delete = dn_fib_table_delete; + t->lookup = dn_fib_table_lookup; + t->flush = dn_fib_table_flush; + t->dump = dn_fib_table_dump; + memset(t->data, 0, sizeof(struct dn_hash)); + dn_fib_tables[n] = t; + + return t; +} + +static void dn_fib_del_tree(int n) +{ + struct dn_fib_table *t; + + write_lock(&dn_fib_tables_lock); + t = dn_fib_tables[n]; + dn_fib_tables[n] = NULL; + write_unlock(&dn_fib_tables_lock); + + if (t) { + kfree(t); + } +} + +struct dn_fib_table *dn_fib_empty_table(void) +{ + int id; + + for(id = RT_TABLE_MIN; id <= RT_TABLE_MAX; id++) + if (dn_fib_tables[id] == NULL) + return dn_fib_get_table(id, 1); + return NULL; +} + +void __init dn_fib_table_init(void) +{ + dn_hash_kmem = kmem_cache_create("dn_fib_info_cache", + sizeof(struct dn_fib_info), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); +} + +void __exit dn_fib_table_cleanup(void) +{ + int i; + + for (i = RT_TABLE_MIN; i <= RT_TABLE_MAX; ++i) + dn_fib_del_tree(i); + + return; +} diff --git a/net/decnet/dn_timer.c b/net/decnet/dn_timer.c new file mode 100644 index 000000000000..09825711d58a --- /dev/null +++ b/net/decnet/dn_timer.c @@ -0,0 +1,109 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Socket Timer Functions + * + * Author: Steve Whitehouse + * + * + * Changes: + * Steve Whitehouse : Made keepalive timer part of the same + * timer idea. + * Steve Whitehouse : Added checks for sk->sock_readers + * David S. Miller : New socket locking + * Steve Whitehouse : Timer grabs socket ref. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Slow timer is for everything else (n * 500mS) + */ + +#define SLOW_INTERVAL (HZ/2) + +static void dn_slow_timer(unsigned long arg); + +void dn_start_slow_timer(struct sock *sk) +{ + sk->sk_timer.expires = jiffies + SLOW_INTERVAL; + sk->sk_timer.function = dn_slow_timer; + sk->sk_timer.data = (unsigned long)sk; + + add_timer(&sk->sk_timer); +} + +void dn_stop_slow_timer(struct sock *sk) +{ + del_timer(&sk->sk_timer); +} + +static void dn_slow_timer(unsigned long arg) +{ + struct sock *sk = (struct sock *)arg; + struct dn_scp *scp = DN_SK(sk); + + sock_hold(sk); + bh_lock_sock(sk); + + if (sock_owned_by_user(sk)) { + sk->sk_timer.expires = jiffies + HZ / 10; + add_timer(&sk->sk_timer); + goto out; + } + + /* + * The persist timer is the standard slow timer used for retransmits + * in both connection establishment and disconnection as well as + * in the RUN state. The different states are catered for by changing + * the function pointer in the socket. Setting the timer to a value + * of zero turns it off. We allow the persist_fxn to turn the + * timer off in a permant way by returning non-zero, so that + * timer based routines may remove sockets. This is why we have a + * sock_hold()/sock_put() around the timer to prevent the socket + * going away in the middle. + */ + if (scp->persist && scp->persist_fxn) { + if (scp->persist <= SLOW_INTERVAL) { + scp->persist = 0; + + if (scp->persist_fxn(sk)) + goto out; + } else { + scp->persist -= SLOW_INTERVAL; + } + } + + /* + * Check for keepalive timeout. After the other timer 'cos if + * the previous timer caused a retransmit, we don't need to + * do this. scp->stamp is the last time that we sent a packet. + * The keepalive function sends a link service packet to the + * other end. If it remains unacknowledged, the standard + * socket timers will eventually shut the socket down. Each + * time we do this, scp->stamp will be updated, thus + * we won't try and send another until scp->keepalive has passed + * since the last successful transmission. + */ + if (scp->keepalive && scp->keepalive_fxn && (scp->state == DN_RUN)) { + if ((jiffies - scp->stamp) >= scp->keepalive) + scp->keepalive_fxn(sk); + } + + sk->sk_timer.expires = jiffies + SLOW_INTERVAL; + + add_timer(&sk->sk_timer); +out: + bh_unlock_sock(sk); + sock_put(sk); +} diff --git a/net/decnet/netfilter/Kconfig b/net/decnet/netfilter/Kconfig new file mode 100644 index 000000000000..ecdb3f9f14ca --- /dev/null +++ b/net/decnet/netfilter/Kconfig @@ -0,0 +1,15 @@ +# +# DECnet netfilter configuration +# + +menu "DECnet: Netfilter Configuration" + depends on DECNET && NETFILTER && EXPERIMENTAL + +config DECNET_NF_GRABULATOR + tristate "Routing message grabulator (for userland routing daemon)" + help + Enable this module if you want to use the userland DECnet routing + daemon. You will also need to enable routing support for DECnet + unless you just want to monitor routing messages from other nodes. + +endmenu diff --git a/net/decnet/netfilter/Makefile b/net/decnet/netfilter/Makefile new file mode 100644 index 000000000000..255c1ae9daeb --- /dev/null +++ b/net/decnet/netfilter/Makefile @@ -0,0 +1,6 @@ +# +# Makefile for DECnet netfilter modules +# + +obj-$(CONFIG_DECNET_NF_GRABULATOR) += dn_rtmsg.o + diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c new file mode 100644 index 000000000000..f86a6259fd12 --- /dev/null +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -0,0 +1,167 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet Routing Message Grabulator + * + * (C) 2000 ChyGwyn Limited - http://www.chygwyn.com/ + * This code may be copied under the GPL v.2 or at your option + * any later version. + * + * Author: Steven Whitehouse + * + */ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +static struct sock *dnrmg = NULL; + + +static struct sk_buff *dnrmg_build_message(struct sk_buff *rt_skb, int *errp) +{ + struct sk_buff *skb = NULL; + size_t size; + unsigned char *old_tail; + struct nlmsghdr *nlh; + unsigned char *ptr; + struct nf_dn_rtmsg *rtm; + + size = NLMSG_SPACE(rt_skb->len); + size += NLMSG_ALIGN(sizeof(struct nf_dn_rtmsg)); + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + goto nlmsg_failure; + old_tail = skb->tail; + nlh = NLMSG_PUT(skb, 0, 0, 0, size - sizeof(*nlh)); + rtm = (struct nf_dn_rtmsg *)NLMSG_DATA(nlh); + rtm->nfdn_ifindex = rt_skb->dev->ifindex; + ptr = NFDN_RTMSG(rtm); + memcpy(ptr, rt_skb->data, rt_skb->len); + nlh->nlmsg_len = skb->tail - old_tail; + return skb; + +nlmsg_failure: + if (skb) + kfree_skb(skb); + *errp = -ENOMEM; + if (net_ratelimit()) + printk(KERN_ERR "dn_rtmsg: error creating netlink message\n"); + return NULL; +} + +static void dnrmg_send_peer(struct sk_buff *skb) +{ + struct sk_buff *skb2; + int status = 0; + int group = 0; + unsigned char flags = *skb->data; + + switch(flags & DN_RT_CNTL_MSK) { + case DN_RT_PKT_L1RT: + group = DNRMG_L1_GROUP; + break; + case DN_RT_PKT_L2RT: + group = DNRMG_L2_GROUP; + break; + default: + return; + } + + skb2 = dnrmg_build_message(skb, &status); + if (skb2 == NULL) + return; + NETLINK_CB(skb2).dst_groups = group; + netlink_broadcast(dnrmg, skb2, 0, group, GFP_ATOMIC); +} + + +static unsigned int dnrmg_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + dnrmg_send_peer(*pskb); + return NF_ACCEPT; +} + + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static inline void dnrmg_receive_user_skb(struct sk_buff *skb) +{ + struct nlmsghdr *nlh = (struct nlmsghdr *)skb->data; + + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return; + + if (!cap_raised(NETLINK_CB(skb).eff_cap, CAP_NET_ADMIN)) + RCV_SKB_FAIL(-EPERM); + + /* Eventually we might send routing messages too */ + + RCV_SKB_FAIL(-EINVAL); +} + +static void dnrmg_receive_user_sk(struct sock *sk, int len) +{ + struct sk_buff *skb; + + while((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + dnrmg_receive_user_skb(skb); + kfree_skb(skb); + } +} + +static struct nf_hook_ops dnrmg_ops = { + .hook = dnrmg_hook, + .pf = PF_DECnet, + .hooknum = NF_DN_ROUTE, + .priority = NF_DN_PRI_DNRTMSG, +}; + +static int __init init(void) +{ + int rv = 0; + + dnrmg = netlink_kernel_create(NETLINK_DNRTMSG, dnrmg_receive_user_sk); + if (dnrmg == NULL) { + printk(KERN_ERR "dn_rtmsg: Cannot create netlink socket"); + return -ENOMEM; + } + + rv = nf_register_hook(&dnrmg_ops); + if (rv) { + sock_release(dnrmg->sk_socket); + } + + return rv; +} + +static void __exit fini(void) +{ + nf_unregister_hook(&dnrmg_ops); + sock_release(dnrmg->sk_socket); +} + + +MODULE_DESCRIPTION("DECnet Routing Message Grabulator"); +MODULE_AUTHOR("Steven Whitehouse "); +MODULE_LICENSE("GPL"); + +module_init(init); +module_exit(fini); + diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c new file mode 100644 index 000000000000..02bca49cb508 --- /dev/null +++ b/net/decnet/sysctl_net_decnet.c @@ -0,0 +1,480 @@ +/* + * DECnet An implementation of the DECnet protocol suite for the LINUX + * operating system. DECnet is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * DECnet sysctl support functions + * + * Author: Steve Whitehouse + * + * + * Changes: + * Steve Whitehouse - C99 changes and default device handling + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + + +int decnet_debug_level; +int decnet_time_wait = 30; +int decnet_dn_count = 1; +int decnet_di_count = 3; +int decnet_dr_count = 3; +int decnet_log_martians = 1; +int decnet_no_fc_max_cwnd = NSP_MIN_WINDOW; + +#ifdef CONFIG_SYSCTL +extern int decnet_dst_gc_interval; +static int min_decnet_time_wait[] = { 5 }; +static int max_decnet_time_wait[] = { 600 }; +static int min_state_count[] = { 1 }; +static int max_state_count[] = { NSP_MAXRXTSHIFT }; +static int min_decnet_dst_gc_interval[] = { 1 }; +static int max_decnet_dst_gc_interval[] = { 60 }; +static int min_decnet_no_fc_max_cwnd[] = { NSP_MIN_WINDOW }; +static int max_decnet_no_fc_max_cwnd[] = { NSP_MAX_WINDOW }; +static char node_name[7] = "???"; + +static struct ctl_table_header *dn_table_header = NULL; + +/* + * ctype.h :-) + */ +#define ISNUM(x) (((x) >= '0') && ((x) <= '9')) +#define ISLOWER(x) (((x) >= 'a') && ((x) <= 'z')) +#define ISUPPER(x) (((x) >= 'A') && ((x) <= 'Z')) +#define ISALPHA(x) (ISLOWER(x) || ISUPPER(x)) +#define INVALID_END_CHAR(x) (ISNUM(x) || ISALPHA(x)) + +static void strip_it(char *str) +{ + for(;;) { + switch(*str) { + case ' ': + case '\n': + case '\r': + case ':': + *str = 0; + case 0: + return; + } + str++; + } +} + +/* + * Simple routine to parse an ascii DECnet address + * into a network order address. + */ +static int parse_addr(dn_address *addr, char *str) +{ + dn_address area, node; + + while(*str && !ISNUM(*str)) str++; + + if (*str == 0) + return -1; + + area = (*str++ - '0'); + if (ISNUM(*str)) { + area *= 10; + area += (*str++ - '0'); + } + + if (*str++ != '.') + return -1; + + if (!ISNUM(*str)) + return -1; + + node = *str++ - '0'; + if (ISNUM(*str)) { + node *= 10; + node += (*str++ - '0'); + } + if (ISNUM(*str)) { + node *= 10; + node += (*str++ - '0'); + } + if (ISNUM(*str)) { + node *= 10; + node += (*str++ - '0'); + } + + if ((node > 1023) || (area > 63)) + return -1; + + if (INVALID_END_CHAR(*str)) + return -1; + + *addr = dn_htons((area << 10) | node); + + return 0; +} + + +static int dn_node_address_strategy(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + size_t len; + dn_address addr; + + if (oldval && oldlenp) { + if (get_user(len, oldlenp)) + return -EFAULT; + if (len) { + if (len != sizeof(unsigned short)) + return -EINVAL; + if (put_user(decnet_address, (unsigned short __user *)oldval)) + return -EFAULT; + } + } + if (newval && newlen) { + if (newlen != sizeof(unsigned short)) + return -EINVAL; + if (get_user(addr, (unsigned short __user *)newval)) + return -EFAULT; + + dn_dev_devices_off(); + + decnet_address = addr; + + dn_dev_devices_on(); + } + return 0; +} + +static int dn_node_address_handler(ctl_table *table, int write, + struct file *filp, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + char addr[DN_ASCBUF_LEN]; + size_t len; + dn_address dnaddr; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + int len = (*lenp < DN_ASCBUF_LEN) ? *lenp : (DN_ASCBUF_LEN-1); + + if (copy_from_user(addr, buffer, len)) + return -EFAULT; + + addr[len] = 0; + strip_it(addr); + + if (parse_addr(&dnaddr, addr)) + return -EINVAL; + + dn_dev_devices_off(); + + decnet_address = dnaddr; + + dn_dev_devices_on(); + + *ppos += len; + + return 0; + } + + dn_addr2asc(dn_ntohs(decnet_address), addr); + len = strlen(addr); + addr[len++] = '\n'; + + if (len > *lenp) len = *lenp; + + if (copy_to_user(buffer, addr, len)) + return -EFAULT; + + *lenp = len; + *ppos += len; + + return 0; +} + + +static int dn_def_dev_strategy(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + size_t len; + struct net_device *dev; + char devname[17]; + size_t namel; + int rv = 0; + + devname[0] = 0; + + if (oldval && oldlenp) { + if (get_user(len, oldlenp)) + return -EFAULT; + if (len) { + dev = dn_dev_get_default(); + if (dev) { + strcpy(devname, dev->name); + dev_put(dev); + } + + namel = strlen(devname) + 1; + if (len > namel) len = namel; + + if (copy_to_user(oldval, devname, len)) + return -EFAULT; + + if (put_user(len, oldlenp)) + return -EFAULT; + } + } + + if (newval && newlen) { + if (newlen > 16) + return -E2BIG; + + if (copy_from_user(devname, newval, newlen)) + return -EFAULT; + + devname[newlen] = 0; + + dev = dev_get_by_name(devname); + if (dev == NULL) + return -ENODEV; + + rv = -ENODEV; + if (dev->dn_ptr != NULL) { + rv = dn_dev_set_default(dev, 1); + if (rv) + dev_put(dev); + } + } + + return rv; +} + + +static int dn_def_dev_handler(ctl_table *table, int write, + struct file * filp, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + size_t len; + struct net_device *dev; + char devname[17]; + + if (!*lenp || (*ppos && !write)) { + *lenp = 0; + return 0; + } + + if (write) { + if (*lenp > 16) + return -E2BIG; + + if (copy_from_user(devname, buffer, *lenp)) + return -EFAULT; + + devname[*lenp] = 0; + strip_it(devname); + + dev = dev_get_by_name(devname); + if (dev == NULL) + return -ENODEV; + + if (dev->dn_ptr == NULL) { + dev_put(dev); + return -ENODEV; + } + + if (dn_dev_set_default(dev, 1)) { + dev_put(dev); + return -ENODEV; + } + *ppos += *lenp; + + return 0; + } + + dev = dn_dev_get_default(); + if (dev == NULL) { + *lenp = 0; + return 0; + } + + strcpy(devname, dev->name); + dev_put(dev); + len = strlen(devname); + devname[len++] = '\n'; + + if (len > *lenp) len = *lenp; + + if (copy_to_user(buffer, devname, len)) + return -EFAULT; + + *lenp = len; + *ppos += len; + + return 0; +} + +static ctl_table dn_table[] = { + { + .ctl_name = NET_DECNET_NODE_ADDRESS, + .procname = "node_address", + .maxlen = 7, + .mode = 0644, + .proc_handler = dn_node_address_handler, + .strategy = dn_node_address_strategy, + }, + { + .ctl_name = NET_DECNET_NODE_NAME, + .procname = "node_name", + .data = node_name, + .maxlen = 7, + .mode = 0644, + .proc_handler = &proc_dostring, + .strategy = &sysctl_string, + }, + { + .ctl_name = NET_DECNET_DEFAULT_DEVICE, + .procname = "default_device", + .maxlen = 16, + .mode = 0644, + .proc_handler = dn_def_dev_handler, + .strategy = dn_def_dev_strategy, + }, + { + .ctl_name = NET_DECNET_TIME_WAIT, + .procname = "time_wait", + .data = &decnet_time_wait, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_decnet_time_wait, + .extra2 = &max_decnet_time_wait + }, + { + .ctl_name = NET_DECNET_DN_COUNT, + .procname = "dn_count", + .data = &decnet_dn_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_state_count, + .extra2 = &max_state_count + }, + { + .ctl_name = NET_DECNET_DI_COUNT, + .procname = "di_count", + .data = &decnet_di_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_state_count, + .extra2 = &max_state_count + }, + { + .ctl_name = NET_DECNET_DR_COUNT, + .procname = "dr_count", + .data = &decnet_dr_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_state_count, + .extra2 = &max_state_count + }, + { + .ctl_name = NET_DECNET_DST_GC_INTERVAL, + .procname = "dst_gc_interval", + .data = &decnet_dst_gc_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_decnet_dst_gc_interval, + .extra2 = &max_decnet_dst_gc_interval + }, + { + .ctl_name = NET_DECNET_NO_FC_MAX_CWND, + .procname = "no_fc_max_cwnd", + .data = &decnet_no_fc_max_cwnd, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_decnet_no_fc_max_cwnd, + .extra2 = &max_decnet_no_fc_max_cwnd + }, + { + .ctl_name = NET_DECNET_DEBUG_LEVEL, + .procname = "debug", + .data = &decnet_debug_level, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + .strategy = &sysctl_intvec, + }, + {0} +}; + +static ctl_table dn_dir_table[] = { + { + .ctl_name = NET_DECNET, + .procname = "decnet", + .mode = 0555, + .child = dn_table}, + {0} +}; + +static ctl_table dn_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = dn_dir_table + }, + {0} +}; + +void dn_register_sysctl(void) +{ + dn_table_header = register_sysctl_table(dn_root_table, 1); +} + +void dn_unregister_sysctl(void) +{ + unregister_sysctl_table(dn_table_header); +} + +#else /* CONFIG_SYSCTL */ +void dn_unregister_sysctl(void) +{ +} +void dn_register_sysctl(void) +{ +} + +#endif diff --git a/net/econet/Makefile b/net/econet/Makefile new file mode 100644 index 000000000000..39f0a77abdbd --- /dev/null +++ b/net/econet/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for Econet support code. +# + +obj-$(CONFIG_ECONET) += econet.o + +econet-objs := af_econet.o diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c new file mode 100644 index 000000000000..de691e119e17 --- /dev/null +++ b/net/econet/af_econet.c @@ -0,0 +1,1129 @@ +/* + * An implementation of the Acorn Econet and AUN protocols. + * Philip Blundell + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static struct proto_ops econet_ops; +static struct hlist_head econet_sklist; +static DEFINE_RWLOCK(econet_lock); + +/* Since there are only 256 possible network numbers (or fewer, depends + how you count) it makes sense to use a simple lookup table. */ +static struct net_device *net2dev_map[256]; + +#define EC_PORT_IP 0xd2 + +#ifdef CONFIG_ECONET_AUNUDP +static spinlock_t aun_queue_lock; +static struct socket *udpsock; +#define AUN_PORT 0x8000 + + +struct aunhdr +{ + unsigned char code; /* AUN magic protocol byte */ + unsigned char port; + unsigned char cb; + unsigned char pad; + unsigned long handle; +}; + +static unsigned long aun_seq; + +/* Queue of packets waiting to be transmitted. */ +static struct sk_buff_head aun_queue; +static struct timer_list ab_cleanup_timer; + +#endif /* CONFIG_ECONET_AUNUDP */ + +/* Per-packet information */ +struct ec_cb +{ + struct sockaddr_ec sec; + unsigned long cookie; /* Supplied by user. */ +#ifdef CONFIG_ECONET_AUNUDP + int done; + unsigned long seq; /* Sequencing */ + unsigned long timeout; /* Timeout */ + unsigned long start; /* jiffies */ +#endif +#ifdef CONFIG_ECONET_NATIVE + void (*sent)(struct sk_buff *, int result); +#endif +}; + +static void econet_remove_socket(struct hlist_head *list, struct sock *sk) +{ + write_lock_bh(&econet_lock); + sk_del_node_init(sk); + write_unlock_bh(&econet_lock); +} + +static void econet_insert_socket(struct hlist_head *list, struct sock *sk) +{ + write_lock_bh(&econet_lock); + sk_add_node(sk, list); + write_unlock_bh(&econet_lock); +} + +/* + * Pull a packet from our receive queue and hand it to the user. + * If necessary we block. + */ + +static int econet_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len, int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + size_t copied; + int err; + + msg->msg_namelen = sizeof(struct sockaddr_ec); + + /* + * Call the generic datagram receiver. This handles all sorts + * of horrible races and re-entrancy so we can forget about it + * in the protocol layers. + * + * Now it will return ENETDOWN, if device have just gone down, + * but then it will block. + */ + + skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err); + + /* + * An error occurred so return it. Because skb_recv_datagram() + * handles the blocking we don't see and worry about blocking + * retries. + */ + + if(skb==NULL) + goto out; + + /* + * You lose any data beyond the buffer you gave. If it worries a + * user program they can ask the device for its MTU anyway. + */ + + copied = skb->len; + if (copied > len) + { + copied=len; + msg->msg_flags|=MSG_TRUNC; + } + + /* We can't use skb_copy_datagram here */ + err = memcpy_toiovec(msg->msg_iov, skb->data, copied); + if (err) + goto out_free; + sk->sk_stamp = skb->stamp; + + if (msg->msg_name) + memcpy(msg->msg_name, skb->cb, msg->msg_namelen); + + /* + * Free or return the buffer as appropriate. Again this + * hides all the races and re-entrancy issues from us. + */ + err = copied; + +out_free: + skb_free_datagram(sk, skb); +out: + return err; +} + +/* + * Bind an Econet socket. + */ + +static int econet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_ec *sec = (struct sockaddr_ec *)uaddr; + struct sock *sk=sock->sk; + struct econet_sock *eo = ec_sk(sk); + + /* + * Check legality + */ + + if (addr_len < sizeof(struct sockaddr_ec) || + sec->sec_family != AF_ECONET) + return -EINVAL; + + eo->cb = sec->cb; + eo->port = sec->port; + eo->station = sec->addr.station; + eo->net = sec->addr.net; + + return 0; +} + +#if defined(CONFIG_ECONET_AUNUDP) || defined(CONFIG_ECONET_NATIVE) +/* + * Queue a transmit result for the user to be told about. + */ + +static void tx_result(struct sock *sk, unsigned long cookie, int result) +{ + struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC); + struct ec_cb *eb; + struct sockaddr_ec *sec; + + if (skb == NULL) + { + printk(KERN_DEBUG "ec: memory squeeze, transmit result dropped.\n"); + return; + } + + eb = (struct ec_cb *)&skb->cb; + sec = (struct sockaddr_ec *)&eb->sec; + memset(sec, 0, sizeof(struct sockaddr_ec)); + sec->cookie = cookie; + sec->type = ECTYPE_TRANSMIT_STATUS | result; + sec->sec_family = AF_ECONET; + + if (sock_queue_rcv_skb(sk, skb) < 0) + kfree_skb(skb); +} +#endif + +#ifdef CONFIG_ECONET_NATIVE +/* + * Called by the Econet hardware driver when a packet transmit + * has completed. Tell the user. + */ + +static void ec_tx_done(struct sk_buff *skb, int result) +{ + struct ec_cb *eb = (struct ec_cb *)&skb->cb; + tx_result(skb->sk, eb->cookie, result); +} +#endif + +/* + * Send a packet. We have to work out which device it's going out on + * and hence whether to use real Econet or the UDP emulation. + */ + +static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct sockaddr_ec *saddr=(struct sockaddr_ec *)msg->msg_name; + struct net_device *dev; + struct ec_addr addr; + int err; + unsigned char port, cb; +#if defined(CONFIG_ECONET_AUNUDP) || defined(CONFIG_ECONET_NATIVE) + struct sk_buff *skb; + struct ec_cb *eb; +#endif +#ifdef CONFIG_ECONET_AUNUDP + struct msghdr udpmsg; + struct iovec iov[msg->msg_iovlen+1]; + struct aunhdr ah; + struct sockaddr_in udpdest; + __kernel_size_t size; + int i; + mm_segment_t oldfs; +#endif + + /* + * Check the flags. + */ + + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) + return -EINVAL; + + /* + * Get and verify the address. + */ + + if (saddr == NULL) { + struct econet_sock *eo = ec_sk(sk); + + addr.station = eo->station; + addr.net = eo->net; + port = eo->port; + cb = eo->cb; + } else { + if (msg->msg_namelen < sizeof(struct sockaddr_ec)) + return -EINVAL; + addr.station = saddr->addr.station; + addr.net = saddr->addr.net; + port = saddr->port; + cb = saddr->cb; + } + + /* Look for a device with the right network number. */ + dev = net2dev_map[addr.net]; + + /* If not directly reachable, use some default */ + if (dev == NULL) + { + dev = net2dev_map[0]; + /* No interfaces at all? */ + if (dev == NULL) + return -ENETDOWN; + } + + if (len + 15 > dev->mtu) + return -EMSGSIZE; + + if (dev->type == ARPHRD_ECONET) + { + /* Real hardware Econet. We're not worthy etc. */ +#ifdef CONFIG_ECONET_NATIVE + unsigned short proto = 0; + + dev_hold(dev); + + skb = sock_alloc_send_skb(sk, len+LL_RESERVED_SPACE(dev), + msg->msg_flags & MSG_DONTWAIT, &err); + if (skb==NULL) + goto out_unlock; + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb->nh.raw = skb->data; + + eb = (struct ec_cb *)&skb->cb; + + /* BUG: saddr may be NULL */ + eb->cookie = saddr->cookie; + eb->sec = *saddr; + eb->sent = ec_tx_done; + + if (dev->hard_header) { + int res; + struct ec_framehdr *fh; + err = -EINVAL; + res = dev->hard_header(skb, dev, ntohs(proto), + &addr, NULL, len); + /* Poke in our control byte and + port number. Hack, hack. */ + fh = (struct ec_framehdr *)(skb->data); + fh->cb = cb; + fh->port = port; + if (sock->type != SOCK_DGRAM) { + skb->tail = skb->data; + skb->len = 0; + } else if (res < 0) + goto out_free; + } + + /* Copy the data. Returns -EFAULT on error */ + err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + skb->protocol = proto; + skb->dev = dev; + skb->priority = sk->sk_priority; + if (err) + goto out_free; + + err = -ENETDOWN; + if (!(dev->flags & IFF_UP)) + goto out_free; + + /* + * Now send it + */ + + dev_queue_xmit(skb); + dev_put(dev); + return(len); + + out_free: + kfree_skb(skb); + out_unlock: + if (dev) + dev_put(dev); +#else + err = -EPROTOTYPE; +#endif + return err; + } + +#ifdef CONFIG_ECONET_AUNUDP + /* AUN virtual Econet. */ + + if (udpsock == NULL) + return -ENETDOWN; /* No socket - can't send */ + + /* Make up a UDP datagram and hand it off to some higher intellect. */ + + memset(&udpdest, 0, sizeof(udpdest)); + udpdest.sin_family = AF_INET; + udpdest.sin_port = htons(AUN_PORT); + + /* At the moment we use the stupid Acorn scheme of Econet address + y.x maps to IP a.b.c.x. This should be replaced with something + more flexible and more aware of subnet masks. */ + { + struct in_device *idev; + unsigned long network = 0; + + rcu_read_lock(); + idev = __in_dev_get(dev); + if (idev) { + if (idev->ifa_list) + network = ntohl(idev->ifa_list->ifa_address) & + 0xffffff00; /* !!! */ + } + rcu_read_unlock(); + udpdest.sin_addr.s_addr = htonl(network | addr.station); + } + + ah.port = port; + ah.cb = cb & 0x7f; + ah.code = 2; /* magic */ + ah.pad = 0; + + /* tack our header on the front of the iovec */ + size = sizeof(struct aunhdr); + /* + * XXX: that is b0rken. We can't mix userland and kernel pointers + * in iovec, since on a lot of platforms copy_from_user() will + * *not* work with the kernel and userland ones at the same time, + * regardless of what we do with set_fs(). And we are talking about + * econet-over-ethernet here, so "it's only ARM anyway" doesn't + * apply. Any suggestions on fixing that code? -- AV + */ + iov[0].iov_base = (void *)&ah; + iov[0].iov_len = size; + for (i = 0; i < msg->msg_iovlen; i++) { + void __user *base = msg->msg_iov[i].iov_base; + size_t len = msg->msg_iov[i].iov_len; + /* Check it now since we switch to KERNEL_DS later. */ + if (!access_ok(VERIFY_READ, base, len)) + return -EFAULT; + iov[i+1].iov_base = base; + iov[i+1].iov_len = len; + size += len; + } + + /* Get a skbuff (no data, just holds our cb information) */ + if ((skb = sock_alloc_send_skb(sk, 0, + msg->msg_flags & MSG_DONTWAIT, &err)) == NULL) + return err; + + eb = (struct ec_cb *)&skb->cb; + + eb->cookie = saddr->cookie; + eb->timeout = (5*HZ); + eb->start = jiffies; + ah.handle = aun_seq; + eb->seq = (aun_seq++); + eb->sec = *saddr; + + skb_queue_tail(&aun_queue, skb); + + udpmsg.msg_name = (void *)&udpdest; + udpmsg.msg_namelen = sizeof(udpdest); + udpmsg.msg_iov = &iov[0]; + udpmsg.msg_iovlen = msg->msg_iovlen + 1; + udpmsg.msg_control = NULL; + udpmsg.msg_controllen = 0; + udpmsg.msg_flags=0; + + oldfs = get_fs(); set_fs(KERNEL_DS); /* More privs :-) */ + err = sock_sendmsg(udpsock, &udpmsg, size); + set_fs(oldfs); +#else + err = -EPROTOTYPE; +#endif + return err; +} + +/* + * Look up the address of a socket. + */ + +static int econet_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sock *sk = sock->sk; + struct econet_sock *eo = ec_sk(sk); + struct sockaddr_ec *sec = (struct sockaddr_ec *)uaddr; + + if (peer) + return -EOPNOTSUPP; + + sec->sec_family = AF_ECONET; + sec->port = eo->port; + sec->addr.station = eo->station; + sec->addr.net = eo->net; + + *uaddr_len = sizeof(*sec); + return 0; +} + +static void econet_destroy_timer(unsigned long data) +{ + struct sock *sk=(struct sock *)data; + + if (!atomic_read(&sk->sk_wmem_alloc) && + !atomic_read(&sk->sk_rmem_alloc)) { + sk_free(sk); + return; + } + + sk->sk_timer.expires = jiffies + 10 * HZ; + add_timer(&sk->sk_timer); + printk(KERN_DEBUG "econet socket destroy delayed\n"); +} + +/* + * Close an econet socket. + */ + +static int econet_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (!sk) + return 0; + + econet_remove_socket(&econet_sklist, sk); + + /* + * Now the socket is dead. No more input will appear. + */ + + sk->sk_state_change(sk); /* It is useless. Just for sanity. */ + + sock->sk = NULL; + sk->sk_socket = NULL; + sock_set_flag(sk, SOCK_DEAD); + + /* Purge queues */ + + skb_queue_purge(&sk->sk_receive_queue); + + if (atomic_read(&sk->sk_rmem_alloc) || + atomic_read(&sk->sk_wmem_alloc)) { + sk->sk_timer.data = (unsigned long)sk; + sk->sk_timer.expires = jiffies + HZ; + sk->sk_timer.function = econet_destroy_timer; + add_timer(&sk->sk_timer); + return 0; + } + + sk_free(sk); + return 0; +} + +static struct proto econet_proto = { + .name = "ECONET", + .owner = THIS_MODULE, + .obj_size = sizeof(struct econet_sock), +}; + +/* + * Create an Econet socket + */ + +static int econet_create(struct socket *sock, int protocol) +{ + struct sock *sk; + struct econet_sock *eo; + int err; + + /* Econet only provides datagram services. */ + if (sock->type != SOCK_DGRAM) + return -ESOCKTNOSUPPORT; + + sock->state = SS_UNCONNECTED; + + err = -ENOBUFS; + sk = sk_alloc(PF_ECONET, GFP_KERNEL, &econet_proto, 1); + if (sk == NULL) + goto out; + + sk->sk_reuse = 1; + sock->ops = &econet_ops; + sock_init_data(sock, sk); + + eo = ec_sk(sk); + sock_reset_flag(sk, SOCK_ZAPPED); + sk->sk_family = PF_ECONET; + eo->num = protocol; + + econet_insert_socket(&econet_sklist, sk); + return(0); +out: + return err; +} + +/* + * Handle Econet specific ioctls + */ + +static int ec_dev_ioctl(struct socket *sock, unsigned int cmd, void __user *arg) +{ + struct ifreq ifr; + struct ec_device *edev; + struct net_device *dev; + struct sockaddr_ec *sec; + + /* + * Fetch the caller's info block into kernel space + */ + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + return -EFAULT; + + if ((dev = dev_get_by_name(ifr.ifr_name)) == NULL) + return -ENODEV; + + sec = (struct sockaddr_ec *)&ifr.ifr_addr; + + switch (cmd) + { + case SIOCSIFADDR: + edev = dev->ec_ptr; + if (edev == NULL) + { + /* Magic up a new one. */ + edev = kmalloc(sizeof(struct ec_device), GFP_KERNEL); + if (edev == NULL) { + printk("af_ec: memory squeeze.\n"); + dev_put(dev); + return -ENOMEM; + } + memset(edev, 0, sizeof(struct ec_device)); + dev->ec_ptr = edev; + } + else + net2dev_map[edev->net] = NULL; + edev->station = sec->addr.station; + edev->net = sec->addr.net; + net2dev_map[sec->addr.net] = dev; + if (!net2dev_map[0]) + net2dev_map[0] = dev; + dev_put(dev); + return 0; + + case SIOCGIFADDR: + edev = dev->ec_ptr; + if (edev == NULL) + { + dev_put(dev); + return -ENODEV; + } + memset(sec, 0, sizeof(struct sockaddr_ec)); + sec->addr.station = edev->station; + sec->addr.net = edev->net; + sec->sec_family = AF_ECONET; + dev_put(dev); + if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) + return -EFAULT; + return 0; + } + + dev_put(dev); + return -EINVAL; +} + +/* + * Handle generic ioctls + */ + +static int econet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + void __user *argp = (void __user *)arg; + + switch(cmd) { + case SIOCGSTAMP: + return sock_get_timestamp(sk, argp); + + case SIOCSIFADDR: + case SIOCGIFADDR: + return ec_dev_ioctl(sock, cmd, argp); + break; + + default: + return dev_ioctl(cmd, argp); + } + /*NOTREACHED*/ + return 0; +} + +static struct net_proto_family econet_family_ops = { + .family = PF_ECONET, + .create = econet_create, + .owner = THIS_MODULE, +}; + +static struct proto_ops SOCKOPS_WRAPPED(econet_ops) = { + .family = PF_ECONET, + .owner = THIS_MODULE, + .release = econet_release, + .bind = econet_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = econet_getname, + .poll = datagram_poll, + .ioctl = econet_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = econet_sendmsg, + .recvmsg = econet_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +#include +SOCKOPS_WRAP(econet, PF_ECONET); + +#if defined(CONFIG_ECONET_AUNUDP) || defined(CONFIG_ECONET_NATIVE) +/* + * Find the listening socket, if any, for the given data. + */ + +static struct sock *ec_listening_socket(unsigned char port, unsigned char + station, unsigned char net) +{ + struct sock *sk; + struct hlist_node *node; + + sk_for_each(sk, node, &econet_sklist) { + struct econet_sock *opt = ec_sk(sk); + if ((opt->port == port || opt->port == 0) && + (opt->station == station || opt->station == 0) && + (opt->net == net || opt->net == 0)) + goto found; + } + sk = NULL; +found: + return sk; +} + +/* + * Queue a received packet for a socket. + */ + +static int ec_queue_packet(struct sock *sk, struct sk_buff *skb, + unsigned char stn, unsigned char net, + unsigned char cb, unsigned char port) +{ + struct ec_cb *eb = (struct ec_cb *)&skb->cb; + struct sockaddr_ec *sec = (struct sockaddr_ec *)&eb->sec; + + memset(sec, 0, sizeof(struct sockaddr_ec)); + sec->sec_family = AF_ECONET; + sec->type = ECTYPE_PACKET_RECEIVED; + sec->port = port; + sec->cb = cb; + sec->addr.net = net; + sec->addr.station = stn; + + return sock_queue_rcv_skb(sk, skb); +} +#endif + +#ifdef CONFIG_ECONET_AUNUDP +/* + * Send an AUN protocol response. + */ + +static void aun_send_response(__u32 addr, unsigned long seq, int code, int cb) +{ + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_port = htons(AUN_PORT), + .sin_addr = {.s_addr = addr} + }; + struct aunhdr ah = {.code = code, .cb = cb, .handle = seq}; + struct kvec iov = {.iov_base = (void *)&ah, .iov_len = sizeof(ah)}; + struct msghdr udpmsg; + + udpmsg.msg_name = (void *)&sin; + udpmsg.msg_namelen = sizeof(sin); + udpmsg.msg_control = NULL; + udpmsg.msg_controllen = 0; + udpmsg.msg_flags=0; + + kernel_sendmsg(udpsock, &udpmsg, &iov, 1, sizeof(ah)); +} + + +/* + * Handle incoming AUN packets. Work out if anybody wants them, + * and send positive or negative acknowledgements as appropriate. + */ + +static void aun_incoming(struct sk_buff *skb, struct aunhdr *ah, size_t len) +{ + struct iphdr *ip = skb->nh.iph; + unsigned char stn = ntohl(ip->saddr) & 0xff; + struct sock *sk; + struct sk_buff *newskb; + struct ec_device *edev = skb->dev->ec_ptr; + + if (! edev) + goto bad; + + if ((sk = ec_listening_socket(ah->port, stn, edev->net)) == NULL) + goto bad; /* Nobody wants it */ + + newskb = alloc_skb((len - sizeof(struct aunhdr) + 15) & ~15, + GFP_ATOMIC); + if (newskb == NULL) + { + printk(KERN_DEBUG "AUN: memory squeeze, dropping packet.\n"); + /* Send nack and hope sender tries again */ + goto bad; + } + + memcpy(skb_put(newskb, len - sizeof(struct aunhdr)), (void *)(ah+1), + len - sizeof(struct aunhdr)); + + if (ec_queue_packet(sk, newskb, stn, edev->net, ah->cb, ah->port)) + { + /* Socket is bankrupt. */ + kfree_skb(newskb); + goto bad; + } + + aun_send_response(ip->saddr, ah->handle, 3, 0); + return; + +bad: + aun_send_response(ip->saddr, ah->handle, 4, 0); +} + +/* + * Handle incoming AUN transmit acknowledgements. If the sequence + * number matches something in our backlog then kill it and tell + * the user. If the remote took too long to reply then we may have + * dropped the packet already. + */ + +static void aun_tx_ack(unsigned long seq, int result) +{ + struct sk_buff *skb; + unsigned long flags; + struct ec_cb *eb; + + spin_lock_irqsave(&aun_queue_lock, flags); + skb = skb_peek(&aun_queue); + while (skb && skb != (struct sk_buff *)&aun_queue) + { + struct sk_buff *newskb = skb->next; + eb = (struct ec_cb *)&skb->cb; + if (eb->seq == seq) + goto foundit; + + skb = newskb; + } + spin_unlock_irqrestore(&aun_queue_lock, flags); + printk(KERN_DEBUG "AUN: unknown sequence %ld\n", seq); + return; + +foundit: + tx_result(skb->sk, eb->cookie, result); + skb_unlink(skb); + spin_unlock_irqrestore(&aun_queue_lock, flags); + kfree_skb(skb); +} + +/* + * Deal with received AUN frames - sort out what type of thing it is + * and hand it to the right function. + */ + +static void aun_data_available(struct sock *sk, int slen) +{ + int err; + struct sk_buff *skb; + unsigned char *data; + struct aunhdr *ah; + struct iphdr *ip; + size_t len; + + while ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) { + if (err == -EAGAIN) { + printk(KERN_ERR "AUN: no data available?!"); + return; + } + printk(KERN_DEBUG "AUN: recvfrom() error %d\n", -err); + } + + data = skb->h.raw + sizeof(struct udphdr); + ah = (struct aunhdr *)data; + len = skb->len - sizeof(struct udphdr); + ip = skb->nh.iph; + + switch (ah->code) + { + case 2: + aun_incoming(skb, ah, len); + break; + case 3: + aun_tx_ack(ah->handle, ECTYPE_TRANSMIT_OK); + break; + case 4: + aun_tx_ack(ah->handle, ECTYPE_TRANSMIT_NOT_LISTENING); + break; +#if 0 + /* This isn't quite right yet. */ + case 5: + aun_send_response(ip->saddr, ah->handle, 6, ah->cb); + break; +#endif + default: + printk(KERN_DEBUG "unknown AUN packet (type %d)\n", data[0]); + } + + skb_free_datagram(sk, skb); +} + +/* + * Called by the timer to manage the AUN transmit queue. If a packet + * was sent to a dead or nonexistent host then we will never get an + * acknowledgement back. After a few seconds we need to spot this and + * drop the packet. + */ + +static void ab_cleanup(unsigned long h) +{ + struct sk_buff *skb; + unsigned long flags; + + spin_lock_irqsave(&aun_queue_lock, flags); + skb = skb_peek(&aun_queue); + while (skb && skb != (struct sk_buff *)&aun_queue) + { + struct sk_buff *newskb = skb->next; + struct ec_cb *eb = (struct ec_cb *)&skb->cb; + if ((jiffies - eb->start) > eb->timeout) + { + tx_result(skb->sk, eb->cookie, + ECTYPE_TRANSMIT_NOT_PRESENT); + skb_unlink(skb); + kfree_skb(skb); + } + skb = newskb; + } + spin_unlock_irqrestore(&aun_queue_lock, flags); + + mod_timer(&ab_cleanup_timer, jiffies + (HZ*2)); +} + +static int __init aun_udp_initialise(void) +{ + int error; + struct sockaddr_in sin; + + skb_queue_head_init(&aun_queue); + spin_lock_init(&aun_queue_lock); + init_timer(&ab_cleanup_timer); + ab_cleanup_timer.expires = jiffies + (HZ*2); + ab_cleanup_timer.function = ab_cleanup; + add_timer(&ab_cleanup_timer); + + memset(&sin, 0, sizeof(sin)); + sin.sin_port = htons(AUN_PORT); + + /* We can count ourselves lucky Acorn machines are too dim to + speak IPv6. :-) */ + if ((error = sock_create_kern(PF_INET, SOCK_DGRAM, 0, &udpsock)) < 0) + { + printk("AUN: socket error %d\n", -error); + return error; + } + + udpsock->sk->sk_reuse = 1; + udpsock->sk->sk_allocation = GFP_ATOMIC; /* we're going to call it + from interrupts */ + + error = udpsock->ops->bind(udpsock, (struct sockaddr *)&sin, + sizeof(sin)); + if (error < 0) + { + printk("AUN: bind error %d\n", -error); + goto release; + } + + udpsock->sk->sk_data_ready = aun_data_available; + + return 0; + +release: + sock_release(udpsock); + udpsock = NULL; + return error; +} +#endif + +#ifdef CONFIG_ECONET_NATIVE + +/* + * Receive an Econet frame from a device. + */ + +static int econet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +{ + struct ec_framehdr *hdr; + struct sock *sk; + struct ec_device *edev = dev->ec_ptr; + + if (skb->pkt_type == PACKET_OTHERHOST) + goto drop; + + if (!edev) + goto drop; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + return NET_RX_DROP; + + if (!pskb_may_pull(skb, sizeof(struct ec_framehdr))) + goto drop; + + hdr = (struct ec_framehdr *) skb->data; + + /* First check for encapsulated IP */ + if (hdr->port == EC_PORT_IP) { + skb->protocol = htons(ETH_P_IP); + skb_pull(skb, sizeof(struct ec_framehdr)); + netif_rx(skb); + return 0; + } + + sk = ec_listening_socket(hdr->port, hdr->src_stn, hdr->src_net); + if (!sk) + goto drop; + + if (ec_queue_packet(sk, skb, edev->net, hdr->src_stn, hdr->cb, + hdr->port)) + goto drop; + + return 0; + +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +static struct packet_type econet_packet_type = { + .type = __constant_htons(ETH_P_ECONET), + .func = econet_rcv, +}; + +static void econet_hw_initialise(void) +{ + dev_add_pack(&econet_packet_type); +} + +#endif + +static int econet_notifier(struct notifier_block *this, unsigned long msg, void *data) +{ + struct net_device *dev = (struct net_device *)data; + struct ec_device *edev; + + switch (msg) { + case NETDEV_UNREGISTER: + /* A device has gone down - kill any data we hold for it. */ + edev = dev->ec_ptr; + if (edev) + { + if (net2dev_map[0] == dev) + net2dev_map[0] = NULL; + net2dev_map[edev->net] = NULL; + kfree(edev); + dev->ec_ptr = NULL; + } + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block econet_netdev_notifier = { + .notifier_call =econet_notifier, +}; + +static void __exit econet_proto_exit(void) +{ +#ifdef CONFIG_ECONET_AUNUDP + del_timer(&ab_cleanup_timer); + if (udpsock) + sock_release(udpsock); +#endif + unregister_netdevice_notifier(&econet_netdev_notifier); + sock_unregister(econet_family_ops.family); + proto_unregister(&econet_proto); +} + +static int __init econet_proto_init(void) +{ + int err = proto_register(&econet_proto, 0); + + if (err != 0) + goto out; + sock_register(&econet_family_ops); +#ifdef CONFIG_ECONET_AUNUDP + spin_lock_init(&aun_queue_lock); + aun_udp_initialise(); +#endif +#ifdef CONFIG_ECONET_NATIVE + econet_hw_initialise(); +#endif + register_netdevice_notifier(&econet_netdev_notifier); +out: + return err; +} + +module_init(econet_proto_init); +module_exit(econet_proto_exit); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_ECONET); diff --git a/net/ethernet/Makefile b/net/ethernet/Makefile new file mode 100644 index 000000000000..69b74a9a0fc3 --- /dev/null +++ b/net/ethernet/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the Linux Ethernet layer. +# + +obj-y += eth.o +obj-$(CONFIG_SYSCTL) += sysctl_net_ether.o +obj-$(subst m,y,$(CONFIG_IPX)) += pe2.o +obj-$(subst m,y,$(CONFIG_ATALK)) += pe2.o diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c new file mode 100644 index 000000000000..16c4234cbe12 --- /dev/null +++ b/net/ethernet/eth.c @@ -0,0 +1,308 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Ethernet-type device handling. + * + * Version: @(#)eth.c 1.0.7 05/25/93 + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * Florian La Roche, + * Alan Cox, + * + * Fixes: + * Mr Linux : Arp problems + * Alan Cox : Generic queue tidyup (very tiny here) + * Alan Cox : eth_header ntohs should be htons + * Alan Cox : eth_rebuild_header missing an htons and + * minor other things. + * Tegge : Arp bug fixes. + * Florian : Removed many unnecessary functions, code cleanup + * and changes for new arp and skbuff. + * Alan Cox : Redid header building to reflect new format. + * Alan Cox : ARP only when compiled with CONFIG_INET + * Greg Page : 802.2 and SNAP stuff. + * Alan Cox : MAC layer pointers/new format. + * Paul Gortmaker : eth_copy_and_sum shouldn't csum padding. + * Alan Cox : Protect against forwarding explosions with + * older network drivers and IFF_ALLMULTI. + * Christer Weinigel : Better rebuild header message. + * Andrew Morton : 26Feb01: kill ether_setup() - use netdev_boot_setup(). + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern int __init netdev_boot_setup(char *str); + +__setup("ether=", netdev_boot_setup); + +/* + * Create the Ethernet MAC header for an arbitrary protocol layer + * + * saddr=NULL means use device source address + * daddr=NULL means leave destination address (eg unresolved arp) + */ + +int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, + void *daddr, void *saddr, unsigned len) +{ + struct ethhdr *eth = (struct ethhdr *)skb_push(skb,ETH_HLEN); + + /* + * Set the protocol type. For a packet of type ETH_P_802_3 we put the length + * in here instead. It is up to the 802.2 layer to carry protocol information. + */ + + if(type!=ETH_P_802_3) + eth->h_proto = htons(type); + else + eth->h_proto = htons(len); + + /* + * Set the source hardware address. + */ + + if(saddr) + memcpy(eth->h_source,saddr,dev->addr_len); + else + memcpy(eth->h_source,dev->dev_addr,dev->addr_len); + + /* + * Anyway, the loopback-device should never use this function... + */ + + if (dev->flags & (IFF_LOOPBACK|IFF_NOARP)) + { + memset(eth->h_dest, 0, dev->addr_len); + return ETH_HLEN; + } + + if(daddr) + { + memcpy(eth->h_dest,daddr,dev->addr_len); + return ETH_HLEN; + } + + return -ETH_HLEN; +} + + +/* + * Rebuild the Ethernet MAC header. This is called after an ARP + * (or in future other address resolution) has completed on this + * sk_buff. We now let ARP fill in the other fields. + * + * This routine CANNOT use cached dst->neigh! + * Really, it is used only when dst->neigh is wrong. + */ + +int eth_rebuild_header(struct sk_buff *skb) +{ + struct ethhdr *eth = (struct ethhdr *)skb->data; + struct net_device *dev = skb->dev; + + switch (eth->h_proto) + { +#ifdef CONFIG_INET + case __constant_htons(ETH_P_IP): + return arp_find(eth->h_dest, skb); +#endif + default: + printk(KERN_DEBUG + "%s: unable to resolve type %X addresses.\n", + dev->name, (int)eth->h_proto); + + memcpy(eth->h_source, dev->dev_addr, dev->addr_len); + break; + } + + return 0; +} + + +/* + * Determine the packet's protocol ID. The rule here is that we + * assume 802.3 if the type field is short enough to be a length. + * This is normal practice and works for any 'now in use' protocol. + */ + +unsigned short eth_type_trans(struct sk_buff *skb, struct net_device *dev) +{ + struct ethhdr *eth; + unsigned char *rawp; + + skb->mac.raw=skb->data; + skb_pull(skb,ETH_HLEN); + eth = eth_hdr(skb); + skb->input_dev = dev; + + if(*eth->h_dest&1) + { + if(memcmp(eth->h_dest,dev->broadcast, ETH_ALEN)==0) + skb->pkt_type=PACKET_BROADCAST; + else + skb->pkt_type=PACKET_MULTICAST; + } + + /* + * This ALLMULTI check should be redundant by 1.4 + * so don't forget to remove it. + * + * Seems, you forgot to remove it. All silly devices + * seems to set IFF_PROMISC. + */ + + else if(1 /*dev->flags&IFF_PROMISC*/) + { + if(memcmp(eth->h_dest,dev->dev_addr, ETH_ALEN)) + skb->pkt_type=PACKET_OTHERHOST; + } + + if (ntohs(eth->h_proto) >= 1536) + return eth->h_proto; + + rawp = skb->data; + + /* + * This is a magic hack to spot IPX packets. Older Novell breaks + * the protocol design and runs IPX over 802.3 without an 802.2 LLC + * layer. We look for FFFF which isn't a used 802.2 SSAP/DSAP. This + * won't work for fault tolerant netware but does for the rest. + */ + if (*(unsigned short *)rawp == 0xFFFF) + return htons(ETH_P_802_3); + + /* + * Real 802.2 LLC + */ + return htons(ETH_P_802_2); +} + +static int eth_header_parse(struct sk_buff *skb, unsigned char *haddr) +{ + struct ethhdr *eth = eth_hdr(skb); + memcpy(haddr, eth->h_source, ETH_ALEN); + return ETH_ALEN; +} + +int eth_header_cache(struct neighbour *neigh, struct hh_cache *hh) +{ + unsigned short type = hh->hh_type; + struct ethhdr *eth; + struct net_device *dev = neigh->dev; + + eth = (struct ethhdr*) + (((u8*)hh->hh_data) + (HH_DATA_OFF(sizeof(*eth)))); + + if (type == __constant_htons(ETH_P_802_3)) + return -1; + + eth->h_proto = type; + memcpy(eth->h_source, dev->dev_addr, dev->addr_len); + memcpy(eth->h_dest, neigh->ha, dev->addr_len); + hh->hh_len = ETH_HLEN; + return 0; +} + +/* + * Called by Address Resolution module to notify changes in address. + */ + +void eth_header_cache_update(struct hh_cache *hh, struct net_device *dev, unsigned char * haddr) +{ + memcpy(((u8*)hh->hh_data) + HH_DATA_OFF(sizeof(struct ethhdr)), + haddr, dev->addr_len); +} + +EXPORT_SYMBOL(eth_type_trans); + +static int eth_mac_addr(struct net_device *dev, void *p) +{ + struct sockaddr *addr=p; + if (netif_running(dev)) + return -EBUSY; + memcpy(dev->dev_addr, addr->sa_data,dev->addr_len); + return 0; +} + +static int eth_change_mtu(struct net_device *dev, int new_mtu) +{ + if ((new_mtu < 68) || (new_mtu > 1500)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +/* + * Fill in the fields of the device structure with ethernet-generic values. + */ +void ether_setup(struct net_device *dev) +{ + dev->change_mtu = eth_change_mtu; + dev->hard_header = eth_header; + dev->rebuild_header = eth_rebuild_header; + dev->set_mac_address = eth_mac_addr; + dev->hard_header_cache = eth_header_cache; + dev->header_cache_update= eth_header_cache_update; + dev->hard_header_parse = eth_header_parse; + + dev->type = ARPHRD_ETHER; + dev->hard_header_len = ETH_HLEN; + dev->mtu = 1500; /* eth_mtu */ + dev->addr_len = ETH_ALEN; + dev->tx_queue_len = 1000; /* Ethernet wants good queues */ + dev->flags = IFF_BROADCAST|IFF_MULTICAST; + + memset(dev->broadcast,0xFF, ETH_ALEN); + +} +EXPORT_SYMBOL(ether_setup); + +/** + * alloc_etherdev - Allocates and sets up an ethernet device + * @sizeof_priv: Size of additional driver-private structure to be allocated + * for this ethernet device + * + * Fill in the fields of the device structure with ethernet-generic + * values. Basically does everything except registering the device. + * + * Constructs a new net device, complete with a private data area of + * size @sizeof_priv. A 32-byte (not bit) alignment is enforced for + * this private data area. + */ + +struct net_device *alloc_etherdev(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "eth%d", ether_setup); +} +EXPORT_SYMBOL(alloc_etherdev); diff --git a/net/ethernet/pe2.c b/net/ethernet/pe2.c new file mode 100644 index 000000000000..98a494be6039 --- /dev/null +++ b/net/ethernet/pe2.c @@ -0,0 +1,40 @@ +#include +#include +#include +#include +#include + +#include + +static int pEII_request(struct datalink_proto *dl, + struct sk_buff *skb, unsigned char *dest_node) +{ + struct net_device *dev = skb->dev; + + skb->protocol = htons(ETH_P_IPX); + if (dev->hard_header) + dev->hard_header(skb, dev, ETH_P_IPX, + dest_node, NULL, skb->len); + return dev_queue_xmit(skb); +} + +struct datalink_proto *make_EII_client(void) +{ + struct datalink_proto *proto = kmalloc(sizeof(*proto), GFP_ATOMIC); + + if (proto) { + proto->header_length = 0; + proto->request = pEII_request; + } + + return proto; +} + +void destroy_EII_client(struct datalink_proto *dl) +{ + if (dl) + kfree(dl); +} + +EXPORT_SYMBOL(destroy_EII_client); +EXPORT_SYMBOL(make_EII_client); diff --git a/net/ethernet/sysctl_net_ether.c b/net/ethernet/sysctl_net_ether.c new file mode 100644 index 000000000000..b81a6d532342 --- /dev/null +++ b/net/ethernet/sysctl_net_ether.c @@ -0,0 +1,13 @@ +/* -*- linux-c -*- + * sysctl_net_ether.c: sysctl interface to net Ethernet subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/ether directory entry (empty =) ). [MS] + */ + +#include +#include + +ctl_table ether_table[] = { + {0} +}; diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig new file mode 100644 index 000000000000..6d3e8b1bd1f2 --- /dev/null +++ b/net/ipv4/Kconfig @@ -0,0 +1,411 @@ +# +# IP configuration +# +config IP_MULTICAST + bool "IP: multicasting" + depends on INET + help + This is code for addressing several networked computers at once, + enlarging your kernel by about 2 KB. You need multicasting if you + intend to participate in the MBONE, a high bandwidth network on top + of the Internet which carries audio and video broadcasts. More + information about the MBONE is on the WWW at + . Information about the multicast + capabilities of the various network cards is contained in + . For most people, it's + safe to say N. + +config IP_ADVANCED_ROUTER + bool "IP: advanced router" + depends on INET + ---help--- + If you intend to run your Linux box mostly as a router, i.e. as a + computer that forwards and redistributes network packets, say Y; you + will then be presented with several options that allow more precise + control about the routing process. + + The answer to this question won't directly affect the kernel: + answering N will just cause the configurator to skip all the + questions about advanced routing. + + Note that your box can only act as a router if you enable IP + forwarding in your kernel; you can do that by saying Y to "/proc + file system support" and "Sysctl support" below and executing the + line + + echo "1" > /proc/sys/net/ipv4/ip_forward + + at boot time after the /proc file system has been mounted. + + If you turn on IP forwarding, you will also get the rp_filter, which + automatically rejects incoming packets if the routing table entry + for their source address doesn't match the network interface they're + arriving on. This has security advantages because it prevents the + so-called IP spoofing, however it can pose problems if you use + asymmetric routing (packets from you to a host take a different path + than packets from that host to you) or if you operate a non-routing + host which has several IP addresses on different interfaces. To turn + rp_filter off use: + + echo 0 > /proc/sys/net/ipv4/conf//rp_filter + or + echo 0 > /proc/sys/net/ipv4/conf/all/rp_filter + + If unsure, say N here. + +config IP_MULTIPLE_TABLES + bool "IP: policy routing" + depends on IP_ADVANCED_ROUTER + ---help--- + Normally, a router decides what to do with a received packet based + solely on the packet's final destination address. If you say Y here, + the Linux router will also be able to take the packet's source + address into account. Furthermore, the TOS (Type-Of-Service) field + of the packet can be used for routing decisions as well. + + If you are interested in this, please see the preliminary + documentation at + and . + You will need supporting software from + . + + If unsure, say N. + +config IP_ROUTE_FWMARK + bool "IP: use netfilter MARK value as routing key" + depends on IP_MULTIPLE_TABLES && NETFILTER + help + If you say Y here, you will be able to specify different routes for + packets with different mark values (see iptables(8), MARK target). + +config IP_ROUTE_MULTIPATH + bool "IP: equal cost multipath" + depends on IP_ADVANCED_ROUTER + help + Normally, the routing tables specify a single action to be taken in + a deterministic manner for a given packet. If you say Y here + however, it becomes possible to attach several actions to a packet + pattern, in effect specifying several alternative paths to travel + for those packets. The router considers all these paths to be of + equal "cost" and chooses one of them in a non-deterministic fashion + if a matching packet arrives. + +config IP_ROUTE_MULTIPATH_CACHED + bool "IP: equal cost multipath with caching support (EXPERIMENTAL)" + depends on: IP_ROUTE_MULTIPATH + help + Normally, equal cost multipath routing is not supported by the + routing cache. If you say Y here, alternative routes are cached + and on cache lookup a route is chosen in a configurable fashion. + + If unsure, say N. + +config IP_ROUTE_MULTIPATH_RR + tristate "MULTIPATH: round robin algorithm" + depends on IP_ROUTE_MULTIPATH_CACHED + help + Mulitpath routes are chosen according to Round Robin + +config IP_ROUTE_MULTIPATH_RANDOM + tristate "MULTIPATH: random algorithm" + depends on IP_ROUTE_MULTIPATH_CACHED + help + Multipath routes are chosen in a random fashion. Actually, + there is no weight for a route. The advantage of this policy + is that it is implemented stateless and therefore introduces only + a very small delay. + +config IP_ROUTE_MULTIPATH_WRANDOM + tristate "MULTIPATH: weighted random algorithm" + depends on IP_ROUTE_MULTIPATH_CACHED + help + Multipath routes are chosen in a weighted random fashion. + The per route weights are the weights visible via ip route 2. As the + corresponding state management introduces some overhead routing delay + is increased. + +config IP_ROUTE_MULTIPATH_DRR + tristate "MULTIPATH: interface round robin algorithm" + depends on IP_ROUTE_MULTIPATH_CACHED + help + Connections are distributed in a round robin fashion over the + available interfaces. This policy makes sense if the connections + should be primarily distributed on interfaces and not on routes. + +config IP_ROUTE_VERBOSE + bool "IP: verbose route monitoring" + depends on IP_ADVANCED_ROUTER + help + If you say Y here, which is recommended, then the kernel will print + verbose messages regarding the routing, for example warnings about + received packets which look strange and could be evidence of an + attack or a misconfigured system somewhere. The information is + handled by the klogd daemon which is responsible for kernel messages + ("man klogd"). + +config IP_PNP + bool "IP: kernel level autoconfiguration" + depends on INET + help + This enables automatic configuration of IP addresses of devices and + of the routing table during kernel boot, based on either information + supplied on the kernel command line or by BOOTP or RARP protocols. + You need to say Y only for diskless machines requiring network + access to boot (in which case you want to say Y to "Root file system + on NFS" as well), because all other machines configure the network + in their startup scripts. + +config IP_PNP_DHCP + bool "IP: DHCP support" + depends on IP_PNP + ---help--- + If you want your Linux box to mount its whole root file system (the + one containing the directory /) from some other computer over the + net via NFS and you want the IP address of your computer to be + discovered automatically at boot time using the DHCP protocol (a + special protocol designed for doing this job), say Y here. In case + the boot ROM of your network card was designed for booting Linux and + does DHCP itself, providing all necessary information on the kernel + command line, you can say N here. + + If unsure, say Y. Note that if you want to use DHCP, a DHCP server + must be operating on your network. Read + for details. + +config IP_PNP_BOOTP + bool "IP: BOOTP support" + depends on IP_PNP + ---help--- + If you want your Linux box to mount its whole root file system (the + one containing the directory /) from some other computer over the + net via NFS and you want the IP address of your computer to be + discovered automatically at boot time using the BOOTP protocol (a + special protocol designed for doing this job), say Y here. In case + the boot ROM of your network card was designed for booting Linux and + does BOOTP itself, providing all necessary information on the kernel + command line, you can say N here. If unsure, say Y. Note that if you + want to use BOOTP, a BOOTP server must be operating on your network. + Read for details. + +config IP_PNP_RARP + bool "IP: RARP support" + depends on IP_PNP + help + If you want your Linux box to mount its whole root file system (the + one containing the directory /) from some other computer over the + net via NFS and you want the IP address of your computer to be + discovered automatically at boot time using the RARP protocol (an + older protocol which is being obsoleted by BOOTP and DHCP), say Y + here. Note that if you want to use RARP, a RARP server must be + operating on your network. Read for + details. + +# not yet ready.. +# bool ' IP: ARP support' CONFIG_IP_PNP_ARP +config NET_IPIP + tristate "IP: tunneling" + depends on INET + select INET_TUNNEL + ---help--- + Tunneling means encapsulating data of one protocol type within + another protocol and sending it over a channel that understands the + encapsulating protocol. This particular tunneling driver implements + encapsulation of IP within IP, which sounds kind of pointless, but + can be useful if you want to make your (or some other) machine + appear on a different network than it physically is, or to use + mobile-IP facilities (allowing laptops to seamlessly move between + networks without changing their IP addresses). + + Saying Y to this option will produce two modules ( = code which can + be inserted in and removed from the running kernel whenever you + want). Most people won't need this and can say N. + +config NET_IPGRE + tristate "IP: GRE tunnels over IP" + depends on INET + select XFRM + help + Tunneling means encapsulating data of one protocol type within + another protocol and sending it over a channel that understands the + encapsulating protocol. This particular tunneling driver implements + GRE (Generic Routing Encapsulation) and at this time allows + encapsulating of IPv4 or IPv6 over existing IPv4 infrastructure. + This driver is useful if the other endpoint is a Cisco router: Cisco + likes GRE much better than the other Linux tunneling driver ("IP + tunneling" above). In addition, GRE allows multicast redistribution + through the tunnel. + +config NET_IPGRE_BROADCAST + bool "IP: broadcast GRE over IP" + depends on IP_MULTICAST && NET_IPGRE + help + One application of GRE/IP is to construct a broadcast WAN (Wide Area + Network), which looks like a normal Ethernet LAN (Local Area + Network), but can be distributed all over the Internet. If you want + to do that, say Y here and to "IP multicast routing" below. + +config IP_MROUTE + bool "IP: multicast routing" + depends on IP_MULTICAST + help + This is used if you want your machine to act as a router for IP + packets that have several destination addresses. It is needed on the + MBONE, a high bandwidth network on top of the Internet which carries + audio and video broadcasts. In order to do that, you would most + likely run the program mrouted. Information about the multicast + capabilities of the various network cards is contained in + . If you haven't heard + about it, you don't need it. + +config IP_PIMSM_V1 + bool "IP: PIM-SM version 1 support" + depends on IP_MROUTE + help + Kernel side support for Sparse Mode PIM (Protocol Independent + Multicast) version 1. This multicast routing protocol is used widely + because Cisco supports it. You need special software to use it + (pimd-v1). Please see for more + information about PIM. + + Say Y if you want to use PIM-SM v1. Note that you can say N here if + you just want to use Dense Mode PIM. + +config IP_PIMSM_V2 + bool "IP: PIM-SM version 2 support" + depends on IP_MROUTE + help + Kernel side support for Sparse Mode PIM version 2. In order to use + this, you need an experimental routing daemon supporting it (pimd or + gated-5). This routing protocol is not used widely, so say N unless + you want to play with it. + +config ARPD + bool "IP: ARP daemon support (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + ---help--- + Normally, the kernel maintains an internal cache which maps IP + addresses to hardware addresses on the local network, so that + Ethernet/Token Ring/ etc. frames are sent to the proper address on + the physical networking layer. For small networks having a few + hundred directly connected hosts or less, keeping this address + resolution (ARP) cache inside the kernel works well. However, + maintaining an internal ARP cache does not work well for very large + switched networks, and will use a lot of kernel memory if TCP/IP + connections are made to many machines on the network. + + If you say Y here, the kernel's internal ARP cache will never grow + to more than 256 entries (the oldest entries are expired in a LIFO + manner) and communication will be attempted with the user space ARP + daemon arpd. Arpd then answers the address resolution request either + from its own cache or by asking the net. + + This code is experimental and also obsolete. If you want to use it, + you need to find a version of the daemon arpd on the net somewhere, + and you should also say Y to "Kernel/User network link driver", + below. If unsure, say N. + +config SYN_COOKIES + bool "IP: TCP syncookie support (disabled per default)" + depends on INET + ---help--- + Normal TCP/IP networking is open to an attack known as "SYN + flooding". This denial-of-service attack prevents legitimate remote + users from being able to connect to your computer during an ongoing + attack and requires very little work from the attacker, who can + operate from anywhere on the Internet. + + SYN cookies provide protection against this type of attack. If you + say Y here, the TCP/IP stack will use a cryptographic challenge + protocol known as "SYN cookies" to enable legitimate users to + continue to connect, even when your machine is under attack. There + is no need for the legitimate users to change their TCP/IP software; + SYN cookies work transparently to them. For technical information + about SYN cookies, check out . + + If you are SYN flooded, the source address reported by the kernel is + likely to have been forged by the attacker; it is only reported as + an aid in tracing the packets to their actual source and should not + be taken as absolute truth. + + SYN cookies may prevent correct error reporting on clients when the + server is really overloaded. If this happens frequently better turn + them off. + + If you say Y here, note that SYN cookies aren't enabled by default; + you can enable them by saying Y to "/proc file system support" and + "Sysctl support" below and executing the command + + echo 1 >/proc/sys/net/ipv4/tcp_syncookies + + at boot time after the /proc file system has been mounted. + + If unsure, say N. + +config INET_AH + tristate "IP: AH transformation" + depends on INET + select XFRM + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_MD5 + select CRYPTO_SHA1 + ---help--- + Support for IPsec AH. + + If unsure, say Y. + +config INET_ESP + tristate "IP: ESP transformation" + depends on INET + select XFRM + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_MD5 + select CRYPTO_SHA1 + select CRYPTO_DES + ---help--- + Support for IPsec ESP. + + If unsure, say Y. + +config INET_IPCOMP + tristate "IP: IPComp transformation" + depends on INET + select XFRM + select INET_TUNNEL + select CRYPTO + select CRYPTO_DEFLATE + ---help--- + Support for IP Payload Compression Protocol (IPComp) (RFC3173), + typically needed for IPsec. + + If unsure, say Y. + +config INET_TUNNEL + tristate "IP: tunnel transformation" + depends on INET + select XFRM + ---help--- + Support for generic IP tunnel transformation, which is required by + the IP tunneling module as well as tunnel mode IPComp. + + If unsure, say Y. + +config IP_TCPDIAG + tristate "IP: TCP socket monitoring interface" + depends on INET + default y + ---help--- + Support for TCP socket monitoring interface used by native Linux + tools such as ss. ss is included in iproute2, currently downloadable + at . If you want IPv6 support + and have selected IPv6 as a module, you need to build this as a + module too. + + If unsure, say Y. + +config IP_TCPDIAG_IPV6 + def_bool (IP_TCPDIAG=y && IPV6=y) || (IP_TCPDIAG=m && IPV6) + +source "net/ipv4/ipvs/Kconfig" + diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile new file mode 100644 index 000000000000..8b379627ebb6 --- /dev/null +++ b/net/ipv4/Makefile @@ -0,0 +1,33 @@ +# +# Makefile for the Linux TCP/IP (INET) layer. +# + +obj-y := utils.o route.o inetpeer.o protocol.o \ + ip_input.o ip_fragment.o ip_forward.o ip_options.o \ + ip_output.o ip_sockglue.o \ + tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o tcp_minisocks.o \ + datagram.o raw.o udp.o arp.o icmp.o devinet.o af_inet.o igmp.o \ + sysctl_net_ipv4.o fib_frontend.o fib_semantics.o fib_hash.o + +obj-$(CONFIG_PROC_FS) += proc.o +obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o +obj-$(CONFIG_IP_MROUTE) += ipmr.o +obj-$(CONFIG_NET_IPIP) += ipip.o +obj-$(CONFIG_NET_IPGRE) += ip_gre.o +obj-$(CONFIG_SYN_COOKIES) += syncookies.o +obj-$(CONFIG_INET_AH) += ah4.o +obj-$(CONFIG_INET_ESP) += esp4.o +obj-$(CONFIG_INET_IPCOMP) += ipcomp.o +obj-$(CONFIG_INET_TUNNEL) += xfrm4_tunnel.o +obj-$(CONFIG_IP_PNP) += ipconfig.o +obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o +obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o +obj-$(CONFIG_IP_ROUTE_MULTIPATH_WRANDOM) += multipath_wrandom.o +obj-$(CONFIG_IP_ROUTE_MULTIPATH_DRR) += multipath_drr.o +obj-$(CONFIG_NETFILTER) += netfilter/ +obj-$(CONFIG_IP_VS) += ipvs/ +obj-$(CONFIG_IP_TCPDIAG) += tcp_diag.o +obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o + +obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ + xfrm4_output.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c new file mode 100644 index 000000000000..c34dab67e461 --- /dev/null +++ b/net/ipv4/af_inet.c @@ -0,0 +1,1188 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * PF_INET protocol family socket handler. + * + * Version: $Id: af_inet.c,v 1.137 2002/02/01 22:01:03 davem Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Florian La Roche, + * Alan Cox, + * + * Changes (see also sock.c) + * + * piggy, + * Karl Knutson : Socket protocol table + * A.N.Kuznetsov : Socket death error in accept(). + * John Richardson : Fix non blocking error in connect() + * so sockets that fail to connect + * don't return -EINPROGRESS. + * Alan Cox : Asynchronous I/O support + * Alan Cox : Keep correct socket pointer on sock + * structures + * when accept() ed + * Alan Cox : Semantics of SO_LINGER aren't state + * moved to close when you look carefully. + * With this fixed and the accept bug fixed + * some RPC stuff seems happier. + * Niibe Yutaka : 4.4BSD style write async I/O + * Alan Cox, + * Tony Gale : Fixed reuse semantics. + * Alan Cox : bind() shouldn't abort existing but dead + * sockets. Stops FTP netin:.. I hope. + * Alan Cox : bind() works correctly for RAW sockets. + * Note that FreeBSD at least was broken + * in this respect so be careful with + * compatibility tests... + * Alan Cox : routing cache support + * Alan Cox : memzero the socket structure for + * compactness. + * Matt Day : nonblock connect error handler + * Alan Cox : Allow large numbers of pending sockets + * (eg for big web sites), but only if + * specifically application requested. + * Alan Cox : New buffering throughout IP. Used + * dumbly. + * Alan Cox : New buffering now used smartly. + * Alan Cox : BSD rather than common sense + * interpretation of listen. + * Germano Caronni : Assorted small races. + * Alan Cox : sendmsg/recvmsg basic support. + * Alan Cox : Only sendmsg/recvmsg now supported. + * Alan Cox : Locked down bind (see security list). + * Alan Cox : Loosened bind a little. + * Mike McLagan : ADD/DEL DLCI Ioctls + * Willy Konynenberg : Transparent proxying support. + * David S. Miller : New socket lookup architecture. + * Some other random speedups. + * Cyrus Durgin : Cleaned up file for kmod hacks. + * Andi Kleen : Fix inet_stream_connect TCP race. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_IP_MROUTE +#include +#endif + +DEFINE_SNMP_STAT(struct linux_mib, net_statistics); + +#ifdef INET_REFCNT_DEBUG +atomic_t inet_sock_nr; +#endif + +extern void ip_mc_drop_socket(struct sock *sk); + +/* The inetsw table contains everything that inet_create needs to + * build a new socket. + */ +static struct list_head inetsw[SOCK_MAX]; +static DEFINE_SPINLOCK(inetsw_lock); + +/* New destruction routine */ + +void inet_sock_destruct(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + + __skb_queue_purge(&sk->sk_receive_queue); + __skb_queue_purge(&sk->sk_error_queue); + + if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) { + printk("Attempt to release TCP socket in state %d %p\n", + sk->sk_state, sk); + return; + } + if (!sock_flag(sk, SOCK_DEAD)) { + printk("Attempt to release alive inet socket %p\n", sk); + return; + } + + BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); + BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); + BUG_TRAP(!sk->sk_wmem_queued); + BUG_TRAP(!sk->sk_forward_alloc); + + if (inet->opt) + kfree(inet->opt); + dst_release(sk->sk_dst_cache); +#ifdef INET_REFCNT_DEBUG + atomic_dec(&inet_sock_nr); + printk(KERN_DEBUG "INET socket %p released, %d are still alive\n", + sk, atomic_read(&inet_sock_nr)); +#endif +} + +/* + * The routines beyond this point handle the behaviour of an AF_INET + * socket object. Mostly it punts to the subprotocols of IP to do + * the work. + */ + +/* + * Automatically bind an unbound socket. + */ + +static int inet_autobind(struct sock *sk) +{ + struct inet_sock *inet; + /* We may need to bind the socket. */ + lock_sock(sk); + inet = inet_sk(sk); + if (!inet->num) { + if (sk->sk_prot->get_port(sk, 0)) { + release_sock(sk); + return -EAGAIN; + } + inet->sport = htons(inet->num); + } + release_sock(sk); + return 0; +} + +/* + * Move a socket into listening state. + */ +int inet_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + unsigned char old_state; + int err; + + lock_sock(sk); + + err = -EINVAL; + if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM) + goto out; + + old_state = sk->sk_state; + if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN))) + goto out; + + /* Really, if the socket is already in listen state + * we can only allow the backlog to be adjusted. + */ + if (old_state != TCP_LISTEN) { + err = tcp_listen_start(sk); + if (err) + goto out; + } + sk->sk_max_ack_backlog = backlog; + err = 0; + +out: + release_sock(sk); + return err; +} + +/* + * Create an inet socket. + */ + +static int inet_create(struct socket *sock, int protocol) +{ + struct sock *sk; + struct list_head *p; + struct inet_protosw *answer; + struct inet_sock *inet; + struct proto *answer_prot; + unsigned char answer_flags; + char answer_no_check; + int err; + + sock->state = SS_UNCONNECTED; + + /* Look for the requested type/protocol pair. */ + answer = NULL; + rcu_read_lock(); + list_for_each_rcu(p, &inetsw[sock->type]) { + answer = list_entry(p, struct inet_protosw, list); + + /* Check the non-wild match. */ + if (protocol == answer->protocol) { + if (protocol != IPPROTO_IP) + break; + } else { + /* Check for the two wild cases. */ + if (IPPROTO_IP == protocol) { + protocol = answer->protocol; + break; + } + if (IPPROTO_IP == answer->protocol) + break; + } + answer = NULL; + } + + err = -ESOCKTNOSUPPORT; + if (!answer) + goto out_rcu_unlock; + err = -EPERM; + if (answer->capability > 0 && !capable(answer->capability)) + goto out_rcu_unlock; + err = -EPROTONOSUPPORT; + if (!protocol) + goto out_rcu_unlock; + + sock->ops = answer->ops; + answer_prot = answer->prot; + answer_no_check = answer->no_check; + answer_flags = answer->flags; + rcu_read_unlock(); + + BUG_TRAP(answer_prot->slab != NULL); + + err = -ENOBUFS; + sk = sk_alloc(PF_INET, GFP_KERNEL, answer_prot, 1); + if (sk == NULL) + goto out; + + err = 0; + sk->sk_no_check = answer_no_check; + if (INET_PROTOSW_REUSE & answer_flags) + sk->sk_reuse = 1; + + inet = inet_sk(sk); + + if (SOCK_RAW == sock->type) { + inet->num = protocol; + if (IPPROTO_RAW == protocol) + inet->hdrincl = 1; + } + + if (ipv4_config.no_pmtu_disc) + inet->pmtudisc = IP_PMTUDISC_DONT; + else + inet->pmtudisc = IP_PMTUDISC_WANT; + + inet->id = 0; + + sock_init_data(sock, sk); + + sk->sk_destruct = inet_sock_destruct; + sk->sk_family = PF_INET; + sk->sk_protocol = protocol; + sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; + + inet->uc_ttl = -1; + inet->mc_loop = 1; + inet->mc_ttl = 1; + inet->mc_index = 0; + inet->mc_list = NULL; + +#ifdef INET_REFCNT_DEBUG + atomic_inc(&inet_sock_nr); +#endif + + if (inet->num) { + /* It assumes that any protocol which allows + * the user to assign a number at socket + * creation time automatically + * shares. + */ + inet->sport = htons(inet->num); + /* Add to protocol hash chains. */ + sk->sk_prot->hash(sk); + } + + if (sk->sk_prot->init) { + err = sk->sk_prot->init(sk); + if (err) + sk_common_release(sk); + } +out: + return err; +out_rcu_unlock: + rcu_read_unlock(); + goto out; +} + + +/* + * The peer socket should always be NULL (or else). When we call this + * function we are destroying the object and from then on nobody + * should refer to it. + */ +int inet_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk) { + long timeout; + + /* Applications forget to leave groups before exiting */ + ip_mc_drop_socket(sk); + + /* If linger is set, we don't return until the close + * is complete. Otherwise we return immediately. The + * actually closing is done the same either way. + * + * If the close is due to the process exiting, we never + * linger.. + */ + timeout = 0; + if (sock_flag(sk, SOCK_LINGER) && + !(current->flags & PF_EXITING)) + timeout = sk->sk_lingertime; + sock->sk = NULL; + sk->sk_prot->close(sk, timeout); + } + return 0; +} + +/* It is off by default, see below. */ +int sysctl_ip_nonlocal_bind; + +int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in *addr = (struct sockaddr_in *)uaddr; + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + unsigned short snum; + int chk_addr_ret; + int err; + + /* If the socket has its own bind function then use it. (RAW) */ + if (sk->sk_prot->bind) { + err = sk->sk_prot->bind(sk, uaddr, addr_len); + goto out; + } + err = -EINVAL; + if (addr_len < sizeof(struct sockaddr_in)) + goto out; + + chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + + /* Not specified by any standard per-se, however it breaks too + * many applications when removed. It is unfortunate since + * allowing applications to make a non-local bind solves + * several problems with systems using dynamic addressing. + * (ie. your servers still start up even if your ISDN link + * is temporarily down) + */ + err = -EADDRNOTAVAIL; + if (!sysctl_ip_nonlocal_bind && + !inet->freebind && + addr->sin_addr.s_addr != INADDR_ANY && + chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && + chk_addr_ret != RTN_BROADCAST) + goto out; + + snum = ntohs(addr->sin_port); + err = -EACCES; + if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + goto out; + + /* We keep a pair of addresses. rcv_saddr is the one + * used by hash lookups, and saddr is used for transmit. + * + * In the BSD API these are the same except where it + * would be illegal to use them (multicast/broadcast) in + * which case the sending device address is used. + */ + lock_sock(sk); + + /* Check these errors (active socket, double bind). */ + err = -EINVAL; + if (sk->sk_state != TCP_CLOSE || inet->num) + goto out_release_sock; + + inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; + if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) + inet->saddr = 0; /* Use device */ + + /* Make sure we are allowed to bind here. */ + if (sk->sk_prot->get_port(sk, snum)) { + inet->saddr = inet->rcv_saddr = 0; + err = -EADDRINUSE; + goto out_release_sock; + } + + if (inet->rcv_saddr) + sk->sk_userlocks |= SOCK_BINDADDR_LOCK; + if (snum) + sk->sk_userlocks |= SOCK_BINDPORT_LOCK; + inet->sport = htons(inet->num); + inet->daddr = 0; + inet->dport = 0; + sk_dst_reset(sk); + err = 0; +out_release_sock: + release_sock(sk); +out: + return err; +} + +int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + + if (uaddr->sa_family == AF_UNSPEC) + return sk->sk_prot->disconnect(sk, flags); + + if (!inet_sk(sk)->num && inet_autobind(sk)) + return -EAGAIN; + return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len); +} + +static long inet_wait_for_connect(struct sock *sk, long timeo) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + + /* Basic assumption: if someone sets sk->sk_err, he _must_ + * change state of the socket from TCP_SYN_*. + * Connect() does not allow to get error notifications + * without closing the socket. + */ + while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + if (signal_pending(current) || !timeo) + break; + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + } + finish_wait(sk->sk_sleep, &wait); + return timeo; +} + +/* + * Connect to a remote host. There is regrettably still a little + * TCP 'magic' in here. + */ +int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + int err; + long timeo; + + lock_sock(sk); + + if (uaddr->sa_family == AF_UNSPEC) { + err = sk->sk_prot->disconnect(sk, flags); + sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; + goto out; + } + + switch (sock->state) { + default: + err = -EINVAL; + goto out; + case SS_CONNECTED: + err = -EISCONN; + goto out; + case SS_CONNECTING: + err = -EALREADY; + /* Fall out of switch with err, set for this state */ + break; + case SS_UNCONNECTED: + err = -EISCONN; + if (sk->sk_state != TCP_CLOSE) + goto out; + + err = sk->sk_prot->connect(sk, uaddr, addr_len); + if (err < 0) + goto out; + + sock->state = SS_CONNECTING; + + /* Just entered SS_CONNECTING state; the only + * difference is that return value in non-blocking + * case is EINPROGRESS, rather than EALREADY. + */ + err = -EINPROGRESS; + break; + } + + timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + /* Error code is set above */ + if (!timeo || !inet_wait_for_connect(sk, timeo)) + goto out; + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + goto out; + } + + /* Connection was closed by RST, timeout, ICMP error + * or another process disconnected us. + */ + if (sk->sk_state == TCP_CLOSE) + goto sock_error; + + /* sk->sk_err may be not zero now, if RECVERR was ordered by user + * and error was received after socket entered established state. + * Hence, it is handled normally after connect() return successfully. + */ + + sock->state = SS_CONNECTED; + err = 0; +out: + release_sock(sk); + return err; + +sock_error: + err = sock_error(sk) ? : -ECONNABORTED; + sock->state = SS_UNCONNECTED; + if (sk->sk_prot->disconnect(sk, flags)) + sock->state = SS_DISCONNECTING; + goto out; +} + +/* + * Accept a pending connection. The TCP layer now gives BSD semantics. + */ + +int inet_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk1 = sock->sk; + int err = -EINVAL; + struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err); + + if (!sk2) + goto do_err; + + lock_sock(sk2); + + BUG_TRAP((1 << sk2->sk_state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)); + + sock_graft(sk2, newsock); + + newsock->state = SS_CONNECTED; + err = 0; + release_sock(sk2); +do_err: + return err; +} + + +/* + * This does both peername and sockname. + */ +int inet_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + + sin->sin_family = AF_INET; + if (peer) { + if (!inet->dport || + (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && + peer == 1)) + return -ENOTCONN; + sin->sin_port = inet->dport; + sin->sin_addr.s_addr = inet->daddr; + } else { + __u32 addr = inet->rcv_saddr; + if (!addr) + addr = inet->saddr; + sin->sin_port = inet->sport; + sin->sin_addr.s_addr = addr; + } + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + *uaddr_len = sizeof(*sin); + return 0; +} + +int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, + size_t size) +{ + struct sock *sk = sock->sk; + + /* We may need to bind the socket. */ + if (!inet_sk(sk)->num && inet_autobind(sk)) + return -EAGAIN; + + return sk->sk_prot->sendmsg(iocb, sk, msg, size); +} + + +static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) +{ + struct sock *sk = sock->sk; + + /* We may need to bind the socket. */ + if (!inet_sk(sk)->num && inet_autobind(sk)) + return -EAGAIN; + + if (sk->sk_prot->sendpage) + return sk->sk_prot->sendpage(sk, page, offset, size, flags); + return sock_no_sendpage(sock, page, offset, size, flags); +} + + +int inet_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + int err = 0; + + /* This should really check to make sure + * the socket is a TCP socket. (WHY AC...) + */ + how++; /* maps 0->1 has the advantage of making bit 1 rcvs and + 1->2 bit 2 snds. + 2->3 */ + if ((how & ~SHUTDOWN_MASK) || !how) /* MAXINT->0 */ + return -EINVAL; + + lock_sock(sk); + if (sock->state == SS_CONNECTING) { + if ((1 << sk->sk_state) & + (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_CLOSE)) + sock->state = SS_DISCONNECTING; + else + sock->state = SS_CONNECTED; + } + + switch (sk->sk_state) { + case TCP_CLOSE: + err = -ENOTCONN; + /* Hack to wake up other listeners, who can poll for + POLLHUP, even on eg. unconnected UDP sockets -- RR */ + default: + sk->sk_shutdown |= how; + if (sk->sk_prot->shutdown) + sk->sk_prot->shutdown(sk, how); + break; + + /* Remaining two branches are temporary solution for missing + * close() in multithreaded environment. It is _not_ a good idea, + * but we have no choice until close() is repaired at VFS level. + */ + case TCP_LISTEN: + if (!(how & RCV_SHUTDOWN)) + break; + /* Fall through */ + case TCP_SYN_SENT: + err = sk->sk_prot->disconnect(sk, O_NONBLOCK); + sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; + break; + } + + /* Wake up anyone sleeping in poll. */ + sk->sk_state_change(sk); + release_sock(sk); + return err; +} + +/* + * ioctl() calls you can issue on an INET socket. Most of these are + * device configuration and stuff and very rarely used. Some ioctls + * pass on to the socket itself. + * + * NOTE: I like the idea of a module for the config stuff. ie ifconfig + * loads the devconfigure module does its configuring and unloads it. + * There's a good 20K of config code hanging around the kernel. + */ + +int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + int err = 0; + + switch (cmd) { + case SIOCGSTAMP: + err = sock_get_timestamp(sk, (struct timeval __user *)arg); + break; + case SIOCADDRT: + case SIOCDELRT: + case SIOCRTMSG: + err = ip_rt_ioctl(cmd, (void __user *)arg); + break; + case SIOCDARP: + case SIOCGARP: + case SIOCSARP: + err = arp_ioctl(cmd, (void __user *)arg); + break; + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCSIFPFLAGS: + case SIOCGIFPFLAGS: + case SIOCSIFFLAGS: + err = devinet_ioctl(cmd, (void __user *)arg); + break; + default: + if (!sk->sk_prot->ioctl || + (err = sk->sk_prot->ioctl(sk, cmd, arg)) == + -ENOIOCTLCMD) + err = dev_ioctl(cmd, (void __user *)arg); + break; + } + return err; +} + +struct proto_ops inet_stream_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = inet_bind, + .connect = inet_stream_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet_getname, + .poll = tcp_poll, + .ioctl = inet_ioctl, + .listen = inet_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = tcp_sendpage +}; + +struct proto_ops inet_dgram_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = inet_bind, + .connect = inet_dgram_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = inet_getname, + .poll = udp_poll, + .ioctl = inet_ioctl, + .listen = sock_no_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, +}; + +/* + * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without + * udp_poll + */ +static struct proto_ops inet_sockraw_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, + .bind = inet_bind, + .connect = inet_dgram_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = inet_getname, + .poll = datagram_poll, + .ioctl = inet_ioctl, + .listen = sock_no_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = inet_sendpage, +}; + +static struct net_proto_family inet_family_ops = { + .family = PF_INET, + .create = inet_create, + .owner = THIS_MODULE, +}; + + +extern void tcp_init(void); +extern void tcp_v4_init(struct net_proto_family *); + +/* Upon startup we insert all the elements in inetsw_array[] into + * the linked list inetsw. + */ +static struct inet_protosw inetsw_array[] = +{ + { + .type = SOCK_STREAM, + .protocol = IPPROTO_TCP, + .prot = &tcp_prot, + .ops = &inet_stream_ops, + .capability = -1, + .no_check = 0, + .flags = INET_PROTOSW_PERMANENT, + }, + + { + .type = SOCK_DGRAM, + .protocol = IPPROTO_UDP, + .prot = &udp_prot, + .ops = &inet_dgram_ops, + .capability = -1, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_PERMANENT, + }, + + + { + .type = SOCK_RAW, + .protocol = IPPROTO_IP, /* wild card */ + .prot = &raw_prot, + .ops = &inet_sockraw_ops, + .capability = CAP_NET_RAW, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_REUSE, + } +}; + +#define INETSW_ARRAY_LEN (sizeof(inetsw_array) / sizeof(struct inet_protosw)) + +void inet_register_protosw(struct inet_protosw *p) +{ + struct list_head *lh; + struct inet_protosw *answer; + int protocol = p->protocol; + struct list_head *last_perm; + + spin_lock_bh(&inetsw_lock); + + if (p->type >= SOCK_MAX) + goto out_illegal; + + /* If we are trying to override a permanent protocol, bail. */ + answer = NULL; + last_perm = &inetsw[p->type]; + list_for_each(lh, &inetsw[p->type]) { + answer = list_entry(lh, struct inet_protosw, list); + + /* Check only the non-wild match. */ + if (INET_PROTOSW_PERMANENT & answer->flags) { + if (protocol == answer->protocol) + break; + last_perm = lh; + } + + answer = NULL; + } + if (answer) + goto out_permanent; + + /* Add the new entry after the last permanent entry if any, so that + * the new entry does not override a permanent entry when matched with + * a wild-card protocol. But it is allowed to override any existing + * non-permanent entry. This means that when we remove this entry, the + * system automatically returns to the old behavior. + */ + list_add_rcu(&p->list, last_perm); +out: + spin_unlock_bh(&inetsw_lock); + + synchronize_net(); + + return; + +out_permanent: + printk(KERN_ERR "Attempt to override permanent protocol %d.\n", + protocol); + goto out; + +out_illegal: + printk(KERN_ERR + "Ignoring attempt to register invalid socket type %d.\n", + p->type); + goto out; +} + +void inet_unregister_protosw(struct inet_protosw *p) +{ + if (INET_PROTOSW_PERMANENT & p->flags) { + printk(KERN_ERR + "Attempt to unregister permanent protocol %d.\n", + p->protocol); + } else { + spin_lock_bh(&inetsw_lock); + list_del_rcu(&p->list); + spin_unlock_bh(&inetsw_lock); + + synchronize_net(); + } +} + +#ifdef CONFIG_IP_MULTICAST +static struct net_protocol igmp_protocol = { + .handler = igmp_rcv, +}; +#endif + +static struct net_protocol tcp_protocol = { + .handler = tcp_v4_rcv, + .err_handler = tcp_v4_err, + .no_policy = 1, +}; + +static struct net_protocol udp_protocol = { + .handler = udp_rcv, + .err_handler = udp_err, + .no_policy = 1, +}; + +static struct net_protocol icmp_protocol = { + .handler = icmp_rcv, +}; + +static int __init init_ipv4_mibs(void) +{ + net_statistics[0] = alloc_percpu(struct linux_mib); + net_statistics[1] = alloc_percpu(struct linux_mib); + ip_statistics[0] = alloc_percpu(struct ipstats_mib); + ip_statistics[1] = alloc_percpu(struct ipstats_mib); + icmp_statistics[0] = alloc_percpu(struct icmp_mib); + icmp_statistics[1] = alloc_percpu(struct icmp_mib); + tcp_statistics[0] = alloc_percpu(struct tcp_mib); + tcp_statistics[1] = alloc_percpu(struct tcp_mib); + udp_statistics[0] = alloc_percpu(struct udp_mib); + udp_statistics[1] = alloc_percpu(struct udp_mib); + if (! + (net_statistics[0] && net_statistics[1] && ip_statistics[0] + && ip_statistics[1] && tcp_statistics[0] && tcp_statistics[1] + && udp_statistics[0] && udp_statistics[1])) + return -ENOMEM; + + (void) tcp_mib_init(); + + return 0; +} + +static int ipv4_proc_init(void); +extern void ipfrag_init(void); + +static int __init inet_init(void) +{ + struct sk_buff *dummy_skb; + struct inet_protosw *q; + struct list_head *r; + int rc = -EINVAL; + + if (sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb)) { + printk(KERN_CRIT "%s: panic\n", __FUNCTION__); + goto out; + } + + rc = proto_register(&tcp_prot, 1); + if (rc) + goto out; + + rc = proto_register(&udp_prot, 1); + if (rc) + goto out_unregister_tcp_proto; + + rc = proto_register(&raw_prot, 1); + if (rc) + goto out_unregister_udp_proto; + + /* + * Tell SOCKET that we are alive... + */ + + (void)sock_register(&inet_family_ops); + + /* + * Add all the base protocols. + */ + + if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0) + printk(KERN_CRIT "inet_init: Cannot add ICMP protocol\n"); + if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0) + printk(KERN_CRIT "inet_init: Cannot add UDP protocol\n"); + if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0) + printk(KERN_CRIT "inet_init: Cannot add TCP protocol\n"); +#ifdef CONFIG_IP_MULTICAST + if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0) + printk(KERN_CRIT "inet_init: Cannot add IGMP protocol\n"); +#endif + + /* Register the socket-side information for inet_create. */ + for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r) + INIT_LIST_HEAD(r); + + for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q) + inet_register_protosw(q); + + /* + * Set the ARP module up + */ + + arp_init(); + + /* + * Set the IP module up + */ + + ip_init(); + + tcp_v4_init(&inet_family_ops); + + /* Setup TCP slab cache for open requests. */ + tcp_init(); + + + /* + * Set the ICMP layer up + */ + + icmp_init(&inet_family_ops); + + /* + * Initialise the multicast router + */ +#if defined(CONFIG_IP_MROUTE) + ip_mr_init(); +#endif + /* + * Initialise per-cpu ipv4 mibs + */ + + if(init_ipv4_mibs()) + printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ; + + ipv4_proc_init(); + + ipfrag_init(); + + rc = 0; +out: + return rc; +out_unregister_tcp_proto: + proto_unregister(&tcp_prot); +out_unregister_udp_proto: + proto_unregister(&udp_prot); + goto out; +} + +module_init(inet_init); + +/* ------------------------------------------------------------------------ */ + +#ifdef CONFIG_PROC_FS +extern int fib_proc_init(void); +extern void fib_proc_exit(void); +extern int ip_misc_proc_init(void); +extern int raw_proc_init(void); +extern void raw_proc_exit(void); +extern int tcp4_proc_init(void); +extern void tcp4_proc_exit(void); +extern int udp4_proc_init(void); +extern void udp4_proc_exit(void); + +static int __init ipv4_proc_init(void) +{ + int rc = 0; + + if (raw_proc_init()) + goto out_raw; + if (tcp4_proc_init()) + goto out_tcp; + if (udp4_proc_init()) + goto out_udp; + if (fib_proc_init()) + goto out_fib; + if (ip_misc_proc_init()) + goto out_misc; +out: + return rc; +out_misc: + fib_proc_exit(); +out_fib: + udp4_proc_exit(); +out_udp: + tcp4_proc_exit(); +out_tcp: + raw_proc_exit(); +out_raw: + rc = -ENOMEM; + goto out; +} + +#else /* CONFIG_PROC_FS */ +static int __init ipv4_proc_init(void) +{ + return 0; +} +#endif /* CONFIG_PROC_FS */ + +MODULE_ALIAS_NETPROTO(PF_INET); + +EXPORT_SYMBOL(inet_accept); +EXPORT_SYMBOL(inet_bind); +EXPORT_SYMBOL(inet_dgram_connect); +EXPORT_SYMBOL(inet_dgram_ops); +EXPORT_SYMBOL(inet_getname); +EXPORT_SYMBOL(inet_ioctl); +EXPORT_SYMBOL(inet_listen); +EXPORT_SYMBOL(inet_register_protosw); +EXPORT_SYMBOL(inet_release); +EXPORT_SYMBOL(inet_sendmsg); +EXPORT_SYMBOL(inet_shutdown); +EXPORT_SYMBOL(inet_sock_destruct); +EXPORT_SYMBOL(inet_stream_connect); +EXPORT_SYMBOL(inet_stream_ops); +EXPORT_SYMBOL(inet_unregister_protosw); +EXPORT_SYMBOL(net_statistics); + +#ifdef INET_REFCNT_DEBUG +EXPORT_SYMBOL(inet_sock_nr); +#endif diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c new file mode 100644 index 000000000000..0e98f2235b6e --- /dev/null +++ b/net/ipv4/ah4.c @@ -0,0 +1,335 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Clear mutable options and find final destination to substitute + * into IP header for icv calculation. Options are already checked + * for validity, so paranoia is not required. */ + +static int ip_clear_mutable_options(struct iphdr *iph, u32 *daddr) +{ + unsigned char * optptr = (unsigned char*)(iph+1); + int l = iph->ihl*4 - sizeof(struct iphdr); + int optlen; + + while (l > 0) { + switch (*optptr) { + case IPOPT_END: + return 0; + case IPOPT_NOOP: + l--; + optptr++; + continue; + } + optlen = optptr[1]; + if (optlen<2 || optlen>l) + return -EINVAL; + switch (*optptr) { + case IPOPT_SEC: + case 0x85: /* Some "Extended Security" crap. */ + case 0x86: /* Another "Commercial Security" crap. */ + case IPOPT_RA: + case 0x80|21: /* RFC1770 */ + break; + case IPOPT_LSRR: + case IPOPT_SSRR: + if (optlen < 6) + return -EINVAL; + memcpy(daddr, optptr+optlen-4, 4); + /* Fall through */ + default: + memset(optptr+2, 0, optlen-2); + } + l -= optlen; + optptr += optlen; + } + return 0; +} + +static int ah_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + struct iphdr *iph, *top_iph; + struct ip_auth_hdr *ah; + struct ah_data *ahp; + union { + struct iphdr iph; + char buf[60]; + } tmp_iph; + + top_iph = skb->nh.iph; + iph = &tmp_iph.iph; + + iph->tos = top_iph->tos; + iph->ttl = top_iph->ttl; + iph->frag_off = top_iph->frag_off; + + if (top_iph->ihl != 5) { + iph->daddr = top_iph->daddr; + memcpy(iph+1, top_iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); + err = ip_clear_mutable_options(top_iph, &top_iph->daddr); + if (err) + goto error; + } + + ah = (struct ip_auth_hdr *)((char *)top_iph+top_iph->ihl*4); + ah->nexthdr = top_iph->protocol; + + top_iph->tos = 0; + top_iph->tot_len = htons(skb->len); + top_iph->frag_off = 0; + top_iph->ttl = 0; + top_iph->protocol = IPPROTO_AH; + top_iph->check = 0; + + ahp = x->data; + ah->hdrlen = (XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + + ahp->icv_trunc_len) >> 2) - 2; + + ah->reserved = 0; + ah->spi = x->id.spi; + ah->seq_no = htonl(++x->replay.oseq); + ahp->icv(ahp, skb, ah->auth_data); + + top_iph->tos = iph->tos; + top_iph->ttl = iph->ttl; + top_iph->frag_off = iph->frag_off; + if (top_iph->ihl != 5) { + top_iph->daddr = iph->daddr; + memcpy(top_iph+1, iph+1, top_iph->ihl*4 - sizeof(struct iphdr)); + } + + ip_send_check(top_iph); + + err = 0; + +error: + return err; +} + +static int ah_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + int ah_hlen; + struct iphdr *iph; + struct ip_auth_hdr *ah; + struct ah_data *ahp; + char work_buf[60]; + + if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr))) + goto out; + + ah = (struct ip_auth_hdr*)skb->data; + ahp = x->data; + ah_hlen = (ah->hdrlen + 2) << 2; + + if (ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len)) + goto out; + + if (!pskb_may_pull(skb, ah_hlen)) + goto out; + + /* We are going to _remove_ AH header to keep sockets happy, + * so... Later this can change. */ + if (skb_cloned(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto out; + + skb->ip_summed = CHECKSUM_NONE; + + ah = (struct ip_auth_hdr*)skb->data; + iph = skb->nh.iph; + + memcpy(work_buf, iph, iph->ihl*4); + + iph->ttl = 0; + iph->tos = 0; + iph->frag_off = 0; + iph->check = 0; + if (iph->ihl != 5) { + u32 dummy; + if (ip_clear_mutable_options(iph, &dummy)) + goto out; + } + { + u8 auth_data[MAX_AH_AUTH_LEN]; + + memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); + skb_push(skb, skb->data - skb->nh.raw); + ahp->icv(ahp, skb, ah->auth_data); + if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { + x->stats.integrity_failed++; + goto out; + } + } + ((struct iphdr*)work_buf)->protocol = ah->nexthdr; + skb->nh.raw = skb_pull(skb, ah_hlen); + memcpy(skb->nh.raw, work_buf, iph->ihl*4); + skb->nh.iph->tot_len = htons(skb->len); + skb_pull(skb, skb->nh.iph->ihl*4); + skb->h.raw = skb->data; + + return 0; + +out: + return -EINVAL; +} + +static void ah4_err(struct sk_buff *skb, u32 info) +{ + struct iphdr *iph = (struct iphdr*)skb->data; + struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+(iph->ihl<<2)); + struct xfrm_state *x; + + if (skb->h.icmph->type != ICMP_DEST_UNREACH || + skb->h.icmph->code != ICMP_FRAG_NEEDED) + return; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET); + if (!x) + return; + printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/%08x\n", + ntohl(ah->spi), ntohl(iph->daddr)); + xfrm_state_put(x); +} + +static int ah_init_state(struct xfrm_state *x, void *args) +{ + struct ah_data *ahp = NULL; + struct xfrm_algo_desc *aalg_desc; + + if (!x->aalg) + goto error; + + /* null auth can use a zero length key */ + if (x->aalg->alg_key_len > 512) + goto error; + + if (x->encap) + goto error; + + ahp = kmalloc(sizeof(*ahp), GFP_KERNEL); + if (ahp == NULL) + return -ENOMEM; + + memset(ahp, 0, sizeof(*ahp)); + + ahp->key = x->aalg->alg_key; + ahp->key_len = (x->aalg->alg_key_len+7)/8; + ahp->tfm = crypto_alloc_tfm(x->aalg->alg_name, 0); + if (!ahp->tfm) + goto error; + ahp->icv = ah_hmac_digest; + + /* + * Lookup the algorithm description maintained by xfrm_algo, + * verify crypto transform properties, and store information + * we need for AH processing. This lookup cannot fail here + * after a successful crypto_alloc_tfm(). + */ + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); + BUG_ON(!aalg_desc); + + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_tfm_alg_digestsize(ahp->tfm)) { + printk(KERN_INFO "AH: %s digestsize %u != %hu\n", + x->aalg->alg_name, crypto_tfm_alg_digestsize(ahp->tfm), + aalg_desc->uinfo.auth.icv_fullbits/8); + goto error; + } + + ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; + ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; + + BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); + + ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL); + if (!ahp->work_icv) + goto error; + + x->props.header_len = XFRM_ALIGN8(sizeof(struct ip_auth_hdr) + ahp->icv_trunc_len); + if (x->props.mode) + x->props.header_len += sizeof(struct iphdr); + x->data = ahp; + + return 0; + +error: + if (ahp) { + if (ahp->work_icv) + kfree(ahp->work_icv); + if (ahp->tfm) + crypto_free_tfm(ahp->tfm); + kfree(ahp); + } + return -EINVAL; +} + +static void ah_destroy(struct xfrm_state *x) +{ + struct ah_data *ahp = x->data; + + if (!ahp) + return; + + if (ahp->work_icv) { + kfree(ahp->work_icv); + ahp->work_icv = NULL; + } + if (ahp->tfm) { + crypto_free_tfm(ahp->tfm); + ahp->tfm = NULL; + } + kfree(ahp); +} + + +static struct xfrm_type ah_type = +{ + .description = "AH4", + .owner = THIS_MODULE, + .proto = IPPROTO_AH, + .init_state = ah_init_state, + .destructor = ah_destroy, + .input = ah_input, + .output = ah_output +}; + +static struct net_protocol ah4_protocol = { + .handler = xfrm4_rcv, + .err_handler = ah4_err, + .no_policy = 1, +}; + +static int __init ah4_init(void) +{ + if (xfrm_register_type(&ah_type, AF_INET) < 0) { + printk(KERN_INFO "ip ah init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet_add_protocol(&ah4_protocol, IPPROTO_AH) < 0) { + printk(KERN_INFO "ip ah init: can't add protocol\n"); + xfrm_unregister_type(&ah_type, AF_INET); + return -EAGAIN; + } + return 0; +} + +static void __exit ah4_fini(void) +{ + if (inet_del_protocol(&ah4_protocol, IPPROTO_AH) < 0) + printk(KERN_INFO "ip ah close: can't remove protocol\n"); + if (xfrm_unregister_type(&ah_type, AF_INET) < 0) + printk(KERN_INFO "ip ah close: can't remove xfrm type\n"); +} + +module_init(ah4_init); +module_exit(ah4_fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c new file mode 100644 index 000000000000..a642fd612853 --- /dev/null +++ b/net/ipv4/arp.c @@ -0,0 +1,1425 @@ +/* linux/net/inet/arp.c + * + * Version: $Id: arp.c,v 1.99 2001/08/30 22:55:42 davem Exp $ + * + * Copyright (C) 1994 by Florian La Roche + * + * This module implements the Address Resolution Protocol ARP (RFC 826), + * which is used to convert IP addresses (or in the future maybe other + * high-level addresses) into a low-level hardware address (like an Ethernet + * address). + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Alan Cox : Removed the Ethernet assumptions in + * Florian's code + * Alan Cox : Fixed some small errors in the ARP + * logic + * Alan Cox : Allow >4K in /proc + * Alan Cox : Make ARP add its own protocol entry + * Ross Martin : Rewrote arp_rcv() and arp_get_info() + * Stephen Henson : Add AX25 support to arp_get_info() + * Alan Cox : Drop data when a device is downed. + * Alan Cox : Use init_timer(). + * Alan Cox : Double lock fixes. + * Martin Seine : Move the arphdr structure + * to if_arp.h for compatibility. + * with BSD based programs. + * Andrew Tridgell : Added ARP netmask code and + * re-arranged proxy handling. + * Alan Cox : Changed to use notifiers. + * Niibe Yutaka : Reply for this device or proxies only. + * Alan Cox : Don't proxy across hardware types! + * Jonathan Naylor : Added support for NET/ROM. + * Mike Shaver : RFC1122 checks. + * Jonathan Naylor : Only lookup the hardware address for + * the correct hardware type. + * Germano Caronni : Assorted subtle races. + * Craig Schlenter : Don't modify permanent entry + * during arp_rcv. + * Russ Nelson : Tidied up a few bits. + * Alexey Kuznetsov: Major changes to caching and behaviour, + * eg intelligent arp probing and + * generation + * of host down events. + * Alan Cox : Missing unlock in device events. + * Eckes : ARP ioctl control errors. + * Alexey Kuznetsov: Arp free fix. + * Manuel Rodriguez: Gratuitous ARP. + * Jonathan Layes : Added arpd support through kerneld + * message queue (960314) + * Mike Shaver : /proc/sys/net/ipv4/arp_* support + * Mike McLagan : Routing by source + * Stuart Cheshire : Metricom and grat arp fixes + * *** FOR 2.1 clean this up *** + * Lawrence V. Stefani: (08/12/96) Added FDDI support. + * Alan Cox : Took the AP1000 nasty FDDI hack and + * folded into the mainstream FDDI code. + * Ack spit, Linus how did you allow that + * one in... + * Jes Sorensen : Make FDDI work again in 2.1.x and + * clean up the APFDDI & gen. FDDI bits. + * Alexey Kuznetsov: new arp state machine; + * now it is in net/core/neighbour.c. + * Krzysztof Halasa: Added Frame Relay ARP support. + * Arnaldo C. Melo : convert /proc/net/arp to seq_file + * Shmulik Hen: Split arp_send to arp_create and + * arp_xmit so intermediate drivers like + * bonding can change the skb before + * sending (e.g. insert 8021q tag). + * Harald Welte : convert to make use of jenkins hash + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) +#include +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) +#include +#endif +#endif +#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) +#include +struct neigh_table *clip_tbl_hook; +#endif + +#include +#include + +#include + +/* + * Interface to generic neighbour cache. + */ +static u32 arp_hash(const void *pkey, const struct net_device *dev); +static int arp_constructor(struct neighbour *neigh); +static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); +static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); +static void parp_redo(struct sk_buff *skb); + +static struct neigh_ops arp_generic_ops = { + .family = AF_INET, + .solicit = arp_solicit, + .error_report = arp_error_report, + .output = neigh_resolve_output, + .connected_output = neigh_connected_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +static struct neigh_ops arp_hh_ops = { + .family = AF_INET, + .solicit = arp_solicit, + .error_report = arp_error_report, + .output = neigh_resolve_output, + .connected_output = neigh_resolve_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +static struct neigh_ops arp_direct_ops = { + .family = AF_INET, + .output = dev_queue_xmit, + .connected_output = dev_queue_xmit, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +struct neigh_ops arp_broken_ops = { + .family = AF_INET, + .solicit = arp_solicit, + .error_report = arp_error_report, + .output = neigh_compat_output, + .connected_output = neigh_compat_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +struct neigh_table arp_tbl = { + .family = AF_INET, + .entry_size = sizeof(struct neighbour) + 4, + .key_len = 4, + .hash = arp_hash, + .constructor = arp_constructor, + .proxy_redo = parp_redo, + .id = "arp_cache", + .parms = { + .tbl = &arp_tbl, + .base_reachable_time = 30 * HZ, + .retrans_time = 1 * HZ, + .gc_staletime = 60 * HZ, + .reachable_time = 30 * HZ, + .delay_probe_time = 5 * HZ, + .queue_len = 3, + .ucast_probes = 3, + .mcast_probes = 3, + .anycast_delay = 1 * HZ, + .proxy_delay = (8 * HZ) / 10, + .proxy_qlen = 64, + .locktime = 1 * HZ, + }, + .gc_interval = 30 * HZ, + .gc_thresh1 = 128, + .gc_thresh2 = 512, + .gc_thresh3 = 1024, +}; + +int arp_mc_map(u32 addr, u8 *haddr, struct net_device *dev, int dir) +{ + switch (dev->type) { + case ARPHRD_ETHER: + case ARPHRD_FDDI: + case ARPHRD_IEEE802: + ip_eth_mc_map(addr, haddr); + return 0; + case ARPHRD_IEEE802_TR: + ip_tr_mc_map(addr, haddr); + return 0; + case ARPHRD_INFINIBAND: + ip_ib_mc_map(addr, haddr); + return 0; + default: + if (dir) { + memcpy(haddr, dev->broadcast, dev->addr_len); + return 0; + } + } + return -EINVAL; +} + + +static u32 arp_hash(const void *pkey, const struct net_device *dev) +{ + return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd); +} + +static int arp_constructor(struct neighbour *neigh) +{ + u32 addr = *(u32*)neigh->primary_key; + struct net_device *dev = neigh->dev; + struct in_device *in_dev; + struct neigh_parms *parms; + + neigh->type = inet_addr_type(addr); + + rcu_read_lock(); + in_dev = rcu_dereference(__in_dev_get(dev)); + if (in_dev == NULL) { + rcu_read_unlock(); + return -EINVAL; + } + + parms = in_dev->arp_parms; + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); + rcu_read_unlock(); + + if (dev->hard_header == NULL) { + neigh->nud_state = NUD_NOARP; + neigh->ops = &arp_direct_ops; + neigh->output = neigh->ops->queue_xmit; + } else { + /* Good devices (checked by reading texts, but only Ethernet is + tested) + + ARPHRD_ETHER: (ethernet, apfddi) + ARPHRD_FDDI: (fddi) + ARPHRD_IEEE802: (tr) + ARPHRD_METRICOM: (strip) + ARPHRD_ARCNET: + etc. etc. etc. + + ARPHRD_IPDDP will also work, if author repairs it. + I did not it, because this driver does not work even + in old paradigm. + */ + +#if 1 + /* So... these "amateur" devices are hopeless. + The only thing, that I can say now: + It is very sad that we need to keep ugly obsolete + code to make them happy. + + They should be moved to more reasonable state, now + they use rebuild_header INSTEAD OF hard_start_xmit!!! + Besides that, they are sort of out of date + (a lot of redundant clones/copies, useless in 2.1), + I wonder why people believe that they work. + */ + switch (dev->type) { + default: + break; + case ARPHRD_ROSE: +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + case ARPHRD_AX25: +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) + case ARPHRD_NETROM: +#endif + neigh->ops = &arp_broken_ops; + neigh->output = neigh->ops->output; + return 0; +#endif + ;} +#endif + if (neigh->type == RTN_MULTICAST) { + neigh->nud_state = NUD_NOARP; + arp_mc_map(addr, neigh->ha, dev, 1); + } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->dev_addr, dev->addr_len); + } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->broadcast, dev->addr_len); + } + if (dev->hard_header_cache) + neigh->ops = &arp_hh_ops; + else + neigh->ops = &arp_generic_ops; + if (neigh->nud_state&NUD_VALID) + neigh->output = neigh->ops->connected_output; + else + neigh->output = neigh->ops->output; + } + return 0; +} + +static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb) +{ + dst_link_failure(skb); + kfree_skb(skb); +} + +static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) +{ + u32 saddr = 0; + u8 *dst_ha = NULL; + struct net_device *dev = neigh->dev; + u32 target = *(u32*)neigh->primary_key; + int probes = atomic_read(&neigh->probes); + struct in_device *in_dev = in_dev_get(dev); + + if (!in_dev) + return; + + switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { + default: + case 0: /* By default announce any local IP */ + if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL) + saddr = skb->nh.iph->saddr; + break; + case 1: /* Restrict announcements of saddr in same subnet */ + if (!skb) + break; + saddr = skb->nh.iph->saddr; + if (inet_addr_type(saddr) == RTN_LOCAL) { + /* saddr should be known to target */ + if (inet_addr_onlink(in_dev, target, saddr)) + break; + } + saddr = 0; + break; + case 2: /* Avoid secondary IPs, get a primary/preferred one */ + break; + } + + if (in_dev) + in_dev_put(in_dev); + if (!saddr) + saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); + + if ((probes -= neigh->parms->ucast_probes) < 0) { + if (!(neigh->nud_state&NUD_VALID)) + printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n"); + dst_ha = neigh->ha; + read_lock_bh(&neigh->lock); + } else if ((probes -= neigh->parms->app_probes) < 0) { +#ifdef CONFIG_ARPD + neigh_app_ns(neigh); +#endif + return; + } + + arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, + dst_ha, dev->dev_addr, NULL); + if (dst_ha) + read_unlock_bh(&neigh->lock); +} + +static int arp_ignore(struct in_device *in_dev, struct net_device *dev, + u32 sip, u32 tip) +{ + int scope; + + switch (IN_DEV_ARP_IGNORE(in_dev)) { + case 0: /* Reply, the tip is already validated */ + return 0; + case 1: /* Reply only if tip is configured on the incoming interface */ + sip = 0; + scope = RT_SCOPE_HOST; + break; + case 2: /* + * Reply only if tip is configured on the incoming interface + * and is in same subnet as sip + */ + scope = RT_SCOPE_HOST; + break; + case 3: /* Do not reply for scope host addresses */ + sip = 0; + scope = RT_SCOPE_LINK; + dev = NULL; + break; + case 4: /* Reserved */ + case 5: + case 6: + case 7: + return 0; + case 8: /* Do not reply */ + return 1; + default: + return 0; + } + return !inet_confirm_addr(dev, sip, tip, scope); +} + +static int arp_filter(__u32 sip, __u32 tip, struct net_device *dev) +{ + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = sip, + .saddr = tip } } }; + struct rtable *rt; + int flag = 0; + /*unsigned long now; */ + + if (ip_route_output_key(&rt, &fl) < 0) + return 1; + if (rt->u.dst.dev != dev) { + NET_INC_STATS_BH(LINUX_MIB_ARPFILTER); + flag = 1; + } + ip_rt_put(rt); + return flag; +} + +/* OBSOLETE FUNCTIONS */ + +/* + * Find an arp mapping in the cache. If not found, post a request. + * + * It is very UGLY routine: it DOES NOT use skb->dst->neighbour, + * even if it exists. It is supposed that skb->dev was mangled + * by a virtual device (eql, shaper). Nobody but broken devices + * is allowed to use this function, it is scheduled to be removed. --ANK + */ + +static int arp_set_predefined(int addr_hint, unsigned char * haddr, u32 paddr, struct net_device * dev) +{ + switch (addr_hint) { + case RTN_LOCAL: + printk(KERN_DEBUG "ARP: arp called for own IP address\n"); + memcpy(haddr, dev->dev_addr, dev->addr_len); + return 1; + case RTN_MULTICAST: + arp_mc_map(paddr, haddr, dev, 1); + return 1; + case RTN_BROADCAST: + memcpy(haddr, dev->broadcast, dev->addr_len); + return 1; + } + return 0; +} + + +int arp_find(unsigned char *haddr, struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + u32 paddr; + struct neighbour *n; + + if (!skb->dst) { + printk(KERN_DEBUG "arp_find is called with dst==NULL\n"); + kfree_skb(skb); + return 1; + } + + paddr = ((struct rtable*)skb->dst)->rt_gateway; + + if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev)) + return 0; + + n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); + + if (n) { + n->used = jiffies; + if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) { + read_lock_bh(&n->lock); + memcpy(haddr, n->ha, dev->addr_len); + read_unlock_bh(&n->lock); + neigh_release(n); + return 0; + } + neigh_release(n); + } else + kfree_skb(skb); + return 1; +} + +/* END OF OBSOLETE FUNCTIONS */ + +int arp_bind_neighbour(struct dst_entry *dst) +{ + struct net_device *dev = dst->dev; + struct neighbour *n = dst->neighbour; + + if (dev == NULL) + return -EINVAL; + if (n == NULL) { + u32 nexthop = ((struct rtable*)dst)->rt_gateway; + if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT)) + nexthop = 0; + n = __neigh_lookup_errno( +#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) + dev->type == ARPHRD_ATM ? clip_tbl_hook : +#endif + &arp_tbl, &nexthop, dev); + if (IS_ERR(n)) + return PTR_ERR(n); + dst->neighbour = n; + } + return 0; +} + +/* + * Check if we can use proxy ARP for this path + */ + +static inline int arp_fwd_proxy(struct in_device *in_dev, struct rtable *rt) +{ + struct in_device *out_dev; + int imi, omi = -1; + + if (!IN_DEV_PROXY_ARP(in_dev)) + return 0; + + if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0) + return 1; + if (imi == -1) + return 0; + + /* place to check for proxy_arp for routes */ + + if ((out_dev = in_dev_get(rt->u.dst.dev)) != NULL) { + omi = IN_DEV_MEDIUM_ID(out_dev); + in_dev_put(out_dev); + } + return (omi != imi && omi != -1); +} + +/* + * Interface to link layer: send routine and receive handler. + */ + +/* + * Create an arp packet. If (dest_hw == NULL), we create a broadcast + * message. + */ +struct sk_buff *arp_create(int type, int ptype, u32 dest_ip, + struct net_device *dev, u32 src_ip, + unsigned char *dest_hw, unsigned char *src_hw, + unsigned char *target_hw) +{ + struct sk_buff *skb; + struct arphdr *arp; + unsigned char *arp_ptr; + + /* + * Allocate a buffer + */ + + skb = alloc_skb(sizeof(struct arphdr)+ 2*(dev->addr_len+4) + + LL_RESERVED_SPACE(dev), GFP_ATOMIC); + if (skb == NULL) + return NULL; + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb->nh.raw = skb->data; + arp = (struct arphdr *) skb_put(skb,sizeof(struct arphdr) + 2*(dev->addr_len+4)); + skb->dev = dev; + skb->protocol = htons(ETH_P_ARP); + if (src_hw == NULL) + src_hw = dev->dev_addr; + if (dest_hw == NULL) + dest_hw = dev->broadcast; + + /* + * Fill the device header for the ARP frame + */ + if (dev->hard_header && + dev->hard_header(skb,dev,ptype,dest_hw,src_hw,skb->len) < 0) + goto out; + + /* + * Fill out the arp protocol part. + * + * The arp hardware type should match the device type, except for FDDI, + * which (according to RFC 1390) should always equal 1 (Ethernet). + */ + /* + * Exceptions everywhere. AX.25 uses the AX.25 PID value not the + * DIX code for the protocol. Make these device structure fields. + */ + switch (dev->type) { + default: + arp->ar_hrd = htons(dev->type); + arp->ar_pro = htons(ETH_P_IP); + break; + +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + case ARPHRD_AX25: + arp->ar_hrd = htons(ARPHRD_AX25); + arp->ar_pro = htons(AX25_P_IP); + break; + +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) + case ARPHRD_NETROM: + arp->ar_hrd = htons(ARPHRD_NETROM); + arp->ar_pro = htons(AX25_P_IP); + break; +#endif +#endif + +#ifdef CONFIG_FDDI + case ARPHRD_FDDI: + arp->ar_hrd = htons(ARPHRD_ETHER); + arp->ar_pro = htons(ETH_P_IP); + break; +#endif +#ifdef CONFIG_TR + case ARPHRD_IEEE802_TR: + arp->ar_hrd = htons(ARPHRD_IEEE802); + arp->ar_pro = htons(ETH_P_IP); + break; +#endif + } + + arp->ar_hln = dev->addr_len; + arp->ar_pln = 4; + arp->ar_op = htons(type); + + arp_ptr=(unsigned char *)(arp+1); + + memcpy(arp_ptr, src_hw, dev->addr_len); + arp_ptr+=dev->addr_len; + memcpy(arp_ptr, &src_ip,4); + arp_ptr+=4; + if (target_hw != NULL) + memcpy(arp_ptr, target_hw, dev->addr_len); + else + memset(arp_ptr, 0, dev->addr_len); + arp_ptr+=dev->addr_len; + memcpy(arp_ptr, &dest_ip, 4); + + return skb; + +out: + kfree_skb(skb); + return NULL; +} + +/* + * Send an arp packet. + */ +void arp_xmit(struct sk_buff *skb) +{ + /* Send it off, maybe filter it using firewalling first. */ + NF_HOOK(NF_ARP, NF_ARP_OUT, skb, NULL, skb->dev, dev_queue_xmit); +} + +/* + * Create and send an arp packet. + */ +void arp_send(int type, int ptype, u32 dest_ip, + struct net_device *dev, u32 src_ip, + unsigned char *dest_hw, unsigned char *src_hw, + unsigned char *target_hw) +{ + struct sk_buff *skb; + + /* + * No arp on this interface. + */ + + if (dev->flags&IFF_NOARP) + return; + + skb = arp_create(type, ptype, dest_ip, dev, src_ip, + dest_hw, src_hw, target_hw); + if (skb == NULL) { + return; + } + + arp_xmit(skb); +} + +static void parp_redo(struct sk_buff *skb) +{ + nf_reset(skb); + arp_rcv(skb, skb->dev, NULL); +} + +/* + * Process an arp request. + */ + +static int arp_process(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct in_device *in_dev = in_dev_get(dev); + struct arphdr *arp; + unsigned char *arp_ptr; + struct rtable *rt; + unsigned char *sha, *tha; + u32 sip, tip; + u16 dev_type = dev->type; + int addr_type; + struct neighbour *n; + + /* arp_rcv below verifies the ARP header and verifies the device + * is ARP'able. + */ + + if (in_dev == NULL) + goto out; + + arp = skb->nh.arph; + + switch (dev_type) { + default: + if (arp->ar_pro != htons(ETH_P_IP) || + htons(dev_type) != arp->ar_hrd) + goto out; + break; +#ifdef CONFIG_NET_ETHERNET + case ARPHRD_ETHER: +#endif +#ifdef CONFIG_TR + case ARPHRD_IEEE802_TR: +#endif +#ifdef CONFIG_FDDI + case ARPHRD_FDDI: +#endif +#ifdef CONFIG_NET_FC + case ARPHRD_IEEE802: +#endif +#if defined(CONFIG_NET_ETHERNET) || defined(CONFIG_TR) || \ + defined(CONFIG_FDDI) || defined(CONFIG_NET_FC) + /* + * ETHERNET, Token Ring and Fibre Channel (which are IEEE 802 + * devices, according to RFC 2625) devices will accept ARP + * hardware types of either 1 (Ethernet) or 6 (IEEE 802.2). + * This is the case also of FDDI, where the RFC 1390 says that + * FDDI devices should accept ARP hardware of (1) Ethernet, + * however, to be more robust, we'll accept both 1 (Ethernet) + * or 6 (IEEE 802.2) + */ + if ((arp->ar_hrd != htons(ARPHRD_ETHER) && + arp->ar_hrd != htons(ARPHRD_IEEE802)) || + arp->ar_pro != htons(ETH_P_IP)) + goto out; + break; +#endif +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + case ARPHRD_AX25: + if (arp->ar_pro != htons(AX25_P_IP) || + arp->ar_hrd != htons(ARPHRD_AX25)) + goto out; + break; +#if defined(CONFIG_NETROM) || defined(CONFIG_NETROM_MODULE) + case ARPHRD_NETROM: + if (arp->ar_pro != htons(AX25_P_IP) || + arp->ar_hrd != htons(ARPHRD_NETROM)) + goto out; + break; +#endif +#endif + } + + /* Understand only these message types */ + + if (arp->ar_op != htons(ARPOP_REPLY) && + arp->ar_op != htons(ARPOP_REQUEST)) + goto out; + +/* + * Extract fields + */ + arp_ptr= (unsigned char *)(arp+1); + sha = arp_ptr; + arp_ptr += dev->addr_len; + memcpy(&sip, arp_ptr, 4); + arp_ptr += 4; + tha = arp_ptr; + arp_ptr += dev->addr_len; + memcpy(&tip, arp_ptr, 4); +/* + * Check for bad requests for 127.x.x.x and requests for multicast + * addresses. If this is one such, delete it. + */ + if (LOOPBACK(tip) || MULTICAST(tip)) + goto out; + +/* + * Special case: We must set Frame Relay source Q.922 address + */ + if (dev_type == ARPHRD_DLCI) + sha = dev->broadcast; + +/* + * Process entry. The idea here is we want to send a reply if it is a + * request for us or if it is a request for someone else that we hold + * a proxy for. We want to add an entry to our cache if it is a reply + * to us or if it is a request for our address. + * (The assumption for this last is that if someone is requesting our + * address, they are probably intending to talk to us, so it saves time + * if we cache their address. Their address is also probably not in + * our cache, since ours is not in their cache.) + * + * Putting this another way, we only care about replies if they are to + * us, in which case we add them to the cache. For requests, we care + * about those for us and those for our proxies. We reply to both, + * and in the case of requests for us we add the requester to the arp + * cache. + */ + + /* Special case: IPv4 duplicate address detection packet (RFC2131) */ + if (sip == 0) { + if (arp->ar_op == htons(ARPOP_REQUEST) && + inet_addr_type(tip) == RTN_LOCAL && + !arp_ignore(in_dev,dev,sip,tip)) + arp_send(ARPOP_REPLY,ETH_P_ARP,tip,dev,tip,sha,dev->dev_addr,dev->dev_addr); + goto out; + } + + if (arp->ar_op == htons(ARPOP_REQUEST) && + ip_route_input(skb, tip, sip, 0, dev) == 0) { + + rt = (struct rtable*)skb->dst; + addr_type = rt->rt_type; + + if (addr_type == RTN_LOCAL) { + n = neigh_event_ns(&arp_tbl, sha, &sip, dev); + if (n) { + int dont_send = 0; + + if (!dont_send) + dont_send |= arp_ignore(in_dev,dev,sip,tip); + if (!dont_send && IN_DEV_ARPFILTER(in_dev)) + dont_send |= arp_filter(sip,tip,dev); + if (!dont_send) + arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + + neigh_release(n); + } + goto out; + } else if (IN_DEV_FORWARD(in_dev)) { + if ((rt->rt_flags&RTCF_DNAT) || + (addr_type == RTN_UNICAST && rt->u.dst.dev != dev && + (arp_fwd_proxy(in_dev, rt) || pneigh_lookup(&arp_tbl, &tip, dev, 0)))) { + n = neigh_event_ns(&arp_tbl, sha, &sip, dev); + if (n) + neigh_release(n); + + if (skb->stamp.tv_sec == LOCALLY_ENQUEUED || + skb->pkt_type == PACKET_HOST || + in_dev->arp_parms->proxy_delay == 0) { + arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + } else { + pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); + in_dev_put(in_dev); + return 0; + } + goto out; + } + } + } + + /* Update our ARP tables */ + + n = __neigh_lookup(&arp_tbl, &sip, dev, 0); + +#ifdef CONFIG_IP_ACCEPT_UNSOLICITED_ARP + /* Unsolicited ARP is not accepted by default. + It is possible, that this option should be enabled for some + devices (strip is candidate) + */ + if (n == NULL && + arp->ar_op == htons(ARPOP_REPLY) && + inet_addr_type(sip) == RTN_UNICAST) + n = __neigh_lookup(&arp_tbl, &sip, dev, -1); +#endif + + if (n) { + int state = NUD_REACHABLE; + int override; + + /* If several different ARP replies follows back-to-back, + use the FIRST one. It is possible, if several proxy + agents are active. Taking the first reply prevents + arp trashing and chooses the fastest router. + */ + override = time_after(jiffies, n->updated + n->parms->locktime); + + /* Broadcast replies and request packets + do not assert neighbour reachability. + */ + if (arp->ar_op != htons(ARPOP_REPLY) || + skb->pkt_type != PACKET_HOST) + state = NUD_STALE; + neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0); + neigh_release(n); + } + +out: + if (in_dev) + in_dev_put(in_dev); + kfree_skb(skb); + return 0; +} + + +/* + * Receive an arp request from the device layer. + */ + +int arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +{ + struct arphdr *arp; + + /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ + if (!pskb_may_pull(skb, (sizeof(struct arphdr) + + (2 * dev->addr_len) + + (2 * sizeof(u32))))) + goto freeskb; + + arp = skb->nh.arph; + if (arp->ar_hln != dev->addr_len || + dev->flags & IFF_NOARP || + skb->pkt_type == PACKET_OTHERHOST || + skb->pkt_type == PACKET_LOOPBACK || + arp->ar_pln != 4) + goto freeskb; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + goto out_of_mem; + + return NF_HOOK(NF_ARP, NF_ARP_IN, skb, dev, NULL, arp_process); + +freeskb: + kfree_skb(skb); +out_of_mem: + return 0; +} + +/* + * User level interface (ioctl) + */ + +/* + * Set (create) an ARP cache entry. + */ + +static int arp_req_set(struct arpreq *r, struct net_device * dev) +{ + u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; + struct neighbour *neigh; + int err; + + if (r->arp_flags&ATF_PUBL) { + u32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr; + if (mask && mask != 0xFFFFFFFF) + return -EINVAL; + if (!dev && (r->arp_flags & ATF_COM)) { + dev = dev_getbyhwaddr(r->arp_ha.sa_family, r->arp_ha.sa_data); + if (!dev) + return -ENODEV; + } + if (mask) { + if (pneigh_lookup(&arp_tbl, &ip, dev, 1) == NULL) + return -ENOBUFS; + return 0; + } + if (dev == NULL) { + ipv4_devconf.proxy_arp = 1; + return 0; + } + if (__in_dev_get(dev)) { + __in_dev_get(dev)->cnf.proxy_arp = 1; + return 0; + } + return -ENXIO; + } + + if (r->arp_flags & ATF_PERM) + r->arp_flags |= ATF_COM; + if (dev == NULL) { + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, + .tos = RTO_ONLINK } } }; + struct rtable * rt; + if ((err = ip_route_output_key(&rt, &fl)) != 0) + return err; + dev = rt->u.dst.dev; + ip_rt_put(rt); + if (!dev) + return -EINVAL; + } + switch (dev->type) { +#ifdef CONFIG_FDDI + case ARPHRD_FDDI: + /* + * According to RFC 1390, FDDI devices should accept ARP + * hardware types of 1 (Ethernet). However, to be more + * robust, we'll accept hardware types of either 1 (Ethernet) + * or 6 (IEEE 802.2). + */ + if (r->arp_ha.sa_family != ARPHRD_FDDI && + r->arp_ha.sa_family != ARPHRD_ETHER && + r->arp_ha.sa_family != ARPHRD_IEEE802) + return -EINVAL; + break; +#endif + default: + if (r->arp_ha.sa_family != dev->type) + return -EINVAL; + break; + } + + neigh = __neigh_lookup_errno(&arp_tbl, &ip, dev); + err = PTR_ERR(neigh); + if (!IS_ERR(neigh)) { + unsigned state = NUD_STALE; + if (r->arp_flags & ATF_PERM) + state = NUD_PERMANENT; + err = neigh_update(neigh, (r->arp_flags&ATF_COM) ? + r->arp_ha.sa_data : NULL, state, + NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_ADMIN); + neigh_release(neigh); + } + return err; +} + +static unsigned arp_state_to_flags(struct neighbour *neigh) +{ + unsigned flags = 0; + if (neigh->nud_state&NUD_PERMANENT) + flags = ATF_PERM|ATF_COM; + else if (neigh->nud_state&NUD_VALID) + flags = ATF_COM; + return flags; +} + +/* + * Get an ARP cache entry. + */ + +static int arp_req_get(struct arpreq *r, struct net_device *dev) +{ + u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; + struct neighbour *neigh; + int err = -ENXIO; + + neigh = neigh_lookup(&arp_tbl, &ip, dev); + if (neigh) { + read_lock_bh(&neigh->lock); + memcpy(r->arp_ha.sa_data, neigh->ha, dev->addr_len); + r->arp_flags = arp_state_to_flags(neigh); + read_unlock_bh(&neigh->lock); + r->arp_ha.sa_family = dev->type; + strlcpy(r->arp_dev, dev->name, sizeof(r->arp_dev)); + neigh_release(neigh); + err = 0; + } + return err; +} + +static int arp_req_delete(struct arpreq *r, struct net_device * dev) +{ + int err; + u32 ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; + struct neighbour *neigh; + + if (r->arp_flags & ATF_PUBL) { + u32 mask = + ((struct sockaddr_in *)&r->arp_netmask)->sin_addr.s_addr; + if (mask == 0xFFFFFFFF) + return pneigh_delete(&arp_tbl, &ip, dev); + if (mask == 0) { + if (dev == NULL) { + ipv4_devconf.proxy_arp = 0; + return 0; + } + if (__in_dev_get(dev)) { + __in_dev_get(dev)->cnf.proxy_arp = 0; + return 0; + } + return -ENXIO; + } + return -EINVAL; + } + + if (dev == NULL) { + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, + .tos = RTO_ONLINK } } }; + struct rtable * rt; + if ((err = ip_route_output_key(&rt, &fl)) != 0) + return err; + dev = rt->u.dst.dev; + ip_rt_put(rt); + if (!dev) + return -EINVAL; + } + err = -ENXIO; + neigh = neigh_lookup(&arp_tbl, &ip, dev); + if (neigh) { + if (neigh->nud_state&~NUD_NOARP) + err = neigh_update(neigh, NULL, NUD_FAILED, + NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_ADMIN); + neigh_release(neigh); + } + return err; +} + +/* + * Handle an ARP layer I/O control request. + */ + +int arp_ioctl(unsigned int cmd, void __user *arg) +{ + int err; + struct arpreq r; + struct net_device *dev = NULL; + + switch (cmd) { + case SIOCDARP: + case SIOCSARP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + case SIOCGARP: + err = copy_from_user(&r, arg, sizeof(struct arpreq)); + if (err) + return -EFAULT; + break; + default: + return -EINVAL; + } + + if (r.arp_pa.sa_family != AF_INET) + return -EPFNOSUPPORT; + + if (!(r.arp_flags & ATF_PUBL) && + (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB))) + return -EINVAL; + if (!(r.arp_flags & ATF_NETMASK)) + ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = + htonl(0xFFFFFFFFUL); + rtnl_lock(); + if (r.arp_dev[0]) { + err = -ENODEV; + if ((dev = __dev_get_by_name(r.arp_dev)) == NULL) + goto out; + + /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ + if (!r.arp_ha.sa_family) + r.arp_ha.sa_family = dev->type; + err = -EINVAL; + if ((r.arp_flags & ATF_COM) && r.arp_ha.sa_family != dev->type) + goto out; + } else if (cmd == SIOCGARP) { + err = -ENODEV; + goto out; + } + + switch(cmd) { + case SIOCDARP: + err = arp_req_delete(&r, dev); + break; + case SIOCSARP: + err = arp_req_set(&r, dev); + break; + case SIOCGARP: + err = arp_req_get(&r, dev); + if (!err && copy_to_user(arg, &r, sizeof(r))) + err = -EFAULT; + break; + } +out: + rtnl_unlock(); + return err; +} + +static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + switch (event) { + case NETDEV_CHANGEADDR: + neigh_changeaddr(&arp_tbl, dev); + rt_cache_flush(0); + break; + default: + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block arp_netdev_notifier = { + .notifier_call = arp_netdev_event, +}; + +/* Note, that it is not on notifier chain. + It is necessary, that this routine was called after route cache will be + flushed. + */ +void arp_ifdown(struct net_device *dev) +{ + neigh_ifdown(&arp_tbl, dev); +} + + +/* + * Called once on startup. + */ + +static struct packet_type arp_packet_type = { + .type = __constant_htons(ETH_P_ARP), + .func = arp_rcv, +}; + +static int arp_proc_init(void); + +void __init arp_init(void) +{ + neigh_table_init(&arp_tbl); + + dev_add_pack(&arp_packet_type); + arp_proc_init(); +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(NULL, &arp_tbl.parms, NET_IPV4, + NET_IPV4_NEIGH, "ipv4", NULL, NULL); +#endif + register_netdevice_notifier(&arp_netdev_notifier); +} + +#ifdef CONFIG_PROC_FS +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + +/* ------------------------------------------------------------------------ */ +/* + * ax25 -> ASCII conversion + */ +static char *ax2asc2(ax25_address *a, char *buf) +{ + char c, *s; + int n; + + for (n = 0, s = buf; n < 6; n++) { + c = (a->ax25_call[n] >> 1) & 0x7F; + + if (c != ' ') *s++ = c; + } + + *s++ = '-'; + + if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { + *s++ = '1'; + n -= 10; + } + + *s++ = n + '0'; + *s++ = '\0'; + + if (*buf == '\0' || *buf == '-') + return "*"; + + return buf; + +} +#endif /* CONFIG_AX25 */ + +#define HBUFFERLEN 30 + +static void arp_format_neigh_entry(struct seq_file *seq, + struct neighbour *n) +{ + char hbuffer[HBUFFERLEN]; + const char hexbuf[] = "0123456789ABCDEF"; + int k, j; + char tbuf[16]; + struct net_device *dev = n->dev; + int hatype = dev->type; + + read_lock(&n->lock); + /* Convert hardware address to XX:XX:XX:XX ... form. */ +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + if (hatype == ARPHRD_AX25 || hatype == ARPHRD_NETROM) + ax2asc2((ax25_address *)n->ha, hbuffer); + else { +#endif + for (k = 0, j = 0; k < HBUFFERLEN - 3 && j < dev->addr_len; j++) { + hbuffer[k++] = hexbuf[(n->ha[j] >> 4) & 15]; + hbuffer[k++] = hexbuf[n->ha[j] & 15]; + hbuffer[k++] = ':'; + } + hbuffer[--k] = 0; +#if defined(CONFIG_AX25) || defined(CONFIG_AX25_MODULE) + } +#endif + sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->primary_key)); + seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", + tbuf, hatype, arp_state_to_flags(n), hbuffer, dev->name); + read_unlock(&n->lock); +} + +static void arp_format_pneigh_entry(struct seq_file *seq, + struct pneigh_entry *n) +{ + struct net_device *dev = n->dev; + int hatype = dev ? dev->type : 0; + char tbuf[16]; + + sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(*(u32*)n->key)); + seq_printf(seq, "%-16s 0x%-10x0x%-10x%s * %s\n", + tbuf, hatype, ATF_PUBL | ATF_PERM, "00:00:00:00:00:00", + dev ? dev->name : "*"); +} + +static int arp_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "IP address HW type Flags " + "HW address Mask Device\n"); + } else { + struct neigh_seq_state *state = seq->private; + + if (state->flags & NEIGH_SEQ_IS_PNEIGH) + arp_format_pneigh_entry(seq, v); + else + arp_format_neigh_entry(seq, v); + } + + return 0; +} + +static void *arp_seq_start(struct seq_file *seq, loff_t *pos) +{ + /* Don't want to confuse "arp -a" w/ magic entries, + * so we tell the generic iterator to skip NUD_NOARP. + */ + return neigh_seq_start(seq, pos, &arp_tbl, NEIGH_SEQ_SKIP_NOARP); +} + +/* ------------------------------------------------------------------------ */ + +static struct seq_operations arp_seq_ops = { + .start = arp_seq_start, + .next = neigh_seq_next, + .stop = neigh_seq_stop, + .show = arp_seq_show, +}; + +static int arp_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct neigh_seq_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + memset(s, 0, sizeof(*s)); + rc = seq_open(file, &arp_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations arp_seq_fops = { + .owner = THIS_MODULE, + .open = arp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +static int __init arp_proc_init(void) +{ + if (!proc_net_fops_create("arp", S_IRUGO, &arp_seq_fops)) + return -ENOMEM; + return 0; +} + +#else /* CONFIG_PROC_FS */ + +static int __init arp_proc_init(void) +{ + return 0; +} + +#endif /* CONFIG_PROC_FS */ + +EXPORT_SYMBOL(arp_broken_ops); +EXPORT_SYMBOL(arp_find); +EXPORT_SYMBOL(arp_rcv); +EXPORT_SYMBOL(arp_create); +EXPORT_SYMBOL(arp_xmit); +EXPORT_SYMBOL(arp_send); +EXPORT_SYMBOL(arp_tbl); + +#if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) +EXPORT_SYMBOL(clip_tbl_hook); +#endif diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c new file mode 100644 index 000000000000..b1db561f2542 --- /dev/null +++ b/net/ipv4/datagram.c @@ -0,0 +1,73 @@ +/* + * common UDP/RAW code + * Linux INET implementation + * + * Authors: + * Hideaki YOSHIFUJI + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct sockaddr_in *usin = (struct sockaddr_in *) uaddr; + struct rtable *rt; + u32 saddr; + int oif; + int err; + + + if (addr_len < sizeof(*usin)) + return -EINVAL; + + if (usin->sin_family != AF_INET) + return -EAFNOSUPPORT; + + sk_dst_reset(sk); + + oif = sk->sk_bound_dev_if; + saddr = inet->saddr; + if (MULTICAST(usin->sin_addr.s_addr)) { + if (!oif) + oif = inet->mc_index; + if (!saddr) + saddr = inet->mc_addr; + } + err = ip_route_connect(&rt, usin->sin_addr.s_addr, saddr, + RT_CONN_FLAGS(sk), oif, + sk->sk_protocol, + inet->sport, usin->sin_port, sk); + if (err) + return err; + if ((rt->rt_flags & RTCF_BROADCAST) && !sock_flag(sk, SOCK_BROADCAST)) { + ip_rt_put(rt); + return -EACCES; + } + if (!inet->saddr) + inet->saddr = rt->rt_src; /* Update source address */ + if (!inet->rcv_saddr) + inet->rcv_saddr = rt->rt_src; + inet->daddr = rt->rt_dst; + inet->dport = usin->sin_port; + sk->sk_state = TCP_ESTABLISHED; + inet->id = jiffies; + + sk_dst_set(sk, &rt->u.dst); + return(0); +} + +EXPORT_SYMBOL(ip4_datagram_connect); + diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c new file mode 100644 index 000000000000..eea7ef010776 --- /dev/null +++ b/net/ipv4/devinet.c @@ -0,0 +1,1508 @@ +/* + * NET3 IP device support routines. + * + * Version: $Id: devinet.c,v 1.44 2001/10/31 21:55:54 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Derived from the IP parts of dev.c 1.0.19 + * Authors: Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * + * Additional Authors: + * Alan Cox, + * Alexey Kuznetsov, + * + * Changes: + * Alexey Kuznetsov: pa_* fields are replaced with ifaddr + * lists. + * Cyrus Durgin: updated for kmod + * Matthias Andree: in devinet_ioctl, compare label and + * address (4.4BSD alias style support), + * fall back to comparing just the label + * if no match found. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif +#include + +#include +#include +#include + +struct ipv4_devconf ipv4_devconf = { + .accept_redirects = 1, + .send_redirects = 1, + .secure_redirects = 1, + .shared_media = 1, +}; + +static struct ipv4_devconf ipv4_devconf_dflt = { + .accept_redirects = 1, + .send_redirects = 1, + .secure_redirects = 1, + .shared_media = 1, + .accept_source_route = 1, +}; + +static void rtmsg_ifa(int event, struct in_ifaddr *); + +static struct notifier_block *inetaddr_chain; +static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, + int destroy); +#ifdef CONFIG_SYSCTL +static void devinet_sysctl_register(struct in_device *in_dev, + struct ipv4_devconf *p); +static void devinet_sysctl_unregister(struct ipv4_devconf *p); +#endif + +/* Locks all the inet devices. */ + +static struct in_ifaddr *inet_alloc_ifa(void) +{ + struct in_ifaddr *ifa = kmalloc(sizeof(*ifa), GFP_KERNEL); + + if (ifa) { + memset(ifa, 0, sizeof(*ifa)); + INIT_RCU_HEAD(&ifa->rcu_head); + } + + return ifa; +} + +static void inet_rcu_free_ifa(struct rcu_head *head) +{ + struct in_ifaddr *ifa = container_of(head, struct in_ifaddr, rcu_head); + if (ifa->ifa_dev) + in_dev_put(ifa->ifa_dev); + kfree(ifa); +} + +static inline void inet_free_ifa(struct in_ifaddr *ifa) +{ + call_rcu(&ifa->rcu_head, inet_rcu_free_ifa); +} + +void in_dev_finish_destroy(struct in_device *idev) +{ + struct net_device *dev = idev->dev; + + BUG_TRAP(!idev->ifa_list); + BUG_TRAP(!idev->mc_list); +#ifdef NET_REFCNT_DEBUG + printk(KERN_DEBUG "in_dev_finish_destroy: %p=%s\n", + idev, dev ? dev->name : "NIL"); +#endif + dev_put(dev); + if (!idev->dead) + printk("Freeing alive in_device %p\n", idev); + else { + kfree(idev); + } +} + +struct in_device *inetdev_init(struct net_device *dev) +{ + struct in_device *in_dev; + + ASSERT_RTNL(); + + in_dev = kmalloc(sizeof(*in_dev), GFP_KERNEL); + if (!in_dev) + goto out; + memset(in_dev, 0, sizeof(*in_dev)); + INIT_RCU_HEAD(&in_dev->rcu_head); + memcpy(&in_dev->cnf, &ipv4_devconf_dflt, sizeof(in_dev->cnf)); + in_dev->cnf.sysctl = NULL; + in_dev->dev = dev; + if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL) + goto out_kfree; + /* Reference in_dev->dev */ + dev_hold(dev); +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4, + NET_IPV4_NEIGH, "ipv4", NULL, NULL); +#endif + + /* Account for reference dev->ip_ptr */ + in_dev_hold(in_dev); + rcu_assign_pointer(dev->ip_ptr, in_dev); + +#ifdef CONFIG_SYSCTL + devinet_sysctl_register(in_dev, &in_dev->cnf); +#endif + ip_mc_init_dev(in_dev); + if (dev->flags & IFF_UP) + ip_mc_up(in_dev); +out: + return in_dev; +out_kfree: + kfree(in_dev); + in_dev = NULL; + goto out; +} + +static void in_dev_rcu_put(struct rcu_head *head) +{ + struct in_device *idev = container_of(head, struct in_device, rcu_head); + in_dev_put(idev); +} + +static void inetdev_destroy(struct in_device *in_dev) +{ + struct in_ifaddr *ifa; + struct net_device *dev; + + ASSERT_RTNL(); + + dev = in_dev->dev; + if (dev == &loopback_dev) + return; + + in_dev->dead = 1; + + ip_mc_destroy_dev(in_dev); + + while ((ifa = in_dev->ifa_list) != NULL) { + inet_del_ifa(in_dev, &in_dev->ifa_list, 0); + inet_free_ifa(ifa); + } + +#ifdef CONFIG_SYSCTL + devinet_sysctl_unregister(&in_dev->cnf); +#endif + + dev->ip_ptr = NULL; + +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(in_dev->arp_parms); +#endif + neigh_parms_release(&arp_tbl, in_dev->arp_parms); + arp_ifdown(dev); + + call_rcu(&in_dev->rcu_head, in_dev_rcu_put); +} + +int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b) +{ + rcu_read_lock(); + for_primary_ifa(in_dev) { + if (inet_ifa_match(a, ifa)) { + if (!b || inet_ifa_match(b, ifa)) { + rcu_read_unlock(); + return 1; + } + } + } endfor_ifa(in_dev); + rcu_read_unlock(); + return 0; +} + +static void inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap, + int destroy) +{ + struct in_ifaddr *ifa1 = *ifap; + + ASSERT_RTNL(); + + /* 1. Deleting primary ifaddr forces deletion all secondaries */ + + if (!(ifa1->ifa_flags & IFA_F_SECONDARY)) { + struct in_ifaddr *ifa; + struct in_ifaddr **ifap1 = &ifa1->ifa_next; + + while ((ifa = *ifap1) != NULL) { + if (!(ifa->ifa_flags & IFA_F_SECONDARY) || + ifa1->ifa_mask != ifa->ifa_mask || + !inet_ifa_match(ifa1->ifa_address, ifa)) { + ifap1 = &ifa->ifa_next; + continue; + } + + *ifap1 = ifa->ifa_next; + + rtmsg_ifa(RTM_DELADDR, ifa); + notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa); + inet_free_ifa(ifa); + } + } + + /* 2. Unlink it */ + + *ifap = ifa1->ifa_next; + + /* 3. Announce address deletion */ + + /* Send message first, then call notifier. + At first sight, FIB update triggered by notifier + will refer to already deleted ifaddr, that could confuse + netlink listeners. It is not true: look, gated sees + that route deleted and if it still thinks that ifaddr + is valid, it will try to restore deleted routes... Grr. + So that, this order is correct. + */ + rtmsg_ifa(RTM_DELADDR, ifa1); + notifier_call_chain(&inetaddr_chain, NETDEV_DOWN, ifa1); + if (destroy) { + inet_free_ifa(ifa1); + + if (!in_dev->ifa_list) + inetdev_destroy(in_dev); + } +} + +static int inet_insert_ifa(struct in_ifaddr *ifa) +{ + struct in_device *in_dev = ifa->ifa_dev; + struct in_ifaddr *ifa1, **ifap, **last_primary; + + ASSERT_RTNL(); + + if (!ifa->ifa_local) { + inet_free_ifa(ifa); + return 0; + } + + ifa->ifa_flags &= ~IFA_F_SECONDARY; + last_primary = &in_dev->ifa_list; + + for (ifap = &in_dev->ifa_list; (ifa1 = *ifap) != NULL; + ifap = &ifa1->ifa_next) { + if (!(ifa1->ifa_flags & IFA_F_SECONDARY) && + ifa->ifa_scope <= ifa1->ifa_scope) + last_primary = &ifa1->ifa_next; + if (ifa1->ifa_mask == ifa->ifa_mask && + inet_ifa_match(ifa1->ifa_address, ifa)) { + if (ifa1->ifa_local == ifa->ifa_local) { + inet_free_ifa(ifa); + return -EEXIST; + } + if (ifa1->ifa_scope != ifa->ifa_scope) { + inet_free_ifa(ifa); + return -EINVAL; + } + ifa->ifa_flags |= IFA_F_SECONDARY; + } + } + + if (!(ifa->ifa_flags & IFA_F_SECONDARY)) { + net_srandom(ifa->ifa_local); + ifap = last_primary; + } + + ifa->ifa_next = *ifap; + *ifap = ifa; + + /* Send message first, then call notifier. + Notifier will trigger FIB update, so that + listeners of netlink will know about new ifaddr */ + rtmsg_ifa(RTM_NEWADDR, ifa); + notifier_call_chain(&inetaddr_chain, NETDEV_UP, ifa); + + return 0; +} + +static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) +{ + struct in_device *in_dev = __in_dev_get(dev); + + ASSERT_RTNL(); + + if (!in_dev) { + in_dev = inetdev_init(dev); + if (!in_dev) { + inet_free_ifa(ifa); + return -ENOBUFS; + } + } + if (ifa->ifa_dev != in_dev) { + BUG_TRAP(!ifa->ifa_dev); + in_dev_hold(in_dev); + ifa->ifa_dev = in_dev; + } + if (LOOPBACK(ifa->ifa_local)) + ifa->ifa_scope = RT_SCOPE_HOST; + return inet_insert_ifa(ifa); +} + +struct in_device *inetdev_by_index(int ifindex) +{ + struct net_device *dev; + struct in_device *in_dev = NULL; + read_lock(&dev_base_lock); + dev = __dev_get_by_index(ifindex); + if (dev) + in_dev = in_dev_get(dev); + read_unlock(&dev_base_lock); + return in_dev; +} + +/* Called only from RTNL semaphored context. No locks. */ + +struct in_ifaddr *inet_ifa_byprefix(struct in_device *in_dev, u32 prefix, + u32 mask) +{ + ASSERT_RTNL(); + + for_primary_ifa(in_dev) { + if (ifa->ifa_mask == mask && inet_ifa_match(prefix, ifa)) + return ifa; + } endfor_ifa(in_dev); + return NULL; +} + +static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct in_device *in_dev; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct in_ifaddr *ifa, **ifap; + + ASSERT_RTNL(); + + if ((in_dev = inetdev_by_index(ifm->ifa_index)) == NULL) + goto out; + __in_dev_put(in_dev); + + for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + ifap = &ifa->ifa_next) { + if ((rta[IFA_LOCAL - 1] && + memcmp(RTA_DATA(rta[IFA_LOCAL - 1]), + &ifa->ifa_local, 4)) || + (rta[IFA_LABEL - 1] && + rtattr_strcmp(rta[IFA_LABEL - 1], ifa->ifa_label)) || + (rta[IFA_ADDRESS - 1] && + (ifm->ifa_prefixlen != ifa->ifa_prefixlen || + !inet_ifa_match(*(u32*)RTA_DATA(rta[IFA_ADDRESS - 1]), + ifa)))) + continue; + inet_del_ifa(in_dev, ifap, 1); + return 0; + } +out: + return -EADDRNOTAVAIL; +} + +static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct net_device *dev; + struct in_device *in_dev; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct in_ifaddr *ifa; + int rc = -EINVAL; + + ASSERT_RTNL(); + + if (ifm->ifa_prefixlen > 32 || !rta[IFA_LOCAL - 1]) + goto out; + + rc = -ENODEV; + if ((dev = __dev_get_by_index(ifm->ifa_index)) == NULL) + goto out; + + rc = -ENOBUFS; + if ((in_dev = __in_dev_get(dev)) == NULL) { + in_dev = inetdev_init(dev); + if (!in_dev) + goto out; + } + + if ((ifa = inet_alloc_ifa()) == NULL) + goto out; + + if (!rta[IFA_ADDRESS - 1]) + rta[IFA_ADDRESS - 1] = rta[IFA_LOCAL - 1]; + memcpy(&ifa->ifa_local, RTA_DATA(rta[IFA_LOCAL - 1]), 4); + memcpy(&ifa->ifa_address, RTA_DATA(rta[IFA_ADDRESS - 1]), 4); + ifa->ifa_prefixlen = ifm->ifa_prefixlen; + ifa->ifa_mask = inet_make_mask(ifm->ifa_prefixlen); + if (rta[IFA_BROADCAST - 1]) + memcpy(&ifa->ifa_broadcast, + RTA_DATA(rta[IFA_BROADCAST - 1]), 4); + if (rta[IFA_ANYCAST - 1]) + memcpy(&ifa->ifa_anycast, RTA_DATA(rta[IFA_ANYCAST - 1]), 4); + ifa->ifa_flags = ifm->ifa_flags; + ifa->ifa_scope = ifm->ifa_scope; + in_dev_hold(in_dev); + ifa->ifa_dev = in_dev; + if (rta[IFA_LABEL - 1]) + rtattr_strlcpy(ifa->ifa_label, rta[IFA_LABEL - 1], IFNAMSIZ); + else + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + + rc = inet_insert_ifa(ifa); +out: + return rc; +} + +/* + * Determine a default network mask, based on the IP address. + */ + +static __inline__ int inet_abc_len(u32 addr) +{ + int rc = -1; /* Something else, probably a multicast. */ + + if (ZERONET(addr)) + rc = 0; + else { + addr = ntohl(addr); + + if (IN_CLASSA(addr)) + rc = 8; + else if (IN_CLASSB(addr)) + rc = 16; + else if (IN_CLASSC(addr)) + rc = 24; + } + + return rc; +} + + +int devinet_ioctl(unsigned int cmd, void __user *arg) +{ + struct ifreq ifr; + struct sockaddr_in sin_orig; + struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr; + struct in_device *in_dev; + struct in_ifaddr **ifap = NULL; + struct in_ifaddr *ifa = NULL; + struct net_device *dev; + char *colon; + int ret = -EFAULT; + int tryaddrmatch = 0; + + /* + * Fetch the caller's info block into kernel space + */ + + if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) + goto out; + ifr.ifr_name[IFNAMSIZ - 1] = 0; + + /* save original address for comparison */ + memcpy(&sin_orig, sin, sizeof(*sin)); + + colon = strchr(ifr.ifr_name, ':'); + if (colon) + *colon = 0; + +#ifdef CONFIG_KMOD + dev_load(ifr.ifr_name); +#endif + + switch(cmd) { + case SIOCGIFADDR: /* Get interface address */ + case SIOCGIFBRDADDR: /* Get the broadcast address */ + case SIOCGIFDSTADDR: /* Get the destination address */ + case SIOCGIFNETMASK: /* Get the netmask for the interface */ + /* Note that these ioctls will not sleep, + so that we do not impose a lock. + One day we will be forced to put shlock here (I mean SMP) + */ + tryaddrmatch = (sin_orig.sin_family == AF_INET); + memset(sin, 0, sizeof(*sin)); + sin->sin_family = AF_INET; + break; + + case SIOCSIFFLAGS: + ret = -EACCES; + if (!capable(CAP_NET_ADMIN)) + goto out; + break; + case SIOCSIFADDR: /* Set interface address (and family) */ + case SIOCSIFBRDADDR: /* Set the broadcast address */ + case SIOCSIFDSTADDR: /* Set the destination address */ + case SIOCSIFNETMASK: /* Set the netmask for the interface */ + ret = -EACCES; + if (!capable(CAP_NET_ADMIN)) + goto out; + ret = -EINVAL; + if (sin->sin_family != AF_INET) + goto out; + break; + default: + ret = -EINVAL; + goto out; + } + + rtnl_lock(); + + ret = -ENODEV; + if ((dev = __dev_get_by_name(ifr.ifr_name)) == NULL) + goto done; + + if (colon) + *colon = ':'; + + if ((in_dev = __in_dev_get(dev)) != NULL) { + if (tryaddrmatch) { + /* Matthias Andree */ + /* compare label and address (4.4BSD style) */ + /* note: we only do this for a limited set of ioctls + and only if the original address family was AF_INET. + This is checked above. */ + for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + ifap = &ifa->ifa_next) { + if (!strcmp(ifr.ifr_name, ifa->ifa_label) && + sin_orig.sin_addr.s_addr == + ifa->ifa_address) { + break; /* found */ + } + } + } + /* we didn't get a match, maybe the application is + 4.3BSD-style and passed in junk so we fall back to + comparing just the label */ + if (!ifa) { + for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; + ifap = &ifa->ifa_next) + if (!strcmp(ifr.ifr_name, ifa->ifa_label)) + break; + } + } + + ret = -EADDRNOTAVAIL; + if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) + goto done; + + switch(cmd) { + case SIOCGIFADDR: /* Get interface address */ + sin->sin_addr.s_addr = ifa->ifa_local; + goto rarok; + + case SIOCGIFBRDADDR: /* Get the broadcast address */ + sin->sin_addr.s_addr = ifa->ifa_broadcast; + goto rarok; + + case SIOCGIFDSTADDR: /* Get the destination address */ + sin->sin_addr.s_addr = ifa->ifa_address; + goto rarok; + + case SIOCGIFNETMASK: /* Get the netmask for the interface */ + sin->sin_addr.s_addr = ifa->ifa_mask; + goto rarok; + + case SIOCSIFFLAGS: + if (colon) { + ret = -EADDRNOTAVAIL; + if (!ifa) + break; + ret = 0; + if (!(ifr.ifr_flags & IFF_UP)) + inet_del_ifa(in_dev, ifap, 1); + break; + } + ret = dev_change_flags(dev, ifr.ifr_flags); + break; + + case SIOCSIFADDR: /* Set interface address (and family) */ + ret = -EINVAL; + if (inet_abc_len(sin->sin_addr.s_addr) < 0) + break; + + if (!ifa) { + ret = -ENOBUFS; + if ((ifa = inet_alloc_ifa()) == NULL) + break; + if (colon) + memcpy(ifa->ifa_label, ifr.ifr_name, IFNAMSIZ); + else + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + } else { + ret = 0; + if (ifa->ifa_local == sin->sin_addr.s_addr) + break; + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_broadcast = 0; + ifa->ifa_anycast = 0; + } + + ifa->ifa_address = ifa->ifa_local = sin->sin_addr.s_addr; + + if (!(dev->flags & IFF_POINTOPOINT)) { + ifa->ifa_prefixlen = inet_abc_len(ifa->ifa_address); + ifa->ifa_mask = inet_make_mask(ifa->ifa_prefixlen); + if ((dev->flags & IFF_BROADCAST) && + ifa->ifa_prefixlen < 31) + ifa->ifa_broadcast = ifa->ifa_address | + ~ifa->ifa_mask; + } else { + ifa->ifa_prefixlen = 32; + ifa->ifa_mask = inet_make_mask(32); + } + ret = inet_set_ifa(dev, ifa); + break; + + case SIOCSIFBRDADDR: /* Set the broadcast address */ + ret = 0; + if (ifa->ifa_broadcast != sin->sin_addr.s_addr) { + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_broadcast = sin->sin_addr.s_addr; + inet_insert_ifa(ifa); + } + break; + + case SIOCSIFDSTADDR: /* Set the destination address */ + ret = 0; + if (ifa->ifa_address == sin->sin_addr.s_addr) + break; + ret = -EINVAL; + if (inet_abc_len(sin->sin_addr.s_addr) < 0) + break; + ret = 0; + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_address = sin->sin_addr.s_addr; + inet_insert_ifa(ifa); + break; + + case SIOCSIFNETMASK: /* Set the netmask for the interface */ + + /* + * The mask we set must be legal. + */ + ret = -EINVAL; + if (bad_mask(sin->sin_addr.s_addr, 0)) + break; + ret = 0; + if (ifa->ifa_mask != sin->sin_addr.s_addr) { + inet_del_ifa(in_dev, ifap, 0); + ifa->ifa_mask = sin->sin_addr.s_addr; + ifa->ifa_prefixlen = inet_mask_len(ifa->ifa_mask); + + /* See if current broadcast address matches + * with current netmask, then recalculate + * the broadcast address. Otherwise it's a + * funny address, so don't touch it since + * the user seems to know what (s)he's doing... + */ + if ((dev->flags & IFF_BROADCAST) && + (ifa->ifa_prefixlen < 31) && + (ifa->ifa_broadcast == + (ifa->ifa_local|~ifa->ifa_mask))) { + ifa->ifa_broadcast = (ifa->ifa_local | + ~sin->sin_addr.s_addr); + } + inet_insert_ifa(ifa); + } + break; + } +done: + rtnl_unlock(); +out: + return ret; +rarok: + rtnl_unlock(); + ret = copy_to_user(arg, &ifr, sizeof(struct ifreq)) ? -EFAULT : 0; + goto out; +} + +static int inet_gifconf(struct net_device *dev, char __user *buf, int len) +{ + struct in_device *in_dev = __in_dev_get(dev); + struct in_ifaddr *ifa; + struct ifreq ifr; + int done = 0; + + if (!in_dev || (ifa = in_dev->ifa_list) == NULL) + goto out; + + for (; ifa; ifa = ifa->ifa_next) { + if (!buf) { + done += sizeof(ifr); + continue; + } + if (len < (int) sizeof(ifr)) + break; + memset(&ifr, 0, sizeof(struct ifreq)); + if (ifa->ifa_label) + strcpy(ifr.ifr_name, ifa->ifa_label); + else + strcpy(ifr.ifr_name, dev->name); + + (*(struct sockaddr_in *)&ifr.ifr_addr).sin_family = AF_INET; + (*(struct sockaddr_in *)&ifr.ifr_addr).sin_addr.s_addr = + ifa->ifa_local; + + if (copy_to_user(buf, &ifr, sizeof(struct ifreq))) { + done = -EFAULT; + break; + } + buf += sizeof(struct ifreq); + len -= sizeof(struct ifreq); + done += sizeof(struct ifreq); + } +out: + return done; +} + +u32 inet_select_addr(const struct net_device *dev, u32 dst, int scope) +{ + u32 addr = 0; + struct in_device *in_dev; + + rcu_read_lock(); + in_dev = __in_dev_get(dev); + if (!in_dev) + goto no_in_dev; + + for_primary_ifa(in_dev) { + if (ifa->ifa_scope > scope) + continue; + if (!dst || inet_ifa_match(dst, ifa)) { + addr = ifa->ifa_local; + break; + } + if (!addr) + addr = ifa->ifa_local; + } endfor_ifa(in_dev); +no_in_dev: + rcu_read_unlock(); + + if (addr) + goto out; + + /* Not loopback addresses on loopback should be preferred + in this case. It is importnat that lo is the first interface + in dev_base list. + */ + read_lock(&dev_base_lock); + rcu_read_lock(); + for (dev = dev_base; dev; dev = dev->next) { + if ((in_dev = __in_dev_get(dev)) == NULL) + continue; + + for_primary_ifa(in_dev) { + if (ifa->ifa_scope != RT_SCOPE_LINK && + ifa->ifa_scope <= scope) { + addr = ifa->ifa_local; + goto out_unlock_both; + } + } endfor_ifa(in_dev); + } +out_unlock_both: + read_unlock(&dev_base_lock); + rcu_read_unlock(); +out: + return addr; +} + +static u32 confirm_addr_indev(struct in_device *in_dev, u32 dst, + u32 local, int scope) +{ + int same = 0; + u32 addr = 0; + + for_ifa(in_dev) { + if (!addr && + (local == ifa->ifa_local || !local) && + ifa->ifa_scope <= scope) { + addr = ifa->ifa_local; + if (same) + break; + } + if (!same) { + same = (!local || inet_ifa_match(local, ifa)) && + (!dst || inet_ifa_match(dst, ifa)); + if (same && addr) { + if (local || !dst) + break; + /* Is the selected addr into dst subnet? */ + if (inet_ifa_match(addr, ifa)) + break; + /* No, then can we use new local src? */ + if (ifa->ifa_scope <= scope) { + addr = ifa->ifa_local; + break; + } + /* search for large dst subnet for addr */ + same = 0; + } + } + } endfor_ifa(in_dev); + + return same? addr : 0; +} + +/* + * Confirm that local IP address exists using wildcards: + * - dev: only on this interface, 0=any interface + * - dst: only in the same subnet as dst, 0=any dst + * - local: address, 0=autoselect the local address + * - scope: maximum allowed scope value for the local address + */ +u32 inet_confirm_addr(const struct net_device *dev, u32 dst, u32 local, int scope) +{ + u32 addr = 0; + struct in_device *in_dev; + + if (dev) { + rcu_read_lock(); + if ((in_dev = __in_dev_get(dev))) + addr = confirm_addr_indev(in_dev, dst, local, scope); + rcu_read_unlock(); + + return addr; + } + + read_lock(&dev_base_lock); + rcu_read_lock(); + for (dev = dev_base; dev; dev = dev->next) { + if ((in_dev = __in_dev_get(dev))) { + addr = confirm_addr_indev(in_dev, dst, local, scope); + if (addr) + break; + } + } + rcu_read_unlock(); + read_unlock(&dev_base_lock); + + return addr; +} + +/* + * Device notifier + */ + +int register_inetaddr_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&inetaddr_chain, nb); +} + +int unregister_inetaddr_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&inetaddr_chain, nb); +} + +/* Rename ifa_labels for a device name change. Make some effort to preserve existing + * alias numbering and to create unique labels if possible. +*/ +static void inetdev_changename(struct net_device *dev, struct in_device *in_dev) +{ + struct in_ifaddr *ifa; + int named = 0; + + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + char old[IFNAMSIZ], *dot; + + memcpy(old, ifa->ifa_label, IFNAMSIZ); + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + if (named++ == 0) + continue; + dot = strchr(ifa->ifa_label, ':'); + if (dot == NULL) { + sprintf(old, ":%d", named); + dot = old; + } + if (strlen(dot) + strlen(dev->name) < IFNAMSIZ) { + strcat(ifa->ifa_label, dot); + } else { + strcpy(ifa->ifa_label + (IFNAMSIZ - strlen(dot) - 1), dot); + } + } +} + +/* Called only under RTNL semaphore */ + +static int inetdev_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct in_device *in_dev = __in_dev_get(dev); + + ASSERT_RTNL(); + + if (!in_dev) { + if (event == NETDEV_REGISTER && dev == &loopback_dev) { + in_dev = inetdev_init(dev); + if (!in_dev) + panic("devinet: Failed to create loopback\n"); + in_dev->cnf.no_xfrm = 1; + in_dev->cnf.no_policy = 1; + } + goto out; + } + + switch (event) { + case NETDEV_REGISTER: + printk(KERN_DEBUG "inetdev_event: bug\n"); + dev->ip_ptr = NULL; + break; + case NETDEV_UP: + if (dev->mtu < 68) + break; + if (dev == &loopback_dev) { + struct in_ifaddr *ifa; + if ((ifa = inet_alloc_ifa()) != NULL) { + ifa->ifa_local = + ifa->ifa_address = htonl(INADDR_LOOPBACK); + ifa->ifa_prefixlen = 8; + ifa->ifa_mask = inet_make_mask(8); + in_dev_hold(in_dev); + ifa->ifa_dev = in_dev; + ifa->ifa_scope = RT_SCOPE_HOST; + memcpy(ifa->ifa_label, dev->name, IFNAMSIZ); + inet_insert_ifa(ifa); + } + } + ip_mc_up(in_dev); + break; + case NETDEV_DOWN: + ip_mc_down(in_dev); + break; + case NETDEV_CHANGEMTU: + if (dev->mtu >= 68) + break; + /* MTU falled under 68, disable IP */ + case NETDEV_UNREGISTER: + inetdev_destroy(in_dev); + break; + case NETDEV_CHANGENAME: + /* Do not notify about label change, this event is + * not interesting to applications using netlink. + */ + inetdev_changename(dev, in_dev); + +#ifdef CONFIG_SYSCTL + devinet_sysctl_unregister(&in_dev->cnf); + neigh_sysctl_unregister(in_dev->arp_parms); + neigh_sysctl_register(dev, in_dev->arp_parms, NET_IPV4, + NET_IPV4_NEIGH, "ipv4", NULL, NULL); + devinet_sysctl_register(in_dev, &in_dev->cnf); +#endif + break; + } +out: + return NOTIFY_DONE; +} + +static struct notifier_block ip_netdev_notifier = { + .notifier_call =inetdev_event, +}; + +static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa, + u32 pid, u32 seq, int event) +{ + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; + ifm = NLMSG_DATA(nlh); + ifm->ifa_family = AF_INET; + ifm->ifa_prefixlen = ifa->ifa_prefixlen; + ifm->ifa_flags = ifa->ifa_flags|IFA_F_PERMANENT; + ifm->ifa_scope = ifa->ifa_scope; + ifm->ifa_index = ifa->ifa_dev->dev->ifindex; + if (ifa->ifa_address) + RTA_PUT(skb, IFA_ADDRESS, 4, &ifa->ifa_address); + if (ifa->ifa_local) + RTA_PUT(skb, IFA_LOCAL, 4, &ifa->ifa_local); + if (ifa->ifa_broadcast) + RTA_PUT(skb, IFA_BROADCAST, 4, &ifa->ifa_broadcast); + if (ifa->ifa_anycast) + RTA_PUT(skb, IFA_ANYCAST, 4, &ifa->ifa_anycast); + if (ifa->ifa_label[0]) + RTA_PUT(skb, IFA_LABEL, IFNAMSIZ, &ifa->ifa_label); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx, ip_idx; + struct net_device *dev; + struct in_device *in_dev; + struct in_ifaddr *ifa; + int s_ip_idx, s_idx = cb->args[0]; + + s_ip_idx = ip_idx = cb->args[1]; + read_lock(&dev_base_lock); + for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (idx > s_idx) + s_ip_idx = 0; + rcu_read_lock(); + if ((in_dev = __in_dev_get(dev)) == NULL) { + rcu_read_unlock(); + continue; + } + + for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; + ifa = ifa->ifa_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWADDR) <= 0) { + rcu_read_unlock(); + goto done; + } + } + rcu_read_unlock(); + } + +done: + read_unlock(&dev_base_lock); + cb->args[0] = idx; + cb->args[1] = ip_idx; + + return skb->len; +} + +static void rtmsg_ifa(int event, struct in_ifaddr* ifa) +{ + int size = NLMSG_SPACE(sizeof(struct ifaddrmsg) + 128); + struct sk_buff *skb = alloc_skb(size, GFP_KERNEL); + + if (!skb) + netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, ENOBUFS); + else if (inet_fill_ifaddr(skb, ifa, 0, 0, event) < 0) { + kfree_skb(skb); + netlink_set_err(rtnl, 0, RTMGRP_IPV4_IFADDR, EINVAL); + } else { + NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_IFADDR; + netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV4_IFADDR, GFP_KERNEL); + } +} + +static struct rtnetlink_link inet_rtnetlink_table[RTM_MAX - RTM_BASE + 1] = { + [4] = { .doit = inet_rtm_newaddr, }, + [5] = { .doit = inet_rtm_deladdr, }, + [6] = { .dumpit = inet_dump_ifaddr, }, + [8] = { .doit = inet_rtm_newroute, }, + [9] = { .doit = inet_rtm_delroute, }, + [10] = { .doit = inet_rtm_getroute, .dumpit = inet_dump_fib, }, +#ifdef CONFIG_IP_MULTIPLE_TABLES + [16] = { .doit = inet_rtm_newrule, }, + [17] = { .doit = inet_rtm_delrule, }, + [18] = { .dumpit = inet_dump_rules, }, +#endif +}; + +#ifdef CONFIG_SYSCTL + +void inet_forward_change(void) +{ + struct net_device *dev; + int on = ipv4_devconf.forwarding; + + ipv4_devconf.accept_redirects = !on; + ipv4_devconf_dflt.forwarding = on; + + read_lock(&dev_base_lock); + for (dev = dev_base; dev; dev = dev->next) { + struct in_device *in_dev; + rcu_read_lock(); + in_dev = __in_dev_get(dev); + if (in_dev) + in_dev->cnf.forwarding = on; + rcu_read_unlock(); + } + read_unlock(&dev_base_lock); + + rt_cache_flush(0); +} + +static int devinet_sysctl_forward(ctl_table *ctl, int write, + struct file* filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + + if (write && *valp != val) { + if (valp == &ipv4_devconf.forwarding) + inet_forward_change(); + else if (valp != &ipv4_devconf_dflt.forwarding) + rt_cache_flush(0); + } + + return ret; +} + +int ipv4_doint_and_flush(ctl_table *ctl, int write, + struct file* filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + + if (write && *valp != val) + rt_cache_flush(0); + + return ret; +} + +int ipv4_doint_and_flush_strategy(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + int *valp = table->data; + int new; + + if (!newval || !newlen) + return 0; + + if (newlen != sizeof(int)) + return -EINVAL; + + if (get_user(new, (int __user *)newval)) + return -EFAULT; + + if (new == *valp) + return 0; + + if (oldval && oldlenp) { + size_t len; + + if (get_user(len, oldlenp)) + return -EFAULT; + + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if (copy_to_user(oldval, valp, len)) + return -EFAULT; + if (put_user(len, oldlenp)) + return -EFAULT; + } + } + + *valp = new; + rt_cache_flush(0); + return 1; +} + + +static struct devinet_sysctl_table { + struct ctl_table_header *sysctl_header; + ctl_table devinet_vars[__NET_IPV4_CONF_MAX]; + ctl_table devinet_dev[2]; + ctl_table devinet_conf_dir[2]; + ctl_table devinet_proto_dir[2]; + ctl_table devinet_root_dir[2]; +} devinet_sysctl = { + .devinet_vars = { + { + .ctl_name = NET_IPV4_CONF_FORWARDING, + .procname = "forwarding", + .data = &ipv4_devconf.forwarding, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &devinet_sysctl_forward, + }, + { + .ctl_name = NET_IPV4_CONF_MC_FORWARDING, + .procname = "mc_forwarding", + .data = &ipv4_devconf.mc_forwarding, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_ACCEPT_REDIRECTS, + .procname = "accept_redirects", + .data = &ipv4_devconf.accept_redirects, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_SECURE_REDIRECTS, + .procname = "secure_redirects", + .data = &ipv4_devconf.secure_redirects, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_SHARED_MEDIA, + .procname = "shared_media", + .data = &ipv4_devconf.shared_media, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_RP_FILTER, + .procname = "rp_filter", + .data = &ipv4_devconf.rp_filter, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_SEND_REDIRECTS, + .procname = "send_redirects", + .data = &ipv4_devconf.send_redirects, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE, + .procname = "accept_source_route", + .data = &ipv4_devconf.accept_source_route, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_PROXY_ARP, + .procname = "proxy_arp", + .data = &ipv4_devconf.proxy_arp, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_MEDIUM_ID, + .procname = "medium_id", + .data = &ipv4_devconf.medium_id, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_BOOTP_RELAY, + .procname = "bootp_relay", + .data = &ipv4_devconf.bootp_relay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_LOG_MARTIANS, + .procname = "log_martians", + .data = &ipv4_devconf.log_martians, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_TAG, + .procname = "tag", + .data = &ipv4_devconf.tag, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_ARPFILTER, + .procname = "arp_filter", + .data = &ipv4_devconf.arp_filter, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_ARP_ANNOUNCE, + .procname = "arp_announce", + .data = &ipv4_devconf.arp_announce, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_ARP_IGNORE, + .procname = "arp_ignore", + .data = &ipv4_devconf.arp_ignore, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_CONF_NOXFRM, + .procname = "disable_xfrm", + .data = &ipv4_devconf.no_xfrm, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ipv4_doint_and_flush, + .strategy = &ipv4_doint_and_flush_strategy, + }, + { + .ctl_name = NET_IPV4_CONF_NOPOLICY, + .procname = "disable_policy", + .data = &ipv4_devconf.no_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ipv4_doint_and_flush, + .strategy = &ipv4_doint_and_flush_strategy, + }, + { + .ctl_name = NET_IPV4_CONF_FORCE_IGMP_VERSION, + .procname = "force_igmp_version", + .data = &ipv4_devconf.force_igmp_version, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ipv4_doint_and_flush, + .strategy = &ipv4_doint_and_flush_strategy, + }, + }, + .devinet_dev = { + { + .ctl_name = NET_PROTO_CONF_ALL, + .procname = "all", + .mode = 0555, + .child = devinet_sysctl.devinet_vars, + }, + }, + .devinet_conf_dir = { + { + .ctl_name = NET_IPV4_CONF, + .procname = "conf", + .mode = 0555, + .child = devinet_sysctl.devinet_dev, + }, + }, + .devinet_proto_dir = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = devinet_sysctl.devinet_conf_dir, + }, + }, + .devinet_root_dir = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = devinet_sysctl.devinet_proto_dir, + }, + }, +}; + +static void devinet_sysctl_register(struct in_device *in_dev, + struct ipv4_devconf *p) +{ + int i; + struct net_device *dev = in_dev ? in_dev->dev : NULL; + struct devinet_sysctl_table *t = kmalloc(sizeof(*t), GFP_KERNEL); + char *dev_name = NULL; + + if (!t) + return; + memcpy(t, &devinet_sysctl, sizeof(*t)); + for (i = 0; i < ARRAY_SIZE(t->devinet_vars) - 1; i++) { + t->devinet_vars[i].data += (char *)p - (char *)&ipv4_devconf; + t->devinet_vars[i].de = NULL; + } + + if (dev) { + dev_name = dev->name; + t->devinet_dev[0].ctl_name = dev->ifindex; + } else { + dev_name = "default"; + t->devinet_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; + } + + /* + * Make a copy of dev_name, because '.procname' is regarded as const + * by sysctl and we wouldn't want anyone to change it under our feet + * (see SIOCSIFNAME). + */ + dev_name = net_sysctl_strdup(dev_name); + if (!dev_name) + goto free; + + t->devinet_dev[0].procname = dev_name; + t->devinet_dev[0].child = t->devinet_vars; + t->devinet_dev[0].de = NULL; + t->devinet_conf_dir[0].child = t->devinet_dev; + t->devinet_conf_dir[0].de = NULL; + t->devinet_proto_dir[0].child = t->devinet_conf_dir; + t->devinet_proto_dir[0].de = NULL; + t->devinet_root_dir[0].child = t->devinet_proto_dir; + t->devinet_root_dir[0].de = NULL; + + t->sysctl_header = register_sysctl_table(t->devinet_root_dir, 0); + if (!t->sysctl_header) + goto free_procname; + + p->sysctl = t; + return; + + /* error path */ + free_procname: + kfree(dev_name); + free: + kfree(t); + return; +} + +static void devinet_sysctl_unregister(struct ipv4_devconf *p) +{ + if (p->sysctl) { + struct devinet_sysctl_table *t = p->sysctl; + p->sysctl = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t->devinet_dev[0].procname); + kfree(t); + } +} +#endif + +void __init devinet_init(void) +{ + register_gifconf(PF_INET, inet_gifconf); + register_netdevice_notifier(&ip_netdev_notifier); + rtnetlink_links[PF_INET] = inet_rtnetlink_table; +#ifdef CONFIG_SYSCTL + devinet_sysctl.sysctl_header = + register_sysctl_table(devinet_sysctl.devinet_root_dir, 0); + devinet_sysctl_register(NULL, &ipv4_devconf_dflt); +#endif +} + +EXPORT_SYMBOL(devinet_ioctl); +EXPORT_SYMBOL(in_dev_finish_destroy); +EXPORT_SYMBOL(inet_select_addr); +EXPORT_SYMBOL(inetdev_by_index); +EXPORT_SYMBOL(register_inetaddr_notifier); +EXPORT_SYMBOL(unregister_inetaddr_notifier); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c new file mode 100644 index 000000000000..053a883247ba --- /dev/null +++ b/net/ipv4/esp4.c @@ -0,0 +1,510 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* decapsulation data for use when post-processing */ +struct esp_decap_data { + xfrm_address_t saddr; + __u16 sport; + __u8 proto; +}; + +static int esp_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + struct iphdr *top_iph; + struct ip_esp_hdr *esph; + struct crypto_tfm *tfm; + struct esp_data *esp; + struct sk_buff *trailer; + int blksize; + int clen; + int alen; + int nfrags; + + /* Strip IP+ESP header. */ + __skb_pull(skb, skb->h.raw - skb->data); + /* Now skb is pure payload to encrypt */ + + err = -ENOMEM; + + /* Round to block size */ + clen = skb->len; + + esp = x->data; + alen = esp->auth.icv_trunc_len; + tfm = esp->conf.tfm; + blksize = (crypto_tfm_alg_blocksize(tfm) + 3) & ~3; + clen = (clen + 2 + blksize-1)&~(blksize-1); + if (esp->conf.padlen) + clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1); + + if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0) + goto error; + + /* Fill padding... */ + do { + int i; + for (i=0; ilen - 2; i++) + *(u8*)(trailer->tail + i) = i+1; + } while (0); + *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2; + pskb_put(skb, trailer, clen - skb->len); + + __skb_push(skb, skb->data - skb->nh.raw); + top_iph = skb->nh.iph; + esph = (struct ip_esp_hdr *)(skb->nh.raw + top_iph->ihl*4); + top_iph->tot_len = htons(skb->len + alen); + *(u8*)(trailer->tail - 1) = top_iph->protocol; + + /* this is non-NULL only with UDP Encapsulation */ + if (x->encap) { + struct xfrm_encap_tmpl *encap = x->encap; + struct udphdr *uh; + u32 *udpdata32; + + uh = (struct udphdr *)esph; + uh->source = encap->encap_sport; + uh->dest = encap->encap_dport; + uh->len = htons(skb->len + alen - top_iph->ihl*4); + uh->check = 0; + + switch (encap->encap_type) { + default: + case UDP_ENCAP_ESPINUDP: + esph = (struct ip_esp_hdr *)(uh + 1); + break; + case UDP_ENCAP_ESPINUDP_NON_IKE: + udpdata32 = (u32 *)(uh + 1); + udpdata32[0] = udpdata32[1] = 0; + esph = (struct ip_esp_hdr *)(udpdata32 + 2); + break; + } + + top_iph->protocol = IPPROTO_UDP; + } else + top_iph->protocol = IPPROTO_ESP; + + esph->spi = x->id.spi; + esph->seq_no = htonl(++x->replay.oseq); + + if (esp->conf.ivlen) + crypto_cipher_set_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + + do { + struct scatterlist *sg = &esp->sgbuf[0]; + + if (unlikely(nfrags > ESP_NUM_FAST_SG)) { + sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC); + if (!sg) + goto error; + } + skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen); + crypto_cipher_encrypt(tfm, sg, sg, clen); + if (unlikely(sg != &esp->sgbuf[0])) + kfree(sg); + } while (0); + + if (esp->conf.ivlen) { + memcpy(esph->enc_data, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + crypto_cipher_get_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + } + + if (esp->auth.icv_full_len) { + esp->auth.icv(esp, skb, (u8*)esph-skb->data, + sizeof(struct ip_esp_hdr) + esp->conf.ivlen+clen, trailer->tail); + pskb_put(skb, trailer, alen); + } + + ip_send_check(top_iph); + + err = 0; + +error: + return err; +} + +/* + * Note: detecting truncated vs. non-truncated authentication data is very + * expensive, so we only support truncated data, which is the recommended + * and common case. + */ +static int esp_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + struct iphdr *iph; + struct ip_esp_hdr *esph; + struct esp_data *esp = x->data; + struct sk_buff *trailer; + int blksize = crypto_tfm_alg_blocksize(esp->conf.tfm); + int alen = esp->auth.icv_trunc_len; + int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen; + int nfrags; + int encap_len = 0; + + if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr))) + goto out; + + if (elen <= 0 || (elen & (blksize-1))) + goto out; + + /* If integrity check is required, do this. */ + if (esp->auth.icv_full_len) { + u8 sum[esp->auth.icv_full_len]; + u8 sum1[alen]; + + esp->auth.icv(esp, skb, 0, skb->len-alen, sum); + + if (skb_copy_bits(skb, skb->len-alen, sum1, alen)) + BUG(); + + if (unlikely(memcmp(sum, sum1, alen))) { + x->stats.integrity_failed++; + goto out; + } + } + + if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0) + goto out; + + skb->ip_summed = CHECKSUM_NONE; + + esph = (struct ip_esp_hdr*)skb->data; + iph = skb->nh.iph; + + /* Get ivec. This can be wrong, check against another impls. */ + if (esp->conf.ivlen) + crypto_cipher_set_iv(esp->conf.tfm, esph->enc_data, crypto_tfm_alg_ivsize(esp->conf.tfm)); + + { + u8 nexthdr[2]; + struct scatterlist *sg = &esp->sgbuf[0]; + u8 workbuf[60]; + int padlen; + + if (unlikely(nfrags > ESP_NUM_FAST_SG)) { + sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC); + if (!sg) + goto out; + } + skb_to_sgvec(skb, sg, sizeof(struct ip_esp_hdr) + esp->conf.ivlen, elen); + crypto_cipher_decrypt(esp->conf.tfm, sg, sg, elen); + if (unlikely(sg != &esp->sgbuf[0])) + kfree(sg); + + if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2)) + BUG(); + + padlen = nexthdr[0]; + if (padlen+2 >= elen) + goto out; + + /* ... check padding bits here. Silly. :-) */ + + if (x->encap && decap && decap->decap_type) { + struct esp_decap_data *encap_data; + struct udphdr *uh = (struct udphdr *) (iph+1); + + encap_data = (struct esp_decap_data *) (decap->decap_data); + encap_data->proto = 0; + + switch (decap->decap_type) { + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + encap_data->proto = AF_INET; + encap_data->saddr.a4 = iph->saddr; + encap_data->sport = uh->source; + encap_len = (void*)esph - (void*)uh; + break; + + default: + goto out; + } + } + + iph->protocol = nexthdr[1]; + pskb_trim(skb, skb->len - alen - padlen - 2); + memcpy(workbuf, skb->nh.raw, iph->ihl*4); + skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen); + skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen; + memcpy(skb->nh.raw, workbuf, iph->ihl*4); + skb->nh.iph->tot_len = htons(skb->len); + } + + return 0; + +out: + return -EINVAL; +} + +static int esp_post_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + + if (x->encap) { + struct xfrm_encap_tmpl *encap; + struct esp_decap_data *decap_data; + + encap = x->encap; + decap_data = (struct esp_decap_data *)(decap->decap_data); + + /* first, make sure that the decap type == the encap type */ + if (encap->encap_type != decap->decap_type) + return -EINVAL; + + switch (encap->encap_type) { + default: + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + /* + * 1) if the NAT-T peer's IP or port changed then + * advertize the change to the keying daemon. + * This is an inbound SA, so just compare + * SRC ports. + */ + if (decap_data->proto == AF_INET && + (decap_data->saddr.a4 != x->props.saddr.a4 || + decap_data->sport != encap->encap_sport)) { + xfrm_address_t ipaddr; + + ipaddr.a4 = decap_data->saddr.a4; + km_new_mapping(x, &ipaddr, decap_data->sport); + + /* XXX: perhaps add an extra + * policy check here, to see + * if we should allow or + * reject a packet from a + * different source + * address/port. + */ + } + + /* + * 2) ignore UDP/TCP checksums in case + * of NAT-T in Transport Mode, or + * perform other post-processing fixes + * as per * draft-ietf-ipsec-udp-encaps-06, + * section 3.1.2 + */ + if (!x->props.mode) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + break; + } + } + return 0; +} + +static u32 esp4_get_max_size(struct xfrm_state *x, int mtu) +{ + struct esp_data *esp = x->data; + u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm); + + if (x->props.mode) { + mtu = (mtu + 2 + blksize-1)&~(blksize-1); + } else { + /* The worst case. */ + mtu += 2 + blksize; + } + if (esp->conf.padlen) + mtu = (mtu + esp->conf.padlen-1)&~(esp->conf.padlen-1); + + return mtu + x->props.header_len + esp->auth.icv_trunc_len; +} + +static void esp4_err(struct sk_buff *skb, u32 info) +{ + struct iphdr *iph = (struct iphdr*)skb->data; + struct ip_esp_hdr *esph = (struct ip_esp_hdr*)(skb->data+(iph->ihl<<2)); + struct xfrm_state *x; + + if (skb->h.icmph->type != ICMP_DEST_UNREACH || + skb->h.icmph->code != ICMP_FRAG_NEEDED) + return; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET); + if (!x) + return; + NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/%08x\n", + ntohl(esph->spi), ntohl(iph->daddr))); + xfrm_state_put(x); +} + +static void esp_destroy(struct xfrm_state *x) +{ + struct esp_data *esp = x->data; + + if (!esp) + return; + + if (esp->conf.tfm) { + crypto_free_tfm(esp->conf.tfm); + esp->conf.tfm = NULL; + } + if (esp->conf.ivec) { + kfree(esp->conf.ivec); + esp->conf.ivec = NULL; + } + if (esp->auth.tfm) { + crypto_free_tfm(esp->auth.tfm); + esp->auth.tfm = NULL; + } + if (esp->auth.work_icv) { + kfree(esp->auth.work_icv); + esp->auth.work_icv = NULL; + } + kfree(esp); +} + +static int esp_init_state(struct xfrm_state *x, void *args) +{ + struct esp_data *esp = NULL; + + /* null auth and encryption can have zero length keys */ + if (x->aalg) { + if (x->aalg->alg_key_len > 512) + goto error; + } + if (x->ealg == NULL) + goto error; + + esp = kmalloc(sizeof(*esp), GFP_KERNEL); + if (esp == NULL) + return -ENOMEM; + + memset(esp, 0, sizeof(*esp)); + + if (x->aalg) { + struct xfrm_algo_desc *aalg_desc; + + esp->auth.key = x->aalg->alg_key; + esp->auth.key_len = (x->aalg->alg_key_len+7)/8; + esp->auth.tfm = crypto_alloc_tfm(x->aalg->alg_name, 0); + if (esp->auth.tfm == NULL) + goto error; + esp->auth.icv = esp_hmac_digest; + + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); + BUG_ON(!aalg_desc); + + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_tfm_alg_digestsize(esp->auth.tfm)) { + NETDEBUG(printk(KERN_INFO "ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_tfm_alg_digestsize(esp->auth.tfm), + aalg_desc->uinfo.auth.icv_fullbits/8)); + goto error; + } + + esp->auth.icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; + esp->auth.icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; + + esp->auth.work_icv = kmalloc(esp->auth.icv_full_len, GFP_KERNEL); + if (!esp->auth.work_icv) + goto error; + } + esp->conf.key = x->ealg->alg_key; + esp->conf.key_len = (x->ealg->alg_key_len+7)/8; + if (x->props.ealgo == SADB_EALG_NULL) + esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_ECB); + else + esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_CBC); + if (esp->conf.tfm == NULL) + goto error; + esp->conf.ivlen = crypto_tfm_alg_ivsize(esp->conf.tfm); + esp->conf.padlen = 0; + if (esp->conf.ivlen) { + esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL); + if (unlikely(esp->conf.ivec == NULL)) + goto error; + get_random_bytes(esp->conf.ivec, esp->conf.ivlen); + } + if (crypto_cipher_setkey(esp->conf.tfm, esp->conf.key, esp->conf.key_len)) + goto error; + x->props.header_len = sizeof(struct ip_esp_hdr) + esp->conf.ivlen; + if (x->props.mode) + x->props.header_len += sizeof(struct iphdr); + if (x->encap) { + struct xfrm_encap_tmpl *encap = x->encap; + + switch (encap->encap_type) { + default: + goto error; + case UDP_ENCAP_ESPINUDP: + x->props.header_len += sizeof(struct udphdr); + break; + case UDP_ENCAP_ESPINUDP_NON_IKE: + x->props.header_len += sizeof(struct udphdr) + 2 * sizeof(u32); + break; + } + } + x->data = esp; + x->props.trailer_len = esp4_get_max_size(x, 0) - x->props.header_len; + return 0; + +error: + x->data = esp; + esp_destroy(x); + x->data = NULL; + return -EINVAL; +} + +static struct xfrm_type esp_type = +{ + .description = "ESP4", + .owner = THIS_MODULE, + .proto = IPPROTO_ESP, + .init_state = esp_init_state, + .destructor = esp_destroy, + .get_max_size = esp4_get_max_size, + .input = esp_input, + .post_input = esp_post_input, + .output = esp_output +}; + +static struct net_protocol esp4_protocol = { + .handler = xfrm4_rcv, + .err_handler = esp4_err, + .no_policy = 1, +}; + +static int __init esp4_init(void) +{ + struct xfrm_decap_state decap; + + if (sizeof(struct esp_decap_data) < + sizeof(decap.decap_data)) { + extern void decap_data_too_small(void); + + decap_data_too_small(); + } + + if (xfrm_register_type(&esp_type, AF_INET) < 0) { + printk(KERN_INFO "ip esp init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet_add_protocol(&esp4_protocol, IPPROTO_ESP) < 0) { + printk(KERN_INFO "ip esp init: can't add protocol\n"); + xfrm_unregister_type(&esp_type, AF_INET); + return -EAGAIN; + } + return 0; +} + +static void __exit esp4_fini(void) +{ + if (inet_del_protocol(&esp4_protocol, IPPROTO_ESP) < 0) + printk(KERN_INFO "ip esp close: can't remove protocol\n"); + if (xfrm_unregister_type(&esp_type, AF_INET) < 0) + printk(KERN_INFO "ip esp close: can't remove xfrm type\n"); +} + +module_init(esp4_init); +module_exit(esp4_fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c new file mode 100644 index 000000000000..563e7d612706 --- /dev/null +++ b/net/ipv4/fib_frontend.c @@ -0,0 +1,611 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 Forwarding Information Base: FIB frontend. + * + * Version: $Id: fib_frontend.c,v 1.26 2001/10/31 21:55:54 davem Exp $ + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#define FFprint(a...) printk(KERN_DEBUG a) + +#ifndef CONFIG_IP_MULTIPLE_TABLES + +#define RT_TABLE_MIN RT_TABLE_MAIN + +struct fib_table *ip_fib_local_table; +struct fib_table *ip_fib_main_table; + +#else + +#define RT_TABLE_MIN 1 + +struct fib_table *fib_tables[RT_TABLE_MAX+1]; + +struct fib_table *__fib_new_table(int id) +{ + struct fib_table *tb; + + tb = fib_hash_init(id); + if (!tb) + return NULL; + fib_tables[id] = tb; + return tb; +} + + +#endif /* CONFIG_IP_MULTIPLE_TABLES */ + + +static void fib_flush(void) +{ + int flushed = 0; +#ifdef CONFIG_IP_MULTIPLE_TABLES + struct fib_table *tb; + int id; + + for (id = RT_TABLE_MAX; id>0; id--) { + if ((tb = fib_get_table(id))==NULL) + continue; + flushed += tb->tb_flush(tb); + } +#else /* CONFIG_IP_MULTIPLE_TABLES */ + flushed += ip_fib_main_table->tb_flush(ip_fib_main_table); + flushed += ip_fib_local_table->tb_flush(ip_fib_local_table); +#endif /* CONFIG_IP_MULTIPLE_TABLES */ + + if (flushed) + rt_cache_flush(-1); +} + +/* + * Find the first device with a given source address. + */ + +struct net_device * ip_dev_find(u32 addr) +{ + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; + struct fib_result res; + struct net_device *dev = NULL; + +#ifdef CONFIG_IP_MULTIPLE_TABLES + res.r = NULL; +#endif + + if (!ip_fib_local_table || + ip_fib_local_table->tb_lookup(ip_fib_local_table, &fl, &res)) + return NULL; + if (res.type != RTN_LOCAL) + goto out; + dev = FIB_RES_DEV(res); + + if (dev) + dev_hold(dev); +out: + fib_res_put(&res); + return dev; +} + +unsigned inet_addr_type(u32 addr) +{ + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; + struct fib_result res; + unsigned ret = RTN_BROADCAST; + + if (ZERONET(addr) || BADCLASS(addr)) + return RTN_BROADCAST; + if (MULTICAST(addr)) + return RTN_MULTICAST; + +#ifdef CONFIG_IP_MULTIPLE_TABLES + res.r = NULL; +#endif + + if (ip_fib_local_table) { + ret = RTN_UNICAST; + if (!ip_fib_local_table->tb_lookup(ip_fib_local_table, + &fl, &res)) { + ret = res.type; + fib_res_put(&res); + } + } + return ret; +} + +/* Given (packet source, input interface) and optional (dst, oif, tos): + - (main) check, that source is valid i.e. not broadcast or our local + address. + - figure out what "logical" interface this packet arrived + and calculate "specific destination" address. + - check, that packet arrived from expected physical interface. + */ + +int fib_validate_source(u32 src, u32 dst, u8 tos, int oif, + struct net_device *dev, u32 *spec_dst, u32 *itag) +{ + struct in_device *in_dev; + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = src, + .saddr = dst, + .tos = tos } }, + .iif = oif }; + struct fib_result res; + int no_addr, rpf; + int ret; + + no_addr = rpf = 0; + rcu_read_lock(); + in_dev = __in_dev_get(dev); + if (in_dev) { + no_addr = in_dev->ifa_list == NULL; + rpf = IN_DEV_RPFILTER(in_dev); + } + rcu_read_unlock(); + + if (in_dev == NULL) + goto e_inval; + + if (fib_lookup(&fl, &res)) + goto last_resort; + if (res.type != RTN_UNICAST) + goto e_inval_res; + *spec_dst = FIB_RES_PREFSRC(res); + fib_combine_itag(itag, &res); +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) +#else + if (FIB_RES_DEV(res) == dev) +#endif + { + ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; + fib_res_put(&res); + return ret; + } + fib_res_put(&res); + if (no_addr) + goto last_resort; + if (rpf) + goto e_inval; + fl.oif = dev->ifindex; + + ret = 0; + if (fib_lookup(&fl, &res) == 0) { + if (res.type == RTN_UNICAST) { + *spec_dst = FIB_RES_PREFSRC(res); + ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; + } + fib_res_put(&res); + } + return ret; + +last_resort: + if (rpf) + goto e_inval; + *spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + *itag = 0; + return 0; + +e_inval_res: + fib_res_put(&res); +e_inval: + return -EINVAL; +} + +#ifndef CONFIG_IP_NOSIOCRT + +/* + * Handle IP routing ioctl calls. These are used to manipulate the routing tables + */ + +int ip_rt_ioctl(unsigned int cmd, void __user *arg) +{ + int err; + struct kern_rta rta; + struct rtentry r; + struct { + struct nlmsghdr nlh; + struct rtmsg rtm; + } req; + + switch (cmd) { + case SIOCADDRT: /* Add a route */ + case SIOCDELRT: /* Delete a route */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (copy_from_user(&r, arg, sizeof(struct rtentry))) + return -EFAULT; + rtnl_lock(); + err = fib_convert_rtentry(cmd, &req.nlh, &req.rtm, &rta, &r); + if (err == 0) { + if (cmd == SIOCDELRT) { + struct fib_table *tb = fib_get_table(req.rtm.rtm_table); + err = -ESRCH; + if (tb) + err = tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL); + } else { + struct fib_table *tb = fib_new_table(req.rtm.rtm_table); + err = -ENOBUFS; + if (tb) + err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL); + } + if (rta.rta_mx) + kfree(rta.rta_mx); + } + rtnl_unlock(); + return err; + } + return -EINVAL; +} + +#else + +int ip_rt_ioctl(unsigned int cmd, void *arg) +{ + return -EINVAL; +} + +#endif + +static int inet_check_attr(struct rtmsg *r, struct rtattr **rta) +{ + int i; + + for (i=1; i<=RTA_MAX; i++) { + struct rtattr *attr = rta[i-1]; + if (attr) { + if (RTA_PAYLOAD(attr) < 4) + return -EINVAL; + if (i != RTA_MULTIPATH && i != RTA_METRICS) + rta[i-1] = (struct rtattr*)RTA_DATA(attr); + } + } + return 0; +} + +int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct fib_table * tb; + struct rtattr **rta = arg; + struct rtmsg *r = NLMSG_DATA(nlh); + + if (inet_check_attr(r, rta)) + return -EINVAL; + + tb = fib_get_table(r->rtm_table); + if (tb) + return tb->tb_delete(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb)); + return -ESRCH; +} + +int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct fib_table * tb; + struct rtattr **rta = arg; + struct rtmsg *r = NLMSG_DATA(nlh); + + if (inet_check_attr(r, rta)) + return -EINVAL; + + tb = fib_new_table(r->rtm_table); + if (tb) + return tb->tb_insert(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb)); + return -ENOBUFS; +} + +int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct fib_table *tb; + + if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) && + ((struct rtmsg*)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED) + return ip_rt_dump(skb, cb); + + s_t = cb->args[0]; + if (s_t == 0) + s_t = cb->args[0] = RT_TABLE_MIN; + + for (t=s_t; t<=RT_TABLE_MAX; t++) { + if (t < s_t) continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); + if ((tb = fib_get_table(t))==NULL) + continue; + if (tb->tb_dump(tb, skb, cb) < 0) + break; + } + + cb->args[0] = t; + + return skb->len; +} + +/* Prepare and feed intra-kernel routing request. + Really, it should be netlink message, but :-( netlink + can be not configured, so that we feed it directly + to fib engine. It is legal, because all events occur + only when netlink is already locked. + */ + +static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr *ifa) +{ + struct fib_table * tb; + struct { + struct nlmsghdr nlh; + struct rtmsg rtm; + } req; + struct kern_rta rta; + + memset(&req.rtm, 0, sizeof(req.rtm)); + memset(&rta, 0, sizeof(rta)); + + if (type == RTN_UNICAST) + tb = fib_new_table(RT_TABLE_MAIN); + else + tb = fib_new_table(RT_TABLE_LOCAL); + + if (tb == NULL) + return; + + req.nlh.nlmsg_len = sizeof(req); + req.nlh.nlmsg_type = cmd; + req.nlh.nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE|NLM_F_APPEND; + req.nlh.nlmsg_pid = 0; + req.nlh.nlmsg_seq = 0; + + req.rtm.rtm_dst_len = dst_len; + req.rtm.rtm_table = tb->tb_id; + req.rtm.rtm_protocol = RTPROT_KERNEL; + req.rtm.rtm_scope = (type != RTN_LOCAL ? RT_SCOPE_LINK : RT_SCOPE_HOST); + req.rtm.rtm_type = type; + + rta.rta_dst = &dst; + rta.rta_prefsrc = &ifa->ifa_local; + rta.rta_oif = &ifa->ifa_dev->dev->ifindex; + + if (cmd == RTM_NEWROUTE) + tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL); + else + tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL); +} + +static void fib_add_ifaddr(struct in_ifaddr *ifa) +{ + struct in_device *in_dev = ifa->ifa_dev; + struct net_device *dev = in_dev->dev; + struct in_ifaddr *prim = ifa; + u32 mask = ifa->ifa_mask; + u32 addr = ifa->ifa_local; + u32 prefix = ifa->ifa_address&mask; + + if (ifa->ifa_flags&IFA_F_SECONDARY) { + prim = inet_ifa_byprefix(in_dev, prefix, mask); + if (prim == NULL) { + printk(KERN_DEBUG "fib_add_ifaddr: bug: prim == NULL\n"); + return; + } + } + + fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); + + if (!(dev->flags&IFF_UP)) + return; + + /* Add broadcast address, if it is explicitly assigned. */ + if (ifa->ifa_broadcast && ifa->ifa_broadcast != 0xFFFFFFFF) + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); + + if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) && + (prefix != addr || ifa->ifa_prefixlen < 32)) { + fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : + RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim); + + /* Add network specific broadcasts, when it takes a sense */ + if (ifa->ifa_prefixlen < 31) { + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim); + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim); + } + } +} + +static void fib_del_ifaddr(struct in_ifaddr *ifa) +{ + struct in_device *in_dev = ifa->ifa_dev; + struct net_device *dev = in_dev->dev; + struct in_ifaddr *ifa1; + struct in_ifaddr *prim = ifa; + u32 brd = ifa->ifa_address|~ifa->ifa_mask; + u32 any = ifa->ifa_address&ifa->ifa_mask; +#define LOCAL_OK 1 +#define BRD_OK 2 +#define BRD0_OK 4 +#define BRD1_OK 8 + unsigned ok = 0; + + if (!(ifa->ifa_flags&IFA_F_SECONDARY)) + fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : + RTN_UNICAST, any, ifa->ifa_prefixlen, prim); + else { + prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); + if (prim == NULL) { + printk(KERN_DEBUG "fib_del_ifaddr: bug: prim == NULL\n"); + return; + } + } + + /* Deletion is more complicated than add. + We should take care of not to delete too much :-) + + Scan address list to be sure that addresses are really gone. + */ + + for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { + if (ifa->ifa_local == ifa1->ifa_local) + ok |= LOCAL_OK; + if (ifa->ifa_broadcast == ifa1->ifa_broadcast) + ok |= BRD_OK; + if (brd == ifa1->ifa_broadcast) + ok |= BRD1_OK; + if (any == ifa1->ifa_broadcast) + ok |= BRD0_OK; + } + + if (!(ok&BRD_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); + if (!(ok&BRD1_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); + if (!(ok&BRD0_OK)) + fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); + if (!(ok&LOCAL_OK)) { + fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); + + /* Check, that this local address finally disappeared. */ + if (inet_addr_type(ifa->ifa_local) != RTN_LOCAL) { + /* And the last, but not the least thing. + We must flush stray FIB entries. + + First of all, we scan fib_info list searching + for stray nexthop entries, then ignite fib_flush. + */ + if (fib_sync_down(ifa->ifa_local, NULL, 0)) + fib_flush(); + } + } +#undef LOCAL_OK +#undef BRD_OK +#undef BRD0_OK +#undef BRD1_OK +} + +static void fib_disable_ip(struct net_device *dev, int force) +{ + if (fib_sync_down(0, dev, force)) + fib_flush(); + rt_cache_flush(0); + arp_ifdown(dev); +} + +static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr*)ptr; + + switch (event) { + case NETDEV_UP: + fib_add_ifaddr(ifa); +#ifdef CONFIG_IP_ROUTE_MULTIPATH + fib_sync_up(ifa->ifa_dev->dev); +#endif + rt_cache_flush(-1); + break; + case NETDEV_DOWN: + fib_del_ifaddr(ifa); + if (ifa->ifa_dev && ifa->ifa_dev->ifa_list == NULL) { + /* Last address was deleted from this interface. + Disable IP. + */ + fib_disable_ip(ifa->ifa_dev->dev, 1); + } else { + rt_cache_flush(-1); + } + break; + } + return NOTIFY_DONE; +} + +static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct in_device *in_dev = __in_dev_get(dev); + + if (event == NETDEV_UNREGISTER) { + fib_disable_ip(dev, 2); + return NOTIFY_DONE; + } + + if (!in_dev) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: + for_ifa(in_dev) { + fib_add_ifaddr(ifa); + } endfor_ifa(in_dev); +#ifdef CONFIG_IP_ROUTE_MULTIPATH + fib_sync_up(dev); +#endif + rt_cache_flush(-1); + break; + case NETDEV_DOWN: + fib_disable_ip(dev, 0); + break; + case NETDEV_CHANGEMTU: + case NETDEV_CHANGE: + rt_cache_flush(0); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block fib_inetaddr_notifier = { + .notifier_call =fib_inetaddr_event, +}; + +static struct notifier_block fib_netdev_notifier = { + .notifier_call =fib_netdev_event, +}; + +void __init ip_fib_init(void) +{ +#ifndef CONFIG_IP_MULTIPLE_TABLES + ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL); + ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN); +#else + fib_rules_init(); +#endif + + register_netdevice_notifier(&fib_netdev_notifier); + register_inetaddr_notifier(&fib_inetaddr_notifier); +} + +EXPORT_SYMBOL(inet_addr_type); +EXPORT_SYMBOL(ip_dev_find); +EXPORT_SYMBOL(ip_rt_ioctl); diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c new file mode 100644 index 000000000000..6506dcc01b46 --- /dev/null +++ b/net/ipv4/fib_hash.c @@ -0,0 +1,1086 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 FIB: lookup engine and maintenance routines. + * + * Version: $Id: fib_hash.c,v 1.13 2001/10/31 21:55:54 davem Exp $ + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "fib_lookup.h" + +static kmem_cache_t *fn_hash_kmem; +static kmem_cache_t *fn_alias_kmem; + +struct fib_node { + struct hlist_node fn_hash; + struct list_head fn_alias; + u32 fn_key; +}; + +struct fn_zone { + struct fn_zone *fz_next; /* Next not empty zone */ + struct hlist_head *fz_hash; /* Hash table pointer */ + int fz_nent; /* Number of entries */ + + int fz_divisor; /* Hash divisor */ + u32 fz_hashmask; /* (fz_divisor - 1) */ +#define FZ_HASHMASK(fz) ((fz)->fz_hashmask) + + int fz_order; /* Zone order */ + u32 fz_mask; +#define FZ_MASK(fz) ((fz)->fz_mask) +}; + +/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask + * can be cheaper than memory lookup, so that FZ_* macros are used. + */ + +struct fn_hash { + struct fn_zone *fn_zones[33]; + struct fn_zone *fn_zone_list; +}; + +static inline u32 fn_hash(u32 key, struct fn_zone *fz) +{ + u32 h = ntohl(key)>>(32 - fz->fz_order); + h ^= (h>>20); + h ^= (h>>10); + h ^= (h>>5); + h &= FZ_HASHMASK(fz); + return h; +} + +static inline u32 fz_key(u32 dst, struct fn_zone *fz) +{ + return dst & FZ_MASK(fz); +} + +static DEFINE_RWLOCK(fib_hash_lock); +static unsigned int fib_hash_genid; + +#define FZ_MAX_DIVISOR ((PAGE_SIZE<fn_hash); + + new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; + hlist_add_head(&f->fn_hash, new_head); + } + } +} + +static void fz_hash_free(struct hlist_head *hash, int divisor) +{ + unsigned long size = divisor * sizeof(struct hlist_head); + + if (size <= PAGE_SIZE) + kfree(hash); + else + free_pages((unsigned long)hash, get_order(size)); +} + +static void fn_rehash_zone(struct fn_zone *fz) +{ + struct hlist_head *ht, *old_ht; + int old_divisor, new_divisor; + u32 new_hashmask; + + old_divisor = fz->fz_divisor; + + switch (old_divisor) { + case 16: + new_divisor = 256; + break; + case 256: + new_divisor = 1024; + break; + default: + if ((old_divisor << 1) > FZ_MAX_DIVISOR) { + printk(KERN_CRIT "route.c: bad divisor %d!\n", old_divisor); + return; + } + new_divisor = (old_divisor << 1); + break; + } + + new_hashmask = (new_divisor - 1); + +#if RT_CACHE_DEBUG >= 2 + printk("fn_rehash_zone: hash for zone %d grows from %d\n", fz->fz_order, old_divisor); +#endif + + ht = fz_hash_alloc(new_divisor); + + if (ht) { + memset(ht, 0, new_divisor * sizeof(struct hlist_head)); + + write_lock_bh(&fib_hash_lock); + old_ht = fz->fz_hash; + fz->fz_hash = ht; + fz->fz_hashmask = new_hashmask; + fz->fz_divisor = new_divisor; + fn_rebuild_zone(fz, old_ht, old_divisor); + fib_hash_genid++; + write_unlock_bh(&fib_hash_lock); + + fz_hash_free(old_ht, old_divisor); + } +} + +static inline void fn_free_node(struct fib_node * f) +{ + kmem_cache_free(fn_hash_kmem, f); +} + +static inline void fn_free_alias(struct fib_alias *fa) +{ + fib_release_info(fa->fa_info); + kmem_cache_free(fn_alias_kmem, fa); +} + +static struct fn_zone * +fn_new_zone(struct fn_hash *table, int z) +{ + int i; + struct fn_zone *fz = kmalloc(sizeof(struct fn_zone), GFP_KERNEL); + if (!fz) + return NULL; + + memset(fz, 0, sizeof(struct fn_zone)); + if (z) { + fz->fz_divisor = 16; + } else { + fz->fz_divisor = 1; + } + fz->fz_hashmask = (fz->fz_divisor - 1); + fz->fz_hash = fz_hash_alloc(fz->fz_divisor); + if (!fz->fz_hash) { + kfree(fz); + return NULL; + } + memset(fz->fz_hash, 0, fz->fz_divisor * sizeof(struct hlist_head *)); + fz->fz_order = z; + fz->fz_mask = inet_make_mask(z); + + /* Find the first not empty zone with more specific mask */ + for (i=z+1; i<=32; i++) + if (table->fn_zones[i]) + break; + write_lock_bh(&fib_hash_lock); + if (i>32) { + /* No more specific masks, we are the first. */ + fz->fz_next = table->fn_zone_list; + table->fn_zone_list = fz; + } else { + fz->fz_next = table->fn_zones[i]->fz_next; + table->fn_zones[i]->fz_next = fz; + } + table->fn_zones[z] = fz; + fib_hash_genid++; + write_unlock_bh(&fib_hash_lock); + return fz; +} + +static int +fn_hash_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) +{ + int err; + struct fn_zone *fz; + struct fn_hash *t = (struct fn_hash*)tb->tb_data; + + read_lock(&fib_hash_lock); + for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { + struct hlist_head *head; + struct hlist_node *node; + struct fib_node *f; + u32 k = fz_key(flp->fl4_dst, fz); + + head = &fz->fz_hash[fn_hash(k, fz)]; + hlist_for_each_entry(f, node, head, fn_hash) { + if (f->fn_key != k) + continue; + + err = fib_semantic_match(&f->fn_alias, + flp, res, + f->fn_key, fz->fz_mask, + fz->fz_order); + if (err <= 0) + goto out; + } + } + err = 1; +out: + read_unlock(&fib_hash_lock); + return err; +} + +static int fn_hash_last_dflt=-1; + +static void +fn_hash_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res) +{ + int order, last_idx; + struct hlist_node *node; + struct fib_node *f; + struct fib_info *fi = NULL; + struct fib_info *last_resort; + struct fn_hash *t = (struct fn_hash*)tb->tb_data; + struct fn_zone *fz = t->fn_zones[0]; + + if (fz == NULL) + return; + + last_idx = -1; + last_resort = NULL; + order = -1; + + read_lock(&fib_hash_lock); + hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) { + struct fib_alias *fa; + + list_for_each_entry(fa, &f->fn_alias, fa_list) { + struct fib_info *next_fi = fa->fa_info; + + if (fa->fa_scope != res->scope || + fa->fa_type != RTN_UNICAST) + continue; + + if (next_fi->fib_priority > res->fi->fib_priority) + break; + if (!next_fi->fib_nh[0].nh_gw || + next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) + continue; + fa->fa_state |= FA_S_ACCESSED; + + if (fi == NULL) { + if (next_fi != res->fi) + break; + } else if (!fib_detect_death(fi, order, &last_resort, + &last_idx, &fn_hash_last_dflt)) { + if (res->fi) + fib_info_put(res->fi); + res->fi = fi; + atomic_inc(&fi->fib_clntref); + fn_hash_last_dflt = order; + goto out; + } + fi = next_fi; + order++; + } + } + + if (order <= 0 || fi == NULL) { + fn_hash_last_dflt = -1; + goto out; + } + + if (!fib_detect_death(fi, order, &last_resort, &last_idx, &fn_hash_last_dflt)) { + if (res->fi) + fib_info_put(res->fi); + res->fi = fi; + atomic_inc(&fi->fib_clntref); + fn_hash_last_dflt = order; + goto out; + } + + if (last_idx >= 0) { + if (res->fi) + fib_info_put(res->fi); + res->fi = last_resort; + if (last_resort) + atomic_inc(&last_resort->fib_clntref); + } + fn_hash_last_dflt = last_idx; +out: + read_unlock(&fib_hash_lock); +} + +/* Insert node F to FZ. */ +static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) +{ + struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; + + hlist_add_head(&f->fn_hash, head); +} + +/* Return the node in FZ matching KEY. */ +static struct fib_node *fib_find_node(struct fn_zone *fz, u32 key) +{ + struct hlist_head *head = &fz->fz_hash[fn_hash(key, fz)]; + struct hlist_node *node; + struct fib_node *f; + + hlist_for_each_entry(f, node, head, fn_hash) { + if (f->fn_key == key) + return f; + } + + return NULL; +} + +static int +fn_hash_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, + struct nlmsghdr *n, struct netlink_skb_parms *req) +{ + struct fn_hash *table = (struct fn_hash *) tb->tb_data; + struct fib_node *new_f, *f; + struct fib_alias *fa, *new_fa; + struct fn_zone *fz; + struct fib_info *fi; + int z = r->rtm_dst_len; + int type = r->rtm_type; + u8 tos = r->rtm_tos; + u32 key; + int err; + + if (z > 32) + return -EINVAL; + fz = table->fn_zones[z]; + if (!fz && !(fz = fn_new_zone(table, z))) + return -ENOBUFS; + + key = 0; + if (rta->rta_dst) { + u32 dst; + memcpy(&dst, rta->rta_dst, 4); + if (dst & ~FZ_MASK(fz)) + return -EINVAL; + key = fz_key(dst, fz); + } + + if ((fi = fib_create_info(r, rta, n, &err)) == NULL) + return err; + + if (fz->fz_nent > (fz->fz_divisor<<1) && + fz->fz_divisor < FZ_MAX_DIVISOR && + (z==32 || (1< fz->fz_divisor)) + fn_rehash_zone(fz); + + f = fib_find_node(fz, key); + + if (!f) + fa = NULL; + else + fa = fib_find_alias(&f->fn_alias, tos, fi->fib_priority); + + /* Now fa, if non-NULL, points to the first fib alias + * with the same keys [prefix,tos,priority], if such key already + * exists or to the node before which we will insert new one. + * + * If fa is NULL, we will need to allocate a new one and + * insert to the head of f. + * + * If f is NULL, no fib node matched the destination key + * and we need to allocate a new one of those as well. + */ + + if (fa && fa->fa_tos == tos && + fa->fa_info->fib_priority == fi->fib_priority) { + struct fib_alias *fa_orig; + + err = -EEXIST; + if (n->nlmsg_flags & NLM_F_EXCL) + goto out; + + if (n->nlmsg_flags & NLM_F_REPLACE) { + struct fib_info *fi_drop; + u8 state; + + write_lock_bh(&fib_hash_lock); + fi_drop = fa->fa_info; + fa->fa_info = fi; + fa->fa_type = type; + fa->fa_scope = r->rtm_scope; + state = fa->fa_state; + fa->fa_state &= ~FA_S_ACCESSED; + fib_hash_genid++; + write_unlock_bh(&fib_hash_lock); + + fib_release_info(fi_drop); + if (state & FA_S_ACCESSED) + rt_cache_flush(-1); + return 0; + } + + /* Error if we find a perfect match which + * uses the same scope, type, and nexthop + * information. + */ + fa_orig = fa; + fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); + list_for_each_entry_continue(fa, &f->fn_alias, fa_list) { + if (fa->fa_tos != tos) + break; + if (fa->fa_info->fib_priority != fi->fib_priority) + break; + if (fa->fa_type == type && + fa->fa_scope == r->rtm_scope && + fa->fa_info == fi) + goto out; + } + if (!(n->nlmsg_flags & NLM_F_APPEND)) + fa = fa_orig; + } + + err = -ENOENT; + if (!(n->nlmsg_flags&NLM_F_CREATE)) + goto out; + + err = -ENOBUFS; + new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL); + if (new_fa == NULL) + goto out; + + new_f = NULL; + if (!f) { + new_f = kmem_cache_alloc(fn_hash_kmem, SLAB_KERNEL); + if (new_f == NULL) + goto out_free_new_fa; + + INIT_HLIST_NODE(&new_f->fn_hash); + INIT_LIST_HEAD(&new_f->fn_alias); + new_f->fn_key = key; + f = new_f; + } + + new_fa->fa_info = fi; + new_fa->fa_tos = tos; + new_fa->fa_type = type; + new_fa->fa_scope = r->rtm_scope; + new_fa->fa_state = 0; + + /* + * Insert new entry to the list. + */ + + write_lock_bh(&fib_hash_lock); + if (new_f) + fib_insert_node(fz, new_f); + list_add_tail(&new_fa->fa_list, + (fa ? &fa->fa_list : &f->fn_alias)); + fib_hash_genid++; + write_unlock_bh(&fib_hash_lock); + + if (new_f) + fz->fz_nent++; + rt_cache_flush(-1); + + rtmsg_fib(RTM_NEWROUTE, key, new_fa, z, tb->tb_id, n, req); + return 0; + +out_free_new_fa: + kmem_cache_free(fn_alias_kmem, new_fa); +out: + fib_release_info(fi); + return err; +} + + +static int +fn_hash_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta, + struct nlmsghdr *n, struct netlink_skb_parms *req) +{ + struct fn_hash *table = (struct fn_hash*)tb->tb_data; + struct fib_node *f; + struct fib_alias *fa, *fa_to_delete; + int z = r->rtm_dst_len; + struct fn_zone *fz; + u32 key; + u8 tos = r->rtm_tos; + + if (z > 32) + return -EINVAL; + if ((fz = table->fn_zones[z]) == NULL) + return -ESRCH; + + key = 0; + if (rta->rta_dst) { + u32 dst; + memcpy(&dst, rta->rta_dst, 4); + if (dst & ~FZ_MASK(fz)) + return -EINVAL; + key = fz_key(dst, fz); + } + + f = fib_find_node(fz, key); + + if (!f) + fa = NULL; + else + fa = fib_find_alias(&f->fn_alias, tos, 0); + if (!fa) + return -ESRCH; + + fa_to_delete = NULL; + fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list); + list_for_each_entry_continue(fa, &f->fn_alias, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (fa->fa_tos != tos) + break; + + if ((!r->rtm_type || + fa->fa_type == r->rtm_type) && + (r->rtm_scope == RT_SCOPE_NOWHERE || + fa->fa_scope == r->rtm_scope) && + (!r->rtm_protocol || + fi->fib_protocol == r->rtm_protocol) && + fib_nh_match(r, n, rta, fi) == 0) { + fa_to_delete = fa; + break; + } + } + + if (fa_to_delete) { + int kill_fn; + + fa = fa_to_delete; + rtmsg_fib(RTM_DELROUTE, key, fa, z, tb->tb_id, n, req); + + kill_fn = 0; + write_lock_bh(&fib_hash_lock); + list_del(&fa->fa_list); + if (list_empty(&f->fn_alias)) { + hlist_del(&f->fn_hash); + kill_fn = 1; + } + fib_hash_genid++; + write_unlock_bh(&fib_hash_lock); + + if (fa->fa_state & FA_S_ACCESSED) + rt_cache_flush(-1); + fn_free_alias(fa); + if (kill_fn) { + fn_free_node(f); + fz->fz_nent--; + } + + return 0; + } + return -ESRCH; +} + +static int fn_flush_list(struct fn_zone *fz, int idx) +{ + struct hlist_head *head = &fz->fz_hash[idx]; + struct hlist_node *node, *n; + struct fib_node *f; + int found = 0; + + hlist_for_each_entry_safe(f, node, n, head, fn_hash) { + struct fib_alias *fa, *fa_node; + int kill_f; + + kill_f = 0; + list_for_each_entry_safe(fa, fa_node, &f->fn_alias, fa_list) { + struct fib_info *fi = fa->fa_info; + + if (fi && (fi->fib_flags&RTNH_F_DEAD)) { + write_lock_bh(&fib_hash_lock); + list_del(&fa->fa_list); + if (list_empty(&f->fn_alias)) { + hlist_del(&f->fn_hash); + kill_f = 1; + } + fib_hash_genid++; + write_unlock_bh(&fib_hash_lock); + + fn_free_alias(fa); + found++; + } + } + if (kill_f) { + fn_free_node(f); + fz->fz_nent--; + } + } + return found; +} + +static int fn_hash_flush(struct fib_table *tb) +{ + struct fn_hash *table = (struct fn_hash *) tb->tb_data; + struct fn_zone *fz; + int found = 0; + + for (fz = table->fn_zone_list; fz; fz = fz->fz_next) { + int i; + + for (i = fz->fz_divisor - 1; i >= 0; i--) + found += fn_flush_list(fz, i); + } + return found; +} + + +static inline int +fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, + struct fib_table *tb, + struct fn_zone *fz, + struct hlist_head *head) +{ + struct hlist_node *node; + struct fib_node *f; + int i, s_i; + + s_i = cb->args[3]; + i = 0; + hlist_for_each_entry(f, node, head, fn_hash) { + struct fib_alias *fa; + + list_for_each_entry(fa, &f->fn_alias, fa_list) { + if (i < s_i) + goto next; + + if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWROUTE, + tb->tb_id, + fa->fa_type, + fa->fa_scope, + &f->fn_key, + fz->fz_order, + fa->fa_tos, + fa->fa_info) < 0) { + cb->args[3] = i; + return -1; + } + next: + i++; + } + } + cb->args[3] = i; + return skb->len; +} + +static inline int +fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, + struct fib_table *tb, + struct fn_zone *fz) +{ + int h, s_h; + + s_h = cb->args[2]; + for (h=0; h < fz->fz_divisor; h++) { + if (h < s_h) continue; + if (h > s_h) + memset(&cb->args[3], 0, + sizeof(cb->args) - 3*sizeof(cb->args[0])); + if (fz->fz_hash == NULL || + hlist_empty(&fz->fz_hash[h])) + continue; + if (fn_hash_dump_bucket(skb, cb, tb, fz, &fz->fz_hash[h])<0) { + cb->args[2] = h; + return -1; + } + } + cb->args[2] = h; + return skb->len; +} + +static int fn_hash_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) +{ + int m, s_m; + struct fn_zone *fz; + struct fn_hash *table = (struct fn_hash*)tb->tb_data; + + s_m = cb->args[1]; + read_lock(&fib_hash_lock); + for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) { + if (m < s_m) continue; + if (m > s_m) + memset(&cb->args[2], 0, + sizeof(cb->args) - 2*sizeof(cb->args[0])); + if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { + cb->args[1] = m; + read_unlock(&fib_hash_lock); + return -1; + } + } + read_unlock(&fib_hash_lock); + cb->args[1] = m; + return skb->len; +} + +#ifdef CONFIG_IP_MULTIPLE_TABLES +struct fib_table * fib_hash_init(int id) +#else +struct fib_table * __init fib_hash_init(int id) +#endif +{ + struct fib_table *tb; + + if (fn_hash_kmem == NULL) + fn_hash_kmem = kmem_cache_create("ip_fib_hash", + sizeof(struct fib_node), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + + if (fn_alias_kmem == NULL) + fn_alias_kmem = kmem_cache_create("ip_fib_alias", + sizeof(struct fib_alias), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + + tb = kmalloc(sizeof(struct fib_table) + sizeof(struct fn_hash), + GFP_KERNEL); + if (tb == NULL) + return NULL; + + tb->tb_id = id; + tb->tb_lookup = fn_hash_lookup; + tb->tb_insert = fn_hash_insert; + tb->tb_delete = fn_hash_delete; + tb->tb_flush = fn_hash_flush; + tb->tb_select_default = fn_hash_select_default; + tb->tb_dump = fn_hash_dump; + memset(tb->tb_data, 0, sizeof(struct fn_hash)); + return tb; +} + +/* ------------------------------------------------------------------------ */ +#ifdef CONFIG_PROC_FS + +struct fib_iter_state { + struct fn_zone *zone; + int bucket; + struct hlist_head *hash_head; + struct fib_node *fn; + struct fib_alias *fa; + loff_t pos; + unsigned int genid; + int valid; +}; + +static struct fib_alias *fib_get_first(struct seq_file *seq) +{ + struct fib_iter_state *iter = seq->private; + struct fn_hash *table = (struct fn_hash *) ip_fib_main_table->tb_data; + + iter->bucket = 0; + iter->hash_head = NULL; + iter->fn = NULL; + iter->fa = NULL; + iter->pos = 0; + iter->genid = fib_hash_genid; + iter->valid = 1; + + for (iter->zone = table->fn_zone_list; iter->zone; + iter->zone = iter->zone->fz_next) { + int maxslot; + + if (!iter->zone->fz_nent) + continue; + + iter->hash_head = iter->zone->fz_hash; + maxslot = iter->zone->fz_divisor; + + for (iter->bucket = 0; iter->bucket < maxslot; + ++iter->bucket, ++iter->hash_head) { + struct hlist_node *node; + struct fib_node *fn; + + hlist_for_each_entry(fn,node,iter->hash_head,fn_hash) { + struct fib_alias *fa; + + list_for_each_entry(fa,&fn->fn_alias,fa_list) { + iter->fn = fn; + iter->fa = fa; + goto out; + } + } + } + } +out: + return iter->fa; +} + +static struct fib_alias *fib_get_next(struct seq_file *seq) +{ + struct fib_iter_state *iter = seq->private; + struct fib_node *fn; + struct fib_alias *fa; + + /* Advance FA, if any. */ + fn = iter->fn; + fa = iter->fa; + if (fa) { + BUG_ON(!fn); + list_for_each_entry_continue(fa, &fn->fn_alias, fa_list) { + iter->fa = fa; + goto out; + } + } + + fa = iter->fa = NULL; + + /* Advance FN. */ + if (fn) { + struct hlist_node *node = &fn->fn_hash; + hlist_for_each_entry_continue(fn, node, fn_hash) { + iter->fn = fn; + + list_for_each_entry(fa, &fn->fn_alias, fa_list) { + iter->fa = fa; + goto out; + } + } + } + + fn = iter->fn = NULL; + + /* Advance hash chain. */ + if (!iter->zone) + goto out; + + for (;;) { + struct hlist_node *node; + int maxslot; + + maxslot = iter->zone->fz_divisor; + + while (++iter->bucket < maxslot) { + iter->hash_head++; + + hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { + list_for_each_entry(fa, &fn->fn_alias, fa_list) { + iter->fn = fn; + iter->fa = fa; + goto out; + } + } + } + + iter->zone = iter->zone->fz_next; + + if (!iter->zone) + goto out; + + iter->bucket = 0; + iter->hash_head = iter->zone->fz_hash; + + hlist_for_each_entry(fn, node, iter->hash_head, fn_hash) { + list_for_each_entry(fa, &fn->fn_alias, fa_list) { + iter->fn = fn; + iter->fa = fa; + goto out; + } + } + } +out: + iter->pos++; + return fa; +} + +static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) +{ + struct fib_iter_state *iter = seq->private; + struct fib_alias *fa; + + if (iter->valid && pos >= iter->pos && iter->genid == fib_hash_genid) { + fa = iter->fa; + pos -= iter->pos; + } else + fa = fib_get_first(seq); + + if (fa) + while (pos && (fa = fib_get_next(seq))) + --pos; + return pos ? NULL : fa; +} + +static void *fib_seq_start(struct seq_file *seq, loff_t *pos) +{ + void *v = NULL; + + read_lock(&fib_hash_lock); + if (ip_fib_main_table) + v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; + return v; +} + +static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return v == SEQ_START_TOKEN ? fib_get_first(seq) : fib_get_next(seq); +} + +static void fib_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&fib_hash_lock); +} + +static unsigned fib_flag_trans(int type, u32 mask, struct fib_info *fi) +{ + static unsigned type2flags[RTN_MAX + 1] = { + [7] = RTF_REJECT, [8] = RTF_REJECT, + }; + unsigned flags = type2flags[type]; + + if (fi && fi->fib_nh->nh_gw) + flags |= RTF_GATEWAY; + if (mask == 0xFFFFFFFF) + flags |= RTF_HOST; + flags |= RTF_UP; + return flags; +} + +/* + * This outputs /proc/net/route. + * + * It always works in backward compatibility mode. + * The format of the file is not supposed to be changed. + */ +static int fib_seq_show(struct seq_file *seq, void *v) +{ + struct fib_iter_state *iter; + char bf[128]; + u32 prefix, mask; + unsigned flags; + struct fib_node *f; + struct fib_alias *fa; + struct fib_info *fi; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway " + "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU" + "\tWindow\tIRTT"); + goto out; + } + + iter = seq->private; + f = iter->fn; + fa = iter->fa; + fi = fa->fa_info; + prefix = f->fn_key; + mask = FZ_MASK(iter->zone); + flags = fib_flag_trans(fa->fa_type, mask, fi); + if (fi) + snprintf(bf, sizeof(bf), + "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", + fi->fib_dev ? fi->fib_dev->name : "*", prefix, + fi->fib_nh->nh_gw, flags, 0, 0, fi->fib_priority, + mask, (fi->fib_advmss ? fi->fib_advmss + 40 : 0), + fi->fib_window, + fi->fib_rtt >> 3); + else + snprintf(bf, sizeof(bf), + "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", + prefix, 0, flags, 0, 0, 0, mask, 0, 0, 0); + seq_printf(seq, "%-127s\n", bf); +out: + return 0; +} + +static struct seq_operations fib_seq_ops = { + .start = fib_seq_start, + .next = fib_seq_next, + .stop = fib_seq_stop, + .show = fib_seq_show, +}; + +static int fib_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct fib_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &fib_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations fib_seq_fops = { + .owner = THIS_MODULE, + .open = fib_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +int __init fib_proc_init(void) +{ + if (!proc_net_fops_create("route", S_IRUGO, &fib_seq_fops)) + return -ENOMEM; + return 0; +} + +void __init fib_proc_exit(void) +{ + proc_net_remove("route"); +} +#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h new file mode 100644 index 000000000000..ac4485f75e97 --- /dev/null +++ b/net/ipv4/fib_lookup.h @@ -0,0 +1,43 @@ +#ifndef _FIB_LOOKUP_H +#define _FIB_LOOKUP_H + +#include +#include +#include + +struct fib_alias { + struct list_head fa_list; + struct fib_info *fa_info; + u8 fa_tos; + u8 fa_type; + u8 fa_scope; + u8 fa_state; +}; + +#define FA_S_ACCESSED 0x01 + +/* Exported by fib_semantics.c */ +extern int fib_semantic_match(struct list_head *head, + const struct flowi *flp, + struct fib_result *res, __u32 zone, __u32 mask, + int prefixlen); +extern void fib_release_info(struct fib_info *); +extern struct fib_info *fib_create_info(const struct rtmsg *r, + struct kern_rta *rta, + const struct nlmsghdr *, + int *err); +extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *, + struct kern_rta *rta, struct fib_info *fi); +extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, + u8 tb_id, u8 type, u8 scope, void *dst, + int dst_len, u8 tos, struct fib_info *fi); +extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, + int z, int tb_id, + struct nlmsghdr *n, struct netlink_skb_parms *req); +extern struct fib_alias *fib_find_alias(struct list_head *fah, + u8 tos, u32 prio); +extern int fib_detect_death(struct fib_info *fi, int order, + struct fib_info **last_resort, + int *last_idx, int *dflt); + +#endif /* _FIB_LOOKUP_H */ diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c new file mode 100644 index 000000000000..39d0aadb9a2a --- /dev/null +++ b/net/ipv4/fib_rules.c @@ -0,0 +1,437 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 Forwarding Information Base: policy rules. + * + * Version: $Id: fib_rules.c,v 1.17 2001/10/31 21:55:54 davem Exp $ + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Rani Assaf : local_rule cannot be deleted + * Marc Boucher : routing by fwmark + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define FRprintk(a...) + +struct fib_rule +{ + struct fib_rule *r_next; + atomic_t r_clntref; + u32 r_preference; + unsigned char r_table; + unsigned char r_action; + unsigned char r_dst_len; + unsigned char r_src_len; + u32 r_src; + u32 r_srcmask; + u32 r_dst; + u32 r_dstmask; + u32 r_srcmap; + u8 r_flags; + u8 r_tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + u32 r_fwmark; +#endif + int r_ifindex; +#ifdef CONFIG_NET_CLS_ROUTE + __u32 r_tclassid; +#endif + char r_ifname[IFNAMSIZ]; + int r_dead; +}; + +static struct fib_rule default_rule = { + .r_clntref = ATOMIC_INIT(2), + .r_preference = 0x7FFF, + .r_table = RT_TABLE_DEFAULT, + .r_action = RTN_UNICAST, +}; + +static struct fib_rule main_rule = { + .r_next = &default_rule, + .r_clntref = ATOMIC_INIT(2), + .r_preference = 0x7FFE, + .r_table = RT_TABLE_MAIN, + .r_action = RTN_UNICAST, +}; + +static struct fib_rule local_rule = { + .r_next = &main_rule, + .r_clntref = ATOMIC_INIT(2), + .r_table = RT_TABLE_LOCAL, + .r_action = RTN_UNICAST, +}; + +static struct fib_rule *fib_rules = &local_rule; +static DEFINE_RWLOCK(fib_rules_lock); + +int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct rtattr **rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct fib_rule *r, **rp; + int err = -ESRCH; + + for (rp=&fib_rules; (r=*rp) != NULL; rp=&r->r_next) { + if ((!rta[RTA_SRC-1] || memcmp(RTA_DATA(rta[RTA_SRC-1]), &r->r_src, 4) == 0) && + rtm->rtm_src_len == r->r_src_len && + rtm->rtm_dst_len == r->r_dst_len && + (!rta[RTA_DST-1] || memcmp(RTA_DATA(rta[RTA_DST-1]), &r->r_dst, 4) == 0) && + rtm->rtm_tos == r->r_tos && +#ifdef CONFIG_IP_ROUTE_FWMARK + (!rta[RTA_PROTOINFO-1] || memcmp(RTA_DATA(rta[RTA_PROTOINFO-1]), &r->r_fwmark, 4) == 0) && +#endif + (!rtm->rtm_type || rtm->rtm_type == r->r_action) && + (!rta[RTA_PRIORITY-1] || memcmp(RTA_DATA(rta[RTA_PRIORITY-1]), &r->r_preference, 4) == 0) && + (!rta[RTA_IIF-1] || rtattr_strcmp(rta[RTA_IIF-1], r->r_ifname) == 0) && + (!rtm->rtm_table || (r && rtm->rtm_table == r->r_table))) { + err = -EPERM; + if (r == &local_rule) + break; + + write_lock_bh(&fib_rules_lock); + *rp = r->r_next; + r->r_dead = 1; + write_unlock_bh(&fib_rules_lock); + fib_rule_put(r); + err = 0; + break; + } + } + return err; +} + +/* Allocate new unique table id */ + +static struct fib_table *fib_empty_table(void) +{ + int id; + + for (id = 1; id <= RT_TABLE_MAX; id++) + if (fib_tables[id] == NULL) + return __fib_new_table(id); + return NULL; +} + +void fib_rule_put(struct fib_rule *r) +{ + if (atomic_dec_and_test(&r->r_clntref)) { + if (r->r_dead) + kfree(r); + else + printk("Freeing alive rule %p\n", r); + } +} + +int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct rtattr **rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct fib_rule *r, *new_r, **rp; + unsigned char table_id; + + if (rtm->rtm_src_len > 32 || rtm->rtm_dst_len > 32 || + (rtm->rtm_tos & ~IPTOS_TOS_MASK)) + return -EINVAL; + + if (rta[RTA_IIF-1] && RTA_PAYLOAD(rta[RTA_IIF-1]) > IFNAMSIZ) + return -EINVAL; + + table_id = rtm->rtm_table; + if (table_id == RT_TABLE_UNSPEC) { + struct fib_table *table; + if (rtm->rtm_type == RTN_UNICAST) { + if ((table = fib_empty_table()) == NULL) + return -ENOBUFS; + table_id = table->tb_id; + } + } + + new_r = kmalloc(sizeof(*new_r), GFP_KERNEL); + if (!new_r) + return -ENOMEM; + memset(new_r, 0, sizeof(*new_r)); + if (rta[RTA_SRC-1]) + memcpy(&new_r->r_src, RTA_DATA(rta[RTA_SRC-1]), 4); + if (rta[RTA_DST-1]) + memcpy(&new_r->r_dst, RTA_DATA(rta[RTA_DST-1]), 4); + if (rta[RTA_GATEWAY-1]) + memcpy(&new_r->r_srcmap, RTA_DATA(rta[RTA_GATEWAY-1]), 4); + new_r->r_src_len = rtm->rtm_src_len; + new_r->r_dst_len = rtm->rtm_dst_len; + new_r->r_srcmask = inet_make_mask(rtm->rtm_src_len); + new_r->r_dstmask = inet_make_mask(rtm->rtm_dst_len); + new_r->r_tos = rtm->rtm_tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + if (rta[RTA_PROTOINFO-1]) + memcpy(&new_r->r_fwmark, RTA_DATA(rta[RTA_PROTOINFO-1]), 4); +#endif + new_r->r_action = rtm->rtm_type; + new_r->r_flags = rtm->rtm_flags; + if (rta[RTA_PRIORITY-1]) + memcpy(&new_r->r_preference, RTA_DATA(rta[RTA_PRIORITY-1]), 4); + new_r->r_table = table_id; + if (rta[RTA_IIF-1]) { + struct net_device *dev; + rtattr_strlcpy(new_r->r_ifname, rta[RTA_IIF-1], IFNAMSIZ); + new_r->r_ifindex = -1; + dev = __dev_get_by_name(new_r->r_ifname); + if (dev) + new_r->r_ifindex = dev->ifindex; + } +#ifdef CONFIG_NET_CLS_ROUTE + if (rta[RTA_FLOW-1]) + memcpy(&new_r->r_tclassid, RTA_DATA(rta[RTA_FLOW-1]), 4); +#endif + + rp = &fib_rules; + if (!new_r->r_preference) { + r = fib_rules; + if (r && (r = r->r_next) != NULL) { + rp = &fib_rules->r_next; + if (r->r_preference) + new_r->r_preference = r->r_preference - 1; + } + } + + while ( (r = *rp) != NULL ) { + if (r->r_preference > new_r->r_preference) + break; + rp = &r->r_next; + } + + new_r->r_next = r; + atomic_inc(&new_r->r_clntref); + write_lock_bh(&fib_rules_lock); + *rp = new_r; + write_unlock_bh(&fib_rules_lock); + return 0; +} + +#ifdef CONFIG_NET_CLS_ROUTE +u32 fib_rules_tclass(struct fib_result *res) +{ + if (res->r) + return res->r->r_tclassid; + return 0; +} +#endif + + +static void fib_rules_detach(struct net_device *dev) +{ + struct fib_rule *r; + + for (r=fib_rules; r; r=r->r_next) { + if (r->r_ifindex == dev->ifindex) { + write_lock_bh(&fib_rules_lock); + r->r_ifindex = -1; + write_unlock_bh(&fib_rules_lock); + } + } +} + +static void fib_rules_attach(struct net_device *dev) +{ + struct fib_rule *r; + + for (r=fib_rules; r; r=r->r_next) { + if (r->r_ifindex == -1 && strcmp(dev->name, r->r_ifname) == 0) { + write_lock_bh(&fib_rules_lock); + r->r_ifindex = dev->ifindex; + write_unlock_bh(&fib_rules_lock); + } + } +} + +int fib_lookup(const struct flowi *flp, struct fib_result *res) +{ + int err; + struct fib_rule *r, *policy; + struct fib_table *tb; + + u32 daddr = flp->fl4_dst; + u32 saddr = flp->fl4_src; + +FRprintk("Lookup: %u.%u.%u.%u <- %u.%u.%u.%u ", + NIPQUAD(flp->fl4_dst), NIPQUAD(flp->fl4_src)); + read_lock(&fib_rules_lock); + for (r = fib_rules; r; r=r->r_next) { + if (((saddr^r->r_src) & r->r_srcmask) || + ((daddr^r->r_dst) & r->r_dstmask) || + (r->r_tos && r->r_tos != flp->fl4_tos) || +#ifdef CONFIG_IP_ROUTE_FWMARK + (r->r_fwmark && r->r_fwmark != flp->fl4_fwmark) || +#endif + (r->r_ifindex && r->r_ifindex != flp->iif)) + continue; + +FRprintk("tb %d r %d ", r->r_table, r->r_action); + switch (r->r_action) { + case RTN_UNICAST: + policy = r; + break; + case RTN_UNREACHABLE: + read_unlock(&fib_rules_lock); + return -ENETUNREACH; + default: + case RTN_BLACKHOLE: + read_unlock(&fib_rules_lock); + return -EINVAL; + case RTN_PROHIBIT: + read_unlock(&fib_rules_lock); + return -EACCES; + } + + if ((tb = fib_get_table(r->r_table)) == NULL) + continue; + err = tb->tb_lookup(tb, flp, res); + if (err == 0) { + res->r = policy; + if (policy) + atomic_inc(&policy->r_clntref); + read_unlock(&fib_rules_lock); + return 0; + } + if (err < 0 && err != -EAGAIN) { + read_unlock(&fib_rules_lock); + return err; + } + } +FRprintk("FAILURE\n"); + read_unlock(&fib_rules_lock); + return -ENETUNREACH; +} + +void fib_select_default(const struct flowi *flp, struct fib_result *res) +{ + if (res->r && res->r->r_action == RTN_UNICAST && + FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) { + struct fib_table *tb; + if ((tb = fib_get_table(res->r->r_table)) != NULL) + tb->tb_select_default(tb, flp, res); + } +} + +static int fib_rules_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + if (event == NETDEV_UNREGISTER) + fib_rules_detach(dev); + else if (event == NETDEV_REGISTER) + fib_rules_attach(dev); + return NOTIFY_DONE; +} + + +static struct notifier_block fib_rules_notifier = { + .notifier_call =fib_rules_event, +}; + +static __inline__ int inet_fill_rule(struct sk_buff *skb, + struct fib_rule *r, + struct netlink_callback *cb) +{ + struct rtmsg *rtm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, NETLINK_CREDS(cb->skb)->pid, cb->nlh->nlmsg_seq, RTM_NEWRULE, sizeof(*rtm)); + rtm = NLMSG_DATA(nlh); + rtm->rtm_family = AF_INET; + rtm->rtm_dst_len = r->r_dst_len; + rtm->rtm_src_len = r->r_src_len; + rtm->rtm_tos = r->r_tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + if (r->r_fwmark) + RTA_PUT(skb, RTA_PROTOINFO, 4, &r->r_fwmark); +#endif + rtm->rtm_table = r->r_table; + rtm->rtm_protocol = 0; + rtm->rtm_scope = 0; + rtm->rtm_type = r->r_action; + rtm->rtm_flags = r->r_flags; + + if (r->r_dst_len) + RTA_PUT(skb, RTA_DST, 4, &r->r_dst); + if (r->r_src_len) + RTA_PUT(skb, RTA_SRC, 4, &r->r_src); + if (r->r_ifname[0]) + RTA_PUT(skb, RTA_IIF, IFNAMSIZ, &r->r_ifname); + if (r->r_preference) + RTA_PUT(skb, RTA_PRIORITY, 4, &r->r_preference); + if (r->r_srcmap) + RTA_PUT(skb, RTA_GATEWAY, 4, &r->r_srcmap); +#ifdef CONFIG_NET_CLS_ROUTE + if (r->r_tclassid) + RTA_PUT(skb, RTA_FLOW, 4, &r->r_tclassid); +#endif + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx; + int s_idx = cb->args[0]; + struct fib_rule *r; + + read_lock(&fib_rules_lock); + for (r=fib_rules, idx=0; r; r = r->r_next, idx++) { + if (idx < s_idx) + continue; + if (inet_fill_rule(skb, r, cb) < 0) + break; + } + read_unlock(&fib_rules_lock); + cb->args[0] = idx; + + return skb->len; +} + +void __init fib_rules_init(void) +{ + register_netdevice_notifier(&fib_rules_notifier); +} diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c new file mode 100644 index 000000000000..029362d66135 --- /dev/null +++ b/net/ipv4/fib_semantics.c @@ -0,0 +1,1332 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IPv4 Forwarding Information Base: semantics. + * + * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $ + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "fib_lookup.h" + +#define FSprintk(a...) + +static DEFINE_RWLOCK(fib_info_lock); +static struct hlist_head *fib_info_hash; +static struct hlist_head *fib_info_laddrhash; +static unsigned int fib_hash_size; +static unsigned int fib_info_cnt; + +#define DEVINDEX_HASHBITS 8 +#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS) +static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +static DEFINE_SPINLOCK(fib_multipath_lock); + +#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ +for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) + +#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \ +for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++) + +#else /* CONFIG_IP_ROUTE_MULTIPATH */ + +/* Hope, that gcc will optimize it to get rid of dummy loop */ + +#define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \ +for (nhsel=0; nhsel < 1; nhsel++) + +#define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \ +for (nhsel=0; nhsel < 1; nhsel++) + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ + +#define endfor_nexthops(fi) } + + +static struct +{ + int error; + u8 scope; +} fib_props[RTA_MAX + 1] = { + { + .error = 0, + .scope = RT_SCOPE_NOWHERE, + }, /* RTN_UNSPEC */ + { + .error = 0, + .scope = RT_SCOPE_UNIVERSE, + }, /* RTN_UNICAST */ + { + .error = 0, + .scope = RT_SCOPE_HOST, + }, /* RTN_LOCAL */ + { + .error = 0, + .scope = RT_SCOPE_LINK, + }, /* RTN_BROADCAST */ + { + .error = 0, + .scope = RT_SCOPE_LINK, + }, /* RTN_ANYCAST */ + { + .error = 0, + .scope = RT_SCOPE_UNIVERSE, + }, /* RTN_MULTICAST */ + { + .error = -EINVAL, + .scope = RT_SCOPE_UNIVERSE, + }, /* RTN_BLACKHOLE */ + { + .error = -EHOSTUNREACH, + .scope = RT_SCOPE_UNIVERSE, + }, /* RTN_UNREACHABLE */ + { + .error = -EACCES, + .scope = RT_SCOPE_UNIVERSE, + }, /* RTN_PROHIBIT */ + { + .error = -EAGAIN, + .scope = RT_SCOPE_UNIVERSE, + }, /* RTN_THROW */ + { + .error = -EINVAL, + .scope = RT_SCOPE_NOWHERE, + }, /* RTN_NAT */ + { + .error = -EINVAL, + .scope = RT_SCOPE_NOWHERE, + }, /* RTN_XRESOLVE */ +}; + + +/* Release a nexthop info record */ + +void free_fib_info(struct fib_info *fi) +{ + if (fi->fib_dead == 0) { + printk("Freeing alive fib_info %p\n", fi); + return; + } + change_nexthops(fi) { + if (nh->nh_dev) + dev_put(nh->nh_dev); + nh->nh_dev = NULL; + } endfor_nexthops(fi); + fib_info_cnt--; + kfree(fi); +} + +void fib_release_info(struct fib_info *fi) +{ + write_lock(&fib_info_lock); + if (fi && --fi->fib_treeref == 0) { + hlist_del(&fi->fib_hash); + if (fi->fib_prefsrc) + hlist_del(&fi->fib_lhash); + change_nexthops(fi) { + if (!nh->nh_dev) + continue; + hlist_del(&nh->nh_hash); + } endfor_nexthops(fi) + fi->fib_dead = 1; + fib_info_put(fi); + } + write_unlock(&fib_info_lock); +} + +static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) +{ + const struct fib_nh *onh = ofi->fib_nh; + + for_nexthops(fi) { + if (nh->nh_oif != onh->nh_oif || + nh->nh_gw != onh->nh_gw || + nh->nh_scope != onh->nh_scope || +#ifdef CONFIG_IP_ROUTE_MULTIPATH + nh->nh_weight != onh->nh_weight || +#endif +#ifdef CONFIG_NET_CLS_ROUTE + nh->nh_tclassid != onh->nh_tclassid || +#endif + ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) + return -1; + onh++; + } endfor_nexthops(fi); + return 0; +} + +static inline unsigned int fib_info_hashfn(const struct fib_info *fi) +{ + unsigned int mask = (fib_hash_size - 1); + unsigned int val = fi->fib_nhs; + + val ^= fi->fib_protocol; + val ^= fi->fib_prefsrc; + val ^= fi->fib_priority; + + return (val ^ (val >> 7) ^ (val >> 12)) & mask; +} + +static struct fib_info *fib_find_info(const struct fib_info *nfi) +{ + struct hlist_head *head; + struct hlist_node *node; + struct fib_info *fi; + unsigned int hash; + + hash = fib_info_hashfn(nfi); + head = &fib_info_hash[hash]; + + hlist_for_each_entry(fi, node, head, fib_hash) { + if (fi->fib_nhs != nfi->fib_nhs) + continue; + if (nfi->fib_protocol == fi->fib_protocol && + nfi->fib_prefsrc == fi->fib_prefsrc && + nfi->fib_priority == fi->fib_priority && + memcmp(nfi->fib_metrics, fi->fib_metrics, + sizeof(fi->fib_metrics)) == 0 && + ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && + (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) + return fi; + } + + return NULL; +} + +static inline unsigned int fib_devindex_hashfn(unsigned int val) +{ + unsigned int mask = DEVINDEX_HASHSIZE - 1; + + return (val ^ + (val >> DEVINDEX_HASHBITS) ^ + (val >> (DEVINDEX_HASHBITS * 2))) & mask; +} + +/* Check, that the gateway is already configured. + Used only by redirect accept routine. + */ + +int ip_fib_check_default(u32 gw, struct net_device *dev) +{ + struct hlist_head *head; + struct hlist_node *node; + struct fib_nh *nh; + unsigned int hash; + + read_lock(&fib_info_lock); + + hash = fib_devindex_hashfn(dev->ifindex); + head = &fib_info_devhash[hash]; + hlist_for_each_entry(nh, node, head, nh_hash) { + if (nh->nh_dev == dev && + nh->nh_gw == gw && + !(nh->nh_flags&RTNH_F_DEAD)) { + read_unlock(&fib_info_lock); + return 0; + } + } + + read_unlock(&fib_info_lock); + + return -1; +} + +void rtmsg_fib(int event, u32 key, struct fib_alias *fa, + int z, int tb_id, + struct nlmsghdr *n, struct netlink_skb_parms *req) +{ + struct sk_buff *skb; + u32 pid = req ? req->pid : 0; + int size = NLMSG_SPACE(sizeof(struct rtmsg)+256); + + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) + return; + + if (fib_dump_info(skb, pid, n->nlmsg_seq, event, tb_id, + fa->fa_type, fa->fa_scope, &key, z, + fa->fa_tos, + fa->fa_info) < 0) { + kfree_skb(skb); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_IPV4_ROUTE; + if (n->nlmsg_flags&NLM_F_ECHO) + atomic_inc(&skb->users); + netlink_broadcast(rtnl, skb, pid, RTMGRP_IPV4_ROUTE, GFP_KERNEL); + if (n->nlmsg_flags&NLM_F_ECHO) + netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); +} + +/* Return the first fib alias matching TOS with + * priority less than or equal to PRIO. + */ +struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio) +{ + if (fah) { + struct fib_alias *fa; + list_for_each_entry(fa, fah, fa_list) { + if (fa->fa_tos > tos) + continue; + if (fa->fa_info->fib_priority >= prio || + fa->fa_tos < tos) + return fa; + } + } + return NULL; +} + +int fib_detect_death(struct fib_info *fi, int order, + struct fib_info **last_resort, int *last_idx, int *dflt) +{ + struct neighbour *n; + int state = NUD_NONE; + + n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev); + if (n) { + state = n->nud_state; + neigh_release(n); + } + if (state==NUD_REACHABLE) + return 0; + if ((state&NUD_VALID) && order != *dflt) + return 0; + if ((state&NUD_VALID) || + (*last_idx<0 && order > *dflt)) { + *last_resort = fi; + *last_idx = order; + } + return 1; +} + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type) +{ + while (RTA_OK(attr,attrlen)) { + if (attr->rta_type == type) + return *(u32*)RTA_DATA(attr); + attr = RTA_NEXT(attr, attrlen); + } + return 0; +} + +static int +fib_count_nexthops(struct rtattr *rta) +{ + int nhs = 0; + struct rtnexthop *nhp = RTA_DATA(rta); + int nhlen = RTA_PAYLOAD(rta); + + while (nhlen >= (int)sizeof(struct rtnexthop)) { + if ((nhlen -= nhp->rtnh_len) < 0) + return 0; + nhs++; + nhp = RTNH_NEXT(nhp); + }; + return nhs; +} + +static int +fib_get_nhs(struct fib_info *fi, const struct rtattr *rta, const struct rtmsg *r) +{ + struct rtnexthop *nhp = RTA_DATA(rta); + int nhlen = RTA_PAYLOAD(rta); + + change_nexthops(fi) { + int attrlen = nhlen - sizeof(struct rtnexthop); + if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0) + return -EINVAL; + nh->nh_flags = (r->rtm_flags&~0xFF) | nhp->rtnh_flags; + nh->nh_oif = nhp->rtnh_ifindex; + nh->nh_weight = nhp->rtnh_hops + 1; + if (attrlen) { + nh->nh_gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); +#ifdef CONFIG_NET_CLS_ROUTE + nh->nh_tclassid = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW); +#endif + } + nhp = RTNH_NEXT(nhp); + } endfor_nexthops(fi); + return 0; +} + +#endif + +int fib_nh_match(struct rtmsg *r, struct nlmsghdr *nlh, struct kern_rta *rta, + struct fib_info *fi) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH + struct rtnexthop *nhp; + int nhlen; +#endif + + if (rta->rta_priority && + *rta->rta_priority != fi->fib_priority) + return 1; + + if (rta->rta_oif || rta->rta_gw) { + if ((!rta->rta_oif || *rta->rta_oif == fi->fib_nh->nh_oif) && + (!rta->rta_gw || memcmp(rta->rta_gw, &fi->fib_nh->nh_gw, 4) == 0)) + return 0; + return 1; + } + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (rta->rta_mp == NULL) + return 0; + nhp = RTA_DATA(rta->rta_mp); + nhlen = RTA_PAYLOAD(rta->rta_mp); + + for_nexthops(fi) { + int attrlen = nhlen - sizeof(struct rtnexthop); + u32 gw; + + if (attrlen < 0 || (nhlen -= nhp->rtnh_len) < 0) + return -EINVAL; + if (nhp->rtnh_ifindex && nhp->rtnh_ifindex != nh->nh_oif) + return 1; + if (attrlen) { + gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_GATEWAY); + if (gw && gw != nh->nh_gw) + return 1; +#ifdef CONFIG_NET_CLS_ROUTE + gw = fib_get_attr32(RTNH_DATA(nhp), attrlen, RTA_FLOW); + if (gw && gw != nh->nh_tclassid) + return 1; +#endif + } + nhp = RTNH_NEXT(nhp); + } endfor_nexthops(fi); +#endif + return 0; +} + + +/* + Picture + ------- + + Semantics of nexthop is very messy by historical reasons. + We have to take into account, that: + a) gateway can be actually local interface address, + so that gatewayed route is direct. + b) gateway must be on-link address, possibly + described not by an ifaddr, but also by a direct route. + c) If both gateway and interface are specified, they should not + contradict. + d) If we use tunnel routes, gateway could be not on-link. + + Attempt to reconcile all of these (alas, self-contradictory) conditions + results in pretty ugly and hairy code with obscure logic. + + I chose to generalized it instead, so that the size + of code does not increase practically, but it becomes + much more general. + Every prefix is assigned a "scope" value: "host" is local address, + "link" is direct route, + [ ... "site" ... "interior" ... ] + and "universe" is true gateway route with global meaning. + + Every prefix refers to a set of "nexthop"s (gw, oif), + where gw must have narrower scope. This recursion stops + when gw has LOCAL scope or if "nexthop" is declared ONLINK, + which means that gw is forced to be on link. + + Code is still hairy, but now it is apparently logically + consistent and very flexible. F.e. as by-product it allows + to co-exists in peace independent exterior and interior + routing processes. + + Normally it looks as following. + + {universe prefix} -> (gw, oif) [scope link] + | + |-> {link prefix} -> (gw, oif) [scope local] + | + |-> {local prefix} (terminal node) + */ + +static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh) +{ + int err; + + if (nh->nh_gw) { + struct fib_result res; + +#ifdef CONFIG_IP_ROUTE_PERVASIVE + if (nh->nh_flags&RTNH_F_PERVASIVE) + return 0; +#endif + if (nh->nh_flags&RTNH_F_ONLINK) { + struct net_device *dev; + + if (r->rtm_scope >= RT_SCOPE_LINK) + return -EINVAL; + if (inet_addr_type(nh->nh_gw) != RTN_UNICAST) + return -EINVAL; + if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL) + return -ENODEV; + if (!(dev->flags&IFF_UP)) + return -ENETDOWN; + nh->nh_dev = dev; + dev_hold(dev); + nh->nh_scope = RT_SCOPE_LINK; + return 0; + } + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = nh->nh_gw, + .scope = r->rtm_scope + 1 } }, + .oif = nh->nh_oif }; + + /* It is not necessary, but requires a bit of thinking */ + if (fl.fl4_scope < RT_SCOPE_LINK) + fl.fl4_scope = RT_SCOPE_LINK; + if ((err = fib_lookup(&fl, &res)) != 0) + return err; + } + err = -EINVAL; + if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) + goto out; + nh->nh_scope = res.scope; + nh->nh_oif = FIB_RES_OIF(res); + if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL) + goto out; + dev_hold(nh->nh_dev); + err = -ENETDOWN; + if (!(nh->nh_dev->flags & IFF_UP)) + goto out; + err = 0; +out: + fib_res_put(&res); + return err; + } else { + struct in_device *in_dev; + + if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) + return -EINVAL; + + in_dev = inetdev_by_index(nh->nh_oif); + if (in_dev == NULL) + return -ENODEV; + if (!(in_dev->dev->flags&IFF_UP)) { + in_dev_put(in_dev); + return -ENETDOWN; + } + nh->nh_dev = in_dev->dev; + dev_hold(nh->nh_dev); + nh->nh_scope = RT_SCOPE_HOST; + in_dev_put(in_dev); + } + return 0; +} + +static inline unsigned int fib_laddr_hashfn(u32 val) +{ + unsigned int mask = (fib_hash_size - 1); + + return (val ^ (val >> 7) ^ (val >> 14)) & mask; +} + +static struct hlist_head *fib_hash_alloc(int bytes) +{ + if (bytes <= PAGE_SIZE) + return kmalloc(bytes, GFP_KERNEL); + else + return (struct hlist_head *) + __get_free_pages(GFP_KERNEL, get_order(bytes)); +} + +static void fib_hash_free(struct hlist_head *hash, int bytes) +{ + if (!hash) + return; + + if (bytes <= PAGE_SIZE) + kfree(hash); + else + free_pages((unsigned long) hash, get_order(bytes)); +} + +static void fib_hash_move(struct hlist_head *new_info_hash, + struct hlist_head *new_laddrhash, + unsigned int new_size) +{ + unsigned int old_size = fib_hash_size; + unsigned int i; + + write_lock(&fib_info_lock); + fib_hash_size = new_size; + + for (i = 0; i < old_size; i++) { + struct hlist_head *head = &fib_info_hash[i]; + struct hlist_node *node, *n; + struct fib_info *fi; + + hlist_for_each_entry_safe(fi, node, n, head, fib_hash) { + struct hlist_head *dest; + unsigned int new_hash; + + hlist_del(&fi->fib_hash); + + new_hash = fib_info_hashfn(fi); + dest = &new_info_hash[new_hash]; + hlist_add_head(&fi->fib_hash, dest); + } + } + fib_info_hash = new_info_hash; + + for (i = 0; i < old_size; i++) { + struct hlist_head *lhead = &fib_info_laddrhash[i]; + struct hlist_node *node, *n; + struct fib_info *fi; + + hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) { + struct hlist_head *ldest; + unsigned int new_hash; + + hlist_del(&fi->fib_lhash); + + new_hash = fib_laddr_hashfn(fi->fib_prefsrc); + ldest = &new_laddrhash[new_hash]; + hlist_add_head(&fi->fib_lhash, ldest); + } + } + fib_info_laddrhash = new_laddrhash; + + write_unlock(&fib_info_lock); +} + +struct fib_info * +fib_create_info(const struct rtmsg *r, struct kern_rta *rta, + const struct nlmsghdr *nlh, int *errp) +{ + int err; + struct fib_info *fi = NULL; + struct fib_info *ofi; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + int nhs = 1; +#else + const int nhs = 1; +#endif +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + u32 mp_alg = IP_MP_ALG_NONE; +#endif + + /* Fast check to catch the most weird cases */ + if (fib_props[r->rtm_type].scope > r->rtm_scope) + goto err_inval; + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (rta->rta_mp) { + nhs = fib_count_nexthops(rta->rta_mp); + if (nhs == 0) + goto err_inval; + } +#endif +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + if (rta->rta_mp_alg) { + mp_alg = *rta->rta_mp_alg; + + if (mp_alg < IP_MP_ALG_NONE || + mp_alg > IP_MP_ALG_MAX) + goto err_inval; + } +#endif + + err = -ENOBUFS; + if (fib_info_cnt >= fib_hash_size) { + unsigned int new_size = fib_hash_size << 1; + struct hlist_head *new_info_hash; + struct hlist_head *new_laddrhash; + unsigned int bytes; + + if (!new_size) + new_size = 1; + bytes = new_size * sizeof(struct hlist_head *); + new_info_hash = fib_hash_alloc(bytes); + new_laddrhash = fib_hash_alloc(bytes); + if (!new_info_hash || !new_laddrhash) { + fib_hash_free(new_info_hash, bytes); + fib_hash_free(new_laddrhash, bytes); + } else { + memset(new_info_hash, 0, bytes); + memset(new_laddrhash, 0, bytes); + + fib_hash_move(new_info_hash, new_laddrhash, new_size); + } + + if (!fib_hash_size) + goto failure; + } + + fi = kmalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); + if (fi == NULL) + goto failure; + fib_info_cnt++; + memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh)); + + fi->fib_protocol = r->rtm_protocol; + + fi->fib_nhs = nhs; + change_nexthops(fi) { + nh->nh_parent = fi; + } endfor_nexthops(fi) + + fi->fib_flags = r->rtm_flags; + if (rta->rta_priority) + fi->fib_priority = *rta->rta_priority; + if (rta->rta_mx) { + int attrlen = RTA_PAYLOAD(rta->rta_mx); + struct rtattr *attr = RTA_DATA(rta->rta_mx); + + while (RTA_OK(attr, attrlen)) { + unsigned flavor = attr->rta_type; + if (flavor) { + if (flavor > RTAX_MAX) + goto err_inval; + fi->fib_metrics[flavor-1] = *(unsigned*)RTA_DATA(attr); + } + attr = RTA_NEXT(attr, attrlen); + } + } + if (rta->rta_prefsrc) + memcpy(&fi->fib_prefsrc, rta->rta_prefsrc, 4); + + if (rta->rta_mp) { +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if ((err = fib_get_nhs(fi, rta->rta_mp, r)) != 0) + goto failure; + if (rta->rta_oif && fi->fib_nh->nh_oif != *rta->rta_oif) + goto err_inval; + if (rta->rta_gw && memcmp(&fi->fib_nh->nh_gw, rta->rta_gw, 4)) + goto err_inval; +#ifdef CONFIG_NET_CLS_ROUTE + if (rta->rta_flow && memcmp(&fi->fib_nh->nh_tclassid, rta->rta_flow, 4)) + goto err_inval; +#endif +#else + goto err_inval; +#endif + } else { + struct fib_nh *nh = fi->fib_nh; + if (rta->rta_oif) + nh->nh_oif = *rta->rta_oif; + if (rta->rta_gw) + memcpy(&nh->nh_gw, rta->rta_gw, 4); +#ifdef CONFIG_NET_CLS_ROUTE + if (rta->rta_flow) + memcpy(&nh->nh_tclassid, rta->rta_flow, 4); +#endif + nh->nh_flags = r->rtm_flags; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + nh->nh_weight = 1; +#endif + } + +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + fi->fib_mp_alg = mp_alg; +#endif + + if (fib_props[r->rtm_type].error) { + if (rta->rta_gw || rta->rta_oif || rta->rta_mp) + goto err_inval; + goto link_it; + } + + if (r->rtm_scope > RT_SCOPE_HOST) + goto err_inval; + + if (r->rtm_scope == RT_SCOPE_HOST) { + struct fib_nh *nh = fi->fib_nh; + + /* Local address is added. */ + if (nhs != 1 || nh->nh_gw) + goto err_inval; + nh->nh_scope = RT_SCOPE_NOWHERE; + nh->nh_dev = dev_get_by_index(fi->fib_nh->nh_oif); + err = -ENODEV; + if (nh->nh_dev == NULL) + goto failure; + } else { + change_nexthops(fi) { + if ((err = fib_check_nh(r, fi, nh)) != 0) + goto failure; + } endfor_nexthops(fi) + } + + if (fi->fib_prefsrc) { + if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL || + memcmp(&fi->fib_prefsrc, rta->rta_dst, 4)) + if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL) + goto err_inval; + } + +link_it: + if ((ofi = fib_find_info(fi)) != NULL) { + fi->fib_dead = 1; + free_fib_info(fi); + ofi->fib_treeref++; + return ofi; + } + + fi->fib_treeref++; + atomic_inc(&fi->fib_clntref); + write_lock(&fib_info_lock); + hlist_add_head(&fi->fib_hash, + &fib_info_hash[fib_info_hashfn(fi)]); + if (fi->fib_prefsrc) { + struct hlist_head *head; + + head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)]; + hlist_add_head(&fi->fib_lhash, head); + } + change_nexthops(fi) { + struct hlist_head *head; + unsigned int hash; + + if (!nh->nh_dev) + continue; + hash = fib_devindex_hashfn(nh->nh_dev->ifindex); + head = &fib_info_devhash[hash]; + hlist_add_head(&nh->nh_hash, head); + } endfor_nexthops(fi) + write_unlock(&fib_info_lock); + return fi; + +err_inval: + err = -EINVAL; + +failure: + *errp = err; + if (fi) { + fi->fib_dead = 1; + free_fib_info(fi); + } + return NULL; +} + +int fib_semantic_match(struct list_head *head, const struct flowi *flp, + struct fib_result *res, __u32 zone, __u32 mask, + int prefixlen) +{ + struct fib_alias *fa; + int nh_sel = 0; + + list_for_each_entry(fa, head, fa_list) { + int err; + + if (fa->fa_tos && + fa->fa_tos != flp->fl4_tos) + continue; + + if (fa->fa_scope < flp->fl4_scope) + continue; + + fa->fa_state |= FA_S_ACCESSED; + + err = fib_props[fa->fa_type].error; + if (err == 0) { + struct fib_info *fi = fa->fa_info; + + if (fi->fib_flags & RTNH_F_DEAD) + continue; + + switch (fa->fa_type) { + case RTN_UNICAST: + case RTN_LOCAL: + case RTN_BROADCAST: + case RTN_ANYCAST: + case RTN_MULTICAST: + for_nexthops(fi) { + if (nh->nh_flags&RTNH_F_DEAD) + continue; + if (!flp->oif || flp->oif == nh->nh_oif) + break; + } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (nhsel < fi->fib_nhs) { + nh_sel = nhsel; + goto out_fill_res; + } +#else + if (nhsel < 1) { + goto out_fill_res; + } +#endif + endfor_nexthops(fi); + continue; + + default: + printk(KERN_DEBUG "impossible 102\n"); + return -EINVAL; + }; + } + return err; + } + return 1; + +out_fill_res: + res->prefixlen = prefixlen; + res->nh_sel = nh_sel; + res->type = fa->fa_type; + res->scope = fa->fa_scope; + res->fi = fa->fa_info; +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + res->netmask = mask; + res->network = zone & + (0xFFFFFFFF >> (32 - prefixlen)); +#endif + atomic_inc(&res->fi->fib_clntref); + return 0; +} + +/* Find appropriate source address to this destination */ + +u32 __fib_res_prefsrc(struct fib_result *res) +{ + return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope); +} + +int +fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, + u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos, + struct fib_info *fi) +{ + struct rtmsg *rtm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*rtm)); + rtm = NLMSG_DATA(nlh); + rtm->rtm_family = AF_INET; + rtm->rtm_dst_len = dst_len; + rtm->rtm_src_len = 0; + rtm->rtm_tos = tos; + rtm->rtm_table = tb_id; + rtm->rtm_type = type; + rtm->rtm_flags = fi->fib_flags; + rtm->rtm_scope = scope; + if (rtm->rtm_dst_len) + RTA_PUT(skb, RTA_DST, 4, dst); + rtm->rtm_protocol = fi->fib_protocol; + if (fi->fib_priority) + RTA_PUT(skb, RTA_PRIORITY, 4, &fi->fib_priority); +#ifdef CONFIG_NET_CLS_ROUTE + if (fi->fib_nh[0].nh_tclassid) + RTA_PUT(skb, RTA_FLOW, 4, &fi->fib_nh[0].nh_tclassid); +#endif + if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0) + goto rtattr_failure; + if (fi->fib_prefsrc) + RTA_PUT(skb, RTA_PREFSRC, 4, &fi->fib_prefsrc); + if (fi->fib_nhs == 1) { + if (fi->fib_nh->nh_gw) + RTA_PUT(skb, RTA_GATEWAY, 4, &fi->fib_nh->nh_gw); + if (fi->fib_nh->nh_oif) + RTA_PUT(skb, RTA_OIF, sizeof(int), &fi->fib_nh->nh_oif); + } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (fi->fib_nhs > 1) { + struct rtnexthop *nhp; + struct rtattr *mp_head; + if (skb_tailroom(skb) <= RTA_SPACE(0)) + goto rtattr_failure; + mp_head = (struct rtattr*)skb_put(skb, RTA_SPACE(0)); + + for_nexthops(fi) { + if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) + goto rtattr_failure; + nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); + nhp->rtnh_flags = nh->nh_flags & 0xFF; + nhp->rtnh_hops = nh->nh_weight-1; + nhp->rtnh_ifindex = nh->nh_oif; + if (nh->nh_gw) + RTA_PUT(skb, RTA_GATEWAY, 4, &nh->nh_gw); + nhp->rtnh_len = skb->tail - (unsigned char*)nhp; + } endfor_nexthops(fi); + mp_head->rta_type = RTA_MULTIPATH; + mp_head->rta_len = skb->tail - (u8*)mp_head; + } +#endif + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifndef CONFIG_IP_NOSIOCRT + +int +fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, + struct kern_rta *rta, struct rtentry *r) +{ + int plen; + u32 *ptr; + + memset(rtm, 0, sizeof(*rtm)); + memset(rta, 0, sizeof(*rta)); + + if (r->rt_dst.sa_family != AF_INET) + return -EAFNOSUPPORT; + + /* Check mask for validity: + a) it must be contiguous. + b) destination must have all host bits clear. + c) if application forgot to set correct family (AF_INET), + reject request unless it is absolutely clear i.e. + both family and mask are zero. + */ + plen = 32; + ptr = &((struct sockaddr_in*)&r->rt_dst)->sin_addr.s_addr; + if (!(r->rt_flags&RTF_HOST)) { + u32 mask = ((struct sockaddr_in*)&r->rt_genmask)->sin_addr.s_addr; + if (r->rt_genmask.sa_family != AF_INET) { + if (mask || r->rt_genmask.sa_family) + return -EAFNOSUPPORT; + } + if (bad_mask(mask, *ptr)) + return -EINVAL; + plen = inet_mask_len(mask); + } + + nl->nlmsg_flags = NLM_F_REQUEST; + nl->nlmsg_pid = 0; + nl->nlmsg_seq = 0; + nl->nlmsg_len = NLMSG_LENGTH(sizeof(*rtm)); + if (cmd == SIOCDELRT) { + nl->nlmsg_type = RTM_DELROUTE; + nl->nlmsg_flags = 0; + } else { + nl->nlmsg_type = RTM_NEWROUTE; + nl->nlmsg_flags = NLM_F_REQUEST|NLM_F_CREATE; + rtm->rtm_protocol = RTPROT_BOOT; + } + + rtm->rtm_dst_len = plen; + rta->rta_dst = ptr; + + if (r->rt_metric) { + *(u32*)&r->rt_pad3 = r->rt_metric - 1; + rta->rta_priority = (u32*)&r->rt_pad3; + } + if (r->rt_flags&RTF_REJECT) { + rtm->rtm_scope = RT_SCOPE_HOST; + rtm->rtm_type = RTN_UNREACHABLE; + return 0; + } + rtm->rtm_scope = RT_SCOPE_NOWHERE; + rtm->rtm_type = RTN_UNICAST; + + if (r->rt_dev) { + char *colon; + struct net_device *dev; + char devname[IFNAMSIZ]; + + if (copy_from_user(devname, r->rt_dev, IFNAMSIZ-1)) + return -EFAULT; + devname[IFNAMSIZ-1] = 0; + colon = strchr(devname, ':'); + if (colon) + *colon = 0; + dev = __dev_get_by_name(devname); + if (!dev) + return -ENODEV; + rta->rta_oif = &dev->ifindex; + if (colon) { + struct in_ifaddr *ifa; + struct in_device *in_dev = __in_dev_get(dev); + if (!in_dev) + return -ENODEV; + *colon = ':'; + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) + if (strcmp(ifa->ifa_label, devname) == 0) + break; + if (ifa == NULL) + return -ENODEV; + rta->rta_prefsrc = &ifa->ifa_local; + } + } + + ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr; + if (r->rt_gateway.sa_family == AF_INET && *ptr) { + rta->rta_gw = ptr; + if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST) + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + } + + if (cmd == SIOCDELRT) + return 0; + + if (r->rt_flags&RTF_GATEWAY && rta->rta_gw == NULL) + return -EINVAL; + + if (rtm->rtm_scope == RT_SCOPE_NOWHERE) + rtm->rtm_scope = RT_SCOPE_LINK; + + if (r->rt_flags&(RTF_MTU|RTF_WINDOW|RTF_IRTT)) { + struct rtattr *rec; + struct rtattr *mx = kmalloc(RTA_LENGTH(3*RTA_LENGTH(4)), GFP_KERNEL); + if (mx == NULL) + return -ENOMEM; + rta->rta_mx = mx; + mx->rta_type = RTA_METRICS; + mx->rta_len = RTA_LENGTH(0); + if (r->rt_flags&RTF_MTU) { + rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len)); + rec->rta_type = RTAX_ADVMSS; + rec->rta_len = RTA_LENGTH(4); + mx->rta_len += RTA_LENGTH(4); + *(u32*)RTA_DATA(rec) = r->rt_mtu - 40; + } + if (r->rt_flags&RTF_WINDOW) { + rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len)); + rec->rta_type = RTAX_WINDOW; + rec->rta_len = RTA_LENGTH(4); + mx->rta_len += RTA_LENGTH(4); + *(u32*)RTA_DATA(rec) = r->rt_window; + } + if (r->rt_flags&RTF_IRTT) { + rec = (void*)((char*)mx + RTA_ALIGN(mx->rta_len)); + rec->rta_type = RTAX_RTT; + rec->rta_len = RTA_LENGTH(4); + mx->rta_len += RTA_LENGTH(4); + *(u32*)RTA_DATA(rec) = r->rt_irtt<<3; + } + } + return 0; +} + +#endif + +/* + Update FIB if: + - local address disappeared -> we must delete all the entries + referring to it. + - device went down -> we must shutdown all nexthops going via it. + */ + +int fib_sync_down(u32 local, struct net_device *dev, int force) +{ + int ret = 0; + int scope = RT_SCOPE_NOWHERE; + + if (force) + scope = -1; + + if (local && fib_info_laddrhash) { + unsigned int hash = fib_laddr_hashfn(local); + struct hlist_head *head = &fib_info_laddrhash[hash]; + struct hlist_node *node; + struct fib_info *fi; + + hlist_for_each_entry(fi, node, head, fib_lhash) { + if (fi->fib_prefsrc == local) { + fi->fib_flags |= RTNH_F_DEAD; + ret++; + } + } + } + + if (dev) { + struct fib_info *prev_fi = NULL; + unsigned int hash = fib_devindex_hashfn(dev->ifindex); + struct hlist_head *head = &fib_info_devhash[hash]; + struct hlist_node *node; + struct fib_nh *nh; + + hlist_for_each_entry(nh, node, head, nh_hash) { + struct fib_info *fi = nh->nh_parent; + int dead; + + BUG_ON(!fi->fib_nhs); + if (nh->nh_dev != dev || fi == prev_fi) + continue; + prev_fi = fi; + dead = 0; + change_nexthops(fi) { + if (nh->nh_flags&RTNH_F_DEAD) + dead++; + else if (nh->nh_dev == dev && + nh->nh_scope != scope) { + nh->nh_flags |= RTNH_F_DEAD; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + spin_lock_bh(&fib_multipath_lock); + fi->fib_power -= nh->nh_power; + nh->nh_power = 0; + spin_unlock_bh(&fib_multipath_lock); +#endif + dead++; + } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (force > 1 && nh->nh_dev == dev) { + dead = fi->fib_nhs; + break; + } +#endif + } endfor_nexthops(fi) + if (dead == fi->fib_nhs) { + fi->fib_flags |= RTNH_F_DEAD; + ret++; + } + } + } + + return ret; +} + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +/* + Dead device goes up. We wake up dead nexthops. + It takes sense only on multipath routes. + */ + +int fib_sync_up(struct net_device *dev) +{ + struct fib_info *prev_fi; + unsigned int hash; + struct hlist_head *head; + struct hlist_node *node; + struct fib_nh *nh; + int ret; + + if (!(dev->flags&IFF_UP)) + return 0; + + prev_fi = NULL; + hash = fib_devindex_hashfn(dev->ifindex); + head = &fib_info_devhash[hash]; + ret = 0; + + hlist_for_each_entry(nh, node, head, nh_hash) { + struct fib_info *fi = nh->nh_parent; + int alive; + + BUG_ON(!fi->fib_nhs); + if (nh->nh_dev != dev || fi == prev_fi) + continue; + + prev_fi = fi; + alive = 0; + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD)) { + alive++; + continue; + } + if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) + continue; + if (nh->nh_dev != dev || __in_dev_get(dev) == NULL) + continue; + alive++; + spin_lock_bh(&fib_multipath_lock); + nh->nh_power = 0; + nh->nh_flags &= ~RTNH_F_DEAD; + spin_unlock_bh(&fib_multipath_lock); + } endfor_nexthops(fi) + + if (alive > 0) { + fi->fib_flags &= ~RTNH_F_DEAD; + ret++; + } + } + + return ret; +} + +/* + The algorithm is suboptimal, but it provides really + fair weighted route distribution. + */ + +void fib_select_multipath(const struct flowi *flp, struct fib_result *res) +{ + struct fib_info *fi = res->fi; + int w; + + spin_lock_bh(&fib_multipath_lock); + if (fi->fib_power <= 0) { + int power = 0; + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD)) { + power += nh->nh_weight; + nh->nh_power = nh->nh_weight; + } + } endfor_nexthops(fi); + fi->fib_power = power; + if (power <= 0) { + spin_unlock_bh(&fib_multipath_lock); + /* Race condition: route has just become dead. */ + res->nh_sel = 0; + return; + } + } + + + /* w should be random number [0..fi->fib_power-1], + it is pretty bad approximation. + */ + + w = jiffies % fi->fib_power; + + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { + if ((w -= nh->nh_power) <= 0) { + nh->nh_power--; + fi->fib_power--; + res->nh_sel = nhsel; + spin_unlock_bh(&fib_multipath_lock); + return; + } + } + } endfor_nexthops(fi); + + /* Race condition: route has just become dead. */ + res->nh_sel = 0; + spin_unlock_bh(&fib_multipath_lock); +} +#endif diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c new file mode 100644 index 000000000000..85bf0d3e294b --- /dev/null +++ b/net/ipv4/icmp.c @@ -0,0 +1,1143 @@ +/* + * NET3: Implementation of the ICMP protocol layer. + * + * Alan Cox, + * + * Version: $Id: icmp.c,v 1.85 2002/02/01 22:01:03 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Some of the function names and the icmp unreach table for this + * module were derived from [icmp.c 1.0.11 06/02/93] by + * Ross Biro, Fred N. van Kempen, Mark Evans, Alan Cox, Gerhard Koerting. + * Other than that this module is a complete rewrite. + * + * Fixes: + * Clemens Fruhwirth : introduce global icmp rate limiting + * with icmp type masking ability instead + * of broken per type icmp timeouts. + * Mike Shaver : RFC1122 checks. + * Alan Cox : Multicast ping reply as self. + * Alan Cox : Fix atomicity lockup in ip_build_xmit + * call. + * Alan Cox : Added 216,128 byte paths to the MTU + * code. + * Martin Mares : RFC1812 checks. + * Martin Mares : Can be configured to follow redirects + * if acting as a router _without_ a + * routing protocol (RFC 1812). + * Martin Mares : Echo requests may be configured to + * be ignored (RFC 1812). + * Martin Mares : Limitation of ICMP error message + * transmit rate (RFC 1812). + * Martin Mares : TOS and Precedence set correctly + * (RFC 1812). + * Martin Mares : Now copying as much data from the + * original packet as we can without + * exceeding 576 bytes (RFC 1812). + * Willy Konynenberg : Transparent proxying support. + * Keith Owens : RFC1191 correction for 4.2BSD based + * path MTU bug. + * Thomas Quinot : ICMP Dest Unreach codes up to 15 are + * valid (RFC 1812). + * Andi Kleen : Check all packet lengths properly + * and moved all kfree_skb() up to + * icmp_rcv. + * Andi Kleen : Move the rate limit bookkeeping + * into the dest entry and use a token + * bucket filter (thanks to ANK). Make + * the rates sysctl configurable. + * Yu Tianli : Fixed two ugly bugs in icmp_send + * - IP option length was accounted wrongly + * - ICMP header length was not accounted + * at all. + * Tristan Greaves : Added sysctl option to ignore bogus + * broadcast responses from broken routers. + * + * To Fix: + * + * - Should use skb_pull() instead of all the manual checking. + * This would also greatly simply some upper layer error handlers. --AK + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Build xmit assembly blocks + */ + +struct icmp_bxm { + struct sk_buff *skb; + int offset; + int data_len; + + struct { + struct icmphdr icmph; + __u32 times[3]; + } data; + int head_len; + struct ip_options replyopts; + unsigned char optbuf[40]; +}; + +/* + * Statistics + */ +DEFINE_SNMP_STAT(struct icmp_mib, icmp_statistics); + +/* An array of errno for error messages from dest unreach. */ +/* RFC 1122: 3.2.2.1 States that NET_UNREACH, HOST_UNREACH and SR_FAILED MUST be considered 'transient errs'. */ + +struct icmp_err icmp_err_convert[] = { + { + .errno = ENETUNREACH, /* ICMP_NET_UNREACH */ + .fatal = 0, + }, + { + .errno = EHOSTUNREACH, /* ICMP_HOST_UNREACH */ + .fatal = 0, + }, + { + .errno = ENOPROTOOPT /* ICMP_PROT_UNREACH */, + .fatal = 1, + }, + { + .errno = ECONNREFUSED, /* ICMP_PORT_UNREACH */ + .fatal = 1, + }, + { + .errno = EMSGSIZE, /* ICMP_FRAG_NEEDED */ + .fatal = 0, + }, + { + .errno = EOPNOTSUPP, /* ICMP_SR_FAILED */ + .fatal = 0, + }, + { + .errno = ENETUNREACH, /* ICMP_NET_UNKNOWN */ + .fatal = 1, + }, + { + .errno = EHOSTDOWN, /* ICMP_HOST_UNKNOWN */ + .fatal = 1, + }, + { + .errno = ENONET, /* ICMP_HOST_ISOLATED */ + .fatal = 1, + }, + { + .errno = ENETUNREACH, /* ICMP_NET_ANO */ + .fatal = 1, + }, + { + .errno = EHOSTUNREACH, /* ICMP_HOST_ANO */ + .fatal = 1, + }, + { + .errno = ENETUNREACH, /* ICMP_NET_UNR_TOS */ + .fatal = 0, + }, + { + .errno = EHOSTUNREACH, /* ICMP_HOST_UNR_TOS */ + .fatal = 0, + }, + { + .errno = EHOSTUNREACH, /* ICMP_PKT_FILTERED */ + .fatal = 1, + }, + { + .errno = EHOSTUNREACH, /* ICMP_PREC_VIOLATION */ + .fatal = 1, + }, + { + .errno = EHOSTUNREACH, /* ICMP_PREC_CUTOFF */ + .fatal = 1, + }, +}; + +/* Control parameters for ECHO replies. */ +int sysctl_icmp_echo_ignore_all; +int sysctl_icmp_echo_ignore_broadcasts; + +/* Control parameter - ignore bogus broadcast responses? */ +int sysctl_icmp_ignore_bogus_error_responses; + +/* + * Configurable global rate limit. + * + * ratelimit defines tokens/packet consumed for dst->rate_token bucket + * ratemask defines which icmp types are ratelimited by setting + * it's bit position. + * + * default: + * dest unreachable (3), source quench (4), + * time exceeded (11), parameter problem (12) + */ + +int sysctl_icmp_ratelimit = 1 * HZ; +int sysctl_icmp_ratemask = 0x1818; + +/* + * ICMP control array. This specifies what to do with each ICMP. + */ + +struct icmp_control { + int output_entry; /* Field for increment on output */ + int input_entry; /* Field for increment on input */ + void (*handler)(struct sk_buff *skb); + short error; /* This ICMP is classed as an error message */ +}; + +static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1]; + +/* + * The ICMP socket(s). This is the most convenient way to flow control + * our ICMP output as well as maintain a clean interface throughout + * all layers. All Socketless IP sends will soon be gone. + * + * On SMP we have one ICMP socket per-cpu. + */ +static DEFINE_PER_CPU(struct socket *, __icmp_socket) = NULL; +#define icmp_socket __get_cpu_var(__icmp_socket) + +static __inline__ int icmp_xmit_lock(void) +{ + local_bh_disable(); + + if (unlikely(!spin_trylock(&icmp_socket->sk->sk_lock.slock))) { + /* This can happen if the output path signals a + * dst_link_failure() for an outgoing ICMP packet. + */ + local_bh_enable(); + return 1; + } + return 0; +} + +static void icmp_xmit_unlock(void) +{ + spin_unlock_bh(&icmp_socket->sk->sk_lock.slock); +} + +/* + * Send an ICMP frame. + */ + +/* + * Check transmit rate limitation for given message. + * The rate information is held in the destination cache now. + * This function is generic and could be used for other purposes + * too. It uses a Token bucket filter as suggested by Alexey Kuznetsov. + * + * Note that the same dst_entry fields are modified by functions in + * route.c too, but these work for packet destinations while xrlim_allow + * works for icmp destinations. This means the rate limiting information + * for one "ip object" is shared - and these ICMPs are twice limited: + * by source and by destination. + * + * RFC 1812: 4.3.2.8 SHOULD be able to limit error message rate + * SHOULD allow setting of rate limits + * + * Shared between ICMPv4 and ICMPv6. + */ +#define XRLIM_BURST_FACTOR 6 +int xrlim_allow(struct dst_entry *dst, int timeout) +{ + unsigned long now; + int rc = 0; + + now = jiffies; + dst->rate_tokens += now - dst->rate_last; + dst->rate_last = now; + if (dst->rate_tokens > XRLIM_BURST_FACTOR * timeout) + dst->rate_tokens = XRLIM_BURST_FACTOR * timeout; + if (dst->rate_tokens >= timeout) { + dst->rate_tokens -= timeout; + rc = 1; + } + return rc; +} + +static inline int icmpv4_xrlim_allow(struct rtable *rt, int type, int code) +{ + struct dst_entry *dst = &rt->u.dst; + int rc = 1; + + if (type > NR_ICMP_TYPES) + goto out; + + /* Don't limit PMTU discovery. */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) + goto out; + + /* No rate limit on loopback */ + if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) + goto out; + + /* Limit if icmp type is enabled in ratemask. */ + if ((1 << type) & sysctl_icmp_ratemask) + rc = xrlim_allow(dst, sysctl_icmp_ratelimit); +out: + return rc; +} + +/* + * Maintain the counters used in the SNMP statistics for outgoing ICMP + */ +static void icmp_out_count(int type) +{ + if (type <= NR_ICMP_TYPES) { + ICMP_INC_STATS(icmp_pointers[type].output_entry); + ICMP_INC_STATS(ICMP_MIB_OUTMSGS); + } +} + +/* + * Checksum each fragment, and on the first include the headers and final + * checksum. + */ +static int icmp_glue_bits(void *from, char *to, int offset, int len, int odd, + struct sk_buff *skb) +{ + struct icmp_bxm *icmp_param = (struct icmp_bxm *)from; + unsigned int csum; + + csum = skb_copy_and_csum_bits(icmp_param->skb, + icmp_param->offset + offset, + to, len, 0); + + skb->csum = csum_block_add(skb->csum, csum, odd); + if (icmp_pointers[icmp_param->data.icmph.type].error) + nf_ct_attach(skb, icmp_param->skb); + return 0; +} + +static void icmp_push_reply(struct icmp_bxm *icmp_param, + struct ipcm_cookie *ipc, struct rtable *rt) +{ + struct sk_buff *skb; + + ip_append_data(icmp_socket->sk, icmp_glue_bits, icmp_param, + icmp_param->data_len+icmp_param->head_len, + icmp_param->head_len, + ipc, rt, MSG_DONTWAIT); + + if ((skb = skb_peek(&icmp_socket->sk->sk_write_queue)) != NULL) { + struct icmphdr *icmph = skb->h.icmph; + unsigned int csum = 0; + struct sk_buff *skb1; + + skb_queue_walk(&icmp_socket->sk->sk_write_queue, skb1) { + csum = csum_add(csum, skb1->csum); + } + csum = csum_partial_copy_nocheck((void *)&icmp_param->data, + (char *)icmph, + icmp_param->head_len, csum); + icmph->checksum = csum_fold(csum); + skb->ip_summed = CHECKSUM_NONE; + ip_push_pending_frames(icmp_socket->sk); + } +} + +/* + * Driving logic for building and sending ICMP messages. + */ + +static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) +{ + struct sock *sk = icmp_socket->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipcm_cookie ipc; + struct rtable *rt = (struct rtable *)skb->dst; + u32 daddr; + + if (ip_options_echo(&icmp_param->replyopts, skb)) + goto out; + + if (icmp_xmit_lock()) + return; + + icmp_param->data.icmph.checksum = 0; + icmp_out_count(icmp_param->data.icmph.type); + + inet->tos = skb->nh.iph->tos; + daddr = ipc.addr = rt->rt_src; + ipc.opt = NULL; + if (icmp_param->replyopts.optlen) { + ipc.opt = &icmp_param->replyopts; + if (ipc.opt->srr) + daddr = icmp_param->replyopts.faddr; + } + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = rt->rt_spec_dst, + .tos = RT_TOS(skb->nh.iph->tos) } }, + .proto = IPPROTO_ICMP }; + if (ip_route_output_key(&rt, &fl)) + goto out_unlock; + } + if (icmpv4_xrlim_allow(rt, icmp_param->data.icmph.type, + icmp_param->data.icmph.code)) + icmp_push_reply(icmp_param, &ipc, rt); + ip_rt_put(rt); +out_unlock: + icmp_xmit_unlock(); +out:; +} + + +/* + * Send an ICMP message in response to a situation + * + * RFC 1122: 3.2.2 MUST send at least the IP header and 8 bytes of header. + * MAY send more (we do). + * MUST NOT change this header information. + * MUST NOT reply to a multicast/broadcast IP address. + * MUST NOT reply to a multicast/broadcast MAC address. + * MUST reply to only the first fragment. + */ + +void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info) +{ + struct iphdr *iph; + int room; + struct icmp_bxm icmp_param; + struct rtable *rt = (struct rtable *)skb_in->dst; + struct ipcm_cookie ipc; + u32 saddr; + u8 tos; + + if (!rt) + goto out; + + /* + * Find the original header. It is expected to be valid, of course. + * Check this, icmp_send is called from the most obscure devices + * sometimes. + */ + iph = skb_in->nh.iph; + + if ((u8 *)iph < skb_in->head || (u8 *)(iph + 1) > skb_in->tail) + goto out; + + /* + * No replies to physical multicast/broadcast + */ + if (skb_in->pkt_type != PACKET_HOST) + goto out; + + /* + * Now check at the protocol level + */ + if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) + goto out; + + /* + * Only reply to fragment 0. We byte re-order the constant + * mask for efficiency. + */ + if (iph->frag_off & htons(IP_OFFSET)) + goto out; + + /* + * If we send an ICMP error to an ICMP error a mess would result.. + */ + if (icmp_pointers[type].error) { + /* + * We are an error, check if we are replying to an + * ICMP error + */ + if (iph->protocol == IPPROTO_ICMP) { + u8 _inner_type, *itp; + + itp = skb_header_pointer(skb_in, + skb_in->nh.raw + + (iph->ihl << 2) + + offsetof(struct icmphdr, + type) - + skb_in->data, + sizeof(_inner_type), + &_inner_type); + if (itp == NULL) + goto out; + + /* + * Assume any unknown ICMP type is an error. This + * isn't specified by the RFC, but think about it.. + */ + if (*itp > NR_ICMP_TYPES || + icmp_pointers[*itp].error) + goto out; + } + } + + if (icmp_xmit_lock()) + return; + + /* + * Construct source address and options. + */ + + saddr = iph->daddr; + if (!(rt->rt_flags & RTCF_LOCAL)) + saddr = 0; + + tos = icmp_pointers[type].error ? ((iph->tos & IPTOS_TOS_MASK) | + IPTOS_PREC_INTERNETCONTROL) : + iph->tos; + + if (ip_options_echo(&icmp_param.replyopts, skb_in)) + goto ende; + + + /* + * Prepare data for ICMP header. + */ + + icmp_param.data.icmph.type = type; + icmp_param.data.icmph.code = code; + icmp_param.data.icmph.un.gateway = info; + icmp_param.data.icmph.checksum = 0; + icmp_param.skb = skb_in; + icmp_param.offset = skb_in->nh.raw - skb_in->data; + icmp_out_count(icmp_param.data.icmph.type); + inet_sk(icmp_socket->sk)->tos = tos; + ipc.addr = iph->saddr; + ipc.opt = &icmp_param.replyopts; + + { + struct flowi fl = { + .nl_u = { + .ip4_u = { + .daddr = icmp_param.replyopts.srr ? + icmp_param.replyopts.faddr : + iph->saddr, + .saddr = saddr, + .tos = RT_TOS(tos) + } + }, + .proto = IPPROTO_ICMP, + .uli_u = { + .icmpt = { + .type = type, + .code = code + } + } + }; + if (ip_route_output_key(&rt, &fl)) + goto out_unlock; + } + + if (!icmpv4_xrlim_allow(rt, type, code)) + goto ende; + + /* RFC says return as much as we can without exceeding 576 bytes. */ + + room = dst_mtu(&rt->u.dst); + if (room > 576) + room = 576; + room -= sizeof(struct iphdr) + icmp_param.replyopts.optlen; + room -= sizeof(struct icmphdr); + + icmp_param.data_len = skb_in->len - icmp_param.offset; + if (icmp_param.data_len > room) + icmp_param.data_len = room; + icmp_param.head_len = sizeof(struct icmphdr); + + icmp_push_reply(&icmp_param, &ipc, rt); +ende: + ip_rt_put(rt); +out_unlock: + icmp_xmit_unlock(); +out:; +} + + +/* + * Handle ICMP_DEST_UNREACH, ICMP_TIME_EXCEED, and ICMP_QUENCH. + */ + +static void icmp_unreach(struct sk_buff *skb) +{ + struct iphdr *iph; + struct icmphdr *icmph; + int hash, protocol; + struct net_protocol *ipprot; + struct sock *raw_sk; + u32 info = 0; + + /* + * Incomplete header ? + * Only checks for the IP header, there should be an + * additional check for longer headers in upper levels. + */ + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out_err; + + icmph = skb->h.icmph; + iph = (struct iphdr *)skb->data; + + if (iph->ihl < 5) /* Mangled header, drop. */ + goto out_err; + + if (icmph->type == ICMP_DEST_UNREACH) { + switch (icmph->code & 15) { + case ICMP_NET_UNREACH: + case ICMP_HOST_UNREACH: + case ICMP_PROT_UNREACH: + case ICMP_PORT_UNREACH: + break; + case ICMP_FRAG_NEEDED: + if (ipv4_config.no_pmtu_disc) { + LIMIT_NETDEBUG( + printk(KERN_INFO "ICMP: %u.%u.%u.%u: " + "fragmentation needed " + "and DF set.\n", + NIPQUAD(iph->daddr))); + } else { + info = ip_rt_frag_needed(iph, + ntohs(icmph->un.frag.mtu)); + if (!info) + goto out; + } + break; + case ICMP_SR_FAILED: + LIMIT_NETDEBUG( + printk(KERN_INFO "ICMP: %u.%u.%u.%u: Source " + "Route Failed.\n", + NIPQUAD(iph->daddr))); + break; + default: + break; + } + if (icmph->code > NR_ICMP_UNREACH) + goto out; + } else if (icmph->type == ICMP_PARAMETERPROB) + info = ntohl(icmph->un.gateway) >> 24; + + /* + * Throw it at our lower layers + * + * RFC 1122: 3.2.2 MUST extract the protocol ID from the passed + * header. + * RFC 1122: 3.2.2.1 MUST pass ICMP unreach messages to the + * transport layer. + * RFC 1122: 3.2.2.2 MUST pass ICMP time expired messages to + * transport layer. + */ + + /* + * Check the other end isnt violating RFC 1122. Some routers send + * bogus responses to broadcast frames. If you see this message + * first check your netmask matches at both ends, if it does then + * get the other vendor to fix their kit. + */ + + if (!sysctl_icmp_ignore_bogus_error_responses && + inet_addr_type(iph->daddr) == RTN_BROADCAST) { + if (net_ratelimit()) + printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP " + "type %u, code %u " + "error to a broadcast: %u.%u.%u.%u on %s\n", + NIPQUAD(skb->nh.iph->saddr), + icmph->type, icmph->code, + NIPQUAD(iph->daddr), + skb->dev->name); + goto out; + } + + /* Checkin full IP header plus 8 bytes of protocol to + * avoid additional coding at protocol handlers. + */ + if (!pskb_may_pull(skb, iph->ihl * 4 + 8)) + goto out; + + iph = (struct iphdr *)skb->data; + protocol = iph->protocol; + + /* + * Deliver ICMP message to raw sockets. Pretty useless feature? + */ + + /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ + hash = protocol & (MAX_INET_PROTOS - 1); + read_lock(&raw_v4_lock); + if ((raw_sk = sk_head(&raw_v4_htable[hash])) != NULL) { + while ((raw_sk = __raw_v4_lookup(raw_sk, protocol, iph->daddr, + iph->saddr, + skb->dev->ifindex)) != NULL) { + raw_err(raw_sk, skb, info); + raw_sk = sk_next(raw_sk); + iph = (struct iphdr *)skb->data; + } + } + read_unlock(&raw_v4_lock); + + rcu_read_lock(); + ipprot = rcu_dereference(inet_protos[hash]); + if (ipprot && ipprot->err_handler) + ipprot->err_handler(skb, info); + rcu_read_unlock(); + +out: + return; +out_err: + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + goto out; +} + + +/* + * Handle ICMP_REDIRECT. + */ + +static void icmp_redirect(struct sk_buff *skb) +{ + struct iphdr *iph; + unsigned long ip; + + if (skb->len < sizeof(struct iphdr)) + goto out_err; + + /* + * Get the copied header of the packet that caused the redirect + */ + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + + iph = (struct iphdr *)skb->data; + ip = iph->daddr; + + switch (skb->h.icmph->code & 7) { + case ICMP_REDIR_NET: + case ICMP_REDIR_NETTOS: + /* + * As per RFC recommendations now handle it as a host redirect. + */ + case ICMP_REDIR_HOST: + case ICMP_REDIR_HOSTTOS: + ip_rt_redirect(skb->nh.iph->saddr, ip, skb->h.icmph->un.gateway, + iph->saddr, iph->tos, skb->dev); + break; + } +out: + return; +out_err: + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + goto out; +} + +/* + * Handle ICMP_ECHO ("ping") requests. + * + * RFC 1122: 3.2.2.6 MUST have an echo server that answers ICMP echo + * requests. + * RFC 1122: 3.2.2.6 Data received in the ICMP_ECHO request MUST be + * included in the reply. + * RFC 1812: 4.3.3.6 SHOULD have a config option for silently ignoring + * echo requests, MUST have default=NOT. + * See also WRT handling of options once they are done and working. + */ + +static void icmp_echo(struct sk_buff *skb) +{ + if (!sysctl_icmp_echo_ignore_all) { + struct icmp_bxm icmp_param; + + icmp_param.data.icmph = *skb->h.icmph; + icmp_param.data.icmph.type = ICMP_ECHOREPLY; + icmp_param.skb = skb; + icmp_param.offset = 0; + icmp_param.data_len = skb->len; + icmp_param.head_len = sizeof(struct icmphdr); + icmp_reply(&icmp_param, skb); + } +} + +/* + * Handle ICMP Timestamp requests. + * RFC 1122: 3.2.2.8 MAY implement ICMP timestamp requests. + * SHOULD be in the kernel for minimum random latency. + * MUST be accurate to a few minutes. + * MUST be updated at least at 15Hz. + */ +static void icmp_timestamp(struct sk_buff *skb) +{ + struct timeval tv; + struct icmp_bxm icmp_param; + /* + * Too short. + */ + if (skb->len < 4) + goto out_err; + + /* + * Fill in the current time as ms since midnight UT: + */ + do_gettimeofday(&tv); + icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * 1000 + + tv.tv_usec / 1000); + icmp_param.data.times[2] = icmp_param.data.times[1]; + if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4)) + BUG(); + icmp_param.data.icmph = *skb->h.icmph; + icmp_param.data.icmph.type = ICMP_TIMESTAMPREPLY; + icmp_param.data.icmph.code = 0; + icmp_param.skb = skb; + icmp_param.offset = 0; + icmp_param.data_len = 0; + icmp_param.head_len = sizeof(struct icmphdr) + 12; + icmp_reply(&icmp_param, skb); +out: + return; +out_err: + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + goto out; +} + + +/* + * Handle ICMP_ADDRESS_MASK requests. (RFC950) + * + * RFC1122 (3.2.2.9). A host MUST only send replies to + * ADDRESS_MASK requests if it's been configured as an address mask + * agent. Receiving a request doesn't constitute implicit permission to + * act as one. Of course, implementing this correctly requires (SHOULD) + * a way to turn the functionality on and off. Another one for sysctl(), + * I guess. -- MS + * + * RFC1812 (4.3.3.9). A router MUST implement it. + * A router SHOULD have switch turning it on/off. + * This switch MUST be ON by default. + * + * Gratuitous replies, zero-source replies are not implemented, + * that complies with RFC. DO NOT implement them!!! All the idea + * of broadcast addrmask replies as specified in RFC950 is broken. + * The problem is that it is not uncommon to have several prefixes + * on one physical interface. Moreover, addrmask agent can even be + * not aware of existing another prefixes. + * If source is zero, addrmask agent cannot choose correct prefix. + * Gratuitous mask announcements suffer from the same problem. + * RFC1812 explains it, but still allows to use ADDRMASK, + * that is pretty silly. --ANK + * + * All these rules are so bizarre, that I removed kernel addrmask + * support at all. It is wrong, it is obsolete, nobody uses it in + * any case. --ANK + * + * Furthermore you can do it with a usermode address agent program + * anyway... + */ + +static void icmp_address(struct sk_buff *skb) +{ +#if 0 + if (net_ratelimit()) + printk(KERN_DEBUG "a guy asks for address mask. Who is it?\n"); +#endif +} + +/* + * RFC1812 (4.3.3.9). A router SHOULD listen all replies, and complain + * loudly if an inconsistency is found. + */ + +static void icmp_address_reply(struct sk_buff *skb) +{ + struct rtable *rt = (struct rtable *)skb->dst; + struct net_device *dev = skb->dev; + struct in_device *in_dev; + struct in_ifaddr *ifa; + + if (skb->len < 4 || !(rt->rt_flags&RTCF_DIRECTSRC)) + goto out; + + in_dev = in_dev_get(dev); + if (!in_dev) + goto out; + rcu_read_lock(); + if (in_dev->ifa_list && + IN_DEV_LOG_MARTIANS(in_dev) && + IN_DEV_FORWARD(in_dev)) { + u32 _mask, *mp; + + mp = skb_header_pointer(skb, 0, sizeof(_mask), &_mask); + if (mp == NULL) + BUG(); + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + if (*mp == ifa->ifa_mask && + inet_ifa_match(rt->rt_src, ifa)) + break; + } + if (!ifa && net_ratelimit()) { + printk(KERN_INFO "Wrong address mask %u.%u.%u.%u from " + "%s/%u.%u.%u.%u\n", + NIPQUAD(*mp), dev->name, NIPQUAD(rt->rt_src)); + } + } + rcu_read_unlock(); + in_dev_put(in_dev); +out:; +} + +static void icmp_discard(struct sk_buff *skb) +{ +} + +/* + * Deal with incoming ICMP packets. + */ +int icmp_rcv(struct sk_buff *skb) +{ + struct icmphdr *icmph; + struct rtable *rt = (struct rtable *)skb->dst; + + ICMP_INC_STATS_BH(ICMP_MIB_INMSGS); + + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!(u16)csum_fold(skb->csum)) + break; + NETDEBUG(if (net_ratelimit()) + printk(KERN_DEBUG "icmp v4 hw csum failure\n")); + case CHECKSUM_NONE: + if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) + goto error; + default:; + } + + if (!pskb_pull(skb, sizeof(struct icmphdr))) + goto error; + + icmph = skb->h.icmph; + + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph->type > NR_ICMP_TYPES) + goto error; + + + /* + * Parse the ICMP message + */ + + if (rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { + /* + * RFC 1122: 3.2.2.6 An ICMP_ECHO to broadcast MAY be + * silently ignored (we let user decide with a sysctl). + * RFC 1122: 3.2.2.8 An ICMP_TIMESTAMP MAY be silently + * discarded if to broadcast/multicast. + */ + if (icmph->type == ICMP_ECHO && + sysctl_icmp_echo_ignore_broadcasts) { + goto error; + } + if (icmph->type != ICMP_ECHO && + icmph->type != ICMP_TIMESTAMP && + icmph->type != ICMP_ADDRESS && + icmph->type != ICMP_ADDRESSREPLY) { + goto error; + } + } + + ICMP_INC_STATS_BH(icmp_pointers[icmph->type].input_entry); + icmp_pointers[icmph->type].handler(skb); + +drop: + kfree_skb(skb); + return 0; +error: + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + goto drop; +} + +/* + * This table is the definition of how we handle ICMP. + */ +static struct icmp_control icmp_pointers[NR_ICMP_TYPES + 1] = { + [ICMP_ECHOREPLY] = { + .output_entry = ICMP_MIB_OUTECHOREPS, + .input_entry = ICMP_MIB_INECHOREPS, + .handler = icmp_discard, + }, + [1] = { + .output_entry = ICMP_MIB_DUMMY, + .input_entry = ICMP_MIB_INERRORS, + .handler = icmp_discard, + .error = 1, + }, + [2] = { + .output_entry = ICMP_MIB_DUMMY, + .input_entry = ICMP_MIB_INERRORS, + .handler = icmp_discard, + .error = 1, + }, + [ICMP_DEST_UNREACH] = { + .output_entry = ICMP_MIB_OUTDESTUNREACHS, + .input_entry = ICMP_MIB_INDESTUNREACHS, + .handler = icmp_unreach, + .error = 1, + }, + [ICMP_SOURCE_QUENCH] = { + .output_entry = ICMP_MIB_OUTSRCQUENCHS, + .input_entry = ICMP_MIB_INSRCQUENCHS, + .handler = icmp_unreach, + .error = 1, + }, + [ICMP_REDIRECT] = { + .output_entry = ICMP_MIB_OUTREDIRECTS, + .input_entry = ICMP_MIB_INREDIRECTS, + .handler = icmp_redirect, + .error = 1, + }, + [6] = { + .output_entry = ICMP_MIB_DUMMY, + .input_entry = ICMP_MIB_INERRORS, + .handler = icmp_discard, + .error = 1, + }, + [7] = { + .output_entry = ICMP_MIB_DUMMY, + .input_entry = ICMP_MIB_INERRORS, + .handler = icmp_discard, + .error = 1, + }, + [ICMP_ECHO] = { + .output_entry = ICMP_MIB_OUTECHOS, + .input_entry = ICMP_MIB_INECHOS, + .handler = icmp_echo, + }, + [9] = { + .output_entry = ICMP_MIB_DUMMY, + .input_entry = ICMP_MIB_INERRORS, + .handler = icmp_discard, + .error = 1, + }, + [10] = { + .output_entry = ICMP_MIB_DUMMY, + .input_entry = ICMP_MIB_INERRORS, + .handler = icmp_discard, + .error = 1, + }, + [ICMP_TIME_EXCEEDED] = { + .output_entry = ICMP_MIB_OUTTIMEEXCDS, + .input_entry = ICMP_MIB_INTIMEEXCDS, + .handler = icmp_unreach, + .error = 1, + }, + [ICMP_PARAMETERPROB] = { + .output_entry = ICMP_MIB_OUTPARMPROBS, + .input_entry = ICMP_MIB_INPARMPROBS, + .handler = icmp_unreach, + .error = 1, + }, + [ICMP_TIMESTAMP] = { + .output_entry = ICMP_MIB_OUTTIMESTAMPS, + .input_entry = ICMP_MIB_INTIMESTAMPS, + .handler = icmp_timestamp, + }, + [ICMP_TIMESTAMPREPLY] = { + .output_entry = ICMP_MIB_OUTTIMESTAMPREPS, + .input_entry = ICMP_MIB_INTIMESTAMPREPS, + .handler = icmp_discard, + }, + [ICMP_INFO_REQUEST] = { + .output_entry = ICMP_MIB_DUMMY, + .input_entry = ICMP_MIB_DUMMY, + .handler = icmp_discard, + }, + [ICMP_INFO_REPLY] = { + .output_entry = ICMP_MIB_DUMMY, + .input_entry = ICMP_MIB_DUMMY, + .handler = icmp_discard, + }, + [ICMP_ADDRESS] = { + .output_entry = ICMP_MIB_OUTADDRMASKS, + .input_entry = ICMP_MIB_INADDRMASKS, + .handler = icmp_address, + }, + [ICMP_ADDRESSREPLY] = { + .output_entry = ICMP_MIB_OUTADDRMASKREPS, + .input_entry = ICMP_MIB_INADDRMASKREPS, + .handler = icmp_address_reply, + }, +}; + +void __init icmp_init(struct net_proto_family *ops) +{ + struct inet_sock *inet; + int i; + + for (i = 0; i < NR_CPUS; i++) { + int err; + + if (!cpu_possible(i)) + continue; + + err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_ICMP, + &per_cpu(__icmp_socket, i)); + + if (err < 0) + panic("Failed to create the ICMP control socket.\n"); + + per_cpu(__icmp_socket, i)->sk->sk_allocation = GFP_ATOMIC; + + /* Enough space for 2 64K ICMP packets, including + * sk_buff struct overhead. + */ + per_cpu(__icmp_socket, i)->sk->sk_sndbuf = + (2 * ((64 * 1024) + sizeof(struct sk_buff))); + + inet = inet_sk(per_cpu(__icmp_socket, i)->sk); + inet->uc_ttl = -1; + inet->pmtudisc = IP_PMTUDISC_DONT; + + /* Unhash it so that IP input processing does not even + * see it, we do not wish this socket to see incoming + * packets. + */ + per_cpu(__icmp_socket, i)->sk->sk_prot->unhash(per_cpu(__icmp_socket, i)->sk); + } +} + +EXPORT_SYMBOL(icmp_err_convert); +EXPORT_SYMBOL(icmp_send); +EXPORT_SYMBOL(icmp_statistics); +EXPORT_SYMBOL(xrlim_allow); diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c new file mode 100644 index 000000000000..1f3183168a90 --- /dev/null +++ b/net/ipv4/igmp.c @@ -0,0 +1,2473 @@ +/* + * Linux NET3: Internet Group Management Protocol [IGMP] + * + * This code implements the IGMP protocol as defined in RFC1112. There has + * been a further revision of this protocol since which is now supported. + * + * If you have trouble with this module be careful what gcc you have used, + * the older version didn't come out right using gcc 2.5.8, the newer one + * seems to fall out with gcc 2.6.2. + * + * Version: $Id: igmp.c,v 1.47 2002/02/01 22:01:03 davem Exp $ + * + * Authors: + * Alan Cox + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * + * Alan Cox : Added lots of __inline__ to optimise + * the memory usage of all the tiny little + * functions. + * Alan Cox : Dumped the header building experiment. + * Alan Cox : Minor tweaks ready for multicast routing + * and extended IGMP protocol. + * Alan Cox : Removed a load of inline directives. Gcc 2.5.8 + * writes utterly bogus code otherwise (sigh) + * fixed IGMP loopback to behave in the manner + * desired by mrouted, fixed the fact it has been + * broken since 1.3.6 and cleaned up a few minor + * points. + * + * Chih-Jen Chang : Tried to revise IGMP to Version 2 + * Tsu-Sheng Tsao E-mail: chihjenc@scf.usc.edu and tsusheng@scf.usc.edu + * The enhancements are mainly based on Steve Deering's + * ipmulti-3.5 source code. + * Chih-Jen Chang : Added the igmp_get_mrouter_info and + * Tsu-Sheng Tsao igmp_set_mrouter_info to keep track of + * the mrouted version on that device. + * Chih-Jen Chang : Added the max_resp_time parameter to + * Tsu-Sheng Tsao igmp_heard_query(). Using this parameter + * to identify the multicast router version + * and do what the IGMP version 2 specified. + * Chih-Jen Chang : Added a timer to revert to IGMP V2 router + * Tsu-Sheng Tsao if the specified time expired. + * Alan Cox : Stop IGMP from 0.0.0.0 being accepted. + * Alan Cox : Use GFP_ATOMIC in the right places. + * Christian Daudt : igmp timer wasn't set for local group + * memberships but was being deleted, + * which caused a "del_timer() called + * from %p with timer not initialized\n" + * message (960131). + * Christian Daudt : removed del_timer from + * igmp_timer_expire function (960205). + * Christian Daudt : igmp_heard_report now only calls + * igmp_timer_expire if tm->running is + * true (960216). + * Malcolm Beattie : ttl comparison wrong in igmp_rcv made + * igmp_heard_query never trigger. Expiry + * miscalculation fixed in igmp_heard_query + * and random() made to return unsigned to + * prevent negative expiry times. + * Alexey Kuznetsov: Wrong group leaving behaviour, backport + * fix from pending 2.1.x patches. + * Alan Cox: Forget to enable FDDI support earlier. + * Alexey Kuznetsov: Fixed leaving groups on device down. + * Alexey Kuznetsov: Accordance to igmp-v2-06 draft. + * David L Stevens: IGMPv3 support, with help from + * Vinay Kulkarni + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_IP_MROUTE +#include +#endif +#ifdef CONFIG_PROC_FS +#include +#include +#endif + +#define IP_MAX_MEMBERSHIPS 20 +#define IP_MAX_MSF 10 + +#ifdef CONFIG_IP_MULTICAST +/* Parameter names and values are taken from igmp-v2-06 draft */ + +#define IGMP_V1_Router_Present_Timeout (400*HZ) +#define IGMP_V2_Router_Present_Timeout (400*HZ) +#define IGMP_Unsolicited_Report_Interval (10*HZ) +#define IGMP_Query_Response_Interval (10*HZ) +#define IGMP_Unsolicited_Report_Count 2 + + +#define IGMP_Initial_Report_Delay (1) + +/* IGMP_Initial_Report_Delay is not from IGMP specs! + * IGMP specs require to report membership immediately after + * joining a group, but we delay the first report by a + * small interval. It seems more natural and still does not + * contradict to specs provided this delay is small enough. + */ + +#define IGMP_V1_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 1 || \ + (in_dev)->cnf.force_igmp_version == 1 || \ + ((in_dev)->mr_v1_seen && \ + time_before(jiffies, (in_dev)->mr_v1_seen))) +#define IGMP_V2_SEEN(in_dev) (ipv4_devconf.force_igmp_version == 2 || \ + (in_dev)->cnf.force_igmp_version == 2 || \ + ((in_dev)->mr_v2_seen && \ + time_before(jiffies, (in_dev)->mr_v2_seen))) + +static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im); +static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr); +static void igmpv3_clear_delrec(struct in_device *in_dev); +static int sf_setstate(struct ip_mc_list *pmc); +static void sf_markstate(struct ip_mc_list *pmc); +#endif +static void ip_mc_clear_src(struct ip_mc_list *pmc); +static int ip_mc_add_src(struct in_device *in_dev, __u32 *pmca, int sfmode, + int sfcount, __u32 *psfsrc, int delta); + +static void ip_ma_put(struct ip_mc_list *im) +{ + if (atomic_dec_and_test(&im->refcnt)) { + in_dev_put(im->interface); + kfree(im); + } +} + +#ifdef CONFIG_IP_MULTICAST + +/* + * Timer management + */ + +static __inline__ void igmp_stop_timer(struct ip_mc_list *im) +{ + spin_lock_bh(&im->lock); + if (del_timer(&im->timer)) + atomic_dec(&im->refcnt); + im->tm_running=0; + im->reporter = 0; + im->unsolicit_count = 0; + spin_unlock_bh(&im->lock); +} + +/* It must be called with locked im->lock */ +static void igmp_start_timer(struct ip_mc_list *im, int max_delay) +{ + int tv=net_random() % max_delay; + + im->tm_running=1; + if (!mod_timer(&im->timer, jiffies+tv+2)) + atomic_inc(&im->refcnt); +} + +static void igmp_gq_start_timer(struct in_device *in_dev) +{ + int tv = net_random() % in_dev->mr_maxdelay; + + in_dev->mr_gq_running = 1; + if (!mod_timer(&in_dev->mr_gq_timer, jiffies+tv+2)) + in_dev_hold(in_dev); +} + +static void igmp_ifc_start_timer(struct in_device *in_dev, int delay) +{ + int tv = net_random() % delay; + + if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2)) + in_dev_hold(in_dev); +} + +static void igmp_mod_timer(struct ip_mc_list *im, int max_delay) +{ + spin_lock_bh(&im->lock); + im->unsolicit_count = 0; + if (del_timer(&im->timer)) { + if ((long)(im->timer.expires-jiffies) < max_delay) { + add_timer(&im->timer); + im->tm_running=1; + spin_unlock_bh(&im->lock); + return; + } + atomic_dec(&im->refcnt); + } + igmp_start_timer(im, max_delay); + spin_unlock_bh(&im->lock); +} + + +/* + * Send an IGMP report. + */ + +#define IGMP_SIZE (sizeof(struct igmphdr)+sizeof(struct iphdr)+4) + + +static int is_in(struct ip_mc_list *pmc, struct ip_sf_list *psf, int type, + int gdeleted, int sdeleted) +{ + switch (type) { + case IGMPV3_MODE_IS_INCLUDE: + case IGMPV3_MODE_IS_EXCLUDE: + if (gdeleted || sdeleted) + return 0; + return !(pmc->gsquery && !psf->sf_gsresp); + case IGMPV3_CHANGE_TO_INCLUDE: + if (gdeleted || sdeleted) + return 0; + return psf->sf_count[MCAST_INCLUDE] != 0; + case IGMPV3_CHANGE_TO_EXCLUDE: + if (gdeleted || sdeleted) + return 0; + if (pmc->sfcount[MCAST_EXCLUDE] == 0 || + psf->sf_count[MCAST_INCLUDE]) + return 0; + return pmc->sfcount[MCAST_EXCLUDE] == + psf->sf_count[MCAST_EXCLUDE]; + case IGMPV3_ALLOW_NEW_SOURCES: + if (gdeleted || !psf->sf_crcount) + return 0; + return (pmc->sfmode == MCAST_INCLUDE) ^ sdeleted; + case IGMPV3_BLOCK_OLD_SOURCES: + if (pmc->sfmode == MCAST_INCLUDE) + return gdeleted || (psf->sf_crcount && sdeleted); + return psf->sf_crcount && !gdeleted && !sdeleted; + } + return 0; +} + +static int +igmp_scount(struct ip_mc_list *pmc, int type, int gdeleted, int sdeleted) +{ + struct ip_sf_list *psf; + int scount = 0; + + for (psf=pmc->sources; psf; psf=psf->sf_next) { + if (!is_in(pmc, psf, type, gdeleted, sdeleted)) + continue; + scount++; + } + return scount; +} + +static struct sk_buff *igmpv3_newpack(struct net_device *dev, int size) +{ + struct sk_buff *skb; + struct rtable *rt; + struct iphdr *pip; + struct igmpv3_report *pig; + + skb = alloc_skb(size + LL_RESERVED_SPACE(dev), GFP_ATOMIC); + if (skb == NULL) + return NULL; + + { + struct flowi fl = { .oif = dev->ifindex, + .nl_u = { .ip4_u = { + .daddr = IGMPV3_ALL_MCR } }, + .proto = IPPROTO_IGMP }; + if (ip_route_output_key(&rt, &fl)) { + kfree_skb(skb); + return NULL; + } + } + if (rt->rt_src == 0) { + kfree_skb(skb); + ip_rt_put(rt); + return NULL; + } + + skb->dst = &rt->u.dst; + skb->dev = dev; + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + + skb->nh.iph = pip =(struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4); + + pip->version = 4; + pip->ihl = (sizeof(struct iphdr)+4)>>2; + pip->tos = 0xc0; + pip->frag_off = htons(IP_DF); + pip->ttl = 1; + pip->daddr = rt->rt_dst; + pip->saddr = rt->rt_src; + pip->protocol = IPPROTO_IGMP; + pip->tot_len = 0; /* filled in later */ + ip_select_ident(pip, &rt->u.dst, NULL); + ((u8*)&pip[1])[0] = IPOPT_RA; + ((u8*)&pip[1])[1] = 4; + ((u8*)&pip[1])[2] = 0; + ((u8*)&pip[1])[3] = 0; + + pig =(struct igmpv3_report *)skb_put(skb, sizeof(*pig)); + skb->h.igmph = (struct igmphdr *)pig; + pig->type = IGMPV3_HOST_MEMBERSHIP_REPORT; + pig->resv1 = 0; + pig->csum = 0; + pig->resv2 = 0; + pig->ngrec = 0; + return skb; +} + +static int igmpv3_sendpack(struct sk_buff *skb) +{ + struct iphdr *pip = skb->nh.iph; + struct igmphdr *pig = skb->h.igmph; + int iplen, igmplen; + + iplen = skb->tail - (unsigned char *)skb->nh.iph; + pip->tot_len = htons(iplen); + ip_send_check(pip); + + igmplen = skb->tail - (unsigned char *)skb->h.igmph; + pig->csum = ip_compute_csum((void *)skb->h.igmph, igmplen); + + return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, skb->dev, + dst_output); +} + +static int grec_size(struct ip_mc_list *pmc, int type, int gdel, int sdel) +{ + return sizeof(struct igmpv3_grec) + 4*igmp_scount(pmc,type,gdel,sdel); +} + +static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc, + int type, struct igmpv3_grec **ppgr) +{ + struct net_device *dev = pmc->interface->dev; + struct igmpv3_report *pih; + struct igmpv3_grec *pgr; + + if (!skb) + skb = igmpv3_newpack(dev, dev->mtu); + if (!skb) + return NULL; + pgr = (struct igmpv3_grec *)skb_put(skb, sizeof(struct igmpv3_grec)); + pgr->grec_type = type; + pgr->grec_auxwords = 0; + pgr->grec_nsrcs = 0; + pgr->grec_mca = pmc->multiaddr; + pih = (struct igmpv3_report *)skb->h.igmph; + pih->ngrec = htons(ntohs(pih->ngrec)+1); + *ppgr = pgr; + return skb; +} + +#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \ + skb_tailroom(skb)) : 0) + +static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc, + int type, int gdeleted, int sdeleted) +{ + struct net_device *dev = pmc->interface->dev; + struct igmpv3_report *pih; + struct igmpv3_grec *pgr = NULL; + struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list; + int scount, first, isquery, truncate; + + if (pmc->multiaddr == IGMP_ALL_HOSTS) + return skb; + + isquery = type == IGMPV3_MODE_IS_INCLUDE || + type == IGMPV3_MODE_IS_EXCLUDE; + truncate = type == IGMPV3_MODE_IS_EXCLUDE || + type == IGMPV3_CHANGE_TO_EXCLUDE; + + psf_list = sdeleted ? &pmc->tomb : &pmc->sources; + + if (!*psf_list) { + if (type == IGMPV3_ALLOW_NEW_SOURCES || + type == IGMPV3_BLOCK_OLD_SOURCES) + return skb; + if (pmc->crcount || isquery) { + /* make sure we have room for group header and at + * least one source. + */ + if (skb && AVAILABLE(skb) < sizeof(struct igmpv3_grec)+ + sizeof(__u32)) { + igmpv3_sendpack(skb); + skb = NULL; /* add_grhead will get a new one */ + } + skb = add_grhead(skb, pmc, type, &pgr); + } + return skb; + } + pih = skb ? (struct igmpv3_report *)skb->h.igmph : NULL; + + /* EX and TO_EX get a fresh packet, if needed */ + if (truncate) { + if (pih && pih->ngrec && + AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { + if (skb) + igmpv3_sendpack(skb); + skb = igmpv3_newpack(dev, dev->mtu); + } + } + first = 1; + scount = 0; + psf_prev = NULL; + for (psf=*psf_list; psf; psf=psf_next) { + u32 *psrc; + + psf_next = psf->sf_next; + + if (!is_in(pmc, psf, type, gdeleted, sdeleted)) { + psf_prev = psf; + continue; + } + + /* clear marks on query responses */ + if (isquery) + psf->sf_gsresp = 0; + + if (AVAILABLE(skb) < sizeof(u32) + + first*sizeof(struct igmpv3_grec)) { + if (truncate && !first) + break; /* truncate these */ + if (pgr) + pgr->grec_nsrcs = htons(scount); + if (skb) + igmpv3_sendpack(skb); + skb = igmpv3_newpack(dev, dev->mtu); + first = 1; + scount = 0; + } + if (first) { + skb = add_grhead(skb, pmc, type, &pgr); + first = 0; + } + psrc = (u32 *)skb_put(skb, sizeof(u32)); + *psrc = psf->sf_inaddr; + scount++; + if ((type == IGMPV3_ALLOW_NEW_SOURCES || + type == IGMPV3_BLOCK_OLD_SOURCES) && psf->sf_crcount) { + psf->sf_crcount--; + if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + *psf_list = psf->sf_next; + kfree(psf); + continue; + } + } + psf_prev = psf; + } + if (pgr) + pgr->grec_nsrcs = htons(scount); + + if (isquery) + pmc->gsquery = 0; /* clear query state on report */ + return skb; +} + +static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc) +{ + struct sk_buff *skb = NULL; + int type; + + if (!pmc) { + read_lock(&in_dev->mc_list_lock); + for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { + if (pmc->multiaddr == IGMP_ALL_HOSTS) + continue; + spin_lock_bh(&pmc->lock); + if (pmc->sfcount[MCAST_EXCLUDE]) + type = IGMPV3_MODE_IS_EXCLUDE; + else + type = IGMPV3_MODE_IS_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + spin_unlock_bh(&pmc->lock); + } + read_unlock(&in_dev->mc_list_lock); + } else { + spin_lock_bh(&pmc->lock); + if (pmc->sfcount[MCAST_EXCLUDE]) + type = IGMPV3_MODE_IS_EXCLUDE; + else + type = IGMPV3_MODE_IS_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + spin_unlock_bh(&pmc->lock); + } + if (!skb) + return 0; + return igmpv3_sendpack(skb); +} + +/* + * remove zero-count source records from a source filter list + */ +static void igmpv3_clear_zeros(struct ip_sf_list **ppsf) +{ + struct ip_sf_list *psf_prev, *psf_next, *psf; + + psf_prev = NULL; + for (psf=*ppsf; psf; psf = psf_next) { + psf_next = psf->sf_next; + if (psf->sf_crcount == 0) { + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + *ppsf = psf->sf_next; + kfree(psf); + } else + psf_prev = psf; + } +} + +static void igmpv3_send_cr(struct in_device *in_dev) +{ + struct ip_mc_list *pmc, *pmc_prev, *pmc_next; + struct sk_buff *skb = NULL; + int type, dtype; + + read_lock(&in_dev->mc_list_lock); + spin_lock_bh(&in_dev->mc_tomb_lock); + + /* deleted MCA's */ + pmc_prev = NULL; + for (pmc=in_dev->mc_tomb; pmc; pmc=pmc_next) { + pmc_next = pmc->next; + if (pmc->sfmode == MCAST_INCLUDE) { + type = IGMPV3_BLOCK_OLD_SOURCES; + dtype = IGMPV3_BLOCK_OLD_SOURCES; + skb = add_grec(skb, pmc, type, 1, 0); + skb = add_grec(skb, pmc, dtype, 1, 1); + } + if (pmc->crcount) { + pmc->crcount--; + if (pmc->sfmode == MCAST_EXCLUDE) { + type = IGMPV3_CHANGE_TO_INCLUDE; + skb = add_grec(skb, pmc, type, 1, 0); + } + if (pmc->crcount == 0) { + igmpv3_clear_zeros(&pmc->tomb); + igmpv3_clear_zeros(&pmc->sources); + } + } + if (pmc->crcount == 0 && !pmc->tomb && !pmc->sources) { + if (pmc_prev) + pmc_prev->next = pmc_next; + else + in_dev->mc_tomb = pmc_next; + in_dev_put(pmc->interface); + kfree(pmc); + } else + pmc_prev = pmc; + } + spin_unlock_bh(&in_dev->mc_tomb_lock); + + /* change recs */ + for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { + spin_lock_bh(&pmc->lock); + if (pmc->sfcount[MCAST_EXCLUDE]) { + type = IGMPV3_BLOCK_OLD_SOURCES; + dtype = IGMPV3_ALLOW_NEW_SOURCES; + } else { + type = IGMPV3_ALLOW_NEW_SOURCES; + dtype = IGMPV3_BLOCK_OLD_SOURCES; + } + skb = add_grec(skb, pmc, type, 0, 0); + skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */ + + /* filter mode changes */ + if (pmc->crcount) { + pmc->crcount--; + if (pmc->sfmode == MCAST_EXCLUDE) + type = IGMPV3_CHANGE_TO_EXCLUDE; + else + type = IGMPV3_CHANGE_TO_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + } + spin_unlock_bh(&pmc->lock); + } + read_unlock(&in_dev->mc_list_lock); + + if (!skb) + return; + (void) igmpv3_sendpack(skb); +} + +static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc, + int type) +{ + struct sk_buff *skb; + struct iphdr *iph; + struct igmphdr *ih; + struct rtable *rt; + struct net_device *dev = in_dev->dev; + u32 group = pmc ? pmc->multiaddr : 0; + u32 dst; + + if (type == IGMPV3_HOST_MEMBERSHIP_REPORT) + return igmpv3_send_report(in_dev, pmc); + else if (type == IGMP_HOST_LEAVE_MESSAGE) + dst = IGMP_ALL_ROUTER; + else + dst = group; + + { + struct flowi fl = { .oif = dev->ifindex, + .nl_u = { .ip4_u = { .daddr = dst } }, + .proto = IPPROTO_IGMP }; + if (ip_route_output_key(&rt, &fl)) + return -1; + } + if (rt->rt_src == 0) { + ip_rt_put(rt); + return -1; + } + + skb=alloc_skb(IGMP_SIZE+LL_RESERVED_SPACE(dev), GFP_ATOMIC); + if (skb == NULL) { + ip_rt_put(rt); + return -1; + } + + skb->dst = &rt->u.dst; + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + + skb->nh.iph = iph = (struct iphdr *)skb_put(skb, sizeof(struct iphdr)+4); + + iph->version = 4; + iph->ihl = (sizeof(struct iphdr)+4)>>2; + iph->tos = 0xc0; + iph->frag_off = htons(IP_DF); + iph->ttl = 1; + iph->daddr = dst; + iph->saddr = rt->rt_src; + iph->protocol = IPPROTO_IGMP; + iph->tot_len = htons(IGMP_SIZE); + ip_select_ident(iph, &rt->u.dst, NULL); + ((u8*)&iph[1])[0] = IPOPT_RA; + ((u8*)&iph[1])[1] = 4; + ((u8*)&iph[1])[2] = 0; + ((u8*)&iph[1])[3] = 0; + ip_send_check(iph); + + ih = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); + ih->type=type; + ih->code=0; + ih->csum=0; + ih->group=group; + ih->csum=ip_compute_csum((void *)ih, sizeof(struct igmphdr)); + + return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + dst_output); +} + +static void igmp_gq_timer_expire(unsigned long data) +{ + struct in_device *in_dev = (struct in_device *)data; + + in_dev->mr_gq_running = 0; + igmpv3_send_report(in_dev, NULL); + __in_dev_put(in_dev); +} + +static void igmp_ifc_timer_expire(unsigned long data) +{ + struct in_device *in_dev = (struct in_device *)data; + + igmpv3_send_cr(in_dev); + if (in_dev->mr_ifc_count) { + in_dev->mr_ifc_count--; + igmp_ifc_start_timer(in_dev, IGMP_Unsolicited_Report_Interval); + } + __in_dev_put(in_dev); +} + +static void igmp_ifc_event(struct in_device *in_dev) +{ + if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) + return; + in_dev->mr_ifc_count = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + igmp_ifc_start_timer(in_dev, 1); +} + + +static void igmp_timer_expire(unsigned long data) +{ + struct ip_mc_list *im=(struct ip_mc_list *)data; + struct in_device *in_dev = im->interface; + + spin_lock(&im->lock); + im->tm_running=0; + + if (im->unsolicit_count) { + im->unsolicit_count--; + igmp_start_timer(im, IGMP_Unsolicited_Report_Interval); + } + im->reporter = 1; + spin_unlock(&im->lock); + + if (IGMP_V1_SEEN(in_dev)) + igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT); + else if (IGMP_V2_SEEN(in_dev)) + igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT); + else + igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT); + + ip_ma_put(im); +} + +static void igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __u32 *srcs) +{ + struct ip_sf_list *psf; + int i, scount; + + scount = 0; + for (psf=pmc->sources; psf; psf=psf->sf_next) { + if (scount == nsrcs) + break; + for (i=0; isf_inaddr) { + psf->sf_gsresp = 1; + scount++; + break; + } + } +} + +static void igmp_heard_report(struct in_device *in_dev, u32 group) +{ + struct ip_mc_list *im; + + /* Timers are only set for non-local groups */ + + if (group == IGMP_ALL_HOSTS) + return; + + read_lock(&in_dev->mc_list_lock); + for (im=in_dev->mc_list; im!=NULL; im=im->next) { + if (im->multiaddr == group) { + igmp_stop_timer(im); + break; + } + } + read_unlock(&in_dev->mc_list_lock); +} + +static void igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb, + int len) +{ + struct igmphdr *ih = skb->h.igmph; + struct igmpv3_query *ih3 = (struct igmpv3_query *)ih; + struct ip_mc_list *im; + u32 group = ih->group; + int max_delay; + int mark = 0; + + + if (len == 8) { + if (ih->code == 0) { + /* Alas, old v1 router presents here. */ + + max_delay = IGMP_Query_Response_Interval; + in_dev->mr_v1_seen = jiffies + + IGMP_V1_Router_Present_Timeout; + group = 0; + } else { + /* v2 router present */ + max_delay = ih->code*(HZ/IGMP_TIMER_SCALE); + in_dev->mr_v2_seen = jiffies + + IGMP_V2_Router_Present_Timeout; + } + /* cancel the interface change timer */ + in_dev->mr_ifc_count = 0; + if (del_timer(&in_dev->mr_ifc_timer)) + __in_dev_put(in_dev); + /* clear deleted report items */ + igmpv3_clear_delrec(in_dev); + } else if (len < 12) { + return; /* ignore bogus packet; freed by caller */ + } else { /* v3 */ + if (!pskb_may_pull(skb, sizeof(struct igmpv3_query))) + return; + + ih3 = (struct igmpv3_query *) skb->h.raw; + if (ih3->nsrcs) { + if (!pskb_may_pull(skb, sizeof(struct igmpv3_query) + + ntohs(ih3->nsrcs)*sizeof(__u32))) + return; + ih3 = (struct igmpv3_query *) skb->h.raw; + } + + max_delay = IGMPV3_MRC(ih3->code)*(HZ/IGMP_TIMER_SCALE); + if (!max_delay) + max_delay = 1; /* can't mod w/ 0 */ + in_dev->mr_maxdelay = max_delay; + if (ih3->qrv) + in_dev->mr_qrv = ih3->qrv; + if (!group) { /* general query */ + if (ih3->nsrcs) + return; /* no sources allowed */ + igmp_gq_start_timer(in_dev); + return; + } + /* mark sources to include, if group & source-specific */ + mark = ih3->nsrcs != 0; + } + + /* + * - Start the timers in all of our membership records + * that the query applies to for the interface on + * which the query arrived excl. those that belong + * to a "local" group (224.0.0.X) + * - For timers already running check if they need to + * be reset. + * - Use the igmp->igmp_code field as the maximum + * delay possible + */ + read_lock(&in_dev->mc_list_lock); + for (im=in_dev->mc_list; im!=NULL; im=im->next) { + if (group && group != im->multiaddr) + continue; + if (im->multiaddr == IGMP_ALL_HOSTS) + continue; + spin_lock_bh(&im->lock); + if (im->tm_running) + im->gsquery = im->gsquery && mark; + else + im->gsquery = mark; + if (im->gsquery) + igmp_marksources(im, ntohs(ih3->nsrcs), ih3->srcs); + spin_unlock_bh(&im->lock); + igmp_mod_timer(im, max_delay); + } + read_unlock(&in_dev->mc_list_lock); +} + +int igmp_rcv(struct sk_buff *skb) +{ + /* This basically follows the spec line by line -- see RFC1112 */ + struct igmphdr *ih; + struct in_device *in_dev = in_dev_get(skb->dev); + int len = skb->len; + + if (in_dev==NULL) { + kfree_skb(skb); + return 0; + } + + if (!pskb_may_pull(skb, sizeof(struct igmphdr)) || + (u16)csum_fold(skb_checksum(skb, 0, len, 0))) { + in_dev_put(in_dev); + kfree_skb(skb); + return 0; + } + + ih = skb->h.igmph; + switch (ih->type) { + case IGMP_HOST_MEMBERSHIP_QUERY: + igmp_heard_query(in_dev, skb, len); + break; + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMPV2_HOST_MEMBERSHIP_REPORT: + case IGMPV3_HOST_MEMBERSHIP_REPORT: + /* Is it our report looped back? */ + if (((struct rtable*)skb->dst)->fl.iif == 0) + break; + igmp_heard_report(in_dev, ih->group); + break; + case IGMP_PIM: +#ifdef CONFIG_IP_PIMSM_V1 + in_dev_put(in_dev); + return pim_rcv_v1(skb); +#endif + case IGMP_DVMRP: + case IGMP_TRACE: + case IGMP_HOST_LEAVE_MESSAGE: + case IGMP_MTRACE: + case IGMP_MTRACE_RESP: + break; + default: + NETDEBUG(printk(KERN_DEBUG "New IGMP type=%d, why we do not know about it?\n", ih->type)); + } + in_dev_put(in_dev); + kfree_skb(skb); + return 0; +} + +#endif + + +/* + * Add a filter to a device + */ + +static void ip_mc_filter_add(struct in_device *in_dev, u32 addr) +{ + char buf[MAX_ADDR_LEN]; + struct net_device *dev = in_dev->dev; + + /* Checking for IFF_MULTICAST here is WRONG-WRONG-WRONG. + We will get multicast token leakage, when IFF_MULTICAST + is changed. This check should be done in dev->set_multicast_list + routine. Something sort of: + if (dev->mc_list && dev->flags&IFF_MULTICAST) { do it; } + --ANK + */ + if (arp_mc_map(addr, buf, dev, 0) == 0) + dev_mc_add(dev,buf,dev->addr_len,0); +} + +/* + * Remove a filter from a device + */ + +static void ip_mc_filter_del(struct in_device *in_dev, u32 addr) +{ + char buf[MAX_ADDR_LEN]; + struct net_device *dev = in_dev->dev; + + if (arp_mc_map(addr, buf, dev, 0) == 0) + dev_mc_delete(dev,buf,dev->addr_len,0); +} + +#ifdef CONFIG_IP_MULTICAST +/* + * deleted ip_mc_list manipulation + */ +static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im) +{ + struct ip_mc_list *pmc; + + /* this is an "ip_mc_list" for convenience; only the fields below + * are actually used. In particular, the refcnt and users are not + * used for management of the delete list. Using the same structure + * for deleted items allows change reports to use common code with + * non-deleted or query-response MCA's. + */ + pmc = (struct ip_mc_list *)kmalloc(sizeof(*pmc), GFP_KERNEL); + if (!pmc) + return; + memset(pmc, 0, sizeof(*pmc)); + spin_lock_bh(&im->lock); + pmc->interface = im->interface; + in_dev_hold(in_dev); + pmc->multiaddr = im->multiaddr; + pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + pmc->sfmode = im->sfmode; + if (pmc->sfmode == MCAST_INCLUDE) { + struct ip_sf_list *psf; + + pmc->tomb = im->tomb; + pmc->sources = im->sources; + im->tomb = im->sources = NULL; + for (psf=pmc->sources; psf; psf=psf->sf_next) + psf->sf_crcount = pmc->crcount; + } + spin_unlock_bh(&im->lock); + + spin_lock_bh(&in_dev->mc_tomb_lock); + pmc->next = in_dev->mc_tomb; + in_dev->mc_tomb = pmc; + spin_unlock_bh(&in_dev->mc_tomb_lock); +} + +static void igmpv3_del_delrec(struct in_device *in_dev, __u32 multiaddr) +{ + struct ip_mc_list *pmc, *pmc_prev; + struct ip_sf_list *psf, *psf_next; + + spin_lock_bh(&in_dev->mc_tomb_lock); + pmc_prev = NULL; + for (pmc=in_dev->mc_tomb; pmc; pmc=pmc->next) { + if (pmc->multiaddr == multiaddr) + break; + pmc_prev = pmc; + } + if (pmc) { + if (pmc_prev) + pmc_prev->next = pmc->next; + else + in_dev->mc_tomb = pmc->next; + } + spin_unlock_bh(&in_dev->mc_tomb_lock); + if (pmc) { + for (psf=pmc->tomb; psf; psf=psf_next) { + psf_next = psf->sf_next; + kfree(psf); + } + in_dev_put(pmc->interface); + kfree(pmc); + } +} + +static void igmpv3_clear_delrec(struct in_device *in_dev) +{ + struct ip_mc_list *pmc, *nextpmc; + + spin_lock_bh(&in_dev->mc_tomb_lock); + pmc = in_dev->mc_tomb; + in_dev->mc_tomb = NULL; + spin_unlock_bh(&in_dev->mc_tomb_lock); + + for (; pmc; pmc = nextpmc) { + nextpmc = pmc->next; + ip_mc_clear_src(pmc); + in_dev_put(pmc->interface); + kfree(pmc); + } + /* clear dead sources, too */ + read_lock(&in_dev->mc_list_lock); + for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { + struct ip_sf_list *psf, *psf_next; + + spin_lock_bh(&pmc->lock); + psf = pmc->tomb; + pmc->tomb = NULL; + spin_unlock_bh(&pmc->lock); + for (; psf; psf=psf_next) { + psf_next = psf->sf_next; + kfree(psf); + } + } + read_unlock(&in_dev->mc_list_lock); +} +#endif + +static void igmp_group_dropped(struct ip_mc_list *im) +{ + struct in_device *in_dev = im->interface; +#ifdef CONFIG_IP_MULTICAST + int reporter; +#endif + + if (im->loaded) { + im->loaded = 0; + ip_mc_filter_del(in_dev, im->multiaddr); + } + +#ifdef CONFIG_IP_MULTICAST + if (im->multiaddr == IGMP_ALL_HOSTS) + return; + + reporter = im->reporter; + igmp_stop_timer(im); + + if (!in_dev->dead) { + if (IGMP_V1_SEEN(in_dev)) + goto done; + if (IGMP_V2_SEEN(in_dev)) { + if (reporter) + igmp_send_report(in_dev, im, IGMP_HOST_LEAVE_MESSAGE); + goto done; + } + /* IGMPv3 */ + igmpv3_add_delrec(in_dev, im); + + igmp_ifc_event(in_dev); + } +done: +#endif + ip_mc_clear_src(im); +} + +static void igmp_group_added(struct ip_mc_list *im) +{ + struct in_device *in_dev = im->interface; + + if (im->loaded == 0) { + im->loaded = 1; + ip_mc_filter_add(in_dev, im->multiaddr); + } + +#ifdef CONFIG_IP_MULTICAST + if (im->multiaddr == IGMP_ALL_HOSTS) + return; + + if (in_dev->dead) + return; + if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { + spin_lock_bh(&im->lock); + igmp_start_timer(im, IGMP_Initial_Report_Delay); + spin_unlock_bh(&im->lock); + return; + } + /* else, v3 */ + + im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + igmp_ifc_event(in_dev); +#endif +} + + +/* + * Multicast list managers + */ + + +/* + * A socket has joined a multicast group on device dev. + */ + +void ip_mc_inc_group(struct in_device *in_dev, u32 addr) +{ + struct ip_mc_list *im; + + ASSERT_RTNL(); + + for (im=in_dev->mc_list; im; im=im->next) { + if (im->multiaddr == addr) { + im->users++; + ip_mc_add_src(in_dev, &addr, MCAST_EXCLUDE, 0, NULL, 0); + goto out; + } + } + + im = (struct ip_mc_list *)kmalloc(sizeof(*im), GFP_KERNEL); + if (!im) + goto out; + + im->users=1; + im->interface=in_dev; + in_dev_hold(in_dev); + im->multiaddr=addr; + /* initial mode is (EX, empty) */ + im->sfmode = MCAST_EXCLUDE; + im->sfcount[MCAST_INCLUDE] = 0; + im->sfcount[MCAST_EXCLUDE] = 1; + im->sources = NULL; + im->tomb = NULL; + im->crcount = 0; + atomic_set(&im->refcnt, 1); + spin_lock_init(&im->lock); +#ifdef CONFIG_IP_MULTICAST + im->tm_running=0; + init_timer(&im->timer); + im->timer.data=(unsigned long)im; + im->timer.function=&igmp_timer_expire; + im->unsolicit_count = IGMP_Unsolicited_Report_Count; + im->reporter = 0; + im->gsquery = 0; +#endif + im->loaded = 0; + write_lock_bh(&in_dev->mc_list_lock); + im->next=in_dev->mc_list; + in_dev->mc_list=im; + write_unlock_bh(&in_dev->mc_list_lock); +#ifdef CONFIG_IP_MULTICAST + igmpv3_del_delrec(in_dev, im->multiaddr); +#endif + igmp_group_added(im); + if (!in_dev->dead) + ip_rt_multicast_event(in_dev); +out: + return; +} + +/* + * A socket has left a multicast group on device dev + */ + +void ip_mc_dec_group(struct in_device *in_dev, u32 addr) +{ + struct ip_mc_list *i, **ip; + + ASSERT_RTNL(); + + for (ip=&in_dev->mc_list; (i=*ip)!=NULL; ip=&i->next) { + if (i->multiaddr==addr) { + if (--i->users == 0) { + write_lock_bh(&in_dev->mc_list_lock); + *ip = i->next; + write_unlock_bh(&in_dev->mc_list_lock); + igmp_group_dropped(i); + + if (!in_dev->dead) + ip_rt_multicast_event(in_dev); + + ip_ma_put(i); + return; + } + break; + } + } +} + +/* Device going down */ + +void ip_mc_down(struct in_device *in_dev) +{ + struct ip_mc_list *i; + + ASSERT_RTNL(); + + for (i=in_dev->mc_list; i; i=i->next) + igmp_group_dropped(i); + +#ifdef CONFIG_IP_MULTICAST + in_dev->mr_ifc_count = 0; + if (del_timer(&in_dev->mr_ifc_timer)) + __in_dev_put(in_dev); + in_dev->mr_gq_running = 0; + if (del_timer(&in_dev->mr_gq_timer)) + __in_dev_put(in_dev); + igmpv3_clear_delrec(in_dev); +#endif + + ip_mc_dec_group(in_dev, IGMP_ALL_HOSTS); +} + +void ip_mc_init_dev(struct in_device *in_dev) +{ + ASSERT_RTNL(); + + in_dev->mc_tomb = NULL; +#ifdef CONFIG_IP_MULTICAST + in_dev->mr_gq_running = 0; + init_timer(&in_dev->mr_gq_timer); + in_dev->mr_gq_timer.data=(unsigned long) in_dev; + in_dev->mr_gq_timer.function=&igmp_gq_timer_expire; + in_dev->mr_ifc_count = 0; + init_timer(&in_dev->mr_ifc_timer); + in_dev->mr_ifc_timer.data=(unsigned long) in_dev; + in_dev->mr_ifc_timer.function=&igmp_ifc_timer_expire; + in_dev->mr_qrv = IGMP_Unsolicited_Report_Count; +#endif + + rwlock_init(&in_dev->mc_list_lock); + spin_lock_init(&in_dev->mc_tomb_lock); +} + +/* Device going up */ + +void ip_mc_up(struct in_device *in_dev) +{ + struct ip_mc_list *i; + + ASSERT_RTNL(); + + ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS); + + for (i=in_dev->mc_list; i; i=i->next) + igmp_group_added(i); +} + +/* + * Device is about to be destroyed: clean up. + */ + +void ip_mc_destroy_dev(struct in_device *in_dev) +{ + struct ip_mc_list *i; + + ASSERT_RTNL(); + + /* Deactivate timers */ + ip_mc_down(in_dev); + + write_lock_bh(&in_dev->mc_list_lock); + while ((i = in_dev->mc_list) != NULL) { + in_dev->mc_list = i->next; + write_unlock_bh(&in_dev->mc_list_lock); + + igmp_group_dropped(i); + ip_ma_put(i); + + write_lock_bh(&in_dev->mc_list_lock); + } + write_unlock_bh(&in_dev->mc_list_lock); +} + +static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr) +{ + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = imr->imr_multiaddr.s_addr } } }; + struct rtable *rt; + struct net_device *dev = NULL; + struct in_device *idev = NULL; + + if (imr->imr_ifindex) { + idev = inetdev_by_index(imr->imr_ifindex); + if (idev) + __in_dev_put(idev); + return idev; + } + if (imr->imr_address.s_addr) { + dev = ip_dev_find(imr->imr_address.s_addr); + if (!dev) + return NULL; + __dev_put(dev); + } + + if (!dev && !ip_route_output_key(&rt, &fl)) { + dev = rt->u.dst.dev; + ip_rt_put(rt); + } + if (dev) { + imr->imr_ifindex = dev->ifindex; + idev = __in_dev_get(dev); + } + return idev; +} + +/* + * Join a socket to a group + */ +int sysctl_igmp_max_memberships = IP_MAX_MEMBERSHIPS; +int sysctl_igmp_max_msf = IP_MAX_MSF; + + +static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode, + __u32 *psfsrc) +{ + struct ip_sf_list *psf, *psf_prev; + int rv = 0; + + psf_prev = NULL; + for (psf=pmc->sources; psf; psf=psf->sf_next) { + if (psf->sf_inaddr == *psfsrc) + break; + psf_prev = psf; + } + if (!psf || psf->sf_count[sfmode] == 0) { + /* source filter not found, or count wrong => bug */ + return -ESRCH; + } + psf->sf_count[sfmode]--; + if (psf->sf_count[sfmode] == 0) { + ip_rt_multicast_event(pmc->interface); + } + if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { +#ifdef CONFIG_IP_MULTICAST + struct in_device *in_dev = pmc->interface; +#endif + + /* no more filters for this source */ + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + pmc->sources = psf->sf_next; +#ifdef CONFIG_IP_MULTICAST + if (psf->sf_oldin && + !IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) { + psf->sf_crcount = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + psf->sf_next = pmc->tomb; + pmc->tomb = psf; + rv = 1; + } else +#endif + kfree(psf); + } + return rv; +} + +#ifndef CONFIG_IP_MULTICAST +#define igmp_ifc_event(x) do { } while (0) +#endif + +static int ip_mc_del_src(struct in_device *in_dev, __u32 *pmca, int sfmode, + int sfcount, __u32 *psfsrc, int delta) +{ + struct ip_mc_list *pmc; + int changerec = 0; + int i, err; + + if (!in_dev) + return -ENODEV; + read_lock(&in_dev->mc_list_lock); + for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { + if (*pmca == pmc->multiaddr) + break; + } + if (!pmc) { + /* MCA not found?? bug */ + read_unlock(&in_dev->mc_list_lock); + return -ESRCH; + } + spin_lock_bh(&pmc->lock); + read_unlock(&in_dev->mc_list_lock); +#ifdef CONFIG_IP_MULTICAST + sf_markstate(pmc); +#endif + if (!delta) { + err = -EINVAL; + if (!pmc->sfcount[sfmode]) + goto out_unlock; + pmc->sfcount[sfmode]--; + } + err = 0; + for (i=0; i 0; + if (!err && rv < 0) + err = rv; + } + if (pmc->sfmode == MCAST_EXCLUDE && + pmc->sfcount[MCAST_EXCLUDE] == 0 && + pmc->sfcount[MCAST_INCLUDE]) { +#ifdef CONFIG_IP_MULTICAST + struct ip_sf_list *psf; +#endif + + /* filter mode change */ + pmc->sfmode = MCAST_INCLUDE; +#ifdef CONFIG_IP_MULTICAST + pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + in_dev->mr_ifc_count = pmc->crcount; + for (psf=pmc->sources; psf; psf = psf->sf_next) + psf->sf_crcount = 0; + igmp_ifc_event(pmc->interface); + } else if (sf_setstate(pmc) || changerec) { + igmp_ifc_event(pmc->interface); +#endif + } +out_unlock: + spin_unlock_bh(&pmc->lock); + return err; +} + +/* + * Add multicast single-source filter to the interface list + */ +static int ip_mc_add1_src(struct ip_mc_list *pmc, int sfmode, + __u32 *psfsrc, int delta) +{ + struct ip_sf_list *psf, *psf_prev; + + psf_prev = NULL; + for (psf=pmc->sources; psf; psf=psf->sf_next) { + if (psf->sf_inaddr == *psfsrc) + break; + psf_prev = psf; + } + if (!psf) { + psf = (struct ip_sf_list *)kmalloc(sizeof(*psf), GFP_ATOMIC); + if (!psf) + return -ENOBUFS; + memset(psf, 0, sizeof(*psf)); + psf->sf_inaddr = *psfsrc; + if (psf_prev) { + psf_prev->sf_next = psf; + } else + pmc->sources = psf; + } + psf->sf_count[sfmode]++; + if (psf->sf_count[sfmode] == 1) { + ip_rt_multicast_event(pmc->interface); + } + return 0; +} + +#ifdef CONFIG_IP_MULTICAST +static void sf_markstate(struct ip_mc_list *pmc) +{ + struct ip_sf_list *psf; + int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; + + for (psf=pmc->sources; psf; psf=psf->sf_next) + if (pmc->sfcount[MCAST_EXCLUDE]) { + psf->sf_oldin = mca_xcount == + psf->sf_count[MCAST_EXCLUDE] && + !psf->sf_count[MCAST_INCLUDE]; + } else + psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0; +} + +static int sf_setstate(struct ip_mc_list *pmc) +{ + struct ip_sf_list *psf; + int mca_xcount = pmc->sfcount[MCAST_EXCLUDE]; + int qrv = pmc->interface->mr_qrv; + int new_in, rv; + + rv = 0; + for (psf=pmc->sources; psf; psf=psf->sf_next) { + if (pmc->sfcount[MCAST_EXCLUDE]) { + new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && + !psf->sf_count[MCAST_INCLUDE]; + } else + new_in = psf->sf_count[MCAST_INCLUDE] != 0; + if (new_in != psf->sf_oldin) { + psf->sf_crcount = qrv; + rv++; + } + } + return rv; +} +#endif + +/* + * Add multicast source filter list to the interface list + */ +static int ip_mc_add_src(struct in_device *in_dev, __u32 *pmca, int sfmode, + int sfcount, __u32 *psfsrc, int delta) +{ + struct ip_mc_list *pmc; + int isexclude; + int i, err; + + if (!in_dev) + return -ENODEV; + read_lock(&in_dev->mc_list_lock); + for (pmc=in_dev->mc_list; pmc; pmc=pmc->next) { + if (*pmca == pmc->multiaddr) + break; + } + if (!pmc) { + /* MCA not found?? bug */ + read_unlock(&in_dev->mc_list_lock); + return -ESRCH; + } + spin_lock_bh(&pmc->lock); + read_unlock(&in_dev->mc_list_lock); + +#ifdef CONFIG_IP_MULTICAST + sf_markstate(pmc); +#endif + isexclude = pmc->sfmode == MCAST_EXCLUDE; + if (!delta) + pmc->sfcount[sfmode]++; + err = 0; + for (i=0; isfcount[sfmode]--; + for (j=0; jsfcount[MCAST_EXCLUDE] != 0)) { +#ifdef CONFIG_IP_MULTICAST + struct in_device *in_dev = pmc->interface; + struct ip_sf_list *psf; +#endif + + /* filter mode change */ + if (pmc->sfcount[MCAST_EXCLUDE]) + pmc->sfmode = MCAST_EXCLUDE; + else if (pmc->sfcount[MCAST_INCLUDE]) + pmc->sfmode = MCAST_INCLUDE; +#ifdef CONFIG_IP_MULTICAST + /* else no filters; keep old mode for reports */ + + pmc->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : + IGMP_Unsolicited_Report_Count; + in_dev->mr_ifc_count = pmc->crcount; + for (psf=pmc->sources; psf; psf = psf->sf_next) + psf->sf_crcount = 0; + igmp_ifc_event(in_dev); + } else if (sf_setstate(pmc)) { + igmp_ifc_event(in_dev); +#endif + } + spin_unlock_bh(&pmc->lock); + return err; +} + +static void ip_mc_clear_src(struct ip_mc_list *pmc) +{ + struct ip_sf_list *psf, *nextpsf; + + for (psf=pmc->tomb; psf; psf=nextpsf) { + nextpsf = psf->sf_next; + kfree(psf); + } + pmc->tomb = NULL; + for (psf=pmc->sources; psf; psf=nextpsf) { + nextpsf = psf->sf_next; + kfree(psf); + } + pmc->sources = NULL; + pmc->sfmode = MCAST_EXCLUDE; + pmc->sfcount[MCAST_EXCLUDE] = 0; + pmc->sfcount[MCAST_EXCLUDE] = 1; +} + + +/* + * Join a multicast group + */ +int ip_mc_join_group(struct sock *sk , struct ip_mreqn *imr) +{ + int err; + u32 addr = imr->imr_multiaddr.s_addr; + struct ip_mc_socklist *iml, *i; + struct in_device *in_dev; + struct inet_sock *inet = inet_sk(sk); + int count = 0; + + if (!MULTICAST(addr)) + return -EINVAL; + + rtnl_shlock(); + + in_dev = ip_mc_find_dev(imr); + + if (!in_dev) { + iml = NULL; + err = -ENODEV; + goto done; + } + + iml = (struct ip_mc_socklist *)sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL); + + err = -EADDRINUSE; + for (i = inet->mc_list; i; i = i->next) { + if (memcmp(&i->multi, imr, sizeof(*imr)) == 0) { + /* New style additions are reference counted */ + if (imr->imr_address.s_addr == 0) { + i->count++; + err = 0; + } + goto done; + } + count++; + } + err = -ENOBUFS; + if (iml == NULL || count >= sysctl_igmp_max_memberships) + goto done; + memcpy(&iml->multi, imr, sizeof(*imr)); + iml->next = inet->mc_list; + iml->count = 1; + iml->sflist = NULL; + iml->sfmode = MCAST_EXCLUDE; + inet->mc_list = iml; + ip_mc_inc_group(in_dev, addr); + iml = NULL; + err = 0; + +done: + rtnl_shunlock(); + if (iml) + sock_kfree_s(sk, iml, sizeof(*iml)); + return err; +} + +static int ip_mc_leave_src(struct sock *sk, struct ip_mc_socklist *iml, + struct in_device *in_dev) +{ + int err; + + if (iml->sflist == 0) { + /* any-source empty exclude case */ + return ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, + iml->sfmode, 0, NULL, 0); + } + err = ip_mc_del_src(in_dev, &iml->multi.imr_multiaddr.s_addr, + iml->sfmode, iml->sflist->sl_count, + iml->sflist->sl_addr, 0); + sock_kfree_s(sk, iml->sflist, IP_SFLSIZE(iml->sflist->sl_max)); + iml->sflist = NULL; + return err; +} + +/* + * Ask a socket to leave a group. + */ + +int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_mc_socklist *iml, **imlp; + + rtnl_lock(); + for (imlp = &inet->mc_list; (iml = *imlp) != NULL; imlp = &iml->next) { + if (iml->multi.imr_multiaddr.s_addr==imr->imr_multiaddr.s_addr && + iml->multi.imr_address.s_addr==imr->imr_address.s_addr && + (!imr->imr_ifindex || iml->multi.imr_ifindex==imr->imr_ifindex)) { + struct in_device *in_dev; + + in_dev = inetdev_by_index(iml->multi.imr_ifindex); + if (in_dev) + (void) ip_mc_leave_src(sk, iml, in_dev); + if (--iml->count) { + rtnl_unlock(); + if (in_dev) + in_dev_put(in_dev); + return 0; + } + + *imlp = iml->next; + + if (in_dev) { + ip_mc_dec_group(in_dev, imr->imr_multiaddr.s_addr); + in_dev_put(in_dev); + } + rtnl_unlock(); + sock_kfree_s(sk, iml, sizeof(*iml)); + return 0; + } + } + rtnl_unlock(); + return -EADDRNOTAVAIL; +} + +int ip_mc_source(int add, int omode, struct sock *sk, struct + ip_mreq_source *mreqs, int ifindex) +{ + int err; + struct ip_mreqn imr; + u32 addr = mreqs->imr_multiaddr; + struct ip_mc_socklist *pmc; + struct in_device *in_dev = NULL; + struct inet_sock *inet = inet_sk(sk); + struct ip_sf_socklist *psl; + int i, j, rv; + + if (!MULTICAST(addr)) + return -EINVAL; + + rtnl_shlock(); + + imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr; + imr.imr_address.s_addr = mreqs->imr_interface; + imr.imr_ifindex = ifindex; + in_dev = ip_mc_find_dev(&imr); + + if (!in_dev) { + err = -ENODEV; + goto done; + } + err = -EADDRNOTAVAIL; + + for (pmc=inet->mc_list; pmc; pmc=pmc->next) { + if (memcmp(&pmc->multi, mreqs, 2*sizeof(__u32)) == 0) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + /* if a source filter was set, must be the same mode as before */ + if (pmc->sflist) { + if (pmc->sfmode != omode) + goto done; + } else if (pmc->sfmode != omode) { + /* allow mode switches for empty-set filters */ + ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 0, NULL, 0); + ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, pmc->sfmode, 0, + NULL, 0); + pmc->sfmode = omode; + } + + psl = pmc->sflist; + if (!add) { + if (!psl) + goto done; + rv = !0; + for (i=0; isl_count; i++) { + rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, + sizeof(__u32)); + if (rv == 0) + break; + } + if (rv) /* source not found */ + goto done; + + /* update the interface filter */ + ip_mc_del_src(in_dev, &mreqs->imr_multiaddr, omode, 1, + &mreqs->imr_sourceaddr, 1); + + for (j=i+1; jsl_count; j++) + psl->sl_addr[j-1] = psl->sl_addr[j]; + psl->sl_count--; + err = 0; + goto done; + } + /* else, add a new source to the filter */ + + if (psl && psl->sl_count >= sysctl_igmp_max_msf) { + err = -ENOBUFS; + goto done; + } + if (!psl || psl->sl_count == psl->sl_max) { + struct ip_sf_socklist *newpsl; + int count = IP_SFBLOCK; + + if (psl) + count += psl->sl_max; + newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk, + IP_SFLSIZE(count), GFP_KERNEL); + if (!newpsl) { + err = -ENOBUFS; + goto done; + } + newpsl->sl_max = count; + newpsl->sl_count = count - IP_SFBLOCK; + if (psl) { + for (i=0; isl_count; i++) + newpsl->sl_addr[i] = psl->sl_addr[i]; + sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); + } + pmc->sflist = psl = newpsl; + } + rv = 1; /* > 0 for insert logic below if sl_count is 0 */ + for (i=0; isl_count; i++) { + rv = memcmp(&psl->sl_addr[i], &mreqs->imr_sourceaddr, + sizeof(__u32)); + if (rv == 0) + break; + } + if (rv == 0) /* address already there is an error */ + goto done; + for (j=psl->sl_count-1; j>=i; j--) + psl->sl_addr[j+1] = psl->sl_addr[j]; + psl->sl_addr[i] = mreqs->imr_sourceaddr; + psl->sl_count++; + err = 0; + /* update the interface list */ + ip_mc_add_src(in_dev, &mreqs->imr_multiaddr, omode, 1, + &mreqs->imr_sourceaddr, 1); +done: + rtnl_shunlock(); + return err; +} + +int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf, int ifindex) +{ + int err; + struct ip_mreqn imr; + u32 addr = msf->imsf_multiaddr; + struct ip_mc_socklist *pmc; + struct in_device *in_dev; + struct inet_sock *inet = inet_sk(sk); + struct ip_sf_socklist *newpsl, *psl; + + if (!MULTICAST(addr)) + return -EINVAL; + if (msf->imsf_fmode != MCAST_INCLUDE && + msf->imsf_fmode != MCAST_EXCLUDE) + return -EINVAL; + + rtnl_shlock(); + + imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; + imr.imr_address.s_addr = msf->imsf_interface; + imr.imr_ifindex = ifindex; + in_dev = ip_mc_find_dev(&imr); + + if (!in_dev) { + err = -ENODEV; + goto done; + } + err = -EADDRNOTAVAIL; + + for (pmc=inet->mc_list; pmc; pmc=pmc->next) { + if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && + pmc->multi.imr_ifindex == imr.imr_ifindex) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + if (msf->imsf_numsrc) { + newpsl = (struct ip_sf_socklist *)sock_kmalloc(sk, + IP_SFLSIZE(msf->imsf_numsrc), GFP_KERNEL); + if (!newpsl) { + err = -ENOBUFS; + goto done; + } + newpsl->sl_max = newpsl->sl_count = msf->imsf_numsrc; + memcpy(newpsl->sl_addr, msf->imsf_slist, + msf->imsf_numsrc * sizeof(msf->imsf_slist[0])); + err = ip_mc_add_src(in_dev, &msf->imsf_multiaddr, + msf->imsf_fmode, newpsl->sl_count, newpsl->sl_addr, 0); + if (err) { + sock_kfree_s(sk, newpsl, IP_SFLSIZE(newpsl->sl_max)); + goto done; + } + } else + newpsl = NULL; + psl = pmc->sflist; + if (psl) { + (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, + psl->sl_count, psl->sl_addr, 0); + sock_kfree_s(sk, psl, IP_SFLSIZE(psl->sl_max)); + } else + (void) ip_mc_del_src(in_dev, &msf->imsf_multiaddr, pmc->sfmode, + 0, NULL, 0); + pmc->sflist = newpsl; + pmc->sfmode = msf->imsf_fmode; +done: + rtnl_shunlock(); + return err; +} + +int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, + struct ip_msfilter __user *optval, int __user *optlen) +{ + int err, len, count, copycount; + struct ip_mreqn imr; + u32 addr = msf->imsf_multiaddr; + struct ip_mc_socklist *pmc; + struct in_device *in_dev; + struct inet_sock *inet = inet_sk(sk); + struct ip_sf_socklist *psl; + + if (!MULTICAST(addr)) + return -EINVAL; + + rtnl_shlock(); + + imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; + imr.imr_address.s_addr = msf->imsf_interface; + imr.imr_ifindex = 0; + in_dev = ip_mc_find_dev(&imr); + + if (!in_dev) { + err = -ENODEV; + goto done; + } + err = -EADDRNOTAVAIL; + + for (pmc=inet->mc_list; pmc; pmc=pmc->next) { + if (pmc->multi.imr_multiaddr.s_addr == msf->imsf_multiaddr && + pmc->multi.imr_ifindex == imr.imr_ifindex) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + msf->imsf_fmode = pmc->sfmode; + psl = pmc->sflist; + rtnl_shunlock(); + if (!psl) { + len = 0; + count = 0; + } else { + count = psl->sl_count; + } + copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc; + len = copycount * sizeof(psl->sl_addr[0]); + msf->imsf_numsrc = count; + if (put_user(IP_MSFILTER_SIZE(copycount), optlen) || + copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) { + return -EFAULT; + } + if (len && + copy_to_user(&optval->imsf_slist[0], psl->sl_addr, len)) + return -EFAULT; + return 0; +done: + rtnl_shunlock(); + return err; +} + +int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, + struct group_filter __user *optval, int __user *optlen) +{ + int err, i, count, copycount; + struct sockaddr_in *psin; + u32 addr; + struct ip_mc_socklist *pmc; + struct inet_sock *inet = inet_sk(sk); + struct ip_sf_socklist *psl; + + psin = (struct sockaddr_in *)&gsf->gf_group; + if (psin->sin_family != AF_INET) + return -EINVAL; + addr = psin->sin_addr.s_addr; + if (!MULTICAST(addr)) + return -EINVAL; + + rtnl_shlock(); + + err = -EADDRNOTAVAIL; + + for (pmc=inet->mc_list; pmc; pmc=pmc->next) { + if (pmc->multi.imr_multiaddr.s_addr == addr && + pmc->multi.imr_ifindex == gsf->gf_interface) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + gsf->gf_fmode = pmc->sfmode; + psl = pmc->sflist; + rtnl_shunlock(); + count = psl ? psl->sl_count : 0; + copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; + gsf->gf_numsrc = count; + if (put_user(GROUP_FILTER_SIZE(copycount), optlen) || + copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { + return -EFAULT; + } + for (i=0; isin_family = AF_INET; + psin->sin_addr.s_addr = psl->sl_addr[i]; + if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss))) + return -EFAULT; + } + return 0; +done: + rtnl_shunlock(); + return err; +} + +/* + * check if a multicast source filter allows delivery for a given + */ +int ip_mc_sf_allow(struct sock *sk, u32 loc_addr, u32 rmt_addr, int dif) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_mc_socklist *pmc; + struct ip_sf_socklist *psl; + int i; + + if (!MULTICAST(loc_addr)) + return 1; + + for (pmc=inet->mc_list; pmc; pmc=pmc->next) { + if (pmc->multi.imr_multiaddr.s_addr == loc_addr && + pmc->multi.imr_ifindex == dif) + break; + } + if (!pmc) + return 1; + psl = pmc->sflist; + if (!psl) + return pmc->sfmode == MCAST_EXCLUDE; + + for (i=0; isl_count; i++) { + if (psl->sl_addr[i] == rmt_addr) + break; + } + if (pmc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) + return 0; + if (pmc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) + return 0; + return 1; +} + +/* + * A socket is closing. + */ + +void ip_mc_drop_socket(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_mc_socklist *iml; + + if (inet->mc_list == NULL) + return; + + rtnl_lock(); + while ((iml = inet->mc_list) != NULL) { + struct in_device *in_dev; + inet->mc_list = iml->next; + + if ((in_dev = inetdev_by_index(iml->multi.imr_ifindex)) != NULL) { + (void) ip_mc_leave_src(sk, iml, in_dev); + ip_mc_dec_group(in_dev, iml->multi.imr_multiaddr.s_addr); + in_dev_put(in_dev); + } + sock_kfree_s(sk, iml, sizeof(*iml)); + + } + rtnl_unlock(); +} + +int ip_check_mc(struct in_device *in_dev, u32 mc_addr, u32 src_addr, u16 proto) +{ + struct ip_mc_list *im; + struct ip_sf_list *psf; + int rv = 0; + + read_lock(&in_dev->mc_list_lock); + for (im=in_dev->mc_list; im; im=im->next) { + if (im->multiaddr == mc_addr) + break; + } + if (im && proto == IPPROTO_IGMP) { + rv = 1; + } else if (im) { + if (src_addr) { + for (psf=im->sources; psf; psf=psf->sf_next) { + if (psf->sf_inaddr == src_addr) + break; + } + if (psf) + rv = psf->sf_count[MCAST_INCLUDE] || + psf->sf_count[MCAST_EXCLUDE] != + im->sfcount[MCAST_EXCLUDE]; + else + rv = im->sfcount[MCAST_EXCLUDE] != 0; + } else + rv = 1; /* unspecified source; tentatively allow */ + } + read_unlock(&in_dev->mc_list_lock); + return rv; +} + +#if defined(CONFIG_PROC_FS) +struct igmp_mc_iter_state { + struct net_device *dev; + struct in_device *in_dev; +}; + +#define igmp_mc_seq_private(seq) ((struct igmp_mc_iter_state *)(seq)->private) + +static inline struct ip_mc_list *igmp_mc_get_first(struct seq_file *seq) +{ + struct ip_mc_list *im = NULL; + struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); + + for (state->dev = dev_base, state->in_dev = NULL; + state->dev; + state->dev = state->dev->next) { + struct in_device *in_dev; + in_dev = in_dev_get(state->dev); + if (!in_dev) + continue; + read_lock(&in_dev->mc_list_lock); + im = in_dev->mc_list; + if (im) { + state->in_dev = in_dev; + break; + } + read_unlock(&in_dev->mc_list_lock); + in_dev_put(in_dev); + } + return im; +} + +static struct ip_mc_list *igmp_mc_get_next(struct seq_file *seq, struct ip_mc_list *im) +{ + struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); + im = im->next; + while (!im) { + if (likely(state->in_dev != NULL)) { + read_unlock(&state->in_dev->mc_list_lock); + in_dev_put(state->in_dev); + } + state->dev = state->dev->next; + if (!state->dev) { + state->in_dev = NULL; + break; + } + state->in_dev = in_dev_get(state->dev); + if (!state->in_dev) + continue; + read_lock(&state->in_dev->mc_list_lock); + im = state->in_dev->mc_list; + } + return im; +} + +static struct ip_mc_list *igmp_mc_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ip_mc_list *im = igmp_mc_get_first(seq); + if (im) + while (pos && (im = igmp_mc_get_next(seq, im)) != NULL) + --pos; + return pos ? NULL : im; +} + +static void *igmp_mc_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&dev_base_lock); + return *pos ? igmp_mc_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *igmp_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_mc_list *im; + if (v == SEQ_START_TOKEN) + im = igmp_mc_get_first(seq); + else + im = igmp_mc_get_next(seq, v); + ++*pos; + return im; +} + +static void igmp_mc_seq_stop(struct seq_file *seq, void *v) +{ + struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); + if (likely(state->in_dev != NULL)) { + read_unlock(&state->in_dev->mc_list_lock); + in_dev_put(state->in_dev); + state->in_dev = NULL; + } + state->dev = NULL; + read_unlock(&dev_base_lock); +} + +static int igmp_mc_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Idx\tDevice : Count Querier\tGroup Users Timer\tReporter\n"); + else { + struct ip_mc_list *im = (struct ip_mc_list *)v; + struct igmp_mc_iter_state *state = igmp_mc_seq_private(seq); + char *querier; +#ifdef CONFIG_IP_MULTICAST + querier = IGMP_V1_SEEN(state->in_dev) ? "V1" : + IGMP_V2_SEEN(state->in_dev) ? "V2" : + "V3"; +#else + querier = "NONE"; +#endif + + if (state->in_dev->mc_list == im) { + seq_printf(seq, "%d\t%-10s: %5d %7s\n", + state->dev->ifindex, state->dev->name, state->dev->mc_count, querier); + } + + seq_printf(seq, + "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n", + im->multiaddr, im->users, + im->tm_running, im->tm_running ? + jiffies_to_clock_t(im->timer.expires-jiffies) : 0, + im->reporter); + } + return 0; +} + +static struct seq_operations igmp_mc_seq_ops = { + .start = igmp_mc_seq_start, + .next = igmp_mc_seq_next, + .stop = igmp_mc_seq_stop, + .show = igmp_mc_seq_show, +}; + +static int igmp_mc_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct igmp_mc_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + rc = seq_open(file, &igmp_mc_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations igmp_mc_seq_fops = { + .owner = THIS_MODULE, + .open = igmp_mc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +struct igmp_mcf_iter_state { + struct net_device *dev; + struct in_device *idev; + struct ip_mc_list *im; +}; + +#define igmp_mcf_seq_private(seq) ((struct igmp_mcf_iter_state *)(seq)->private) + +static inline struct ip_sf_list *igmp_mcf_get_first(struct seq_file *seq) +{ + struct ip_sf_list *psf = NULL; + struct ip_mc_list *im = NULL; + struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); + + for (state->dev = dev_base, state->idev = NULL, state->im = NULL; + state->dev; + state->dev = state->dev->next) { + struct in_device *idev; + idev = in_dev_get(state->dev); + if (unlikely(idev == NULL)) + continue; + read_lock(&idev->mc_list_lock); + im = idev->mc_list; + if (likely(im != NULL)) { + spin_lock_bh(&im->lock); + psf = im->sources; + if (likely(psf != NULL)) { + state->im = im; + state->idev = idev; + break; + } + spin_unlock_bh(&im->lock); + } + read_unlock(&idev->mc_list_lock); + in_dev_put(idev); + } + return psf; +} + +static struct ip_sf_list *igmp_mcf_get_next(struct seq_file *seq, struct ip_sf_list *psf) +{ + struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); + + psf = psf->sf_next; + while (!psf) { + spin_unlock_bh(&state->im->lock); + state->im = state->im->next; + while (!state->im) { + if (likely(state->idev != NULL)) { + read_unlock(&state->idev->mc_list_lock); + in_dev_put(state->idev); + } + state->dev = state->dev->next; + if (!state->dev) { + state->idev = NULL; + goto out; + } + state->idev = in_dev_get(state->dev); + if (!state->idev) + continue; + read_lock(&state->idev->mc_list_lock); + state->im = state->idev->mc_list; + } + if (!state->im) + break; + spin_lock_bh(&state->im->lock); + psf = state->im->sources; + } +out: + return psf; +} + +static struct ip_sf_list *igmp_mcf_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ip_sf_list *psf = igmp_mcf_get_first(seq); + if (psf) + while (pos && (psf = igmp_mcf_get_next(seq, psf)) != NULL) + --pos; + return pos ? NULL : psf; +} + +static void *igmp_mcf_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&dev_base_lock); + return *pos ? igmp_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *igmp_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_sf_list *psf; + if (v == SEQ_START_TOKEN) + psf = igmp_mcf_get_first(seq); + else + psf = igmp_mcf_get_next(seq, v); + ++*pos; + return psf; +} + +static void igmp_mcf_seq_stop(struct seq_file *seq, void *v) +{ + struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); + if (likely(state->im != NULL)) { + spin_unlock_bh(&state->im->lock); + state->im = NULL; + } + if (likely(state->idev != NULL)) { + read_unlock(&state->idev->mc_list_lock); + in_dev_put(state->idev); + state->idev = NULL; + } + state->dev = NULL; + read_unlock(&dev_base_lock); +} + +static int igmp_mcf_seq_show(struct seq_file *seq, void *v) +{ + struct ip_sf_list *psf = (struct ip_sf_list *)v; + struct igmp_mcf_iter_state *state = igmp_mcf_seq_private(seq); + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, + "%3s %6s " + "%10s %10s %6s %6s\n", "Idx", + "Device", "MCA", + "SRC", "INC", "EXC"); + } else { + seq_printf(seq, + "%3d %6.6s 0x%08x " + "0x%08x %6lu %6lu\n", + state->dev->ifindex, state->dev->name, + ntohl(state->im->multiaddr), + ntohl(psf->sf_inaddr), + psf->sf_count[MCAST_INCLUDE], + psf->sf_count[MCAST_EXCLUDE]); + } + return 0; +} + +static struct seq_operations igmp_mcf_seq_ops = { + .start = igmp_mcf_seq_start, + .next = igmp_mcf_seq_next, + .stop = igmp_mcf_seq_stop, + .show = igmp_mcf_seq_show, +}; + +static int igmp_mcf_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct igmp_mcf_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + rc = seq_open(file, &igmp_mcf_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations igmp_mcf_seq_fops = { + .owner = THIS_MODULE, + .open = igmp_mcf_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +int __init igmp_mc_proc_init(void) +{ + proc_net_fops_create("igmp", S_IRUGO, &igmp_mc_seq_fops); + proc_net_fops_create("mcfilter", S_IRUGO, &igmp_mcf_seq_fops); + return 0; +} +#endif + +EXPORT_SYMBOL(ip_mc_dec_group); +EXPORT_SYMBOL(ip_mc_inc_group); +EXPORT_SYMBOL(ip_mc_join_group); diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c new file mode 100644 index 000000000000..95473953c406 --- /dev/null +++ b/net/ipv4/inetpeer.c @@ -0,0 +1,460 @@ +/* + * INETPEER - A storage for permanent information about peers + * + * This source is covered by the GNU GPL, the same as all kernel sources. + * + * Version: $Id: inetpeer.c,v 1.7 2001/09/20 21:22:50 davem Exp $ + * + * Authors: Andrey V. Savochkin + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Theory of operations. + * We keep one entry for each peer IP address. The nodes contains long-living + * information about the peer which doesn't depend on routes. + * At this moment this information consists only of ID field for the next + * outgoing IP packet. This field is incremented with each packet as encoded + * in inet_getid() function (include/net/inetpeer.h). + * At the moment of writing this notes identifier of IP packets is generated + * to be unpredictable using this code only for packets subjected + * (actually or potentially) to defragmentation. I.e. DF packets less than + * PMTU in size uses a constant ID and do not use this code (see + * ip_select_ident() in include/net/ip.h). + * + * Route cache entries hold references to our nodes. + * New cache entries get references via lookup by destination IP address in + * the avl tree. The reference is grabbed only when it's needed i.e. only + * when we try to output IP packet which needs an unpredictable ID (see + * __ip_select_ident() in net/ipv4/route.c). + * Nodes are removed only when reference counter goes to 0. + * When it's happened the node may be removed when a sufficient amount of + * time has been passed since its last use. The less-recently-used entry can + * also be removed if the pool is overloaded i.e. if the total amount of + * entries is greater-or-equal than the threshold. + * + * Node pool is organised as an AVL tree. + * Such an implementation has been chosen not just for fun. It's a way to + * prevent easy and efficient DoS attacks by creating hash collisions. A huge + * amount of long living nodes in a single hash slot would significantly delay + * lookups performed with disabled BHs. + * + * Serialisation issues. + * 1. Nodes may appear in the tree only with the pool write lock held. + * 2. Nodes may disappear from the tree only with the pool write lock held + * AND reference count being 0. + * 3. Nodes appears and disappears from unused node list only under + * "inet_peer_unused_lock". + * 4. Global variable peer_total is modified under the pool lock. + * 5. struct inet_peer fields modification: + * avl_left, avl_right, avl_parent, avl_height: pool lock + * unused_next, unused_prevp: unused node list lock + * refcnt: atomically against modifications on other CPU; + * usually under some other lock to prevent node disappearing + * dtime: unused node list lock + * v4daddr: unchangeable + * ip_id_count: idlock + */ + +/* Exported for inet_getid inline function. */ +DEFINE_SPINLOCK(inet_peer_idlock); + +static kmem_cache_t *peer_cachep; + +#define node_height(x) x->avl_height +static struct inet_peer peer_fake_node = { + .avl_left = &peer_fake_node, + .avl_right = &peer_fake_node, + .avl_height = 0 +}; +#define peer_avl_empty (&peer_fake_node) +static struct inet_peer *peer_root = peer_avl_empty; +static DEFINE_RWLOCK(peer_pool_lock); +#define PEER_MAXDEPTH 40 /* sufficient for about 2^27 nodes */ + +static volatile int peer_total; +/* Exported for sysctl_net_ipv4. */ +int inet_peer_threshold = 65536 + 128; /* start to throw entries more + * aggressively at this stage */ +int inet_peer_minttl = 120 * HZ; /* TTL under high load: 120 sec */ +int inet_peer_maxttl = 10 * 60 * HZ; /* usual time to live: 10 min */ + +static struct inet_peer *inet_peer_unused_head; +/* Exported for inet_putpeer inline function. */ +struct inet_peer **inet_peer_unused_tailp = &inet_peer_unused_head; +DEFINE_SPINLOCK(inet_peer_unused_lock); +#define PEER_MAX_CLEANUP_WORK 30 + +static void peer_check_expire(unsigned long dummy); +static struct timer_list peer_periodic_timer = + TIMER_INITIALIZER(peer_check_expire, 0, 0); + +/* Exported for sysctl_net_ipv4. */ +int inet_peer_gc_mintime = 10 * HZ, + inet_peer_gc_maxtime = 120 * HZ; + +/* Called from ip_output.c:ip_init */ +void __init inet_initpeers(void) +{ + struct sysinfo si; + + /* Use the straight interface to information about memory. */ + si_meminfo(&si); + /* The values below were suggested by Alexey Kuznetsov + * . I don't have any opinion about the values + * myself. --SAW + */ + if (si.totalram <= (32768*1024)/PAGE_SIZE) + inet_peer_threshold >>= 1; /* max pool size about 1MB on IA32 */ + if (si.totalram <= (16384*1024)/PAGE_SIZE) + inet_peer_threshold >>= 1; /* about 512KB */ + if (si.totalram <= (8192*1024)/PAGE_SIZE) + inet_peer_threshold >>= 2; /* about 128KB */ + + peer_cachep = kmem_cache_create("inet_peer_cache", + sizeof(struct inet_peer), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + + if (!peer_cachep) + panic("cannot create inet_peer_cache"); + + /* All the timers, started at system startup tend + to synchronize. Perturb it a bit. + */ + peer_periodic_timer.expires = jiffies + + net_random() % inet_peer_gc_maxtime + + inet_peer_gc_maxtime; + add_timer(&peer_periodic_timer); +} + +/* Called with or without local BH being disabled. */ +static void unlink_from_unused(struct inet_peer *p) +{ + spin_lock_bh(&inet_peer_unused_lock); + if (p->unused_prevp != NULL) { + /* On unused list. */ + *p->unused_prevp = p->unused_next; + if (p->unused_next != NULL) + p->unused_next->unused_prevp = p->unused_prevp; + else + inet_peer_unused_tailp = p->unused_prevp; + p->unused_prevp = NULL; /* mark it as removed */ + } + spin_unlock_bh(&inet_peer_unused_lock); +} + +/* Called with local BH disabled and the pool lock held. */ +#define lookup(daddr) \ +({ \ + struct inet_peer *u, **v; \ + stackptr = stack; \ + *stackptr++ = &peer_root; \ + for (u = peer_root; u != peer_avl_empty; ) { \ + if (daddr == u->v4daddr) \ + break; \ + if (daddr < u->v4daddr) \ + v = &u->avl_left; \ + else \ + v = &u->avl_right; \ + *stackptr++ = v; \ + u = *v; \ + } \ + u; \ +}) + +/* Called with local BH disabled and the pool write lock held. */ +#define lookup_rightempty(start) \ +({ \ + struct inet_peer *u, **v; \ + *stackptr++ = &start->avl_left; \ + v = &start->avl_left; \ + for (u = *v; u->avl_right != peer_avl_empty; ) { \ + v = &u->avl_right; \ + *stackptr++ = v; \ + u = *v; \ + } \ + u; \ +}) + +/* Called with local BH disabled and the pool write lock held. + * Variable names are the proof of operation correctness. + * Look into mm/map_avl.c for more detail description of the ideas. */ +static void peer_avl_rebalance(struct inet_peer **stack[], + struct inet_peer ***stackend) +{ + struct inet_peer **nodep, *node, *l, *r; + int lh, rh; + + while (stackend > stack) { + nodep = *--stackend; + node = *nodep; + l = node->avl_left; + r = node->avl_right; + lh = node_height(l); + rh = node_height(r); + if (lh > rh + 1) { /* l: RH+2 */ + struct inet_peer *ll, *lr, *lrl, *lrr; + int lrh; + ll = l->avl_left; + lr = l->avl_right; + lrh = node_height(lr); + if (lrh <= node_height(ll)) { /* ll: RH+1 */ + node->avl_left = lr; /* lr: RH or RH+1 */ + node->avl_right = r; /* r: RH */ + node->avl_height = lrh + 1; /* RH+1 or RH+2 */ + l->avl_left = ll; /* ll: RH+1 */ + l->avl_right = node; /* node: RH+1 or RH+2 */ + l->avl_height = node->avl_height + 1; + *nodep = l; + } else { /* ll: RH, lr: RH+1 */ + lrl = lr->avl_left; /* lrl: RH or RH-1 */ + lrr = lr->avl_right; /* lrr: RH or RH-1 */ + node->avl_left = lrr; /* lrr: RH or RH-1 */ + node->avl_right = r; /* r: RH */ + node->avl_height = rh + 1; /* node: RH+1 */ + l->avl_left = ll; /* ll: RH */ + l->avl_right = lrl; /* lrl: RH or RH-1 */ + l->avl_height = rh + 1; /* l: RH+1 */ + lr->avl_left = l; /* l: RH+1 */ + lr->avl_right = node; /* node: RH+1 */ + lr->avl_height = rh + 2; + *nodep = lr; + } + } else if (rh > lh + 1) { /* r: LH+2 */ + struct inet_peer *rr, *rl, *rlr, *rll; + int rlh; + rr = r->avl_right; + rl = r->avl_left; + rlh = node_height(rl); + if (rlh <= node_height(rr)) { /* rr: LH+1 */ + node->avl_right = rl; /* rl: LH or LH+1 */ + node->avl_left = l; /* l: LH */ + node->avl_height = rlh + 1; /* LH+1 or LH+2 */ + r->avl_right = rr; /* rr: LH+1 */ + r->avl_left = node; /* node: LH+1 or LH+2 */ + r->avl_height = node->avl_height + 1; + *nodep = r; + } else { /* rr: RH, rl: RH+1 */ + rlr = rl->avl_right; /* rlr: LH or LH-1 */ + rll = rl->avl_left; /* rll: LH or LH-1 */ + node->avl_right = rll; /* rll: LH or LH-1 */ + node->avl_left = l; /* l: LH */ + node->avl_height = lh + 1; /* node: LH+1 */ + r->avl_right = rr; /* rr: LH */ + r->avl_left = rlr; /* rlr: LH or LH-1 */ + r->avl_height = lh + 1; /* r: LH+1 */ + rl->avl_right = r; /* r: LH+1 */ + rl->avl_left = node; /* node: LH+1 */ + rl->avl_height = lh + 2; + *nodep = rl; + } + } else { + node->avl_height = (lh > rh ? lh : rh) + 1; + } + } +} + +/* Called with local BH disabled and the pool write lock held. */ +#define link_to_pool(n) \ +do { \ + n->avl_height = 1; \ + n->avl_left = peer_avl_empty; \ + n->avl_right = peer_avl_empty; \ + **--stackptr = n; \ + peer_avl_rebalance(stack, stackptr); \ +} while(0) + +/* May be called with local BH enabled. */ +static void unlink_from_pool(struct inet_peer *p) +{ + int do_free; + + do_free = 0; + + write_lock_bh(&peer_pool_lock); + /* Check the reference counter. It was artificially incremented by 1 + * in cleanup() function to prevent sudden disappearing. If the + * reference count is still 1 then the node is referenced only as `p' + * here and from the pool. So under the exclusive pool lock it's safe + * to remove the node and free it later. */ + if (atomic_read(&p->refcnt) == 1) { + struct inet_peer **stack[PEER_MAXDEPTH]; + struct inet_peer ***stackptr, ***delp; + if (lookup(p->v4daddr) != p) + BUG(); + delp = stackptr - 1; /* *delp[0] == p */ + if (p->avl_left == peer_avl_empty) { + *delp[0] = p->avl_right; + --stackptr; + } else { + /* look for a node to insert instead of p */ + struct inet_peer *t; + t = lookup_rightempty(p); + if (*stackptr[-1] != t) + BUG(); + **--stackptr = t->avl_left; + /* t is removed, t->v4daddr > x->v4daddr for any + * x in p->avl_left subtree. + * Put t in the old place of p. */ + *delp[0] = t; + t->avl_left = p->avl_left; + t->avl_right = p->avl_right; + t->avl_height = p->avl_height; + if (delp[1] != &p->avl_left) + BUG(); + delp[1] = &t->avl_left; /* was &p->avl_left */ + } + peer_avl_rebalance(stack, stackptr); + peer_total--; + do_free = 1; + } + write_unlock_bh(&peer_pool_lock); + + if (do_free) + kmem_cache_free(peer_cachep, p); + else + /* The node is used again. Decrease the reference counter + * back. The loop "cleanup -> unlink_from_unused + * -> unlink_from_pool -> putpeer -> link_to_unused + * -> cleanup (for the same node)" + * doesn't really exist because the entry will have a + * recent deletion time and will not be cleaned again soon. */ + inet_putpeer(p); +} + +/* May be called with local BH enabled. */ +static int cleanup_once(unsigned long ttl) +{ + struct inet_peer *p; + + /* Remove the first entry from the list of unused nodes. */ + spin_lock_bh(&inet_peer_unused_lock); + p = inet_peer_unused_head; + if (p != NULL) { + if (time_after(p->dtime + ttl, jiffies)) { + /* Do not prune fresh entries. */ + spin_unlock_bh(&inet_peer_unused_lock); + return -1; + } + inet_peer_unused_head = p->unused_next; + if (p->unused_next != NULL) + p->unused_next->unused_prevp = p->unused_prevp; + else + inet_peer_unused_tailp = p->unused_prevp; + p->unused_prevp = NULL; /* mark as not on the list */ + /* Grab an extra reference to prevent node disappearing + * before unlink_from_pool() call. */ + atomic_inc(&p->refcnt); + } + spin_unlock_bh(&inet_peer_unused_lock); + + if (p == NULL) + /* It means that the total number of USED entries has + * grown over inet_peer_threshold. It shouldn't really + * happen because of entry limits in route cache. */ + return -1; + + unlink_from_pool(p); + return 0; +} + +/* Called with or without local BH being disabled. */ +struct inet_peer *inet_getpeer(__u32 daddr, int create) +{ + struct inet_peer *p, *n; + struct inet_peer **stack[PEER_MAXDEPTH], ***stackptr; + + /* Look up for the address quickly. */ + read_lock_bh(&peer_pool_lock); + p = lookup(daddr); + if (p != peer_avl_empty) + atomic_inc(&p->refcnt); + read_unlock_bh(&peer_pool_lock); + + if (p != peer_avl_empty) { + /* The existing node has been found. */ + /* Remove the entry from unused list if it was there. */ + unlink_from_unused(p); + return p; + } + + if (!create) + return NULL; + + /* Allocate the space outside the locked region. */ + n = kmem_cache_alloc(peer_cachep, GFP_ATOMIC); + if (n == NULL) + return NULL; + n->v4daddr = daddr; + atomic_set(&n->refcnt, 1); + n->ip_id_count = secure_ip_id(daddr); + n->tcp_ts_stamp = 0; + + write_lock_bh(&peer_pool_lock); + /* Check if an entry has suddenly appeared. */ + p = lookup(daddr); + if (p != peer_avl_empty) + goto out_free; + + /* Link the node. */ + link_to_pool(n); + n->unused_prevp = NULL; /* not on the list */ + peer_total++; + write_unlock_bh(&peer_pool_lock); + + if (peer_total >= inet_peer_threshold) + /* Remove one less-recently-used entry. */ + cleanup_once(0); + + return n; + +out_free: + /* The appropriate node is already in the pool. */ + atomic_inc(&p->refcnt); + write_unlock_bh(&peer_pool_lock); + /* Remove the entry from unused list if it was there. */ + unlink_from_unused(p); + /* Free preallocated the preallocated node. */ + kmem_cache_free(peer_cachep, n); + return p; +} + +/* Called with local BH disabled. */ +static void peer_check_expire(unsigned long dummy) +{ + int i; + int ttl; + + if (peer_total >= inet_peer_threshold) + ttl = inet_peer_minttl; + else + ttl = inet_peer_maxttl + - (inet_peer_maxttl - inet_peer_minttl) / HZ * + peer_total / inet_peer_threshold * HZ; + for (i = 0; i < PEER_MAX_CLEANUP_WORK && !cleanup_once(ttl); i++); + + /* Trigger the timer after inet_peer_gc_mintime .. inet_peer_gc_maxtime + * interval depending on the total number of entries (more entries, + * less interval). */ + peer_periodic_timer.expires = jiffies + + inet_peer_gc_maxtime + - (inet_peer_gc_maxtime - inet_peer_gc_mintime) / HZ * + peer_total / inet_peer_threshold * HZ; + add_timer(&peer_periodic_timer); +} + +EXPORT_SYMBOL(inet_peer_idlock); diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c new file mode 100644 index 000000000000..77094aac6c28 --- /dev/null +++ b/net/ipv4/ip_forward.c @@ -0,0 +1,127 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The IP forwarding functionality. + * + * Version: $Id: ip_forward.c,v 1.48 2000/12/13 18:31:48 davem Exp $ + * + * Authors: see ip.c + * + * Fixes: + * Many : Split from ip.c , see ip_input.c for + * history. + * Dave Gregorich : NULL ip_rt_put fix for multicast + * routing. + * Jos Vos : Add call_out_firewall before sending, + * use output device for accounting. + * Jos Vos : Call forward firewall after routing + * (always use output device). + * Mike McLagan : Routing by source + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline int ip_forward_finish(struct sk_buff *skb) +{ + struct ip_options * opt = &(IPCB(skb)->opt); + + IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS); + + if (unlikely(opt->optlen)) + ip_forward_options(skb); + + return dst_output(skb); +} + +int ip_forward(struct sk_buff *skb) +{ + struct iphdr *iph; /* Our header */ + struct rtable *rt; /* Route we use */ + struct ip_options * opt = &(IPCB(skb)->opt); + + if (!xfrm4_policy_check(NULL, XFRM_POLICY_FWD, skb)) + goto drop; + + if (IPCB(skb)->opt.router_alert && ip_call_ra_chain(skb)) + return NET_RX_SUCCESS; + + if (skb->pkt_type != PACKET_HOST) + goto drop; + + skb->ip_summed = CHECKSUM_NONE; + + /* + * According to the RFC, we must first decrease the TTL field. If + * that reaches zero, we must reply an ICMP control message telling + * that the packet's lifetime expired. + */ + + iph = skb->nh.iph; + + if (iph->ttl <= 1) + goto too_many_hops; + + if (!xfrm4_route_forward(skb)) + goto drop; + + iph = skb->nh.iph; + rt = (struct rtable*)skb->dst; + + if (opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + goto sr_failed; + + /* We are about to mangle packet. Copy it! */ + if (skb_cow(skb, LL_RESERVED_SPACE(rt->u.dst.dev)+rt->u.dst.header_len)) + goto drop; + iph = skb->nh.iph; + + /* Decrease ttl after skb cow done */ + ip_decrease_ttl(iph); + + /* + * We now generate an ICMP HOST REDIRECT giving the route + * we calculated. + */ + if (rt->rt_flags&RTCF_DOREDIRECT && !opt->srr) + ip_rt_send_redirect(skb); + + skb->priority = rt_tos2priority(iph->tos); + + return NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, rt->u.dst.dev, + ip_forward_finish); + +sr_failed: + /* + * Strict routing permits no gatewaying + */ + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_SR_FAILED, 0); + goto drop; + +too_many_hops: + /* Tell the sender its packet died... */ + icmp_send(skb, ICMP_TIME_EXCEEDED, ICMP_EXC_TTL, 0); +drop: + kfree_skb(skb); + return NET_RX_DROP; +} diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c new file mode 100644 index 000000000000..7f68e27eb4ea --- /dev/null +++ b/net/ipv4/ip_fragment.c @@ -0,0 +1,691 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The IP fragmentation functionality. + * + * Version: $Id: ip_fragment.c,v 1.59 2002/01/12 07:54:56 davem Exp $ + * + * Authors: Fred N. van Kempen + * Alan Cox + * + * Fixes: + * Alan Cox : Split from ip.c , see ip_input.c for history. + * David S. Miller : Begin massive cleanup... + * Andi Kleen : Add sysctls. + * xxxx : Overlapfrag bug. + * Ultima : ip_expire() kernel panic. + * Bill Hawes : Frag accounting and evictor fixes. + * John McDonald : 0 length frag bug. + * Alexey Kuznetsov: SMP races, threading, cleanup. + * Patrick McHardy : LRU queue of frag heads for evictor. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6 + * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c + * as well. Or notify me, at least. --ANK + */ + +/* Fragment cache limits. We will commit 256K at one time. Should we + * cross that limit we will prune down to 192K. This should cope with + * even the most extreme cases without allowing an attacker to measurably + * harm machine performance. + */ +int sysctl_ipfrag_high_thresh = 256*1024; +int sysctl_ipfrag_low_thresh = 192*1024; + +/* Important NOTE! Fragment queue must be destroyed before MSL expires. + * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. + */ +int sysctl_ipfrag_time = IP_FRAG_TIME; + +struct ipfrag_skb_cb +{ + struct inet_skb_parm h; + int offset; +}; + +#define FRAG_CB(skb) ((struct ipfrag_skb_cb*)((skb)->cb)) + +/* Describe an entry in the "incomplete datagrams" queue. */ +struct ipq { + struct ipq *next; /* linked list pointers */ + struct list_head lru_list; /* lru list member */ + u32 user; + u32 saddr; + u32 daddr; + u16 id; + u8 protocol; + u8 last_in; +#define COMPLETE 4 +#define FIRST_IN 2 +#define LAST_IN 1 + + struct sk_buff *fragments; /* linked list of received fragments */ + int len; /* total length of original datagram */ + int meat; + spinlock_t lock; + atomic_t refcnt; + struct timer_list timer; /* when will this queue expire? */ + struct ipq **pprev; + int iif; + struct timeval stamp; +}; + +/* Hash table. */ + +#define IPQ_HASHSZ 64 + +/* Per-bucket lock is easy to add now. */ +static struct ipq *ipq_hash[IPQ_HASHSZ]; +static DEFINE_RWLOCK(ipfrag_lock); +static u32 ipfrag_hash_rnd; +static LIST_HEAD(ipq_lru_list); +int ip_frag_nqueues = 0; + +static __inline__ void __ipq_unlink(struct ipq *qp) +{ + if(qp->next) + qp->next->pprev = qp->pprev; + *qp->pprev = qp->next; + list_del(&qp->lru_list); + ip_frag_nqueues--; +} + +static __inline__ void ipq_unlink(struct ipq *ipq) +{ + write_lock(&ipfrag_lock); + __ipq_unlink(ipq); + write_unlock(&ipfrag_lock); +} + +static unsigned int ipqhashfn(u16 id, u32 saddr, u32 daddr, u8 prot) +{ + return jhash_3words((u32)id << 16 | prot, saddr, daddr, + ipfrag_hash_rnd) & (IPQ_HASHSZ - 1); +} + +static struct timer_list ipfrag_secret_timer; +int sysctl_ipfrag_secret_interval = 10 * 60 * HZ; + +static void ipfrag_secret_rebuild(unsigned long dummy) +{ + unsigned long now = jiffies; + int i; + + write_lock(&ipfrag_lock); + get_random_bytes(&ipfrag_hash_rnd, sizeof(u32)); + for (i = 0; i < IPQ_HASHSZ; i++) { + struct ipq *q; + + q = ipq_hash[i]; + while (q) { + struct ipq *next = q->next; + unsigned int hval = ipqhashfn(q->id, q->saddr, + q->daddr, q->protocol); + + if (hval != i) { + /* Unlink. */ + if (q->next) + q->next->pprev = q->pprev; + *q->pprev = q->next; + + /* Relink to new hash chain. */ + if ((q->next = ipq_hash[hval]) != NULL) + q->next->pprev = &q->next; + ipq_hash[hval] = q; + q->pprev = &ipq_hash[hval]; + } + + q = next; + } + } + write_unlock(&ipfrag_lock); + + mod_timer(&ipfrag_secret_timer, now + sysctl_ipfrag_secret_interval); +} + +atomic_t ip_frag_mem = ATOMIC_INIT(0); /* Memory used for fragments */ + +/* Memory Tracking Functions. */ +static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work) +{ + if (work) + *work -= skb->truesize; + atomic_sub(skb->truesize, &ip_frag_mem); + kfree_skb(skb); +} + +static __inline__ void frag_free_queue(struct ipq *qp, int *work) +{ + if (work) + *work -= sizeof(struct ipq); + atomic_sub(sizeof(struct ipq), &ip_frag_mem); + kfree(qp); +} + +static __inline__ struct ipq *frag_alloc_queue(void) +{ + struct ipq *qp = kmalloc(sizeof(struct ipq), GFP_ATOMIC); + + if(!qp) + return NULL; + atomic_add(sizeof(struct ipq), &ip_frag_mem); + return qp; +} + + +/* Destruction primitives. */ + +/* Complete destruction of ipq. */ +static void ip_frag_destroy(struct ipq *qp, int *work) +{ + struct sk_buff *fp; + + BUG_TRAP(qp->last_in&COMPLETE); + BUG_TRAP(del_timer(&qp->timer) == 0); + + /* Release all fragment data. */ + fp = qp->fragments; + while (fp) { + struct sk_buff *xp = fp->next; + + frag_kfree_skb(fp, work); + fp = xp; + } + + /* Finally, release the queue descriptor itself. */ + frag_free_queue(qp, work); +} + +static __inline__ void ipq_put(struct ipq *ipq, int *work) +{ + if (atomic_dec_and_test(&ipq->refcnt)) + ip_frag_destroy(ipq, work); +} + +/* Kill ipq entry. It is not destroyed immediately, + * because caller (and someone more) holds reference count. + */ +static void ipq_kill(struct ipq *ipq) +{ + if (del_timer(&ipq->timer)) + atomic_dec(&ipq->refcnt); + + if (!(ipq->last_in & COMPLETE)) { + ipq_unlink(ipq); + atomic_dec(&ipq->refcnt); + ipq->last_in |= COMPLETE; + } +} + +/* Memory limiting on fragments. Evictor trashes the oldest + * fragment queue until we are back under the threshold. + */ +static void ip_evictor(void) +{ + struct ipq *qp; + struct list_head *tmp; + int work; + + work = atomic_read(&ip_frag_mem) - sysctl_ipfrag_low_thresh; + if (work <= 0) + return; + + while (work > 0) { + read_lock(&ipfrag_lock); + if (list_empty(&ipq_lru_list)) { + read_unlock(&ipfrag_lock); + return; + } + tmp = ipq_lru_list.next; + qp = list_entry(tmp, struct ipq, lru_list); + atomic_inc(&qp->refcnt); + read_unlock(&ipfrag_lock); + + spin_lock(&qp->lock); + if (!(qp->last_in&COMPLETE)) + ipq_kill(qp); + spin_unlock(&qp->lock); + + ipq_put(qp, &work); + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + } +} + +/* + * Oops, a fragment queue timed out. Kill it and send an ICMP reply. + */ +static void ip_expire(unsigned long arg) +{ + struct ipq *qp = (struct ipq *) arg; + + spin_lock(&qp->lock); + + if (qp->last_in & COMPLETE) + goto out; + + ipq_kill(qp); + + IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT); + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + + if ((qp->last_in&FIRST_IN) && qp->fragments != NULL) { + struct sk_buff *head = qp->fragments; + /* Send an ICMP "Fragment Reassembly Timeout" message. */ + if ((head->dev = dev_get_by_index(qp->iif)) != NULL) { + icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); + dev_put(head->dev); + } + } +out: + spin_unlock(&qp->lock); + ipq_put(qp, NULL); +} + +/* Creation primitives. */ + +static struct ipq *ip_frag_intern(unsigned int hash, struct ipq *qp_in) +{ + struct ipq *qp; + + write_lock(&ipfrag_lock); +#ifdef CONFIG_SMP + /* With SMP race we have to recheck hash table, because + * such entry could be created on other cpu, while we + * promoted read lock to write lock. + */ + for(qp = ipq_hash[hash]; qp; qp = qp->next) { + if(qp->id == qp_in->id && + qp->saddr == qp_in->saddr && + qp->daddr == qp_in->daddr && + qp->protocol == qp_in->protocol && + qp->user == qp_in->user) { + atomic_inc(&qp->refcnt); + write_unlock(&ipfrag_lock); + qp_in->last_in |= COMPLETE; + ipq_put(qp_in, NULL); + return qp; + } + } +#endif + qp = qp_in; + + if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) + atomic_inc(&qp->refcnt); + + atomic_inc(&qp->refcnt); + if((qp->next = ipq_hash[hash]) != NULL) + qp->next->pprev = &qp->next; + ipq_hash[hash] = qp; + qp->pprev = &ipq_hash[hash]; + INIT_LIST_HEAD(&qp->lru_list); + list_add_tail(&qp->lru_list, &ipq_lru_list); + ip_frag_nqueues++; + write_unlock(&ipfrag_lock); + return qp; +} + +/* Add an entry to the 'ipq' queue for a newly received IP datagram. */ +static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user) +{ + struct ipq *qp; + + if ((qp = frag_alloc_queue()) == NULL) + goto out_nomem; + + qp->protocol = iph->protocol; + qp->last_in = 0; + qp->id = iph->id; + qp->saddr = iph->saddr; + qp->daddr = iph->daddr; + qp->user = user; + qp->len = 0; + qp->meat = 0; + qp->fragments = NULL; + qp->iif = 0; + + /* Initialize a timer for this entry. */ + init_timer(&qp->timer); + qp->timer.data = (unsigned long) qp; /* pointer to queue */ + qp->timer.function = ip_expire; /* expire function */ + spin_lock_init(&qp->lock); + atomic_set(&qp->refcnt, 1); + + return ip_frag_intern(hash, qp); + +out_nomem: + NETDEBUG(if (net_ratelimit()) printk(KERN_ERR "ip_frag_create: no memory left !\n")); + return NULL; +} + +/* Find the correct entry in the "incomplete datagrams" queue for + * this IP datagram, and create new one, if nothing is found. + */ +static inline struct ipq *ip_find(struct iphdr *iph, u32 user) +{ + __u16 id = iph->id; + __u32 saddr = iph->saddr; + __u32 daddr = iph->daddr; + __u8 protocol = iph->protocol; + unsigned int hash = ipqhashfn(id, saddr, daddr, protocol); + struct ipq *qp; + + read_lock(&ipfrag_lock); + for(qp = ipq_hash[hash]; qp; qp = qp->next) { + if(qp->id == id && + qp->saddr == saddr && + qp->daddr == daddr && + qp->protocol == protocol && + qp->user == user) { + atomic_inc(&qp->refcnt); + read_unlock(&ipfrag_lock); + return qp; + } + } + read_unlock(&ipfrag_lock); + + return ip_frag_create(hash, iph, user); +} + +/* Add new segment to existing queue. */ +static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) +{ + struct sk_buff *prev, *next; + int flags, offset; + int ihl, end; + + if (qp->last_in & COMPLETE) + goto err; + + offset = ntohs(skb->nh.iph->frag_off); + flags = offset & ~IP_OFFSET; + offset &= IP_OFFSET; + offset <<= 3; /* offset is in 8-byte chunks */ + ihl = skb->nh.iph->ihl * 4; + + /* Determine the position of this fragment. */ + end = offset + skb->len - ihl; + + /* Is this the final fragment? */ + if ((flags & IP_MF) == 0) { + /* If we already have some bits beyond end + * or have different end, the segment is corrrupted. + */ + if (end < qp->len || + ((qp->last_in & LAST_IN) && end != qp->len)) + goto err; + qp->last_in |= LAST_IN; + qp->len = end; + } else { + if (end&7) { + end &= ~7; + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + if (end > qp->len) { + /* Some bits beyond end -> corruption. */ + if (qp->last_in & LAST_IN) + goto err; + qp->len = end; + } + } + if (end == offset) + goto err; + + if (pskb_pull(skb, ihl) == NULL) + goto err; + if (pskb_trim(skb, end-offset)) + goto err; + + /* Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + prev = NULL; + for(next = qp->fragments; next != NULL; next = next->next) { + if (FRAG_CB(next)->offset >= offset) + break; /* bingo! */ + prev = next; + } + + /* We found where to put this one. Check for overlap with + * preceding fragment, and, if needed, align things so that + * any overlaps are eliminated. + */ + if (prev) { + int i = (FRAG_CB(prev)->offset + prev->len) - offset; + + if (i > 0) { + offset += i; + if (end <= offset) + goto err; + if (!pskb_pull(skb, i)) + goto err; + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + } + + while (next && FRAG_CB(next)->offset < end) { + int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */ + + if (i < next->len) { + /* Eat head of the next overlapped fragment + * and leave the loop. The next ones cannot overlap. + */ + if (!pskb_pull(next, i)) + goto err; + FRAG_CB(next)->offset += i; + qp->meat -= i; + if (next->ip_summed != CHECKSUM_UNNECESSARY) + next->ip_summed = CHECKSUM_NONE; + break; + } else { + struct sk_buff *free_it = next; + + /* Old fragmnet is completely overridden with + * new one drop it. + */ + next = next->next; + + if (prev) + prev->next = next; + else + qp->fragments = next; + + qp->meat -= free_it->len; + frag_kfree_skb(free_it, NULL); + } + } + + FRAG_CB(skb)->offset = offset; + + /* Insert this fragment in the chain of fragments. */ + skb->next = next; + if (prev) + prev->next = skb; + else + qp->fragments = skb; + + if (skb->dev) + qp->iif = skb->dev->ifindex; + skb->dev = NULL; + qp->stamp = skb->stamp; + qp->meat += skb->len; + atomic_add(skb->truesize, &ip_frag_mem); + if (offset == 0) + qp->last_in |= FIRST_IN; + + write_lock(&ipfrag_lock); + list_move_tail(&qp->lru_list, &ipq_lru_list); + write_unlock(&ipfrag_lock); + + return; + +err: + kfree_skb(skb); +} + + +/* Build a new IP datagram from all its fragments. */ + +static struct sk_buff *ip_frag_reasm(struct ipq *qp, struct net_device *dev) +{ + struct iphdr *iph; + struct sk_buff *fp, *head = qp->fragments; + int len; + int ihlen; + + ipq_kill(qp); + + BUG_TRAP(head != NULL); + BUG_TRAP(FRAG_CB(head)->offset == 0); + + /* Allocate a new buffer for the datagram. */ + ihlen = head->nh.iph->ihl*4; + len = ihlen + qp->len; + + if(len > 65535) + goto out_oversize; + + /* Head of list must not be cloned. */ + if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) + goto out_nomem; + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. */ + if (skb_shinfo(head)->frag_list) { + struct sk_buff *clone; + int i, plen = 0; + + if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) + goto out_nomem; + clone->next = head->next; + head->next = clone; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_shinfo(head)->frag_list = NULL; + for (i=0; inr_frags; i++) + plen += skb_shinfo(head)->frags[i].size; + clone->len = clone->data_len = head->data_len - plen; + head->data_len -= clone->len; + head->len -= clone->len; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + atomic_add(clone->truesize, &ip_frag_mem); + } + + skb_shinfo(head)->frag_list = head->next; + skb_push(head, head->data - head->nh.raw); + atomic_sub(head->truesize, &ip_frag_mem); + + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_HW) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + atomic_sub(fp->truesize, &ip_frag_mem); + } + + head->next = NULL; + head->dev = dev; + head->stamp = qp->stamp; + + iph = head->nh.iph; + iph->frag_off = 0; + iph->tot_len = htons(len); + IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS); + qp->fragments = NULL; + return head; + +out_nomem: + NETDEBUG(if (net_ratelimit()) + printk(KERN_ERR + "IP: queue_glue: no memory for gluing queue %p\n", + qp)); + goto out_fail; +out_oversize: + if (net_ratelimit()) + printk(KERN_INFO + "Oversized IP packet from %d.%d.%d.%d.\n", + NIPQUAD(qp->saddr)); +out_fail: + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + return NULL; +} + +/* Process an incoming IP datagram fragment. */ +struct sk_buff *ip_defrag(struct sk_buff *skb, u32 user) +{ + struct iphdr *iph = skb->nh.iph; + struct ipq *qp; + struct net_device *dev; + + IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); + + /* Start by cleaning up the memory. */ + if (atomic_read(&ip_frag_mem) > sysctl_ipfrag_high_thresh) + ip_evictor(); + + dev = skb->dev; + + /* Lookup (or create) queue header */ + if ((qp = ip_find(iph, user)) != NULL) { + struct sk_buff *ret = NULL; + + spin_lock(&qp->lock); + + ip_frag_queue(qp, skb); + + if (qp->last_in == (FIRST_IN|LAST_IN) && + qp->meat == qp->len) + ret = ip_frag_reasm(qp, dev); + + spin_unlock(&qp->lock); + ipq_put(qp, NULL); + return ret; + } + + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + kfree_skb(skb); + return NULL; +} + +void ipfrag_init(void) +{ + ipfrag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ + (jiffies ^ (jiffies >> 6))); + + init_timer(&ipfrag_secret_timer); + ipfrag_secret_timer.function = ipfrag_secret_rebuild; + ipfrag_secret_timer.expires = jiffies + sysctl_ipfrag_secret_interval; + add_timer(&ipfrag_secret_timer); +} + +EXPORT_SYMBOL(ip_defrag); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c new file mode 100644 index 000000000000..884835522224 --- /dev/null +++ b/net/ipv4/ip_gre.c @@ -0,0 +1,1290 @@ +/* + * Linux NET3: GRE over IP protocol decoder. + * + * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_IPV6 +#include +#include +#include +#endif + +/* + Problems & solutions + -------------------- + + 1. The most important issue is detecting local dead loops. + They would cause complete host lockup in transmit, which + would be "resolved" by stack overflow or, if queueing is enabled, + with infinite looping in net_bh. + + We cannot track such dead loops during route installation, + it is infeasible task. The most general solutions would be + to keep skb->encapsulation counter (sort of local ttl), + and silently drop packet when it expires. It is the best + solution, but it supposes maintaing new variable in ALL + skb, even if no tunneling is used. + + Current solution: t->recursion lock breaks dead loops. It looks + like dev->tbusy flag, but I preferred new variable, because + the semantics is different. One day, when hard_start_xmit + will be multithreaded we will have to use skb->encapsulation. + + + + 2. Networking dead loops would not kill routers, but would really + kill network. IP hop limit plays role of "t->recursion" in this case, + if we copy it from packet being encapsulated to upper header. + It is very good solution, but it introduces two problems: + + - Routing protocols, using packets with ttl=1 (OSPF, RIP2), + do not work over tunnels. + - traceroute does not work. I planned to relay ICMP from tunnel, + so that this problem would be solved and traceroute output + would even more informative. This idea appeared to be wrong: + only Linux complies to rfc1812 now (yes, guys, Linux is the only + true router now :-)), all routers (at least, in neighbourhood of mine) + return only 8 bytes of payload. It is the end. + + Hence, if we want that OSPF worked or traceroute said something reasonable, + we should search for another solution. + + One of them is to parse packet trying to detect inner encapsulation + made by our node. It is difficult or even impossible, especially, + taking into account fragmentation. TO be short, tt is not solution at all. + + Current solution: The solution was UNEXPECTEDLY SIMPLE. + We force DF flag on tunnels with preconfigured hop limit, + that is ALL. :-) Well, it does not remove the problem completely, + but exponential growth of network traffic is changed to linear + (branches, that exceed pmtu are pruned) and tunnel mtu + fastly degrades to value <68, where looping stops. + Yes, it is not good if there exists a router in the loop, + which does not force DF, even when encapsulating packets have DF set. + But it is not our problem! Nobody could accuse us, we made + all that we could make. Even if it is your gated who injected + fatal route to network, even if it were you who configured + fatal static route: you are innocent. :-) + + + + 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain + practically identical code. It would be good to glue them + together, but it is not very evident, how to make them modular. + sit is integral part of IPv6, ipip and gre are naturally modular. + We could extract common parts (hash table, ioctl etc) + to a separate module (ip_tunnel.c). + + Alexey Kuznetsov. + */ + +static int ipgre_tunnel_init(struct net_device *dev); +static void ipgre_tunnel_setup(struct net_device *dev); + +/* Fallback tunnel: no source, no destination, no key, no options */ + +static int ipgre_fb_tunnel_init(struct net_device *dev); + +static struct net_device *ipgre_fb_tunnel_dev; + +/* Tunnel hash table */ + +/* + 4 hash tables: + + 3: (remote,local) + 2: (remote,*) + 1: (*,local) + 0: (*,*) + + We require exact key match i.e. if a key is present in packet + it will match only tunnel with the same key; if it is not present, + it will match only keyless tunnel. + + All keysless packets, if not matched configured keyless tunnels + will match fallback tunnel. + */ + +#define HASH_SIZE 16 +#define HASH(addr) ((addr^(addr>>4))&0xF) + +static struct ip_tunnel *tunnels[4][HASH_SIZE]; + +#define tunnels_r_l (tunnels[3]) +#define tunnels_r (tunnels[2]) +#define tunnels_l (tunnels[1]) +#define tunnels_wc (tunnels[0]) + +static DEFINE_RWLOCK(ipgre_lock); + +/* Given src, dst and key, find appropriate for input tunnel. */ + +static struct ip_tunnel * ipgre_tunnel_lookup(u32 remote, u32 local, u32 key) +{ + unsigned h0 = HASH(remote); + unsigned h1 = HASH(key); + struct ip_tunnel *t; + + for (t = tunnels_r_l[h0^h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { + if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) + return t; + } + } + for (t = tunnels_r[h0^h1]; t; t = t->next) { + if (remote == t->parms.iph.daddr) { + if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) + return t; + } + } + for (t = tunnels_l[h1]; t; t = t->next) { + if (local == t->parms.iph.saddr || + (local == t->parms.iph.daddr && MULTICAST(local))) { + if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) + return t; + } + } + for (t = tunnels_wc[h1]; t; t = t->next) { + if (t->parms.i_key == key && (t->dev->flags&IFF_UP)) + return t; + } + + if (ipgre_fb_tunnel_dev->flags&IFF_UP) + return ipgre_fb_tunnel_dev->priv; + return NULL; +} + +static struct ip_tunnel **ipgre_bucket(struct ip_tunnel *t) +{ + u32 remote = t->parms.iph.daddr; + u32 local = t->parms.iph.saddr; + u32 key = t->parms.i_key; + unsigned h = HASH(key); + int prio = 0; + + if (local) + prio |= 1; + if (remote && !MULTICAST(remote)) { + prio |= 2; + h ^= HASH(remote); + } + + return &tunnels[prio][h]; +} + +static void ipgre_tunnel_link(struct ip_tunnel *t) +{ + struct ip_tunnel **tp = ipgre_bucket(t); + + t->next = *tp; + write_lock_bh(&ipgre_lock); + *tp = t; + write_unlock_bh(&ipgre_lock); +} + +static void ipgre_tunnel_unlink(struct ip_tunnel *t) +{ + struct ip_tunnel **tp; + + for (tp = ipgre_bucket(t); *tp; tp = &(*tp)->next) { + if (t == *tp) { + write_lock_bh(&ipgre_lock); + *tp = t->next; + write_unlock_bh(&ipgre_lock); + break; + } + } +} + +static struct ip_tunnel * ipgre_tunnel_locate(struct ip_tunnel_parm *parms, int create) +{ + u32 remote = parms->iph.daddr; + u32 local = parms->iph.saddr; + u32 key = parms->i_key; + struct ip_tunnel *t, **tp, *nt; + struct net_device *dev; + unsigned h = HASH(key); + int prio = 0; + char name[IFNAMSIZ]; + + if (local) + prio |= 1; + if (remote && !MULTICAST(remote)) { + prio |= 2; + h ^= HASH(remote); + } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) { + if (key == t->parms.i_key) + return t; + } + } + if (!create) + return NULL; + + if (parms->name[0]) + strlcpy(name, parms->name, IFNAMSIZ); + else { + int i; + for (i=1; i<100; i++) { + sprintf(name, "gre%d", i); + if (__dev_get_by_name(name) == NULL) + break; + } + if (i==100) + goto failed; + } + + dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); + if (!dev) + return NULL; + + dev->init = ipgre_tunnel_init; + nt = dev->priv; + nt->parms = *parms; + + if (register_netdevice(dev) < 0) { + free_netdev(dev); + goto failed; + } + + nt = dev->priv; + nt->parms = *parms; + + dev_hold(dev); + ipgre_tunnel_link(nt); + /* Do not decrement MOD_USE_COUNT here. */ + return nt; + +failed: + return NULL; +} + +static void ipgre_tunnel_uninit(struct net_device *dev) +{ + ipgre_tunnel_unlink((struct ip_tunnel*)dev->priv); + dev_put(dev); +} + + +static void ipgre_err(struct sk_buff *skb, u32 info) +{ +#ifndef I_WISH_WORLD_WERE_PERFECT + +/* It is not :-( All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. + + Moreover, Cisco "wise men" put GRE key to the third word + in GRE header. It makes impossible maintaining even soft state for keyed + GRE tunnels with enabled checksum. Tell them "thank you". + + Well, I wonder, rfc1812 was written by Cisco employee, + what the hell these idiots break standrads established + by themself??? + */ + + struct iphdr *iph = (struct iphdr*)skb->data; + u16 *p = (u16*)(skb->data+(iph->ihl<<2)); + int grehlen = (iph->ihl<<2) + 4; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct ip_tunnel *t; + u16 flags; + + flags = p[0]; + if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { + if (flags&(GRE_VERSION|GRE_ROUTING)) + return; + if (flags&GRE_KEY) { + grehlen += 4; + if (flags&GRE_CSUM) + grehlen += 4; + } + } + + /* If only 8 bytes returned, keyed message will be dropped here */ + if (skb_headlen(skb) < grehlen) + return; + + switch (type) { + default: + case ICMP_PARAMETERPROB: + return; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Soft state for pmtu is maintained by IP core. */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe they are just ether pollution. --ANK + */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + read_lock(&ipgre_lock); + t = ipgre_tunnel_lookup(iph->daddr, iph->saddr, (flags&GRE_KEY) ? *(((u32*)p) + (grehlen>>2) - 1) : 0); + if (t == NULL || t->parms.iph.daddr == 0 || MULTICAST(t->parms.iph.daddr)) + goto out; + + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + goto out; + + if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; +out: + read_unlock(&ipgre_lock); + return; +#else + struct iphdr *iph = (struct iphdr*)dp; + struct iphdr *eiph; + u16 *p = (u16*)(dp+(iph->ihl<<2)); + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + int rel_type = 0; + int rel_code = 0; + int rel_info = 0; + u16 flags; + int grehlen = (iph->ihl<<2) + 4; + struct sk_buff *skb2; + struct flowi fl; + struct rtable *rt; + + if (p[1] != htons(ETH_P_IP)) + return; + + flags = p[0]; + if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) { + if (flags&(GRE_VERSION|GRE_ROUTING)) + return; + if (flags&GRE_CSUM) + grehlen += 4; + if (flags&GRE_KEY) + grehlen += 4; + if (flags&GRE_SEQ) + grehlen += 4; + } + if (len < grehlen + sizeof(struct iphdr)) + return; + eiph = (struct iphdr*)(dp + grehlen); + + switch (type) { + default: + return; + case ICMP_PARAMETERPROB: + if (skb->h.icmph->un.gateway < (iph->ihl<<2)) + return; + + /* So... This guy found something strange INSIDE encapsulated + packet. Well, he is fool, but what can we do ? + */ + rel_type = ICMP_PARAMETERPROB; + rel_info = skb->h.icmph->un.gateway - grehlen; + break; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* And it is the only really necessary thing :-) */ + rel_info = ntohs(skb->h.icmph->un.frag.mtu); + if (rel_info < grehlen+68) + return; + rel_info -= grehlen; + /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */ + if (rel_info > ntohs(eiph->tot_len)) + return; + break; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe, it is just ether pollution. --ANK + */ + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + /* Prepare fake skb to feed it to icmp_send */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 == NULL) + return; + dst_release(skb2->dst); + skb2->dst = NULL; + skb_pull(skb2, skb->data - (u8*)eiph); + skb2->nh.raw = skb2->data; + + /* Try to guess incoming interface */ + memset(&fl, 0, sizeof(fl)); + fl.fl4_dst = eiph->saddr; + fl.fl4_tos = RT_TOS(eiph->tos); + fl.proto = IPPROTO_GRE; + if (ip_route_output_key(&rt, &fl)) { + kfree_skb(skb2); + return; + } + skb2->dev = rt->u.dst.dev; + + /* route "incoming" packet */ + if (rt->rt_flags&RTCF_LOCAL) { + ip_rt_put(rt); + rt = NULL; + fl.fl4_dst = eiph->daddr; + fl.fl4_src = eiph->saddr; + fl.fl4_tos = eiph->tos; + if (ip_route_output_key(&rt, &fl) || + rt->u.dst.dev->type != ARPHRD_IPGRE) { + ip_rt_put(rt); + kfree_skb(skb2); + return; + } + } else { + ip_rt_put(rt); + if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || + skb2->dst->dev->type != ARPHRD_IPGRE) { + kfree_skb(skb2); + return; + } + } + + /* change mtu on this route */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + if (rel_info > dst_mtu(skb2->dst)) { + kfree_skb(skb2); + return; + } + skb2->dst->ops->update_pmtu(skb2->dst, rel_info); + rel_info = htonl(rel_info); + } else if (type == ICMP_TIME_EXCEEDED) { + struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv; + if (t->parms.iph.ttl) { + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + } + } + + icmp_send(skb2, rel_type, rel_code, rel_info); + kfree_skb(skb2); +#endif +} + +static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) +{ + if (INET_ECN_is_ce(iph->tos)) { + if (skb->protocol == htons(ETH_P_IP)) { + IP_ECN_set_ce(skb->nh.iph); + } else if (skb->protocol == htons(ETH_P_IPV6)) { + IP6_ECN_set_ce(skb->nh.ipv6h); + } + } +} + +static inline u8 +ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb) +{ + u8 inner = 0; + if (skb->protocol == htons(ETH_P_IP)) + inner = old_iph->tos; + else if (skb->protocol == htons(ETH_P_IPV6)) + inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph); + return INET_ECN_encapsulate(tos, inner); +} + +static int ipgre_rcv(struct sk_buff *skb) +{ + struct iphdr *iph; + u8 *h; + u16 flags; + u16 csum = 0; + u32 key = 0; + u32 seqno = 0; + struct ip_tunnel *tunnel; + int offset = 4; + + if (!pskb_may_pull(skb, 16)) + goto drop_nolock; + + iph = skb->nh.iph; + h = skb->data; + flags = *(u16*)h; + + if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) { + /* - Version must be 0. + - We do not support routing headers. + */ + if (flags&(GRE_VERSION|GRE_ROUTING)) + goto drop_nolock; + + if (flags&GRE_CSUM) { + if (skb->ip_summed == CHECKSUM_HW) { + csum = (u16)csum_fold(skb->csum); + if (csum) + skb->ip_summed = CHECKSUM_NONE; + } + if (skb->ip_summed == CHECKSUM_NONE) { + skb->csum = skb_checksum(skb, 0, skb->len, 0); + skb->ip_summed = CHECKSUM_HW; + csum = (u16)csum_fold(skb->csum); + } + offset += 4; + } + if (flags&GRE_KEY) { + key = *(u32*)(h + offset); + offset += 4; + } + if (flags&GRE_SEQ) { + seqno = ntohl(*(u32*)(h + offset)); + offset += 4; + } + } + + read_lock(&ipgre_lock); + if ((tunnel = ipgre_tunnel_lookup(iph->saddr, iph->daddr, key)) != NULL) { + secpath_reset(skb); + + skb->protocol = *(u16*)(h + 2); + /* WCCP version 1 and 2 protocol decoding. + * - Change protocol to IP + * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header + */ + if (flags == 0 && + skb->protocol == __constant_htons(ETH_P_WCCP)) { + skb->protocol = __constant_htons(ETH_P_IP); + if ((*(h + offset) & 0xF0) != 0x40) + offset += 4; + } + + skb->mac.raw = skb->nh.raw; + skb->nh.raw = __pskb_pull(skb, offset); + skb_postpull_rcsum(skb, skb->mac.raw, offset); + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->pkt_type = PACKET_HOST; +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (MULTICAST(iph->daddr)) { + /* Looped back packet, drop it! */ + if (((struct rtable*)skb->dst)->fl.iif == 0) + goto drop; + tunnel->stat.multicast++; + skb->pkt_type = PACKET_BROADCAST; + } +#endif + + if (((flags&GRE_CSUM) && csum) || + (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { + tunnel->stat.rx_crc_errors++; + tunnel->stat.rx_errors++; + goto drop; + } + if (tunnel->parms.i_flags&GRE_SEQ) { + if (!(flags&GRE_SEQ) || + (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { + tunnel->stat.rx_fifo_errors++; + tunnel->stat.rx_errors++; + goto drop; + } + tunnel->i_seqno = seqno + 1; + } + tunnel->stat.rx_packets++; + tunnel->stat.rx_bytes += skb->len; + skb->dev = tunnel->dev; + dst_release(skb->dst); + skb->dst = NULL; + nf_reset(skb); + ipgre_ecn_decapsulate(iph, skb); + netif_rx(skb); + read_unlock(&ipgre_lock); + return(0); + } + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); + +drop: + read_unlock(&ipgre_lock); +drop_nolock: + kfree_skb(skb); + return(0); +} + +static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct net_device_stats *stats = &tunnel->stat; + struct iphdr *old_iph = skb->nh.iph; + struct iphdr *tiph; + u8 tos; + u16 df; + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + int gre_hlen; + u32 dst; + int mtu; + + if (tunnel->recursion++) { + tunnel->stat.collisions++; + goto tx_error; + } + + if (dev->hard_header) { + gre_hlen = 0; + tiph = (struct iphdr*)skb->data; + } else { + gre_hlen = tunnel->hlen; + tiph = &tunnel->parms.iph; + } + + if ((dst = tiph->daddr) == 0) { + /* NBMA tunnel */ + + if (skb->dst == NULL) { + tunnel->stat.tx_fifo_errors++; + goto tx_error; + } + + if (skb->protocol == htons(ETH_P_IP)) { + rt = (struct rtable*)skb->dst; + if ((dst = rt->rt_gateway) == 0) + goto tx_error_icmp; + } +#ifdef CONFIG_IPV6 + else if (skb->protocol == htons(ETH_P_IPV6)) { + struct in6_addr *addr6; + int addr_type; + struct neighbour *neigh = skb->dst->neighbour; + + if (neigh == NULL) + goto tx_error; + + addr6 = (struct in6_addr*)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if (addr_type == IPV6_ADDR_ANY) { + addr6 = &skb->nh.ipv6h->daddr; + addr_type = ipv6_addr_type(addr6); + } + + if ((addr_type & IPV6_ADDR_COMPATv4) == 0) + goto tx_error_icmp; + + dst = addr6->s6_addr32[3]; + } +#endif + else + goto tx_error; + } + + tos = tiph->tos; + if (tos&1) { + if (skb->protocol == htons(ETH_P_IP)) + tos = old_iph->tos; + tos &= ~1; + } + + { + struct flowi fl = { .oif = tunnel->parms.link, + .nl_u = { .ip4_u = + { .daddr = dst, + .saddr = tiph->saddr, + .tos = RT_TOS(tos) } }, + .proto = IPPROTO_GRE }; + if (ip_route_output_key(&rt, &fl)) { + tunnel->stat.tx_carrier_errors++; + goto tx_error; + } + } + tdev = rt->u.dst.dev; + + if (tdev == dev) { + ip_rt_put(rt); + tunnel->stat.collisions++; + goto tx_error; + } + + df = tiph->frag_off; + if (df) + mtu = dst_mtu(&rt->u.dst) - tunnel->hlen; + else + mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; + + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + if (skb->protocol == htons(ETH_P_IP)) { + df |= (old_iph->frag_off&htons(IP_DF)); + + if ((old_iph->frag_off&htons(IP_DF)) && + mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + goto tx_error; + } + } +#ifdef CONFIG_IPV6 + else if (skb->protocol == htons(ETH_P_IPV6)) { + struct rt6_info *rt6 = (struct rt6_info*)skb->dst; + + if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) { + if ((tunnel->parms.iph.daddr && !MULTICAST(tunnel->parms.iph.daddr)) || + rt6->rt6i_dst.plen == 128) { + rt6->rt6i_flags |= RTF_MODIFIED; + skb->dst->metrics[RTAX_MTU-1] = mtu; + } + } + + if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + ip_rt_put(rt); + goto tx_error; + } + } +#endif + + if (tunnel->err_count > 0) { + if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + tunnel->err_count--; + + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen; + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + stats->tx_dropped++; + dev_kfree_skb(skb); + tunnel->recursion--; + return 0; + } + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + dev_kfree_skb(skb); + skb = new_skb; + old_iph = skb->nh.iph; + } + + skb->h.raw = skb->nh.raw; + skb->nh.raw = skb_push(skb, gre_hlen); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr) >> 2; + iph->frag_off = df; + iph->protocol = IPPROTO_GRE; + iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb); + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + + if ((iph->ttl = tiph->ttl) == 0) { + if (skb->protocol == htons(ETH_P_IP)) + iph->ttl = old_iph->ttl; +#ifdef CONFIG_IPV6 + else if (skb->protocol == htons(ETH_P_IPV6)) + iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit; +#endif + else + iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT); + } + + ((u16*)(iph+1))[0] = tunnel->parms.o_flags; + ((u16*)(iph+1))[1] = skb->protocol; + + if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) { + u32 *ptr = (u32*)(((u8*)iph) + tunnel->hlen - 4); + + if (tunnel->parms.o_flags&GRE_SEQ) { + ++tunnel->o_seqno; + *ptr = htonl(tunnel->o_seqno); + ptr--; + } + if (tunnel->parms.o_flags&GRE_KEY) { + *ptr = tunnel->parms.o_key; + ptr--; + } + if (tunnel->parms.o_flags&GRE_CSUM) { + *ptr = 0; + *(__u16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr)); + } + } + + nf_reset(skb); + + IPTUNNEL_XMIT(); + tunnel->recursion--; + return 0; + +tx_error_icmp: + dst_link_failure(skb); + +tx_error: + stats->tx_errors++; + dev_kfree_skb(skb); + tunnel->recursion--; + return 0; +} + +static int +ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel *t; + + switch (cmd) { + case SIOCGETTUNNEL: + t = NULL; + if (dev == ipgre_fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + t = ipgre_tunnel_locate(&p, 0); + } + if (t == NULL) + t = (struct ip_tunnel*)dev->priv; + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE || + p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) || + ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING))) + goto done; + if (p.iph.ttl) + p.iph.frag_off |= htons(IP_DF); + + if (!(p.i_flags&GRE_KEY)) + p.i_key = 0; + if (!(p.o_flags&GRE_KEY)) + p.o_key = 0; + + t = ipgre_tunnel_locate(&p, cmd == SIOCADDTUNNEL); + + if (dev != ipgre_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else { + unsigned nflags=0; + + t = (struct ip_tunnel*)dev->priv; + + if (MULTICAST(p.iph.daddr)) + nflags = IFF_BROADCAST; + else if (p.iph.daddr) + nflags = IFF_POINTOPOINT; + + if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) { + err = -EINVAL; + break; + } + ipgre_tunnel_unlink(t); + t->parms.iph.saddr = p.iph.saddr; + t->parms.iph.daddr = p.iph.daddr; + t->parms.i_key = p.i_key; + t->parms.o_key = p.o_key; + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + ipgre_tunnel_link(t); + netdev_state_change(dev); + } + } + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + t->parms.iph.frag_off = p.iph.frag_off; + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + if (dev == ipgre_fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + if ((t = ipgre_tunnel_locate(&p, 0)) == NULL) + goto done; + err = -EPERM; + if (t == ipgre_fb_tunnel_dev->priv) + goto done; + dev = t->dev; + } + err = unregister_netdevice(dev); + break; + + default: + err = -EINVAL; + } + +done: + return err; +} + +static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev) +{ + return &(((struct ip_tunnel*)dev->priv)->stat); +} + +static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +#ifdef CONFIG_NET_IPGRE_BROADCAST +/* Nice toy. Unfortunately, useless in real life :-) + It allows to construct virtual multiprotocol broadcast "LAN" + over the Internet, provided multicast routing is tuned. + + + I have no idea was this bicycle invented before me, + so that I had to set ARPHRD_IPGRE to a random value. + I have an impression, that Cisco could make something similar, + but this feature is apparently missing in IOS<=11.2(8). + + I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks + with broadcast 224.66.66.66. If you have access to mbone, play with me :-) + + ping -t 255 224.66.66.66 + + If nobody answers, mbone does not work. + + ip tunnel add Universe mode gre remote 224.66.66.66 local ttl 255 + ip addr add 10.66.66./24 dev Universe + ifconfig Universe up + ifconfig Universe add fe80::/10 + ifconfig Universe add fec0:6666:6666::/96 + ftp 10.66.66.66 + ... + ftp fec0:6666:6666::193.233.7.65 + ... + + */ + +static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, + void *daddr, void *saddr, unsigned len) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); + u16 *p = (u16*)(iph+1); + + memcpy(iph, &t->parms.iph, sizeof(struct iphdr)); + p[0] = t->parms.o_flags; + p[1] = htons(type); + + /* + * Set the source hardware address. + */ + + if (saddr) + memcpy(&iph->saddr, saddr, 4); + + if (daddr) { + memcpy(&iph->daddr, daddr, 4); + return t->hlen; + } + if (iph->daddr && !MULTICAST(iph->daddr)) + return t->hlen; + + return -t->hlen; +} + +static int ipgre_open(struct net_device *dev) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + + if (MULTICAST(t->parms.iph.daddr)) { + struct flowi fl = { .oif = t->parms.link, + .nl_u = { .ip4_u = + { .daddr = t->parms.iph.daddr, + .saddr = t->parms.iph.saddr, + .tos = RT_TOS(t->parms.iph.tos) } }, + .proto = IPPROTO_GRE }; + struct rtable *rt; + if (ip_route_output_key(&rt, &fl)) + return -EADDRNOTAVAIL; + dev = rt->u.dst.dev; + ip_rt_put(rt); + if (__in_dev_get(dev) == NULL) + return -EADDRNOTAVAIL; + t->mlink = dev->ifindex; + ip_mc_inc_group(__in_dev_get(dev), t->parms.iph.daddr); + } + return 0; +} + +static int ipgre_close(struct net_device *dev) +{ + struct ip_tunnel *t = (struct ip_tunnel*)dev->priv; + if (MULTICAST(t->parms.iph.daddr) && t->mlink) { + struct in_device *in_dev = inetdev_by_index(t->mlink); + if (in_dev) { + ip_mc_dec_group(in_dev, t->parms.iph.daddr); + in_dev_put(in_dev); + } + } + return 0; +} + +#endif + +static void ipgre_tunnel_setup(struct net_device *dev) +{ + SET_MODULE_OWNER(dev); + dev->uninit = ipgre_tunnel_uninit; + dev->destructor = free_netdev; + dev->hard_start_xmit = ipgre_tunnel_xmit; + dev->get_stats = ipgre_tunnel_get_stats; + dev->do_ioctl = ipgre_tunnel_ioctl; + dev->change_mtu = ipgre_tunnel_change_mtu; + + dev->type = ARPHRD_IPGRE; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr) + 4; + dev->mtu = 1500 - sizeof(struct iphdr) - 4; + dev->flags = IFF_NOARP; + dev->iflink = 0; + dev->addr_len = 4; +} + +static int ipgre_tunnel_init(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + int hlen = LL_MAX_HEADER; + int mtu = 1500; + int addend = sizeof(struct iphdr) + 4; + + tunnel = (struct ip_tunnel*)dev->priv; + iph = &tunnel->parms.iph; + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); + memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); + + /* Guess output device to choose reasonable mtu and hard_header_len */ + + if (iph->daddr) { + struct flowi fl = { .oif = tunnel->parms.link, + .nl_u = { .ip4_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) } }, + .proto = IPPROTO_GRE }; + struct rtable *rt; + if (!ip_route_output_key(&rt, &fl)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + + dev->flags |= IFF_POINTOPOINT; + +#ifdef CONFIG_NET_IPGRE_BROADCAST + if (MULTICAST(iph->daddr)) { + if (!iph->saddr) + return -EINVAL; + dev->flags = IFF_BROADCAST; + dev->hard_header = ipgre_header; + dev->open = ipgre_open; + dev->stop = ipgre_close; + } +#endif + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(tunnel->parms.link); + + if (tdev) { + hlen = tdev->hard_header_len; + mtu = tdev->mtu; + } + dev->iflink = tunnel->parms.link; + + /* Precalculate GRE options length */ + if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) { + if (tunnel->parms.o_flags&GRE_CSUM) + addend += 4; + if (tunnel->parms.o_flags&GRE_KEY) + addend += 4; + if (tunnel->parms.o_flags&GRE_SEQ) + addend += 4; + } + dev->hard_header_len = hlen + addend; + dev->mtu = mtu - addend; + tunnel->hlen = addend; + return 0; +} + +int __init ipgre_fb_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct iphdr *iph = &tunnel->parms.iph; + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + iph->version = 4; + iph->protocol = IPPROTO_GRE; + iph->ihl = 5; + tunnel->hlen = sizeof(struct iphdr) + 4; + + dev_hold(dev); + tunnels_wc[0] = tunnel; + return 0; +} + + +static struct net_protocol ipgre_protocol = { + .handler = ipgre_rcv, + .err_handler = ipgre_err, +}; + + +/* + * And now the modules code and kernel interface. + */ + +static int __init ipgre_init(void) +{ + int err; + + printk(KERN_INFO "GRE over IPv4 tunneling driver\n"); + + if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) { + printk(KERN_INFO "ipgre init: can't add protocol\n"); + return -EAGAIN; + } + + ipgre_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0", + ipgre_tunnel_setup); + if (!ipgre_fb_tunnel_dev) { + err = -ENOMEM; + goto err1; + } + + ipgre_fb_tunnel_dev->init = ipgre_fb_tunnel_init; + + if ((err = register_netdev(ipgre_fb_tunnel_dev))) + goto err2; +out: + return err; +err2: + free_netdev(ipgre_fb_tunnel_dev); +err1: + inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); + goto out; +} + +static void ipgre_fini(void) +{ + if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) + printk(KERN_INFO "ipgre close: can't remove protocol\n"); + + unregister_netdev(ipgre_fb_tunnel_dev); +} + +module_init(ipgre_init); +module_exit(ipgre_fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c new file mode 100644 index 000000000000..a0d0833034be --- /dev/null +++ b/net/ipv4/ip_input.c @@ -0,0 +1,431 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The Internet Protocol (IP) module. + * + * Version: $Id: ip_input.c,v 1.55 2002/01/12 07:39:45 davem Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Donald Becker, + * Alan Cox, + * Richard Underwood + * Stefan Becker, + * Jorge Cwik, + * Arnt Gulbrandsen, + * + * + * Fixes: + * Alan Cox : Commented a couple of minor bits of surplus code + * Alan Cox : Undefining IP_FORWARD doesn't include the code + * (just stops a compiler warning). + * Alan Cox : Frames with >=MAX_ROUTE record routes, strict routes or loose routes + * are junked rather than corrupting things. + * Alan Cox : Frames to bad broadcast subnets are dumped + * We used to process them non broadcast and + * boy could that cause havoc. + * Alan Cox : ip_forward sets the free flag on the + * new frame it queues. Still crap because + * it copies the frame but at least it + * doesn't eat memory too. + * Alan Cox : Generic queue code and memory fixes. + * Fred Van Kempen : IP fragment support (borrowed from NET2E) + * Gerhard Koerting: Forward fragmented frames correctly. + * Gerhard Koerting: Fixes to my fix of the above 8-). + * Gerhard Koerting: IP interface addressing fix. + * Linus Torvalds : More robustness checks + * Alan Cox : Even more checks: Still not as robust as it ought to be + * Alan Cox : Save IP header pointer for later + * Alan Cox : ip option setting + * Alan Cox : Use ip_tos/ip_ttl settings + * Alan Cox : Fragmentation bogosity removed + * (Thanks to Mark.Bush@prg.ox.ac.uk) + * Dmitry Gorodchanin : Send of a raw packet crash fix. + * Alan Cox : Silly ip bug when an overlength + * fragment turns up. Now frees the + * queue. + * Linus Torvalds/ : Memory leakage on fragmentation + * Alan Cox : handling. + * Gerhard Koerting: Forwarding uses IP priority hints + * Teemu Rantanen : Fragment problems. + * Alan Cox : General cleanup, comments and reformat + * Alan Cox : SNMP statistics + * Alan Cox : BSD address rule semantics. Also see + * UDP as there is a nasty checksum issue + * if you do things the wrong way. + * Alan Cox : Always defrag, moved IP_FORWARD to the config.in file + * Alan Cox : IP options adjust sk->priority. + * Pedro Roque : Fix mtu/length error in ip_forward. + * Alan Cox : Avoid ip_chk_addr when possible. + * Richard Underwood : IP multicasting. + * Alan Cox : Cleaned up multicast handlers. + * Alan Cox : RAW sockets demultiplex in the BSD style. + * Gunther Mayer : Fix the SNMP reporting typo + * Alan Cox : Always in group 224.0.0.1 + * Pauline Middelink : Fast ip_checksum update when forwarding + * Masquerading support. + * Alan Cox : Multicast loopback error for 224.0.0.1 + * Alan Cox : IP_MULTICAST_LOOP option. + * Alan Cox : Use notifiers. + * Bjorn Ekwall : Removed ip_csum (from slhc.c too) + * Bjorn Ekwall : Moved ip_fast_csum to ip.h (inline!) + * Stefan Becker : Send out ICMP HOST REDIRECT + * Arnt Gulbrandsen : ip_build_xmit + * Alan Cox : Per socket routing cache + * Alan Cox : Fixed routing cache, added header cache. + * Alan Cox : Loopback didn't work right in original ip_build_xmit - fixed it. + * Alan Cox : Only send ICMP_REDIRECT if src/dest are the same net. + * Alan Cox : Incoming IP option handling. + * Alan Cox : Set saddr on raw output frames as per BSD. + * Alan Cox : Stopped broadcast source route explosions. + * Alan Cox : Can disable source routing + * Takeshi Sone : Masquerading didn't work. + * Dave Bonn,Alan Cox : Faster IP forwarding whenever possible. + * Alan Cox : Memory leaks, tramples, misc debugging. + * Alan Cox : Fixed multicast (by popular demand 8)) + * Alan Cox : Fixed forwarding (by even more popular demand 8)) + * Alan Cox : Fixed SNMP statistics [I think] + * Gerhard Koerting : IP fragmentation forwarding fix + * Alan Cox : Device lock against page fault. + * Alan Cox : IP_HDRINCL facility. + * Werner Almesberger : Zero fragment bug + * Alan Cox : RAW IP frame length bug + * Alan Cox : Outgoing firewall on build_xmit + * A.N.Kuznetsov : IP_OPTIONS support throughout the kernel + * Alan Cox : Multicast routing hooks + * Jos Vos : Do accounting *before* call_in_firewall + * Willy Konynenberg : Transparent proxying support + * + * + * + * To Fix: + * IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient + * and could be made very efficient with the addition of some virtual memory hacks to permit + * the allocation of a buffer that can then be 'grown' by twiddling page tables. + * Output fragmentation wants updating along with the buffer management to use a single + * interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet + * output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause + * fragmentation anyway. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * SNMP management statistics + */ + +DEFINE_SNMP_STAT(struct ipstats_mib, ip_statistics); + +/* + * Process Router Attention IP option + */ +int ip_call_ra_chain(struct sk_buff *skb) +{ + struct ip_ra_chain *ra; + u8 protocol = skb->nh.iph->protocol; + struct sock *last = NULL; + + read_lock(&ip_ra_lock); + for (ra = ip_ra_chain; ra; ra = ra->next) { + struct sock *sk = ra->sk; + + /* If socket is bound to an interface, only report + * the packet if it came from that interface. + */ + if (sk && inet_sk(sk)->num == protocol && + (!sk->sk_bound_dev_if || + sk->sk_bound_dev_if == skb->dev->ifindex)) { + if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + skb = ip_defrag(skb, IP_DEFRAG_CALL_RA_CHAIN); + if (skb == NULL) { + read_unlock(&ip_ra_lock); + return 1; + } + } + if (last) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + raw_rcv(last, skb2); + } + last = sk; + } + } + + if (last) { + raw_rcv(last, skb); + read_unlock(&ip_ra_lock); + return 1; + } + read_unlock(&ip_ra_lock); + return 0; +} + +static inline int ip_local_deliver_finish(struct sk_buff *skb) +{ + int ihl = skb->nh.iph->ihl*4; + +#ifdef CONFIG_NETFILTER_DEBUG + nf_debug_ip_local_deliver(skb); +#endif /*CONFIG_NETFILTER_DEBUG*/ + + __skb_pull(skb, ihl); + + /* Free reference early: we don't need it any more, and it may + hold ip_conntrack module loaded indefinitely. */ + nf_reset(skb); + + /* Point into the IP datagram, just past the header. */ + skb->h.raw = skb->data; + + rcu_read_lock(); + { + /* Note: See raw.c and net/raw.h, RAWV4_HTABLE_SIZE==MAX_INET_PROTOS */ + int protocol = skb->nh.iph->protocol; + int hash; + struct sock *raw_sk; + struct net_protocol *ipprot; + + resubmit: + hash = protocol & (MAX_INET_PROTOS - 1); + raw_sk = sk_head(&raw_v4_htable[hash]); + + /* If there maybe a raw socket we must check - if not we + * don't care less + */ + if (raw_sk) + raw_v4_input(skb, skb->nh.iph, hash); + + if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) { + int ret; + + if (!ipprot->no_policy && + !xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + goto out; + } + ret = ipprot->handler(skb); + if (ret < 0) { + protocol = -ret; + goto resubmit; + } + IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); + } else { + if (!raw_sk) { + if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + IP_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS); + icmp_send(skb, ICMP_DEST_UNREACH, + ICMP_PROT_UNREACH, 0); + } + } else + IP_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); + kfree_skb(skb); + } + } + out: + rcu_read_unlock(); + + return 0; +} + +/* + * Deliver IP Packets to the higher protocol layers. + */ +int ip_local_deliver(struct sk_buff *skb) +{ + /* + * Reassemble IP fragments. + */ + + if (skb->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + skb = ip_defrag(skb, IP_DEFRAG_LOCAL_DELIVER); + if (!skb) + return 0; + } + + return NF_HOOK(PF_INET, NF_IP_LOCAL_IN, skb, skb->dev, NULL, + ip_local_deliver_finish); +} + +static inline int ip_rcv_finish(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct iphdr *iph = skb->nh.iph; + + /* + * Initialise the virtual path cache for the packet. It describes + * how the packet travels inside Linux networking. + */ + if (skb->dst == NULL) { + if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev)) + goto drop; + } + +#ifdef CONFIG_NET_CLS_ROUTE + if (skb->dst->tclassid) { + struct ip_rt_acct *st = ip_rt_acct + 256*smp_processor_id(); + u32 idx = skb->dst->tclassid; + st[idx&0xFF].o_packets++; + st[idx&0xFF].o_bytes+=skb->len; + st[(idx>>16)&0xFF].i_packets++; + st[(idx>>16)&0xFF].i_bytes+=skb->len; + } +#endif + + if (iph->ihl > 5) { + struct ip_options *opt; + + /* It looks as overkill, because not all + IP options require packet mangling. + But it is the easiest for now, especially taking + into account that combination of IP options + and running sniffer is extremely rare condition. + --ANK (980813) + */ + + if (skb_cow(skb, skb_headroom(skb))) { + IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + goto drop; + } + iph = skb->nh.iph; + + if (ip_options_compile(NULL, skb)) + goto inhdr_error; + + opt = &(IPCB(skb)->opt); + if (opt->srr) { + struct in_device *in_dev = in_dev_get(dev); + if (in_dev) { + if (!IN_DEV_SOURCE_ROUTE(in_dev)) { + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) + printk(KERN_INFO "source route option %u.%u.%u.%u -> %u.%u.%u.%u\n", + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + in_dev_put(in_dev); + goto drop; + } + in_dev_put(in_dev); + } + if (ip_options_rcv_srr(skb)) + goto drop; + } + } + + return dst_input(skb); + +inhdr_error: + IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +/* + * Main IP Receive routine. + */ +int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +{ + struct iphdr *iph; + + /* When the interface is in promisc. mode, drop all the crap + * that it receives, do not try to analyse it. + */ + if (skb->pkt_type == PACKET_OTHERHOST) + goto drop; + + IP_INC_STATS_BH(IPSTATS_MIB_INRECEIVES); + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { + IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + goto out; + } + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto inhdr_error; + + iph = skb->nh.iph; + + /* + * RFC1122: 3.1.2.2 MUST silently discard any IP frame that fails the checksum. + * + * Is the datagram acceptable? + * + * 1. Length at least the size of an ip header + * 2. Version of 4 + * 3. Checksums correctly. [Speed optimisation for later, skip loopback checksums] + * 4. Doesn't have a bogus length + */ + + if (iph->ihl < 5 || iph->version != 4) + goto inhdr_error; + + if (!pskb_may_pull(skb, iph->ihl*4)) + goto inhdr_error; + + iph = skb->nh.iph; + + if (ip_fast_csum((u8 *)iph, iph->ihl) != 0) + goto inhdr_error; + + { + __u32 len = ntohs(iph->tot_len); + if (skb->len < len || len < (iph->ihl<<2)) + goto inhdr_error; + + /* Our transport medium may have padded the buffer out. Now we know it + * is IP we can trim to the true length of the frame. + * Note this now means skb->len holds ntohs(iph->tot_len). + */ + if (pskb_trim_rcsum(skb, len)) { + IP_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + goto drop; + } + } + + return NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, dev, NULL, + ip_rcv_finish); + +inhdr_error: + IP_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); +drop: + kfree_skb(skb); +out: + return NET_RX_DROP; +} + +EXPORT_SYMBOL(ip_rcv); +EXPORT_SYMBOL(ip_statistics); diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c new file mode 100644 index 000000000000..6d89f3f3e701 --- /dev/null +++ b/net/ipv4/ip_options.c @@ -0,0 +1,625 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The options processing module for ip.c + * + * Version: $Id: ip_options.c,v 1.21 2001/09/01 00:31:50 davem Exp $ + * + * Authors: A.N.Kuznetsov + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Write options to IP header, record destination address to + * source route option, address of outgoing interface + * (we should already know it, so that this function is allowed be + * called only after routing decision) and timestamp, + * if we originate this datagram. + * + * daddr is real destination address, next hop is recorded in IP header. + * saddr is address of outgoing interface. + */ + +void ip_options_build(struct sk_buff * skb, struct ip_options * opt, + u32 daddr, struct rtable *rt, int is_frag) +{ + unsigned char * iph = skb->nh.raw; + + memcpy(&(IPCB(skb)->opt), opt, sizeof(struct ip_options)); + memcpy(iph+sizeof(struct iphdr), opt->__data, opt->optlen); + opt = &(IPCB(skb)->opt); + opt->is_data = 0; + + if (opt->srr) + memcpy(iph+opt->srr+iph[opt->srr+1]-4, &daddr, 4); + + if (!is_frag) { + if (opt->rr_needaddr) + ip_rt_get_source(iph+opt->rr+iph[opt->rr+2]-5, rt); + if (opt->ts_needaddr) + ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, rt); + if (opt->ts_needtime) { + struct timeval tv; + __u32 midtime; + do_gettimeofday(&tv); + midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000); + memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4); + } + return; + } + if (opt->rr) { + memset(iph+opt->rr, IPOPT_NOP, iph[opt->rr+1]); + opt->rr = 0; + opt->rr_needaddr = 0; + } + if (opt->ts) { + memset(iph+opt->ts, IPOPT_NOP, iph[opt->ts+1]); + opt->ts = 0; + opt->ts_needaddr = opt->ts_needtime = 0; + } +} + +/* + * Provided (sopt, skb) points to received options, + * build in dopt compiled option set appropriate for answering. + * i.e. invert SRR option, copy anothers, + * and grab room in RR/TS options. + * + * NOTE: dopt cannot point to skb. + */ + +int ip_options_echo(struct ip_options * dopt, struct sk_buff * skb) +{ + struct ip_options *sopt; + unsigned char *sptr, *dptr; + int soffset, doffset; + int optlen; + u32 daddr; + + memset(dopt, 0, sizeof(struct ip_options)); + + dopt->is_data = 1; + + sopt = &(IPCB(skb)->opt); + + if (sopt->optlen == 0) { + dopt->optlen = 0; + return 0; + } + + sptr = skb->nh.raw; + dptr = dopt->__data; + + if (skb->dst) + daddr = ((struct rtable*)skb->dst)->rt_spec_dst; + else + daddr = skb->nh.iph->daddr; + + if (sopt->rr) { + optlen = sptr[sopt->rr+1]; + soffset = sptr[sopt->rr+2]; + dopt->rr = dopt->optlen + sizeof(struct iphdr); + memcpy(dptr, sptr+sopt->rr, optlen); + if (sopt->rr_needaddr && soffset <= optlen) { + if (soffset + 3 > optlen) + return -EINVAL; + dptr[2] = soffset + 4; + dopt->rr_needaddr = 1; + } + dptr += optlen; + dopt->optlen += optlen; + } + if (sopt->ts) { + optlen = sptr[sopt->ts+1]; + soffset = sptr[sopt->ts+2]; + dopt->ts = dopt->optlen + sizeof(struct iphdr); + memcpy(dptr, sptr+sopt->ts, optlen); + if (soffset <= optlen) { + if (sopt->ts_needaddr) { + if (soffset + 3 > optlen) + return -EINVAL; + dopt->ts_needaddr = 1; + soffset += 4; + } + if (sopt->ts_needtime) { + if (soffset + 3 > optlen) + return -EINVAL; + if ((dptr[3]&0xF) != IPOPT_TS_PRESPEC) { + dopt->ts_needtime = 1; + soffset += 4; + } else { + dopt->ts_needtime = 0; + + if (soffset + 8 <= optlen) { + __u32 addr; + + memcpy(&addr, sptr+soffset-1, 4); + if (inet_addr_type(addr) != RTN_LOCAL) { + dopt->ts_needtime = 1; + soffset += 8; + } + } + } + } + dptr[2] = soffset; + } + dptr += optlen; + dopt->optlen += optlen; + } + if (sopt->srr) { + unsigned char * start = sptr+sopt->srr; + u32 faddr; + + optlen = start[1]; + soffset = start[2]; + doffset = 0; + if (soffset > optlen) + soffset = optlen + 1; + soffset -= 4; + if (soffset > 3) { + memcpy(&faddr, &start[soffset-1], 4); + for (soffset-=4, doffset=4; soffset > 3; soffset-=4, doffset+=4) + memcpy(&dptr[doffset-1], &start[soffset-1], 4); + /* + * RFC1812 requires to fix illegal source routes. + */ + if (memcmp(&skb->nh.iph->saddr, &start[soffset+3], 4) == 0) + doffset -= 4; + } + if (doffset > 3) { + memcpy(&start[doffset-1], &daddr, 4); + dopt->faddr = faddr; + dptr[0] = start[0]; + dptr[1] = doffset+3; + dptr[2] = 4; + dptr += doffset+3; + dopt->srr = dopt->optlen + sizeof(struct iphdr); + dopt->optlen += doffset+3; + dopt->is_strictroute = sopt->is_strictroute; + } + } + while (dopt->optlen & 3) { + *dptr++ = IPOPT_END; + dopt->optlen++; + } + return 0; +} + +/* + * Options "fragmenting", just fill options not + * allowed in fragments with NOOPs. + * Simple and stupid 8), but the most efficient way. + */ + +void ip_options_fragment(struct sk_buff * skb) +{ + unsigned char * optptr = skb->nh.raw; + struct ip_options * opt = &(IPCB(skb)->opt); + int l = opt->optlen; + int optlen; + + while (l > 0) { + switch (*optptr) { + case IPOPT_END: + return; + case IPOPT_NOOP: + l--; + optptr++; + continue; + } + optlen = optptr[1]; + if (optlen<2 || optlen>l) + return; + if (!IPOPT_COPIED(*optptr)) + memset(optptr, IPOPT_NOOP, optlen); + l -= optlen; + optptr += optlen; + } + opt->ts = 0; + opt->rr = 0; + opt->rr_needaddr = 0; + opt->ts_needaddr = 0; + opt->ts_needtime = 0; + return; +} + +/* + * Verify options and fill pointers in struct options. + * Caller should clear *opt, and set opt->data. + * If opt == NULL, then skb->data should point to IP header. + */ + +int ip_options_compile(struct ip_options * opt, struct sk_buff * skb) +{ + int l; + unsigned char * iph; + unsigned char * optptr; + int optlen; + unsigned char * pp_ptr = NULL; + struct rtable *rt = skb ? (struct rtable*)skb->dst : NULL; + + if (!opt) { + opt = &(IPCB(skb)->opt); + memset(opt, 0, sizeof(struct ip_options)); + iph = skb->nh.raw; + opt->optlen = ((struct iphdr *)iph)->ihl*4 - sizeof(struct iphdr); + optptr = iph + sizeof(struct iphdr); + opt->is_data = 0; + } else { + optptr = opt->is_data ? opt->__data : (unsigned char*)&(skb->nh.iph[1]); + iph = optptr - sizeof(struct iphdr); + } + + for (l = opt->optlen; l > 0; ) { + switch (*optptr) { + case IPOPT_END: + for (optptr++, l--; l>0; optptr++, l--) { + if (*optptr != IPOPT_END) { + *optptr = IPOPT_END; + opt->is_changed = 1; + } + } + goto eol; + case IPOPT_NOOP: + l--; + optptr++; + continue; + } + optlen = optptr[1]; + if (optlen<2 || optlen>l) { + pp_ptr = optptr; + goto error; + } + switch (*optptr) { + case IPOPT_SSRR: + case IPOPT_LSRR: + if (optlen < 3) { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] < 4) { + pp_ptr = optptr + 2; + goto error; + } + /* NB: cf RFC-1812 5.2.4.1 */ + if (opt->srr) { + pp_ptr = optptr; + goto error; + } + if (!skb) { + if (optptr[2] != 4 || optlen < 7 || ((optlen-3) & 3)) { + pp_ptr = optptr + 1; + goto error; + } + memcpy(&opt->faddr, &optptr[3], 4); + if (optlen > 7) + memmove(&optptr[3], &optptr[7], optlen-7); + } + opt->is_strictroute = (optptr[0] == IPOPT_SSRR); + opt->srr = optptr - iph; + break; + case IPOPT_RR: + if (opt->rr) { + pp_ptr = optptr; + goto error; + } + if (optlen < 3) { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] < 4) { + pp_ptr = optptr + 2; + goto error; + } + if (optptr[2] <= optlen) { + if (optptr[2]+3 > optlen) { + pp_ptr = optptr + 2; + goto error; + } + if (skb) { + memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); + opt->is_changed = 1; + } + optptr[2] += 4; + opt->rr_needaddr = 1; + } + opt->rr = optptr - iph; + break; + case IPOPT_TIMESTAMP: + if (opt->ts) { + pp_ptr = optptr; + goto error; + } + if (optlen < 4) { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] < 5) { + pp_ptr = optptr + 2; + goto error; + } + if (optptr[2] <= optlen) { + __u32 * timeptr = NULL; + if (optptr[2]+3 > optptr[1]) { + pp_ptr = optptr + 2; + goto error; + } + switch (optptr[3]&0xF) { + case IPOPT_TS_TSONLY: + opt->ts = optptr - iph; + if (skb) + timeptr = (__u32*)&optptr[optptr[2]-1]; + opt->ts_needtime = 1; + optptr[2] += 4; + break; + case IPOPT_TS_TSANDADDR: + if (optptr[2]+7 > optptr[1]) { + pp_ptr = optptr + 2; + goto error; + } + opt->ts = optptr - iph; + if (skb) { + memcpy(&optptr[optptr[2]-1], &rt->rt_spec_dst, 4); + timeptr = (__u32*)&optptr[optptr[2]+3]; + } + opt->ts_needaddr = 1; + opt->ts_needtime = 1; + optptr[2] += 8; + break; + case IPOPT_TS_PRESPEC: + if (optptr[2]+7 > optptr[1]) { + pp_ptr = optptr + 2; + goto error; + } + opt->ts = optptr - iph; + { + u32 addr; + memcpy(&addr, &optptr[optptr[2]-1], 4); + if (inet_addr_type(addr) == RTN_UNICAST) + break; + if (skb) + timeptr = (__u32*)&optptr[optptr[2]+3]; + } + opt->ts_needtime = 1; + optptr[2] += 8; + break; + default: + if (!skb && !capable(CAP_NET_RAW)) { + pp_ptr = optptr + 3; + goto error; + } + break; + } + if (timeptr) { + struct timeval tv; + __u32 midtime; + do_gettimeofday(&tv); + midtime = htonl((tv.tv_sec % 86400) * 1000 + tv.tv_usec / 1000); + memcpy(timeptr, &midtime, sizeof(__u32)); + opt->is_changed = 1; + } + } else { + unsigned overflow = optptr[3]>>4; + if (overflow == 15) { + pp_ptr = optptr + 3; + goto error; + } + opt->ts = optptr - iph; + if (skb) { + optptr[3] = (optptr[3]&0xF)|((overflow+1)<<4); + opt->is_changed = 1; + } + } + break; + case IPOPT_RA: + if (optlen < 4) { + pp_ptr = optptr + 1; + goto error; + } + if (optptr[2] == 0 && optptr[3] == 0) + opt->router_alert = optptr - iph; + break; + case IPOPT_SEC: + case IPOPT_SID: + default: + if (!skb && !capable(CAP_NET_RAW)) { + pp_ptr = optptr; + goto error; + } + break; + } + l -= optlen; + optptr += optlen; + } + +eol: + if (!pp_ptr) + return 0; + +error: + if (skb) { + icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((pp_ptr-iph)<<24)); + } + return -EINVAL; +} + + +/* + * Undo all the changes done by ip_options_compile(). + */ + +void ip_options_undo(struct ip_options * opt) +{ + if (opt->srr) { + unsigned char * optptr = opt->__data+opt->srr-sizeof(struct iphdr); + memmove(optptr+7, optptr+3, optptr[1]-7); + memcpy(optptr+3, &opt->faddr, 4); + } + if (opt->rr_needaddr) { + unsigned char * optptr = opt->__data+opt->rr-sizeof(struct iphdr); + optptr[2] -= 4; + memset(&optptr[optptr[2]-1], 0, 4); + } + if (opt->ts) { + unsigned char * optptr = opt->__data+opt->ts-sizeof(struct iphdr); + if (opt->ts_needtime) { + optptr[2] -= 4; + memset(&optptr[optptr[2]-1], 0, 4); + if ((optptr[3]&0xF) == IPOPT_TS_PRESPEC) + optptr[2] -= 4; + } + if (opt->ts_needaddr) { + optptr[2] -= 4; + memset(&optptr[optptr[2]-1], 0, 4); + } + } +} + +int ip_options_get(struct ip_options **optp, unsigned char *data, int optlen, int user) +{ + struct ip_options *opt; + + opt = kmalloc(sizeof(struct ip_options)+((optlen+3)&~3), GFP_KERNEL); + if (!opt) + return -ENOMEM; + memset(opt, 0, sizeof(struct ip_options)); + if (optlen) { + if (user) { + if (copy_from_user(opt->__data, data, optlen)) { + kfree(opt); + return -EFAULT; + } + } else + memcpy(opt->__data, data, optlen); + } + while (optlen & 3) + opt->__data[optlen++] = IPOPT_END; + opt->optlen = optlen; + opt->is_data = 1; + opt->is_setbyuser = 1; + if (optlen && ip_options_compile(opt, NULL)) { + kfree(opt); + return -EINVAL; + } + if (*optp) + kfree(*optp); + *optp = opt; + return 0; +} + +void ip_forward_options(struct sk_buff *skb) +{ + struct ip_options * opt = &(IPCB(skb)->opt); + unsigned char * optptr; + struct rtable *rt = (struct rtable*)skb->dst; + unsigned char *raw = skb->nh.raw; + + if (opt->rr_needaddr) { + optptr = (unsigned char *)raw + opt->rr; + ip_rt_get_source(&optptr[optptr[2]-5], rt); + opt->is_changed = 1; + } + if (opt->srr_is_hit) { + int srrptr, srrspace; + + optptr = raw + opt->srr; + + for ( srrptr=optptr[2], srrspace = optptr[1]; + srrptr <= srrspace; + srrptr += 4 + ) { + if (srrptr + 3 > srrspace) + break; + if (memcmp(&rt->rt_dst, &optptr[srrptr-1], 4) == 0) + break; + } + if (srrptr + 3 <= srrspace) { + opt->is_changed = 1; + ip_rt_get_source(&optptr[srrptr-1], rt); + skb->nh.iph->daddr = rt->rt_dst; + optptr[2] = srrptr+4; + } else if (net_ratelimit()) + printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); + if (opt->ts_needaddr) { + optptr = raw + opt->ts; + ip_rt_get_source(&optptr[optptr[2]-9], rt); + opt->is_changed = 1; + } + } + if (opt->is_changed) { + opt->is_changed = 0; + ip_send_check(skb->nh.iph); + } +} + +int ip_options_rcv_srr(struct sk_buff *skb) +{ + struct ip_options *opt = &(IPCB(skb)->opt); + int srrspace, srrptr; + u32 nexthop; + struct iphdr *iph = skb->nh.iph; + unsigned char * optptr = skb->nh.raw + opt->srr; + struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt2; + int err; + + if (!opt->srr) + return 0; + + if (skb->pkt_type != PACKET_HOST) + return -EINVAL; + if (rt->rt_type == RTN_UNICAST) { + if (!opt->is_strictroute) + return 0; + icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl(16<<24)); + return -EINVAL; + } + if (rt->rt_type != RTN_LOCAL) + return -EINVAL; + + for (srrptr=optptr[2], srrspace = optptr[1]; srrptr <= srrspace; srrptr += 4) { + if (srrptr + 3 > srrspace) { + icmp_send(skb, ICMP_PARAMETERPROB, 0, htonl((opt->srr+2)<<24)); + return -EINVAL; + } + memcpy(&nexthop, &optptr[srrptr-1], 4); + + rt = (struct rtable*)skb->dst; + skb->dst = NULL; + err = ip_route_input(skb, nexthop, iph->saddr, iph->tos, skb->dev); + rt2 = (struct rtable*)skb->dst; + if (err || (rt2->rt_type != RTN_UNICAST && rt2->rt_type != RTN_LOCAL)) { + ip_rt_put(rt2); + skb->dst = &rt->u.dst; + return -EINVAL; + } + ip_rt_put(rt); + if (rt2->rt_type != RTN_LOCAL) + break; + /* Superfast 8) loopback forward */ + memcpy(&iph->daddr, &optptr[srrptr-1], 4); + opt->is_changed = 1; + } + if (srrptr <= srrspace) { + opt->srr_is_hit = 1; + opt->is_changed = 1; + } + return 0; +} + +EXPORT_SYMBOL(ip_options_compile); +EXPORT_SYMBOL(ip_options_undo); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c new file mode 100644 index 000000000000..30ab7b6ab761 --- /dev/null +++ b/net/ipv4/ip_output.c @@ -0,0 +1,1359 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The Internet Protocol (IP) output module. + * + * Version: $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Donald Becker, + * Alan Cox, + * Richard Underwood + * Stefan Becker, + * Jorge Cwik, + * Arnt Gulbrandsen, + * Hirokazu Takahashi, + * + * See ip_input.c for original log + * + * Fixes: + * Alan Cox : Missing nonblock feature in ip_build_xmit. + * Mike Kilburn : htons() missing in ip_build_xmit. + * Bradford Johnson: Fix faulty handling of some frames when + * no route is found. + * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit + * (in case if packet not accepted by + * output firewall rules) + * Mike McLagan : Routing by source + * Alexey Kuznetsov: use new route cache + * Andi Kleen: Fix broken PMTU recovery and remove + * some redundant tests. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Andi Kleen : Replace ip_reply with ip_send_reply. + * Andi Kleen : Split fast and slow ip_build_xmit path + * for decreased register pressure on x86 + * and more readibility. + * Marc Boucher : When call_out_firewall returns FW_QUEUE, + * silently drop skb instead of failing with -EPERM. + * Detlev Wengorz : Copy protocol for fragments. + * Hirokazu Takahashi: HW checksumming for outgoing UDP + * datagrams. + * Hirokazu Takahashi: sendfile() on UDP works now. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Shall we try to damage output packets if routing dev changes? + */ + +int sysctl_ip_dynaddr; +int sysctl_ip_default_ttl = IPDEFTTL; + +/* Generate a checksum for an outgoing IP datagram. */ +__inline__ void ip_send_check(struct iphdr *iph) +{ + iph->check = 0; + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); +} + +/* dev_loopback_xmit for use with netfilter. */ +static int ip_dev_loopback_xmit(struct sk_buff *newskb) +{ + newskb->mac.raw = newskb->data; + __skb_pull(newskb, newskb->nh.raw - newskb->data); + newskb->pkt_type = PACKET_LOOPBACK; + newskb->ip_summed = CHECKSUM_UNNECESSARY; + BUG_TRAP(newskb->dst); + +#ifdef CONFIG_NETFILTER_DEBUG + nf_debug_ip_loopback_xmit(newskb); +#endif + netif_rx(newskb); + return 0; +} + +static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst) +{ + int ttl = inet->uc_ttl; + + if (ttl < 0) + ttl = dst_metric(dst, RTAX_HOPLIMIT); + return ttl; +} + +/* + * Add an ip header to a skbuff and send it out. + * + */ +int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk, + u32 saddr, u32 daddr, struct ip_options *opt) +{ + struct inet_sock *inet = inet_sk(sk); + struct rtable *rt = (struct rtable *)skb->dst; + struct iphdr *iph; + + /* Build the IP header. */ + if (opt) + iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr) + opt->optlen); + else + iph=(struct iphdr *)skb_push(skb,sizeof(struct iphdr)); + + iph->version = 4; + iph->ihl = 5; + iph->tos = inet->tos; + if (ip_dont_fragment(sk, &rt->u.dst)) + iph->frag_off = htons(IP_DF); + else + iph->frag_off = 0; + iph->ttl = ip_select_ttl(inet, &rt->u.dst); + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + iph->protocol = sk->sk_protocol; + iph->tot_len = htons(skb->len); + ip_select_ident(iph, &rt->u.dst, sk); + skb->nh.iph = iph; + + if (opt && opt->optlen) { + iph->ihl += opt->optlen>>2; + ip_options_build(skb, opt, daddr, rt, 0); + } + ip_send_check(iph); + + skb->priority = sk->sk_priority; + + /* Send it out. */ + return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + dst_output); +} + +static inline int ip_finish_output2(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct hh_cache *hh = dst->hh; + struct net_device *dev = dst->dev; + int hh_len = LL_RESERVED_SPACE(dev); + + /* Be paranoid, rather than too clever. */ + if (unlikely(skb_headroom(skb) < hh_len && dev->hard_header)) { + struct sk_buff *skb2; + + skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev)); + if (skb2 == NULL) { + kfree_skb(skb); + return -ENOMEM; + } + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + kfree_skb(skb); + skb = skb2; + } + +#ifdef CONFIG_NETFILTER_DEBUG + nf_debug_ip_finish_output2(skb); +#endif /*CONFIG_NETFILTER_DEBUG*/ + + if (hh) { + int hh_alen; + + read_lock_bh(&hh->hh_lock); + hh_alen = HH_DATA_ALIGN(hh->hh_len); + memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); + read_unlock_bh(&hh->hh_lock); + skb_push(skb, hh->hh_len); + return hh->hh_output(skb); + } else if (dst->neighbour) + return dst->neighbour->output(skb); + + if (net_ratelimit()) + printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n"); + kfree_skb(skb); + return -EINVAL; +} + +int ip_finish_output(struct sk_buff *skb) +{ + struct net_device *dev = skb->dst->dev; + + skb->dev = dev; + skb->protocol = htons(ETH_P_IP); + + return NF_HOOK(PF_INET, NF_IP_POST_ROUTING, skb, NULL, dev, + ip_finish_output2); +} + +int ip_mc_output(struct sk_buff *skb) +{ + struct sock *sk = skb->sk; + struct rtable *rt = (struct rtable*)skb->dst; + struct net_device *dev = rt->u.dst.dev; + + /* + * If the indicated interface is up and running, send the packet. + */ + IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + + skb->dev = dev; + skb->protocol = htons(ETH_P_IP); + + /* + * Multicasts are looped back for other local users + */ + + if (rt->rt_flags&RTCF_MULTICAST) { + if ((!sk || inet_sk(sk)->mc_loop) +#ifdef CONFIG_IP_MROUTE + /* Small optimization: do not loopback not local frames, + which returned after forwarding; they will be dropped + by ip_mr_input in any case. + Note, that local frames are looped back to be delivered + to local recipients. + + This check is duplicated in ip_mr_input at the moment. + */ + && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED)) +#endif + ) { + struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); + if (newskb) + NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, + newskb->dev, + ip_dev_loopback_xmit); + } + + /* Multicasts with ttl 0 must not go beyond the host */ + + if (skb->nh.iph->ttl == 0) { + kfree_skb(skb); + return 0; + } + } + + if (rt->rt_flags&RTCF_BROADCAST) { + struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); + if (newskb) + NF_HOOK(PF_INET, NF_IP_POST_ROUTING, newskb, NULL, + newskb->dev, ip_dev_loopback_xmit); + } + + if (skb->len > dst_mtu(&rt->u.dst)) + return ip_fragment(skb, ip_finish_output); + else + return ip_finish_output(skb); +} + +int ip_output(struct sk_buff *skb) +{ + IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + + if (skb->len > dst_mtu(skb->dst) && !skb_shinfo(skb)->tso_size) + return ip_fragment(skb, ip_finish_output); + else + return ip_finish_output(skb); +} + +int ip_queue_xmit(struct sk_buff *skb, int ipfragok) +{ + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(sk); + struct ip_options *opt = inet->opt; + struct rtable *rt; + struct iphdr *iph; + + /* Skip all of this if the packet is already routed, + * f.e. by something like SCTP. + */ + rt = (struct rtable *) skb->dst; + if (rt != NULL) + goto packet_routed; + + /* Make sure we can route this packet. */ + rt = (struct rtable *)__sk_dst_check(sk, 0); + if (rt == NULL) { + u32 daddr; + + /* Use correct destination address if we have options. */ + daddr = inet->daddr; + if(opt && opt->srr) + daddr = opt->faddr; + + { + struct flowi fl = { .oif = sk->sk_bound_dev_if, + .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = inet->saddr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = sk->sk_protocol, + .uli_u = { .ports = + { .sport = inet->sport, + .dport = inet->dport } } }; + + /* If this fails, retransmit mechanism of transport layer will + * keep trying until route appears or the connection times + * itself out. + */ + if (ip_route_output_flow(&rt, &fl, sk, 0)) + goto no_route; + } + __sk_dst_set(sk, &rt->u.dst); + tcp_v4_setup_caps(sk, &rt->u.dst); + } + skb->dst = dst_clone(&rt->u.dst); + +packet_routed: + if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + goto no_route; + + /* OK, we know where to send it, allocate and build IP header. */ + iph = (struct iphdr *) skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0)); + *((__u16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff)); + iph->tot_len = htons(skb->len); + if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok) + iph->frag_off = htons(IP_DF); + else + iph->frag_off = 0; + iph->ttl = ip_select_ttl(inet, &rt->u.dst); + iph->protocol = sk->sk_protocol; + iph->saddr = rt->rt_src; + iph->daddr = rt->rt_dst; + skb->nh.iph = iph; + /* Transport layer set skb->h.foo itself. */ + + if (opt && opt->optlen) { + iph->ihl += opt->optlen >> 2; + ip_options_build(skb, opt, inet->daddr, rt, 0); + } + + ip_select_ident_more(iph, &rt->u.dst, sk, skb_shinfo(skb)->tso_segs); + + /* Add an IP checksum. */ + ip_send_check(iph); + + skb->priority = sk->sk_priority; + + return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + dst_output); + +no_route: + IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + kfree_skb(skb); + return -EHOSTUNREACH; +} + + +static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) +{ + to->pkt_type = from->pkt_type; + to->priority = from->priority; + to->protocol = from->protocol; + to->security = from->security; + dst_release(to->dst); + to->dst = dst_clone(from->dst); + to->dev = from->dev; + + /* Copy the flags to each fragment. */ + IPCB(to)->flags = IPCB(from)->flags; + +#ifdef CONFIG_NET_SCHED + to->tc_index = from->tc_index; +#endif +#ifdef CONFIG_NETFILTER + to->nfmark = from->nfmark; + to->nfcache = from->nfcache; + /* Connection association is same as pre-frag packet */ + nf_conntrack_put(to->nfct); + to->nfct = from->nfct; + nf_conntrack_get(to->nfct); + to->nfctinfo = from->nfctinfo; +#ifdef CONFIG_BRIDGE_NETFILTER + nf_bridge_put(to->nf_bridge); + to->nf_bridge = from->nf_bridge; + nf_bridge_get(to->nf_bridge); +#endif +#ifdef CONFIG_NETFILTER_DEBUG + to->nf_debug = from->nf_debug; +#endif +#endif +} + +/* + * This IP datagram is too large to be sent in one piece. Break it up into + * smaller pieces (each of size equal to IP header plus + * a block of the data of the original IP data part) that will yet fit in a + * single device frame, and queue such a frame for sending. + */ + +int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) +{ + struct iphdr *iph; + int raw = 0; + int ptr; + struct net_device *dev; + struct sk_buff *skb2; + unsigned int mtu, hlen, left, len, ll_rs; + int offset; + int not_last_frag; + struct rtable *rt = (struct rtable*)skb->dst; + int err = 0; + + dev = rt->u.dst.dev; + + /* + * Point into the IP datagram header. + */ + + iph = skb->nh.iph; + + if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, + htonl(dst_mtu(&rt->u.dst))); + kfree_skb(skb); + return -EMSGSIZE; + } + + /* + * Setup starting values. + */ + + hlen = iph->ihl * 4; + mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ + + /* When frag_list is given, use it. First, check its validity: + * some transformers could create wrong frag_list or break existing + * one, it is not prohibited. In this case fall back to copying. + * + * LATER: this step can be merged to real generation of fragments, + * we can switch to copy when see the first bad fragment. + */ + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *frag; + int first_len = skb_pagelen(skb); + + if (first_len - hlen > mtu || + ((first_len - hlen) & 7) || + (iph->frag_off & htons(IP_MF|IP_OFFSET)) || + skb_cloned(skb)) + goto slow_path; + + for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) { + /* Correct geometry. */ + if (frag->len > mtu || + ((frag->len & 7) && frag->next) || + skb_headroom(frag) < hlen) + goto slow_path; + + /* Partially cloned skb? */ + if (skb_shared(frag)) + goto slow_path; + } + + /* Everything is OK. Generate! */ + + err = 0; + offset = 0; + frag = skb_shinfo(skb)->frag_list; + skb_shinfo(skb)->frag_list = NULL; + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + iph->tot_len = htons(first_len); + iph->frag_off = htons(IP_MF); + ip_send_check(iph); + + for (;;) { + /* Prepare header of the next frame, + * before previous one went down. */ + if (frag) { + frag->ip_summed = CHECKSUM_NONE; + frag->h.raw = frag->data; + frag->nh.raw = __skb_push(frag, hlen); + memcpy(frag->nh.raw, iph, hlen); + iph = frag->nh.iph; + iph->tot_len = htons(frag->len); + ip_copy_metadata(frag, skb); + if (offset == 0) + ip_options_fragment(frag); + offset += skb->len - hlen; + iph->frag_off = htons(offset>>3); + if (frag->next != NULL) + iph->frag_off |= htons(IP_MF); + /* Ready, complete checksum */ + ip_send_check(iph); + } + + err = output(skb); + + if (err || !frag) + break; + + skb = frag; + frag = skb->next; + skb->next = NULL; + } + + if (err == 0) { + IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + return 0; + } + + while (frag) { + skb = frag->next; + kfree_skb(frag); + frag = skb; + } + IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + return err; + } + +slow_path: + left = skb->len - hlen; /* Space per frame */ + ptr = raw + hlen; /* Where to start from */ + +#ifdef CONFIG_BRIDGE_NETFILTER + /* for bridged IP traffic encapsulated inside f.e. a vlan header, + * we need to make room for the encapsulating header */ + ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, nf_bridge_pad(skb)); + mtu -= nf_bridge_pad(skb); +#else + ll_rs = LL_RESERVED_SPACE(rt->u.dst.dev); +#endif + /* + * Fragment the datagram. + */ + + offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3; + not_last_frag = iph->frag_off & htons(IP_MF); + + /* + * Keep copying data until we run out. + */ + + while(left > 0) { + len = left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > mtu) + len = mtu; + /* IF: we are not sending upto and including the packet end + then align the next start on an eight byte boundary */ + if (len < left) { + len &= ~7; + } + /* + * Allocate buffer. + */ + + if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) { + NETDEBUG(printk(KERN_INFO "IP: frag: no memory for new fragment!\n")); + err = -ENOMEM; + goto fail; + } + + /* + * Set up data on packet + */ + + ip_copy_metadata(skb2, skb); + skb_reserve(skb2, ll_rs); + skb_put(skb2, len + hlen); + skb2->nh.raw = skb2->data; + skb2->h.raw = skb2->data + hlen; + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + + if (skb->sk) + skb_set_owner_w(skb2, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + + memcpy(skb2->nh.raw, skb->data, hlen); + + /* + * Copy a block of the IP datagram. + */ + if (skb_copy_bits(skb, ptr, skb2->h.raw, len)) + BUG(); + left -= len; + + /* + * Fill in the new header fields. + */ + iph = skb2->nh.iph; + iph->frag_off = htons((offset >> 3)); + + /* ANK: dirty, but effective trick. Upgrade options only if + * the segment to be fragmented was THE FIRST (otherwise, + * options are already fixed) and make it ONCE + * on the initial skb, so that all the following fragments + * will inherit fixed options. + */ + if (offset == 0) + ip_options_fragment(skb); + + /* + * Added AC : If we are fragmenting a fragment that's not the + * last fragment then keep MF on each bit + */ + if (left > 0 || not_last_frag) + iph->frag_off |= htons(IP_MF); + ptr += len; + offset += len; + + /* + * Put this fragment into the sending queue. + */ + + IP_INC_STATS(IPSTATS_MIB_FRAGCREATES); + + iph->tot_len = htons(len + hlen); + + ip_send_check(iph); + + err = output(skb2); + if (err) + goto fail; + } + kfree_skb(skb); + IP_INC_STATS(IPSTATS_MIB_FRAGOKS); + return err; + +fail: + kfree_skb(skb); + IP_INC_STATS(IPSTATS_MIB_FRAGFAILS); + return err; +} + +int +ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) +{ + struct iovec *iov = from; + + if (skb->ip_summed == CHECKSUM_HW) { + if (memcpy_fromiovecend(to, iov, offset, len) < 0) + return -EFAULT; + } else { + unsigned int csum = 0; + if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0) + return -EFAULT; + skb->csum = csum_block_add(skb->csum, csum, odd); + } + return 0; +} + +static inline unsigned int +csum_page(struct page *page, int offset, int copy) +{ + char *kaddr; + unsigned int csum; + kaddr = kmap(page); + csum = csum_partial(kaddr + offset, copy, 0); + kunmap(page); + return csum; +} + +/* + * ip_append_data() and ip_append_page() can make one large IP datagram + * from many pieces of data. Each pieces will be holded on the socket + * until ip_push_pending_frames() is called. Each piece can be a page + * or non-page data. + * + * Not only UDP, other transport protocols - e.g. raw sockets - can use + * this interface potentially. + * + * LATER: length must be adjusted by pad at tail, when it is required. + */ +int ip_append_data(struct sock *sk, + int getfrag(void *from, char *to, int offset, int len, + int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + struct ipcm_cookie *ipc, struct rtable *rt, + unsigned int flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct sk_buff *skb; + + struct ip_options *opt = NULL; + int hh_len; + int exthdrlen; + int mtu; + int copy; + int err; + int offset = 0; + unsigned int maxfraglen, fragheaderlen; + int csummode = CHECKSUM_NONE; + + if (flags&MSG_PROBE) + return 0; + + if (skb_queue_empty(&sk->sk_write_queue)) { + /* + * setup for corking. + */ + opt = ipc->opt; + if (opt) { + if (inet->cork.opt == NULL) { + inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation); + if (unlikely(inet->cork.opt == NULL)) + return -ENOBUFS; + } + memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen); + inet->cork.flags |= IPCORK_OPT; + inet->cork.addr = ipc->addr; + } + dst_hold(&rt->u.dst); + inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path); + inet->cork.rt = rt; + inet->cork.length = 0; + sk->sk_sndmsg_page = NULL; + sk->sk_sndmsg_off = 0; + if ((exthdrlen = rt->u.dst.header_len) != 0) { + length += exthdrlen; + transhdrlen += exthdrlen; + } + } else { + rt = inet->cork.rt; + if (inet->cork.flags & IPCORK_OPT) + opt = inet->cork.opt; + + transhdrlen = 0; + exthdrlen = 0; + mtu = inet->cork.fragsize; + } + hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + + fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; + + if (inet->cork.length + length > 0xFFFF - fragheaderlen) { + ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen); + return -EMSGSIZE; + } + + /* + * transhdrlen > 0 means that this is the first fragment and we wish + * it won't be fragmented in the future. + */ + if (transhdrlen && + length + fragheaderlen <= mtu && + rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) && + !exthdrlen) + csummode = CHECKSUM_HW; + + inet->cork.length += length; + + /* So, what's going on in the loop below? + * + * We use calculated fragment length to generate chained skb, + * each of segments is IP fragment ready for sending to network after + * adding appropriate IP header. + */ + + if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) + goto alloc_new_skb; + + while (length > 0) { + /* Check if the remaining data fits into current packet. */ + copy = mtu - skb->len; + if (copy < length) + copy = maxfraglen - skb->len; + if (copy <= 0) { + char *data; + unsigned int datalen; + unsigned int fraglen; + unsigned int fraggap; + unsigned int alloclen; + struct sk_buff *skb_prev; +alloc_new_skb: + skb_prev = skb; + if (skb_prev) + fraggap = skb_prev->len - maxfraglen; + else + fraggap = 0; + + /* + * If remaining data exceeds the mtu, + * we know we need more fragment(s). + */ + datalen = length + fraggap; + if (datalen > mtu - fragheaderlen) + datalen = maxfraglen - fragheaderlen; + fraglen = datalen + fragheaderlen; + + if ((flags & MSG_MORE) && + !(rt->u.dst.dev->features&NETIF_F_SG)) + alloclen = mtu; + else + alloclen = datalen + fragheaderlen; + + /* The last fragment gets additional space at tail. + * Note, with MSG_MORE we overallocate on fragments, + * because we have no idea what fragment will be + * the last. + */ + if (datalen == length) + alloclen += rt->u.dst.trailer_len; + + if (transhdrlen) { + skb = sock_alloc_send_skb(sk, + alloclen + hh_len + 15, + (flags & MSG_DONTWAIT), &err); + } else { + skb = NULL; + if (atomic_read(&sk->sk_wmem_alloc) <= + 2 * sk->sk_sndbuf) + skb = sock_wmalloc(sk, + alloclen + hh_len + 15, 1, + sk->sk_allocation); + if (unlikely(skb == NULL)) + err = -ENOBUFS; + } + if (skb == NULL) + goto error; + + /* + * Fill in the control structures + */ + skb->ip_summed = csummode; + skb->csum = 0; + skb_reserve(skb, hh_len); + + /* + * Find where to start putting bytes. + */ + data = skb_put(skb, fraglen); + skb->nh.raw = data + exthdrlen; + data += fragheaderlen; + skb->h.raw = data + exthdrlen; + + if (fraggap) { + skb->csum = skb_copy_and_csum_bits( + skb_prev, maxfraglen, + data + transhdrlen, fraggap, 0); + skb_prev->csum = csum_sub(skb_prev->csum, + skb->csum); + data += fraggap; + skb_trim(skb_prev, maxfraglen); + } + + copy = datalen - transhdrlen - fraggap; + if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { + err = -EFAULT; + kfree_skb(skb); + goto error; + } + + offset += copy; + length -= datalen - fraggap; + transhdrlen = 0; + exthdrlen = 0; + csummode = CHECKSUM_NONE; + + /* + * Put the packet on the pending queue. + */ + __skb_queue_tail(&sk->sk_write_queue, skb); + continue; + } + + if (copy > length) + copy = length; + + if (!(rt->u.dst.dev->features&NETIF_F_SG)) { + unsigned int off; + + off = skb->len; + if (getfrag(from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { + __skb_trim(skb, off); + err = -EFAULT; + goto error; + } + } else { + int i = skb_shinfo(skb)->nr_frags; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; + struct page *page = sk->sk_sndmsg_page; + int off = sk->sk_sndmsg_off; + unsigned int left; + + if (page && (left = PAGE_SIZE - off) > 0) { + if (copy >= left) + copy = left; + if (page != frag->page) { + if (i == MAX_SKB_FRAGS) { + err = -EMSGSIZE; + goto error; + } + get_page(page); + skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); + frag = &skb_shinfo(skb)->frags[i]; + } + } else if (i < MAX_SKB_FRAGS) { + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + page = alloc_pages(sk->sk_allocation, 0); + if (page == NULL) { + err = -ENOMEM; + goto error; + } + sk->sk_sndmsg_page = page; + sk->sk_sndmsg_off = 0; + + skb_fill_page_desc(skb, i, page, 0, 0); + frag = &skb_shinfo(skb)->frags[i]; + skb->truesize += PAGE_SIZE; + atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); + } else { + err = -EMSGSIZE; + goto error; + } + if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { + err = -EFAULT; + goto error; + } + sk->sk_sndmsg_off += copy; + frag->size += copy; + skb->len += copy; + skb->data_len += copy; + } + offset += copy; + length -= copy; + } + + return 0; + +error: + inet->cork.length -= length; + IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + return err; +} + +ssize_t ip_append_page(struct sock *sk, struct page *page, + int offset, size_t size, int flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct sk_buff *skb; + struct rtable *rt; + struct ip_options *opt = NULL; + int hh_len; + int mtu; + int len; + int err; + unsigned int maxfraglen, fragheaderlen, fraggap; + + if (inet->hdrincl) + return -EPERM; + + if (flags&MSG_PROBE) + return 0; + + if (skb_queue_empty(&sk->sk_write_queue)) + return -EINVAL; + + rt = inet->cork.rt; + if (inet->cork.flags & IPCORK_OPT) + opt = inet->cork.opt; + + if (!(rt->u.dst.dev->features&NETIF_F_SG)) + return -EOPNOTSUPP; + + hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + mtu = inet->cork.fragsize; + + fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen; + + if (inet->cork.length + size > 0xFFFF - fragheaderlen) { + ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu); + return -EMSGSIZE; + } + + if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) + return -EINVAL; + + inet->cork.length += size; + + while (size > 0) { + int i; + + /* Check if the remaining data fits into current packet. */ + len = mtu - skb->len; + if (len < size) + len = maxfraglen - skb->len; + if (len <= 0) { + struct sk_buff *skb_prev; + char *data; + struct iphdr *iph; + int alloclen; + + skb_prev = skb; + if (skb_prev) + fraggap = skb_prev->len - maxfraglen; + else + fraggap = 0; + + alloclen = fragheaderlen + hh_len + fraggap + 15; + skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); + if (unlikely(!skb)) { + err = -ENOBUFS; + goto error; + } + + /* + * Fill in the control structures + */ + skb->ip_summed = CHECKSUM_NONE; + skb->csum = 0; + skb_reserve(skb, hh_len); + + /* + * Find where to start putting bytes. + */ + data = skb_put(skb, fragheaderlen + fraggap); + skb->nh.iph = iph = (struct iphdr *)data; + data += fragheaderlen; + skb->h.raw = data; + + if (fraggap) { + skb->csum = skb_copy_and_csum_bits( + skb_prev, maxfraglen, + data, fraggap, 0); + skb_prev->csum = csum_sub(skb_prev->csum, + skb->csum); + skb_trim(skb_prev, maxfraglen); + } + + /* + * Put the packet on the pending queue. + */ + __skb_queue_tail(&sk->sk_write_queue, skb); + continue; + } + + i = skb_shinfo(skb)->nr_frags; + if (len > size) + len = size; + if (skb_can_coalesce(skb, i, page, offset)) { + skb_shinfo(skb)->frags[i-1].size += len; + } else if (i < MAX_SKB_FRAGS) { + get_page(page); + skb_fill_page_desc(skb, i, page, offset, len); + } else { + err = -EMSGSIZE; + goto error; + } + + if (skb->ip_summed == CHECKSUM_NONE) { + unsigned int csum; + csum = csum_page(page, offset, len); + skb->csum = csum_block_add(skb->csum, csum, skb->len); + } + + skb->len += len; + skb->data_len += len; + offset += len; + size -= len; + } + return 0; + +error: + inet->cork.length -= size; + IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + return err; +} + +/* + * Combined all pending IP fragments on the socket as one IP datagram + * and push them out. + */ +int ip_push_pending_frames(struct sock *sk) +{ + struct sk_buff *skb, *tmp_skb; + struct sk_buff **tail_skb; + struct inet_sock *inet = inet_sk(sk); + struct ip_options *opt = NULL; + struct rtable *rt = inet->cork.rt; + struct iphdr *iph; + int df = 0; + __u8 ttl; + int err = 0; + + if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) + goto out; + tail_skb = &(skb_shinfo(skb)->frag_list); + + /* move skb->data to ip header from ext header */ + if (skb->data < skb->nh.raw) + __skb_pull(skb, skb->nh.raw - skb->data); + while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw); + *tail_skb = tmp_skb; + tail_skb = &(tmp_skb->next); + skb->len += tmp_skb->len; + skb->data_len += tmp_skb->len; + skb->truesize += tmp_skb->truesize; + __sock_put(tmp_skb->sk); + tmp_skb->destructor = NULL; + tmp_skb->sk = NULL; + } + + /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow + * to fragment the frame generated here. No matter, what transforms + * how transforms change size of the packet, it will come out. + */ + if (inet->pmtudisc != IP_PMTUDISC_DO) + skb->local_df = 1; + + /* DF bit is set when we want to see DF on outgoing frames. + * If local_df is set too, we still allow to fragment this frame + * locally. */ + if (inet->pmtudisc == IP_PMTUDISC_DO || + (skb->len <= dst_mtu(&rt->u.dst) && + ip_dont_fragment(sk, &rt->u.dst))) + df = htons(IP_DF); + + if (inet->cork.flags & IPCORK_OPT) + opt = inet->cork.opt; + + if (rt->rt_type == RTN_MULTICAST) + ttl = inet->mc_ttl; + else + ttl = ip_select_ttl(inet, &rt->u.dst); + + iph = (struct iphdr *)skb->data; + iph->version = 4; + iph->ihl = 5; + if (opt) { + iph->ihl += opt->optlen>>2; + ip_options_build(skb, opt, inet->cork.addr, rt, 0); + } + iph->tos = inet->tos; + iph->tot_len = htons(skb->len); + iph->frag_off = df; + if (!df) { + __ip_select_ident(iph, &rt->u.dst, 0); + } else { + iph->id = htons(inet->id++); + } + iph->ttl = ttl; + iph->protocol = sk->sk_protocol; + iph->saddr = rt->rt_src; + iph->daddr = rt->rt_dst; + ip_send_check(iph); + + skb->priority = sk->sk_priority; + skb->dst = dst_clone(&rt->u.dst); + + /* Netfilter gets whole the not fragmented skb. */ + err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, + skb->dst->dev, dst_output); + if (err) { + if (err > 0) + err = inet->recverr ? net_xmit_errno(err) : 0; + if (err) + goto error; + } + +out: + inet->cork.flags &= ~IPCORK_OPT; + if (inet->cork.opt) { + kfree(inet->cork.opt); + inet->cork.opt = NULL; + } + if (inet->cork.rt) { + ip_rt_put(inet->cork.rt); + inet->cork.rt = NULL; + } + return err; + +error: + IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + goto out; +} + +/* + * Throw away all pending data on the socket. + */ +void ip_flush_pending_frames(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct sk_buff *skb; + + while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) + kfree_skb(skb); + + inet->cork.flags &= ~IPCORK_OPT; + if (inet->cork.opt) { + kfree(inet->cork.opt); + inet->cork.opt = NULL; + } + if (inet->cork.rt) { + ip_rt_put(inet->cork.rt); + inet->cork.rt = NULL; + } +} + + +/* + * Fetch data from kernel space and fill in checksum if needed. + */ +static int ip_reply_glue_bits(void *dptr, char *to, int offset, + int len, int odd, struct sk_buff *skb) +{ + unsigned int csum; + + csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0); + skb->csum = csum_block_add(skb->csum, csum, odd); + return 0; +} + +/* + * Generic function to send a packet as reply to another packet. + * Used to send TCP resets so far. ICMP should use this function too. + * + * Should run single threaded per socket because it uses the sock + * structure to pass arguments. + * + * LATER: switch from ip_build_xmit to ip_append_* + */ +void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg, + unsigned int len) +{ + struct inet_sock *inet = inet_sk(sk); + struct { + struct ip_options opt; + char data[40]; + } replyopts; + struct ipcm_cookie ipc; + u32 daddr; + struct rtable *rt = (struct rtable*)skb->dst; + + if (ip_options_echo(&replyopts.opt, skb)) + return; + + daddr = ipc.addr = rt->rt_src; + ipc.opt = NULL; + + if (replyopts.opt.optlen) { + ipc.opt = &replyopts.opt; + + if (ipc.opt->srr) + daddr = replyopts.opt.faddr; + } + + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = rt->rt_spec_dst, + .tos = RT_TOS(skb->nh.iph->tos) } }, + /* Not quite clean, but right. */ + .uli_u = { .ports = + { .sport = skb->h.th->dest, + .dport = skb->h.th->source } }, + .proto = sk->sk_protocol }; + if (ip_route_output_key(&rt, &fl)) + return; + } + + /* And let IP do all the hard work. + + This chunk is not reenterable, hence spinlock. + Note that it uses the fact, that this function is called + with locally disabled BH and that sk cannot be already spinlocked. + */ + bh_lock_sock(sk); + inet->tos = skb->nh.iph->tos; + sk->sk_priority = skb->priority; + sk->sk_protocol = skb->nh.iph->protocol; + ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0, + &ipc, rt, MSG_DONTWAIT); + if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) { + if (arg->csumoffset >= 0) + *((u16 *)skb->h.raw + arg->csumoffset) = csum_fold(csum_add(skb->csum, arg->csum)); + skb->ip_summed = CHECKSUM_NONE; + ip_push_pending_frames(sk); + } + + bh_unlock_sock(sk); + + ip_rt_put(rt); +} + +/* + * IP protocol layer initialiser + */ + +static struct packet_type ip_packet_type = { + .type = __constant_htons(ETH_P_IP), + .func = ip_rcv, +}; + +/* + * IP registers the packet type and then calls the subprotocol initialisers + */ + +void __init ip_init(void) +{ + dev_add_pack(&ip_packet_type); + + ip_rt_init(); + inet_initpeers(); + +#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS) + igmp_mc_proc_init(); +#endif +} + +EXPORT_SYMBOL(ip_finish_output); +EXPORT_SYMBOL(ip_fragment); +EXPORT_SYMBOL(ip_generic_getfrag); +EXPORT_SYMBOL(ip_queue_xmit); +EXPORT_SYMBOL(ip_send_check); + +#ifdef CONFIG_SYSCTL +EXPORT_SYMBOL(sysctl_ip_default_ttl); +#endif diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c new file mode 100644 index 000000000000..47012b93cad2 --- /dev/null +++ b/net/ipv4/ip_sockglue.c @@ -0,0 +1,1093 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The IP to API glue. + * + * Version: $Id: ip_sockglue.c,v 1.62 2002/02/01 22:01:04 davem Exp $ + * + * Authors: see ip.c + * + * Fixes: + * Many : Split from ip.c , see ip.c for history. + * Martin Mares : TOS setting fixed. + * Alan Cox : Fixed a couple of oopses in Martin's + * TOS tweaks. + * Mike McLagan : Routing by source + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#include +#endif + +#include +#include + +#define IP_CMSG_PKTINFO 1 +#define IP_CMSG_TTL 2 +#define IP_CMSG_TOS 4 +#define IP_CMSG_RECVOPTS 8 +#define IP_CMSG_RETOPTS 16 + +/* + * SOL_IP control messages. + */ + +static void ip_cmsg_recv_pktinfo(struct msghdr *msg, struct sk_buff *skb) +{ + struct in_pktinfo info; + struct rtable *rt = (struct rtable *)skb->dst; + + info.ipi_addr.s_addr = skb->nh.iph->daddr; + if (rt) { + info.ipi_ifindex = rt->rt_iif; + info.ipi_spec_dst.s_addr = rt->rt_spec_dst; + } else { + info.ipi_ifindex = 0; + info.ipi_spec_dst.s_addr = 0; + } + + put_cmsg(msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); +} + +static void ip_cmsg_recv_ttl(struct msghdr *msg, struct sk_buff *skb) +{ + int ttl = skb->nh.iph->ttl; + put_cmsg(msg, SOL_IP, IP_TTL, sizeof(int), &ttl); +} + +static void ip_cmsg_recv_tos(struct msghdr *msg, struct sk_buff *skb) +{ + put_cmsg(msg, SOL_IP, IP_TOS, 1, &skb->nh.iph->tos); +} + +static void ip_cmsg_recv_opts(struct msghdr *msg, struct sk_buff *skb) +{ + if (IPCB(skb)->opt.optlen == 0) + return; + + put_cmsg(msg, SOL_IP, IP_RECVOPTS, IPCB(skb)->opt.optlen, skb->nh.iph+1); +} + + +static void ip_cmsg_recv_retopts(struct msghdr *msg, struct sk_buff *skb) +{ + unsigned char optbuf[sizeof(struct ip_options) + 40]; + struct ip_options * opt = (struct ip_options*)optbuf; + + if (IPCB(skb)->opt.optlen == 0) + return; + + if (ip_options_echo(opt, skb)) { + msg->msg_flags |= MSG_CTRUNC; + return; + } + ip_options_undo(opt); + + put_cmsg(msg, SOL_IP, IP_RETOPTS, opt->optlen, opt->__data); +} + + +void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) +{ + struct inet_sock *inet = inet_sk(skb->sk); + unsigned flags = inet->cmsg_flags; + + /* Ordered by supposed usage frequency */ + if (flags & 1) + ip_cmsg_recv_pktinfo(msg, skb); + if ((flags>>=1) == 0) + return; + + if (flags & 1) + ip_cmsg_recv_ttl(msg, skb); + if ((flags>>=1) == 0) + return; + + if (flags & 1) + ip_cmsg_recv_tos(msg, skb); + if ((flags>>=1) == 0) + return; + + if (flags & 1) + ip_cmsg_recv_opts(msg, skb); + if ((flags>>=1) == 0) + return; + + if (flags & 1) + ip_cmsg_recv_retopts(msg, skb); +} + +int ip_cmsg_send(struct msghdr *msg, struct ipcm_cookie *ipc) +{ + int err; + struct cmsghdr *cmsg; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + if (cmsg->cmsg_level != SOL_IP) + continue; + switch (cmsg->cmsg_type) { + case IP_RETOPTS: + err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr)); + err = ip_options_get(&ipc->opt, CMSG_DATA(cmsg), err < 40 ? err : 40, 0); + if (err) + return err; + break; + case IP_PKTINFO: + { + struct in_pktinfo *info; + if (cmsg->cmsg_len != CMSG_LEN(sizeof(struct in_pktinfo))) + return -EINVAL; + info = (struct in_pktinfo *)CMSG_DATA(cmsg); + ipc->oif = info->ipi_ifindex; + ipc->addr = info->ipi_spec_dst.s_addr; + break; + } + default: + return -EINVAL; + } + } + return 0; +} + + +/* Special input handler for packets caught by router alert option. + They are selected only by protocol field, and then processed likely + local ones; but only if someone wants them! Otherwise, router + not running rsvpd will kill RSVP. + + It is user level problem, what it will make with them. + I have no idea, how it will masquearde or NAT them (it is joke, joke :-)), + but receiver should be enough clever f.e. to forward mtrace requests, + sent to multicast group to reach destination designated router. + */ +struct ip_ra_chain *ip_ra_chain; +DEFINE_RWLOCK(ip_ra_lock); + +int ip_ra_control(struct sock *sk, unsigned char on, void (*destructor)(struct sock *)) +{ + struct ip_ra_chain *ra, *new_ra, **rap; + + if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num == IPPROTO_RAW) + return -EINVAL; + + new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; + + write_lock_bh(&ip_ra_lock); + for (rap = &ip_ra_chain; (ra=*rap) != NULL; rap = &ra->next) { + if (ra->sk == sk) { + if (on) { + write_unlock_bh(&ip_ra_lock); + if (new_ra) + kfree(new_ra); + return -EADDRINUSE; + } + *rap = ra->next; + write_unlock_bh(&ip_ra_lock); + + if (ra->destructor) + ra->destructor(sk); + sock_put(sk); + kfree(ra); + return 0; + } + } + if (new_ra == NULL) { + write_unlock_bh(&ip_ra_lock); + return -ENOBUFS; + } + new_ra->sk = sk; + new_ra->destructor = destructor; + + new_ra->next = ra; + *rap = new_ra; + sock_hold(sk); + write_unlock_bh(&ip_ra_lock); + + return 0; +} + +void ip_icmp_error(struct sock *sk, struct sk_buff *skb, int err, + u16 port, u32 info, u8 *payload) +{ + struct inet_sock *inet = inet_sk(sk); + struct sock_exterr_skb *serr; + + if (!inet->recverr) + return; + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + return; + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_ICMP; + serr->ee.ee_type = skb->h.icmph->type; + serr->ee.ee_code = skb->h.icmph->code; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8*)&(((struct iphdr*)(skb->h.icmph+1))->daddr) - skb->nh.raw; + serr->port = port; + + skb->h.raw = payload; + if (!skb_pull(skb, payload - skb->data) || + sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +void ip_local_error(struct sock *sk, int err, u32 daddr, u16 port, u32 info) +{ + struct inet_sock *inet = inet_sk(sk); + struct sock_exterr_skb *serr; + struct iphdr *iph; + struct sk_buff *skb; + + if (!inet->recverr) + return; + + skb = alloc_skb(sizeof(struct iphdr), GFP_ATOMIC); + if (!skb) + return; + + iph = (struct iphdr*)skb_put(skb, sizeof(struct iphdr)); + skb->nh.iph = iph; + iph->daddr = daddr; + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; + serr->ee.ee_type = 0; + serr->ee.ee_code = 0; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw; + serr->port = port; + + skb->h.raw = skb->tail; + __skb_pull(skb, skb->tail - skb->data); + + if (sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +/* + * Handle MSG_ERRQUEUE + */ +int ip_recv_error(struct sock *sk, struct msghdr *msg, int len) +{ + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + struct sockaddr_in *sin; + struct { + struct sock_extended_err ee; + struct sockaddr_in offender; + } errhdr; + int err; + int copied; + + err = -EAGAIN; + skb = skb_dequeue(&sk->sk_error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + + sin = (struct sockaddr_in *)msg->msg_name; + if (sin) { + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = *(u32*)(skb->nh.raw + serr->addr_offset); + sin->sin_port = serr->port; + memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + + memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); + sin = &errhdr.offender; + sin->sin_family = AF_UNSPEC; + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP) { + struct inet_sock *inet = inet_sk(sk); + + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = skb->nh.iph->saddr; + sin->sin_port = 0; + memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + } + + put_cmsg(msg, SOL_IP, IP_RECVERR, sizeof(errhdr), &errhdr); + + /* Now we could try to dump offended packet options */ + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + spin_lock_irq(&sk->sk_error_queue.lock); + sk->sk_err = 0; + if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { + sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; + spin_unlock_irq(&sk->sk_error_queue.lock); + sk->sk_error_report(sk); + } else + spin_unlock_irq(&sk->sk_error_queue.lock); + +out_free_skb: + kfree_skb(skb); +out: + return err; +} + + +/* + * Socket option code for IP. This is the end of the line after any TCP,UDP etc options on + * an IP socket. + */ + +int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int optlen) +{ + struct inet_sock *inet = inet_sk(sk); + int val=0,err; + + if (level != SOL_IP) + return -ENOPROTOOPT; + + if (((1<= sizeof(int)) { + if (get_user(val, (int __user *) optval)) + return -EFAULT; + } else if (optlen >= sizeof(char)) { + unsigned char ucval; + + if (get_user(ucval, (unsigned char __user *) optval)) + return -EFAULT; + val = (int) ucval; + } + } + + /* If optlen==0, it is equivalent to val == 0 */ + +#ifdef CONFIG_IP_MROUTE + if (optname >= MRT_BASE && optname <= (MRT_BASE + 10)) + return ip_mroute_setsockopt(sk,optname,optval,optlen); +#endif + + err = 0; + lock_sock(sk); + + switch (optname) { + case IP_OPTIONS: + { + struct ip_options * opt = NULL; + if (optlen > 40 || optlen < 0) + goto e_inval; + err = ip_options_get(&opt, optval, optlen, 1); + if (err) + break; + if (sk->sk_type == SOCK_STREAM) { + struct tcp_sock *tp = tcp_sk(sk); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (sk->sk_family == PF_INET || + (!((1 << sk->sk_state) & + (TCPF_LISTEN | TCPF_CLOSE)) && + inet->daddr != LOOPBACK4_IPV6)) { +#endif + if (inet->opt) + tp->ext_header_len -= inet->opt->optlen; + if (opt) + tp->ext_header_len += opt->optlen; + tcp_sync_mss(sk, tp->pmtu_cookie); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + } +#endif + } + opt = xchg(&inet->opt, opt); + if (opt) + kfree(opt); + break; + } + case IP_PKTINFO: + if (val) + inet->cmsg_flags |= IP_CMSG_PKTINFO; + else + inet->cmsg_flags &= ~IP_CMSG_PKTINFO; + break; + case IP_RECVTTL: + if (val) + inet->cmsg_flags |= IP_CMSG_TTL; + else + inet->cmsg_flags &= ~IP_CMSG_TTL; + break; + case IP_RECVTOS: + if (val) + inet->cmsg_flags |= IP_CMSG_TOS; + else + inet->cmsg_flags &= ~IP_CMSG_TOS; + break; + case IP_RECVOPTS: + if (val) + inet->cmsg_flags |= IP_CMSG_RECVOPTS; + else + inet->cmsg_flags &= ~IP_CMSG_RECVOPTS; + break; + case IP_RETOPTS: + if (val) + inet->cmsg_flags |= IP_CMSG_RETOPTS; + else + inet->cmsg_flags &= ~IP_CMSG_RETOPTS; + break; + case IP_TOS: /* This sets both TOS and Precedence */ + if (sk->sk_type == SOCK_STREAM) { + val &= ~3; + val |= inet->tos & 3; + } + if (IPTOS_PREC(val) >= IPTOS_PREC_CRITIC_ECP && + !capable(CAP_NET_ADMIN)) { + err = -EPERM; + break; + } + if (inet->tos != val) { + inet->tos = val; + sk->sk_priority = rt_tos2priority(val); + sk_dst_reset(sk); + } + break; + case IP_TTL: + if (optlen<1) + goto e_inval; + if (val != -1 && (val < 1 || val>255)) + goto e_inval; + inet->uc_ttl = val; + break; + case IP_HDRINCL: + if (sk->sk_type != SOCK_RAW) { + err = -ENOPROTOOPT; + break; + } + inet->hdrincl = val ? 1 : 0; + break; + case IP_MTU_DISCOVER: + if (val<0 || val>2) + goto e_inval; + inet->pmtudisc = val; + break; + case IP_RECVERR: + inet->recverr = !!val; + if (!val) + skb_queue_purge(&sk->sk_error_queue); + break; + case IP_MULTICAST_TTL: + if (sk->sk_type == SOCK_STREAM) + goto e_inval; + if (optlen<1) + goto e_inval; + if (val==-1) + val = 1; + if (val < 0 || val > 255) + goto e_inval; + inet->mc_ttl = val; + break; + case IP_MULTICAST_LOOP: + if (optlen<1) + goto e_inval; + inet->mc_loop = !!val; + break; + case IP_MULTICAST_IF: + { + struct ip_mreqn mreq; + struct net_device *dev = NULL; + + if (sk->sk_type == SOCK_STREAM) + goto e_inval; + /* + * Check the arguments are allowable + */ + + err = -EFAULT; + if (optlen >= sizeof(struct ip_mreqn)) { + if (copy_from_user(&mreq,optval,sizeof(mreq))) + break; + } else { + memset(&mreq, 0, sizeof(mreq)); + if (optlen >= sizeof(struct in_addr) && + copy_from_user(&mreq.imr_address,optval,sizeof(struct in_addr))) + break; + } + + if (!mreq.imr_ifindex) { + if (mreq.imr_address.s_addr == INADDR_ANY) { + inet->mc_index = 0; + inet->mc_addr = 0; + err = 0; + break; + } + dev = ip_dev_find(mreq.imr_address.s_addr); + if (dev) { + mreq.imr_ifindex = dev->ifindex; + dev_put(dev); + } + } else + dev = __dev_get_by_index(mreq.imr_ifindex); + + + err = -EADDRNOTAVAIL; + if (!dev) + break; + + err = -EINVAL; + if (sk->sk_bound_dev_if && + mreq.imr_ifindex != sk->sk_bound_dev_if) + break; + + inet->mc_index = mreq.imr_ifindex; + inet->mc_addr = mreq.imr_address.s_addr; + err = 0; + break; + } + + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + { + struct ip_mreqn mreq; + + if (optlen < sizeof(struct ip_mreq)) + goto e_inval; + err = -EFAULT; + if (optlen >= sizeof(struct ip_mreqn)) { + if(copy_from_user(&mreq,optval,sizeof(mreq))) + break; + } else { + memset(&mreq, 0, sizeof(mreq)); + if (copy_from_user(&mreq,optval,sizeof(struct ip_mreq))) + break; + } + + if (optname == IP_ADD_MEMBERSHIP) + err = ip_mc_join_group(sk, &mreq); + else + err = ip_mc_leave_group(sk, &mreq); + break; + } + case IP_MSFILTER: + { + extern int sysctl_optmem_max; + extern int sysctl_igmp_max_msf; + struct ip_msfilter *msf; + + if (optlen < IP_MSFILTER_SIZE(0)) + goto e_inval; + if (optlen > sysctl_optmem_max) { + err = -ENOBUFS; + break; + } + msf = (struct ip_msfilter *)kmalloc(optlen, GFP_KERNEL); + if (msf == 0) { + err = -ENOBUFS; + break; + } + err = -EFAULT; + if (copy_from_user(msf, optval, optlen)) { + kfree(msf); + break; + } + /* numsrc >= (1G-4) overflow in 32 bits */ + if (msf->imsf_numsrc >= 0x3ffffffcU || + msf->imsf_numsrc > sysctl_igmp_max_msf) { + kfree(msf); + err = -ENOBUFS; + break; + } + if (IP_MSFILTER_SIZE(msf->imsf_numsrc) > optlen) { + kfree(msf); + err = -EINVAL; + break; + } + err = ip_mc_msfilter(sk, msf, 0); + kfree(msf); + break; + } + case IP_BLOCK_SOURCE: + case IP_UNBLOCK_SOURCE: + case IP_ADD_SOURCE_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + { + struct ip_mreq_source mreqs; + int omode, add; + + if (optlen != sizeof(struct ip_mreq_source)) + goto e_inval; + if (copy_from_user(&mreqs, optval, sizeof(mreqs))) { + err = -EFAULT; + break; + } + if (optname == IP_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == IP_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == IP_ADD_SOURCE_MEMBERSHIP) { + struct ip_mreqn mreq; + + mreq.imr_multiaddr.s_addr = mreqs.imr_multiaddr; + mreq.imr_address.s_addr = mreqs.imr_interface; + mreq.imr_ifindex = 0; + err = ip_mc_join_group(sk, &mreq); + if (err) + break; + omode = MCAST_INCLUDE; + add = 1; + } else /*IP_DROP_SOURCE_MEMBERSHIP */ { + omode = MCAST_INCLUDE; + add = 0; + } + err = ip_mc_source(add, omode, sk, &mreqs, 0); + break; + } + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + { + struct group_req greq; + struct sockaddr_in *psin; + struct ip_mreqn mreq; + + if (optlen < sizeof(struct group_req)) + goto e_inval; + err = -EFAULT; + if(copy_from_user(&greq, optval, sizeof(greq))) + break; + psin = (struct sockaddr_in *)&greq.gr_group; + if (psin->sin_family != AF_INET) + goto e_inval; + memset(&mreq, 0, sizeof(mreq)); + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_ifindex = greq.gr_interface; + + if (optname == MCAST_JOIN_GROUP) + err = ip_mc_join_group(sk, &mreq); + else + err = ip_mc_leave_group(sk, &mreq); + break; + } + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + { + struct group_source_req greqs; + struct ip_mreq_source mreqs; + struct sockaddr_in *psin; + int omode, add; + + if (optlen != sizeof(struct group_source_req)) + goto e_inval; + if (copy_from_user(&greqs, optval, sizeof(greqs))) { + err = -EFAULT; + break; + } + if (greqs.gsr_group.ss_family != AF_INET || + greqs.gsr_source.ss_family != AF_INET) { + err = -EADDRNOTAVAIL; + break; + } + psin = (struct sockaddr_in *)&greqs.gsr_group; + mreqs.imr_multiaddr = psin->sin_addr.s_addr; + psin = (struct sockaddr_in *)&greqs.gsr_source; + mreqs.imr_sourceaddr = psin->sin_addr.s_addr; + mreqs.imr_interface = 0; /* use index for mc_source */ + + if (optname == MCAST_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == MCAST_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == MCAST_JOIN_SOURCE_GROUP) { + struct ip_mreqn mreq; + + psin = (struct sockaddr_in *)&greqs.gsr_group; + mreq.imr_multiaddr = psin->sin_addr; + mreq.imr_address.s_addr = 0; + mreq.imr_ifindex = greqs.gsr_interface; + err = ip_mc_join_group(sk, &mreq); + if (err) + break; + greqs.gsr_interface = mreq.imr_ifindex; + omode = MCAST_INCLUDE; + add = 1; + } else /* MCAST_LEAVE_SOURCE_GROUP */ { + omode = MCAST_INCLUDE; + add = 0; + } + err = ip_mc_source(add, omode, sk, &mreqs, + greqs.gsr_interface); + break; + } + case MCAST_MSFILTER: + { + extern int sysctl_optmem_max; + extern int sysctl_igmp_max_msf; + struct sockaddr_in *psin; + struct ip_msfilter *msf = NULL; + struct group_filter *gsf = NULL; + int msize, i, ifindex; + + if (optlen < GROUP_FILTER_SIZE(0)) + goto e_inval; + if (optlen > sysctl_optmem_max) { + err = -ENOBUFS; + break; + } + gsf = (struct group_filter *)kmalloc(optlen,GFP_KERNEL); + if (gsf == 0) { + err = -ENOBUFS; + break; + } + err = -EFAULT; + if (copy_from_user(gsf, optval, optlen)) { + goto mc_msf_out; + } + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + if (gsf->gf_numsrc >= 0x1ffffff || + gsf->gf_numsrc > sysctl_igmp_max_msf) { + err = -ENOBUFS; + goto mc_msf_out; + } + if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { + err = -EINVAL; + goto mc_msf_out; + } + msize = IP_MSFILTER_SIZE(gsf->gf_numsrc); + msf = (struct ip_msfilter *)kmalloc(msize,GFP_KERNEL); + if (msf == 0) { + err = -ENOBUFS; + goto mc_msf_out; + } + ifindex = gsf->gf_interface; + psin = (struct sockaddr_in *)&gsf->gf_group; + if (psin->sin_family != AF_INET) { + err = -EADDRNOTAVAIL; + goto mc_msf_out; + } + msf->imsf_multiaddr = psin->sin_addr.s_addr; + msf->imsf_interface = 0; + msf->imsf_fmode = gsf->gf_fmode; + msf->imsf_numsrc = gsf->gf_numsrc; + err = -EADDRNOTAVAIL; + for (i=0; igf_numsrc; ++i) { + psin = (struct sockaddr_in *)&gsf->gf_slist[i]; + + if (psin->sin_family != AF_INET) + goto mc_msf_out; + msf->imsf_slist[i] = psin->sin_addr.s_addr; + } + kfree(gsf); + gsf = NULL; + + err = ip_mc_msfilter(sk, msf, ifindex); +mc_msf_out: + if (msf) + kfree(msf); + if (gsf) + kfree(gsf); + break; + } + case IP_ROUTER_ALERT: + err = ip_ra_control(sk, val ? 1 : 0, NULL); + break; + + case IP_FREEBIND: + if (optlen<1) + goto e_inval; + inet->freebind = !!val; + break; + + case IP_IPSEC_POLICY: + case IP_XFRM_POLICY: + err = xfrm_user_policy(sk, optname, optval, optlen); + break; + + default: +#ifdef CONFIG_NETFILTER + err = nf_setsockopt(sk, PF_INET, optname, optval, + optlen); +#else + err = -ENOPROTOOPT; +#endif + break; + } + release_sock(sk); + return err; + +e_inval: + release_sock(sk); + return -EINVAL; +} + +/* + * Get the options. Note for future reference. The GET of IP options gets the + * _received_ ones. The set sets the _sent_ ones. + */ + +int ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) +{ + struct inet_sock *inet = inet_sk(sk); + int val; + int len; + + if(level!=SOL_IP) + return -EOPNOTSUPP; + +#ifdef CONFIG_IP_MROUTE + if(optname>=MRT_BASE && optname <=MRT_BASE+10) + { + return ip_mroute_getsockopt(sk,optname,optval,optlen); + } +#endif + + if(get_user(len,optlen)) + return -EFAULT; + if(len < 0) + return -EINVAL; + + lock_sock(sk); + + switch(optname) { + case IP_OPTIONS: + { + unsigned char optbuf[sizeof(struct ip_options)+40]; + struct ip_options * opt = (struct ip_options*)optbuf; + opt->optlen = 0; + if (inet->opt) + memcpy(optbuf, inet->opt, + sizeof(struct ip_options)+ + inet->opt->optlen); + release_sock(sk); + + if (opt->optlen == 0) + return put_user(0, optlen); + + ip_options_undo(opt); + + len = min_t(unsigned int, len, opt->optlen); + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval, opt->__data, len)) + return -EFAULT; + return 0; + } + case IP_PKTINFO: + val = (inet->cmsg_flags & IP_CMSG_PKTINFO) != 0; + break; + case IP_RECVTTL: + val = (inet->cmsg_flags & IP_CMSG_TTL) != 0; + break; + case IP_RECVTOS: + val = (inet->cmsg_flags & IP_CMSG_TOS) != 0; + break; + case IP_RECVOPTS: + val = (inet->cmsg_flags & IP_CMSG_RECVOPTS) != 0; + break; + case IP_RETOPTS: + val = (inet->cmsg_flags & IP_CMSG_RETOPTS) != 0; + break; + case IP_TOS: + val = inet->tos; + break; + case IP_TTL: + val = (inet->uc_ttl == -1 ? + sysctl_ip_default_ttl : + inet->uc_ttl); + break; + case IP_HDRINCL: + val = inet->hdrincl; + break; + case IP_MTU_DISCOVER: + val = inet->pmtudisc; + break; + case IP_MTU: + { + struct dst_entry *dst; + val = 0; + dst = sk_dst_get(sk); + if (dst) { + val = dst_mtu(dst); + dst_release(dst); + } + if (!val) { + release_sock(sk); + return -ENOTCONN; + } + break; + } + case IP_RECVERR: + val = inet->recverr; + break; + case IP_MULTICAST_TTL: + val = inet->mc_ttl; + break; + case IP_MULTICAST_LOOP: + val = inet->mc_loop; + break; + case IP_MULTICAST_IF: + { + struct in_addr addr; + len = min_t(unsigned int, len, sizeof(struct in_addr)); + addr.s_addr = inet->mc_addr; + release_sock(sk); + + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval, &addr, len)) + return -EFAULT; + return 0; + } + case IP_MSFILTER: + { + struct ip_msfilter msf; + int err; + + if (len < IP_MSFILTER_SIZE(0)) { + release_sock(sk); + return -EINVAL; + } + if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) { + release_sock(sk); + return -EFAULT; + } + err = ip_mc_msfget(sk, &msf, + (struct ip_msfilter __user *)optval, optlen); + release_sock(sk); + return err; + } + case MCAST_MSFILTER: + { + struct group_filter gsf; + int err; + + if (len < GROUP_FILTER_SIZE(0)) { + release_sock(sk); + return -EINVAL; + } + if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) { + release_sock(sk); + return -EFAULT; + } + err = ip_mc_gsfget(sk, &gsf, + (struct group_filter __user *)optval, optlen); + release_sock(sk); + return err; + } + case IP_PKTOPTIONS: + { + struct msghdr msg; + + release_sock(sk); + + if (sk->sk_type != SOCK_STREAM) + return -ENOPROTOOPT; + + msg.msg_control = optval; + msg.msg_controllen = len; + msg.msg_flags = 0; + + if (inet->cmsg_flags & IP_CMSG_PKTINFO) { + struct in_pktinfo info; + + info.ipi_addr.s_addr = inet->rcv_saddr; + info.ipi_spec_dst.s_addr = inet->rcv_saddr; + info.ipi_ifindex = inet->mc_index; + put_cmsg(&msg, SOL_IP, IP_PKTINFO, sizeof(info), &info); + } + if (inet->cmsg_flags & IP_CMSG_TTL) { + int hlim = inet->mc_ttl; + put_cmsg(&msg, SOL_IP, IP_TTL, sizeof(hlim), &hlim); + } + len -= msg.msg_controllen; + return put_user(len, optlen); + } + case IP_FREEBIND: + val = inet->freebind; + break; + default: +#ifdef CONFIG_NETFILTER + val = nf_getsockopt(sk, PF_INET, optname, optval, + &len); + release_sock(sk); + if (val >= 0) + val = put_user(len, optlen); + return val; +#else + release_sock(sk); + return -ENOPROTOOPT; +#endif + } + release_sock(sk); + + if (len < sizeof(int) && len > 0 && val>=0 && val<255) { + unsigned char ucval = (unsigned char)val; + len = 1; + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval,&ucval,1)) + return -EFAULT; + } else { + len = min_t(unsigned int, sizeof(int), len); + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval,&val,len)) + return -EFAULT; + } + return 0; +} + +EXPORT_SYMBOL(ip_cmsg_recv); + +#ifdef CONFIG_IP_SCTP_MODULE +EXPORT_SYMBOL(ip_getsockopt); +EXPORT_SYMBOL(ip_setsockopt); +#endif diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c new file mode 100644 index 000000000000..1a23c5263b99 --- /dev/null +++ b/net/ipv4/ipcomp.c @@ -0,0 +1,524 @@ +/* + * IP Payload Compression Protocol (IPComp) - RFC3173. + * + * Copyright (c) 2003 James Morris + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * Todo: + * - Tunable compression parameters. + * - Compression stats. + * - Adaptive compression. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ipcomp_tfms { + struct list_head list; + struct crypto_tfm **tfms; + int users; +}; + +static DECLARE_MUTEX(ipcomp_resource_sem); +static void **ipcomp_scratches; +static int ipcomp_scratch_users; +static LIST_HEAD(ipcomp_tfms_list); + +static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) +{ + int err, plen, dlen; + struct iphdr *iph; + struct ipcomp_data *ipcd = x->data; + u8 *start, *scratch; + struct crypto_tfm *tfm; + int cpu; + + plen = skb->len; + dlen = IPCOMP_SCRATCH_SIZE; + start = skb->data; + + cpu = get_cpu(); + scratch = *per_cpu_ptr(ipcomp_scratches, cpu); + tfm = *per_cpu_ptr(ipcd->tfms, cpu); + + err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen); + if (err) + goto out; + + if (dlen < (plen + sizeof(struct ip_comp_hdr))) { + err = -EINVAL; + goto out; + } + + err = pskb_expand_head(skb, 0, dlen - plen, GFP_ATOMIC); + if (err) + goto out; + + skb_put(skb, dlen - plen); + memcpy(skb->data, scratch, dlen); + iph = skb->nh.iph; + iph->tot_len = htons(dlen + iph->ihl * 4); +out: + put_cpu(); + return err; +} + +static int ipcomp_input(struct xfrm_state *x, + struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + u8 nexthdr; + int err = 0; + struct iphdr *iph; + union { + struct iphdr iph; + char buf[60]; + } tmp_iph; + + + if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && + skb_linearize(skb, GFP_ATOMIC) != 0) { + err = -ENOMEM; + goto out; + } + + skb->ip_summed = CHECKSUM_NONE; + + /* Remove ipcomp header and decompress original payload */ + iph = skb->nh.iph; + memcpy(&tmp_iph, iph, iph->ihl * 4); + nexthdr = *(u8 *)skb->data; + skb_pull(skb, sizeof(struct ip_comp_hdr)); + skb->nh.raw += sizeof(struct ip_comp_hdr); + memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4); + iph = skb->nh.iph; + iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr)); + iph->protocol = nexthdr; + skb->h.raw = skb->data; + err = ipcomp_decompress(x, skb); + +out: + return err; +} + +static int ipcomp_compress(struct xfrm_state *x, struct sk_buff *skb) +{ + int err, plen, dlen, ihlen; + struct iphdr *iph = skb->nh.iph; + struct ipcomp_data *ipcd = x->data; + u8 *start, *scratch; + struct crypto_tfm *tfm; + int cpu; + + ihlen = iph->ihl * 4; + plen = skb->len - ihlen; + dlen = IPCOMP_SCRATCH_SIZE; + start = skb->data + ihlen; + + cpu = get_cpu(); + scratch = *per_cpu_ptr(ipcomp_scratches, cpu); + tfm = *per_cpu_ptr(ipcd->tfms, cpu); + + err = crypto_comp_compress(tfm, start, plen, scratch, &dlen); + if (err) + goto out; + + if ((dlen + sizeof(struct ip_comp_hdr)) >= plen) { + err = -EMSGSIZE; + goto out; + } + + memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen); + put_cpu(); + + pskb_trim(skb, ihlen + dlen + sizeof(struct ip_comp_hdr)); + return 0; + +out: + put_cpu(); + return err; +} + +static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + struct iphdr *iph; + struct ip_comp_hdr *ipch; + struct ipcomp_data *ipcd = x->data; + int hdr_len = 0; + + iph = skb->nh.iph; + iph->tot_len = htons(skb->len); + hdr_len = iph->ihl * 4; + if ((skb->len - hdr_len) < ipcd->threshold) { + /* Don't bother compressing */ + goto out_ok; + } + + if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && + skb_linearize(skb, GFP_ATOMIC) != 0) { + goto out_ok; + } + + err = ipcomp_compress(x, skb); + iph = skb->nh.iph; + + if (err) { + goto out_ok; + } + + /* Install ipcomp header, convert into ipcomp datagram. */ + iph->tot_len = htons(skb->len); + ipch = (struct ip_comp_hdr *)((char *)iph + iph->ihl * 4); + ipch->nexthdr = iph->protocol; + ipch->flags = 0; + ipch->cpi = htons((u16 )ntohl(x->id.spi)); + iph->protocol = IPPROTO_COMP; + ip_send_check(iph); + return 0; + +out_ok: + if (x->props.mode) + ip_send_check(iph); + return 0; +} + +static void ipcomp4_err(struct sk_buff *skb, u32 info) +{ + u32 spi; + struct iphdr *iph = (struct iphdr *)skb->data; + struct ip_comp_hdr *ipch = (struct ip_comp_hdr *)(skb->data+(iph->ihl<<2)); + struct xfrm_state *x; + + if (skb->h.icmph->type != ICMP_DEST_UNREACH || + skb->h.icmph->code != ICMP_FRAG_NEEDED) + return; + + spi = ntohl(ntohs(ipch->cpi)); + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, + spi, IPPROTO_COMP, AF_INET); + if (!x) + return; + NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/%u.%u.%u.%u\n", + spi, NIPQUAD(iph->daddr))); + xfrm_state_put(x); +} + +/* We always hold one tunnel user reference to indicate a tunnel */ +static struct xfrm_state *ipcomp_tunnel_create(struct xfrm_state *x) +{ + struct xfrm_state *t; + + t = xfrm_state_alloc(); + if (t == NULL) + goto out; + + t->id.proto = IPPROTO_IPIP; + t->id.spi = x->props.saddr.a4; + t->id.daddr.a4 = x->id.daddr.a4; + memcpy(&t->sel, &x->sel, sizeof(t->sel)); + t->props.family = AF_INET; + t->props.mode = 1; + t->props.saddr.a4 = x->props.saddr.a4; + t->props.flags = x->props.flags; + + t->type = xfrm_get_type(IPPROTO_IPIP, t->props.family); + if (t->type == NULL) + goto error; + + if (t->type->init_state(t, NULL)) + goto error; + + t->km.state = XFRM_STATE_VALID; + atomic_set(&t->tunnel_users, 1); +out: + return t; + +error: + t->km.state = XFRM_STATE_DEAD; + xfrm_state_put(t); + t = NULL; + goto out; +} + +/* + * Must be protected by xfrm_cfg_sem. State and tunnel user references are + * always incremented on success. + */ +static int ipcomp_tunnel_attach(struct xfrm_state *x) +{ + int err = 0; + struct xfrm_state *t; + + t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr.a4, + x->props.saddr.a4, IPPROTO_IPIP, AF_INET); + if (!t) { + t = ipcomp_tunnel_create(x); + if (!t) { + err = -EINVAL; + goto out; + } + xfrm_state_insert(t); + xfrm_state_hold(t); + } + x->tunnel = t; + atomic_inc(&t->tunnel_users); +out: + return err; +} + +static void ipcomp_free_scratches(void) +{ + int i; + void **scratches; + + if (--ipcomp_scratch_users) + return; + + scratches = ipcomp_scratches; + if (!scratches) + return; + + for_each_cpu(i) { + void *scratch = *per_cpu_ptr(scratches, i); + if (scratch) + vfree(scratch); + } + + free_percpu(scratches); +} + +static void **ipcomp_alloc_scratches(void) +{ + int i; + void **scratches; + + if (ipcomp_scratch_users++) + return ipcomp_scratches; + + scratches = alloc_percpu(void *); + if (!scratches) + return NULL; + + ipcomp_scratches = scratches; + + for_each_cpu(i) { + void *scratch = vmalloc(IPCOMP_SCRATCH_SIZE); + if (!scratch) + return NULL; + *per_cpu_ptr(scratches, i) = scratch; + } + + return scratches; +} + +static void ipcomp_free_tfms(struct crypto_tfm **tfms) +{ + struct ipcomp_tfms *pos; + int cpu; + + list_for_each_entry(pos, &ipcomp_tfms_list, list) { + if (pos->tfms == tfms) + break; + } + + BUG_TRAP(pos); + + if (--pos->users) + return; + + list_del(&pos->list); + kfree(pos); + + if (!tfms) + return; + + for_each_cpu(cpu) { + struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu); + if (tfm) + crypto_free_tfm(tfm); + } + free_percpu(tfms); +} + +static struct crypto_tfm **ipcomp_alloc_tfms(const char *alg_name) +{ + struct ipcomp_tfms *pos; + struct crypto_tfm **tfms; + int cpu; + + /* This can be any valid CPU ID so we don't need locking. */ + cpu = smp_processor_id(); + + list_for_each_entry(pos, &ipcomp_tfms_list, list) { + struct crypto_tfm *tfm; + + tfms = pos->tfms; + tfm = *per_cpu_ptr(tfms, cpu); + + if (!strcmp(crypto_tfm_alg_name(tfm), alg_name)) { + pos->users++; + return tfms; + } + } + + pos = kmalloc(sizeof(*pos), GFP_KERNEL); + if (!pos) + return NULL; + + pos->users = 1; + INIT_LIST_HEAD(&pos->list); + list_add(&pos->list, &ipcomp_tfms_list); + + pos->tfms = tfms = alloc_percpu(struct crypto_tfm *); + if (!tfms) + goto error; + + for_each_cpu(cpu) { + struct crypto_tfm *tfm = crypto_alloc_tfm(alg_name, 0); + if (!tfm) + goto error; + *per_cpu_ptr(tfms, cpu) = tfm; + } + + return tfms; + +error: + ipcomp_free_tfms(tfms); + return NULL; +} + +static void ipcomp_free_data(struct ipcomp_data *ipcd) +{ + if (ipcd->tfms) + ipcomp_free_tfms(ipcd->tfms); + ipcomp_free_scratches(); +} + +static void ipcomp_destroy(struct xfrm_state *x) +{ + struct ipcomp_data *ipcd = x->data; + if (!ipcd) + return; + xfrm_state_delete_tunnel(x); + down(&ipcomp_resource_sem); + ipcomp_free_data(ipcd); + up(&ipcomp_resource_sem); + kfree(ipcd); +} + +static int ipcomp_init_state(struct xfrm_state *x, void *args) +{ + int err; + struct ipcomp_data *ipcd; + struct xfrm_algo_desc *calg_desc; + + err = -EINVAL; + if (!x->calg) + goto out; + + if (x->encap) + goto out; + + err = -ENOMEM; + ipcd = kmalloc(sizeof(*ipcd), GFP_KERNEL); + if (!ipcd) + goto out; + + memset(ipcd, 0, sizeof(*ipcd)); + x->props.header_len = 0; + if (x->props.mode) + x->props.header_len += sizeof(struct iphdr); + + down(&ipcomp_resource_sem); + if (!ipcomp_alloc_scratches()) + goto error; + + ipcd->tfms = ipcomp_alloc_tfms(x->calg->alg_name); + if (!ipcd->tfms) + goto error; + up(&ipcomp_resource_sem); + + if (x->props.mode) { + err = ipcomp_tunnel_attach(x); + if (err) + goto error_tunnel; + } + + calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0); + BUG_ON(!calg_desc); + ipcd->threshold = calg_desc->uinfo.comp.threshold; + x->data = ipcd; + err = 0; +out: + return err; + +error_tunnel: + down(&ipcomp_resource_sem); +error: + ipcomp_free_data(ipcd); + up(&ipcomp_resource_sem); + kfree(ipcd); + goto out; +} + +static struct xfrm_type ipcomp_type = { + .description = "IPCOMP4", + .owner = THIS_MODULE, + .proto = IPPROTO_COMP, + .init_state = ipcomp_init_state, + .destructor = ipcomp_destroy, + .input = ipcomp_input, + .output = ipcomp_output +}; + +static struct net_protocol ipcomp4_protocol = { + .handler = xfrm4_rcv, + .err_handler = ipcomp4_err, + .no_policy = 1, +}; + +static int __init ipcomp4_init(void) +{ + if (xfrm_register_type(&ipcomp_type, AF_INET) < 0) { + printk(KERN_INFO "ipcomp init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet_add_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) { + printk(KERN_INFO "ipcomp init: can't add protocol\n"); + xfrm_unregister_type(&ipcomp_type, AF_INET); + return -EAGAIN; + } + return 0; +} + +static void __exit ipcomp4_fini(void) +{ + if (inet_del_protocol(&ipcomp4_protocol, IPPROTO_COMP) < 0) + printk(KERN_INFO "ip ipcomp close: can't remove protocol\n"); + if (xfrm_unregister_type(&ipcomp_type, AF_INET) < 0) + printk(KERN_INFO "ip ipcomp close: can't remove xfrm type\n"); +} + +module_init(ipcomp4_init); +module_exit(ipcomp4_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) - RFC3173"); +MODULE_AUTHOR("James Morris "); + diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c new file mode 100644 index 000000000000..f2509034ce72 --- /dev/null +++ b/net/ipv4/ipconfig.c @@ -0,0 +1,1507 @@ +/* + * $Id: ipconfig.c,v 1.46 2002/02/01 22:01:04 davem Exp $ + * + * Automatic Configuration of IP -- use DHCP, BOOTP, RARP, or + * user-supplied information to configure own IP address and routes. + * + * Copyright (C) 1996-1998 Martin Mares + * + * Derived from network configuration code in fs/nfs/nfsroot.c, + * originally Copyright (C) 1995, 1996 Gero Kuhlmann and me. + * + * BOOTP rewritten to construct and analyse packets itself instead + * of misusing the IP layer. num_bugs_causing_wrong_arp_replies--; + * -- MJ, December 1998 + * + * Fixed ip_auto_config_setup calling at startup in the new "Linker Magic" + * initialization scheme. + * - Arnaldo Carvalho de Melo , 08/11/1999 + * + * DHCP support added. To users this looks like a whole separate + * protocol, but we know it's just a bag on the side of BOOTP. + * -- Chip Salzenberg , May 2000 + * + * Ported DHCP support from 2.2.16 to 2.4.0-test4 + * -- Eric Biederman , 30 Aug 2000 + * + * Merged changes from 2.2.19 into 2.4.3 + * -- Eric Biederman , 22 April Aug 2001 + * + * Multiple Nameservers in /proc/net/pnp + * -- Josef Siemes , Aug 2002 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* Define this to allow debugging output */ +#undef IPCONFIG_DEBUG + +#ifdef IPCONFIG_DEBUG +#define DBG(x) printk x +#else +#define DBG(x) do { } while(0) +#endif + +#if defined(CONFIG_IP_PNP_DHCP) +#define IPCONFIG_DHCP +#endif +#if defined(CONFIG_IP_PNP_BOOTP) || defined(CONFIG_IP_PNP_DHCP) +#define IPCONFIG_BOOTP +#endif +#if defined(CONFIG_IP_PNP_RARP) +#define IPCONFIG_RARP +#endif +#if defined(IPCONFIG_BOOTP) || defined(IPCONFIG_RARP) +#define IPCONFIG_DYNAMIC +#endif + +/* Define the friendly delay before and after opening net devices */ +#define CONF_PRE_OPEN 500 /* Before opening: 1/2 second */ +#define CONF_POST_OPEN 1 /* After opening: 1 second */ + +/* Define the timeout for waiting for a DHCP/BOOTP/RARP reply */ +#define CONF_OPEN_RETRIES 2 /* (Re)open devices twice */ +#define CONF_SEND_RETRIES 6 /* Send six requests per open */ +#define CONF_INTER_TIMEOUT (HZ/2) /* Inter-device timeout: 1/2 second */ +#define CONF_BASE_TIMEOUT (HZ*2) /* Initial timeout: 2 seconds */ +#define CONF_TIMEOUT_RANDOM (HZ) /* Maximum amount of randomization */ +#define CONF_TIMEOUT_MULT *7/4 /* Rate of timeout growth */ +#define CONF_TIMEOUT_MAX (HZ*30) /* Maximum allowed timeout */ +#define CONF_NAMESERVERS_MAX 3 /* Maximum number of nameservers + - '3' from resolv.h */ + + +/* + * Public IP configuration + */ + +/* This is used by platforms which might be able to set the ipconfig + * variables using firmware environment vars. If this is set, it will + * ignore such firmware variables. + */ +int ic_set_manually __initdata = 0; /* IPconfig parameters set manually */ + +static int ic_enable __initdata = 0; /* IP config enabled? */ + +/* Protocol choice */ +int ic_proto_enabled __initdata = 0 +#ifdef IPCONFIG_BOOTP + | IC_BOOTP +#endif +#ifdef CONFIG_IP_PNP_DHCP + | IC_USE_DHCP +#endif +#ifdef IPCONFIG_RARP + | IC_RARP +#endif + ; + +static int ic_host_name_set __initdata = 0; /* Host name set by us? */ + +u32 ic_myaddr = INADDR_NONE; /* My IP address */ +static u32 ic_netmask = INADDR_NONE; /* Netmask for local subnet */ +u32 ic_gateway = INADDR_NONE; /* Gateway IP address */ + +u32 ic_servaddr = INADDR_NONE; /* Boot server IP address */ + +u32 root_server_addr = INADDR_NONE; /* Address of NFS server */ +u8 root_server_path[256] = { 0, }; /* Path to mount as root */ + +/* Persistent data: */ + +static int ic_proto_used; /* Protocol used, if any */ +static u32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */ +static u8 ic_domain[64]; /* DNS (not NIS) domain name */ + +/* + * Private state. + */ + +/* Name of user-selected boot device */ +static char user_dev_name[IFNAMSIZ] __initdata = { 0, }; + +/* Protocols supported by available interfaces */ +static int ic_proto_have_if __initdata = 0; + +#ifdef IPCONFIG_DYNAMIC +static DEFINE_SPINLOCK(ic_recv_lock); +static volatile int ic_got_reply __initdata = 0; /* Proto(s) that replied */ +#endif +#ifdef IPCONFIG_DHCP +static int ic_dhcp_msgtype __initdata = 0; /* DHCP msg type received */ +#endif + + +/* + * Network devices + */ + +struct ic_device { + struct ic_device *next; + struct net_device *dev; + unsigned short flags; + short able; + u32 xid; +}; + +static struct ic_device *ic_first_dev __initdata = NULL;/* List of open device */ +static struct net_device *ic_dev __initdata = NULL; /* Selected device */ + +static int __init ic_open_devs(void) +{ + struct ic_device *d, **last; + struct net_device *dev; + unsigned short oflags; + + last = &ic_first_dev; + rtnl_shlock(); + + /* bring loopback device up first */ + if (dev_change_flags(&loopback_dev, loopback_dev.flags | IFF_UP) < 0) + printk(KERN_ERR "IP-Config: Failed to open %s\n", loopback_dev.name); + + for (dev = dev_base; dev; dev = dev->next) { + if (dev == &loopback_dev) + continue; + if (user_dev_name[0] ? !strcmp(dev->name, user_dev_name) : + (!(dev->flags & IFF_LOOPBACK) && + (dev->flags & (IFF_POINTOPOINT|IFF_BROADCAST)) && + strncmp(dev->name, "dummy", 5))) { + int able = 0; + if (dev->mtu >= 364) + able |= IC_BOOTP; + else + printk(KERN_WARNING "DHCP/BOOTP: Ignoring device %s, MTU %d too small", dev->name, dev->mtu); + if (!(dev->flags & IFF_NOARP)) + able |= IC_RARP; + able &= ic_proto_enabled; + if (ic_proto_enabled && !able) + continue; + oflags = dev->flags; + if (dev_change_flags(dev, oflags | IFF_UP) < 0) { + printk(KERN_ERR "IP-Config: Failed to open %s\n", dev->name); + continue; + } + if (!(d = kmalloc(sizeof(struct ic_device), GFP_KERNEL))) { + rtnl_shunlock(); + return -1; + } + d->dev = dev; + *last = d; + last = &d->next; + d->flags = oflags; + d->able = able; + if (able & IC_BOOTP) + get_random_bytes(&d->xid, sizeof(u32)); + else + d->xid = 0; + ic_proto_have_if |= able; + DBG(("IP-Config: %s UP (able=%d, xid=%08x)\n", + dev->name, able, d->xid)); + } + } + rtnl_shunlock(); + + *last = NULL; + + if (!ic_first_dev) { + if (user_dev_name[0]) + printk(KERN_ERR "IP-Config: Device `%s' not found.\n", user_dev_name); + else + printk(KERN_ERR "IP-Config: No network devices available.\n"); + return -1; + } + return 0; +} + +static void __init ic_close_devs(void) +{ + struct ic_device *d, *next; + struct net_device *dev; + + rtnl_shlock(); + next = ic_first_dev; + while ((d = next)) { + next = d->next; + dev = d->dev; + if (dev != ic_dev) { + DBG(("IP-Config: Downing %s\n", dev->name)); + dev_change_flags(dev, d->flags); + } + kfree(d); + } + rtnl_shunlock(); +} + +/* + * Interface to various network functions. + */ + +static inline void +set_sockaddr(struct sockaddr_in *sin, u32 addr, u16 port) +{ + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = addr; + sin->sin_port = port; +} + +static int __init ic_dev_ioctl(unsigned int cmd, struct ifreq *arg) +{ + int res; + + mm_segment_t oldfs = get_fs(); + set_fs(get_ds()); + res = devinet_ioctl(cmd, (struct ifreq __user *) arg); + set_fs(oldfs); + return res; +} + +static int __init ic_route_ioctl(unsigned int cmd, struct rtentry *arg) +{ + int res; + + mm_segment_t oldfs = get_fs(); + set_fs(get_ds()); + res = ip_rt_ioctl(cmd, (void __user *) arg); + set_fs(oldfs); + return res; +} + +/* + * Set up interface addresses and routes. + */ + +static int __init ic_setup_if(void) +{ + struct ifreq ir; + struct sockaddr_in *sin = (void *) &ir.ifr_ifru.ifru_addr; + int err; + + memset(&ir, 0, sizeof(ir)); + strcpy(ir.ifr_ifrn.ifrn_name, ic_dev->name); + set_sockaddr(sin, ic_myaddr, 0); + if ((err = ic_dev_ioctl(SIOCSIFADDR, &ir)) < 0) { + printk(KERN_ERR "IP-Config: Unable to set interface address (%d).\n", err); + return -1; + } + set_sockaddr(sin, ic_netmask, 0); + if ((err = ic_dev_ioctl(SIOCSIFNETMASK, &ir)) < 0) { + printk(KERN_ERR "IP-Config: Unable to set interface netmask (%d).\n", err); + return -1; + } + set_sockaddr(sin, ic_myaddr | ~ic_netmask, 0); + if ((err = ic_dev_ioctl(SIOCSIFBRDADDR, &ir)) < 0) { + printk(KERN_ERR "IP-Config: Unable to set interface broadcast address (%d).\n", err); + return -1; + } + return 0; +} + +static int __init ic_setup_routes(void) +{ + /* No need to setup device routes, only the default route... */ + + if (ic_gateway != INADDR_NONE) { + struct rtentry rm; + int err; + + memset(&rm, 0, sizeof(rm)); + if ((ic_gateway ^ ic_myaddr) & ic_netmask) { + printk(KERN_ERR "IP-Config: Gateway not on directly connected network.\n"); + return -1; + } + set_sockaddr((struct sockaddr_in *) &rm.rt_dst, 0, 0); + set_sockaddr((struct sockaddr_in *) &rm.rt_genmask, 0, 0); + set_sockaddr((struct sockaddr_in *) &rm.rt_gateway, ic_gateway, 0); + rm.rt_flags = RTF_UP | RTF_GATEWAY; + if ((err = ic_route_ioctl(SIOCADDRT, &rm)) < 0) { + printk(KERN_ERR "IP-Config: Cannot add default route (%d).\n", err); + return -1; + } + } + + return 0; +} + +/* + * Fill in default values for all missing parameters. + */ + +static int __init ic_defaults(void) +{ + /* + * At this point we have no userspace running so need not + * claim locks on system_utsname + */ + + if (!ic_host_name_set) + sprintf(system_utsname.nodename, "%u.%u.%u.%u", NIPQUAD(ic_myaddr)); + + if (root_server_addr == INADDR_NONE) + root_server_addr = ic_servaddr; + + if (ic_netmask == INADDR_NONE) { + if (IN_CLASSA(ntohl(ic_myaddr))) + ic_netmask = htonl(IN_CLASSA_NET); + else if (IN_CLASSB(ntohl(ic_myaddr))) + ic_netmask = htonl(IN_CLASSB_NET); + else if (IN_CLASSC(ntohl(ic_myaddr))) + ic_netmask = htonl(IN_CLASSC_NET); + else { + printk(KERN_ERR "IP-Config: Unable to guess netmask for address %u.%u.%u.%u\n", + NIPQUAD(ic_myaddr)); + return -1; + } + printk("IP-Config: Guessing netmask %u.%u.%u.%u\n", NIPQUAD(ic_netmask)); + } + + return 0; +} + +/* + * RARP support. + */ + +#ifdef IPCONFIG_RARP + +static int ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); + +static struct packet_type rarp_packet_type __initdata = { + .type = __constant_htons(ETH_P_RARP), + .func = ic_rarp_recv, +}; + +static inline void ic_rarp_init(void) +{ + dev_add_pack(&rarp_packet_type); +} + +static inline void ic_rarp_cleanup(void) +{ + dev_remove_pack(&rarp_packet_type); +} + +/* + * Process received RARP packet. + */ +static int __init +ic_rarp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +{ + struct arphdr *rarp; + unsigned char *rarp_ptr; + unsigned long sip, tip; + unsigned char *sha, *tha; /* s for "source", t for "target" */ + struct ic_device *d; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + return NET_RX_DROP; + + if (!pskb_may_pull(skb, sizeof(struct arphdr))) + goto drop; + + /* Basic sanity checks can be done without the lock. */ + rarp = (struct arphdr *)skb->h.raw; + + /* If this test doesn't pass, it's not IP, or we should + * ignore it anyway. + */ + if (rarp->ar_hln != dev->addr_len || dev->type != ntohs(rarp->ar_hrd)) + goto drop; + + /* If it's not a RARP reply, delete it. */ + if (rarp->ar_op != htons(ARPOP_RREPLY)) + goto drop; + + /* If it's not Ethernet, delete it. */ + if (rarp->ar_pro != htons(ETH_P_IP)) + goto drop; + + if (!pskb_may_pull(skb, + sizeof(struct arphdr) + + (2 * dev->addr_len) + + (2 * 4))) + goto drop; + + /* OK, it is all there and looks valid, process... */ + rarp = (struct arphdr *)skb->h.raw; + rarp_ptr = (unsigned char *) (rarp + 1); + + /* One reply at a time, please. */ + spin_lock(&ic_recv_lock); + + /* If we already have a reply, just drop the packet */ + if (ic_got_reply) + goto drop_unlock; + + /* Find the ic_device that the packet arrived on */ + d = ic_first_dev; + while (d && d->dev != dev) + d = d->next; + if (!d) + goto drop_unlock; /* should never happen */ + + /* Extract variable-width fields */ + sha = rarp_ptr; + rarp_ptr += dev->addr_len; + memcpy(&sip, rarp_ptr, 4); + rarp_ptr += 4; + tha = rarp_ptr; + rarp_ptr += dev->addr_len; + memcpy(&tip, rarp_ptr, 4); + + /* Discard packets which are not meant for us. */ + if (memcmp(tha, dev->dev_addr, dev->addr_len)) + goto drop_unlock; + + /* Discard packets which are not from specified server. */ + if (ic_servaddr != INADDR_NONE && ic_servaddr != sip) + goto drop_unlock; + + /* We have a winner! */ + ic_dev = dev; + if (ic_myaddr == INADDR_NONE) + ic_myaddr = tip; + ic_servaddr = sip; + ic_got_reply = IC_RARP; + +drop_unlock: + /* Show's over. Nothing to see here. */ + spin_unlock(&ic_recv_lock); + +drop: + /* Throw the packet out. */ + kfree_skb(skb); + return 0; +} + + +/* + * Send RARP request packet over a single interface. + */ +static void __init ic_rarp_send_if(struct ic_device *d) +{ + struct net_device *dev = d->dev; + arp_send(ARPOP_RREQUEST, ETH_P_RARP, 0, dev, 0, NULL, + dev->dev_addr, dev->dev_addr); +} +#endif + +/* + * DHCP/BOOTP support. + */ + +#ifdef IPCONFIG_BOOTP + +struct bootp_pkt { /* BOOTP packet format */ + struct iphdr iph; /* IP header */ + struct udphdr udph; /* UDP header */ + u8 op; /* 1=request, 2=reply */ + u8 htype; /* HW address type */ + u8 hlen; /* HW address length */ + u8 hops; /* Used only by gateways */ + u32 xid; /* Transaction ID */ + u16 secs; /* Seconds since we started */ + u16 flags; /* Just what it says */ + u32 client_ip; /* Client's IP address if known */ + u32 your_ip; /* Assigned IP address */ + u32 server_ip; /* (Next, e.g. NFS) Server's IP address */ + u32 relay_ip; /* IP address of BOOTP relay */ + u8 hw_addr[16]; /* Client's HW address */ + u8 serv_name[64]; /* Server host name */ + u8 boot_file[128]; /* Name of boot file */ + u8 exten[312]; /* DHCP options / BOOTP vendor extensions */ +}; + +/* packet ops */ +#define BOOTP_REQUEST 1 +#define BOOTP_REPLY 2 + +/* DHCP message types */ +#define DHCPDISCOVER 1 +#define DHCPOFFER 2 +#define DHCPREQUEST 3 +#define DHCPDECLINE 4 +#define DHCPACK 5 +#define DHCPNAK 6 +#define DHCPRELEASE 7 +#define DHCPINFORM 8 + +static int ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt); + +static struct packet_type bootp_packet_type __initdata = { + .type = __constant_htons(ETH_P_IP), + .func = ic_bootp_recv, +}; + + +/* + * Initialize DHCP/BOOTP extension fields in the request. + */ + +static const u8 ic_bootp_cookie[4] = { 99, 130, 83, 99 }; + +#ifdef IPCONFIG_DHCP + +static void __init +ic_dhcp_init_options(u8 *options) +{ + u8 mt = ((ic_servaddr == INADDR_NONE) + ? DHCPDISCOVER : DHCPREQUEST); + u8 *e = options; + +#ifdef IPCONFIG_DEBUG + printk("DHCP: Sending message type %d\n", mt); +#endif + + memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */ + e += 4; + + *e++ = 53; /* DHCP message type */ + *e++ = 1; + *e++ = mt; + + if (mt == DHCPREQUEST) { + *e++ = 54; /* Server ID (IP address) */ + *e++ = 4; + memcpy(e, &ic_servaddr, 4); + e += 4; + + *e++ = 50; /* Requested IP address */ + *e++ = 4; + memcpy(e, &ic_myaddr, 4); + e += 4; + } + + /* always? */ + { + static const u8 ic_req_params[] = { + 1, /* Subnet mask */ + 3, /* Default gateway */ + 6, /* DNS server */ + 12, /* Host name */ + 15, /* Domain name */ + 17, /* Boot path */ + 40, /* NIS domain name */ + }; + + *e++ = 55; /* Parameter request list */ + *e++ = sizeof(ic_req_params); + memcpy(e, ic_req_params, sizeof(ic_req_params)); + e += sizeof(ic_req_params); + } + + *e++ = 255; /* End of the list */ +} + +#endif /* IPCONFIG_DHCP */ + +static void __init ic_bootp_init_ext(u8 *e) +{ + memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */ + e += 4; + *e++ = 1; /* Subnet mask request */ + *e++ = 4; + e += 4; + *e++ = 3; /* Default gateway request */ + *e++ = 4; + e += 4; + *e++ = 5; /* Name server request */ + *e++ = 8; + e += 8; + *e++ = 12; /* Host name request */ + *e++ = 32; + e += 32; + *e++ = 40; /* NIS Domain name request */ + *e++ = 32; + e += 32; + *e++ = 17; /* Boot path */ + *e++ = 40; + e += 40; + + *e++ = 57; /* set extension buffer size for reply */ + *e++ = 2; + *e++ = 1; /* 128+236+8+20+14, see dhcpd sources */ + *e++ = 150; + + *e++ = 255; /* End of the list */ +} + + +/* + * Initialize the DHCP/BOOTP mechanism. + */ +static inline void ic_bootp_init(void) +{ + int i; + + for (i = 0; i < CONF_NAMESERVERS_MAX; i++) + ic_nameservers[i] = INADDR_NONE; + + dev_add_pack(&bootp_packet_type); +} + + +/* + * DHCP/BOOTP cleanup. + */ +static inline void ic_bootp_cleanup(void) +{ + dev_remove_pack(&bootp_packet_type); +} + + +/* + * Send DHCP/BOOTP request to single interface. + */ +static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_diff) +{ + struct net_device *dev = d->dev; + struct sk_buff *skb; + struct bootp_pkt *b; + int hh_len = LL_RESERVED_SPACE(dev); + struct iphdr *h; + + /* Allocate packet */ + skb = alloc_skb(sizeof(struct bootp_pkt) + hh_len + 15, GFP_KERNEL); + if (!skb) + return; + skb_reserve(skb, hh_len); + b = (struct bootp_pkt *) skb_put(skb, sizeof(struct bootp_pkt)); + memset(b, 0, sizeof(struct bootp_pkt)); + + /* Construct IP header */ + skb->nh.iph = h = &b->iph; + h->version = 4; + h->ihl = 5; + h->tot_len = htons(sizeof(struct bootp_pkt)); + h->frag_off = htons(IP_DF); + h->ttl = 64; + h->protocol = IPPROTO_UDP; + h->daddr = INADDR_BROADCAST; + h->check = ip_fast_csum((unsigned char *) h, h->ihl); + + /* Construct UDP header */ + b->udph.source = htons(68); + b->udph.dest = htons(67); + b->udph.len = htons(sizeof(struct bootp_pkt) - sizeof(struct iphdr)); + /* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */ + + /* Construct DHCP/BOOTP header */ + b->op = BOOTP_REQUEST; + if (dev->type < 256) /* check for false types */ + b->htype = dev->type; + else if (dev->type == ARPHRD_IEEE802_TR) /* fix for token ring */ + b->htype = ARPHRD_IEEE802; + else if (dev->type == ARPHRD_FDDI) + b->htype = ARPHRD_ETHER; + else { + printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name); + b->htype = dev->type; /* can cause undefined behavior */ + } + b->hlen = dev->addr_len; + b->your_ip = INADDR_NONE; + b->server_ip = INADDR_NONE; + memcpy(b->hw_addr, dev->dev_addr, dev->addr_len); + b->secs = htons(jiffies_diff / HZ); + b->xid = d->xid; + + /* add DHCP options or BOOTP extensions */ +#ifdef IPCONFIG_DHCP + if (ic_proto_enabled & IC_USE_DHCP) + ic_dhcp_init_options(b->exten); + else +#endif + ic_bootp_init_ext(b->exten); + + /* Chain packet down the line... */ + skb->dev = dev; + skb->protocol = htons(ETH_P_IP); + if ((dev->hard_header && + dev->hard_header(skb, dev, ntohs(skb->protocol), dev->broadcast, dev->dev_addr, skb->len) < 0) || + dev_queue_xmit(skb) < 0) + printk("E"); +} + + +/* + * Copy BOOTP-supplied string if not already set. + */ +static int __init ic_bootp_string(char *dest, char *src, int len, int max) +{ + if (!len) + return 0; + if (len > max-1) + len = max-1; + memcpy(dest, src, len); + dest[len] = '\0'; + return 1; +} + + +/* + * Process BOOTP extensions. + */ +static void __init ic_do_bootp_ext(u8 *ext) +{ + u8 servers; + int i; + +#ifdef IPCONFIG_DEBUG + u8 *c; + + printk("DHCP/BOOTP: Got extension %d:",*ext); + for(c=ext+2; c CONF_NAMESERVERS_MAX) + servers = CONF_NAMESERVERS_MAX; + for (i = 0; i < servers; i++) { + if (ic_nameservers[i] == INADDR_NONE) + memcpy(&ic_nameservers[i], ext+1+4*i, 4); + } + break; + case 12: /* Host name */ + ic_bootp_string(system_utsname.nodename, ext+1, *ext, __NEW_UTS_LEN); + ic_host_name_set = 1; + break; + case 15: /* Domain name (DNS) */ + ic_bootp_string(ic_domain, ext+1, *ext, sizeof(ic_domain)); + break; + case 17: /* Root path */ + if (!root_server_path[0]) + ic_bootp_string(root_server_path, ext+1, *ext, sizeof(root_server_path)); + break; + case 40: /* NIS Domain name (_not_ DNS) */ + ic_bootp_string(system_utsname.domainname, ext+1, *ext, __NEW_UTS_LEN); + break; + } +} + + +/* + * Receive BOOTP reply. + */ +static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +{ + struct bootp_pkt *b; + struct iphdr *h; + struct ic_device *d; + int len, ext_len; + + /* Perform verifications before taking the lock. */ + if (skb->pkt_type == PACKET_OTHERHOST) + goto drop; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + return NET_RX_DROP; + + if (!pskb_may_pull(skb, + sizeof(struct iphdr) + + sizeof(struct udphdr))) + goto drop; + + b = (struct bootp_pkt *) skb->nh.iph; + h = &b->iph; + + if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP) + goto drop; + + /* Fragments are not supported */ + if (h->frag_off & htons(IP_OFFSET | IP_MF)) { + if (net_ratelimit()) + printk(KERN_ERR "DHCP/BOOTP: Ignoring fragmented " + "reply.\n"); + goto drop; + } + + if (skb->len < ntohs(h->tot_len)) + goto drop; + + if (ip_fast_csum((char *) h, h->ihl)) + goto drop; + + if (b->udph.source != htons(67) || b->udph.dest != htons(68)) + goto drop; + + if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr)) + goto drop; + + len = ntohs(b->udph.len) - sizeof(struct udphdr); + ext_len = len - (sizeof(*b) - + sizeof(struct iphdr) - + sizeof(struct udphdr) - + sizeof(b->exten)); + if (ext_len < 0) + goto drop; + + /* Ok the front looks good, make sure we can get at the rest. */ + if (!pskb_may_pull(skb, skb->len)) + goto drop; + + b = (struct bootp_pkt *) skb->nh.iph; + h = &b->iph; + + /* One reply at a time, please. */ + spin_lock(&ic_recv_lock); + + /* If we already have a reply, just drop the packet */ + if (ic_got_reply) + goto drop_unlock; + + /* Find the ic_device that the packet arrived on */ + d = ic_first_dev; + while (d && d->dev != dev) + d = d->next; + if (!d) + goto drop_unlock; /* should never happen */ + + /* Is it a reply to our BOOTP request? */ + if (b->op != BOOTP_REPLY || + b->xid != d->xid) { + if (net_ratelimit()) + printk(KERN_ERR "DHCP/BOOTP: Reply not for us, " + "op[%x] xid[%x]\n", + b->op, b->xid); + goto drop_unlock; + } + + /* Parse extensions */ + if (ext_len >= 4 && + !memcmp(b->exten, ic_bootp_cookie, 4)) { /* Check magic cookie */ + u8 *end = (u8 *) b + ntohs(b->iph.tot_len); + u8 *ext; + +#ifdef IPCONFIG_DHCP + if (ic_proto_enabled & IC_USE_DHCP) { + u32 server_id = INADDR_NONE; + int mt = 0; + + ext = &b->exten[4]; + while (ext < end && *ext != 0xff) { + u8 *opt = ext++; + if (*opt == 0) /* Padding */ + continue; + ext += *ext + 1; + if (ext >= end) + break; + switch (*opt) { + case 53: /* Message type */ + if (opt[1]) + mt = opt[2]; + break; + case 54: /* Server ID (IP address) */ + if (opt[1] >= 4) + memcpy(&server_id, opt + 2, 4); + break; + }; + } + +#ifdef IPCONFIG_DEBUG + printk("DHCP: Got message type %d\n", mt); +#endif + + switch (mt) { + case DHCPOFFER: + /* While in the process of accepting one offer, + * ignore all others. + */ + if (ic_myaddr != INADDR_NONE) + goto drop_unlock; + + /* Let's accept that offer. */ + ic_myaddr = b->your_ip; + ic_servaddr = server_id; +#ifdef IPCONFIG_DEBUG + printk("DHCP: Offered address %u.%u.%u.%u", + NIPQUAD(ic_myaddr)); + printk(" by server %u.%u.%u.%u\n", + NIPQUAD(ic_servaddr)); +#endif + /* The DHCP indicated server address takes + * precedence over the bootp header one if + * they are different. + */ + if ((server_id != INADDR_NONE) && + (b->server_ip != server_id)) + b->server_ip = ic_servaddr; + break; + + case DHCPACK: + if (memcmp(dev->dev_addr, b->hw_addr, dev->addr_len) != 0) + goto drop_unlock; + + /* Yeah! */ + break; + + default: + /* Urque. Forget it*/ + ic_myaddr = INADDR_NONE; + ic_servaddr = INADDR_NONE; + goto drop_unlock; + }; + + ic_dhcp_msgtype = mt; + + } +#endif /* IPCONFIG_DHCP */ + + ext = &b->exten[4]; + while (ext < end && *ext != 0xff) { + u8 *opt = ext++; + if (*opt == 0) /* Padding */ + continue; + ext += *ext + 1; + if (ext < end) + ic_do_bootp_ext(opt); + } + } + + /* We have a winner! */ + ic_dev = dev; + ic_myaddr = b->your_ip; + ic_servaddr = b->server_ip; + if (ic_gateway == INADDR_NONE && b->relay_ip) + ic_gateway = b->relay_ip; + if (ic_nameservers[0] == INADDR_NONE) + ic_nameservers[0] = ic_servaddr; + ic_got_reply = IC_BOOTP; + +drop_unlock: + /* Show's over. Nothing to see here. */ + spin_unlock(&ic_recv_lock); + +drop: + /* Throw the packet out. */ + kfree_skb(skb); + + return 0; +} + + +#endif + + +/* + * Dynamic IP configuration -- DHCP, BOOTP, RARP. + */ + +#ifdef IPCONFIG_DYNAMIC + +static int __init ic_dynamic(void) +{ + int retries; + struct ic_device *d; + unsigned long start_jiffies, timeout, jiff; + int do_bootp = ic_proto_have_if & IC_BOOTP; + int do_rarp = ic_proto_have_if & IC_RARP; + + /* + * If none of DHCP/BOOTP/RARP was selected, return with an error. + * This routine gets only called when some pieces of information + * are missing, and without DHCP/BOOTP/RARP we are unable to get it. + */ + if (!ic_proto_enabled) { + printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n"); + return -1; + } + +#ifdef IPCONFIG_BOOTP + if ((ic_proto_enabled ^ ic_proto_have_if) & IC_BOOTP) + printk(KERN_ERR "DHCP/BOOTP: No suitable device found.\n"); +#endif +#ifdef IPCONFIG_RARP + if ((ic_proto_enabled ^ ic_proto_have_if) & IC_RARP) + printk(KERN_ERR "RARP: No suitable device found.\n"); +#endif + + if (!ic_proto_have_if) + /* Error message already printed */ + return -1; + + /* + * Setup protocols + */ +#ifdef IPCONFIG_BOOTP + if (do_bootp) + ic_bootp_init(); +#endif +#ifdef IPCONFIG_RARP + if (do_rarp) + ic_rarp_init(); +#endif + + /* + * Send requests and wait, until we get an answer. This loop + * seems to be a terrible waste of CPU time, but actually there is + * only one process running at all, so we don't need to use any + * scheduler functions. + * [Actually we could now, but the nothing else running note still + * applies.. - AC] + */ + printk(KERN_NOTICE "Sending %s%s%s requests .", + do_bootp + ? ((ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP") : "", + (do_bootp && do_rarp) ? " and " : "", + do_rarp ? "RARP" : ""); + + start_jiffies = jiffies; + d = ic_first_dev; + retries = CONF_SEND_RETRIES; + get_random_bytes(&timeout, sizeof(timeout)); + timeout = CONF_BASE_TIMEOUT + (timeout % (unsigned) CONF_TIMEOUT_RANDOM); + for(;;) { +#ifdef IPCONFIG_BOOTP + if (do_bootp && (d->able & IC_BOOTP)) + ic_bootp_send_if(d, jiffies - start_jiffies); +#endif +#ifdef IPCONFIG_RARP + if (do_rarp && (d->able & IC_RARP)) + ic_rarp_send_if(d); +#endif + + jiff = jiffies + (d->next ? CONF_INTER_TIMEOUT : timeout); + while (time_before(jiffies, jiff) && !ic_got_reply) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(1); + } +#ifdef IPCONFIG_DHCP + /* DHCP isn't done until we get a DHCPACK. */ + if ((ic_got_reply & IC_BOOTP) + && (ic_proto_enabled & IC_USE_DHCP) + && ic_dhcp_msgtype != DHCPACK) + { + ic_got_reply = 0; + printk(","); + continue; + } +#endif /* IPCONFIG_DHCP */ + + if (ic_got_reply) { + printk(" OK\n"); + break; + } + + if ((d = d->next)) + continue; + + if (! --retries) { + printk(" timed out!\n"); + break; + } + + d = ic_first_dev; + + timeout = timeout CONF_TIMEOUT_MULT; + if (timeout > CONF_TIMEOUT_MAX) + timeout = CONF_TIMEOUT_MAX; + + printk("."); + } + +#ifdef IPCONFIG_BOOTP + if (do_bootp) + ic_bootp_cleanup(); +#endif +#ifdef IPCONFIG_RARP + if (do_rarp) + ic_rarp_cleanup(); +#endif + + if (!ic_got_reply) + return -1; + + printk("IP-Config: Got %s answer from %u.%u.%u.%u, ", + ((ic_got_reply & IC_RARP) ? "RARP" + : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"), + NIPQUAD(ic_servaddr)); + printk("my address is %u.%u.%u.%u\n", NIPQUAD(ic_myaddr)); + + return 0; +} + +#endif /* IPCONFIG_DYNAMIC */ + +#ifdef CONFIG_PROC_FS + +static int pnp_seq_show(struct seq_file *seq, void *v) +{ + int i; + + if (ic_proto_used & IC_PROTO) + seq_printf(seq, "#PROTO: %s\n", + (ic_proto_used & IC_RARP) ? "RARP" + : (ic_proto_used & IC_USE_DHCP) ? "DHCP" : "BOOTP"); + else + seq_puts(seq, "#MANUAL\n"); + + if (ic_domain[0]) + seq_printf(seq, + "domain %s\n", ic_domain); + for (i = 0; i < CONF_NAMESERVERS_MAX; i++) { + if (ic_nameservers[i] != INADDR_NONE) + seq_printf(seq, + "nameserver %u.%u.%u.%u\n", + NIPQUAD(ic_nameservers[i])); + } + if (ic_servaddr != INADDR_NONE) + seq_printf(seq, + "bootserver %u.%u.%u.%u\n", + NIPQUAD(ic_servaddr)); + return 0; +} + +static int pnp_seq_open(struct inode *indoe, struct file *file) +{ + return single_open(file, pnp_seq_show, NULL); +} + +static struct file_operations pnp_seq_fops = { + .owner = THIS_MODULE, + .open = pnp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_PROC_FS */ + +/* + * Extract IP address from the parameter string if needed. Note that we + * need to have root_server_addr set _before_ IPConfig gets called as it + * can override it. + */ +u32 __init root_nfs_parse_addr(char *name) +{ + u32 addr; + int octets = 0; + char *cp, *cq; + + cp = cq = name; + while (octets < 4) { + while (*cp >= '0' && *cp <= '9') + cp++; + if (cp == cq || cp - cq > 3) + break; + if (*cp == '.' || octets == 3) + octets++; + if (octets < 4) + cp++; + cq = cp; + } + if (octets == 4 && (*cp == ':' || *cp == '\0')) { + if (*cp == ':') + *cp++ = '\0'; + addr = in_aton(name); + memmove(name, cp, strlen(cp) + 1); + } else + addr = INADDR_NONE; + + return addr; +} + +/* + * IP Autoconfig dispatcher. + */ + +static int __init ip_auto_config(void) +{ + u32 addr; + +#ifdef CONFIG_PROC_FS + proc_net_fops_create("pnp", S_IRUGO, &pnp_seq_fops); +#endif /* CONFIG_PROC_FS */ + + if (!ic_enable) + return 0; + + DBG(("IP-Config: Entered.\n")); +#ifdef IPCONFIG_DYNAMIC + try_try_again: +#endif + /* Give hardware a chance to settle */ + msleep(CONF_PRE_OPEN); + + /* Setup all network devices */ + if (ic_open_devs() < 0) + return -1; + + /* Give drivers a chance to settle */ + ssleep(CONF_POST_OPEN); + + /* + * If the config information is insufficient (e.g., our IP address or + * IP address of the boot server is missing or we have multiple network + * interfaces and no default was set), use BOOTP or RARP to get the + * missing values. + */ + if (ic_myaddr == INADDR_NONE || +#ifdef CONFIG_ROOT_NFS + (MAJOR(ROOT_DEV) == UNNAMED_MAJOR + && root_server_addr == INADDR_NONE + && ic_servaddr == INADDR_NONE) || +#endif + ic_first_dev->next) { +#ifdef IPCONFIG_DYNAMIC + + int retries = CONF_OPEN_RETRIES; + + if (ic_dynamic() < 0) { + ic_close_devs(); + + /* + * I don't know why, but sometimes the + * eepro100 driver (at least) gets upset and + * doesn't work the first time it's opened. + * But then if you close it and reopen it, it + * works just fine. So we need to try that at + * least once before giving up. + * + * Also, if the root will be NFS-mounted, we + * have nowhere to go if DHCP fails. So we + * just have to keep trying forever. + * + * -- Chip + */ +#ifdef CONFIG_ROOT_NFS + if (ROOT_DEV == Root_NFS) { + printk(KERN_ERR + "IP-Config: Retrying forever (NFS root)...\n"); + goto try_try_again; + } +#endif + + if (--retries) { + printk(KERN_ERR + "IP-Config: Reopening network devices...\n"); + goto try_try_again; + } + + /* Oh, well. At least we tried. */ + printk(KERN_ERR "IP-Config: Auto-configuration of network failed.\n"); + return -1; + } +#else /* !DYNAMIC */ + printk(KERN_ERR "IP-Config: Incomplete network configuration information.\n"); + ic_close_devs(); + return -1; +#endif /* IPCONFIG_DYNAMIC */ + } else { + /* Device selected manually or only one device -> use it */ + ic_dev = ic_first_dev->dev; + } + + addr = root_nfs_parse_addr(root_server_path); + if (root_server_addr == INADDR_NONE) + root_server_addr = addr; + + /* + * Use defaults whereever applicable. + */ + if (ic_defaults() < 0) + return -1; + + /* + * Close all network devices except the device we've + * autoconfigured and set up routes. + */ + ic_close_devs(); + if (ic_setup_if() < 0 || ic_setup_routes() < 0) + return -1; + + /* + * Record which protocol was actually used. + */ +#ifdef IPCONFIG_DYNAMIC + ic_proto_used = ic_got_reply | (ic_proto_enabled & IC_USE_DHCP); +#endif + +#ifndef IPCONFIG_SILENT + /* + * Clue in the operator. + */ + printk("IP-Config: Complete:"); + printk("\n device=%s", ic_dev->name); + printk(", addr=%u.%u.%u.%u", NIPQUAD(ic_myaddr)); + printk(", mask=%u.%u.%u.%u", NIPQUAD(ic_netmask)); + printk(", gw=%u.%u.%u.%u", NIPQUAD(ic_gateway)); + printk(",\n host=%s, domain=%s, nis-domain=%s", + system_utsname.nodename, ic_domain, system_utsname.domainname); + printk(",\n bootserver=%u.%u.%u.%u", NIPQUAD(ic_servaddr)); + printk(", rootserver=%u.%u.%u.%u", NIPQUAD(root_server_addr)); + printk(", rootpath=%s", root_server_path); + printk("\n"); +#endif /* !SILENT */ + + return 0; +} + +late_initcall(ip_auto_config); + + +/* + * Decode any IP configuration options in the "ip=" or "nfsaddrs=" kernel + * command line parameter. It consists of option fields separated by colons in + * the following order: + * + * :::::: + * + * Any of the fields can be empty which means to use a default value: + * - address given by BOOTP or RARP + * - address of host returning BOOTP or RARP packet + * - none, or the address returned by BOOTP + * - automatically determined from , or the + * one returned by BOOTP + * - in ASCII notation, or the name returned + * by BOOTP + * - use all available devices + * : + * off|none - don't do autoconfig at all (DEFAULT) + * on|any - use any configured protocol + * dhcp|bootp|rarp - use only the specified protocol + * both - use both BOOTP and RARP (not DHCP) + */ +static int __init ic_proto_name(char *name) +{ + if (!strcmp(name, "on") || !strcmp(name, "any")) { + return 1; + } +#ifdef CONFIG_IP_PNP_DHCP + else if (!strcmp(name, "dhcp")) { + ic_proto_enabled &= ~IC_RARP; + return 1; + } +#endif +#ifdef CONFIG_IP_PNP_BOOTP + else if (!strcmp(name, "bootp")) { + ic_proto_enabled &= ~(IC_RARP | IC_USE_DHCP); + return 1; + } +#endif +#ifdef CONFIG_IP_PNP_RARP + else if (!strcmp(name, "rarp")) { + ic_proto_enabled &= ~(IC_BOOTP | IC_USE_DHCP); + return 1; + } +#endif +#ifdef IPCONFIG_DYNAMIC + else if (!strcmp(name, "both")) { + ic_proto_enabled &= ~IC_USE_DHCP; /* backward compat :-( */ + return 1; + } +#endif + return 0; +} + +static int __init ip_auto_config_setup(char *addrs) +{ + char *cp, *ip, *dp; + int num = 0; + + ic_set_manually = 1; + + ic_enable = (*addrs && + (strcmp(addrs, "off") != 0) && + (strcmp(addrs, "none") != 0)); + if (!ic_enable) + return 1; + + if (ic_proto_name(addrs)) + return 1; + + /* Parse the whole string */ + ip = addrs; + while (ip && *ip) { + if ((cp = strchr(ip, ':'))) + *cp++ = '\0'; + if (strlen(ip) > 0) { + DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip)); + switch (num) { + case 0: + if ((ic_myaddr = in_aton(ip)) == INADDR_ANY) + ic_myaddr = INADDR_NONE; + break; + case 1: + if ((ic_servaddr = in_aton(ip)) == INADDR_ANY) + ic_servaddr = INADDR_NONE; + break; + case 2: + if ((ic_gateway = in_aton(ip)) == INADDR_ANY) + ic_gateway = INADDR_NONE; + break; + case 3: + if ((ic_netmask = in_aton(ip)) == INADDR_ANY) + ic_netmask = INADDR_NONE; + break; + case 4: + if ((dp = strchr(ip, '.'))) { + *dp++ = '\0'; + strlcpy(system_utsname.domainname, dp, + sizeof(system_utsname.domainname)); + } + strlcpy(system_utsname.nodename, ip, + sizeof(system_utsname.nodename)); + ic_host_name_set = 1; + break; + case 5: + strlcpy(user_dev_name, ip, sizeof(user_dev_name)); + break; + case 6: + ic_proto_name(ip); + break; + } + } + ip = cp; + num++; + } + + return 1; +} + +static int __init nfsaddrs_config_setup(char *addrs) +{ + return ip_auto_config_setup(addrs); +} + +__setup("ip=", ip_auto_config_setup); +__setup("nfsaddrs=", nfsaddrs_config_setup); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c new file mode 100644 index 000000000000..68a78731f722 --- /dev/null +++ b/net/ipv4/ipip.c @@ -0,0 +1,905 @@ +/* + * Linux NET3: IP/IP protocol decoder. + * + * Version: $Id: ipip.c,v 1.50 2001/10/02 02:22:36 davem Exp $ + * + * Authors: + * Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 + * + * Fixes: + * Alan Cox : Merged and made usable non modular (its so tiny its silly as + * a module taking up 2 pages). + * Alan Cox : Fixed bug with 1.3.18 and IPIP not working (now needs to set skb->h.iph) + * to keep ip_forward happy. + * Alan Cox : More fixes for 1.3.21, and firewall fix. Maybe this will work soon 8). + * Kai Schulte : Fixed #defines for IP_FIREWALL->FIREWALL + * David Woodhouse : Perform some basic ICMP handling. + * IPIP Routing without decapsulation. + * Carlos Picoto : GRE over IP support + * Alexey Kuznetsov: Reworked. Really, now it is truncated version of ipv4/ip_gre.c. + * I do not want to merge them together. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +/* tunnel.c: an IP tunnel driver + + The purpose of this driver is to provide an IP tunnel through + which you can tunnel network traffic transparently across subnets. + + This was written by looking at Nick Holloway's dummy driver + Thanks for the great code! + + -Sam Lantinga (slouken@cs.ucdavis.edu) 02/01/95 + + Minor tweaks: + Cleaned up the code a little and added some pre-1.3.0 tweaks. + dev->hard_header/hard_header_len changed to use no headers. + Comments/bracketing tweaked. + Made the tunnels use dev->name not tunnel: when error reporting. + Added tx_dropped stat + + -Alan Cox (Alan.Cox@linux.org) 21 March 95 + + Reworked: + Changed to tunnel to destination gateway in addition to the + tunnel's pointopoint address + Almost completely rewritten + Note: There is currently no firewall or ICMP handling done. + + -Sam Lantinga (slouken@cs.ucdavis.edu) 02/13/96 + +*/ + +/* Things I wish I had known when writing the tunnel driver: + + When the tunnel_xmit() function is called, the skb contains the + packet to be sent (plus a great deal of extra info), and dev + contains the tunnel device that _we_ are. + + When we are passed a packet, we are expected to fill in the + source address with our source IP address. + + What is the proper way to allocate, copy and free a buffer? + After you allocate it, it is a "0 length" chunk of memory + starting at zero. If you want to add headers to the buffer + later, you'll have to call "skb_reserve(skb, amount)" with + the amount of memory you want reserved. Then, you call + "skb_put(skb, amount)" with the amount of space you want in + the buffer. skb_put() returns a pointer to the top (#0) of + that buffer. skb->len is set to the amount of space you have + "allocated" with skb_put(). You can then write up to skb->len + bytes to that buffer. If you need more, you can call skb_put() + again with the additional amount of space you need. You can + find out how much more space you can allocate by calling + "skb_tailroom(skb)". + Now, to add header space, call "skb_push(skb, header_len)". + This creates space at the beginning of the buffer and returns + a pointer to this new space. If later you need to strip a + header from a buffer, call "skb_pull(skb, header_len)". + skb_headroom() will return how much space is left at the top + of the buffer (before the main data). Remember, this headroom + space must be reserved before the skb_put() function is called. + */ + +/* + This version of net/ipv4/ipip.c is cloned of net/ipv4/ip_gre.c + + For comments look at net/ipv4/ip_gre.c --ANK + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define HASH_SIZE 16 +#define HASH(addr) ((addr^(addr>>4))&0xF) + +static int ipip_fb_tunnel_init(struct net_device *dev); +static int ipip_tunnel_init(struct net_device *dev); +static void ipip_tunnel_setup(struct net_device *dev); + +static struct net_device *ipip_fb_tunnel_dev; + +static struct ip_tunnel *tunnels_r_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_r[HASH_SIZE]; +static struct ip_tunnel *tunnels_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_wc[1]; +static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l }; + +static DEFINE_RWLOCK(ipip_lock); + +static struct ip_tunnel * ipip_tunnel_lookup(u32 remote, u32 local) +{ + unsigned h0 = HASH(remote); + unsigned h1 = HASH(local); + struct ip_tunnel *t; + + for (t = tunnels_r_l[h0^h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_r[h0]; t; t = t->next) { + if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_l[h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) + return t; + } + if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) + return t; + return NULL; +} + +static struct ip_tunnel **ipip_bucket(struct ip_tunnel *t) +{ + u32 remote = t->parms.iph.daddr; + u32 local = t->parms.iph.saddr; + unsigned h = 0; + int prio = 0; + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + return &tunnels[prio][h]; +} + + +static void ipip_tunnel_unlink(struct ip_tunnel *t) +{ + struct ip_tunnel **tp; + + for (tp = ipip_bucket(t); *tp; tp = &(*tp)->next) { + if (t == *tp) { + write_lock_bh(&ipip_lock); + *tp = t->next; + write_unlock_bh(&ipip_lock); + break; + } + } +} + +static void ipip_tunnel_link(struct ip_tunnel *t) +{ + struct ip_tunnel **tp = ipip_bucket(t); + + t->next = *tp; + write_lock_bh(&ipip_lock); + *tp = t; + write_unlock_bh(&ipip_lock); +} + +static struct ip_tunnel * ipip_tunnel_locate(struct ip_tunnel_parm *parms, int create) +{ + u32 remote = parms->iph.daddr; + u32 local = parms->iph.saddr; + struct ip_tunnel *t, **tp, *nt; + struct net_device *dev; + unsigned h = 0; + int prio = 0; + char name[IFNAMSIZ]; + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) + return t; + } + if (!create) + return NULL; + + if (parms->name[0]) + strlcpy(name, parms->name, IFNAMSIZ); + else { + int i; + for (i=1; i<100; i++) { + sprintf(name, "tunl%d", i); + if (__dev_get_by_name(name) == NULL) + break; + } + if (i==100) + goto failed; + } + + dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); + if (dev == NULL) + return NULL; + + nt = dev->priv; + SET_MODULE_OWNER(dev); + dev->init = ipip_tunnel_init; + nt->parms = *parms; + + if (register_netdevice(dev) < 0) { + free_netdev(dev); + goto failed; + } + + dev_hold(dev); + ipip_tunnel_link(nt); + /* Do not decrement MOD_USE_COUNT here. */ + return nt; + +failed: + return NULL; +} + +static void ipip_tunnel_uninit(struct net_device *dev) +{ + if (dev == ipip_fb_tunnel_dev) { + write_lock_bh(&ipip_lock); + tunnels_wc[0] = NULL; + write_unlock_bh(&ipip_lock); + } else + ipip_tunnel_unlink((struct ip_tunnel*)dev->priv); + dev_put(dev); +} + +static void ipip_err(struct sk_buff *skb, void *__unused) +{ +#ifndef I_WISH_WORLD_WERE_PERFECT + +/* It is not :-( All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. + */ + struct iphdr *iph = (struct iphdr*)skb->data; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct ip_tunnel *t; + + switch (type) { + default: + case ICMP_PARAMETERPROB: + return; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Soft state for pmtu is maintained by IP core. */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe they are just ether pollution. --ANK + */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + read_lock(&ipip_lock); + t = ipip_tunnel_lookup(iph->daddr, iph->saddr); + if (t == NULL || t->parms.iph.daddr == 0) + goto out; + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + goto out; + + if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; +out: + read_unlock(&ipip_lock); + return; +#else + struct iphdr *iph = (struct iphdr*)dp; + int hlen = iph->ihl<<2; + struct iphdr *eiph; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + int rel_type = 0; + int rel_code = 0; + int rel_info = 0; + struct sk_buff *skb2; + struct flowi fl; + struct rtable *rt; + + if (len < hlen + sizeof(struct iphdr)) + return; + eiph = (struct iphdr*)(dp + hlen); + + switch (type) { + default: + return; + case ICMP_PARAMETERPROB: + if (skb->h.icmph->un.gateway < hlen) + return; + + /* So... This guy found something strange INSIDE encapsulated + packet. Well, he is fool, but what can we do ? + */ + rel_type = ICMP_PARAMETERPROB; + rel_info = skb->h.icmph->un.gateway - hlen; + break; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* And it is the only really necessary thing :-) */ + rel_info = ntohs(skb->h.icmph->un.frag.mtu); + if (rel_info < hlen+68) + return; + rel_info -= hlen; + /* BSD 4.2 MORE DOES NOT EXIST IN NATURE. */ + if (rel_info > ntohs(eiph->tot_len)) + return; + break; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe, it is just ether pollution. --ANK + */ + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + /* Prepare fake skb to feed it to icmp_send */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 == NULL) + return; + dst_release(skb2->dst); + skb2->dst = NULL; + skb_pull(skb2, skb->data - (u8*)eiph); + skb2->nh.raw = skb2->data; + + /* Try to guess incoming interface */ + memset(&fl, 0, sizeof(fl)); + fl.fl4_daddr = eiph->saddr; + fl.fl4_tos = RT_TOS(eiph->tos); + fl.proto = IPPROTO_IPIP; + if (ip_route_output_key(&rt, &key)) { + kfree_skb(skb2); + return; + } + skb2->dev = rt->u.dst.dev; + + /* route "incoming" packet */ + if (rt->rt_flags&RTCF_LOCAL) { + ip_rt_put(rt); + rt = NULL; + fl.fl4_daddr = eiph->daddr; + fl.fl4_src = eiph->saddr; + fl.fl4_tos = eiph->tos; + if (ip_route_output_key(&rt, &fl) || + rt->u.dst.dev->type != ARPHRD_TUNNEL) { + ip_rt_put(rt); + kfree_skb(skb2); + return; + } + } else { + ip_rt_put(rt); + if (ip_route_input(skb2, eiph->daddr, eiph->saddr, eiph->tos, skb2->dev) || + skb2->dst->dev->type != ARPHRD_TUNNEL) { + kfree_skb(skb2); + return; + } + } + + /* change mtu on this route */ + if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) { + if (rel_info > dst_mtu(skb2->dst)) { + kfree_skb(skb2); + return; + } + skb2->dst->ops->update_pmtu(skb2->dst, rel_info); + rel_info = htonl(rel_info); + } else if (type == ICMP_TIME_EXCEEDED) { + struct ip_tunnel *t = (struct ip_tunnel*)skb2->dev->priv; + if (t->parms.iph.ttl) { + rel_type = ICMP_DEST_UNREACH; + rel_code = ICMP_HOST_UNREACH; + } + } + + icmp_send(skb2, rel_type, rel_code, rel_info); + kfree_skb(skb2); + return; +#endif +} + +static inline void ipip_ecn_decapsulate(struct iphdr *outer_iph, struct sk_buff *skb) +{ + struct iphdr *inner_iph = skb->nh.iph; + + if (INET_ECN_is_ce(outer_iph->tos)) + IP_ECN_set_ce(inner_iph); +} + +static int ipip_rcv(struct sk_buff *skb) +{ + struct iphdr *iph; + struct ip_tunnel *tunnel; + + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + + iph = skb->nh.iph; + + read_lock(&ipip_lock); + if ((tunnel = ipip_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) { + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + read_unlock(&ipip_lock); + kfree_skb(skb); + return 0; + } + + secpath_reset(skb); + + skb->mac.raw = skb->nh.raw; + skb->nh.raw = skb->data; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = htons(ETH_P_IP); + skb->pkt_type = PACKET_HOST; + + tunnel->stat.rx_packets++; + tunnel->stat.rx_bytes += skb->len; + skb->dev = tunnel->dev; + dst_release(skb->dst); + skb->dst = NULL; + nf_reset(skb); + ipip_ecn_decapsulate(iph, skb); + netif_rx(skb); + read_unlock(&ipip_lock); + return 0; + } + read_unlock(&ipip_lock); + +out: + return -1; +} + +/* + * This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ + +static int ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct net_device_stats *stats = &tunnel->stat; + struct iphdr *tiph = &tunnel->parms.iph; + u8 tos = tunnel->parms.iph.tos; + u16 df = tiph->frag_off; + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *old_iph = skb->nh.iph; + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + u32 dst = tiph->daddr; + int mtu; + + if (tunnel->recursion++) { + tunnel->stat.collisions++; + goto tx_error; + } + + if (skb->protocol != htons(ETH_P_IP)) + goto tx_error; + + if (tos&1) + tos = old_iph->tos; + + if (!dst) { + /* NBMA tunnel */ + if ((rt = (struct rtable*)skb->dst) == NULL) { + tunnel->stat.tx_fifo_errors++; + goto tx_error; + } + if ((dst = rt->rt_gateway) == 0) + goto tx_error_icmp; + } + + { + struct flowi fl = { .oif = tunnel->parms.link, + .nl_u = { .ip4_u = + { .daddr = dst, + .saddr = tiph->saddr, + .tos = RT_TOS(tos) } }, + .proto = IPPROTO_IPIP }; + if (ip_route_output_key(&rt, &fl)) { + tunnel->stat.tx_carrier_errors++; + goto tx_error_icmp; + } + } + tdev = rt->u.dst.dev; + + if (tdev == dev) { + ip_rt_put(rt); + tunnel->stat.collisions++; + goto tx_error; + } + + if (tiph->frag_off) + mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); + else + mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; + + if (mtu < 68) { + tunnel->stat.collisions++; + ip_rt_put(rt); + goto tx_error; + } + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + df |= (old_iph->frag_off&htons(IP_DF)); + + if ((old_iph->frag_off&htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + goto tx_error; + } + + if (tunnel->err_count > 0) { + if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + tunnel->err_count--; + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = (LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr)); + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + stats->tx_dropped++; + dev_kfree_skb(skb); + tunnel->recursion--; + return 0; + } + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + dev_kfree_skb(skb); + skb = new_skb; + old_iph = skb->nh.iph; + } + + skb->h.raw = skb->nh.raw; + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPIP; + iph->tos = INET_ECN_encapsulate(tos, old_iph->tos); + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + + if ((iph->ttl = tiph->ttl) == 0) + iph->ttl = old_iph->ttl; + + nf_reset(skb); + + IPTUNNEL_XMIT(); + tunnel->recursion--; + return 0; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + stats->tx_errors++; + dev_kfree_skb(skb); + tunnel->recursion--; + return 0; +} + +static int +ipip_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel *t; + + switch (cmd) { + case SIOCGETTUNNEL: + t = NULL; + if (dev == ipip_fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + t = ipip_tunnel_locate(&p, 0); + } + if (t == NULL) + t = (struct ip_tunnel*)dev->priv; + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPIP || + p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) + goto done; + if (p.iph.ttl) + p.iph.frag_off |= htons(IP_DF); + + t = ipip_tunnel_locate(&p, cmd == SIOCADDTUNNEL); + + if (dev != ipip_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else { + if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || + (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { + err = -EINVAL; + break; + } + t = (struct ip_tunnel*)dev->priv; + ipip_tunnel_unlink(t); + t->parms.iph.saddr = p.iph.saddr; + t->parms.iph.daddr = p.iph.daddr; + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + ipip_tunnel_link(t); + netdev_state_change(dev); + } + } + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + t->parms.iph.frag_off = p.iph.frag_off; + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + if (dev == ipip_fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + if ((t = ipip_tunnel_locate(&p, 0)) == NULL) + goto done; + err = -EPERM; + if (t->dev == ipip_fb_tunnel_dev) + goto done; + dev = t->dev; + } + err = unregister_netdevice(dev); + break; + + default: + err = -EINVAL; + } + +done: + return err; +} + +static struct net_device_stats *ipip_tunnel_get_stats(struct net_device *dev) +{ + return &(((struct ip_tunnel*)dev->priv)->stat); +} + +static int ipip_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < 68 || new_mtu > 0xFFF8 - sizeof(struct iphdr)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static void ipip_tunnel_setup(struct net_device *dev) +{ + SET_MODULE_OWNER(dev); + dev->uninit = ipip_tunnel_uninit; + dev->hard_start_xmit = ipip_tunnel_xmit; + dev->get_stats = ipip_tunnel_get_stats; + dev->do_ioctl = ipip_tunnel_ioctl; + dev->change_mtu = ipip_tunnel_change_mtu; + dev->destructor = free_netdev; + + dev->type = ARPHRD_TUNNEL; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); + dev->mtu = 1500 - sizeof(struct iphdr); + dev->flags = IFF_NOARP; + dev->iflink = 0; + dev->addr_len = 4; +} + +static int ipip_tunnel_init(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + + tunnel = (struct ip_tunnel*)dev->priv; + iph = &tunnel->parms.iph; + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); + memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); + + if (iph->daddr) { + struct flowi fl = { .oif = tunnel->parms.link, + .nl_u = { .ip4_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) } }, + .proto = IPPROTO_IPIP }; + struct rtable *rt; + if (!ip_route_output_key(&rt, &fl)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(tunnel->parms.link); + + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); + dev->mtu = tdev->mtu - sizeof(struct iphdr); + } + dev->iflink = tunnel->parms.link; + + return 0; +} + +static int __init ipip_fb_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = dev->priv; + struct iphdr *iph = &tunnel->parms.iph; + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + iph->version = 4; + iph->protocol = IPPROTO_IPIP; + iph->ihl = 5; + + dev_hold(dev); + tunnels_wc[0] = tunnel; + return 0; +} + +static struct xfrm_tunnel ipip_handler = { + .handler = ipip_rcv, + .err_handler = ipip_err, +}; + +static char banner[] __initdata = + KERN_INFO "IPv4 over IPv4 tunneling driver\n"; + +static int __init ipip_init(void) +{ + int err; + + printk(banner); + + if (xfrm4_tunnel_register(&ipip_handler) < 0) { + printk(KERN_INFO "ipip init: can't register tunnel\n"); + return -EAGAIN; + } + + ipip_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), + "tunl0", + ipip_tunnel_setup); + if (!ipip_fb_tunnel_dev) { + err = -ENOMEM; + goto err1; + } + + ipip_fb_tunnel_dev->init = ipip_fb_tunnel_init; + + if ((err = register_netdev(ipip_fb_tunnel_dev))) + goto err2; + out: + return err; + err2: + free_netdev(ipip_fb_tunnel_dev); + err1: + xfrm4_tunnel_deregister(&ipip_handler); + goto out; +} + +static void __exit ipip_fini(void) +{ + if (xfrm4_tunnel_deregister(&ipip_handler) < 0) + printk(KERN_INFO "ipip close: can't deregister tunnel\n"); + + unregister_netdev(ipip_fb_tunnel_dev); +} + +module_init(ipip_init); +module_exit(ipip_fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c new file mode 100644 index 000000000000..e21c049ec62a --- /dev/null +++ b/net/ipv4/ipmr.c @@ -0,0 +1,1900 @@ +/* + * IP multicast routing support for mrouted 3.6/3.8 + * + * (c) 1995 Alan Cox, + * Linux Consultancy and Custom Driver Development + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $ + * + * Fixes: + * Michael Chastain : Incorrect size of copying. + * Alan Cox : Added the cache manager code + * Alan Cox : Fixed the clone/copy bug and device race. + * Mike McLagan : Routing by source + * Malcolm Beattie : Buffer handling fixes. + * Alexey Kuznetsov : Double buffer free and other fixes. + * SVR Anand : Fixed several multicast bugs and problems. + * Alexey Kuznetsov : Status, optimisations and more. + * Brad Parker : Better behaviour on mrouted upcall + * overflow. + * Carlos Picoto : PIMv1 Support + * Pavlin Ivanov Radoslavov: PIMv2 Registers must checksum only PIM header + * Relax this requrement to work with older peers. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2) +#define CONFIG_IP_PIMSM 1 +#endif + +static struct sock *mroute_socket; + + +/* Big lock, protecting vif table, mrt cache and mroute socket state. + Note that the changes are semaphored via rtnl_lock. + */ + +static DEFINE_RWLOCK(mrt_lock); + +/* + * Multicast router control variables + */ + +static struct vif_device vif_table[MAXVIFS]; /* Devices */ +static int maxvif; + +#define VIF_EXISTS(idx) (vif_table[idx].dev != NULL) + +static int mroute_do_assert; /* Set in PIM assert */ +static int mroute_do_pim; + +static struct mfc_cache *mfc_cache_array[MFC_LINES]; /* Forwarding cache */ + +static struct mfc_cache *mfc_unres_queue; /* Queue of unresolved entries */ +static atomic_t cache_resolve_queue_len; /* Size of unresolved */ + +/* Special spinlock for queue of unresolved entries */ +static DEFINE_SPINLOCK(mfc_unres_lock); + +/* We return to original Alan's scheme. Hash table of resolved + entries is changed only in process context and protected + with weak lock mrt_lock. Queue of unresolved entries is protected + with strong spinlock mfc_unres_lock. + + In this case data path is free of exclusive locks at all. + */ + +static kmem_cache_t *mrt_cachep; + +static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local); +static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert); +static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm); + +#ifdef CONFIG_IP_PIMSM_V2 +static struct net_protocol pim_protocol; +#endif + +static struct timer_list ipmr_expire_timer; + +/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */ + +static +struct net_device *ipmr_new_tunnel(struct vifctl *v) +{ + struct net_device *dev; + + dev = __dev_get_by_name("tunl0"); + + if (dev) { + int err; + struct ifreq ifr; + mm_segment_t oldfs; + struct ip_tunnel_parm p; + struct in_device *in_dev; + + memset(&p, 0, sizeof(p)); + p.iph.daddr = v->vifc_rmt_addr.s_addr; + p.iph.saddr = v->vifc_lcl_addr.s_addr; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPIP; + sprintf(p.name, "dvmrp%d", v->vifc_vifi); + ifr.ifr_ifru.ifru_data = (void*)&p; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL); + set_fs(oldfs); + + dev = NULL; + + if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) { + dev->flags |= IFF_MULTICAST; + + in_dev = __in_dev_get(dev); + if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL) + goto failure; + in_dev->cnf.rp_filter = 0; + + if (dev_open(dev)) + goto failure; + } + } + return dev; + +failure: + /* allow the register to be completed before unregistering. */ + rtnl_unlock(); + rtnl_lock(); + + unregister_netdevice(dev); + return NULL; +} + +#ifdef CONFIG_IP_PIMSM + +static int reg_vif_num = -1; + +static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev) +{ + read_lock(&mrt_lock); + ((struct net_device_stats*)dev->priv)->tx_bytes += skb->len; + ((struct net_device_stats*)dev->priv)->tx_packets++; + ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT); + read_unlock(&mrt_lock); + kfree_skb(skb); + return 0; +} + +static struct net_device_stats *reg_vif_get_stats(struct net_device *dev) +{ + return (struct net_device_stats*)dev->priv; +} + +static void reg_vif_setup(struct net_device *dev) +{ + dev->type = ARPHRD_PIMREG; + dev->mtu = 1500 - sizeof(struct iphdr) - 8; + dev->flags = IFF_NOARP; + dev->hard_start_xmit = reg_vif_xmit; + dev->get_stats = reg_vif_get_stats; + dev->destructor = free_netdev; +} + +static struct net_device *ipmr_reg_vif(void) +{ + struct net_device *dev; + struct in_device *in_dev; + + dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg", + reg_vif_setup); + + if (dev == NULL) + return NULL; + + if (register_netdevice(dev)) { + free_netdev(dev); + return NULL; + } + dev->iflink = 0; + + if ((in_dev = inetdev_init(dev)) == NULL) + goto failure; + + in_dev->cnf.rp_filter = 0; + + if (dev_open(dev)) + goto failure; + + return dev; + +failure: + /* allow the register to be completed before unregistering. */ + rtnl_unlock(); + rtnl_lock(); + + unregister_netdevice(dev); + return NULL; +} +#endif + +/* + * Delete a VIF entry + */ + +static int vif_delete(int vifi) +{ + struct vif_device *v; + struct net_device *dev; + struct in_device *in_dev; + + if (vifi < 0 || vifi >= maxvif) + return -EADDRNOTAVAIL; + + v = &vif_table[vifi]; + + write_lock_bh(&mrt_lock); + dev = v->dev; + v->dev = NULL; + + if (!dev) { + write_unlock_bh(&mrt_lock); + return -EADDRNOTAVAIL; + } + +#ifdef CONFIG_IP_PIMSM + if (vifi == reg_vif_num) + reg_vif_num = -1; +#endif + + if (vifi+1 == maxvif) { + int tmp; + for (tmp=vifi-1; tmp>=0; tmp--) { + if (VIF_EXISTS(tmp)) + break; + } + maxvif = tmp+1; + } + + write_unlock_bh(&mrt_lock); + + dev_set_allmulti(dev, -1); + + if ((in_dev = __in_dev_get(dev)) != NULL) { + in_dev->cnf.mc_forwarding--; + ip_rt_multicast_event(in_dev); + } + + if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) + unregister_netdevice(dev); + + dev_put(dev); + return 0; +} + +/* Destroy an unresolved cache entry, killing queued skbs + and reporting error to netlink readers. + */ + +static void ipmr_destroy_unres(struct mfc_cache *c) +{ + struct sk_buff *skb; + + atomic_dec(&cache_resolve_queue_len); + + while((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) { + if (skb->nh.iph->version == 0) { + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -ETIMEDOUT; + netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); + } else + kfree_skb(skb); + } + + kmem_cache_free(mrt_cachep, c); +} + + +/* Single timer process for all the unresolved queue. */ + +static void ipmr_expire_process(unsigned long dummy) +{ + unsigned long now; + unsigned long expires; + struct mfc_cache *c, **cp; + + if (!spin_trylock(&mfc_unres_lock)) { + mod_timer(&ipmr_expire_timer, jiffies+HZ/10); + return; + } + + if (atomic_read(&cache_resolve_queue_len) == 0) + goto out; + + now = jiffies; + expires = 10*HZ; + cp = &mfc_unres_queue; + + while ((c=*cp) != NULL) { + if (time_after(c->mfc_un.unres.expires, now)) { + unsigned long interval = c->mfc_un.unres.expires - now; + if (interval < expires) + expires = interval; + cp = &c->next; + continue; + } + + *cp = c->next; + + ipmr_destroy_unres(c); + } + + if (atomic_read(&cache_resolve_queue_len)) + mod_timer(&ipmr_expire_timer, jiffies + expires); + +out: + spin_unlock(&mfc_unres_lock); +} + +/* Fill oifs list. It is called under write locked mrt_lock. */ + +static void ipmr_update_threshoulds(struct mfc_cache *cache, unsigned char *ttls) +{ + int vifi; + + cache->mfc_un.res.minvif = MAXVIFS; + cache->mfc_un.res.maxvif = 0; + memset(cache->mfc_un.res.ttls, 255, MAXVIFS); + + for (vifi=0; vifimfc_un.res.ttls[vifi] = ttls[vifi]; + if (cache->mfc_un.res.minvif > vifi) + cache->mfc_un.res.minvif = vifi; + if (cache->mfc_un.res.maxvif <= vifi) + cache->mfc_un.res.maxvif = vifi + 1; + } + } +} + +static int vif_add(struct vifctl *vifc, int mrtsock) +{ + int vifi = vifc->vifc_vifi; + struct vif_device *v = &vif_table[vifi]; + struct net_device *dev; + struct in_device *in_dev; + + /* Is vif busy ? */ + if (VIF_EXISTS(vifi)) + return -EADDRINUSE; + + switch (vifc->vifc_flags) { +#ifdef CONFIG_IP_PIMSM + case VIFF_REGISTER: + /* + * Special Purpose VIF in PIM + * All the packets will be sent to the daemon + */ + if (reg_vif_num >= 0) + return -EADDRINUSE; + dev = ipmr_reg_vif(); + if (!dev) + return -ENOBUFS; + break; +#endif + case VIFF_TUNNEL: + dev = ipmr_new_tunnel(vifc); + if (!dev) + return -ENOBUFS; + break; + case 0: + dev=ip_dev_find(vifc->vifc_lcl_addr.s_addr); + if (!dev) + return -EADDRNOTAVAIL; + __dev_put(dev); + break; + default: + return -EINVAL; + } + + if ((in_dev = __in_dev_get(dev)) == NULL) + return -EADDRNOTAVAIL; + in_dev->cnf.mc_forwarding++; + dev_set_allmulti(dev, +1); + ip_rt_multicast_event(in_dev); + + /* + * Fill in the VIF structures + */ + v->rate_limit=vifc->vifc_rate_limit; + v->local=vifc->vifc_lcl_addr.s_addr; + v->remote=vifc->vifc_rmt_addr.s_addr; + v->flags=vifc->vifc_flags; + if (!mrtsock) + v->flags |= VIFF_STATIC; + v->threshold=vifc->vifc_threshold; + v->bytes_in = 0; + v->bytes_out = 0; + v->pkt_in = 0; + v->pkt_out = 0; + v->link = dev->ifindex; + if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) + v->link = dev->iflink; + + /* And finish update writing critical data */ + write_lock_bh(&mrt_lock); + dev_hold(dev); + v->dev=dev; +#ifdef CONFIG_IP_PIMSM + if (v->flags&VIFF_REGISTER) + reg_vif_num = vifi; +#endif + if (vifi+1 > maxvif) + maxvif = vifi+1; + write_unlock_bh(&mrt_lock); + return 0; +} + +static struct mfc_cache *ipmr_cache_find(__u32 origin, __u32 mcastgrp) +{ + int line=MFC_HASH(mcastgrp,origin); + struct mfc_cache *c; + + for (c=mfc_cache_array[line]; c; c = c->next) { + if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp) + break; + } + return c; +} + +/* + * Allocate a multicast cache entry + */ +static struct mfc_cache *ipmr_cache_alloc(void) +{ + struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_KERNEL); + if(c==NULL) + return NULL; + memset(c, 0, sizeof(*c)); + c->mfc_un.res.minvif = MAXVIFS; + return c; +} + +static struct mfc_cache *ipmr_cache_alloc_unres(void) +{ + struct mfc_cache *c=kmem_cache_alloc(mrt_cachep, GFP_ATOMIC); + if(c==NULL) + return NULL; + memset(c, 0, sizeof(*c)); + skb_queue_head_init(&c->mfc_un.unres.unresolved); + c->mfc_un.unres.expires = jiffies + 10*HZ; + return c; +} + +/* + * A cache entry has gone into a resolved state from queued + */ + +static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c) +{ + struct sk_buff *skb; + + /* + * Play the pending entries through our router + */ + + while((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) { + if (skb->nh.iph->version == 0) { + int err; + struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); + + if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) { + nlh->nlmsg_len = skb->tail - (u8*)nlh; + } else { + nlh->nlmsg_type = NLMSG_ERROR; + nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); + skb_trim(skb, nlh->nlmsg_len); + ((struct nlmsgerr*)NLMSG_DATA(nlh))->error = -EMSGSIZE; + } + err = netlink_unicast(rtnl, skb, NETLINK_CB(skb).dst_pid, MSG_DONTWAIT); + } else + ip_mr_forward(skb, c, 0); + } +} + +/* + * Bounce a cache query up to mrouted. We could use netlink for this but mrouted + * expects the following bizarre scheme. + * + * Called under mrt_lock. + */ + +static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert) +{ + struct sk_buff *skb; + int ihl = pkt->nh.iph->ihl<<2; + struct igmphdr *igmp; + struct igmpmsg *msg; + int ret; + +#ifdef CONFIG_IP_PIMSM + if (assert == IGMPMSG_WHOLEPKT) + skb = skb_realloc_headroom(pkt, sizeof(struct iphdr)); + else +#endif + skb = alloc_skb(128, GFP_ATOMIC); + + if(!skb) + return -ENOBUFS; + +#ifdef CONFIG_IP_PIMSM + if (assert == IGMPMSG_WHOLEPKT) { + /* Ugly, but we have no choice with this interface. + Duplicate old header, fix ihl, length etc. + And all this only to mangle msg->im_msgtype and + to set msg->im_mbz to "mbz" :-) + */ + msg = (struct igmpmsg*)skb_push(skb, sizeof(struct iphdr)); + skb->nh.raw = skb->h.raw = (u8*)msg; + memcpy(msg, pkt->nh.raw, sizeof(struct iphdr)); + msg->im_msgtype = IGMPMSG_WHOLEPKT; + msg->im_mbz = 0; + msg->im_vif = reg_vif_num; + skb->nh.iph->ihl = sizeof(struct iphdr) >> 2; + skb->nh.iph->tot_len = htons(ntohs(pkt->nh.iph->tot_len) + sizeof(struct iphdr)); + } else +#endif + { + + /* + * Copy the IP header + */ + + skb->nh.iph = (struct iphdr *)skb_put(skb, ihl); + memcpy(skb->data,pkt->data,ihl); + skb->nh.iph->protocol = 0; /* Flag to the kernel this is a route add */ + msg = (struct igmpmsg*)skb->nh.iph; + msg->im_vif = vifi; + skb->dst = dst_clone(pkt->dst); + + /* + * Add our header + */ + + igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr)); + igmp->type = + msg->im_msgtype = assert; + igmp->code = 0; + skb->nh.iph->tot_len=htons(skb->len); /* Fix the length */ + skb->h.raw = skb->nh.raw; + } + + if (mroute_socket == NULL) { + kfree_skb(skb); + return -EINVAL; + } + + /* + * Deliver to mrouted + */ + if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) { + if (net_ratelimit()) + printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); + kfree_skb(skb); + } + + return ret; +} + +/* + * Queue a packet for resolution. It gets locked cache entry! + */ + +static int +ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb) +{ + int err; + struct mfc_cache *c; + + spin_lock_bh(&mfc_unres_lock); + for (c=mfc_unres_queue; c; c=c->next) { + if (c->mfc_mcastgrp == skb->nh.iph->daddr && + c->mfc_origin == skb->nh.iph->saddr) + break; + } + + if (c == NULL) { + /* + * Create a new entry if allowable + */ + + if (atomic_read(&cache_resolve_queue_len)>=10 || + (c=ipmr_cache_alloc_unres())==NULL) { + spin_unlock_bh(&mfc_unres_lock); + + kfree_skb(skb); + return -ENOBUFS; + } + + /* + * Fill in the new cache entry + */ + c->mfc_parent=-1; + c->mfc_origin=skb->nh.iph->saddr; + c->mfc_mcastgrp=skb->nh.iph->daddr; + + /* + * Reflect first query at mrouted. + */ + if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) { + /* If the report failed throw the cache entry + out - Brad Parker + */ + spin_unlock_bh(&mfc_unres_lock); + + kmem_cache_free(mrt_cachep, c); + kfree_skb(skb); + return err; + } + + atomic_inc(&cache_resolve_queue_len); + c->next = mfc_unres_queue; + mfc_unres_queue = c; + + mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires); + } + + /* + * See if we can append the packet + */ + if (c->mfc_un.unres.unresolved.qlen>3) { + kfree_skb(skb); + err = -ENOBUFS; + } else { + skb_queue_tail(&c->mfc_un.unres.unresolved,skb); + err = 0; + } + + spin_unlock_bh(&mfc_unres_lock); + return err; +} + +/* + * MFC cache manipulation by user space mroute daemon + */ + +static int ipmr_mfc_delete(struct mfcctl *mfc) +{ + int line; + struct mfc_cache *c, **cp; + + line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); + + for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) { + if (c->mfc_origin == mfc->mfcc_origin.s_addr && + c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { + write_lock_bh(&mrt_lock); + *cp = c->next; + write_unlock_bh(&mrt_lock); + + kmem_cache_free(mrt_cachep, c); + return 0; + } + } + return -ENOENT; +} + +static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock) +{ + int line; + struct mfc_cache *uc, *c, **cp; + + line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr); + + for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) { + if (c->mfc_origin == mfc->mfcc_origin.s_addr && + c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) + break; + } + + if (c != NULL) { + write_lock_bh(&mrt_lock); + c->mfc_parent = mfc->mfcc_parent; + ipmr_update_threshoulds(c, mfc->mfcc_ttls); + if (!mrtsock) + c->mfc_flags |= MFC_STATIC; + write_unlock_bh(&mrt_lock); + return 0; + } + + if(!MULTICAST(mfc->mfcc_mcastgrp.s_addr)) + return -EINVAL; + + c=ipmr_cache_alloc(); + if (c==NULL) + return -ENOMEM; + + c->mfc_origin=mfc->mfcc_origin.s_addr; + c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr; + c->mfc_parent=mfc->mfcc_parent; + ipmr_update_threshoulds(c, mfc->mfcc_ttls); + if (!mrtsock) + c->mfc_flags |= MFC_STATIC; + + write_lock_bh(&mrt_lock); + c->next = mfc_cache_array[line]; + mfc_cache_array[line] = c; + write_unlock_bh(&mrt_lock); + + /* + * Check to see if we resolved a queued list. If so we + * need to send on the frames and tidy up. + */ + spin_lock_bh(&mfc_unres_lock); + for (cp = &mfc_unres_queue; (uc=*cp) != NULL; + cp = &uc->next) { + if (uc->mfc_origin == c->mfc_origin && + uc->mfc_mcastgrp == c->mfc_mcastgrp) { + *cp = uc->next; + if (atomic_dec_and_test(&cache_resolve_queue_len)) + del_timer(&ipmr_expire_timer); + break; + } + } + spin_unlock_bh(&mfc_unres_lock); + + if (uc) { + ipmr_cache_resolve(uc, c); + kmem_cache_free(mrt_cachep, uc); + } + return 0; +} + +/* + * Close the multicast socket, and clear the vif tables etc + */ + +static void mroute_clean_tables(struct sock *sk) +{ + int i; + + /* + * Shut down all active vif entries + */ + for(i=0; imfc_flags&MFC_STATIC) { + cp = &c->next; + continue; + } + write_lock_bh(&mrt_lock); + *cp = c->next; + write_unlock_bh(&mrt_lock); + + kmem_cache_free(mrt_cachep, c); + } + } + + if (atomic_read(&cache_resolve_queue_len) != 0) { + struct mfc_cache *c; + + spin_lock_bh(&mfc_unres_lock); + while (mfc_unres_queue != NULL) { + c = mfc_unres_queue; + mfc_unres_queue = c->next; + spin_unlock_bh(&mfc_unres_lock); + + ipmr_destroy_unres(c); + + spin_lock_bh(&mfc_unres_lock); + } + spin_unlock_bh(&mfc_unres_lock); + } +} + +static void mrtsock_destruct(struct sock *sk) +{ + rtnl_lock(); + if (sk == mroute_socket) { + ipv4_devconf.mc_forwarding--; + + write_lock_bh(&mrt_lock); + mroute_socket=NULL; + write_unlock_bh(&mrt_lock); + + mroute_clean_tables(sk); + } + rtnl_unlock(); +} + +/* + * Socket options and virtual interface manipulation. The whole + * virtual interface system is a complete heap, but unfortunately + * that's how BSD mrouted happens to think. Maybe one day with a proper + * MOSPF/PIM router set up we can clean this up. + */ + +int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen) +{ + int ret; + struct vifctl vif; + struct mfcctl mfc; + + if(optname!=MRT_INIT) + { + if(sk!=mroute_socket && !capable(CAP_NET_ADMIN)) + return -EACCES; + } + + switch(optname) + { + case MRT_INIT: + if (sk->sk_type != SOCK_RAW || + inet_sk(sk)->num != IPPROTO_IGMP) + return -EOPNOTSUPP; + if(optlen!=sizeof(int)) + return -ENOPROTOOPT; + + rtnl_lock(); + if (mroute_socket) { + rtnl_unlock(); + return -EADDRINUSE; + } + + ret = ip_ra_control(sk, 1, mrtsock_destruct); + if (ret == 0) { + write_lock_bh(&mrt_lock); + mroute_socket=sk; + write_unlock_bh(&mrt_lock); + + ipv4_devconf.mc_forwarding++; + } + rtnl_unlock(); + return ret; + case MRT_DONE: + if (sk!=mroute_socket) + return -EACCES; + return ip_ra_control(sk, 0, NULL); + case MRT_ADD_VIF: + case MRT_DEL_VIF: + if(optlen!=sizeof(vif)) + return -EINVAL; + if (copy_from_user(&vif,optval,sizeof(vif))) + return -EFAULT; + if(vif.vifc_vifi >= MAXVIFS) + return -ENFILE; + rtnl_lock(); + if (optname==MRT_ADD_VIF) { + ret = vif_add(&vif, sk==mroute_socket); + } else { + ret = vif_delete(vif.vifc_vifi); + } + rtnl_unlock(); + return ret; + + /* + * Manipulate the forwarding caches. These live + * in a sort of kernel/user symbiosis. + */ + case MRT_ADD_MFC: + case MRT_DEL_MFC: + if(optlen!=sizeof(mfc)) + return -EINVAL; + if (copy_from_user(&mfc,optval, sizeof(mfc))) + return -EFAULT; + rtnl_lock(); + if (optname==MRT_DEL_MFC) + ret = ipmr_mfc_delete(&mfc); + else + ret = ipmr_mfc_add(&mfc, sk==mroute_socket); + rtnl_unlock(); + return ret; + /* + * Control PIM assert. + */ + case MRT_ASSERT: + { + int v; + if(get_user(v,(int __user *)optval)) + return -EFAULT; + mroute_do_assert=(v)?1:0; + return 0; + } +#ifdef CONFIG_IP_PIMSM + case MRT_PIM: + { + int v, ret; + if(get_user(v,(int __user *)optval)) + return -EFAULT; + v = (v)?1:0; + rtnl_lock(); + ret = 0; + if (v != mroute_do_pim) { + mroute_do_pim = v; + mroute_do_assert = v; +#ifdef CONFIG_IP_PIMSM_V2 + if (mroute_do_pim) + ret = inet_add_protocol(&pim_protocol, + IPPROTO_PIM); + else + ret = inet_del_protocol(&pim_protocol, + IPPROTO_PIM); + if (ret < 0) + ret = -EAGAIN; +#endif + } + rtnl_unlock(); + return ret; + } +#endif + /* + * Spurious command, or MRT_VERSION which you cannot + * set. + */ + default: + return -ENOPROTOOPT; + } +} + +/* + * Getsock opt support for the multicast routing system. + */ + +int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen) +{ + int olr; + int val; + + if(optname!=MRT_VERSION && +#ifdef CONFIG_IP_PIMSM + optname!=MRT_PIM && +#endif + optname!=MRT_ASSERT) + return -ENOPROTOOPT; + + if (get_user(olr, optlen)) + return -EFAULT; + + olr = min_t(unsigned int, olr, sizeof(int)); + if (olr < 0) + return -EINVAL; + + if(put_user(olr,optlen)) + return -EFAULT; + if(optname==MRT_VERSION) + val=0x0305; +#ifdef CONFIG_IP_PIMSM + else if(optname==MRT_PIM) + val=mroute_do_pim; +#endif + else + val=mroute_do_assert; + if(copy_to_user(optval,&val,olr)) + return -EFAULT; + return 0; +} + +/* + * The IP multicast ioctl support routines. + */ + +int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) +{ + struct sioc_sg_req sr; + struct sioc_vif_req vr; + struct vif_device *vif; + struct mfc_cache *c; + + switch(cmd) + { + case SIOCGETVIFCNT: + if (copy_from_user(&vr,arg,sizeof(vr))) + return -EFAULT; + if(vr.vifi>=maxvif) + return -EINVAL; + read_lock(&mrt_lock); + vif=&vif_table[vr.vifi]; + if(VIF_EXISTS(vr.vifi)) { + vr.icount=vif->pkt_in; + vr.ocount=vif->pkt_out; + vr.ibytes=vif->bytes_in; + vr.obytes=vif->bytes_out; + read_unlock(&mrt_lock); + + if (copy_to_user(arg,&vr,sizeof(vr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + case SIOCGETSGCNT: + if (copy_from_user(&sr,arg,sizeof(sr))) + return -EFAULT; + + read_lock(&mrt_lock); + c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr); + if (c) { + sr.pktcnt = c->mfc_un.res.pkt; + sr.bytecnt = c->mfc_un.res.bytes; + sr.wrong_if = c->mfc_un.res.wrong_if; + read_unlock(&mrt_lock); + + if (copy_to_user(arg,&sr,sizeof(sr))) + return -EFAULT; + return 0; + } + read_unlock(&mrt_lock); + return -EADDRNOTAVAIL; + default: + return -ENOIOCTLCMD; + } +} + + +static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct vif_device *v; + int ct; + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + v=&vif_table[0]; + for(ct=0;ctdev==ptr) + vif_delete(ct); + } + return NOTIFY_DONE; +} + + +static struct notifier_block ip_mr_notifier={ + .notifier_call = ipmr_device_event, +}; + +/* + * Encapsulate a packet by attaching a valid IPIP header to it. + * This avoids tunnel drivers and other mess and gives us the speed so + * important for multicast video. + */ + +static void ip_encap(struct sk_buff *skb, u32 saddr, u32 daddr) +{ + struct iphdr *iph = (struct iphdr *)skb_push(skb,sizeof(struct iphdr)); + + iph->version = 4; + iph->tos = skb->nh.iph->tos; + iph->ttl = skb->nh.iph->ttl; + iph->frag_off = 0; + iph->daddr = daddr; + iph->saddr = saddr; + iph->protocol = IPPROTO_IPIP; + iph->ihl = 5; + iph->tot_len = htons(skb->len); + ip_select_ident(iph, skb->dst, NULL); + ip_send_check(iph); + + skb->h.ipiph = skb->nh.iph; + skb->nh.iph = iph; + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + nf_reset(skb); +} + +static inline int ipmr_forward_finish(struct sk_buff *skb) +{ + struct ip_options * opt = &(IPCB(skb)->opt); + + IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS); + + if (unlikely(opt->optlen)) + ip_forward_options(skb); + + return dst_output(skb); +} + +/* + * Processing handlers for ipmr_forward + */ + +static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi) +{ + struct iphdr *iph = skb->nh.iph; + struct vif_device *vif = &vif_table[vifi]; + struct net_device *dev; + struct rtable *rt; + int encap = 0; + + if (vif->dev == NULL) + goto out_free; + +#ifdef CONFIG_IP_PIMSM + if (vif->flags & VIFF_REGISTER) { + vif->pkt_out++; + vif->bytes_out+=skb->len; + ((struct net_device_stats*)vif->dev->priv)->tx_bytes += skb->len; + ((struct net_device_stats*)vif->dev->priv)->tx_packets++; + ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT); + kfree_skb(skb); + return; + } +#endif + + if (vif->flags&VIFF_TUNNEL) { + struct flowi fl = { .oif = vif->link, + .nl_u = { .ip4_u = + { .daddr = vif->remote, + .saddr = vif->local, + .tos = RT_TOS(iph->tos) } }, + .proto = IPPROTO_IPIP }; + if (ip_route_output_key(&rt, &fl)) + goto out_free; + encap = sizeof(struct iphdr); + } else { + struct flowi fl = { .oif = vif->link, + .nl_u = { .ip4_u = + { .daddr = iph->daddr, + .tos = RT_TOS(iph->tos) } }, + .proto = IPPROTO_IPIP }; + if (ip_route_output_key(&rt, &fl)) + goto out_free; + } + + dev = rt->u.dst.dev; + + if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) { + /* Do not fragment multicasts. Alas, IPv4 does not + allow to send ICMP, so that packets will disappear + to blackhole. + */ + + IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS); + ip_rt_put(rt); + goto out_free; + } + + encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len; + + if (skb_cow(skb, encap)) { + ip_rt_put(rt); + goto out_free; + } + + vif->pkt_out++; + vif->bytes_out+=skb->len; + + dst_release(skb->dst); + skb->dst = &rt->u.dst; + iph = skb->nh.iph; + ip_decrease_ttl(iph); + + /* FIXME: forward and output firewalls used to be called here. + * What do we do with netfilter? -- RR */ + if (vif->flags & VIFF_TUNNEL) { + ip_encap(skb, vif->local, vif->remote); + /* FIXME: extra output firewall step used to be here. --RR */ + ((struct ip_tunnel *)vif->dev->priv)->stat.tx_packets++; + ((struct ip_tunnel *)vif->dev->priv)->stat.tx_bytes+=skb->len; + } + + IPCB(skb)->flags |= IPSKB_FORWARDED; + + /* + * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally + * not only before forwarding, but after forwarding on all output + * interfaces. It is clear, if mrouter runs a multicasting + * program, it should receive packets not depending to what interface + * program is joined. + * If we will not make it, the program will have to join on all + * interfaces. On the other hand, multihoming host (or router, but + * not mrouter) cannot join to more than one interface - it will + * result in receiving multiple packets. + */ + NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev, + ipmr_forward_finish); + return; + +out_free: + kfree_skb(skb); + return; +} + +static int ipmr_find_vif(struct net_device *dev) +{ + int ct; + for (ct=maxvif-1; ct>=0; ct--) { + if (vif_table[ct].dev == dev) + break; + } + return ct; +} + +/* "local" means that we should preserve one skb (for local delivery) */ + +static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local) +{ + int psend = -1; + int vif, ct; + + vif = cache->mfc_parent; + cache->mfc_un.res.pkt++; + cache->mfc_un.res.bytes += skb->len; + + /* + * Wrong interface: drop packet and (maybe) send PIM assert. + */ + if (vif_table[vif].dev != skb->dev) { + int true_vifi; + + if (((struct rtable*)skb->dst)->fl.iif == 0) { + /* It is our own packet, looped back. + Very complicated situation... + + The best workaround until routing daemons will be + fixed is not to redistribute packet, if it was + send through wrong interface. It means, that + multicast applications WILL NOT work for + (S,G), which have default multicast route pointing + to wrong oif. In any case, it is not a good + idea to use multicasting applications on router. + */ + goto dont_forward; + } + + cache->mfc_un.res.wrong_if++; + true_vifi = ipmr_find_vif(skb->dev); + + if (true_vifi >= 0 && mroute_do_assert && + /* pimsm uses asserts, when switching from RPT to SPT, + so that we cannot check that packet arrived on an oif. + It is bad, but otherwise we would need to move pretty + large chunk of pimd to kernel. Ough... --ANK + */ + (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) && + time_after(jiffies, + cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) { + cache->mfc_un.res.last_assert = jiffies; + ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF); + } + goto dont_forward; + } + + vif_table[vif].pkt_in++; + vif_table[vif].bytes_in+=skb->len; + + /* + * Forward the frame + */ + for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) { + if (skb->nh.iph->ttl > cache->mfc_un.res.ttls[ct]) { + if (psend != -1) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + ipmr_queue_xmit(skb2, cache, psend); + } + psend=ct; + } + } + if (psend != -1) { + if (local) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + ipmr_queue_xmit(skb2, cache, psend); + } else { + ipmr_queue_xmit(skb, cache, psend); + return 0; + } + } + +dont_forward: + if (!local) + kfree_skb(skb); + return 0; +} + + +/* + * Multicast packets for forwarding arrive here + */ + +int ip_mr_input(struct sk_buff *skb) +{ + struct mfc_cache *cache; + int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL; + + /* Packet is looped back after forward, it should not be + forwarded second time, but still can be delivered locally. + */ + if (IPCB(skb)->flags&IPSKB_FORWARDED) + goto dont_forward; + + if (!local) { + if (IPCB(skb)->opt.router_alert) { + if (ip_call_ra_chain(skb)) + return 0; + } else if (skb->nh.iph->protocol == IPPROTO_IGMP){ + /* IGMPv1 (and broken IGMPv2 implementations sort of + Cisco IOS <= 11.2(8)) do not put router alert + option to IGMP packets destined to routable + groups. It is very bad, because it means + that we can forward NO IGMP messages. + */ + read_lock(&mrt_lock); + if (mroute_socket) { + raw_rcv(mroute_socket, skb); + read_unlock(&mrt_lock); + return 0; + } + read_unlock(&mrt_lock); + } + } + + read_lock(&mrt_lock); + cache = ipmr_cache_find(skb->nh.iph->saddr, skb->nh.iph->daddr); + + /* + * No usable cache entry + */ + if (cache==NULL) { + int vif; + + if (local) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + ip_local_deliver(skb); + if (skb2 == NULL) { + read_unlock(&mrt_lock); + return -ENOBUFS; + } + skb = skb2; + } + + vif = ipmr_find_vif(skb->dev); + if (vif >= 0) { + int err = ipmr_cache_unresolved(vif, skb); + read_unlock(&mrt_lock); + + return err; + } + read_unlock(&mrt_lock); + kfree_skb(skb); + return -ENODEV; + } + + ip_mr_forward(skb, cache, local); + + read_unlock(&mrt_lock); + + if (local) + return ip_local_deliver(skb); + + return 0; + +dont_forward: + if (local) + return ip_local_deliver(skb); + kfree_skb(skb); + return 0; +} + +#ifdef CONFIG_IP_PIMSM_V1 +/* + * Handle IGMP messages of PIMv1 + */ + +int pim_rcv_v1(struct sk_buff * skb) +{ + struct igmphdr *pim; + struct iphdr *encap; + struct net_device *reg_dev = NULL; + + if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) + goto drop; + + pim = (struct igmphdr*)skb->h.raw; + + if (!mroute_do_pim || + skb->len < sizeof(*pim) + sizeof(*encap) || + pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER) + goto drop; + + encap = (struct iphdr*)(skb->h.raw + sizeof(struct igmphdr)); + /* + Check that: + a. packet is really destinted to a multicast group + b. packet is not a NULL-REGISTER + c. packet is not truncated + */ + if (!MULTICAST(encap->daddr) || + encap->tot_len == 0 || + ntohs(encap->tot_len) + sizeof(*pim) > skb->len) + goto drop; + + read_lock(&mrt_lock); + if (reg_vif_num >= 0) + reg_dev = vif_table[reg_vif_num].dev; + if (reg_dev) + dev_hold(reg_dev); + read_unlock(&mrt_lock); + + if (reg_dev == NULL) + goto drop; + + skb->mac.raw = skb->nh.raw; + skb_pull(skb, (u8*)encap - skb->data); + skb->nh.iph = (struct iphdr *)skb->data; + skb->dev = reg_dev; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = htons(ETH_P_IP); + skb->ip_summed = 0; + skb->pkt_type = PACKET_HOST; + dst_release(skb->dst); + skb->dst = NULL; + ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len; + ((struct net_device_stats*)reg_dev->priv)->rx_packets++; + nf_reset(skb); + netif_rx(skb); + dev_put(reg_dev); + return 0; + drop: + kfree_skb(skb); + return 0; +} +#endif + +#ifdef CONFIG_IP_PIMSM_V2 +static int pim_rcv(struct sk_buff * skb) +{ + struct pimreghdr *pim; + struct iphdr *encap; + struct net_device *reg_dev = NULL; + + if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap))) + goto drop; + + pim = (struct pimreghdr*)skb->h.raw; + if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || + (pim->flags&PIM_NULL_REGISTER) || + (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && + (u16)csum_fold(skb_checksum(skb, 0, skb->len, 0)))) + goto drop; + + /* check if the inner packet is destined to mcast group */ + encap = (struct iphdr*)(skb->h.raw + sizeof(struct pimreghdr)); + if (!MULTICAST(encap->daddr) || + encap->tot_len == 0 || + ntohs(encap->tot_len) + sizeof(*pim) > skb->len) + goto drop; + + read_lock(&mrt_lock); + if (reg_vif_num >= 0) + reg_dev = vif_table[reg_vif_num].dev; + if (reg_dev) + dev_hold(reg_dev); + read_unlock(&mrt_lock); + + if (reg_dev == NULL) + goto drop; + + skb->mac.raw = skb->nh.raw; + skb_pull(skb, (u8*)encap - skb->data); + skb->nh.iph = (struct iphdr *)skb->data; + skb->dev = reg_dev; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = htons(ETH_P_IP); + skb->ip_summed = 0; + skb->pkt_type = PACKET_HOST; + dst_release(skb->dst); + ((struct net_device_stats*)reg_dev->priv)->rx_bytes += skb->len; + ((struct net_device_stats*)reg_dev->priv)->rx_packets++; + skb->dst = NULL; + nf_reset(skb); + netif_rx(skb); + dev_put(reg_dev); + return 0; + drop: + kfree_skb(skb); + return 0; +} +#endif + +static int +ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm) +{ + int ct; + struct rtnexthop *nhp; + struct net_device *dev = vif_table[c->mfc_parent].dev; + u8 *b = skb->tail; + struct rtattr *mp_head; + + if (dev) + RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex); + + mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0)); + + for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) { + if (c->mfc_un.res.ttls[ct] < 255) { + if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4)) + goto rtattr_failure; + nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp))); + nhp->rtnh_flags = 0; + nhp->rtnh_hops = c->mfc_un.res.ttls[ct]; + nhp->rtnh_ifindex = vif_table[ct].dev->ifindex; + nhp->rtnh_len = sizeof(*nhp); + } + } + mp_head->rta_type = RTA_MULTIPATH; + mp_head->rta_len = skb->tail - (u8*)mp_head; + rtm->rtm_type = RTN_MULTICAST; + return 1; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -EMSGSIZE; +} + +int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait) +{ + int err; + struct mfc_cache *cache; + struct rtable *rt = (struct rtable*)skb->dst; + + read_lock(&mrt_lock); + cache = ipmr_cache_find(rt->rt_src, rt->rt_dst); + + if (cache==NULL) { + struct net_device *dev; + int vif; + + if (nowait) { + read_unlock(&mrt_lock); + return -EAGAIN; + } + + dev = skb->dev; + if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) { + read_unlock(&mrt_lock); + return -ENODEV; + } + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + skb->nh.iph->ihl = sizeof(struct iphdr)>>2; + skb->nh.iph->saddr = rt->rt_src; + skb->nh.iph->daddr = rt->rt_dst; + skb->nh.iph->version = 0; + err = ipmr_cache_unresolved(vif, skb); + read_unlock(&mrt_lock); + return err; + } + + if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) + cache->mfc_flags |= MFC_NOTIFY; + err = ipmr_fill_mroute(skb, cache, rtm); + read_unlock(&mrt_lock); + return err; +} + +#ifdef CONFIG_PROC_FS +/* + * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif + */ +struct ipmr_vif_iter { + int ct; +}; + +static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter, + loff_t pos) +{ + for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) { + if(!VIF_EXISTS(iter->ct)) + continue; + if (pos-- == 0) + return &vif_table[iter->ct]; + } + return NULL; +} + +static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&mrt_lock); + return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1) + : SEQ_START_TOKEN; +} + +static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ipmr_vif_iter *iter = seq->private; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ipmr_vif_seq_idx(iter, 0); + + while (++iter->ct < maxvif) { + if(!VIF_EXISTS(iter->ct)) + continue; + return &vif_table[iter->ct]; + } + return NULL; +} + +static void ipmr_vif_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&mrt_lock); +} + +static int ipmr_vif_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + "Interface BytesIn PktsIn BytesOut PktsOut Flags Local Remote\n"); + } else { + const struct vif_device *vif = v; + const char *name = vif->dev ? vif->dev->name : "none"; + + seq_printf(seq, + "%2Zd %-10s %8ld %7ld %8ld %7ld %05X %08X %08X\n", + vif - vif_table, + name, vif->bytes_in, vif->pkt_in, + vif->bytes_out, vif->pkt_out, + vif->flags, vif->local, vif->remote); + } + return 0; +} + +static struct seq_operations ipmr_vif_seq_ops = { + .start = ipmr_vif_seq_start, + .next = ipmr_vif_seq_next, + .stop = ipmr_vif_seq_stop, + .show = ipmr_vif_seq_show, +}; + +static int ipmr_vif_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &ipmr_vif_seq_ops); + if (rc) + goto out_kfree; + + s->ct = 0; + seq = file->private_data; + seq->private = s; +out: + return rc; +out_kfree: + kfree(s); + goto out; + +} + +static struct file_operations ipmr_vif_fops = { + .owner = THIS_MODULE, + .open = ipmr_vif_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +struct ipmr_mfc_iter { + struct mfc_cache **cache; + int ct; +}; + + +static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos) +{ + struct mfc_cache *mfc; + + it->cache = mfc_cache_array; + read_lock(&mrt_lock); + for (it->ct = 0; it->ct < MFC_LINES; it->ct++) + for(mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next) + if (pos-- == 0) + return mfc; + read_unlock(&mrt_lock); + + it->cache = &mfc_unres_queue; + spin_lock_bh(&mfc_unres_lock); + for(mfc = mfc_unres_queue; mfc; mfc = mfc->next) + if (pos-- == 0) + return mfc; + spin_unlock_bh(&mfc_unres_lock); + + it->cache = NULL; + return NULL; +} + + +static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct ipmr_mfc_iter *it = seq->private; + it->cache = NULL; + it->ct = 0; + return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1) + : SEQ_START_TOKEN; +} + +static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct mfc_cache *mfc = v; + struct ipmr_mfc_iter *it = seq->private; + + ++*pos; + + if (v == SEQ_START_TOKEN) + return ipmr_mfc_seq_idx(seq->private, 0); + + if (mfc->next) + return mfc->next; + + if (it->cache == &mfc_unres_queue) + goto end_of_list; + + BUG_ON(it->cache != mfc_cache_array); + + while (++it->ct < MFC_LINES) { + mfc = mfc_cache_array[it->ct]; + if (mfc) + return mfc; + } + + /* exhausted cache_array, show unresolved */ + read_unlock(&mrt_lock); + it->cache = &mfc_unres_queue; + it->ct = 0; + + spin_lock_bh(&mfc_unres_lock); + mfc = mfc_unres_queue; + if (mfc) + return mfc; + + end_of_list: + spin_unlock_bh(&mfc_unres_lock); + it->cache = NULL; + + return NULL; +} + +static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) +{ + struct ipmr_mfc_iter *it = seq->private; + + if (it->cache == &mfc_unres_queue) + spin_unlock_bh(&mfc_unres_lock); + else if (it->cache == mfc_cache_array) + read_unlock(&mrt_lock); +} + +static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) +{ + int n; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + "Group Origin Iif Pkts Bytes Wrong Oifs\n"); + } else { + const struct mfc_cache *mfc = v; + const struct ipmr_mfc_iter *it = seq->private; + + seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld", + (unsigned long) mfc->mfc_mcastgrp, + (unsigned long) mfc->mfc_origin, + mfc->mfc_parent, + mfc->mfc_un.res.pkt, + mfc->mfc_un.res.bytes, + mfc->mfc_un.res.wrong_if); + + if (it->cache != &mfc_unres_queue) { + for(n = mfc->mfc_un.res.minvif; + n < mfc->mfc_un.res.maxvif; n++ ) { + if(VIF_EXISTS(n) + && mfc->mfc_un.res.ttls[n] < 255) + seq_printf(seq, + " %2d:%-3d", + n, mfc->mfc_un.res.ttls[n]); + } + } + seq_putc(seq, '\n'); + } + return 0; +} + +static struct seq_operations ipmr_mfc_seq_ops = { + .start = ipmr_mfc_seq_start, + .next = ipmr_mfc_seq_next, + .stop = ipmr_mfc_seq_stop, + .show = ipmr_mfc_seq_show, +}; + +static int ipmr_mfc_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &ipmr_mfc_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; +out: + return rc; +out_kfree: + kfree(s); + goto out; + +} + +static struct file_operations ipmr_mfc_fops = { + .owner = THIS_MODULE, + .open = ipmr_mfc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + +#ifdef CONFIG_IP_PIMSM_V2 +static struct net_protocol pim_protocol = { + .handler = pim_rcv, +}; +#endif + + +/* + * Setup for IP multicast routing + */ + +void __init ip_mr_init(void) +{ + mrt_cachep = kmem_cache_create("ip_mrt_cache", + sizeof(struct mfc_cache), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!mrt_cachep) + panic("cannot allocate ip_mrt_cache"); + + init_timer(&ipmr_expire_timer); + ipmr_expire_timer.function=ipmr_expire_process; + register_netdevice_notifier(&ip_mr_notifier); +#ifdef CONFIG_PROC_FS + proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops); + proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops); +#endif +} diff --git a/net/ipv4/ipvs/Kconfig b/net/ipv4/ipvs/Kconfig new file mode 100644 index 000000000000..63a82b4b64bb --- /dev/null +++ b/net/ipv4/ipvs/Kconfig @@ -0,0 +1,244 @@ +# +# IP Virtual Server configuration +# +menu "IP: Virtual Server Configuration" + depends on INET && NETFILTER + +config IP_VS + tristate "IP virtual server support (EXPERIMENTAL)" + depends on INET && NETFILTER + ---help--- + IP Virtual Server support will let you build a high-performance + virtual server based on cluster of two or more real servers. This + option must be enabled for at least one of the clustered computers + that will take care of intercepting incoming connections to a + single IP address and scheduling them to real servers. + + Three request dispatching techniques are implemented, they are + virtual server via NAT, virtual server via tunneling and virtual + server via direct routing. The several scheduling algorithms can + be used to choose which server the connection is directed to, + thus load balancing can be achieved among the servers. For more + information and its administration program, please visit the + following URL: . + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_DEBUG + bool "IP virtual server debugging" + depends on IP_VS + ---help--- + Say Y here if you want to get additional messages useful in + debugging the IP virtual server code. You can change the debug + level in /proc/sys/net/ipv4/vs/debug_level + +config IP_VS_TAB_BITS + int "IPVS connection table size (the Nth power of 2)" + depends on IP_VS + default "12" + ---help--- + The IPVS connection hash table uses the chaining scheme to handle + hash collisions. Using a big IPVS connection hash table will greatly + reduce conflicts when there are hundreds of thousands of connections + in the hash table. + + Note the table size must be power of 2. The table size will be the + value of 2 to the your input number power. The number to choose is + from 8 to 20, the default number is 12, which means the table size + is 4096. Don't input the number too small, otherwise you will lose + performance on it. You can adapt the table size yourself, according + to your virtual server application. It is good to set the table size + not far less than the number of connections per second multiplying + average lasting time of connection in the table. For example, your + virtual server gets 200 connections per second, the connection lasts + for 200 seconds in average in the connection table, the table size + should be not far less than 200x200, it is good to set the table + size 32768 (2**15). + + Another note that each connection occupies 128 bytes effectively and + each hash entry uses 8 bytes, so you can estimate how much memory is + needed for your box. + +comment "IPVS transport protocol load balancing support" + depends on IP_VS + +config IP_VS_PROTO_TCP + bool "TCP load balancing support" + depends on IP_VS + ---help--- + This option enables support for load balancing TCP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_UDP + bool "UDP load balancing support" + depends on IP_VS + ---help--- + This option enables support for load balancing UDP transport + protocol. Say Y if unsure. + +config IP_VS_PROTO_ESP + bool "ESP load balancing support" + depends on IP_VS + ---help--- + This option enables support for load balancing ESP (Encapsultion + Security Payload) transport protocol. Say Y if unsure. + +config IP_VS_PROTO_AH + bool "AH load balancing support" + depends on IP_VS + ---help--- + This option enables support for load balancing AH (Authentication + Header) transport protocol. Say Y if unsure. + +comment "IPVS scheduler" + depends on IP_VS + +config IP_VS_RR + tristate "round-robin scheduling" + depends on IP_VS + ---help--- + The robin-robin scheduling algorithm simply directs network + connections to different real servers in a round-robin manner. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WRR + tristate "weighted round-robin scheduling" + depends on IP_VS + ---help--- + The weighted robin-robin scheduling algorithm directs network + connections to different real servers based on server weights + in a round-robin manner. Servers with higher weights receive + new connections first than those with less weights, and servers + with higher weights get more connections than those with less + weights and servers with equal weights get equal connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LC + tristate "least-connection scheduling" + depends on IP_VS + ---help--- + The least-connection scheduling algorithm directs network + connections to the server with the least number of active + connections. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_WLC + tristate "weighted least-connection scheduling" + depends on IP_VS + ---help--- + The weighted least-connection scheduling algorithm directs network + connections to the server with the least active connections + normalized by the server weight. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLC + tristate "locality-based least-connection scheduling" + depends on IP_VS + ---help--- + The locality-based least-connection scheduling algorithm is for + destination IP load balancing. It is usually used in cache cluster. + This algorithm usually directs packet destined for an IP address to + its server if the server is alive and under load. If the server is + overloaded (its active connection numbers is larger than its weight) + and there is a server in its half load, then allocate the weighted + least-connection server to this IP address. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_LBLCR + tristate "locality-based least-connection with replication scheduling" + depends on IP_VS + ---help--- + The locality-based least-connection with replication scheduling + algorithm is also for destination IP load balancing. It is + usually used in cache cluster. It differs from the LBLC scheduling + as follows: the load balancer maintains mappings from a target + to a set of server nodes that can serve the target. Requests for + a target are assigned to the least-connection node in the target's + server set. If all the node in the server set are over loaded, + it picks up a least-connection node in the cluster and adds it + in the sever set for the target. If the server set has not been + modified for the specified time, the most loaded node is removed + from the server set, in order to avoid high degree of replication. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_DH + tristate "destination hashing scheduling" + depends on IP_VS + ---help--- + The destination hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their destination IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SH + tristate "source hashing scheduling" + depends on IP_VS + ---help--- + The source hashing scheduling algorithm assigns network + connections to the servers through looking up a statically assigned + hash table by their source IP addresses. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_SED + tristate "shortest expected delay scheduling" + depends on IP_VS + ---help--- + The shortest expected delay scheduling algorithm assigns network + connections to the server with the shortest expected delay. The + expected delay that the job will experience is (Ci + 1) / Ui if + sent to the ith server, in which Ci is the number of connections + on the the ith server and Ui is the fixed service rate (weight) + of the ith server. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +config IP_VS_NQ + tristate "never queue scheduling" + depends on IP_VS + ---help--- + The never queue scheduling algorithm adopts a two-speed model. + When there is an idle server available, the job will be sent to + the idle server, instead of waiting for a fast one. When there + is no idle server available, the job will be sent to the server + that minimize its expected delay (The Shortest Expected Delay + scheduling algorithm). + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +comment 'IPVS application helper' + depends on IP_VS + +config IP_VS_FTP + tristate "FTP protocol helper" + depends on IP_VS && IP_VS_PROTO_TCP + ---help--- + FTP is a protocol that transfers IP address and/or port number in + the payload. In the virtual server via Network Address Translation, + the IP address and port number of real servers cannot be sent to + clients in ftp connections directly, so FTP protocol helper is + required for tracking the connection and mangling it back to that of + virtual service. + + If you want to compile it in kernel, say Y. To compile it as a + module, choose M here. If unsure, say N. + +endmenu diff --git a/net/ipv4/ipvs/Makefile b/net/ipv4/ipvs/Makefile new file mode 100644 index 000000000000..a788461a40c9 --- /dev/null +++ b/net/ipv4/ipvs/Makefile @@ -0,0 +1,34 @@ +# +# Makefile for the IPVS modules on top of IPv4. +# + +# IPVS transport protocol load balancing support +ip_vs_proto-objs-y := +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_TCP) += ip_vs_proto_tcp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_ESP) += ip_vs_proto_esp.o +ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH) += ip_vs_proto_ah.o + +ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ + ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ + ip_vs_est.o ip_vs_proto.o ip_vs_proto_icmp.o \ + $(ip_vs_proto-objs-y) + + +# IPVS core +obj-$(CONFIG_IP_VS) += ip_vs.o + +# IPVS schedulers +obj-$(CONFIG_IP_VS_RR) += ip_vs_rr.o +obj-$(CONFIG_IP_VS_WRR) += ip_vs_wrr.o +obj-$(CONFIG_IP_VS_LC) += ip_vs_lc.o +obj-$(CONFIG_IP_VS_WLC) += ip_vs_wlc.o +obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o +obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o +obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o +obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o +obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o +obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o + +# IPVS application helpers +obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c new file mode 100644 index 000000000000..d9212addd193 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -0,0 +1,658 @@ +/* + * ip_vs_app.c: Application module support for IPVS + * + * Version: $Id: ip_vs_app.c,v 1.17 2003/03/22 06:31:21 wensong Exp $ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_app.c in kernel 2.2. The difference + * is that ip_vs_app module handles the reverse direction (incoming requests + * and outgoing responses). + * + * IP_MASQ_APP application masquerading module + * + * Author: Juan Jose Ciarlante, + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +EXPORT_SYMBOL(register_ip_vs_app); +EXPORT_SYMBOL(unregister_ip_vs_app); +EXPORT_SYMBOL(register_ip_vs_app_inc); + +/* ipvs application list head */ +static LIST_HEAD(ip_vs_app_list); +static DECLARE_MUTEX(__ip_vs_app_mutex); + + +/* + * Get an ip_vs_app object + */ +static inline int ip_vs_app_get(struct ip_vs_app *app) +{ + /* test and get the module atomically */ + if (app->module) + return try_module_get(app->module); + else + return 1; +} + + +static inline void ip_vs_app_put(struct ip_vs_app *app) +{ + if (app->module) + module_put(app->module); +} + + +/* + * Allocate/initialize app incarnation and register it in proto apps. + */ +static int +ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) +{ + struct ip_vs_protocol *pp; + struct ip_vs_app *inc; + int ret; + + if (!(pp = ip_vs_proto_get(proto))) + return -EPROTONOSUPPORT; + + if (!pp->unregister_app) + return -EOPNOTSUPP; + + inc = kmalloc(sizeof(struct ip_vs_app), GFP_KERNEL); + if (!inc) + return -ENOMEM; + memcpy(inc, app, sizeof(*inc)); + INIT_LIST_HEAD(&inc->p_list); + INIT_LIST_HEAD(&inc->incs_list); + inc->app = app; + inc->port = htons(port); + atomic_set(&inc->usecnt, 0); + + if (app->timeouts) { + inc->timeout_table = + ip_vs_create_timeout_table(app->timeouts, + app->timeouts_size); + if (!inc->timeout_table) { + ret = -ENOMEM; + goto out; + } + } + + ret = pp->register_app(inc); + if (ret) + goto out; + + list_add(&inc->a_list, &app->incs_list); + IP_VS_DBG(9, "%s application %s:%u registered\n", + pp->name, inc->name, inc->port); + + return 0; + + out: + if (inc->timeout_table) + kfree(inc->timeout_table); + kfree(inc); + return ret; +} + + +/* + * Release app incarnation + */ +static void +ip_vs_app_inc_release(struct ip_vs_app *inc) +{ + struct ip_vs_protocol *pp; + + if (!(pp = ip_vs_proto_get(inc->protocol))) + return; + + if (pp->unregister_app) + pp->unregister_app(inc); + + IP_VS_DBG(9, "%s App %s:%u unregistered\n", + pp->name, inc->name, inc->port); + + list_del(&inc->a_list); + + if (inc->timeout_table != NULL) + kfree(inc->timeout_table); + kfree(inc); +} + + +/* + * Get reference to app inc (only called from softirq) + * + */ +int ip_vs_app_inc_get(struct ip_vs_app *inc) +{ + int result; + + atomic_inc(&inc->usecnt); + if (unlikely((result = ip_vs_app_get(inc->app)) != 1)) + atomic_dec(&inc->usecnt); + return result; +} + + +/* + * Put the app inc (only called from timer or net softirq) + */ +void ip_vs_app_inc_put(struct ip_vs_app *inc) +{ + ip_vs_app_put(inc->app); + atomic_dec(&inc->usecnt); +} + + +/* + * Register an application incarnation in protocol applications + */ +int +register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port) +{ + int result; + + down(&__ip_vs_app_mutex); + + result = ip_vs_app_inc_new(app, proto, port); + + up(&__ip_vs_app_mutex); + + return result; +} + + +/* + * ip_vs_app registration routine + */ +int register_ip_vs_app(struct ip_vs_app *app) +{ + /* increase the module use count */ + ip_vs_use_count_inc(); + + down(&__ip_vs_app_mutex); + + list_add(&app->a_list, &ip_vs_app_list); + + up(&__ip_vs_app_mutex); + + return 0; +} + + +/* + * ip_vs_app unregistration routine + * We are sure there are no app incarnations attached to services + */ +void unregister_ip_vs_app(struct ip_vs_app *app) +{ + struct ip_vs_app *inc, *nxt; + + down(&__ip_vs_app_mutex); + + list_for_each_entry_safe(inc, nxt, &app->incs_list, a_list) { + ip_vs_app_inc_release(inc); + } + + list_del(&app->a_list); + + up(&__ip_vs_app_mutex); + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + + +#if 0000 +/* + * Get reference to app by name (called from user context) + */ +struct ip_vs_app *ip_vs_app_get_by_name(char *appname) +{ + struct ip_vs_app *app, *a = NULL; + + down(&__ip_vs_app_mutex); + + list_for_each_entry(ent, &ip_vs_app_list, a_list) { + if (strcmp(app->name, appname)) + continue; + + /* softirq may call ip_vs_app_get too, so the caller + must disable softirq on the current CPU */ + if (ip_vs_app_get(app)) + a = app; + break; + } + + up(&__ip_vs_app_mutex); + + return a; +} +#endif + + +/* + * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) + */ +int ip_vs_bind_app(struct ip_vs_conn *cp, struct ip_vs_protocol *pp) +{ + return pp->app_conn_bind(cp); +} + + +/* + * Unbind cp from application incarnation (called by cp destructor) + */ +void ip_vs_unbind_app(struct ip_vs_conn *cp) +{ + struct ip_vs_app *inc = cp->app; + + if (!inc) + return; + + if (inc->unbind_conn) + inc->unbind_conn(inc, cp); + if (inc->done_conn) + inc->done_conn(inc, cp); + ip_vs_app_inc_put(inc); + cp->app = NULL; +} + + +/* + * Fixes th->seq based on ip_vs_seq info. + */ +static inline void vs_fix_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 seq = ntohl(th->seq); + + /* + * Adjust seq with delta-offset for all packets after + * the most recent resized pkt seq and with previous_delta offset + * for all packets before most recent resized pkt seq. + */ + if (vseq->delta || vseq->previous_delta) { + if(after(seq, vseq->init_seq)) { + th->seq = htonl(seq + vseq->delta); + IP_VS_DBG(9, "vs_fix_seq(): added delta (%d) to seq\n", + vseq->delta); + } else { + th->seq = htonl(seq + vseq->previous_delta); + IP_VS_DBG(9, "vs_fix_seq(): added previous_delta " + "(%d) to seq\n", vseq->previous_delta); + } + } +} + + +/* + * Fixes th->ack_seq based on ip_vs_seq info. + */ +static inline void +vs_fix_ack_seq(const struct ip_vs_seq *vseq, struct tcphdr *th) +{ + __u32 ack_seq = ntohl(th->ack_seq); + + /* + * Adjust ack_seq with delta-offset for + * the packets AFTER most recent resized pkt has caused a shift + * for packets before most recent resized pkt, use previous_delta + */ + if (vseq->delta || vseq->previous_delta) { + /* since ack_seq is the number of octet that is expected + to receive next, so compare it with init_seq+delta */ + if(after(ack_seq, vseq->init_seq+vseq->delta)) { + th->ack_seq = htonl(ack_seq - vseq->delta); + IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted delta " + "(%d) from ack_seq\n", vseq->delta); + + } else { + th->ack_seq = htonl(ack_seq - vseq->previous_delta); + IP_VS_DBG(9, "vs_fix_ack_seq(): subtracted " + "previous_delta (%d) from ack_seq\n", + vseq->previous_delta); + } + } +} + + +/* + * Updates ip_vs_seq if pkt has been resized + * Assumes already checked proto==IPPROTO_TCP and diff!=0. + */ +static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, + unsigned flag, __u32 seq, int diff) +{ + /* spinlock is to keep updating cp->flags atomic */ + spin_lock(&cp->lock); + if (!(cp->flags & flag) || after(seq, vseq->init_seq)) { + vseq->previous_delta = vseq->delta; + vseq->delta += diff; + vseq->init_seq = seq; + cp->flags |= flag; + } + spin_unlock(&cp->lock); +} + +static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb, + struct ip_vs_app *app) +{ + int diff; + unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4; + struct tcphdr *th; + __u32 seq; + + if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_seq(&cp->out_seq, th); + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_ack_seq(&cp->in_seq, th); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + if (!app->pkt_out(app, cp, pskb, &diff)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->out_seq, + IP_VS_CONN_F_OUT_SEQ, seq, diff); + + return 1; +} + +/* + * Output pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL + * returns false if it can't handle packet (oom) + */ +int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_out(cp, pskb, app); + + /* + * Call private output hook function + */ + if (app->pkt_out == NULL) + return 1; + + return app->pkt_out(app, cp, pskb, NULL); +} + + +static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb, + struct ip_vs_app *app) +{ + int diff; + unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4; + struct tcphdr *th; + __u32 seq; + + if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th))) + return 0; + + th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset); + + /* + * Remember seq number in case this pkt gets resized + */ + seq = ntohl(th->seq); + + /* + * Fix seq stuff if flagged as so. + */ + if (cp->flags & IP_VS_CONN_F_IN_SEQ) + vs_fix_seq(&cp->in_seq, th); + if (cp->flags & IP_VS_CONN_F_OUT_SEQ) + vs_fix_ack_seq(&cp->out_seq, th); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + if (!app->pkt_in(app, cp, pskb, &diff)) + return 0; + + /* + * Update ip_vs seq stuff if len has changed. + */ + if (diff != 0) + vs_seq_update(cp, &cp->in_seq, + IP_VS_CONN_F_IN_SEQ, seq, diff); + + return 1; +} + +/* + * Input pkt hook. Will call bound ip_vs_app specific function + * called by ipvs packet handler, assumes previously checked cp!=NULL. + * returns false if can't handle packet (oom). + */ +int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb) +{ + struct ip_vs_app *app; + + /* + * check if application module is bound to + * this ip_vs_conn. + */ + if ((app = cp->app) == NULL) + return 1; + + /* TCP is complicated */ + if (cp->protocol == IPPROTO_TCP) + return app_tcp_pkt_in(cp, pskb, app); + + /* + * Call private input hook function + */ + if (app->pkt_in == NULL) + return 1; + + return app->pkt_in(app, cp, pskb, NULL); +} + + +#ifdef CONFIG_PROC_FS +/* + * /proc/net/ip_vs_app entry function + */ + +static struct ip_vs_app *ip_vs_app_idx(loff_t pos) +{ + struct ip_vs_app *app, *inc; + + list_for_each_entry(app, &ip_vs_app_list, a_list) { + list_for_each_entry(inc, &app->incs_list, a_list) { + if (pos-- == 0) + return inc; + } + } + return NULL; + +} + +static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) +{ + down(&__ip_vs_app_mutex); + + return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_app *inc, *app; + struct list_head *e; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_app_idx(0); + + inc = v; + app = inc->app; + + if ((e = inc->a_list.next) != &app->incs_list) + return list_entry(e, struct ip_vs_app, a_list); + + /* go on to next application */ + for (e = app->a_list.next; e != &ip_vs_app_list; e = e->next) { + app = list_entry(e, struct ip_vs_app, a_list); + list_for_each_entry(inc, &app->incs_list, a_list) { + return inc; + } + } + return NULL; +} + +static void ip_vs_app_seq_stop(struct seq_file *seq, void *v) +{ + up(&__ip_vs_app_mutex); +} + +static int ip_vs_app_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "prot port usecnt name\n"); + else { + const struct ip_vs_app *inc = v; + + seq_printf(seq, "%-3s %-7u %-6d %-17s\n", + ip_vs_proto_name(inc->protocol), + ntohs(inc->port), + atomic_read(&inc->usecnt), + inc->name); + } + return 0; +} + +static struct seq_operations ip_vs_app_seq_ops = { + .start = ip_vs_app_seq_start, + .next = ip_vs_app_seq_next, + .stop = ip_vs_app_seq_stop, + .show = ip_vs_app_seq_show, +}; + +static int ip_vs_app_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ip_vs_app_seq_ops); +} + +static struct file_operations ip_vs_app_fops = { + .owner = THIS_MODULE, + .open = ip_vs_app_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + + +/* + * Replace a segment of data with a new segment + */ +int ip_vs_skb_replace(struct sk_buff *skb, int pri, + char *o_buf, int o_len, char *n_buf, int n_len) +{ + struct iphdr *iph; + int diff; + int o_offset; + int o_left; + + EnterFunction(9); + + diff = n_len - o_len; + o_offset = o_buf - (char *)skb->data; + /* The length of left data after o_buf+o_len in the skb data */ + o_left = skb->len - (o_offset + o_len); + + if (diff <= 0) { + memmove(o_buf + n_len, o_buf + o_len, o_left); + memcpy(o_buf, n_buf, n_len); + skb_trim(skb, skb->len + diff); + } else if (diff <= skb_tailroom(skb)) { + skb_put(skb, diff); + memmove(o_buf + n_len, o_buf + o_len, o_left); + memcpy(o_buf, n_buf, n_len); + } else { + if (pskb_expand_head(skb, skb_headroom(skb), diff, pri)) + return -ENOMEM; + skb_put(skb, diff); + memmove(skb->data + o_offset + n_len, + skb->data + o_offset + o_len, o_left); + memcpy(skb->data + o_offset, n_buf, n_len); + } + + /* must update the iph total length here */ + iph = skb->nh.iph; + iph->tot_len = htons(skb->len); + + LeaveFunction(9); + return 0; +} + + +int ip_vs_app_init(void) +{ + /* we will replace it with proc_net_ipvs_create() soon */ + proc_net_fops_create("ip_vs_app", 0, &ip_vs_app_fops); + return 0; +} + + +void ip_vs_app_cleanup(void) +{ + proc_net_remove("ip_vs_app"); +} diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c new file mode 100644 index 000000000000..fd6feb5499fe --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -0,0 +1,920 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_conn.c,v 1.31 2003/04/18 09:03:16 wensong Exp $ + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. Many code here is taken from IP MASQ code of kernel 2.2. + * + * Changes: + * + */ + +#include +#include +#include /* for proc_net_* */ +#include +#include +#include + +#include + + +/* + * Connection hash table: for input and output packets lookups of IPVS + */ +static struct list_head *ip_vs_conn_tab; + +/* SLAB cache for IPVS connections */ +static kmem_cache_t *ip_vs_conn_cachep; + +/* counter for current IPVS connections */ +static atomic_t ip_vs_conn_count = ATOMIC_INIT(0); + +/* counter for no client port connections */ +static atomic_t ip_vs_conn_no_cport_cnt = ATOMIC_INIT(0); + +/* random value for IPVS connection hash */ +static unsigned int ip_vs_conn_rnd; + +/* + * Fine locking granularity for big connection hash table + */ +#define CT_LOCKARRAY_BITS 4 +#define CT_LOCKARRAY_SIZE (1<protocol, cp->caddr, cp->cport); + + ct_write_lock(hash); + + if (!(cp->flags & IP_VS_CONN_F_HASHED)) { + list_add(&cp->c_list, &ip_vs_conn_tab[hash]); + cp->flags |= IP_VS_CONN_F_HASHED; + atomic_inc(&cp->refcnt); + ret = 1; + } else { + IP_VS_ERR("ip_vs_conn_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + ret = 0; + } + + ct_write_unlock(hash); + + return ret; +} + + +/* + * UNhashes ip_vs_conn from ip_vs_conn_tab. + * returns bool success. + */ +static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) +{ + unsigned hash; + int ret; + + /* unhash it and decrease its reference counter */ + hash = ip_vs_conn_hashkey(cp->protocol, cp->caddr, cp->cport); + + ct_write_lock(hash); + + if (cp->flags & IP_VS_CONN_F_HASHED) { + list_del(&cp->c_list); + cp->flags &= ~IP_VS_CONN_F_HASHED; + atomic_dec(&cp->refcnt); + ret = 1; + } else + ret = 0; + + ct_write_unlock(hash); + + return ret; +} + + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from OUTside-to-INside. + * s_addr, s_port: pkt source address (foreign host) + * d_addr, d_port: pkt dest address (load balancer) + */ +static inline struct ip_vs_conn *__ip_vs_conn_in_get +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + unsigned hash; + struct ip_vs_conn *cp; + + hash = ip_vs_conn_hashkey(protocol, s_addr, s_port); + + ct_read_lock(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (s_addr==cp->caddr && s_port==cp->cport && + d_port==cp->vport && d_addr==cp->vaddr && + protocol==cp->protocol) { + /* HIT */ + atomic_inc(&cp->refcnt); + ct_read_unlock(hash); + return cp; + } + } + + ct_read_unlock(hash); + + return NULL; +} + +struct ip_vs_conn *ip_vs_conn_in_get +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + struct ip_vs_conn *cp; + + cp = __ip_vs_conn_in_get(protocol, s_addr, s_port, d_addr, d_port); + if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) + cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); + + IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + ip_vs_proto_name(protocol), + NIPQUAD(s_addr), ntohs(s_port), + NIPQUAD(d_addr), ntohs(d_port), + cp?"hit":"not hit"); + + return cp; +} + + +/* + * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from inside-to-OUTside. + * s_addr, s_port: pkt source address (inside host) + * d_addr, d_port: pkt dest address (foreign host) + */ +struct ip_vs_conn *ip_vs_conn_out_get +(int protocol, __u32 s_addr, __u16 s_port, __u32 d_addr, __u16 d_port) +{ + unsigned hash; + struct ip_vs_conn *cp, *ret=NULL; + + /* + * Check for "full" addressed entries + */ + hash = ip_vs_conn_hashkey(protocol, d_addr, d_port); + + ct_read_lock(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (d_addr == cp->caddr && d_port == cp->cport && + s_port == cp->dport && s_addr == cp->daddr && + protocol == cp->protocol) { + /* HIT */ + atomic_inc(&cp->refcnt); + ret = cp; + break; + } + } + + ct_read_unlock(hash); + + IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + ip_vs_proto_name(protocol), + NIPQUAD(s_addr), ntohs(s_port), + NIPQUAD(d_addr), ntohs(d_port), + ret?"hit":"not hit"); + + return ret; +} + + +/* + * Put back the conn and restart its timer with its timeout + */ +void ip_vs_conn_put(struct ip_vs_conn *cp) +{ + /* reset it expire in its timeout */ + mod_timer(&cp->timer, jiffies+cp->timeout); + + __ip_vs_conn_put(cp); +} + + +/* + * Fill a no_client_port connection with a client port number + */ +void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __u16 cport) +{ + if (ip_vs_conn_unhash(cp)) { + spin_lock(&cp->lock); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) { + atomic_dec(&ip_vs_conn_no_cport_cnt); + cp->flags &= ~IP_VS_CONN_F_NO_CPORT; + cp->cport = cport; + } + spin_unlock(&cp->lock); + + /* hash on new dport */ + ip_vs_conn_hash(cp); + } +} + + +/* + * Bind a connection entry with the corresponding packet_xmit. + * Called by ip_vs_conn_new. + */ +static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp) +{ + switch (IP_VS_FWD_METHOD(cp)) { + case IP_VS_CONN_F_MASQ: + cp->packet_xmit = ip_vs_nat_xmit; + break; + + case IP_VS_CONN_F_TUNNEL: + cp->packet_xmit = ip_vs_tunnel_xmit; + break; + + case IP_VS_CONN_F_DROUTE: + cp->packet_xmit = ip_vs_dr_xmit; + break; + + case IP_VS_CONN_F_LOCALNODE: + cp->packet_xmit = ip_vs_null_xmit; + break; + + case IP_VS_CONN_F_BYPASS: + cp->packet_xmit = ip_vs_bypass_xmit; + break; + } +} + + +static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) +{ + return atomic_read(&dest->activeconns) + + atomic_read(&dest->inactconns); +} + +/* + * Bind a connection entry with a virtual service destination + * Called just after a new connection entry is created. + */ +static inline void +ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) +{ + /* if dest is NULL, then return directly */ + if (!dest) + return; + + /* Increase the refcnt counter of the dest */ + atomic_inc(&dest->refcnt); + + /* Bind with the destination and its corresponding transmitter */ + cp->flags |= atomic_read(&dest->conn_flags); + cp->dest = dest; + + IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", + ip_vs_proto_name(cp->protocol), + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + NIPQUAD(cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) { + /* It is a normal connection, so increase the inactive + connection counter because it is in TCP SYNRECV + state (inactive) or other protocol inacive state */ + atomic_inc(&dest->inactconns); + } else { + /* It is a persistent connection/template, so increase + the peristent connection counter */ + atomic_inc(&dest->persistconns); + } + + if (dest->u_threshold != 0 && + ip_vs_dest_totalconns(dest) >= dest->u_threshold) + dest->flags |= IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Unbind a connection entry with its VS destination + * Called by the ip_vs_conn_expire function. + */ +static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) +{ + struct ip_vs_dest *dest = cp->dest; + + if (!dest) + return; + + IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", + ip_vs_proto_name(cp->protocol), + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + NIPQUAD(cp->daddr), ntohs(cp->dport), + ip_vs_fwd_tag(cp), cp->state, + cp->flags, atomic_read(&cp->refcnt), + atomic_read(&dest->refcnt)); + + /* Update the connection counters */ + if (cp->cport || (cp->flags & IP_VS_CONN_F_NO_CPORT)) { + /* It is a normal connection, so decrease the inactconns + or activeconns counter */ + if (cp->flags & IP_VS_CONN_F_INACTIVE) { + atomic_dec(&dest->inactconns); + } else { + atomic_dec(&dest->activeconns); + } + } else { + /* It is a persistent connection/template, so decrease + the peristent connection counter */ + atomic_dec(&dest->persistconns); + } + + if (dest->l_threshold != 0) { + if (ip_vs_dest_totalconns(dest) < dest->l_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else if (dest->u_threshold != 0) { + if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } else { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + } + + /* + * Simply decrease the refcnt of the dest, because the + * dest will be either in service's destination list + * or in the trash. + */ + atomic_dec(&dest->refcnt); +} + + +/* + * Checking if the destination of a connection template is available. + * If available, return 1, otherwise invalidate this connection + * template and return 0. + */ +int ip_vs_check_template(struct ip_vs_conn *ct) +{ + struct ip_vs_dest *dest = ct->dest; + + /* + * Checking the dest server status. + */ + if ((dest == NULL) || + !(dest->flags & IP_VS_DEST_F_AVAILABLE) || + (sysctl_ip_vs_expire_quiescent_template && + (atomic_read(&dest->weight) == 0))) { + IP_VS_DBG(9, "check_template: dest not available for " + "protocol %s s:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "-> d:%u.%u.%u.%u:%d\n", + ip_vs_proto_name(ct->protocol), + NIPQUAD(ct->caddr), ntohs(ct->cport), + NIPQUAD(ct->vaddr), ntohs(ct->vport), + NIPQUAD(ct->daddr), ntohs(ct->dport)); + + /* + * Invalidate the connection template + */ + if (ct->cport) { + if (ip_vs_conn_unhash(ct)) { + ct->dport = 65535; + ct->vport = 65535; + ct->cport = 0; + ip_vs_conn_hash(ct); + } + } + + /* + * Simply decrease the refcnt of the template, + * don't restart its timer. + */ + atomic_dec(&ct->refcnt); + return 0; + } + return 1; +} + +static void ip_vs_conn_expire(unsigned long data) +{ + struct ip_vs_conn *cp = (struct ip_vs_conn *)data; + + cp->timeout = 60*HZ; + + /* + * hey, I'm using it + */ + atomic_inc(&cp->refcnt); + + /* + * do I control anybody? + */ + if (atomic_read(&cp->n_control)) + goto expire_later; + + /* + * unhash it if it is hashed in the conn table + */ + if (!ip_vs_conn_unhash(cp)) + goto expire_later; + + /* + * refcnt==1 implies I'm the only one referrer + */ + if (likely(atomic_read(&cp->refcnt) == 1)) { + /* delete the timer if it is activated by other users */ + if (timer_pending(&cp->timer)) + del_timer(&cp->timer); + + /* does anybody control me? */ + if (cp->control) + ip_vs_control_del(cp); + + if (unlikely(cp->app != NULL)) + ip_vs_unbind_app(cp); + ip_vs_unbind_dest(cp); + if (cp->flags & IP_VS_CONN_F_NO_CPORT) + atomic_dec(&ip_vs_conn_no_cport_cnt); + atomic_dec(&ip_vs_conn_count); + + kmem_cache_free(ip_vs_conn_cachep, cp); + return; + } + + /* hash it back to the table */ + ip_vs_conn_hash(cp); + + expire_later: + IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n", + atomic_read(&cp->refcnt)-1, + atomic_read(&cp->n_control)); + + ip_vs_conn_put(cp); +} + + +void ip_vs_conn_expire_now(struct ip_vs_conn *cp) +{ + if (del_timer(&cp->timer)) + mod_timer(&cp->timer, jiffies); + __ip_vs_conn_put(cp); +} + + +/* + * Create a new connection entry and hash it into the ip_vs_conn_tab + */ +struct ip_vs_conn * +ip_vs_conn_new(int proto, __u32 caddr, __u16 cport, __u32 vaddr, __u16 vport, + __u32 daddr, __u16 dport, unsigned flags, + struct ip_vs_dest *dest) +{ + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + + cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC); + if (cp == NULL) { + IP_VS_ERR_RL("ip_vs_conn_new: no memory available.\n"); + return NULL; + } + + memset(cp, 0, sizeof(*cp)); + INIT_LIST_HEAD(&cp->c_list); + init_timer(&cp->timer); + cp->timer.data = (unsigned long)cp; + cp->timer.function = ip_vs_conn_expire; + cp->protocol = proto; + cp->caddr = caddr; + cp->cport = cport; + cp->vaddr = vaddr; + cp->vport = vport; + cp->daddr = daddr; + cp->dport = dport; + cp->flags = flags; + spin_lock_init(&cp->lock); + + /* + * Set the entry is referenced by the current thread before hashing + * it in the table, so that other thread run ip_vs_random_dropentry + * but cannot drop this entry. + */ + atomic_set(&cp->refcnt, 1); + + atomic_set(&cp->n_control, 0); + atomic_set(&cp->in_pkts, 0); + + atomic_inc(&ip_vs_conn_count); + if (flags & IP_VS_CONN_F_NO_CPORT) + atomic_inc(&ip_vs_conn_no_cport_cnt); + + /* Bind the connection with a destination server */ + ip_vs_bind_dest(cp, dest); + + /* Set its state and timeout */ + cp->state = 0; + cp->timeout = 3*HZ; + + /* Bind its packet transmitter */ + ip_vs_bind_xmit(cp); + + if (unlikely(pp && atomic_read(&pp->appcnt))) + ip_vs_bind_app(cp, pp); + + /* Hash it in the ip_vs_conn_tab finally */ + ip_vs_conn_hash(cp); + + return cp; +} + + +/* + * /proc/net/ip_vs_conn entries + */ +#ifdef CONFIG_PROC_FS + +static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) +{ + int idx; + struct ip_vs_conn *cp; + + for(idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + ct_read_lock_bh(idx); + list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + if (pos-- == 0) { + seq->private = &ip_vs_conn_tab[idx]; + return cp; + } + } + ct_read_unlock_bh(idx); + } + + return NULL; +} + +static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos) +{ + seq->private = NULL; + return *pos ? ip_vs_conn_array(seq, *pos - 1) :SEQ_START_TOKEN; +} + +static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip_vs_conn *cp = v; + struct list_head *e, *l = seq->private; + int idx; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_conn_array(seq, 0); + + /* more on same hash chain? */ + if ((e = cp->c_list.next) != l) + return list_entry(e, struct ip_vs_conn, c_list); + + idx = l - ip_vs_conn_tab; + ct_read_unlock_bh(idx); + + while (++idx < IP_VS_CONN_TAB_SIZE) { + ct_read_lock_bh(idx); + list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + seq->private = &ip_vs_conn_tab[idx]; + return cp; + } + ct_read_unlock_bh(idx); + } + seq->private = NULL; + return NULL; +} + +static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) +{ + struct list_head *l = seq->private; + + if (l) + ct_read_unlock_bh(l - ip_vs_conn_tab); +} + +static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) +{ + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); + else { + const struct ip_vs_conn *cp = v; + + seq_printf(seq, + "%-3s %08X %04X %08X %04X %08X %04X %-11s %7lu\n", + ip_vs_proto_name(cp->protocol), + ntohl(cp->caddr), ntohs(cp->cport), + ntohl(cp->vaddr), ntohs(cp->vport), + ntohl(cp->daddr), ntohs(cp->dport), + ip_vs_state_name(cp->protocol, cp->state), + (cp->timer.expires-jiffies)/HZ); + } + return 0; +} + +static struct seq_operations ip_vs_conn_seq_ops = { + .start = ip_vs_conn_seq_start, + .next = ip_vs_conn_seq_next, + .stop = ip_vs_conn_seq_stop, + .show = ip_vs_conn_seq_show, +}; + +static int ip_vs_conn_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ip_vs_conn_seq_ops); +} + +static struct file_operations ip_vs_conn_fops = { + .owner = THIS_MODULE, + .open = ip_vs_conn_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + + +/* + * Randomly drop connection entries before running out of memory + */ +static inline int todrop_entry(struct ip_vs_conn *cp) +{ + /* + * The drop rate array needs tuning for real environments. + * Called from timer bh only => no locking + */ + static char todrop_rate[9] = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + static char todrop_counter[9] = {0}; + int i; + + /* if the conn entry hasn't lasted for 60 seconds, don't drop it. + This will leave enough time for normal connection to get + through. */ + if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ)) + return 0; + + /* Don't drop the entry if its number of incoming packets is not + located in [0, 8] */ + i = atomic_read(&cp->in_pkts); + if (i > 8 || i < 0) return 0; + + if (!todrop_rate[i]) return 0; + if (--todrop_counter[i] > 0) return 0; + + todrop_counter[i] = todrop_rate[i]; + return 1; +} + + +void ip_vs_random_dropentry(void) +{ + int idx; + struct ip_vs_conn *cp; + struct ip_vs_conn *ct; + + /* + * Randomly scan 1/32 of the whole table every second + */ + for (idx = 0; idx < (IP_VS_CONN_TAB_SIZE>>5); idx++) { + unsigned hash = net_random() & IP_VS_CONN_TAB_MASK; + + /* + * Lock is actually needed in this loop. + */ + ct_write_lock(hash); + + list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (!cp->cport && !(cp->flags & IP_VS_CONN_F_NO_CPORT)) + /* connection template */ + continue; + + if (cp->protocol == IPPROTO_TCP) { + switch(cp->state) { + case IP_VS_TCP_S_SYN_RECV: + case IP_VS_TCP_S_SYNACK: + break; + + case IP_VS_TCP_S_ESTABLISHED: + if (todrop_entry(cp)) + break; + continue; + + default: + continue; + } + } else { + if (!todrop_entry(cp)) + continue; + } + + /* + * Drop the entry, and drop its ct if not referenced + */ + atomic_inc(&cp->refcnt); + ct_write_unlock(hash); + + if ((ct = cp->control)) + atomic_inc(&ct->refcnt); + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); + if (ct) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(ct); + } + ct_write_lock(hash); + } + ct_write_unlock(hash); + } +} + + +/* + * Flush all the connection entries in the ip_vs_conn_tab + */ +static void ip_vs_conn_flush(void) +{ + int idx; + struct ip_vs_conn *cp; + struct ip_vs_conn *ct; + + flush_again: + for (idx=0; idxrefcnt); + ct_write_unlock(idx); + + if ((ct = cp->control)) + atomic_inc(&ct->refcnt); + IP_VS_DBG(4, "del connection\n"); + ip_vs_conn_expire_now(cp); + if (ct) { + IP_VS_DBG(4, "del conn template\n"); + ip_vs_conn_expire_now(ct); + } + ct_write_lock(idx); + } + ct_write_unlock_bh(idx); + } + + /* the counter may be not NULL, because maybe some conn entries + are run by slow timer handler or unhashed but still referred */ + if (atomic_read(&ip_vs_conn_count) != 0) { + schedule(); + goto flush_again; + } +} + + +int ip_vs_conn_init(void) +{ + int idx; + + /* + * Allocate the connection hash table and initialize its list heads + */ + ip_vs_conn_tab = vmalloc(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head)); + if (!ip_vs_conn_tab) + return -ENOMEM; + + /* Allocate ip_vs_conn slab cache */ + ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn", + sizeof(struct ip_vs_conn), 0, + SLAB_HWCACHE_ALIGN, NULL, NULL); + if (!ip_vs_conn_cachep) { + vfree(ip_vs_conn_tab); + return -ENOMEM; + } + + IP_VS_INFO("Connection hash table configured " + "(size=%d, memory=%ldKbytes)\n", + IP_VS_CONN_TAB_SIZE, + (long)(IP_VS_CONN_TAB_SIZE*sizeof(struct list_head))/1024); + IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", + sizeof(struct ip_vs_conn)); + + for (idx = 0; idx < IP_VS_CONN_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); + } + + for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { + rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); + } + + proc_net_fops_create("ip_vs_conn", 0, &ip_vs_conn_fops); + + /* calculate the random value for connection hash */ + get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); + + return 0; +} + + +void ip_vs_conn_cleanup(void) +{ + /* flush all the connection entries first */ + ip_vs_conn_flush(); + + /* Release the empty cache */ + kmem_cache_destroy(ip_vs_conn_cachep); + proc_net_remove("ip_vs_conn"); + vfree(ip_vs_conn_tab); +} diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c new file mode 100644 index 000000000000..5fb257dd07cb --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -0,0 +1,1191 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_core.c,v 1.34 2003/05/10 03:05:23 wensong Exp $ + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese, + * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms + * and others. + * + * Changes: + * Paul `Rusty' Russell properly handle non-linear skbs + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include /* for icmp_send */ +#include + +#include +#include + +#include + + +EXPORT_SYMBOL(register_ip_vs_scheduler); +EXPORT_SYMBOL(unregister_ip_vs_scheduler); +EXPORT_SYMBOL(ip_vs_skb_replace); +EXPORT_SYMBOL(ip_vs_proto_name); +EXPORT_SYMBOL(ip_vs_conn_new); +EXPORT_SYMBOL(ip_vs_conn_in_get); +EXPORT_SYMBOL(ip_vs_conn_out_get); +#ifdef CONFIG_IP_VS_PROTO_TCP +EXPORT_SYMBOL(ip_vs_tcp_conn_listen); +#endif +EXPORT_SYMBOL(ip_vs_conn_put); +#ifdef CONFIG_IP_VS_DEBUG +EXPORT_SYMBOL(ip_vs_get_debug_level); +#endif +EXPORT_SYMBOL(ip_vs_make_skb_writable); + + +/* ID used in ICMP lookups */ +#define icmp_id(icmph) (((icmph)->un).echo.id) + +const char *ip_vs_proto_name(unsigned proto) +{ + static char buf[20]; + + switch (proto) { + case IPPROTO_IP: + return "IP"; + case IPPROTO_UDP: + return "UDP"; + case IPPROTO_TCP: + return "TCP"; + case IPPROTO_ICMP: + return "ICMP"; + default: + sprintf(buf, "IP_%d", proto); + return buf; + } +} + +void ip_vs_init_hash_table(struct list_head *table, int rows) +{ + while (--rows >= 0) + INIT_LIST_HEAD(&table[rows]); +} + +static inline void +ip_vs_in_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + spin_lock(&dest->stats.lock); + dest->stats.inpkts++; + dest->stats.inbytes += skb->len; + spin_unlock(&dest->stats.lock); + + spin_lock(&dest->svc->stats.lock); + dest->svc->stats.inpkts++; + dest->svc->stats.inbytes += skb->len; + spin_unlock(&dest->svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.inpkts++; + ip_vs_stats.inbytes += skb->len; + spin_unlock(&ip_vs_stats.lock); + } +} + + +static inline void +ip_vs_out_stats(struct ip_vs_conn *cp, struct sk_buff *skb) +{ + struct ip_vs_dest *dest = cp->dest; + if (dest && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + spin_lock(&dest->stats.lock); + dest->stats.outpkts++; + dest->stats.outbytes += skb->len; + spin_unlock(&dest->stats.lock); + + spin_lock(&dest->svc->stats.lock); + dest->svc->stats.outpkts++; + dest->svc->stats.outbytes += skb->len; + spin_unlock(&dest->svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.outpkts++; + ip_vs_stats.outbytes += skb->len; + spin_unlock(&ip_vs_stats.lock); + } +} + + +static inline void +ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) +{ + spin_lock(&cp->dest->stats.lock); + cp->dest->stats.conns++; + spin_unlock(&cp->dest->stats.lock); + + spin_lock(&svc->stats.lock); + svc->stats.conns++; + spin_unlock(&svc->stats.lock); + + spin_lock(&ip_vs_stats.lock); + ip_vs_stats.conns++; + spin_unlock(&ip_vs_stats.lock); +} + + +static inline int +ip_vs_set_state(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + if (unlikely(!pp->state_transition)) + return 0; + return pp->state_transition(cp, direction, skb, pp); +} + + +int ip_vs_make_skb_writable(struct sk_buff **pskb, int writable_len) +{ + struct sk_buff *skb = *pskb; + + /* skb is already used, better copy skb and its payload */ + if (unlikely(skb_shared(skb) || skb->sk)) + goto copy_skb; + + /* skb data is already used, copy it */ + if (unlikely(skb_cloned(skb))) + goto copy_data; + + return pskb_may_pull(skb, writable_len); + + copy_data: + if (unlikely(writable_len > skb->len)) + return 0; + return !pskb_expand_head(skb, 0, 0, GFP_ATOMIC); + + copy_skb: + if (unlikely(writable_len > skb->len)) + return 0; + skb = skb_copy(skb, GFP_ATOMIC); + if (!skb) + return 0; + BUG_ON(skb_is_nonlinear(skb)); + + /* Rest of kernel will get very unhappy if we pass it a + suddenly-orphaned skbuff */ + if ((*pskb)->sk) + skb_set_owner_w(skb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = skb; + return 1; +} + +/* + * IPVS persistent scheduling function + * It creates a connection entry according to its template if exists, + * or selects a server and creates a connection entry plus a template. + * Locking: we are svc user (svc->refcnt), so we hold all dests too + * Protocols supported: TCP, UDP + */ +static struct ip_vs_conn * +ip_vs_sched_persist(struct ip_vs_service *svc, + const struct sk_buff *skb, + __u16 ports[2]) +{ + struct ip_vs_conn *cp = NULL; + struct iphdr *iph = skb->nh.iph; + struct ip_vs_dest *dest; + struct ip_vs_conn *ct; + __u16 dport; /* destination port to forward */ + __u32 snet; /* source network of the client, after masking */ + + /* Mask saddr with the netmask to adjust template granularity */ + snet = iph->saddr & svc->netmask; + + IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u " + "mnet %u.%u.%u.%u\n", + NIPQUAD(iph->saddr), ntohs(ports[0]), + NIPQUAD(iph->daddr), ntohs(ports[1]), + NIPQUAD(snet)); + + /* + * As far as we know, FTP is a very complicated network protocol, and + * it uses control connection and data connections. For active FTP, + * FTP server initialize data connection to the client, its source port + * is often 20. For passive FTP, FTP server tells the clients the port + * that it passively listens to, and the client issues the data + * connection. In the tunneling or direct routing mode, the load + * balancer is on the client-to-server half of connection, the port + * number is unknown to the load balancer. So, a conn template like + * is created for persistent FTP + * service, and a template like + * is created for other persistent services. + */ + if (ports[1] == svc->port) { + /* Check if a template already exists */ + if (svc->port != FTPPORT) + ct = ip_vs_conn_in_get(iph->protocol, snet, 0, + iph->daddr, ports[1]); + else + ct = ip_vs_conn_in_get(iph->protocol, snet, 0, + iph->daddr, 0); + + if (!ct || !ip_vs_check_template(ct)) { + /* + * No template found or the dest of the connection + * template is not available. + */ + dest = svc->scheduler->schedule(svc, skb); + if (dest == NULL) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a template like for non-ftp service, + * and + * for ftp service. + */ + if (svc->port != FTPPORT) + ct = ip_vs_conn_new(iph->protocol, + snet, 0, + iph->daddr, + ports[1], + dest->addr, dest->port, + 0, + dest); + else + ct = ip_vs_conn_new(iph->protocol, + snet, 0, + iph->daddr, 0, + dest->addr, 0, + 0, + dest); + if (ct == NULL) + return NULL; + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + } + dport = dest->port; + } else { + /* + * Note: persistent fwmark-based services and persistent + * port zero service are handled here. + * fwmark template: + * port zero template: + */ + if (svc->fwmark) + ct = ip_vs_conn_in_get(IPPROTO_IP, snet, 0, + htonl(svc->fwmark), 0); + else + ct = ip_vs_conn_in_get(iph->protocol, snet, 0, + iph->daddr, 0); + + if (!ct || !ip_vs_check_template(ct)) { + /* + * If it is not persistent port zero, return NULL, + * otherwise create a connection template. + */ + if (svc->port) + return NULL; + + dest = svc->scheduler->schedule(svc, skb); + if (dest == NULL) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a template according to the service + */ + if (svc->fwmark) + ct = ip_vs_conn_new(IPPROTO_IP, + snet, 0, + htonl(svc->fwmark), 0, + dest->addr, 0, + 0, + dest); + else + ct = ip_vs_conn_new(iph->protocol, + snet, 0, + iph->daddr, 0, + dest->addr, 0, + 0, + dest); + if (ct == NULL) + return NULL; + + ct->timeout = svc->timeout; + } else { + /* set destination with the found template */ + dest = ct->dest; + } + dport = ports[1]; + } + + /* + * Create a new connection according to the template + */ + cp = ip_vs_conn_new(iph->protocol, + iph->saddr, ports[0], + iph->daddr, ports[1], + dest->addr, dport, + 0, + dest); + if (cp == NULL) { + ip_vs_conn_put(ct); + return NULL; + } + + /* + * Add its control + */ + ip_vs_control_add(cp, ct); + ip_vs_conn_put(ct); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * IPVS main scheduling function + * It selects a server according to the virtual service, and + * creates a connection entry. + * Protocols supported: TCP, UDP + */ +struct ip_vs_conn * +ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_conn *cp = NULL; + struct iphdr *iph = skb->nh.iph; + struct ip_vs_dest *dest; + __u16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, iph->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + /* + * Persistent service + */ + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + return ip_vs_sched_persist(svc, skb, pptr); + + /* + * Non-persistent service + */ + if (!svc->fwmark && pptr[1] != svc->port) { + if (!svc->port) + IP_VS_ERR("Schedule: port zero only supported " + "in persistent services, " + "check your ipvs configuration\n"); + return NULL; + } + + dest = svc->scheduler->schedule(svc, skb); + if (dest == NULL) { + IP_VS_DBG(1, "Schedule: no dest found.\n"); + return NULL; + } + + /* + * Create a connection entry. + */ + cp = ip_vs_conn_new(iph->protocol, + iph->saddr, pptr[0], + iph->daddr, pptr[1], + dest->addr, dest->port?dest->port:pptr[1], + 0, + dest); + if (cp == NULL) + return NULL; + + IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " + "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n", + ip_vs_fwd_tag(cp), + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + NIPQUAD(cp->daddr), ntohs(cp->dport), + cp->flags, atomic_read(&cp->refcnt)); + + ip_vs_conn_stats(cp, svc); + return cp; +} + + +/* + * Pass or drop the packet. + * Called by ip_vs_in, when the virtual service is available but + * no destination is available for a new connection. + */ +int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + __u16 _ports[2], *pptr; + struct iphdr *iph = skb->nh.iph; + + pptr = skb_header_pointer(skb, iph->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) { + ip_vs_service_put(svc); + return NF_DROP; + } + + /* if it is fwmark-based service, the cache_bypass sysctl is up + and the destination is RTN_UNICAST (and not local), then create + a cache_bypass connection entry */ + if (sysctl_ip_vs_cache_bypass && svc->fwmark + && (inet_addr_type(iph->daddr) == RTN_UNICAST)) { + int ret, cs; + struct ip_vs_conn *cp; + + ip_vs_service_put(svc); + + /* create a new connection entry */ + IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); + cp = ip_vs_conn_new(iph->protocol, + iph->saddr, pptr[0], + iph->daddr, pptr[1], + 0, 0, + IP_VS_CONN_F_BYPASS, + NULL); + if (cp == NULL) + return NF_DROP; + + /* statistics */ + ip_vs_in_stats(cp, skb); + + /* set state */ + cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); + + /* transmit the first SYN packet */ + ret = cp->packet_xmit(skb, cp, pp); + /* do not touch skb anymore */ + + atomic_inc(&cp->in_pkts); + ip_vs_conn_put(cp); + return ret; + } + + /* + * When the virtual ftp service is presented, packets destined + * for other services on the VIP may get here (except services + * listed in the ipvs table), pass the packets, because it is + * not ipvs job to decide to drop the packets. + */ + if ((svc->port == FTPPORT) && (pptr[1] != FTPPORT)) { + ip_vs_service_put(svc); + return NF_ACCEPT; + } + + ip_vs_service_put(svc); + + /* + * Notify the client that the destination is unreachable, and + * release the socket buffer. + * Since it is in IP layer, the TCP socket is not actually + * created, the TCP RST packet cannot be sent, instead that + * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ + */ + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + return NF_DROP; +} + + +/* + * It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING + * chain, and is used for VS/NAT. + * It detects packets for VS/NAT connections and sends the packets + * immediately. This can avoid that iptable_nat mangles the packets + * for VS/NAT. + */ +static unsigned int ip_vs_post_routing(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY)) + return NF_ACCEPT; + + /* The packet was sent from IPVS, exit this chain */ + (*okfn)(*pskb); + + return NF_STOLEN; +} + +u16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) +{ + return (u16) csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); +} + +static inline struct sk_buff * +ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + skb = ip_defrag(skb, user); + if (skb) + ip_send_check(skb->nh.iph); + return skb; +} + +/* + * Packet has been made sufficiently writable in caller + * - inout: 1=in->out, 0=out->in + */ +void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, + struct ip_vs_conn *cp, int inout) +{ + struct iphdr *iph = skb->nh.iph; + unsigned int icmp_offset = iph->ihl*4; + struct icmphdr *icmph = (struct icmphdr *)(skb->nh.raw + icmp_offset); + struct iphdr *ciph = (struct iphdr *)(icmph + 1); + + if (inout) { + iph->saddr = cp->vaddr; + ip_send_check(iph); + ciph->daddr = cp->vaddr; + ip_send_check(ciph); + } else { + iph->daddr = cp->daddr; + ip_send_check(iph); + ciph->saddr = cp->daddr; + ip_send_check(ciph); + } + + /* the TCP/UDP port */ + if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) { + __u16 *ports = (void *)ciph + ciph->ihl*4; + + if (inout) + ports[1] = cp->vport; + else + ports[0] = cp->dport; + } + + /* And finally the ICMP checksum */ + icmph->checksum = 0; + icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset); + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (inout) + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMP"); + else + IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMP"); +} + +/* + * Handle ICMP messages in the inside-to-outside direction (outgoing). + * Find any that might be relevant, check against existing connections, + * forward to the right destination host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded. + * (Only used in VS/NAT) + */ +static int ip_vs_out_icmp(struct sk_buff **pskb, int *related) +{ + struct sk_buff *skb = *pskb; + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, ihl, verdict; + + *related = 1; + + /* reassemble IP fragments */ + if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { + skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT); + if (!skb) + return NF_STOLEN; + *pskb = skb; + } + + iph = skb->nh.iph; + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", + ic->type, ntohs(icmp_id(ic)), + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->protocol); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); + + offset += cih->ihl * 4; + + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_out_get(skb, pp, cih, offset, 1); + if (!cp) + return NF_ACCEPT; + + verdict = NF_DROP; + + if (IP_VS_FWD_METHOD(cp) != 0) { + IP_VS_ERR("shouldn't reach here, because the box is on the" + "half connection in the tun/dr module.\n"); + } + + /* Ensure the checksum is correct */ + if (skb->ip_summed != CHECKSUM_UNNECESSARY && + ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n", + NIPQUAD(iph->saddr)); + goto out; + } + + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) + offset += 2 * sizeof(__u16); + if (!ip_vs_make_skb_writable(pskb, offset)) + goto out; + skb = *pskb; + + ip_vs_nat_icmp(skb, pp, cp, 1); + + /* do the statistics and put it back */ + ip_vs_out_stats(cp, skb); + + skb->nfcache |= NFC_IPVS_PROPERTY; + verdict = NF_ACCEPT; + + out: + __ip_vs_conn_put(cp); + + return verdict; +} + +static inline int is_tcp_reset(const struct sk_buff *skb) +{ + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) + return 0; + return th->rst; +} + +/* + * It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT. + * Check if outgoing packet belongs to the established ip_vs_conn, + * rewrite addresses of the packet and send it on its way... + */ +static unsigned int +ip_vs_out(unsigned int hooknum, struct sk_buff **pskb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *pskb; + struct iphdr *iph; + struct ip_vs_protocol *pp; + struct ip_vs_conn *cp; + int ihl; + + EnterFunction(11); + + if (skb->nfcache & NFC_IPVS_PROPERTY) + return NF_ACCEPT; + + iph = skb->nh.iph; + if (unlikely(iph->protocol == IPPROTO_ICMP)) { + int related, verdict = ip_vs_out_icmp(pskb, &related); + + if (related) + return verdict; + skb = *pskb; + iph = skb->nh.iph; + } + + pp = ip_vs_proto_get(iph->protocol); + if (unlikely(!pp)) + return NF_ACCEPT; + + /* reassemble IP fragments */ + if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) && + !pp->dont_defrag)) { + skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT); + if (!skb) + return NF_STOLEN; + iph = skb->nh.iph; + *pskb = skb; + } + + ihl = iph->ihl << 2; + + /* + * Check if the packet belongs to an existing entry + */ + cp = pp->conn_out_get(skb, pp, iph, ihl, 0); + + if (unlikely(!cp)) { + if (sysctl_ip_vs_nat_icmp_send && + (pp->protocol == IPPROTO_TCP || + pp->protocol == IPPROTO_UDP)) { + __u16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, ihl, + sizeof(_ports), _ports); + if (pptr == NULL) + return NF_ACCEPT; /* Not for me */ + if (ip_vs_lookup_real_service(iph->protocol, + iph->saddr, pptr[0])) { + /* + * Notify the real server: there is no + * existing entry if it is not RST + * packet or not TCP packet. + */ + if (iph->protocol != IPPROTO_TCP + || !is_tcp_reset(skb)) { + icmp_send(skb,ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); + return NF_DROP; + } + } + } + IP_VS_DBG_PKT(12, pp, skb, 0, + "packet continues traversal as normal"); + return NF_ACCEPT; + } + + IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); + + if (!ip_vs_make_skb_writable(pskb, ihl)) + goto drop; + + /* mangle the packet */ + if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp)) + goto drop; + skb = *pskb; + skb->nh.iph->saddr = cp->vaddr; + ip_send_check(skb->nh.iph); + + IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); + + ip_vs_out_stats(cp, skb); + ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); + ip_vs_conn_put(cp); + + skb->nfcache |= NFC_IPVS_PROPERTY; + + LeaveFunction(11); + return NF_ACCEPT; + + drop: + ip_vs_conn_put(cp); + kfree_skb(*pskb); + return NF_STOLEN; +} + + +/* + * Handle ICMP messages in the outside-to-inside direction (incoming). + * Find any that might be relevant, check against existing connections, + * forward to the right destination host if relevant. + * Currently handles error types - unreachable, quench, ttl exceeded. + */ +static int +ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum) +{ + struct sk_buff *skb = *pskb; + struct iphdr *iph; + struct icmphdr _icmph, *ic; + struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */ + struct ip_vs_conn *cp; + struct ip_vs_protocol *pp; + unsigned int offset, ihl, verdict; + + *related = 1; + + /* reassemble IP fragments */ + if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { + skb = ip_vs_gather_frags(skb, + hooknum == NF_IP_LOCAL_IN ? + IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD); + if (!skb) + return NF_STOLEN; + *pskb = skb; + } + + iph = skb->nh.iph; + offset = ihl = iph->ihl * 4; + ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph); + if (ic == NULL) + return NF_DROP; + + IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", + ic->type, ntohs(icmp_id(ic)), + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + + /* + * Work through seeing if this is for us. + * These checks are supposed to be in an order that means easy + * things are checked first to speed up processing.... however + * this means that some packets will manage to get a long way + * down this stack and then be rejected, but that's life. + */ + if ((ic->type != ICMP_DEST_UNREACH) && + (ic->type != ICMP_SOURCE_QUENCH) && + (ic->type != ICMP_TIME_EXCEEDED)) { + *related = 0; + return NF_ACCEPT; + } + + /* Now find the contained IP header */ + offset += sizeof(_icmph); + cih = skb_header_pointer(skb, offset, sizeof(_ciph), &_ciph); + if (cih == NULL) + return NF_ACCEPT; /* The packet looks wrong, ignore */ + + pp = ip_vs_proto_get(cih->protocol); + if (!pp) + return NF_ACCEPT; + + /* Is the embedded protocol header present? */ + if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) && + pp->dont_defrag)) + return NF_ACCEPT; + + IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); + + offset += cih->ihl * 4; + + /* The embedded headers contain source and dest in reverse order */ + cp = pp->conn_in_get(skb, pp, cih, offset, 1); + if (!cp) + return NF_ACCEPT; + + verdict = NF_DROP; + + /* Ensure the checksum is correct */ + if (skb->ip_summed != CHECKSUM_UNNECESSARY && + ip_vs_checksum_complete(skb, ihl)) { + /* Failed checksum! */ + IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n", + NIPQUAD(iph->saddr)); + goto out; + } + + /* do the statistics and put it back */ + ip_vs_in_stats(cp, skb); + if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) + offset += 2 * sizeof(__u16); + verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); + /* do not touch skb anymore */ + + out: + __ip_vs_conn_put(cp); + + return verdict; +} + +/* + * Check if it's for virtual services, look it up, + * and send it on its way... + */ +static unsigned int +ip_vs_in(unsigned int hooknum, struct sk_buff **pskb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *skb = *pskb; + struct iphdr *iph; + struct ip_vs_protocol *pp; + struct ip_vs_conn *cp; + int ret, restart; + int ihl; + + /* + * Big tappo: only PACKET_HOST (neither loopback nor mcasts) + * ... don't know why 1st test DOES NOT include 2nd (?) + */ + if (unlikely(skb->pkt_type != PACKET_HOST + || skb->dev == &loopback_dev || skb->sk)) { + IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", + skb->pkt_type, + skb->nh.iph->protocol, + NIPQUAD(skb->nh.iph->daddr)); + return NF_ACCEPT; + } + + iph = skb->nh.iph; + if (unlikely(iph->protocol == IPPROTO_ICMP)) { + int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum); + + if (related) + return verdict; + skb = *pskb; + iph = skb->nh.iph; + } + + /* Protocol supported? */ + pp = ip_vs_proto_get(iph->protocol); + if (unlikely(!pp)) + return NF_ACCEPT; + + ihl = iph->ihl << 2; + + /* + * Check if the packet belongs to an existing connection entry + */ + cp = pp->conn_in_get(skb, pp, iph, ihl, 0); + + if (unlikely(!cp)) { + int v; + + if (!pp->conn_schedule(skb, pp, &v, &cp)) + return v; + } + + if (unlikely(!cp)) { + /* sorry, all this trouble for a no-hit :) */ + IP_VS_DBG_PKT(12, pp, skb, 0, + "packet continues traversal as normal"); + return NF_ACCEPT; + } + + IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); + + /* Check the server status */ + if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { + /* the destination server is not available */ + + if (sysctl_ip_vs_expire_nodest_conn) { + /* try to expire the connection immediately */ + ip_vs_conn_expire_now(cp); + } else { + /* don't restart its timer, and silently + drop the packet. */ + __ip_vs_conn_put(cp); + } + return NF_DROP; + } + + ip_vs_in_stats(cp, skb); + restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp); + if (cp->packet_xmit) + ret = cp->packet_xmit(skb, cp, pp); + /* do not touch skb anymore */ + else { + IP_VS_DBG_RL("warning: packet_xmit is null"); + ret = NF_ACCEPT; + } + + /* increase its packet counter and check if it is needed + to be synchronized */ + atomic_inc(&cp->in_pkts); + if ((ip_vs_sync_state & IP_VS_STATE_MASTER) && + (cp->protocol != IPPROTO_TCP || + cp->state == IP_VS_TCP_S_ESTABLISHED) && + (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1] + == sysctl_ip_vs_sync_threshold[0])) + ip_vs_sync_conn(cp); + + ip_vs_conn_put(cp); + return ret; +} + + +/* + * It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP + * related packets destined for 0.0.0.0/0. + * When fwmark-based virtual service is used, such as transparent + * cache cluster, TCP packets can be marked and routed to ip_vs_in, + * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and + * sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain + * and send them to ip_vs_in_icmp. + */ +static unsigned int +ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + int r; + + if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP) + return NF_ACCEPT; + + return ip_vs_in_icmp(pskb, &r, hooknum); +} + + +/* After packet filtering, forward packet through VS/DR, VS/TUN, + or VS/NAT(change destination), so that filtering rules can be + applied to IPVS. */ +static struct nf_hook_ops ip_vs_in_ops = { + .hook = ip_vs_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = 100, +}; + +/* After packet filtering, change source only for VS/NAT */ +static struct nf_hook_ops ip_vs_out_ops = { + .hook = ip_vs_out, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_FORWARD, + .priority = 100, +}; + +/* After packet filtering (but before ip_vs_out_icmp), catch icmp + destined for 0.0.0.0/0, which is for incoming IPVS connections */ +static struct nf_hook_ops ip_vs_forward_icmp_ops = { + .hook = ip_vs_forward_icmp, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_FORWARD, + .priority = 99, +}; + +/* Before the netfilter connection tracking, exit from POST_ROUTING */ +static struct nf_hook_ops ip_vs_post_routing_ops = { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_NAT_SRC-1, +}; + + +/* + * Initialize IP Virtual Server + */ +static int __init ip_vs_init(void) +{ + int ret; + + ret = ip_vs_control_init(); + if (ret < 0) { + IP_VS_ERR("can't setup control.\n"); + goto cleanup_nothing; + } + + ip_vs_protocol_init(); + + ret = ip_vs_app_init(); + if (ret < 0) { + IP_VS_ERR("can't setup application helper.\n"); + goto cleanup_protocol; + } + + ret = ip_vs_conn_init(); + if (ret < 0) { + IP_VS_ERR("can't setup connection table.\n"); + goto cleanup_app; + } + + ret = nf_register_hook(&ip_vs_in_ops); + if (ret < 0) { + IP_VS_ERR("can't register in hook.\n"); + goto cleanup_conn; + } + + ret = nf_register_hook(&ip_vs_out_ops); + if (ret < 0) { + IP_VS_ERR("can't register out hook.\n"); + goto cleanup_inops; + } + ret = nf_register_hook(&ip_vs_post_routing_ops); + if (ret < 0) { + IP_VS_ERR("can't register post_routing hook.\n"); + goto cleanup_outops; + } + ret = nf_register_hook(&ip_vs_forward_icmp_ops); + if (ret < 0) { + IP_VS_ERR("can't register forward_icmp hook.\n"); + goto cleanup_postroutingops; + } + + IP_VS_INFO("ipvs loaded.\n"); + return ret; + + cleanup_postroutingops: + nf_unregister_hook(&ip_vs_post_routing_ops); + cleanup_outops: + nf_unregister_hook(&ip_vs_out_ops); + cleanup_inops: + nf_unregister_hook(&ip_vs_in_ops); + cleanup_conn: + ip_vs_conn_cleanup(); + cleanup_app: + ip_vs_app_cleanup(); + cleanup_protocol: + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); + cleanup_nothing: + return ret; +} + +static void __exit ip_vs_cleanup(void) +{ + nf_unregister_hook(&ip_vs_forward_icmp_ops); + nf_unregister_hook(&ip_vs_post_routing_ops); + nf_unregister_hook(&ip_vs_out_ops); + nf_unregister_hook(&ip_vs_in_ops); + ip_vs_conn_cleanup(); + ip_vs_app_cleanup(); + ip_vs_protocol_cleanup(); + ip_vs_control_cleanup(); + IP_VS_INFO("ipvs unloaded.\n"); +} + +module_init(ip_vs_init); +module_exit(ip_vs_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c new file mode 100644 index 000000000000..218d9701036e --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -0,0 +1,2391 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_ctl.c,v 1.36 2003/06/08 09:31:19 wensong Exp $ + * + * Authors: Wensong Zhang + * Peter Kese + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include + +#include + +/* semaphore for IPVS sockopts. And, [gs]etsockopt may sleep. */ +static DECLARE_MUTEX(__ip_vs_mutex); + +/* lock for service table */ +static DEFINE_RWLOCK(__ip_vs_svc_lock); + +/* lock for table with the real services */ +static DEFINE_RWLOCK(__ip_vs_rs_lock); + +/* lock for state and timeout tables */ +static DEFINE_RWLOCK(__ip_vs_securetcp_lock); + +/* lock for drop entry handling */ +static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); + +/* lock for drop packet handling */ +static DEFINE_SPINLOCK(__ip_vs_droppacket_lock); + +/* 1/rate drop and drop-entry variables */ +int ip_vs_drop_rate = 0; +int ip_vs_drop_counter = 0; +static atomic_t ip_vs_dropentry = ATOMIC_INIT(0); + +/* number of virtual services */ +static int ip_vs_num_services = 0; + +/* sysctl variables */ +static int sysctl_ip_vs_drop_entry = 0; +static int sysctl_ip_vs_drop_packet = 0; +static int sysctl_ip_vs_secure_tcp = 0; +static int sysctl_ip_vs_amemthresh = 1024; +static int sysctl_ip_vs_am_droprate = 10; +int sysctl_ip_vs_cache_bypass = 0; +int sysctl_ip_vs_expire_nodest_conn = 0; +int sysctl_ip_vs_expire_quiescent_template = 0; +int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; +int sysctl_ip_vs_nat_icmp_send = 0; + + +#ifdef CONFIG_IP_VS_DEBUG +static int sysctl_ip_vs_debug_level = 0; + +int ip_vs_get_debug_level(void) +{ + return sysctl_ip_vs_debug_level; +} +#endif + +/* + * update_defense_level is called from keventd and from sysctl. + */ +static void update_defense_level(void) +{ + struct sysinfo i; + static int old_secure_tcp = 0; + int availmem; + int nomem; + int to_change = -1; + + /* we only count free and buffered memory (in pages) */ + si_meminfo(&i); + availmem = i.freeram + i.bufferram; + /* however in linux 2.5 the i.bufferram is total page cache size, + we need adjust it */ + /* si_swapinfo(&i); */ + /* availmem = availmem - (i.totalswap - i.freeswap); */ + + nomem = (availmem < sysctl_ip_vs_amemthresh); + + /* drop_entry */ + spin_lock(&__ip_vs_dropentry_lock); + switch (sysctl_ip_vs_drop_entry) { + case 0: + atomic_set(&ip_vs_dropentry, 0); + break; + case 1: + if (nomem) { + atomic_set(&ip_vs_dropentry, 1); + sysctl_ip_vs_drop_entry = 2; + } else { + atomic_set(&ip_vs_dropentry, 0); + } + break; + case 2: + if (nomem) { + atomic_set(&ip_vs_dropentry, 1); + } else { + atomic_set(&ip_vs_dropentry, 0); + sysctl_ip_vs_drop_entry = 1; + }; + break; + case 3: + atomic_set(&ip_vs_dropentry, 1); + break; + } + spin_unlock(&__ip_vs_dropentry_lock); + + /* drop_packet */ + spin_lock(&__ip_vs_droppacket_lock); + switch (sysctl_ip_vs_drop_packet) { + case 0: + ip_vs_drop_rate = 0; + break; + case 1: + if (nomem) { + ip_vs_drop_rate = ip_vs_drop_counter + = sysctl_ip_vs_amemthresh / + (sysctl_ip_vs_amemthresh-availmem); + sysctl_ip_vs_drop_packet = 2; + } else { + ip_vs_drop_rate = 0; + } + break; + case 2: + if (nomem) { + ip_vs_drop_rate = ip_vs_drop_counter + = sysctl_ip_vs_amemthresh / + (sysctl_ip_vs_amemthresh-availmem); + } else { + ip_vs_drop_rate = 0; + sysctl_ip_vs_drop_packet = 1; + } + break; + case 3: + ip_vs_drop_rate = sysctl_ip_vs_am_droprate; + break; + } + spin_unlock(&__ip_vs_droppacket_lock); + + /* secure_tcp */ + write_lock(&__ip_vs_securetcp_lock); + switch (sysctl_ip_vs_secure_tcp) { + case 0: + if (old_secure_tcp >= 2) + to_change = 0; + break; + case 1: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + sysctl_ip_vs_secure_tcp = 2; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + } + break; + case 2: + if (nomem) { + if (old_secure_tcp < 2) + to_change = 1; + } else { + if (old_secure_tcp >= 2) + to_change = 0; + sysctl_ip_vs_secure_tcp = 1; + } + break; + case 3: + if (old_secure_tcp < 2) + to_change = 1; + break; + } + old_secure_tcp = sysctl_ip_vs_secure_tcp; + if (to_change >= 0) + ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); + write_unlock(&__ip_vs_securetcp_lock); +} + + +/* + * Timer for checking the defense + */ +#define DEFENSE_TIMER_PERIOD 1*HZ +static void defense_work_handler(void *data); +static DECLARE_WORK(defense_work, defense_work_handler, NULL); + +static void defense_work_handler(void *data) +{ + update_defense_level(); + if (atomic_read(&ip_vs_dropentry)) + ip_vs_random_dropentry(); + + schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); +} + +int +ip_vs_use_count_inc(void) +{ + return try_module_get(THIS_MODULE); +} + +void +ip_vs_use_count_dec(void) +{ + module_put(THIS_MODULE); +} + + +/* + * Hash table: for virtual service lookups + */ +#define IP_VS_SVC_TAB_BITS 8 +#define IP_VS_SVC_TAB_SIZE (1 << IP_VS_SVC_TAB_BITS) +#define IP_VS_SVC_TAB_MASK (IP_VS_SVC_TAB_SIZE - 1) + +/* the service table hashed by */ +static struct list_head ip_vs_svc_table[IP_VS_SVC_TAB_SIZE]; +/* the service table hashed by fwmark */ +static struct list_head ip_vs_svc_fwm_table[IP_VS_SVC_TAB_SIZE]; + +/* + * Hash table: for real service lookups + */ +#define IP_VS_RTAB_BITS 4 +#define IP_VS_RTAB_SIZE (1 << IP_VS_RTAB_BITS) +#define IP_VS_RTAB_MASK (IP_VS_RTAB_SIZE - 1) + +static struct list_head ip_vs_rtable[IP_VS_RTAB_SIZE]; + +/* + * Trash for destinations + */ +static LIST_HEAD(ip_vs_dest_trash); + +/* + * FTP & NULL virtual service counters + */ +static atomic_t ip_vs_ftpsvc_counter = ATOMIC_INIT(0); +static atomic_t ip_vs_nullsvc_counter = ATOMIC_INIT(0); + + +/* + * Returns hash value for virtual service + */ +static __inline__ unsigned +ip_vs_svc_hashkey(unsigned proto, __u32 addr, __u16 port) +{ + register unsigned porth = ntohs(port); + + return (proto^ntohl(addr)^(porth>>IP_VS_SVC_TAB_BITS)^porth) + & IP_VS_SVC_TAB_MASK; +} + +/* + * Returns hash value of fwmark for virtual service lookup + */ +static __inline__ unsigned ip_vs_svc_fwm_hashkey(__u32 fwmark) +{ + return fwmark & IP_VS_SVC_TAB_MASK; +} + +/* + * Hashes a service in the ip_vs_svc_table by + * or in the ip_vs_svc_fwm_table by fwmark. + * Should be called with locked tables. + */ +static int ip_vs_svc_hash(struct ip_vs_service *svc) +{ + unsigned hash; + + if (svc->flags & IP_VS_SVC_F_HASHED) { + IP_VS_ERR("ip_vs_svc_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* + * Hash it by in ip_vs_svc_table + */ + hash = ip_vs_svc_hashkey(svc->protocol, svc->addr, svc->port); + list_add(&svc->s_list, &ip_vs_svc_table[hash]); + } else { + /* + * Hash it by fwmark in ip_vs_svc_fwm_table + */ + hash = ip_vs_svc_fwm_hashkey(svc->fwmark); + list_add(&svc->f_list, &ip_vs_svc_fwm_table[hash]); + } + + svc->flags |= IP_VS_SVC_F_HASHED; + /* increase its refcnt because it is referenced by the svc table */ + atomic_inc(&svc->refcnt); + return 1; +} + + +/* + * Unhashes a service from ip_vs_svc_table/ip_vs_svc_fwm_table. + * Should be called with locked tables. + */ +static int ip_vs_svc_unhash(struct ip_vs_service *svc) +{ + if (!(svc->flags & IP_VS_SVC_F_HASHED)) { + IP_VS_ERR("ip_vs_svc_unhash(): request for unhash flagged, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + if (svc->fwmark == 0) { + /* Remove it from the ip_vs_svc_table table */ + list_del(&svc->s_list); + } else { + /* Remove it from the ip_vs_svc_fwm_table table */ + list_del(&svc->f_list); + } + + svc->flags &= ~IP_VS_SVC_F_HASHED; + atomic_dec(&svc->refcnt); + return 1; +} + + +/* + * Get service by {proto,addr,port} in the service table. + */ +static __inline__ struct ip_vs_service * +__ip_vs_service_get(__u16 protocol, __u32 vaddr, __u16 vport) +{ + unsigned hash; + struct ip_vs_service *svc; + + /* Check for "full" addressed entries */ + hash = ip_vs_svc_hashkey(protocol, vaddr, vport); + + list_for_each_entry(svc, &ip_vs_svc_table[hash], s_list){ + if ((svc->addr == vaddr) + && (svc->port == vport) + && (svc->protocol == protocol)) { + /* HIT */ + atomic_inc(&svc->usecnt); + return svc; + } + } + + return NULL; +} + + +/* + * Get service by {fwmark} in the service table. + */ +static __inline__ struct ip_vs_service *__ip_vs_svc_fwm_get(__u32 fwmark) +{ + unsigned hash; + struct ip_vs_service *svc; + + /* Check for fwmark addressed entries */ + hash = ip_vs_svc_fwm_hashkey(fwmark); + + list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { + if (svc->fwmark == fwmark) { + /* HIT */ + atomic_inc(&svc->usecnt); + return svc; + } + } + + return NULL; +} + +struct ip_vs_service * +ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport) +{ + struct ip_vs_service *svc; + + read_lock(&__ip_vs_svc_lock); + + /* + * Check the table hashed by fwmark first + */ + if (fwmark && (svc = __ip_vs_svc_fwm_get(fwmark))) + goto out; + + /* + * Check the table hashed by + * for "full" addressed entries + */ + svc = __ip_vs_service_get(protocol, vaddr, vport); + + if (svc == NULL + && protocol == IPPROTO_TCP + && atomic_read(&ip_vs_ftpsvc_counter) + && (vport == FTPDATA || ntohs(vport) >= PROT_SOCK)) { + /* + * Check if ftp service entry exists, the packet + * might belong to FTP data connections. + */ + svc = __ip_vs_service_get(protocol, vaddr, FTPPORT); + } + + if (svc == NULL + && atomic_read(&ip_vs_nullsvc_counter)) { + /* + * Check if the catch-all port (port zero) exists + */ + svc = __ip_vs_service_get(protocol, vaddr, 0); + } + + out: + read_unlock(&__ip_vs_svc_lock); + + IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", + fwmark, ip_vs_proto_name(protocol), + NIPQUAD(vaddr), ntohs(vport), + svc?"hit":"not hit"); + + return svc; +} + + +static inline void +__ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + atomic_inc(&svc->refcnt); + dest->svc = svc; +} + +static inline void +__ip_vs_unbind_svc(struct ip_vs_dest *dest) +{ + struct ip_vs_service *svc = dest->svc; + + dest->svc = NULL; + if (atomic_dec_and_test(&svc->refcnt)) + kfree(svc); +} + + +/* + * Returns hash value for real service + */ +static __inline__ unsigned ip_vs_rs_hashkey(__u32 addr, __u16 port) +{ + register unsigned porth = ntohs(port); + + return (ntohl(addr)^(porth>>IP_VS_RTAB_BITS)^porth) + & IP_VS_RTAB_MASK; +} + +/* + * Hashes ip_vs_dest in ip_vs_rtable by . + * should be called with locked tables. + */ +static int ip_vs_rs_hash(struct ip_vs_dest *dest) +{ + unsigned hash; + + if (!list_empty(&dest->d_list)) { + return 0; + } + + /* + * Hash by proto,addr,port, + * which are the parameters of the real service. + */ + hash = ip_vs_rs_hashkey(dest->addr, dest->port); + list_add(&dest->d_list, &ip_vs_rtable[hash]); + + return 1; +} + +/* + * UNhashes ip_vs_dest from ip_vs_rtable. + * should be called with locked tables. + */ +static int ip_vs_rs_unhash(struct ip_vs_dest *dest) +{ + /* + * Remove it from the ip_vs_rtable table. + */ + if (!list_empty(&dest->d_list)) { + list_del(&dest->d_list); + INIT_LIST_HEAD(&dest->d_list); + } + + return 1; +} + +/* + * Lookup real service by in the real service table. + */ +struct ip_vs_dest * +ip_vs_lookup_real_service(__u16 protocol, __u32 daddr, __u16 dport) +{ + unsigned hash; + struct ip_vs_dest *dest; + + /* + * Check for "full" addressed entries + * Return the first found entry + */ + hash = ip_vs_rs_hashkey(daddr, dport); + + read_lock(&__ip_vs_rs_lock); + list_for_each_entry(dest, &ip_vs_rtable[hash], d_list) { + if ((dest->addr == daddr) + && (dest->port == dport) + && ((dest->protocol == protocol) || + dest->vfwmark)) { + /* HIT */ + read_unlock(&__ip_vs_rs_lock); + return dest; + } + } + read_unlock(&__ip_vs_rs_lock); + + return NULL; +} + +/* + * Lookup destination by {addr,port} in the given service + */ +static struct ip_vs_dest * +ip_vs_lookup_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport) +{ + struct ip_vs_dest *dest; + + /* + * Find the destination for the given service + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if ((dest->addr == daddr) && (dest->port == dport)) { + /* HIT */ + return dest; + } + } + + return NULL; +} + + +/* + * Lookup dest by {svc,addr,port} in the destination trash. + * The destination trash is used to hold the destinations that are removed + * from the service table but are still referenced by some conn entries. + * The reason to add the destination trash is when the dest is temporary + * down (either by administrator or by monitor program), the dest can be + * picked back from the trash, the remaining connections to the dest can + * continue, and the counting information of the dest is also useful for + * scheduling. + */ +static struct ip_vs_dest * +ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport) +{ + struct ip_vs_dest *dest, *nxt; + + /* + * Find the destination in trash + */ + list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { + IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " + "refcnt=%d\n", + dest->vfwmark, + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->refcnt)); + if (dest->addr == daddr && + dest->port == dport && + dest->vfwmark == svc->fwmark && + dest->protocol == svc->protocol && + (svc->fwmark || + (dest->vaddr == svc->addr && + dest->vport == svc->port))) { + /* HIT */ + return dest; + } + + /* + * Try to purge the destination from trash if not referenced + */ + if (atomic_read(&dest->refcnt) == 1) { + IP_VS_DBG(3, "Removing destination %u/%u.%u.%u.%u:%u " + "from trash\n", + dest->vfwmark, + NIPQUAD(dest->addr), ntohs(dest->port)); + list_del(&dest->n_list); + ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + kfree(dest); + } + } + + return NULL; +} + + +/* + * Clean up all the destinations in the trash + * Called by the ip_vs_control_cleanup() + * + * When the ip_vs_control_clearup is activated by ipvs module exit, + * the service tables must have been flushed and all the connections + * are expired, and the refcnt of each destination in the trash must + * be 1, so we simply release them here. + */ +static void ip_vs_trash_cleanup(void) +{ + struct ip_vs_dest *dest, *nxt; + + list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { + list_del(&dest->n_list); + ip_vs_dst_reset(dest); + __ip_vs_unbind_svc(dest); + kfree(dest); + } +} + + +static void +ip_vs_zero_stats(struct ip_vs_stats *stats) +{ + spin_lock_bh(&stats->lock); + memset(stats, 0, (char *)&stats->lock - (char *)stats); + spin_unlock_bh(&stats->lock); + ip_vs_zero_estimator(stats); +} + +/* + * Update a destination in the given service + */ +static void +__ip_vs_update_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, struct ip_vs_dest_user *udest) +{ + int conn_flags; + + /* set the weight and the flags */ + atomic_set(&dest->weight, udest->weight); + conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; + + /* check if local node and update the flags */ + if (inet_addr_type(udest->addr) == RTN_LOCAL) { + conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) + | IP_VS_CONN_F_LOCALNODE; + } + + /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ + if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { + conn_flags |= IP_VS_CONN_F_NOOUTPUT; + } else { + /* + * Put the real service in ip_vs_rtable if not present. + * For now only for NAT! + */ + write_lock_bh(&__ip_vs_rs_lock); + ip_vs_rs_hash(dest); + write_unlock_bh(&__ip_vs_rs_lock); + } + atomic_set(&dest->conn_flags, conn_flags); + + /* bind the service */ + if (!dest->svc) { + __ip_vs_bind_svc(dest, svc); + } else { + if (dest->svc != svc) { + __ip_vs_unbind_svc(dest); + ip_vs_zero_stats(&dest->stats); + __ip_vs_bind_svc(dest, svc); + } + } + + /* set the dest status flags */ + dest->flags |= IP_VS_DEST_F_AVAILABLE; + + if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold) + dest->flags &= ~IP_VS_DEST_F_OVERLOAD; + dest->u_threshold = udest->u_threshold; + dest->l_threshold = udest->l_threshold; +} + + +/* + * Create a destination for the given service + */ +static int +ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest, + struct ip_vs_dest **dest_p) +{ + struct ip_vs_dest *dest; + unsigned atype; + + EnterFunction(2); + + atype = inet_addr_type(udest->addr); + if (atype != RTN_LOCAL && atype != RTN_UNICAST) + return -EINVAL; + + dest = kmalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); + if (dest == NULL) { + IP_VS_ERR("ip_vs_new_dest: kmalloc failed.\n"); + return -ENOMEM; + } + memset(dest, 0, sizeof(struct ip_vs_dest)); + + dest->protocol = svc->protocol; + dest->vaddr = svc->addr; + dest->vport = svc->port; + dest->vfwmark = svc->fwmark; + dest->addr = udest->addr; + dest->port = udest->port; + + atomic_set(&dest->activeconns, 0); + atomic_set(&dest->inactconns, 0); + atomic_set(&dest->persistconns, 0); + atomic_set(&dest->refcnt, 0); + + INIT_LIST_HEAD(&dest->d_list); + spin_lock_init(&dest->dst_lock); + spin_lock_init(&dest->stats.lock); + __ip_vs_update_dest(svc, dest, udest); + ip_vs_new_estimator(&dest->stats); + + *dest_p = dest; + + LeaveFunction(2); + return 0; +} + + +/* + * Add a destination into an existing service + */ +static int +ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) +{ + struct ip_vs_dest *dest; + __u32 daddr = udest->addr; + __u16 dport = udest->port; + int ret; + + EnterFunction(2); + + if (udest->weight < 0) { + IP_VS_ERR("ip_vs_add_dest(): server weight less than zero\n"); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + IP_VS_ERR("ip_vs_add_dest(): lower threshold is higher than " + "upper threshold\n"); + return -ERANGE; + } + + /* + * Check if the dest already exists in the list + */ + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest != NULL) { + IP_VS_DBG(1, "ip_vs_add_dest(): dest already exists\n"); + return -EEXIST; + } + + /* + * Check if the dest already exists in the trash and + * is from the same service + */ + dest = ip_vs_trash_get_dest(svc, daddr, dport); + if (dest != NULL) { + IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " + "refcnt=%d, service %u/%u.%u.%u.%u:%u\n", + NIPQUAD(daddr), ntohs(dport), + atomic_read(&dest->refcnt), + dest->vfwmark, + NIPQUAD(dest->vaddr), + ntohs(dest->vport)); + __ip_vs_update_dest(svc, dest, udest); + + /* + * Get the destination from the trash + */ + list_del(&dest->n_list); + + ip_vs_new_estimator(&dest->stats); + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + list_add(&dest->n_list, &svc->destinations); + svc->num_dests++; + + /* call the update_service function of its scheduler */ + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + return 0; + } + + /* + * Allocate and initialize the dest structure + */ + ret = ip_vs_new_dest(svc, udest, &dest); + if (ret) { + return ret; + } + + /* + * Add the dest entry into the list + */ + atomic_inc(&dest->refcnt); + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + list_add(&dest->n_list, &svc->destinations); + svc->num_dests++; + + /* call the update_service function of its scheduler */ + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + LeaveFunction(2); + + return 0; +} + + +/* + * Edit a destination in the given service + */ +static int +ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) +{ + struct ip_vs_dest *dest; + __u32 daddr = udest->addr; + __u16 dport = udest->port; + + EnterFunction(2); + + if (udest->weight < 0) { + IP_VS_ERR("ip_vs_edit_dest(): server weight less than zero\n"); + return -ERANGE; + } + + if (udest->l_threshold > udest->u_threshold) { + IP_VS_ERR("ip_vs_edit_dest(): lower threshold is higher than " + "upper threshold\n"); + return -ERANGE; + } + + /* + * Lookup the destination list + */ + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest == NULL) { + IP_VS_DBG(1, "ip_vs_edit_dest(): dest doesn't exist\n"); + return -ENOENT; + } + + __ip_vs_update_dest(svc, dest, udest); + + write_lock_bh(&__ip_vs_svc_lock); + + /* Wait until all other svc users go away */ + while (atomic_read(&svc->usecnt) > 1) {}; + + /* call the update_service, because server weight may be changed */ + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + LeaveFunction(2); + + return 0; +} + + +/* + * Delete a destination (must be already unlinked from the service) + */ +static void __ip_vs_del_dest(struct ip_vs_dest *dest) +{ + ip_vs_kill_estimator(&dest->stats); + + /* + * Remove it from the d-linked list with the real services. + */ + write_lock_bh(&__ip_vs_rs_lock); + ip_vs_rs_unhash(dest); + write_unlock_bh(&__ip_vs_rs_lock); + + /* + * Decrease the refcnt of the dest, and free the dest + * if nobody refers to it (refcnt=0). Otherwise, throw + * the destination into the trash. + */ + if (atomic_dec_and_test(&dest->refcnt)) { + ip_vs_dst_reset(dest); + /* simply decrease svc->refcnt here, let the caller check + and release the service if nobody refers to it. + Only user context can release destination and service, + and only one user context can update virtual service at a + time, so the operation here is OK */ + atomic_dec(&dest->svc->refcnt); + kfree(dest); + } else { + IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n", + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->refcnt)); + list_add(&dest->n_list, &ip_vs_dest_trash); + atomic_inc(&dest->refcnt); + } +} + + +/* + * Unlink a destination from the given service + */ +static void __ip_vs_unlink_dest(struct ip_vs_service *svc, + struct ip_vs_dest *dest, + int svcupd) +{ + dest->flags &= ~IP_VS_DEST_F_AVAILABLE; + + /* + * Remove it from the d-linked destination list. + */ + list_del(&dest->n_list); + svc->num_dests--; + if (svcupd) { + /* + * Call the update_service function of its scheduler + */ + svc->scheduler->update_service(svc); + } +} + + +/* + * Delete a destination server in the given service + */ +static int +ip_vs_del_dest(struct ip_vs_service *svc,struct ip_vs_dest_user *udest) +{ + struct ip_vs_dest *dest; + __u32 daddr = udest->addr; + __u16 dport = udest->port; + + EnterFunction(2); + + dest = ip_vs_lookup_dest(svc, daddr, dport); + if (dest == NULL) { + IP_VS_DBG(1, "ip_vs_del_dest(): destination not found!\n"); + return -ENOENT; + } + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + /* + * Unlink dest from the service + */ + __ip_vs_unlink_dest(svc, dest, 1); + + write_unlock_bh(&__ip_vs_svc_lock); + + /* + * Delete the destination + */ + __ip_vs_del_dest(dest); + + LeaveFunction(2); + + return 0; +} + + +/* + * Add a service into the service hash table + */ +static int +ip_vs_add_service(struct ip_vs_service_user *u, struct ip_vs_service **svc_p) +{ + int ret = 0; + struct ip_vs_scheduler *sched = NULL; + struct ip_vs_service *svc = NULL; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + /* Lookup the scheduler by 'u->sched_name' */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + IP_VS_INFO("Scheduler module ip_vs_%s not found\n", + u->sched_name); + ret = -ENOENT; + goto out_mod_dec; + } + + svc = (struct ip_vs_service *) + kmalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); + if (svc == NULL) { + IP_VS_DBG(1, "ip_vs_add_service: kmalloc failed.\n"); + ret = -ENOMEM; + goto out_err; + } + memset(svc, 0, sizeof(struct ip_vs_service)); + + /* I'm the first user of the service */ + atomic_set(&svc->usecnt, 1); + atomic_set(&svc->refcnt, 0); + + svc->protocol = u->protocol; + svc->addr = u->addr; + svc->port = u->port; + svc->fwmark = u->fwmark; + svc->flags = u->flags; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + + INIT_LIST_HEAD(&svc->destinations); + rwlock_init(&svc->sched_lock); + spin_lock_init(&svc->stats.lock); + + /* Bind the scheduler */ + ret = ip_vs_bind_scheduler(svc, sched); + if (ret) + goto out_err; + sched = NULL; + + /* Update the virtual service counters */ + if (svc->port == FTPPORT) + atomic_inc(&ip_vs_ftpsvc_counter); + else if (svc->port == 0) + atomic_inc(&ip_vs_nullsvc_counter); + + ip_vs_new_estimator(&svc->stats); + ip_vs_num_services++; + + /* Hash the service into the service table */ + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_hash(svc); + write_unlock_bh(&__ip_vs_svc_lock); + + *svc_p = svc; + return 0; + + out_err: + if (svc != NULL) { + if (svc->scheduler) + ip_vs_unbind_scheduler(svc); + if (svc->inc) { + local_bh_disable(); + ip_vs_app_inc_put(svc->inc); + local_bh_enable(); + } + kfree(svc); + } + ip_vs_scheduler_put(sched); + + out_mod_dec: + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +/* + * Edit a service and bind it with a new scheduler + */ +static int +ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user *u) +{ + struct ip_vs_scheduler *sched, *old_sched; + int ret = 0; + + /* + * Lookup the scheduler, by 'u->sched_name' + */ + sched = ip_vs_scheduler_get(u->sched_name); + if (sched == NULL) { + IP_VS_INFO("Scheduler module ip_vs_%s not found\n", + u->sched_name); + return -ENOENT; + } + old_sched = sched; + + write_lock_bh(&__ip_vs_svc_lock); + + /* + * Wait until all other svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + /* + * Set the flags and timeout value + */ + svc->flags = u->flags | IP_VS_SVC_F_HASHED; + svc->timeout = u->timeout * HZ; + svc->netmask = u->netmask; + + old_sched = svc->scheduler; + if (sched != old_sched) { + /* + * Unbind the old scheduler + */ + if ((ret = ip_vs_unbind_scheduler(svc))) { + old_sched = sched; + goto out; + } + + /* + * Bind the new scheduler + */ + if ((ret = ip_vs_bind_scheduler(svc, sched))) { + /* + * If ip_vs_bind_scheduler fails, restore the old + * scheduler. + * The main reason of failure is out of memory. + * + * The question is if the old scheduler can be + * restored all the time. TODO: if it cannot be + * restored some time, we must delete the service, + * otherwise the system may crash. + */ + ip_vs_bind_scheduler(svc, old_sched); + old_sched = sched; + goto out; + } + } + + out: + write_unlock_bh(&__ip_vs_svc_lock); + + if (old_sched) + ip_vs_scheduler_put(old_sched); + + return ret; +} + + +/* + * Delete a service from the service list + * - The service must be unlinked, unlocked and not referenced! + * - We are called under _bh lock + */ +static void __ip_vs_del_service(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest, *nxt; + struct ip_vs_scheduler *old_sched; + + ip_vs_num_services--; + ip_vs_kill_estimator(&svc->stats); + + /* Unbind scheduler */ + old_sched = svc->scheduler; + ip_vs_unbind_scheduler(svc); + if (old_sched) + ip_vs_scheduler_put(old_sched); + + /* Unbind app inc */ + if (svc->inc) { + ip_vs_app_inc_put(svc->inc); + svc->inc = NULL; + } + + /* + * Unlink the whole destination list + */ + list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) { + __ip_vs_unlink_dest(svc, dest, 0); + __ip_vs_del_dest(dest); + } + + /* + * Update the virtual service counters + */ + if (svc->port == FTPPORT) + atomic_dec(&ip_vs_ftpsvc_counter); + else if (svc->port == 0) + atomic_dec(&ip_vs_nullsvc_counter); + + /* + * Free the service if nobody refers to it + */ + if (atomic_read(&svc->refcnt) == 0) + kfree(svc); + + /* decrease the module use count */ + ip_vs_use_count_dec(); +} + +/* + * Delete a service from the service list + */ +static int ip_vs_del_service(struct ip_vs_service *svc) +{ + if (svc == NULL) + return -EEXIST; + + /* + * Unhash it from the service table + */ + write_lock_bh(&__ip_vs_svc_lock); + + ip_vs_svc_unhash(svc); + + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + + __ip_vs_del_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); + + return 0; +} + + +/* + * Flush all the virtual services + */ +static int ip_vs_flush(void) +{ + int idx; + struct ip_vs_service *svc, *nxt; + + /* + * Flush the service table hashed by + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_unhash(svc); + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + __ip_vs_del_service(svc); + write_unlock_bh(&__ip_vs_svc_lock); + } + } + + /* + * Flush the service table hashed by fwmark + */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry_safe(svc, nxt, + &ip_vs_svc_fwm_table[idx], f_list) { + write_lock_bh(&__ip_vs_svc_lock); + ip_vs_svc_unhash(svc); + /* + * Wait until all the svc users go away. + */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + __ip_vs_del_service(svc); + write_unlock_bh(&__ip_vs_svc_lock); + } + } + + return 0; +} + + +/* + * Zero counters in a service or all services + */ +static int ip_vs_zero_service(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + + write_lock_bh(&__ip_vs_svc_lock); + list_for_each_entry(dest, &svc->destinations, n_list) { + ip_vs_zero_stats(&dest->stats); + } + ip_vs_zero_stats(&svc->stats); + write_unlock_bh(&__ip_vs_svc_lock); + return 0; +} + +static int ip_vs_zero_all(void) +{ + int idx; + struct ip_vs_service *svc; + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + ip_vs_zero_service(svc); + } + } + + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + ip_vs_zero_service(svc); + } + } + + ip_vs_zero_stats(&ip_vs_stats); + return 0; +} + + +static int +proc_do_defense_mode(ctl_table *table, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val = *valp; + int rc; + + rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); + if (write && (*valp != val)) { + if ((*valp < 0) || (*valp > 3)) { + /* Restore the correct value */ + *valp = val; + } else { + local_bh_disable(); + update_defense_level(); + local_bh_enable(); + } + } + return rc; +} + + +static int +proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = table->data; + int val[2]; + int rc; + + /* backup the value first */ + memcpy(val, valp, sizeof(val)); + + rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); + if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { + /* Restore the correct value */ + memcpy(valp, val, sizeof(val)); + } + return rc; +} + + +/* + * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/) + */ + +static struct ctl_table vs_vars[] = { + { + .ctl_name = NET_IPV4_VS_AMEMTHRESH, + .procname = "amemthresh", + .data = &sysctl_ip_vs_amemthresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_IP_VS_DEBUG + { + .ctl_name = NET_IPV4_VS_DEBUG_LEVEL, + .procname = "debug_level", + .data = &sysctl_ip_vs_debug_level, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = NET_IPV4_VS_AMDROPRATE, + .procname = "am_droprate", + .data = &sysctl_ip_vs_am_droprate, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_VS_DROP_ENTRY, + .procname = "drop_entry", + .data = &sysctl_ip_vs_drop_entry, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_defense_mode, + }, + { + .ctl_name = NET_IPV4_VS_DROP_PACKET, + .procname = "drop_packet", + .data = &sysctl_ip_vs_drop_packet, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_defense_mode, + }, + { + .ctl_name = NET_IPV4_VS_SECURE_TCP, + .procname = "secure_tcp", + .data = &sysctl_ip_vs_secure_tcp, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_do_defense_mode, + }, +#if 0 + { + .ctl_name = NET_IPV4_VS_TO_ES, + .procname = "timeout_established", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_ESTABLISHED], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_SS, + .procname = "timeout_synsent", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_SENT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_SR, + .procname = "timeout_synrecv", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYN_RECV], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_FW, + .procname = "timeout_finwait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_FIN_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_TW, + .procname = "timeout_timewait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_TIME_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_CL, + .procname = "timeout_close", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_CW, + .procname = "timeout_closewait", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_CLOSE_WAIT], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_LA, + .procname = "timeout_lastack", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_LAST_ACK], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_LI, + .procname = "timeout_listen", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_LISTEN], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_SA, + .procname = "timeout_synack", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_SYNACK], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_UDP, + .procname = "timeout_udp", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_UDP], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_VS_TO_ICMP, + .procname = "timeout_icmp", + .data = &vs_timeout_table_dos.timeout[IP_VS_S_ICMP], + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, +#endif + { + .ctl_name = NET_IPV4_VS_CACHE_BYPASS, + .procname = "cache_bypass", + .data = &sysctl_ip_vs_cache_bypass, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_VS_EXPIRE_NODEST_CONN, + .procname = "expire_nodest_conn", + .data = &sysctl_ip_vs_expire_nodest_conn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE, + .procname = "expire_quiescent_template", + .data = &sysctl_ip_vs_expire_quiescent_template, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_VS_SYNC_THRESHOLD, + .procname = "sync_threshold", + .data = &sysctl_ip_vs_sync_threshold, + .maxlen = sizeof(sysctl_ip_vs_sync_threshold), + .mode = 0644, + .proc_handler = &proc_do_sync_threshold, + }, + { + .ctl_name = NET_IPV4_VS_NAT_ICMP_SEND, + .procname = "nat_icmp_send", + .data = &sysctl_ip_vs_nat_icmp_send, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +static ctl_table vs_table[] = { + { + .ctl_name = NET_IPV4_VS, + .procname = "vs", + .mode = 0555, + .child = vs_vars + }, + { .ctl_name = 0 } +}; + +static ctl_table ipv4_table[] = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = vs_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table vs_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ipv4_table, + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header * sysctl_header; + +#ifdef CONFIG_PROC_FS + +struct ip_vs_iter { + struct list_head *table; + int bucket; +}; + +/* + * Write the contents of the VS rule table to a PROCfs file. + * (It is kept just for backward compatibility) + */ +static inline const char *ip_vs_fwd_name(unsigned flags) +{ + switch (flags & IP_VS_CONN_F_FWD_MASK) { + case IP_VS_CONN_F_LOCALNODE: + return "Local"; + case IP_VS_CONN_F_TUNNEL: + return "Tunnel"; + case IP_VS_CONN_F_DROUTE: + return "Route"; + default: + return "Masq"; + } +} + + +/* Get the Nth entry in the two lists */ +static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos) +{ + struct ip_vs_iter *iter = seq->private; + int idx; + struct ip_vs_service *svc; + + /* look in hash by protocol */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (pos-- == 0){ + iter->table = ip_vs_svc_table; + iter->bucket = idx; + return svc; + } + } + } + + /* keep looking in fwmark */ + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (pos-- == 0) { + iter->table = ip_vs_svc_fwm_table; + iter->bucket = idx; + return svc; + } + } + } + + return NULL; +} + +static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos) +{ + + read_lock_bh(&__ip_vs_svc_lock); + return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN; +} + + +static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *e; + struct ip_vs_iter *iter; + struct ip_vs_service *svc; + + ++*pos; + if (v == SEQ_START_TOKEN) + return ip_vs_info_array(seq,0); + + svc = v; + iter = seq->private; + + if (iter->table == ip_vs_svc_table) { + /* next service in table hashed by protocol */ + if ((e = svc->s_list.next) != &ip_vs_svc_table[iter->bucket]) + return list_entry(e, struct ip_vs_service, s_list); + + + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + list_for_each_entry(svc,&ip_vs_svc_table[iter->bucket], + s_list) { + return svc; + } + } + + iter->table = ip_vs_svc_fwm_table; + iter->bucket = -1; + goto scan_fwmark; + } + + /* next service in hashed by fwmark */ + if ((e = svc->f_list.next) != &ip_vs_svc_fwm_table[iter->bucket]) + return list_entry(e, struct ip_vs_service, f_list); + + scan_fwmark: + while (++iter->bucket < IP_VS_SVC_TAB_SIZE) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[iter->bucket], + f_list) + return svc; + } + + return NULL; +} + +static void ip_vs_info_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock_bh(&__ip_vs_svc_lock); +} + + +static int ip_vs_info_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) { + seq_printf(seq, + "IP Virtual Server version %d.%d.%d (size=%d)\n", + NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + seq_puts(seq, + "Prot LocalAddress:Port Scheduler Flags\n"); + seq_puts(seq, + " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n"); + } else { + const struct ip_vs_service *svc = v; + const struct ip_vs_iter *iter = seq->private; + const struct ip_vs_dest *dest; + + if (iter->table == ip_vs_svc_table) + seq_printf(seq, "%s %08X:%04X %s ", + ip_vs_proto_name(svc->protocol), + ntohl(svc->addr), + ntohs(svc->port), + svc->scheduler->name); + else + seq_printf(seq, "FWM %08X %s ", + svc->fwmark, svc->scheduler->name); + + if (svc->flags & IP_VS_SVC_F_PERSISTENT) + seq_printf(seq, "persistent %d %08X\n", + svc->timeout, + ntohl(svc->netmask)); + else + seq_putc(seq, '\n'); + + list_for_each_entry(dest, &svc->destinations, n_list) { + seq_printf(seq, + " -> %08X:%04X %-7s %-6d %-10d %-10d\n", + ntohl(dest->addr), ntohs(dest->port), + ip_vs_fwd_name(atomic_read(&dest->conn_flags)), + atomic_read(&dest->weight), + atomic_read(&dest->activeconns), + atomic_read(&dest->inactconns)); + } + } + return 0; +} + +static struct seq_operations ip_vs_info_seq_ops = { + .start = ip_vs_info_seq_start, + .next = ip_vs_info_seq_next, + .stop = ip_vs_info_seq_stop, + .show = ip_vs_info_seq_show, +}; + +static int ip_vs_info_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct ip_vs_iter *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &ip_vs_info_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations ip_vs_info_fops = { + .owner = THIS_MODULE, + .open = ip_vs_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif + +struct ip_vs_stats ip_vs_stats; + +#ifdef CONFIG_PROC_FS +static int ip_vs_stats_show(struct seq_file *seq, void *v) +{ + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Total Incoming Outgoing Incoming Outgoing\n"); + seq_printf(seq, + " Conns Packets Packets Bytes Bytes\n"); + + spin_lock_bh(&ip_vs_stats.lock); + seq_printf(seq, "%8X %8X %8X %16LX %16LX\n\n", ip_vs_stats.conns, + ip_vs_stats.inpkts, ip_vs_stats.outpkts, + (unsigned long long) ip_vs_stats.inbytes, + (unsigned long long) ip_vs_stats.outbytes); + +/* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ + seq_puts(seq, + " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n"); + seq_printf(seq,"%8X %8X %8X %16X %16X\n", + ip_vs_stats.cps, + ip_vs_stats.inpps, + ip_vs_stats.outpps, + ip_vs_stats.inbps, + ip_vs_stats.outbps); + spin_unlock_bh(&ip_vs_stats.lock); + + return 0; +} + +static int ip_vs_stats_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, ip_vs_stats_show, NULL); +} + +static struct file_operations ip_vs_stats_fops = { + .owner = THIS_MODULE, + .open = ip_vs_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif + +/* + * Set timeout values for tcp tcpfin udp in the timeout_table. + */ +static int ip_vs_set_timeout(struct ip_vs_timeout_user *u) +{ + IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n", + u->tcp_timeout, + u->tcp_fin_timeout, + u->udp_timeout); + +#ifdef CONFIG_IP_VS_PROTO_TCP + if (u->tcp_timeout) { + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] + = u->tcp_timeout * HZ; + } + + if (u->tcp_fin_timeout) { + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] + = u->tcp_fin_timeout * HZ; + } +#endif + +#ifdef CONFIG_IP_VS_PROTO_UDP + if (u->udp_timeout) { + ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] + = u->udp_timeout * HZ; + } +#endif + return 0; +} + + +#define SET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define SERVICE_ARG_LEN (sizeof(struct ip_vs_service_user)) +#define SVCDEST_ARG_LEN (sizeof(struct ip_vs_service_user) + \ + sizeof(struct ip_vs_dest_user)) +#define TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user)) +#define MAX_ARG_LEN SVCDEST_ARG_LEN + +static unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { + [SET_CMDID(IP_VS_SO_SET_ADD)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDIT)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DEL)] = SERVICE_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_FLUSH)] = 0, + [SET_CMDID(IP_VS_SO_SET_ADDDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_DELDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_EDITDEST)] = SVCDEST_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_TIMEOUT)] = TIMEOUT_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STARTDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_STOPDAEMON)] = DAEMON_ARG_LEN, + [SET_CMDID(IP_VS_SO_SET_ZERO)] = SERVICE_ARG_LEN, +}; + +static int +do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; + unsigned char arg[MAX_ARG_LEN]; + struct ip_vs_service_user *usvc; + struct ip_vs_service *svc; + struct ip_vs_dest_user *udest; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (len != set_arglen[SET_CMDID(cmd)]) { + IP_VS_ERR("set_ctl: len %u != %u\n", + len, set_arglen[SET_CMDID(cmd)]); + return -EINVAL; + } + + if (copy_from_user(arg, user, len) != 0) + return -EFAULT; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + if (down_interruptible(&__ip_vs_mutex)) { + ret = -ERESTARTSYS; + goto out_dec; + } + + if (cmd == IP_VS_SO_SET_FLUSH) { + /* Flush the virtual service */ + ret = ip_vs_flush(); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_TIMEOUT) { + /* Set timeout values for (tcp tcpfin udp) */ + ret = ip_vs_set_timeout((struct ip_vs_timeout_user *)arg); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_STARTDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + ret = start_sync_thread(dm->state, dm->mcast_ifn, dm->syncid); + goto out_unlock; + } else if (cmd == IP_VS_SO_SET_STOPDAEMON) { + struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg; + ret = stop_sync_thread(dm->state); + goto out_unlock; + } + + usvc = (struct ip_vs_service_user *)arg; + udest = (struct ip_vs_dest_user *)(usvc + 1); + + if (cmd == IP_VS_SO_SET_ZERO) { + /* if no service address is set, zero counters in all */ + if (!usvc->fwmark && !usvc->addr && !usvc->port) { + ret = ip_vs_zero_all(); + goto out_unlock; + } + } + + /* Check for valid protocol: TCP or UDP, even for fwmark!=0 */ + if (usvc->protocol!=IPPROTO_TCP && usvc->protocol!=IPPROTO_UDP) { + IP_VS_ERR("set_ctl: invalid protocol: %d %d.%d.%d.%d:%d %s\n", + usvc->protocol, NIPQUAD(usvc->addr), + ntohs(usvc->port), usvc->sched_name); + ret = -EFAULT; + goto out_unlock; + } + + /* Lookup the exact service by or fwmark */ + if (usvc->fwmark == 0) + svc = __ip_vs_service_get(usvc->protocol, + usvc->addr, usvc->port); + else + svc = __ip_vs_svc_fwm_get(usvc->fwmark); + + if (cmd != IP_VS_SO_SET_ADD + && (svc == NULL || svc->protocol != usvc->protocol)) { + ret = -ESRCH; + goto out_unlock; + } + + switch (cmd) { + case IP_VS_SO_SET_ADD: + if (svc != NULL) + ret = -EEXIST; + else + ret = ip_vs_add_service(usvc, &svc); + break; + case IP_VS_SO_SET_EDIT: + ret = ip_vs_edit_service(svc, usvc); + break; + case IP_VS_SO_SET_DEL: + ret = ip_vs_del_service(svc); + if (!ret) + goto out_unlock; + break; + case IP_VS_SO_SET_ZERO: + ret = ip_vs_zero_service(svc); + break; + case IP_VS_SO_SET_ADDDEST: + ret = ip_vs_add_dest(svc, udest); + break; + case IP_VS_SO_SET_EDITDEST: + ret = ip_vs_edit_dest(svc, udest); + break; + case IP_VS_SO_SET_DELDEST: + ret = ip_vs_del_dest(svc, udest); + break; + default: + ret = -EINVAL; + } + + if (svc) + ip_vs_service_put(svc); + + out_unlock: + up(&__ip_vs_mutex); + out_dec: + /* decrease the module use count */ + ip_vs_use_count_dec(); + + return ret; +} + + +static void +ip_vs_copy_stats(struct ip_vs_stats_user *dst, struct ip_vs_stats *src) +{ + spin_lock_bh(&src->lock); + memcpy(dst, src, (char*)&src->lock - (char*)src); + spin_unlock_bh(&src->lock); +} + +static void +ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) +{ + dst->protocol = src->protocol; + dst->addr = src->addr; + dst->port = src->port; + dst->fwmark = src->fwmark; + strcpy(dst->sched_name, src->scheduler->name); + dst->flags = src->flags; + dst->timeout = src->timeout / HZ; + dst->netmask = src->netmask; + dst->num_dests = src->num_dests; + ip_vs_copy_stats(&dst->stats, &src->stats); +} + +static inline int +__ip_vs_get_service_entries(const struct ip_vs_get_services *get, + struct ip_vs_get_services __user *uptr) +{ + int idx, count=0; + struct ip_vs_service *svc; + struct ip_vs_service_entry entry; + int ret = 0; + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (count >= get->num_services) + goto out; + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (count >= get->num_services) + goto out; + ip_vs_copy_service(&entry, svc); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + goto out; + } + count++; + } + } + out: + return ret; +} + +static inline int +__ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, + struct ip_vs_get_dests __user *uptr) +{ + struct ip_vs_service *svc; + int ret = 0; + + if (get->fwmark) + svc = __ip_vs_svc_fwm_get(get->fwmark); + else + svc = __ip_vs_service_get(get->protocol, + get->addr, get->port); + if (svc) { + int count = 0; + struct ip_vs_dest *dest; + struct ip_vs_dest_entry entry; + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (count >= get->num_dests) + break; + + entry.addr = dest->addr; + entry.port = dest->port; + entry.conn_flags = atomic_read(&dest->conn_flags); + entry.weight = atomic_read(&dest->weight); + entry.u_threshold = dest->u_threshold; + entry.l_threshold = dest->l_threshold; + entry.activeconns = atomic_read(&dest->activeconns); + entry.inactconns = atomic_read(&dest->inactconns); + entry.persistconns = atomic_read(&dest->persistconns); + ip_vs_copy_stats(&entry.stats, &dest->stats); + if (copy_to_user(&uptr->entrytable[count], + &entry, sizeof(entry))) { + ret = -EFAULT; + break; + } + count++; + } + ip_vs_service_put(svc); + } else + ret = -ESRCH; + return ret; +} + +static inline void +__ip_vs_get_timeouts(struct ip_vs_timeout_user *u) +{ +#ifdef CONFIG_IP_VS_PROTO_TCP + u->tcp_timeout = + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ; + u->tcp_fin_timeout = + ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ; +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + u->udp_timeout = + ip_vs_protocol_udp.timeout_table[IP_VS_UDP_S_NORMAL] / HZ; +#endif +} + + +#define GET_CMDID(cmd) (cmd - IP_VS_BASE_CTL) +#define GET_INFO_ARG_LEN (sizeof(struct ip_vs_getinfo)) +#define GET_SERVICES_ARG_LEN (sizeof(struct ip_vs_get_services)) +#define GET_SERVICE_ARG_LEN (sizeof(struct ip_vs_service_entry)) +#define GET_DESTS_ARG_LEN (sizeof(struct ip_vs_get_dests)) +#define GET_TIMEOUT_ARG_LEN (sizeof(struct ip_vs_timeout_user)) +#define GET_DAEMON_ARG_LEN (sizeof(struct ip_vs_daemon_user) * 2) + +static unsigned char get_arglen[GET_CMDID(IP_VS_SO_GET_MAX)+1] = { + [GET_CMDID(IP_VS_SO_GET_VERSION)] = 64, + [GET_CMDID(IP_VS_SO_GET_INFO)] = GET_INFO_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICES)] = GET_SERVICES_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_SERVICE)] = GET_SERVICE_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DESTS)] = GET_DESTS_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_TIMEOUT)] = GET_TIMEOUT_ARG_LEN, + [GET_CMDID(IP_VS_SO_GET_DAEMON)] = GET_DAEMON_ARG_LEN, +}; + +static int +do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + unsigned char arg[128]; + int ret = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (*len < get_arglen[GET_CMDID(cmd)]) { + IP_VS_ERR("get_ctl: len %u < %u\n", + *len, get_arglen[GET_CMDID(cmd)]); + return -EINVAL; + } + + if (copy_from_user(arg, user, get_arglen[GET_CMDID(cmd)]) != 0) + return -EFAULT; + + if (down_interruptible(&__ip_vs_mutex)) + return -ERESTARTSYS; + + switch (cmd) { + case IP_VS_SO_GET_VERSION: + { + char buf[64]; + + sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)", + NVERSION(IP_VS_VERSION_CODE), IP_VS_CONN_TAB_SIZE); + if (copy_to_user(user, buf, strlen(buf)+1) != 0) { + ret = -EFAULT; + goto out; + } + *len = strlen(buf)+1; + } + break; + + case IP_VS_SO_GET_INFO: + { + struct ip_vs_getinfo info; + info.version = IP_VS_VERSION_CODE; + info.size = IP_VS_CONN_TAB_SIZE; + info.num_services = ip_vs_num_services; + if (copy_to_user(user, &info, sizeof(info)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_SERVICES: + { + struct ip_vs_get_services *get; + int size; + + get = (struct ip_vs_get_services *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_service_entry) * get->num_services; + if (*len != size) { + IP_VS_ERR("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_service_entries(get, user); + } + break; + + case IP_VS_SO_GET_SERVICE: + { + struct ip_vs_service_entry *entry; + struct ip_vs_service *svc; + + entry = (struct ip_vs_service_entry *)arg; + if (entry->fwmark) + svc = __ip_vs_svc_fwm_get(entry->fwmark); + else + svc = __ip_vs_service_get(entry->protocol, + entry->addr, entry->port); + if (svc) { + ip_vs_copy_service(entry, svc); + if (copy_to_user(user, entry, sizeof(*entry)) != 0) + ret = -EFAULT; + ip_vs_service_put(svc); + } else + ret = -ESRCH; + } + break; + + case IP_VS_SO_GET_DESTS: + { + struct ip_vs_get_dests *get; + int size; + + get = (struct ip_vs_get_dests *)arg; + size = sizeof(*get) + + sizeof(struct ip_vs_dest_entry) * get->num_dests; + if (*len != size) { + IP_VS_ERR("length: %u != %u\n", *len, size); + ret = -EINVAL; + goto out; + } + ret = __ip_vs_get_dest_entries(get, user); + } + break; + + case IP_VS_SO_GET_TIMEOUT: + { + struct ip_vs_timeout_user t; + + __ip_vs_get_timeouts(&t); + if (copy_to_user(user, &t, sizeof(t)) != 0) + ret = -EFAULT; + } + break; + + case IP_VS_SO_GET_DAEMON: + { + struct ip_vs_daemon_user d[2]; + + memset(&d, 0, sizeof(d)); + if (ip_vs_sync_state & IP_VS_STATE_MASTER) { + d[0].state = IP_VS_STATE_MASTER; + strcpy(d[0].mcast_ifn, ip_vs_master_mcast_ifn); + d[0].syncid = ip_vs_master_syncid; + } + if (ip_vs_sync_state & IP_VS_STATE_BACKUP) { + d[1].state = IP_VS_STATE_BACKUP; + strcpy(d[1].mcast_ifn, ip_vs_backup_mcast_ifn); + d[1].syncid = ip_vs_backup_syncid; + } + if (copy_to_user(user, &d, sizeof(d)) != 0) + ret = -EFAULT; + } + break; + + default: + ret = -EINVAL; + } + + out: + up(&__ip_vs_mutex); + return ret; +} + + +static struct nf_sockopt_ops ip_vs_sockopts = { + .pf = PF_INET, + .set_optmin = IP_VS_BASE_CTL, + .set_optmax = IP_VS_SO_SET_MAX+1, + .set = do_ip_vs_set_ctl, + .get_optmin = IP_VS_BASE_CTL, + .get_optmax = IP_VS_SO_GET_MAX+1, + .get = do_ip_vs_get_ctl, +}; + + +int ip_vs_control_init(void) +{ + int ret; + int idx; + + EnterFunction(2); + + ret = nf_register_sockopt(&ip_vs_sockopts); + if (ret) { + IP_VS_ERR("cannot register sockopt.\n"); + return ret; + } + + proc_net_fops_create("ip_vs", 0, &ip_vs_info_fops); + proc_net_fops_create("ip_vs_stats",0, &ip_vs_stats_fops); + + sysctl_header = register_sysctl_table(vs_root_table, 0); + + /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_svc_table[idx]); + INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); + } + for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_rtable[idx]); + } + + memset(&ip_vs_stats, 0, sizeof(ip_vs_stats)); + spin_lock_init(&ip_vs_stats.lock); + ip_vs_new_estimator(&ip_vs_stats); + + /* Hook the defense timer */ + schedule_delayed_work(&defense_work, DEFENSE_TIMER_PERIOD); + + LeaveFunction(2); + return 0; +} + + +void ip_vs_control_cleanup(void) +{ + EnterFunction(2); + ip_vs_trash_cleanup(); + cancel_rearming_delayed_work(&defense_work); + ip_vs_kill_estimator(&ip_vs_stats); + unregister_sysctl_table(sysctl_header); + proc_net_remove("ip_vs_stats"); + proc_net_remove("ip_vs"); + nf_unregister_sockopt(&ip_vs_sockopts); + LeaveFunction(2); +} diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c new file mode 100644 index 000000000000..f3bc320dce93 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_dh.c @@ -0,0 +1,258 @@ +/* + * IPVS: Destination Hashing scheduling module + * + * Version: $Id: ip_vs_dh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang + * + * Inspired by the consistent hashing scheduler patch from + * Thomas Proell + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The dh algorithm is to select server by the hash key of destination IP + * address. The pseudo code is as follows: + * + * n <- servernode[dest_ip]; + * if (n is dead) OR + * (n is overloaded) OR (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet destination IP address to the current server + * array. If the dh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#include +#include + +#include + + +/* + * IPVS DH bucket + */ +struct ip_vs_dh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS DH entry hash table + */ +#ifndef CONFIG_IP_VS_DH_TAB_BITS +#define CONFIG_IP_VS_DH_TAB_BITS 8 +#endif +#define IP_VS_DH_TAB_BITS CONFIG_IP_VS_DH_TAB_BITS +#define IP_VS_DH_TAB_SIZE (1 << IP_VS_DH_TAB_BITS) +#define IP_VS_DH_TAB_MASK (IP_VS_DH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS DH entry + */ +static inline unsigned ip_vs_dh_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_DH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_dh_get(struct ip_vs_dh_bucket *tbl, __u32 addr) +{ + return (tbl[ip_vs_dh_hashkey(addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_dh_assign(struct ip_vs_dh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_dh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + + b = tbl; + p = &svc->destinations; + for (i=0; idest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_dh_flush(struct ip_vs_dh_bucket *tbl) +{ + int i; + struct ip_vs_dh_bucket *b; + + b = tbl; + for (i=0; idest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_dh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl; + + /* allocate the DH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_dh_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_dh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "DH hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_dh_bucket)*IP_VS_DH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_dh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_dh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_dh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_dh_assign(tbl, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Destination hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_dh_bucket *tbl; + struct iphdr *iph = skb->nh.iph; + + IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_dh_bucket *)svc->sched_data; + dest = ip_vs_dh_get(tbl, iph->daddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + return NULL; + } + + IP_VS_DBG(6, "DH: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->daddr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS DH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_dh_scheduler = +{ + .name = "dh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_dh_init_svc, + .done_service = ip_vs_dh_done_svc, + .update_service = ip_vs_dh_update_svc, + .schedule = ip_vs_dh_schedule, +}; + + +static int __init ip_vs_dh_init(void) +{ + INIT_LIST_HEAD(&ip_vs_dh_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +static void __exit ip_vs_dh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_dh_scheduler); +} + + +module_init(ip_vs_dh_init); +module_exit(ip_vs_dh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c new file mode 100644 index 000000000000..67b3e2fc1fa1 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_est.c @@ -0,0 +1,200 @@ +/* + * ip_vs_est.c: simple rate estimator for IPVS + * + * Version: $Id: ip_vs_est.c,v 1.4 2002/11/30 01:50:35 wensong Exp $ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ +#include +#include + +#include + +/* + This code is to estimate rate in a shorter interval (such as 8 + seconds) for virtual services and real servers. For measure rate in a + long interval, it is easy to implement a user level daemon which + periodically reads those statistical counters and measure rate. + + Currently, the measurement is activated by slow timer handler. Hope + this measurement will not introduce too much load. + + We measure rate during the last 8 seconds every 2 seconds: + + avgrate = avgrate*(1-W) + rate*W + + where W = 2^(-2) + + NOTES. + + * The stored value for average bps is scaled by 2^5, so that maximal + rate is ~2.15Gbits/s, average pps and cps are scaled by 2^10. + + * A lot code is taken from net/sched/estimator.c + */ + + +struct ip_vs_estimator +{ + struct ip_vs_estimator *next; + struct ip_vs_stats *stats; + + u32 last_conns; + u32 last_inpkts; + u32 last_outpkts; + u64 last_inbytes; + u64 last_outbytes; + + u32 cps; + u32 inpps; + u32 outpps; + u32 inbps; + u32 outbps; +}; + + +static struct ip_vs_estimator *est_list = NULL; +static DEFINE_RWLOCK(est_lock); +static struct timer_list est_timer; + +static void estimation_timer(unsigned long arg) +{ + struct ip_vs_estimator *e; + struct ip_vs_stats *s; + u32 n_conns; + u32 n_inpkts, n_outpkts; + u64 n_inbytes, n_outbytes; + u32 rate; + + read_lock(&est_lock); + for (e = est_list; e; e = e->next) { + s = e->stats; + + spin_lock(&s->lock); + n_conns = s->conns; + n_inpkts = s->inpkts; + n_outpkts = s->outpkts; + n_inbytes = s->inbytes; + n_outbytes = s->outbytes; + + /* scaled by 2^10, but divided 2 seconds */ + rate = (n_conns - e->last_conns)<<9; + e->last_conns = n_conns; + e->cps += ((long)rate - (long)e->cps)>>2; + s->cps = (e->cps+0x1FF)>>10; + + rate = (n_inpkts - e->last_inpkts)<<9; + e->last_inpkts = n_inpkts; + e->inpps += ((long)rate - (long)e->inpps)>>2; + s->inpps = (e->inpps+0x1FF)>>10; + + rate = (n_outpkts - e->last_outpkts)<<9; + e->last_outpkts = n_outpkts; + e->outpps += ((long)rate - (long)e->outpps)>>2; + s->outpps = (e->outpps+0x1FF)>>10; + + rate = (n_inbytes - e->last_inbytes)<<4; + e->last_inbytes = n_inbytes; + e->inbps += ((long)rate - (long)e->inbps)>>2; + s->inbps = (e->inbps+0xF)>>5; + + rate = (n_outbytes - e->last_outbytes)<<4; + e->last_outbytes = n_outbytes; + e->outbps += ((long)rate - (long)e->outbps)>>2; + s->outbps = (e->outbps+0xF)>>5; + spin_unlock(&s->lock); + } + read_unlock(&est_lock); + mod_timer(&est_timer, jiffies + 2*HZ); +} + +int ip_vs_new_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est; + + est = kmalloc(sizeof(*est), GFP_KERNEL); + if (est == NULL) + return -ENOMEM; + + memset(est, 0, sizeof(*est)); + est->stats = stats; + est->last_conns = stats->conns; + est->cps = stats->cps<<10; + + est->last_inpkts = stats->inpkts; + est->inpps = stats->inpps<<10; + + est->last_outpkts = stats->outpkts; + est->outpps = stats->outpps<<10; + + est->last_inbytes = stats->inbytes; + est->inbps = stats->inbps<<5; + + est->last_outbytes = stats->outbytes; + est->outbps = stats->outbps<<5; + + write_lock_bh(&est_lock); + est->next = est_list; + if (est->next == NULL) { + init_timer(&est_timer); + est_timer.expires = jiffies + 2*HZ; + est_timer.function = estimation_timer; + add_timer(&est_timer); + } + est_list = est; + write_unlock_bh(&est_lock); + return 0; +} + +void ip_vs_kill_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *est, **pest; + int killed = 0; + + write_lock_bh(&est_lock); + pest = &est_list; + while ((est=*pest) != NULL) { + if (est->stats != stats) { + pest = &est->next; + continue; + } + *pest = est->next; + kfree(est); + killed++; + } + if (killed && est_list == NULL) + del_timer_sync(&est_timer); + write_unlock_bh(&est_lock); +} + +void ip_vs_zero_estimator(struct ip_vs_stats *stats) +{ + struct ip_vs_estimator *e; + + write_lock_bh(&est_lock); + for (e = est_list; e; e = e->next) { + if (e->stats != stats) + continue; + + /* set counters zero */ + e->last_conns = 0; + e->last_inpkts = 0; + e->last_outpkts = 0; + e->last_inbytes = 0; + e->last_outbytes = 0; + e->cps = 0; + e->inpps = 0; + e->outpps = 0; + e->inbps = 0; + e->outbps = 0; + } + write_unlock_bh(&est_lock); +} diff --git a/net/ipv4/ipvs/ip_vs_ftp.c b/net/ipv4/ipvs/ip_vs_ftp.c new file mode 100644 index 000000000000..a19a33ceb811 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_ftp.c @@ -0,0 +1,400 @@ +/* + * ip_vs_ftp.c: IPVS ftp application module + * + * Version: $Id: ip_vs_ftp.c,v 1.13 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang + * + * Changes: + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Most code here is taken from ip_masq_ftp.c in kernel 2.2. The difference + * is that ip_vs_ftp module handles the reverse direction to ip_masq_ftp. + * + * IP_MASQ_FTP ftp masquerading module + * + * Version: @(#)ip_masq_ftp.c 0.04 02/05/96 + * + * Author: Wouter Gadeyne + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +#define SERVER_STRING "227 Entering Passive Mode (" +#define CLIENT_STRING "PORT " + + +/* + * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper + * First port is set to the default port. + */ +static int ports[IP_VS_APP_MAX_PORTS] = {21, 0}; +module_param_array(ports, int, NULL, 0); + +/* + * Debug level + */ +#ifdef CONFIG_IP_VS_DEBUG +static int debug=0; +module_param(debug, int, 0); +#endif + + +/* Dummy variable */ +static int ip_vs_ftp_pasv; + + +static int +ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + return 0; +} + + +static int +ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) +{ + return 0; +} + + +/* + * Get from the string "xxx.xxx.xxx.xxx,ppp,ppp", started + * with the "pattern" and terminated with the "term" character. + * is in network order. + */ +static int ip_vs_ftp_get_addrport(char *data, char *data_limit, + const char *pattern, size_t plen, char term, + __u32 *addr, __u16 *port, + char **start, char **end) +{ + unsigned char p[6]; + int i = 0; + + if (data_limit - data < plen) { + /* check if there is partial match */ + if (strnicmp(data, pattern, data_limit - data) == 0) + return -1; + else + return 0; + } + + if (strnicmp(data, pattern, plen) != 0) { + return 0; + } + *start = data + plen; + + for (data = *start; *data != term; data++) { + if (data == data_limit) + return -1; + } + *end = data; + + memset(p, 0, sizeof(p)); + for (data = *start; data != *end; data++) { + if (*data >= '0' && *data <= '9') { + p[i] = p[i]*10 + *data - '0'; + } else if (*data == ',' && i < 5) { + i++; + } else { + /* unexpected character */ + return -1; + } + } + + if (i != 5) + return -1; + + *addr = (p[3]<<24) | (p[2]<<16) | (p[1]<<8) | p[0]; + *port = (p[5]<<8) | p[4]; + return 1; +} + + +/* + * Look at outgoing ftp packets to catch the response to a PASV command + * from the server (inside-to-outside). + * When we see one, we build a connection entry with the client address, + * client port 0 (unknown at the moment), the server address and the + * server port. Mark the current connection entry as a control channel + * of the new entry. All this work is just to make the data connection + * can be scheduled to the right server later. + * + * The outgoing packet should be something like + * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". + * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. + */ +static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff **pskb, int *diff) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_limit; + char *start, *end; + __u32 from; + __u16 port; + struct ip_vs_conn *n_cp; + char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ + unsigned buf_len; + int ret; + + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (!ip_vs_make_skb_writable(pskb, (*pskb)->len)) + return 0; + + if (cp->app_data == &ip_vs_ftp_pasv) { + iph = (*pskb)->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + data = (char *)th + (th->doff << 2); + data_limit = (*pskb)->tail; + + if (ip_vs_ftp_get_addrport(data, data_limit, + SERVER_STRING, + sizeof(SERVER_STRING)-1, ')', + &from, &port, + &start, &end) != 1) + return 1; + + IP_VS_DBG(1-debug, "PASV response (%u.%u.%u.%u:%d) -> " + "%u.%u.%u.%u:%d detected\n", + NIPQUAD(from), ntohs(port), NIPQUAD(cp->caddr), 0); + + /* + * Now update or create an connection entry for it + */ + n_cp = ip_vs_conn_out_get(iph->protocol, from, port, + cp->caddr, 0); + if (!n_cp) { + n_cp = ip_vs_conn_new(IPPROTO_TCP, + cp->caddr, 0, + cp->vaddr, port, + from, port, + IP_VS_CONN_F_NO_CPORT, + cp->dest); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* + * Replace the old passive address with the new one + */ + from = n_cp->vaddr; + port = n_cp->vport; + sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from), + port&255, (port>>8)&255); + buf_len = strlen(buf); + + /* + * Calculate required delta-offset to keep TCP happy + */ + *diff = buf_len - (end-start); + + if (*diff == 0) { + /* simply replace it with new passive address */ + memcpy(start, buf, buf_len); + ret = 1; + } else { + ret = !ip_vs_skb_replace(*pskb, GFP_ATOMIC, start, + end-start, buf, buf_len); + } + + cp->app_data = NULL; + ip_vs_tcp_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + return ret; + } + return 1; +} + + +/* + * Look at incoming ftp packets to catch the PASV/PORT command + * (outside-to-inside). + * + * The incoming packet having the PORT command should be something like + * "PORT xxx,xxx,xxx,xxx,ppp,ppp\n". + * xxx,xxx,xxx,xxx is the client address, ppp,ppp is the client port number. + * In this case, we create a connection entry using the client address and + * port, so that the active ftp data connection from the server can reach + * the client. + */ +static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, + struct sk_buff **pskb, int *diff) +{ + struct iphdr *iph; + struct tcphdr *th; + char *data, *data_start, *data_limit; + char *start, *end; + __u32 to; + __u16 port; + struct ip_vs_conn *n_cp; + + /* no diff required for incoming packets */ + *diff = 0; + + /* Only useful for established sessions */ + if (cp->state != IP_VS_TCP_S_ESTABLISHED) + return 1; + + /* Linear packets are much easier to deal with. */ + if (!ip_vs_make_skb_writable(pskb, (*pskb)->len)) + return 0; + + /* + * Detecting whether it is passive + */ + iph = (*pskb)->nh.iph; + th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); + + /* Since there may be OPTIONS in the TCP packet and the HLEN is + the length of the header in 32-bit multiples, it is accurate + to calculate data address by th+HLEN*4 */ + data = data_start = (char *)th + (th->doff << 2); + data_limit = (*pskb)->tail; + + while (data <= data_limit - 6) { + if (strnicmp(data, "PASV\r\n", 6) == 0) { + /* Passive mode on */ + IP_VS_DBG(1-debug, "got PASV at %zd of %zd\n", + data - data_start, + data_limit - data_start); + cp->app_data = &ip_vs_ftp_pasv; + return 1; + } + data++; + } + + /* + * To support virtual FTP server, the scenerio is as follows: + * FTP client ----> Load Balancer ----> FTP server + * First detect the port number in the application data, + * then create a new connection entry for the coming data + * connection. + */ + if (ip_vs_ftp_get_addrport(data_start, data_limit, + CLIENT_STRING, sizeof(CLIENT_STRING)-1, + '\r', &to, &port, + &start, &end) != 1) + return 1; + + IP_VS_DBG(1-debug, "PORT %u.%u.%u.%u:%d detected\n", + NIPQUAD(to), ntohs(port)); + + /* Passive mode off */ + cp->app_data = NULL; + + /* + * Now update or create a connection entry for it + */ + IP_VS_DBG(1-debug, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", + ip_vs_proto_name(iph->protocol), + NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0); + + n_cp = ip_vs_conn_in_get(iph->protocol, + to, port, + cp->vaddr, htons(ntohs(cp->vport)-1)); + if (!n_cp) { + n_cp = ip_vs_conn_new(IPPROTO_TCP, + to, port, + cp->vaddr, htons(ntohs(cp->vport)-1), + cp->daddr, htons(ntohs(cp->dport)-1), + 0, + cp->dest); + if (!n_cp) + return 0; + + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } + + /* + * Move tunnel to listen state + */ + ip_vs_tcp_conn_listen(n_cp); + ip_vs_conn_put(n_cp); + + return 1; +} + + +static struct ip_vs_app ip_vs_ftp = { + .name = "ftp", + .type = IP_VS_APP_TYPE_FTP, + .protocol = IPPROTO_TCP, + .module = THIS_MODULE, + .incs_list = LIST_HEAD_INIT(ip_vs_ftp.incs_list), + .init_conn = ip_vs_ftp_init_conn, + .done_conn = ip_vs_ftp_done_conn, + .bind_conn = NULL, + .unbind_conn = NULL, + .pkt_out = ip_vs_ftp_out, + .pkt_in = ip_vs_ftp_in, +}; + + +/* + * ip_vs_ftp initialization + */ +static int __init ip_vs_ftp_init(void) +{ + int i, ret; + struct ip_vs_app *app = &ip_vs_ftp; + + ret = register_ip_vs_app(app); + if (ret) + return ret; + + for (i=0; iprotocol, ports[i]); + if (ret) + break; + IP_VS_DBG(1-debug, "%s: loaded support on port[%d] = %d\n", + app->name, i, ports[i]); + } + + if (ret) + unregister_ip_vs_app(app); + + return ret; +} + + +/* + * ip_vs_ftp finish. + */ +static void __exit ip_vs_ftp_exit(void) +{ + unregister_ip_vs_app(&ip_vs_ftp); +} + + +module_init(ip_vs_ftp_init); +module_exit(ip_vs_ftp_exit); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c new file mode 100644 index 000000000000..c035838b780a --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -0,0 +1,624 @@ +/* + * IPVS: Locality-Based Least-Connection scheduling module + * + * Version: $Id: ip_vs_lblc.c,v 1.10 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Martin Hamilton : fixed the terrible locking bugs + * *lock(tbl->lock) ==> *lock(&tbl->lock) + * Wensong Zhang : fixed the uninitilized tbl->lock bug + * Wensong Zhang : added doing full expiration check to + * collect stale entries of 24+ hours when + * no partial expire check in a half hour + * Julian Anastasov : replaced del_timer call with del_timer_sync + * to avoid the possible race between timer + * handler and del_timer thread in SMP + * + */ + +/* + * The lblc algorithm is as follows (pseudo code): + * + * if cachenode[dest_ip] is null then + * n, cachenode[dest_ip] <- {weighted least-conn node}; + * else + * n <- cachenode[dest_ip]; + * if (n is dead) OR + * (n.conns>n.weight AND + * there is a node m with m.conns +#include + +/* for sysctl */ +#include +#include + +#include + + +/* + * It is for garbage collection of stale IPVS lblc entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 +static int sysctl_ip_vs_lblc_expiration = 24*60*60*HZ; + + +/* + * for IPVS lblc entry hash table + */ +#ifndef CONFIG_IP_VS_LBLC_TAB_BITS +#define CONFIG_IP_VS_LBLC_TAB_BITS 10 +#endif +#define IP_VS_LBLC_TAB_BITS CONFIG_IP_VS_LBLC_TAB_BITS +#define IP_VS_LBLC_TAB_SIZE (1 << IP_VS_LBLC_TAB_BITS) +#define IP_VS_LBLC_TAB_MASK (IP_VS_LBLC_TAB_SIZE - 1) + + +/* + * IPVS lblc entry represents an association between destination + * IP address and its destination server + */ +struct ip_vs_lblc_entry { + struct list_head list; + __u32 addr; /* destination IP address */ + struct ip_vs_dest *dest; /* real server (cache) */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblc hash table + */ +struct ip_vs_lblc_table { + rwlock_t lock; /* lock for this table */ + struct list_head bucket[IP_VS_LBLC_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +/* + * IPVS LBLC sysctl table + */ + +static ctl_table vs_vars_table[] = { + { + .ctl_name = NET_IPV4_VS_LBLC_EXPIRE, + .procname = "lblc_expiration", + .data = &sysctl_ip_vs_lblc_expiration, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static ctl_table vs_table[] = { + { + .ctl_name = NET_IPV4_VS, + .procname = "vs", + .mode = 0555, + .child = vs_vars_table + }, + { .ctl_name = 0 } +}; + +static ctl_table ipv4_table[] = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = vs_table + }, + { .ctl_name = 0 } +}; + +static ctl_table lblc_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ipv4_table + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header * sysctl_header; + +/* + * new/free a ip_vs_lblc_entry, which is a mapping of a destionation + * IP address to a server. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_new(__u32 daddr, struct ip_vs_dest *dest) +{ + struct ip_vs_lblc_entry *en; + + en = kmalloc(sizeof(struct ip_vs_lblc_entry), GFP_ATOMIC); + if (en == NULL) { + IP_VS_ERR("ip_vs_lblc_new(): no memory\n"); + return NULL; + } + + INIT_LIST_HEAD(&en->list); + en->addr = daddr; + + atomic_inc(&dest->refcnt); + en->dest = dest; + + return en; +} + + +static inline void ip_vs_lblc_free(struct ip_vs_lblc_entry *en) +{ + list_del(&en->list); + /* + * We don't kfree dest because it is refered either by its service + * or the trash dest list. + */ + atomic_dec(&en->dest->refcnt); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLC entry + */ +static inline unsigned ip_vs_lblc_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_LBLC_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblc_table. + * returns bool success. + */ +static int +ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) +{ + unsigned hash; + + if (!list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblc_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Hash by destination IP address + */ + hash = ip_vs_lblc_hashkey(en->addr); + + write_lock(&tbl->lock); + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); + write_unlock(&tbl->lock); + + return 1; +} + + +#if 0000 +/* + * Unhash ip_vs_lblc_entry from ip_vs_lblc_table. + * returns bool success. + */ +static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl, + struct ip_vs_lblc_entry *en) +{ + if (list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Remove it from the table + */ + write_lock(&tbl->lock); + list_del(&en->list); + INIT_LIST_HEAD(&en->list); + write_unlock(&tbl->lock); + + return 1; +} +#endif + + +/* + * Get ip_vs_lblc_entry associated with supplied parameters. + */ +static inline struct ip_vs_lblc_entry * +ip_vs_lblc_get(struct ip_vs_lblc_table *tbl, __u32 addr) +{ + unsigned hash; + struct ip_vs_lblc_entry *en; + + hash = ip_vs_lblc_hashkey(addr); + + read_lock(&tbl->lock); + + list_for_each_entry(en, &tbl->bucket[hash], list) { + if (en->addr == addr) { + /* HIT */ + read_unlock(&tbl->lock); + return en; + } + } + + read_unlock(&tbl->lock); + + return NULL; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblc_flush(struct ip_vs_lblc_table *tbl) +{ + int i; + struct ip_vs_lblc_entry *en, *nxt; + + for (i=0; ilock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } +} + + +static inline void ip_vs_lblc_full_check(struct ip_vs_lblc_table *tbl) +{ + unsigned long now = jiffies; + int i, j; + struct ip_vs_lblc_entry *en, *nxt; + + for (i=0, j=tbl->rover; ilock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, + en->lastuse + sysctl_ip_vs_lblc_expiration)) + continue; + + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblc table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblc_check_expire(unsigned long data) +{ + struct ip_vs_lblc_table *tbl; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblc_entry *en, *nxt; + + tbl = (struct ip_vs_lblc_table *)data; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblc_full_check(tbl); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; ilock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse + ENTRY_TIMEOUT)) + continue; + + ip_vs_lblc_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&tbl->lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + + +static int ip_vs_lblc_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblc_table *tbl; + + /* + * Allocate the ip_vs_lblc_table for this service + */ + tbl = kmalloc(sizeof(struct ip_vs_lblc_table), GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_lblc_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_lblc_table)); + + /* + * Initialize the hash buckets + */ + for (i=0; ibucket[i]); + } + rwlock_init(&tbl->lock); + tbl->max_size = IP_VS_LBLC_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + init_timer(&tbl->periodic_timer); + tbl->periodic_timer.data = (unsigned long)tbl; + tbl->periodic_timer.function = ip_vs_lblc_check_expire; + tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; + add_timer(&tbl->periodic_timer); + + return 0; +} + + +static int ip_vs_lblc_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblc_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblc_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "LBLC hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_lblc_table)); + + return 0; +} + + +static int ip_vs_lblc_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We think the overhead of processing active connections is fifty + * times higher than that of inactive connections in average. (This + * fifty times might not be accurate, we will change it later.) We + * use the following formula to estimate the overhead: + * dest->activeconns*50 + dest->inactconns + * and the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LBLC: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_lblc_table *tbl; + struct ip_vs_lblc_entry *en; + struct iphdr *iph = skb->nh.iph; + + IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_lblc_table *)svc->sched_data; + en = ip_vs_lblc_get(tbl, iph->daddr); + if (en == NULL) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + en = ip_vs_lblc_new(iph->daddr, dest); + if (en == NULL) { + return NULL; + } + ip_vs_lblc_hash(tbl, en); + } else { + dest = en->dest; + if (!(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest, svc)) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + atomic_dec(&en->dest->refcnt); + atomic_inc(&dest->refcnt); + en->dest = dest; + } + } + en->lastuse = jiffies; + + IP_VS_DBG(6, "LBLC: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(en->addr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLC Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblc_scheduler = +{ + .name = "lblc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_lblc_init_svc, + .done_service = ip_vs_lblc_done_svc, + .update_service = ip_vs_lblc_update_svc, + .schedule = ip_vs_lblc_schedule, +}; + + +static int __init ip_vs_lblc_init(void) +{ + INIT_LIST_HEAD(&ip_vs_lblc_scheduler.n_list); + sysctl_header = register_sysctl_table(lblc_root_table, 0); + return register_ip_vs_scheduler(&ip_vs_lblc_scheduler); +} + + +static void __exit ip_vs_lblc_cleanup(void) +{ + unregister_sysctl_table(sysctl_header); + unregister_ip_vs_scheduler(&ip_vs_lblc_scheduler); +} + + +module_init(ip_vs_lblc_init); +module_exit(ip_vs_lblc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c new file mode 100644 index 000000000000..22b5dd55d271 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -0,0 +1,888 @@ +/* + * IPVS: Locality-Based Least-Connection with Replication scheduler + * + * Version: $Id: ip_vs_lblcr.c,v 1.11 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Julian Anastasov : Added the missing (dest->weight>0) + * condition in the ip_vs_dest_set_max. + * + */ + +/* + * The lblc/r algorithm is as follows (pseudo code): + * + * if serverSet[dest_ip] is null then + * n, serverSet[dest_ip] <- {weighted least-conn node}; + * else + * n <- {least-conn (alive) node in serverSet[dest_ip]}; + * if (n is null) OR + * (n.conns>n.weight AND + * there is a node m with m.conns 1 AND + * now - serverSet[dest_ip].lastMod > T then + * m <- {most conn node in serverSet[dest_ip]}; + * remove m from serverSet[dest_ip]; + * if serverSet[dest_ip] changed then + * serverSet[dest_ip].lastMod <- now; + * + * return n; + * + */ + +#include +#include + +/* for sysctl */ +#include +#include +/* for proc_net_create/proc_net_remove */ +#include + +#include + + +/* + * It is for garbage collection of stale IPVS lblcr entries, + * when the table is full. + */ +#define CHECK_EXPIRE_INTERVAL (60*HZ) +#define ENTRY_TIMEOUT (6*60*HZ) + +/* + * It is for full expiration check. + * When there is no partial expiration check (garbage collection) + * in a half hour, do a full expiration check to collect stale + * entries that haven't been touched for a day. + */ +#define COUNT_FOR_FULL_EXPIRATION 30 +static int sysctl_ip_vs_lblcr_expiration = 24*60*60*HZ; + + +/* + * for IPVS lblcr entry hash table + */ +#ifndef CONFIG_IP_VS_LBLCR_TAB_BITS +#define CONFIG_IP_VS_LBLCR_TAB_BITS 10 +#endif +#define IP_VS_LBLCR_TAB_BITS CONFIG_IP_VS_LBLCR_TAB_BITS +#define IP_VS_LBLCR_TAB_SIZE (1 << IP_VS_LBLCR_TAB_BITS) +#define IP_VS_LBLCR_TAB_MASK (IP_VS_LBLCR_TAB_SIZE - 1) + + +/* + * IPVS destination set structure and operations + */ +struct ip_vs_dest_list { + struct ip_vs_dest_list *next; /* list link */ + struct ip_vs_dest *dest; /* destination server */ +}; + +struct ip_vs_dest_set { + atomic_t size; /* set size */ + unsigned long lastmod; /* last modified time */ + struct ip_vs_dest_list *list; /* destination list */ + rwlock_t lock; /* lock for this list */ +}; + + +static struct ip_vs_dest_list * +ip_vs_dest_set_insert(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_list *e; + + for (e=set->list; e!=NULL; e=e->next) { + if (e->dest == dest) + /* already existed */ + return NULL; + } + + e = kmalloc(sizeof(struct ip_vs_dest_list), GFP_ATOMIC); + if (e == NULL) { + IP_VS_ERR("ip_vs_dest_set_insert(): no memory\n"); + return NULL; + } + + atomic_inc(&dest->refcnt); + e->dest = dest; + + /* link it to the list */ + write_lock(&set->lock); + e->next = set->list; + set->list = e; + atomic_inc(&set->size); + write_unlock(&set->lock); + + set->lastmod = jiffies; + return e; +} + +static void +ip_vs_dest_set_erase(struct ip_vs_dest_set *set, struct ip_vs_dest *dest) +{ + struct ip_vs_dest_list *e, **ep; + + write_lock(&set->lock); + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { + if (e->dest == dest) { + /* HIT */ + *ep = e->next; + atomic_dec(&set->size); + set->lastmod = jiffies; + atomic_dec(&e->dest->refcnt); + kfree(e); + break; + } + ep = &e->next; + } + write_unlock(&set->lock); +} + +static void ip_vs_dest_set_eraseall(struct ip_vs_dest_set *set) +{ + struct ip_vs_dest_list *e, **ep; + + write_lock(&set->lock); + for (ep=&set->list, e=*ep; e!=NULL; e=*ep) { + *ep = e->next; + /* + * We don't kfree dest because it is refered either + * by its service or by the trash dest list. + */ + atomic_dec(&e->dest->refcnt); + kfree(e); + } + write_unlock(&set->lock); +} + +/* get weighted least-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_list *e; + struct ip_vs_dest *dest, *least; + int loh, doh; + + if (set == NULL) + return NULL; + + read_lock(&set->lock); + /* select the first destination server, whose weight > 0 */ + for (e=set->list; e!=NULL; e=e->next) { + least = e->dest; + if (least->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if ((atomic_read(&least->weight) > 0) + && (least->flags & IP_VS_DEST_F_AVAILABLE)) { + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + read_unlock(&set->lock); + return NULL; + + /* find the destination with the weighted least load */ + nextstage: + for (e=e->next; e!=NULL; e=e->next) { + dest = e->dest; + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if ((loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) + && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { + least = dest; + loh = doh; + } + } + read_unlock(&set->lock); + + IP_VS_DBG(6, "ip_vs_dest_set_min: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + return least; +} + + +/* get weighted most-connection node in the destination set */ +static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) +{ + register struct ip_vs_dest_list *e; + struct ip_vs_dest *dest, *most; + int moh, doh; + + if (set == NULL) + return NULL; + + read_lock(&set->lock); + /* select the first destination server, whose weight > 0 */ + for (e=set->list; e!=NULL; e=e->next) { + most = e->dest; + if (atomic_read(&most->weight) > 0) { + moh = atomic_read(&most->activeconns) * 50 + + atomic_read(&most->inactconns); + goto nextstage; + } + } + read_unlock(&set->lock); + return NULL; + + /* find the destination with the weighted most load */ + nextstage: + for (e=e->next; e!=NULL; e=e->next) { + dest = e->dest; + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ + if ((moh * atomic_read(&dest->weight) < + doh * atomic_read(&most->weight)) + && (atomic_read(&dest->weight) > 0)) { + most = dest; + moh = doh; + } + } + read_unlock(&set->lock); + + IP_VS_DBG(6, "ip_vs_dest_set_max: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(most->addr), ntohs(most->port), + atomic_read(&most->activeconns), + atomic_read(&most->refcnt), + atomic_read(&most->weight), moh); + return most; +} + + +/* + * IPVS lblcr entry represents an association between destination + * IP address and its destination server set + */ +struct ip_vs_lblcr_entry { + struct list_head list; + __u32 addr; /* destination IP address */ + struct ip_vs_dest_set set; /* destination server set */ + unsigned long lastuse; /* last used time */ +}; + + +/* + * IPVS lblcr hash table + */ +struct ip_vs_lblcr_table { + rwlock_t lock; /* lock for this table */ + struct list_head bucket[IP_VS_LBLCR_TAB_SIZE]; /* hash bucket */ + atomic_t entries; /* number of entries */ + int max_size; /* maximum size of entries */ + struct timer_list periodic_timer; /* collect stale entries */ + int rover; /* rover for expire check */ + int counter; /* counter for no expire */ +}; + + +/* + * IPVS LBLCR sysctl table + */ + +static ctl_table vs_vars_table[] = { + { + .ctl_name = NET_IPV4_VS_LBLCR_EXPIRE, + .procname = "lblcr_expiration", + .data = &sysctl_ip_vs_lblcr_expiration, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static ctl_table vs_table[] = { + { + .ctl_name = NET_IPV4_VS, + .procname = "vs", + .mode = 0555, + .child = vs_vars_table + }, + { .ctl_name = 0 } +}; + +static ctl_table ipv4_table[] = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = vs_table + }, + { .ctl_name = 0 } +}; + +static ctl_table lblcr_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ipv4_table + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header * sysctl_header; + +/* + * new/free a ip_vs_lblcr_entry, which is a mapping of a destination + * IP address to a server. + */ +static inline struct ip_vs_lblcr_entry *ip_vs_lblcr_new(__u32 daddr) +{ + struct ip_vs_lblcr_entry *en; + + en = kmalloc(sizeof(struct ip_vs_lblcr_entry), GFP_ATOMIC); + if (en == NULL) { + IP_VS_ERR("ip_vs_lblcr_new(): no memory\n"); + return NULL; + } + + INIT_LIST_HEAD(&en->list); + en->addr = daddr; + + /* initilize its dest set */ + atomic_set(&(en->set.size), 0); + en->set.list = NULL; + rwlock_init(&en->set.lock); + + return en; +} + + +static inline void ip_vs_lblcr_free(struct ip_vs_lblcr_entry *en) +{ + list_del(&en->list); + ip_vs_dest_set_eraseall(&en->set); + kfree(en); +} + + +/* + * Returns hash value for IPVS LBLCR entry + */ +static inline unsigned ip_vs_lblcr_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_LBLCR_TAB_MASK; +} + + +/* + * Hash an entry in the ip_vs_lblcr_table. + * returns bool success. + */ +static int +ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) +{ + unsigned hash; + + if (!list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblcr_hash(): request for already hashed, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Hash by destination IP address + */ + hash = ip_vs_lblcr_hashkey(en->addr); + + write_lock(&tbl->lock); + list_add(&en->list, &tbl->bucket[hash]); + atomic_inc(&tbl->entries); + write_unlock(&tbl->lock); + + return 1; +} + + +#if 0000 +/* + * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table. + * returns bool success. + */ +static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl, + struct ip_vs_lblcr_entry *en) +{ + if (list_empty(&en->list)) { + IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, " + "called from %p\n", __builtin_return_address(0)); + return 0; + } + + /* + * Remove it from the table + */ + write_lock(&tbl->lock); + list_del(&en->list); + INIT_LIST_HEAD(&en->list); + write_unlock(&tbl->lock); + + return 1; +} +#endif + + +/* + * Get ip_vs_lblcr_entry associated with supplied parameters. + */ +static inline struct ip_vs_lblcr_entry * +ip_vs_lblcr_get(struct ip_vs_lblcr_table *tbl, __u32 addr) +{ + unsigned hash; + struct ip_vs_lblcr_entry *en; + + hash = ip_vs_lblcr_hashkey(addr); + + read_lock(&tbl->lock); + + list_for_each_entry(en, &tbl->bucket[hash], list) { + if (en->addr == addr) { + /* HIT */ + read_unlock(&tbl->lock); + return en; + } + } + + read_unlock(&tbl->lock); + + return NULL; +} + + +/* + * Flush all the entries of the specified table. + */ +static void ip_vs_lblcr_flush(struct ip_vs_lblcr_table *tbl) +{ + int i; + struct ip_vs_lblcr_entry *en, *nxt; + + for (i=0; ilock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[i], list) { + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } +} + + +static inline void ip_vs_lblcr_full_check(struct ip_vs_lblcr_table *tbl) +{ + unsigned long now = jiffies; + int i, j; + struct ip_vs_lblcr_entry *en, *nxt; + + for (i=0, j=tbl->rover; ilock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_after(en->lastuse+sysctl_ip_vs_lblcr_expiration, + now)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + } + write_unlock(&tbl->lock); + } + tbl->rover = j; +} + + +/* + * Periodical timer handler for IPVS lblcr table + * It is used to collect stale entries when the number of entries + * exceeds the maximum size of the table. + * + * Fixme: we probably need more complicated algorithm to collect + * entries that have not been used for a long time even + * if the number of entries doesn't exceed the maximum size + * of the table. + * The full expiration check is for this purpose now. + */ +static void ip_vs_lblcr_check_expire(unsigned long data) +{ + struct ip_vs_lblcr_table *tbl; + unsigned long now = jiffies; + int goal; + int i, j; + struct ip_vs_lblcr_entry *en, *nxt; + + tbl = (struct ip_vs_lblcr_table *)data; + + if ((tbl->counter % COUNT_FOR_FULL_EXPIRATION) == 0) { + /* do full expiration check */ + ip_vs_lblcr_full_check(tbl); + tbl->counter = 1; + goto out; + } + + if (atomic_read(&tbl->entries) <= tbl->max_size) { + tbl->counter++; + goto out; + } + + goal = (atomic_read(&tbl->entries) - tbl->max_size)*4/3; + if (goal > tbl->max_size/2) + goal = tbl->max_size/2; + + for (i=0, j=tbl->rover; ilock); + list_for_each_entry_safe(en, nxt, &tbl->bucket[j], list) { + if (time_before(now, en->lastuse+ENTRY_TIMEOUT)) + continue; + + ip_vs_lblcr_free(en); + atomic_dec(&tbl->entries); + goal--; + } + write_unlock(&tbl->lock); + if (goal <= 0) + break; + } + tbl->rover = j; + + out: + mod_timer(&tbl->periodic_timer, jiffies+CHECK_EXPIRE_INTERVAL); +} + + +#ifdef CONFIG_IP_VS_LBLCR_DEBUG +static struct ip_vs_lblcr_table *lblcr_table_list; + +/* + * /proc/net/ip_vs_lblcr to display the mappings of + * destination IP address <==> its serverSet + */ +static int +ip_vs_lblcr_getinfo(char *buffer, char **start, off_t offset, int length) +{ + off_t pos=0, begin; + int len=0, size; + struct ip_vs_lblcr_table *tbl; + unsigned long now = jiffies; + int i; + struct ip_vs_lblcr_entry *en; + + tbl = lblcr_table_list; + + size = sprintf(buffer, "LastTime Dest IP address Server set\n"); + pos += size; + len += size; + + for (i=0; ilock); + list_for_each_entry(en, &tbl->bucket[i], list) { + char tbuf[16]; + struct ip_vs_dest_list *d; + + sprintf(tbuf, "%u.%u.%u.%u", NIPQUAD(en->addr)); + size = sprintf(buffer+len, "%8lu %-16s ", + now-en->lastuse, tbuf); + + read_lock(&en->set.lock); + for (d=en->set.list; d!=NULL; d=d->next) { + size += sprintf(buffer+len+size, + "%u.%u.%u.%u ", + NIPQUAD(d->dest->addr)); + } + read_unlock(&en->set.lock); + size += sprintf(buffer+len+size, "\n"); + len += size; + pos += size; + if (pos <= offset) + len=0; + if (pos >= offset+length) { + read_unlock_bh(&tbl->lock); + goto done; + } + } + read_unlock_bh(&tbl->lock); + } + + done: + begin = len - (pos - offset); + *start = buffer + begin; + len -= begin; + if(len>length) + len = length; + return len; +} +#endif + + +static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc) +{ + int i; + struct ip_vs_lblcr_table *tbl; + + /* + * Allocate the ip_vs_lblcr_table for this service + */ + tbl = kmalloc(sizeof(struct ip_vs_lblcr_table), GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_lblcr_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_lblcr_table)); + + /* + * Initialize the hash buckets + */ + for (i=0; ibucket[i]); + } + rwlock_init(&tbl->lock); + tbl->max_size = IP_VS_LBLCR_TAB_SIZE*16; + tbl->rover = 0; + tbl->counter = 1; + + /* + * Hook periodic timer for garbage collection + */ + init_timer(&tbl->periodic_timer); + tbl->periodic_timer.data = (unsigned long)tbl; + tbl->periodic_timer.function = ip_vs_lblcr_check_expire; + tbl->periodic_timer.expires = jiffies+CHECK_EXPIRE_INTERVAL; + add_timer(&tbl->periodic_timer); + +#ifdef CONFIG_IP_VS_LBLCR_DEBUG + lblcr_table_list = tbl; +#endif + return 0; +} + + +static int ip_vs_lblcr_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_lblcr_table *tbl = svc->sched_data; + + /* remove periodic timer */ + del_timer_sync(&tbl->periodic_timer); + + /* got to clean up table entries here */ + ip_vs_lblcr_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "LBLCR hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_lblcr_table)); + + return 0; +} + + +static int ip_vs_lblcr_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline struct ip_vs_dest * +__ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) +{ + struct ip_vs_dest *dest, *least; + int loh, doh; + + /* + * We think the overhead of processing active connections is fifty + * times higher than that of inactive connections in average. (This + * fifty times might not be accurate, we will change it later.) We + * use the following formula to estimate the overhead: + * dest->activeconns*50 + dest->inactconns + * and the load: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connection. + */ + list_for_each_entry(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + if (atomic_read(&dest->weight) > 0) { + least = dest; + loh = atomic_read(&least->activeconns) * 50 + + atomic_read(&least->inactconns); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + + doh = atomic_read(&dest->activeconns) * 50 + + atomic_read(&dest->inactconns); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "LBLCR: server %d.%d.%d.%d:%d " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +/* + * If this destination server is overloaded and there is a less loaded + * server, then return true. + */ +static inline int +is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) +{ + if (atomic_read(&dest->activeconns) > atomic_read(&dest->weight)) { + struct ip_vs_dest *d; + + list_for_each_entry(d, &svc->destinations, n_list) { + if (atomic_read(&d->activeconns)*2 + < atomic_read(&d->weight)) { + return 1; + } + } + } + return 0; +} + + +/* + * Locality-Based (weighted) Least-Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_lblcr_table *tbl; + struct ip_vs_lblcr_entry *en; + struct iphdr *iph = skb->nh.iph; + + IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_lblcr_table *)svc->sched_data; + en = ip_vs_lblcr_get(tbl, iph->daddr); + if (en == NULL) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + en = ip_vs_lblcr_new(iph->daddr); + if (en == NULL) { + return NULL; + } + ip_vs_dest_set_insert(&en->set, dest); + ip_vs_lblcr_hash(tbl, en); + } else { + dest = ip_vs_dest_set_min(&en->set); + if (!dest || is_overloaded(dest, svc)) { + dest = __ip_vs_wlc_schedule(svc, iph); + if (dest == NULL) { + IP_VS_DBG(1, "no destination available\n"); + return NULL; + } + ip_vs_dest_set_insert(&en->set, dest); + } + if (atomic_read(&en->set.size) > 1 && + jiffies-en->set.lastmod > sysctl_ip_vs_lblcr_expiration) { + struct ip_vs_dest *m; + m = ip_vs_dest_set_max(&en->set); + if (m) + ip_vs_dest_set_erase(&en->set, m); + } + } + en->lastuse = jiffies; + + IP_VS_DBG(6, "LBLCR: destination IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(en->addr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS LBLCR Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_lblcr_scheduler = +{ + .name = "lblcr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_lblcr_init_svc, + .done_service = ip_vs_lblcr_done_svc, + .update_service = ip_vs_lblcr_update_svc, + .schedule = ip_vs_lblcr_schedule, +}; + + +static int __init ip_vs_lblcr_init(void) +{ + INIT_LIST_HEAD(&ip_vs_lblcr_scheduler.n_list); + sysctl_header = register_sysctl_table(lblcr_root_table, 0); +#ifdef CONFIG_IP_VS_LBLCR_DEBUG + proc_net_create("ip_vs_lblcr", 0, ip_vs_lblcr_getinfo); +#endif + return register_ip_vs_scheduler(&ip_vs_lblcr_scheduler); +} + + +static void __exit ip_vs_lblcr_cleanup(void) +{ +#ifdef CONFIG_IP_VS_LBLCR_DEBUG + proc_net_remove("ip_vs_lblcr"); +#endif + unregister_sysctl_table(sysctl_header); + unregister_ip_vs_scheduler(&ip_vs_lblcr_scheduler); +} + + +module_init(ip_vs_lblcr_init); +module_exit(ip_vs_lblcr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_lc.c b/net/ipv4/ipvs/ip_vs_lc.c new file mode 100644 index 000000000000..d88fef90a641 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_lc.c @@ -0,0 +1,123 @@ +/* + * IPVS: Least-Connection Scheduling module + * + * Version: $Id: ip_vs_lc.c,v 1.10 2003/04/18 09:03:16 wensong Exp $ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : added the ip_vs_lc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include + +#include + + +static int ip_vs_lc_init_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int ip_vs_lc_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int ip_vs_lc_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline unsigned int +ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We think the overhead of processing active connections is 256 + * times higher than that of inactive connections in average. (This + * 256 times might not be accurate, we will change it later) We + * use the following formula to estimate the overhead now: + * dest->activeconns*256 + dest->inactconns + */ + return (atomic_read(&dest->activeconns) << 8) + + atomic_read(&dest->inactconns); +} + + +/* + * Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least = NULL; + unsigned int loh = 0, doh; + + IP_VS_DBG(6, "ip_vs_lc_schedule(): Scheduling...\n"); + + /* + * Simply select the server with the least number of + * (activeconns<<5) + inactconns + * Except whose weight is equal to zero. + * If the weight is equal to zero, it means that the server is + * quiesced, the existing connections to the server still get + * served, but no new connection is assigned to the server. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || + atomic_read(&dest->weight) == 0) + continue; + doh = ip_vs_lc_dest_overhead(dest); + if (!least || doh < loh) { + least = dest; + loh = doh; + } + } + + if (least) + IP_VS_DBG(6, "LC: server %u.%u.%u.%u:%u activeconns %d inactconns %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->inactconns)); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_lc_scheduler = { + .name = "lc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_lc_init_svc, + .done_service = ip_vs_lc_done_svc, + .update_service = ip_vs_lc_update_svc, + .schedule = ip_vs_lc_schedule, +}; + + +static int __init ip_vs_lc_init(void) +{ + INIT_LIST_HEAD(&ip_vs_lc_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_lc_scheduler) ; +} + +static void __exit ip_vs_lc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_lc_scheduler); +} + +module_init(ip_vs_lc_init); +module_exit(ip_vs_lc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_nq.c b/net/ipv4/ipvs/ip_vs_nq.c new file mode 100644 index 000000000000..bc2a9e5f2a7b --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_nq.c @@ -0,0 +1,161 @@ +/* + * IPVS: Never Queue scheduling module + * + * Version: $Id: ip_vs_nq.c,v 1.2 2003/06/08 09:31:19 wensong Exp $ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The NQ algorithm adopts a two-speed model. When there is an idle server + * available, the job will be sent to the idle server, instead of waiting + * for a fast one. When there is no idle server available, the job will be + * sent to the server that minimize its expected delay (The Shortest + * Expected Delay scheduling algorithm). + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri for talking NQ to me. + * + * The difference between NQ and SED is that NQ can improve overall + * system utilization. + * + */ + +#include +#include + +#include + + +static int +ip_vs_nq_init_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_nq_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_nq_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline unsigned int +ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least = NULL; + unsigned int loh = 0, doh; + + IP_VS_DBG(6, "ip_vs_nq_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + + if (dest->flags & IP_VS_DEST_F_OVERLOAD || + !atomic_read(&dest->weight)) + continue; + + doh = ip_vs_nq_dest_overhead(dest); + + /* return the server directly if it is idle */ + if (atomic_read(&dest->activeconns) == 0) { + least = dest; + loh = doh; + goto out; + } + + if (!least || + (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight))) { + least = dest; + loh = doh; + } + } + + if (!least) + return NULL; + + out: + IP_VS_DBG(6, "NQ: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_nq_scheduler = +{ + .name = "nq", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_nq_init_svc, + .done_service = ip_vs_nq_done_svc, + .update_service = ip_vs_nq_update_svc, + .schedule = ip_vs_nq_schedule, +}; + + +static int __init ip_vs_nq_init(void) +{ + INIT_LIST_HEAD(&ip_vs_nq_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +static void __exit ip_vs_nq_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_nq_scheduler); +} + +module_init(ip_vs_nq_init); +module_exit(ip_vs_nq_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_proto.c b/net/ipv4/ipvs/ip_vs_proto.c new file mode 100644 index 000000000000..253c46252bd5 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto.c @@ -0,0 +1,244 @@ +/* + * ip_vs_proto.c: transport protocol load balancing support for IPVS + * + * Version: $Id: ip_vs_proto.c,v 1.2 2003/04/18 09:03:16 wensong Exp $ + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +/* + * IPVS protocols can only be registered/unregistered when the ipvs + * module is loaded/unloaded, so no lock is needed in accessing the + * ipvs protocol table. + */ + +#define IP_VS_PROTO_TAB_SIZE 32 /* must be power of 2 */ +#define IP_VS_PROTO_HASH(proto) ((proto) & (IP_VS_PROTO_TAB_SIZE-1)) + +static struct ip_vs_protocol *ip_vs_proto_table[IP_VS_PROTO_TAB_SIZE]; + + +/* + * register an ipvs protocol + */ +static int register_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + + pp->next = ip_vs_proto_table[hash]; + ip_vs_proto_table[hash] = pp; + + if (pp->init != NULL) + pp->init(pp); + + return 0; +} + + +/* + * unregister an ipvs protocol + */ +static int unregister_ip_vs_protocol(struct ip_vs_protocol *pp) +{ + struct ip_vs_protocol **pp_p; + unsigned hash = IP_VS_PROTO_HASH(pp->protocol); + + pp_p = &ip_vs_proto_table[hash]; + for (; *pp_p; pp_p = &(*pp_p)->next) { + if (*pp_p == pp) { + *pp_p = pp->next; + if (pp->exit != NULL) + pp->exit(pp); + return 0; + } + } + + return -ESRCH; +} + + +/* + * get ip_vs_protocol object by its proto. + */ +struct ip_vs_protocol * ip_vs_proto_get(unsigned short proto) +{ + struct ip_vs_protocol *pp; + unsigned hash = IP_VS_PROTO_HASH(proto); + + for (pp = ip_vs_proto_table[hash]; pp; pp = pp->next) { + if (pp->protocol == proto) + return pp; + } + + return NULL; +} + + +/* + * Propagate event for state change to all protocols + */ +void ip_vs_protocol_timeout_change(int flags) +{ + struct ip_vs_protocol *pp; + int i; + + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + for (pp = ip_vs_proto_table[i]; pp; pp = pp->next) { + if (pp->timeout_change) + pp->timeout_change(pp, flags); + } + } +} + + +int * +ip_vs_create_timeout_table(int *table, int size) +{ + int *t; + + t = kmalloc(size, GFP_ATOMIC); + if (t == NULL) + return NULL; + memcpy(t, table, size); + return t; +} + + +/* + * Set timeout value for state specified by name + */ +int +ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to) +{ + int i; + + if (!table || !name || !to) + return -EINVAL; + + for (i = 0; i < num; i++) { + if (strcmp(names[i], name)) + continue; + table[i] = to * HZ; + return 0; + } + return -ENOENT; +} + + +const char * ip_vs_state_name(__u16 proto, int state) +{ + struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + + if (pp == NULL || pp->state_name == NULL) + return "ERR!"; + return pp->state_name(state); +} + + +void +ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[128]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else if (ih->frag_off & __constant_htons(IP_OFFSET)) + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + else { + __u16 _ports[2], *pptr +; + pptr = skb_header_pointer(skb, offset + ih->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) + sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u", + pp->name, + NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + else + sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u", + pp->name, + NIPQUAD(ih->saddr), + ntohs(pptr[0]), + NIPQUAD(ih->daddr), + ntohs(pptr[1])); + } + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + + +int ip_vs_protocol_init(void) +{ + char protocols[64]; +#define REGISTER_PROTOCOL(p) \ + do { \ + register_ip_vs_protocol(p); \ + strcat(protocols, ", "); \ + strcat(protocols, (p)->name); \ + } while (0) + + protocols[0] = '\0'; + protocols[2] = '\0'; +#ifdef CONFIG_IP_VS_PROTO_TCP + REGISTER_PROTOCOL(&ip_vs_protocol_tcp); +#endif +#ifdef CONFIG_IP_VS_PROTO_UDP + REGISTER_PROTOCOL(&ip_vs_protocol_udp); +#endif +#ifdef CONFIG_IP_VS_PROTO_ICMP + REGISTER_PROTOCOL(&ip_vs_protocol_icmp); +#endif +#ifdef CONFIG_IP_VS_PROTO_AH + REGISTER_PROTOCOL(&ip_vs_protocol_ah); +#endif +#ifdef CONFIG_IP_VS_PROTO_ESP + REGISTER_PROTOCOL(&ip_vs_protocol_esp); +#endif + IP_VS_INFO("Registered protocols (%s)\n", &protocols[2]); + + return 0; +} + + +void ip_vs_protocol_cleanup(void) +{ + struct ip_vs_protocol *pp; + int i; + + /* unregister all the ipvs protocols */ + for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { + while ((pp = ip_vs_proto_table[i]) != NULL) + unregister_ip_vs_protocol(pp); + } +} diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c new file mode 100644 index 000000000000..453e94a0bbd7 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c @@ -0,0 +1,177 @@ +/* + * ip_vs_proto_ah.c: AH IPSec load balancing support for IPVS + * + * Version: $Id: ip_vs_proto_ah.c,v 1.1 2003/07/04 15:04:37 wensong Exp $ + * + * Authors: Julian Anastasov , February 2002 + * Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation; + * + */ + +#include +#include +#include +#include + +#include + + +/* TODO: + +struct isakmp_hdr { + __u8 icookie[8]; + __u8 rcookie[8]; + __u8 np; + __u8 version; + __u8 xchgtype; + __u8 flags; + __u32 msgid; + __u32 length; +}; + +*/ + +#define PORT_ISAKMP 500 + + +static struct ip_vs_conn * +ah_conn_in_get(const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct iphdr *iph, + unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(IPPROTO_UDP, + iph->saddr, + __constant_htons(PORT_ISAKMP), + iph->daddr, + __constant_htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_in_get(IPPROTO_UDP, + iph->daddr, + __constant_htons(PORT_ISAKMP), + iph->saddr, + __constant_htons(PORT_ISAKMP)); + } + + if (!cp) { + /* + * We are not sure if the packet is from our + * service, so our conn_schedule hook should return NF_ACCEPT + */ + IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " + "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", + inverse ? "ICMP+" : "", + pp->name, + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + } + + return cp; +} + + +static struct ip_vs_conn * +ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct iphdr *iph, unsigned int proto_off, int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->saddr, + __constant_htons(PORT_ISAKMP), + iph->daddr, + __constant_htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->daddr, + __constant_htons(PORT_ISAKMP), + iph->saddr, + __constant_htons(PORT_ISAKMP)); + } + + if (!cp) { + IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " + "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", + inverse ? "ICMP+" : "", + pp->name, + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + } + + return cp; +} + + +static int +ah_conn_schedule(struct sk_buff *skb, + struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + /* + * AH is only related traffic. Pass the packet to IP stack. + */ + *verdict = NF_ACCEPT; + return 0; +} + + +static void +ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ + char buf[256]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + + +static void ah_init(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +static void ah_exit(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +struct ip_vs_protocol ip_vs_protocol_ah = { + .name = "AH", + .protocol = IPPROTO_AH, + .dont_defrag = 1, + .init = ah_init, + .exit = ah_exit, + .conn_schedule = ah_conn_schedule, + .conn_in_get = ah_conn_in_get, + .conn_out_get = ah_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = ah_debug_packet, + .timeout_change = NULL, /* ISAKMP */ + .set_state_timeout = NULL, +}; diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c new file mode 100644 index 000000000000..478e5c7c7e8e --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_esp.c @@ -0,0 +1,175 @@ +/* + * ip_vs_proto_esp.c: ESP IPSec load balancing support for IPVS + * + * Version: $Id: ip_vs_proto_esp.c,v 1.1 2003/07/04 15:04:37 wensong Exp $ + * + * Authors: Julian Anastasov , February 2002 + * Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation; + * + */ + +#include +#include +#include +#include + +#include + + +/* TODO: + +struct isakmp_hdr { + __u8 icookie[8]; + __u8 rcookie[8]; + __u8 np; + __u8 version; + __u8 xchgtype; + __u8 flags; + __u32 msgid; + __u32 length; +}; + +*/ + +#define PORT_ISAKMP 500 + + +static struct ip_vs_conn * +esp_conn_in_get(const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct iphdr *iph, + unsigned int proto_off, + int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(IPPROTO_UDP, + iph->saddr, + __constant_htons(PORT_ISAKMP), + iph->daddr, + __constant_htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_in_get(IPPROTO_UDP, + iph->daddr, + __constant_htons(PORT_ISAKMP), + iph->saddr, + __constant_htons(PORT_ISAKMP)); + } + + if (!cp) { + /* + * We are not sure if the packet is from our + * service, so our conn_schedule hook should return NF_ACCEPT + */ + IP_VS_DBG(12, "Unknown ISAKMP entry for outin packet " + "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", + inverse ? "ICMP+" : "", + pp->name, + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + } + + return cp; +} + + +static struct ip_vs_conn * +esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct iphdr *iph, unsigned int proto_off, int inverse) +{ + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->saddr, + __constant_htons(PORT_ISAKMP), + iph->daddr, + __constant_htons(PORT_ISAKMP)); + } else { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->daddr, + __constant_htons(PORT_ISAKMP), + iph->saddr, + __constant_htons(PORT_ISAKMP)); + } + + if (!cp) { + IP_VS_DBG(12, "Unknown ISAKMP entry for inout packet " + "%s%s %u.%u.%u.%u->%u.%u.%u.%u\n", + inverse ? "ICMP+" : "", + pp->name, + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr)); + } + + return cp; +} + + +static int +esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + /* + * ESP is only related traffic. Pass the packet to IP stack. + */ + *verdict = NF_ACCEPT; + return 0; +} + + +static void +esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, + int offset, const char *msg) +{ + char buf[256]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + + +static void esp_init(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +static void esp_exit(struct ip_vs_protocol *pp) +{ + /* nothing to do now */ +} + + +struct ip_vs_protocol ip_vs_protocol_esp = { + .name = "ESP", + .protocol = IPPROTO_ESP, + .dont_defrag = 1, + .init = esp_init, + .exit = esp_exit, + .conn_schedule = esp_conn_schedule, + .conn_in_get = esp_conn_in_get, + .conn_out_get = esp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = NULL, + .state_transition = NULL, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = esp_debug_packet, + .timeout_change = NULL, /* ISAKMP */ +}; diff --git a/net/ipv4/ipvs/ip_vs_proto_icmp.c b/net/ipv4/ipvs/ip_vs_proto_icmp.c new file mode 100644 index 000000000000..191e94aa1c1f --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_icmp.c @@ -0,0 +1,182 @@ +/* + * ip_vs_proto_icmp.c: ICMP load balancing support for IP Virtual Server + * + * Authors: Julian Anastasov , March 2002 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation; + * + */ + +#include +#include +#include +#include +#include + +#include + + +static int icmp_timeouts[1] = { 1*60*HZ }; + +static char * icmp_state_name_table[1] = { "ICMP" }; + +static struct ip_vs_conn * +icmp_conn_in_get(const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct iphdr *iph, + unsigned int proto_off, + int inverse) +{ +#if 0 + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(iph->protocol, + iph->saddr, 0, + iph->daddr, 0); + } else { + cp = ip_vs_conn_in_get(iph->protocol, + iph->daddr, 0, + iph->saddr, 0); + } + + return cp; + +#else + return NULL; +#endif +} + +static struct ip_vs_conn * +icmp_conn_out_get(const struct sk_buff *skb, + struct ip_vs_protocol *pp, + const struct iphdr *iph, + unsigned int proto_off, + int inverse) +{ +#if 0 + struct ip_vs_conn *cp; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(iph->protocol, + iph->saddr, 0, + iph->daddr, 0); + } else { + cp = ip_vs_conn_out_get(IPPROTO_UDP, + iph->daddr, 0, + iph->saddr, 0); + } + + return cp; +#else + return NULL; +#endif +} + +static int +icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + *verdict = NF_ACCEPT; + return 0; +} + +static int +icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + if (!(skb->nh.iph->frag_off & __constant_htons(IP_OFFSET))) { + if (skb->ip_summed != CHECKSUM_UNNECESSARY) { + if (ip_vs_checksum_complete(skb, skb->nh.iph->ihl * 4)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, "Failed checksum for"); + return 0; + } + } + } + return 1; +} + +static void +icmp_debug_packet(struct ip_vs_protocol *pp, + const struct sk_buff *skb, + int offset, + const char *msg) +{ + char buf[256]; + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); + if (ih == NULL) + sprintf(buf, "%s TRUNCATED", pp->name); + else if (ih->frag_off & __constant_htons(IP_OFFSET)) + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr)); + else { + struct icmphdr _icmph, *ic; + + ic = skb_header_pointer(skb, offset + ih->ihl*4, + sizeof(_icmph), &_icmph); + if (ic == NULL) + sprintf(buf, "%s TRUNCATED to %u bytes\n", + pp->name, skb->len - offset); + else + sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d", + pp->name, NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr), + ic->type, ic->code); + } + printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); +} + +static int +icmp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + cp->timeout = pp->timeout_table[IP_VS_ICMP_S_NORMAL]; + return 1; +} + +static int +icmp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) +{ + int num; + char **names; + + num = IP_VS_ICMP_S_LAST; + names = icmp_state_name_table; + return ip_vs_set_state_timeout(pp->timeout_table, num, names, sname, to); +} + + +static void icmp_init(struct ip_vs_protocol *pp) +{ + pp->timeout_table = icmp_timeouts; +} + +static void icmp_exit(struct ip_vs_protocol *pp) +{ +} + +struct ip_vs_protocol ip_vs_protocol_icmp = { + .name = "ICMP", + .protocol = IPPROTO_ICMP, + .dont_defrag = 0, + .init = icmp_init, + .exit = icmp_exit, + .conn_schedule = icmp_conn_schedule, + .conn_in_get = icmp_conn_in_get, + .conn_out_get = icmp_conn_out_get, + .snat_handler = NULL, + .dnat_handler = NULL, + .csum_check = icmp_csum_check, + .state_transition = icmp_state_transition, + .register_app = NULL, + .unregister_app = NULL, + .app_conn_bind = NULL, + .debug_packet = icmp_debug_packet, + .timeout_change = NULL, + .set_state_timeout = icmp_set_state_timeout, +}; diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c new file mode 100644 index 000000000000..e65de675da74 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -0,0 +1,640 @@ +/* + * ip_vs_proto_tcp.c: TCP load balancing support for IPVS + * + * Version: $Id: ip_vs_proto_tcp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $ + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include /* for tcphdr */ +#include +#include /* for csum_tcpudp_magic */ +#include + +#include + + +static struct ip_vs_conn * +tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct iphdr *iph, unsigned int proto_off, int inverse) +{ + __u16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + return ip_vs_conn_in_get(iph->protocol, + iph->saddr, pptr[0], + iph->daddr, pptr[1]); + } else { + return ip_vs_conn_in_get(iph->protocol, + iph->daddr, pptr[1], + iph->saddr, pptr[0]); + } +} + +static struct ip_vs_conn * +tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct iphdr *iph, unsigned int proto_off, int inverse) +{ + __u16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + return ip_vs_conn_out_get(iph->protocol, + iph->saddr, pptr[0], + iph->daddr, pptr[1]); + } else { + return ip_vs_conn_out_get(iph->protocol, + iph->daddr, pptr[1], + iph->saddr, pptr[0]); + } +} + + +static int +tcp_conn_schedule(struct sk_buff *skb, + struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + struct ip_vs_service *svc; + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) { + *verdict = NF_DROP; + return 0; + } + + if (th->syn && + (svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol, + skb->nh.iph->daddr, th->dest))) { + if (ip_vs_todrop()) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb); + if (!*cpp) { + *verdict = ip_vs_leave(svc, skb, pp); + return 0; + } + ip_vs_service_put(svc); + } + return 1; +} + + +static inline void +tcp_fast_csum_update(struct tcphdr *tcph, u32 oldip, u32 newip, + u16 oldport, u16 newport) +{ + tcph->check = + ip_vs_check_diff(~oldip, newip, + ip_vs_check_diff(oldport ^ 0xFFFF, + newport, tcph->check)); +} + + +static int +tcp_snat_handler(struct sk_buff **pskb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct tcphdr *tcph; + unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4; + + /* csum_check requires unshared skb */ + if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(*pskb, pp)) + return 0; + + /* Call application helper if needed */ + if (!ip_vs_app_pkt_out(cp, pskb)) + return 0; + } + + tcph = (void *)(*pskb)->nh.iph + tcphoff; + tcph->source = cp->vport; + + /* Adjust TCP checksums */ + if (!cp->app) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr, + cp->dport, cp->vport); + if ((*pskb)->ip_summed == CHECKSUM_HW) + (*pskb)->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + (*pskb)->csum = skb_checksum(*pskb, tcphoff, + (*pskb)->len - tcphoff, 0); + tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, + (*pskb)->len - tcphoff, + cp->protocol, + (*pskb)->csum); + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, tcph->check, + (char*)&(tcph->check) - (char*)tcph); + } + return 1; +} + + +static int +tcp_dnat_handler(struct sk_buff **pskb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct tcphdr *tcph; + unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4; + + /* csum_check requires unshared skb */ + if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(*pskb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn and iph ack_seq stuff + */ + if (!ip_vs_app_pkt_in(cp, pskb)) + return 0; + } + + tcph = (void *)(*pskb)->nh.iph + tcphoff; + tcph->dest = cp->dport; + + /* + * Adjust TCP checksums + */ + if (!cp->app) { + /* Only port and addr are changed, do fast csum update */ + tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr, + cp->vport, cp->dport); + if ((*pskb)->ip_summed == CHECKSUM_HW) + (*pskb)->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + tcph->check = 0; + (*pskb)->csum = skb_checksum(*pskb, tcphoff, + (*pskb)->len - tcphoff, 0); + tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, + (*pskb)->len - tcphoff, + cp->protocol, + (*pskb)->csum); + (*pskb)->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + unsigned int tcphoff = skb->nh.iph->ihl*4; + + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); + case CHECKSUM_HW: + if (csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->len - tcphoff, + skb->nh.iph->protocol, skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* CHECKSUM_UNNECESSARY */ + break; + } + + return 1; +} + + +#define TCP_DIR_INPUT 0 +#define TCP_DIR_OUTPUT 4 +#define TCP_DIR_INPUT_ONLY 8 + +static int tcp_state_off[IP_VS_DIR_LAST] = { + [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, + [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, + [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, +}; + +/* + * Timeout table[state] + */ +static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = 2*HZ, + [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, + [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, + [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, + [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, + [IP_VS_TCP_S_CLOSE] = 10*HZ, + [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, + [IP_VS_TCP_S_LAST_ACK] = 30*HZ, + [IP_VS_TCP_S_LISTEN] = 2*60*HZ, + [IP_VS_TCP_S_SYNACK] = 120*HZ, + [IP_VS_TCP_S_LAST] = 2*HZ, +}; + + +#if 0 + +/* FIXME: This is going to die */ + +static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = 2*HZ, + [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ, + [IP_VS_TCP_S_SYN_SENT] = 60*HZ, + [IP_VS_TCP_S_SYN_RECV] = 10*HZ, + [IP_VS_TCP_S_FIN_WAIT] = 60*HZ, + [IP_VS_TCP_S_TIME_WAIT] = 60*HZ, + [IP_VS_TCP_S_CLOSE] = 10*HZ, + [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, + [IP_VS_TCP_S_LAST_ACK] = 30*HZ, + [IP_VS_TCP_S_LISTEN] = 2*60*HZ, + [IP_VS_TCP_S_SYNACK] = 100*HZ, + [IP_VS_TCP_S_LAST] = 2*HZ, +}; + +#endif + +static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { + [IP_VS_TCP_S_NONE] = "NONE", + [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", + [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", + [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", + [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", + [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", + [IP_VS_TCP_S_CLOSE] = "CLOSE", + [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", + [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", + [IP_VS_TCP_S_LISTEN] = "LISTEN", + [IP_VS_TCP_S_SYNACK] = "SYNACK", + [IP_VS_TCP_S_LAST] = "BUG!", +}; + +#define sNO IP_VS_TCP_S_NONE +#define sES IP_VS_TCP_S_ESTABLISHED +#define sSS IP_VS_TCP_S_SYN_SENT +#define sSR IP_VS_TCP_S_SYN_RECV +#define sFW IP_VS_TCP_S_FIN_WAIT +#define sTW IP_VS_TCP_S_TIME_WAIT +#define sCL IP_VS_TCP_S_CLOSE +#define sCW IP_VS_TCP_S_CLOSE_WAIT +#define sLA IP_VS_TCP_S_LAST_ACK +#define sLI IP_VS_TCP_S_LISTEN +#define sSA IP_VS_TCP_S_SYNACK + +struct tcp_states_t { + int next_state[IP_VS_TCP_S_LAST]; +}; + +static const char * tcp_state_name(int state) +{ + if (state >= IP_VS_TCP_S_LAST) + return "ERR!"; + return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; +} + +static struct tcp_states_t tcp_states [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct tcp_states_t tcp_states_dos [] = { +/* INPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, +/*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, +/*ack*/ {{sCL, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, + +/* OUTPUT */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, +/*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, +/*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, +/*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, + +/* INPUT-ONLY */ +/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ +/*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, +/*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, +/*ack*/ {{sCL, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, +/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, +}; + +static struct tcp_states_t *tcp_state_table = tcp_states; + + +static void tcp_timeout_change(struct ip_vs_protocol *pp, int flags) +{ + int on = (flags & 1); /* secure_tcp */ + + /* + ** FIXME: change secure_tcp to independent sysctl var + ** or make it per-service or per-app because it is valid + ** for most if not for all of the applications. Something + ** like "capabilities" (flags) for each object. + */ + tcp_state_table = (on? tcp_states_dos : tcp_states); +} + +static int +tcp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) +{ + return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_TCP_S_LAST, + tcp_state_name_table, sname, to); +} + +static inline int tcp_state_idx(struct tcphdr *th) +{ + if (th->rst) + return 3; + if (th->syn) + return 0; + if (th->fin) + return 1; + if (th->ack) + return 2; + return -1; +} + +static inline void +set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, + int direction, struct tcphdr *th) +{ + int state_idx; + int new_state = IP_VS_TCP_S_CLOSE; + int state_off = tcp_state_off[direction]; + + /* + * Update state offset to INPUT_ONLY if necessary + * or delete NO_OUTPUT flag if output packet detected + */ + if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { + if (state_off == TCP_DIR_OUTPUT) + cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; + else + state_off = TCP_DIR_INPUT_ONLY; + } + + if ((state_idx = tcp_state_idx(th)) < 0) { + IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); + goto tcp_state_out; + } + + new_state = tcp_state_table[state_off+state_idx].next_state[cp->state]; + + tcp_state_out: + if (new_state != cp->state) { + struct ip_vs_dest *dest = cp->dest; + + IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" + "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n", + pp->name, + (state_off==TCP_DIR_OUTPUT)?"output ":"input ", + th->syn? 'S' : '.', + th->fin? 'F' : '.', + th->ack? 'A' : '.', + th->rst? 'R' : '.', + NIPQUAD(cp->daddr), ntohs(cp->dport), + NIPQUAD(cp->caddr), ntohs(cp->cport), + tcp_state_name(cp->state), + tcp_state_name(new_state), + atomic_read(&cp->refcnt)); + if (dest) { + if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state != IP_VS_TCP_S_ESTABLISHED)) { + atomic_dec(&dest->activeconns); + atomic_inc(&dest->inactconns); + cp->flags |= IP_VS_CONN_F_INACTIVE; + } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && + (new_state == IP_VS_TCP_S_ESTABLISHED)) { + atomic_inc(&dest->activeconns); + atomic_dec(&dest->inactconns); + cp->flags &= ~IP_VS_CONN_F_INACTIVE; + } + } + } + + cp->timeout = pp->timeout_table[cp->state = new_state]; +} + + +/* + * Handle state transitions + */ +static int +tcp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) + return 0; + + spin_lock(&cp->lock); + set_tcp_state(pp, cp, direction, th); + spin_unlock(&cp->lock); + + return 1; +} + + +/* + * Hash table for TCP application incarnations + */ +#define TCP_APP_TAB_BITS 4 +#define TCP_APP_TAB_SIZE (1 << TCP_APP_TAB_BITS) +#define TCP_APP_TAB_MASK (TCP_APP_TAB_SIZE - 1) + +static struct list_head tcp_apps[TCP_APP_TAB_SIZE]; +static DEFINE_SPINLOCK(tcp_app_lock); + +static inline __u16 tcp_app_hashkey(__u16 port) +{ + return ((port >> TCP_APP_TAB_BITS) ^ port) & TCP_APP_TAB_MASK; +} + + +static int tcp_register_app(struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash, port = inc->port; + int ret = 0; + + hash = tcp_app_hashkey(port); + + spin_lock_bh(&tcp_app_lock); + list_for_each_entry(i, &tcp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, &tcp_apps[hash]); + atomic_inc(&ip_vs_protocol_tcp.appcnt); + + out: + spin_unlock_bh(&tcp_app_lock); + return ret; +} + + +static void +tcp_unregister_app(struct ip_vs_app *inc) +{ + spin_lock_bh(&tcp_app_lock); + atomic_dec(&ip_vs_protocol_tcp.appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&tcp_app_lock); +} + + +static int +tcp_app_conn_bind(struct ip_vs_conn *cp) +{ + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = tcp_app_hashkey(cp->vport); + + spin_lock(&tcp_app_lock); + list_for_each_entry(inc, &tcp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&tcp_app_lock); + + IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" + "%u.%u.%u.%u:%u to app %s on port %u\n", + __FUNCTION__, + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + inc->name, ntohs(inc->port)); + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&tcp_app_lock); + + out: + return result; +} + + +/* + * Set LISTEN timeout. (ip_vs_conn_put will setup timer) + */ +void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) +{ + spin_lock(&cp->lock); + cp->state = IP_VS_TCP_S_LISTEN; + cp->timeout = ip_vs_protocol_tcp.timeout_table[IP_VS_TCP_S_LISTEN]; + spin_unlock(&cp->lock); +} + + +static void tcp_init(struct ip_vs_protocol *pp) +{ + IP_VS_INIT_HASH_TABLE(tcp_apps); + pp->timeout_table = tcp_timeouts; +} + + +static void tcp_exit(struct ip_vs_protocol *pp) +{ +} + + +struct ip_vs_protocol ip_vs_protocol_tcp = { + .name = "TCP", + .protocol = IPPROTO_TCP, + .dont_defrag = 0, + .appcnt = ATOMIC_INIT(0), + .init = tcp_init, + .exit = tcp_exit, + .register_app = tcp_register_app, + .unregister_app = tcp_unregister_app, + .conn_schedule = tcp_conn_schedule, + .conn_in_get = tcp_conn_in_get, + .conn_out_get = tcp_conn_out_get, + .snat_handler = tcp_snat_handler, + .dnat_handler = tcp_dnat_handler, + .csum_check = tcp_csum_check, + .state_name = tcp_state_name, + .state_transition = tcp_state_transition, + .app_conn_bind = tcp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = tcp_timeout_change, + .set_state_timeout = tcp_set_state_timeout, +}; diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c new file mode 100644 index 000000000000..8ae5f2e0aefa --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -0,0 +1,427 @@ +/* + * ip_vs_proto_udp.c: UDP load balancing support for IPVS + * + * Version: $Id: ip_vs_proto_udp.c,v 1.3 2002/11/30 01:50:35 wensong Exp $ + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include + +#include + + +static struct ip_vs_conn * +udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct iphdr *iph, unsigned int proto_off, int inverse) +{ + struct ip_vs_conn *cp; + __u16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + cp = ip_vs_conn_in_get(iph->protocol, + iph->saddr, pptr[0], + iph->daddr, pptr[1]); + } else { + cp = ip_vs_conn_in_get(iph->protocol, + iph->daddr, pptr[1], + iph->saddr, pptr[0]); + } + + return cp; +} + + +static struct ip_vs_conn * +udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp, + const struct iphdr *iph, unsigned int proto_off, int inverse) +{ + struct ip_vs_conn *cp; + __u16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_ports), _ports); + if (pptr == NULL) + return NULL; + + if (likely(!inverse)) { + cp = ip_vs_conn_out_get(iph->protocol, + iph->saddr, pptr[0], + iph->daddr, pptr[1]); + } else { + cp = ip_vs_conn_out_get(iph->protocol, + iph->daddr, pptr[1], + iph->saddr, pptr[0]); + } + + return cp; +} + + +static int +udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, + int *verdict, struct ip_vs_conn **cpp) +{ + struct ip_vs_service *svc; + struct udphdr _udph, *uh; + + uh = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_udph), &_udph); + if (uh == NULL) { + *verdict = NF_DROP; + return 0; + } + + if ((svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol, + skb->nh.iph->daddr, uh->dest))) { + if (ip_vs_todrop()) { + /* + * It seems that we are very loaded. + * We have to drop this packet :( + */ + ip_vs_service_put(svc); + *verdict = NF_DROP; + return 0; + } + + /* + * Let the virtual server select a real server for the + * incoming connection, and create a connection entry. + */ + *cpp = ip_vs_schedule(svc, skb); + if (!*cpp) { + *verdict = ip_vs_leave(svc, skb, pp); + return 0; + } + ip_vs_service_put(svc); + } + return 1; +} + + +static inline void +udp_fast_csum_update(struct udphdr *uhdr, u32 oldip, u32 newip, + u16 oldport, u16 newport) +{ + uhdr->check = + ip_vs_check_diff(~oldip, newip, + ip_vs_check_diff(oldport ^ 0xFFFF, + newport, uhdr->check)); + if (!uhdr->check) + uhdr->check = 0xFFFF; +} + +static int +udp_snat_handler(struct sk_buff **pskb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct udphdr *udph; + unsigned int udphoff = (*pskb)->nh.iph->ihl * 4; + + /* csum_check requires unshared skb */ + if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(*pskb, pp)) + return 0; + + /* + * Call application helper if needed + */ + if (!ip_vs_app_pkt_out(cp, pskb)) + return 0; + } + + udph = (void *)(*pskb)->nh.iph + udphoff; + udph->source = cp->vport; + + /* + * Adjust UDP checksums + */ + if (!cp->app && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(udph, cp->daddr, cp->vaddr, + cp->dport, cp->vport); + if ((*pskb)->ip_summed == CHECKSUM_HW) + (*pskb)->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + (*pskb)->csum = skb_checksum(*pskb, udphoff, + (*pskb)->len - udphoff, 0); + udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr, + (*pskb)->len - udphoff, + cp->protocol, + (*pskb)->csum); + if (udph->check == 0) + udph->check = 0xFFFF; + IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", + pp->name, udph->check, + (char*)&(udph->check) - (char*)udph); + } + return 1; +} + + +static int +udp_dnat_handler(struct sk_buff **pskb, + struct ip_vs_protocol *pp, struct ip_vs_conn *cp) +{ + struct udphdr *udph; + unsigned int udphoff = (*pskb)->nh.iph->ihl * 4; + + /* csum_check requires unshared skb */ + if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph))) + return 0; + + if (unlikely(cp->app != NULL)) { + /* Some checks before mangling */ + if (pp->csum_check && !pp->csum_check(*pskb, pp)) + return 0; + + /* + * Attempt ip_vs_app call. + * It will fix ip_vs_conn + */ + if (!ip_vs_app_pkt_in(cp, pskb)) + return 0; + } + + udph = (void *)(*pskb)->nh.iph + udphoff; + udph->dest = cp->dport; + + /* + * Adjust UDP checksums + */ + if (!cp->app && (udph->check != 0)) { + /* Only port and addr are changed, do fast csum update */ + udp_fast_csum_update(udph, cp->vaddr, cp->daddr, + cp->vport, cp->dport); + if ((*pskb)->ip_summed == CHECKSUM_HW) + (*pskb)->ip_summed = CHECKSUM_NONE; + } else { + /* full checksum calculation */ + udph->check = 0; + (*pskb)->csum = skb_checksum(*pskb, udphoff, + (*pskb)->len - udphoff, 0); + udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr, + (*pskb)->len - udphoff, + cp->protocol, + (*pskb)->csum); + if (udph->check == 0) + udph->check = 0xFFFF; + (*pskb)->ip_summed = CHECKSUM_UNNECESSARY; + } + return 1; +} + + +static int +udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp) +{ + struct udphdr _udph, *uh; + unsigned int udphoff = skb->nh.iph->ihl*4; + + uh = skb_header_pointer(skb, udphoff, sizeof(_udph), &_udph); + if (uh == NULL) + return 0; + + if (uh->check != 0) { + switch (skb->ip_summed) { + case CHECKSUM_NONE: + skb->csum = skb_checksum(skb, udphoff, + skb->len - udphoff, 0); + case CHECKSUM_HW: + if (csum_tcpudp_magic(skb->nh.iph->saddr, + skb->nh.iph->daddr, + skb->len - udphoff, + skb->nh.iph->protocol, + skb->csum)) { + IP_VS_DBG_RL_PKT(0, pp, skb, 0, + "Failed checksum for"); + return 0; + } + break; + default: + /* CHECKSUM_UNNECESSARY */ + break; + } + } + return 1; +} + + +/* + * Note: the caller guarantees that only one of register_app, + * unregister_app or app_conn_bind is called each time. + */ + +#define UDP_APP_TAB_BITS 4 +#define UDP_APP_TAB_SIZE (1 << UDP_APP_TAB_BITS) +#define UDP_APP_TAB_MASK (UDP_APP_TAB_SIZE - 1) + +static struct list_head udp_apps[UDP_APP_TAB_SIZE]; +static DEFINE_SPINLOCK(udp_app_lock); + +static inline __u16 udp_app_hashkey(__u16 port) +{ + return ((port >> UDP_APP_TAB_BITS) ^ port) & UDP_APP_TAB_MASK; +} + + +static int udp_register_app(struct ip_vs_app *inc) +{ + struct ip_vs_app *i; + __u16 hash, port = inc->port; + int ret = 0; + + hash = udp_app_hashkey(port); + + + spin_lock_bh(&udp_app_lock); + list_for_each_entry(i, &udp_apps[hash], p_list) { + if (i->port == port) { + ret = -EEXIST; + goto out; + } + } + list_add(&inc->p_list, &udp_apps[hash]); + atomic_inc(&ip_vs_protocol_udp.appcnt); + + out: + spin_unlock_bh(&udp_app_lock); + return ret; +} + + +static void +udp_unregister_app(struct ip_vs_app *inc) +{ + spin_lock_bh(&udp_app_lock); + atomic_dec(&ip_vs_protocol_udp.appcnt); + list_del(&inc->p_list); + spin_unlock_bh(&udp_app_lock); +} + + +static int udp_app_conn_bind(struct ip_vs_conn *cp) +{ + int hash; + struct ip_vs_app *inc; + int result = 0; + + /* Default binding: bind app only for NAT */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return 0; + + /* Lookup application incarnations and bind the right one */ + hash = udp_app_hashkey(cp->vport); + + spin_lock(&udp_app_lock); + list_for_each_entry(inc, &udp_apps[hash], p_list) { + if (inc->port == cp->vport) { + if (unlikely(!ip_vs_app_inc_get(inc))) + break; + spin_unlock(&udp_app_lock); + + IP_VS_DBG(9, "%s: Binding conn %u.%u.%u.%u:%u->" + "%u.%u.%u.%u:%u to app %s on port %u\n", + __FUNCTION__, + NIPQUAD(cp->caddr), ntohs(cp->cport), + NIPQUAD(cp->vaddr), ntohs(cp->vport), + inc->name, ntohs(inc->port)); + cp->app = inc; + if (inc->init_conn) + result = inc->init_conn(inc, cp); + goto out; + } + } + spin_unlock(&udp_app_lock); + + out: + return result; +} + + +static int udp_timeouts[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = 5*60*HZ, + [IP_VS_UDP_S_LAST] = 2*HZ, +}; + +static char * udp_state_name_table[IP_VS_UDP_S_LAST+1] = { + [IP_VS_UDP_S_NORMAL] = "UDP", + [IP_VS_UDP_S_LAST] = "BUG!", +}; + + +static int +udp_set_state_timeout(struct ip_vs_protocol *pp, char *sname, int to) +{ + return ip_vs_set_state_timeout(pp->timeout_table, IP_VS_UDP_S_LAST, + udp_state_name_table, sname, to); +} + +static const char * udp_state_name(int state) +{ + if (state >= IP_VS_UDP_S_LAST) + return "ERR!"; + return udp_state_name_table[state] ? udp_state_name_table[state] : "?"; +} + +static int +udp_state_transition(struct ip_vs_conn *cp, int direction, + const struct sk_buff *skb, + struct ip_vs_protocol *pp) +{ + cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; + return 1; +} + +static void udp_init(struct ip_vs_protocol *pp) +{ + IP_VS_INIT_HASH_TABLE(udp_apps); + pp->timeout_table = udp_timeouts; +} + +static void udp_exit(struct ip_vs_protocol *pp) +{ +} + + +struct ip_vs_protocol ip_vs_protocol_udp = { + .name = "UDP", + .protocol = IPPROTO_UDP, + .dont_defrag = 0, + .init = udp_init, + .exit = udp_exit, + .conn_schedule = udp_conn_schedule, + .conn_in_get = udp_conn_in_get, + .conn_out_get = udp_conn_out_get, + .snat_handler = udp_snat_handler, + .dnat_handler = udp_dnat_handler, + .csum_check = udp_csum_check, + .state_transition = udp_state_transition, + .state_name = udp_state_name, + .register_app = udp_register_app, + .unregister_app = udp_unregister_app, + .app_conn_bind = udp_app_conn_bind, + .debug_packet = ip_vs_tcpudp_debug_packet, + .timeout_change = NULL, + .set_state_timeout = udp_set_state_timeout, +}; diff --git a/net/ipv4/ipvs/ip_vs_rr.c b/net/ipv4/ipvs/ip_vs_rr.c new file mode 100644 index 000000000000..b23bab231cab --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_rr.c @@ -0,0 +1,118 @@ +/* + * IPVS: Round-Robin Scheduling module + * + * Version: $Id: ip_vs_rr.c,v 1.9 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes/Changes: + * Wensong Zhang : changed the ip_vs_rr_schedule to return dest + * Julian Anastasov : fixed the NULL pointer access bug in debugging + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_rr_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include + +#include + + +static int ip_vs_rr_init_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +static int ip_vs_rr_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int ip_vs_rr_update_svc(struct ip_vs_service *svc) +{ + svc->sched_data = &svc->destinations; + return 0; +} + + +/* + * Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct list_head *p, *q; + struct ip_vs_dest *dest; + + IP_VS_DBG(6, "ip_vs_rr_schedule(): Scheduling...\n"); + + write_lock(&svc->sched_lock); + p = (struct list_head *)svc->sched_data; + p = p->next; + q = p; + do { + /* skip list head */ + if (q == &svc->destinations) { + q = q->next; + continue; + } + + dest = list_entry(q, struct ip_vs_dest, n_list); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) + /* HIT */ + goto out; + q = q->next; + } while (q != p); + write_unlock(&svc->sched_lock); + return NULL; + + out: + svc->sched_data = q; + write_unlock(&svc->sched_lock); + IP_VS_DBG(6, "RR: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d\n", + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + + return dest; +} + + +static struct ip_vs_scheduler ip_vs_rr_scheduler = { + .name = "rr", /* name */ + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_rr_init_svc, + .done_service = ip_vs_rr_done_svc, + .update_service = ip_vs_rr_update_svc, + .schedule = ip_vs_rr_schedule, +}; + +static int __init ip_vs_rr_init(void) +{ + INIT_LIST_HEAD(&ip_vs_rr_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +static void __exit ip_vs_rr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_rr_scheduler); +} + +module_init(ip_vs_rr_init); +module_exit(ip_vs_rr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sched.c b/net/ipv4/ipvs/ip_vs_sched.c new file mode 100644 index 000000000000..0f7c56a225bd --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_sched.c @@ -0,0 +1,251 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the Netfilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_sched.c,v 1.13 2003/05/10 03:05:23 wensong Exp $ + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include +#include +#include + +#include + +/* + * IPVS scheduler list + */ +static LIST_HEAD(ip_vs_schedulers); + +/* lock for service table */ +static DEFINE_RWLOCK(__ip_vs_sched_lock); + + +/* + * Bind a service with a scheduler + */ +int ip_vs_bind_scheduler(struct ip_vs_service *svc, + struct ip_vs_scheduler *scheduler) +{ + int ret; + + if (svc == NULL) { + IP_VS_ERR("ip_vs_bind_scheduler(): svc arg NULL\n"); + return -EINVAL; + } + if (scheduler == NULL) { + IP_VS_ERR("ip_vs_bind_scheduler(): scheduler arg NULL\n"); + return -EINVAL; + } + + svc->scheduler = scheduler; + + if (scheduler->init_service) { + ret = scheduler->init_service(svc); + if (ret) { + IP_VS_ERR("ip_vs_bind_scheduler(): init error\n"); + return ret; + } + } + + return 0; +} + + +/* + * Unbind a service with its scheduler + */ +int ip_vs_unbind_scheduler(struct ip_vs_service *svc) +{ + struct ip_vs_scheduler *sched; + + if (svc == NULL) { + IP_VS_ERR("ip_vs_unbind_scheduler(): svc arg NULL\n"); + return -EINVAL; + } + + sched = svc->scheduler; + if (sched == NULL) { + IP_VS_ERR("ip_vs_unbind_scheduler(): svc isn't bound\n"); + return -EINVAL; + } + + if (sched->done_service) { + if (sched->done_service(svc) != 0) { + IP_VS_ERR("ip_vs_unbind_scheduler(): done error\n"); + return -EINVAL; + } + } + + svc->scheduler = NULL; + return 0; +} + + +/* + * Get scheduler in the scheduler list by name + */ +static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + IP_VS_DBG(2, "ip_vs_sched_getbyname(): sched_name \"%s\"\n", + sched_name); + + read_lock_bh(&__ip_vs_sched_lock); + + list_for_each_entry(sched, &ip_vs_schedulers, n_list) { + /* + * Test and get the modules atomically + */ + if (sched->module && !try_module_get(sched->module)) { + /* + * This scheduler is just deleted + */ + continue; + } + if (strcmp(sched_name, sched->name)==0) { + /* HIT */ + read_unlock_bh(&__ip_vs_sched_lock); + return sched; + } + if (sched->module) + module_put(sched->module); + } + + read_unlock_bh(&__ip_vs_sched_lock); + return NULL; +} + + +/* + * Lookup scheduler and try to load it if it doesn't exist + */ +struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) +{ + struct ip_vs_scheduler *sched; + + /* + * Search for the scheduler by sched_name + */ + sched = ip_vs_sched_getbyname(sched_name); + + /* + * If scheduler not found, load the module and search again + */ + if (sched == NULL) { + request_module("ip_vs_%s", sched_name); + sched = ip_vs_sched_getbyname(sched_name); + } + + return sched; +} + +void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) +{ + if (scheduler->module) + module_put(scheduler->module); +} + + +/* + * Register a scheduler in the scheduler list + */ +int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + struct ip_vs_scheduler *sched; + + if (!scheduler) { + IP_VS_ERR("register_ip_vs_scheduler(): NULL arg\n"); + return -EINVAL; + } + + if (!scheduler->name) { + IP_VS_ERR("register_ip_vs_scheduler(): NULL scheduler_name\n"); + return -EINVAL; + } + + /* increase the module use count */ + ip_vs_use_count_inc(); + + /* + * Make sure that the scheduler with this name doesn't exist + * in the scheduler list. + */ + sched = ip_vs_sched_getbyname(scheduler->name); + if (sched) { + ip_vs_scheduler_put(sched); + ip_vs_use_count_dec(); + IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " + "already existed in the system\n", scheduler->name); + return -EINVAL; + } + + write_lock_bh(&__ip_vs_sched_lock); + + if (scheduler->n_list.next != &scheduler->n_list) { + write_unlock_bh(&__ip_vs_sched_lock); + ip_vs_use_count_dec(); + IP_VS_ERR("register_ip_vs_scheduler(): [%s] scheduler " + "already linked\n", scheduler->name); + return -EINVAL; + } + + /* + * Add it into the d-linked scheduler list + */ + list_add(&scheduler->n_list, &ip_vs_schedulers); + write_unlock_bh(&__ip_vs_sched_lock); + + IP_VS_INFO("[%s] scheduler registered.\n", scheduler->name); + + return 0; +} + + +/* + * Unregister a scheduler from the scheduler list + */ +int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) +{ + if (!scheduler) { + IP_VS_ERR( "unregister_ip_vs_scheduler(): NULL arg\n"); + return -EINVAL; + } + + write_lock_bh(&__ip_vs_sched_lock); + if (scheduler->n_list.next == &scheduler->n_list) { + write_unlock_bh(&__ip_vs_sched_lock); + IP_VS_ERR("unregister_ip_vs_scheduler(): [%s] scheduler " + "is not in the list. failed\n", scheduler->name); + return -EINVAL; + } + + /* + * Remove it from the d-linked scheduler list + */ + list_del(&scheduler->n_list); + write_unlock_bh(&__ip_vs_sched_lock); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + IP_VS_INFO("[%s] scheduler unregistered.\n", scheduler->name); + + return 0; +} diff --git a/net/ipv4/ipvs/ip_vs_sed.c b/net/ipv4/ipvs/ip_vs_sed.c new file mode 100644 index 000000000000..ff366f7390d9 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_sed.c @@ -0,0 +1,163 @@ +/* + * IPVS: Shortest Expected Delay scheduling module + * + * Version: $Id: ip_vs_sed.c,v 1.1 2003/05/10 03:06:08 wensong Exp $ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The SED algorithm attempts to minimize each job's expected delay until + * completion. The expected delay that the job will experience is + * (Ci + 1) / Ui if sent to the ith server, in which Ci is the number of + * jobs on the the ith server and Ui is the fixed service rate (weight) of + * the ith server. The SED algorithm adopts a greedy policy that each does + * what is in its own best interest, i.e. to join the queue which would + * minimize its expected delay of completion. + * + * See the following paper for more information: + * A. Weinrib and S. Shenker, Greed is not enough: Adaptive load sharing + * in large heterogeneous systems. In Proceedings IEEE INFOCOM'88, + * pages 986-994, 1988. + * + * Thanks must go to Marko Buuri for talking SED to me. + * + * The difference between SED and WLC is that SED includes the incoming + * job in the cost function (the increment of 1). SED may outperform + * WLC, while scheduling big jobs under larger heterogeneous systems + * (the server weight varies a lot). + * + */ + +#include +#include + +#include + + +static int +ip_vs_sed_init_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_sed_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_sed_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline unsigned int +ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We only use the active connection number in the cost + * calculation here. + */ + return atomic_read(&dest->activeconns) + 1; +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_sed_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (server expected overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_sed_dest_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_sed_dest_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "SED: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_sed_scheduler = +{ + .name = "sed", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_sed_init_svc, + .done_service = ip_vs_sed_done_svc, + .update_service = ip_vs_sed_update_svc, + .schedule = ip_vs_sed_schedule, +}; + + +static int __init ip_vs_sed_init(void) +{ + INIT_LIST_HEAD(&ip_vs_sed_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +static void __exit ip_vs_sed_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sed_scheduler); +} + +module_init(ip_vs_sed_init); +module_exit(ip_vs_sed_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c new file mode 100644 index 000000000000..6f7c50e44a39 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_sh.c @@ -0,0 +1,255 @@ +/* + * IPVS: Source Hashing scheduling module + * + * Version: $Id: ip_vs_sh.c,v 1.5 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +/* + * The sh algorithm is to select server by the hash key of source IP + * address. The pseudo code is as follows: + * + * n <- servernode[src_ip]; + * if (n is dead) OR + * (n is overloaded) or (n.weight <= 0) then + * return NULL; + * + * return n; + * + * Notes that servernode is a 256-bucket hash table that maps the hash + * index derived from packet source IP address to the current server + * array. If the sh scheduler is used in cache cluster, it is good to + * combine it with cache_bypass feature. When the statically assigned + * server is dead or overloaded, the load balancer can bypass the cache + * server and send requests to the original server directly. + * + */ + +#include +#include + +#include + + +/* + * IPVS SH bucket + */ +struct ip_vs_sh_bucket { + struct ip_vs_dest *dest; /* real server (cache) */ +}; + +/* + * for IPVS SH entry hash table + */ +#ifndef CONFIG_IP_VS_SH_TAB_BITS +#define CONFIG_IP_VS_SH_TAB_BITS 8 +#endif +#define IP_VS_SH_TAB_BITS CONFIG_IP_VS_SH_TAB_BITS +#define IP_VS_SH_TAB_SIZE (1 << IP_VS_SH_TAB_BITS) +#define IP_VS_SH_TAB_MASK (IP_VS_SH_TAB_SIZE - 1) + + +/* + * Returns hash value for IPVS SH entry + */ +static inline unsigned ip_vs_sh_hashkey(__u32 addr) +{ + return (ntohl(addr)*2654435761UL) & IP_VS_SH_TAB_MASK; +} + + +/* + * Get ip_vs_dest associated with supplied parameters. + */ +static inline struct ip_vs_dest * +ip_vs_sh_get(struct ip_vs_sh_bucket *tbl, __u32 addr) +{ + return (tbl[ip_vs_sh_hashkey(addr)]).dest; +} + + +/* + * Assign all the hash buckets of the specified table with the service. + */ +static int +ip_vs_sh_assign(struct ip_vs_sh_bucket *tbl, struct ip_vs_service *svc) +{ + int i; + struct ip_vs_sh_bucket *b; + struct list_head *p; + struct ip_vs_dest *dest; + + b = tbl; + p = &svc->destinations; + for (i=0; idest = NULL; + } else { + if (p == &svc->destinations) + p = p->next; + + dest = list_entry(p, struct ip_vs_dest, n_list); + atomic_inc(&dest->refcnt); + b->dest = dest; + + p = p->next; + } + b++; + } + return 0; +} + + +/* + * Flush all the hash buckets of the specified table. + */ +static void ip_vs_sh_flush(struct ip_vs_sh_bucket *tbl) +{ + int i; + struct ip_vs_sh_bucket *b; + + b = tbl; + for (i=0; idest) { + atomic_dec(&b->dest->refcnt); + b->dest = NULL; + } + b++; + } +} + + +static int ip_vs_sh_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl; + + /* allocate the SH table for this service */ + tbl = kmalloc(sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE, + GFP_ATOMIC); + if (tbl == NULL) { + IP_VS_ERR("ip_vs_sh_init_svc(): no memory\n"); + return -ENOMEM; + } + svc->sched_data = tbl; + IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) allocated for " + "current service\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +static int ip_vs_sh_done_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* release the table itself */ + kfree(svc->sched_data); + IP_VS_DBG(6, "SH hash table (memory=%Zdbytes) released\n", + sizeof(struct ip_vs_sh_bucket)*IP_VS_SH_TAB_SIZE); + + return 0; +} + + +static int ip_vs_sh_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_sh_bucket *tbl = svc->sched_data; + + /* got to clean up hash buckets here */ + ip_vs_sh_flush(tbl); + + /* assign the hash buckets with the updated service */ + ip_vs_sh_assign(tbl, svc); + + return 0; +} + + +/* + * If the dest flags is set with IP_VS_DEST_F_OVERLOAD, + * consider that the server is overloaded here. + */ +static inline int is_overloaded(struct ip_vs_dest *dest) +{ + return dest->flags & IP_VS_DEST_F_OVERLOAD; +} + + +/* + * Source Hashing scheduling + */ +static struct ip_vs_dest * +ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_sh_bucket *tbl; + struct iphdr *iph = skb->nh.iph; + + IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); + + tbl = (struct ip_vs_sh_bucket *)svc->sched_data; + dest = ip_vs_sh_get(tbl, iph->saddr); + if (!dest + || !(dest->flags & IP_VS_DEST_F_AVAILABLE) + || atomic_read(&dest->weight) <= 0 + || is_overloaded(dest)) { + return NULL; + } + + IP_VS_DBG(6, "SH: source IP address %u.%u.%u.%u " + "--> server %u.%u.%u.%u:%d\n", + NIPQUAD(iph->saddr), + NIPQUAD(dest->addr), + ntohs(dest->port)); + + return dest; +} + + +/* + * IPVS SH Scheduler structure + */ +static struct ip_vs_scheduler ip_vs_sh_scheduler = +{ + .name = "sh", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_sh_init_svc, + .done_service = ip_vs_sh_done_svc, + .update_service = ip_vs_sh_update_svc, + .schedule = ip_vs_sh_schedule, +}; + + +static int __init ip_vs_sh_init(void) +{ + INIT_LIST_HEAD(&ip_vs_sh_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +static void __exit ip_vs_sh_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_sh_scheduler); +} + + +module_init(ip_vs_sh_init); +module_exit(ip_vs_sh_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c new file mode 100644 index 000000000000..25c479550a32 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -0,0 +1,892 @@ +/* + * IPVS An implementation of the IP virtual server support for the + * LINUX operating system. IPVS is now implemented as a module + * over the NetFilter framework. IPVS can be used to build a + * high-performance and highly available server based on a + * cluster of servers. + * + * Version: $Id: ip_vs_sync.c,v 1.13 2003/06/08 09:31:19 wensong Exp $ + * + * Authors: Wensong Zhang + * + * ip_vs_sync: sync connection info from master load balancer to backups + * through multicast + * + * Changes: + * Alexandre Cassen : Added master & backup support at a time. + * Alexandre Cassen : Added SyncID support for incoming sync + * messages filtering. + * Justin Ossevoort : Fix endian problem on sync message size. + */ + +#include +#include +#include +#include +#include +#include +#include +#include /* for ip_mc_join_group */ + +#include +#include +#include /* for get_fs and set_fs */ + +#include + +#define IP_VS_SYNC_GROUP 0xe0000051 /* multicast addr - 224.0.0.81 */ +#define IP_VS_SYNC_PORT 8848 /* multicast port */ + + +/* + * IPVS sync connection entry + */ +struct ip_vs_sync_conn { + __u8 reserved; + + /* Protocol, addresses and port numbers */ + __u8 protocol; /* Which protocol (TCP/UDP) */ + __u16 cport; + __u16 vport; + __u16 dport; + __u32 caddr; /* client address */ + __u32 vaddr; /* virtual address */ + __u32 daddr; /* destination address */ + + /* Flags and state transition */ + __u16 flags; /* status flags */ + __u16 state; /* state info */ + + /* The sequence options start here */ +}; + +struct ip_vs_sync_conn_options { + struct ip_vs_seq in_seq; /* incoming seq. struct */ + struct ip_vs_seq out_seq; /* outgoing seq. struct */ +}; + +#define IP_VS_SYNC_CONN_TIMEOUT (3*60*HZ) +#define SIMPLE_CONN_SIZE (sizeof(struct ip_vs_sync_conn)) +#define FULL_CONN_SIZE \ +(sizeof(struct ip_vs_sync_conn) + sizeof(struct ip_vs_sync_conn_options)) + + +/* + The master mulitcasts messages to the backup load balancers in the + following format. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Count Conns | SyncID | Size | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (1) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | . | + | . | + | . | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | IPVS Sync Connection (n) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +*/ + +#define SYNC_MESG_HEADER_LEN 4 + +struct ip_vs_sync_mesg { + __u8 nr_conns; + __u8 syncid; + __u16 size; + + /* ip_vs_sync_conn entries start here */ +}; + +/* the maximum length of sync (sending/receiving) message */ +static int sync_send_mesg_maxlen; +static int sync_recv_mesg_maxlen; + +struct ip_vs_sync_buff { + struct list_head list; + unsigned long firstuse; + + /* pointers for the message data */ + struct ip_vs_sync_mesg *mesg; + unsigned char *head; + unsigned char *end; +}; + + +/* the sync_buff list head and the lock */ +static LIST_HEAD(ip_vs_sync_queue); +static DEFINE_SPINLOCK(ip_vs_sync_lock); + +/* current sync_buff for accepting new conn entries */ +static struct ip_vs_sync_buff *curr_sb = NULL; +static DEFINE_SPINLOCK(curr_sb_lock); + +/* ipvs sync daemon state */ +volatile int ip_vs_sync_state = IP_VS_STATE_NONE; +volatile int ip_vs_master_syncid = 0; +volatile int ip_vs_backup_syncid = 0; + +/* multicast interface name */ +char ip_vs_master_mcast_ifn[IP_VS_IFNAME_MAXLEN]; +char ip_vs_backup_mcast_ifn[IP_VS_IFNAME_MAXLEN]; + +/* multicast addr */ +static struct sockaddr_in mcast_addr; + + +static inline void sb_queue_tail(struct ip_vs_sync_buff *sb) +{ + spin_lock(&ip_vs_sync_lock); + list_add_tail(&sb->list, &ip_vs_sync_queue); + spin_unlock(&ip_vs_sync_lock); +} + +static inline struct ip_vs_sync_buff * sb_dequeue(void) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&ip_vs_sync_lock); + if (list_empty(&ip_vs_sync_queue)) { + sb = NULL; + } else { + sb = list_entry(ip_vs_sync_queue.next, + struct ip_vs_sync_buff, + list); + list_del(&sb->list); + } + spin_unlock_bh(&ip_vs_sync_lock); + + return sb; +} + +static inline struct ip_vs_sync_buff * ip_vs_sync_buff_create(void) +{ + struct ip_vs_sync_buff *sb; + + if (!(sb=kmalloc(sizeof(struct ip_vs_sync_buff), GFP_ATOMIC))) + return NULL; + + if (!(sb->mesg=kmalloc(sync_send_mesg_maxlen, GFP_ATOMIC))) { + kfree(sb); + return NULL; + } + sb->mesg->nr_conns = 0; + sb->mesg->syncid = ip_vs_master_syncid; + sb->mesg->size = 4; + sb->head = (unsigned char *)sb->mesg + 4; + sb->end = (unsigned char *)sb->mesg + sync_send_mesg_maxlen; + sb->firstuse = jiffies; + return sb; +} + +static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb) +{ + kfree(sb->mesg); + kfree(sb); +} + +/* + * Get the current sync buffer if it has been created for more + * than the specified time or the specified time is zero. + */ +static inline struct ip_vs_sync_buff * +get_curr_sync_buff(unsigned long time) +{ + struct ip_vs_sync_buff *sb; + + spin_lock_bh(&curr_sb_lock); + if (curr_sb && (time == 0 || + time_before(jiffies - curr_sb->firstuse, time))) { + sb = curr_sb; + curr_sb = NULL; + } else + sb = NULL; + spin_unlock_bh(&curr_sb_lock); + return sb; +} + + +/* + * Add an ip_vs_conn information into the current sync_buff. + * Called by ip_vs_in. + */ +void ip_vs_sync_conn(struct ip_vs_conn *cp) +{ + struct ip_vs_sync_mesg *m; + struct ip_vs_sync_conn *s; + int len; + + spin_lock(&curr_sb_lock); + if (!curr_sb) { + if (!(curr_sb=ip_vs_sync_buff_create())) { + spin_unlock(&curr_sb_lock); + IP_VS_ERR("ip_vs_sync_buff_create failed.\n"); + return; + } + } + + len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE : + SIMPLE_CONN_SIZE; + m = curr_sb->mesg; + s = (struct ip_vs_sync_conn *)curr_sb->head; + + /* copy members */ + s->protocol = cp->protocol; + s->cport = cp->cport; + s->vport = cp->vport; + s->dport = cp->dport; + s->caddr = cp->caddr; + s->vaddr = cp->vaddr; + s->daddr = cp->daddr; + s->flags = htons(cp->flags & ~IP_VS_CONN_F_HASHED); + s->state = htons(cp->state); + if (cp->flags & IP_VS_CONN_F_SEQ_MASK) { + struct ip_vs_sync_conn_options *opt = + (struct ip_vs_sync_conn_options *)&s[1]; + memcpy(opt, &cp->in_seq, sizeof(*opt)); + } + + m->nr_conns++; + m->size += len; + curr_sb->head += len; + + /* check if there is a space for next one */ + if (curr_sb->head+FULL_CONN_SIZE > curr_sb->end) { + sb_queue_tail(curr_sb); + curr_sb = NULL; + } + spin_unlock(&curr_sb_lock); + + /* synchronize its controller if it has */ + if (cp->control) + ip_vs_sync_conn(cp->control); +} + + +/* + * Process received multicast message and create the corresponding + * ip_vs_conn entries. + */ +static void ip_vs_process_message(const char *buffer, const size_t buflen) +{ + struct ip_vs_sync_mesg *m = (struct ip_vs_sync_mesg *)buffer; + struct ip_vs_sync_conn *s; + struct ip_vs_sync_conn_options *opt; + struct ip_vs_conn *cp; + char *p; + int i; + + /* Convert size back to host byte order */ + m->size = ntohs(m->size); + + if (buflen != m->size) { + IP_VS_ERR("bogus message\n"); + return; + } + + /* SyncID sanity check */ + if (ip_vs_backup_syncid != 0 && m->syncid != ip_vs_backup_syncid) { + IP_VS_DBG(7, "Ignoring incoming msg with syncid = %d\n", + m->syncid); + return; + } + + p = (char *)buffer + sizeof(struct ip_vs_sync_mesg); + for (i=0; inr_conns; i++) { + s = (struct ip_vs_sync_conn *)p; + cp = ip_vs_conn_in_get(s->protocol, + s->caddr, s->cport, + s->vaddr, s->vport); + if (!cp) { + cp = ip_vs_conn_new(s->protocol, + s->caddr, s->cport, + s->vaddr, s->vport, + s->daddr, s->dport, + ntohs(s->flags), NULL); + if (!cp) { + IP_VS_ERR("ip_vs_conn_new failed\n"); + return; + } + cp->state = ntohs(s->state); + } else if (!cp->dest) { + /* it is an entry created by the synchronization */ + cp->state = ntohs(s->state); + cp->flags = ntohs(s->flags) | IP_VS_CONN_F_HASHED; + } /* Note that we don't touch its state and flags + if it is a normal entry. */ + + if (ntohs(s->flags) & IP_VS_CONN_F_SEQ_MASK) { + opt = (struct ip_vs_sync_conn_options *)&s[1]; + memcpy(&cp->in_seq, opt, sizeof(*opt)); + p += FULL_CONN_SIZE; + } else + p += SIMPLE_CONN_SIZE; + + atomic_set(&cp->in_pkts, sysctl_ip_vs_sync_threshold[0]); + cp->timeout = IP_VS_SYNC_CONN_TIMEOUT; + ip_vs_conn_put(cp); + + if (p > buffer+buflen) { + IP_VS_ERR("bogus message\n"); + return; + } + } +} + + +/* + * Setup loopback of outgoing multicasts on a sending socket + */ +static void set_mcast_loop(struct sock *sk, u_char loop) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_LOOP, &loop, sizeof(loop)); */ + lock_sock(sk); + inet->mc_loop = loop ? 1 : 0; + release_sock(sk); +} + +/* + * Specify TTL for outgoing multicasts on a sending socket + */ +static void set_mcast_ttl(struct sock *sk, u_char ttl) +{ + struct inet_sock *inet = inet_sk(sk); + + /* setsockopt(sock, SOL_IP, IP_MULTICAST_TTL, &ttl, sizeof(ttl)); */ + lock_sock(sk); + inet->mc_ttl = ttl; + release_sock(sk); +} + +/* + * Specifiy default interface for outgoing multicasts + */ +static int set_mcast_if(struct sock *sk, char *ifname) +{ + struct net_device *dev; + struct inet_sock *inet = inet_sk(sk); + + if ((dev = __dev_get_by_name(ifname)) == NULL) + return -ENODEV; + + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + lock_sock(sk); + inet->mc_index = dev->ifindex; + /* inet->mc_addr = 0; */ + release_sock(sk); + + return 0; +} + + +/* + * Set the maximum length of sync message according to the + * specified interface's MTU. + */ +static int set_sync_mesg_maxlen(int sync_state) +{ + struct net_device *dev; + int num; + + if (sync_state == IP_VS_STATE_MASTER) { + if ((dev = __dev_get_by_name(ip_vs_master_mcast_ifn)) == NULL) + return -ENODEV; + + num = (dev->mtu - sizeof(struct iphdr) - + sizeof(struct udphdr) - + SYNC_MESG_HEADER_LEN - 20) / SIMPLE_CONN_SIZE; + sync_send_mesg_maxlen = + SYNC_MESG_HEADER_LEN + SIMPLE_CONN_SIZE * num; + IP_VS_DBG(7, "setting the maximum length of sync sending " + "message %d.\n", sync_send_mesg_maxlen); + } else if (sync_state == IP_VS_STATE_BACKUP) { + if ((dev = __dev_get_by_name(ip_vs_backup_mcast_ifn)) == NULL) + return -ENODEV; + + sync_recv_mesg_maxlen = dev->mtu - + sizeof(struct iphdr) - sizeof(struct udphdr); + IP_VS_DBG(7, "setting the maximum length of sync receiving " + "message %d.\n", sync_recv_mesg_maxlen); + } + + return 0; +} + + +/* + * Join a multicast group. + * the group is specified by a class D multicast address 224.0.0.0/8 + * in the in_addr structure passed in as a parameter. + */ +static int +join_mcast_group(struct sock *sk, struct in_addr *addr, char *ifname) +{ + struct ip_mreqn mreq; + struct net_device *dev; + int ret; + + memset(&mreq, 0, sizeof(mreq)); + memcpy(&mreq.imr_multiaddr, addr, sizeof(struct in_addr)); + + if ((dev = __dev_get_by_name(ifname)) == NULL) + return -ENODEV; + if (sk->sk_bound_dev_if && dev->ifindex != sk->sk_bound_dev_if) + return -EINVAL; + + mreq.imr_ifindex = dev->ifindex; + + lock_sock(sk); + ret = ip_mc_join_group(sk, &mreq); + release_sock(sk); + + return ret; +} + + +static int bind_mcastif_addr(struct socket *sock, char *ifname) +{ + struct net_device *dev; + u32 addr; + struct sockaddr_in sin; + + if ((dev = __dev_get_by_name(ifname)) == NULL) + return -ENODEV; + + addr = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + if (!addr) + IP_VS_ERR("You probably need to specify IP address on " + "multicast interface.\n"); + + IP_VS_DBG(7, "binding socket with (%s) %u.%u.%u.%u\n", + ifname, NIPQUAD(addr)); + + /* Now bind the socket with the address of multicast interface */ + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = addr; + sin.sin_port = 0; + + return sock->ops->bind(sock, (struct sockaddr*)&sin, sizeof(sin)); +} + +/* + * Set up sending multicast socket over UDP + */ +static struct socket * make_send_sock(void) +{ + struct socket *sock; + + /* First create a socket */ + if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { + IP_VS_ERR("Error during creation of socket; terminating\n"); + return NULL; + } + + if (set_mcast_if(sock->sk, ip_vs_master_mcast_ifn) < 0) { + IP_VS_ERR("Error setting outbound mcast interface\n"); + goto error; + } + + set_mcast_loop(sock->sk, 0); + set_mcast_ttl(sock->sk, 1); + + if (bind_mcastif_addr(sock, ip_vs_master_mcast_ifn) < 0) { + IP_VS_ERR("Error binding address of the mcast interface\n"); + goto error; + } + + if (sock->ops->connect(sock, + (struct sockaddr*)&mcast_addr, + sizeof(struct sockaddr), 0) < 0) { + IP_VS_ERR("Error connecting to the multicast addr\n"); + goto error; + } + + return sock; + + error: + sock_release(sock); + return NULL; +} + + +/* + * Set up receiving multicast socket over UDP + */ +static struct socket * make_receive_sock(void) +{ + struct socket *sock; + + /* First create a socket */ + if (sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock) < 0) { + IP_VS_ERR("Error during creation of socket; terminating\n"); + return NULL; + } + + /* it is equivalent to the REUSEADDR option in user-space */ + sock->sk->sk_reuse = 1; + + if (sock->ops->bind(sock, + (struct sockaddr*)&mcast_addr, + sizeof(struct sockaddr)) < 0) { + IP_VS_ERR("Error binding to the multicast addr\n"); + goto error; + } + + /* join the multicast group */ + if (join_mcast_group(sock->sk, + (struct in_addr*)&mcast_addr.sin_addr, + ip_vs_backup_mcast_ifn) < 0) { + IP_VS_ERR("Error joining to the multicast group\n"); + goto error; + } + + return sock; + + error: + sock_release(sock); + return NULL; +} + + +static int +ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length) +{ + struct msghdr msg = {.msg_flags = MSG_DONTWAIT|MSG_NOSIGNAL}; + struct kvec iov; + int len; + + EnterFunction(7); + iov.iov_base = (void *)buffer; + iov.iov_len = length; + + len = kernel_sendmsg(sock, &msg, &iov, 1, (size_t)(length)); + + LeaveFunction(7); + return len; +} + +static void +ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg) +{ + int msize; + + msize = msg->size; + + /* Put size in network byte order */ + msg->size = htons(msg->size); + + if (ip_vs_send_async(sock, (char *)msg, msize) != msize) + IP_VS_ERR("ip_vs_send_async error\n"); +} + +static int +ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen) +{ + struct msghdr msg = {NULL,}; + struct kvec iov; + int len; + + EnterFunction(7); + + /* Receive a packet */ + iov.iov_base = buffer; + iov.iov_len = (size_t)buflen; + + len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0); + + if (len < 0) + return -1; + + LeaveFunction(7); + return len; +} + + +static DECLARE_WAIT_QUEUE_HEAD(sync_wait); +static pid_t sync_master_pid = 0; +static pid_t sync_backup_pid = 0; + +static DECLARE_WAIT_QUEUE_HEAD(stop_sync_wait); +static int stop_master_sync = 0; +static int stop_backup_sync = 0; + +static void sync_master_loop(void) +{ + struct socket *sock; + struct ip_vs_sync_buff *sb; + + /* create the sending multicast socket */ + sock = make_send_sock(); + if (!sock) + return; + + IP_VS_INFO("sync thread started: state = MASTER, mcast_ifn = %s, " + "syncid = %d\n", + ip_vs_master_mcast_ifn, ip_vs_master_syncid); + + for (;;) { + while ((sb=sb_dequeue())) { + ip_vs_send_sync_msg(sock, sb->mesg); + ip_vs_sync_buff_release(sb); + } + + /* check if entries stay in curr_sb for 2 seconds */ + if ((sb = get_curr_sync_buff(2*HZ))) { + ip_vs_send_sync_msg(sock, sb->mesg); + ip_vs_sync_buff_release(sb); + } + + if (stop_master_sync) + break; + + ssleep(1); + } + + /* clean up the sync_buff queue */ + while ((sb=sb_dequeue())) { + ip_vs_sync_buff_release(sb); + } + + /* clean up the current sync_buff */ + if ((sb = get_curr_sync_buff(0))) { + ip_vs_sync_buff_release(sb); + } + + /* release the sending multicast socket */ + sock_release(sock); +} + + +static void sync_backup_loop(void) +{ + struct socket *sock; + char *buf; + int len; + + if (!(buf = kmalloc(sync_recv_mesg_maxlen, GFP_ATOMIC))) { + IP_VS_ERR("sync_backup_loop: kmalloc error\n"); + return; + } + + /* create the receiving multicast socket */ + sock = make_receive_sock(); + if (!sock) + goto out; + + IP_VS_INFO("sync thread started: state = BACKUP, mcast_ifn = %s, " + "syncid = %d\n", + ip_vs_backup_mcast_ifn, ip_vs_backup_syncid); + + for (;;) { + /* do you have data now? */ + while (!skb_queue_empty(&(sock->sk->sk_receive_queue))) { + if ((len = + ip_vs_receive(sock, buf, + sync_recv_mesg_maxlen)) <= 0) { + IP_VS_ERR("receiving message error\n"); + break; + } + /* disable bottom half, because it accessed the data + shared by softirq while getting/creating conns */ + local_bh_disable(); + ip_vs_process_message(buf, len); + local_bh_enable(); + } + + if (stop_backup_sync) + break; + + ssleep(1); + } + + /* release the sending multicast socket */ + sock_release(sock); + + out: + kfree(buf); +} + + +static void set_sync_pid(int sync_state, pid_t sync_pid) +{ + if (sync_state == IP_VS_STATE_MASTER) + sync_master_pid = sync_pid; + else if (sync_state == IP_VS_STATE_BACKUP) + sync_backup_pid = sync_pid; +} + +static void set_stop_sync(int sync_state, int set) +{ + if (sync_state == IP_VS_STATE_MASTER) + stop_master_sync = set; + else if (sync_state == IP_VS_STATE_BACKUP) + stop_backup_sync = set; + else { + stop_master_sync = set; + stop_backup_sync = set; + } +} + +static int sync_thread(void *startup) +{ + DECLARE_WAITQUEUE(wait, current); + mm_segment_t oldmm; + int state; + const char *name; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + if (ip_vs_sync_state & IP_VS_STATE_MASTER && !sync_master_pid) { + state = IP_VS_STATE_MASTER; + name = "ipvs_syncmaster"; + } else if (ip_vs_sync_state & IP_VS_STATE_BACKUP && !sync_backup_pid) { + state = IP_VS_STATE_BACKUP; + name = "ipvs_syncbackup"; + } else { + IP_VS_BUG(); + ip_vs_use_count_dec(); + return -EINVAL; + } + + daemonize(name); + + oldmm = get_fs(); + set_fs(KERNEL_DS); + + /* Block all signals */ + spin_lock_irq(¤t->sighand->siglock); + siginitsetinv(¤t->blocked, 0); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); + + /* set the maximum length of sync message */ + set_sync_mesg_maxlen(state); + + /* set up multicast address */ + mcast_addr.sin_family = AF_INET; + mcast_addr.sin_port = htons(IP_VS_SYNC_PORT); + mcast_addr.sin_addr.s_addr = htonl(IP_VS_SYNC_GROUP); + + add_wait_queue(&sync_wait, &wait); + + set_sync_pid(state, current->pid); + complete((struct completion *)startup); + + /* processing master/backup loop here */ + if (state == IP_VS_STATE_MASTER) + sync_master_loop(); + else if (state == IP_VS_STATE_BACKUP) + sync_backup_loop(); + else IP_VS_BUG(); + + remove_wait_queue(&sync_wait, &wait); + + /* thread exits */ + set_sync_pid(state, 0); + IP_VS_INFO("sync thread stopped!\n"); + + set_fs(oldmm); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + set_stop_sync(state, 0); + wake_up(&stop_sync_wait); + + return 0; +} + + +static int fork_sync_thread(void *startup) +{ + pid_t pid; + + /* fork the sync thread here, then the parent process of the + sync thread is the init process after this thread exits. */ + repeat: + if ((pid = kernel_thread(sync_thread, startup, 0)) < 0) { + IP_VS_ERR("could not create sync_thread due to %d... " + "retrying.\n", pid); + ssleep(1); + goto repeat; + } + + return 0; +} + + +int start_sync_thread(int state, char *mcast_ifn, __u8 syncid) +{ + DECLARE_COMPLETION(startup); + pid_t pid; + + if ((state == IP_VS_STATE_MASTER && sync_master_pid) || + (state == IP_VS_STATE_BACKUP && sync_backup_pid)) + return -EEXIST; + + IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid); + IP_VS_DBG(7, "Each ip_vs_sync_conn entry need %Zd bytes\n", + sizeof(struct ip_vs_sync_conn)); + + ip_vs_sync_state |= state; + if (state == IP_VS_STATE_MASTER) { + strcpy(ip_vs_master_mcast_ifn, mcast_ifn); + ip_vs_master_syncid = syncid; + } else { + strcpy(ip_vs_backup_mcast_ifn, mcast_ifn); + ip_vs_backup_syncid = syncid; + } + + repeat: + if ((pid = kernel_thread(fork_sync_thread, &startup, 0)) < 0) { + IP_VS_ERR("could not create fork_sync_thread due to %d... " + "retrying.\n", pid); + ssleep(1); + goto repeat; + } + + wait_for_completion(&startup); + + return 0; +} + + +int stop_sync_thread(int state) +{ + DECLARE_WAITQUEUE(wait, current); + + if ((state == IP_VS_STATE_MASTER && !sync_master_pid) || + (state == IP_VS_STATE_BACKUP && !sync_backup_pid)) + return -ESRCH; + + IP_VS_DBG(7, "%s: pid %d\n", __FUNCTION__, current->pid); + IP_VS_INFO("stopping sync thread %d ...\n", + (state == IP_VS_STATE_MASTER) ? sync_master_pid : sync_backup_pid); + + __set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&stop_sync_wait, &wait); + set_stop_sync(state, 1); + ip_vs_sync_state -= state; + wake_up(&sync_wait); + schedule(); + __set_current_state(TASK_RUNNING); + remove_wait_queue(&stop_sync_wait, &wait); + + /* Note: no need to reap the sync thread, because its parent + process is the init process */ + + if ((state == IP_VS_STATE_MASTER && stop_master_sync) || + (state == IP_VS_STATE_BACKUP && stop_backup_sync)) + IP_VS_BUG(); + + return 0; +} diff --git a/net/ipv4/ipvs/ip_vs_wlc.c b/net/ipv4/ipvs/ip_vs_wlc.c new file mode 100644 index 000000000000..8a9d913261d8 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_wlc.c @@ -0,0 +1,151 @@ +/* + * IPVS: Weighted Least-Connection Scheduling module + * + * Version: $Id: ip_vs_wlc.c,v 1.13 2003/04/18 09:03:16 wensong Exp $ + * + * Authors: Wensong Zhang + * Peter Kese + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wlc_schedule to return dest + * Wensong Zhang : changed to use the inactconns in scheduling + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wlc_update_svc + * Wensong Zhang : added any dest with weight=0 is quiesced + * + */ + +#include +#include + +#include + + +static int +ip_vs_wlc_init_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_wlc_done_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static int +ip_vs_wlc_update_svc(struct ip_vs_service *svc) +{ + return 0; +} + + +static inline unsigned int +ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) +{ + /* + * We think the overhead of processing active connections is 256 + * times higher than that of inactive connections in average. (This + * 256 times might not be accurate, we will change it later) We + * use the following formula to estimate the overhead now: + * dest->activeconns*256 + dest->inactconns + */ + return (atomic_read(&dest->activeconns) << 8) + + atomic_read(&dest->inactconns); +} + + +/* + * Weighted Least Connection scheduling + */ +static struct ip_vs_dest * +ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest, *least; + unsigned int loh, doh; + + IP_VS_DBG(6, "ip_vs_wlc_schedule(): Scheduling...\n"); + + /* + * We calculate the load of each dest server as follows: + * (dest overhead) / dest->weight + * + * Remember -- no floats in kernel mode!!! + * The comparison of h1*w2 > h2*w1 is equivalent to that of + * h1/w1 > h2/w2 + * if every weight is larger than zero. + * + * The server with weight=0 is quiesced and will not receive any + * new connections. + */ + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) > 0) { + least = dest; + loh = ip_vs_wlc_dest_overhead(least); + goto nextstage; + } + } + return NULL; + + /* + * Find the destination with the least load. + */ + nextstage: + list_for_each_entry_continue(dest, &svc->destinations, n_list) { + if (dest->flags & IP_VS_DEST_F_OVERLOAD) + continue; + doh = ip_vs_wlc_dest_overhead(dest); + if (loh * atomic_read(&dest->weight) > + doh * atomic_read(&least->weight)) { + least = dest; + loh = doh; + } + } + + IP_VS_DBG(6, "WLC: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d overhead %d\n", + NIPQUAD(least->addr), ntohs(least->port), + atomic_read(&least->activeconns), + atomic_read(&least->refcnt), + atomic_read(&least->weight), loh); + + return least; +} + + +static struct ip_vs_scheduler ip_vs_wlc_scheduler = +{ + .name = "wlc", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_wlc_init_svc, + .done_service = ip_vs_wlc_done_svc, + .update_service = ip_vs_wlc_update_svc, + .schedule = ip_vs_wlc_schedule, +}; + + +static int __init ip_vs_wlc_init(void) +{ + INIT_LIST_HEAD(&ip_vs_wlc_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +static void __exit ip_vs_wlc_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wlc_scheduler); +} + +module_init(ip_vs_wlc_init); +module_exit(ip_vs_wlc_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_wrr.c b/net/ipv4/ipvs/ip_vs_wrr.c new file mode 100644 index 000000000000..749fa044eca5 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_wrr.c @@ -0,0 +1,235 @@ +/* + * IPVS: Weighted Round-Robin Scheduling module + * + * Version: $Id: ip_vs_wrr.c,v 1.12 2002/09/15 08:14:08 wensong Exp $ + * + * Authors: Wensong Zhang + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Wensong Zhang : changed the ip_vs_wrr_schedule to return dest + * Wensong Zhang : changed some comestics things for debugging + * Wensong Zhang : changed for the d-linked destination list + * Wensong Zhang : added the ip_vs_wrr_update_svc + * Julian Anastasov : fixed the bug of returning destination + * with weight 0 when all weights are zero + * + */ + +#include +#include + +#include + +/* + * current destination pointer for weighted round-robin scheduling + */ +struct ip_vs_wrr_mark { + struct list_head *cl; /* current list head */ + int cw; /* current weight */ + int mw; /* maximum weight */ + int di; /* decreasing interval */ +}; + + +/* + * Get the gcd of server weights + */ +static int gcd(int a, int b) +{ + int c; + + while ((c = a % b)) { + a = b; + b = c; + } + return b; +} + +static int ip_vs_wrr_gcd_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight; + int g = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + weight = atomic_read(&dest->weight); + if (weight > 0) { + if (g > 0) + g = gcd(weight, g); + else + g = weight; + } + } + return g ? g : 1; +} + + +/* + * Get the maximum weight of the service destinations. + */ +static int ip_vs_wrr_max_weight(struct ip_vs_service *svc) +{ + struct ip_vs_dest *dest; + int weight = 0; + + list_for_each_entry(dest, &svc->destinations, n_list) { + if (atomic_read(&dest->weight) > weight) + weight = atomic_read(&dest->weight); + } + + return weight; +} + + +static int ip_vs_wrr_init_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark; + + /* + * Allocate the mark variable for WRR scheduling + */ + mark = kmalloc(sizeof(struct ip_vs_wrr_mark), GFP_ATOMIC); + if (mark == NULL) { + IP_VS_ERR("ip_vs_wrr_init_svc(): no memory\n"); + return -ENOMEM; + } + mark->cl = &svc->destinations; + mark->cw = 0; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + svc->sched_data = mark; + + return 0; +} + + +static int ip_vs_wrr_done_svc(struct ip_vs_service *svc) +{ + /* + * Release the mark variable + */ + kfree(svc->sched_data); + + return 0; +} + + +static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) +{ + struct ip_vs_wrr_mark *mark = svc->sched_data; + + mark->cl = &svc->destinations; + mark->mw = ip_vs_wrr_max_weight(svc); + mark->di = ip_vs_wrr_gcd_weight(svc); + if (mark->cw > mark->mw) + mark->cw = 0; + return 0; +} + + +/* + * Weighted Round-Robin Scheduling + */ +static struct ip_vs_dest * +ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +{ + struct ip_vs_dest *dest; + struct ip_vs_wrr_mark *mark = svc->sched_data; + struct list_head *p; + + IP_VS_DBG(6, "ip_vs_wrr_schedule(): Scheduling...\n"); + + /* + * This loop will always terminate, because mark->cw in (0, max_weight] + * and at least one server has its weight equal to max_weight. + */ + write_lock(&svc->sched_lock); + p = mark->cl; + while (1) { + if (mark->cl == &svc->destinations) { + /* it is at the head of the destination list */ + + if (mark->cl == mark->cl->next) { + /* no dest entry */ + dest = NULL; + goto out; + } + + mark->cl = svc->destinations.next; + mark->cw -= mark->di; + if (mark->cw <= 0) { + mark->cw = mark->mw; + /* + * Still zero, which means no available servers. + */ + if (mark->cw == 0) { + mark->cl = &svc->destinations; + IP_VS_INFO("ip_vs_wrr_schedule(): " + "no available servers\n"); + dest = NULL; + goto out; + } + } + } else + mark->cl = mark->cl->next; + + if (mark->cl != &svc->destinations) { + /* not at the head of the list */ + dest = list_entry(mark->cl, struct ip_vs_dest, n_list); + if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && + atomic_read(&dest->weight) >= mark->cw) { + /* got it */ + break; + } + } + + if (mark->cl == p && mark->cw == mark->di) { + /* back to the start, and no dest is found. + It is only possible when all dests are OVERLOADED */ + dest = NULL; + goto out; + } + } + + IP_VS_DBG(6, "WRR: server %u.%u.%u.%u:%u " + "activeconns %d refcnt %d weight %d\n", + NIPQUAD(dest->addr), ntohs(dest->port), + atomic_read(&dest->activeconns), + atomic_read(&dest->refcnt), + atomic_read(&dest->weight)); + + out: + write_unlock(&svc->sched_lock); + return dest; +} + + +static struct ip_vs_scheduler ip_vs_wrr_scheduler = { + .name = "wrr", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .init_service = ip_vs_wrr_init_svc, + .done_service = ip_vs_wrr_done_svc, + .update_service = ip_vs_wrr_update_svc, + .schedule = ip_vs_wrr_schedule, +}; + +static int __init ip_vs_wrr_init(void) +{ + INIT_LIST_HEAD(&ip_vs_wrr_scheduler.n_list); + return register_ip_vs_scheduler(&ip_vs_wrr_scheduler) ; +} + +static void __exit ip_vs_wrr_cleanup(void) +{ + unregister_ip_vs_scheduler(&ip_vs_wrr_scheduler); +} + +module_init(ip_vs_wrr_init); +module_exit(ip_vs_wrr_cleanup); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/ipvs/ip_vs_xmit.c b/net/ipv4/ipvs/ip_vs_xmit.c new file mode 100644 index 000000000000..faa6176bbeb1 --- /dev/null +++ b/net/ipv4/ipvs/ip_vs_xmit.c @@ -0,0 +1,563 @@ +/* + * ip_vs_xmit.c: various packet transmitters for IPVS + * + * Version: $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $ + * + * Authors: Wensong Zhang + * Julian Anastasov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * + */ + +#include +#include +#include /* for tcphdr */ +#include /* for csum_tcpudp_magic */ +#include +#include /* for icmp_send */ +#include /* for ip_route_output */ +#include +#include + +#include + + +/* + * Destination cache to speed up outgoing route lookup + */ +static inline void +__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = dst; + dest->dst_rtos = rtos; + dst_release(old_dst); +} + +static inline struct dst_entry * +__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) +{ + struct dst_entry *dst = dest->dst_cache; + + if (!dst) + return NULL; + if ((dst->obsolete || rtos != dest->dst_rtos) && + dst->ops->check(dst, cookie) == NULL) { + dest->dst_cache = NULL; + dst_release(dst); + return NULL; + } + dst_hold(dst); + return dst; +} + +static inline struct rtable * +__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) +{ + struct rtable *rt; /* Route to the other host */ + struct ip_vs_dest *dest = cp->dest; + + if (dest) { + spin_lock(&dest->dst_lock); + if (!(rt = (struct rtable *) + __ip_vs_dst_check(dest, rtos, 0))) { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = dest->addr, + .saddr = 0, + .tos = rtos, } }, + }; + + if (ip_route_output_key(&rt, &fl)) { + spin_unlock(&dest->dst_lock); + IP_VS_DBG_RL("ip_route_output error, " + "dest: %u.%u.%u.%u\n", + NIPQUAD(dest->addr)); + return NULL; + } + __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst)); + IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n", + NIPQUAD(dest->addr), + atomic_read(&rt->u.dst.__refcnt), rtos); + } + spin_unlock(&dest->dst_lock); + } else { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = cp->daddr, + .saddr = 0, + .tos = rtos, } }, + }; + + if (ip_route_output_key(&rt, &fl)) { + IP_VS_DBG_RL("ip_route_output error, dest: " + "%u.%u.%u.%u\n", NIPQUAD(cp->daddr)); + return NULL; + } + } + + return rt; +} + + +/* + * Release dest->dst_cache before a dest is removed + */ +void +ip_vs_dst_reset(struct ip_vs_dest *dest) +{ + struct dst_entry *old_dst; + + old_dst = dest->dst_cache; + dest->dst_cache = NULL; + dst_release(old_dst); +} + +#define IP_VS_XMIT(skb, rt) \ +do { \ + nf_reset_debug(skb); \ + (skb)->nfcache |= NFC_IPVS_PROPERTY; \ + (skb)->ip_summed = CHECKSUM_NONE; \ + NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \ + (rt)->u.dst.dev, dst_output); \ +} while (0) + + +/* + * NULL transmitter (do nothing except return NF_ACCEPT) + */ +int +ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + /* we do not touch skb and do not need pskb ptr */ + return NF_ACCEPT; +} + + +/* + * Bypass transmitter + * Let packets bypass the destination when the destination is not + * available, it may be only used in transparent cache cluster. + */ +int +ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = skb->nh.iph; + u8 tos = iph->tos; + int mtu; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .saddr = 0, + .tos = RT_TOS(tos), } }, + }; + + EnterFunction(10); + + if (ip_route_output_key(&rt, &fl)) { + IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, " + "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr)); + goto tx_error_icmp; + } + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { + ip_rt_put(rt); + return NF_STOLEN; + } + ip_send_check(skb->nh.iph); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + + +/* + * NAT transmitter (only for outside-to-inside nat forwarding) + * Not used for related ICMP + */ +int +ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + int mtu; + struct iphdr *iph = skb->nh.iph; + + EnterFunction(10); + + /* check if it is a connection of no-client-port */ + if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { + __u16 _pt, *p; + p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt); + if (p == NULL) + goto tx_error; + ip_vs_conn_fill_cport(cp, *p); + IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); + } + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr))) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* mangle the packet */ + if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp)) + goto tx_error; + skb->nh.iph->daddr = cp->daddr; + ip_send_check(skb->nh.iph); + + IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + + /* FIXME: when application helper enlarges the packet and the length + is larger than the MTU of outgoing device, there will be still + MTU problem. */ + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + LeaveFunction(10); + kfree_skb(skb); + return NF_STOLEN; + tx_error_put: + ip_rt_put(rt); + goto tx_error; +} + + +/* + * IP Tunneling transmitter + * + * This function encapsulates the packet in a new IP packet, its + * destination will be set to cp->daddr. Most code of this function + * is taken from ipip.c. + * + * It is used in VS/TUN cluster. The load balancer selects a real + * server from a cluster based on a scheduling algorithm, + * encapsulates the request packet and forwards it to the selected + * server. For example, all real servers are configured with + * "ifconfig tunl0 up". When the server receives + * the encapsulated packet, it will decapsulate the packet, processe + * the request and return the response packets directly to the client + * without passing the load balancer. This can greatly increase the + * scalability of virtual server. + * + * Used for ANY protocol + */ +int +ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *old_iph = skb->nh.iph; + u8 tos = old_iph->tos; + u16 df = old_iph->frag_off; + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + int mtu; + + EnterFunction(10); + + if (skb->protocol != __constant_htons(ETH_P_IP)) { + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, " + "ETH_P_IP: %d, skb protocol: %d\n", + __constant_htons(ETH_P_IP), skb->protocol); + goto tx_error; + } + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) + goto tx_error_icmp; + + tdev = rt->u.dst.dev; + + mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); + if (mtu < 68) { + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n"); + goto tx_error; + } + if (skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + df |= (old_iph->frag_off&__constant_htons(IP_DF)); + + if ((old_iph->frag_off&__constant_htons(IP_DF)) + && mtu < ntohs(old_iph->tot_len)) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n"); + goto tx_error; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr); + + if (skb_headroom(skb) < max_headroom + || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + kfree_skb(skb); + IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); + return NF_STOLEN; + } + kfree_skb(skb); + skb = new_skb; + old_iph = skb->nh.iph; + } + + skb->h.raw = (void *) old_iph; + + /* fix old IP header checksum */ + ip_send_check(old_iph); + + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + iph->frag_off = df; + iph->protocol = IPPROTO_IPIP; + iph->tos = tos; + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + iph->ttl = old_iph->ttl; + iph->tot_len = htons(skb->len); + ip_select_ident(iph, &rt->u.dst, NULL); + ip_send_check(iph); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(skb, rt); + + LeaveFunction(10); + + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + + +/* + * Direct Routing transmitter + * Used for ANY protocol + */ +int +ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp) +{ + struct rtable *rt; /* Route to the other host */ + struct iphdr *iph = skb->nh.iph; + int mtu; + + EnterFunction(10); + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((iph->frag_off&__constant_htons(IP_DF)) && skb->len > mtu) { + icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); + ip_rt_put(rt); + IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n"); + goto tx_error; + } + + /* + * Call ip_send_check because we are not sure it is called + * after ip_defrag. Is copy-on-write needed? + */ + if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) { + ip_rt_put(rt); + return NF_STOLEN; + } + ip_send_check(skb->nh.iph); + + /* drop old route */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(skb, rt); + + LeaveFunction(10); + return NF_STOLEN; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + kfree_skb(skb); + LeaveFunction(10); + return NF_STOLEN; +} + + +/* + * ICMP packet transmitter + * called by the ip_vs_in_icmp + */ +int +ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, + struct ip_vs_protocol *pp, int offset) +{ + struct rtable *rt; /* Route to the other host */ + int mtu; + int rc; + + EnterFunction(10); + + /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be + forwarded directly here, because there is no need to + translate address/port back */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) { + if (cp->packet_xmit) + rc = cp->packet_xmit(skb, cp, pp); + else + rc = NF_ACCEPT; + /* do not touch skb anymore */ + atomic_inc(&cp->in_pkts); + __ip_vs_conn_put(cp); + goto out; + } + + /* + * mangle and send the packet here (only for VS/NAT) + */ + + if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos)))) + goto tx_error_icmp; + + /* MTU checking */ + mtu = dst_mtu(&rt->u.dst); + if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) { + ip_rt_put(rt); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); + goto tx_error; + } + + /* copy-on-write the packet before mangling it */ + if (!ip_vs_make_skb_writable(&skb, offset)) + goto tx_error_put; + + if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) + goto tx_error_put; + + /* drop the old route when skb is not shared */ + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + ip_vs_nat_icmp(skb, pp, cp, 0); + + /* Another hack: avoid icmp_send in ip_fragment */ + skb->local_df = 1; + + IP_VS_XMIT(skb, rt); + + rc = NF_STOLEN; + goto out; + + tx_error_icmp: + dst_link_failure(skb); + tx_error: + dev_kfree_skb(skb); + rc = NF_STOLEN; + out: + LeaveFunction(10); + return rc; + tx_error_put: + ip_rt_put(rt); + goto tx_error; +} diff --git a/net/ipv4/multipath.c b/net/ipv4/multipath.c new file mode 100644 index 000000000000..4e9ca7c76407 --- /dev/null +++ b/net/ipv4/multipath.c @@ -0,0 +1,55 @@ +/* multipath.c: IPV4 multipath algorithm support. + * + * Copyright (C) 2004, 2005 Einar Lueck + * Copyright (C) 2005 David S. Miller + */ + +#include +#include +#include +#include + +#include + +static DEFINE_SPINLOCK(alg_table_lock); +struct ip_mp_alg_ops *ip_mp_alg_table[IP_MP_ALG_MAX + 1]; + +int multipath_alg_register(struct ip_mp_alg_ops *ops, enum ip_mp_alg n) +{ + struct ip_mp_alg_ops **slot; + int err; + + if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX || + !ops->mp_alg_select_route) + return -EINVAL; + + spin_lock(&alg_table_lock); + slot = &ip_mp_alg_table[n]; + if (*slot != NULL) { + err = -EBUSY; + } else { + *slot = ops; + err = 0; + } + spin_unlock(&alg_table_lock); + + return err; +} +EXPORT_SYMBOL(multipath_alg_register); + +void multipath_alg_unregister(struct ip_mp_alg_ops *ops, enum ip_mp_alg n) +{ + struct ip_mp_alg_ops **slot; + + if (n < IP_MP_ALG_NONE || n > IP_MP_ALG_MAX) + return; + + spin_lock(&alg_table_lock); + slot = &ip_mp_alg_table[n]; + if (*slot == ops) + *slot = NULL; + spin_unlock(&alg_table_lock); + + synchronize_net(); +} +EXPORT_SYMBOL(multipath_alg_unregister); diff --git a/net/ipv4/multipath_drr.c b/net/ipv4/multipath_drr.c new file mode 100644 index 000000000000..9349686131fc --- /dev/null +++ b/net/ipv4/multipath_drr.c @@ -0,0 +1,265 @@ +/* + * Device round robin policy for multipath. + * + * + * Version: $Id: multipath_drr.c,v 1.1.2.1 2004/09/16 07:42:34 elueck Exp $ + * + * Authors: Einar Lueck + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct multipath_device { + int ifi; /* interface index of device */ + atomic_t usecount; + int allocated; +}; + +#define MULTIPATH_MAX_DEVICECANDIDATES 10 + +static struct multipath_device state[MULTIPATH_MAX_DEVICECANDIDATES]; +static DEFINE_SPINLOCK(state_lock); +static struct rtable *last_selection = NULL; + +static int inline __multipath_findslot(void) +{ + int i; + + for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) { + if (state[i].allocated == 0) + return i; + } + return -1; +} + +static int inline __multipath_finddev(int ifindex) +{ + int i; + + for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) { + if (state[i].allocated != 0 && + state[i].ifi == ifindex) + return i; + } + return -1; +} + +static int drr_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + int devidx; + + switch (event) { + case NETDEV_UNREGISTER: + case NETDEV_DOWN: + spin_lock_bh(&state_lock); + + devidx = __multipath_finddev(dev->ifindex); + if (devidx != -1) { + state[devidx].allocated = 0; + state[devidx].ifi = 0; + atomic_set(&state[devidx].usecount, 0); + } + + spin_unlock_bh(&state_lock); + break; + }; + + return NOTIFY_DONE; +} + +struct notifier_block drr_dev_notifier = { + .notifier_call = drr_dev_event, +}; + +static void drr_remove(struct rtable *rt) +{ + if (last_selection == rt) + last_selection = NULL; +} + +static void drr_safe_inc(atomic_t *usecount) +{ + int n; + + atomic_inc(usecount); + + n = atomic_read(usecount); + if (n <= 0) { + int i; + + spin_lock_bh(&state_lock); + + for (i = 0; i < MULTIPATH_MAX_DEVICECANDIDATES; i++) + atomic_set(&state[i].usecount, 0); + + spin_unlock_bh(&state_lock); + } +} + +static void drr_select_route(const struct flowi *flp, + struct rtable *first, struct rtable **rp) +{ + struct rtable *nh, *result, *cur_min; + int min_usecount = -1; + int devidx = -1; + int cur_min_devidx = -1; + + /* if necessary and possible utilize the old alternative */ + if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 && + last_selection != NULL) { + result = last_selection; + *rp = result; + return; + } + + /* 1. make sure all alt. nexthops have the same GC related data */ + /* 2. determine the new candidate to be returned */ + result = NULL; + cur_min = NULL; + for (nh = rcu_dereference(first); nh; + nh = rcu_dereference(nh->u.rt_next)) { + if ((nh->u.dst.flags & DST_BALANCED) != 0 && + multipath_comparekeys(&nh->fl, flp)) { + int nh_ifidx = nh->u.dst.dev->ifindex; + + nh->u.dst.lastuse = jiffies; + nh->u.dst.__use++; + if (result != NULL) + continue; + + /* search for the output interface */ + + /* this is not SMP safe, only add/remove are + * SMP safe as wrong usecount updates have no big + * impact + */ + devidx = __multipath_finddev(nh_ifidx); + if (devidx == -1) { + /* add the interface to the array + * SMP safe + */ + spin_lock_bh(&state_lock); + + /* due to SMP: search again */ + devidx = __multipath_finddev(nh_ifidx); + if (devidx == -1) { + /* add entry for device */ + devidx = __multipath_findslot(); + if (devidx == -1) { + /* unlikely but possible */ + continue; + } + + state[devidx].allocated = 1; + state[devidx].ifi = nh_ifidx; + atomic_set(&state[devidx].usecount, 0); + min_usecount = 0; + } + + spin_unlock_bh(&state_lock); + } + + if (min_usecount == 0) { + /* if the device has not been used it is + * the primary target + */ + drr_safe_inc(&state[devidx].usecount); + result = nh; + } else { + int count = + atomic_read(&state[devidx].usecount); + + if (min_usecount == -1 || + count < min_usecount) { + cur_min = nh; + cur_min_devidx = devidx; + min_usecount = count; + } + } + } + } + + if (!result) { + if (cur_min) { + drr_safe_inc(&state[cur_min_devidx].usecount); + result = cur_min; + } else { + result = first; + } + } + + *rp = result; + last_selection = result; +} + +static struct ip_mp_alg_ops drr_ops = { + .mp_alg_select_route = drr_select_route, + .mp_alg_remove = drr_remove, +}; + +static int __init drr_init(void) +{ + int err = register_netdevice_notifier(&drr_dev_notifier); + + if (err) + return err; + + err = multipath_alg_register(&drr_ops, IP_MP_ALG_RR); + if (err) + goto fail; + + return 0; + +fail: + unregister_netdevice_notifier(&drr_dev_notifier); + return err; +} + +static void __exit drr_exit(void) +{ + unregister_netdevice_notifier(&drr_dev_notifier); + multipath_alg_unregister(&drr_ops, IP_MP_ALG_DRR); +} + +module_init(drr_init); +module_exit(drr_exit); diff --git a/net/ipv4/multipath_random.c b/net/ipv4/multipath_random.c new file mode 100644 index 000000000000..805a16e47de5 --- /dev/null +++ b/net/ipv4/multipath_random.c @@ -0,0 +1,128 @@ +/* + * Random policy for multipath. + * + * + * Version: $Id: multipath_random.c,v 1.1.2.3 2004/09/21 08:42:11 elueck Exp $ + * + * Authors: Einar Lueck + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MULTIPATH_MAX_CANDIDATES 40 + +/* interface to random number generation */ +static unsigned int RANDOM_SEED = 93186752; + +static inline unsigned int random(unsigned int ubound) +{ + static unsigned int a = 1588635695, + q = 2, + r = 1117695901; + + RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q); + + return RANDOM_SEED % ubound; +} + + +static void random_select_route(const struct flowi *flp, + struct rtable *first, + struct rtable **rp) +{ + struct rtable *rt; + struct rtable *decision; + unsigned char candidate_count = 0; + + /* count all candidate */ + for (rt = rcu_dereference(first); rt; + rt = rcu_dereference(rt->u.rt_next)) { + if ((rt->u.dst.flags & DST_BALANCED) != 0 && + multipath_comparekeys(&rt->fl, flp)) + ++candidate_count; + } + + /* choose a random candidate */ + decision = first; + if (candidate_count > 1) { + unsigned char i = 0; + unsigned char candidate_no = (unsigned char) + random(candidate_count); + + /* find chosen candidate and adjust GC data for all candidates + * to ensure they stay in cache + */ + for (rt = first; rt; rt = rt->u.rt_next) { + if ((rt->u.dst.flags & DST_BALANCED) != 0 && + multipath_comparekeys(&rt->fl, flp)) { + rt->u.dst.lastuse = jiffies; + + if (i == candidate_no) + decision = rt; + + if (i >= candidate_count) + break; + + i++; + } + } + } + + decision->u.dst.__use++; + *rp = decision; +} + +static struct ip_mp_alg_ops random_ops = { + .mp_alg_select_route = random_select_route, +}; + +static int __init random_init(void) +{ + return multipath_alg_register(&random_ops, IP_MP_ALG_RANDOM); +} + +static void __exit random_exit(void) +{ + multipath_alg_unregister(&random_ops, IP_MP_ALG_RANDOM); +} + +module_init(random_init); +module_exit(random_exit); diff --git a/net/ipv4/multipath_rr.c b/net/ipv4/multipath_rr.c new file mode 100644 index 000000000000..554a82568160 --- /dev/null +++ b/net/ipv4/multipath_rr.c @@ -0,0 +1,115 @@ +/* + * Round robin policy for multipath. + * + * + * Version: $Id: multipath_rr.c,v 1.1.2.2 2004/09/16 07:42:34 elueck Exp $ + * + * Authors: Einar Lueck + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MULTIPATH_MAX_CANDIDATES 40 + +static struct rtable* last_used = NULL; + +static void rr_remove(struct rtable *rt) +{ + if (last_used == rt) + last_used = NULL; +} + +static void rr_select_route(const struct flowi *flp, + struct rtable *first, struct rtable **rp) +{ + struct rtable *nh, *result, *min_use_cand = NULL; + int min_use = -1; + + /* if necessary and possible utilize the old alternative */ + if ((flp->flags & FLOWI_FLAG_MULTIPATHOLDROUTE) != 0 && + last_used != NULL) { + result = last_used; + goto out; + } + + /* 1. make sure all alt. nexthops have the same GC related data + * 2. determine the new candidate to be returned + */ + result = NULL; + for (nh = rcu_dereference(first); nh; + nh = rcu_dereference(nh->u.rt_next)) { + if ((nh->u.dst.flags & DST_BALANCED) != 0 && + multipath_comparekeys(&nh->fl, flp)) { + nh->u.dst.lastuse = jiffies; + + if (min_use == -1 || nh->u.dst.__use < min_use) { + min_use = nh->u.dst.__use; + min_use_cand = nh; + } + } + } + result = min_use_cand; + if (!result) + result = first; + +out: + last_used = result; + result->u.dst.__use++; + *rp = result; +} + +static struct ip_mp_alg_ops rr_ops = { + .mp_alg_select_route = rr_select_route, + .mp_alg_remove = rr_remove, +}; + +static int __init rr_init(void) +{ + return multipath_alg_register(&rr_ops, IP_MP_ALG_RR); +} + +static void __exit rr_exit(void) +{ + multipath_alg_unregister(&rr_ops, IP_MP_ALG_RR); +} + +module_init(rr_init); +module_exit(rr_exit); diff --git a/net/ipv4/multipath_wrandom.c b/net/ipv4/multipath_wrandom.c new file mode 100644 index 000000000000..10b23e1bece6 --- /dev/null +++ b/net/ipv4/multipath_wrandom.c @@ -0,0 +1,344 @@ +/* + * Weighted random policy for multipath. + * + * + * Version: $Id: multipath_wrandom.c,v 1.1.2.3 2004/09/22 07:51:40 elueck Exp $ + * + * Authors: Einar Lueck + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MULTIPATH_STATE_SIZE 15 + +struct multipath_candidate { + struct multipath_candidate *next; + int power; + struct rtable *rt; +}; + +struct multipath_dest { + struct list_head list; + + const struct fib_nh *nh_info; + __u32 netmask; + __u32 network; + unsigned char prefixlen; + + struct rcu_head rcu; +}; + +struct multipath_bucket { + struct list_head head; + spinlock_t lock; +}; + +struct multipath_route { + struct list_head list; + + int oif; + __u32 gw; + struct list_head dests; + + struct rcu_head rcu; +}; + +/* state: primarily weight per route information */ +static struct multipath_bucket state[MULTIPATH_STATE_SIZE]; + +/* interface to random number generation */ +static unsigned int RANDOM_SEED = 93186752; + +static inline unsigned int random(unsigned int ubound) +{ + static unsigned int a = 1588635695, + q = 2, + r = 1117695901; + RANDOM_SEED = a*(RANDOM_SEED % q) - r*(RANDOM_SEED / q); + return RANDOM_SEED % ubound; +} + +static unsigned char __multipath_lookup_weight(const struct flowi *fl, + const struct rtable *rt) +{ + const int state_idx = rt->idev->dev->ifindex % MULTIPATH_STATE_SIZE; + struct multipath_route *r; + struct multipath_route *target_route = NULL; + struct multipath_dest *d; + int weight = 1; + + /* lookup the weight information for a certain route */ + rcu_read_lock(); + + /* find state entry for gateway or add one if necessary */ + list_for_each_entry_rcu(r, &state[state_idx].head, list) { + if (r->gw == rt->rt_gateway && + r->oif == rt->idev->dev->ifindex) { + target_route = r; + break; + } + } + + if (!target_route) { + /* this should not happen... but we are prepared */ + printk( KERN_CRIT"%s: missing state for gateway: %u and " \ + "device %d\n", __FUNCTION__, rt->rt_gateway, + rt->idev->dev->ifindex); + goto out; + } + + /* find state entry for destination */ + list_for_each_entry_rcu(d, &target_route->dests, list) { + __u32 targetnetwork = fl->fl4_dst & + (0xFFFFFFFF >> (32 - d->prefixlen)); + + if ((targetnetwork & d->netmask) == d->network) { + weight = d->nh_info->nh_weight; + goto out; + } + } + +out: + rcu_read_unlock(); + return weight; +} + +static void wrandom_init_state(void) +{ + int i; + + for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) { + INIT_LIST_HEAD(&state[i].head); + spin_lock_init(&state[i].lock); + } +} + +static void wrandom_select_route(const struct flowi *flp, + struct rtable *first, + struct rtable **rp) +{ + struct rtable *rt; + struct rtable *decision; + struct multipath_candidate *first_mpc = NULL; + struct multipath_candidate *mpc, *last_mpc = NULL; + int power = 0; + int last_power; + int selector; + const size_t size_mpc = sizeof(struct multipath_candidate); + + /* collect all candidates and identify their weights */ + for (rt = rcu_dereference(first); rt; + rt = rcu_dereference(rt->u.rt_next)) { + if ((rt->u.dst.flags & DST_BALANCED) != 0 && + multipath_comparekeys(&rt->fl, flp)) { + struct multipath_candidate* mpc = + (struct multipath_candidate*) + kmalloc(size_mpc, GFP_KERNEL); + + if (!mpc) + return; + + power += __multipath_lookup_weight(flp, rt) * 10000; + + mpc->power = power; + mpc->rt = rt; + mpc->next = NULL; + + if (!first_mpc) + first_mpc = mpc; + else + last_mpc->next = mpc; + + last_mpc = mpc; + } + } + + /* choose a weighted random candidate */ + decision = first; + selector = random(power); + last_power = 0; + + /* select candidate, adjust GC data and cleanup local state */ + decision = first; + last_mpc = NULL; + for (mpc = first_mpc; mpc; mpc = mpc->next) { + mpc->rt->u.dst.lastuse = jiffies; + if (last_power <= selector && selector < mpc->power) + decision = mpc->rt; + + last_power = mpc->power; + if (last_mpc) + kfree(last_mpc); + + last_mpc = mpc; + } + + if (last_mpc) { + /* concurrent __multipath_flush may lead to !last_mpc */ + kfree(last_mpc); + } + + decision->u.dst.__use++; + *rp = decision; +} + +static void wrandom_set_nhinfo(__u32 network, + __u32 netmask, + unsigned char prefixlen, + const struct fib_nh *nh) +{ + const int state_idx = nh->nh_oif % MULTIPATH_STATE_SIZE; + struct multipath_route *r, *target_route = NULL; + struct multipath_dest *d, *target_dest = NULL; + + /* store the weight information for a certain route */ + spin_lock(&state[state_idx].lock); + + /* find state entry for gateway or add one if necessary */ + list_for_each_entry_rcu(r, &state[state_idx].head, list) { + if (r->gw == nh->nh_gw && r->oif == nh->nh_oif) { + target_route = r; + break; + } + } + + if (!target_route) { + const size_t size_rt = sizeof(struct multipath_route); + target_route = (struct multipath_route *) + kmalloc(size_rt, GFP_KERNEL); + + target_route->gw = nh->nh_gw; + target_route->oif = nh->nh_oif; + memset(&target_route->rcu, 0, sizeof(struct rcu_head)); + INIT_LIST_HEAD(&target_route->dests); + + list_add_rcu(&target_route->list, &state[state_idx].head); + } + + /* find state entry for destination or add one if necessary */ + list_for_each_entry_rcu(d, &target_route->dests, list) { + if (d->nh_info == nh) { + target_dest = d; + break; + } + } + + if (!target_dest) { + const size_t size_dst = sizeof(struct multipath_dest); + target_dest = (struct multipath_dest*) + kmalloc(size_dst, GFP_KERNEL); + + target_dest->nh_info = nh; + target_dest->network = network; + target_dest->netmask = netmask; + target_dest->prefixlen = prefixlen; + memset(&target_dest->rcu, 0, sizeof(struct rcu_head)); + + list_add_rcu(&target_dest->list, &target_route->dests); + } + /* else: we already stored this info for another destination => + * we are finished + */ + + spin_unlock(&state[state_idx].lock); +} + +static void __multipath_free(struct rcu_head *head) +{ + struct multipath_route *rt = container_of(head, struct multipath_route, + rcu); + kfree(rt); +} + +static void __multipath_free_dst(struct rcu_head *head) +{ + struct multipath_dest *dst = container_of(head, + struct multipath_dest, + rcu); + kfree(dst); +} + +static void wrandom_flush(void) +{ + int i; + + /* defere delete to all entries */ + for (i = 0; i < MULTIPATH_STATE_SIZE; ++i) { + struct multipath_route *r; + + spin_lock(&state[i].lock); + list_for_each_entry_rcu(r, &state[i].head, list) { + struct multipath_dest *d; + list_for_each_entry_rcu(d, &r->dests, list) { + list_del_rcu(&d->list); + call_rcu(&d->rcu, + __multipath_free_dst); + } + list_del_rcu(&r->list); + call_rcu(&r->rcu, + __multipath_free); + } + + spin_unlock(&state[i].lock); + } +} + +static struct ip_mp_alg_ops wrandom_ops = { + .mp_alg_select_route = wrandom_select_route, + .mp_alg_flush = wrandom_flush, + .mp_alg_set_nhinfo = wrandom_set_nhinfo, +}; + +static int __init wrandom_init(void) +{ + wrandom_init_state(); + + return multipath_alg_register(&wrandom_ops, IP_MP_ALG_WRANDOM); +} + +static void __exit wrandom_exit(void) +{ + multipath_alg_unregister(&wrandom_ops, IP_MP_ALG_WRANDOM); +} + +module_init(wrandom_init); +module_exit(wrandom_exit); diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig new file mode 100644 index 000000000000..46d4cb1c06f0 --- /dev/null +++ b/net/ipv4/netfilter/Kconfig @@ -0,0 +1,696 @@ +# +# IP netfilter configuration +# + +menu "IP: Netfilter Configuration" + depends on INET && NETFILTER + +# connection tracking, helpers and protocols +config IP_NF_CONNTRACK + tristate "Connection tracking (required for masq/NAT)" + ---help--- + Connection tracking keeps a record of what packets have passed + through your machine, in order to figure out how they are related + into connections. + + This is required to do Masquerading or other kinds of Network + Address Translation (except for Fast NAT). It can also be used to + enhance packet filtering (see `Connection state match support' + below). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_CT_ACCT + bool "Connection tracking flow accounting" + depends on IP_NF_CONNTRACK + help + If this option is enabled, the connection tracking code will + keep per-flow packet and byte counters. + + Those counters can be used for flow-based accounting or the + `connbytes' match. + + If unsure, say `N'. + +config IP_NF_CONNTRACK_MARK + bool 'Connection mark tracking support' + help + This option enables support for connection marks, used by the + `CONNMARK' target and `connmark' match. Similar to the mark value + of packets, but this mark value is kept in the conntrack session + instead of the individual packets. + +config IP_NF_CT_PROTO_SCTP + tristate 'SCTP protocol connection tracking support (EXPERIMENTAL)' + depends on IP_NF_CONNTRACK && EXPERIMENTAL + help + With this option enabled, the connection tracking code will + be able to do state tracking on SCTP connections. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config IP_NF_FTP + tristate "FTP protocol support" + depends on IP_NF_CONNTRACK + help + Tracking FTP connections is problematic: special helpers are + required for tracking them, and doing masquerading and other forms + of Network Address Translation on them. + + To compile it as a module, choose M here. If unsure, say Y. + +config IP_NF_IRC + tristate "IRC protocol support" + depends on IP_NF_CONNTRACK + ---help--- + There is a commonly-used extension to IRC called + Direct Client-to-Client Protocol (DCC). This enables users to send + files to each other, and also chat to each other without the need + of a server. DCC Sending is used anywhere you send files over IRC, + and DCC Chat is most commonly used by Eggdrop bots. If you are + using NAT, this extension will enable you to send files and initiate + chats. Note that you do NOT need this extension to get files or + have others initiate chats, or everything else in IRC. + + To compile it as a module, choose M here. If unsure, say Y. + +config IP_NF_TFTP + tristate "TFTP protocol support" + depends on IP_NF_CONNTRACK + help + TFTP connection tracking helper, this is required depending + on how restrictive your ruleset is. + If you are using a tftp client behind -j SNAT or -j MASQUERADING + you will need this. + + To compile it as a module, choose M here. If unsure, say Y. + +config IP_NF_AMANDA + tristate "Amanda backup protocol support" + depends on IP_NF_CONNTRACK + help + If you are running the Amanda backup package + on this machine or machines that will be MASQUERADED through this + machine, then you may want to enable this feature. This allows the + connection tracking and natting code to allow the sub-channels that + Amanda requires for communication of the backup data, messages and + index. + + To compile it as a module, choose M here. If unsure, say Y. + +config IP_NF_QUEUE + tristate "Userspace queueing via NETLINK" + help + Netfilter has the ability to queue packets to user space: the + netlink device can be used to access them using this driver. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_IPTABLES + tristate "IP tables support (required for filtering/masq/NAT)" + help + iptables is a general, extensible packet identification framework. + The packet filtering and full NAT (masquerading, port forwarding, + etc) subsystems now use this: say `Y' or `M' here if you want to use + either of those. + + To compile it as a module, choose M here. If unsure, say N. + +# The matches. +config IP_NF_MATCH_LIMIT + tristate "limit match support" + depends on IP_NF_IPTABLES + help + limit matching allows you to control the rate at which a rule can be + matched: mainly useful in combination with the LOG target ("LOG + target support", below) and to avoid some Denial of Service attacks. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_IPRANGE + tristate "IP range match support" + depends on IP_NF_IPTABLES + help + This option makes possible to match IP addresses against IP address + ranges. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_MAC + tristate "MAC address match support" + depends on IP_NF_IPTABLES + help + MAC matching allows you to match packets based on the source + Ethernet address of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_PKTTYPE + tristate "Packet type match support" + depends on IP_NF_IPTABLES + help + Packet type matching allows you to match a packet by + its "class", eg. BROADCAST, MULTICAST, ... + + Typical usage: + iptables -A INPUT -m pkttype --pkt-type broadcast -j LOG + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_MARK + tristate "netfilter MARK match support" + depends on IP_NF_IPTABLES + help + Netfilter mark matching allows you to match packets based on the + `nfmark' value in the packet. This can be set by the MARK target + (see below). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_MULTIPORT + tristate "Multiple port match support" + depends on IP_NF_IPTABLES + help + Multiport matching allows you to match TCP or UDP packets based on + a series of source or destination ports: normally a rule can only + match a single range of ports. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_TOS + tristate "TOS match support" + depends on IP_NF_IPTABLES + help + TOS matching allows you to match packets based on the Type Of + Service fields of the IP packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_RECENT + tristate "recent match support" + depends on IP_NF_IPTABLES + help + This match is used for creating one or many lists of recently + used addresses and then matching against that/those list(s). + + Short options are available by using 'iptables -m recent -h' + Official Website: + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_ECN + tristate "ECN match support" + depends on IP_NF_IPTABLES + help + This option adds a `ECN' match, which allows you to match against + the IPv4 and TCP header ECN fields. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_DSCP + tristate "DSCP match support" + depends on IP_NF_IPTABLES + help + This option adds a `DSCP' match, which allows you to match against + the IPv4 header DSCP field (DSCP codepoint). + + The DSCP codepoint can have any value between 0x0 and 0x4f. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_AH_ESP + tristate "AH/ESP match support" + depends on IP_NF_IPTABLES + help + These two match extensions (`ah' and `esp') allow you to match a + range of SPIs inside AH or ESP headers of IPSec packets. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_LENGTH + tristate "LENGTH match support" + depends on IP_NF_IPTABLES + help + This option allows you to match the length of a packet against a + specific value or range of values. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_TTL + tristate "TTL match support" + depends on IP_NF_IPTABLES + help + This adds CONFIG_IP_NF_MATCH_TTL option, which enabled the user + to match packets by their TTL value. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_TCPMSS + tristate "tcpmss match support" + depends on IP_NF_IPTABLES + help + This option adds a `tcpmss' match, which allows you to examine the + MSS value of TCP SYN packets, which control the maximum packet size + for that connection. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_HELPER + tristate "Helper match support" + depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + help + Helper matching allows you to match packets in dynamic connections + tracked by a conntrack-helper, ie. ip_conntrack_ftp + + To compile it as a module, choose M here. If unsure, say Y. + +config IP_NF_MATCH_STATE + tristate "Connection state match support" + depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + help + Connection state matching allows you to match packets based on their + relationship to a tracked connection (ie. previous packets). This + is a powerful tool for packet classification. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_CONNTRACK + tristate "Connection tracking match support" + depends on IP_NF_CONNTRACK && IP_NF_IPTABLES + help + This is a general conntrack match module, a superset of the state match. + + It allows matching on additional conntrack information, which is + useful in complex configurations, such as NAT gateways with multiple + internet links or tunnels. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_OWNER + tristate "Owner match support" + depends on IP_NF_IPTABLES + help + Packet owner matching allows you to match locally-generated packets + based on who created them: the user, group, process or session. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_PHYSDEV + tristate "Physdev match support" + depends on IP_NF_IPTABLES && BRIDGE_NETFILTER + help + Physdev packet matching matches against the physical bridge ports + the IP packet arrived on or will leave by. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_MATCH_ADDRTYPE + tristate 'address type match support' + depends on IP_NF_IPTABLES + help + This option allows you to match what routing thinks of an address, + eg. UNICAST, LOCAL, BROADCAST, ... + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config IP_NF_MATCH_REALM + tristate 'realm match support' + depends on IP_NF_IPTABLES + select NET_CLS_ROUTE + help + This option adds a `realm' match, which allows you to use the realm + key from the routing subsystem inside iptables. + + This match pretty much resembles the CONFIG_NET_CLS_ROUTE4 option + in tc world. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config IP_NF_MATCH_SCTP + tristate 'SCTP protocol match support' + depends on IP_NF_IPTABLES + help + With this option enabled, you will be able to use the iptables + `sctp' match in order to match on SCTP source/destination ports + and SCTP chunk types. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config IP_NF_MATCH_COMMENT + tristate 'comment match support' + depends on IP_NF_IPTABLES + help + This option adds a `comment' dummy-match, which allows you to put + comments in your iptables ruleset. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config IP_NF_MATCH_CONNMARK + tristate 'Connection mark match support' + depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES + help + This option adds a `connmark' match, which allows you to match the + connection mark value previously set for the session by `CONNMARK'. + + If you want to compile it as a module, say M here and read + . The module will be called + ipt_connmark.o. If unsure, say `N'. + +config IP_NF_MATCH_HASHLIMIT + tristate 'hashlimit match support' + depends on IP_NF_IPTABLES + help + This option adds a new iptables `hashlimit' match. + + As opposed to `limit', this match dynamically crates a hash table + of limit buckets, based on your selection of source/destination + ip addresses and/or ports. + + It enables you to express policies like `10kpps for any given + destination IP' or `500pps from any given source IP' with a single + IPtables rule. + +# `filter', generic and specific targets +config IP_NF_FILTER + tristate "Packet filtering" + depends on IP_NF_IPTABLES + help + Packet filtering defines a table `filter', which has a series of + rules for simple packet filtering at local input, forwarding and + local output. See the man page for iptables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_REJECT + tristate "REJECT target support" + depends on IP_NF_FILTER + help + The REJECT target allows a filtering rule to specify that an ICMP + error should be issued in response to an incoming packet, rather + than silently being dropped. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_LOG + tristate "LOG target support" + depends on IP_NF_IPTABLES + help + This option adds a `LOG' target, which allows you to create rules in + any iptables table which records the packet header to the syslog. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_ULOG + tristate "ULOG target support" + depends on IP_NF_IPTABLES + ---help--- + This option adds a `ULOG' target, which allows you to create rules in + any iptables table. The packet is passed to a userspace logging + daemon using netlink multicast sockets; unlike the LOG target + which can only be viewed through syslog. + + The apropriate userspace logging daemon (ulogd) may be obtained from + + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_TCPMSS + tristate "TCPMSS target support" + depends on IP_NF_IPTABLES + ---help--- + This option adds a `TCPMSS' target, which allows you to alter the + MSS value of TCP SYN packets, to control the maximum size for that + connection (usually limiting it to your outgoing interface's MTU + minus 40). + + This is used to overcome criminally braindead ISPs or servers which + block ICMP Fragmentation Needed packets. The symptoms of this + problem are that everything works fine from your Linux + firewall/router, but machines behind it can never exchange large + packets: + 1) Web browsers connect, then hang with no data received. + 2) Small mail works fine, but large emails hang. + 3) ssh works fine, but scp hangs after initial handshaking. + + Workaround: activate this option and add a rule to your firewall + configuration like: + + iptables -A FORWARD -p tcp --tcp-flags SYN,RST SYN \ + -j TCPMSS --clamp-mss-to-pmtu + + To compile it as a module, choose M here. If unsure, say N. + +# NAT + specific targets +config IP_NF_NAT + tristate "Full NAT" + depends on IP_NF_IPTABLES && IP_NF_CONNTRACK + help + The Full NAT option allows masquerading, port forwarding and other + forms of full Network Address Port Translation. It is controlled by + the `nat' table in iptables: see the man page for iptables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_NAT_NEEDED + bool + depends on IP_NF_NAT != n + default y + +config IP_NF_TARGET_MASQUERADE + tristate "MASQUERADE target support" + depends on IP_NF_NAT + help + Masquerading is a special case of NAT: all outgoing connections are + changed to seem to come from a particular interface's address, and + if the interface goes down, those connections are lost. This is + only useful for dialup accounts with dynamic IP address (ie. your IP + address will be different on next dialup). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_REDIRECT + tristate "REDIRECT target support" + depends on IP_NF_NAT + help + REDIRECT is a special case of NAT: all incoming connections are + mapped onto the incoming interface's address, causing the packets to + come to the local machine instead of passing through. This is + useful for transparent proxies. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_NETMAP + tristate "NETMAP target support" + depends on IP_NF_NAT + help + NETMAP is an implementation of static 1:1 NAT mapping of network + addresses. It maps the network address part, while keeping the host + address part intact. It is similar to Fast NAT, except that + Netfilter's connection tracking doesn't work well with Fast NAT. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_SAME + tristate "SAME target support" + depends on IP_NF_NAT + help + This option adds a `SAME' target, which works like the standard SNAT + target, but attempts to give clients the same IP for all connections. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_NAT_SNMP_BASIC + tristate "Basic SNMP-ALG support (EXPERIMENTAL)" + depends on EXPERIMENTAL && IP_NF_NAT + ---help--- + + This module implements an Application Layer Gateway (ALG) for + SNMP payloads. In conjunction with NAT, it allows a network + management system to access multiple private networks with + conflicting addresses. It works by modifying IP addresses + inside SNMP payloads to match IP-layer NAT mapping. + + This is the "basic" form of SNMP-ALG, as described in RFC 2962 + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_NAT_IRC + tristate + depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n + default IP_NF_NAT if IP_NF_IRC=y + default m if IP_NF_IRC=m + +# If they want FTP, set to $CONFIG_IP_NF_NAT (m or y), +# or $CONFIG_IP_NF_FTP (m or y), whichever is weaker. Argh. +config IP_NF_NAT_FTP + tristate + depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n + default IP_NF_NAT if IP_NF_FTP=y + default m if IP_NF_FTP=m + +config IP_NF_NAT_TFTP + tristate + depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n + default IP_NF_NAT if IP_NF_TFTP=y + default m if IP_NF_TFTP=m + +config IP_NF_NAT_AMANDA + tristate + depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n + default IP_NF_NAT if IP_NF_AMANDA=y + default m if IP_NF_AMANDA=m + +# mangle + specific targets +config IP_NF_MANGLE + tristate "Packet mangling" + depends on IP_NF_IPTABLES + help + This option adds a `mangle' table to iptables: see the man page for + iptables(8). This table is used for various packet alterations + which can effect how the packet is routed. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_TOS + tristate "TOS target support" + depends on IP_NF_MANGLE + help + This option adds a `TOS' target, which allows you to create rules in + the `mangle' table which alter the Type Of Service field of an IP + packet prior to routing. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_ECN + tristate "ECN target support" + depends on IP_NF_MANGLE + ---help--- + This option adds a `ECN' target, which can be used in the iptables mangle + table. + + You can use this target to remove the ECN bits from the IPv4 header of + an IP packet. This is particularly useful, if you need to work around + existing ECN blackholes on the internet, but don't want to disable + ECN support in general. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_DSCP + tristate "DSCP target support" + depends on IP_NF_MANGLE + help + This option adds a `DSCP' match, which allows you to match against + the IPv4 header DSCP field (DSCP codepoint). + + The DSCP codepoint can have any value between 0x0 and 0x4f. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_MARK + tristate "MARK target support" + depends on IP_NF_MANGLE + help + This option adds a `MARK' target, which allows you to create rules + in the `mangle' table which alter the netfilter mark (nfmark) field + associated with the packet prior to routing. This can change + the routing method (see `Use netfilter MARK value as routing + key') and can also be used by other subsystems to change their + behavior. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_CLASSIFY + tristate "CLASSIFY target support" + depends on IP_NF_MANGLE + help + This option adds a `CLASSIFY' target, which enables the user to set + the priority of a packet. Some qdiscs can use this value for + classification, among these are: + + atm, cbq, dsmark, pfifo_fast, htb, prio + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_CONNMARK + tristate 'CONNMARK target support' + depends on IP_NF_CONNTRACK_MARK && IP_NF_MANGLE + help + This option adds a `CONNMARK' target, which allows one to manipulate + the connection mark value. Similar to the MARK target, but + affects the connection mark value rather than the packet mark value. + + If you want to compile it as a module, say M here and read + . The module will be called + ipt_CONNMARK.o. If unsure, say `N'. + +config IP_NF_TARGET_CLUSTERIP + tristate "CLUSTERIP target support (EXPERIMENTAL)" + depends on IP_NF_CONNTRACK_MARK && IP_NF_IPTABLES && EXPERIMENTAL + help + The CLUSTERIP target allows you to build load-balancing clusters of + network servers without having a dedicated load-balancing + router/server/switch. + + To compile it as a module, choose M here. If unsure, say N. + +# raw + specific targets +config IP_NF_RAW + tristate 'raw table support (required for NOTRACK/TRACE)' + depends on IP_NF_IPTABLES + help + This option adds a `raw' table to iptables. This table is the very + first in the netfilter framework and hooks in at the PREROUTING + and OUTPUT chains. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +config IP_NF_TARGET_NOTRACK + tristate 'NOTRACK target support' + depends on IP_NF_RAW + depends on IP_NF_CONNTRACK + help + The NOTRACK target allows a select rule to specify + which packets *not* to enter the conntrack/NAT + subsystem with all the consequences (no ICMP error tracking, + no protocol helpers for the selected packets). + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + + +# ARP tables +config IP_NF_ARPTABLES + tristate "ARP tables support" + help + arptables is a general, extensible packet identification framework. + The ARP packet filtering and mangling (manipulation)subsystems + use this: say Y or M here if you want to use either of those. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_ARPFILTER + tristate "ARP packet filtering" + depends on IP_NF_ARPTABLES + help + ARP packet filtering defines a table `filter', which has a series of + rules for simple ARP packet filtering at local input and + local output. On a bridge, you can also specify filtering rules + for forwarded ARP packets. See the man page for arptables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_ARP_MANGLE + tristate "ARP payload mangling" + depends on IP_NF_ARPTABLES + help + Allows altering the ARP packet payload: source and destination + hardware and network addresses. + +endmenu + diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile new file mode 100644 index 000000000000..45796d5924dd --- /dev/null +++ b/net/ipv4/netfilter/Makefile @@ -0,0 +1,89 @@ +# +# Makefile for the netfilter modules on top of IPv4. +# + +# objects for the standalone - connection tracking / NAT +ip_conntrack-objs := ip_conntrack_standalone.o ip_conntrack_core.o ip_conntrack_proto_generic.o ip_conntrack_proto_tcp.o ip_conntrack_proto_udp.o ip_conntrack_proto_icmp.o +iptable_nat-objs := ip_nat_standalone.o ip_nat_rule.o ip_nat_core.o ip_nat_helper.o ip_nat_proto_unknown.o ip_nat_proto_tcp.o ip_nat_proto_udp.o ip_nat_proto_icmp.o + +# connection tracking +obj-$(CONFIG_IP_NF_CONNTRACK) += ip_conntrack.o + +# SCTP protocol connection tracking +obj-$(CONFIG_IP_NF_CT_PROTO_SCTP) += ip_conntrack_proto_sctp.o + +# connection tracking helpers +obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o +obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o +obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o +obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o + +# NAT helpers +obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o +obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o +obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o +obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o + +# generic IP tables +obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o + +# the three instances of ip_tables +obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o +obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o +obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o +obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o + +# matches +obj-$(CONFIG_IP_NF_MATCH_HELPER) += ipt_helper.o +obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_limit.o +obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o +obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o +obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o +obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o +obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o +obj-$(CONFIG_IP_NF_MATCH_PKTTYPE) += ipt_pkttype.o +obj-$(CONFIG_IP_NF_MATCH_MULTIPORT) += ipt_multiport.o +obj-$(CONFIG_IP_NF_MATCH_OWNER) += ipt_owner.o +obj-$(CONFIG_IP_NF_MATCH_TOS) += ipt_tos.o +obj-$(CONFIG_IP_NF_MATCH_RECENT) += ipt_recent.o +obj-$(CONFIG_IP_NF_MATCH_ECN) += ipt_ecn.o +obj-$(CONFIG_IP_NF_MATCH_DSCP) += ipt_dscp.o +obj-$(CONFIG_IP_NF_MATCH_AH_ESP) += ipt_ah.o ipt_esp.o +obj-$(CONFIG_IP_NF_MATCH_LENGTH) += ipt_length.o +obj-$(CONFIG_IP_NF_MATCH_TTL) += ipt_ttl.o +obj-$(CONFIG_IP_NF_MATCH_STATE) += ipt_state.o +obj-$(CONFIG_IP_NF_MATCH_CONNMARK) += ipt_connmark.o +obj-$(CONFIG_IP_NF_MATCH_CONNTRACK) += ipt_conntrack.o +obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_tcpmss.o +obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o +obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o +obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o +obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o + +# targets +obj-$(CONFIG_IP_NF_TARGET_REJECT) += ipt_REJECT.o +obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TOS.o +obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o +obj-$(CONFIG_IP_NF_TARGET_DSCP) += ipt_DSCP.o +obj-$(CONFIG_IP_NF_TARGET_MARK) += ipt_MARK.o +obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o +obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o +obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o +obj-$(CONFIG_IP_NF_TARGET_SAME) += ipt_SAME.o +obj-$(CONFIG_IP_NF_TARGET_CLASSIFY) += ipt_CLASSIFY.o +obj-$(CONFIG_IP_NF_NAT_SNMP_BASIC) += ip_nat_snmp_basic.o +obj-$(CONFIG_IP_NF_TARGET_LOG) += ipt_LOG.o +obj-$(CONFIG_IP_NF_TARGET_CONNMARK) += ipt_CONNMARK.o +obj-$(CONFIG_IP_NF_TARGET_ULOG) += ipt_ULOG.o +obj-$(CONFIG_IP_NF_TARGET_TCPMSS) += ipt_TCPMSS.o +obj-$(CONFIG_IP_NF_TARGET_NOTRACK) += ipt_NOTRACK.o +obj-$(CONFIG_IP_NF_TARGET_CLUSTERIP) += ipt_CLUSTERIP.o + +# generic ARP tables +obj-$(CONFIG_IP_NF_ARPTABLES) += arp_tables.o +obj-$(CONFIG_IP_NF_ARP_MANGLE) += arpt_mangle.o + +# just filtering instance of ARP tables for now +obj-$(CONFIG_IP_NF_ARPFILTER) += arptable_filter.o + +obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c new file mode 100644 index 000000000000..df79f5ed6a0a --- /dev/null +++ b/net/ipv4/netfilter/arp_tables.c @@ -0,0 +1,1333 @@ +/* + * Packet matching code for ARP packets. + * + * Based heavily, if not almost entirely, upon ip_tables.c framework. + * + * Some ARP specific bits are: + * + * Copyright (C) 2002 David S. Miller (davem@redhat.com) + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("David S. Miller "); +MODULE_DESCRIPTION("arptables core"); + +/*#define DEBUG_ARP_TABLES*/ +/*#define DEBUG_ARP_TABLES_USER*/ + +#ifdef DEBUG_ARP_TABLES +#define dprintf(format, args...) printk(format , ## args) +#else +#define dprintf(format, args...) +#endif + +#ifdef DEBUG_ARP_TABLES_USER +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#ifdef CONFIG_NETFILTER_DEBUG +#define ARP_NF_ASSERT(x) \ +do { \ + if (!(x)) \ + printk("ARP_NF_ASSERT: %s:%s:%u\n", \ + __FUNCTION__, __FILE__, __LINE__); \ +} while(0) +#else +#define ARP_NF_ASSERT(x) +#endif +#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) + +static DECLARE_MUTEX(arpt_mutex); + +#define ASSERT_READ_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) +#define ASSERT_WRITE_LOCK(x) ARP_NF_ASSERT(down_trylock(&arpt_mutex) != 0) +#include +#include + +struct arpt_table_info { + unsigned int size; + unsigned int number; + unsigned int initial_entries; + unsigned int hook_entry[NF_ARP_NUMHOOKS]; + unsigned int underflow[NF_ARP_NUMHOOKS]; + char entries[0] __attribute__((aligned(SMP_CACHE_BYTES))); +}; + +static LIST_HEAD(arpt_target); +static LIST_HEAD(arpt_tables); +#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) + +#ifdef CONFIG_SMP +#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) +#else +#define TABLE_OFFSET(t,p) 0 +#endif + +static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, + char *hdr_addr, int len) +{ + int i, ret; + + if (len > ARPT_DEV_ADDR_LEN_MAX) + len = ARPT_DEV_ADDR_LEN_MAX; + + ret = 0; + for (i = 0; i < len; i++) + ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i]; + + return (ret != 0); +} + +/* Returns whether packet matches rule or not. */ +static inline int arp_packet_match(const struct arphdr *arphdr, + struct net_device *dev, + const char *indev, + const char *outdev, + const struct arpt_arp *arpinfo) +{ + char *arpptr = (char *)(arphdr + 1); + char *src_devaddr, *tgt_devaddr; + u32 src_ipaddr, tgt_ipaddr; + int i, ret; + +#define FWINV(bool,invflg) ((bool) ^ !!(arpinfo->invflags & invflg)) + + if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop, + ARPT_INV_ARPOP)) { + dprintf("ARP operation field mismatch.\n"); + dprintf("ar_op: %04x info->arpop: %04x info->arpop_mask: %04x\n", + arphdr->ar_op, arpinfo->arpop, arpinfo->arpop_mask); + return 0; + } + + if (FWINV((arphdr->ar_hrd & arpinfo->arhrd_mask) != arpinfo->arhrd, + ARPT_INV_ARPHRD)) { + dprintf("ARP hardware address format mismatch.\n"); + dprintf("ar_hrd: %04x info->arhrd: %04x info->arhrd_mask: %04x\n", + arphdr->ar_hrd, arpinfo->arhrd, arpinfo->arhrd_mask); + return 0; + } + + if (FWINV((arphdr->ar_pro & arpinfo->arpro_mask) != arpinfo->arpro, + ARPT_INV_ARPPRO)) { + dprintf("ARP protocol address format mismatch.\n"); + dprintf("ar_pro: %04x info->arpro: %04x info->arpro_mask: %04x\n", + arphdr->ar_pro, arpinfo->arpro, arpinfo->arpro_mask); + return 0; + } + + if (FWINV((arphdr->ar_hln & arpinfo->arhln_mask) != arpinfo->arhln, + ARPT_INV_ARPHLN)) { + dprintf("ARP hardware address length mismatch.\n"); + dprintf("ar_hln: %02x info->arhln: %02x info->arhln_mask: %02x\n", + arphdr->ar_hln, arpinfo->arhln, arpinfo->arhln_mask); + return 0; + } + + src_devaddr = arpptr; + arpptr += dev->addr_len; + memcpy(&src_ipaddr, arpptr, sizeof(u32)); + arpptr += sizeof(u32); + tgt_devaddr = arpptr; + arpptr += dev->addr_len; + memcpy(&tgt_ipaddr, arpptr, sizeof(u32)); + + if (FWINV(arp_devaddr_compare(&arpinfo->src_devaddr, src_devaddr, dev->addr_len), + ARPT_INV_SRCDEVADDR) || + FWINV(arp_devaddr_compare(&arpinfo->tgt_devaddr, tgt_devaddr, dev->addr_len), + ARPT_INV_TGTDEVADDR)) { + dprintf("Source or target device address mismatch.\n"); + + return 0; + } + + if (FWINV((src_ipaddr & arpinfo->smsk.s_addr) != arpinfo->src.s_addr, + ARPT_INV_SRCIP) || + FWINV(((tgt_ipaddr & arpinfo->tmsk.s_addr) != arpinfo->tgt.s_addr), + ARPT_INV_TGTIP)) { + dprintf("Source or target IP address mismatch.\n"); + + dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n", + NIPQUAD(src_ipaddr), + NIPQUAD(arpinfo->smsk.s_addr), + NIPQUAD(arpinfo->src.s_addr), + arpinfo->invflags & ARPT_INV_SRCIP ? " (INV)" : ""); + dprintf("TGT: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n", + NIPQUAD(tgt_ipaddr), + NIPQUAD(arpinfo->tmsk.s_addr), + NIPQUAD(arpinfo->tgt.s_addr), + arpinfo->invflags & ARPT_INV_TGTIP ? " (INV)" : ""); + return 0; + } + + /* Look for ifname matches. */ + for (i = 0, ret = 0; i < IFNAMSIZ; i++) { + ret |= (indev[i] ^ arpinfo->iniface[i]) + & arpinfo->iniface_mask[i]; + } + + if (FWINV(ret != 0, ARPT_INV_VIA_IN)) { + dprintf("VIA in mismatch (%s vs %s).%s\n", + indev, arpinfo->iniface, + arpinfo->invflags&ARPT_INV_VIA_IN ?" (INV)":""); + return 0; + } + + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + unsigned long odev; + memcpy(&odev, outdev + i*sizeof(unsigned long), + sizeof(unsigned long)); + ret |= (odev + ^ ((const unsigned long *)arpinfo->outiface)[i]) + & ((const unsigned long *)arpinfo->outiface_mask)[i]; + } + + if (FWINV(ret != 0, ARPT_INV_VIA_OUT)) { + dprintf("VIA out mismatch (%s vs %s).%s\n", + outdev, arpinfo->outiface, + arpinfo->invflags&ARPT_INV_VIA_OUT ?" (INV)":""); + return 0; + } + + return 1; +} + +static inline int arp_checkentry(const struct arpt_arp *arp) +{ + if (arp->flags & ~ARPT_F_MASK) { + duprintf("Unknown flag bits set: %08X\n", + arp->flags & ~ARPT_F_MASK); + return 0; + } + if (arp->invflags & ~ARPT_INV_MASK) { + duprintf("Unknown invflag bits set: %08X\n", + arp->invflags & ~ARPT_INV_MASK); + return 0; + } + + return 1; +} + +static unsigned int arpt_error(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + const void *targinfo, + void *userinfo) +{ + if (net_ratelimit()) + printk("arp_tables: error: '%s'\n", (char *)targinfo); + + return NF_DROP; +} + +static inline struct arpt_entry *get_entry(void *base, unsigned int offset) +{ + return (struct arpt_entry *)(base + offset); +} + +unsigned int arpt_do_table(struct sk_buff **pskb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + struct arpt_table *table, + void *userdata) +{ + static const char nulldevname[IFNAMSIZ]; + unsigned int verdict = NF_DROP; + struct arphdr *arp; + int hotdrop = 0; + struct arpt_entry *e, *back; + const char *indev, *outdev; + void *table_base; + + /* ARP header, plus 2 device addresses, plus 2 IP addresses. */ + if (!pskb_may_pull((*pskb), (sizeof(struct arphdr) + + (2 * (*pskb)->dev->addr_len) + + (2 * sizeof(u32))))) + return NF_DROP; + + indev = in ? in->name : nulldevname; + outdev = out ? out->name : nulldevname; + + read_lock_bh(&table->lock); + table_base = (void *)table->private->entries + + TABLE_OFFSET(table->private, + smp_processor_id()); + e = get_entry(table_base, table->private->hook_entry[hook]); + back = get_entry(table_base, table->private->underflow[hook]); + + arp = (*pskb)->nh.arph; + do { + if (arp_packet_match(arp, (*pskb)->dev, indev, outdev, &e->arp)) { + struct arpt_entry_target *t; + int hdr_len; + + hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) + + (2 * (*pskb)->dev->addr_len); + ADD_COUNTER(e->counters, hdr_len, 1); + + t = arpt_get_target(e); + + /* Standard target? */ + if (!t->u.kernel.target->target) { + int v; + + v = ((struct arpt_standard_target *)t)->verdict; + if (v < 0) { + /* Pop from stack? */ + if (v != ARPT_RETURN) { + verdict = (unsigned)(-v) - 1; + break; + } + e = back; + back = get_entry(table_base, + back->comefrom); + continue; + } + if (table_base + v + != (void *)e + e->next_offset) { + /* Save old back ptr in next entry */ + struct arpt_entry *next + = (void *)e + e->next_offset; + next->comefrom = + (void *)back - table_base; + + /* set back pointer to next entry */ + back = next; + } + + e = get_entry(table_base, v); + } else { + /* Targets which reenter must return + * abs. verdicts + */ + verdict = t->u.kernel.target->target(pskb, + hook, + in, out, + t->data, + userdata); + + /* Target might have changed stuff. */ + arp = (*pskb)->nh.arph; + + if (verdict == ARPT_CONTINUE) + e = (void *)e + e->next_offset; + else + /* Verdict */ + break; + } + } else { + e = (void *)e + e->next_offset; + } + } while (!hotdrop); + read_unlock_bh(&table->lock); + + if (hotdrop) + return NF_DROP; + else + return verdict; +} + +static inline void *find_inlist_lock_noload(struct list_head *head, + const char *name, + int *error, + struct semaphore *mutex) +{ + void *ret; + + *error = down_interruptible(mutex); + if (*error != 0) + return NULL; + + ret = list_named_find(head, name); + if (!ret) { + *error = -ENOENT; + up(mutex); + } + return ret; +} + +#ifndef CONFIG_KMOD +#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m)) +#else +static void * +find_inlist_lock(struct list_head *head, + const char *name, + const char *prefix, + int *error, + struct semaphore *mutex) +{ + void *ret; + + ret = find_inlist_lock_noload(head, name, error, mutex); + if (!ret) { + duprintf("find_inlist: loading `%s%s'.\n", prefix, name); + request_module("%s%s", prefix, name); + ret = find_inlist_lock_noload(head, name, error, mutex); + } + + return ret; +} +#endif + +static inline struct arpt_table *arpt_find_table_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&arpt_tables, name, "arptable_", error, mutex); +} + +static struct arpt_target *arpt_find_target_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&arpt_target, name, "arpt_", error, mutex); +} + +/* All zeroes == unconditional rule. */ +static inline int unconditional(const struct arpt_arp *arp) +{ + unsigned int i; + + for (i = 0; i < sizeof(*arp)/sizeof(__u32); i++) + if (((__u32 *)arp)[i]) + return 0; + + return 1; +} + +/* Figures out from what hook each rule can be called: returns 0 if + * there are loops. Puts hook bitmask in comefrom. + */ +static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks) +{ + unsigned int hook; + + /* No recursion; use packet counter to save back ptrs (reset + * to 0 as we leave), and comefrom to save source hook bitmask. + */ + for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { + unsigned int pos = newinfo->hook_entry[hook]; + struct arpt_entry *e + = (struct arpt_entry *)(newinfo->entries + pos); + + if (!(valid_hooks & (1 << hook))) + continue; + + /* Set initial back pointer. */ + e->counters.pcnt = pos; + + for (;;) { + struct arpt_standard_target *t + = (void *)arpt_get_target(e); + + if (e->comefrom & (1 << NF_ARP_NUMHOOKS)) { + printk("arptables: loop hook %u pos %u %08X.\n", + hook, pos, e->comefrom); + return 0; + } + e->comefrom + |= ((1 << hook) | (1 << NF_ARP_NUMHOOKS)); + + /* Unconditional return/END. */ + if (e->target_offset == sizeof(struct arpt_entry) + && (strcmp(t->target.u.user.name, + ARPT_STANDARD_TARGET) == 0) + && t->verdict < 0 + && unconditional(&e->arp)) { + unsigned int oldpos, size; + + /* Return: backtrack through the last + * big jump. + */ + do { + e->comefrom ^= (1<counters.pcnt; + e->counters.pcnt = 0; + + /* We're at the start. */ + if (pos == oldpos) + goto next; + + e = (struct arpt_entry *) + (newinfo->entries + pos); + } while (oldpos == pos + e->next_offset); + + /* Move along one */ + size = e->next_offset; + e = (struct arpt_entry *) + (newinfo->entries + pos + size); + e->counters.pcnt = pos; + pos += size; + } else { + int newpos = t->verdict; + + if (strcmp(t->target.u.user.name, + ARPT_STANDARD_TARGET) == 0 + && newpos >= 0) { + /* This a jump; chase it. */ + duprintf("Jump rule %u -> %u\n", + pos, newpos); + } else { + /* ... this is a fallthru */ + newpos = pos + e->next_offset; + } + e = (struct arpt_entry *) + (newinfo->entries + newpos); + e->counters.pcnt = pos; + pos = newpos; + } + } + next: + duprintf("Finished chain %u\n", hook); + } + return 1; +} + +static inline int standard_check(const struct arpt_entry_target *t, + unsigned int max_offset) +{ + struct arpt_standard_target *targ = (void *)t; + + /* Check standard info. */ + if (t->u.target_size + != ARPT_ALIGN(sizeof(struct arpt_standard_target))) { + duprintf("arpt_standard_check: target size %u != %Zu\n", + t->u.target_size, + ARPT_ALIGN(sizeof(struct arpt_standard_target))); + return 0; + } + + if (targ->verdict >= 0 + && targ->verdict > max_offset - sizeof(struct arpt_entry)) { + duprintf("arpt_standard_check: bad verdict (%i)\n", + targ->verdict); + return 0; + } + + if (targ->verdict < -NF_MAX_VERDICT - 1) { + duprintf("arpt_standard_check: bad negative verdict (%i)\n", + targ->verdict); + return 0; + } + return 1; +} + +static struct arpt_target arpt_standard_target; + +static inline int check_entry(struct arpt_entry *e, const char *name, unsigned int size, + unsigned int *i) +{ + struct arpt_entry_target *t; + struct arpt_target *target; + int ret; + + if (!arp_checkentry(&e->arp)) { + duprintf("arp_tables: arp check failed %p %s.\n", e, name); + return -EINVAL; + } + + t = arpt_get_target(e); + target = arpt_find_target_lock(t->u.user.name, &ret, &arpt_mutex); + if (!target) { + duprintf("check_entry: `%s' not found\n", t->u.user.name); + goto out; + } + if (!try_module_get((target->me))) { + ret = -ENOENT; + goto out_unlock; + } + t->u.kernel.target = target; + up(&arpt_mutex); + + if (t->u.kernel.target == &arpt_standard_target) { + if (!standard_check(t, size)) { + ret = -EINVAL; + goto out; + } + } else if (t->u.kernel.target->checkentry + && !t->u.kernel.target->checkentry(name, e, t->data, + t->u.target_size + - sizeof(*t), + e->comefrom)) { + module_put(t->u.kernel.target->me); + duprintf("arp_tables: check failed for `%s'.\n", + t->u.kernel.target->name); + ret = -EINVAL; + goto out; + } + + (*i)++; + return 0; + +out_unlock: + up(&arpt_mutex); +out: + return ret; +} + +static inline int check_entry_size_and_hooks(struct arpt_entry *e, + struct arpt_table_info *newinfo, + unsigned char *base, + unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + unsigned int *i) +{ + unsigned int h; + + if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 + || (unsigned char *)e + sizeof(struct arpt_entry) >= limit) { + duprintf("Bad offset %p\n", e); + return -EINVAL; + } + + if (e->next_offset + < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* Check hooks & underflows */ + for (h = 0; h < NF_ARP_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* FIXME: underflows must be unconditional, standard verdicts + < 0 (not ARPT_RETURN). --RR */ + + /* Clear counters and comefrom */ + e->counters = ((struct arpt_counters) { 0, 0 }); + e->comefrom = 0; + + (*i)++; + return 0; +} + +static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) +{ + struct arpt_entry_target *t; + + if (i && (*i)-- == 0) + return 1; + + t = arpt_get_target(e); + if (t->u.kernel.target->destroy) + t->u.kernel.target->destroy(t->data, + t->u.target_size - sizeof(*t)); + module_put(t->u.kernel.target->me); + return 0; +} + +/* Checks and translates the user-supplied table segment (held in + * newinfo). + */ +static int translate_table(const char *name, + unsigned int valid_hooks, + struct arpt_table_info *newinfo, + unsigned int size, + unsigned int number, + const unsigned int *hook_entries, + const unsigned int *underflows) +{ + unsigned int i; + int ret; + + newinfo->size = size; + newinfo->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + newinfo->hook_entry[i] = 0xFFFFFFFF; + newinfo->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_table: size %u\n", newinfo->size); + i = 0; + + /* Walk through entries, checking offsets. */ + ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry_size_and_hooks, + newinfo, + newinfo->entries, + newinfo->entries + size, + hook_entries, underflows, &i); + duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); + if (ret != 0) + return ret; + + if (i != number) { + duprintf("translate_table: %u not %u entries\n", + i, number); + return -EINVAL; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_ARP_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (newinfo->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + return -EINVAL; + } + if (newinfo->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + return -EINVAL; + } + } + + if (!mark_source_chains(newinfo, valid_hooks)) { + duprintf("Looping hook\n"); + return -ELOOP; + } + + /* Finally, each sanity check must pass */ + i = 0; + ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry, name, size, &i); + + if (ret != 0) { + ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + cleanup_entry, &i); + return ret; + } + + /* And one copy for every other CPU */ + for (i = 1; i < num_possible_cpus(); i++) { + memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i, + newinfo->entries, + SMP_ALIGN(newinfo->size)); + } + + return ret; +} + +static struct arpt_table_info *replace_table(struct arpt_table *table, + unsigned int num_counters, + struct arpt_table_info *newinfo, + int *error) +{ + struct arpt_table_info *oldinfo; + + /* Do the substitution. */ + write_lock_bh(&table->lock); + /* Check inside lock: is the old number correct? */ + if (num_counters != table->private->number) { + duprintf("num_counters != table->private->number (%u/%u)\n", + num_counters, table->private->number); + write_unlock_bh(&table->lock); + *error = -EAGAIN; + return NULL; + } + oldinfo = table->private; + table->private = newinfo; + newinfo->initial_entries = oldinfo->initial_entries; + write_unlock_bh(&table->lock); + + return oldinfo; +} + +/* Gets counters. */ +static inline int add_entry_to_counter(const struct arpt_entry *e, + struct arpt_counters total[], + unsigned int *i) +{ + ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + +static void get_counters(const struct arpt_table_info *t, + struct arpt_counters counters[]) +{ + unsigned int cpu; + unsigned int i; + + for (cpu = 0; cpu < num_possible_cpus(); cpu++) { + i = 0; + ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + t->size, + add_entry_to_counter, + counters, + &i); + } +} + +static int copy_entries_to_user(unsigned int total_size, + struct arpt_table *table, + void __user *userptr) +{ + unsigned int off, num, countersize; + struct arpt_entry *e; + struct arpt_counters *counters; + int ret = 0; + + /* We need atomic snapshot of counters: rest doesn't change + * (other than comefrom, which userspace doesn't care + * about). + */ + countersize = sizeof(struct arpt_counters) * table->private->number; + counters = vmalloc(countersize); + + if (counters == NULL) + return -ENOMEM; + + /* First, sum counters... */ + memset(counters, 0, countersize); + write_lock_bh(&table->lock); + get_counters(table->private, counters); + write_unlock_bh(&table->lock); + + /* ... then copy entire thing from CPU 0... */ + if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + ret = -EFAULT; + goto free_counters; + } + + /* FIXME: use iterator macros --RR */ + /* ... then go back and fix counters and names */ + for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ + struct arpt_entry_target *t; + + e = (struct arpt_entry *)(table->private->entries + off); + if (copy_to_user(userptr + off + + offsetof(struct arpt_entry, counters), + &counters[num], + sizeof(counters[num])) != 0) { + ret = -EFAULT; + goto free_counters; + } + + t = arpt_get_target(e); + if (copy_to_user(userptr + off + e->target_offset + + offsetof(struct arpt_entry_target, + u.user.name), + t->u.kernel.target->name, + strlen(t->u.kernel.target->name)+1) != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + free_counters: + vfree(counters); + return ret; +} + +static int get_entries(const struct arpt_get_entries *entries, + struct arpt_get_entries __user *uptr) +{ + int ret; + struct arpt_table *t; + + t = arpt_find_table_lock(entries->name, &ret, &arpt_mutex); + if (t) { + duprintf("t->private->number = %u\n", + t->private->number); + if (entries->size == t->private->size) + ret = copy_entries_to_user(t->private->size, + t, uptr->entrytable); + else { + duprintf("get_entries: I've got %u not %u!\n", + t->private->size, + entries->size); + ret = -EINVAL; + } + up(&arpt_mutex); + } else + duprintf("get_entries: Can't find %s!\n", + entries->name); + + return ret; +} + +static int do_replace(void __user *user, unsigned int len) +{ + int ret; + struct arpt_replace tmp; + struct arpt_table *t; + struct arpt_table_info *newinfo, *oldinfo; + struct arpt_counters *counters; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* Hack: Causes ipchains to give correct error msg --RR */ + if (len != sizeof(tmp) + tmp.size) + return -ENOPROTOOPT; + + /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ + if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) + return -ENOMEM; + + newinfo = vmalloc(sizeof(struct arpt_table_info) + + SMP_ALIGN(tmp.size) * num_possible_cpus()); + if (!newinfo) + return -ENOMEM; + + if (copy_from_user(newinfo->entries, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + counters = vmalloc(tmp.num_counters * sizeof(struct arpt_counters)); + if (!counters) { + ret = -ENOMEM; + goto free_newinfo; + } + memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters)); + + ret = translate_table(tmp.name, tmp.valid_hooks, + newinfo, tmp.size, tmp.num_entries, + tmp.hook_entry, tmp.underflow); + if (ret != 0) + goto free_newinfo_counters; + + duprintf("arp_tables: Translated table\n"); + + t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex); + if (!t) + goto free_newinfo_counters_untrans; + + /* You lied! */ + if (tmp.valid_hooks != t->valid_hooks) { + duprintf("Valid hook crap: %08X vs %08X\n", + tmp.valid_hooks, t->valid_hooks); + ret = -EINVAL; + goto free_newinfo_counters_untrans_unlock; + } + + /* Get a reference in advance, we're not allowed fail later */ + if (!try_module_get(t->me)) { + ret = -EBUSY; + goto free_newinfo_counters_untrans_unlock; + } + + oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret); + if (!oldinfo) + goto put_module; + + /* Update module usage count based on number of rules */ + duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", + oldinfo->number, oldinfo->initial_entries, newinfo->number); + if ((oldinfo->number > oldinfo->initial_entries) || + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + if ((oldinfo->number > oldinfo->initial_entries) && + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + + /* Get the old counters. */ + get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ + ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); + vfree(oldinfo); + if (copy_to_user(tmp.counters, counters, + sizeof(struct arpt_counters) * tmp.num_counters) != 0) + ret = -EFAULT; + vfree(counters); + up(&arpt_mutex); + return ret; + + put_module: + module_put(t->me); + free_newinfo_counters_untrans_unlock: + up(&arpt_mutex); + free_newinfo_counters_untrans: + ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL); + free_newinfo_counters: + vfree(counters); + free_newinfo: + vfree(newinfo); + return ret; +} + +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. + */ +static inline int add_counter_to_entry(struct arpt_entry *e, + const struct arpt_counters addme[], + unsigned int *i) +{ + + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + +static int do_add_counters(void __user *user, unsigned int len) +{ + unsigned int i; + struct arpt_counters_info tmp, *paddc; + struct arpt_table *t; + int ret; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct arpt_counters)) + return -EINVAL; + + paddc = vmalloc(len); + if (!paddc) + return -ENOMEM; + + if (copy_from_user(paddc, user, len) != 0) { + ret = -EFAULT; + goto free; + } + + t = arpt_find_table_lock(tmp.name, &ret, &arpt_mutex); + if (!t) + goto free; + + write_lock_bh(&t->lock); + if (t->private->number != paddc->num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } + + i = 0; + ARPT_ENTRY_ITERATE(t->private->entries, + t->private->size, + add_counter_to_entry, + paddc->counters, + &i); + unlock_up_free: + write_unlock_bh(&t->lock); + up(&arpt_mutex); + free: + vfree(paddc); + + return ret; +} + +static int do_arpt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_SET_REPLACE: + ret = do_replace(user, len); + break; + + case ARPT_SO_SET_ADD_COUNTERS: + ret = do_add_counters(user, len); + break; + + default: + duprintf("do_arpt_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case ARPT_SO_GET_INFO: { + char name[ARPT_TABLE_MAXNAMELEN]; + struct arpt_table *t; + + if (*len != sizeof(struct arpt_getinfo)) { + duprintf("length %u != %Zu\n", *len, + sizeof(struct arpt_getinfo)); + ret = -EINVAL; + break; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) { + ret = -EFAULT; + break; + } + name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; + t = arpt_find_table_lock(name, &ret, &arpt_mutex); + if (t) { + struct arpt_getinfo info; + + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, t->private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, t->private->underflow, + sizeof(info.underflow)); + info.num_entries = t->private->number; + info.size = t->private->size; + strcpy(info.name, name); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + + up(&arpt_mutex); + } + } + break; + + case ARPT_SO_GET_ENTRIES: { + struct arpt_get_entries get; + + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %Zu\n", *len, sizeof(get)); + ret = -EINVAL; + } else if (copy_from_user(&get, user, sizeof(get)) != 0) { + ret = -EFAULT; + } else if (*len != sizeof(struct arpt_get_entries) + get.size) { + duprintf("get_entries: %u != %Zu\n", *len, + sizeof(struct arpt_get_entries) + get.size); + ret = -EINVAL; + } else + ret = get_entries(&get, user); + break; + } + + default: + duprintf("do_arpt_get_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +/* Registration hooks for targets. */ +int arpt_register_target(struct arpt_target *target) +{ + int ret; + + ret = down_interruptible(&arpt_mutex); + if (ret != 0) + return ret; + + if (!list_named_insert(&arpt_target, target)) { + duprintf("arpt_register_target: `%s' already in list!\n", + target->name); + ret = -EINVAL; + } + up(&arpt_mutex); + return ret; +} + +void arpt_unregister_target(struct arpt_target *target) +{ + down(&arpt_mutex); + LIST_DELETE(&arpt_target, target); + up(&arpt_mutex); +} + +int arpt_register_table(struct arpt_table *table, + const struct arpt_replace *repl) +{ + int ret; + struct arpt_table_info *newinfo; + static struct arpt_table_info bootstrap + = { 0, 0, 0, { 0 }, { 0 }, { } }; + + newinfo = vmalloc(sizeof(struct arpt_table_info) + + SMP_ALIGN(repl->size) * num_possible_cpus()); + if (!newinfo) { + ret = -ENOMEM; + return ret; + } + memcpy(newinfo->entries, repl->entries, repl->size); + + ret = translate_table(table->name, table->valid_hooks, + newinfo, repl->size, + repl->num_entries, + repl->hook_entry, + repl->underflow); + duprintf("arpt_register_table: translate table gives %d\n", ret); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + ret = down_interruptible(&arpt_mutex); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + /* Don't autoload: we'd eat our tail... */ + if (list_named_find(&arpt_tables, table->name)) { + ret = -EEXIST; + goto free_unlock; + } + + /* Simplifies replace_table code. */ + table->private = &bootstrap; + if (!replace_table(table, 0, newinfo, &ret)) + goto free_unlock; + + duprintf("table->private->number = %u\n", + table->private->number); + + /* save number of initial entries */ + table->private->initial_entries = table->private->number; + + rwlock_init(&table->lock); + list_prepend(&arpt_tables, table); + + unlock: + up(&arpt_mutex); + return ret; + + free_unlock: + vfree(newinfo); + goto unlock; +} + +void arpt_unregister_table(struct arpt_table *table) +{ + down(&arpt_mutex); + LIST_DELETE(&arpt_tables, table); + up(&arpt_mutex); + + /* Decrease module usage counts and free resources */ + ARPT_ENTRY_ITERATE(table->private->entries, table->private->size, + cleanup_entry, NULL); + vfree(table->private); +} + +/* The built-in targets: standard (NULL) and error. */ +static struct arpt_target arpt_standard_target = { + .name = ARPT_STANDARD_TARGET, +}; + +static struct arpt_target arpt_error_target = { + .name = ARPT_ERROR_TARGET, + .target = arpt_error, +}; + +static struct nf_sockopt_ops arpt_sockopts = { + .pf = PF_INET, + .set_optmin = ARPT_BASE_CTL, + .set_optmax = ARPT_SO_SET_MAX+1, + .set = do_arpt_set_ctl, + .get_optmin = ARPT_BASE_CTL, + .get_optmax = ARPT_SO_GET_MAX+1, + .get = do_arpt_get_ctl, +}; + +#ifdef CONFIG_PROC_FS +static inline int print_name(const struct arpt_table *t, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if ((*count)++ >= start_offset) { + unsigned int namelen; + + namelen = sprintf(buffer + *pos, "%s\n", t->name); + if (*pos + namelen > length) { + /* Stop iterating */ + return 1; + } + *pos += namelen; + } + return 0; +} + +static int arpt_get_tables(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&arpt_mutex) != 0) + return 0; + + LIST_FIND(&arpt_tables, print_name, struct arpt_table *, + offset, buffer, length, &pos, &count); + + up(&arpt_mutex); + + /* `start' hack - see fs/proc/generic.c line ~105 */ + *start=(char *)((unsigned long)count-offset); + return pos; +} +#endif /*CONFIG_PROC_FS*/ + +static int __init init(void) +{ + int ret; + + /* Noone else will be downing sem now, so we won't sleep */ + down(&arpt_mutex); + list_append(&arpt_target, &arpt_standard_target); + list_append(&arpt_target, &arpt_error_target); + up(&arpt_mutex); + + /* Register setsockopt */ + ret = nf_register_sockopt(&arpt_sockopts); + if (ret < 0) { + duprintf("Unable to register sockopts.\n"); + return ret; + } + +#ifdef CONFIG_PROC_FS + { + struct proc_dir_entry *proc; + + proc = proc_net_create("arp_tables_names", 0, arpt_get_tables); + if (!proc) { + nf_unregister_sockopt(&arpt_sockopts); + return -ENOMEM; + } + proc->owner = THIS_MODULE; + } +#endif + + printk("arp_tables: (C) 2002 David S. Miller\n"); + return 0; +} + +static void __exit fini(void) +{ + nf_unregister_sockopt(&arpt_sockopts); +#ifdef CONFIG_PROC_FS + proc_net_remove("arp_tables_names"); +#endif +} + +EXPORT_SYMBOL(arpt_register_table); +EXPORT_SYMBOL(arpt_unregister_table); +EXPORT_SYMBOL(arpt_do_table); +EXPORT_SYMBOL(arpt_register_target); +EXPORT_SYMBOL(arpt_unregister_target); + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c new file mode 100644 index 000000000000..3e592ec86482 --- /dev/null +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -0,0 +1,104 @@ +/* module that allows mangling of the arp payload */ +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Bart De Schuymer "); +MODULE_DESCRIPTION("arptables arp payload mangle target"); + +static unsigned int +target(struct sk_buff **pskb, unsigned int hooknum, const struct net_device *in, + const struct net_device *out, const void *targinfo, void *userinfo) +{ + const struct arpt_mangle *mangle = targinfo; + struct arphdr *arp; + unsigned char *arpptr; + int pln, hln; + + if (skb_shared(*pskb) || skb_cloned(*pskb)) { + struct sk_buff *nskb; + + nskb = skb_copy(*pskb, GFP_ATOMIC); + if (!nskb) + return NF_DROP; + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); + kfree_skb(*pskb); + *pskb = nskb; + } + + arp = (*pskb)->nh.arph; + arpptr = (*pskb)->nh.raw + sizeof(*arp); + pln = arp->ar_pln; + hln = arp->ar_hln; + /* We assume that pln and hln were checked in the match */ + if (mangle->flags & ARPT_MANGLE_SDEV) { + if (ARPT_DEV_ADDR_LEN_MAX < hln || + (arpptr + hln > (**pskb).tail)) + return NF_DROP; + memcpy(arpptr, mangle->src_devaddr, hln); + } + arpptr += hln; + if (mangle->flags & ARPT_MANGLE_SIP) { + if (ARPT_MANGLE_ADDR_LEN_MAX < pln || + (arpptr + pln > (**pskb).tail)) + return NF_DROP; + memcpy(arpptr, &mangle->u_s.src_ip, pln); + } + arpptr += pln; + if (mangle->flags & ARPT_MANGLE_TDEV) { + if (ARPT_DEV_ADDR_LEN_MAX < hln || + (arpptr + hln > (**pskb).tail)) + return NF_DROP; + memcpy(arpptr, mangle->tgt_devaddr, hln); + } + arpptr += hln; + if (mangle->flags & ARPT_MANGLE_TIP) { + if (ARPT_MANGLE_ADDR_LEN_MAX < pln || + (arpptr + pln > (**pskb).tail)) + return NF_DROP; + memcpy(arpptr, &mangle->u_t.tgt_ip, pln); + } + return mangle->target; +} + +static int +checkentry(const char *tablename, const struct arpt_entry *e, void *targinfo, + unsigned int targinfosize, unsigned int hook_mask) +{ + const struct arpt_mangle *mangle = targinfo; + + if (mangle->flags & ~ARPT_MANGLE_MASK || + !(mangle->flags & ARPT_MANGLE_MASK)) + return 0; + + if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT && + mangle->target != ARPT_CONTINUE) + return 0; + return 1; +} + +static struct arpt_target arpt_mangle_reg += { + .name = "mangle", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + if (arpt_register_target(&arpt_mangle_reg)) + return -EINVAL; + + return 0; +} + +static void __exit fini(void) +{ + arpt_unregister_target(&arpt_mangle_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c new file mode 100644 index 000000000000..0d759f5a4ef0 --- /dev/null +++ b/net/ipv4/netfilter/arptable_filter.c @@ -0,0 +1,214 @@ +/* + * Filtering ARP tables module. + * + * Copyright (C) 2002 David S. Miller (davem@redhat.com) + * + */ + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("David S. Miller "); +MODULE_DESCRIPTION("arptables filter table"); + +#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \ + (1 << NF_ARP_FORWARD)) + +/* Standard entry. */ +struct arpt_standard +{ + struct arpt_entry entry; + struct arpt_standard_target target; +}; + +struct arpt_error_target +{ + struct arpt_entry_target target; + char errorname[ARPT_FUNCTION_MAXNAMELEN]; +}; + +struct arpt_error +{ + struct arpt_entry entry; + struct arpt_error_target target; +}; + +static struct +{ + struct arpt_replace repl; + struct arpt_standard entries[3]; + struct arpt_error term; +} initial_table __initdata += { { "filter", FILTER_VALID_HOOKS, 4, + sizeof(struct arpt_standard) * 3 + sizeof(struct arpt_error), + { [NF_ARP_IN] = 0, + [NF_ARP_OUT] = sizeof(struct arpt_standard), + [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), }, + { [NF_ARP_IN] = 0, + [NF_ARP_OUT] = sizeof(struct arpt_standard), + [NF_ARP_FORWARD] = 2 * sizeof(struct arpt_standard), }, + 0, NULL, { } }, + { + /* ARP_IN */ + { + { + { + { 0 }, { 0 }, { 0 }, { 0 }, + 0, 0, + { { 0, }, { 0, } }, + { { 0, }, { 0, } }, + 0, 0, + 0, 0, + 0, 0, + "", "", { 0 }, { 0 }, + 0, 0 + }, + sizeof(struct arpt_entry), + sizeof(struct arpt_standard), + 0, + { 0, 0 }, { } }, + { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } + }, + /* ARP_OUT */ + { + { + { + { 0 }, { 0 }, { 0 }, { 0 }, + 0, 0, + { { 0, }, { 0, } }, + { { 0, }, { 0, } }, + 0, 0, + 0, 0, + 0, 0, + "", "", { 0 }, { 0 }, + 0, 0 + }, + sizeof(struct arpt_entry), + sizeof(struct arpt_standard), + 0, + { 0, 0 }, { } }, + { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } + }, + /* ARP_FORWARD */ + { + { + { + { 0 }, { 0 }, { 0 }, { 0 }, + 0, 0, + { { 0, }, { 0, } }, + { { 0, }, { 0, } }, + 0, 0, + 0, 0, + 0, 0, + "", "", { 0 }, { 0 }, + 0, 0 + }, + sizeof(struct arpt_entry), + sizeof(struct arpt_standard), + 0, + { 0, 0 }, { } }, + { { { { ARPT_ALIGN(sizeof(struct arpt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } + } + }, + /* ERROR */ + { + { + { + { 0 }, { 0 }, { 0 }, { 0 }, + 0, 0, + { { 0, }, { 0, } }, + { { 0, }, { 0, } }, + 0, 0, + 0, 0, + 0, 0, + "", "", { 0 }, { 0 }, + 0, 0 + }, + sizeof(struct arpt_entry), + sizeof(struct arpt_error), + 0, + { 0, 0 }, { } }, + { { { { ARPT_ALIGN(sizeof(struct arpt_error_target)), ARPT_ERROR_TARGET } }, + { } }, + "ERROR" + } + } +}; + +static struct arpt_table packet_filter = { + .name = "filter", + .valid_hooks = FILTER_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .private = NULL, + .me = THIS_MODULE, +}; + +/* The work comes in here from netfilter.c */ +static unsigned int arpt_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return arpt_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static struct nf_hook_ops arpt_ops[] = { + { + .hook = arpt_hook, + .owner = THIS_MODULE, + .pf = NF_ARP, + .hooknum = NF_ARP_IN, + }, + { + .hook = arpt_hook, + .owner = THIS_MODULE, + .pf = NF_ARP, + .hooknum = NF_ARP_OUT, + }, + { + .hook = arpt_hook, + .owner = THIS_MODULE, + .pf = NF_ARP, + .hooknum = NF_ARP_FORWARD, + }, +}; + +static int __init init(void) +{ + int ret, i; + + /* Register table */ + ret = arpt_register_table(&packet_filter, &initial_table.repl); + if (ret < 0) + return ret; + + for (i = 0; i < ARRAY_SIZE(arpt_ops); i++) + if ((ret = nf_register_hook(&arpt_ops[i])) < 0) + goto cleanup_hooks; + return ret; + +cleanup_hooks: + while (--i >= 0) + nf_unregister_hook(&arpt_ops[i]); + + arpt_unregister_table(&packet_filter); + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < ARRAY_SIZE(arpt_ops); i++) + nf_unregister_hook(&arpt_ops[i]); + + arpt_unregister_table(&packet_filter); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c new file mode 100644 index 000000000000..3dbddd062605 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -0,0 +1,167 @@ +/* Amanda extension for IP connection tracking, Version 0.2 + * (C) 2002 by Brian J. Murrell + * based on HW's ip_conntrack_irc.c as well as other modules + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Module load syntax: + * insmod ip_conntrack_amanda.o [master_timeout=n] + * + * Where master_timeout is the timeout (in seconds) of the master + * connection (port 10080). This defaults to 5 minutes but if + * your clients take longer than 5 minutes to do their work + * before getting back to the Amanda server, you can increase + * this value. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static unsigned int master_timeout = 300; + +MODULE_AUTHOR("Brian J. Murrell "); +MODULE_DESCRIPTION("Amanda connection tracking module"); +MODULE_LICENSE("GPL"); +module_param(master_timeout, int, 0600); +MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); + +static char *conns[] = { "DATA ", "MESG ", "INDEX " }; + +/* This is slow, but it's simple. --RR */ +static char amanda_buffer[65536]; +static DECLARE_LOCK(amanda_buffer_lock); + +unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp); +EXPORT_SYMBOL_GPL(ip_nat_amanda_hook); + +static int help(struct sk_buff **pskb, + struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) +{ + struct ip_conntrack_expect *exp; + char *data, *data_limit, *tmp; + unsigned int dataoff, i; + u_int16_t port, len; + int ret = NF_ACCEPT; + + /* Only look at packets from the Amanda server */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + /* increase the UDP timeout of the master connection as replies from + * Amanda clients to the server can be quite delayed */ + ip_ct_refresh_acct(ct, ctinfo, NULL, master_timeout * HZ); + + /* No data? */ + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + if (dataoff >= (*pskb)->len) { + if (net_ratelimit()) + printk("amanda_help: skblen = %u\n", (*pskb)->len); + return NF_ACCEPT; + } + + LOCK_BH(&amanda_buffer_lock); + skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff); + data = amanda_buffer; + data_limit = amanda_buffer + (*pskb)->len - dataoff; + *data_limit = '\0'; + + /* Search for the CONNECT string */ + data = strstr(data, "CONNECT "); + if (!data) + goto out; + data += strlen("CONNECT "); + + /* Only search first line. */ + if ((tmp = strchr(data, '\n'))) + *tmp = '\0'; + + for (i = 0; i < ARRAY_SIZE(conns); i++) { + char *match = strstr(data, conns[i]); + if (!match) + continue; + tmp = data = match + strlen(conns[i]); + port = simple_strtoul(data, &data, 10); + len = data - tmp; + if (port == 0 || len > 5) + break; + + exp = ip_conntrack_expect_alloc(); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + + exp->expectfn = NULL; + exp->master = ct; + + exp->tuple.src.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip; + exp->tuple.src.u.tcp.port = 0; + exp->tuple.dst.ip = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip; + exp->tuple.dst.protonum = IPPROTO_TCP; + exp->tuple.dst.u.tcp.port = htons(port); + + exp->mask.src.ip = 0xFFFFFFFF; + exp->mask.src.u.tcp.port = 0; + exp->mask.dst.ip = 0xFFFFFFFF; + exp->mask.dst.protonum = 0xFF; + exp->mask.dst.u.tcp.port = 0xFFFF; + + if (ip_nat_amanda_hook) + ret = ip_nat_amanda_hook(pskb, ctinfo, + tmp - amanda_buffer, + len, exp); + else if (ip_conntrack_expect_related(exp) != 0) { + ip_conntrack_expect_free(exp); + ret = NF_DROP; + } + } + +out: + UNLOCK_BH(&amanda_buffer_lock); + return ret; +} + +static struct ip_conntrack_helper amanda_helper = { + .max_expected = ARRAY_SIZE(conns), + .timeout = 180, + .me = THIS_MODULE, + .help = help, + .name = "amanda", + + .tuple = { .src = { .u = { __constant_htons(10080) } }, + .dst = { .protonum = IPPROTO_UDP }, + }, + .mask = { .src = { .u = { 0xFFFF } }, + .dst = { .protonum = 0xFF }, + }, +}; + +static void __exit fini(void) +{ + ip_conntrack_helper_unregister(&amanda_helper); +} + +static int __init init(void) +{ + return ip_conntrack_helper_register(&amanda_helper); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c new file mode 100644 index 000000000000..28d9425d5c39 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -0,0 +1,1247 @@ +/* Connection state tracking for netfilter. This is separated from, + but required by, the NAT layer; it can also be used by an iptables + extension. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 23 Apr 2001: Harald Welte + * - new API and handling of conntrack/nat helpers + * - now capable of multiple expectations for one master + * 16 Jul 2002: Harald Welte + * - add usage/reference counts to ip_conntrack_expect + * - export ip_conntrack[_expect]_{find_get,put} functions + * */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* This rwlock protects the main hash table, protocol/helper/expected + registrations, conntrack timers*/ +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) + +#include +#include +#include +#include +#include + +#define IP_CONNTRACK_VERSION "2.1" + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_RWLOCK(ip_conntrack_lock); + +/* ip_conntrack_standalone needs this */ +atomic_t ip_conntrack_count = ATOMIC_INIT(0); + +void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL; +LIST_HEAD(ip_conntrack_expect_list); +struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO]; +static LIST_HEAD(helpers); +unsigned int ip_conntrack_htable_size = 0; +int ip_conntrack_max; +struct list_head *ip_conntrack_hash; +static kmem_cache_t *ip_conntrack_cachep; +static kmem_cache_t *ip_conntrack_expect_cachep; +struct ip_conntrack ip_conntrack_untracked; +unsigned int ip_ct_log_invalid; +static LIST_HEAD(unconfirmed); +static int ip_conntrack_vmalloc; + +DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); + +void +ip_conntrack_put(struct ip_conntrack *ct) +{ + IP_NF_ASSERT(ct); + nf_conntrack_put(&ct->ct_general); +} + +static int ip_conntrack_hash_rnd_initted; +static unsigned int ip_conntrack_hash_rnd; + +static u_int32_t +hash_conntrack(const struct ip_conntrack_tuple *tuple) +{ +#if 0 + dump_tuple(tuple); +#endif + return (jhash_3words(tuple->src.ip, + (tuple->dst.ip ^ tuple->dst.protonum), + (tuple->src.u.all | (tuple->dst.u.all << 16)), + ip_conntrack_hash_rnd) % ip_conntrack_htable_size); +} + +int +ip_ct_get_tuple(const struct iphdr *iph, + const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_protocol *protocol) +{ + /* Never happen */ + if (iph->frag_off & htons(IP_OFFSET)) { + printk("ip_conntrack_core: Frag of proto %u.\n", + iph->protocol); + return 0; + } + + tuple->src.ip = iph->saddr; + tuple->dst.ip = iph->daddr; + tuple->dst.protonum = iph->protocol; + tuple->dst.dir = IP_CT_DIR_ORIGINAL; + + return protocol->pkt_to_tuple(skb, dataoff, tuple); +} + +int +ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse, + const struct ip_conntrack_tuple *orig, + const struct ip_conntrack_protocol *protocol) +{ + inverse->src.ip = orig->dst.ip; + inverse->dst.ip = orig->src.ip; + inverse->dst.protonum = orig->dst.protonum; + inverse->dst.dir = !orig->dst.dir; + + return protocol->invert_tuple(inverse, orig); +} + + +/* ip_conntrack_expect helper functions */ +static void destroy_expect(struct ip_conntrack_expect *exp) +{ + ip_conntrack_put(exp->master); + IP_NF_ASSERT(!timer_pending(&exp->timeout)); + kmem_cache_free(ip_conntrack_expect_cachep, exp); + CONNTRACK_STAT_INC(expect_delete); +} + +static void unlink_expect(struct ip_conntrack_expect *exp) +{ + MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); + list_del(&exp->list); + /* Logically in destroy_expect, but we hold the lock here. */ + exp->master->expecting--; +} + +static void expectation_timed_out(unsigned long ul_expect) +{ + struct ip_conntrack_expect *exp = (void *)ul_expect; + + WRITE_LOCK(&ip_conntrack_lock); + unlink_expect(exp); + WRITE_UNLOCK(&ip_conntrack_lock); + destroy_expect(exp); +} + +/* If an expectation for this connection is found, it gets delete from + * global list then returned. */ +static struct ip_conntrack_expect * +find_expectation(const struct ip_conntrack_tuple *tuple) +{ + struct ip_conntrack_expect *i; + + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + /* If master is not in hash table yet (ie. packet hasn't left + this machine yet), how can other end know about expected? + Hence these are not the droids you are looking for (if + master ct never got confirmed, we'd hold a reference to it + and weird things would happen to future packets). */ + if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) + && is_confirmed(i->master) + && del_timer(&i->timeout)) { + unlink_expect(i); + return i; + } + } + return NULL; +} + +/* delete all expectations for this conntrack */ +static void remove_expectations(struct ip_conntrack *ct) +{ + struct ip_conntrack_expect *i, *tmp; + + /* Optimization: most connection never expect any others. */ + if (ct->expecting == 0) + return; + + list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) { + if (i->master == ct && del_timer(&i->timeout)) { + unlink_expect(i); + destroy_expect(i); + } + } +} + +static void +clean_from_lists(struct ip_conntrack *ct) +{ + unsigned int ho, hr; + + DEBUGP("clean_from_lists(%p)\n", ct); + MUST_BE_WRITE_LOCKED(&ip_conntrack_lock); + + ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]); + LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]); + + /* Destroy all pending expectations */ + remove_expectations(ct); +} + +static void +destroy_conntrack(struct nf_conntrack *nfct) +{ + struct ip_conntrack *ct = (struct ip_conntrack *)nfct; + struct ip_conntrack_protocol *proto; + + DEBUGP("destroy_conntrack(%p)\n", ct); + IP_NF_ASSERT(atomic_read(&nfct->use) == 0); + IP_NF_ASSERT(!timer_pending(&ct->timeout)); + + /* To make sure we don't get any weird locking issues here: + * destroy_conntrack() MUST NOT be called with a write lock + * to ip_conntrack_lock!!! -HW */ + proto = ip_ct_find_proto(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum); + if (proto && proto->destroy) + proto->destroy(ct); + + if (ip_conntrack_destroyed) + ip_conntrack_destroyed(ct); + + WRITE_LOCK(&ip_conntrack_lock); + /* Expectations will have been removed in clean_from_lists, + * except TFTP can create an expectation on the first packet, + * before connection is in the list, so we need to clean here, + * too. */ + remove_expectations(ct); + + /* We overload first tuple to link into unconfirmed list. */ + if (!is_confirmed(ct)) { + BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list)); + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + } + + CONNTRACK_STAT_INC(delete); + WRITE_UNLOCK(&ip_conntrack_lock); + + if (ct->master) + ip_conntrack_put(ct->master); + + DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct); + kmem_cache_free(ip_conntrack_cachep, ct); + atomic_dec(&ip_conntrack_count); +} + +static void death_by_timeout(unsigned long ul_conntrack) +{ + struct ip_conntrack *ct = (void *)ul_conntrack; + + WRITE_LOCK(&ip_conntrack_lock); + /* Inside lock so preempt is disabled on module removal path. + * Otherwise we can get spurious warnings. */ + CONNTRACK_STAT_INC(delete_list); + clean_from_lists(ct); + WRITE_UNLOCK(&ip_conntrack_lock); + ip_conntrack_put(ct); +} + +static inline int +conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i, + const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + MUST_BE_READ_LOCKED(&ip_conntrack_lock); + return tuplehash_to_ctrack(i) != ignored_conntrack + && ip_ct_tuple_equal(tuple, &i->tuple); +} + +static struct ip_conntrack_tuple_hash * +__ip_conntrack_find(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + struct ip_conntrack_tuple_hash *h; + unsigned int hash = hash_conntrack(tuple); + + MUST_BE_READ_LOCKED(&ip_conntrack_lock); + list_for_each_entry(h, &ip_conntrack_hash[hash], list) { + if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) { + CONNTRACK_STAT_INC(found); + return h; + } + CONNTRACK_STAT_INC(searched); + } + + return NULL; +} + +/* Find a connection corresponding to a tuple. */ +struct ip_conntrack_tuple_hash * +ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + struct ip_conntrack_tuple_hash *h; + + READ_LOCK(&ip_conntrack_lock); + h = __ip_conntrack_find(tuple, ignored_conntrack); + if (h) + atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); + READ_UNLOCK(&ip_conntrack_lock); + + return h; +} + +/* Confirm a connection given skb; places it in hash table */ +int +__ip_conntrack_confirm(struct sk_buff **pskb) +{ + unsigned int hash, repl_hash; + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + ct = ip_conntrack_get(*pskb, &ctinfo); + + /* ipt_REJECT uses ip_conntrack_attach to attach related + ICMP/TCP RST packets in other direction. Actual packet + which created connection will be IP_CT_NEW or for an + expected connection, IP_CT_RELATED. */ + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + /* We're not in hash table, and we refuse to set up related + connections for unconfirmed conns. But packet copies and + REJECT will give spurious warnings here. */ + /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */ + + /* No external references means noone else could have + confirmed us. */ + IP_NF_ASSERT(!is_confirmed(ct)); + DEBUGP("Confirming conntrack %p\n", ct); + + WRITE_LOCK(&ip_conntrack_lock); + + /* See if there's one in the list already, including reverse: + NAT could have grabbed it without realizing, since we're + not in the hash. If there is, we lost race. */ + if (!LIST_FIND(&ip_conntrack_hash[hash], + conntrack_tuple_cmp, + struct ip_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL) + && !LIST_FIND(&ip_conntrack_hash[repl_hash], + conntrack_tuple_cmp, + struct ip_conntrack_tuple_hash *, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) { + /* Remove from unconfirmed list */ + list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list); + + list_prepend(&ip_conntrack_hash[hash], + &ct->tuplehash[IP_CT_DIR_ORIGINAL]); + list_prepend(&ip_conntrack_hash[repl_hash], + &ct->tuplehash[IP_CT_DIR_REPLY]); + /* Timer relative to confirmation time, not original + setting time, otherwise we'd get timer wrap in + weird delay cases. */ + ct->timeout.expires += jiffies; + add_timer(&ct->timeout); + atomic_inc(&ct->ct_general.use); + set_bit(IPS_CONFIRMED_BIT, &ct->status); + CONNTRACK_STAT_INC(insert); + WRITE_UNLOCK(&ip_conntrack_lock); + return NF_ACCEPT; + } + + CONNTRACK_STAT_INC(insert_failed); + WRITE_UNLOCK(&ip_conntrack_lock); + + return NF_DROP; +} + +/* Returns true if a connection correspondings to the tuple (required + for NAT). */ +int +ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + struct ip_conntrack_tuple_hash *h; + + READ_LOCK(&ip_conntrack_lock); + h = __ip_conntrack_find(tuple, ignored_conntrack); + READ_UNLOCK(&ip_conntrack_lock); + + return h != NULL; +} + +/* There's a small race here where we may free a just-assured + connection. Too bad: we're in trouble anyway. */ +static inline int unreplied(const struct ip_conntrack_tuple_hash *i) +{ + return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status)); +} + +static int early_drop(struct list_head *chain) +{ + /* Traverse backwards: gives us oldest, which is roughly LRU */ + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack *ct = NULL; + int dropped = 0; + + READ_LOCK(&ip_conntrack_lock); + h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *); + if (h) { + ct = tuplehash_to_ctrack(h); + atomic_inc(&ct->ct_general.use); + } + READ_UNLOCK(&ip_conntrack_lock); + + if (!ct) + return dropped; + + if (del_timer(&ct->timeout)) { + death_by_timeout((unsigned long)ct); + dropped = 1; + CONNTRACK_STAT_INC(early_drop); + } + ip_conntrack_put(ct); + return dropped; +} + +static inline int helper_cmp(const struct ip_conntrack_helper *i, + const struct ip_conntrack_tuple *rtuple) +{ + return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask); +} + +static struct ip_conntrack_helper *ip_ct_find_helper(const struct ip_conntrack_tuple *tuple) +{ + return LIST_FIND(&helpers, helper_cmp, + struct ip_conntrack_helper *, + tuple); +} + +/* Allocate a new conntrack: we return -ENOMEM if classification + failed due to stress. Otherwise it really is unclassifiable. */ +static struct ip_conntrack_tuple_hash * +init_conntrack(const struct ip_conntrack_tuple *tuple, + struct ip_conntrack_protocol *protocol, + struct sk_buff *skb) +{ + struct ip_conntrack *conntrack; + struct ip_conntrack_tuple repl_tuple; + size_t hash; + struct ip_conntrack_expect *exp; + + if (!ip_conntrack_hash_rnd_initted) { + get_random_bytes(&ip_conntrack_hash_rnd, 4); + ip_conntrack_hash_rnd_initted = 1; + } + + hash = hash_conntrack(tuple); + + if (ip_conntrack_max + && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) { + /* Try dropping from this hash chain. */ + if (!early_drop(&ip_conntrack_hash[hash])) { + if (net_ratelimit()) + printk(KERN_WARNING + "ip_conntrack: table full, dropping" + " packet.\n"); + return ERR_PTR(-ENOMEM); + } + } + + if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) { + DEBUGP("Can't invert tuple.\n"); + return NULL; + } + + conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC); + if (!conntrack) { + DEBUGP("Can't allocate conntrack.\n"); + return ERR_PTR(-ENOMEM); + } + + memset(conntrack, 0, sizeof(*conntrack)); + atomic_set(&conntrack->ct_general.use, 1); + conntrack->ct_general.destroy = destroy_conntrack; + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *tuple; + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = repl_tuple; + if (!protocol->new(conntrack, skb)) { + kmem_cache_free(ip_conntrack_cachep, conntrack); + return NULL; + } + /* Don't set timer yet: wait for confirmation */ + init_timer(&conntrack->timeout); + conntrack->timeout.data = (unsigned long)conntrack; + conntrack->timeout.function = death_by_timeout; + + WRITE_LOCK(&ip_conntrack_lock); + exp = find_expectation(tuple); + + if (exp) { + DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n", + conntrack, exp); + /* Welcome, Mr. Bond. We've been expecting you... */ + __set_bit(IPS_EXPECTED_BIT, &conntrack->status); + conntrack->master = exp->master; +#if CONFIG_IP_NF_CONNTRACK_MARK + conntrack->mark = exp->master->mark; +#endif + nf_conntrack_get(&conntrack->master->ct_general); + CONNTRACK_STAT_INC(expect_new); + } else { + conntrack->helper = ip_ct_find_helper(&repl_tuple); + + CONNTRACK_STAT_INC(new); + } + + /* Overload tuple linked list to put us in unconfirmed list. */ + list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed); + + atomic_inc(&ip_conntrack_count); + WRITE_UNLOCK(&ip_conntrack_lock); + + if (exp) { + if (exp->expectfn) + exp->expectfn(conntrack, exp); + destroy_expect(exp); + } + + return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL]; +} + +/* On success, returns conntrack ptr, sets skb->nfct and ctinfo */ +static inline struct ip_conntrack * +resolve_normal_ct(struct sk_buff *skb, + struct ip_conntrack_protocol *proto, + int *set_reply, + unsigned int hooknum, + enum ip_conntrack_info *ctinfo) +{ + struct ip_conntrack_tuple tuple; + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack *ct; + + IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0); + + if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, + &tuple,proto)) + return NULL; + + /* look for tuple match */ + h = ip_conntrack_find_get(&tuple, NULL); + if (!h) { + h = init_conntrack(&tuple, proto, skb); + if (!h) + return NULL; + if (IS_ERR(h)) + return (void *)h; + } + ct = tuplehash_to_ctrack(h); + + /* It exists; we have (non-exclusive) reference. */ + if (DIRECTION(h) == IP_CT_DIR_REPLY) { + *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY; + /* Please set reply bit if this packet OK */ + *set_reply = 1; + } else { + /* Once we've had two way comms, always ESTABLISHED. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { + DEBUGP("ip_conntrack_in: normal packet for %p\n", + ct); + *ctinfo = IP_CT_ESTABLISHED; + } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { + DEBUGP("ip_conntrack_in: related packet for %p\n", + ct); + *ctinfo = IP_CT_RELATED; + } else { + DEBUGP("ip_conntrack_in: new packet for %p\n", + ct); + *ctinfo = IP_CT_NEW; + } + *set_reply = 0; + } + skb->nfct = &ct->ct_general; + skb->nfctinfo = *ctinfo; + return ct; +} + +/* Netfilter hook itself. */ +unsigned int ip_conntrack_in(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + struct ip_conntrack_protocol *proto; + int set_reply; + int ret; + + /* Previously seen (loopback or untracked)? Ignore. */ + if ((*pskb)->nfct) { + CONNTRACK_STAT_INC(ignore); + return NF_ACCEPT; + } + + /* Never happen */ + if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) { + if (net_ratelimit()) { + printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n", + (*pskb)->nh.iph->protocol, hooknum); + } + return NF_DROP; + } + + /* FIXME: Do this right please. --RR */ + (*pskb)->nfcache |= NFC_UNKNOWN; + +/* Doesn't cover locally-generated broadcast, so not worth it. */ +#if 0 + /* Ignore broadcast: no `connection'. */ + if ((*pskb)->pkt_type == PACKET_BROADCAST) { + printk("Broadcast packet!\n"); + return NF_ACCEPT; + } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) + == htonl(0x000000FF)) { + printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n", + NIPQUAD((*pskb)->nh.iph->saddr), + NIPQUAD((*pskb)->nh.iph->daddr), + (*pskb)->sk, (*pskb)->pkt_type); + } +#endif + + proto = ip_ct_find_proto((*pskb)->nh.iph->protocol); + + /* It may be an special packet, error, unclean... + * inverse of the return code tells to the netfilter + * core what to do with the packet. */ + if (proto->error != NULL + && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) { + CONNTRACK_STAT_INC(error); + CONNTRACK_STAT_INC(invalid); + return -ret; + } + + if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) { + /* Not valid part of a connection */ + CONNTRACK_STAT_INC(invalid); + return NF_ACCEPT; + } + + if (IS_ERR(ct)) { + /* Too stressed to deal. */ + CONNTRACK_STAT_INC(drop); + return NF_DROP; + } + + IP_NF_ASSERT((*pskb)->nfct); + + ret = proto->packet(ct, *pskb, ctinfo); + if (ret < 0) { + /* Invalid: inverse of the return code tells + * the netfilter core what to do*/ + nf_conntrack_put((*pskb)->nfct); + (*pskb)->nfct = NULL; + CONNTRACK_STAT_INC(invalid); + return -ret; + } + + if (set_reply) + set_bit(IPS_SEEN_REPLY_BIT, &ct->status); + + return ret; +} + +int invert_tuplepr(struct ip_conntrack_tuple *inverse, + const struct ip_conntrack_tuple *orig) +{ + return ip_ct_invert_tuple(inverse, orig, + ip_ct_find_proto(orig->dst.protonum)); +} + +/* Would two expected things clash? */ +static inline int expect_clash(const struct ip_conntrack_expect *a, + const struct ip_conntrack_expect *b) +{ + /* Part covered by intersection of masks must be unequal, + otherwise they clash */ + struct ip_conntrack_tuple intersect_mask + = { { a->mask.src.ip & b->mask.src.ip, + { a->mask.src.u.all & b->mask.src.u.all } }, + { a->mask.dst.ip & b->mask.dst.ip, + { a->mask.dst.u.all & b->mask.dst.u.all }, + a->mask.dst.protonum & b->mask.dst.protonum } }; + + return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask); +} + +static inline int expect_matches(const struct ip_conntrack_expect *a, + const struct ip_conntrack_expect *b) +{ + return a->master == b->master + && ip_ct_tuple_equal(&a->tuple, &b->tuple) + && ip_ct_tuple_equal(&a->mask, &b->mask); +} + +/* Generally a bad idea to call this: could have matched already. */ +void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp) +{ + struct ip_conntrack_expect *i; + + WRITE_LOCK(&ip_conntrack_lock); + /* choose the the oldest expectation to evict */ + list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { + if (expect_matches(i, exp) && del_timer(&i->timeout)) { + unlink_expect(i); + WRITE_UNLOCK(&ip_conntrack_lock); + destroy_expect(i); + return; + } + } + WRITE_UNLOCK(&ip_conntrack_lock); +} + +struct ip_conntrack_expect *ip_conntrack_expect_alloc(void) +{ + struct ip_conntrack_expect *new; + + new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC); + if (!new) { + DEBUGP("expect_related: OOM allocating expect\n"); + return NULL; + } + new->master = NULL; + return new; +} + +void ip_conntrack_expect_free(struct ip_conntrack_expect *expect) +{ + kmem_cache_free(ip_conntrack_expect_cachep, expect); +} + +static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp) +{ + atomic_inc(&exp->master->ct_general.use); + exp->master->expecting++; + list_add(&exp->list, &ip_conntrack_expect_list); + + if (exp->master->helper->timeout) { + init_timer(&exp->timeout); + exp->timeout.data = (unsigned long)exp; + exp->timeout.function = expectation_timed_out; + exp->timeout.expires + = jiffies + exp->master->helper->timeout * HZ; + add_timer(&exp->timeout); + } else + exp->timeout.function = NULL; + + CONNTRACK_STAT_INC(expect_create); +} + +/* Race with expectations being used means we could have none to find; OK. */ +static void evict_oldest_expect(struct ip_conntrack *master) +{ + struct ip_conntrack_expect *i; + + list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) { + if (i->master == master) { + if (del_timer(&i->timeout)) { + unlink_expect(i); + destroy_expect(i); + } + break; + } + } +} + +static inline int refresh_timer(struct ip_conntrack_expect *i) +{ + if (!del_timer(&i->timeout)) + return 0; + + i->timeout.expires = jiffies + i->master->helper->timeout*HZ; + add_timer(&i->timeout); + return 1; +} + +int ip_conntrack_expect_related(struct ip_conntrack_expect *expect) +{ + struct ip_conntrack_expect *i; + int ret; + + DEBUGP("ip_conntrack_expect_related %p\n", related_to); + DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple); + DEBUGP("mask: "); DUMP_TUPLE(&expect->mask); + + WRITE_LOCK(&ip_conntrack_lock); + list_for_each_entry(i, &ip_conntrack_expect_list, list) { + if (expect_matches(i, expect)) { + /* Refresh timer: if it's dying, ignore.. */ + if (refresh_timer(i)) { + ret = 0; + /* We don't need the one they've given us. */ + ip_conntrack_expect_free(expect); + goto out; + } + } else if (expect_clash(i, expect)) { + ret = -EBUSY; + goto out; + } + } + + /* Will be over limit? */ + if (expect->master->helper->max_expected && + expect->master->expecting >= expect->master->helper->max_expected) + evict_oldest_expect(expect->master); + + ip_conntrack_expect_insert(expect); + ret = 0; +out: + WRITE_UNLOCK(&ip_conntrack_lock); + return ret; +} + +/* Alter reply tuple (maybe alter helper). This is for NAT, and is + implicitly racy: see __ip_conntrack_confirm */ +void ip_conntrack_alter_reply(struct ip_conntrack *conntrack, + const struct ip_conntrack_tuple *newreply) +{ + WRITE_LOCK(&ip_conntrack_lock); + /* Should be unconfirmed, so not in hash table yet */ + IP_NF_ASSERT(!is_confirmed(conntrack)); + + DEBUGP("Altering reply tuple of %p to ", conntrack); + DUMP_TUPLE(newreply); + + conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; + if (!conntrack->master && conntrack->expecting == 0) + conntrack->helper = ip_ct_find_helper(newreply); + WRITE_UNLOCK(&ip_conntrack_lock); +} + +int ip_conntrack_helper_register(struct ip_conntrack_helper *me) +{ + BUG_ON(me->timeout == 0); + WRITE_LOCK(&ip_conntrack_lock); + list_prepend(&helpers, me); + WRITE_UNLOCK(&ip_conntrack_lock); + + return 0; +} + +static inline int unhelp(struct ip_conntrack_tuple_hash *i, + const struct ip_conntrack_helper *me) +{ + if (tuplehash_to_ctrack(i)->helper == me) + tuplehash_to_ctrack(i)->helper = NULL; + return 0; +} + +void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me) +{ + unsigned int i; + struct ip_conntrack_expect *exp, *tmp; + + /* Need write lock here, to delete helper. */ + WRITE_LOCK(&ip_conntrack_lock); + LIST_DELETE(&helpers, me); + + /* Get rid of expectations */ + list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) { + if (exp->master->helper == me && del_timer(&exp->timeout)) { + unlink_expect(exp); + destroy_expect(exp); + } + } + /* Get rid of expecteds, set helpers to NULL. */ + LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me); + for (i = 0; i < ip_conntrack_htable_size; i++) + LIST_FIND_W(&ip_conntrack_hash[i], unhelp, + struct ip_conntrack_tuple_hash *, me); + WRITE_UNLOCK(&ip_conntrack_lock); + + /* Someone could be still looking at the helper in a bh. */ + synchronize_net(); +} + +static inline void ct_add_counters(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb) +{ +#ifdef CONFIG_IP_NF_CT_ACCT + if (skb) { + ct->counters[CTINFO2DIR(ctinfo)].packets++; + ct->counters[CTINFO2DIR(ctinfo)].bytes += + ntohs(skb->nh.iph->tot_len); + } +#endif +} + +/* Refresh conntrack for this many jiffies and do accounting (if skb != NULL) */ +void ip_ct_refresh_acct(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + const struct sk_buff *skb, + unsigned long extra_jiffies) +{ + IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct); + + /* If not in hash table, timer will not be active yet */ + if (!is_confirmed(ct)) { + ct->timeout.expires = extra_jiffies; + ct_add_counters(ct, ctinfo, skb); + } else { + WRITE_LOCK(&ip_conntrack_lock); + /* Need del_timer for race avoidance (may already be dying). */ + if (del_timer(&ct->timeout)) { + ct->timeout.expires = jiffies + extra_jiffies; + add_timer(&ct->timeout); + } + ct_add_counters(ct, ctinfo, skb); + WRITE_UNLOCK(&ip_conntrack_lock); + } +} + +/* Returns new sk_buff, or NULL */ +struct sk_buff * +ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user) +{ + struct sock *sk = skb->sk; +#ifdef CONFIG_NETFILTER_DEBUG + unsigned int olddebug = skb->nf_debug; +#endif + + if (sk) { + sock_hold(sk); + skb_orphan(skb); + } + + local_bh_disable(); + skb = ip_defrag(skb, user); + local_bh_enable(); + + if (!skb) { + if (sk) + sock_put(sk); + return skb; + } + + if (sk) { + skb_set_owner_w(skb, sk); + sock_put(sk); + } + + ip_send_check(skb->nh.iph); + skb->nfcache |= NFC_ALTERED; +#ifdef CONFIG_NETFILTER_DEBUG + /* Packet path as if nothing had happened. */ + skb->nf_debug = olddebug; +#endif + return skb; +} + +/* Used by ipt_REJECT. */ +static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + /* This ICMP is in reverse direction to the packet which caused it */ + ct = ip_conntrack_get(skb, &ctinfo); + + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) + ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY; + else + ctinfo = IP_CT_RELATED; + + /* Attach to new skbuff, and increment count */ + nskb->nfct = &ct->ct_general; + nskb->nfctinfo = ctinfo; + nf_conntrack_get(nskb->nfct); +} + +static inline int +do_iter(const struct ip_conntrack_tuple_hash *i, + int (*iter)(struct ip_conntrack *i, void *data), + void *data) +{ + return iter(tuplehash_to_ctrack(i), data); +} + +/* Bring out ya dead! */ +static struct ip_conntrack_tuple_hash * +get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data), + void *data, unsigned int *bucket) +{ + struct ip_conntrack_tuple_hash *h = NULL; + + WRITE_LOCK(&ip_conntrack_lock); + for (; *bucket < ip_conntrack_htable_size; (*bucket)++) { + h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter, + struct ip_conntrack_tuple_hash *, iter, data); + if (h) + break; + } + if (!h) + h = LIST_FIND_W(&unconfirmed, do_iter, + struct ip_conntrack_tuple_hash *, iter, data); + if (h) + atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use); + WRITE_UNLOCK(&ip_conntrack_lock); + + return h; +} + +void +ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data) +{ + struct ip_conntrack_tuple_hash *h; + unsigned int bucket = 0; + + while ((h = get_next_corpse(iter, data, &bucket)) != NULL) { + struct ip_conntrack *ct = tuplehash_to_ctrack(h); + /* Time to push up daises... */ + if (del_timer(&ct->timeout)) + death_by_timeout((unsigned long)ct); + /* ... else the timer will get him soon. */ + + ip_conntrack_put(ct); + } +} + +/* Fast function for those who don't want to parse /proc (and I don't + blame them). */ +/* Reversing the socket's dst/src point of view gives us the reply + mapping. */ +static int +getorigdst(struct sock *sk, int optval, void __user *user, int *len) +{ + struct inet_sock *inet = inet_sk(sk); + struct ip_conntrack_tuple_hash *h; + struct ip_conntrack_tuple tuple; + + IP_CT_TUPLE_U_BLANK(&tuple); + tuple.src.ip = inet->rcv_saddr; + tuple.src.u.tcp.port = inet->sport; + tuple.dst.ip = inet->daddr; + tuple.dst.u.tcp.port = inet->dport; + tuple.dst.protonum = IPPROTO_TCP; + + /* We only do TCP at the moment: is there a better way? */ + if (strcmp(sk->sk_prot->name, "TCP")) { + DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n"); + return -ENOPROTOOPT; + } + + if ((unsigned int) *len < sizeof(struct sockaddr_in)) { + DEBUGP("SO_ORIGINAL_DST: len %u not %u\n", + *len, sizeof(struct sockaddr_in)); + return -EINVAL; + } + + h = ip_conntrack_find_get(&tuple, NULL); + if (h) { + struct sockaddr_in sin; + struct ip_conntrack *ct = tuplehash_to_ctrack(h); + + sin.sin_family = AF_INET; + sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.u.tcp.port; + sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.ip; + + DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n", + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + ip_conntrack_put(ct); + if (copy_to_user(user, &sin, sizeof(sin)) != 0) + return -EFAULT; + else + return 0; + } + DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n", + NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port), + NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port)); + return -ENOENT; +} + +static struct nf_sockopt_ops so_getorigdst = { + .pf = PF_INET, + .get_optmin = SO_ORIGINAL_DST, + .get_optmax = SO_ORIGINAL_DST+1, + .get = &getorigdst, +}; + +static int kill_all(struct ip_conntrack *i, void *data) +{ + return 1; +} + +static void free_conntrack_hash(void) +{ + if (ip_conntrack_vmalloc) + vfree(ip_conntrack_hash); + else + free_pages((unsigned long)ip_conntrack_hash, + get_order(sizeof(struct list_head) + * ip_conntrack_htable_size)); +} + +/* Mishearing the voices in his head, our hero wonders how he's + supposed to kill the mall. */ +void ip_conntrack_cleanup(void) +{ + ip_ct_attach = NULL; + /* This makes sure all current packets have passed through + netfilter framework. Roll on, two-stage module + delete... */ + synchronize_net(); + + i_see_dead_people: + ip_ct_iterate_cleanup(kill_all, NULL); + if (atomic_read(&ip_conntrack_count) != 0) { + schedule(); + goto i_see_dead_people; + } + + kmem_cache_destroy(ip_conntrack_cachep); + kmem_cache_destroy(ip_conntrack_expect_cachep); + free_conntrack_hash(); + nf_unregister_sockopt(&so_getorigdst); +} + +static int hashsize; +module_param(hashsize, int, 0400); + +int __init ip_conntrack_init(void) +{ + unsigned int i; + int ret; + + /* Idea from tcp.c: use 1/16384 of memory. On i386: 32MB + * machine has 256 buckets. >= 1GB machines have 8192 buckets. */ + if (hashsize) { + ip_conntrack_htable_size = hashsize; + } else { + ip_conntrack_htable_size + = (((num_physpages << PAGE_SHIFT) / 16384) + / sizeof(struct list_head)); + if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + ip_conntrack_htable_size = 8192; + if (ip_conntrack_htable_size < 16) + ip_conntrack_htable_size = 16; + } + ip_conntrack_max = 8 * ip_conntrack_htable_size; + + printk("ip_conntrack version %s (%u buckets, %d max)" + " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION, + ip_conntrack_htable_size, ip_conntrack_max, + sizeof(struct ip_conntrack)); + + ret = nf_register_sockopt(&so_getorigdst); + if (ret != 0) { + printk(KERN_ERR "Unable to register netfilter socket option\n"); + return ret; + } + + /* AK: the hash table is twice as big than needed because it + uses list_head. it would be much nicer to caches to use a + single pointer list head here. */ + ip_conntrack_vmalloc = 0; + ip_conntrack_hash + =(void*)__get_free_pages(GFP_KERNEL, + get_order(sizeof(struct list_head) + *ip_conntrack_htable_size)); + if (!ip_conntrack_hash) { + ip_conntrack_vmalloc = 1; + printk(KERN_WARNING "ip_conntrack: falling back to vmalloc.\n"); + ip_conntrack_hash = vmalloc(sizeof(struct list_head) + * ip_conntrack_htable_size); + } + if (!ip_conntrack_hash) { + printk(KERN_ERR "Unable to create ip_conntrack_hash\n"); + goto err_unreg_sockopt; + } + + ip_conntrack_cachep = kmem_cache_create("ip_conntrack", + sizeof(struct ip_conntrack), 0, + 0, NULL, NULL); + if (!ip_conntrack_cachep) { + printk(KERN_ERR "Unable to create ip_conntrack slab cache\n"); + goto err_free_hash; + } + + ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect", + sizeof(struct ip_conntrack_expect), + 0, 0, NULL, NULL); + if (!ip_conntrack_expect_cachep) { + printk(KERN_ERR "Unable to create ip_expect slab cache\n"); + goto err_free_conntrack_slab; + } + + /* Don't NEED lock here, but good form anyway. */ + WRITE_LOCK(&ip_conntrack_lock); + for (i = 0; i < MAX_IP_CT_PROTO; i++) + ip_ct_protos[i] = &ip_conntrack_generic_protocol; + /* Sew in builtin protocols. */ + ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp; + ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp; + ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp; + WRITE_UNLOCK(&ip_conntrack_lock); + + for (i = 0; i < ip_conntrack_htable_size; i++) + INIT_LIST_HEAD(&ip_conntrack_hash[i]); + + /* For use by ipt_REJECT */ + ip_ct_attach = ip_conntrack_attach; + + /* Set up fake conntrack: + - to never be deleted, not in any hashes */ + atomic_set(&ip_conntrack_untracked.ct_general.use, 1); + /* - and look it like as a confirmed connection */ + set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status); + + return ret; + +err_free_conntrack_slab: + kmem_cache_destroy(ip_conntrack_cachep); +err_free_hash: + free_conntrack_hash(); +err_unreg_sockopt: + nf_unregister_sockopt(&so_getorigdst); + + return -ENOMEM; +} diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c new file mode 100644 index 000000000000..12b88cbb11db --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -0,0 +1,501 @@ +/* FTP extension for IP connection tracking. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell "); +MODULE_DESCRIPTION("ftp connection tracking helper"); + +/* This is slow, but it's simple. --RR */ +static char ftp_buffer[65536]; + +static DECLARE_LOCK(ip_ftp_lock); + +#define MAX_PORTS 8 +static int ports[MAX_PORTS]; +static int ports_c; +module_param_array(ports, int, &ports_c, 0400); + +static int loose; +module_param(loose, int, 0600); + +unsigned int (*ip_nat_ftp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + enum ip_ct_ftp_type type, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp, + u32 *seq); +EXPORT_SYMBOL_GPL(ip_nat_ftp_hook); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int try_rfc959(const char *, size_t, u_int32_t [], char); +static int try_eprt(const char *, size_t, u_int32_t [], char); +static int try_epsv_response(const char *, size_t, u_int32_t [], char); + +static struct ftp_search { + enum ip_conntrack_dir dir; + const char *pattern; + size_t plen; + char skip; + char term; + enum ip_ct_ftp_type ftptype; + int (*getnum)(const char *, size_t, u_int32_t[], char); +} search[] = { + { + IP_CT_DIR_ORIGINAL, + "PORT", sizeof("PORT") - 1, ' ', '\r', + IP_CT_FTP_PORT, + try_rfc959, + }, + { + IP_CT_DIR_REPLY, + "227 ", sizeof("227 ") - 1, '(', ')', + IP_CT_FTP_PASV, + try_rfc959, + }, + { + IP_CT_DIR_ORIGINAL, + "EPRT", sizeof("EPRT") - 1, ' ', '\r', + IP_CT_FTP_EPRT, + try_eprt, + }, + { + IP_CT_DIR_REPLY, + "229 ", sizeof("229 ") - 1, '(', ')', + IP_CT_FTP_EPSV, + try_epsv_response, + }, +}; + +static int try_number(const char *data, size_t dlen, u_int32_t array[], + int array_size, char sep, char term) +{ + u_int32_t i, len; + + memset(array, 0, sizeof(array[0])*array_size); + + /* Keep data pointing at next char. */ + for (i = 0, len = 0; len < dlen && i < array_size; len++, data++) { + if (*data >= '0' && *data <= '9') { + array[i] = array[i]*10 + *data - '0'; + } + else if (*data == sep) + i++; + else { + /* Unexpected character; true if it's the + terminator and we're finished. */ + if (*data == term && i == array_size - 1) + return len; + + DEBUGP("Char %u (got %u nums) `%u' unexpected\n", + len, i, *data); + return 0; + } + } + DEBUGP("Failed to fill %u numbers separated by %c\n", array_size, sep); + + return 0; +} + +/* Returns 0, or length of numbers: 192,168,1,1,5,6 */ +static int try_rfc959(const char *data, size_t dlen, u_int32_t array[6], + char term) +{ + return try_number(data, dlen, array, 6, ',', term); +} + +/* Grab port: number up to delimiter */ +static int get_port(const char *data, int start, size_t dlen, char delim, + u_int32_t array[2]) +{ + u_int16_t port = 0; + int i; + + for (i = start; i < dlen; i++) { + /* Finished? */ + if (data[i] == delim) { + if (port == 0) + break; + array[0] = port >> 8; + array[1] = port; + return i + 1; + } + else if (data[i] >= '0' && data[i] <= '9') + port = port*10 + data[i] - '0'; + else /* Some other crap */ + break; + } + return 0; +} + +/* Returns 0, or length of numbers: |1|132.235.1.2|6275| */ +static int try_eprt(const char *data, size_t dlen, u_int32_t array[6], + char term) +{ + char delim; + int length; + + /* First character is delimiter, then "1" for IPv4, then + delimiter again. */ + if (dlen <= 3) return 0; + delim = data[0]; + if (isdigit(delim) || delim < 33 || delim > 126 + || data[1] != '1' || data[2] != delim) + return 0; + + DEBUGP("EPRT: Got |1|!\n"); + /* Now we have IP address. */ + length = try_number(data + 3, dlen - 3, array, 4, '.', delim); + if (length == 0) + return 0; + + DEBUGP("EPRT: Got IP address!\n"); + /* Start offset includes initial "|1|", and trailing delimiter */ + return get_port(data, 3 + length + 1, dlen, delim, array+4); +} + +/* Returns 0, or length of numbers: |||6446| */ +static int try_epsv_response(const char *data, size_t dlen, u_int32_t array[6], + char term) +{ + char delim; + + /* Three delimiters. */ + if (dlen <= 3) return 0; + delim = data[0]; + if (isdigit(delim) || delim < 33 || delim > 126 + || data[1] != delim || data[2] != delim) + return 0; + + return get_port(data, 3, dlen, delim, array+4); +} + +/* Return 1 for match, 0 for accept, -1 for partial. */ +static int find_pattern(const char *data, size_t dlen, + const char *pattern, size_t plen, + char skip, char term, + unsigned int *numoff, + unsigned int *numlen, + u_int32_t array[6], + int (*getnum)(const char *, size_t, u_int32_t[], char)) +{ + size_t i; + + DEBUGP("find_pattern `%s': dlen = %u\n", pattern, dlen); + if (dlen == 0) + return 0; + + if (dlen <= plen) { + /* Short packet: try for partial? */ + if (strnicmp(data, pattern, dlen) == 0) + return -1; + else return 0; + } + + if (strnicmp(data, pattern, plen) != 0) { +#if 0 + size_t i; + + DEBUGP("ftp: string mismatch\n"); + for (i = 0; i < plen; i++) { + DEBUGP("ftp:char %u `%c'(%u) vs `%c'(%u)\n", + i, data[i], data[i], + pattern[i], pattern[i]); + } +#endif + return 0; + } + + DEBUGP("Pattern matches!\n"); + /* Now we've found the constant string, try to skip + to the 'skip' character */ + for (i = plen; data[i] != skip; i++) + if (i == dlen - 1) return -1; + + /* Skip over the last character */ + i++; + + DEBUGP("Skipped up to `%c'!\n", skip); + + *numoff = i; + *numlen = getnum(data + i, dlen - i, array, term); + if (!*numlen) + return -1; + + DEBUGP("Match succeeded!\n"); + return 1; +} + +/* Look up to see if we're just after a \n. */ +static int find_nl_seq(u16 seq, const struct ip_ct_ftp_master *info, int dir) +{ + unsigned int i; + + for (i = 0; i < info->seq_aft_nl_num[dir]; i++) + if (info->seq_aft_nl[dir][i] == seq) + return 1; + return 0; +} + +/* We don't update if it's older than what we have. */ +static void update_nl_seq(u16 nl_seq, struct ip_ct_ftp_master *info, int dir) +{ + unsigned int i, oldest = NUM_SEQ_TO_REMEMBER; + + /* Look for oldest: if we find exact match, we're done. */ + for (i = 0; i < info->seq_aft_nl_num[dir]; i++) { + if (info->seq_aft_nl[dir][i] == nl_seq) + return; + + if (oldest == info->seq_aft_nl_num[dir] + || before(info->seq_aft_nl[dir][i], oldest)) + oldest = i; + } + + if (info->seq_aft_nl_num[dir] < NUM_SEQ_TO_REMEMBER) + info->seq_aft_nl[dir][info->seq_aft_nl_num[dir]++] = nl_seq; + else if (oldest != NUM_SEQ_TO_REMEMBER) + info->seq_aft_nl[dir][oldest] = nl_seq; +} + +static int help(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff, datalen; + struct tcphdr _tcph, *th; + char *fb_ptr; + int ret; + u32 seq, array[6] = { 0 }; + int dir = CTINFO2DIR(ctinfo); + unsigned int matchlen, matchoff; + struct ip_ct_ftp_master *ct_ftp_info = &ct->help.ct_ftp_info; + struct ip_conntrack_expect *exp; + unsigned int i; + int found = 0, ends_in_nl; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED + && ctinfo != IP_CT_ESTABLISHED+IP_CT_IS_REPLY) { + DEBUGP("ftp: Conntrackinfo = %u\n", ctinfo); + return NF_ACCEPT; + } + + th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + + dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4; + /* No data? */ + if (dataoff >= (*pskb)->len) { + DEBUGP("ftp: pskblen = %u\n", (*pskb)->len); + return NF_ACCEPT; + } + datalen = (*pskb)->len - dataoff; + + LOCK_BH(&ip_ftp_lock); + fb_ptr = skb_header_pointer(*pskb, dataoff, + (*pskb)->len - dataoff, ftp_buffer); + BUG_ON(fb_ptr == NULL); + + ends_in_nl = (fb_ptr[datalen - 1] == '\n'); + seq = ntohl(th->seq) + datalen; + + /* Look up to see if we're just after a \n. */ + if (!find_nl_seq(ntohl(th->seq), ct_ftp_info, dir)) { + /* Now if this ends in \n, update ftp info. */ + DEBUGP("ip_conntrack_ftp_help: wrong seq pos %s(%u) or %s(%u)\n", + ct_ftp_info->seq_aft_nl[0][dir] + old_seq_aft_nl_set ? "":"(UNSET) ", old_seq_aft_nl); + ret = NF_ACCEPT; + goto out_update_nl; + } + + /* Initialize IP array to expected address (it's not mentioned + in EPSV responses) */ + array[0] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 24) & 0xFF; + array[1] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 16) & 0xFF; + array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF; + array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF; + + for (i = 0; i < ARRAY_SIZE(search); i++) { + if (search[i].dir != dir) continue; + + found = find_pattern(fb_ptr, (*pskb)->len - dataoff, + search[i].pattern, + search[i].plen, + search[i].skip, + search[i].term, + &matchoff, &matchlen, + array, + search[i].getnum); + if (found) break; + } + if (found == -1) { + /* We don't usually drop packets. After all, this is + connection tracking, not packet filtering. + However, it is necessary for accurate tracking in + this case. */ + if (net_ratelimit()) + printk("conntrack_ftp: partial %s %u+%u\n", + search[i].pattern, + ntohl(th->seq), datalen); + ret = NF_DROP; + goto out; + } else if (found == 0) { /* No match */ + ret = NF_ACCEPT; + goto out_update_nl; + } + + DEBUGP("conntrack_ftp: match `%s' (%u bytes at %u)\n", + fb_ptr + matchoff, matchlen, ntohl(th->seq) + matchoff); + + /* Allocate expectation which will be inserted */ + exp = ip_conntrack_expect_alloc(); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + + /* We refer to the reverse direction ("!dir") tuples here, + * because we're expecting something in the other direction. + * Doesn't matter unless NAT is happening. */ + exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; + + if (htonl((array[0] << 24) | (array[1] << 16) | (array[2] << 8) | array[3]) + != ct->tuplehash[dir].tuple.src.ip) { + /* Enrico Scholz's passive FTP to partially RNAT'd ftp + server: it really wants us to connect to a + different IP address. Simply don't record it for + NAT. */ + DEBUGP("conntrack_ftp: NOT RECORDING: %u,%u,%u,%u != %u.%u.%u.%u\n", + array[0], array[1], array[2], array[3], + NIPQUAD(ct->tuplehash[dir].tuple.src.ip)); + + /* Thanks to Cristiano Lincoln Mattos + for reporting this potential + problem (DMZ machines opening holes to internal + networks, or the packet filter itself). */ + if (!loose) { + ret = NF_ACCEPT; + ip_conntrack_expect_free(exp); + goto out_update_nl; + } + exp->tuple.dst.ip = htonl((array[0] << 24) | (array[1] << 16) + | (array[2] << 8) | array[3]); + } + + exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; + exp->tuple.dst.u.tcp.port = htons(array[4] << 8 | array[5]); + exp->tuple.src.u.tcp.port = 0; /* Don't care. */ + exp->tuple.dst.protonum = IPPROTO_TCP; + exp->mask = ((struct ip_conntrack_tuple) + { { 0xFFFFFFFF, { 0 } }, + { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }}); + + exp->expectfn = NULL; + exp->master = ct; + + /* Now, NAT might want to mangle the packet, and register the + * (possibly changed) expectation itself. */ + if (ip_nat_ftp_hook) + ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, + matchoff, matchlen, exp, &seq); + else { + /* Can't expect this? Best to drop packet now. */ + if (ip_conntrack_expect_related(exp) != 0) { + ip_conntrack_expect_free(exp); + ret = NF_DROP; + } else + ret = NF_ACCEPT; + } + +out_update_nl: + /* Now if this ends in \n, update ftp info. Seq may have been + * adjusted by NAT code. */ + if (ends_in_nl) + update_nl_seq(seq, ct_ftp_info,dir); + out: + UNLOCK_BH(&ip_ftp_lock); + return ret; +} + +static struct ip_conntrack_helper ftp[MAX_PORTS]; +static char ftp_names[MAX_PORTS][10]; + +/* Not __exit: called from init() */ +static void fini(void) +{ + int i; + for (i = 0; i < ports_c; i++) { + DEBUGP("ip_ct_ftp: unregistering helper for port %d\n", + ports[i]); + ip_conntrack_helper_unregister(&ftp[i]); + } +} + +static int __init init(void) +{ + int i, ret; + char *tmpname; + + if (ports_c == 0) + ports[ports_c++] = FTP_PORT; + + for (i = 0; i < ports_c; i++) { + ftp[i].tuple.src.u.tcp.port = htons(ports[i]); + ftp[i].tuple.dst.protonum = IPPROTO_TCP; + ftp[i].mask.src.u.tcp.port = 0xFFFF; + ftp[i].mask.dst.protonum = 0xFF; + ftp[i].max_expected = 1; + ftp[i].timeout = 5 * 60; /* 5 minutes */ + ftp[i].me = THIS_MODULE; + ftp[i].help = help; + + tmpname = &ftp_names[i][0]; + if (ports[i] == FTP_PORT) + sprintf(tmpname, "ftp"); + else + sprintf(tmpname, "ftp-%d", ports[i]); + ftp[i].name = tmpname; + + DEBUGP("ip_ct_ftp: registering helper for port %d\n", + ports[i]); + ret = ip_conntrack_helper_register(&ftp[i]); + + if (ret) { + fini(); + return ret; + } + } + return 0; +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_irc.c b/net/ipv4/netfilter/ip_conntrack_irc.c new file mode 100644 index 000000000000..33cc7348b6ee --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_irc.c @@ -0,0 +1,313 @@ +/* IRC extension for IP connection tracking, Version 1.21 + * (C) 2000-2002 by Harald Welte + * based on RR's ip_conntrack_ftp.c + * + * ip_conntrack_irc.c,v 1.21 2002/02/05 14:49:26 laforge Exp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + ** + * Module load syntax: + * insmod ip_conntrack_irc.o ports=port1,port2,...port + * max_dcc_channels=n dcc_timeout=secs + * + * please give the ports of all IRC servers You wish to connect to. + * If You don't specify ports, the default will be port 6667. + * With max_dcc_channels you can define the maximum number of not + * yet answered DCC channels per IRC session (default 8). + * With dcc_timeout you can specify how long the system waits for + * an expected DCC channel (default 300 seconds). + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define MAX_PORTS 8 +static int ports[MAX_PORTS]; +static int ports_c; +static int max_dcc_channels = 8; +static unsigned int dcc_timeout = 300; +/* This is slow, but it's simple. --RR */ +static char irc_buffer[65536]; +static DECLARE_LOCK(irc_buffer_lock); + +unsigned int (*ip_nat_irc_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp); +EXPORT_SYMBOL_GPL(ip_nat_irc_hook); + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("IRC (DCC) connection tracking helper"); +MODULE_LICENSE("GPL"); +module_param_array(ports, int, &ports_c, 0400); +MODULE_PARM_DESC(ports, "port numbers of IRC servers"); +module_param(max_dcc_channels, int, 0400); +MODULE_PARM_DESC(max_dcc_channels, "max number of expected DCC channels per IRC session"); +module_param(dcc_timeout, int, 0400); +MODULE_PARM_DESC(dcc_timeout, "timeout on for unestablished DCC channels"); + +static char *dccprotos[] = { "SEND ", "CHAT ", "MOVE ", "TSEND ", "SCHAT " }; +#define MINMATCHLEN 5 + +#if 0 +#define DEBUGP(format, args...) printk(KERN_DEBUG "%s:%s:" format, \ + __FILE__, __FUNCTION__ , ## args) +#else +#define DEBUGP(format, args...) +#endif + +static int parse_dcc(char *data, char *data_end, u_int32_t *ip, + u_int16_t *port, char **ad_beg_p, char **ad_end_p) +/* tries to get the ip_addr and port out of a dcc command + return value: -1 on failure, 0 on success + data pointer to first byte of DCC command data + data_end pointer to last byte of dcc command data + ip returns parsed ip of dcc command + port returns parsed port of dcc command + ad_beg_p returns pointer to first byte of addr data + ad_end_p returns pointer to last byte of addr data */ +{ + + /* at least 12: "AAAAAAAA P\1\n" */ + while (*data++ != ' ') + if (data > data_end - 12) + return -1; + + *ad_beg_p = data; + *ip = simple_strtoul(data, &data, 10); + + /* skip blanks between ip and port */ + while (*data == ' ') { + if (data >= data_end) + return -1; + data++; + } + + *port = simple_strtoul(data, &data, 10); + *ad_end_p = data; + + return 0; +} + +static int help(struct sk_buff **pskb, + struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff; + struct tcphdr _tcph, *th; + char *data, *data_limit, *ib_ptr; + int dir = CTINFO2DIR(ctinfo); + struct ip_conntrack_expect *exp; + u32 seq; + u_int32_t dcc_ip; + u_int16_t dcc_port; + int i, ret = NF_ACCEPT; + char *addr_beg_p, *addr_end_p; + + DEBUGP("entered\n"); + + /* If packet is coming from IRC server */ + if (dir == IP_CT_DIR_REPLY) + return NF_ACCEPT; + + /* Until there's been traffic both ways, don't look in packets. */ + if (ctinfo != IP_CT_ESTABLISHED + && ctinfo != IP_CT_ESTABLISHED + IP_CT_IS_REPLY) { + DEBUGP("Conntrackinfo = %u\n", ctinfo); + return NF_ACCEPT; + } + + /* Not a full tcp header? */ + th = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) + return NF_ACCEPT; + + /* No data? */ + dataoff = (*pskb)->nh.iph->ihl*4 + th->doff*4; + if (dataoff >= (*pskb)->len) + return NF_ACCEPT; + + LOCK_BH(&irc_buffer_lock); + ib_ptr = skb_header_pointer(*pskb, dataoff, + (*pskb)->len - dataoff, irc_buffer); + BUG_ON(ib_ptr == NULL); + + data = ib_ptr; + data_limit = ib_ptr + (*pskb)->len - dataoff; + + /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24 + * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */ + while (data < (data_limit - (19 + MINMATCHLEN))) { + if (memcmp(data, "\1DCC ", 5)) { + data++; + continue; + } + + data += 5; + /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */ + + DEBUGP("DCC found in master %u.%u.%u.%u:%u %u.%u.%u.%u:%u...\n", + NIPQUAD(iph->saddr), ntohs(th->source), + NIPQUAD(iph->daddr), ntohs(th->dest)); + + for (i = 0; i < ARRAY_SIZE(dccprotos); i++) { + if (memcmp(data, dccprotos[i], strlen(dccprotos[i]))) { + /* no match */ + continue; + } + + DEBUGP("DCC %s detected\n", dccprotos[i]); + data += strlen(dccprotos[i]); + /* we have at least + * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid + * data left (== 14/13 bytes) */ + if (parse_dcc((char *)data, data_limit, &dcc_ip, + &dcc_port, &addr_beg_p, &addr_end_p)) { + /* unable to parse */ + DEBUGP("unable to parse dcc command\n"); + continue; + } + DEBUGP("DCC bound ip/port: %u.%u.%u.%u:%u\n", + HIPQUAD(dcc_ip), dcc_port); + + /* dcc_ip can be the internal OR external (NAT'ed) IP + * Tiago Sousa */ + if (ct->tuplehash[dir].tuple.src.ip != htonl(dcc_ip) + && ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip != htonl(dcc_ip)) { + if (net_ratelimit()) + printk(KERN_WARNING + "Forged DCC command from " + "%u.%u.%u.%u: %u.%u.%u.%u:%u\n", + NIPQUAD(ct->tuplehash[dir].tuple.src.ip), + HIPQUAD(dcc_ip), dcc_port); + + continue; + } + + exp = ip_conntrack_expect_alloc(); + if (exp == NULL) { + ret = NF_DROP; + goto out; + } + + /* save position of address in dcc string, + * necessary for NAT */ + DEBUGP("tcph->seq = %u\n", th->seq); + seq = ntohl(th->seq) + (addr_beg_p - ib_ptr); + + /* We refer to the reverse direction ("!dir") + * tuples here, because we're expecting + * something in the other * direction. + * Doesn't matter unless NAT is happening. */ + exp->tuple = ((struct ip_conntrack_tuple) + { { 0, { 0 } }, + { ct->tuplehash[!dir].tuple.dst.ip, + { .tcp = { htons(dcc_port) } }, + IPPROTO_TCP }}); + exp->mask = ((struct ip_conntrack_tuple) + { { 0, { 0 } }, + { 0xFFFFFFFF, { .tcp = { 0xFFFF } }, 0xFF }}); + exp->expectfn = NULL; + exp->master = ct; + if (ip_nat_irc_hook) + ret = ip_nat_irc_hook(pskb, ctinfo, + addr_beg_p - ib_ptr, + addr_end_p - addr_beg_p, + exp); + else if (ip_conntrack_expect_related(exp) != 0) { + ip_conntrack_expect_free(exp); + ret = NF_DROP; + } + goto out; + } /* for .. NUM_DCCPROTO */ + } /* while data < ... */ + + out: + UNLOCK_BH(&irc_buffer_lock); + return ret; +} + +static struct ip_conntrack_helper irc_helpers[MAX_PORTS]; +static char irc_names[MAX_PORTS][10]; + +static void fini(void); + +static int __init init(void) +{ + int i, ret; + struct ip_conntrack_helper *hlpr; + char *tmpname; + + if (max_dcc_channels < 1) { + printk("ip_conntrack_irc: max_dcc_channels must be a positive integer\n"); + return -EBUSY; + } + if (dcc_timeout < 0) { + printk("ip_conntrack_irc: dcc_timeout must be a positive integer\n"); + return -EBUSY; + } + + /* If no port given, default to standard irc port */ + if (ports_c == 0) + ports[ports_c++] = IRC_PORT; + + for (i = 0; i < ports_c; i++) { + hlpr = &irc_helpers[i]; + hlpr->tuple.src.u.tcp.port = htons(ports[i]); + hlpr->tuple.dst.protonum = IPPROTO_TCP; + hlpr->mask.src.u.tcp.port = 0xFFFF; + hlpr->mask.dst.protonum = 0xFF; + hlpr->max_expected = max_dcc_channels; + hlpr->timeout = dcc_timeout; + hlpr->me = THIS_MODULE; + hlpr->help = help; + + tmpname = &irc_names[i][0]; + if (ports[i] == IRC_PORT) + sprintf(tmpname, "irc"); + else + sprintf(tmpname, "irc-%d", i); + hlpr->name = tmpname; + + DEBUGP("port #%d: %d\n", i, ports[i]); + + ret = ip_conntrack_helper_register(hlpr); + + if (ret) { + printk("ip_conntrack_irc: ERROR registering port %d\n", + ports[i]); + fini(); + return -EBUSY; + } + } + return 0; +} + +/* This function is intentionally _NOT_ defined as __exit, because + * it is needed by the init function */ +static void fini(void) +{ + int i; + for (i = 0; i < ports_c; i++) { + DEBUGP("unregistering port %d\n", + ports[i]); + ip_conntrack_helper_unregister(&irc_helpers[i]); + } +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_generic.c b/net/ipv4/netfilter/ip_conntrack_proto_generic.c new file mode 100644 index 000000000000..88c3712bd251 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_generic.c @@ -0,0 +1,75 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +unsigned long ip_ct_generic_timeout = 600*HZ; + +static int generic_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return 1; +} + +static int generic_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + tuple->src.u.all = 0; + tuple->dst.u.all = 0; + + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int generic_print_tuple(struct seq_file *s, + const struct ip_conntrack_tuple *tuple) +{ + return 0; +} + +/* Print out the private part of the conntrack. */ +static int generic_print_conntrack(struct seq_file *s, + const struct ip_conntrack *state) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int packet(struct ip_conntrack *conntrack, + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) +{ + ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_generic_timeout); + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int new(struct ip_conntrack *conntrack, const struct sk_buff *skb) +{ + return 1; +} + +struct ip_conntrack_protocol ip_conntrack_generic_protocol = +{ + .proto = 0, + .name = "unknown", + .pkt_to_tuple = generic_pkt_to_tuple, + .invert_tuple = generic_invert_tuple, + .print_tuple = generic_print_tuple, + .print_conntrack = generic_print_conntrack, + .packet = packet, + .new = new, +}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c new file mode 100644 index 000000000000..602c74db3252 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -0,0 +1,279 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned long ip_ct_icmp_timeout = 30*HZ; + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int icmp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple) +{ + struct icmphdr _hdr, *hp; + + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return 0; + + tuple->dst.u.icmp.type = hp->type; + tuple->src.u.icmp.id = hp->un.echo.id; + tuple->dst.u.icmp.code = hp->code; + + return 1; +} + +static int icmp_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + /* Add 1; spaces filled with 0. */ + static u_int8_t invmap[] + = { [ICMP_ECHO] = ICMP_ECHOREPLY + 1, + [ICMP_ECHOREPLY] = ICMP_ECHO + 1, + [ICMP_TIMESTAMP] = ICMP_TIMESTAMPREPLY + 1, + [ICMP_TIMESTAMPREPLY] = ICMP_TIMESTAMP + 1, + [ICMP_INFO_REQUEST] = ICMP_INFO_REPLY + 1, + [ICMP_INFO_REPLY] = ICMP_INFO_REQUEST + 1, + [ICMP_ADDRESS] = ICMP_ADDRESSREPLY + 1, + [ICMP_ADDRESSREPLY] = ICMP_ADDRESS + 1}; + + if (orig->dst.u.icmp.type >= sizeof(invmap) + || !invmap[orig->dst.u.icmp.type]) + return 0; + + tuple->src.u.icmp.id = orig->src.u.icmp.id; + tuple->dst.u.icmp.type = invmap[orig->dst.u.icmp.type] - 1; + tuple->dst.u.icmp.code = orig->dst.u.icmp.code; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int icmp_print_tuple(struct seq_file *s, + const struct ip_conntrack_tuple *tuple) +{ + return seq_printf(s, "type=%u code=%u id=%u ", + tuple->dst.u.icmp.type, + tuple->dst.u.icmp.code, + ntohs(tuple->src.u.icmp.id)); +} + +/* Print out the private part of the conntrack. */ +static int icmp_print_conntrack(struct seq_file *s, + const struct ip_conntrack *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int icmp_packet(struct ip_conntrack *ct, + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) +{ + /* Try to delete connection immediately after all replies: + won't actually vanish as we still have skb, and del_timer + means this will only run once even if count hits zero twice + (theoretically possible with SMP) */ + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY) { + if (atomic_dec_and_test(&ct->proto.icmp.count) + && del_timer(&ct->timeout)) + ct->timeout.function((unsigned long)ct); + } else { + atomic_inc(&ct->proto.icmp.count); + ip_ct_refresh_acct(ct, ctinfo, skb, ip_ct_icmp_timeout); + } + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int icmp_new(struct ip_conntrack *conntrack, + const struct sk_buff *skb) +{ + static u_int8_t valid_new[] + = { [ICMP_ECHO] = 1, + [ICMP_TIMESTAMP] = 1, + [ICMP_INFO_REQUEST] = 1, + [ICMP_ADDRESS] = 1 }; + + if (conntrack->tuplehash[0].tuple.dst.u.icmp.type >= sizeof(valid_new) + || !valid_new[conntrack->tuplehash[0].tuple.dst.u.icmp.type]) { + /* Can't create a new ICMP `conn' with this. */ + DEBUGP("icmp: can't create new conn with type %u\n", + conntrack->tuplehash[0].tuple.dst.u.icmp.type); + DUMP_TUPLE(&conntrack->tuplehash[0].tuple); + return 0; + } + atomic_set(&conntrack->proto.icmp.count, 0); + return 1; +} + +static int +icmp_error_message(struct sk_buff *skb, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct ip_conntrack_tuple innertuple, origtuple; + struct { + struct icmphdr icmp; + struct iphdr ip; + } _in, *inside; + struct ip_conntrack_protocol *innerproto; + struct ip_conntrack_tuple_hash *h; + int dataoff; + + IP_NF_ASSERT(skb->nfct == NULL); + + /* Not enough header? */ + inside = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_in), &_in); + if (inside == NULL) + return NF_ACCEPT; + + /* Ignore ICMP's containing fragments (shouldn't happen) */ + if (inside->ip.frag_off & htons(IP_OFFSET)) { + DEBUGP("icmp_error_track: fragment of proto %u\n", + inside->ip.protocol); + return NF_ACCEPT; + } + + innerproto = ip_ct_find_proto(inside->ip.protocol); + dataoff = skb->nh.iph->ihl*4 + sizeof(inside->icmp) + inside->ip.ihl*4; + /* Are they talking about one of our connections? */ + if (!ip_ct_get_tuple(&inside->ip, skb, dataoff, &origtuple, innerproto)) { + DEBUGP("icmp_error: ! get_tuple p=%u", inside->ip.protocol); + return NF_ACCEPT; + } + + /* Ordinarily, we'd expect the inverted tupleproto, but it's + been preserved inside the ICMP. */ + if (!ip_ct_invert_tuple(&innertuple, &origtuple, innerproto)) { + DEBUGP("icmp_error_track: Can't invert tuple\n"); + return NF_ACCEPT; + } + + *ctinfo = IP_CT_RELATED; + + h = ip_conntrack_find_get(&innertuple, NULL); + if (!h) { + /* Locally generated ICMPs will match inverted if they + haven't been SNAT'ed yet */ + /* FIXME: NAT code has to handle half-done double NAT --RR */ + if (hooknum == NF_IP_LOCAL_OUT) + h = ip_conntrack_find_get(&origtuple, NULL); + + if (!h) { + DEBUGP("icmp_error_track: no match\n"); + return NF_ACCEPT; + } + /* Reverse direction from that found */ + if (DIRECTION(h) != IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } else { + if (DIRECTION(h) == IP_CT_DIR_REPLY) + *ctinfo += IP_CT_IS_REPLY; + } + + /* Update skb to refer to this connection */ + skb->nfct = &tuplehash_to_ctrack(h)->ct_general; + skb->nfctinfo = *ctinfo; + return -NF_ACCEPT; +} + +/* Small and modified version of icmp_rcv */ +static int +icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct icmphdr _ih, *icmph; + + /* Not enough header? */ + icmph = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_ih), &_ih); + if (icmph == NULL) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_icmp: short packet "); + return -NF_ACCEPT; + } + + /* See ip_conntrack_proto_tcp.c */ + if (hooknum != NF_IP_PRE_ROUTING) + goto checksum_skipped; + + switch (skb->ip_summed) { + case CHECKSUM_HW: + if (!(u16)csum_fold(skb->csum)) + break; + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_icmp: bad HW ICMP checksum "); + return -NF_ACCEPT; + case CHECKSUM_NONE: + if ((u16)csum_fold(skb_checksum(skb, 0, skb->len, 0))) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_icmp: bad ICMP checksum "); + return -NF_ACCEPT; + } + default: + break; + } + +checksum_skipped: + /* + * 18 is the highest 'known' ICMP type. Anything else is a mystery + * + * RFC 1122: 3.2.2 Unknown ICMP messages types MUST be silently + * discarded. + */ + if (icmph->type > NR_ICMP_TYPES) { + if (LOG_INVALID(IPPROTO_ICMP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_icmp: invalid ICMP type "); + return -NF_ACCEPT; + } + + /* Need to track icmp error message? */ + if (icmph->type != ICMP_DEST_UNREACH + && icmph->type != ICMP_SOURCE_QUENCH + && icmph->type != ICMP_TIME_EXCEEDED + && icmph->type != ICMP_PARAMETERPROB + && icmph->type != ICMP_REDIRECT) + return NF_ACCEPT; + + return icmp_error_message(skb, ctinfo, hooknum); +} + +struct ip_conntrack_protocol ip_conntrack_protocol_icmp = +{ + .proto = IPPROTO_ICMP, + .name = "icmp", + .pkt_to_tuple = icmp_pkt_to_tuple, + .invert_tuple = icmp_invert_tuple, + .print_tuple = icmp_print_tuple, + .print_conntrack = icmp_print_conntrack, + .packet = icmp_packet, + .new = icmp_new, + .error = icmp_error, +}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_sctp.c b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c new file mode 100644 index 000000000000..ff8c34a860ff --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_sctp.c @@ -0,0 +1,649 @@ +/* + * Connection tracking protocol helper module for SCTP. + * + * SCTP is defined in RFC 2960. References to various sections in this code + * are to this RFC. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * Added support for proc manipulation of timeouts. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#if 0 +#define DEBUGP(format, ...) printk(format, ## __VA_ARGS__) +#else +#define DEBUGP(format, args...) +#endif + +/* Protects conntrack->proto.sctp */ +static DECLARE_RWLOCK(sctp_lock); + +/* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR + + And so for me for SCTP :D -Kiran */ + +static const char *sctp_conntrack_names[] = { + "NONE", + "CLOSED", + "COOKIE_WAIT", + "COOKIE_ECHOED", + "ESTABLISHED", + "SHUTDOWN_SENT", + "SHUTDOWN_RECD", + "SHUTDOWN_ACK_SENT", +}; + +#define SECS * HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + +static unsigned long ip_ct_sctp_timeout_closed = 10 SECS; +static unsigned long ip_ct_sctp_timeout_cookie_wait = 3 SECS; +static unsigned long ip_ct_sctp_timeout_cookie_echoed = 3 SECS; +static unsigned long ip_ct_sctp_timeout_established = 5 DAYS; +static unsigned long ip_ct_sctp_timeout_shutdown_sent = 300 SECS / 1000; +static unsigned long ip_ct_sctp_timeout_shutdown_recd = 300 SECS / 1000; +static unsigned long ip_ct_sctp_timeout_shutdown_ack_sent = 3 SECS; + +static unsigned long * sctp_timeouts[] += { NULL, /* SCTP_CONNTRACK_NONE */ + &ip_ct_sctp_timeout_closed, /* SCTP_CONNTRACK_CLOSED */ + &ip_ct_sctp_timeout_cookie_wait, /* SCTP_CONNTRACK_COOKIE_WAIT */ + &ip_ct_sctp_timeout_cookie_echoed, /* SCTP_CONNTRACK_COOKIE_ECHOED */ + &ip_ct_sctp_timeout_established, /* SCTP_CONNTRACK_ESTABLISHED */ + &ip_ct_sctp_timeout_shutdown_sent, /* SCTP_CONNTRACK_SHUTDOWN_SENT */ + &ip_ct_sctp_timeout_shutdown_recd, /* SCTP_CONNTRACK_SHUTDOWN_RECD */ + &ip_ct_sctp_timeout_shutdown_ack_sent /* SCTP_CONNTRACK_SHUTDOWN_ACK_SENT */ + }; + +#define sNO SCTP_CONNTRACK_NONE +#define sCL SCTP_CONNTRACK_CLOSED +#define sCW SCTP_CONNTRACK_COOKIE_WAIT +#define sCE SCTP_CONNTRACK_COOKIE_ECHOED +#define sES SCTP_CONNTRACK_ESTABLISHED +#define sSS SCTP_CONNTRACK_SHUTDOWN_SENT +#define sSR SCTP_CONNTRACK_SHUTDOWN_RECD +#define sSA SCTP_CONNTRACK_SHUTDOWN_ACK_SENT +#define sIV SCTP_CONNTRACK_MAX + +/* + These are the descriptions of the states: + +NOTE: These state names are tantalizingly similar to the states of an +SCTP endpoint. But the interpretation of the states is a little different, +considering that these are the states of the connection and not of an end +point. Please note the subtleties. -Kiran + +NONE - Nothing so far. +COOKIE WAIT - We have seen an INIT chunk in the original direction, or also + an INIT_ACK chunk in the reply direction. +COOKIE ECHOED - We have seen a COOKIE_ECHO chunk in the original direction. +ESTABLISHED - We have seen a COOKIE_ACK in the reply direction. +SHUTDOWN_SENT - We have seen a SHUTDOWN chunk in the original direction. +SHUTDOWN_RECD - We have seen a SHUTDOWN chunk in the reply directoin. +SHUTDOWN_ACK_SENT - We have seen a SHUTDOWN_ACK chunk in the direction opposite + to that of the SHUTDOWN chunk. +CLOSED - We have seen a SHUTDOWN_COMPLETE chunk in the direction of + the SHUTDOWN chunk. Connection is closed. +*/ + +/* TODO + - I have assumed that the first INIT is in the original direction. + This messes things when an INIT comes in the reply direction in CLOSED + state. + - Check the error type in the reply dir before transitioning from +cookie echoed to closed. + - Sec 5.2.4 of RFC 2960 + - Multi Homing support. +*/ + +/* SCTP conntrack state transitions */ +static enum sctp_conntrack sctp_conntracks[2][9][SCTP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sCW, sCW, sCW, sCE, sES, sSS, sSR, sSA}, +/* init_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sCL, sCL, sCW, sCE, sSS, sSS, sSR, sSA}, +/* shutdown_ack */ {sSA, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant have Stale cookie*/ +/* cookie_echo */ {sCL, sCL, sCE, sCE, sES, sSS, sSR, sSA},/* 5.2.4 - Big TODO */ +/* cookie_ack */ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in orig dir */ +/* shutdown_comp*/ {sCL, sCL, sCW, sCE, sES, sSS, sSR, sCL} + }, + { +/* REPLY */ +/* sNO, sCL, sCW, sCE, sES, sSS, sSR, sSA */ +/* init */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* INIT in sCL Big TODO */ +/* init_ack */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA}, +/* abort */ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, +/* shutdown */ {sIV, sCL, sCW, sCE, sSR, sSS, sSR, sSA}, +/* shutdown_ack */ {sIV, sCL, sCW, sCE, sES, sSA, sSA, sSA}, +/* error */ {sIV, sCL, sCW, sCL, sES, sSS, sSR, sSA}, +/* cookie_echo */ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sSA},/* Cant come in reply dir */ +/* cookie_ack */ {sIV, sCL, sCW, sES, sES, sSS, sSR, sSA}, +/* shutdown_comp*/ {sIV, sCL, sCW, sCE, sES, sSS, sSR, sCL} + } +}; + +static int sctp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple) +{ + sctp_sctphdr_t _hdr, *hp; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.sctp.port = hp->source; + tuple->dst.u.sctp.port = hp->dest; + return 1; +} + +static int sctp_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + tuple->src.u.sctp.port = orig->dst.u.sctp.port; + tuple->dst.u.sctp.port = orig->src.u.sctp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int sctp_print_tuple(struct seq_file *s, + const struct ip_conntrack_tuple *tuple) +{ + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.sctp.port), + ntohs(tuple->dst.u.sctp.port)); +} + +/* Print out the private part of the conntrack. */ +static int sctp_print_conntrack(struct seq_file *s, + const struct ip_conntrack *conntrack) +{ + enum sctp_conntrack state; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + READ_LOCK(&sctp_lock); + state = conntrack->proto.sctp.state; + READ_UNLOCK(&sctp_lock); + + return seq_printf(s, "%s ", sctp_conntrack_names[state]); +} + +#define for_each_sctp_chunk(skb, sch, _sch, offset, count) \ +for (offset = skb->nh.iph->ihl * 4 + sizeof(sctp_sctphdr_t), count = 0; \ + offset < skb->len && \ + (sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch)); \ + offset += (htons(sch->length) + 3) & ~3, count++) + +/* Some validity checks to make sure the chunks are fine */ +static int do_basic_checks(struct ip_conntrack *conntrack, + const struct sk_buff *skb, + char *map) +{ + u_int32_t offset, count; + sctp_chunkhdr_t _sch, *sch; + int flag; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + flag = 0; + + for_each_sctp_chunk (skb, sch, _sch, offset, count) { + DEBUGP("Chunk Num: %d Type: %d\n", count, sch->type); + + if (sch->type == SCTP_CID_INIT + || sch->type == SCTP_CID_INIT_ACK + || sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { + flag = 1; + } + + /* Cookie Ack/Echo chunks not the first OR + Init / Init Ack / Shutdown compl chunks not the only chunks */ + if ((sch->type == SCTP_CID_COOKIE_ACK + || sch->type == SCTP_CID_COOKIE_ECHO + || flag) + && count !=0 ) { + DEBUGP("Basic checks failed\n"); + return 1; + } + + if (map) { + set_bit(sch->type, (void *)map); + } + } + + DEBUGP("Basic checks passed\n"); + return 0; +} + +static int new_state(enum ip_conntrack_dir dir, + enum sctp_conntrack cur_state, + int chunk_type) +{ + int i; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + DEBUGP("Chunk type: %d\n", chunk_type); + + switch (chunk_type) { + case SCTP_CID_INIT: + DEBUGP("SCTP_CID_INIT\n"); + i = 0; break; + case SCTP_CID_INIT_ACK: + DEBUGP("SCTP_CID_INIT_ACK\n"); + i = 1; break; + case SCTP_CID_ABORT: + DEBUGP("SCTP_CID_ABORT\n"); + i = 2; break; + case SCTP_CID_SHUTDOWN: + DEBUGP("SCTP_CID_SHUTDOWN\n"); + i = 3; break; + case SCTP_CID_SHUTDOWN_ACK: + DEBUGP("SCTP_CID_SHUTDOWN_ACK\n"); + i = 4; break; + case SCTP_CID_ERROR: + DEBUGP("SCTP_CID_ERROR\n"); + i = 5; break; + case SCTP_CID_COOKIE_ECHO: + DEBUGP("SCTP_CID_COOKIE_ECHO\n"); + i = 6; break; + case SCTP_CID_COOKIE_ACK: + DEBUGP("SCTP_CID_COOKIE_ACK\n"); + i = 7; break; + case SCTP_CID_SHUTDOWN_COMPLETE: + DEBUGP("SCTP_CID_SHUTDOWN_COMPLETE\n"); + i = 8; break; + default: + /* Other chunks like DATA, SACK, HEARTBEAT and + its ACK do not cause a change in state */ + DEBUGP("Unknown chunk type, Will stay in %s\n", + sctp_conntrack_names[cur_state]); + return cur_state; + } + + DEBUGP("dir: %d cur_state: %s chunk_type: %d new_state: %s\n", + dir, sctp_conntrack_names[cur_state], chunk_type, + sctp_conntrack_names[sctp_conntracks[dir][i][cur_state]]); + + return sctp_conntracks[dir][i][cur_state]; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int sctp_packet(struct ip_conntrack *conntrack, + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) +{ + enum sctp_conntrack newconntrack, oldsctpstate; + struct iphdr *iph = skb->nh.iph; + sctp_sctphdr_t _sctph, *sh; + sctp_chunkhdr_t _sch, *sch; + u_int32_t offset, count; + char map[256 / sizeof (char)] = {0}; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph); + if (sh == NULL) + return -1; + + if (do_basic_checks(conntrack, skb, map) != 0) + return -1; + + /* Check the verification tag (Sec 8.5) */ + if (!test_bit(SCTP_CID_INIT, (void *)map) + && !test_bit(SCTP_CID_SHUTDOWN_COMPLETE, (void *)map) + && !test_bit(SCTP_CID_COOKIE_ECHO, (void *)map) + && !test_bit(SCTP_CID_ABORT, (void *)map) + && !test_bit(SCTP_CID_SHUTDOWN_ACK, (void *)map) + && (sh->vtag != conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { + DEBUGP("Verification tag check failed\n"); + return -1; + } + + oldsctpstate = newconntrack = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk (skb, sch, _sch, offset, count) { + WRITE_LOCK(&sctp_lock); + + /* Special cases of Verification tag check (Sec 8.5.1) */ + if (sch->type == SCTP_CID_INIT) { + /* Sec 8.5.1 (A) */ + if (sh->vtag != 0) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_ABORT) { + /* Sec 8.5.1 (B) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) + && !(sh->vtag == conntrack->proto.sctp.vtag + [1 - CTINFO2DIR(ctinfo)])) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_SHUTDOWN_COMPLETE) { + /* Sec 8.5.1 (C) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)]) + && !(sh->vtag == conntrack->proto.sctp.vtag + [1 - CTINFO2DIR(ctinfo)] + && (sch->flags & 1))) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + } else if (sch->type == SCTP_CID_COOKIE_ECHO) { + /* Sec 8.5.1 (D) */ + if (!(sh->vtag == conntrack->proto.sctp.vtag[CTINFO2DIR(ctinfo)])) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + } + + oldsctpstate = conntrack->proto.sctp.state; + newconntrack = new_state(CTINFO2DIR(ctinfo), oldsctpstate, sch->type); + + /* Invalid */ + if (newconntrack == SCTP_CONNTRACK_MAX) { + DEBUGP("ip_conntrack_sctp: Invalid dir=%i ctype=%u conntrack=%u\n", + CTINFO2DIR(ctinfo), sch->type, oldsctpstate); + WRITE_UNLOCK(&sctp_lock); + return -1; + } + + /* If it is an INIT or an INIT ACK note down the vtag */ + if (sch->type == SCTP_CID_INIT + || sch->type == SCTP_CID_INIT_ACK) { + sctp_inithdr_t _inithdr, *ih; + + ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), + sizeof(_inithdr), &_inithdr); + if (ih == NULL) { + WRITE_UNLOCK(&sctp_lock); + return -1; + } + DEBUGP("Setting vtag %x for dir %d\n", + ih->init_tag, !CTINFO2DIR(ctinfo)); + conntrack->proto.sctp.vtag[!CTINFO2DIR(ctinfo)] = ih->init_tag; + } + + conntrack->proto.sctp.state = newconntrack; + WRITE_UNLOCK(&sctp_lock); + } + + ip_ct_refresh_acct(conntrack, ctinfo, skb, *sctp_timeouts[newconntrack]); + + if (oldsctpstate == SCTP_CONNTRACK_COOKIE_ECHOED + && CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY + && newconntrack == SCTP_CONNTRACK_ESTABLISHED) { + DEBUGP("Setting assured bit\n"); + set_bit(IPS_ASSURED_BIT, &conntrack->status); + } + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int sctp_new(struct ip_conntrack *conntrack, + const struct sk_buff *skb) +{ + enum sctp_conntrack newconntrack; + struct iphdr *iph = skb->nh.iph; + sctp_sctphdr_t _sctph, *sh; + sctp_chunkhdr_t _sch, *sch; + u_int32_t offset, count; + char map[256 / sizeof (char)] = {0}; + + DEBUGP(__FUNCTION__); + DEBUGP("\n"); + + sh = skb_header_pointer(skb, iph->ihl * 4, sizeof(_sctph), &_sctph); + if (sh == NULL) + return 0; + + if (do_basic_checks(conntrack, skb, map) != 0) + return 0; + + /* If an OOTB packet has any of these chunks discard (Sec 8.4) */ + if ((test_bit (SCTP_CID_ABORT, (void *)map)) + || (test_bit (SCTP_CID_SHUTDOWN_COMPLETE, (void *)map)) + || (test_bit (SCTP_CID_COOKIE_ACK, (void *)map))) { + return 0; + } + + newconntrack = SCTP_CONNTRACK_MAX; + for_each_sctp_chunk (skb, sch, _sch, offset, count) { + /* Don't need lock here: this conntrack not in circulation yet */ + newconntrack = new_state (IP_CT_DIR_ORIGINAL, + SCTP_CONNTRACK_NONE, sch->type); + + /* Invalid: delete conntrack */ + if (newconntrack == SCTP_CONNTRACK_MAX) { + DEBUGP("ip_conntrack_sctp: invalid new deleting.\n"); + return 0; + } + + /* Copy the vtag into the state info */ + if (sch->type == SCTP_CID_INIT) { + if (sh->vtag == 0) { + sctp_inithdr_t _inithdr, *ih; + + ih = skb_header_pointer(skb, offset + sizeof(sctp_chunkhdr_t), + sizeof(_inithdr), &_inithdr); + if (ih == NULL) + return 0; + + DEBUGP("Setting vtag %x for new conn\n", + ih->init_tag); + + conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = + ih->init_tag; + } else { + /* Sec 8.5.1 (A) */ + return 0; + } + } + /* If it is a shutdown ack OOTB packet, we expect a return + shutdown complete, otherwise an ABORT Sec 8.4 (5) and (8) */ + else { + DEBUGP("Setting vtag %x for new conn OOTB\n", + sh->vtag); + conntrack->proto.sctp.vtag[IP_CT_DIR_REPLY] = sh->vtag; + } + + conntrack->proto.sctp.state = newconntrack; + } + + return 1; +} + +static struct ip_conntrack_protocol ip_conntrack_protocol_sctp = { + .proto = IPPROTO_SCTP, + .name = "sctp", + .pkt_to_tuple = sctp_pkt_to_tuple, + .invert_tuple = sctp_invert_tuple, + .print_tuple = sctp_print_tuple, + .print_conntrack = sctp_print_conntrack, + .packet = sctp_packet, + .new = sctp_new, + .destroy = NULL, + .me = THIS_MODULE +}; + +#ifdef CONFIG_SYSCTL +static ctl_table ip_ct_sysctl_table[] = { + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_CLOSED, + .procname = "ip_conntrack_sctp_timeout_closed", + .data = &ip_ct_sctp_timeout_closed, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_WAIT, + .procname = "ip_conntrack_sctp_timeout_cookie_wait", + .data = &ip_ct_sctp_timeout_cookie_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_COOKIE_ECHOED, + .procname = "ip_conntrack_sctp_timeout_cookie_echoed", + .data = &ip_ct_sctp_timeout_cookie_echoed, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_ESTABLISHED, + .procname = "ip_conntrack_sctp_timeout_established", + .data = &ip_ct_sctp_timeout_established, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_SENT, + .procname = "ip_conntrack_sctp_timeout_shutdown_sent", + .data = &ip_ct_sctp_timeout_shutdown_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD, + .procname = "ip_conntrack_sctp_timeout_shutdown_recd", + .data = &ip_ct_sctp_timeout_shutdown_recd, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT, + .procname = "ip_conntrack_sctp_timeout_shutdown_ack_sent", + .data = &ip_ct_sctp_timeout_shutdown_ack_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_netfilter_table[] = { + { + .ctl_name = NET_IPV4_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = ip_ct_sysctl_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_ipv4_table[] = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = ip_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ip_ct_ipv4_table, + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header *ip_ct_sysctl_header; +#endif + +static int __init init(void) +{ + int ret; + + ret = ip_conntrack_protocol_register(&ip_conntrack_protocol_sctp); + if (ret) { + printk("ip_conntrack_proto_sctp: protocol register failed\n"); + goto out; + } + +#ifdef CONFIG_SYSCTL + ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0); + if (ip_ct_sysctl_header == NULL) { + ret = -ENOMEM; + printk("ip_conntrack_proto_sctp: can't register to sysctl.\n"); + goto cleanup; + } +#endif + + return ret; + +#ifdef CONFIG_SYSCTL + cleanup: + ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp); +#endif + out: + DEBUGP("SCTP conntrack module loading %s\n", + ret ? "failed": "succeeded"); + return ret; +} + +static void __exit fini(void) +{ + ip_conntrack_protocol_unregister(&ip_conntrack_protocol_sctp); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(ip_ct_sysctl_header); +#endif + DEBUGP("SCTP conntrack module unloaded\n"); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kiran Kumar Immidi"); +MODULE_DESCRIPTION("Netfilter connection tracking protocol helper for SCTP"); diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c new file mode 100644 index 000000000000..e800b16fc920 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -0,0 +1,1098 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Jozsef Kadlecsik : + * - Real stateful connection tracking + * - Modified state transitions table + * - Window scaling support added + * - SACK support added + * + * Willy Tarreau: + * - State table bugfixes + * - More robust state changes + * - Tuning timer parameters + * + * version 2.2 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#define DEBUGP_VARS +#else +#define DEBUGP(format, args...) +#endif + +/* Protects conntrack->proto.tcp */ +static DECLARE_RWLOCK(tcp_lock); + +/* "Be conservative in what you do, + be liberal in what you accept from others." + If it's non-zero, we mark only out of window RST segments as INVALID. */ +int ip_ct_tcp_be_liberal = 0; + +/* When connection is picked up from the middle, how many packets are required + to pass in each direction when we assume we are in sync - if any side uses + window scaling, we lost the game. + If it is set to zero, we disable picking up already established + connections. */ +int ip_ct_tcp_loose = 3; + +/* Max number of the retransmitted packets without receiving an (acceptable) + ACK from the destination. If this number is reached, a shorter timer + will be started. */ +int ip_ct_tcp_max_retrans = 3; + + /* FIXME: Examine ipfilter's timeouts and conntrack transitions more + closely. They're more complex. --RR */ + +static const char *tcp_conntrack_names[] = { + "NONE", + "SYN_SENT", + "SYN_RECV", + "ESTABLISHED", + "FIN_WAIT", + "CLOSE_WAIT", + "LAST_ACK", + "TIME_WAIT", + "CLOSE", + "LISTEN" +}; + +#define SECS * HZ +#define MINS * 60 SECS +#define HOURS * 60 MINS +#define DAYS * 24 HOURS + +unsigned long ip_ct_tcp_timeout_syn_sent = 2 MINS; +unsigned long ip_ct_tcp_timeout_syn_recv = 60 SECS; +unsigned long ip_ct_tcp_timeout_established = 5 DAYS; +unsigned long ip_ct_tcp_timeout_fin_wait = 2 MINS; +unsigned long ip_ct_tcp_timeout_close_wait = 60 SECS; +unsigned long ip_ct_tcp_timeout_last_ack = 30 SECS; +unsigned long ip_ct_tcp_timeout_time_wait = 2 MINS; +unsigned long ip_ct_tcp_timeout_close = 10 SECS; + +/* RFC1122 says the R2 limit should be at least 100 seconds. + Linux uses 15 packets as limit, which corresponds + to ~13-30min depending on RTO. */ +unsigned long ip_ct_tcp_timeout_max_retrans = 5 MINS; + +static unsigned long * tcp_timeouts[] += { NULL, /* TCP_CONNTRACK_NONE */ + &ip_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ + &ip_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ + &ip_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */ + &ip_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */ + &ip_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */ + &ip_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */ + &ip_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */ + &ip_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */ + NULL, /* TCP_CONNTRACK_LISTEN */ + }; + +#define sNO TCP_CONNTRACK_NONE +#define sSS TCP_CONNTRACK_SYN_SENT +#define sSR TCP_CONNTRACK_SYN_RECV +#define sES TCP_CONNTRACK_ESTABLISHED +#define sFW TCP_CONNTRACK_FIN_WAIT +#define sCW TCP_CONNTRACK_CLOSE_WAIT +#define sLA TCP_CONNTRACK_LAST_ACK +#define sTW TCP_CONNTRACK_TIME_WAIT +#define sCL TCP_CONNTRACK_CLOSE +#define sLI TCP_CONNTRACK_LISTEN +#define sIV TCP_CONNTRACK_MAX +#define sIG TCP_CONNTRACK_IGNORE + +/* What TCP flags are set from RST/SYN/FIN/ACK. */ +enum tcp_bit_set { + TCP_SYN_SET, + TCP_SYNACK_SET, + TCP_FIN_SET, + TCP_ACK_SET, + TCP_RST_SET, + TCP_NONE_SET, +}; + +/* + * The TCP state transition table needs a few words... + * + * We are the man in the middle. All the packets go through us + * but might get lost in transit to the destination. + * It is assumed that the destinations can't receive segments + * we haven't seen. + * + * The checked segment is in window, but our windows are *not* + * equivalent with the ones of the sender/receiver. We always + * try to guess the state of the current sender. + * + * The meaning of the states are: + * + * NONE: initial state + * SYN_SENT: SYN-only packet seen + * SYN_RECV: SYN-ACK packet seen + * ESTABLISHED: ACK packet seen + * FIN_WAIT: FIN packet seen + * CLOSE_WAIT: ACK seen (after FIN) + * LAST_ACK: FIN seen (after FIN) + * TIME_WAIT: last ACK seen + * CLOSE: closed connection + * + * LISTEN state is not used. + * + * Packets marked as IGNORED (sIG): + * if they may be either invalid or valid + * and the receiver may send back a connection + * closing RST or a SYN/ACK. + * + * Packets marked as INVALID (sIV): + * if they are invalid + * or we do not support the request (simultaneous open) + */ +static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { + { +/* ORIGINAL */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV }, +/* + * sNO -> sSS Initialize a new connection + * sSS -> sSS Retransmitted SYN + * sSR -> sIG Late retransmitted SYN? + * sES -> sIG Error: SYNs in window outside the SYN_SENT state + * are errors. Receiver will reply with RST + * and close the connection. + * Or we are not in sync and hold a dead connection. + * sFW -> sIG + * sCW -> sIG + * sLA -> sIG + * sTW -> sSS Reopened connection (RFC 1122). + * sCL -> sSS + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* + * A SYN/ACK from the client is always invalid: + * - either it tries to set up a simultaneous open, which is + * not supported; + * - or the firewall has just been inserted between the two hosts + * during the session set-up. The SYN will be retransmitted + * by the true client (or it'll time out). + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sNO -> sIV Too late and no reason to do anything... + * sSS -> sIV Client migth not send FIN in this state: + * we enforce waiting for a SYN/ACK reply first. + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions, waiting for + * the last ACK. + * Migth be a retransmitted FIN as well... + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. Remain in the same state. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* + * sNO -> sES Assumed. + * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. + * sSR -> sES Established state is reached. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. Remain in the same state. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + }, + { +/* REPLY */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* + * sNO -> sIV Never reached. + * sSS -> sIV Simultaneous open, not supported + * sSR -> sIV Simultaneous open, not supported. + * sES -> sIV Server may not initiate a connection. + * sFW -> sIV + * sCW -> sIV + * sLA -> sIV + * sTW -> sIV Reopened connection, but server may not do it. + * sCL -> sIV + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV }, +/* + * sSS -> sSR Standard open. + * sSR -> sSR Retransmitted SYN/ACK. + * sES -> sIG Late retransmitted SYN/ACK? + * sFW -> sIG Might be SYN/ACK answering ignored SYN + * sCW -> sIG + * sLA -> sIG + * sTW -> sIG + * sCL -> sIG + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, +/* + * sSS -> sIV Server might not send FIN in this state. + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions. + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. + * sTW -> sTW + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*ack*/ { sIV, sIV, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* + * sSS -> sIV Might be a half-open connection. + * sSR -> sSR Might answer late resent SYN. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. + * sCL -> sCL + */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } + } +}; + +static int tcp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple) +{ + struct tcphdr _hdr, *hp; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, 8, &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.tcp.port = hp->source; + tuple->dst.u.tcp.port = hp->dest; + + return 1; +} + +static int tcp_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + tuple->src.u.tcp.port = orig->dst.u.tcp.port; + tuple->dst.u.tcp.port = orig->src.u.tcp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int tcp_print_tuple(struct seq_file *s, + const struct ip_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.tcp.port), + ntohs(tuple->dst.u.tcp.port)); +} + +/* Print out the private part of the conntrack. */ +static int tcp_print_conntrack(struct seq_file *s, + const struct ip_conntrack *conntrack) +{ + enum tcp_conntrack state; + + READ_LOCK(&tcp_lock); + state = conntrack->proto.tcp.state; + READ_UNLOCK(&tcp_lock); + + return seq_printf(s, "%s ", tcp_conntrack_names[state]); +} + +static unsigned int get_conntrack_index(const struct tcphdr *tcph) +{ + if (tcph->rst) return TCP_RST_SET; + else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET); + else if (tcph->fin) return TCP_FIN_SET; + else if (tcph->ack) return TCP_ACK_SET; + else return TCP_NONE_SET; +} + +/* TCP connection tracking based on 'Real Stateful TCP Packet Filtering + in IP Filter' by Guido van Rooij. + + http://www.nluug.nl/events/sane2000/papers.html + http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz + + The boundaries and the conditions are changed according to RFC793: + the packet must intersect the window (i.e. segments may be + after the right or before the left edge) and thus receivers may ACK + segments after the right edge of the window. + + td_maxend = max(sack + max(win,1)) seen in reply packets + td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets + td_maxwin += seq + len - sender.td_maxend + if seq + len > sender.td_maxend + td_end = max(seq + len) seen in sent packets + + I. Upper bound for valid data: seq <= sender.td_maxend + II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin + III. Upper bound for valid ack: sack <= receiver.td_end + IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW + + where sack is the highest right edge of sack block found in the packet. + + The upper bound limit for a valid ack is not ignored - + we doesn't have to deal with fragments. +*/ + +static inline __u32 segment_seq_plus_len(__u32 seq, + size_t len, + struct iphdr *iph, + struct tcphdr *tcph) +{ + return (seq + len - (iph->ihl + tcph->doff)*4 + + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0)); +} + +/* Fixme: what about big packets? */ +#define MAXACKWINCONST 66000 +#define MAXACKWINDOW(sender) \ + ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin \ + : MAXACKWINCONST) + +/* + * Simplified tcp_parse_options routine from tcp_input.c + */ +static void tcp_options(const struct sk_buff *skb, + struct iphdr *iph, + struct tcphdr *tcph, + struct ip_ct_tcp_state *state) +{ + unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; + unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + + if (!length) + return; + + ptr = skb_header_pointer(skb, + (iph->ihl * 4) + sizeof(struct tcphdr), + length, buff); + BUG_ON(ptr == NULL); + + state->td_scale = + state->flags = 0; + + while (length > 0) { + int opcode=*ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK_PERM + && opsize == TCPOLEN_SACK_PERM) + state->flags |= IP_CT_TCP_FLAG_SACK_PERM; + else if (opcode == TCPOPT_WINDOW + && opsize == TCPOLEN_WINDOW) { + state->td_scale = *(u_int8_t *)ptr; + + if (state->td_scale > 14) { + /* See RFC1323 */ + state->td_scale = 14; + } + state->flags |= + IP_CT_TCP_FLAG_WINDOW_SCALE; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +static void tcp_sack(const struct sk_buff *skb, + struct iphdr *iph, + struct tcphdr *tcph, + __u32 *sack) +{ + unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; + unsigned char *ptr; + int length = (tcph->doff*4) - sizeof(struct tcphdr); + __u32 tmp; + + if (!length) + return; + + ptr = skb_header_pointer(skb, + (iph->ihl * 4) + sizeof(struct tcphdr), + length, buff); + BUG_ON(ptr == NULL); + + /* Fast path for timestamp-only option */ + if (length == TCPOLEN_TSTAMP_ALIGNED*4 + && *(__u32 *)ptr == + __constant_ntohl((TCPOPT_NOP << 24) + | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) + | TCPOLEN_TIMESTAMP)) + return; + + while (length > 0) { + int opcode=*ptr++; + int opsize, i; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + break; /* don't parse partial options */ + + if (opcode == TCPOPT_SACK + && opsize >= (TCPOLEN_SACK_BASE + + TCPOLEN_SACK_PERBLOCK) + && !((opsize - TCPOLEN_SACK_BASE) + % TCPOLEN_SACK_PERBLOCK)) { + for (i = 0; + i < (opsize - TCPOLEN_SACK_BASE); + i += TCPOLEN_SACK_PERBLOCK) { + tmp = ntohl(*((u_int32_t *)(ptr+i)+1)); + + if (after(tmp, *sack)) + *sack = tmp; + } + return; + } + ptr += opsize - 2; + length -= opsize; + } + } +} + +static int tcp_in_window(struct ip_ct_tcp *state, + enum ip_conntrack_dir dir, + unsigned int index, + const struct sk_buff *skb, + struct iphdr *iph, + struct tcphdr *tcph) +{ + struct ip_ct_tcp_state *sender = &state->seen[dir]; + struct ip_ct_tcp_state *receiver = &state->seen[!dir]; + __u32 seq, ack, sack, end, win, swin; + int res; + + /* + * Get the required data from the packet. + */ + seq = ntohl(tcph->seq); + ack = sack = ntohl(tcph->ack_seq); + win = ntohs(tcph->window); + end = segment_seq_plus_len(seq, skb->len, iph, tcph); + + if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) + tcp_sack(skb, iph, tcph, &sack); + + DEBUGP("tcp_in_window: START\n"); + DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "seq=%u ack=%u sack=%u win=%u end=%u\n", + NIPQUAD(iph->saddr), ntohs(tcph->source), + NIPQUAD(iph->daddr), ntohs(tcph->dest), + seq, ack, sack, win, end); + DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + if (sender->td_end == 0) { + /* + * Initialize sender data. + */ + if (tcph->syn && tcph->ack) { + /* + * Outgoing SYN-ACK in reply to a SYN. + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, iph, tcph, sender); + /* + * RFC 1323: + * Both sides must send the Window Scale option + * to enable window scaling in either direction. + */ + if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE + && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) + sender->td_scale = + receiver->td_scale = 0; + } else { + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + sender->td_end = end; + sender->td_maxwin = (win == 0 ? 1 : win); + sender->td_maxend = end + sender->td_maxwin; + } + } else if (((state->state == TCP_CONNTRACK_SYN_SENT + && dir == IP_CT_DIR_ORIGINAL) + || (state->state == TCP_CONNTRACK_SYN_RECV + && dir == IP_CT_DIR_REPLY)) + && after(end, sender->td_end)) { + /* + * RFC 793: "if a TCP is reinitialized ... then it need + * not wait at all; it must only be sure to use sequence + * numbers larger than those recently used." + */ + sender->td_end = + sender->td_maxend = end; + sender->td_maxwin = (win == 0 ? 1 : win); + + tcp_options(skb, iph, tcph, sender); + } + + if (!(tcph->ack)) { + /* + * If there is no ACK, just pretend it was set and OK. + */ + ack = sack = receiver->td_end; + } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) == + (TCP_FLAG_ACK|TCP_FLAG_RST)) + && (ack == 0)) { + /* + * Broken TCP stacks, that set ACK in RST packets as well + * with zero ack value. + */ + ack = sack = receiver->td_end; + } + + if (seq == end + && (!tcph->rst + || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT))) + /* + * Packets contains no data: we assume it is valid + * and check the ack value only. + * However RST segments are always validated by their + * SEQ number, except when seq == 0 (reset sent answering + * SYN. + */ + seq = end = sender->td_end; + + DEBUGP("tcp_in_window: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "seq=%u ack=%u sack =%u win=%u end=%u\n", + NIPQUAD(iph->saddr), ntohs(tcph->source), + NIPQUAD(iph->daddr), ntohs(tcph->dest), + seq, ack, sack, win, end); + DEBUGP("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + + DEBUGP("tcp_in_window: I=%i II=%i III=%i IV=%i\n", + before(seq, sender->td_maxend + 1), + after(end, sender->td_end - receiver->td_maxwin - 1), + before(sack, receiver->td_end + 1), + after(ack, receiver->td_end - MAXACKWINDOW(sender))); + + if (sender->loose || receiver->loose || + (before(seq, sender->td_maxend + 1) && + after(end, sender->td_end - receiver->td_maxwin - 1) && + before(sack, receiver->td_end + 1) && + after(ack, receiver->td_end - MAXACKWINDOW(sender)))) { + /* + * Take into account window scaling (RFC 1323). + */ + if (!tcph->syn) + win <<= sender->td_scale; + + /* + * Update sender data. + */ + swin = win + (sack - ack); + if (sender->td_maxwin < swin) + sender->td_maxwin = swin; + if (after(end, sender->td_end)) + sender->td_end = end; + /* + * Update receiver data. + */ + if (after(end, sender->td_maxend)) + receiver->td_maxwin += end - sender->td_maxend; + if (after(sack + win, receiver->td_maxend - 1)) { + receiver->td_maxend = sack + win; + if (win == 0) + receiver->td_maxend++; + } + + /* + * Check retransmissions. + */ + if (index == TCP_ACK_SET) { + if (state->last_dir == dir + && state->last_seq == seq + && state->last_ack == ack + && state->last_end == end) + state->retrans++; + else { + state->last_dir = dir; + state->last_seq = seq; + state->last_ack = ack; + state->last_end = end; + state->retrans = 0; + } + } + /* + * Close the window of disabled window tracking :-) + */ + if (sender->loose) + sender->loose--; + + res = 1; + } else { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: %s ", + before(seq, sender->td_maxend + 1) ? + after(end, sender->td_end - receiver->td_maxwin - 1) ? + before(sack, receiver->td_end + 1) ? + after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG" + : "ACK is under the lower bound (possible overly delayed ACK)" + : "ACK is over the upper bound (ACKed data not seen yet)" + : "SEQ is under the lower bound (already ACKed data retransmitted)" + : "SEQ is over the upper bound (over the window of the receiver)"); + + res = ip_ct_tcp_be_liberal; + } + + DEBUGP("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " + "receiver end=%u maxend=%u maxwin=%u\n", + res, sender->td_end, sender->td_maxend, sender->td_maxwin, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin); + + return res; +} + +#ifdef CONFIG_IP_NF_NAT_NEEDED +/* Update sender->td_end after NAT successfully mangled the packet */ +void ip_conntrack_tcp_update(struct sk_buff *skb, + struct ip_conntrack *conntrack, + enum ip_conntrack_dir dir) +{ + struct iphdr *iph = skb->nh.iph; + struct tcphdr *tcph = (void *)skb->nh.iph + skb->nh.iph->ihl*4; + __u32 end; +#ifdef DEBUGP_VARS + struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir]; + struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir]; +#endif + + end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, iph, tcph); + + WRITE_LOCK(&tcp_lock); + /* + * We have to worry for the ack in the reply packet only... + */ + if (after(end, conntrack->proto.tcp.seen[dir].td_end)) + conntrack->proto.tcp.seen[dir].td_end = end; + conntrack->proto.tcp.last_end = end; + WRITE_UNLOCK(&tcp_lock); + DEBUGP("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); +} + +#endif + +#define TH_FIN 0x01 +#define TH_SYN 0x02 +#define TH_RST 0x04 +#define TH_PUSH 0x08 +#define TH_ACK 0x10 +#define TH_URG 0x20 +#define TH_ECE 0x40 +#define TH_CWR 0x80 + +/* table of valid flag combinations - ECE and CWR are always valid */ +static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG) + 1] = +{ + [TH_SYN] = 1, + [TH_SYN|TH_ACK] = 1, + [TH_RST] = 1, + [TH_RST|TH_ACK] = 1, + [TH_RST|TH_ACK|TH_PUSH] = 1, + [TH_FIN|TH_ACK] = 1, + [TH_ACK] = 1, + [TH_ACK|TH_PUSH] = 1, + [TH_ACK|TH_URG] = 1, + [TH_ACK|TH_URG|TH_PUSH] = 1, + [TH_FIN|TH_ACK|TH_PUSH] = 1, + [TH_FIN|TH_ACK|TH_URG] = 1, + [TH_FIN|TH_ACK|TH_URG|TH_PUSH] = 1, +}; + +/* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ +static int tcp_error(struct sk_buff *skb, + enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct iphdr *iph = skb->nh.iph; + struct tcphdr _tcph, *th; + unsigned int tcplen = skb->len - iph->ihl * 4; + u_int8_t tcpflags; + + /* Smaller that minimal TCP header? */ + th = skb_header_pointer(skb, iph->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: short packet "); + return -NF_ACCEPT; + } + + /* Not whole TCP header or malformed packet */ + if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the semantic of CHECKSUM_HW is different there + * and moreover root might send raw packets. + */ + /* FIXME: Source route IP option packets --RR */ + if (hooknum == NF_IP_PRE_ROUTING + && csum_tcpudp_magic(iph->saddr, iph->daddr, tcplen, IPPROTO_TCP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, iph->ihl*4, tcplen, 0))) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: bad TCP checksum "); + return -NF_ACCEPT; + } + + /* Check TCP flags. */ + tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR)); + if (!tcp_valid_flags[tcpflags]) { + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: invalid TCP flag combination "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +/* Returns verdict for packet, or -1 for invalid. */ +static int tcp_packet(struct ip_conntrack *conntrack, + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) +{ + enum tcp_conntrack new_state, old_state; + enum ip_conntrack_dir dir; + struct iphdr *iph = skb->nh.iph; + struct tcphdr *th, _tcph; + unsigned long timeout; + unsigned int index; + + th = skb_header_pointer(skb, iph->ihl * 4, + sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + + WRITE_LOCK(&tcp_lock); + old_state = conntrack->proto.tcp.state; + dir = CTINFO2DIR(ctinfo); + index = get_conntrack_index(th); + new_state = tcp_conntracks[dir][index][old_state]; + + switch (new_state) { + case TCP_CONNTRACK_IGNORE: + /* Either SYN in ORIGINAL + * or SYN/ACK in REPLY. */ + if (index == TCP_SYNACK_SET + && conntrack->proto.tcp.last_index == TCP_SYN_SET + && conntrack->proto.tcp.last_dir != dir + && ntohl(th->ack_seq) == + conntrack->proto.tcp.last_end) { + /* This SYN/ACK acknowledges a SYN that we earlier + * ignored as invalid. This means that the client and + * the server are both in sync, while the firewall is + * not. We kill this session and block the SYN/ACK so + * that the client cannot but retransmit its SYN and + * thus initiate a clean new session. + */ + WRITE_UNLOCK(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: killing out of sync session "); + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return -NF_DROP; + } + conntrack->proto.tcp.last_index = index; + conntrack->proto.tcp.last_dir = dir; + conntrack->proto.tcp.last_seq = ntohl(th->seq); + conntrack->proto.tcp.last_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, iph, th); + + WRITE_UNLOCK(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: invalid packet ignored "); + return NF_ACCEPT; + case TCP_CONNTRACK_MAX: + /* Invalid packet */ + DEBUGP("ip_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", + dir, get_conntrack_index(th), + old_state); + WRITE_UNLOCK(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: invalid state "); + return -NF_ACCEPT; + case TCP_CONNTRACK_SYN_SENT: + if (old_state < TCP_CONNTRACK_TIME_WAIT) + break; + if ((conntrack->proto.tcp.seen[dir].flags & + IP_CT_TCP_FLAG_CLOSE_INIT) + || after(ntohl(th->seq), + conntrack->proto.tcp.seen[dir].td_end)) { + /* Attempt to reopen a closed connection. + * Delete this connection and look up again. */ + WRITE_UNLOCK(&tcp_lock); + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return -NF_REPEAT; + } else { + WRITE_UNLOCK(&tcp_lock); + if (LOG_INVALID(IPPROTO_TCP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_tcp: invalid SYN"); + return -NF_ACCEPT; + } + case TCP_CONNTRACK_CLOSE: + if (index == TCP_RST_SET + && test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) + && conntrack->proto.tcp.last_index == TCP_SYN_SET + && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { + /* RST sent to invalid SYN we had let trough + * SYN was in window then, tear down connection. + * We skip window checking, because packet might ACK + * segments we ignored in the SYN. */ + goto in_window; + } + /* Just fall trough */ + default: + /* Keep compilers happy. */ + break; + } + + if (!tcp_in_window(&conntrack->proto.tcp, dir, index, + skb, iph, th)) { + WRITE_UNLOCK(&tcp_lock); + return -NF_ACCEPT; + } + in_window: + /* From now on we have got in-window packets */ + conntrack->proto.tcp.last_index = index; + + DEBUGP("tcp_conntracks: src=%u.%u.%u.%u:%hu dst=%u.%u.%u.%u:%hu " + "syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", + NIPQUAD(iph->saddr), ntohs(th->source), + NIPQUAD(iph->daddr), ntohs(th->dest), + (th->syn ? 1 : 0), (th->ack ? 1 : 0), + (th->fin ? 1 : 0), (th->rst ? 1 : 0), + old_state, new_state); + + conntrack->proto.tcp.state = new_state; + if (old_state != new_state + && (new_state == TCP_CONNTRACK_FIN_WAIT + || new_state == TCP_CONNTRACK_CLOSE)) + conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + timeout = conntrack->proto.tcp.retrans >= ip_ct_tcp_max_retrans + && *tcp_timeouts[new_state] > ip_ct_tcp_timeout_max_retrans + ? ip_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; + WRITE_UNLOCK(&tcp_lock); + + if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + /* If only reply is a RST, we can consider ourselves not to + have an established connection: this is a fairly common + problem case, so we can delete the conntrack + immediately. --RR */ + if (th->rst) { + if (del_timer(&conntrack->timeout)) + conntrack->timeout.function((unsigned long) + conntrack); + return NF_ACCEPT; + } + } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status) + && (old_state == TCP_CONNTRACK_SYN_RECV + || old_state == TCP_CONNTRACK_ESTABLISHED) + && new_state == TCP_CONNTRACK_ESTABLISHED) { + /* Set ASSURED if we see see valid ack in ESTABLISHED + after SYN_RECV or a valid answer for a picked up + connection. */ + set_bit(IPS_ASSURED_BIT, &conntrack->status); + } + ip_ct_refresh_acct(conntrack, ctinfo, skb, timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int tcp_new(struct ip_conntrack *conntrack, + const struct sk_buff *skb) +{ + enum tcp_conntrack new_state; + struct iphdr *iph = skb->nh.iph; + struct tcphdr *th, _tcph; +#ifdef DEBUGP_VARS + struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0]; + struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1]; +#endif + + th = skb_header_pointer(skb, iph->ihl * 4, + sizeof(_tcph), &_tcph); + BUG_ON(th == NULL); + + /* Don't need lock here: this conntrack not in circulation yet */ + new_state + = tcp_conntracks[0][get_conntrack_index(th)] + [TCP_CONNTRACK_NONE]; + + /* Invalid: delete conntrack */ + if (new_state >= TCP_CONNTRACK_MAX) { + DEBUGP("ip_ct_tcp: invalid new deleting.\n"); + return 0; + } + + if (new_state == TCP_CONNTRACK_SYN_SENT) { + /* SYN packet */ + conntrack->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + iph, th); + conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (conntrack->proto.tcp.seen[0].td_maxwin == 0) + conntrack->proto.tcp.seen[0].td_maxwin = 1; + conntrack->proto.tcp.seen[0].td_maxend = + conntrack->proto.tcp.seen[0].td_end; + + tcp_options(skb, iph, th, &conntrack->proto.tcp.seen[0]); + conntrack->proto.tcp.seen[1].flags = 0; + conntrack->proto.tcp.seen[0].loose = + conntrack->proto.tcp.seen[1].loose = 0; + } else if (ip_ct_tcp_loose == 0) { + /* Don't try to pick up connections. */ + return 0; + } else { + /* + * We are in the middle of a connection, + * its history is lost for us. + * Let's try to use the data from the packet. + */ + conntrack->proto.tcp.seen[0].td_end = + segment_seq_plus_len(ntohl(th->seq), skb->len, + iph, th); + conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (conntrack->proto.tcp.seen[0].td_maxwin == 0) + conntrack->proto.tcp.seen[0].td_maxwin = 1; + conntrack->proto.tcp.seen[0].td_maxend = + conntrack->proto.tcp.seen[0].td_end + + conntrack->proto.tcp.seen[0].td_maxwin; + conntrack->proto.tcp.seen[0].td_scale = 0; + + /* We assume SACK. Should we assume window scaling too? */ + conntrack->proto.tcp.seen[0].flags = + conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM; + conntrack->proto.tcp.seen[0].loose = + conntrack->proto.tcp.seen[1].loose = ip_ct_tcp_loose; + } + + conntrack->proto.tcp.seen[1].td_end = 0; + conntrack->proto.tcp.seen[1].td_maxend = 0; + conntrack->proto.tcp.seen[1].td_maxwin = 1; + conntrack->proto.tcp.seen[1].td_scale = 0; + + /* tcp_packet will set them */ + conntrack->proto.tcp.state = TCP_CONNTRACK_NONE; + conntrack->proto.tcp.last_index = TCP_NONE_SET; + + DEBUGP("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " + "receiver end=%u maxend=%u maxwin=%u scale=%i\n", + sender->td_end, sender->td_maxend, sender->td_maxwin, + sender->td_scale, + receiver->td_end, receiver->td_maxend, receiver->td_maxwin, + receiver->td_scale); + return 1; +} + +struct ip_conntrack_protocol ip_conntrack_protocol_tcp = +{ + .proto = IPPROTO_TCP, + .name = "tcp", + .pkt_to_tuple = tcp_pkt_to_tuple, + .invert_tuple = tcp_invert_tuple, + .print_tuple = tcp_print_tuple, + .print_conntrack = tcp_print_conntrack, + .packet = tcp_packet, + .new = tcp_new, + .error = tcp_error, +}; diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c new file mode 100644 index 000000000000..5bc28a224623 --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -0,0 +1,146 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +unsigned long ip_ct_udp_timeout = 30*HZ; +unsigned long ip_ct_udp_timeout_stream = 180*HZ; + +static int udp_pkt_to_tuple(const struct sk_buff *skb, + unsigned int dataoff, + struct ip_conntrack_tuple *tuple) +{ + struct udphdr _hdr, *hp; + + /* Actually only need first 8 bytes. */ + hp = skb_header_pointer(skb, dataoff, sizeof(_hdr), &_hdr); + if (hp == NULL) + return 0; + + tuple->src.u.udp.port = hp->source; + tuple->dst.u.udp.port = hp->dest; + + return 1; +} + +static int udp_invert_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig) +{ + tuple->src.u.udp.port = orig->dst.u.udp.port; + tuple->dst.u.udp.port = orig->src.u.udp.port; + return 1; +} + +/* Print out the per-protocol part of the tuple. */ +static int udp_print_tuple(struct seq_file *s, + const struct ip_conntrack_tuple *tuple) +{ + return seq_printf(s, "sport=%hu dport=%hu ", + ntohs(tuple->src.u.udp.port), + ntohs(tuple->dst.u.udp.port)); +} + +/* Print out the private part of the conntrack. */ +static int udp_print_conntrack(struct seq_file *s, + const struct ip_conntrack *conntrack) +{ + return 0; +} + +/* Returns verdict for packet, and may modify conntracktype */ +static int udp_packet(struct ip_conntrack *conntrack, + const struct sk_buff *skb, + enum ip_conntrack_info ctinfo) +{ + /* If we've seen traffic both ways, this is some kind of UDP + stream. Extend timeout. */ + if (test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + ip_ct_refresh_acct(conntrack, ctinfo, skb, + ip_ct_udp_timeout_stream); + /* Also, more likely to be important, and not a probe */ + set_bit(IPS_ASSURED_BIT, &conntrack->status); + } else + ip_ct_refresh_acct(conntrack, ctinfo, skb, ip_ct_udp_timeout); + + return NF_ACCEPT; +} + +/* Called when a new connection for this protocol found. */ +static int udp_new(struct ip_conntrack *conntrack, const struct sk_buff *skb) +{ + return 1; +} + +static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, + unsigned int hooknum) +{ + struct iphdr *iph = skb->nh.iph; + unsigned int udplen = skb->len - iph->ihl * 4; + struct udphdr _hdr, *hdr; + + /* Header is too small? */ + hdr = skb_header_pointer(skb, iph->ihl*4, sizeof(_hdr), &_hdr); + if (hdr == NULL) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_udp: short packet "); + return -NF_ACCEPT; + } + + /* Truncated/malformed packets */ + if (ntohs(hdr->len) > udplen || ntohs(hdr->len) < sizeof(*hdr)) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_udp: truncated/malformed packet "); + return -NF_ACCEPT; + } + + /* Packet with no checksum */ + if (!hdr->check) + return NF_ACCEPT; + + /* Checksum invalid? Ignore. + * We skip checking packets on the outgoing path + * because the semantic of CHECKSUM_HW is different there + * and moreover root might send raw packets. + * FIXME: Source route IP option packets --RR */ + if (hooknum == NF_IP_PRE_ROUTING + && csum_tcpudp_magic(iph->saddr, iph->daddr, udplen, IPPROTO_UDP, + skb->ip_summed == CHECKSUM_HW ? skb->csum + : skb_checksum(skb, iph->ihl*4, udplen, 0))) { + if (LOG_INVALID(IPPROTO_UDP)) + nf_log_packet(PF_INET, 0, skb, NULL, NULL, + "ip_ct_udp: bad UDP checksum "); + return -NF_ACCEPT; + } + + return NF_ACCEPT; +} + +struct ip_conntrack_protocol ip_conntrack_protocol_udp = +{ + .proto = IPPROTO_UDP, + .name = "udp", + .pkt_to_tuple = udp_pkt_to_tuple, + .invert_tuple = udp_invert_tuple, + .print_tuple = udp_print_tuple, + .print_conntrack = udp_print_conntrack, + .packet = udp_packet, + .new = udp_new, + .error = udp_error, +}; diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c new file mode 100644 index 000000000000..80a7bde2a57a --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -0,0 +1,961 @@ +/* This file contains all the functions required for the standalone + ip_conntrack module. + + These are not required by the compatibility layer. +*/ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif +#include +#include + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_conntrack_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_conntrack_lock) + +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +MODULE_LICENSE("GPL"); + +extern atomic_t ip_conntrack_count; +DECLARE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat); + +static int kill_proto(struct ip_conntrack *i, void *data) +{ + return (i->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum == + *((u_int8_t *) data)); +} + +#ifdef CONFIG_PROC_FS +static int +print_tuple(struct seq_file *s, const struct ip_conntrack_tuple *tuple, + struct ip_conntrack_protocol *proto) +{ + seq_printf(s, "src=%u.%u.%u.%u dst=%u.%u.%u.%u ", + NIPQUAD(tuple->src.ip), NIPQUAD(tuple->dst.ip)); + return proto->print_tuple(s, tuple); +} + +#ifdef CONFIG_IP_NF_CT_ACCT +static unsigned int +seq_print_counters(struct seq_file *s, + const struct ip_conntrack_counter *counter) +{ + return seq_printf(s, "packets=%llu bytes=%llu ", + (unsigned long long)counter->packets, + (unsigned long long)counter->bytes); +} +#else +#define seq_print_counters(x, y) 0 +#endif + +struct ct_iter_state { + unsigned int bucket; +}; + +static struct list_head *ct_get_first(struct seq_file *seq) +{ + struct ct_iter_state *st = seq->private; + + for (st->bucket = 0; + st->bucket < ip_conntrack_htable_size; + st->bucket++) { + if (!list_empty(&ip_conntrack_hash[st->bucket])) + return ip_conntrack_hash[st->bucket].next; + } + return NULL; +} + +static struct list_head *ct_get_next(struct seq_file *seq, struct list_head *head) +{ + struct ct_iter_state *st = seq->private; + + head = head->next; + while (head == &ip_conntrack_hash[st->bucket]) { + if (++st->bucket >= ip_conntrack_htable_size) + return NULL; + head = ip_conntrack_hash[st->bucket].next; + } + return head; +} + +static struct list_head *ct_get_idx(struct seq_file *seq, loff_t pos) +{ + struct list_head *head = ct_get_first(seq); + + if (head) + while (pos && (head = ct_get_next(seq, head))) + pos--; + return pos ? NULL : head; +} + +static void *ct_seq_start(struct seq_file *seq, loff_t *pos) +{ + READ_LOCK(&ip_conntrack_lock); + return ct_get_idx(seq, *pos); +} + +static void *ct_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + (*pos)++; + return ct_get_next(s, v); +} + +static void ct_seq_stop(struct seq_file *s, void *v) +{ + READ_UNLOCK(&ip_conntrack_lock); +} + +static int ct_seq_show(struct seq_file *s, void *v) +{ + const struct ip_conntrack_tuple_hash *hash = v; + const struct ip_conntrack *conntrack = tuplehash_to_ctrack(hash); + struct ip_conntrack_protocol *proto; + + MUST_BE_READ_LOCKED(&ip_conntrack_lock); + IP_NF_ASSERT(conntrack); + + /* we only want to print DIR_ORIGINAL */ + if (DIRECTION(hash)) + return 0; + + proto = ip_ct_find_proto(conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple.dst.protonum); + IP_NF_ASSERT(proto); + + if (seq_printf(s, "%-8s %u %ld ", + proto->name, + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum, + timer_pending(&conntrack->timeout) + ? (long)(conntrack->timeout.expires - jiffies)/HZ + : 0) != 0) + return -ENOSPC; + + if (proto->print_conntrack(s, conntrack)) + return -ENOSPC; + + if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + proto)) + return -ENOSPC; + + if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_ORIGINAL])) + return -ENOSPC; + + if (!(test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status))) + if (seq_printf(s, "[UNREPLIED] ")) + return -ENOSPC; + + if (print_tuple(s, &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple, + proto)) + return -ENOSPC; + + if (seq_print_counters(s, &conntrack->counters[IP_CT_DIR_REPLY])) + return -ENOSPC; + + if (test_bit(IPS_ASSURED_BIT, &conntrack->status)) + if (seq_printf(s, "[ASSURED] ")) + return -ENOSPC; + +#if defined(CONFIG_IP_NF_CONNTRACK_MARK) + if (seq_printf(s, "mark=%lu ", conntrack->mark)) + return -ENOSPC; +#endif + + if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) + return -ENOSPC; + + return 0; +} + +static struct seq_operations ct_seq_ops = { + .start = ct_seq_start, + .next = ct_seq_next, + .stop = ct_seq_stop, + .show = ct_seq_show +}; + +static int ct_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct ct_iter_state *st; + int ret; + + st = kmalloc(sizeof(struct ct_iter_state), GFP_KERNEL); + if (st == NULL) + return -ENOMEM; + ret = seq_open(file, &ct_seq_ops); + if (ret) + goto out_free; + seq = file->private_data; + seq->private = st; + memset(st, 0, sizeof(struct ct_iter_state)); + return ret; +out_free: + kfree(st); + return ret; +} + +static struct file_operations ct_file_ops = { + .owner = THIS_MODULE, + .open = ct_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +/* expects */ +static void *exp_seq_start(struct seq_file *s, loff_t *pos) +{ + struct list_head *e = &ip_conntrack_expect_list; + loff_t i; + + /* strange seq_file api calls stop even if we fail, + * thus we need to grab lock since stop unlocks */ + READ_LOCK(&ip_conntrack_lock); + + if (list_empty(e)) + return NULL; + + for (i = 0; i <= *pos; i++) { + e = e->next; + if (e == &ip_conntrack_expect_list) + return NULL; + } + return e; +} + +static void *exp_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct list_head *e = v; + + e = e->next; + + if (e == &ip_conntrack_expect_list) + return NULL; + + return e; +} + +static void exp_seq_stop(struct seq_file *s, void *v) +{ + READ_UNLOCK(&ip_conntrack_lock); +} + +static int exp_seq_show(struct seq_file *s, void *v) +{ + struct ip_conntrack_expect *expect = v; + + if (expect->timeout.function) + seq_printf(s, "%ld ", timer_pending(&expect->timeout) + ? (long)(expect->timeout.expires - jiffies)/HZ : 0); + else + seq_printf(s, "- "); + + seq_printf(s, "proto=%u ", expect->tuple.dst.protonum); + + print_tuple(s, &expect->tuple, + ip_ct_find_proto(expect->tuple.dst.protonum)); + return seq_putc(s, '\n'); +} + +static struct seq_operations exp_seq_ops = { + .start = exp_seq_start, + .next = exp_seq_next, + .stop = exp_seq_stop, + .show = exp_seq_show +}; + +static int exp_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &exp_seq_ops); +} + +static struct file_operations exp_file_ops = { + .owner = THIS_MODULE, + .open = exp_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static void *ct_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return &per_cpu(ip_conntrack_stat, cpu); + } + + return NULL; +} + +static void *ct_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + int cpu; + + for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return &per_cpu(ip_conntrack_stat, cpu); + } + + return NULL; +} + +static void ct_cpu_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int ct_cpu_seq_show(struct seq_file *seq, void *v) +{ + unsigned int nr_conntracks = atomic_read(&ip_conntrack_count); + struct ip_conntrack_stat *st = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete\n"); + return 0; + } + + seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x " + "%08x %08x %08x %08x %08x %08x %08x %08x \n", + nr_conntracks, + st->searched, + st->found, + st->new, + st->invalid, + st->ignore, + st->delete, + st->delete_list, + st->insert, + st->insert_failed, + st->drop, + st->early_drop, + st->error, + + st->expect_new, + st->expect_create, + st->expect_delete + ); + return 0; +} + +static struct seq_operations ct_cpu_seq_ops = { + .start = ct_cpu_seq_start, + .next = ct_cpu_seq_next, + .stop = ct_cpu_seq_stop, + .show = ct_cpu_seq_show, +}; + +static int ct_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ct_cpu_seq_ops); +} + +static struct file_operations ct_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = ct_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + +static unsigned int ip_confirm(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + + /* This is where we call the helper: as the packet goes out. */ + ct = ip_conntrack_get(*pskb, &ctinfo); + if (ct && ct->helper) { + unsigned int ret; + ret = ct->helper->help(pskb, ct, ctinfo); + if (ret != NF_ACCEPT) + return ret; + } + + /* We've seen it coming out the other side: confirm it */ + return ip_conntrack_confirm(pskb); +} + +static unsigned int ip_conntrack_defrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ +#if !defined(CONFIG_IP_NF_NAT) && !defined(CONFIG_IP_NF_NAT_MODULE) + /* Previously seen (loopback)? Ignore. Do this before + fragment check. */ + if ((*pskb)->nfct) + return NF_ACCEPT; +#endif + + /* Gather fragments. */ + if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + *pskb = ip_ct_gather_frags(*pskb, + hooknum == NF_IP_PRE_ROUTING ? + IP_DEFRAG_CONNTRACK_IN : + IP_DEFRAG_CONNTRACK_OUT); + if (!*pskb) + return NF_STOLEN; + } + return NF_ACCEPT; +} + +static unsigned int ip_refrag(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct rtable *rt = (struct rtable *)(*pskb)->dst; + + /* We've seen it coming out the other side: confirm */ + if (ip_confirm(hooknum, pskb, in, out, okfn) != NF_ACCEPT) + return NF_DROP; + + /* Local packets are never produced too large for their + interface. We degfragment them at LOCAL_OUT, however, + so we have to refragment them here. */ + if ((*pskb)->len > dst_mtu(&rt->u.dst) && + !skb_shinfo(*pskb)->tso_size) { + /* No hook can be after us, so this should be OK. */ + ip_fragment(*pskb, okfn); + return NF_STOLEN; + } + return NF_ACCEPT; +} + +static unsigned int ip_conntrack_local(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ipt_hook: happy cracking.\n"); + return NF_ACCEPT; + } + return ip_conntrack_in(hooknum, pskb, in, out, okfn); +} + +/* Connection tracking may drop packets, but never alters them, so + make it the first hook. */ +static struct nf_hook_ops ip_conntrack_defrag_ops = { + .hook = ip_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, +}; + +static struct nf_hook_ops ip_conntrack_in_ops = { + .hook = ip_conntrack_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_CONNTRACK, +}; + +static struct nf_hook_ops ip_conntrack_defrag_local_out_ops = { + .hook = ip_conntrack_defrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK_DEFRAG, +}; + +static struct nf_hook_ops ip_conntrack_local_out_ops = { + .hook = ip_conntrack_local, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_CONNTRACK, +}; + +/* Refragmenter; last chance. */ +static struct nf_hook_ops ip_conntrack_out_ops = { + .hook = ip_refrag, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_LAST, +}; + +static struct nf_hook_ops ip_conntrack_local_in_ops = { + .hook = ip_confirm, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_LAST-1, +}; + +/* Sysctl support */ + +#ifdef CONFIG_SYSCTL + +/* From ip_conntrack_core.c */ +extern int ip_conntrack_max; +extern unsigned int ip_conntrack_htable_size; + +/* From ip_conntrack_proto_tcp.c */ +extern unsigned long ip_ct_tcp_timeout_syn_sent; +extern unsigned long ip_ct_tcp_timeout_syn_recv; +extern unsigned long ip_ct_tcp_timeout_established; +extern unsigned long ip_ct_tcp_timeout_fin_wait; +extern unsigned long ip_ct_tcp_timeout_close_wait; +extern unsigned long ip_ct_tcp_timeout_last_ack; +extern unsigned long ip_ct_tcp_timeout_time_wait; +extern unsigned long ip_ct_tcp_timeout_close; +extern unsigned long ip_ct_tcp_timeout_max_retrans; +extern int ip_ct_tcp_loose; +extern int ip_ct_tcp_be_liberal; +extern int ip_ct_tcp_max_retrans; + +/* From ip_conntrack_proto_udp.c */ +extern unsigned long ip_ct_udp_timeout; +extern unsigned long ip_ct_udp_timeout_stream; + +/* From ip_conntrack_proto_icmp.c */ +extern unsigned long ip_ct_icmp_timeout; + +/* From ip_conntrack_proto_icmp.c */ +extern unsigned long ip_ct_generic_timeout; + +/* Log invalid packets of a given protocol */ +static int log_invalid_proto_min = 0; +static int log_invalid_proto_max = 255; + +static struct ctl_table_header *ip_ct_sysctl_header; + +static ctl_table ip_ct_sysctl_table[] = { + { + .ctl_name = NET_IPV4_NF_CONNTRACK_MAX, + .procname = "ip_conntrack_max", + .data = &ip_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_COUNT, + .procname = "ip_conntrack_count", + .data = &ip_conntrack_count, + .maxlen = sizeof(int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_BUCKETS, + .procname = "ip_conntrack_buckets", + .data = &ip_conntrack_htable_size, + .maxlen = sizeof(unsigned int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, + .procname = "ip_conntrack_tcp_timeout_syn_sent", + .data = &ip_ct_tcp_timeout_syn_sent, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_RECV, + .procname = "ip_conntrack_tcp_timeout_syn_recv", + .data = &ip_ct_tcp_timeout_syn_recv, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_ESTABLISHED, + .procname = "ip_conntrack_tcp_timeout_established", + .data = &ip_ct_tcp_timeout_established, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_FIN_WAIT, + .procname = "ip_conntrack_tcp_timeout_fin_wait", + .data = &ip_ct_tcp_timeout_fin_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE_WAIT, + .procname = "ip_conntrack_tcp_timeout_close_wait", + .data = &ip_ct_tcp_timeout_close_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_LAST_ACK, + .procname = "ip_conntrack_tcp_timeout_last_ack", + .data = &ip_ct_tcp_timeout_last_ack, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_TIME_WAIT, + .procname = "ip_conntrack_tcp_timeout_time_wait", + .data = &ip_ct_tcp_timeout_time_wait, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_CLOSE, + .procname = "ip_conntrack_tcp_timeout_close", + .data = &ip_ct_tcp_timeout_close, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT, + .procname = "ip_conntrack_udp_timeout", + .data = &ip_ct_udp_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_UDP_TIMEOUT_STREAM, + .procname = "ip_conntrack_udp_timeout_stream", + .data = &ip_ct_udp_timeout_stream, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_ICMP_TIMEOUT, + .procname = "ip_conntrack_icmp_timeout", + .data = &ip_ct_icmp_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_GENERIC_TIMEOUT, + .procname = "ip_conntrack_generic_timeout", + .data = &ip_ct_generic_timeout, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_LOG_INVALID, + .procname = "ip_conntrack_log_invalid", + .data = &ip_ct_log_invalid, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &log_invalid_proto_min, + .extra2 = &log_invalid_proto_max, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_MAX_RETRANS, + .procname = "ip_conntrack_tcp_timeout_max_retrans", + .data = &ip_ct_tcp_timeout_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_LOOSE, + .procname = "ip_conntrack_tcp_loose", + .data = &ip_ct_tcp_loose, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, + .procname = "ip_conntrack_tcp_be_liberal", + .data = &ip_ct_tcp_be_liberal, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, + .procname = "ip_conntrack_tcp_max_retrans", + .data = &ip_ct_tcp_max_retrans, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +#define NET_IP_CONNTRACK_MAX 2089 + +static ctl_table ip_ct_netfilter_table[] = { + { + .ctl_name = NET_IPV4_NETFILTER, + .procname = "netfilter", + .mode = 0555, + .child = ip_ct_sysctl_table, + }, + { + .ctl_name = NET_IP_CONNTRACK_MAX, + .procname = "ip_conntrack_max", + .data = &ip_conntrack_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_ipv4_table[] = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = ip_ct_netfilter_table, + }, + { .ctl_name = 0 } +}; + +static ctl_table ip_ct_net_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ip_ct_ipv4_table, + }, + { .ctl_name = 0 } +}; + +EXPORT_SYMBOL(ip_ct_log_invalid); +#endif /* CONFIG_SYSCTL */ + +static int init_or_cleanup(int init) +{ +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *proc, *proc_exp, *proc_stat; +#endif + int ret = 0; + + if (!init) goto cleanup; + + ret = ip_conntrack_init(); + if (ret < 0) + goto cleanup_nothing; + +#ifdef CONFIG_PROC_FS + ret = -ENOMEM; + proc = proc_net_fops_create("ip_conntrack", 0440, &ct_file_ops); + if (!proc) goto cleanup_init; + + proc_exp = proc_net_fops_create("ip_conntrack_expect", 0440, + &exp_file_ops); + if (!proc_exp) goto cleanup_proc; + + proc_stat = create_proc_entry("ip_conntrack", S_IRUGO, proc_net_stat); + if (!proc_stat) + goto cleanup_proc_exp; + + proc_stat->proc_fops = &ct_cpu_seq_fops; + proc_stat->owner = THIS_MODULE; +#endif + + ret = nf_register_hook(&ip_conntrack_defrag_ops); + if (ret < 0) { + printk("ip_conntrack: can't register pre-routing defrag hook.\n"); + goto cleanup_proc_stat; + } + ret = nf_register_hook(&ip_conntrack_defrag_local_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local_out defrag hook.\n"); + goto cleanup_defragops; + } + ret = nf_register_hook(&ip_conntrack_in_ops); + if (ret < 0) { + printk("ip_conntrack: can't register pre-routing hook.\n"); + goto cleanup_defraglocalops; + } + ret = nf_register_hook(&ip_conntrack_local_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local out hook.\n"); + goto cleanup_inops; + } + ret = nf_register_hook(&ip_conntrack_out_ops); + if (ret < 0) { + printk("ip_conntrack: can't register post-routing hook.\n"); + goto cleanup_inandlocalops; + } + ret = nf_register_hook(&ip_conntrack_local_in_ops); + if (ret < 0) { + printk("ip_conntrack: can't register local in hook.\n"); + goto cleanup_inoutandlocalops; + } +#ifdef CONFIG_SYSCTL + ip_ct_sysctl_header = register_sysctl_table(ip_ct_net_table, 0); + if (ip_ct_sysctl_header == NULL) { + printk("ip_conntrack: can't register to sysctl.\n"); + ret = -ENOMEM; + goto cleanup_localinops; + } +#endif + + return ret; + + cleanup: +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(ip_ct_sysctl_header); + cleanup_localinops: +#endif + nf_unregister_hook(&ip_conntrack_local_in_ops); + cleanup_inoutandlocalops: + nf_unregister_hook(&ip_conntrack_out_ops); + cleanup_inandlocalops: + nf_unregister_hook(&ip_conntrack_local_out_ops); + cleanup_inops: + nf_unregister_hook(&ip_conntrack_in_ops); + cleanup_defraglocalops: + nf_unregister_hook(&ip_conntrack_defrag_local_out_ops); + cleanup_defragops: + nf_unregister_hook(&ip_conntrack_defrag_ops); + cleanup_proc_stat: +#ifdef CONFIG_PROC_FS + remove_proc_entry("ip_conntrack", proc_net_stat); + cleanup_proc_exp: + proc_net_remove("ip_conntrack_expect"); + cleanup_proc: + proc_net_remove("ip_conntrack"); + cleanup_init: +#endif /* CONFIG_PROC_FS */ + ip_conntrack_cleanup(); + cleanup_nothing: + return ret; +} + +/* FIXME: Allow NULL functions and sub in pointers to generic for + them. --RR */ +int ip_conntrack_protocol_register(struct ip_conntrack_protocol *proto) +{ + int ret = 0; + + WRITE_LOCK(&ip_conntrack_lock); + if (ip_ct_protos[proto->proto] != &ip_conntrack_generic_protocol) { + ret = -EBUSY; + goto out; + } + ip_ct_protos[proto->proto] = proto; + out: + WRITE_UNLOCK(&ip_conntrack_lock); + return ret; +} + +void ip_conntrack_protocol_unregister(struct ip_conntrack_protocol *proto) +{ + WRITE_LOCK(&ip_conntrack_lock); + ip_ct_protos[proto->proto] = &ip_conntrack_generic_protocol; + WRITE_UNLOCK(&ip_conntrack_lock); + + /* Somebody could be still looking at the proto in bh. */ + synchronize_net(); + + /* Remove all contrack entries for this protocol */ + ip_ct_iterate_cleanup(kill_proto, &proto->proto); +} + +static int __init init(void) +{ + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +/* Some modules need us, but don't depend directly on any symbol. + They should call this. */ +void need_ip_conntrack(void) +{ +} + +EXPORT_SYMBOL(ip_conntrack_protocol_register); +EXPORT_SYMBOL(ip_conntrack_protocol_unregister); +EXPORT_SYMBOL(ip_ct_get_tuple); +EXPORT_SYMBOL(invert_tuplepr); +EXPORT_SYMBOL(ip_conntrack_alter_reply); +EXPORT_SYMBOL(ip_conntrack_destroyed); +EXPORT_SYMBOL(need_ip_conntrack); +EXPORT_SYMBOL(ip_conntrack_helper_register); +EXPORT_SYMBOL(ip_conntrack_helper_unregister); +EXPORT_SYMBOL(ip_ct_iterate_cleanup); +EXPORT_SYMBOL(ip_ct_refresh_acct); +EXPORT_SYMBOL(ip_ct_protos); +EXPORT_SYMBOL(ip_ct_find_proto); +EXPORT_SYMBOL(ip_conntrack_expect_alloc); +EXPORT_SYMBOL(ip_conntrack_expect_free); +EXPORT_SYMBOL(ip_conntrack_expect_related); +EXPORT_SYMBOL(ip_conntrack_unexpect_related); +EXPORT_SYMBOL(ip_conntrack_tuple_taken); +EXPORT_SYMBOL(ip_ct_gather_frags); +EXPORT_SYMBOL(ip_conntrack_htable_size); +EXPORT_SYMBOL(ip_conntrack_lock); +EXPORT_SYMBOL(ip_conntrack_hash); +EXPORT_SYMBOL(ip_conntrack_untracked); +EXPORT_SYMBOL_GPL(ip_conntrack_find_get); +EXPORT_SYMBOL_GPL(ip_conntrack_put); +#ifdef CONFIG_IP_NF_NAT_NEEDED +EXPORT_SYMBOL(ip_conntrack_tcp_update); +#endif diff --git a/net/ipv4/netfilter/ip_conntrack_tftp.c b/net/ipv4/netfilter/ip_conntrack_tftp.c new file mode 100644 index 000000000000..992fac3e36ee --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_tftp.c @@ -0,0 +1,159 @@ +/* (C) 2001-2002 Magnus Boden + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Version: 0.0.7 + * + * Thu 21 Mar 2002 Harald Welte + * - port to newnat API + * + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Magnus Boden "); +MODULE_DESCRIPTION("tftp connection tracking helper"); +MODULE_LICENSE("GPL"); + +#define MAX_PORTS 8 +static int ports[MAX_PORTS]; +static int ports_c; +module_param_array(ports, int, &ports_c, 0400); +MODULE_PARM_DESC(ports, "port numbers of tftp servers"); + +#if 0 +#define DEBUGP(format, args...) printk("%s:%s:" format, \ + __FILE__, __FUNCTION__ , ## args) +#else +#define DEBUGP(format, args...) +#endif + +unsigned int (*ip_nat_tftp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack_expect *exp); +EXPORT_SYMBOL_GPL(ip_nat_tftp_hook); + +static int tftp_help(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + struct tftphdr _tftph, *tfh; + struct ip_conntrack_expect *exp; + unsigned int ret = NF_ACCEPT; + + tfh = skb_header_pointer(*pskb, + (*pskb)->nh.iph->ihl*4+sizeof(struct udphdr), + sizeof(_tftph), &_tftph); + if (tfh == NULL) + return NF_ACCEPT; + + switch (ntohs(tfh->opcode)) { + /* RRQ and WRQ works the same way */ + case TFTP_OPCODE_READ: + case TFTP_OPCODE_WRITE: + DEBUGP(""); + DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + exp = ip_conntrack_expect_alloc(); + if (exp == NULL) + return NF_DROP; + + exp->tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + exp->mask.src.ip = 0xffffffff; + exp->mask.dst.ip = 0xffffffff; + exp->mask.dst.u.udp.port = 0xffff; + exp->mask.dst.protonum = 0xff; + exp->expectfn = NULL; + exp->master = ct; + + DEBUGP("expect: "); + DUMP_TUPLE(&exp->tuple); + DUMP_TUPLE(&exp->mask); + if (ip_nat_tftp_hook) + ret = ip_nat_tftp_hook(pskb, ctinfo, exp); + else if (ip_conntrack_expect_related(exp) != 0) { + ip_conntrack_expect_free(exp); + ret = NF_DROP; + } + break; + case TFTP_OPCODE_DATA: + case TFTP_OPCODE_ACK: + DEBUGP("Data/ACK opcode\n"); + break; + case TFTP_OPCODE_ERROR: + DEBUGP("Error opcode\n"); + break; + default: + DEBUGP("Unknown opcode\n"); + } + return NF_ACCEPT; +} + +static struct ip_conntrack_helper tftp[MAX_PORTS]; +static char tftp_names[MAX_PORTS][10]; + +static void fini(void) +{ + int i; + + for (i = 0 ; i < ports_c; i++) { + DEBUGP("unregistering helper for port %d\n", + ports[i]); + ip_conntrack_helper_unregister(&tftp[i]); + } +} + +static int __init init(void) +{ + int i, ret; + char *tmpname; + + if (ports_c == 0) + ports[ports_c++] = TFTP_PORT; + + for (i = 0; i < ports_c; i++) { + /* Create helper structure */ + memset(&tftp[i], 0, sizeof(struct ip_conntrack_helper)); + + tftp[i].tuple.dst.protonum = IPPROTO_UDP; + tftp[i].tuple.src.u.udp.port = htons(ports[i]); + tftp[i].mask.dst.protonum = 0xFF; + tftp[i].mask.src.u.udp.port = 0xFFFF; + tftp[i].max_expected = 1; + tftp[i].timeout = 5 * 60; /* 5 minutes */ + tftp[i].me = THIS_MODULE; + tftp[i].help = tftp_help; + + tmpname = &tftp_names[i][0]; + if (ports[i] == TFTP_PORT) + sprintf(tmpname, "tftp"); + else + sprintf(tmpname, "tftp-%d", i); + tftp[i].name = tmpname; + + DEBUGP("port #%d: %d\n", i, ports[i]); + + ret=ip_conntrack_helper_register(&tftp[i]); + if (ret) { + printk("ERROR registering helper for port %d\n", + ports[i]); + fini(); + return(ret); + } + } + return(0); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_amanda.c b/net/ipv4/netfilter/ip_nat_amanda.c new file mode 100644 index 000000000000..da1f412583ed --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_amanda.c @@ -0,0 +1,88 @@ +/* Amanda extension for TCP NAT alteration. + * (C) 2002 by Brian J. Murrell + * based on a copy of HW's ip_nat_irc.c as well as other modules + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Module load syntax: + * insmod ip_nat_amanda.o + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +MODULE_AUTHOR("Brian J. Murrell "); +MODULE_DESCRIPTION("Amanda NAT helper"); +MODULE_LICENSE("GPL"); + +static unsigned int help(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp) +{ + char buffer[sizeof("65535")]; + u_int16_t port; + unsigned int ret; + + /* Connection comes from client. */ + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = IP_CT_DIR_ORIGINAL; + + /* When you see the packet, we need to NAT it the same as the + * this one (ie. same IP: it will be TCP and master is UDP). */ + exp->expectfn = ip_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + exp->tuple.dst.u.tcp.port = htons(port); + if (ip_conntrack_expect_related(exp) == 0) + break; + } + + if (port == 0) { + ip_conntrack_expect_free(exp); + return NF_DROP; + } + + sprintf(buffer, "%u", port); + ret = ip_nat_mangle_udp_packet(pskb, exp->master, ctinfo, + matchoff, matchlen, + buffer, strlen(buffer)); + if (ret != NF_ACCEPT) + ip_conntrack_unexpect_related(exp); + return ret; +} + +static void __exit fini(void) +{ + ip_nat_amanda_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); +} + +static int __init init(void) +{ + BUG_ON(ip_nat_amanda_hook); + ip_nat_amanda_hook = help; + return 0; +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_core.c b/net/ipv4/netfilter/ip_nat_core.c new file mode 100644 index 000000000000..162ceacfc29a --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_core.c @@ -0,0 +1,556 @@ +/* NAT for netfilter; shared with compatibility layer. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For tcp_prot in getorigdst */ +#include +#include +#include + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +DECLARE_RWLOCK(ip_nat_lock); + +/* Calculated at init based on memory size */ +static unsigned int ip_nat_htable_size; + +static struct list_head *bysource; +struct ip_nat_protocol *ip_nat_protos[MAX_IP_NAT_PROTO]; + + +/* We keep an extra hash for each conntrack, for fast searching. */ +static inline unsigned int +hash_by_src(const struct ip_conntrack_tuple *tuple) +{ + /* Original src, to ensure we map it consistently if poss. */ + return jhash_3words(tuple->src.ip, tuple->src.u.all, + tuple->dst.protonum, 0) % ip_nat_htable_size; +} + +/* Noone using conntrack by the time this called. */ +static void ip_nat_cleanup_conntrack(struct ip_conntrack *conn) +{ + if (!(conn->status & IPS_NAT_DONE_MASK)) + return; + + WRITE_LOCK(&ip_nat_lock); + list_del(&conn->nat.info.bysource); + WRITE_UNLOCK(&ip_nat_lock); +} + +/* We do checksum mangling, so if they were wrong before they're still + * wrong. Also works for incomplete packets (eg. ICMP dest + * unreachables.) */ +u_int16_t +ip_nat_cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck) +{ + u_int32_t diffs[] = { oldvalinv, newval }; + return csum_fold(csum_partial((char *)diffs, sizeof(diffs), + oldcheck^0xFFFF)); +} + +/* Is this tuple already taken? (not by us) */ +int +ip_nat_used_tuple(const struct ip_conntrack_tuple *tuple, + const struct ip_conntrack *ignored_conntrack) +{ + /* Conntrack tracking doesn't keep track of outgoing tuples; only + incoming ones. NAT means they don't have a fixed mapping, + so we invert the tuple and look for the incoming reply. + + We could keep a separate hash if this proves too slow. */ + struct ip_conntrack_tuple reply; + + invert_tuplepr(&reply, tuple); + return ip_conntrack_tuple_taken(&reply, ignored_conntrack); +} + +/* If we source map this tuple so reply looks like reply_tuple, will + * that meet the constraints of range. */ +static int +in_range(const struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range) +{ + struct ip_nat_protocol *proto = ip_nat_find_proto(tuple->dst.protonum); + + /* If we are supposed to map IPs, then we must be in the + range specified, otherwise let this drag us onto a new src IP. */ + if (range->flags & IP_NAT_RANGE_MAP_IPS) { + if (ntohl(tuple->src.ip) < ntohl(range->min_ip) + || ntohl(tuple->src.ip) > ntohl(range->max_ip)) + return 0; + } + + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) + || proto->in_range(tuple, IP_NAT_MANIP_SRC, + &range->min, &range->max)) + return 1; + + return 0; +} + +static inline int +same_src(const struct ip_conntrack *ct, + const struct ip_conntrack_tuple *tuple) +{ + return (ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum + == tuple->dst.protonum + && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip + == tuple->src.ip + && ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u.all + == tuple->src.u.all); +} + +/* Only called for SRC manip */ +static int +find_appropriate_src(const struct ip_conntrack_tuple *tuple, + struct ip_conntrack_tuple *result, + const struct ip_nat_range *range) +{ + unsigned int h = hash_by_src(tuple); + struct ip_conntrack *ct; + + READ_LOCK(&ip_nat_lock); + list_for_each_entry(ct, &bysource[h], nat.info.bysource) { + if (same_src(ct, tuple)) { + /* Copy source part from reply tuple. */ + invert_tuplepr(result, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + result->dst = tuple->dst; + + if (in_range(result, range)) { + READ_UNLOCK(&ip_nat_lock); + return 1; + } + } + } + READ_UNLOCK(&ip_nat_lock); + return 0; +} + +/* For [FUTURE] fragmentation handling, we want the least-used + src-ip/dst-ip/proto triple. Fairness doesn't come into it. Thus + if the range specifies 1.2.3.4 ports 10000-10005 and 1.2.3.5 ports + 1-65535, we don't do pro-rata allocation based on ports; we choose + the ip with the lowest src-ip/dst-ip/proto usage. +*/ +static void +find_best_ips_proto(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + const struct ip_conntrack *conntrack, + enum ip_nat_manip_type maniptype) +{ + u_int32_t *var_ipp; + /* Host order */ + u_int32_t minip, maxip, j; + + /* No IP mapping? Do nothing. */ + if (!(range->flags & IP_NAT_RANGE_MAP_IPS)) + return; + + if (maniptype == IP_NAT_MANIP_SRC) + var_ipp = &tuple->src.ip; + else + var_ipp = &tuple->dst.ip; + + /* Fast path: only one choice. */ + if (range->min_ip == range->max_ip) { + *var_ipp = range->min_ip; + return; + } + + /* Hashing source and destination IPs gives a fairly even + * spread in practice (if there are a small number of IPs + * involved, there usually aren't that many connections + * anyway). The consistency means that servers see the same + * client coming from the same IP (some Internet Banking sites + * like this), even across reboots. */ + minip = ntohl(range->min_ip); + maxip = ntohl(range->max_ip); + j = jhash_2words(tuple->src.ip, tuple->dst.ip, 0); + *var_ipp = htonl(minip + j % (maxip - minip + 1)); +} + +/* Manipulate the tuple into the range given. For NF_IP_POST_ROUTING, + * we change the source to map into the range. For NF_IP_PRE_ROUTING + * and NF_IP_LOCAL_OUT, we change the destination to map into the + * range. It might not be possible to get a unique tuple, but we try. + * At worst (or if we race), we will end up with a final duplicate in + * __ip_conntrack_confirm and drop the packet. */ +static void +get_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_conntrack_tuple *orig_tuple, + const struct ip_nat_range *range, + struct ip_conntrack *conntrack, + enum ip_nat_manip_type maniptype) +{ + struct ip_nat_protocol *proto + = ip_nat_find_proto(orig_tuple->dst.protonum); + + /* 1) If this srcip/proto/src-proto-part is currently mapped, + and that same mapping gives a unique tuple within the given + range, use that. + + This is only required for source (ie. NAT/masq) mappings. + So far, we don't do local source mappings, so multiple + manips not an issue. */ + if (maniptype == IP_NAT_MANIP_SRC) { + if (find_appropriate_src(orig_tuple, tuple, range)) { + DEBUGP("get_unique_tuple: Found current src map\n"); + if (!ip_nat_used_tuple(tuple, conntrack)) + return; + } + } + + /* 2) Select the least-used IP/proto combination in the given + range. */ + *tuple = *orig_tuple; + find_best_ips_proto(tuple, range, conntrack, maniptype); + + /* 3) The per-protocol part of the manip is made to map into + the range to make a unique tuple. */ + + /* Only bother mapping if it's not already in range and unique */ + if ((!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) + || proto->in_range(tuple, maniptype, &range->min, &range->max)) + && !ip_nat_used_tuple(tuple, conntrack)) + return; + + /* Last change: get protocol to try to obtain unique tuple. */ + proto->unique_tuple(tuple, range, maniptype, conntrack); +} + +unsigned int +ip_nat_setup_info(struct ip_conntrack *conntrack, + const struct ip_nat_range *range, + unsigned int hooknum) +{ + struct ip_conntrack_tuple curr_tuple, new_tuple; + struct ip_nat_info *info = &conntrack->nat.info; + int have_to_hash = !(conntrack->status & IPS_NAT_DONE_MASK); + enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum); + + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + || hooknum == NF_IP_POST_ROUTING + || hooknum == NF_IP_LOCAL_IN + || hooknum == NF_IP_LOCAL_OUT); + BUG_ON(ip_nat_initialized(conntrack, maniptype)); + + /* What we've got will look like inverse of reply. Normally + this is what is in the conntrack, except for prior + manipulations (future optimization: if num_manips == 0, + orig_tp = + conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple) */ + invert_tuplepr(&curr_tuple, + &conntrack->tuplehash[IP_CT_DIR_REPLY].tuple); + + get_unique_tuple(&new_tuple, &curr_tuple, range, conntrack, maniptype); + + if (!ip_ct_tuple_equal(&new_tuple, &curr_tuple)) { + struct ip_conntrack_tuple reply; + + /* Alter conntrack table so will recognize replies. */ + invert_tuplepr(&reply, &new_tuple); + ip_conntrack_alter_reply(conntrack, &reply); + + /* Non-atomic: we own this at the moment. */ + if (maniptype == IP_NAT_MANIP_SRC) + conntrack->status |= IPS_SRC_NAT; + else + conntrack->status |= IPS_DST_NAT; + } + + /* Place in source hash if this is the first time. */ + if (have_to_hash) { + unsigned int srchash + = hash_by_src(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL] + .tuple); + WRITE_LOCK(&ip_nat_lock); + list_add(&info->bysource, &bysource[srchash]); + WRITE_UNLOCK(&ip_nat_lock); + } + + /* It's done. */ + if (maniptype == IP_NAT_MANIP_DST) + set_bit(IPS_DST_NAT_DONE_BIT, &conntrack->status); + else + set_bit(IPS_SRC_NAT_DONE_BIT, &conntrack->status); + + return NF_ACCEPT; +} + +/* Returns true if succeeded. */ +static int +manip_pkt(u_int16_t proto, + struct sk_buff **pskb, + unsigned int iphdroff, + const struct ip_conntrack_tuple *target, + enum ip_nat_manip_type maniptype) +{ + struct iphdr *iph; + + (*pskb)->nfcache |= NFC_ALTERED; + if (!skb_ip_make_writable(pskb, iphdroff + sizeof(*iph))) + return 0; + + iph = (void *)(*pskb)->data + iphdroff; + + /* Manipulate protcol part. */ + if (!ip_nat_find_proto(proto)->manip_pkt(pskb, iphdroff, + target, maniptype)) + return 0; + + iph = (void *)(*pskb)->data + iphdroff; + + if (maniptype == IP_NAT_MANIP_SRC) { + iph->check = ip_nat_cheat_check(~iph->saddr, target->src.ip, + iph->check); + iph->saddr = target->src.ip; + } else { + iph->check = ip_nat_cheat_check(~iph->daddr, target->dst.ip, + iph->check); + iph->daddr = target->dst.ip; + } + return 1; +} + +/* Do packet manipulations according to ip_nat_setup_info. */ +unsigned int nat_packet(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + unsigned int hooknum, + struct sk_buff **pskb) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + unsigned long statusbit; + enum ip_nat_manip_type mtype = HOOK2MANIP(hooknum); + + if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) + && (hooknum == NF_IP_POST_ROUTING || hooknum == NF_IP_LOCAL_IN)) { + DEBUGP("ip_nat_core: adjusting sequence number\n"); + /* future: put this in a l4-proto specific function, + * and call this function here. */ + if (!ip_nat_seq_adjust(pskb, ct, ctinfo)) + return NF_DROP; + } + + if (mtype == IP_NAT_MANIP_SRC) + statusbit = IPS_SRC_NAT; + else + statusbit = IPS_DST_NAT; + + /* Invert if this is reply dir. */ + if (dir == IP_CT_DIR_REPLY) + statusbit ^= IPS_NAT_MASK; + + /* Non-atomic: these bits don't change. */ + if (ct->status & statusbit) { + struct ip_conntrack_tuple target; + + /* We are aiming to look like inverse of other direction. */ + invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + + if (!manip_pkt(target.dst.protonum, pskb, 0, &target, mtype)) + return NF_DROP; + } + return NF_ACCEPT; +} + +/* Dir is direction ICMP is coming from (opposite to packet it contains) */ +int icmp_reply_translation(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_nat_manip_type manip, + enum ip_conntrack_dir dir) +{ + struct { + struct icmphdr icmp; + struct iphdr ip; + } *inside; + struct ip_conntrack_tuple inner, target; + int hdrlen = (*pskb)->nh.iph->ihl * 4; + + if (!skb_ip_make_writable(pskb, hdrlen + sizeof(*inside))) + return 0; + + inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; + + /* We're actually going to mangle it beyond trivial checksum + adjustment, so make sure the current checksum is correct. */ + if ((*pskb)->ip_summed != CHECKSUM_UNNECESSARY) { + hdrlen = (*pskb)->nh.iph->ihl * 4; + if ((u16)csum_fold(skb_checksum(*pskb, hdrlen, + (*pskb)->len - hdrlen, 0))) + return 0; + } + + /* Must be RELATED */ + IP_NF_ASSERT((*pskb)->nfctinfo == IP_CT_RELATED || + (*pskb)->nfctinfo == IP_CT_RELATED+IP_CT_IS_REPLY); + + /* Redirects on non-null nats must be dropped, else they'll + start talking to each other without our translation, and be + confused... --RR */ + if (inside->icmp.type == ICMP_REDIRECT) { + /* If NAT isn't finished, assume it and drop. */ + if ((ct->status & IPS_NAT_DONE_MASK) != IPS_NAT_DONE_MASK) + return 0; + + if (ct->status & IPS_NAT_MASK) + return 0; + } + + DEBUGP("icmp_reply_translation: translating error %p manp %u dir %s\n", + *pskb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); + + if (!ip_ct_get_tuple(&inside->ip, *pskb, (*pskb)->nh.iph->ihl*4 + + sizeof(struct icmphdr) + inside->ip.ihl*4, + &inner, ip_ct_find_proto(inside->ip.protocol))) + return 0; + + /* Change inner back to look like incoming packet. We do the + opposite manip on this hook to normal, because it might not + pass all hooks (locally-generated ICMP). Consider incoming + packet: PREROUTING (DST manip), routing produces ICMP, goes + through POSTROUTING (which must correct the DST manip). */ + if (!manip_pkt(inside->ip.protocol, pskb, + (*pskb)->nh.iph->ihl*4 + + sizeof(inside->icmp), + &ct->tuplehash[!dir].tuple, + !manip)) + return 0; + + /* Reloading "inside" here since manip_pkt inner. */ + inside = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; + inside->icmp.checksum = 0; + inside->icmp.checksum = csum_fold(skb_checksum(*pskb, hdrlen, + (*pskb)->len - hdrlen, + 0)); + + /* Change outer to look the reply to an incoming packet + * (proto 0 means don't invert per-proto part). */ + + /* Obviously, we need to NAT destination IP, but source IP + should be NAT'ed only if it is from a NAT'd host. + + Explanation: some people use NAT for anonymizing. Also, + CERT recommends dropping all packets from private IP + addresses (although ICMP errors from internal links with + such addresses are not too uncommon, as Alan Cox points + out) */ + if (manip != IP_NAT_MANIP_SRC + || ((*pskb)->nh.iph->saddr == ct->tuplehash[dir].tuple.src.ip)) { + invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + if (!manip_pkt(0, pskb, 0, &target, manip)) + return 0; + } + + return 1; +} + +/* Protocol registration. */ +int ip_nat_protocol_register(struct ip_nat_protocol *proto) +{ + int ret = 0; + + WRITE_LOCK(&ip_nat_lock); + if (ip_nat_protos[proto->protonum] != &ip_nat_unknown_protocol) { + ret = -EBUSY; + goto out; + } + ip_nat_protos[proto->protonum] = proto; + out: + WRITE_UNLOCK(&ip_nat_lock); + return ret; +} + +/* Noone stores the protocol anywhere; simply delete it. */ +void ip_nat_protocol_unregister(struct ip_nat_protocol *proto) +{ + WRITE_LOCK(&ip_nat_lock); + ip_nat_protos[proto->protonum] = &ip_nat_unknown_protocol; + WRITE_UNLOCK(&ip_nat_lock); + + /* Someone could be still looking at the proto in a bh. */ + synchronize_net(); +} + +int __init ip_nat_init(void) +{ + size_t i; + + /* Leave them the same for the moment. */ + ip_nat_htable_size = ip_conntrack_htable_size; + + /* One vmalloc for both hash tables */ + bysource = vmalloc(sizeof(struct list_head) * ip_nat_htable_size); + if (!bysource) + return -ENOMEM; + + /* Sew in builtin protocols. */ + WRITE_LOCK(&ip_nat_lock); + for (i = 0; i < MAX_IP_NAT_PROTO; i++) + ip_nat_protos[i] = &ip_nat_unknown_protocol; + ip_nat_protos[IPPROTO_TCP] = &ip_nat_protocol_tcp; + ip_nat_protos[IPPROTO_UDP] = &ip_nat_protocol_udp; + ip_nat_protos[IPPROTO_ICMP] = &ip_nat_protocol_icmp; + WRITE_UNLOCK(&ip_nat_lock); + + for (i = 0; i < ip_nat_htable_size; i++) { + INIT_LIST_HEAD(&bysource[i]); + } + + /* FIXME: Man, this is a hack. */ + IP_NF_ASSERT(ip_conntrack_destroyed == NULL); + ip_conntrack_destroyed = &ip_nat_cleanup_conntrack; + + /* Initialize fake conntrack so that NAT will skip it */ + ip_conntrack_untracked.status |= IPS_NAT_DONE_MASK; + return 0; +} + +/* Clear NAT section of all conntracks, in case we're loaded again. */ +static int clean_nat(struct ip_conntrack *i, void *data) +{ + memset(&i->nat, 0, sizeof(i->nat)); + i->status &= ~(IPS_NAT_MASK | IPS_NAT_DONE_MASK | IPS_SEQ_ADJUST); + return 0; +} + +/* Not __exit: called from ip_nat_standalone.c:init_or_cleanup() --RR */ +void ip_nat_cleanup(void) +{ + ip_ct_iterate_cleanup(&clean_nat, NULL); + ip_conntrack_destroyed = NULL; + vfree(bysource); +} diff --git a/net/ipv4/netfilter/ip_nat_ftp.c b/net/ipv4/netfilter/ip_nat_ftp.c new file mode 100644 index 000000000000..c6000e794ad6 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_ftp.c @@ -0,0 +1,183 @@ +/* FTP extension for TCP NAT alteration. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell "); +MODULE_DESCRIPTION("ftp NAT helper"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* FIXME: Time out? --RR */ + +static int +mangle_rfc959_packet(struct sk_buff **pskb, + u_int32_t newip, + u_int16_t port, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + u32 *seq) +{ + char buffer[sizeof("nnn,nnn,nnn,nnn,nnn,nnn")]; + + sprintf(buffer, "%u,%u,%u,%u,%u,%u", + NIPQUAD(newip), port>>8, port&0xFF); + + DEBUGP("calling ip_nat_mangle_tcp_packet\n"); + + *seq += strlen(buffer) - matchlen; + return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, + matchlen, buffer, strlen(buffer)); +} + +/* |1|132.235.1.2|6275| */ +static int +mangle_eprt_packet(struct sk_buff **pskb, + u_int32_t newip, + u_int16_t port, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + u32 *seq) +{ + char buffer[sizeof("|1|255.255.255.255|65535|")]; + + sprintf(buffer, "|1|%u.%u.%u.%u|%u|", NIPQUAD(newip), port); + + DEBUGP("calling ip_nat_mangle_tcp_packet\n"); + + *seq += strlen(buffer) - matchlen; + return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, + matchlen, buffer, strlen(buffer)); +} + +/* |1|132.235.1.2|6275| */ +static int +mangle_epsv_packet(struct sk_buff **pskb, + u_int32_t newip, + u_int16_t port, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + u32 *seq) +{ + char buffer[sizeof("|||65535|")]; + + sprintf(buffer, "|||%u|", port); + + DEBUGP("calling ip_nat_mangle_tcp_packet\n"); + + *seq += strlen(buffer) - matchlen; + return ip_nat_mangle_tcp_packet(pskb, ct, ctinfo, matchoff, + matchlen, buffer, strlen(buffer)); +} + +static int (*mangle[])(struct sk_buff **, u_int32_t, u_int16_t, + unsigned int, + unsigned int, + struct ip_conntrack *, + enum ip_conntrack_info, + u32 *seq) += { [IP_CT_FTP_PORT] = mangle_rfc959_packet, + [IP_CT_FTP_PASV] = mangle_rfc959_packet, + [IP_CT_FTP_EPRT] = mangle_eprt_packet, + [IP_CT_FTP_EPSV] = mangle_epsv_packet +}; + +/* So, this packet has hit the connection tracking matching code. + Mangle it, and change the expectation to match the new version. */ +static unsigned int ip_nat_ftp(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + enum ip_ct_ftp_type type, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp, + u32 *seq) +{ + u_int32_t newip; + u_int16_t port; + int dir = CTINFO2DIR(ctinfo); + struct ip_conntrack *ct = exp->master; + + DEBUGP("FTP_NAT: type %i, off %u len %u\n", type, matchoff, matchlen); + + /* Connection will come from wherever this packet goes, hence !dir */ + newip = ct->tuplehash[!dir].tuple.dst.ip; + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = !dir; + + /* When you see the packet, we need to NAT it the same as the + * this one. */ + exp->expectfn = ip_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + exp->tuple.dst.u.tcp.port = htons(port); + if (ip_conntrack_expect_related(exp) == 0) + break; + } + + if (port == 0) { + ip_conntrack_expect_free(exp); + return NF_DROP; + } + + if (!mangle[type](pskb, newip, port, matchoff, matchlen, ct, ctinfo, + seq)) { + ip_conntrack_unexpect_related(exp); + return NF_DROP; + } + return NF_ACCEPT; +} + +static void __exit fini(void) +{ + ip_nat_ftp_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); +} + +static int __init init(void) +{ + BUG_ON(ip_nat_ftp_hook); + ip_nat_ftp_hook = ip_nat_ftp; + return 0; +} + +/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ +static int warn_set(const char *val, struct kernel_param *kp) +{ + printk(KERN_INFO __stringify(KBUILD_MODNAME) + ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); + return 0; +} +module_param_call(ports, warn_set, NULL, NULL, 0); + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_helper.c b/net/ipv4/netfilter/ip_nat_helper.c new file mode 100644 index 000000000000..1637b96d8c01 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_helper.c @@ -0,0 +1,430 @@ +/* ip_nat_helper.c - generic support functions for NAT helpers + * + * (C) 2000-2002 Harald Welte + * (C) 2003-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 14 Jan 2002 Harald Welte : + * - add support for SACK adjustment + * 14 Mar 2002 Harald Welte : + * - merge SACK support into newnat API + * 16 Aug 2002 Brian J. Murrell : + * - make ip_nat_resize_packet more generic (TCP and UDP) + * - add ip_nat_mangle_udp_packet + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) + +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#define DUMP_OFFSET(x) printk("offset_before=%d, offset_after=%d, correction_pos=%u\n", x->offset_before, x->offset_after, x->correction_pos); +#else +#define DEBUGP(format, args...) +#define DUMP_OFFSET(x) +#endif + +static DECLARE_LOCK(ip_nat_seqofs_lock); + +/* Setup TCP sequence correction given this change at this sequence */ +static inline void +adjust_tcp_sequence(u32 seq, + int sizediff, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + int dir; + struct ip_nat_seq *this_way, *other_way; + + DEBUGP("ip_nat_resize_packet: old_size = %u, new_size = %u\n", + (*skb)->len, new_size); + + dir = CTINFO2DIR(ctinfo); + + this_way = &ct->nat.info.seq[dir]; + other_way = &ct->nat.info.seq[!dir]; + + DEBUGP("ip_nat_resize_packet: Seq_offset before: "); + DUMP_OFFSET(this_way); + + LOCK_BH(&ip_nat_seqofs_lock); + + /* SYN adjust. If it's uninitialized, or this is after last + * correction, record it: we don't handle more than one + * adjustment in the window, but do deal with common case of a + * retransmit */ + if (this_way->offset_before == this_way->offset_after + || before(this_way->correction_pos, seq)) { + this_way->correction_pos = seq; + this_way->offset_before = this_way->offset_after; + this_way->offset_after += sizediff; + } + UNLOCK_BH(&ip_nat_seqofs_lock); + + DEBUGP("ip_nat_resize_packet: Seq_offset after: "); + DUMP_OFFSET(this_way); +} + +/* Frobs data inside this packet, which is linear. */ +static void mangle_contents(struct sk_buff *skb, + unsigned int dataoff, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len) +{ + unsigned char *data; + + BUG_ON(skb_is_nonlinear(skb)); + data = (unsigned char *)skb->nh.iph + dataoff; + + /* move post-replacement */ + memmove(data + match_offset + rep_len, + data + match_offset + match_len, + skb->tail - (data + match_offset + match_len)); + + /* insert data from buffer */ + memcpy(data + match_offset, rep_buffer, rep_len); + + /* update skb info */ + if (rep_len > match_len) { + DEBUGP("ip_nat_mangle_packet: Extending packet by " + "%u from %u bytes\n", rep_len - match_len, + skb->len); + skb_put(skb, rep_len - match_len); + } else { + DEBUGP("ip_nat_mangle_packet: Shrinking packet from " + "%u from %u bytes\n", match_len - rep_len, + skb->len); + __skb_trim(skb, skb->len + rep_len - match_len); + } + + /* fix IP hdr checksum information */ + skb->nh.iph->tot_len = htons(skb->len); + ip_send_check(skb->nh.iph); +} + +/* Unusual, but possible case. */ +static int enlarge_skb(struct sk_buff **pskb, unsigned int extra) +{ + struct sk_buff *nskb; + + if ((*pskb)->len + extra > 65535) + return 0; + + nskb = skb_copy_expand(*pskb, skb_headroom(*pskb), extra, GFP_ATOMIC); + if (!nskb) + return 0; + + /* Transfer socket to new skb. */ + if ((*pskb)->sk) + skb_set_owner_w(nskb, (*pskb)->sk); +#ifdef CONFIG_NETFILTER_DEBUG + nskb->nf_debug = (*pskb)->nf_debug; +#endif + kfree_skb(*pskb); + *pskb = nskb; + return 1; +} + +/* Generic function for mangling variable-length address changes inside + * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX + * command in FTP). + * + * Takes care about all the nasty sequence number changes, checksumming, + * skb enlargement, ... + * + * */ +int +ip_nat_mangle_tcp_packet(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len) +{ + struct iphdr *iph; + struct tcphdr *tcph; + int datalen; + + if (!skb_ip_make_writable(pskb, (*pskb)->len)) + return 0; + + if (rep_len > match_len + && rep_len - match_len > skb_tailroom(*pskb) + && !enlarge_skb(pskb, rep_len - match_len)) + return 0; + + SKB_LINEAR_ASSERT(*pskb); + + iph = (*pskb)->nh.iph; + tcph = (void *)iph + iph->ihl*4; + + mangle_contents(*pskb, iph->ihl*4 + tcph->doff*4, + match_offset, match_len, rep_buffer, rep_len); + + datalen = (*pskb)->len - iph->ihl*4; + tcph->check = 0; + tcph->check = tcp_v4_check(tcph, datalen, iph->saddr, iph->daddr, + csum_partial((char *)tcph, datalen, 0)); + + if (rep_len != match_len) { + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); + adjust_tcp_sequence(ntohl(tcph->seq), + (int)rep_len - (int)match_len, + ct, ctinfo); + /* Tell TCP window tracking about seq change */ + ip_conntrack_tcp_update(*pskb, ct, CTINFO2DIR(ctinfo)); + } + return 1; +} + +/* Generic function for mangling variable-length address changes inside + * NATed UDP connections (like the CONNECT DATA XXXXX MESG XXXXX INDEX XXXXX + * command in the Amanda protocol) + * + * Takes care about all the nasty sequence number changes, checksumming, + * skb enlargement, ... + * + * XXX - This function could be merged with ip_nat_mangle_tcp_packet which + * should be fairly easy to do. + */ +int +ip_nat_mangle_udp_packet(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len) +{ + struct iphdr *iph; + struct udphdr *udph; + + /* UDP helpers might accidentally mangle the wrong packet */ + iph = (*pskb)->nh.iph; + if ((*pskb)->len < iph->ihl*4 + sizeof(*udph) + + match_offset + match_len) + return 0; + + if (!skb_ip_make_writable(pskb, (*pskb)->len)) + return 0; + + if (rep_len > match_len + && rep_len - match_len > skb_tailroom(*pskb) + && !enlarge_skb(pskb, rep_len - match_len)) + return 0; + + iph = (*pskb)->nh.iph; + udph = (void *)iph + iph->ihl*4; + mangle_contents(*pskb, iph->ihl*4 + sizeof(*udph), + match_offset, match_len, rep_buffer, rep_len); + + /* update the length of the UDP packet */ + udph->len = htons((*pskb)->len - iph->ihl*4); + + /* fix udp checksum if udp checksum was previously calculated */ + if (udph->check) { + int datalen = (*pskb)->len - iph->ihl * 4; + udph->check = 0; + udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + datalen, IPPROTO_UDP, + csum_partial((char *)udph, + datalen, 0)); + } + + return 1; +} + +/* Adjust one found SACK option including checksum correction */ +static void +sack_adjust(struct sk_buff *skb, + struct tcphdr *tcph, + unsigned int sackoff, + unsigned int sackend, + struct ip_nat_seq *natseq) +{ + while (sackoff < sackend) { + struct tcp_sack_block *sack; + u_int32_t new_start_seq, new_end_seq; + + sack = (void *)skb->data + sackoff; + if (after(ntohl(sack->start_seq) - natseq->offset_before, + natseq->correction_pos)) + new_start_seq = ntohl(sack->start_seq) + - natseq->offset_after; + else + new_start_seq = ntohl(sack->start_seq) + - natseq->offset_before; + new_start_seq = htonl(new_start_seq); + + if (after(ntohl(sack->end_seq) - natseq->offset_before, + natseq->correction_pos)) + new_end_seq = ntohl(sack->end_seq) + - natseq->offset_after; + else + new_end_seq = ntohl(sack->end_seq) + - natseq->offset_before; + new_end_seq = htonl(new_end_seq); + + DEBUGP("sack_adjust: start_seq: %d->%d, end_seq: %d->%d\n", + ntohl(sack->start_seq), new_start_seq, + ntohl(sack->end_seq), new_end_seq); + + tcph->check = + ip_nat_cheat_check(~sack->start_seq, new_start_seq, + ip_nat_cheat_check(~sack->end_seq, + new_end_seq, + tcph->check)); + sack->start_seq = new_start_seq; + sack->end_seq = new_end_seq; + sackoff += sizeof(*sack); + } +} + +/* TCP SACK sequence number adjustment */ +static inline unsigned int +ip_nat_sack_adjust(struct sk_buff **pskb, + struct tcphdr *tcph, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dir, optoff, optend; + + optoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct tcphdr); + optend = (*pskb)->nh.iph->ihl*4 + tcph->doff*4; + + if (!skb_ip_make_writable(pskb, optend)) + return 0; + + dir = CTINFO2DIR(ctinfo); + + while (optoff < optend) { + /* Usually: option, length. */ + unsigned char *op = (*pskb)->data + optoff; + + switch (op[0]) { + case TCPOPT_EOL: + return 1; + case TCPOPT_NOP: + optoff++; + continue; + default: + /* no partial options */ + if (optoff + 1 == optend + || optoff + op[1] > optend + || op[1] < 2) + return 0; + if (op[0] == TCPOPT_SACK + && op[1] >= 2+TCPOLEN_SACK_PERBLOCK + && ((op[1] - 2) % TCPOLEN_SACK_PERBLOCK) == 0) + sack_adjust(*pskb, tcph, optoff+2, + optoff+op[1], + &ct->nat.info.seq[!dir]); + optoff += op[1]; + } + } + return 1; +} + +/* TCP sequence number adjustment. Returns 1 on success, 0 on failure */ +int +ip_nat_seq_adjust(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + struct tcphdr *tcph; + int dir, newseq, newack; + struct ip_nat_seq *this_way, *other_way; + + dir = CTINFO2DIR(ctinfo); + + this_way = &ct->nat.info.seq[dir]; + other_way = &ct->nat.info.seq[!dir]; + + if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + return 0; + + tcph = (void *)(*pskb)->data + (*pskb)->nh.iph->ihl*4; + if (after(ntohl(tcph->seq), this_way->correction_pos)) + newseq = ntohl(tcph->seq) + this_way->offset_after; + else + newseq = ntohl(tcph->seq) + this_way->offset_before; + newseq = htonl(newseq); + + if (after(ntohl(tcph->ack_seq) - other_way->offset_before, + other_way->correction_pos)) + newack = ntohl(tcph->ack_seq) - other_way->offset_after; + else + newack = ntohl(tcph->ack_seq) - other_way->offset_before; + newack = htonl(newack); + + tcph->check = ip_nat_cheat_check(~tcph->seq, newseq, + ip_nat_cheat_check(~tcph->ack_seq, + newack, + tcph->check)); + + DEBUGP("Adjusting sequence number from %u->%u, ack from %u->%u\n", + ntohl(tcph->seq), ntohl(newseq), ntohl(tcph->ack_seq), + ntohl(newack)); + + tcph->seq = newseq; + tcph->ack_seq = newack; + + if (!ip_nat_sack_adjust(pskb, tcph, ct, ctinfo)) + return 0; + + ip_conntrack_tcp_update(*pskb, ct, dir); + + return 1; +} + +/* Setup NAT on this expected conntrack so it follows master. */ +/* If we fail to get a free NAT slot, we'll get dropped on confirm */ +void ip_nat_follow_master(struct ip_conntrack *ct, + struct ip_conntrack_expect *exp) +{ + struct ip_nat_range range; + + /* This must be a fresh one. */ + BUG_ON(ct->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.dst.ip; + /* hook doesn't matter, but it has to do source manip */ + ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + + /* For DST manip, map port here to where it's expected. */ + range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); + range.min = range.max = exp->saved_proto; + range.min_ip = range.max_ip + = ct->master->tuplehash[!exp->dir].tuple.src.ip; + /* hook doesn't matter, but it has to do destination manip */ + ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); +} diff --git a/net/ipv4/netfilter/ip_nat_irc.c b/net/ipv4/netfilter/ip_nat_irc.c new file mode 100644 index 000000000000..9c1ca3381d56 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_irc.c @@ -0,0 +1,125 @@ +/* IRC extension for TCP NAT alteration. + * (C) 2000-2001 by Harald Welte + * (C) 2004 Rusty Russell IBM Corporation + * based on a copy of RR's ip_nat_ftp.c + * + * ip_nat_irc.c,v 1.16 2001/12/06 07:42:10 laforge Exp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("IRC (DCC) NAT helper"); +MODULE_LICENSE("GPL"); + +static unsigned int help(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + unsigned int matchoff, + unsigned int matchlen, + struct ip_conntrack_expect *exp) +{ + u_int16_t port; + unsigned int ret; + + /* "4294967296 65635 " */ + char buffer[18]; + + DEBUGP("IRC_NAT: info (seq %u + %u) in %u\n", + expect->seq, exp_irc_info->len, + ntohl(tcph->seq)); + + /* Reply comes from server. */ + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->dir = IP_CT_DIR_REPLY; + + /* When you see the packet, we need to NAT it the same as the + * this one. */ + exp->expectfn = ip_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + exp->tuple.dst.u.tcp.port = htons(port); + if (ip_conntrack_expect_related(exp) == 0) + break; + } + + if (port == 0) { + ip_conntrack_expect_free(exp); + return NF_DROP; + } + + /* strlen("\1DCC CHAT chat AAAAAAAA P\1\n")=27 + * strlen("\1DCC SCHAT chat AAAAAAAA P\1\n")=28 + * strlen("\1DCC SEND F AAAAAAAA P S\1\n")=26 + * strlen("\1DCC MOVE F AAAAAAAA P S\1\n")=26 + * strlen("\1DCC TSEND F AAAAAAAA P S\1\n")=27 + * AAAAAAAAA: bound addr (1.0.0.0==16777216, min 8 digits, + * 255.255.255.255==4294967296, 10 digits) + * P: bound port (min 1 d, max 5d (65635)) + * F: filename (min 1 d ) + * S: size (min 1 d ) + * 0x01, \n: terminators + */ + + /* AAA = "us", ie. where server normally talks to. */ + sprintf(buffer, "%u %u", + ntohl(exp->master->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip), + port); + DEBUGP("ip_nat_irc: Inserting '%s' == %u.%u.%u.%u, port %u\n", + buffer, NIPQUAD(exp->tuple.src.ip), port); + + ret = ip_nat_mangle_tcp_packet(pskb, exp->master, ctinfo, + matchoff, matchlen, buffer, + strlen(buffer)); + if (ret != NF_ACCEPT) + ip_conntrack_unexpect_related(exp); + return ret; +} + +static void __exit fini(void) +{ + ip_nat_irc_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); +} + +static int __init init(void) +{ + BUG_ON(ip_nat_irc_hook); + ip_nat_irc_hook = help; + return 0; +} + +/* Prior to 2.6.11, we had a ports param. No longer, but don't break users. */ +static int warn_set(const char *val, struct kernel_param *kp) +{ + printk(KERN_INFO __stringify(KBUILD_MODNAME) + ": kernel >= 2.6.10 only uses 'ports' for conntrack modules\n"); + return 0; +} +module_param_call(ports, warn_set, NULL, NULL, 0); + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_proto_icmp.c b/net/ipv4/netfilter/ip_nat_proto_icmp.c new file mode 100644 index 000000000000..a558cf0eee8a --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_proto_icmp.c @@ -0,0 +1,115 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static int +icmp_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + return (tuple->src.u.icmp.id >= min->icmp.id + && tuple->src.u.icmp.id <= max->icmp.id); +} + +static int +icmp_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + static u_int16_t id; + unsigned int range_size + = (unsigned int)range->max.icmp.id - range->min.icmp.id + 1; + unsigned int i; + + /* If no range specified... */ + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) + range_size = 0xFFFF; + + for (i = 0; i < range_size; i++, id++) { + tuple->src.u.icmp.id = range->min.icmp.id + (id % range_size); + if (!ip_nat_used_tuple(tuple, conntrack)) + return 1; + } + return 0; +} + +static int +icmp_manip_pkt(struct sk_buff **pskb, + unsigned int iphdroff, + const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype) +{ + struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); + struct icmphdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + + if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) + return 0; + + hdr = (struct icmphdr *)((*pskb)->data + hdroff); + + hdr->checksum = ip_nat_cheat_check(hdr->un.echo.id ^ 0xFFFF, + tuple->src.u.icmp.id, + hdr->checksum); + hdr->un.echo.id = tuple->src.u.icmp.id; + return 1; +} + +static unsigned int +icmp_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + unsigned int len = 0; + + if (mask->src.u.icmp.id) + len += sprintf(buffer + len, "id=%u ", + ntohs(match->src.u.icmp.id)); + + if (mask->dst.u.icmp.type) + len += sprintf(buffer + len, "type=%u ", + ntohs(match->dst.u.icmp.type)); + + if (mask->dst.u.icmp.code) + len += sprintf(buffer + len, "code=%u ", + ntohs(match->dst.u.icmp.code)); + + return len; +} + +static unsigned int +icmp_print_range(char *buffer, const struct ip_nat_range *range) +{ + if (range->min.icmp.id != 0 || range->max.icmp.id != 0xFFFF) + return sprintf(buffer, "id %u-%u ", + ntohs(range->min.icmp.id), + ntohs(range->max.icmp.id)); + else return 0; +} + +struct ip_nat_protocol ip_nat_protocol_icmp += { "ICMP", IPPROTO_ICMP, + icmp_manip_pkt, + icmp_in_range, + icmp_unique_tuple, + icmp_print, + icmp_print_range +}; diff --git a/net/ipv4/netfilter/ip_nat_proto_tcp.c b/net/ipv4/netfilter/ip_nat_proto_tcp.c new file mode 100644 index 000000000000..a91cfceff272 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_proto_tcp.c @@ -0,0 +1,178 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int +tcp_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + u_int16_t port; + + if (maniptype == IP_NAT_MANIP_SRC) + port = tuple->src.u.tcp.port; + else + port = tuple->dst.u.tcp.port; + + return ntohs(port) >= ntohs(min->tcp.port) + && ntohs(port) <= ntohs(max->tcp.port); +} + +static int +tcp_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + static u_int16_t port, *portptr; + unsigned int range_size, min, i; + + if (maniptype == IP_NAT_MANIP_SRC) + portptr = &tuple->src.u.tcp.port; + else + portptr = &tuple->dst.u.tcp.port; + + /* If no range specified... */ + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { + /* If it's dst rewrite, can't change port */ + if (maniptype == IP_NAT_MANIP_DST) + return 0; + + /* Map privileged onto privileged. */ + if (ntohs(*portptr) < 1024) { + /* Loose convention: >> 512 is credential passing */ + if (ntohs(*portptr)<512) { + min = 1; + range_size = 511 - min + 1; + } else { + min = 600; + range_size = 1023 - min + 1; + } + } else { + min = 1024; + range_size = 65535 - 1024 + 1; + } + } else { + min = ntohs(range->min.tcp.port); + range_size = ntohs(range->max.tcp.port) - min + 1; + } + + for (i = 0; i < range_size; i++, port++) { + *portptr = htons(min + port % range_size); + if (!ip_nat_used_tuple(tuple, conntrack)) { + return 1; + } + } + return 0; +} + +static int +tcp_manip_pkt(struct sk_buff **pskb, + unsigned int iphdroff, + const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype) +{ + struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); + struct tcphdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + u32 oldip, newip; + u16 *portptr, newport, oldport; + int hdrsize = 8; /* TCP connection tracking guarantees this much */ + + /* this could be a inner header returned in icmp packet; in such + cases we cannot update the checksum field since it is outside of + the 8 bytes of transport layer headers we are guaranteed */ + if ((*pskb)->len >= hdroff + sizeof(struct tcphdr)) + hdrsize = sizeof(struct tcphdr); + + if (!skb_ip_make_writable(pskb, hdroff + hdrsize)) + return 0; + + iph = (struct iphdr *)((*pskb)->data + iphdroff); + hdr = (struct tcphdr *)((*pskb)->data + hdroff); + + if (maniptype == IP_NAT_MANIP_SRC) { + /* Get rid of src ip and src pt */ + oldip = iph->saddr; + newip = tuple->src.ip; + newport = tuple->src.u.tcp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst ip and dst pt */ + oldip = iph->daddr; + newip = tuple->dst.ip; + newport = tuple->dst.u.tcp.port; + portptr = &hdr->dest; + } + + oldport = *portptr; + *portptr = newport; + + if (hdrsize < sizeof(*hdr)) + return 1; + + hdr->check = ip_nat_cheat_check(~oldip, newip, + ip_nat_cheat_check(oldport ^ 0xFFFF, + newport, + hdr->check)); + return 1; +} + +static unsigned int +tcp_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + unsigned int len = 0; + + if (mask->src.u.tcp.port) + len += sprintf(buffer + len, "srcpt=%u ", + ntohs(match->src.u.tcp.port)); + + + if (mask->dst.u.tcp.port) + len += sprintf(buffer + len, "dstpt=%u ", + ntohs(match->dst.u.tcp.port)); + + return len; +} + +static unsigned int +tcp_print_range(char *buffer, const struct ip_nat_range *range) +{ + if (range->min.tcp.port != 0 || range->max.tcp.port != 0xFFFF) { + if (range->min.tcp.port == range->max.tcp.port) + return sprintf(buffer, "port %u ", + ntohs(range->min.tcp.port)); + else + return sprintf(buffer, "ports %u-%u ", + ntohs(range->min.tcp.port), + ntohs(range->max.tcp.port)); + } + else return 0; +} + +struct ip_nat_protocol ip_nat_protocol_tcp += { "TCP", IPPROTO_TCP, + tcp_manip_pkt, + tcp_in_range, + tcp_unique_tuple, + tcp_print, + tcp_print_range +}; diff --git a/net/ipv4/netfilter/ip_nat_proto_udp.c b/net/ipv4/netfilter/ip_nat_proto_udp.c new file mode 100644 index 000000000000..c669e3b5f5d0 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_proto_udp.c @@ -0,0 +1,165 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +static int +udp_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + u_int16_t port; + + if (maniptype == IP_NAT_MANIP_SRC) + port = tuple->src.u.udp.port; + else + port = tuple->dst.u.udp.port; + + return ntohs(port) >= ntohs(min->udp.port) + && ntohs(port) <= ntohs(max->udp.port); +} + +static int +udp_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + static u_int16_t port, *portptr; + unsigned int range_size, min, i; + + if (maniptype == IP_NAT_MANIP_SRC) + portptr = &tuple->src.u.udp.port; + else + portptr = &tuple->dst.u.udp.port; + + /* If no range specified... */ + if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) { + /* If it's dst rewrite, can't change port */ + if (maniptype == IP_NAT_MANIP_DST) + return 0; + + if (ntohs(*portptr) < 1024) { + /* Loose convention: >> 512 is credential passing */ + if (ntohs(*portptr)<512) { + min = 1; + range_size = 511 - min + 1; + } else { + min = 600; + range_size = 1023 - min + 1; + } + } else { + min = 1024; + range_size = 65535 - 1024 + 1; + } + } else { + min = ntohs(range->min.udp.port); + range_size = ntohs(range->max.udp.port) - min + 1; + } + + for (i = 0; i < range_size; i++, port++) { + *portptr = htons(min + port % range_size); + if (!ip_nat_used_tuple(tuple, conntrack)) + return 1; + } + return 0; +} + +static int +udp_manip_pkt(struct sk_buff **pskb, + unsigned int iphdroff, + const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype) +{ + struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); + struct udphdr *hdr; + unsigned int hdroff = iphdroff + iph->ihl*4; + u32 oldip, newip; + u16 *portptr, newport; + + if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) + return 0; + + iph = (struct iphdr *)((*pskb)->data + iphdroff); + hdr = (struct udphdr *)((*pskb)->data + hdroff); + + if (maniptype == IP_NAT_MANIP_SRC) { + /* Get rid of src ip and src pt */ + oldip = iph->saddr; + newip = tuple->src.ip; + newport = tuple->src.u.udp.port; + portptr = &hdr->source; + } else { + /* Get rid of dst ip and dst pt */ + oldip = iph->daddr; + newip = tuple->dst.ip; + newport = tuple->dst.u.udp.port; + portptr = &hdr->dest; + } + if (hdr->check) /* 0 is a special case meaning no checksum */ + hdr->check = ip_nat_cheat_check(~oldip, newip, + ip_nat_cheat_check(*portptr ^ 0xFFFF, + newport, + hdr->check)); + *portptr = newport; + return 1; +} + +static unsigned int +udp_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + unsigned int len = 0; + + if (mask->src.u.udp.port) + len += sprintf(buffer + len, "srcpt=%u ", + ntohs(match->src.u.udp.port)); + + + if (mask->dst.u.udp.port) + len += sprintf(buffer + len, "dstpt=%u ", + ntohs(match->dst.u.udp.port)); + + return len; +} + +static unsigned int +udp_print_range(char *buffer, const struct ip_nat_range *range) +{ + if (range->min.udp.port != 0 || range->max.udp.port != 0xFFFF) { + if (range->min.udp.port == range->max.udp.port) + return sprintf(buffer, "port %u ", + ntohs(range->min.udp.port)); + else + return sprintf(buffer, "ports %u-%u ", + ntohs(range->min.udp.port), + ntohs(range->max.udp.port)); + } + else return 0; +} + +struct ip_nat_protocol ip_nat_protocol_udp += { "UDP", IPPROTO_UDP, + udp_manip_pkt, + udp_in_range, + udp_unique_tuple, + udp_print, + udp_print_range +}; diff --git a/net/ipv4/netfilter/ip_nat_proto_unknown.c b/net/ipv4/netfilter/ip_nat_proto_unknown.c new file mode 100644 index 000000000000..f5525bd58d16 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_proto_unknown.c @@ -0,0 +1,70 @@ +/* The "unknown" protocol. This is what is used for protocols we + * don't understand. It's returned by ip_ct_find_proto(). + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include +#include + +static int unknown_in_range(const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type manip_type, + const union ip_conntrack_manip_proto *min, + const union ip_conntrack_manip_proto *max) +{ + return 1; +} + +static int unknown_unique_tuple(struct ip_conntrack_tuple *tuple, + const struct ip_nat_range *range, + enum ip_nat_manip_type maniptype, + const struct ip_conntrack *conntrack) +{ + /* Sorry: we can't help you; if it's not unique, we can't frob + anything. */ + return 0; +} + +static int +unknown_manip_pkt(struct sk_buff **pskb, + unsigned int iphdroff, + const struct ip_conntrack_tuple *tuple, + enum ip_nat_manip_type maniptype) +{ + return 1; +} + +static unsigned int +unknown_print(char *buffer, + const struct ip_conntrack_tuple *match, + const struct ip_conntrack_tuple *mask) +{ + return 0; +} + +static unsigned int +unknown_print_range(char *buffer, const struct ip_nat_range *range) +{ + return 0; +} + +struct ip_nat_protocol ip_nat_unknown_protocol = { + "unknown", 0, + unknown_manip_pkt, + unknown_in_range, + unknown_unique_tuple, + unknown_print, + unknown_print_range +}; diff --git a/net/ipv4/netfilter/ip_nat_rule.c b/net/ipv4/netfilter/ip_nat_rule.c new file mode 100644 index 000000000000..581f097f5a24 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_rule.c @@ -0,0 +1,319 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* Everything about the rules for NAT. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) + +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +#define NAT_VALID_HOOKS ((1<range[0], hooknum); +} + +/* Before 2.6.11 we did implicit source NAT if required. Warn about change. */ +static void warn_if_extra_mangle(u32 dstip, u32 srcip) +{ + static int warned = 0; + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } }; + struct rtable *rt; + + if (ip_route_output_key(&rt, &fl) != 0) + return; + + if (rt->rt_src != srcip && !warned) { + printk("NAT: no longer support implicit source local NAT\n"); + printk("NAT: packet src %u.%u.%u.%u -> dst %u.%u.%u.%u\n", + NIPQUAD(srcip), NIPQUAD(dstip)); + warned = 1; + } + ip_rt_put(rt); +} + +static unsigned int ipt_dnat_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + const struct ip_nat_multi_range_compat *mr = targinfo; + + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + || hooknum == NF_IP_LOCAL_OUT); + + ct = ip_conntrack_get(*pskb, &ctinfo); + + /* Connection must be valid and new. */ + IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + if (hooknum == NF_IP_LOCAL_OUT + && mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) + warn_if_extra_mangle((*pskb)->nh.iph->daddr, + mr->range[0].min_ip); + + return ip_nat_setup_info(ct, &mr->range[0], hooknum); +} + +static int ipt_snat_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ip_nat_multi_range_compat *mr = targinfo; + + /* Must be a valid range */ + if (mr->rangesize != 1) { + printk("SNAT: multiple ranges no longer supported\n"); + return 0; + } + + if (targinfosize != IPT_ALIGN(sizeof(struct ip_nat_multi_range_compat))) { + DEBUGP("SNAT: Target size %u wrong for %u ranges\n", + targinfosize, mr->rangesize); + return 0; + } + + /* Only allow these for NAT. */ + if (strcmp(tablename, "nat") != 0) { + DEBUGP("SNAT: wrong table %s\n", tablename); + return 0; + } + + if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) { + DEBUGP("SNAT: hook mask 0x%x bad\n", hook_mask); + return 0; + } + return 1; +} + +static int ipt_dnat_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ip_nat_multi_range_compat *mr = targinfo; + + /* Must be a valid range */ + if (mr->rangesize != 1) { + printk("DNAT: multiple ranges no longer supported\n"); + return 0; + } + + if (targinfosize != IPT_ALIGN(sizeof(struct ip_nat_multi_range_compat))) { + DEBUGP("DNAT: Target size %u wrong for %u ranges\n", + targinfosize, mr->rangesize); + return 0; + } + + /* Only allow these for NAT. */ + if (strcmp(tablename, "nat") != 0) { + DEBUGP("DNAT: wrong table %s\n", tablename); + return 0; + } + + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) { + DEBUGP("DNAT: hook mask 0x%x bad\n", hook_mask); + return 0; + } + + return 1; +} + +inline unsigned int +alloc_null_binding(struct ip_conntrack *conntrack, + struct ip_nat_info *info, + unsigned int hooknum) +{ + /* Force range to this IP; let proto decide mapping for + per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). + Use reply in case it's already been mangled (eg local packet). + */ + u_int32_t ip + = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC + ? conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip + : conntrack->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip); + struct ip_nat_range range + = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } }; + + DEBUGP("Allocating NULL binding for %p (%u.%u.%u.%u)\n", conntrack, + NIPQUAD(ip)); + return ip_nat_setup_info(conntrack, &range, hooknum); +} + +int ip_nat_rule_find(struct sk_buff **pskb, + unsigned int hooknum, + const struct net_device *in, + const struct net_device *out, + struct ip_conntrack *ct, + struct ip_nat_info *info) +{ + int ret; + + ret = ipt_do_table(pskb, hooknum, in, out, &nat_table, NULL); + + if (ret == NF_ACCEPT) { + if (!ip_nat_initialized(ct, HOOK2MANIP(hooknum))) + /* NUL mapping */ + ret = alloc_null_binding(ct, info, hooknum); + } + return ret; +} + +static struct ipt_target ipt_snat_reg = { + .name = "SNAT", + .target = ipt_snat_target, + .checkentry = ipt_snat_checkentry, +}; + +static struct ipt_target ipt_dnat_reg = { + .name = "DNAT", + .target = ipt_dnat_target, + .checkentry = ipt_dnat_checkentry, +}; + +int __init ip_nat_rule_init(void) +{ + int ret; + + ret = ipt_register_table(&nat_table, &nat_initial_table.repl); + if (ret != 0) + return ret; + ret = ipt_register_target(&ipt_snat_reg); + if (ret != 0) + goto unregister_table; + + ret = ipt_register_target(&ipt_dnat_reg); + if (ret != 0) + goto unregister_snat; + + return ret; + + unregister_snat: + ipt_unregister_target(&ipt_snat_reg); + unregister_table: + ipt_unregister_table(&nat_table); + + return ret; +} + +void ip_nat_rule_cleanup(void) +{ + ipt_unregister_target(&ipt_dnat_reg); + ipt_unregister_target(&ipt_snat_reg); + ipt_unregister_table(&nat_table); +} diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c new file mode 100644 index 000000000000..2a48b6e635ae --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -0,0 +1,1347 @@ +/* + * ip_nat_snmp_basic.c + * + * Basic SNMP Application Layer Gateway + * + * This IP NAT module is intended for use with SNMP network + * discovery and monitoring applications where target networks use + * conflicting private address realms. + * + * Static NAT is used to remap the networks from the view of the network + * management system at the IP layer, and this module remaps some application + * layer addresses to match. + * + * The simplest form of ALG is performed, where only tagged IP addresses + * are modified. The module does not need to be MIB aware and only scans + * messages at the ASN.1/BER level. + * + * Currently, only SNMPv1 and SNMPv2 are supported. + * + * More information on ALG and associated issues can be found in + * RFC 2962 + * + * The ASB.1/BER parsing code is derived from the gxsnmp package by Gregory + * McLean & Jochen Friedrich, stripped down for use in the kernel. + * + * Copyright (c) 2000 RP Internet (www.rpi.net.au). + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: James Morris + * + * Updates: + * 2000-08-06: Convert to new helper API (Harald Welte). + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris "); +MODULE_DESCRIPTION("Basic SNMP Application Layer Gateway"); + +#define SNMP_PORT 161 +#define SNMP_TRAP_PORT 162 +#define NOCT1(n) (u_int8_t )((n) & 0xff) + +static int debug; +static DEFINE_SPINLOCK(snmp_lock); + +/* + * Application layer address mapping mimics the NAT mapping, but + * only for the first octet in this case (a more flexible system + * can be implemented if needed). + */ +struct oct1_map +{ + u_int8_t from; + u_int8_t to; +}; + + +/***************************************************************************** + * + * Basic ASN.1 decoding routines (gxsnmp author Dirk Wisse) + * + *****************************************************************************/ + +/* Class */ +#define ASN1_UNI 0 /* Universal */ +#define ASN1_APL 1 /* Application */ +#define ASN1_CTX 2 /* Context */ +#define ASN1_PRV 3 /* Private */ + +/* Tag */ +#define ASN1_EOC 0 /* End Of Contents */ +#define ASN1_BOL 1 /* Boolean */ +#define ASN1_INT 2 /* Integer */ +#define ASN1_BTS 3 /* Bit String */ +#define ASN1_OTS 4 /* Octet String */ +#define ASN1_NUL 5 /* Null */ +#define ASN1_OJI 6 /* Object Identifier */ +#define ASN1_OJD 7 /* Object Description */ +#define ASN1_EXT 8 /* External */ +#define ASN1_SEQ 16 /* Sequence */ +#define ASN1_SET 17 /* Set */ +#define ASN1_NUMSTR 18 /* Numerical String */ +#define ASN1_PRNSTR 19 /* Printable String */ +#define ASN1_TEXSTR 20 /* Teletext String */ +#define ASN1_VIDSTR 21 /* Video String */ +#define ASN1_IA5STR 22 /* IA5 String */ +#define ASN1_UNITIM 23 /* Universal Time */ +#define ASN1_GENTIM 24 /* General Time */ +#define ASN1_GRASTR 25 /* Graphical String */ +#define ASN1_VISSTR 26 /* Visible String */ +#define ASN1_GENSTR 27 /* General String */ + +/* Primitive / Constructed methods*/ +#define ASN1_PRI 0 /* Primitive */ +#define ASN1_CON 1 /* Constructed */ + +/* + * Error codes. + */ +#define ASN1_ERR_NOERROR 0 +#define ASN1_ERR_DEC_EMPTY 2 +#define ASN1_ERR_DEC_EOC_MISMATCH 3 +#define ASN1_ERR_DEC_LENGTH_MISMATCH 4 +#define ASN1_ERR_DEC_BADVALUE 5 + +/* + * ASN.1 context. + */ +struct asn1_ctx +{ + int error; /* Error condition */ + unsigned char *pointer; /* Octet just to be decoded */ + unsigned char *begin; /* First octet */ + unsigned char *end; /* Octet after last octet */ +}; + +/* + * Octet string (not null terminated) + */ +struct asn1_octstr +{ + unsigned char *data; + unsigned int len; +}; + +static void asn1_open(struct asn1_ctx *ctx, + unsigned char *buf, + unsigned int len) +{ + ctx->begin = buf; + ctx->end = buf + len; + ctx->pointer = buf; + ctx->error = ASN1_ERR_NOERROR; +} + +static unsigned char asn1_octet_decode(struct asn1_ctx *ctx, unsigned char *ch) +{ + if (ctx->pointer >= ctx->end) { + ctx->error = ASN1_ERR_DEC_EMPTY; + return 0; + } + *ch = *(ctx->pointer)++; + return 1; +} + +static unsigned char asn1_tag_decode(struct asn1_ctx *ctx, unsigned int *tag) +{ + unsigned char ch; + + *tag = 0; + + do + { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + *tag <<= 7; + *tag |= ch & 0x7F; + } while ((ch & 0x80) == 0x80); + return 1; +} + +static unsigned char asn1_id_decode(struct asn1_ctx *ctx, + unsigned int *cls, + unsigned int *con, + unsigned int *tag) +{ + unsigned char ch; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *cls = (ch & 0xC0) >> 6; + *con = (ch & 0x20) >> 5; + *tag = (ch & 0x1F); + + if (*tag == 0x1F) { + if (!asn1_tag_decode(ctx, tag)) + return 0; + } + return 1; +} + +static unsigned char asn1_length_decode(struct asn1_ctx *ctx, + unsigned int *def, + unsigned int *len) +{ + unsigned char ch, cnt; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch == 0x80) + *def = 0; + else { + *def = 1; + + if (ch < 0x80) + *len = ch; + else { + cnt = (unsigned char) (ch & 0x7F); + *len = 0; + + while (cnt > 0) { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + *len <<= 8; + *len |= ch; + cnt--; + } + } + } + return 1; +} + +static unsigned char asn1_header_decode(struct asn1_ctx *ctx, + unsigned char **eoc, + unsigned int *cls, + unsigned int *con, + unsigned int *tag) +{ + unsigned int def, len; + + if (!asn1_id_decode(ctx, cls, con, tag)) + return 0; + + if (!asn1_length_decode(ctx, &def, &len)) + return 0; + + if (def) + *eoc = ctx->pointer + len; + else + *eoc = NULL; + return 1; +} + +static unsigned char asn1_eoc_decode(struct asn1_ctx *ctx, unsigned char *eoc) +{ + unsigned char ch; + + if (eoc == 0) { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch != 0x00) { + ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + if (ch != 0x00) { + ctx->error = ASN1_ERR_DEC_EOC_MISMATCH; + return 0; + } + return 1; + } else { + if (ctx->pointer != eoc) { + ctx->error = ASN1_ERR_DEC_LENGTH_MISMATCH; + return 0; + } + return 1; + } +} + +static unsigned char asn1_null_decode(struct asn1_ctx *ctx, unsigned char *eoc) +{ + ctx->pointer = eoc; + return 1; +} + +static unsigned char asn1_long_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + long *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = (signed char) ch; + len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof (long)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char asn1_uint_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned int *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = ch; + if (ch == 0) len = 0; + else len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof (unsigned int)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char asn1_ulong_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned long *integer) +{ + unsigned char ch; + unsigned int len; + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer = ch; + if (ch == 0) len = 0; + else len = 1; + + while (ctx->pointer < eoc) { + if (++len > sizeof (unsigned long)) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + return 0; + } + + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *integer <<= 8; + *integer |= ch; + } + return 1; +} + +static unsigned char asn1_octets_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned char **octets, + unsigned int *len) +{ + unsigned char *ptr; + + *len = 0; + + *octets = kmalloc(eoc - ctx->pointer, GFP_ATOMIC); + if (*octets == NULL) { + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + + ptr = *octets; + while (ctx->pointer < eoc) { + if (!asn1_octet_decode(ctx, (unsigned char *)ptr++)) { + kfree(*octets); + *octets = NULL; + return 0; + } + (*len)++; + } + return 1; +} + +static unsigned char asn1_subid_decode(struct asn1_ctx *ctx, + unsigned long *subid) +{ + unsigned char ch; + + *subid = 0; + + do { + if (!asn1_octet_decode(ctx, &ch)) + return 0; + + *subid <<= 7; + *subid |= ch & 0x7F; + } while ((ch & 0x80) == 0x80); + return 1; +} + +static unsigned char asn1_oid_decode(struct asn1_ctx *ctx, + unsigned char *eoc, + unsigned long **oid, + unsigned int *len) +{ + unsigned long subid; + unsigned int size; + unsigned long *optr; + + size = eoc - ctx->pointer + 1; + *oid = kmalloc(size * sizeof(unsigned long), GFP_ATOMIC); + if (*oid == NULL) { + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + + optr = *oid; + + if (!asn1_subid_decode(ctx, &subid)) { + kfree(*oid); + *oid = NULL; + return 0; + } + + if (subid < 40) { + optr [0] = 0; + optr [1] = subid; + } else if (subid < 80) { + optr [0] = 1; + optr [1] = subid - 40; + } else { + optr [0] = 2; + optr [1] = subid - 80; + } + + *len = 2; + optr += 2; + + while (ctx->pointer < eoc) { + if (++(*len) > size) { + ctx->error = ASN1_ERR_DEC_BADVALUE; + kfree(*oid); + *oid = NULL; + return 0; + } + + if (!asn1_subid_decode(ctx, optr++)) { + kfree(*oid); + *oid = NULL; + return 0; + } + } + return 1; +} + +/***************************************************************************** + * + * SNMP decoding routines (gxsnmp author Dirk Wisse) + * + *****************************************************************************/ + +/* SNMP Versions */ +#define SNMP_V1 0 +#define SNMP_V2C 1 +#define SNMP_V2 2 +#define SNMP_V3 3 + +/* Default Sizes */ +#define SNMP_SIZE_COMM 256 +#define SNMP_SIZE_OBJECTID 128 +#define SNMP_SIZE_BUFCHR 256 +#define SNMP_SIZE_BUFINT 128 +#define SNMP_SIZE_SMALLOBJECTID 16 + +/* Requests */ +#define SNMP_PDU_GET 0 +#define SNMP_PDU_NEXT 1 +#define SNMP_PDU_RESPONSE 2 +#define SNMP_PDU_SET 3 +#define SNMP_PDU_TRAP1 4 +#define SNMP_PDU_BULK 5 +#define SNMP_PDU_INFORM 6 +#define SNMP_PDU_TRAP2 7 + +/* Errors */ +#define SNMP_NOERROR 0 +#define SNMP_TOOBIG 1 +#define SNMP_NOSUCHNAME 2 +#define SNMP_BADVALUE 3 +#define SNMP_READONLY 4 +#define SNMP_GENERROR 5 +#define SNMP_NOACCESS 6 +#define SNMP_WRONGTYPE 7 +#define SNMP_WRONGLENGTH 8 +#define SNMP_WRONGENCODING 9 +#define SNMP_WRONGVALUE 10 +#define SNMP_NOCREATION 11 +#define SNMP_INCONSISTENTVALUE 12 +#define SNMP_RESOURCEUNAVAILABLE 13 +#define SNMP_COMMITFAILED 14 +#define SNMP_UNDOFAILED 15 +#define SNMP_AUTHORIZATIONERROR 16 +#define SNMP_NOTWRITABLE 17 +#define SNMP_INCONSISTENTNAME 18 + +/* General SNMP V1 Traps */ +#define SNMP_TRAP_COLDSTART 0 +#define SNMP_TRAP_WARMSTART 1 +#define SNMP_TRAP_LINKDOWN 2 +#define SNMP_TRAP_LINKUP 3 +#define SNMP_TRAP_AUTFAILURE 4 +#define SNMP_TRAP_EQPNEIGHBORLOSS 5 +#define SNMP_TRAP_ENTSPECIFIC 6 + +/* SNMPv1 Types */ +#define SNMP_NULL 0 +#define SNMP_INTEGER 1 /* l */ +#define SNMP_OCTETSTR 2 /* c */ +#define SNMP_DISPLAYSTR 2 /* c */ +#define SNMP_OBJECTID 3 /* ul */ +#define SNMP_IPADDR 4 /* uc */ +#define SNMP_COUNTER 5 /* ul */ +#define SNMP_GAUGE 6 /* ul */ +#define SNMP_TIMETICKS 7 /* ul */ +#define SNMP_OPAQUE 8 /* c */ + +/* Additional SNMPv2 Types */ +#define SNMP_UINTEGER 5 /* ul */ +#define SNMP_BITSTR 9 /* uc */ +#define SNMP_NSAP 10 /* uc */ +#define SNMP_COUNTER64 11 /* ul */ +#define SNMP_NOSUCHOBJECT 12 +#define SNMP_NOSUCHINSTANCE 13 +#define SNMP_ENDOFMIBVIEW 14 + +union snmp_syntax +{ + unsigned char uc[0]; /* 8 bit unsigned */ + char c[0]; /* 8 bit signed */ + unsigned long ul[0]; /* 32 bit unsigned */ + long l[0]; /* 32 bit signed */ +}; + +struct snmp_object +{ + unsigned long *id; + unsigned int id_len; + unsigned short type; + unsigned int syntax_len; + union snmp_syntax syntax; +}; + +struct snmp_request +{ + unsigned long id; + unsigned int error_status; + unsigned int error_index; +}; + +struct snmp_v1_trap +{ + unsigned long *id; + unsigned int id_len; + unsigned long ip_address; /* pointer */ + unsigned int general; + unsigned int specific; + unsigned long time; +}; + +/* SNMP types */ +#define SNMP_IPA 0 +#define SNMP_CNT 1 +#define SNMP_GGE 2 +#define SNMP_TIT 3 +#define SNMP_OPQ 4 +#define SNMP_C64 6 + +/* SNMP errors */ +#define SERR_NSO 0 +#define SERR_NSI 1 +#define SERR_EOM 2 + +static inline void mangle_address(unsigned char *begin, + unsigned char *addr, + const struct oct1_map *map, + u_int16_t *check); +struct snmp_cnv +{ + unsigned int class; + unsigned int tag; + int syntax; +}; + +static struct snmp_cnv snmp_conv [] = +{ + {ASN1_UNI, ASN1_NUL, SNMP_NULL}, + {ASN1_UNI, ASN1_INT, SNMP_INTEGER}, + {ASN1_UNI, ASN1_OTS, SNMP_OCTETSTR}, + {ASN1_UNI, ASN1_OTS, SNMP_DISPLAYSTR}, + {ASN1_UNI, ASN1_OJI, SNMP_OBJECTID}, + {ASN1_APL, SNMP_IPA, SNMP_IPADDR}, + {ASN1_APL, SNMP_CNT, SNMP_COUNTER}, /* Counter32 */ + {ASN1_APL, SNMP_GGE, SNMP_GAUGE}, /* Gauge32 == Unsigned32 */ + {ASN1_APL, SNMP_TIT, SNMP_TIMETICKS}, + {ASN1_APL, SNMP_OPQ, SNMP_OPAQUE}, + + /* SNMPv2 data types and errors */ + {ASN1_UNI, ASN1_BTS, SNMP_BITSTR}, + {ASN1_APL, SNMP_C64, SNMP_COUNTER64}, + {ASN1_CTX, SERR_NSO, SNMP_NOSUCHOBJECT}, + {ASN1_CTX, SERR_NSI, SNMP_NOSUCHINSTANCE}, + {ASN1_CTX, SERR_EOM, SNMP_ENDOFMIBVIEW}, + {0, 0, -1} +}; + +static unsigned char snmp_tag_cls2syntax(unsigned int tag, + unsigned int cls, + unsigned short *syntax) +{ + struct snmp_cnv *cnv; + + cnv = snmp_conv; + + while (cnv->syntax != -1) { + if (cnv->tag == tag && cnv->class == cls) { + *syntax = cnv->syntax; + return 1; + } + cnv++; + } + return 0; +} + +static unsigned char snmp_object_decode(struct asn1_ctx *ctx, + struct snmp_object **obj) +{ + unsigned int cls, con, tag, len, idlen; + unsigned short type; + unsigned char *eoc, *end, *p; + unsigned long *lp, *id; + unsigned long ul; + long l; + + *obj = NULL; + id = NULL; + + if (!asn1_header_decode(ctx, &eoc, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) + return 0; + + if (!asn1_oid_decode(ctx, end, &id, &idlen)) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) { + kfree(id); + return 0; + } + + if (con != ASN1_PRI) { + kfree(id); + return 0; + } + + if (!snmp_tag_cls2syntax(tag, cls, &type)) { + kfree(id); + return 0; + } + + switch (type) { + case SNMP_INTEGER: + len = sizeof(long); + if (!asn1_long_decode(ctx, end, &l)) { + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, + GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + (*obj)->syntax.l[0] = l; + break; + case SNMP_OCTETSTR: + case SNMP_OPAQUE: + if (!asn1_octets_decode(ctx, end, &p, &len)) { + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, + GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + memcpy((*obj)->syntax.c, p, len); + kfree(p); + break; + case SNMP_NULL: + case SNMP_NOSUCHOBJECT: + case SNMP_NOSUCHINSTANCE: + case SNMP_ENDOFMIBVIEW: + len = 0; + *obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + if (!asn1_null_decode(ctx, end)) { + kfree(id); + kfree(*obj); + *obj = NULL; + return 0; + } + break; + case SNMP_OBJECTID: + if (!asn1_oid_decode(ctx, end, (unsigned long **)&lp, &len)) { + kfree(id); + return 0; + } + len *= sizeof(unsigned long); + *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + memcpy((*obj)->syntax.ul, lp, len); + kfree(lp); + break; + case SNMP_IPADDR: + if (!asn1_octets_decode(ctx, end, &p, &len)) { + kfree(id); + return 0; + } + if (len != 4) { + kfree(p); + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); + if (*obj == NULL) { + kfree(p); + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + memcpy((*obj)->syntax.uc, p, len); + kfree(p); + break; + case SNMP_COUNTER: + case SNMP_GAUGE: + case SNMP_TIMETICKS: + len = sizeof(unsigned long); + if (!asn1_ulong_decode(ctx, end, &ul)) { + kfree(id); + return 0; + } + *obj = kmalloc(sizeof(struct snmp_object) + len, GFP_ATOMIC); + if (*obj == NULL) { + kfree(id); + if (net_ratelimit()) + printk("OOM in bsalg (%d)\n", __LINE__); + return 0; + } + (*obj)->syntax.ul[0] = ul; + break; + default: + kfree(id); + return 0; + } + + (*obj)->syntax_len = len; + (*obj)->type = type; + (*obj)->id = id; + (*obj)->id_len = idlen; + + if (!asn1_eoc_decode(ctx, eoc)) { + kfree(id); + kfree(*obj); + *obj = NULL; + return 0; + } + return 1; +} + +static unsigned char snmp_request_decode(struct asn1_ctx *ctx, + struct snmp_request *request) +{ + unsigned int cls, con, tag; + unsigned char *end; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + return 0; + + if (!asn1_ulong_decode(ctx, end, &request->id)) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + return 0; + + if (!asn1_uint_decode(ctx, end, &request->error_status)) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + return 0; + + if (!asn1_uint_decode(ctx, end, &request->error_index)) + return 0; + + return 1; +} + +/* + * Fast checksum update for possibly oddly-aligned UDP byte, from the + * code example in the draft. + */ +static void fast_csum(unsigned char *csum, + const unsigned char *optr, + const unsigned char *nptr, + int odd) +{ + long x, old, new; + + x = csum[0] * 256 + csum[1]; + + x =~ x & 0xFFFF; + + if (odd) old = optr[0] * 256; + else old = optr[0]; + + x -= old & 0xFFFF; + if (x <= 0) { + x--; + x &= 0xFFFF; + } + + if (odd) new = nptr[0] * 256; + else new = nptr[0]; + + x += new & 0xFFFF; + if (x & 0x10000) { + x++; + x &= 0xFFFF; + } + + x =~ x & 0xFFFF; + csum[0] = x / 256; + csum[1] = x & 0xFF; +} + +/* + * Mangle IP address. + * - begin points to the start of the snmp messgae + * - addr points to the start of the address + */ +static inline void mangle_address(unsigned char *begin, + unsigned char *addr, + const struct oct1_map *map, + u_int16_t *check) +{ + if (map->from == NOCT1(*addr)) { + u_int32_t old; + + if (debug) + memcpy(&old, (unsigned char *)addr, sizeof(old)); + + *addr = map->to; + + /* Update UDP checksum if being used */ + if (*check) { + unsigned char odd = !((addr - begin) % 2); + + fast_csum((unsigned char *)check, + &map->from, &map->to, odd); + + } + + if (debug) + printk(KERN_DEBUG "bsalg: mapped %u.%u.%u.%u to " + "%u.%u.%u.%u\n", NIPQUAD(old), NIPQUAD(*addr)); + } +} + +static unsigned char snmp_trap_decode(struct asn1_ctx *ctx, + struct snmp_v1_trap *trap, + const struct oct1_map *map, + u_int16_t *check) +{ + unsigned int cls, con, tag, len; + unsigned char *end; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OJI) + return 0; + + if (!asn1_oid_decode(ctx, end, &trap->id, &trap->id_len)) + return 0; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + goto err_id_free; + + if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_IPA) || + (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_OTS))) + goto err_id_free; + + if (!asn1_octets_decode(ctx, end, (unsigned char **)&trap->ip_address, &len)) + goto err_id_free; + + /* IPv4 only */ + if (len != 4) + goto err_addr_free; + + mangle_address(ctx->begin, ctx->pointer - 4, map, check); + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + goto err_addr_free; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + goto err_addr_free; + + if (!asn1_uint_decode(ctx, end, &trap->general)) + goto err_addr_free; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + goto err_addr_free; + + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + goto err_addr_free; + + if (!asn1_uint_decode(ctx, end, &trap->specific)) + goto err_addr_free; + + if (!asn1_header_decode(ctx, &end, &cls, &con, &tag)) + goto err_addr_free; + + if (!((cls == ASN1_APL && con == ASN1_PRI && tag == SNMP_TIT) || + (cls == ASN1_UNI && con == ASN1_PRI && tag == ASN1_INT))) + goto err_addr_free; + + if (!asn1_ulong_decode(ctx, end, &trap->time)) + goto err_addr_free; + + return 1; + +err_id_free: + kfree(trap->id); + +err_addr_free: + kfree((unsigned long *)trap->ip_address); + + return 0; +} + +/***************************************************************************** + * + * Misc. routines + * + *****************************************************************************/ + +static void hex_dump(unsigned char *buf, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + if (i && !(i % 16)) + printk("\n"); + printk("%02x ", *(buf + i)); + } + printk("\n"); +} + +/* + * Parse and mangle SNMP message according to mapping. + * (And this is the fucking 'basic' method). + */ +static int snmp_parse_mangle(unsigned char *msg, + u_int16_t len, + const struct oct1_map *map, + u_int16_t *check) +{ + unsigned char *eoc, *end; + unsigned int cls, con, tag, vers, pdutype; + struct asn1_ctx ctx; + struct asn1_octstr comm; + struct snmp_object **obj; + + if (debug > 1) + hex_dump(msg, len); + + asn1_open(&ctx, msg, len); + + /* + * Start of SNMP message. + */ + if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) + return 0; + if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) + return 0; + + /* + * Version 1 or 2 handled. + */ + if (!asn1_header_decode(&ctx, &end, &cls, &con, &tag)) + return 0; + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_INT) + return 0; + if (!asn1_uint_decode (&ctx, end, &vers)) + return 0; + if (debug > 1) + printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1); + if (vers > 1) + return 1; + + /* + * Community. + */ + if (!asn1_header_decode (&ctx, &end, &cls, &con, &tag)) + return 0; + if (cls != ASN1_UNI || con != ASN1_PRI || tag != ASN1_OTS) + return 0; + if (!asn1_octets_decode(&ctx, end, &comm.data, &comm.len)) + return 0; + if (debug > 1) { + unsigned int i; + + printk(KERN_DEBUG "bsalg: community: "); + for (i = 0; i < comm.len; i++) + printk("%c", comm.data[i]); + printk("\n"); + } + kfree(comm.data); + + /* + * PDU type + */ + if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &pdutype)) + return 0; + if (cls != ASN1_CTX || con != ASN1_CON) + return 0; + if (debug > 1) { + unsigned char *pdus[] = { + [SNMP_PDU_GET] = "get", + [SNMP_PDU_NEXT] = "get-next", + [SNMP_PDU_RESPONSE] = "response", + [SNMP_PDU_SET] = "set", + [SNMP_PDU_TRAP1] = "trapv1", + [SNMP_PDU_BULK] = "bulk", + [SNMP_PDU_INFORM] = "inform", + [SNMP_PDU_TRAP2] = "trapv2" + }; + + if (pdutype > SNMP_PDU_TRAP2) + printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype); + else + printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]); + } + if (pdutype != SNMP_PDU_RESPONSE && + pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2) + return 1; + + /* + * Request header or v1 trap + */ + if (pdutype == SNMP_PDU_TRAP1) { + struct snmp_v1_trap trap; + unsigned char ret = snmp_trap_decode(&ctx, &trap, map, check); + + /* Discard trap allocations regardless */ + kfree(trap.id); + kfree((unsigned long *)trap.ip_address); + + if (!ret) + return ret; + + } else { + struct snmp_request req; + + if (!snmp_request_decode(&ctx, &req)) + return 0; + + if (debug > 1) + printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u " + "error_index=%u\n", req.id, req.error_status, + req.error_index); + } + + /* + * Loop through objects, look for IP addresses to mangle. + */ + if (!asn1_header_decode(&ctx, &eoc, &cls, &con, &tag)) + return 0; + + if (cls != ASN1_UNI || con != ASN1_CON || tag != ASN1_SEQ) + return 0; + + obj = kmalloc(sizeof(struct snmp_object), GFP_ATOMIC); + if (obj == NULL) { + if (net_ratelimit()) + printk(KERN_WARNING "OOM in bsalg(%d)\n", __LINE__); + return 0; + } + + while (!asn1_eoc_decode(&ctx, eoc)) { + unsigned int i; + + if (!snmp_object_decode(&ctx, obj)) { + if (*obj) { + if ((*obj)->id) + kfree((*obj)->id); + kfree(*obj); + } + kfree(obj); + return 0; + } + + if (debug > 1) { + printk(KERN_DEBUG "bsalg: object: "); + for (i = 0; i < (*obj)->id_len; i++) { + if (i > 0) + printk("."); + printk("%lu", (*obj)->id[i]); + } + printk(": type=%u\n", (*obj)->type); + + } + + if ((*obj)->type == SNMP_IPADDR) + mangle_address(ctx.begin, ctx.pointer - 4 , map, check); + + kfree((*obj)->id); + kfree(*obj); + } + kfree(obj); + + if (!asn1_eoc_decode(&ctx, eoc)) + return 0; + + return 1; +} + +/***************************************************************************** + * + * NAT routines. + * + *****************************************************************************/ + +/* + * SNMP translation routine. + */ +static int snmp_translate(struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + struct sk_buff **pskb) +{ + struct iphdr *iph = (*pskb)->nh.iph; + struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl); + u_int16_t udplen = ntohs(udph->len); + u_int16_t paylen = udplen - sizeof(struct udphdr); + int dir = CTINFO2DIR(ctinfo); + struct oct1_map map; + + /* + * Determine mappping for application layer addresses based + * on NAT manipulations for the packet. + */ + if (dir == IP_CT_DIR_ORIGINAL) { + /* SNAT traps */ + map.from = NOCT1(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip); + map.to = NOCT1(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip); + } else { + /* DNAT replies */ + map.from = NOCT1(ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip); + map.to = NOCT1(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip); + } + + if (map.from == map.to) + return NF_ACCEPT; + + if (!snmp_parse_mangle((unsigned char *)udph + sizeof(struct udphdr), + paylen, &map, &udph->check)) { + if (net_ratelimit()) + printk(KERN_WARNING "bsalg: parser failed\n"); + return NF_DROP; + } + return NF_ACCEPT; +} + +/* We don't actually set up expectations, just adjust internal IP + * addresses if this is being NATted */ +static int help(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + int dir = CTINFO2DIR(ctinfo); + unsigned int ret; + struct iphdr *iph = (*pskb)->nh.iph; + struct udphdr *udph = (struct udphdr *)((u_int32_t *)iph + iph->ihl); + + /* SNMP replies and originating SNMP traps get mangled */ + if (udph->source == ntohs(SNMP_PORT) && dir != IP_CT_DIR_REPLY) + return NF_ACCEPT; + if (udph->dest == ntohs(SNMP_TRAP_PORT) && dir != IP_CT_DIR_ORIGINAL) + return NF_ACCEPT; + + /* No NAT? */ + if (!(ct->status & IPS_NAT_MASK)) + return NF_ACCEPT; + + /* + * Make sure the packet length is ok. So far, we were only guaranteed + * to have a valid length IP header plus 8 bytes, which means we have + * enough room for a UDP header. Just verify the UDP length field so we + * can mess around with the payload. + */ + if (ntohs(udph->len) != (*pskb)->len - (iph->ihl << 2)) { + if (net_ratelimit()) + printk(KERN_WARNING "SNMP: dropping malformed packet " + "src=%u.%u.%u.%u dst=%u.%u.%u.%u\n", + NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); + return NF_DROP; + } + + if (!skb_ip_make_writable(pskb, (*pskb)->len)) + return NF_DROP; + + spin_lock_bh(&snmp_lock); + ret = snmp_translate(ct, ctinfo, pskb); + spin_unlock_bh(&snmp_lock); + return ret; +} + +static struct ip_conntrack_helper snmp_helper = { + .max_expected = 0, + .timeout = 180, + .me = THIS_MODULE, + .help = help, + .name = "snmp", + + .tuple = { .src = { .u = { __constant_htons(SNMP_PORT) } }, + .dst = { .protonum = IPPROTO_UDP }, + }, + .mask = { .src = { .u = { 0xFFFF } }, + .dst = { .protonum = 0xFF }, + }, +}; + +static struct ip_conntrack_helper snmp_trap_helper = { + .max_expected = 0, + .timeout = 180, + .me = THIS_MODULE, + .help = help, + .name = "snmp_trap", + + .tuple = { .src = { .u = { __constant_htons(SNMP_TRAP_PORT) } }, + .dst = { .protonum = IPPROTO_UDP }, + }, + .mask = { .src = { .u = { 0xFFFF } }, + .dst = { .protonum = 0xFF }, + }, +}; + +/***************************************************************************** + * + * Module stuff. + * + *****************************************************************************/ + +static int __init init(void) +{ + int ret = 0; + + ret = ip_conntrack_helper_register(&snmp_helper); + if (ret < 0) + return ret; + ret = ip_conntrack_helper_register(&snmp_trap_helper); + if (ret < 0) { + ip_conntrack_helper_unregister(&snmp_helper); + return ret; + } + return ret; +} + +static void __exit fini(void) +{ + ip_conntrack_helper_unregister(&snmp_helper); + ip_conntrack_helper_unregister(&snmp_trap_helper); +} + +module_init(init); +module_exit(fini); + +module_param(debug, bool, 0600); diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c new file mode 100644 index 000000000000..dec4a74212cd --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_standalone.c @@ -0,0 +1,349 @@ +/* This file contains all the functions required for the standalone + ip_nat module. + + These are not required by the compatibility layer. +*/ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +/* + * 23 Apr 2001: Harald Welte + * - new API and handling of conntrack/nat helpers + * - now capable of multiple expectations for one master + * */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) +#define ASSERT_WRITE_LOCK(x) MUST_BE_WRITE_LOCKED(&ip_nat_lock) + +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +#define HOOKNAME(hooknum) ((hooknum) == NF_IP_POST_ROUTING ? "POST_ROUTING" \ + : ((hooknum) == NF_IP_PRE_ROUTING ? "PRE_ROUTING" \ + : ((hooknum) == NF_IP_LOCAL_OUT ? "LOCAL_OUT" \ + : ((hooknum) == NF_IP_LOCAL_IN ? "LOCAL_IN" \ + : "*ERROR*"))) + +static unsigned int +ip_nat_fn(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + struct ip_nat_info *info; + /* maniptype == SRC for postrouting. */ + enum ip_nat_manip_type maniptype = HOOK2MANIP(hooknum); + + /* We never see fragments: conntrack defrags on pre-routing + and local-out, and ip_nat_out protects post-routing. */ + IP_NF_ASSERT(!((*pskb)->nh.iph->frag_off + & htons(IP_MF|IP_OFFSET))); + + (*pskb)->nfcache |= NFC_UNKNOWN; + + /* If we had a hardware checksum before, it's now invalid */ + if ((*pskb)->ip_summed == CHECKSUM_HW) + if (skb_checksum_help(*pskb, (out == NULL))) + return NF_DROP; + + ct = ip_conntrack_get(*pskb, &ctinfo); + /* Can't track? It's not due to stress, or conntrack would + have dropped it. Hence it's the user's responsibilty to + packet filter it out, or implement conntrack/NAT for that + protocol. 8) --RR */ + if (!ct) { + /* Exception: ICMP redirect to new connection (not in + hash table yet). We must not let this through, in + case we're doing NAT to the same network. */ + if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { + struct icmphdr _hdr, *hp; + + hp = skb_header_pointer(*pskb, + (*pskb)->nh.iph->ihl*4, + sizeof(_hdr), &_hdr); + if (hp != NULL && + hp->type == ICMP_REDIRECT) + return NF_DROP; + } + return NF_ACCEPT; + } + + switch (ctinfo) { + case IP_CT_RELATED: + case IP_CT_RELATED+IP_CT_IS_REPLY: + if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { + if (!icmp_reply_translation(pskb, ct, maniptype, + CTINFO2DIR(ctinfo))) + return NF_DROP; + else + return NF_ACCEPT; + } + /* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */ + case IP_CT_NEW: + info = &ct->nat.info; + + /* Seen it before? This can happen for loopback, retrans, + or local packets.. */ + if (!ip_nat_initialized(ct, maniptype)) { + unsigned int ret; + + /* LOCAL_IN hook doesn't have a chain! */ + if (hooknum == NF_IP_LOCAL_IN) + ret = alloc_null_binding(ct, info, hooknum); + else + ret = ip_nat_rule_find(pskb, hooknum, + in, out, ct, + info); + + if (ret != NF_ACCEPT) { + return ret; + } + } else + DEBUGP("Already setup manip %s for ct %p\n", + maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", + ct); + break; + + default: + /* ESTABLISHED */ + IP_NF_ASSERT(ctinfo == IP_CT_ESTABLISHED + || ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY)); + info = &ct->nat.info; + } + + IP_NF_ASSERT(info); + return nat_packet(ct, ctinfo, hooknum, pskb); +} + +static unsigned int +ip_nat_in(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + u_int32_t saddr, daddr; + unsigned int ret; + + saddr = (*pskb)->nh.iph->saddr; + daddr = (*pskb)->nh.iph->daddr; + + ret = ip_nat_fn(hooknum, pskb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN + && ((*pskb)->nh.iph->saddr != saddr + || (*pskb)->nh.iph->daddr != daddr)) { + dst_release((*pskb)->dst); + (*pskb)->dst = NULL; + } + return ret; +} + +static unsigned int +ip_nat_out(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) + return NF_ACCEPT; + + /* We can hit fragment here; forwarded packets get + defragmented by connection tracking coming in, then + fragmented (grr) by the forward code. + + In future: If we have nfct != NULL, AND we have NAT + initialized, AND there is no helper, then we can do full + NAPT on the head, and IP-address-only NAT on the rest. + + I'm starting to have nightmares about fragments. */ + + if ((*pskb)->nh.iph->frag_off & htons(IP_MF|IP_OFFSET)) { + *pskb = ip_ct_gather_frags(*pskb, IP_DEFRAG_NAT_OUT); + + if (!*pskb) + return NF_STOLEN; + } + + return ip_nat_fn(hooknum, pskb, in, out, okfn); +} + +static unsigned int +ip_nat_local_fn(unsigned int hooknum, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + u_int32_t saddr, daddr; + unsigned int ret; + + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) + return NF_ACCEPT; + + saddr = (*pskb)->nh.iph->saddr; + daddr = (*pskb)->nh.iph->daddr; + + ret = ip_nat_fn(hooknum, pskb, in, out, okfn); + if (ret != NF_DROP && ret != NF_STOLEN + && ((*pskb)->nh.iph->saddr != saddr + || (*pskb)->nh.iph->daddr != daddr)) + return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP; + return ret; +} + +/* We must be after connection tracking and before packet filtering. */ + +/* Before packet filtering, change destination */ +static struct nf_hook_ops ip_nat_in_ops = { + .hook = ip_nat_in, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_NAT_DST, +}; + +/* After packet filtering, change source */ +static struct nf_hook_ops ip_nat_out_ops = { + .hook = ip_nat_out, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_NAT_SRC, +}; + +/* Before packet filtering, change destination */ +static struct nf_hook_ops ip_nat_local_out_ops = { + .hook = ip_nat_local_fn, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_NAT_DST, +}; + +/* After packet filtering, change source for reply packets of LOCAL_OUT DNAT */ +static struct nf_hook_ops ip_nat_local_in_ops = { + .hook = ip_nat_fn, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_NAT_SRC, +}; + +static int init_or_cleanup(int init) +{ + int ret = 0; + + need_ip_conntrack(); + + if (!init) goto cleanup; + + ret = ip_nat_rule_init(); + if (ret < 0) { + printk("ip_nat_init: can't setup rules.\n"); + goto cleanup_nothing; + } + ret = ip_nat_init(); + if (ret < 0) { + printk("ip_nat_init: can't setup rules.\n"); + goto cleanup_rule_init; + } + ret = nf_register_hook(&ip_nat_in_ops); + if (ret < 0) { + printk("ip_nat_init: can't register in hook.\n"); + goto cleanup_nat; + } + ret = nf_register_hook(&ip_nat_out_ops); + if (ret < 0) { + printk("ip_nat_init: can't register out hook.\n"); + goto cleanup_inops; + } + ret = nf_register_hook(&ip_nat_local_out_ops); + if (ret < 0) { + printk("ip_nat_init: can't register local out hook.\n"); + goto cleanup_outops; + } + ret = nf_register_hook(&ip_nat_local_in_ops); + if (ret < 0) { + printk("ip_nat_init: can't register local in hook.\n"); + goto cleanup_localoutops; + } + return ret; + + cleanup: + nf_unregister_hook(&ip_nat_local_in_ops); + cleanup_localoutops: + nf_unregister_hook(&ip_nat_local_out_ops); + cleanup_outops: + nf_unregister_hook(&ip_nat_out_ops); + cleanup_inops: + nf_unregister_hook(&ip_nat_in_ops); + cleanup_nat: + ip_nat_cleanup(); + cleanup_rule_init: + ip_nat_rule_cleanup(); + cleanup_nothing: + MUST_BE_READ_WRITE_UNLOCKED(&ip_nat_lock); + return ret; +} + +static int __init init(void) +{ + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +module_init(init); +module_exit(fini); + +EXPORT_SYMBOL(ip_nat_setup_info); +EXPORT_SYMBOL(ip_nat_protocol_register); +EXPORT_SYMBOL(ip_nat_protocol_unregister); +EXPORT_SYMBOL(ip_nat_cheat_check); +EXPORT_SYMBOL(ip_nat_mangle_tcp_packet); +EXPORT_SYMBOL(ip_nat_mangle_udp_packet); +EXPORT_SYMBOL(ip_nat_used_tuple); +EXPORT_SYMBOL(ip_nat_follow_master); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/ip_nat_tftp.c b/net/ipv4/netfilter/ip_nat_tftp.c new file mode 100644 index 000000000000..0343e0d64674 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_tftp.c @@ -0,0 +1,70 @@ +/* (C) 2001-2002 Magnus Boden + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Version: 0.0.7 + * + * Thu 21 Mar 2002 Harald Welte + * - Port to newnat API + * + * This module currently supports DNAT: + * iptables -t nat -A PREROUTING -d x.x.x.x -j DNAT --to-dest x.x.x.y + * + * and SNAT: + * iptables -t nat -A POSTROUTING { -j MASQUERADE , -j SNAT --to-source x.x.x.x } + * + * It has not been tested with + * -j SNAT --to-source x.x.x.x-x.x.x.y since I only have one external ip + * If you do test this please let me know if it works or not. + * + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Magnus Boden "); +MODULE_DESCRIPTION("tftp NAT helper"); +MODULE_LICENSE("GPL"); + +static unsigned int help(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack_expect *exp) +{ + exp->saved_proto.udp.port = exp->tuple.dst.u.tcp.port; + exp->dir = IP_CT_DIR_REPLY; + exp->expectfn = ip_nat_follow_master; + if (ip_conntrack_expect_related(exp) != 0) { + ip_conntrack_expect_free(exp); + return NF_DROP; + } + return NF_ACCEPT; +} + +static void __exit fini(void) +{ + ip_nat_tftp_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); +} + +static int __init init(void) +{ + BUG_ON(ip_nat_tftp_hook); + ip_nat_tftp_hook = help; + return 0; +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_queue.c b/net/ipv4/netfilter/ip_queue.c new file mode 100644 index 000000000000..9e40dffc204f --- /dev/null +++ b/net/ipv4/netfilter/ip_queue.c @@ -0,0 +1,741 @@ +/* + * This is a module which is used for queueing IPv4 packets and + * communicating with userspace via netlink. + * + * (C) 2000-2002 James Morris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 2000-03-27: Simplified code (thanks to Andi Kleen for clues). + * 2000-05-20: Fixed notifier problems (following Miguel Freitas' report). + * 2000-06-19: Fixed so nfmark is copied to metadata (reported by Sebastian + * Zander). + * 2000-08-01: Added Nick Williams' MAC support. + * 2002-06-25: Code cleanup. + * 2005-01-10: Added /proc counter for dropped packets; fixed so + * packets aren't delivered to user space if they're going + * to be dropped. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IPQ_QMAX_DEFAULT 1024 +#define IPQ_PROC_FS_NAME "ip_queue" +#define NET_IPQ_QMAX 2088 +#define NET_IPQ_QMAX_NAME "ip_queue_maxlen" + +struct ipq_rt_info { + __u8 tos; + __u32 daddr; + __u32 saddr; +}; + +struct ipq_queue_entry { + struct list_head list; + struct nf_info *info; + struct sk_buff *skb; + struct ipq_rt_info rt_info; +}; + +typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); + +static unsigned char copy_mode = IPQ_COPY_NONE; +static unsigned int queue_maxlen = IPQ_QMAX_DEFAULT; +static DEFINE_RWLOCK(queue_lock); +static int peer_pid; +static unsigned int copy_range; +static unsigned int queue_total; +static unsigned int queue_dropped = 0; +static unsigned int queue_user_dropped = 0; +static struct sock *ipqnl; +static LIST_HEAD(queue_list); +static DECLARE_MUTEX(ipqnl_sem); + +static void +ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) +{ + nf_reinject(entry->skb, entry->info, verdict); + kfree(entry); +} + +static inline void +__ipq_enqueue_entry(struct ipq_queue_entry *entry) +{ + list_add(&entry->list, &queue_list); + queue_total++; +} + +/* + * Find and return a queued entry matched by cmpfn, or return the last + * entry if cmpfn is NULL. + */ +static inline struct ipq_queue_entry * +__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data) +{ + struct list_head *p; + + list_for_each_prev(p, &queue_list) { + struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p; + + if (!cmpfn || cmpfn(entry, data)) + return entry; + } + return NULL; +} + +static inline void +__ipq_dequeue_entry(struct ipq_queue_entry *entry) +{ + list_del(&entry->list); + queue_total--; +} + +static inline struct ipq_queue_entry * +__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data) +{ + struct ipq_queue_entry *entry; + + entry = __ipq_find_entry(cmpfn, data); + if (entry == NULL) + return NULL; + + __ipq_dequeue_entry(entry); + return entry; +} + + +static inline void +__ipq_flush(int verdict) +{ + struct ipq_queue_entry *entry; + + while ((entry = __ipq_find_dequeue_entry(NULL, 0))) + ipq_issue_verdict(entry, verdict); +} + +static inline int +__ipq_set_mode(unsigned char mode, unsigned int range) +{ + int status = 0; + + switch(mode) { + case IPQ_COPY_NONE: + case IPQ_COPY_META: + copy_mode = mode; + copy_range = 0; + break; + + case IPQ_COPY_PACKET: + copy_mode = mode; + copy_range = range; + if (copy_range > 0xFFFF) + copy_range = 0xFFFF; + break; + + default: + status = -EINVAL; + + } + return status; +} + +static inline void +__ipq_reset(void) +{ + peer_pid = 0; + net_disable_timestamp(); + __ipq_set_mode(IPQ_COPY_NONE, 0); + __ipq_flush(NF_DROP); +} + +static struct ipq_queue_entry * +ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data) +{ + struct ipq_queue_entry *entry; + + write_lock_bh(&queue_lock); + entry = __ipq_find_dequeue_entry(cmpfn, data); + write_unlock_bh(&queue_lock); + return entry; +} + +static void +ipq_flush(int verdict) +{ + write_lock_bh(&queue_lock); + __ipq_flush(verdict); + write_unlock_bh(&queue_lock); +} + +static struct sk_buff * +ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) +{ + unsigned char *old_tail; + size_t size = 0; + size_t data_len = 0; + struct sk_buff *skb; + struct ipq_packet_msg *pmsg; + struct nlmsghdr *nlh; + + read_lock_bh(&queue_lock); + + switch (copy_mode) { + case IPQ_COPY_META: + case IPQ_COPY_NONE: + size = NLMSG_SPACE(sizeof(*pmsg)); + data_len = 0; + break; + + case IPQ_COPY_PACKET: + if (copy_range == 0 || copy_range > entry->skb->len) + data_len = entry->skb->len; + else + data_len = copy_range; + + size = NLMSG_SPACE(sizeof(*pmsg) + data_len); + break; + + default: + *errp = -EINVAL; + read_unlock_bh(&queue_lock); + return NULL; + } + + read_unlock_bh(&queue_lock); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + goto nlmsg_failure; + + old_tail= skb->tail; + nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); + pmsg = NLMSG_DATA(nlh); + memset(pmsg, 0, sizeof(*pmsg)); + + pmsg->packet_id = (unsigned long )entry; + pmsg->data_len = data_len; + pmsg->timestamp_sec = entry->skb->stamp.tv_sec; + pmsg->timestamp_usec = entry->skb->stamp.tv_usec; + pmsg->mark = entry->skb->nfmark; + pmsg->hook = entry->info->hook; + pmsg->hw_protocol = entry->skb->protocol; + + if (entry->info->indev) + strcpy(pmsg->indev_name, entry->info->indev->name); + else + pmsg->indev_name[0] = '\0'; + + if (entry->info->outdev) + strcpy(pmsg->outdev_name, entry->info->outdev->name); + else + pmsg->outdev_name[0] = '\0'; + + if (entry->info->indev && entry->skb->dev) { + pmsg->hw_type = entry->skb->dev->type; + if (entry->skb->dev->hard_header_parse) + pmsg->hw_addrlen = + entry->skb->dev->hard_header_parse(entry->skb, + pmsg->hw_addr); + } + + if (data_len) + if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len)) + BUG(); + + nlh->nlmsg_len = skb->tail - old_tail; + return skb; + +nlmsg_failure: + if (skb) + kfree_skb(skb); + *errp = -EINVAL; + printk(KERN_ERR "ip_queue: error creating packet message\n"); + return NULL; +} + +static int +ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) +{ + int status = -EINVAL; + struct sk_buff *nskb; + struct ipq_queue_entry *entry; + + if (copy_mode == IPQ_COPY_NONE) + return -EAGAIN; + + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) { + printk(KERN_ERR "ip_queue: OOM in ipq_enqueue_packet()\n"); + return -ENOMEM; + } + + entry->info = info; + entry->skb = skb; + + if (entry->info->hook == NF_IP_LOCAL_OUT) { + struct iphdr *iph = skb->nh.iph; + + entry->rt_info.tos = iph->tos; + entry->rt_info.daddr = iph->daddr; + entry->rt_info.saddr = iph->saddr; + } + + nskb = ipq_build_packet_message(entry, &status); + if (nskb == NULL) + goto err_out_free; + + write_lock_bh(&queue_lock); + + if (!peer_pid) + goto err_out_free_nskb; + + if (queue_total >= queue_maxlen) { + queue_dropped++; + status = -ENOSPC; + if (net_ratelimit()) + printk (KERN_WARNING "ip_queue: full at %d entries, " + "dropping packets(s). Dropped: %d\n", queue_total, + queue_dropped); + goto err_out_free_nskb; + } + + /* netlink_unicast will either free the nskb or attach it to a socket */ + status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT); + if (status < 0) { + queue_user_dropped++; + goto err_out_unlock; + } + + __ipq_enqueue_entry(entry); + + write_unlock_bh(&queue_lock); + return status; + +err_out_free_nskb: + kfree_skb(nskb); + +err_out_unlock: + write_unlock_bh(&queue_lock); + +err_out_free: + kfree(entry); + return status; +} + +static int +ipq_mangle_ipv4(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) +{ + int diff; + struct iphdr *user_iph = (struct iphdr *)v->payload; + + if (v->data_len < sizeof(*user_iph)) + return 0; + diff = v->data_len - e->skb->len; + if (diff < 0) + skb_trim(e->skb, v->data_len); + else if (diff > 0) { + if (v->data_len > 0xFFFF) + return -EINVAL; + if (diff > skb_tailroom(e->skb)) { + struct sk_buff *newskb; + + newskb = skb_copy_expand(e->skb, + skb_headroom(e->skb), + diff, + GFP_ATOMIC); + if (newskb == NULL) { + printk(KERN_WARNING "ip_queue: OOM " + "in mangle, dropping packet\n"); + return -ENOMEM; + } + if (e->skb->sk) + skb_set_owner_w(newskb, e->skb->sk); + kfree_skb(e->skb); + e->skb = newskb; + } + skb_put(e->skb, diff); + } + if (!skb_ip_make_writable(&e->skb, v->data_len)) + return -ENOMEM; + memcpy(e->skb->data, v->payload, v->data_len); + e->skb->nfcache |= NFC_ALTERED; + + /* + * Extra routing may needed on local out, as the QUEUE target never + * returns control to the table. + */ + if (e->info->hook == NF_IP_LOCAL_OUT) { + struct iphdr *iph = e->skb->nh.iph; + + if (!(iph->tos == e->rt_info.tos + && iph->daddr == e->rt_info.daddr + && iph->saddr == e->rt_info.saddr)) + return ip_route_me_harder(&e->skb); + } + return 0; +} + +static inline int +id_cmp(struct ipq_queue_entry *e, unsigned long id) +{ + return (id == (unsigned long )e); +} + +static int +ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) +{ + struct ipq_queue_entry *entry; + + if (vmsg->value > NF_MAX_VERDICT) + return -EINVAL; + + entry = ipq_find_dequeue_entry(id_cmp, vmsg->id); + if (entry == NULL) + return -ENOENT; + else { + int verdict = vmsg->value; + + if (vmsg->data_len && vmsg->data_len == len) + if (ipq_mangle_ipv4(vmsg, entry) < 0) + verdict = NF_DROP; + + ipq_issue_verdict(entry, verdict); + return 0; + } +} + +static int +ipq_set_mode(unsigned char mode, unsigned int range) +{ + int status; + + write_lock_bh(&queue_lock); + status = __ipq_set_mode(mode, range); + write_unlock_bh(&queue_lock); + return status; +} + +static int +ipq_receive_peer(struct ipq_peer_msg *pmsg, + unsigned char type, unsigned int len) +{ + int status = 0; + + if (len < sizeof(*pmsg)) + return -EINVAL; + + switch (type) { + case IPQM_MODE: + status = ipq_set_mode(pmsg->msg.mode.value, + pmsg->msg.mode.range); + break; + + case IPQM_VERDICT: + if (pmsg->msg.verdict.value > NF_MAX_VERDICT) + status = -EINVAL; + else + status = ipq_set_verdict(&pmsg->msg.verdict, + len - sizeof(*pmsg)); + break; + default: + status = -EINVAL; + } + return status; +} + +static int +dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex) +{ + if (entry->info->indev) + if (entry->info->indev->ifindex == ifindex) + return 1; + + if (entry->info->outdev) + if (entry->info->outdev->ifindex == ifindex) + return 1; + + return 0; +} + +static void +ipq_dev_drop(int ifindex) +{ + struct ipq_queue_entry *entry; + + while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL) + ipq_issue_verdict(entry, NF_DROP); +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static inline void +ipq_rcv_skb(struct sk_buff *skb) +{ + int status, type, pid, flags, nlmsglen, skblen; + struct nlmsghdr *nlh; + + skblen = skb->len; + if (skblen < sizeof(*nlh)) + return; + + nlh = (struct nlmsghdr *)skb->data; + nlmsglen = nlh->nlmsg_len; + if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) + return; + + pid = nlh->nlmsg_pid; + flags = nlh->nlmsg_flags; + + if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI) + RCV_SKB_FAIL(-EINVAL); + + if (flags & MSG_TRUNC) + RCV_SKB_FAIL(-ECOMM); + + type = nlh->nlmsg_type; + if (type < NLMSG_NOOP || type >= IPQM_MAX) + RCV_SKB_FAIL(-EINVAL); + + if (type <= IPQM_BASE) + return; + + if (security_netlink_recv(skb)) + RCV_SKB_FAIL(-EPERM); + + write_lock_bh(&queue_lock); + + if (peer_pid) { + if (peer_pid != pid) { + write_unlock_bh(&queue_lock); + RCV_SKB_FAIL(-EBUSY); + } + } else { + net_enable_timestamp(); + peer_pid = pid; + } + + write_unlock_bh(&queue_lock); + + status = ipq_receive_peer(NLMSG_DATA(nlh), type, + skblen - NLMSG_LENGTH(0)); + if (status < 0) + RCV_SKB_FAIL(status); + + if (flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + return; +} + +static void +ipq_rcv_sk(struct sock *sk, int len) +{ + do { + struct sk_buff *skb; + + if (down_trylock(&ipqnl_sem)) + return; + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + ipq_rcv_skb(skb); + kfree_skb(skb); + } + + up(&ipqnl_sem); + + } while (ipqnl && ipqnl->sk_receive_queue.qlen); +} + +static int +ipq_rcv_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + ipq_dev_drop(dev->ifindex); + return NOTIFY_DONE; +} + +static struct notifier_block ipq_dev_notifier = { + .notifier_call = ipq_rcv_dev_event, +}; + +static int +ipq_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && + n->protocol == NETLINK_FIREWALL && n->pid) { + write_lock_bh(&queue_lock); + if (n->pid == peer_pid) + __ipq_reset(); + write_unlock_bh(&queue_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block ipq_nl_notifier = { + .notifier_call = ipq_rcv_nl_event, +}; + +static struct ctl_table_header *ipq_sysctl_header; + +static ctl_table ipq_table[] = { + { + .ctl_name = NET_IPQ_QMAX, + .procname = NET_IPQ_QMAX_NAME, + .data = &queue_maxlen, + .maxlen = sizeof(queue_maxlen), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .ctl_name = 0 } +}; + +static ctl_table ipq_dir_table[] = { + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = ipq_table + }, + { .ctl_name = 0 } +}; + +static ctl_table ipq_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ipq_dir_table + }, + { .ctl_name = 0 } +}; + +#ifdef CONFIG_PROC_FS +static int +ipq_get_info(char *buffer, char **start, off_t offset, int length) +{ + int len; + + read_lock_bh(&queue_lock); + + len = sprintf(buffer, + "Peer PID : %d\n" + "Copy mode : %hu\n" + "Copy range : %u\n" + "Queue length : %u\n" + "Queue max. length : %u\n" + "Queue dropped : %u\n" + "Netlink dropped : %u\n", + peer_pid, + copy_mode, + copy_range, + queue_total, + queue_maxlen, + queue_dropped, + queue_user_dropped); + + read_unlock_bh(&queue_lock); + + *start = buffer + offset; + len -= offset; + if (len > length) + len = length; + else if (len < 0) + len = 0; + return len; +} +#endif /* CONFIG_PROC_FS */ + +static int +init_or_cleanup(int init) +{ + int status = -ENOMEM; + struct proc_dir_entry *proc; + + if (!init) + goto cleanup; + + netlink_register_notifier(&ipq_nl_notifier); + ipqnl = netlink_kernel_create(NETLINK_FIREWALL, ipq_rcv_sk); + if (ipqnl == NULL) { + printk(KERN_ERR "ip_queue: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + + proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info); + if (proc) + proc->owner = THIS_MODULE; + else { + printk(KERN_ERR "ip_queue: failed to create proc entry\n"); + goto cleanup_ipqnl; + } + + register_netdevice_notifier(&ipq_dev_notifier); + ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0); + + status = nf_register_queue_handler(PF_INET, ipq_enqueue_packet, NULL); + if (status < 0) { + printk(KERN_ERR "ip_queue: failed to register queue handler\n"); + goto cleanup_sysctl; + } + return status; + +cleanup: + nf_unregister_queue_handler(PF_INET); + synchronize_net(); + ipq_flush(NF_DROP); + +cleanup_sysctl: + unregister_sysctl_table(ipq_sysctl_header); + unregister_netdevice_notifier(&ipq_dev_notifier); + proc_net_remove(IPQ_PROC_FS_NAME); + +cleanup_ipqnl: + sock_release(ipqnl->sk_socket); + down(&ipqnl_sem); + up(&ipqnl_sem); + +cleanup_netlink_notifier: + netlink_unregister_notifier(&ipq_nl_notifier); + return status; +} + +static int __init init(void) +{ + + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +MODULE_DESCRIPTION("IPv4 packet queue handler"); +MODULE_AUTHOR("James Morris "); +MODULE_LICENSE("GPL"); + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c new file mode 100644 index 000000000000..8a54f92b8496 --- /dev/null +++ b/net/ipv4/netfilter/ip_tables.c @@ -0,0 +1,1964 @@ +/* + * Packet matching code. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 19 Jan 2002 Harald Welte + * - increase module usage count as soon as we have rules inside + * a table + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("IPv4 packet filter"); + +/*#define DEBUG_IP_FIREWALL*/ +/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ +/*#define DEBUG_IP_FIREWALL_USER*/ + +#ifdef DEBUG_IP_FIREWALL +#define dprintf(format, args...) printk(format , ## args) +#else +#define dprintf(format, args...) +#endif + +#ifdef DEBUG_IP_FIREWALL_USER +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#ifdef CONFIG_NETFILTER_DEBUG +#define IP_NF_ASSERT(x) \ +do { \ + if (!(x)) \ + printk("IP_NF_ASSERT: %s:%s:%u\n", \ + __FUNCTION__, __FILE__, __LINE__); \ +} while(0) +#else +#define IP_NF_ASSERT(x) +#endif +#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) + +static DECLARE_MUTEX(ipt_mutex); + +/* Must have mutex */ +#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) +#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ipt_mutex) != 0) +#include +#include + +#if 0 +/* All the better to debug you with... */ +#define static +#define inline +#endif + +/* + We keep a set of rules for each CPU, so we can avoid write-locking + them in the softirq when updating the counters and therefore + only need to read-lock in the softirq; doing a write_lock_bh() in user + context stops packets coming through and allows user context to read + the counters or update the rules. + + To be cache friendly on SMP, we arrange them like so: + [ n-entries ] + ... cache-align padding ... + [ n-entries ] + + Hence the start of any table is given by get_table() below. */ + +/* The table itself */ +struct ipt_table_info +{ + /* Size per table */ + unsigned int size; + /* Number of entries: FIXME. --RR */ + unsigned int number; + /* Initial number of entries. Needed for module usage count */ + unsigned int initial_entries; + + /* Entry points and underflows */ + unsigned int hook_entry[NF_IP_NUMHOOKS]; + unsigned int underflow[NF_IP_NUMHOOKS]; + + /* ipt_entry tables: one per CPU */ + char entries[0] ____cacheline_aligned; +}; + +static LIST_HEAD(ipt_target); +static LIST_HEAD(ipt_match); +static LIST_HEAD(ipt_tables); +#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) + +#ifdef CONFIG_SMP +#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) +#else +#define TABLE_OFFSET(t,p) 0 +#endif + +#if 0 +#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) +#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) +#define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0) +#endif + +/* Returns whether matches rule or not. */ +static inline int +ip_packet_match(const struct iphdr *ip, + const char *indev, + const char *outdev, + const struct ipt_ip *ipinfo, + int isfrag) +{ + size_t i; + unsigned long ret; + +#define FWINV(bool,invflg) ((bool) ^ !!(ipinfo->invflags & invflg)) + + if (FWINV((ip->saddr&ipinfo->smsk.s_addr) != ipinfo->src.s_addr, + IPT_INV_SRCIP) + || FWINV((ip->daddr&ipinfo->dmsk.s_addr) != ipinfo->dst.s_addr, + IPT_INV_DSTIP)) { + dprintf("Source or dest mismatch.\n"); + + dprintf("SRC: %u.%u.%u.%u. Mask: %u.%u.%u.%u. Target: %u.%u.%u.%u.%s\n", + NIPQUAD(ip->saddr), + NIPQUAD(ipinfo->smsk.s_addr), + NIPQUAD(ipinfo->src.s_addr), + ipinfo->invflags & IPT_INV_SRCIP ? " (INV)" : ""); + dprintf("DST: %u.%u.%u.%u Mask: %u.%u.%u.%u Target: %u.%u.%u.%u.%s\n", + NIPQUAD(ip->daddr), + NIPQUAD(ipinfo->dmsk.s_addr), + NIPQUAD(ipinfo->dst.s_addr), + ipinfo->invflags & IPT_INV_DSTIP ? " (INV)" : ""); + return 0; + } + + /* Look for ifname matches; this should unroll nicely. */ + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + ret |= (((const unsigned long *)indev)[i] + ^ ((const unsigned long *)ipinfo->iniface)[i]) + & ((const unsigned long *)ipinfo->iniface_mask)[i]; + } + + if (FWINV(ret != 0, IPT_INV_VIA_IN)) { + dprintf("VIA in mismatch (%s vs %s).%s\n", + indev, ipinfo->iniface, + ipinfo->invflags&IPT_INV_VIA_IN ?" (INV)":""); + return 0; + } + + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + ret |= (((const unsigned long *)outdev)[i] + ^ ((const unsigned long *)ipinfo->outiface)[i]) + & ((const unsigned long *)ipinfo->outiface_mask)[i]; + } + + if (FWINV(ret != 0, IPT_INV_VIA_OUT)) { + dprintf("VIA out mismatch (%s vs %s).%s\n", + outdev, ipinfo->outiface, + ipinfo->invflags&IPT_INV_VIA_OUT ?" (INV)":""); + return 0; + } + + /* Check specific protocol */ + if (ipinfo->proto + && FWINV(ip->protocol != ipinfo->proto, IPT_INV_PROTO)) { + dprintf("Packet protocol %hi does not match %hi.%s\n", + ip->protocol, ipinfo->proto, + ipinfo->invflags&IPT_INV_PROTO ? " (INV)":""); + return 0; + } + + /* If we have a fragment rule but the packet is not a fragment + * then we return zero */ + if (FWINV((ipinfo->flags&IPT_F_FRAG) && !isfrag, IPT_INV_FRAG)) { + dprintf("Fragment rule but not fragment.%s\n", + ipinfo->invflags & IPT_INV_FRAG ? " (INV)" : ""); + return 0; + } + + return 1; +} + +static inline int +ip_checkentry(const struct ipt_ip *ip) +{ + if (ip->flags & ~IPT_F_MASK) { + duprintf("Unknown flag bits set: %08X\n", + ip->flags & ~IPT_F_MASK); + return 0; + } + if (ip->invflags & ~IPT_INV_MASK) { + duprintf("Unknown invflag bits set: %08X\n", + ip->invflags & ~IPT_INV_MASK); + return 0; + } + return 1; +} + +static unsigned int +ipt_error(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + if (net_ratelimit()) + printk("ip_tables: error: `%s'\n", (char *)targinfo); + + return NF_DROP; +} + +static inline +int do_match(struct ipt_entry_match *m, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int offset, + int *hotdrop) +{ + /* Stop iteration if it doesn't match */ + if (!m->u.kernel.match->match(skb, in, out, m->data, offset, hotdrop)) + return 1; + else + return 0; +} + +static inline struct ipt_entry * +get_entry(void *base, unsigned int offset) +{ + return (struct ipt_entry *)(base + offset); +} + +/* Returns one of the generic firewall policies, like NF_ACCEPT. */ +unsigned int +ipt_do_table(struct sk_buff **pskb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + struct ipt_table *table, + void *userdata) +{ + static const char nulldevname[IFNAMSIZ] __attribute__((aligned(sizeof(long)))); + u_int16_t offset; + struct iphdr *ip; + u_int16_t datalen; + int hotdrop = 0; + /* Initializing verdict to NF_DROP keeps gcc happy. */ + unsigned int verdict = NF_DROP; + const char *indev, *outdev; + void *table_base; + struct ipt_entry *e, *back; + + /* Initialization */ + ip = (*pskb)->nh.iph; + datalen = (*pskb)->len - ip->ihl * 4; + indev = in ? in->name : nulldevname; + outdev = out ? out->name : nulldevname; + /* We handle fragments by dealing with the first fragment as + * if it was a normal packet. All other fragments are treated + * normally, except that they will NEVER match rules that ask + * things we don't know, ie. tcp syn flag or ports). If the + * rule is also a fragment-specific rule, non-fragments won't + * match it. */ + offset = ntohs(ip->frag_off) & IP_OFFSET; + + read_lock_bh(&table->lock); + IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + table_base = (void *)table->private->entries + + TABLE_OFFSET(table->private, smp_processor_id()); + e = get_entry(table_base, table->private->hook_entry[hook]); + +#ifdef CONFIG_NETFILTER_DEBUG + /* Check noone else using our table */ + if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac + && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) { + printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n", + smp_processor_id(), + table->name, + &((struct ipt_entry *)table_base)->comefrom, + ((struct ipt_entry *)table_base)->comefrom); + } + ((struct ipt_entry *)table_base)->comefrom = 0x57acc001; +#endif + + /* For return from builtin chain */ + back = get_entry(table_base, table->private->underflow[hook]); + + do { + IP_NF_ASSERT(e); + IP_NF_ASSERT(back); + (*pskb)->nfcache |= e->nfcache; + if (ip_packet_match(ip, indev, outdev, &e->ip, offset)) { + struct ipt_entry_target *t; + + if (IPT_MATCH_ITERATE(e, do_match, + *pskb, in, out, + offset, &hotdrop) != 0) + goto no_match; + + ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1); + + t = ipt_get_target(e); + IP_NF_ASSERT(t->u.kernel.target); + /* Standard target? */ + if (!t->u.kernel.target->target) { + int v; + + v = ((struct ipt_standard_target *)t)->verdict; + if (v < 0) { + /* Pop from stack? */ + if (v != IPT_RETURN) { + verdict = (unsigned)(-v) - 1; + break; + } + e = back; + back = get_entry(table_base, + back->comefrom); + continue; + } + if (table_base + v + != (void *)e + e->next_offset) { + /* Save old back ptr in next entry */ + struct ipt_entry *next + = (void *)e + e->next_offset; + next->comefrom + = (void *)back - table_base; + /* set back pointer to next entry */ + back = next; + } + + e = get_entry(table_base, v); + } else { + /* Targets which reenter must return + abs. verdicts */ +#ifdef CONFIG_NETFILTER_DEBUG + ((struct ipt_entry *)table_base)->comefrom + = 0xeeeeeeec; +#endif + verdict = t->u.kernel.target->target(pskb, + in, out, + hook, + t->data, + userdata); + +#ifdef CONFIG_NETFILTER_DEBUG + if (((struct ipt_entry *)table_base)->comefrom + != 0xeeeeeeec + && verdict == IPT_CONTINUE) { + printk("Target %s reentered!\n", + t->u.kernel.target->name); + verdict = NF_DROP; + } + ((struct ipt_entry *)table_base)->comefrom + = 0x57acc001; +#endif + /* Target might have changed stuff. */ + ip = (*pskb)->nh.iph; + datalen = (*pskb)->len - ip->ihl * 4; + + if (verdict == IPT_CONTINUE) + e = (void *)e + e->next_offset; + else + /* Verdict */ + break; + } + } else { + + no_match: + e = (void *)e + e->next_offset; + } + } while (!hotdrop); + +#ifdef CONFIG_NETFILTER_DEBUG + ((struct ipt_entry *)table_base)->comefrom = 0xdead57ac; +#endif + read_unlock_bh(&table->lock); + +#ifdef DEBUG_ALLOW_ALL + return NF_ACCEPT; +#else + if (hotdrop) + return NF_DROP; + else return verdict; +#endif +} + +/* + * These are weird, but module loading must not be done with mutex + * held (since they will register), and we have to have a single + * function to use try_then_request_module(). + */ + +/* Find table by name, grabs mutex & ref. Returns ERR_PTR() on error. */ +static inline struct ipt_table *find_table_lock(const char *name) +{ + struct ipt_table *t; + + if (down_interruptible(&ipt_mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(t, &ipt_tables, list) + if (strcmp(t->name, name) == 0 && try_module_get(t->me)) + return t; + up(&ipt_mutex); + return NULL; +} + +/* Find match, grabs ref. Returns ERR_PTR() on error. */ +static inline struct ipt_match *find_match(const char *name, u8 revision) +{ + struct ipt_match *m; + int err = 0; + + if (down_interruptible(&ipt_mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(m, &ipt_match, list) { + if (strcmp(m->name, name) == 0) { + if (m->revision == revision) { + if (try_module_get(m->me)) { + up(&ipt_mutex); + return m; + } + } else + err = -EPROTOTYPE; /* Found something. */ + } + } + up(&ipt_mutex); + return ERR_PTR(err); +} + +/* Find target, grabs ref. Returns ERR_PTR() on error. */ +static inline struct ipt_target *find_target(const char *name, u8 revision) +{ + struct ipt_target *t; + int err = 0; + + if (down_interruptible(&ipt_mutex) != 0) + return ERR_PTR(-EINTR); + + list_for_each_entry(t, &ipt_target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision == revision) { + if (try_module_get(t->me)) { + up(&ipt_mutex); + return t; + } + } else + err = -EPROTOTYPE; /* Found something. */ + } + } + up(&ipt_mutex); + return ERR_PTR(err); +} + +struct ipt_target *ipt_find_target(const char *name, u8 revision) +{ + struct ipt_target *target; + + target = try_then_request_module(find_target(name, revision), + "ipt_%s", name); + if (IS_ERR(target) || !target) + return NULL; + return target; +} + +static int match_revfn(const char *name, u8 revision, int *bestp) +{ + struct ipt_match *m; + int have_rev = 0; + + list_for_each_entry(m, &ipt_match, list) { + if (strcmp(m->name, name) == 0) { + if (m->revision > *bestp) + *bestp = m->revision; + if (m->revision == revision) + have_rev = 1; + } + } + return have_rev; +} + +static int target_revfn(const char *name, u8 revision, int *bestp) +{ + struct ipt_target *t; + int have_rev = 0; + + list_for_each_entry(t, &ipt_target, list) { + if (strcmp(t->name, name) == 0) { + if (t->revision > *bestp) + *bestp = t->revision; + if (t->revision == revision) + have_rev = 1; + } + } + return have_rev; +} + +/* Returns true or false (if no such extension at all) */ +static inline int find_revision(const char *name, u8 revision, + int (*revfn)(const char *, u8, int *), + int *err) +{ + int have_rev, best = -1; + + if (down_interruptible(&ipt_mutex) != 0) { + *err = -EINTR; + return 1; + } + have_rev = revfn(name, revision, &best); + up(&ipt_mutex); + + /* Nothing at all? Return 0 to try loading module. */ + if (best == -1) { + *err = -ENOENT; + return 0; + } + + *err = best; + if (!have_rev) + *err = -EPROTONOSUPPORT; + return 1; +} + + +/* All zeroes == unconditional rule. */ +static inline int +unconditional(const struct ipt_ip *ip) +{ + unsigned int i; + + for (i = 0; i < sizeof(*ip)/sizeof(__u32); i++) + if (((__u32 *)ip)[i]) + return 0; + + return 1; +} + +/* Figures out from what hook each rule can be called: returns 0 if + there are loops. Puts hook bitmask in comefrom. */ +static int +mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) +{ + unsigned int hook; + + /* No recursion; use packet counter to save back ptrs (reset + to 0 as we leave), and comefrom to save source hook bitmask */ + for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) { + unsigned int pos = newinfo->hook_entry[hook]; + struct ipt_entry *e + = (struct ipt_entry *)(newinfo->entries + pos); + + if (!(valid_hooks & (1 << hook))) + continue; + + /* Set initial back pointer. */ + e->counters.pcnt = pos; + + for (;;) { + struct ipt_standard_target *t + = (void *)ipt_get_target(e); + + if (e->comefrom & (1 << NF_IP_NUMHOOKS)) { + printk("iptables: loop hook %u pos %u %08X.\n", + hook, pos, e->comefrom); + return 0; + } + e->comefrom + |= ((1 << hook) | (1 << NF_IP_NUMHOOKS)); + + /* Unconditional return/END. */ + if (e->target_offset == sizeof(struct ipt_entry) + && (strcmp(t->target.u.user.name, + IPT_STANDARD_TARGET) == 0) + && t->verdict < 0 + && unconditional(&e->ip)) { + unsigned int oldpos, size; + + /* Return: backtrack through the last + big jump. */ + do { + e->comefrom ^= (1<comefrom + & (1 << NF_IP_NUMHOOKS)) { + duprintf("Back unset " + "on hook %u " + "rule %u\n", + hook, pos); + } +#endif + oldpos = pos; + pos = e->counters.pcnt; + e->counters.pcnt = 0; + + /* We're at the start. */ + if (pos == oldpos) + goto next; + + e = (struct ipt_entry *) + (newinfo->entries + pos); + } while (oldpos == pos + e->next_offset); + + /* Move along one */ + size = e->next_offset; + e = (struct ipt_entry *) + (newinfo->entries + pos + size); + e->counters.pcnt = pos; + pos += size; + } else { + int newpos = t->verdict; + + if (strcmp(t->target.u.user.name, + IPT_STANDARD_TARGET) == 0 + && newpos >= 0) { + /* This a jump; chase it. */ + duprintf("Jump rule %u -> %u\n", + pos, newpos); + } else { + /* ... this is a fallthru */ + newpos = pos + e->next_offset; + } + e = (struct ipt_entry *) + (newinfo->entries + newpos); + e->counters.pcnt = pos; + pos = newpos; + } + } + next: + duprintf("Finished chain %u\n", hook); + } + return 1; +} + +static inline int +cleanup_match(struct ipt_entry_match *m, unsigned int *i) +{ + if (i && (*i)-- == 0) + return 1; + + if (m->u.kernel.match->destroy) + m->u.kernel.match->destroy(m->data, + m->u.match_size - sizeof(*m)); + module_put(m->u.kernel.match->me); + return 0; +} + +static inline int +standard_check(const struct ipt_entry_target *t, + unsigned int max_offset) +{ + struct ipt_standard_target *targ = (void *)t; + + /* Check standard info. */ + if (t->u.target_size + != IPT_ALIGN(sizeof(struct ipt_standard_target))) { + duprintf("standard_check: target size %u != %u\n", + t->u.target_size, + IPT_ALIGN(sizeof(struct ipt_standard_target))); + return 0; + } + + if (targ->verdict >= 0 + && targ->verdict > max_offset - sizeof(struct ipt_entry)) { + duprintf("ipt_standard_check: bad verdict (%i)\n", + targ->verdict); + return 0; + } + + if (targ->verdict < -NF_MAX_VERDICT - 1) { + duprintf("ipt_standard_check: bad negative verdict (%i)\n", + targ->verdict); + return 0; + } + return 1; +} + +static inline int +check_match(struct ipt_entry_match *m, + const char *name, + const struct ipt_ip *ip, + unsigned int hookmask, + unsigned int *i) +{ + struct ipt_match *match; + + match = try_then_request_module(find_match(m->u.user.name, + m->u.user.revision), + "ipt_%s", m->u.user.name); + if (IS_ERR(match) || !match) { + duprintf("check_match: `%s' not found\n", m->u.user.name); + return match ? PTR_ERR(match) : -ENOENT; + } + m->u.kernel.match = match; + + if (m->u.kernel.match->checkentry + && !m->u.kernel.match->checkentry(name, ip, m->data, + m->u.match_size - sizeof(*m), + hookmask)) { + module_put(m->u.kernel.match->me); + duprintf("ip_tables: check failed for `%s'.\n", + m->u.kernel.match->name); + return -EINVAL; + } + + (*i)++; + return 0; +} + +static struct ipt_target ipt_standard_target; + +static inline int +check_entry(struct ipt_entry *e, const char *name, unsigned int size, + unsigned int *i) +{ + struct ipt_entry_target *t; + struct ipt_target *target; + int ret; + unsigned int j; + + if (!ip_checkentry(&e->ip)) { + duprintf("ip_tables: ip check failed %p %s.\n", e, name); + return -EINVAL; + } + + j = 0; + ret = IPT_MATCH_ITERATE(e, check_match, name, &e->ip, e->comefrom, &j); + if (ret != 0) + goto cleanup_matches; + + t = ipt_get_target(e); + target = try_then_request_module(find_target(t->u.user.name, + t->u.user.revision), + "ipt_%s", t->u.user.name); + if (IS_ERR(target) || !target) { + duprintf("check_entry: `%s' not found\n", t->u.user.name); + ret = target ? PTR_ERR(target) : -ENOENT; + goto cleanup_matches; + } + t->u.kernel.target = target; + + if (t->u.kernel.target == &ipt_standard_target) { + if (!standard_check(t, size)) { + ret = -EINVAL; + goto cleanup_matches; + } + } else if (t->u.kernel.target->checkentry + && !t->u.kernel.target->checkentry(name, e, t->data, + t->u.target_size + - sizeof(*t), + e->comefrom)) { + module_put(t->u.kernel.target->me); + duprintf("ip_tables: check failed for `%s'.\n", + t->u.kernel.target->name); + ret = -EINVAL; + goto cleanup_matches; + } + + (*i)++; + return 0; + + cleanup_matches: + IPT_MATCH_ITERATE(e, cleanup_match, &j); + return ret; +} + +static inline int +check_entry_size_and_hooks(struct ipt_entry *e, + struct ipt_table_info *newinfo, + unsigned char *base, + unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + unsigned int *i) +{ + unsigned int h; + + if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 + || (unsigned char *)e + sizeof(struct ipt_entry) >= limit) { + duprintf("Bad offset %p\n", e); + return -EINVAL; + } + + if (e->next_offset + < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* Check hooks & underflows */ + for (h = 0; h < NF_IP_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* FIXME: underflows must be unconditional, standard verdicts + < 0 (not IPT_RETURN). --RR */ + + /* Clear counters and comefrom */ + e->counters = ((struct ipt_counters) { 0, 0 }); + e->comefrom = 0; + + (*i)++; + return 0; +} + +static inline int +cleanup_entry(struct ipt_entry *e, unsigned int *i) +{ + struct ipt_entry_target *t; + + if (i && (*i)-- == 0) + return 1; + + /* Cleanup all matches */ + IPT_MATCH_ITERATE(e, cleanup_match, NULL); + t = ipt_get_target(e); + if (t->u.kernel.target->destroy) + t->u.kernel.target->destroy(t->data, + t->u.target_size - sizeof(*t)); + module_put(t->u.kernel.target->me); + return 0; +} + +/* Checks and translates the user-supplied table segment (held in + newinfo) */ +static int +translate_table(const char *name, + unsigned int valid_hooks, + struct ipt_table_info *newinfo, + unsigned int size, + unsigned int number, + const unsigned int *hook_entries, + const unsigned int *underflows) +{ + unsigned int i; + int ret; + + newinfo->size = size; + newinfo->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_IP_NUMHOOKS; i++) { + newinfo->hook_entry[i] = 0xFFFFFFFF; + newinfo->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_table: size %u\n", newinfo->size); + i = 0; + /* Walk through entries, checking offsets. */ + ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry_size_and_hooks, + newinfo, + newinfo->entries, + newinfo->entries + size, + hook_entries, underflows, &i); + if (ret != 0) + return ret; + + if (i != number) { + duprintf("translate_table: %u not %u entries\n", + i, number); + return -EINVAL; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_IP_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (newinfo->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + return -EINVAL; + } + if (newinfo->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + return -EINVAL; + } + } + + if (!mark_source_chains(newinfo, valid_hooks)) + return -ELOOP; + + /* Finally, each sanity check must pass */ + i = 0; + ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry, name, size, &i); + + if (ret != 0) { + IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + cleanup_entry, &i); + return ret; + } + + /* And one copy for every other CPU */ + for (i = 1; i < num_possible_cpus(); i++) { + memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i, + newinfo->entries, + SMP_ALIGN(newinfo->size)); + } + + return ret; +} + +static struct ipt_table_info * +replace_table(struct ipt_table *table, + unsigned int num_counters, + struct ipt_table_info *newinfo, + int *error) +{ + struct ipt_table_info *oldinfo; + +#ifdef CONFIG_NETFILTER_DEBUG + { + struct ipt_entry *table_base; + unsigned int i; + + for (i = 0; i < num_possible_cpus(); i++) { + table_base = + (void *)newinfo->entries + + TABLE_OFFSET(newinfo, i); + + table_base->comefrom = 0xdead57ac; + } + } +#endif + + /* Do the substitution. */ + write_lock_bh(&table->lock); + /* Check inside lock: is the old number correct? */ + if (num_counters != table->private->number) { + duprintf("num_counters != table->private->number (%u/%u)\n", + num_counters, table->private->number); + write_unlock_bh(&table->lock); + *error = -EAGAIN; + return NULL; + } + oldinfo = table->private; + table->private = newinfo; + newinfo->initial_entries = oldinfo->initial_entries; + write_unlock_bh(&table->lock); + + return oldinfo; +} + +/* Gets counters. */ +static inline int +add_entry_to_counter(const struct ipt_entry *e, + struct ipt_counters total[], + unsigned int *i) +{ + ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + +static void +get_counters(const struct ipt_table_info *t, + struct ipt_counters counters[]) +{ + unsigned int cpu; + unsigned int i; + + for (cpu = 0; cpu < num_possible_cpus(); cpu++) { + i = 0; + IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + t->size, + add_entry_to_counter, + counters, + &i); + } +} + +static int +copy_entries_to_user(unsigned int total_size, + struct ipt_table *table, + void __user *userptr) +{ + unsigned int off, num, countersize; + struct ipt_entry *e; + struct ipt_counters *counters; + int ret = 0; + + /* We need atomic snapshot of counters: rest doesn't change + (other than comefrom, which userspace doesn't care + about). */ + countersize = sizeof(struct ipt_counters) * table->private->number; + counters = vmalloc(countersize); + + if (counters == NULL) + return -ENOMEM; + + /* First, sum counters... */ + memset(counters, 0, countersize); + write_lock_bh(&table->lock); + get_counters(table->private, counters); + write_unlock_bh(&table->lock); + + /* ... then copy entire thing from CPU 0... */ + if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + ret = -EFAULT; + goto free_counters; + } + + /* FIXME: use iterator macros --RR */ + /* ... then go back and fix counters and names */ + for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ + unsigned int i; + struct ipt_entry_match *m; + struct ipt_entry_target *t; + + e = (struct ipt_entry *)(table->private->entries + off); + if (copy_to_user(userptr + off + + offsetof(struct ipt_entry, counters), + &counters[num], + sizeof(counters[num])) != 0) { + ret = -EFAULT; + goto free_counters; + } + + for (i = sizeof(struct ipt_entry); + i < e->target_offset; + i += m->u.match_size) { + m = (void *)e + i; + + if (copy_to_user(userptr + off + i + + offsetof(struct ipt_entry_match, + u.user.name), + m->u.kernel.match->name, + strlen(m->u.kernel.match->name)+1) + != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + t = ipt_get_target(e); + if (copy_to_user(userptr + off + e->target_offset + + offsetof(struct ipt_entry_target, + u.user.name), + t->u.kernel.target->name, + strlen(t->u.kernel.target->name)+1) != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + free_counters: + vfree(counters); + return ret; +} + +static int +get_entries(const struct ipt_get_entries *entries, + struct ipt_get_entries __user *uptr) +{ + int ret; + struct ipt_table *t; + + t = find_table_lock(entries->name); + if (t && !IS_ERR(t)) { + duprintf("t->private->number = %u\n", + t->private->number); + if (entries->size == t->private->size) + ret = copy_entries_to_user(t->private->size, + t, uptr->entrytable); + else { + duprintf("get_entries: I've got %u not %u!\n", + t->private->size, + entries->size); + ret = -EINVAL; + } + module_put(t->me); + up(&ipt_mutex); + } else + ret = t ? PTR_ERR(t) : -ENOENT; + + return ret; +} + +static int +do_replace(void __user *user, unsigned int len) +{ + int ret; + struct ipt_replace tmp; + struct ipt_table *t; + struct ipt_table_info *newinfo, *oldinfo; + struct ipt_counters *counters; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* Hack: Causes ipchains to give correct error msg --RR */ + if (len != sizeof(tmp) + tmp.size) + return -ENOPROTOOPT; + + /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ + if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) + return -ENOMEM; + + newinfo = vmalloc(sizeof(struct ipt_table_info) + + SMP_ALIGN(tmp.size) * num_possible_cpus()); + if (!newinfo) + return -ENOMEM; + + if (copy_from_user(newinfo->entries, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + counters = vmalloc(tmp.num_counters * sizeof(struct ipt_counters)); + if (!counters) { + ret = -ENOMEM; + goto free_newinfo; + } + memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters)); + + ret = translate_table(tmp.name, tmp.valid_hooks, + newinfo, tmp.size, tmp.num_entries, + tmp.hook_entry, tmp.underflow); + if (ret != 0) + goto free_newinfo_counters; + + duprintf("ip_tables: Translated table\n"); + + t = try_then_request_module(find_table_lock(tmp.name), + "iptable_%s", tmp.name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; + goto free_newinfo_counters_untrans; + } + + /* You lied! */ + if (tmp.valid_hooks != t->valid_hooks) { + duprintf("Valid hook crap: %08X vs %08X\n", + tmp.valid_hooks, t->valid_hooks); + ret = -EINVAL; + goto put_module; + } + + oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret); + if (!oldinfo) + goto put_module; + + /* Update module usage count based on number of rules */ + duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", + oldinfo->number, oldinfo->initial_entries, newinfo->number); + if ((oldinfo->number > oldinfo->initial_entries) || + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + if ((oldinfo->number > oldinfo->initial_entries) && + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + + /* Get the old counters. */ + get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ + IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); + vfree(oldinfo); + if (copy_to_user(tmp.counters, counters, + sizeof(struct ipt_counters) * tmp.num_counters) != 0) + ret = -EFAULT; + vfree(counters); + up(&ipt_mutex); + return ret; + + put_module: + module_put(t->me); + up(&ipt_mutex); + free_newinfo_counters_untrans: + IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + free_newinfo_counters: + vfree(counters); + free_newinfo: + vfree(newinfo); + return ret; +} + +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static inline int +add_counter_to_entry(struct ipt_entry *e, + const struct ipt_counters addme[], + unsigned int *i) +{ +#if 0 + duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n", + *i, + (long unsigned int)e->counters.pcnt, + (long unsigned int)e->counters.bcnt, + (long unsigned int)addme[*i].pcnt, + (long unsigned int)addme[*i].bcnt); +#endif + + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + +static int +do_add_counters(void __user *user, unsigned int len) +{ + unsigned int i; + struct ipt_counters_info tmp, *paddc; + struct ipt_table *t; + int ret = 0; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters)) + return -EINVAL; + + paddc = vmalloc(len); + if (!paddc) + return -ENOMEM; + + if (copy_from_user(paddc, user, len) != 0) { + ret = -EFAULT; + goto free; + } + + t = find_table_lock(tmp.name); + if (!t || IS_ERR(t)) { + ret = t ? PTR_ERR(t) : -ENOENT; + goto free; + } + + write_lock_bh(&t->lock); + if (t->private->number != paddc->num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } + + i = 0; + IPT_ENTRY_ITERATE(t->private->entries, + t->private->size, + add_counter_to_entry, + paddc->counters, + &i); + unlock_up_free: + write_unlock_bh(&t->lock); + up(&ipt_mutex); + module_put(t->me); + free: + vfree(paddc); + + return ret; +} + +static int +do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IPT_SO_SET_REPLACE: + ret = do_replace(user, len); + break; + + case IPT_SO_SET_ADD_COUNTERS: + ret = do_add_counters(user, len); + break; + + default: + duprintf("do_ipt_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +static int +do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IPT_SO_GET_INFO: { + char name[IPT_TABLE_MAXNAMELEN]; + struct ipt_table *t; + + if (*len != sizeof(struct ipt_getinfo)) { + duprintf("length %u != %u\n", *len, + sizeof(struct ipt_getinfo)); + ret = -EINVAL; + break; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) { + ret = -EFAULT; + break; + } + name[IPT_TABLE_MAXNAMELEN-1] = '\0'; + + t = try_then_request_module(find_table_lock(name), + "iptable_%s", name); + if (t && !IS_ERR(t)) { + struct ipt_getinfo info; + + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, t->private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, t->private->underflow, + sizeof(info.underflow)); + info.num_entries = t->private->number; + info.size = t->private->size; + memcpy(info.name, name, sizeof(info.name)); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + up(&ipt_mutex); + module_put(t->me); + } else + ret = t ? PTR_ERR(t) : -ENOENT; + } + break; + + case IPT_SO_GET_ENTRIES: { + struct ipt_get_entries get; + + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %u\n", *len, sizeof(get)); + ret = -EINVAL; + } else if (copy_from_user(&get, user, sizeof(get)) != 0) { + ret = -EFAULT; + } else if (*len != sizeof(struct ipt_get_entries) + get.size) { + duprintf("get_entries: %u != %u\n", *len, + sizeof(struct ipt_get_entries) + get.size); + ret = -EINVAL; + } else + ret = get_entries(&get, user); + break; + } + + case IPT_SO_GET_REVISION_MATCH: + case IPT_SO_GET_REVISION_TARGET: { + struct ipt_get_revision rev; + int (*revfn)(const char *, u8, int *); + + if (*len != sizeof(rev)) { + ret = -EINVAL; + break; + } + if (copy_from_user(&rev, user, sizeof(rev)) != 0) { + ret = -EFAULT; + break; + } + + if (cmd == IPT_SO_GET_REVISION_TARGET) + revfn = target_revfn; + else + revfn = match_revfn; + + try_then_request_module(find_revision(rev.name, rev.revision, + revfn, &ret), + "ipt_%s", rev.name); + break; + } + + default: + duprintf("do_ipt_get_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +/* Registration hooks for targets. */ +int +ipt_register_target(struct ipt_target *target) +{ + int ret; + + ret = down_interruptible(&ipt_mutex); + if (ret != 0) + return ret; + list_add(&target->list, &ipt_target); + up(&ipt_mutex); + return ret; +} + +void +ipt_unregister_target(struct ipt_target *target) +{ + down(&ipt_mutex); + LIST_DELETE(&ipt_target, target); + up(&ipt_mutex); +} + +int +ipt_register_match(struct ipt_match *match) +{ + int ret; + + ret = down_interruptible(&ipt_mutex); + if (ret != 0) + return ret; + + list_add(&match->list, &ipt_match); + up(&ipt_mutex); + + return ret; +} + +void +ipt_unregister_match(struct ipt_match *match) +{ + down(&ipt_mutex); + LIST_DELETE(&ipt_match, match); + up(&ipt_mutex); +} + +int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl) +{ + int ret; + struct ipt_table_info *newinfo; + static struct ipt_table_info bootstrap + = { 0, 0, 0, { 0 }, { 0 }, { } }; + + newinfo = vmalloc(sizeof(struct ipt_table_info) + + SMP_ALIGN(repl->size) * num_possible_cpus()); + if (!newinfo) + return -ENOMEM; + + memcpy(newinfo->entries, repl->entries, repl->size); + + ret = translate_table(table->name, table->valid_hooks, + newinfo, repl->size, + repl->num_entries, + repl->hook_entry, + repl->underflow); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + ret = down_interruptible(&ipt_mutex); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + /* Don't autoload: we'd eat our tail... */ + if (list_named_find(&ipt_tables, table->name)) { + ret = -EEXIST; + goto free_unlock; + } + + /* Simplifies replace_table code. */ + table->private = &bootstrap; + if (!replace_table(table, 0, newinfo, &ret)) + goto free_unlock; + + duprintf("table->private->number = %u\n", + table->private->number); + + /* save number of initial entries */ + table->private->initial_entries = table->private->number; + + rwlock_init(&table->lock); + list_prepend(&ipt_tables, table); + + unlock: + up(&ipt_mutex); + return ret; + + free_unlock: + vfree(newinfo); + goto unlock; +} + +void ipt_unregister_table(struct ipt_table *table) +{ + down(&ipt_mutex); + LIST_DELETE(&ipt_tables, table); + up(&ipt_mutex); + + /* Decrease module usage counts and free resources */ + IPT_ENTRY_ITERATE(table->private->entries, table->private->size, + cleanup_entry, NULL); + vfree(table->private); +} + +/* Returns 1 if the port is matched by the range, 0 otherwise */ +static inline int +port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert) +{ + int ret; + + ret = (port >= min && port <= max) ^ invert; + return ret; +} + +static int +tcp_find_option(u_int8_t option, + const struct sk_buff *skb, + unsigned int optlen, + int invert, + int *hotdrop) +{ + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + u_int8_t _opt[60 - sizeof(struct tcphdr)], *op; + unsigned int i; + + duprintf("tcp_match: finding option\n"); + + if (!optlen) + return invert; + + /* If we don't have the whole header, drop packet. */ + op = skb_header_pointer(skb, + skb->nh.iph->ihl*4 + sizeof(struct tcphdr), + optlen, _opt); + if (op == NULL) { + *hotdrop = 1; + return 0; + } + + for (i = 0; i < optlen; ) { + if (op[i] == option) return !invert; + if (op[i] < 2) i++; + else i += op[i+1]?:1; + } + + return invert; +} + +static int +tcp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct tcphdr _tcph, *th; + const struct ipt_tcp *tcpinfo = matchinfo; + + if (offset) { + /* To quote Alan: + + Don't allow a fragment of TCP 8 bytes in. Nobody normal + causes this. Its a cracker trying to break in by doing a + flag overwrite to pass the direction checks. + */ + if (offset == 1) { + duprintf("Dropping evil TCP offset=1 frag.\n"); + *hotdrop = 1; + } + /* Must not be a fragment. */ + return 0; + } + +#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg)) + + th = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (th == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil TCP offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + + if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1], + ntohs(th->source), + !!(tcpinfo->invflags & IPT_TCP_INV_SRCPT))) + return 0; + if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], + ntohs(th->dest), + !!(tcpinfo->invflags & IPT_TCP_INV_DSTPT))) + return 0; + if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask) + == tcpinfo->flg_cmp, + IPT_TCP_INV_FLAGS)) + return 0; + if (tcpinfo->option) { + if (th->doff * 4 < sizeof(_tcph)) { + *hotdrop = 1; + return 0; + } + if (!tcp_find_option(tcpinfo->option, skb, + th->doff*4 - sizeof(_tcph), + tcpinfo->invflags & IPT_TCP_INV_OPTION, + hotdrop)) + return 0; + } + return 1; +} + +/* Called when user tries to insert an entry of this type. */ +static int +tcp_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_tcp *tcpinfo = matchinfo; + + /* Must specify proto == TCP, and no unknown invflags */ + return ip->proto == IPPROTO_TCP + && !(ip->invflags & IPT_INV_PROTO) + && matchsize == IPT_ALIGN(sizeof(struct ipt_tcp)) + && !(tcpinfo->invflags & ~IPT_TCP_INV_MASK); +} + +static int +udp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct udphdr _udph, *uh; + const struct ipt_udp *udpinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + uh = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_udph), &_udph); + if (uh == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil UDP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return port_match(udpinfo->spts[0], udpinfo->spts[1], + ntohs(uh->source), + !!(udpinfo->invflags & IPT_UDP_INV_SRCPT)) + && port_match(udpinfo->dpts[0], udpinfo->dpts[1], + ntohs(uh->dest), + !!(udpinfo->invflags & IPT_UDP_INV_DSTPT)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +udp_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ipt_udp *udpinfo = matchinfo; + + /* Must specify proto == UDP, and no unknown invflags */ + if (ip->proto != IPPROTO_UDP || (ip->invflags & IPT_INV_PROTO)) { + duprintf("ipt_udp: Protocol %u != %u\n", ip->proto, + IPPROTO_UDP); + return 0; + } + if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_udp))) { + duprintf("ipt_udp: matchsize %u != %u\n", + matchinfosize, IPT_ALIGN(sizeof(struct ipt_udp))); + return 0; + } + if (udpinfo->invflags & ~IPT_UDP_INV_MASK) { + duprintf("ipt_udp: unknown flags %X\n", + udpinfo->invflags); + return 0; + } + + return 1; +} + +/* Returns 1 if the type and code is matched by the range, 0 otherwise */ +static inline int +icmp_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, + u_int8_t type, u_int8_t code, + int invert) +{ + return ((test_type == 0xFF) || (type == test_type && code >= min_code && code <= max_code)) + ^ invert; +} + +static int +icmp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct icmphdr _icmph, *ic; + const struct ipt_icmp *icmpinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + ic = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_icmph), &_icmph); + if (ic == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("Dropping evil ICMP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return icmp_type_code_match(icmpinfo->type, + icmpinfo->code[0], + icmpinfo->code[1], + ic->type, ic->code, + !!(icmpinfo->invflags&IPT_ICMP_INV)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +icmp_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_icmp *icmpinfo = matchinfo; + + /* Must specify proto == ICMP, and no unknown invflags */ + return ip->proto == IPPROTO_ICMP + && !(ip->invflags & IPT_INV_PROTO) + && matchsize == IPT_ALIGN(sizeof(struct ipt_icmp)) + && !(icmpinfo->invflags & ~IPT_ICMP_INV); +} + +/* The built-in targets: standard (NULL) and error. */ +static struct ipt_target ipt_standard_target = { + .name = IPT_STANDARD_TARGET, +}; + +static struct ipt_target ipt_error_target = { + .name = IPT_ERROR_TARGET, + .target = ipt_error, +}; + +static struct nf_sockopt_ops ipt_sockopts = { + .pf = PF_INET, + .set_optmin = IPT_BASE_CTL, + .set_optmax = IPT_SO_SET_MAX+1, + .set = do_ipt_set_ctl, + .get_optmin = IPT_BASE_CTL, + .get_optmax = IPT_SO_GET_MAX+1, + .get = do_ipt_get_ctl, +}; + +static struct ipt_match tcp_matchstruct = { + .name = "tcp", + .match = &tcp_match, + .checkentry = &tcp_checkentry, +}; + +static struct ipt_match udp_matchstruct = { + .name = "udp", + .match = &udp_match, + .checkentry = &udp_checkentry, +}; + +static struct ipt_match icmp_matchstruct = { + .name = "icmp", + .match = &icmp_match, + .checkentry = &icmp_checkentry, +}; + +#ifdef CONFIG_PROC_FS +static inline int print_name(const char *i, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if ((*count)++ >= start_offset) { + unsigned int namelen; + + namelen = sprintf(buffer + *pos, "%s\n", + i + sizeof(struct list_head)); + if (*pos + namelen > length) { + /* Stop iterating */ + return 1; + } + *pos += namelen; + } + return 0; +} + +static inline int print_target(const struct ipt_target *t, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if (t == &ipt_standard_target || t == &ipt_error_target) + return 0; + return print_name((char *)t, start_offset, buffer, length, pos, count); +} + +static int ipt_get_tables(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ipt_mutex) != 0) + return 0; + + LIST_FIND(&ipt_tables, print_name, void *, + offset, buffer, length, &pos, &count); + + up(&ipt_mutex); + + /* `start' hack - see fs/proc/generic.c line ~105 */ + *start=(char *)((unsigned long)count-offset); + return pos; +} + +static int ipt_get_targets(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ipt_mutex) != 0) + return 0; + + LIST_FIND(&ipt_target, print_target, struct ipt_target *, + offset, buffer, length, &pos, &count); + + up(&ipt_mutex); + + *start = (char *)((unsigned long)count - offset); + return pos; +} + +static int ipt_get_matches(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ipt_mutex) != 0) + return 0; + + LIST_FIND(&ipt_match, print_name, void *, + offset, buffer, length, &pos, &count); + + up(&ipt_mutex); + + *start = (char *)((unsigned long)count - offset); + return pos; +} + +static struct { char *name; get_info_t *get_info; } ipt_proc_entry[] = +{ { "ip_tables_names", ipt_get_tables }, + { "ip_tables_targets", ipt_get_targets }, + { "ip_tables_matches", ipt_get_matches }, + { NULL, NULL} }; +#endif /*CONFIG_PROC_FS*/ + +static int __init init(void) +{ + int ret; + + /* Noone else will be downing sem now, so we won't sleep */ + down(&ipt_mutex); + list_append(&ipt_target, &ipt_standard_target); + list_append(&ipt_target, &ipt_error_target); + list_append(&ipt_match, &tcp_matchstruct); + list_append(&ipt_match, &udp_matchstruct); + list_append(&ipt_match, &icmp_matchstruct); + up(&ipt_mutex); + + /* Register setsockopt */ + ret = nf_register_sockopt(&ipt_sockopts); + if (ret < 0) { + duprintf("Unable to register sockopts.\n"); + return ret; + } + +#ifdef CONFIG_PROC_FS + { + struct proc_dir_entry *proc; + int i; + + for (i = 0; ipt_proc_entry[i].name; i++) { + proc = proc_net_create(ipt_proc_entry[i].name, 0, + ipt_proc_entry[i].get_info); + if (!proc) { + while (--i >= 0) + proc_net_remove(ipt_proc_entry[i].name); + nf_unregister_sockopt(&ipt_sockopts); + return -ENOMEM; + } + proc->owner = THIS_MODULE; + } + } +#endif + + printk("ip_tables: (C) 2000-2002 Netfilter core team\n"); + return 0; +} + +static void __exit fini(void) +{ + nf_unregister_sockopt(&ipt_sockopts); +#ifdef CONFIG_PROC_FS + { + int i; + for (i = 0; ipt_proc_entry[i].name; i++) + proc_net_remove(ipt_proc_entry[i].name); + } +#endif +} + +EXPORT_SYMBOL(ipt_register_table); +EXPORT_SYMBOL(ipt_unregister_table); +EXPORT_SYMBOL(ipt_register_match); +EXPORT_SYMBOL(ipt_unregister_match); +EXPORT_SYMBOL(ipt_do_table); +EXPORT_SYMBOL(ipt_register_target); +EXPORT_SYMBOL(ipt_unregister_target); +EXPORT_SYMBOL(ipt_find_target); + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_CLASSIFY.c b/net/ipv4/netfilter/ipt_CLASSIFY.c new file mode 100644 index 000000000000..9842e6e23184 --- /dev/null +++ b/net/ipv4/netfilter/ipt_CLASSIFY.c @@ -0,0 +1,92 @@ +/* + * This is a module which is used for setting the skb->priority field + * of an skb for qdisc classification. + */ + +/* (C) 2001-2002 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Patrick McHardy "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("iptables qdisc classification target module"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_classify_target_info *clinfo = targinfo; + + if((*pskb)->priority != clinfo->priority) { + (*pskb)->priority = clinfo->priority; + (*pskb)->nfcache |= NFC_ALTERED; + } + + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_classify_target_info))){ + printk(KERN_ERR "CLASSIFY: invalid size (%u != %Zu).\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_classify_target_info))); + return 0; + } + + if (hook_mask & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_FORWARD) | + (1 << NF_IP_POST_ROUTING))) { + printk(KERN_ERR "CLASSIFY: only valid in LOCAL_OUT, FORWARD " + "and POST_ROUTING.\n"); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_ERR "CLASSIFY: can only be called from " + "\"mangle\" table, not \"%s\".\n", + tablename); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_classify_reg = { + .name = "CLASSIFY", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_classify_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_classify_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c new file mode 100644 index 000000000000..0f12e3a3dc73 --- /dev/null +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -0,0 +1,761 @@ +/* Cluster IP hashmark target + * (C) 2003-2004 by Harald Welte + * based on ideas of Fabio Olive Leite + * + * Development of this code funded by SuSE Linux AG, http://www.suse.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include + +#define CLUSTERIP_VERSION "0.6" + +#define DEBUG_CLUSTERIP + +#ifdef DEBUG_CLUSTERIP +#define DEBUGP printk +#else +#define DEBUGP +#endif + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("iptables target for CLUSTERIP"); + +struct clusterip_config { + struct list_head list; /* list of all configs */ + atomic_t refcount; /* reference count */ + + u_int32_t clusterip; /* the IP address */ + u_int8_t clustermac[ETH_ALEN]; /* the MAC address */ + struct net_device *dev; /* device */ + u_int16_t num_total_nodes; /* total number of nodes */ + u_int16_t num_local_nodes; /* number of local nodes */ + u_int16_t local_nodes[CLUSTERIP_MAX_NODES]; /* node number array */ + +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *pde; /* proc dir entry */ +#endif + enum clusterip_hashmode hash_mode; /* which hashing mode */ + u_int32_t hash_initval; /* hash initialization */ +}; + +static LIST_HEAD(clusterip_configs); + +/* clusterip_lock protects the clusterip_configs list _AND_ the configurable + * data within all structurses (num_local_nodes, local_nodes[]) */ +static DECLARE_RWLOCK(clusterip_lock); + +#ifdef CONFIG_PROC_FS +static struct file_operations clusterip_proc_fops; +static struct proc_dir_entry *clusterip_procdir; +#endif + +static inline void +clusterip_config_get(struct clusterip_config *c) { + atomic_inc(&c->refcount); +} + +static inline void +clusterip_config_put(struct clusterip_config *c) { + if (atomic_dec_and_test(&c->refcount)) { + WRITE_LOCK(&clusterip_lock); + list_del(&c->list); + WRITE_UNLOCK(&clusterip_lock); + dev_mc_delete(c->dev, c->clustermac, ETH_ALEN, 0); + dev_put(c->dev); + kfree(c); + } +} + + +static struct clusterip_config * +__clusterip_config_find(u_int32_t clusterip) +{ + struct list_head *pos; + + MUST_BE_READ_LOCKED(&clusterip_lock); + list_for_each(pos, &clusterip_configs) { + struct clusterip_config *c = list_entry(pos, + struct clusterip_config, list); + if (c->clusterip == clusterip) { + return c; + } + } + + return NULL; +} + +static inline struct clusterip_config * +clusterip_config_find_get(u_int32_t clusterip) +{ + struct clusterip_config *c; + + READ_LOCK(&clusterip_lock); + c = __clusterip_config_find(clusterip); + if (!c) { + READ_UNLOCK(&clusterip_lock); + return NULL; + } + atomic_inc(&c->refcount); + READ_UNLOCK(&clusterip_lock); + + return c; +} + +static struct clusterip_config * +clusterip_config_init(struct ipt_clusterip_tgt_info *i, u_int32_t ip, + struct net_device *dev) +{ + struct clusterip_config *c; + char buffer[16]; + + c = kmalloc(sizeof(*c), GFP_ATOMIC); + if (!c) + return NULL; + + memset(c, 0, sizeof(*c)); + c->dev = dev; + c->clusterip = ip; + memcpy(&c->clustermac, &i->clustermac, ETH_ALEN); + c->num_total_nodes = i->num_total_nodes; + c->num_local_nodes = i->num_local_nodes; + memcpy(&c->local_nodes, &i->local_nodes, sizeof(&c->local_nodes)); + c->hash_mode = i->hash_mode; + c->hash_initval = i->hash_initval; + atomic_set(&c->refcount, 1); + +#ifdef CONFIG_PROC_FS + /* create proc dir entry */ + sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(ip)); + c->pde = create_proc_entry(buffer, S_IWUSR|S_IRUSR, clusterip_procdir); + if (!c->pde) { + kfree(c); + return NULL; + } + c->pde->proc_fops = &clusterip_proc_fops; + c->pde->data = c; +#endif + + WRITE_LOCK(&clusterip_lock); + list_add(&c->list, &clusterip_configs); + WRITE_UNLOCK(&clusterip_lock); + + return c; +} + +static int +clusterip_add_node(struct clusterip_config *c, u_int16_t nodenum) +{ + int i; + + WRITE_LOCK(&clusterip_lock); + + if (c->num_local_nodes >= CLUSTERIP_MAX_NODES + || nodenum > CLUSTERIP_MAX_NODES) { + WRITE_UNLOCK(&clusterip_lock); + return 1; + } + + /* check if we alrady have this number in our array */ + for (i = 0; i < c->num_local_nodes; i++) { + if (c->local_nodes[i] == nodenum) { + WRITE_UNLOCK(&clusterip_lock); + return 1; + } + } + + c->local_nodes[c->num_local_nodes++] = nodenum; + + WRITE_UNLOCK(&clusterip_lock); + return 0; +} + +static int +clusterip_del_node(struct clusterip_config *c, u_int16_t nodenum) +{ + int i; + + WRITE_LOCK(&clusterip_lock); + + if (c->num_local_nodes <= 1 || nodenum > CLUSTERIP_MAX_NODES) { + WRITE_UNLOCK(&clusterip_lock); + return 1; + } + + for (i = 0; i < c->num_local_nodes; i++) { + if (c->local_nodes[i] == nodenum) { + int size = sizeof(u_int16_t)*(c->num_local_nodes-(i+1)); + memmove(&c->local_nodes[i], &c->local_nodes[i+1], size); + c->num_local_nodes--; + WRITE_UNLOCK(&clusterip_lock); + return 0; + } + } + + WRITE_UNLOCK(&clusterip_lock); + return 1; +} + +static inline u_int32_t +clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config) +{ + struct iphdr *iph = skb->nh.iph; + unsigned long hashval; + u_int16_t sport, dport; + struct tcphdr *th; + struct udphdr *uh; + struct icmphdr *ih; + + switch (iph->protocol) { + case IPPROTO_TCP: + th = (void *)iph+iph->ihl*4; + sport = ntohs(th->source); + dport = ntohs(th->dest); + break; + case IPPROTO_UDP: + uh = (void *)iph+iph->ihl*4; + sport = ntohs(uh->source); + dport = ntohs(uh->dest); + break; + case IPPROTO_ICMP: + ih = (void *)iph+iph->ihl*4; + sport = ntohs(ih->un.echo.id); + dport = (ih->type<<8)|ih->code; + break; + default: + if (net_ratelimit()) { + printk(KERN_NOTICE "CLUSTERIP: unknown protocol `%u'\n", + iph->protocol); + } + sport = dport = 0; + } + + switch (config->hash_mode) { + case CLUSTERIP_HASHMODE_SIP: + hashval = jhash_1word(ntohl(iph->saddr), + config->hash_initval); + break; + case CLUSTERIP_HASHMODE_SIP_SPT: + hashval = jhash_2words(ntohl(iph->saddr), sport, + config->hash_initval); + break; + case CLUSTERIP_HASHMODE_SIP_SPT_DPT: + hashval = jhash_3words(ntohl(iph->saddr), sport, dport, + config->hash_initval); + break; + default: + /* to make gcc happy */ + hashval = 0; + /* This cannot happen, unless the check function wasn't called + * at rule load time */ + printk("CLUSTERIP: unknown mode `%u'\n", config->hash_mode); + BUG(); + break; + } + + /* node numbers are 1..n, not 0..n */ + return ((hashval % config->num_total_nodes)+1); +} + +static inline int +clusterip_responsible(struct clusterip_config *config, u_int32_t hash) +{ + int i; + + READ_LOCK(&clusterip_lock); + + if (config->num_local_nodes == 0) { + READ_UNLOCK(&clusterip_lock); + return 0; + } + + for (i = 0; i < config->num_local_nodes; i++) { + if (config->local_nodes[i] == hash) { + READ_UNLOCK(&clusterip_lock); + return 1; + } + } + + READ_UNLOCK(&clusterip_lock); + + return 0; +} + +/*********************************************************************** + * IPTABLES TARGET + ***********************************************************************/ + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_clusterip_tgt_info *cipinfo = targinfo; + enum ip_conntrack_info ctinfo; + struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); + u_int32_t hash; + + /* don't need to clusterip_config_get() here, since refcount + * is only decremented by destroy() - and ip_tables guarantees + * that the ->target() function isn't called after ->destroy() */ + + if (!ct) { + printk(KERN_ERR "CLUSTERIP: no conntrack!\n"); + /* FIXME: need to drop invalid ones, since replies + * to outgoing connections of other nodes will be + * marked as INVALID */ + return NF_DROP; + } + + /* special case: ICMP error handling. conntrack distinguishes between + * error messages (RELATED) and information requests (see below) */ + if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP + && (ctinfo == IP_CT_RELATED + || ctinfo == IP_CT_IS_REPLY+IP_CT_IS_REPLY)) + return IPT_CONTINUE; + + /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, + * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here + * on, which all have an ID field [relevant for hashing]. */ + + hash = clusterip_hashfn(*pskb, cipinfo->config); + + switch (ctinfo) { + case IP_CT_NEW: + ct->mark = hash; + break; + case IP_CT_RELATED: + case IP_CT_RELATED+IP_CT_IS_REPLY: + /* FIXME: we don't handle expectations at the + * moment. they can arrive on a different node than + * the master connection (e.g. FTP passive mode) */ + case IP_CT_ESTABLISHED: + case IP_CT_ESTABLISHED+IP_CT_IS_REPLY: + break; + default: + break; + } + +#ifdef DEBUG_CLUSTERP + DUMP_TUPLE(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); +#endif + DEBUGP("hash=%u ct_hash=%lu ", hash, ct->mark); + if (!clusterip_responsible(cipinfo->config, hash)) { + DEBUGP("not responsible\n"); + return NF_DROP; + } + DEBUGP("responsible\n"); + + /* despite being received via linklayer multicast, this is + * actually a unicast IP packet. TCP doesn't like PACKET_MULTICAST */ + (*pskb)->pkt_type = PACKET_HOST; + + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ipt_clusterip_tgt_info *cipinfo = targinfo; + + struct clusterip_config *config; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info))) { + printk(KERN_WARNING "CLUSTERIP: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_clusterip_tgt_info))); + return 0; + } + + if (cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP && + cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT && + cipinfo->hash_mode != CLUSTERIP_HASHMODE_SIP_SPT_DPT) { + printk(KERN_WARNING "CLUSTERIP: unknown mode `%u'\n", + cipinfo->hash_mode); + return 0; + + } + if (e->ip.dmsk.s_addr != 0xffffffff + || e->ip.dst.s_addr == 0) { + printk(KERN_ERR "CLUSTERIP: Please specify destination IP\n"); + return 0; + } + + /* FIXME: further sanity checks */ + + config = clusterip_config_find_get(e->ip.dst.s_addr); + if (!config) { + if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) { + printk(KERN_WARNING "CLUSTERIP: no config found for %u.%u.%u.%u, need 'new'\n", NIPQUAD(e->ip.dst.s_addr)); + return 0; + } else { + struct net_device *dev; + + if (e->ip.iniface[0] == '\0') { + printk(KERN_WARNING "CLUSTERIP: Please specify an interface name\n"); + return 0; + } + + dev = dev_get_by_name(e->ip.iniface); + if (!dev) { + printk(KERN_WARNING "CLUSTERIP: no such interface %s\n", e->ip.iniface); + return 0; + } + + config = clusterip_config_init(cipinfo, + e->ip.dst.s_addr, dev); + if (!config) { + printk(KERN_WARNING "CLUSTERIP: cannot allocate config\n"); + dev_put(dev); + return 0; + } + dev_mc_add(config->dev,config->clustermac, ETH_ALEN, 0); + } + } + + cipinfo->config = config; + + return 1; +} + +/* drop reference count of cluster config when rule is deleted */ +static void destroy(void *matchinfo, unsigned int matchinfosize) +{ + struct ipt_clusterip_tgt_info *cipinfo = matchinfo; + + /* we first remove the proc entry and then drop the reference + * count. In case anyone still accesses the file, the open/close + * functions are also incrementing the refcount on their own */ +#ifdef CONFIG_PROC_FS + remove_proc_entry(cipinfo->config->pde->name, + cipinfo->config->pde->parent); +#endif + clusterip_config_put(cipinfo->config); +} + +static struct ipt_target clusterip_tgt = { + .name = "CLUSTERIP", + .target = &target, + .checkentry = &checkentry, + .destroy = &destroy, + .me = THIS_MODULE +}; + + +/*********************************************************************** + * ARP MANGLING CODE + ***********************************************************************/ + +/* hardcoded for 48bit ethernet and 32bit ipv4 addresses */ +struct arp_payload { + u_int8_t src_hw[ETH_ALEN]; + u_int32_t src_ip; + u_int8_t dst_hw[ETH_ALEN]; + u_int32_t dst_ip; +} __attribute__ ((packed)); + +#ifdef CLUSTERIP_DEBUG +static void arp_print(struct arp_payload *payload) +{ +#define HBUFFERLEN 30 + char hbuffer[HBUFFERLEN]; + int j,k; + const char hexbuf[]= "0123456789abcdef"; + + for (k=0, j=0; k < HBUFFERLEN-3 && j < ETH_ALEN; j++) { + hbuffer[k++]=hexbuf[(payload->src_hw[j]>>4)&15]; + hbuffer[k++]=hexbuf[payload->src_hw[j]&15]; + hbuffer[k++]=':'; + } + hbuffer[--k]='\0'; + + printk("src %u.%u.%u.%u@%s, dst %u.%u.%u.%u\n", + NIPQUAD(payload->src_ip), hbuffer, + NIPQUAD(payload->dst_ip)); +} +#endif + +static unsigned int +arp_mangle(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct arphdr *arp = (*pskb)->nh.arph; + struct arp_payload *payload; + struct clusterip_config *c; + + /* we don't care about non-ethernet and non-ipv4 ARP */ + if (arp->ar_hrd != htons(ARPHRD_ETHER) + || arp->ar_pro != htons(ETH_P_IP) + || arp->ar_pln != 4 || arp->ar_hln != ETH_ALEN) + return NF_ACCEPT; + + /* we only want to mangle arp replies */ + if (arp->ar_op != htons(ARPOP_REPLY)) + return NF_ACCEPT; + + payload = (void *)(arp+1); + + /* if there is no clusterip configuration for the arp reply's + * source ip, we don't want to mangle it */ + c = clusterip_config_find_get(payload->src_ip); + if (!c) + return NF_ACCEPT; + + /* normally the linux kernel always replies to arp queries of + * addresses on different interfacs. However, in the CLUSTERIP case + * this wouldn't work, since we didn't subscribe the mcast group on + * other interfaces */ + if (c->dev != out) { + DEBUGP("CLUSTERIP: not mangling arp reply on different " + "interface: cip'%s'-skb'%s'\n", c->dev->name, out->name); + clusterip_config_put(c); + return NF_ACCEPT; + } + + /* mangle reply hardware address */ + memcpy(payload->src_hw, c->clustermac, arp->ar_hln); + +#ifdef CLUSTERIP_DEBUG + DEBUGP(KERN_DEBUG "CLUSTERIP mangled arp reply: "); + arp_print(payload); +#endif + + clusterip_config_put(c); + + return NF_ACCEPT; +} + +static struct nf_hook_ops cip_arp_ops = { + .hook = arp_mangle, + .pf = NF_ARP, + .hooknum = NF_ARP_OUT, + .priority = -1 +}; + +/*********************************************************************** + * PROC DIR HANDLING + ***********************************************************************/ + +#ifdef CONFIG_PROC_FS + +static void *clusterip_seq_start(struct seq_file *s, loff_t *pos) +{ + struct proc_dir_entry *pde = s->private; + struct clusterip_config *c = pde->data; + unsigned int *nodeidx; + + READ_LOCK(&clusterip_lock); + if (*pos >= c->num_local_nodes) + return NULL; + + nodeidx = kmalloc(sizeof(unsigned int), GFP_KERNEL); + if (!nodeidx) + return ERR_PTR(-ENOMEM); + + *nodeidx = *pos; + return nodeidx; +} + +static void *clusterip_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct proc_dir_entry *pde = s->private; + struct clusterip_config *c = pde->data; + unsigned int *nodeidx = (unsigned int *)v; + + *pos = ++(*nodeidx); + if (*pos >= c->num_local_nodes) { + kfree(v); + return NULL; + } + return nodeidx; +} + +static void clusterip_seq_stop(struct seq_file *s, void *v) +{ + kfree(v); + + READ_UNLOCK(&clusterip_lock); +} + +static int clusterip_seq_show(struct seq_file *s, void *v) +{ + struct proc_dir_entry *pde = s->private; + struct clusterip_config *c = pde->data; + unsigned int *nodeidx = (unsigned int *)v; + + if (*nodeidx != 0) + seq_putc(s, ','); + seq_printf(s, "%u", c->local_nodes[*nodeidx]); + + if (*nodeidx == c->num_local_nodes-1) + seq_putc(s, '\n'); + + return 0; +} + +static struct seq_operations clusterip_seq_ops = { + .start = clusterip_seq_start, + .next = clusterip_seq_next, + .stop = clusterip_seq_stop, + .show = clusterip_seq_show, +}; + +static int clusterip_proc_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &clusterip_seq_ops); + + if (!ret) { + struct seq_file *sf = file->private_data; + struct proc_dir_entry *pde = PDE(inode); + struct clusterip_config *c = pde->data; + + sf->private = pde; + + clusterip_config_get(c); + } + + return ret; +} + +static int clusterip_proc_release(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + struct clusterip_config *c = pde->data; + int ret; + + ret = seq_release(inode, file); + + if (!ret) + clusterip_config_put(c); + + return ret; +} + +static ssize_t clusterip_proc_write(struct file *file, const char __user *input, + size_t size, loff_t *ofs) +{ +#define PROC_WRITELEN 10 + char buffer[PROC_WRITELEN+1]; + struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode); + struct clusterip_config *c = pde->data; + unsigned long nodenum; + + if (copy_from_user(buffer, input, PROC_WRITELEN)) + return -EFAULT; + + if (*buffer == '+') { + nodenum = simple_strtoul(buffer+1, NULL, 10); + if (clusterip_add_node(c, nodenum)) + return -ENOMEM; + } else if (*buffer == '-') { + nodenum = simple_strtoul(buffer+1, NULL,10); + if (clusterip_del_node(c, nodenum)) + return -ENOENT; + } else + return -EIO; + + return size; +} + +static struct file_operations clusterip_proc_fops = { + .owner = THIS_MODULE, + .open = clusterip_proc_open, + .read = seq_read, + .write = clusterip_proc_write, + .llseek = seq_lseek, + .release = clusterip_proc_release, +}; + +#endif /* CONFIG_PROC_FS */ + +static int init_or_cleanup(int fini) +{ + int ret; + + if (fini) + goto cleanup; + + if (ipt_register_target(&clusterip_tgt)) { + ret = -EINVAL; + goto cleanup_none; + } + + if (nf_register_hook(&cip_arp_ops) < 0) { + ret = -EINVAL; + goto cleanup_target; + } + +#ifdef CONFIG_PROC_FS + clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", proc_net); + if (!clusterip_procdir) { + printk(KERN_ERR "CLUSTERIP: Unable to proc dir entry\n"); + ret = -ENOMEM; + goto cleanup_hook; + } +#endif /* CONFIG_PROC_FS */ + + printk(KERN_NOTICE "ClusterIP Version %s loaded successfully\n", + CLUSTERIP_VERSION); + + return 0; + +cleanup: + printk(KERN_NOTICE "ClusterIP Version %s unloading\n", + CLUSTERIP_VERSION); +#ifdef CONFIG_PROC_FS + remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent); +#endif +cleanup_hook: + nf_unregister_hook(&cip_arp_ops); +cleanup_target: + ipt_unregister_target(&clusterip_tgt); +cleanup_none: + return -EINVAL; +} + +static int __init init(void) +{ + return init_or_cleanup(0); +} + +static void __exit fini(void) +{ + init_or_cleanup(1); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_CONNMARK.c b/net/ipv4/netfilter/ipt_CONNMARK.c new file mode 100644 index 000000000000..30ddd3e18eb7 --- /dev/null +++ b/net/ipv4/netfilter/ipt_CONNMARK.c @@ -0,0 +1,118 @@ +/* This kernel module is used to modify the connection mark values, or + * to optionally restore the skb nfmark from the connection mark + * + * Copyright (C) 2002,2004 MARA Systems AB + * by Henrik Nordstrom + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +#include +#include +#include +#include + +MODULE_AUTHOR("Henrik Nordstrom "); +MODULE_DESCRIPTION("IP tables CONNMARK matching module"); +MODULE_LICENSE("GPL"); + +#include +#include +#include + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_connmark_target_info *markinfo = targinfo; + unsigned long diff; + unsigned long nfmark; + unsigned long newmark; + + enum ip_conntrack_info ctinfo; + struct ip_conntrack *ct = ip_conntrack_get((*pskb), &ctinfo); + if (ct) { + switch(markinfo->mode) { + case IPT_CONNMARK_SET: + newmark = (ct->mark & ~markinfo->mask) | markinfo->mark; + if (newmark != ct->mark) + ct->mark = newmark; + break; + case IPT_CONNMARK_SAVE: + newmark = (ct->mark & ~markinfo->mask) | ((*pskb)->nfmark & markinfo->mask); + if (ct->mark != newmark) + ct->mark = newmark; + break; + case IPT_CONNMARK_RESTORE: + nfmark = (*pskb)->nfmark; + diff = (ct->mark ^ nfmark) & markinfo->mask; + if (diff != 0) { + (*pskb)->nfmark = nfmark ^ diff; + (*pskb)->nfcache |= NFC_ALTERED; + } + break; + } + } + + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ipt_connmark_target_info *matchinfo = targinfo; + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_connmark_target_info))) { + printk(KERN_WARNING "CONNMARK: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_connmark_target_info))); + return 0; + } + + if (matchinfo->mode == IPT_CONNMARK_RESTORE) { + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "CONNMARK: restore can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + } + + return 1; +} + +static struct ipt_target ipt_connmark_reg = { + .name = "CONNMARK", + .target = &target, + .checkentry = &checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_connmark_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_connmark_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_DSCP.c b/net/ipv4/netfilter/ipt_DSCP.c new file mode 100644 index 000000000000..3ea4509099f9 --- /dev/null +++ b/net/ipv4/netfilter/ipt_DSCP.c @@ -0,0 +1,106 @@ +/* iptables module for setting the IPv4 DSCP field, Version 1.8 + * + * (C) 2002 by Harald Welte + * based on ipt_FTOS.c (C) 2000 by Matthew G. Marsh + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * See RFC2474 for a description of the DSCP field within the IP Header. + * + * ipt_DSCP.c,v 1.8 2002/08/06 18:41:57 laforge Exp +*/ + +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("iptables DSCP modification module"); +MODULE_LICENSE("GPL"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_DSCP_info *dinfo = targinfo; + u_int8_t sh_dscp = ((dinfo->dscp << IPT_DSCP_SHIFT) & IPT_DSCP_MASK); + + + if (((*pskb)->nh.iph->tos & IPT_DSCP_MASK) != sh_dscp) { + u_int16_t diffs[2]; + + if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + return NF_DROP; + + diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; + (*pskb)->nh.iph->tos = ((*pskb)->nh.iph->tos & ~IPT_DSCP_MASK) + | sh_dscp; + diffs[1] = htons((*pskb)->nh.iph->tos); + (*pskb)->nh.iph->check + = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + (*pskb)->nh.iph->check + ^ 0xFFFF)); + (*pskb)->nfcache |= NFC_ALTERED; + } + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const u_int8_t dscp = ((struct ipt_DSCP_info *)targinfo)->dscp; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_DSCP_info))) { + printk(KERN_WARNING "DSCP: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_DSCP_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "DSCP: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if ((dscp > IPT_DSCP_MAX)) { + printk(KERN_WARNING "DSCP: dscp %x out of range\n", dscp); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_dscp_reg = { + .name = "DSCP", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_dscp_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_dscp_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_ECN.c b/net/ipv4/netfilter/ipt_ECN.c new file mode 100644 index 000000000000..ada9911118e9 --- /dev/null +++ b/net/ipv4/netfilter/ipt_ECN.c @@ -0,0 +1,175 @@ +/* iptables module for the IPv4 and TCP ECN bits, Version 1.5 + * + * (C) 2002 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * ipt_ECN.c,v 1.5 2002/08/18 19:36:51 laforge Exp +*/ + +#include +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("iptables ECN modification module"); + +/* set ECT codepoint from IP header. + * return 0 if there was an error. */ +static inline int +set_ect_ip(struct sk_buff **pskb, const struct ipt_ECN_info *einfo) +{ + if (((*pskb)->nh.iph->tos & IPT_ECN_IP_MASK) + != (einfo->ip_ect & IPT_ECN_IP_MASK)) { + u_int16_t diffs[2]; + + if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + return 0; + + diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; + (*pskb)->nh.iph->tos &= ~IPT_ECN_IP_MASK; + (*pskb)->nh.iph->tos |= (einfo->ip_ect & IPT_ECN_IP_MASK); + diffs[1] = htons((*pskb)->nh.iph->tos); + (*pskb)->nh.iph->check + = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + (*pskb)->nh.iph->check + ^0xFFFF)); + (*pskb)->nfcache |= NFC_ALTERED; + } + return 1; +} + +/* Return 0 if there was an error. */ +static inline int +set_ect_tcp(struct sk_buff **pskb, const struct ipt_ECN_info *einfo, int inward) +{ + struct tcphdr _tcph, *tcph; + u_int16_t diffs[2]; + + /* Not enought header? */ + tcph = skb_header_pointer(*pskb, (*pskb)->nh.iph->ihl*4, + sizeof(_tcph), &_tcph); + if (!tcph) + return 0; + + if (!(einfo->operation & IPT_ECN_OP_SET_ECE + || tcph->ece == einfo->proto.tcp.ece) + && (!(einfo->operation & IPT_ECN_OP_SET_CWR + || tcph->cwr == einfo->proto.tcp.cwr))) + return 1; + + if (!skb_ip_make_writable(pskb, (*pskb)->nh.iph->ihl*4+sizeof(*tcph))) + return 0; + tcph = (void *)(*pskb)->nh.iph + (*pskb)->nh.iph->ihl*4; + + diffs[0] = ((u_int16_t *)tcph)[6]; + if (einfo->operation & IPT_ECN_OP_SET_ECE) + tcph->ece = einfo->proto.tcp.ece; + if (einfo->operation & IPT_ECN_OP_SET_CWR) + tcph->cwr = einfo->proto.tcp.cwr; + diffs[1] = ((u_int16_t *)tcph)[6]; + diffs[0] = diffs[0] ^ 0xFFFF; + + if ((*pskb)->ip_summed != CHECKSUM_HW) + tcph->check = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + tcph->check^0xFFFF)); + else + if (skb_checksum_help(*pskb, inward)) + return 0; + (*pskb)->nfcache |= NFC_ALTERED; + return 1; +} + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_ECN_info *einfo = targinfo; + + if (einfo->operation & IPT_ECN_OP_SET_IP) + if (!set_ect_ip(pskb, einfo)) + return NF_DROP; + + if (einfo->operation & (IPT_ECN_OP_SET_ECE | IPT_ECN_OP_SET_CWR) + && (*pskb)->nh.iph->protocol == IPPROTO_TCP) + if (!set_ect_tcp(pskb, einfo, (out == NULL))) + return NF_DROP; + + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ipt_ECN_info *einfo = (struct ipt_ECN_info *)targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_ECN_info))) { + printk(KERN_WARNING "ECN: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_ECN_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "ECN: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if (einfo->operation & IPT_ECN_OP_MASK) { + printk(KERN_WARNING "ECN: unsupported ECN operation %x\n", + einfo->operation); + return 0; + } + if (einfo->ip_ect & ~IPT_ECN_IP_MASK) { + printk(KERN_WARNING "ECN: new ECT codepoint %x out of mask\n", + einfo->ip_ect); + return 0; + } + + if ((einfo->operation & (IPT_ECN_OP_SET_ECE|IPT_ECN_OP_SET_CWR)) + && (e->ip.proto != IPPROTO_TCP || (e->ip.invflags & IPT_INV_PROTO))) { + printk(KERN_WARNING "ECN: cannot use TCP operations on a " + "non-tcp rule\n"); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_ecn_reg = { + .name = "ECN", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_ecn_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_ecn_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c new file mode 100644 index 000000000000..ef08733d26da --- /dev/null +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -0,0 +1,485 @@ +/* + * This is a module which is used for logging packets. + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("iptables syslog logging module"); + +static unsigned int nflog = 1; +module_param(nflog, int, 0400); +MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Use lock to serialize, so printks don't overlap */ +static DEFINE_SPINLOCK(log_lock); + +/* One level of recursion won't kill us */ +static void dump_packet(const struct ipt_log_info *info, + const struct sk_buff *skb, + unsigned int iphoff) +{ + struct iphdr _iph, *ih; + + ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); + if (ih == NULL) { + printk("TRUNCATED"); + return; + } + + /* Important fields: + * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ + /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ + printk("SRC=%u.%u.%u.%u DST=%u.%u.%u.%u ", + NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); + + /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ + printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, + ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); + + /* Max length: 6 "CE DF MF " */ + if (ntohs(ih->frag_off) & IP_CE) + printk("CE "); + if (ntohs(ih->frag_off) & IP_DF) + printk("DF "); + if (ntohs(ih->frag_off) & IP_MF) + printk("MF "); + + /* Max length: 11 "FRAG:65535 " */ + if (ntohs(ih->frag_off) & IP_OFFSET) + printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + + if ((info->logflags & IPT_LOG_IPOPT) + && ih->ihl * 4 > sizeof(struct iphdr)) { + unsigned char _opt[4 * 15 - sizeof(struct iphdr)], *op; + unsigned int i, optsize; + + optsize = ih->ihl * 4 - sizeof(struct iphdr); + op = skb_header_pointer(skb, iphoff+sizeof(_iph), + optsize, _opt); + if (op == NULL) { + printk("TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + printk("OPT ("); + for (i = 0; i < optsize; i++) + printk("%02X", op[i]); + printk(") "); + } + + switch (ih->protocol) { + case IPPROTO_TCP: { + struct tcphdr _tcph, *th; + + /* Max length: 10 "PROTO=TCP " */ + printk("PROTO=TCP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + th = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + printk("SPT=%u DPT=%u ", + ntohs(th->source), ntohs(th->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (info->logflags & IPT_LOG_TCPSEQ) + printk("SEQ=%u ACK=%u ", + ntohl(th->seq), ntohl(th->ack_seq)); + /* Max length: 13 "WINDOW=65535 " */ + printk("WINDOW=%u ", ntohs(th->window)); + /* Max length: 9 "RES=0x3F " */ + printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); + /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->cwr) + printk("CWR "); + if (th->ece) + printk("ECE "); + if (th->urg) + printk("URG "); + if (th->ack) + printk("ACK "); + if (th->psh) + printk("PSH "); + if (th->rst) + printk("RST "); + if (th->syn) + printk("SYN "); + if (th->fin) + printk("FIN "); + /* Max length: 11 "URGP=65535 " */ + printk("URGP=%u ", ntohs(th->urg_ptr)); + + if ((info->logflags & IPT_LOG_TCPOPT) + && th->doff * 4 > sizeof(struct tcphdr)) { + unsigned char _opt[4 * 15 - sizeof(struct tcphdr)]; + unsigned char *op; + unsigned int i, optsize; + + optsize = th->doff * 4 - sizeof(struct tcphdr); + op = skb_header_pointer(skb, + iphoff+ih->ihl*4+sizeof(_tcph), + optsize, _opt); + if (op == NULL) { + printk("TRUNCATED"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + printk("OPT ("); + for (i = 0; i < optsize; i++) + printk("%02X", op[i]); + printk(") "); + } + break; + } + case IPPROTO_UDP: { + struct udphdr _udph, *uh; + + /* Max length: 10 "PROTO=UDP " */ + printk("PROTO=UDP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + uh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_udph), &_udph); + if (uh == NULL) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + printk("SPT=%u DPT=%u LEN=%u ", + ntohs(uh->source), ntohs(uh->dest), + ntohs(uh->len)); + break; + } + case IPPROTO_ICMP: { + struct icmphdr _icmph, *ich; + static size_t required_len[NR_ICMP_TYPES+1] + = { [ICMP_ECHOREPLY] = 4, + [ICMP_DEST_UNREACH] + = 8 + sizeof(struct iphdr), + [ICMP_SOURCE_QUENCH] + = 8 + sizeof(struct iphdr), + [ICMP_REDIRECT] + = 8 + sizeof(struct iphdr), + [ICMP_ECHO] = 4, + [ICMP_TIME_EXCEEDED] + = 8 + sizeof(struct iphdr), + [ICMP_PARAMETERPROB] + = 8 + sizeof(struct iphdr), + [ICMP_TIMESTAMP] = 20, + [ICMP_TIMESTAMPREPLY] = 20, + [ICMP_ADDRESS] = 12, + [ICMP_ADDRESSREPLY] = 12 }; + + /* Max length: 11 "PROTO=ICMP " */ + printk("PROTO=ICMP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, + sizeof(_icmph), &_icmph); + if (ich == NULL) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + printk("TYPE=%u CODE=%u ", ich->type, ich->code); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + if (ich->type <= NR_ICMP_TYPES + && required_len[ich->type] + && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + switch (ich->type) { + case ICMP_ECHOREPLY: + case ICMP_ECHO: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + printk("ID=%u SEQ=%u ", + ntohs(ich->un.echo.id), + ntohs(ich->un.echo.sequence)); + break; + + case ICMP_PARAMETERPROB: + /* Max length: 14 "PARAMETER=255 " */ + printk("PARAMETER=%u ", + ntohl(ich->un.gateway) >> 24); + break; + case ICMP_REDIRECT: + /* Max length: 24 "GATEWAY=255.255.255.255 " */ + printk("GATEWAY=%u.%u.%u.%u ", + NIPQUAD(ich->un.gateway)); + /* Fall through */ + case ICMP_DEST_UNREACH: + case ICMP_SOURCE_QUENCH: + case ICMP_TIME_EXCEEDED: + /* Max length: 3+maxlen */ + if (!iphoff) { /* Only recurse once. */ + printk("["); + dump_packet(info, skb, + iphoff + ih->ihl*4+sizeof(_icmph)); + printk("] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ich->type == ICMP_DEST_UNREACH + && ich->code == ICMP_FRAG_NEEDED) + printk("MTU=%u ", ntohs(ich->un.frag.mtu)); + } + break; + } + /* Max Length */ + case IPPROTO_AH: { + struct ip_auth_hdr _ahdr, *ah; + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 9 "PROTO=AH " */ + printk("PROTO=AH "); + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ah = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_ahdr), &_ahdr); + if (ah == NULL) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + printk("SPI=0x%x ", ntohl(ah->spi)); + break; + } + case IPPROTO_ESP: { + struct ip_esp_hdr _esph, *eh; + + /* Max length: 10 "PROTO=ESP " */ + printk("PROTO=ESP "); + + if (ntohs(ih->frag_off) & IP_OFFSET) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + eh = skb_header_pointer(skb, iphoff+ih->ihl*4, + sizeof(_esph), &_esph); + if (eh == NULL) { + printk("INCOMPLETE [%u bytes] ", + skb->len - iphoff - ih->ihl*4); + break; + } + + /* Length: 15 "SPI=0xF1234567 " */ + printk("SPI=0x%x ", ntohl(eh->spi)); + break; + } + /* Max length: 10 "PROTO 255 " */ + default: + printk("PROTO=%u ", ih->protocol); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((info->logflags & IPT_LOG_UID) && !iphoff && skb->sk) { + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket && skb->sk->sk_socket->file) + printk("UID=%u ", skb->sk->sk_socket->file->f_uid); + read_unlock_bh(&skb->sk->sk_callback_lock); + } + + /* Proto Max log string length */ + /* IP: 40+46+6+11+127 = 230 */ + /* TCP: 10+max(25,20+30+13+9+32+11+127) = 252 */ + /* UDP: 10+max(25,20) = 35 */ + /* ICMP: 11+max(25, 18+25+max(19,14,24+3+n+10,3+n+10)) = 91+n */ + /* ESP: 10+max(25)+15 = 50 */ + /* AH: 9+max(25)+15 = 49 */ + /* unknown: 10 */ + + /* (ICMP allows recursion one level deep) */ + /* maxlen = IP + ICMP + IP + max(TCP,UDP,ICMP,unknown) */ + /* maxlen = 230+ 91 + 230 + 252 = 803 */ +} + +static void +ipt_log_packet(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct ipt_log_info *loginfo, + const char *level_string, + const char *prefix) +{ + spin_lock_bh(&log_lock); + printk(level_string); + printk("%sIN=%s OUT=%s ", + prefix == NULL ? loginfo->prefix : prefix, + in ? in->name : "", + out ? out->name : ""); +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge) { + struct net_device *physindev = skb->nf_bridge->physindev; + struct net_device *physoutdev = skb->nf_bridge->physoutdev; + + if (physindev && in != physindev) + printk("PHYSIN=%s ", physindev->name); + if (physoutdev && out != physoutdev) + printk("PHYSOUT=%s ", physoutdev->name); + } +#endif + + if (in && !out) { + /* MAC logging for input chain only. */ + printk("MAC="); + if (skb->dev && skb->dev->hard_header_len + && skb->mac.raw != (void*)skb->nh.iph) { + int i; + unsigned char *p = skb->mac.raw; + for (i = 0; i < skb->dev->hard_header_len; i++,p++) + printk("%02x%c", *p, + i==skb->dev->hard_header_len - 1 + ? ' ':':'); + } else + printk(" "); + } + + dump_packet(loginfo, skb, 0); + printk("\n"); + spin_unlock_bh(&log_lock); +} + +static unsigned int +ipt_log_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_log_info *loginfo = targinfo; + char level_string[4] = "< >"; + + level_string[1] = '0' + (loginfo->level % 8); + ipt_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL); + + return IPT_CONTINUE; +} + +static void +ipt_logfn(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const char *prefix) +{ + struct ipt_log_info loginfo = { + .level = 0, + .logflags = IPT_LOG_MASK, + .prefix = "" + }; + + ipt_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix); +} + +static int ipt_log_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ipt_log_info *loginfo = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_log_info))) { + DEBUGP("LOG: targinfosize %u != %u\n", + targinfosize, IPT_ALIGN(sizeof(struct ipt_log_info))); + return 0; + } + + if (loginfo->level >= 8) { + DEBUGP("LOG: level %u >= 8\n", loginfo->level); + return 0; + } + + if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { + DEBUGP("LOG: prefix term %i\n", + loginfo->prefix[sizeof(loginfo->prefix)-1]); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_log_reg = { + .name = "LOG", + .target = ipt_log_target, + .checkentry = ipt_log_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + if (ipt_register_target(&ipt_log_reg)) + return -EINVAL; + if (nflog) + nf_log_register(PF_INET, &ipt_logfn); + + return 0; +} + +static void __exit fini(void) +{ + if (nflog) + nf_log_unregister(PF_INET, &ipt_logfn); + ipt_unregister_target(&ipt_log_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_MARK.c b/net/ipv4/netfilter/ipt_MARK.c new file mode 100644 index 000000000000..33c6f9b63b8d --- /dev/null +++ b/net/ipv4/netfilter/ipt_MARK.c @@ -0,0 +1,162 @@ +/* This is a module which is used for setting the NFMARK field of an skb. */ + +/* (C) 1999-2001 Marc Boucher + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher "); +MODULE_DESCRIPTION("iptables MARK modification module"); + +static unsigned int +target_v0(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_mark_target_info *markinfo = targinfo; + + if((*pskb)->nfmark != markinfo->mark) { + (*pskb)->nfmark = markinfo->mark; + (*pskb)->nfcache |= NFC_ALTERED; + } + return IPT_CONTINUE; +} + +static unsigned int +target_v1(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_mark_target_info_v1 *markinfo = targinfo; + int mark = 0; + + switch (markinfo->mode) { + case IPT_MARK_SET: + mark = markinfo->mark; + break; + + case IPT_MARK_AND: + mark = (*pskb)->nfmark & markinfo->mark; + break; + + case IPT_MARK_OR: + mark = (*pskb)->nfmark | markinfo->mark; + break; + } + + if((*pskb)->nfmark != mark) { + (*pskb)->nfmark = mark; + (*pskb)->nfcache |= NFC_ALTERED; + } + return IPT_CONTINUE; +} + + +static int +checkentry_v0(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info))) { + printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_mark_target_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + return 1; +} + +static int +checkentry_v1(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + struct ipt_mark_target_info_v1 *markinfo = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1))){ + printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_mark_target_info_v1))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if (markinfo->mode != IPT_MARK_SET + && markinfo->mode != IPT_MARK_AND + && markinfo->mode != IPT_MARK_OR) { + printk(KERN_WARNING "MARK: unknown mode %u\n", + markinfo->mode); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_mark_reg_v0 = { + .name = "MARK", + .target = target_v0, + .checkentry = checkentry_v0, + .me = THIS_MODULE, + .revision = 0, +}; + +static struct ipt_target ipt_mark_reg_v1 = { + .name = "MARK", + .target = target_v1, + .checkentry = checkentry_v1, + .me = THIS_MODULE, + .revision = 1, +}; + +static int __init init(void) +{ + int err; + + err = ipt_register_target(&ipt_mark_reg_v0); + if (!err) { + err = ipt_register_target(&ipt_mark_reg_v1); + if (err) + ipt_unregister_target(&ipt_mark_reg_v0); + } + return err; +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_mark_reg_v0); + ipt_unregister_target(&ipt_mark_reg_v1); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c new file mode 100644 index 000000000000..57e9f6cf1c36 --- /dev/null +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -0,0 +1,207 @@ +/* Masquerade. Simple mapping which alters range to a local IP address + (depending on route). */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("iptables MASQUERADE target module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Lock protects masq region inside conntrack */ +static DECLARE_RWLOCK(masq_lock); + +/* FIXME: Multiple targets. --RR */ +static int +masquerade_check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ip_nat_multi_range_compat *mr = targinfo; + + if (strcmp(tablename, "nat") != 0) { + DEBUGP("masquerade_check: bad table `%s'.\n", tablename); + return 0; + } + if (targinfosize != IPT_ALIGN(sizeof(*mr))) { + DEBUGP("masquerade_check: size %u != %u.\n", + targinfosize, sizeof(*mr)); + return 0; + } + if (hook_mask & ~(1 << NF_IP_POST_ROUTING)) { + DEBUGP("masquerade_check: bad hooks %x.\n", hook_mask); + return 0; + } + if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { + DEBUGP("masquerade_check: bad MAP_IPS.\n"); + return 0; + } + if (mr->rangesize != 1) { + DEBUGP("masquerade_check: bad rangesize %u.\n", mr->rangesize); + return 0; + } + return 1; +} + +static unsigned int +masquerade_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + const struct ip_nat_multi_range_compat *mr; + struct ip_nat_range newrange; + struct rtable *rt; + u_int32_t newsrc; + + IP_NF_ASSERT(hooknum == NF_IP_POST_ROUTING); + + /* FIXME: For the moment, don't do local packets, breaks + testsuite for 2.3.49 --RR */ + if ((*pskb)->sk) + return NF_ACCEPT; + + ct = ip_conntrack_get(*pskb, &ctinfo); + IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED + || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)); + + mr = targinfo; + rt = (struct rtable *)(*pskb)->dst; + newsrc = inet_select_addr(out, rt->rt_gateway, RT_SCOPE_UNIVERSE); + if (!newsrc) { + printk("MASQUERADE: %s ate my IP address\n", out->name); + return NF_DROP; + } + + WRITE_LOCK(&masq_lock); + ct->nat.masq_index = out->ifindex; + WRITE_UNLOCK(&masq_lock); + + /* Transfer from original range. */ + newrange = ((struct ip_nat_range) + { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, + newsrc, newsrc, + mr->range[0].min, mr->range[0].max }); + + /* Hand modified range to generic setup. */ + return ip_nat_setup_info(ct, &newrange, hooknum); +} + +static inline int +device_cmp(struct ip_conntrack *i, void *ifindex) +{ + int ret; + + READ_LOCK(&masq_lock); + ret = (i->nat.masq_index == (int)(long)ifindex); + READ_UNLOCK(&masq_lock); + + return ret; +} + +static int masq_device_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + + if (event == NETDEV_DOWN) { + /* Device was downed. Search entire table for + conntracks which were associated with that device, + and forget them. */ + IP_NF_ASSERT(dev->ifindex != 0); + + ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); + } + + return NOTIFY_DONE; +} + +static int masq_inet_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; + + if (event == NETDEV_DOWN) { + /* IP address was deleted. Search entire table for + conntracks which were associated with that device, + and forget them. */ + IP_NF_ASSERT(dev->ifindex != 0); + + ip_ct_iterate_cleanup(device_cmp, (void *)(long)dev->ifindex); + } + + return NOTIFY_DONE; +} + +static struct notifier_block masq_dev_notifier = { + .notifier_call = masq_device_event, +}; + +static struct notifier_block masq_inet_notifier = { + .notifier_call = masq_inet_event, +}; + +static struct ipt_target masquerade = { + .name = "MASQUERADE", + .target = masquerade_target, + .checkentry = masquerade_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + int ret; + + ret = ipt_register_target(&masquerade); + + if (ret == 0) { + /* Register for device down reports */ + register_netdevice_notifier(&masq_dev_notifier); + /* Register IP address change reports */ + register_inetaddr_notifier(&masq_inet_notifier); + } + + return ret; +} + +static void __exit fini(void) +{ + ipt_unregister_target(&masquerade); + unregister_netdevice_notifier(&masq_dev_notifier); + unregister_inetaddr_notifier(&masq_inet_notifier); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_NETMAP.c b/net/ipv4/netfilter/ipt_NETMAP.c new file mode 100644 index 000000000000..06254b29d034 --- /dev/null +++ b/net/ipv4/netfilter/ipt_NETMAP.c @@ -0,0 +1,117 @@ +/* NETMAP - static NAT mapping of IP network addresses (1:1). + * The mapping can be applied to source (POSTROUTING), + * destination (PREROUTING), or both (with separate rules). + */ + +/* (C) 2000-2001 Svenning Soerensen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define MODULENAME "NETMAP" +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Svenning Soerensen "); +MODULE_DESCRIPTION("iptables 1:1 NAT mapping of IP networks target"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int +check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ip_nat_multi_range_compat *mr = targinfo; + + if (strcmp(tablename, "nat") != 0) { + DEBUGP(MODULENAME":check: bad table `%s'.\n", tablename); + return 0; + } + if (targinfosize != IPT_ALIGN(sizeof(*mr))) { + DEBUGP(MODULENAME":check: size %u.\n", targinfosize); + return 0; + } + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_POST_ROUTING))) { + DEBUGP(MODULENAME":check: bad hooks %x.\n", hook_mask); + return 0; + } + if (!(mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)) { + DEBUGP(MODULENAME":check: bad MAP_IPS.\n"); + return 0; + } + if (mr->rangesize != 1) { + DEBUGP(MODULENAME":check: bad rangesize %u.\n", mr->rangesize); + return 0; + } + return 1; +} + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + u_int32_t new_ip, netmask; + const struct ip_nat_multi_range_compat *mr = targinfo; + struct ip_nat_range newrange; + + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + || hooknum == NF_IP_POST_ROUTING); + ct = ip_conntrack_get(*pskb, &ctinfo); + + netmask = ~(mr->range[0].min_ip ^ mr->range[0].max_ip); + + if (hooknum == NF_IP_PRE_ROUTING) + new_ip = (*pskb)->nh.iph->daddr & ~netmask; + else + new_ip = (*pskb)->nh.iph->saddr & ~netmask; + new_ip |= mr->range[0].min_ip & netmask; + + newrange = ((struct ip_nat_range) + { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, + new_ip, new_ip, + mr->range[0].min, mr->range[0].max }); + + /* Hand modified range to generic setup. */ + return ip_nat_setup_info(ct, &newrange, hooknum); +} + +static struct ipt_target target_module = { + .name = MODULENAME, + .target = target, + .checkentry = check, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_target(&target_module); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&target_module); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_NOTRACK.c b/net/ipv4/netfilter/ipt_NOTRACK.c new file mode 100644 index 000000000000..a4bb9b3bc292 --- /dev/null +++ b/net/ipv4/netfilter/ipt_NOTRACK.c @@ -0,0 +1,76 @@ +/* This is a module which is used for setting up fake conntracks + * on packets so that they are not seen by the conntrack/NAT code. + */ +#include +#include + +#include +#include + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + /* Previously seen (loopback)? Ignore. */ + if ((*pskb)->nfct != NULL) + return IPT_CONTINUE; + + /* Attach fake conntrack entry. + If there is a real ct entry correspondig to this packet, + it'll hang aroun till timing out. We don't deal with it + for performance reasons. JK */ + (*pskb)->nfct = &ip_conntrack_untracked.ct_general; + (*pskb)->nfctinfo = IP_CT_NEW; + nf_conntrack_get((*pskb)->nfct); + + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != 0) { + printk(KERN_WARNING "NOTRACK: targinfosize %u != 0\n", + targinfosize); + return 0; + } + + if (strcmp(tablename, "raw") != 0) { + printk(KERN_WARNING "NOTRACK: can only be called from \"raw\" table, not \"%s\"\n", tablename); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_notrack_reg = { + .name = "NOTRACK", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + if (ipt_register_target(&ipt_notrack_reg)) + return -EINVAL; + + return 0; +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_notrack_reg); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/netfilter/ipt_REDIRECT.c b/net/ipv4/netfilter/ipt_REDIRECT.c new file mode 100644 index 000000000000..d2e13447678e --- /dev/null +++ b/net/ipv4/netfilter/ipt_REDIRECT.c @@ -0,0 +1,129 @@ +/* Redirect. Simple mapping which alters dst to a local IP address. */ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("iptables REDIRECT target module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* FIXME: Take multiple ranges --RR */ +static int +redirect_check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ip_nat_multi_range_compat *mr = targinfo; + + if (strcmp(tablename, "nat") != 0) { + DEBUGP("redirect_check: bad table `%s'.\n", table); + return 0; + } + if (targinfosize != IPT_ALIGN(sizeof(*mr))) { + DEBUGP("redirect_check: size %u.\n", targinfosize); + return 0; + } + if (hook_mask & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT))) { + DEBUGP("redirect_check: bad hooks %x.\n", hook_mask); + return 0; + } + if (mr->range[0].flags & IP_NAT_RANGE_MAP_IPS) { + DEBUGP("redirect_check: bad MAP_IPS.\n"); + return 0; + } + if (mr->rangesize != 1) { + DEBUGP("redirect_check: bad rangesize %u.\n", mr->rangesize); + return 0; + } + return 1; +} + +static unsigned int +redirect_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + u_int32_t newdst; + const struct ip_nat_multi_range_compat *mr = targinfo; + struct ip_nat_range newrange; + + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING + || hooknum == NF_IP_LOCAL_OUT); + + ct = ip_conntrack_get(*pskb, &ctinfo); + IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); + + /* Local packets: make them go to loopback */ + if (hooknum == NF_IP_LOCAL_OUT) + newdst = htonl(0x7F000001); + else { + struct in_device *indev; + + /* Device might not have an associated in_device. */ + indev = (struct in_device *)(*pskb)->dev->ip_ptr; + if (indev == NULL || indev->ifa_list == NULL) + return NF_DROP; + + /* Grab first address on interface. */ + newdst = indev->ifa_list->ifa_local; + } + + /* Transfer from original range. */ + newrange = ((struct ip_nat_range) + { mr->range[0].flags | IP_NAT_RANGE_MAP_IPS, + newdst, newdst, + mr->range[0].min, mr->range[0].max }); + + /* Hand modified range to generic setup. */ + return ip_nat_setup_info(ct, &newrange, hooknum); +} + +static struct ipt_target redirect_reg = { + .name = "REDIRECT", + .target = redirect_target, + .checkentry = redirect_check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&redirect_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&redirect_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c new file mode 100644 index 000000000000..266d64979286 --- /dev/null +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -0,0 +1,335 @@ +/* + * This is a module which is used for rejecting packets. + * Added support for customized reject packets (Jozsef Kadlecsik). + * Added support for ICMP type-3-code-13 (Maciej Soltysiak). [RFC 1812] + */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_BRIDGE_NETFILTER +#include +#endif + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("iptables REJECT target module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static inline struct rtable *route_reverse(struct sk_buff *skb, + struct tcphdr *tcph, int hook) +{ + struct iphdr *iph = skb->nh.iph; + struct dst_entry *odst; + struct flowi fl = {}; + struct rtable *rt; + + /* We don't require ip forwarding to be enabled to be able to + * send a RST reply for bridged traffic. */ + if (hook != NF_IP_FORWARD +#ifdef CONFIG_BRIDGE_NETFILTER + || (skb->nf_bridge && skb->nf_bridge->mask & BRNF_BRIDGED) +#endif + ) { + fl.nl_u.ip4_u.daddr = iph->saddr; + if (hook == NF_IP_LOCAL_IN) + fl.nl_u.ip4_u.saddr = iph->daddr; + fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); + + if (ip_route_output_key(&rt, &fl) != 0) + return NULL; + } else { + /* non-local src, find valid iif to satisfy + * rp-filter when calling ip_route_input. */ + fl.nl_u.ip4_u.daddr = iph->daddr; + if (ip_route_output_key(&rt, &fl) != 0) + return NULL; + + odst = skb->dst; + if (ip_route_input(skb, iph->saddr, iph->daddr, + RT_TOS(iph->tos), rt->u.dst.dev) != 0) { + dst_release(&rt->u.dst); + return NULL; + } + dst_release(&rt->u.dst); + rt = (struct rtable *)skb->dst; + skb->dst = odst; + + fl.nl_u.ip4_u.daddr = iph->saddr; + fl.nl_u.ip4_u.saddr = iph->daddr; + fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); + } + + if (rt->u.dst.error) { + dst_release(&rt->u.dst); + return NULL; + } + + fl.proto = IPPROTO_TCP; + fl.fl_ip_sport = tcph->dest; + fl.fl_ip_dport = tcph->source; + + if (xfrm_lookup((struct dst_entry **)&rt, &fl, NULL, 0)) { + dst_release(&rt->u.dst); + rt = NULL; + } + + return rt; +} + +/* Send RST reply */ +static void send_reset(struct sk_buff *oldskb, int hook) +{ + struct sk_buff *nskb; + struct tcphdr _otcph, *oth, *tcph; + struct rtable *rt; + u_int16_t tmp_port; + u_int32_t tmp_addr; + int needs_ack; + int hh_len; + + /* IP header checks: fragment. */ + if (oldskb->nh.iph->frag_off & htons(IP_OFFSET)) + return; + + oth = skb_header_pointer(oldskb, oldskb->nh.iph->ihl * 4, + sizeof(_otcph), &_otcph); + if (oth == NULL) + return; + + /* No RST for RST. */ + if (oth->rst) + return; + + /* FIXME: Check checksum --RR */ + if ((rt = route_reverse(oldskb, oth, hook)) == NULL) + return; + + hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + + /* We need a linear, writeable skb. We also need to expand + headroom in case hh_len of incoming interface < hh_len of + outgoing interface */ + nskb = skb_copy_expand(oldskb, hh_len, skb_tailroom(oldskb), + GFP_ATOMIC); + if (!nskb) { + dst_release(&rt->u.dst); + return; + } + + dst_release(nskb->dst); + nskb->dst = &rt->u.dst; + + /* This packet will not be the same as the other: clear nf fields */ + nf_reset(nskb); + nskb->nfcache = 0; + nskb->nfmark = 0; +#ifdef CONFIG_BRIDGE_NETFILTER + nf_bridge_put(nskb->nf_bridge); + nskb->nf_bridge = NULL; +#endif + + tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl); + + /* Swap source and dest */ + tmp_addr = nskb->nh.iph->saddr; + nskb->nh.iph->saddr = nskb->nh.iph->daddr; + nskb->nh.iph->daddr = tmp_addr; + tmp_port = tcph->source; + tcph->source = tcph->dest; + tcph->dest = tmp_port; + + /* Truncate to length (no data) */ + tcph->doff = sizeof(struct tcphdr)/4; + skb_trim(nskb, nskb->nh.iph->ihl*4 + sizeof(struct tcphdr)); + nskb->nh.iph->tot_len = htons(nskb->len); + + if (tcph->ack) { + needs_ack = 0; + tcph->seq = oth->ack_seq; + tcph->ack_seq = 0; + } else { + needs_ack = 1; + tcph->ack_seq = htonl(ntohl(oth->seq) + oth->syn + oth->fin + + oldskb->len - oldskb->nh.iph->ihl*4 + - (oth->doff<<2)); + tcph->seq = 0; + } + + /* Reset flags */ + ((u_int8_t *)tcph)[13] = 0; + tcph->rst = 1; + tcph->ack = needs_ack; + + tcph->window = 0; + tcph->urg_ptr = 0; + + /* Adjust TCP checksum */ + tcph->check = 0; + tcph->check = tcp_v4_check(tcph, sizeof(struct tcphdr), + nskb->nh.iph->saddr, + nskb->nh.iph->daddr, + csum_partial((char *)tcph, + sizeof(struct tcphdr), 0)); + + /* Adjust IP TTL, DF */ + nskb->nh.iph->ttl = MAXTTL; + /* Set DF, id = 0 */ + nskb->nh.iph->frag_off = htons(IP_DF); + nskb->nh.iph->id = 0; + + /* Adjust IP checksum */ + nskb->nh.iph->check = 0; + nskb->nh.iph->check = ip_fast_csum((unsigned char *)nskb->nh.iph, + nskb->nh.iph->ihl); + + /* "Never happens" */ + if (nskb->len > dst_mtu(nskb->dst)) + goto free_nskb; + + nf_ct_attach(nskb, oldskb); + + NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, nskb, NULL, nskb->dst->dev, + dst_output); + return; + + free_nskb: + kfree_skb(nskb); +} + +static inline void send_unreach(struct sk_buff *skb_in, int code) +{ + icmp_send(skb_in, ICMP_DEST_UNREACH, code, 0); +} + +static unsigned int reject(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_reject_info *reject = targinfo; + + /* Our naive response construction doesn't deal with IP + options, and probably shouldn't try. */ + if ((*pskb)->nh.iph->ihl<<2 != sizeof(struct iphdr)) + return NF_DROP; + + /* WARNING: This code causes reentry within iptables. + This means that the iptables jump stack is now crap. We + must return an absolute verdict. --RR */ + switch (reject->with) { + case IPT_ICMP_NET_UNREACHABLE: + send_unreach(*pskb, ICMP_NET_UNREACH); + break; + case IPT_ICMP_HOST_UNREACHABLE: + send_unreach(*pskb, ICMP_HOST_UNREACH); + break; + case IPT_ICMP_PROT_UNREACHABLE: + send_unreach(*pskb, ICMP_PROT_UNREACH); + break; + case IPT_ICMP_PORT_UNREACHABLE: + send_unreach(*pskb, ICMP_PORT_UNREACH); + break; + case IPT_ICMP_NET_PROHIBITED: + send_unreach(*pskb, ICMP_NET_ANO); + break; + case IPT_ICMP_HOST_PROHIBITED: + send_unreach(*pskb, ICMP_HOST_ANO); + break; + case IPT_ICMP_ADMIN_PROHIBITED: + send_unreach(*pskb, ICMP_PKT_FILTERED); + break; + case IPT_TCP_RESET: + send_reset(*pskb, hooknum); + case IPT_ICMP_ECHOREPLY: + /* Doesn't happen. */ + break; + } + + return NF_DROP; +} + +static int check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ipt_reject_info *rejinfo = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_reject_info))) { + DEBUGP("REJECT: targinfosize %u != 0\n", targinfosize); + return 0; + } + + /* Only allow these for packet filtering. */ + if (strcmp(tablename, "filter") != 0) { + DEBUGP("REJECT: bad table `%s'.\n", tablename); + return 0; + } + if ((hook_mask & ~((1 << NF_IP_LOCAL_IN) + | (1 << NF_IP_FORWARD) + | (1 << NF_IP_LOCAL_OUT))) != 0) { + DEBUGP("REJECT: bad hook mask %X\n", hook_mask); + return 0; + } + + if (rejinfo->with == IPT_ICMP_ECHOREPLY) { + printk("REJECT: ECHOREPLY no longer supported.\n"); + return 0; + } else if (rejinfo->with == IPT_TCP_RESET) { + /* Must specify that it's a TCP packet */ + if (e->ip.proto != IPPROTO_TCP + || (e->ip.invflags & IPT_INV_PROTO)) { + DEBUGP("REJECT: TCP_RESET invalid for non-tcp\n"); + return 0; + } + } + + return 1; +} + +static struct ipt_target ipt_reject_reg = { + .name = "REJECT", + .target = reject, + .checkentry = check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_reject_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_reject_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_SAME.c b/net/ipv4/netfilter/ipt_SAME.c new file mode 100644 index 000000000000..7a0536d864ac --- /dev/null +++ b/net/ipv4/netfilter/ipt_SAME.c @@ -0,0 +1,211 @@ +/* Same. Just like SNAT, only try to make the connections + * between client A and server B always have the same source ip. + * + * (C) 2000 Paul `Rusty' Russell + * (C) 2001 Martin Josefsson + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 010320 Martin Josefsson + * * copied ipt_BALANCE.c to ipt_SAME.c and changed a few things. + * 010728 Martin Josefsson + * * added --nodst to not include destination-ip in new source + * calculations. + * * added some more sanity-checks. + * 010729 Martin Josefsson + * * fixed a buggy if-statement in same_check(), should have + * used ntohl() but didn't. + * * added support for multiple ranges. IPT_SAME_MAX_RANGE is + * defined in linux/include/linux/netfilter_ipv4/ipt_SAME.h + * and is currently set to 10. + * * added support for 1-address range, nice to have now that + * we have multiple ranges. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Josefsson "); +MODULE_DESCRIPTION("iptables special SNAT module for consistent sourceip"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int +same_check(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + unsigned int count, countess, rangeip, index = 0; + struct ipt_same_info *mr = targinfo; + + mr->ipnum = 0; + + if (strcmp(tablename, "nat") != 0) { + DEBUGP("same_check: bad table `%s'.\n", tablename); + return 0; + } + if (targinfosize != IPT_ALIGN(sizeof(*mr))) { + DEBUGP("same_check: size %u.\n", targinfosize); + return 0; + } + if (hook_mask & ~(1 << NF_IP_PRE_ROUTING | 1 << NF_IP_POST_ROUTING)) { + DEBUGP("same_check: bad hooks %x.\n", hook_mask); + return 0; + } + if (mr->rangesize < 1) { + DEBUGP("same_check: need at least one dest range.\n"); + return 0; + } + if (mr->rangesize > IPT_SAME_MAX_RANGE) { + DEBUGP("same_check: too many ranges specified, maximum " + "is %u ranges\n", + IPT_SAME_MAX_RANGE); + return 0; + } + for (count = 0; count < mr->rangesize; count++) { + if (ntohl(mr->range[count].min_ip) > + ntohl(mr->range[count].max_ip)) { + DEBUGP("same_check: min_ip is larger than max_ip in " + "range `%u.%u.%u.%u-%u.%u.%u.%u'.\n", + NIPQUAD(mr->range[count].min_ip), + NIPQUAD(mr->range[count].max_ip)); + return 0; + } + if (!(mr->range[count].flags & IP_NAT_RANGE_MAP_IPS)) { + DEBUGP("same_check: bad MAP_IPS.\n"); + return 0; + } + rangeip = (ntohl(mr->range[count].max_ip) - + ntohl(mr->range[count].min_ip) + 1); + mr->ipnum += rangeip; + + DEBUGP("same_check: range %u, ipnum = %u\n", count, rangeip); + } + DEBUGP("same_check: total ipaddresses = %u\n", mr->ipnum); + + mr->iparray = kmalloc((sizeof(u_int32_t) * mr->ipnum), GFP_KERNEL); + if (!mr->iparray) { + DEBUGP("same_check: Couldn't allocate %u bytes " + "for %u ipaddresses!\n", + (sizeof(u_int32_t) * mr->ipnum), mr->ipnum); + return 0; + } + DEBUGP("same_check: Allocated %u bytes for %u ipaddresses.\n", + (sizeof(u_int32_t) * mr->ipnum), mr->ipnum); + + for (count = 0; count < mr->rangesize; count++) { + for (countess = ntohl(mr->range[count].min_ip); + countess <= ntohl(mr->range[count].max_ip); + countess++) { + mr->iparray[index] = countess; + DEBUGP("same_check: Added ipaddress `%u.%u.%u.%u' " + "in index %u.\n", + HIPQUAD(countess), index); + index++; + } + } + return 1; +} + +static void +same_destroy(void *targinfo, + unsigned int targinfosize) +{ + struct ipt_same_info *mr = targinfo; + + kfree(mr->iparray); + + DEBUGP("same_destroy: Deallocated %u bytes for %u ipaddresses.\n", + (sizeof(u_int32_t) * mr->ipnum), mr->ipnum); +} + +static unsigned int +same_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + u_int32_t tmpip, aindex, new_ip; + const struct ipt_same_info *same = targinfo; + struct ip_nat_range newrange; + const struct ip_conntrack_tuple *t; + + IP_NF_ASSERT(hooknum == NF_IP_PRE_ROUTING || + hooknum == NF_IP_POST_ROUTING); + ct = ip_conntrack_get(*pskb, &ctinfo); + + t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + + /* Base new source on real src ip and optionally dst ip, + giving some hope for consistency across reboots. + Here we calculate the index in same->iparray which + holds the ipaddress we should use */ + + tmpip = ntohl(t->src.ip); + + if (!(same->info & IPT_SAME_NODST)) + tmpip += ntohl(t->dst.ip); + + aindex = tmpip % same->ipnum; + + new_ip = htonl(same->iparray[aindex]); + + DEBUGP("ipt_SAME: src=%u.%u.%u.%u dst=%u.%u.%u.%u, " + "new src=%u.%u.%u.%u\n", + NIPQUAD(t->src.ip), NIPQUAD(t->dst.ip), + NIPQUAD(new_ip)); + + /* Transfer from original range. */ + newrange = ((struct ip_nat_range) + { same->range[0].flags, new_ip, new_ip, + /* FIXME: Use ports from correct range! */ + same->range[0].min, same->range[0].max }); + + /* Hand modified range to generic setup. */ + return ip_nat_setup_info(ct, &newrange, hooknum); +} + +static struct ipt_target same_reg = { + .name = "SAME", + .target = same_target, + .checkentry = same_check, + .destroy = same_destroy, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&same_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&same_reg); +} + +module_init(init); +module_exit(fini); + diff --git a/net/ipv4/netfilter/ipt_TCPMSS.c b/net/ipv4/netfilter/ipt_TCPMSS.c new file mode 100644 index 000000000000..1049050b2bfb --- /dev/null +++ b/net/ipv4/netfilter/ipt_TCPMSS.c @@ -0,0 +1,262 @@ +/* + * This is a module which is used for setting the MSS option in TCP packets. + * + * Copyright (C) 2000 Marc Boucher + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher "); +MODULE_DESCRIPTION("iptables TCP MSS modification module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static u_int16_t +cheat_check(u_int32_t oldvalinv, u_int32_t newval, u_int16_t oldcheck) +{ + u_int32_t diffs[] = { oldvalinv, newval }; + return csum_fold(csum_partial((char *)diffs, sizeof(diffs), + oldcheck^0xFFFF)); +} + +static inline unsigned int +optlen(const u_int8_t *opt, unsigned int offset) +{ + /* Beware zero-length options: make finite progress */ + if (opt[offset] <= TCPOPT_NOP || opt[offset+1] == 0) return 1; + else return opt[offset+1]; +} + +static unsigned int +ipt_tcpmss_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_tcpmss_info *tcpmssinfo = targinfo; + struct tcphdr *tcph; + struct iphdr *iph; + u_int16_t tcplen, newtotlen, oldval, newmss; + unsigned int i; + u_int8_t *opt; + + if (!skb_ip_make_writable(pskb, (*pskb)->len)) + return NF_DROP; + + iph = (*pskb)->nh.iph; + tcplen = (*pskb)->len - iph->ihl*4; + + tcph = (void *)iph + iph->ihl*4; + + /* Since it passed flags test in tcp match, we know it is is + not a fragment, and has data >= tcp header length. SYN + packets should not contain data: if they did, then we risk + running over MTU, sending Frag Needed and breaking things + badly. --RR */ + if (tcplen != tcph->doff*4) { + if (net_ratelimit()) + printk(KERN_ERR + "ipt_tcpmss_target: bad length (%d bytes)\n", + (*pskb)->len); + return NF_DROP; + } + + if(tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) { + if(!(*pskb)->dst) { + if (net_ratelimit()) + printk(KERN_ERR + "ipt_tcpmss_target: no dst?! can't determine path-MTU\n"); + return NF_DROP; /* or IPT_CONTINUE ?? */ + } + + if(dst_mtu((*pskb)->dst) <= (sizeof(struct iphdr) + sizeof(struct tcphdr))) { + if (net_ratelimit()) + printk(KERN_ERR + "ipt_tcpmss_target: unknown or invalid path-MTU (%d)\n", dst_mtu((*pskb)->dst)); + return NF_DROP; /* or IPT_CONTINUE ?? */ + } + + newmss = dst_mtu((*pskb)->dst) - sizeof(struct iphdr) - sizeof(struct tcphdr); + } else + newmss = tcpmssinfo->mss; + + opt = (u_int8_t *)tcph; + for (i = sizeof(struct tcphdr); i < tcph->doff*4; i += optlen(opt, i)){ + if ((opt[i] == TCPOPT_MSS) && + ((tcph->doff*4 - i) >= TCPOLEN_MSS) && + (opt[i+1] == TCPOLEN_MSS)) { + u_int16_t oldmss; + + oldmss = (opt[i+2] << 8) | opt[i+3]; + + if((tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) && + (oldmss <= newmss)) + return IPT_CONTINUE; + + opt[i+2] = (newmss & 0xff00) >> 8; + opt[i+3] = (newmss & 0x00ff); + + tcph->check = cheat_check(htons(oldmss)^0xFFFF, + htons(newmss), + tcph->check); + + DEBUGP(KERN_INFO "ipt_tcpmss_target: %u.%u.%u.%u:%hu" + "->%u.%u.%u.%u:%hu changed TCP MSS option" + " (from %u to %u)\n", + NIPQUAD((*pskb)->nh.iph->saddr), + ntohs(tcph->source), + NIPQUAD((*pskb)->nh.iph->daddr), + ntohs(tcph->dest), + oldmss, newmss); + goto retmodified; + } + } + + /* + * MSS Option not found ?! add it.. + */ + if (skb_tailroom((*pskb)) < TCPOLEN_MSS) { + struct sk_buff *newskb; + + newskb = skb_copy_expand(*pskb, skb_headroom(*pskb), + TCPOLEN_MSS, GFP_ATOMIC); + if (!newskb) { + if (net_ratelimit()) + printk(KERN_ERR "ipt_tcpmss_target:" + " unable to allocate larger skb\n"); + return NF_DROP; + } + + kfree_skb(*pskb); + *pskb = newskb; + iph = (*pskb)->nh.iph; + tcph = (void *)iph + iph->ihl*4; + } + + skb_put((*pskb), TCPOLEN_MSS); + + opt = (u_int8_t *)tcph + sizeof(struct tcphdr); + memmove(opt + TCPOLEN_MSS, opt, tcplen - sizeof(struct tcphdr)); + + tcph->check = cheat_check(htons(tcplen) ^ 0xFFFF, + htons(tcplen + TCPOLEN_MSS), tcph->check); + tcplen += TCPOLEN_MSS; + + opt[0] = TCPOPT_MSS; + opt[1] = TCPOLEN_MSS; + opt[2] = (newmss & 0xff00) >> 8; + opt[3] = (newmss & 0x00ff); + + tcph->check = cheat_check(~0, *((u_int32_t *)opt), tcph->check); + + oldval = ((u_int16_t *)tcph)[6]; + tcph->doff += TCPOLEN_MSS/4; + tcph->check = cheat_check(oldval ^ 0xFFFF, + ((u_int16_t *)tcph)[6], tcph->check); + + newtotlen = htons(ntohs(iph->tot_len) + TCPOLEN_MSS); + iph->check = cheat_check(iph->tot_len ^ 0xFFFF, + newtotlen, iph->check); + iph->tot_len = newtotlen; + + DEBUGP(KERN_INFO "ipt_tcpmss_target: %u.%u.%u.%u:%hu" + "->%u.%u.%u.%u:%hu added TCP MSS option (%u)\n", + NIPQUAD((*pskb)->nh.iph->saddr), + ntohs(tcph->source), + NIPQUAD((*pskb)->nh.iph->daddr), + ntohs(tcph->dest), + newmss); + + retmodified: + /* We never hw checksum SYN packets. */ + BUG_ON((*pskb)->ip_summed == CHECKSUM_HW); + + (*pskb)->nfcache |= NFC_UNKNOWN | NFC_ALTERED; + return IPT_CONTINUE; +} + +#define TH_SYN 0x02 + +static inline int find_syn_match(const struct ipt_entry_match *m) +{ + const struct ipt_tcp *tcpinfo = (const struct ipt_tcp *)m->data; + + if (strcmp(m->u.kernel.match->name, "tcp") == 0 + && (tcpinfo->flg_cmp & TH_SYN) + && !(tcpinfo->invflags & IPT_TCP_INV_FLAGS)) + return 1; + + return 0; +} + +/* Must specify -p tcp --syn/--tcp-flags SYN */ +static int +ipt_tcpmss_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ipt_tcpmss_info *tcpmssinfo = targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tcpmss_info))) { + DEBUGP("ipt_tcpmss_checkentry: targinfosize %u != %u\n", + targinfosize, IPT_ALIGN(sizeof(struct ipt_tcpmss_info))); + return 0; + } + + + if((tcpmssinfo->mss == IPT_TCPMSS_CLAMP_PMTU) && + ((hook_mask & ~((1 << NF_IP_FORWARD) + | (1 << NF_IP_LOCAL_OUT) + | (1 << NF_IP_POST_ROUTING))) != 0)) { + printk("TCPMSS: path-MTU clamping only supported in FORWARD, OUTPUT and POSTROUTING hooks\n"); + return 0; + } + + if (e->ip.proto == IPPROTO_TCP + && !(e->ip.invflags & IPT_INV_PROTO) + && IPT_MATCH_ITERATE(e, find_syn_match)) + return 1; + + printk("TCPMSS: Only works on TCP SYN packets\n"); + return 0; +} + +static struct ipt_target ipt_tcpmss_reg = { + .name = "TCPMSS", + .target = ipt_tcpmss_target, + .checkentry = ipt_tcpmss_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_tcpmss_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_tcpmss_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_TOS.c b/net/ipv4/netfilter/ipt_TOS.c new file mode 100644 index 000000000000..85c70d240f8b --- /dev/null +++ b/net/ipv4/netfilter/ipt_TOS.c @@ -0,0 +1,105 @@ +/* This is a module which is used for setting the TOS field of a packet. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("iptables TOS mangling module"); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_tos_target_info *tosinfo = targinfo; + + if (((*pskb)->nh.iph->tos & IPTOS_TOS_MASK) != tosinfo->tos) { + u_int16_t diffs[2]; + + if (!skb_ip_make_writable(pskb, sizeof(struct iphdr))) + return NF_DROP; + + diffs[0] = htons((*pskb)->nh.iph->tos) ^ 0xFFFF; + (*pskb)->nh.iph->tos + = ((*pskb)->nh.iph->tos & IPTOS_PREC_MASK) + | tosinfo->tos; + diffs[1] = htons((*pskb)->nh.iph->tos); + (*pskb)->nh.iph->check + = csum_fold(csum_partial((char *)diffs, + sizeof(diffs), + (*pskb)->nh.iph->check + ^0xFFFF)); + (*pskb)->nfcache |= NFC_ALTERED; + } + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const u_int8_t tos = ((struct ipt_tos_target_info *)targinfo)->tos; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_tos_target_info))) { + printk(KERN_WARNING "TOS: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_tos_target_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "TOS: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + if (tos != IPTOS_LOWDELAY + && tos != IPTOS_THROUGHPUT + && tos != IPTOS_RELIABILITY + && tos != IPTOS_MINCOST + && tos != IPTOS_NORMALSVC) { + printk(KERN_WARNING "TOS: bad tos value %#x\n", tos); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_tos_reg = { + .name = "TOS", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_tos_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_tos_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_ULOG.c b/net/ipv4/netfilter/ipt_ULOG.c new file mode 100644 index 000000000000..6f2cefbe16cd --- /dev/null +++ b/net/ipv4/netfilter/ipt_ULOG.c @@ -0,0 +1,419 @@ +/* + * netfilter module for userspace packet logging daemons + * + * (C) 2000-2004 by Harald Welte + * + * 2000/09/22 ulog-cprange feature added + * 2001/01/04 in-kernel queue as proposed by Sebastian Zander + * + * 2001/01/30 per-rule nlgroup conflicts with global queue. + * nlgroup now global (sysctl) + * 2001/04/19 ulog-queue reworked, now fixed buffer size specified at + * module loadtime -HW + * 2002/07/07 remove broken nflog_rcv() function -HW + * 2002/08/29 fix shifted/unshifted nlgroup bug -HW + * 2002/10/30 fix uninitialized mac_len field - + * 2004/10/25 fix erroneous calculation of 'len' parameter to NLMSG_PUT + * resulting in bogus 'error during NLMSG_PUT' messages. + * + * (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This module accepts two parameters: + * + * nlbufsiz: + * The parameter specifies how big the buffer for each netlink multicast + * group is. e.g. If you say nlbufsiz=8192, up to eight kb of packets will + * get accumulated in the kernel until they are sent to userspace. It is + * NOT possible to allocate more than 128kB, and it is strongly discouraged, + * because atomically allocating 128kB inside the network rx softirq is not + * reliable. Please also keep in mind that this buffer size is allocated for + * each nlgroup you are using, so the total kernel memory usage increases + * by that factor. + * + * flushtimeout: + * Specify, after how many hundredths of a second the queue should be + * flushed even if it is not full yet. + * + * ipt_ULOG.c,v 1.22 2002/10/30 09:07:31 laforge Exp + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("iptables userspace logging module"); + +#define ULOG_NL_EVENT 111 /* Harald's favorite number */ +#define ULOG_MAXNLGROUPS 32 /* numer of nlgroups */ + +#if 0 +#define DEBUGP(format, args...) printk("%s:%s:" format, \ + __FILE__, __FUNCTION__ , ## args) +#else +#define DEBUGP(format, args...) +#endif + +#define PRINTR(format, args...) do { if (net_ratelimit()) printk(format , ## args); } while (0) + +static unsigned int nlbufsiz = 4096; +module_param(nlbufsiz, uint, 0600); /* FIXME: Check size < 128k --RR */ +MODULE_PARM_DESC(nlbufsiz, "netlink buffer size"); + +static unsigned int flushtimeout = 10; +module_param(flushtimeout, int, 0600); +MODULE_PARM_DESC(flushtimeout, "buffer flush timeout (hundredths of a second)"); + +static unsigned int nflog = 1; +module_param(nflog, int, 0400); +MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); + +/* global data structures */ + +typedef struct { + unsigned int qlen; /* number of nlmsgs' in the skb */ + struct nlmsghdr *lastnlh; /* netlink header of last msg in skb */ + struct sk_buff *skb; /* the pre-allocated skb */ + struct timer_list timer; /* the timer function */ +} ulog_buff_t; + +static ulog_buff_t ulog_buffers[ULOG_MAXNLGROUPS]; /* array of buffers */ + +static struct sock *nflognl; /* our socket */ +static DECLARE_LOCK(ulog_lock); /* spinlock */ + +/* send one ulog_buff_t to userspace */ +static void ulog_send(unsigned int nlgroupnum) +{ + ulog_buff_t *ub = &ulog_buffers[nlgroupnum]; + + if (timer_pending(&ub->timer)) { + DEBUGP("ipt_ULOG: ulog_send: timer was pending, deleting\n"); + del_timer(&ub->timer); + } + + /* last nlmsg needs NLMSG_DONE */ + if (ub->qlen > 1) + ub->lastnlh->nlmsg_type = NLMSG_DONE; + + NETLINK_CB(ub->skb).dst_groups = (1 << nlgroupnum); + DEBUGP("ipt_ULOG: throwing %d packets to netlink mask %u\n", + ub->qlen, nlgroupnum); + netlink_broadcast(nflognl, ub->skb, 0, (1 << nlgroupnum), GFP_ATOMIC); + + ub->qlen = 0; + ub->skb = NULL; + ub->lastnlh = NULL; + +} + + +/* timer function to flush queue in flushtimeout time */ +static void ulog_timer(unsigned long data) +{ + DEBUGP("ipt_ULOG: timer function called, calling ulog_send\n"); + + /* lock to protect against somebody modifying our structure + * from ipt_ulog_target at the same time */ + LOCK_BH(&ulog_lock); + ulog_send(data); + UNLOCK_BH(&ulog_lock); +} + +static struct sk_buff *ulog_alloc_skb(unsigned int size) +{ + struct sk_buff *skb; + + /* alloc skb which should be big enough for a whole + * multipart message. WARNING: has to be <= 131000 + * due to slab allocator restrictions */ + + skb = alloc_skb(nlbufsiz, GFP_ATOMIC); + if (!skb) { + PRINTR("ipt_ULOG: can't alloc whole buffer %ub!\n", + nlbufsiz); + + /* try to allocate only as much as we need for + * current packet */ + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + PRINTR("ipt_ULOG: can't even allocate %ub\n", size); + } + + return skb; +} + +static void ipt_ulog_packet(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct ipt_ulog_info *loginfo, + const char *prefix) +{ + ulog_buff_t *ub; + ulog_packet_msg_t *pm; + size_t size, copy_len; + struct nlmsghdr *nlh; + + /* ffs == find first bit set, necessary because userspace + * is already shifting groupnumber, but we need unshifted. + * ffs() returns [1..32], we need [0..31] */ + unsigned int groupnum = ffs(loginfo->nl_group) - 1; + + /* calculate the size of the skb needed */ + if ((loginfo->copy_range == 0) || + (loginfo->copy_range > skb->len)) { + copy_len = skb->len; + } else { + copy_len = loginfo->copy_range; + } + + size = NLMSG_SPACE(sizeof(*pm) + copy_len); + + ub = &ulog_buffers[groupnum]; + + LOCK_BH(&ulog_lock); + + if (!ub->skb) { + if (!(ub->skb = ulog_alloc_skb(size))) + goto alloc_failure; + } else if (ub->qlen >= loginfo->qthreshold || + size > skb_tailroom(ub->skb)) { + /* either the queue len is too high or we don't have + * enough room in nlskb left. send it to userspace. */ + + ulog_send(groupnum); + + if (!(ub->skb = ulog_alloc_skb(size))) + goto alloc_failure; + } + + DEBUGP("ipt_ULOG: qlen %d, qthreshold %d\n", ub->qlen, + loginfo->qthreshold); + + /* NLMSG_PUT contains a hidden goto nlmsg_failure !!! */ + nlh = NLMSG_PUT(ub->skb, 0, ub->qlen, ULOG_NL_EVENT, + sizeof(*pm)+copy_len); + ub->qlen++; + + pm = NLMSG_DATA(nlh); + + /* We might not have a timestamp, get one */ + if (skb->stamp.tv_sec == 0) + do_gettimeofday((struct timeval *)&skb->stamp); + + /* copy hook, prefix, timestamp, payload, etc. */ + pm->data_len = copy_len; + pm->timestamp_sec = skb->stamp.tv_sec; + pm->timestamp_usec = skb->stamp.tv_usec; + pm->mark = skb->nfmark; + pm->hook = hooknum; + if (prefix != NULL) + strncpy(pm->prefix, prefix, sizeof(pm->prefix)); + else if (loginfo->prefix[0] != '\0') + strncpy(pm->prefix, loginfo->prefix, sizeof(pm->prefix)); + else + *(pm->prefix) = '\0'; + + if (in && in->hard_header_len > 0 + && skb->mac.raw != (void *) skb->nh.iph + && in->hard_header_len <= ULOG_MAC_LEN) { + memcpy(pm->mac, skb->mac.raw, in->hard_header_len); + pm->mac_len = in->hard_header_len; + } else + pm->mac_len = 0; + + if (in) + strncpy(pm->indev_name, in->name, sizeof(pm->indev_name)); + else + pm->indev_name[0] = '\0'; + + if (out) + strncpy(pm->outdev_name, out->name, sizeof(pm->outdev_name)); + else + pm->outdev_name[0] = '\0'; + + /* copy_len <= skb->len, so can't fail. */ + if (skb_copy_bits(skb, 0, pm->payload, copy_len) < 0) + BUG(); + + /* check if we are building multi-part messages */ + if (ub->qlen > 1) { + ub->lastnlh->nlmsg_flags |= NLM_F_MULTI; + } + + ub->lastnlh = nlh; + + /* if timer isn't already running, start it */ + if (!timer_pending(&ub->timer)) { + ub->timer.expires = jiffies + flushtimeout * HZ / 100; + add_timer(&ub->timer); + } + + /* if threshold is reached, send message to userspace */ + if (ub->qlen >= loginfo->qthreshold) { + if (loginfo->qthreshold > 1) + nlh->nlmsg_type = NLMSG_DONE; + ulog_send(groupnum); + } + + UNLOCK_BH(&ulog_lock); + + return; + +nlmsg_failure: + PRINTR("ipt_ULOG: error during NLMSG_PUT\n"); + +alloc_failure: + PRINTR("ipt_ULOG: Error building netlink message\n"); + + UNLOCK_BH(&ulog_lock); +} + +static unsigned int ipt_ulog_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, void *userinfo) +{ + struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo; + + ipt_ulog_packet(hooknum, *pskb, in, out, loginfo, NULL); + + return IPT_CONTINUE; +} + +static void ipt_logfn(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const char *prefix) +{ + struct ipt_ulog_info loginfo = { + .nl_group = ULOG_DEFAULT_NLGROUP, + .copy_range = 0, + .qthreshold = ULOG_DEFAULT_QTHRESHOLD, + .prefix = "" + }; + + ipt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); +} + +static int ipt_ulog_checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hookmask) +{ + struct ipt_ulog_info *loginfo = (struct ipt_ulog_info *) targinfo; + + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_ulog_info))) { + DEBUGP("ipt_ULOG: targinfosize %u != 0\n", targinfosize); + return 0; + } + + if (loginfo->prefix[sizeof(loginfo->prefix) - 1] != '\0') { + DEBUGP("ipt_ULOG: prefix term %i\n", + loginfo->prefix[sizeof(loginfo->prefix) - 1]); + return 0; + } + + if (loginfo->qthreshold > ULOG_MAX_QLEN) { + DEBUGP("ipt_ULOG: queue threshold %i > MAX_QLEN\n", + loginfo->qthreshold); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_ulog_reg = { + .name = "ULOG", + .target = ipt_ulog_target, + .checkentry = ipt_ulog_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + int i; + + DEBUGP("ipt_ULOG: init module\n"); + + if (nlbufsiz >= 128*1024) { + printk("Netlink buffer has to be <= 128kB\n"); + return -EINVAL; + } + + /* initialize ulog_buffers */ + for (i = 0; i < ULOG_MAXNLGROUPS; i++) { + init_timer(&ulog_buffers[i].timer); + ulog_buffers[i].timer.function = ulog_timer; + ulog_buffers[i].timer.data = i; + } + + nflognl = netlink_kernel_create(NETLINK_NFLOG, NULL); + if (!nflognl) + return -ENOMEM; + + if (ipt_register_target(&ipt_ulog_reg) != 0) { + sock_release(nflognl->sk_socket); + return -EINVAL; + } + if (nflog) + nf_log_register(PF_INET, &ipt_logfn); + + return 0; +} + +static void __exit fini(void) +{ + ulog_buff_t *ub; + int i; + + DEBUGP("ipt_ULOG: cleanup_module\n"); + + if (nflog) + nf_log_unregister(PF_INET, &ipt_logfn); + ipt_unregister_target(&ipt_ulog_reg); + sock_release(nflognl->sk_socket); + + /* remove pending timers and free allocated skb's */ + for (i = 0; i < ULOG_MAXNLGROUPS; i++) { + ub = &ulog_buffers[i]; + if (timer_pending(&ub->timer)) { + DEBUGP("timer was pending, deleting\n"); + del_timer(&ub->timer); + } + + if (ub->skb) { + kfree_skb(ub->skb); + ub->skb = NULL; + } + } + +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c new file mode 100644 index 000000000000..f5909a4c3fc7 --- /dev/null +++ b/net/ipv4/netfilter/ipt_addrtype.c @@ -0,0 +1,77 @@ +/* + * iptables module to match inet_addr_type() of an ip. + * + * Copyright (c) 2004 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("iptables addrtype match"); + +static inline int match_type(u_int32_t addr, u_int16_t mask) +{ + return !!(mask & (1 << inet_addr_type(addr))); +} + +static int match(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *matchinfo, + int offset, int *hotdrop) +{ + const struct ipt_addrtype_info *info = matchinfo; + const struct iphdr *iph = skb->nh.iph; + int ret = 1; + + if (info->source) + ret &= match_type(iph->saddr, info->source)^info->invert_source; + if (info->dest) + ret &= match_type(iph->daddr, info->dest)^info->invert_dest; + + return ret; +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, + void *matchinfo, unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_addrtype_info))) { + printk(KERN_ERR "ipt_addrtype: invalid size (%u != %Zu)\n.", + matchsize, IPT_ALIGN(sizeof(struct ipt_addrtype_info))); + return 0; + } + + return 1; +} + +static struct ipt_match addrtype_match = { + .name = "addrtype", + .match = match, + .checkentry = checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&addrtype_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&addrtype_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_ah.c b/net/ipv4/netfilter/ipt_ah.c new file mode 100644 index 000000000000..a0fea847cb72 --- /dev/null +++ b/net/ipv4/netfilter/ipt_ah.c @@ -0,0 +1,117 @@ +/* Kernel module to match AH parameters. */ +/* (C) 1999-2000 Yon Uriarte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yon Uriarte "); +MODULE_DESCRIPTION("iptables AH SPI match module"); + +#ifdef DEBUG_CONNTRACK +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline int +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert) +{ + int r=0; + duprintf("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ', + min,spi,max); + r=(spi >= min && spi <= max) ^ invert; + duprintf(" result %s\n",r? "PASS" : "FAILED"); + return r; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct ip_auth_hdr _ahdr, *ah; + const struct ipt_ah *ahinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + ah = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_ahdr), &_ahdr); + if (ah == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("Dropping evil AH tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return spi_match(ahinfo->spis[0], ahinfo->spis[1], + ntohl(ah->spi), + !!(ahinfo->invflags & IPT_AH_INV_SPI)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ipt_ah *ahinfo = matchinfo; + + /* Must specify proto == AH, and no unknown invflags */ + if (ip->proto != IPPROTO_AH || (ip->invflags & IPT_INV_PROTO)) { + duprintf("ipt_ah: Protocol %u != %u\n", ip->proto, + IPPROTO_AH); + return 0; + } + if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_ah))) { + duprintf("ipt_ah: matchsize %u != %u\n", + matchinfosize, IPT_ALIGN(sizeof(struct ipt_ah))); + return 0; + } + if (ahinfo->invflags & ~IPT_AH_INV_MASK) { + duprintf("ipt_ah: unknown flags %X\n", + ahinfo->invflags); + return 0; + } + + return 1; +} + +static struct ipt_match ah_match = { + .name = "ah", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&ah_match); +} + +static void __exit cleanup(void) +{ + ipt_unregister_match(&ah_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv4/netfilter/ipt_comment.c b/net/ipv4/netfilter/ipt_comment.c new file mode 100644 index 000000000000..6b76a1ea5245 --- /dev/null +++ b/net/ipv4/netfilter/ipt_comment.c @@ -0,0 +1,59 @@ +/* + * Implements a dummy match to allow attaching comments to rules + * + * 2003-05-13 Brad Fisher (brad@info-link.net) + */ + +#include +#include +#include +#include + +MODULE_AUTHOR("Brad Fisher "); +MODULE_DESCRIPTION("iptables comment match module"); +MODULE_LICENSE("GPL"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + /* We always match */ + return 1; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + /* Check the size */ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_comment_info))) + return 0; + return 1; +} + +static struct ipt_match comment_match = { + .name = "comment", + .match = match, + .checkentry = checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&comment_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&comment_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_connmark.c b/net/ipv4/netfilter/ipt_connmark.c new file mode 100644 index 000000000000..2706f96cea55 --- /dev/null +++ b/net/ipv4/netfilter/ipt_connmark.c @@ -0,0 +1,81 @@ +/* This kernel module matches connection mark values set by the + * CONNMARK target + * + * Copyright (C) 2002,2004 MARA Systems AB + * by Henrik Nordstrom + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include + +MODULE_AUTHOR("Henrik Nordstrom "); +MODULE_DESCRIPTION("IP tables connmark match module"); +MODULE_LICENSE("GPL"); + +#include +#include +#include + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_connmark_info *info = matchinfo; + enum ip_conntrack_info ctinfo; + struct ip_conntrack *ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); + if (!ct) + return 0; + + return ((ct->mark & info->mask) == info->mark) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_connmark_info))) + return 0; + + return 1; +} + +static struct ipt_match connmark_match = { + .name = "connmark", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&connmark_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&connmark_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_conntrack.c b/net/ipv4/netfilter/ipt_conntrack.c new file mode 100644 index 000000000000..c1d22801b7cf --- /dev/null +++ b/net/ipv4/netfilter/ipt_conntrack.c @@ -0,0 +1,136 @@ +/* Kernel module to match connection tracking information. + * Superset of Rusty's minimalistic state match. + * + * (C) 2001 Marc Boucher (marc@mbsi.ca). + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher "); +MODULE_DESCRIPTION("iptables connection tracking match module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_conntrack_info *sinfo = matchinfo; + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + unsigned int statebit; + + ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); + +#define FWINV(bool,invflg) ((bool) ^ !!(sinfo->invflags & invflg)) + + if (ct == &ip_conntrack_untracked) + statebit = IPT_CONNTRACK_STATE_UNTRACKED; + else if (ct) + statebit = IPT_CONNTRACK_STATE_BIT(ctinfo); + else + statebit = IPT_CONNTRACK_STATE_INVALID; + + if(sinfo->flags & IPT_CONNTRACK_STATE) { + if (ct) { + if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip != + ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip) + statebit |= IPT_CONNTRACK_STATE_SNAT; + + if(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip != + ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip) + statebit |= IPT_CONNTRACK_STATE_DNAT; + } + + if (FWINV((statebit & sinfo->statemask) == 0, IPT_CONNTRACK_STATE)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_PROTO) { + if (!ct || FWINV(ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.protonum != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum, IPT_CONNTRACK_PROTO)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_ORIGSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip, IPT_CONNTRACK_ORIGSRC)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_ORIGDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) != sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip, IPT_CONNTRACK_ORIGDST)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_REPLSRC) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.ip&sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].src.ip, IPT_CONNTRACK_REPLSRC)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_REPLDST) { + if (!ct || FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.ip&sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) != sinfo->tuple[IP_CT_DIR_REPLY].dst.ip, IPT_CONNTRACK_REPLDST)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_STATUS) { + if (!ct || FWINV((ct->status & sinfo->statusmask) == 0, IPT_CONNTRACK_STATUS)) + return 0; + } + + if(sinfo->flags & IPT_CONNTRACK_EXPIRES) { + unsigned long expires; + + if(!ct) + return 0; + + expires = timer_pending(&ct->timeout) ? (ct->timeout.expires - jiffies)/HZ : 0; + + if (FWINV(!(expires >= sinfo->expires_min && expires <= sinfo->expires_max), IPT_CONNTRACK_EXPIRES)) + return 0; + } + + return 1; +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_conntrack_info))) + return 0; + + return 1; +} + +static struct ipt_match conntrack_match = { + .name = "conntrack", + .match = &match, + .checkentry = &check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + need_ip_conntrack(); + return ipt_register_match(&conntrack_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&conntrack_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_dscp.c b/net/ipv4/netfilter/ipt_dscp.c new file mode 100644 index 000000000000..5df52a64a5d4 --- /dev/null +++ b/net/ipv4/netfilter/ipt_dscp.c @@ -0,0 +1,63 @@ +/* IP tables module for matching the value of the IPv4 DSCP field + * + * ipt_dscp.c,v 1.3 2002/08/05 19:00:21 laforge Exp + * + * (C) 2002 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +#include +#include + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("iptables DSCP matching module"); +MODULE_LICENSE("GPL"); + +static int match(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *matchinfo, + int offset, int *hotdrop) +{ + const struct ipt_dscp_info *info = matchinfo; + const struct iphdr *iph = skb->nh.iph; + + u_int8_t sh_dscp = ((info->dscp << IPT_DSCP_SHIFT) & IPT_DSCP_MASK); + + return ((iph->tos&IPT_DSCP_MASK) == sh_dscp) ^ info->invert; +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, + void *matchinfo, unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_dscp_info))) + return 0; + + return 1; +} + +static struct ipt_match dscp_match = { + .name = "dscp", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&dscp_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&dscp_match); + +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_ecn.c b/net/ipv4/netfilter/ipt_ecn.c new file mode 100644 index 000000000000..b6f7181e89cc --- /dev/null +++ b/net/ipv4/netfilter/ipt_ecn.c @@ -0,0 +1,131 @@ +/* IP tables module for matching the value of the IPv4 and TCP ECN bits + * + * ipt_ecn.c,v 1.3 2002/05/29 15:09:00 laforge Exp + * + * (C) 2002 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("iptables ECN matching module"); +MODULE_LICENSE("GPL"); + +static inline int match_ip(const struct sk_buff *skb, + const struct ipt_ecn_info *einfo) +{ + return ((skb->nh.iph->tos&IPT_ECN_IP_MASK) == einfo->ip_ect); +} + +static inline int match_tcp(const struct sk_buff *skb, + const struct ipt_ecn_info *einfo, + int *hotdrop) +{ + struct tcphdr _tcph, *th; + + /* In practice, TCP match does this, so can't fail. But let's + * be good citizens. + */ + th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) { + *hotdrop = 0; + return 0; + } + + if (einfo->operation & IPT_ECN_OP_MATCH_ECE) { + if (einfo->invert & IPT_ECN_OP_MATCH_ECE) { + if (th->ece == 1) + return 0; + } else { + if (th->ece == 0) + return 0; + } + } + + if (einfo->operation & IPT_ECN_OP_MATCH_CWR) { + if (einfo->invert & IPT_ECN_OP_MATCH_CWR) { + if (th->cwr == 1) + return 0; + } else { + if (th->cwr == 0) + return 0; + } + } + + return 1; +} + +static int match(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *matchinfo, + int offset, int *hotdrop) +{ + const struct ipt_ecn_info *info = matchinfo; + + if (info->operation & IPT_ECN_OP_MATCH_IP) + if (!match_ip(skb, info)) + return 0; + + if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR)) { + if (skb->nh.iph->protocol != IPPROTO_TCP) + return 0; + if (!match_tcp(skb, info, hotdrop)) + return 0; + } + + return 1; +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, + void *matchinfo, unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_ecn_info *info = matchinfo; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_ecn_info))) + return 0; + + if (info->operation & IPT_ECN_OP_MATCH_MASK) + return 0; + + if (info->invert & IPT_ECN_OP_MATCH_MASK) + return 0; + + if (info->operation & (IPT_ECN_OP_MATCH_ECE|IPT_ECN_OP_MATCH_CWR) + && ip->proto != IPPROTO_TCP) { + printk(KERN_WARNING "ipt_ecn: can't match TCP bits in rule for" + " non-tcp packets\n"); + return 0; + } + + return 1; +} + +static struct ipt_match ecn_match = { + .name = "ecn", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&ecn_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&ecn_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_esp.c b/net/ipv4/netfilter/ipt_esp.c new file mode 100644 index 000000000000..e1d0dd31e117 --- /dev/null +++ b/net/ipv4/netfilter/ipt_esp.c @@ -0,0 +1,118 @@ +/* Kernel module to match ESP parameters. */ + +/* (C) 1999-2000 Yon Uriarte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Yon Uriarte "); +MODULE_DESCRIPTION("iptables ESP SPI match module"); + +#ifdef DEBUG_CONNTRACK +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline int +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert) +{ + int r=0; + duprintf("esp spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ', + min,spi,max); + r=(spi >= min && spi <= max) ^ invert; + duprintf(" result %s\n",r? "PASS" : "FAILED"); + return r; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct ip_esp_hdr _esp, *eh; + const struct ipt_esp *espinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + eh = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_esp), &_esp); + if (eh == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("Dropping evil ESP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return spi_match(espinfo->spis[0], espinfo->spis[1], + ntohl(eh->spi), + !!(espinfo->invflags & IPT_ESP_INV_SPI)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ipt_esp *espinfo = matchinfo; + + /* Must specify proto == ESP, and no unknown invflags */ + if (ip->proto != IPPROTO_ESP || (ip->invflags & IPT_INV_PROTO)) { + duprintf("ipt_esp: Protocol %u != %u\n", ip->proto, + IPPROTO_ESP); + return 0; + } + if (matchinfosize != IPT_ALIGN(sizeof(struct ipt_esp))) { + duprintf("ipt_esp: matchsize %u != %u\n", + matchinfosize, IPT_ALIGN(sizeof(struct ipt_esp))); + return 0; + } + if (espinfo->invflags & ~IPT_ESP_INV_MASK) { + duprintf("ipt_esp: unknown flags %X\n", + espinfo->invflags); + return 0; + } + + return 1; +} + +static struct ipt_match esp_match = { + .name = "esp", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&esp_match); +} + +static void __exit cleanup(void) +{ + ipt_unregister_match(&esp_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c new file mode 100644 index 000000000000..f1937190cd77 --- /dev/null +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -0,0 +1,731 @@ +/* iptables match extension to limit the number of packets per second + * seperately for each hashbucket (sourceip/sourceport/dstip/dstport) + * + * (C) 2003-2004 by Harald Welte + * + * $Id: ipt_hashlimit.c 3244 2004-10-20 16:24:29Z laforge@netfilter.org $ + * + * Development of this code was funded by Astaro AG, http://www.astaro.com/ + * + * based on ipt_limit.c by: + * Jérôme de Vivie + * Hervé Eychenne + * Rusty Russell + * + * The general idea is to create a hash table for every dstip and have a + * seperate limit counter per tuple. This way you can do something like 'limit + * the number of syn packets for each of my internal addresses. + * + * Ideally this would just be implemented as a general 'hash' match, which would + * allow us to attach any iptables target to it's hash buckets. But this is + * not possible in the current iptables architecture. As always, pkttables for + * 2.7.x will help ;) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* FIXME: this is just for IP_NF_ASSERRT */ +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("iptables match for limiting per hash-bucket"); + +/* need to declare this at the top */ +static struct proc_dir_entry *hashlimit_procdir; +static struct file_operations dl_file_ops; + +/* hash table crap */ + +struct dsthash_dst { + u_int32_t src_ip; + u_int32_t dst_ip; + /* ports have to be consecutive !!! */ + u_int16_t src_port; + u_int16_t dst_port; +}; + +struct dsthash_ent { + /* static / read-only parts in the beginning */ + struct hlist_node node; + struct dsthash_dst dst; + + /* modified structure members in the end */ + unsigned long expires; /* precalculated expiry time */ + struct { + unsigned long prev; /* last modification */ + u_int32_t credit; + u_int32_t credit_cap, cost; + } rateinfo; +}; + +struct ipt_hashlimit_htable { + struct hlist_node node; /* global list of all htables */ + atomic_t use; + + struct hashlimit_cfg cfg; /* config */ + + /* used internally */ + spinlock_t lock; /* lock for list_head */ + u_int32_t rnd; /* random seed for hash */ + struct timer_list timer; /* timer for gc */ + atomic_t count; /* number entries in table */ + + /* seq_file stuff */ + struct proc_dir_entry *pde; + + struct hlist_head hash[0]; /* hashtable itself */ +}; + +static DECLARE_LOCK(hashlimit_lock); /* protects htables list */ +static DECLARE_MUTEX(hlimit_mutex); /* additional checkentry protection */ +static HLIST_HEAD(hashlimit_htables); +static kmem_cache_t *hashlimit_cachep; + +static inline int dst_cmp(const struct dsthash_ent *ent, struct dsthash_dst *b) +{ + return (ent->dst.dst_ip == b->dst_ip + && ent->dst.dst_port == b->dst_port + && ent->dst.src_port == b->src_port + && ent->dst.src_ip == b->src_ip); +} + +static inline u_int32_t +hash_dst(const struct ipt_hashlimit_htable *ht, const struct dsthash_dst *dst) +{ + return (jhash_3words(dst->dst_ip, (dst->dst_port<<16 | dst->src_port), + dst->src_ip, ht->rnd) % ht->cfg.size); +} + +static inline struct dsthash_ent * +__dsthash_find(const struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst) +{ + struct dsthash_ent *ent; + struct hlist_node *pos; + u_int32_t hash = hash_dst(ht, dst); + + if (!hlist_empty(&ht->hash[hash])) + hlist_for_each_entry(ent, pos, &ht->hash[hash], node) { + if (dst_cmp(ent, dst)) { + return ent; + } + } + + return NULL; +} + +/* allocate dsthash_ent, initialize dst, put in htable and lock it */ +static struct dsthash_ent * +__dsthash_alloc_init(struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst) +{ + struct dsthash_ent *ent; + + /* initialize hash with random val at the time we allocate + * the first hashtable entry */ + if (!ht->rnd) + get_random_bytes(&ht->rnd, 4); + + if (ht->cfg.max && + atomic_read(&ht->count) >= ht->cfg.max) { + /* FIXME: do something. question is what.. */ + if (net_ratelimit()) + printk(KERN_WARNING + "ipt_hashlimit: max count of %u reached\n", + ht->cfg.max); + return NULL; + } + + ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC); + if (!ent) { + if (net_ratelimit()) + printk(KERN_ERR + "ipt_hashlimit: can't allocate dsthash_ent\n"); + return NULL; + } + + atomic_inc(&ht->count); + + ent->dst.dst_ip = dst->dst_ip; + ent->dst.dst_port = dst->dst_port; + ent->dst.src_ip = dst->src_ip; + ent->dst.src_port = dst->src_port; + + hlist_add_head(&ent->node, &ht->hash[hash_dst(ht, dst)]); + + return ent; +} + +static inline void +__dsthash_free(struct ipt_hashlimit_htable *ht, struct dsthash_ent *ent) +{ + hlist_del(&ent->node); + kmem_cache_free(hashlimit_cachep, ent); + atomic_dec(&ht->count); +} +static void htable_gc(unsigned long htlong); + +static int htable_create(struct ipt_hashlimit_info *minfo) +{ + int i; + unsigned int size; + struct ipt_hashlimit_htable *hinfo; + + if (minfo->cfg.size) + size = minfo->cfg.size; + else { + size = (((num_physpages << PAGE_SHIFT) / 16384) + / sizeof(struct list_head)); + if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + size = 8192; + if (size < 16) + size = 16; + } + /* FIXME: don't use vmalloc() here or anywhere else -HW */ + hinfo = vmalloc(sizeof(struct ipt_hashlimit_htable) + + (sizeof(struct list_head) * size)); + if (!hinfo) { + printk(KERN_ERR "ipt_hashlimit: Unable to create hashtable\n"); + return -1; + } + minfo->hinfo = hinfo; + + /* copy match config into hashtable config */ + memcpy(&hinfo->cfg, &minfo->cfg, sizeof(hinfo->cfg)); + hinfo->cfg.size = size; + if (!hinfo->cfg.max) + hinfo->cfg.max = 8 * hinfo->cfg.size; + else if (hinfo->cfg.max < hinfo->cfg.size) + hinfo->cfg.max = hinfo->cfg.size; + + for (i = 0; i < hinfo->cfg.size; i++) + INIT_HLIST_HEAD(&hinfo->hash[i]); + + atomic_set(&hinfo->count, 0); + atomic_set(&hinfo->use, 1); + hinfo->rnd = 0; + spin_lock_init(&hinfo->lock); + hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir); + if (!hinfo->pde) { + vfree(hinfo); + return -1; + } + hinfo->pde->proc_fops = &dl_file_ops; + hinfo->pde->data = hinfo; + + init_timer(&hinfo->timer); + hinfo->timer.expires = jiffies + msecs_to_jiffies(hinfo->cfg.gc_interval); + hinfo->timer.data = (unsigned long )hinfo; + hinfo->timer.function = htable_gc; + add_timer(&hinfo->timer); + + LOCK_BH(&hashlimit_lock); + hlist_add_head(&hinfo->node, &hashlimit_htables); + UNLOCK_BH(&hashlimit_lock); + + return 0; +} + +static int select_all(struct ipt_hashlimit_htable *ht, struct dsthash_ent *he) +{ + return 1; +} + +static int select_gc(struct ipt_hashlimit_htable *ht, struct dsthash_ent *he) +{ + return (jiffies >= he->expires); +} + +static void htable_selective_cleanup(struct ipt_hashlimit_htable *ht, + int (*select)(struct ipt_hashlimit_htable *ht, + struct dsthash_ent *he)) +{ + int i; + + IP_NF_ASSERT(ht->cfg.size && ht->cfg.max); + + /* lock hash table and iterate over it */ + spin_lock_bh(&ht->lock); + for (i = 0; i < ht->cfg.size; i++) { + struct dsthash_ent *dh; + struct hlist_node *pos, *n; + hlist_for_each_entry_safe(dh, pos, n, &ht->hash[i], node) { + if ((*select)(ht, dh)) + __dsthash_free(ht, dh); + } + } + spin_unlock_bh(&ht->lock); +} + +/* hash table garbage collector, run by timer */ +static void htable_gc(unsigned long htlong) +{ + struct ipt_hashlimit_htable *ht = (struct ipt_hashlimit_htable *)htlong; + + htable_selective_cleanup(ht, select_gc); + + /* re-add the timer accordingly */ + ht->timer.expires = jiffies + msecs_to_jiffies(ht->cfg.gc_interval); + add_timer(&ht->timer); +} + +static void htable_destroy(struct ipt_hashlimit_htable *hinfo) +{ + /* remove timer, if it is pending */ + if (timer_pending(&hinfo->timer)) + del_timer(&hinfo->timer); + + /* remove proc entry */ + remove_proc_entry(hinfo->pde->name, hashlimit_procdir); + + htable_selective_cleanup(hinfo, select_all); + vfree(hinfo); +} + +static struct ipt_hashlimit_htable *htable_find_get(char *name) +{ + struct ipt_hashlimit_htable *hinfo; + struct hlist_node *pos; + + LOCK_BH(&hashlimit_lock); + hlist_for_each_entry(hinfo, pos, &hashlimit_htables, node) { + if (!strcmp(name, hinfo->pde->name)) { + atomic_inc(&hinfo->use); + UNLOCK_BH(&hashlimit_lock); + return hinfo; + } + } + UNLOCK_BH(&hashlimit_lock); + + return NULL; +} + +static void htable_put(struct ipt_hashlimit_htable *hinfo) +{ + if (atomic_dec_and_test(&hinfo->use)) { + LOCK_BH(&hashlimit_lock); + hlist_del(&hinfo->node); + UNLOCK_BH(&hashlimit_lock); + htable_destroy(hinfo); + } +} + + +/* The algorithm used is the Simple Token Bucket Filter (TBF) + * see net/sched/sch_tbf.c in the linux source tree + */ + +/* Rusty: This is my (non-mathematically-inclined) understanding of + this algorithm. The `average rate' in jiffies becomes your initial + amount of credit `credit' and the most credit you can ever have + `credit_cap'. The `peak rate' becomes the cost of passing the + test, `cost'. + + `prev' tracks the last packet hit: you gain one credit per jiffy. + If you get credit balance more than this, the extra credit is + discarded. Every time the match passes, you lose `cost' credits; + if you don't have that many, the test fails. + + See Alexey's formal explanation in net/sched/sch_tbf.c. + + To get the maximum range, we multiply by this factor (ie. you get N + credits per jiffy). We want to allow a rate as low as 1 per day + (slowest userspace tool allows), which means + CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32 ie. +*/ +#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24)) + +/* Repeated shift and or gives us all 1s, final shift and add 1 gives + * us the power of 2 below the theoretical max, so GCC simply does a + * shift. */ +#define _POW2_BELOW2(x) ((x)|((x)>>1)) +#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2)) +#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) +#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) +#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) +#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) + +#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) + +/* Precision saver. */ +static inline u_int32_t +user2credits(u_int32_t user) +{ + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + /* Divide first. */ + return (user / IPT_HASHLIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + + return (user * HZ * CREDITS_PER_JIFFY) / IPT_HASHLIMIT_SCALE; +} + +static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) +{ + dh->rateinfo.credit += (now - xchg(&dh->rateinfo.prev, now)) + * CREDITS_PER_JIFFY; + if (dh->rateinfo.credit > dh->rateinfo.credit_cap) + dh->rateinfo.credit = dh->rateinfo.credit_cap; +} + +static inline int get_ports(const struct sk_buff *skb, int offset, + u16 ports[2]) +{ + union { + struct tcphdr th; + struct udphdr uh; + sctp_sctphdr_t sctph; + } hdr_u, *ptr_u; + + /* Must not be a fragment. */ + if (offset) + return 1; + + /* Must be big enough to read ports (both UDP and TCP have + them at the start). */ + ptr_u = skb_header_pointer(skb, skb->nh.iph->ihl*4, 8, &hdr_u); + if (!ptr_u) + return 1; + + switch (skb->nh.iph->protocol) { + case IPPROTO_TCP: + ports[0] = ptr_u->th.source; + ports[1] = ptr_u->th.dest; + break; + case IPPROTO_UDP: + ports[0] = ptr_u->uh.source; + ports[1] = ptr_u->uh.dest; + break; + case IPPROTO_SCTP: + ports[0] = ptr_u->sctph.source; + ports[1] = ptr_u->sctph.dest; + break; + default: + /* all other protocols don't supprot per-port hash + * buckets */ + ports[0] = ports[1] = 0; + break; + } + + return 0; +} + + +static int +hashlimit_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct ipt_hashlimit_info *r = + ((struct ipt_hashlimit_info *)matchinfo)->u.master; + struct ipt_hashlimit_htable *hinfo = r->hinfo; + unsigned long now = jiffies; + struct dsthash_ent *dh; + struct dsthash_dst dst; + + /* build 'dst' according to hinfo->cfg and current packet */ + memset(&dst, 0, sizeof(dst)); + if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DIP) + dst.dst_ip = skb->nh.iph->daddr; + if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SIP) + dst.src_ip = skb->nh.iph->saddr; + if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT + ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) { + u_int16_t ports[2]; + if (get_ports(skb, offset, ports)) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + *hotdrop = 1; + return 0; + } + if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) + dst.src_port = ports[0]; + if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT) + dst.dst_port = ports[1]; + } + + spin_lock_bh(&hinfo->lock); + dh = __dsthash_find(hinfo, &dst); + if (!dh) { + dh = __dsthash_alloc_init(hinfo, &dst); + + if (!dh) { + /* enomem... don't match == DROP */ + if (net_ratelimit()) + printk(KERN_ERR "%s: ENOMEM\n", __FUNCTION__); + spin_unlock_bh(&hinfo->lock); + return 0; + } + + dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire); + + dh->rateinfo.prev = jiffies; + dh->rateinfo.credit = user2credits(hinfo->cfg.avg * + hinfo->cfg.burst); + dh->rateinfo.credit_cap = user2credits(hinfo->cfg.avg * + hinfo->cfg.burst); + dh->rateinfo.cost = user2credits(hinfo->cfg.avg); + + spin_unlock_bh(&hinfo->lock); + return 1; + } + + /* update expiration timeout */ + dh->expires = now + msecs_to_jiffies(hinfo->cfg.expire); + + rateinfo_recalc(dh, now); + if (dh->rateinfo.credit >= dh->rateinfo.cost) { + /* We're underlimit. */ + dh->rateinfo.credit -= dh->rateinfo.cost; + spin_unlock_bh(&hinfo->lock); + return 1; + } + + spin_unlock_bh(&hinfo->lock); + + /* default case: we're overlimit, thus don't match */ + return 0; +} + +static int +hashlimit_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ipt_hashlimit_info *r = matchinfo; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_hashlimit_info))) + return 0; + + /* Check for overflow. */ + if (r->cfg.burst == 0 + || user2credits(r->cfg.avg * r->cfg.burst) < + user2credits(r->cfg.avg)) { + printk(KERN_ERR "ipt_hashlimit: Overflow, try lower: %u/%u\n", + r->cfg.avg, r->cfg.burst); + return 0; + } + + if (r->cfg.mode == 0 + || r->cfg.mode > (IPT_HASHLIMIT_HASH_DPT + |IPT_HASHLIMIT_HASH_DIP + |IPT_HASHLIMIT_HASH_SIP + |IPT_HASHLIMIT_HASH_SPT)) + return 0; + + if (!r->cfg.gc_interval) + return 0; + + if (!r->cfg.expire) + return 0; + + /* This is the best we've got: We cannot release and re-grab lock, + * since checkentry() is called before ip_tables.c grabs ipt_mutex. + * We also cannot grab the hashtable spinlock, since htable_create will + * call vmalloc, and that can sleep. And we cannot just re-search + * the list of htable's in htable_create(), since then we would + * create duplicate proc files. -HW */ + down(&hlimit_mutex); + r->hinfo = htable_find_get(r->name); + if (!r->hinfo && (htable_create(r) != 0)) { + up(&hlimit_mutex); + return 0; + } + up(&hlimit_mutex); + + /* Ugly hack: For SMP, we only want to use one set */ + r->u.master = r; + + return 1; +} + +static void +hashlimit_destroy(void *matchinfo, unsigned int matchsize) +{ + struct ipt_hashlimit_info *r = (struct ipt_hashlimit_info *) matchinfo; + + htable_put(r->hinfo); +} + +static struct ipt_match ipt_hashlimit = { + .name = "hashlimit", + .match = hashlimit_match, + .checkentry = hashlimit_checkentry, + .destroy = hashlimit_destroy, + .me = THIS_MODULE +}; + +/* PROC stuff */ + +static void *dl_seq_start(struct seq_file *s, loff_t *pos) +{ + struct proc_dir_entry *pde = s->private; + struct ipt_hashlimit_htable *htable = pde->data; + unsigned int *bucket; + + spin_lock_bh(&htable->lock); + if (*pos >= htable->cfg.size) + return NULL; + + bucket = kmalloc(sizeof(unsigned int), GFP_ATOMIC); + if (!bucket) + return ERR_PTR(-ENOMEM); + + *bucket = *pos; + return bucket; +} + +static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct proc_dir_entry *pde = s->private; + struct ipt_hashlimit_htable *htable = pde->data; + unsigned int *bucket = (unsigned int *)v; + + *pos = ++(*bucket); + if (*pos >= htable->cfg.size) { + kfree(v); + return NULL; + } + return bucket; +} + +static void dl_seq_stop(struct seq_file *s, void *v) +{ + struct proc_dir_entry *pde = s->private; + struct ipt_hashlimit_htable *htable = pde->data; + unsigned int *bucket = (unsigned int *)v; + + kfree(bucket); + + spin_unlock_bh(&htable->lock); +} + +static inline int dl_seq_real_show(struct dsthash_ent *ent, struct seq_file *s) +{ + /* recalculate to show accurate numbers */ + rateinfo_recalc(ent, jiffies); + + return seq_printf(s, "%ld %u.%u.%u.%u:%u->%u.%u.%u.%u:%u %u %u %u\n", + (long)(ent->expires - jiffies)/HZ, + NIPQUAD(ent->dst.src_ip), ntohs(ent->dst.src_port), + NIPQUAD(ent->dst.dst_ip), ntohs(ent->dst.dst_port), + ent->rateinfo.credit, ent->rateinfo.credit_cap, + ent->rateinfo.cost); +} + +static int dl_seq_show(struct seq_file *s, void *v) +{ + struct proc_dir_entry *pde = s->private; + struct ipt_hashlimit_htable *htable = pde->data; + unsigned int *bucket = (unsigned int *)v; + struct dsthash_ent *ent; + struct hlist_node *pos; + + if (!hlist_empty(&htable->hash[*bucket])) + hlist_for_each_entry(ent, pos, &htable->hash[*bucket], node) { + if (dl_seq_real_show(ent, s)) { + /* buffer was filled and unable to print that tuple */ + return 1; + } + } + + return 0; +} + +static struct seq_operations dl_seq_ops = { + .start = dl_seq_start, + .next = dl_seq_next, + .stop = dl_seq_stop, + .show = dl_seq_show +}; + +static int dl_proc_open(struct inode *inode, struct file *file) +{ + int ret = seq_open(file, &dl_seq_ops); + + if (!ret) { + struct seq_file *sf = file->private_data; + sf->private = PDE(inode); + } + return ret; +} + +static struct file_operations dl_file_ops = { + .owner = THIS_MODULE, + .open = dl_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +static int init_or_fini(int fini) +{ + int ret = 0; + + if (fini) + goto cleanup; + + if (ipt_register_match(&ipt_hashlimit)) { + ret = -EINVAL; + goto cleanup_nothing; + } + + hashlimit_cachep = kmem_cache_create("ipt_hashlimit", + sizeof(struct dsthash_ent), 0, + 0, NULL, NULL); + if (!hashlimit_cachep) { + printk(KERN_ERR "Unable to create ipt_hashlimit slab cache\n"); + ret = -ENOMEM; + goto cleanup_unreg_match; + } + + hashlimit_procdir = proc_mkdir("ipt_hashlimit", proc_net); + if (!hashlimit_procdir) { + printk(KERN_ERR "Unable to create proc dir entry\n"); + ret = -ENOMEM; + goto cleanup_free_slab; + } + + return ret; + +cleanup: + remove_proc_entry("ipt_hashlimit", proc_net); +cleanup_free_slab: + kmem_cache_destroy(hashlimit_cachep); +cleanup_unreg_match: + ipt_unregister_match(&ipt_hashlimit); +cleanup_nothing: + return ret; + +} + +static int __init init(void) +{ + return init_or_fini(0); +} + +static void __exit fini(void) +{ + init_or_fini(1); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_helper.c b/net/ipv4/netfilter/ipt_helper.c new file mode 100644 index 000000000000..33fdf364d3d3 --- /dev/null +++ b/net/ipv4/netfilter/ipt_helper.c @@ -0,0 +1,113 @@ +/* iptables module to match on related connections */ +/* + * (C) 2001 Martin Josefsson + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 19 Mar 2002 Harald Welte : + * - Port to newnat infrastructure + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Martin Josefsson "); +MODULE_DESCRIPTION("iptables helper match module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_helper_info *info = matchinfo; + struct ip_conntrack *ct; + enum ip_conntrack_info ctinfo; + int ret = info->invert; + + ct = ip_conntrack_get((struct sk_buff *)skb, &ctinfo); + if (!ct) { + DEBUGP("ipt_helper: Eek! invalid conntrack?\n"); + return ret; + } + + if (!ct->master) { + DEBUGP("ipt_helper: conntrack %p has no master\n", ct); + return ret; + } + + READ_LOCK(&ip_conntrack_lock); + if (!ct->master->helper) { + DEBUGP("ipt_helper: master ct %p has no helper\n", + exp->expectant); + goto out_unlock; + } + + DEBUGP("master's name = %s , info->name = %s\n", + ct->master->helper->name, info->name); + + if (info->name[0] == '\0') + ret ^= 1; + else + ret ^= !strncmp(ct->master->helper->name, info->name, + strlen(ct->master->helper->name)); +out_unlock: + READ_UNLOCK(&ip_conntrack_lock); + return ret; +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ipt_helper_info *info = matchinfo; + + info->name[29] = '\0'; + + /* verify size */ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_helper_info))) + return 0; + + return 1; +} + +static struct ipt_match helper_match = { + .name = "helper", + .match = &match, + .checkentry = &check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + need_ip_conntrack(); + return ipt_register_match(&helper_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&helper_match); +} + +module_init(init); +module_exit(fini); + diff --git a/net/ipv4/netfilter/ipt_iprange.c b/net/ipv4/netfilter/ipt_iprange.c new file mode 100644 index 000000000000..b835b7b2e560 --- /dev/null +++ b/net/ipv4/netfilter/ipt_iprange.c @@ -0,0 +1,99 @@ +/* + * iptables module to match IP address ranges + * + * (C) 2003 Jozsef Kadlecsik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Jozsef Kadlecsik "); +MODULE_DESCRIPTION("iptables arbitrary IP range match module"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, int *hotdrop) +{ + const struct ipt_iprange_info *info = matchinfo; + const struct iphdr *iph = skb->nh.iph; + + if (info->flags & IPRANGE_SRC) { + if (((ntohl(iph->saddr) < ntohl(info->src.min_ip)) + || (ntohl(iph->saddr) > ntohl(info->src.max_ip))) + ^ !!(info->flags & IPRANGE_SRC_INV)) { + DEBUGP("src IP %u.%u.%u.%u NOT in range %s" + "%u.%u.%u.%u-%u.%u.%u.%u\n", + NIPQUAD(iph->saddr), + info->flags & IPRANGE_SRC_INV ? "(INV) " : "", + NIPQUAD(info->src.min_ip), + NIPQUAD(info->src.max_ip)); + return 0; + } + } + if (info->flags & IPRANGE_DST) { + if (((ntohl(iph->daddr) < ntohl(info->dst.min_ip)) + || (ntohl(iph->daddr) > ntohl(info->dst.max_ip))) + ^ !!(info->flags & IPRANGE_DST_INV)) { + DEBUGP("dst IP %u.%u.%u.%u NOT in range %s" + "%u.%u.%u.%u-%u.%u.%u.%u\n", + NIPQUAD(iph->daddr), + info->flags & IPRANGE_DST_INV ? "(INV) " : "", + NIPQUAD(info->dst.min_ip), + NIPQUAD(info->dst.max_ip)); + return 0; + } + } + return 1; +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + /* verify size */ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_iprange_info))) + return 0; + + return 1; +} + +static struct ipt_match iprange_match = +{ + .list = { NULL, NULL }, + .name = "iprange", + .match = &match, + .checkentry = &check, + .destroy = NULL, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&iprange_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&iprange_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_length.c b/net/ipv4/netfilter/ipt_length.c new file mode 100644 index 000000000000..4eabcfbda9d1 --- /dev/null +++ b/net/ipv4/netfilter/ipt_length.c @@ -0,0 +1,64 @@ +/* Kernel module to match packet length. */ +/* (C) 1999-2001 James Morris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +#include +#include + +MODULE_AUTHOR("James Morris "); +MODULE_DESCRIPTION("IP tables packet length matching module"); +MODULE_LICENSE("GPL"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_length_info *info = matchinfo; + u_int16_t pktlen = ntohs(skb->nh.iph->tot_len); + + return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_length_info))) + return 0; + + return 1; +} + +static struct ipt_match length_match = { + .name = "length", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&length_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&length_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_limit.c b/net/ipv4/netfilter/ipt_limit.c new file mode 100644 index 000000000000..0c24dcc703a5 --- /dev/null +++ b/net/ipv4/netfilter/ipt_limit.c @@ -0,0 +1,157 @@ +/* Kernel module to control the rate + * + * 2 September 1999: Changed from the target RATE to the match + * `limit', removed logging. Did I mention that + * Alexey is a fucking genius? + * Rusty Russell (rusty@rustcorp.com.au). */ + +/* (C) 1999 Jérôme de Vivie + * (C) 1999 Hervé Eychenne + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Herve Eychenne "); +MODULE_DESCRIPTION("iptables rate limit match"); + +/* The algorithm used is the Simple Token Bucket Filter (TBF) + * see net/sched/sch_tbf.c in the linux source tree + */ + +static DEFINE_SPINLOCK(limit_lock); + +/* Rusty: This is my (non-mathematically-inclined) understanding of + this algorithm. The `average rate' in jiffies becomes your initial + amount of credit `credit' and the most credit you can ever have + `credit_cap'. The `peak rate' becomes the cost of passing the + test, `cost'. + + `prev' tracks the last packet hit: you gain one credit per jiffy. + If you get credit balance more than this, the extra credit is + discarded. Every time the match passes, you lose `cost' credits; + if you don't have that many, the test fails. + + See Alexey's formal explanation in net/sched/sch_tbf.c. + + To get the maxmum range, we multiply by this factor (ie. you get N + credits per jiffy). We want to allow a rate as low as 1 per day + (slowest userspace tool allows), which means + CREDITS_PER_JIFFY*HZ*60*60*24 < 2^32. ie. */ +#define MAX_CPJ (0xFFFFFFFF / (HZ*60*60*24)) + +/* Repeated shift and or gives us all 1s, final shift and add 1 gives + * us the power of 2 below the theoretical max, so GCC simply does a + * shift. */ +#define _POW2_BELOW2(x) ((x)|((x)>>1)) +#define _POW2_BELOW4(x) (_POW2_BELOW2(x)|_POW2_BELOW2((x)>>2)) +#define _POW2_BELOW8(x) (_POW2_BELOW4(x)|_POW2_BELOW4((x)>>4)) +#define _POW2_BELOW16(x) (_POW2_BELOW8(x)|_POW2_BELOW8((x)>>8)) +#define _POW2_BELOW32(x) (_POW2_BELOW16(x)|_POW2_BELOW16((x)>>16)) +#define POW2_BELOW32(x) ((_POW2_BELOW32(x)>>1) + 1) + +#define CREDITS_PER_JIFFY POW2_BELOW32(MAX_CPJ) + +static int +ipt_limit_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + struct ipt_rateinfo *r = ((struct ipt_rateinfo *)matchinfo)->master; + unsigned long now = jiffies; + + spin_lock_bh(&limit_lock); + r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY; + if (r->credit > r->credit_cap) + r->credit = r->credit_cap; + + if (r->credit >= r->cost) { + /* We're not limited. */ + r->credit -= r->cost; + spin_unlock_bh(&limit_lock); + return 1; + } + + spin_unlock_bh(&limit_lock); + return 0; +} + +/* Precision saver. */ +static u_int32_t +user2credits(u_int32_t user) +{ + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + /* Divide first. */ + return (user / IPT_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + + return (user * HZ * CREDITS_PER_JIFFY) / IPT_LIMIT_SCALE; +} + +static int +ipt_limit_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ipt_rateinfo *r = matchinfo; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_rateinfo))) + return 0; + + /* Check for overflow. */ + if (r->burst == 0 + || user2credits(r->avg * r->burst) < user2credits(r->avg)) { + printk("Overflow in ipt_limit, try lower: %u/%u\n", + r->avg, r->burst); + return 0; + } + + /* User avg in seconds * IPT_LIMIT_SCALE: convert to jiffies * + 128. */ + r->prev = jiffies; + r->credit = user2credits(r->avg * r->burst); /* Credits full. */ + r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */ + r->cost = user2credits(r->avg); + + /* For SMP, we only want to use one set of counters. */ + r->master = r; + + return 1; +} + +static struct ipt_match ipt_limit_reg = { + .name = "limit", + .match = ipt_limit_match, + .checkentry = ipt_limit_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + if (ipt_register_match(&ipt_limit_reg)) + return -EINVAL; + return 0; +} + +static void __exit fini(void) +{ + ipt_unregister_match(&ipt_limit_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_mac.c b/net/ipv4/netfilter/ipt_mac.c new file mode 100644 index 000000000000..11a459e33f25 --- /dev/null +++ b/net/ipv4/netfilter/ipt_mac.c @@ -0,0 +1,79 @@ +/* Kernel module to match MAC address parameters. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("iptables mac matching module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_mac_info *info = matchinfo; + + /* Is mac pointer valid? */ + return (skb->mac.raw >= skb->head + && (skb->mac.raw + ETH_HLEN) <= skb->data + /* If so, compare... */ + && ((memcmp(eth_hdr(skb)->h_source, info->srcaddr, ETH_ALEN) + == 0) ^ info->invert)); +} + +static int +ipt_mac_checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + /* FORWARD isn't always valid, but it's nice to be able to do --RR */ + if (hook_mask + & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN) + | (1 << NF_IP_FORWARD))) { + printk("ipt_mac: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n"); + return 0; + } + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_mac_info))) + return 0; + + return 1; +} + +static struct ipt_match mac_match = { + .name = "mac", + .match = &match, + .checkentry = &ipt_mac_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&mac_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&mac_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_mark.c b/net/ipv4/netfilter/ipt_mark.c new file mode 100644 index 000000000000..8955728127b9 --- /dev/null +++ b/net/ipv4/netfilter/ipt_mark.c @@ -0,0 +1,64 @@ +/* Kernel module to match NFMARK values. */ + +/* (C) 1999-2001 Marc Boucher + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher "); +MODULE_DESCRIPTION("iptables mark matching module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_mark_info *info = matchinfo; + + return ((skb->nfmark & info->mask) == info->mark) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_mark_info))) + return 0; + + return 1; +} + +static struct ipt_match mark_match = { + .name = "mark", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&mark_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&mark_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_multiport.c b/net/ipv4/netfilter/ipt_multiport.c new file mode 100644 index 000000000000..99e8188162e2 --- /dev/null +++ b/net/ipv4/netfilter/ipt_multiport.c @@ -0,0 +1,212 @@ +/* Kernel module to match one of a list of TCP/UDP ports: ports are in + the same place so we can treat them as equal. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("iptables multiple port match module"); + +#if 0 +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +/* Returns 1 if the port is matched by the test, 0 otherwise. */ +static inline int +ports_match(const u_int16_t *portlist, enum ipt_multiport_flags flags, + u_int8_t count, u_int16_t src, u_int16_t dst) +{ + unsigned int i; + for (i=0; icount; i++) { + s = minfo->ports[i]; + + if (minfo->pflags[i]) { + /* range port matching */ + e = minfo->ports[++i]; + duprintf("src or dst matches with %d-%d?\n", s, e); + + if (minfo->flags == IPT_MULTIPORT_SOURCE + && src >= s && src <= e) + return 1 ^ minfo->invert; + if (minfo->flags == IPT_MULTIPORT_DESTINATION + && dst >= s && dst <= e) + return 1 ^ minfo->invert; + if (minfo->flags == IPT_MULTIPORT_EITHER + && ((dst >= s && dst <= e) + || (src >= s && src <= e))) + return 1 ^ minfo->invert; + } else { + /* exact port matching */ + duprintf("src or dst matches with %d?\n", s); + + if (minfo->flags == IPT_MULTIPORT_SOURCE + && src == s) + return 1 ^ minfo->invert; + if (minfo->flags == IPT_MULTIPORT_DESTINATION + && dst == s) + return 1 ^ minfo->invert; + if (minfo->flags == IPT_MULTIPORT_EITHER + && (src == s || dst == s)) + return 1 ^ minfo->invert; + } + } + + return minfo->invert; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + u16 _ports[2], *pptr; + const struct ipt_multiport *multiinfo = matchinfo; + + if (offset) + return 0; + + pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_ports), _ports); + if (pptr == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("ipt_multiport:" + " Dropping evil offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return ports_match(multiinfo->ports, + multiinfo->flags, multiinfo->count, + ntohs(pptr[0]), ntohs(pptr[1])); +} + +static int +match_v1(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + u16 _ports[2], *pptr; + const struct ipt_multiport_v1 *multiinfo = matchinfo; + + if (offset) + return 0; + + pptr = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_ports), _ports); + if (pptr == NULL) { + /* We've been asked to examine this packet, and we + * can't. Hence, no choice but to drop. + */ + duprintf("ipt_multiport:" + " Dropping evil offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return ports_match_v1(multiinfo, ntohs(pptr[0]), ntohs(pptr[1])); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + return (matchsize == IPT_ALIGN(sizeof(struct ipt_multiport))); +} + +static int +checkentry_v1(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + return (matchsize == IPT_ALIGN(sizeof(struct ipt_multiport_v1))); +} + +static struct ipt_match multiport_match = { + .name = "multiport", + .revision = 0, + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static struct ipt_match multiport_match_v1 = { + .name = "multiport", + .revision = 1, + .match = &match_v1, + .checkentry = &checkentry_v1, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + int err; + + err = ipt_register_match(&multiport_match); + if (!err) { + err = ipt_register_match(&multiport_match_v1); + if (err) + ipt_unregister_match(&multiport_match); + } + + return err; +} + +static void __exit fini(void) +{ + ipt_unregister_match(&multiport_match); + ipt_unregister_match(&multiport_match_v1); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_owner.c b/net/ipv4/netfilter/ipt_owner.c new file mode 100644 index 000000000000..3b9065e06381 --- /dev/null +++ b/net/ipv4/netfilter/ipt_owner.c @@ -0,0 +1,217 @@ +/* Kernel module to match various things tied to sockets associated with + locally generated outgoing packets. */ + +/* (C) 2000 Marc Boucher + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher "); +MODULE_DESCRIPTION("iptables owner match"); + +static int +match_comm(const struct sk_buff *skb, const char *comm) +{ + struct task_struct *g, *p; + struct files_struct *files; + int i; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if(strncmp(p->comm, comm, sizeof(p->comm))) + continue; + + task_lock(p); + files = p->files; + if(files) { + spin_lock(&files->file_lock); + for (i=0; i < files->max_fds; i++) { + if (fcheck_files(files, i) == + skb->sk->sk_socket->file) { + spin_unlock(&files->file_lock); + task_unlock(p); + read_unlock(&tasklist_lock); + return 1; + } + } + spin_unlock(&files->file_lock); + } + task_unlock(p); + } while_each_thread(g, p); + read_unlock(&tasklist_lock); + return 0; +} + +static int +match_pid(const struct sk_buff *skb, pid_t pid) +{ + struct task_struct *p; + struct files_struct *files; + int i; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (!p) + goto out; + task_lock(p); + files = p->files; + if(files) { + spin_lock(&files->file_lock); + for (i=0; i < files->max_fds; i++) { + if (fcheck_files(files, i) == + skb->sk->sk_socket->file) { + spin_unlock(&files->file_lock); + task_unlock(p); + read_unlock(&tasklist_lock); + return 1; + } + } + spin_unlock(&files->file_lock); + } + task_unlock(p); +out: + read_unlock(&tasklist_lock); + return 0; +} + +static int +match_sid(const struct sk_buff *skb, pid_t sid) +{ + struct task_struct *g, *p; + struct file *file = skb->sk->sk_socket->file; + int i, found=0; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + struct files_struct *files; + if (p->signal->session != sid) + continue; + + task_lock(p); + files = p->files; + if (files) { + spin_lock(&files->file_lock); + for (i=0; i < files->max_fds; i++) { + if (fcheck_files(files, i) == file) { + found = 1; + break; + } + } + spin_unlock(&files->file_lock); + } + task_unlock(p); + if (found) + goto out; + } while_each_thread(g, p); +out: + read_unlock(&tasklist_lock); + + return found; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_owner_info *info = matchinfo; + + if (!skb->sk || !skb->sk->sk_socket || !skb->sk->sk_socket->file) + return 0; + + if(info->match & IPT_OWNER_UID) { + if ((skb->sk->sk_socket->file->f_uid != info->uid) ^ + !!(info->invert & IPT_OWNER_UID)) + return 0; + } + + if(info->match & IPT_OWNER_GID) { + if ((skb->sk->sk_socket->file->f_gid != info->gid) ^ + !!(info->invert & IPT_OWNER_GID)) + return 0; + } + + if(info->match & IPT_OWNER_PID) { + if (!match_pid(skb, info->pid) ^ + !!(info->invert & IPT_OWNER_PID)) + return 0; + } + + if(info->match & IPT_OWNER_SID) { + if (!match_sid(skb, info->sid) ^ + !!(info->invert & IPT_OWNER_SID)) + return 0; + } + + if(info->match & IPT_OWNER_COMM) { + if (!match_comm(skb, info->comm) ^ + !!(info->invert & IPT_OWNER_COMM)) + return 0; + } + + return 1; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (hook_mask + & ~((1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_POST_ROUTING))) { + printk("ipt_owner: only valid for LOCAL_OUT or POST_ROUTING.\n"); + return 0; + } + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_owner_info))) { + printk("Matchsize %u != %Zu\n", matchsize, + IPT_ALIGN(sizeof(struct ipt_owner_info))); + return 0; + } +#ifdef CONFIG_SMP + /* files->file_lock can not be used in a BH */ + if (((struct ipt_owner_info *)matchinfo)->match + & (IPT_OWNER_PID|IPT_OWNER_SID|IPT_OWNER_COMM)) { + printk("ipt_owner: pid, sid and command matching is broken " + "on SMP.\n"); + return 0; + } +#endif + return 1; +} + +static struct ipt_match owner_match = { + .name = "owner", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&owner_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&owner_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c new file mode 100644 index 000000000000..1a53924041fc --- /dev/null +++ b/net/ipv4/netfilter/ipt_physdev.c @@ -0,0 +1,134 @@ +/* Kernel module to match the bridge port in and + * out device for IP packets coming into contact with a bridge. */ + +/* (C) 2001-2003 Bart De Schuymer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#define MATCH 1 +#define NOMATCH 0 + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Bart De Schuymer "); +MODULE_DESCRIPTION("iptables bridge physical device match module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + int i; + static const char nulldevname[IFNAMSIZ]; + const struct ipt_physdev_info *info = matchinfo; + unsigned int ret; + const char *indev, *outdev; + struct nf_bridge_info *nf_bridge; + + /* Not a bridged IP packet or no info available yet: + * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if + * the destination device will be a bridge. */ + if (!(nf_bridge = skb->nf_bridge)) { + /* Return MATCH if the invert flags of the used options are on */ + if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) && + !(info->invert & IPT_PHYSDEV_OP_BRIDGED)) + return NOMATCH; + if ((info->bitmask & IPT_PHYSDEV_OP_ISIN) && + !(info->invert & IPT_PHYSDEV_OP_ISIN)) + return NOMATCH; + if ((info->bitmask & IPT_PHYSDEV_OP_ISOUT) && + !(info->invert & IPT_PHYSDEV_OP_ISOUT)) + return NOMATCH; + if ((info->bitmask & IPT_PHYSDEV_OP_IN) && + !(info->invert & IPT_PHYSDEV_OP_IN)) + return NOMATCH; + if ((info->bitmask & IPT_PHYSDEV_OP_OUT) && + !(info->invert & IPT_PHYSDEV_OP_OUT)) + return NOMATCH; + return MATCH; + } + + /* This only makes sense in the FORWARD and POSTROUTING chains */ + if ((info->bitmask & IPT_PHYSDEV_OP_BRIDGED) && + (!!(nf_bridge->mask & BRNF_BRIDGED) ^ + !(info->invert & IPT_PHYSDEV_OP_BRIDGED))) + return NOMATCH; + + if ((info->bitmask & IPT_PHYSDEV_OP_ISIN && + (!nf_bridge->physindev ^ !!(info->invert & IPT_PHYSDEV_OP_ISIN))) || + (info->bitmask & IPT_PHYSDEV_OP_ISOUT && + (!nf_bridge->physoutdev ^ !!(info->invert & IPT_PHYSDEV_OP_ISOUT)))) + return NOMATCH; + + if (!(info->bitmask & IPT_PHYSDEV_OP_IN)) + goto match_outdev; + indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname; + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) { + ret |= (((const unsigned int *)indev)[i] + ^ ((const unsigned int *)info->physindev)[i]) + & ((const unsigned int *)info->in_mask)[i]; + } + + if ((ret == 0) ^ !(info->invert & IPT_PHYSDEV_OP_IN)) + return NOMATCH; + +match_outdev: + if (!(info->bitmask & IPT_PHYSDEV_OP_OUT)) + return MATCH; + outdev = nf_bridge->physoutdev ? + nf_bridge->physoutdev->name : nulldevname; + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) { + ret |= (((const unsigned int *)outdev)[i] + ^ ((const unsigned int *)info->physoutdev)[i]) + & ((const unsigned int *)info->out_mask)[i]; + } + + return (ret != 0) ^ !(info->invert & IPT_PHYSDEV_OP_OUT); +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_physdev_info *info = matchinfo; + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_physdev_info))) + return 0; + if (!(info->bitmask & IPT_PHYSDEV_OP_MASK) || + info->bitmask & ~IPT_PHYSDEV_OP_MASK) + return 0; + return 1; +} + +static struct ipt_match physdev_match = { + .name = "physdev", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&physdev_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&physdev_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_pkttype.c b/net/ipv4/netfilter/ipt_pkttype.c new file mode 100644 index 000000000000..8ddb1dc5e5ae --- /dev/null +++ b/net/ipv4/netfilter/ipt_pkttype.c @@ -0,0 +1,70 @@ +/* (C) 1999-2001 Michal Ludvig + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Michal Ludvig "); +MODULE_DESCRIPTION("IP tables match to match on linklayer packet type"); + +static int match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_pkttype_info *info = matchinfo; + + return (skb->pkt_type == info->pkttype) ^ info->invert; +} + +static int checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ +/* + if (hook_mask + & ~((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_IN) + | (1 << NF_IP_FORWARD))) { + printk("ipt_pkttype: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n"); + return 0; + } +*/ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_pkttype_info))) + return 0; + + return 1; +} + +static struct ipt_match pkttype_match = { + .name = "pkttype", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&pkttype_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&pkttype_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_realm.c b/net/ipv4/netfilter/ipt_realm.c new file mode 100644 index 000000000000..54a6897ebaa6 --- /dev/null +++ b/net/ipv4/netfilter/ipt_realm.c @@ -0,0 +1,76 @@ +/* IP tables module for matching the routing realm + * + * $Id: ipt_realm.c,v 1.3 2004/03/05 13:25:40 laforge Exp $ + * + * (C) 2003 by Sampsa Ranta + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Sampsa Ranta "); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("iptables realm match"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_realm_info *info = matchinfo; + struct dst_entry *dst = skb->dst; + + return (info->id == (dst->tclassid & info->mask)) ^ info->invert; +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (hook_mask + & ~((1 << NF_IP_POST_ROUTING) | (1 << NF_IP_FORWARD) | + (1 << NF_IP_LOCAL_OUT) | (1 << NF_IP_LOCAL_IN))) { + printk("ipt_realm: only valid for POST_ROUTING, LOCAL_OUT, " + "LOCAL_IN or FORWARD.\n"); + return 0; + } + if (matchsize != IPT_ALIGN(sizeof(struct ipt_realm_info))) { + printk("ipt_realm: invalid matchsize.\n"); + return 0; + } + return 1; +} + +static struct ipt_match realm_match = { + .name = "realm", + .match = match, + .checkentry = check, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&realm_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&realm_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c new file mode 100644 index 000000000000..25ab9fabdcba --- /dev/null +++ b/net/ipv4/netfilter/ipt_recent.c @@ -0,0 +1,1002 @@ +/* Kernel module to check if the source address has been seen recently. */ +/* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */ +/* Author: Stephen Frost */ +/* Project Page: http://snowman.net/projects/ipt_recent/ */ +/* This software is distributed under the terms of the GPL, Version 2 */ +/* This copyright does not cover user programs that use kernel services + * by normal system calls. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#undef DEBUG +#define HASH_LOG 9 + +/* Defaults, these can be overridden on the module command-line. */ +static int ip_list_tot = 100; +static int ip_pkt_list_tot = 20; +static int ip_list_hash_size = 0; +static int ip_list_perms = 0644; +#ifdef DEBUG +static int debug = 1; +#endif + +static char version[] = +KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost . http://snowman.net/projects/ipt_recent/\n"; + +MODULE_AUTHOR("Stephen Frost "); +MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER); +MODULE_LICENSE("GPL"); +module_param(ip_list_tot, int, 0400); +module_param(ip_pkt_list_tot, int, 0400); +module_param(ip_list_hash_size, int, 0400); +module_param(ip_list_perms, int, 0400); +#ifdef DEBUG +module_param(debug, int, 0600); +MODULE_PARM_DESC(debug,"debugging level, defaults to 1"); +#endif +MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list"); +MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember"); +MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs"); +MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files"); + +/* Structure of our list of recently seen addresses. */ +struct recent_ip_list { + u_int32_t addr; + u_int8_t ttl; + unsigned long last_seen; + unsigned long *last_pkts; + u_int32_t oldest_pkt; + u_int32_t hash_entry; + u_int32_t time_pos; +}; + +struct time_info_list { + u_int32_t position; + u_int32_t time; +}; + +/* Structure of our linked list of tables of recent lists. */ +struct recent_ip_tables { + char name[IPT_RECENT_NAME_LEN]; + int count; + int time_pos; + struct recent_ip_list *table; + struct recent_ip_tables *next; + spinlock_t list_lock; + int *hash_table; + struct time_info_list *time_info; +#ifdef CONFIG_PROC_FS + struct proc_dir_entry *status_proc; +#endif /* CONFIG_PROC_FS */ +}; + +/* Our current list of addresses we have recently seen. + * Only added to on a --set, and only updated on --set || --update + */ +static struct recent_ip_tables *r_tables = NULL; + +/* We protect r_list with this spinlock so two processors are not modifying + * the list at the same time. + */ +static DEFINE_SPINLOCK(recent_lock); + +#ifdef CONFIG_PROC_FS +/* Our /proc/net/ipt_recent entry */ +static struct proc_dir_entry *proc_net_ipt_recent = NULL; +#endif + +/* Function declaration for later. */ +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop); + +/* Function to hash a given address into the hash table of table_size size */ +static int hash_func(unsigned int addr, int table_size) +{ + int result = 0; + unsigned int value = addr; + do { result ^= value; } while((value >>= HASH_LOG)); + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n", + result & (table_size - 1), + addr, + table_size); +#endif + + return(result & (table_size - 1)); +} + +#ifdef CONFIG_PROC_FS +/* This is the function which produces the output for our /proc output + * interface which lists each IP address, the last seen time and the + * other recent times the address was seen. + */ + +static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data) +{ + int len = 0, count, last_len = 0, pkt_count; + off_t pos = 0; + off_t begin = 0; + struct recent_ip_tables *curr_table; + + curr_table = (struct recent_ip_tables*) data; + + spin_lock_bh(&curr_table->list_lock); + for(count = 0; count < ip_list_tot; count++) { + if(!curr_table->table[count].addr) continue; + last_len = len; + len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr)); + len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl); + len += sprintf(buffer+len,"last_seen: %lu ",curr_table->table[count].last_seen); + len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt); + len += sprintf(buffer+len,"last_pkts: %lu",curr_table->table[count].last_pkts[0]); + for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) { + if(!curr_table->table[count].last_pkts[pkt_count]) break; + len += sprintf(buffer+len,", %lu",curr_table->table[count].last_pkts[pkt_count]); + } + len += sprintf(buffer+len,"\n"); + pos = begin + len; + if(pos < offset) { len = 0; begin = pos; } + if(pos > offset + length) { len = last_len; break; } + } + + *start = buffer + (offset - begin); + len -= (offset - begin); + if(len > length) len = length; + + spin_unlock_bh(&curr_table->list_lock); + return len; +} + +/* ip_recent_ctrl provides an interface for users to modify the table + * directly. This allows adding entries, removing entries, and + * flushing the entire table. + * This is done by opening up the appropriate table for writing and + * sending one of: + * xx.xx.xx.xx -- Add entry to table with current time + * +xx.xx.xx.xx -- Add entry to table with current time + * -xx.xx.xx.xx -- Remove entry from table + * clear -- Flush table, remove all entries + */ + +static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned long size, void *data) +{ + static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff }; + u_int32_t val; + int base, used = 0; + char c, *cp; + union iaddr { + uint8_t bytes[4]; + uint32_t word; + } res; + uint8_t *pp = res.bytes; + int digit; + + char buffer[20]; + int len, check_set = 0, count; + u_int32_t addr = 0; + struct sk_buff *skb; + struct ipt_recent_info *info; + struct recent_ip_tables *curr_table; + + curr_table = (struct recent_ip_tables*) data; + + if(size > 20) len = 20; else len = size; + + if(copy_from_user(buffer,input,len)) return -EFAULT; + + if(len < 20) buffer[len] = '\0'; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer); +#endif + + cp = buffer; + while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; } + + /* Check if we are asked to flush the entire table */ + if(!memcmp(cp,"clear",5)) { + used += 5; + spin_lock_bh(&curr_table->list_lock); + curr_table->time_pos = 0; + for(count = 0; count < ip_list_hash_size; count++) { + curr_table->hash_table[count] = -1; + } + for(count = 0; count < ip_list_tot; count++) { + curr_table->table[count].last_seen = 0; + curr_table->table[count].addr = 0; + curr_table->table[count].ttl = 0; + memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); + curr_table->table[count].oldest_pkt = 0; + curr_table->table[count].time_pos = 0; + curr_table->time_info[count].position = count; + curr_table->time_info[count].time = 0; + } + spin_unlock_bh(&curr_table->list_lock); + return used; + } + + check_set = IPT_RECENT_SET; + switch(*cp) { + case '+': check_set = IPT_RECENT_SET; cp++; used++; break; + case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break; + default: if(!isdigit(*cp)) return (used+1); break; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set); +#endif + /* Get addr (effectively inet_aton()) */ + /* Shamelessly stolen from libc, a function in the kernel for doing + * this would, of course, be greatly preferred, but our options appear + * to be rather limited, so we will just do it ourselves here. + */ + res.word = 0; + + c = *cp; + for(;;) { + if(!isdigit(c)) return used; + val = 0; base = 10; digit = 0; + if(c == '0') { + c = *++cp; + if(c == 'x' || c == 'X') base = 16, c = *++cp; + else { base = 8; digit = 1; } + } + for(;;) { + if(isascii(c) && isdigit(c)) { + if(base == 8 && (c == '8' || c == '0')) return used; + val = (val * base) + (c - '0'); + c = *++cp; + digit = 1; + } else if(base == 16 && isascii(c) && isxdigit(c)) { + val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A')); + c = *++cp; + digit = 1; + } else break; + } + if(c == '.') { + if(pp > res.bytes + 2 || val > 0xff) return used; + *pp++ = val; + c = *++cp; + } else break; + } + used = cp - buffer; + if(c != '\0' && (!isascii(c) || !isspace(c))) return used; + if(c == '\n') used++; + if(!digit) return used; + + if(val > max[pp - res.bytes]) return used; + addr = res.word | htonl(val); + + if(!addr && check_set == IPT_RECENT_SET) return used; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used); +#endif + + /* Set up and just call match */ + info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL); + if(!info) { return -ENOMEM; } + info->seconds = 0; + info->hit_count = 0; + info->check_set = check_set; + info->invert = 0; + info->side = IPT_RECENT_SOURCE; + strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN); + info->name[IPT_RECENT_NAME_LEN-1] = '\0'; + + skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL); + if (!skb) { + used = -ENOMEM; + goto out_free_info; + } + skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL); + if (!skb->nh.iph) { + used = -ENOMEM; + goto out_free_skb; + } + + skb->nh.iph->saddr = addr; + skb->nh.iph->daddr = 0; + /* Clear ttl since we have no way of knowing it */ + skb->nh.iph->ttl = 0; + match(skb,NULL,NULL,info,0,NULL); + + kfree(skb->nh.iph); +out_free_skb: + kfree(skb); +out_free_info: + kfree(info); + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used); +#endif + return used; +} + +#endif /* CONFIG_PROC_FS */ + +/* 'match' is our primary function, called by the kernel whenever a rule is + * hit with our module as an option to it. + * What this function does depends on what was specifically asked of it by + * the user: + * --set -- Add or update last seen time of the source address of the packet + * -- matchinfo->check_set == IPT_RECENT_SET + * --rcheck -- Just check if the source address is in the list + * -- matchinfo->check_set == IPT_RECENT_CHECK + * --update -- If the source address is in the list, update last_seen + * -- matchinfo->check_set == IPT_RECENT_UPDATE + * --remove -- If the source address is in the list, remove it + * -- matchinfo->check_set == IPT_RECENT_REMOVE + * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds + * -- matchinfo->seconds + * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times + * -- matchinfo->hit_count + * --seconds and --hitcount can be combined + */ +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + int pkt_count, hits_found, ans; + unsigned long now; + const struct ipt_recent_info *info = matchinfo; + u_int32_t addr = 0, time_temp; + u_int8_t ttl = skb->nh.iph->ttl; + int *hash_table; + int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1; + struct time_info_list *time_info; + struct recent_ip_tables *curr_table; + struct recent_ip_tables *last_table; + struct recent_ip_list *r_list; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n"); +#endif + + /* Default is false ^ info->invert */ + ans = info->invert; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name); +#endif + + /* if out != NULL then routing has been done and TTL changed. + * We change it back here internally for match what came in before routing. */ + if(out) ttl++; + + /* Find the right table */ + spin_lock_bh(&recent_lock); + curr_table = r_tables; + while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) ); + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name); +#endif + + spin_unlock_bh(&recent_lock); + + /* Table with this name not found, match impossible */ + if(!curr_table) { return ans; } + + /* Make sure no one is changing the list while we work with it */ + spin_lock_bh(&curr_table->list_lock); + + r_list = curr_table->table; + if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr; + + if(!addr) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr); +#endif + spin_unlock_bh(&curr_table->list_lock); + return ans; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl); +#endif + + /* Get jiffies now in case they changed while we were waiting for a lock */ + now = jiffies; + hash_table = curr_table->hash_table; + time_info = curr_table->time_info; + + orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size); + /* Hash entry at this result used */ + /* Check for TTL match if requested. If TTL is zero then a match would never + * happen, so match regardless of existing TTL in that case. Zero means the + * entry was added via the /proc interface anyway, so we will just use the + * first TTL we get for that IP address. */ + if(info->check_set & IPT_RECENT_TTL) { + while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr && + (!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) { + /* Collision in hash table */ + hash_result = (hash_result + 1) % ip_list_hash_size; + } + } else { + while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) { + /* Collision in hash table */ + hash_result = (hash_result + 1) % ip_list_hash_size; + } + } + + if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) { + /* IP not in list and not asked to SET */ + spin_unlock_bh(&curr_table->list_lock); + return ans; + } + + /* Check if we need to handle the collision, do not need to on REMOVE */ + if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n", + orig_hash_result, + hash_result, + r_list[hash_table[orig_hash_result]].addr, + addr); +#endif + + /* We had a collision. + * orig_hash_result is where we started, hash_result is where we ended up. + * So, swap them because we are likely to see the same guy again sooner */ +#ifdef DEBUG + if(debug) { + printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]); + printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n", + r_list[hash_table[orig_hash_result]].hash_entry); + } +#endif + + r_list[hash_table[orig_hash_result]].hash_entry = hash_result; + + + temp = hash_table[orig_hash_result]; +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]); +#endif + hash_table[orig_hash_result] = hash_table[hash_result]; + hash_table[hash_result] = temp; + temp = hash_result; + hash_result = orig_hash_result; + orig_hash_result = temp; + time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result]; + if(hash_table[hash_result] != -1) { + r_list[hash_table[hash_result]].hash_entry = hash_result; + time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n"); +#endif + } + + if(hash_table[hash_result] == -1) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n", + hash_result, addr); +#endif + + /* New item found and IPT_RECENT_SET, so we need to add it */ + location = time_info[curr_table->time_pos].position; + hash_table[r_list[location].hash_entry] = -1; + hash_table[hash_result] = location; + memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); + r_list[location].time_pos = curr_table->time_pos; + r_list[location].addr = addr; + r_list[location].ttl = ttl; + r_list[location].last_seen = now; + r_list[location].oldest_pkt = 1; + r_list[location].last_pkts[0] = now; + r_list[location].hash_entry = hash_result; + time_info[curr_table->time_pos].time = r_list[location].last_seen; + curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot; + + ans = !info->invert; + } else { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n", + hash_result, + addr); +#endif + + /* Existing item found */ + location = hash_table[hash_result]; + /* We have a match on address, now to make sure it meets all requirements for a + * full match. */ + if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) { + if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert; + if(info->seconds && !info->hit_count) { + if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert; + } + if(info->seconds && info->hit_count) { + for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { + if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++; + } + if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; + } + if(info->hit_count && !info->seconds) { + for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { + if(r_list[location].last_pkts[pkt_count] == 0) break; + hits_found++; + } + if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; + } + } +#ifdef DEBUG + if(debug) { + if(ans) + printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr); + else + printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr); + } +#endif + + /* If and only if we have been asked to SET, or to UPDATE (on match) do we add the + * current timestamp to the last_seen. */ + if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n"); +#endif + /* Have to update our time info */ + time_loc = r_list[location].time_pos; + time_info[time_loc].time = now; + time_info[time_loc].position = location; + while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { + time_temp = time_info[time_loc].time; + time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; + time_info[(time_loc+1)%ip_list_tot].time = time_temp; + time_temp = time_info[time_loc].position; + time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; + time_info[(time_loc+1)%ip_list_tot].position = time_temp; + r_list[time_info[time_loc].position].time_pos = time_loc; + r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; + time_loc = (time_loc+1) % ip_list_tot; + } + r_list[location].time_pos = time_loc; + r_list[location].ttl = ttl; + r_list[location].last_pkts[r_list[location].oldest_pkt] = now; + r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot; + r_list[location].last_seen = now; + } + /* If we have been asked to remove the entry from the list, just set it to 0 */ + if(info->check_set & IPT_RECENT_REMOVE) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result); +#endif + /* Check if this is part of a collision chain */ + while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) { + orig_hash_result++; + if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) { + /* Found collision chain, how deep does this rabbit hole go? */ +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n"); +#endif + end_collision_chain = orig_hash_result; + } + } + if(end_collision_chain != -1) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n"); +#endif + /* Part of a collision chain, swap it with the end of the chain + * before removing. */ + r_list[hash_table[end_collision_chain]].hash_entry = hash_result; + temp = hash_table[end_collision_chain]; + hash_table[end_collision_chain] = hash_table[hash_result]; + hash_table[hash_result] = temp; + time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; + hash_result = end_collision_chain; + r_list[hash_table[hash_result]].hash_entry = hash_result; + time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; + } + location = hash_table[hash_result]; + hash_table[r_list[location].hash_entry] = -1; + time_loc = r_list[location].time_pos; + time_info[time_loc].time = 0; + time_info[time_loc].position = location; + while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { + time_temp = time_info[time_loc].time; + time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; + time_info[(time_loc+1)%ip_list_tot].time = time_temp; + time_temp = time_info[time_loc].position; + time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; + time_info[(time_loc+1)%ip_list_tot].position = time_temp; + r_list[time_info[time_loc].position].time_pos = time_loc; + r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; + time_loc = (time_loc+1) % ip_list_tot; + } + r_list[location].time_pos = time_loc; + r_list[location].last_seen = 0; + r_list[location].addr = 0; + r_list[location].ttl = 0; + memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(u_int32_t)); + r_list[location].oldest_pkt = 0; + ans = !info->invert; + } + spin_unlock_bh(&curr_table->list_lock); + return ans; + } + + spin_unlock_bh(&curr_table->list_lock); +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n"); +#endif + return ans; +} + +/* This function is to verify that the rule given during the userspace iptables + * command is correct. + * If the command is valid then we check if the table name referred to by the + * rule exists, if not it is created. + */ +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + int flag = 0, c; + unsigned long *hold; + const struct ipt_recent_info *info = matchinfo; + struct recent_ip_tables *curr_table, *find_table, *last_table; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n"); +#endif + + if (matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return 0; + + /* seconds and hit_count only valid for CHECK/UPDATE */ + if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; } + if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; } + if(info->check_set & IPT_RECENT_CHECK) flag++; + if(info->check_set & IPT_RECENT_UPDATE) flag++; + + /* One and only one of these should ever be set */ + if(flag != 1) return 0; + + /* Name must be set to something */ + if(!info->name || !info->name[0]) return 0; + + /* Things look good, create a list for this if it does not exist */ + /* Lock the linked list while we play with it */ + spin_lock_bh(&recent_lock); + + /* Look for an entry with this name already created */ + /* Finds the end of the list and the entry before the end if current name does not exist */ + find_table = r_tables; + while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); + + /* If a table already exists just increment the count on that table and return */ + if(find_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name); +#endif + find_table->count++; + spin_unlock_bh(&recent_lock); + return 1; + } + + spin_unlock_bh(&recent_lock); + + /* Table with this name not found */ + /* Allocate memory for new linked list item */ + +#ifdef DEBUG + if(debug) { + printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name); + printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables)); + } +#endif + + curr_table = vmalloc(sizeof(struct recent_ip_tables)); + if(curr_table == NULL) return 0; + + spin_lock_init(&curr_table->list_lock); + curr_table->next = NULL; + curr_table->count = 1; + curr_table->time_pos = 0; + strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN); + curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0'; + + /* Allocate memory for this table and the list of packets in each entry. */ +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n", + sizeof(struct recent_ip_list)*ip_list_tot, + info->name); +#endif + + curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot); + if(curr_table->table == NULL) { vfree(curr_table); return 0; } + memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot); +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n", + sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot); +#endif + + hold = vmalloc(sizeof(u_int32_t)*ip_pkt_list_tot*ip_list_tot); +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n"); +#endif + if(hold == NULL) { + printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n"); + vfree(curr_table->table); + vfree(curr_table); + return 0; + } + for(c = 0; c < ip_list_tot; c++) { + curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot; + } + + /* Allocate memory for the hash table */ +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n", + sizeof(int)*ip_list_hash_size); +#endif + + curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size); + if(!curr_table->hash_table) { + printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n"); + vfree(hold); + vfree(curr_table->table); + vfree(curr_table); + return 0; + } + + for(c = 0; c < ip_list_hash_size; c++) { + curr_table->hash_table[c] = -1; + } + + /* Allocate memory for the time info */ +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n", + sizeof(struct time_info_list)*ip_list_tot); +#endif + + curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot); + if(!curr_table->time_info) { + printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n"); + vfree(curr_table->hash_table); + vfree(hold); + vfree(curr_table->table); + vfree(curr_table); + return 0; + } + for(c = 0; c < ip_list_tot; c++) { + curr_table->time_info[c].position = c; + curr_table->time_info[c].time = 0; + } + + /* Put the new table in place */ + spin_lock_bh(&recent_lock); + find_table = r_tables; + while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); + + /* If a table already exists just increment the count on that table and return */ + if(find_table) { + find_table->count++; + spin_unlock_bh(&recent_lock); +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name); +#endif + vfree(curr_table->time_info); + vfree(curr_table->hash_table); + vfree(hold); + vfree(curr_table->table); + vfree(curr_table); + return 1; + } + if(!last_table) r_tables = curr_table; else last_table->next = curr_table; + + spin_unlock_bh(&recent_lock); + +#ifdef CONFIG_PROC_FS + /* Create our proc 'status' entry. */ + curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent); + if (!curr_table->status_proc) { + printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n"); + /* Destroy the created table */ + spin_lock_bh(&recent_lock); + last_table = NULL; + curr_table = r_tables; + if(!curr_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n"); +#endif + spin_unlock_bh(&recent_lock); + return 0; + } + while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); + if(!curr_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n"); +#endif + spin_unlock_bh(&recent_lock); + return 0; + } + if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; + spin_unlock_bh(&recent_lock); + vfree(curr_table->time_info); + vfree(curr_table->hash_table); + vfree(hold); + vfree(curr_table->table); + vfree(curr_table); + return 0; + } + + curr_table->status_proc->owner = THIS_MODULE; + curr_table->status_proc->data = curr_table; + wmb(); + curr_table->status_proc->read_proc = ip_recent_get_info; + curr_table->status_proc->write_proc = ip_recent_ctrl; +#endif /* CONFIG_PROC_FS */ + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n"); +#endif + + return 1; +} + +/* This function is called in the event that a rule matching this module is + * removed. + * When this happens we need to check if there are no other rules matching + * the table given. If that is the case then we remove the table and clean + * up its memory. + */ +static void +destroy(void *matchinfo, unsigned int matchsize) +{ + const struct ipt_recent_info *info = matchinfo; + struct recent_ip_tables *curr_table, *last_table; + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n"); +#endif + + if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return; + + /* Lock the linked list while we play with it */ + spin_lock_bh(&recent_lock); + + /* Look for an entry with this name already created */ + /* Finds the end of the list and the entry before the end if current name does not exist */ + last_table = NULL; + curr_table = r_tables; + if(!curr_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n"); +#endif + spin_unlock_bh(&recent_lock); + return; + } + while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); + + /* If a table does not exist then do nothing and return */ + if(!curr_table) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n"); +#endif + spin_unlock_bh(&recent_lock); + return; + } + + curr_table->count--; + + /* If count is still non-zero then there are still rules referenceing it so we do nothing */ + if(curr_table->count) { +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n"); +#endif + spin_unlock_bh(&recent_lock); + return; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n"); +#endif + + /* Count must be zero so we remove this table from the list */ + if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; + + spin_unlock_bh(&recent_lock); + + /* lock to make sure any late-runners still using this after we removed it from + * the list finish up then remove everything */ + spin_lock_bh(&curr_table->list_lock); + spin_unlock_bh(&curr_table->list_lock); + +#ifdef CONFIG_PROC_FS + if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent); +#endif /* CONFIG_PROC_FS */ + vfree(curr_table->table[0].last_pkts); + vfree(curr_table->table); + vfree(curr_table->hash_table); + vfree(curr_table->time_info); + vfree(curr_table); + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n"); +#endif + + return; +} + +/* This is the structure we pass to ipt_register to register our + * module with iptables. + */ +static struct ipt_match recent_match = { + .name = "recent", + .match = &match, + .checkentry = &checkentry, + .destroy = &destroy, + .me = THIS_MODULE +}; + +/* Kernel module initialization. */ +static int __init init(void) +{ + int err, count; + + printk(version); +#ifdef CONFIG_PROC_FS + proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net); + if(!proc_net_ipt_recent) return -ENOMEM; +#endif + + if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) { + printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n"); + ip_list_hash_size = 0; + } + + if(!ip_list_hash_size) { + ip_list_hash_size = ip_list_tot*3; + count = 2*2; + while(ip_list_hash_size > count) count = count*2; + ip_list_hash_size = count; + } + +#ifdef DEBUG + if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size); +#endif + + err = ipt_register_match(&recent_match); + if (err) + remove_proc_entry("ipt_recent", proc_net); + return err; +} + +/* Kernel module destruction. */ +static void __exit fini(void) +{ + ipt_unregister_match(&recent_match); + + remove_proc_entry("ipt_recent",proc_net); +} + +/* Register our module with the kernel. */ +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_sctp.c b/net/ipv4/netfilter/ipt_sctp.c new file mode 100644 index 000000000000..fe2b327bcaa4 --- /dev/null +++ b/net/ipv4/netfilter/ipt_sctp.c @@ -0,0 +1,203 @@ +#include +#include +#include +#include + +#include +#include + +#ifdef DEBUG_SCTP +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#define SCCHECK(cond, option, flag, invflag) (!((flag) & (option)) \ + || (!!((invflag) & (option)) ^ (cond))) + +static int +match_flags(const struct ipt_sctp_flag_info *flag_info, + const int flag_count, + u_int8_t chunktype, + u_int8_t chunkflags) +{ + int i; + + for (i = 0; i < flag_count; i++) { + if (flag_info[i].chunktype == chunktype) { + return (chunkflags & flag_info[i].flag_mask) == flag_info[i].flag; + } + } + + return 1; +} + +static int +match_packet(const struct sk_buff *skb, + const u_int32_t *chunkmap, + int chunk_match_type, + const struct ipt_sctp_flag_info *flag_info, + const int flag_count, + int *hotdrop) +{ + int offset; + u_int32_t chunkmapcopy[256 / sizeof (u_int32_t)]; + sctp_chunkhdr_t _sch, *sch; + +#ifdef DEBUG_SCTP + int i = 0; +#endif + + if (chunk_match_type == SCTP_CHUNK_MATCH_ALL) { + SCTP_CHUNKMAP_COPY(chunkmapcopy, chunkmap); + } + + offset = skb->nh.iph->ihl * 4 + sizeof (sctp_sctphdr_t); + do { + sch = skb_header_pointer(skb, offset, sizeof(_sch), &_sch); + if (sch == NULL) { + duprintf("Dropping invalid SCTP packet.\n"); + *hotdrop = 1; + return 0; + } + + duprintf("Chunk num: %d\toffset: %d\ttype: %d\tlength: %d\tflags: %x\n", + ++i, offset, sch->type, htons(sch->length), sch->flags); + + offset += (htons(sch->length) + 3) & ~3; + + duprintf("skb->len: %d\toffset: %d\n", skb->len, offset); + + if (SCTP_CHUNKMAP_IS_SET(chunkmap, sch->type)) { + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ANY: + if (match_flags(flag_info, flag_count, + sch->type, sch->flags)) { + return 1; + } + break; + + case SCTP_CHUNK_MATCH_ALL: + if (match_flags(flag_info, flag_count, + sch->type, sch->flags)) { + SCTP_CHUNKMAP_CLEAR(chunkmapcopy, sch->type); + } + break; + + case SCTP_CHUNK_MATCH_ONLY: + if (!match_flags(flag_info, flag_count, + sch->type, sch->flags)) { + return 0; + } + break; + } + } else { + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ONLY: + return 0; + } + } + } while (offset < skb->len); + + switch (chunk_match_type) { + case SCTP_CHUNK_MATCH_ALL: + return SCTP_CHUNKMAP_IS_CLEAR(chunkmap); + case SCTP_CHUNK_MATCH_ANY: + return 0; + case SCTP_CHUNK_MATCH_ONLY: + return 1; + } + + /* This will never be reached, but required to stop compiler whine */ + return 0; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_sctp_info *info; + sctp_sctphdr_t _sh, *sh; + + info = (const struct ipt_sctp_info *)matchinfo; + + if (offset) { + duprintf("Dropping non-first fragment.. FIXME\n"); + return 0; + } + + sh = skb_header_pointer(skb, skb->nh.iph->ihl*4, sizeof(_sh), &_sh); + if (sh == NULL) { + duprintf("Dropping evil TCP offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + duprintf("spt: %d\tdpt: %d\n", ntohs(sh->source), ntohs(sh->dest)); + + return SCCHECK(((ntohs(sh->source) >= info->spts[0]) + && (ntohs(sh->source) <= info->spts[1])), + IPT_SCTP_SRC_PORTS, info->flags, info->invflags) + && SCCHECK(((ntohs(sh->dest) >= info->dpts[0]) + && (ntohs(sh->dest) <= info->dpts[1])), + IPT_SCTP_DEST_PORTS, info->flags, info->invflags) + && SCCHECK(match_packet(skb, info->chunkmap, info->chunk_match_type, + info->flag_info, info->flag_count, + hotdrop), + IPT_SCTP_CHUNK_TYPES, info->flags, info->invflags); +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ipt_sctp_info *info; + + info = (const struct ipt_sctp_info *)matchinfo; + + return ip->proto == IPPROTO_SCTP + && !(ip->invflags & IPT_INV_PROTO) + && matchsize == IPT_ALIGN(sizeof(struct ipt_sctp_info)) + && !(info->flags & ~IPT_SCTP_VALID_FLAGS) + && !(info->invflags & ~IPT_SCTP_VALID_FLAGS) + && !(info->invflags & ~info->flags) + && ((!(info->flags & IPT_SCTP_CHUNK_TYPES)) || + (info->chunk_match_type & + (SCTP_CHUNK_MATCH_ALL + | SCTP_CHUNK_MATCH_ANY + | SCTP_CHUNK_MATCH_ONLY))); +} + +static struct ipt_match sctp_match = +{ + .list = { NULL, NULL}, + .name = "sctp", + .match = &match, + .checkentry = &checkentry, + .destroy = NULL, + .me = THIS_MODULE +}; + +static int __init init(void) +{ + return ipt_register_match(&sctp_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&sctp_match); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Kiran Kumar Immidi"); +MODULE_DESCRIPTION("Match for SCTP protocol packets"); + diff --git a/net/ipv4/netfilter/ipt_state.c b/net/ipv4/netfilter/ipt_state.c new file mode 100644 index 000000000000..b1511b97ea5f --- /dev/null +++ b/net/ipv4/netfilter/ipt_state.c @@ -0,0 +1,74 @@ +/* Kernel module to match connection tracking information. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Rusty Russell "); +MODULE_DESCRIPTION("iptables connection tracking state match module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_state_info *sinfo = matchinfo; + enum ip_conntrack_info ctinfo; + unsigned int statebit; + + if (skb->nfct == &ip_conntrack_untracked.ct_general) + statebit = IPT_STATE_UNTRACKED; + else if (!ip_conntrack_get(skb, &ctinfo)) + statebit = IPT_STATE_INVALID; + else + statebit = IPT_STATE_BIT(ctinfo); + + return (sinfo->statemask & statebit); +} + +static int check(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_state_info))) + return 0; + + return 1; +} + +static struct ipt_match state_match = { + .name = "state", + .match = &match, + .checkentry = &check, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + need_ip_conntrack(); + return ipt_register_match(&state_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&state_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_tcpmss.c b/net/ipv4/netfilter/ipt_tcpmss.c new file mode 100644 index 000000000000..4dc9b16ab4a3 --- /dev/null +++ b/net/ipv4/netfilter/ipt_tcpmss.c @@ -0,0 +1,127 @@ +/* Kernel module to match TCP MSS values. */ + +/* Copyright (C) 2000 Marc Boucher + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include + +#define TH_SYN 0x02 + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Marc Boucher "); +MODULE_DESCRIPTION("iptables TCP MSS match module"); + +/* Returns 1 if the mss option is set and matched by the range, 0 otherwise */ +static inline int +mssoption_match(u_int16_t min, u_int16_t max, + const struct sk_buff *skb, + int invert, + int *hotdrop) +{ + struct tcphdr _tcph, *th; + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + u8 _opt[15 * 4 - sizeof(_tcph)], *op; + unsigned int i, optlen; + + /* If we don't have the whole header, drop packet. */ + th = skb_header_pointer(skb, skb->nh.iph->ihl * 4, + sizeof(_tcph), &_tcph); + if (th == NULL) + goto dropit; + + /* Malformed. */ + if (th->doff*4 < sizeof(*th)) + goto dropit; + + optlen = th->doff*4 - sizeof(*th); + if (!optlen) + goto out; + + /* Truncated options. */ + op = skb_header_pointer(skb, skb->nh.iph->ihl * 4 + sizeof(*th), + optlen, _opt); + if (op == NULL) + goto dropit; + + for (i = 0; i < optlen; ) { + if (op[i] == TCPOPT_MSS + && (optlen - i) >= TCPOLEN_MSS + && op[i+1] == TCPOLEN_MSS) { + u_int16_t mssval; + + mssval = (op[i+2] << 8) | op[i+3]; + + return (mssval >= min && mssval <= max) ^ invert; + } + if (op[i] < 2) i++; + else i += op[i+1]?:1; + } +out: + return invert; + + dropit: + *hotdrop = 1; + return 0; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_tcpmss_match_info *info = matchinfo; + + return mssoption_match(info->mss_min, info->mss_max, skb, + info->invert, hotdrop); +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_tcpmss_match_info))) + return 0; + + /* Must specify -p tcp */ + if (ip->proto != IPPROTO_TCP || (ip->invflags & IPT_INV_PROTO)) { + printk("tcpmss: Only works on TCP packets\n"); + return 0; + } + + return 1; +} + +static struct ipt_match tcpmss_match = { + .name = "tcpmss", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&tcpmss_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&tcpmss_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_tos.c b/net/ipv4/netfilter/ipt_tos.c new file mode 100644 index 000000000000..086a1bb61e3e --- /dev/null +++ b/net/ipv4/netfilter/ipt_tos.c @@ -0,0 +1,64 @@ +/* Kernel module to match TOS values. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("iptables TOS match module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_tos_info *info = matchinfo; + + return (skb->nh.iph->tos == info->tos) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_tos_info))) + return 0; + + return 1; +} + +static struct ipt_match tos_match = { + .name = "tos", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&tos_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&tos_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ipt_ttl.c b/net/ipv4/netfilter/ipt_ttl.c new file mode 100644 index 000000000000..219aa9de88cc --- /dev/null +++ b/net/ipv4/netfilter/ipt_ttl.c @@ -0,0 +1,79 @@ +/* IP tables module for matching the value of the TTL + * + * ipt_ttl.c,v 1.5 2000/11/13 11:16:08 laforge Exp + * + * (C) 2000,2001 by Harald Welte + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +#include +#include + +MODULE_AUTHOR("Harald Welte "); +MODULE_DESCRIPTION("IP tables TTL matching module"); +MODULE_LICENSE("GPL"); + +static int match(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *matchinfo, + int offset, int *hotdrop) +{ + const struct ipt_ttl_info *info = matchinfo; + + switch (info->mode) { + case IPT_TTL_EQ: + return (skb->nh.iph->ttl == info->ttl); + break; + case IPT_TTL_NE: + return (!(skb->nh.iph->ttl == info->ttl)); + break; + case IPT_TTL_LT: + return (skb->nh.iph->ttl < info->ttl); + break; + case IPT_TTL_GT: + return (skb->nh.iph->ttl > info->ttl); + break; + default: + printk(KERN_WARNING "ipt_ttl: unknown mode %d\n", + info->mode); + return 0; + } + + return 0; +} + +static int checkentry(const char *tablename, const struct ipt_ip *ip, + void *matchinfo, unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_ttl_info))) + return 0; + + return 1; +} + +static struct ipt_match ttl_match = { + .name = "ttl", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&ttl_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&ttl_match); + +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c new file mode 100644 index 000000000000..260a4f0a2a90 --- /dev/null +++ b/net/ipv4/netfilter/iptable_filter.c @@ -0,0 +1,194 @@ +/* + * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("iptables filter table"); + +#define FILTER_VALID_HOOKS ((1 << NF_IP_LOCAL_IN) | (1 << NF_IP_FORWARD) | (1 << NF_IP_LOCAL_OUT)) + +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[3]; + struct ipt_error term; +} initial_table __initdata += { { "filter", FILTER_VALID_HOOKS, 4, + sizeof(struct ipt_standard) * 3 + sizeof(struct ipt_error), + { [NF_IP_LOCAL_IN] = 0, + [NF_IP_FORWARD] = sizeof(struct ipt_standard), + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, + { [NF_IP_LOCAL_IN] = 0, + [NF_IP_FORWARD] = sizeof(struct ipt_standard), + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 2 }, + 0, NULL, { } }, + { + /* LOCAL_IN */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* FORWARD */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_OUT */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } } + }, + /* ERROR */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_error), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } }, + { } }, + "ERROR" + } + } +}; + +static struct ipt_table packet_filter = { + .name = "filter", + .valid_hooks = FILTER_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ipt_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static unsigned int +ipt_local_out_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ipt_hook: happy cracking.\n"); + return NF_ACCEPT; + } + + return ipt_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static struct nf_hook_ops ipt_ops[] = { + { + .hook = ipt_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_FILTER, + }, + { + .hook = ipt_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_FORWARD, + .priority = NF_IP_PRI_FILTER, + }, + { + .hook = ipt_local_out_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_FILTER, + }, +}; + +/* Default to forward because I got too much mail already. */ +static int forward = NF_ACCEPT; +module_param(forward, bool, 0000); + +static int __init init(void) +{ + int ret; + + if (forward < 0 || forward > NF_MAX_VERDICT) { + printk("iptables forward must be 0 or 1\n"); + return -EINVAL; + } + + /* Entry 1 is the FORWARD hook */ + initial_table.entries[1].target.verdict = -forward - 1; + + /* Register table */ + ret = ipt_register_table(&packet_filter, &initial_table.repl); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ipt_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ipt_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + ret = nf_register_hook(&ipt_ops[2]); + if (ret < 0) + goto cleanup_hook1; + + return ret; + + cleanup_hook1: + nf_unregister_hook(&ipt_ops[1]); + cleanup_hook0: + nf_unregister_hook(&ipt_ops[0]); + cleanup_table: + ipt_unregister_table(&packet_filter); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ipt_ops[i]); + + ipt_unregister_table(&packet_filter); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c new file mode 100644 index 000000000000..160eb11b6e2f --- /dev/null +++ b/net/ipv4/netfilter/iptable_mangle.c @@ -0,0 +1,260 @@ +/* + * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Extended to all five netfilter hooks by Brad Chapman & Harald Welte + */ +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("iptables mangle table"); + +#define MANGLE_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | \ + (1 << NF_IP_LOCAL_IN) | \ + (1 << NF_IP_FORWARD) | \ + (1 << NF_IP_LOCAL_OUT) | \ + (1 << NF_IP_POST_ROUTING)) + +/* Ouch - five different hooks? Maybe this should be a config option..... -- BC */ +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[5]; + struct ipt_error term; +} initial_table __initdata += { { "mangle", MANGLE_VALID_HOOKS, 6, + sizeof(struct ipt_standard) * 5 + sizeof(struct ipt_error), + { [NF_IP_PRE_ROUTING] = 0, + [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard), + [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, + [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4 }, + { [NF_IP_PRE_ROUTING] = 0, + [NF_IP_LOCAL_IN] = sizeof(struct ipt_standard), + [NF_IP_FORWARD] = sizeof(struct ipt_standard) * 2, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) * 3, + [NF_IP_POST_ROUTING] = sizeof(struct ipt_standard) * 4 }, + 0, NULL, { } }, + { + /* PRE_ROUTING */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_IN */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* FORWARD */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_OUT */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* POST_ROUTING */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_standard), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + }, + /* ERROR */ + { { { { 0 }, { 0 }, { 0 }, { 0 }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ipt_entry), + sizeof(struct ipt_error), + 0, { 0, 0 }, { } }, + { { { { IPT_ALIGN(sizeof(struct ipt_error_target)), IPT_ERROR_TARGET } }, + { } }, + "ERROR" + } + } +}; + +static struct ipt_table packet_mangler = { + .name = "mangle", + .valid_hooks = MANGLE_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE, +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ipt_route_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL); +} + +static unsigned int +ipt_local_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int ret; + u_int8_t tos; + u_int32_t saddr, daddr; + unsigned long nfmark; + + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ipt_hook: happy cracking.\n"); + return NF_ACCEPT; + } + + /* Save things which could affect route */ + nfmark = (*pskb)->nfmark; + saddr = (*pskb)->nh.iph->saddr; + daddr = (*pskb)->nh.iph->daddr; + tos = (*pskb)->nh.iph->tos; + + ret = ipt_do_table(pskb, hook, in, out, &packet_mangler, NULL); + /* Reroute for ANY change. */ + if (ret != NF_DROP && ret != NF_STOLEN && ret != NF_QUEUE + && ((*pskb)->nh.iph->saddr != saddr + || (*pskb)->nh.iph->daddr != daddr +#ifdef CONFIG_IP_ROUTE_FWMARK + || (*pskb)->nfmark != nfmark +#endif + || (*pskb)->nh.iph->tos != tos)) + return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP; + + return ret; +} + +static struct nf_hook_ops ipt_ops[] = { + { + .hook = ipt_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_MANGLE, + }, + { + .hook = ipt_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_IN, + .priority = NF_IP_PRI_MANGLE, + }, + { + .hook = ipt_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_FORWARD, + .priority = NF_IP_PRI_MANGLE, + }, + { + .hook = ipt_local_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_MANGLE, + }, + { + .hook = ipt_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_POST_ROUTING, + .priority = NF_IP_PRI_MANGLE, + }, +}; + +static int __init init(void) +{ + int ret; + + /* Register table */ + ret = ipt_register_table(&packet_mangler, &initial_table.repl); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ipt_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ipt_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + ret = nf_register_hook(&ipt_ops[2]); + if (ret < 0) + goto cleanup_hook1; + + ret = nf_register_hook(&ipt_ops[3]); + if (ret < 0) + goto cleanup_hook2; + + ret = nf_register_hook(&ipt_ops[4]); + if (ret < 0) + goto cleanup_hook3; + + return ret; + + cleanup_hook3: + nf_unregister_hook(&ipt_ops[3]); + cleanup_hook2: + nf_unregister_hook(&ipt_ops[2]); + cleanup_hook1: + nf_unregister_hook(&ipt_ops[1]); + cleanup_hook0: + nf_unregister_hook(&ipt_ops[0]); + cleanup_table: + ipt_unregister_table(&packet_mangler); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ipt_ops[i]); + + ipt_unregister_table(&packet_mangler); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c new file mode 100644 index 000000000000..01b4a3c814d3 --- /dev/null +++ b/net/ipv4/netfilter/iptable_raw.c @@ -0,0 +1,156 @@ +/* + * 'raw' table, which is the very first hooked in at PRE_ROUTING and LOCAL_OUT . + * + * Copyright (C) 2003 Jozsef Kadlecsik + */ +#include +#include + +#define RAW_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT)) + +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[2]; + struct ipt_error term; +} initial_table __initdata = { + .repl = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .num_entries = 3, + .size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error), + .hook_entry = { + [NF_IP_PRE_ROUTING] = 0, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) }, + .underflow = { + [NF_IP_PRE_ROUTING] = 0, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) }, + }, + .entries = { + /* PRE_ROUTING */ + { + .entry = { + .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_standard), + }, + .target = { + .target = { + .u = { + .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), + }, + }, + .verdict = -NF_ACCEPT - 1, + }, + }, + + /* LOCAL_OUT */ + { + .entry = { + .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_standard), + }, + .target = { + .target = { + .u = { + .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), + }, + }, + .verdict = -NF_ACCEPT - 1, + }, + }, + }, + /* ERROR */ + .term = { + .entry = { + .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_error), + }, + .target = { + .target = { + .u = { + .user = { + .target_size = IPT_ALIGN(sizeof(struct ipt_error_target)), + .name = IPT_ERROR_TARGET, + }, + }, + }, + .errorname = "ERROR", + }, + } +}; + +static struct ipt_table packet_raw = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ipt_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(pskb, hook, in, out, &packet_raw, NULL); +} + +/* 'raw' is the very first table. */ +static struct nf_hook_ops ipt_ops[] = { + { + .hook = ipt_hook, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_RAW + }, + { + .hook = ipt_hook, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_RAW + }, +}; + +static int __init init(void) +{ + int ret; + + /* Register table */ + ret = ipt_register_table(&packet_raw, &initial_table.repl); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ipt_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ipt_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + return ret; + + cleanup_hook0: + nf_unregister_hook(&ipt_ops[0]); + cleanup_table: + ipt_unregister_table(&packet_raw); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ipt_ops[i]); + + ipt_unregister_table(&packet_raw); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c new file mode 100644 index 000000000000..912bbcc7f415 --- /dev/null +++ b/net/ipv4/proc.c @@ -0,0 +1,382 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * This file implements the various access functions for the + * PROC file system. It is mainly used for debugging and + * statistics. + * + * Version: $Id: proc.c,v 1.45 2001/05/16 16:45:35 davem Exp $ + * + * Authors: Fred N. van Kempen, + * Gerald J. Heim, + * Fred Baumgarten, + * Erik Schoenfelder, + * + * Fixes: + * Alan Cox : UDP sockets show the rxqueue/txqueue + * using hint flag for the netinfo. + * Pauline Middelink : identd support + * Alan Cox : Make /proc safer. + * Erik Schoenfelder : /proc/net/snmp + * Alan Cox : Handle dead sockets properly. + * Gerhard Koerting : Show both timers + * Alan Cox : Allow inode to be NULL (kernel socket) + * Andi Kleen : Add support for open_requests and + * split functions for more readibility. + * Andi Kleen : Add support for /proc/net/netstat + * Arnaldo C. Melo : Convert to seq_file + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int fold_prot_inuse(struct proto *proto) +{ + int res = 0; + int cpu; + + for (cpu = 0; cpu < NR_CPUS; cpu++) + res += proto->stats[cpu].inuse; + + return res; +} + +/* + * Report socket allocation statistics [mea@utu.fi] + */ +static int sockstat_seq_show(struct seq_file *seq, void *v) +{ + /* From net/socket.c */ + extern void socket_seq_show(struct seq_file *seq); + + socket_seq_show(seq); + seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %d\n", + fold_prot_inuse(&tcp_prot), atomic_read(&tcp_orphan_count), + tcp_tw_count, atomic_read(&tcp_sockets_allocated), + atomic_read(&tcp_memory_allocated)); + seq_printf(seq, "UDP: inuse %d\n", fold_prot_inuse(&udp_prot)); + seq_printf(seq, "RAW: inuse %d\n", fold_prot_inuse(&raw_prot)); + seq_printf(seq, "FRAG: inuse %d memory %d\n", ip_frag_nqueues, + atomic_read(&ip_frag_mem)); + return 0; +} + +static int sockstat_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, sockstat_seq_show, NULL); +} + +static struct file_operations sockstat_seq_fops = { + .owner = THIS_MODULE, + .open = sockstat_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static unsigned long +fold_field(void *mib[], int offt) +{ + unsigned long res = 0; + int i; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_possible(i)) + continue; + res += *(((unsigned long *) per_cpu_ptr(mib[0], i)) + offt); + res += *(((unsigned long *) per_cpu_ptr(mib[1], i)) + offt); + } + return res; +} + +/* snmp items */ +static struct snmp_mib snmp4_ipstats_list[] = { + SNMP_MIB_ITEM("InReceives", IPSTATS_MIB_INRECEIVES), + SNMP_MIB_ITEM("InHdrErrors", IPSTATS_MIB_INHDRERRORS), + SNMP_MIB_ITEM("InAddrErrors", IPSTATS_MIB_INADDRERRORS), + SNMP_MIB_ITEM("ForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), + SNMP_MIB_ITEM("InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS), + SNMP_MIB_ITEM("InDiscards", IPSTATS_MIB_INDISCARDS), + SNMP_MIB_ITEM("InDelivers", IPSTATS_MIB_INDELIVERS), + SNMP_MIB_ITEM("OutRequests", IPSTATS_MIB_OUTREQUESTS), + SNMP_MIB_ITEM("OutDiscards", IPSTATS_MIB_OUTDISCARDS), + SNMP_MIB_ITEM("OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), + SNMP_MIB_ITEM("ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), + SNMP_MIB_ITEM("ReasmReqds", IPSTATS_MIB_REASMREQDS), + SNMP_MIB_ITEM("ReasmOKs", IPSTATS_MIB_REASMOKS), + SNMP_MIB_ITEM("ReasmFails", IPSTATS_MIB_REASMFAILS), + SNMP_MIB_ITEM("FragOKs", IPSTATS_MIB_FRAGOKS), + SNMP_MIB_ITEM("FragFails", IPSTATS_MIB_FRAGFAILS), + SNMP_MIB_ITEM("FragCreates", IPSTATS_MIB_FRAGCREATES), + SNMP_MIB_SENTINEL +}; + +static struct snmp_mib snmp4_icmp_list[] = { + SNMP_MIB_ITEM("InMsgs", ICMP_MIB_INMSGS), + SNMP_MIB_ITEM("InErrors", ICMP_MIB_INERRORS), + SNMP_MIB_ITEM("InDestUnreachs", ICMP_MIB_INDESTUNREACHS), + SNMP_MIB_ITEM("InTimeExcds", ICMP_MIB_INTIMEEXCDS), + SNMP_MIB_ITEM("InParmProbs", ICMP_MIB_INPARMPROBS), + SNMP_MIB_ITEM("InSrcQuenchs", ICMP_MIB_INSRCQUENCHS), + SNMP_MIB_ITEM("InRedirects", ICMP_MIB_INREDIRECTS), + SNMP_MIB_ITEM("InEchos", ICMP_MIB_INECHOS), + SNMP_MIB_ITEM("InEchoReps", ICMP_MIB_INECHOREPS), + SNMP_MIB_ITEM("InTimestamps", ICMP_MIB_INTIMESTAMPS), + SNMP_MIB_ITEM("InTimestampReps", ICMP_MIB_INTIMESTAMPREPS), + SNMP_MIB_ITEM("InAddrMasks", ICMP_MIB_INADDRMASKS), + SNMP_MIB_ITEM("InAddrMaskReps", ICMP_MIB_INADDRMASKREPS), + SNMP_MIB_ITEM("OutMsgs", ICMP_MIB_OUTMSGS), + SNMP_MIB_ITEM("OutErrors", ICMP_MIB_OUTERRORS), + SNMP_MIB_ITEM("OutDestUnreachs", ICMP_MIB_OUTDESTUNREACHS), + SNMP_MIB_ITEM("OutTimeExcds", ICMP_MIB_OUTTIMEEXCDS), + SNMP_MIB_ITEM("OutParmProbs", ICMP_MIB_OUTPARMPROBS), + SNMP_MIB_ITEM("OutSrcQuenchs", ICMP_MIB_OUTSRCQUENCHS), + SNMP_MIB_ITEM("OutRedirects", ICMP_MIB_OUTREDIRECTS), + SNMP_MIB_ITEM("OutEchos", ICMP_MIB_OUTECHOS), + SNMP_MIB_ITEM("OutEchoReps", ICMP_MIB_OUTECHOREPS), + SNMP_MIB_ITEM("OutTimestamps", ICMP_MIB_OUTTIMESTAMPS), + SNMP_MIB_ITEM("OutTimestampReps", ICMP_MIB_OUTTIMESTAMPREPS), + SNMP_MIB_ITEM("OutAddrMasks", ICMP_MIB_OUTADDRMASKS), + SNMP_MIB_ITEM("OutAddrMaskReps", ICMP_MIB_OUTADDRMASKREPS), + SNMP_MIB_SENTINEL +}; + +static struct snmp_mib snmp4_tcp_list[] = { + SNMP_MIB_ITEM("RtoAlgorithm", TCP_MIB_RTOALGORITHM), + SNMP_MIB_ITEM("RtoMin", TCP_MIB_RTOMIN), + SNMP_MIB_ITEM("RtoMax", TCP_MIB_RTOMAX), + SNMP_MIB_ITEM("MaxConn", TCP_MIB_MAXCONN), + SNMP_MIB_ITEM("ActiveOpens", TCP_MIB_ACTIVEOPENS), + SNMP_MIB_ITEM("PassiveOpens", TCP_MIB_PASSIVEOPENS), + SNMP_MIB_ITEM("AttemptFails", TCP_MIB_ATTEMPTFAILS), + SNMP_MIB_ITEM("EstabResets", TCP_MIB_ESTABRESETS), + SNMP_MIB_ITEM("CurrEstab", TCP_MIB_CURRESTAB), + SNMP_MIB_ITEM("InSegs", TCP_MIB_INSEGS), + SNMP_MIB_ITEM("OutSegs", TCP_MIB_OUTSEGS), + SNMP_MIB_ITEM("RetransSegs", TCP_MIB_RETRANSSEGS), + SNMP_MIB_ITEM("InErrs", TCP_MIB_INERRS), + SNMP_MIB_ITEM("OutRsts", TCP_MIB_OUTRSTS), + SNMP_MIB_SENTINEL +}; + +static struct snmp_mib snmp4_udp_list[] = { + SNMP_MIB_ITEM("InDatagrams", UDP_MIB_INDATAGRAMS), + SNMP_MIB_ITEM("NoPorts", UDP_MIB_NOPORTS), + SNMP_MIB_ITEM("InErrors", UDP_MIB_INERRORS), + SNMP_MIB_ITEM("OutDatagrams", UDP_MIB_OUTDATAGRAMS), + SNMP_MIB_SENTINEL +}; + +static struct snmp_mib snmp4_net_list[] = { + SNMP_MIB_ITEM("SyncookiesSent", LINUX_MIB_SYNCOOKIESSENT), + SNMP_MIB_ITEM("SyncookiesRecv", LINUX_MIB_SYNCOOKIESRECV), + SNMP_MIB_ITEM("SyncookiesFailed", LINUX_MIB_SYNCOOKIESFAILED), + SNMP_MIB_ITEM("EmbryonicRsts", LINUX_MIB_EMBRYONICRSTS), + SNMP_MIB_ITEM("PruneCalled", LINUX_MIB_PRUNECALLED), + SNMP_MIB_ITEM("RcvPruned", LINUX_MIB_RCVPRUNED), + SNMP_MIB_ITEM("OfoPruned", LINUX_MIB_OFOPRUNED), + SNMP_MIB_ITEM("OutOfWindowIcmps", LINUX_MIB_OUTOFWINDOWICMPS), + SNMP_MIB_ITEM("LockDroppedIcmps", LINUX_MIB_LOCKDROPPEDICMPS), + SNMP_MIB_ITEM("ArpFilter", LINUX_MIB_ARPFILTER), + SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED), + SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED), + SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), + SNMP_MIB_ITEM("PAWSPassive", LINUX_MIB_PAWSPASSIVEREJECTED), + SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), + SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), + SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), + SNMP_MIB_ITEM("DelayedACKLocked", LINUX_MIB_DELAYEDACKLOCKED), + SNMP_MIB_ITEM("DelayedACKLost", LINUX_MIB_DELAYEDACKLOST), + SNMP_MIB_ITEM("ListenOverflows", LINUX_MIB_LISTENOVERFLOWS), + SNMP_MIB_ITEM("ListenDrops", LINUX_MIB_LISTENDROPS), + SNMP_MIB_ITEM("TCPPrequeued", LINUX_MIB_TCPPREQUEUED), + SNMP_MIB_ITEM("TCPDirectCopyFromBacklog", LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG), + SNMP_MIB_ITEM("TCPDirectCopyFromPrequeue", LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE), + SNMP_MIB_ITEM("TCPPrequeueDropped", LINUX_MIB_TCPPREQUEUEDROPPED), + SNMP_MIB_ITEM("TCPHPHits", LINUX_MIB_TCPHPHITS), + SNMP_MIB_ITEM("TCPHPHitsToUser", LINUX_MIB_TCPHPHITSTOUSER), + SNMP_MIB_ITEM("TCPPureAcks", LINUX_MIB_TCPPUREACKS), + SNMP_MIB_ITEM("TCPHPAcks", LINUX_MIB_TCPHPACKS), + SNMP_MIB_ITEM("TCPRenoRecovery", LINUX_MIB_TCPRENORECOVERY), + SNMP_MIB_ITEM("TCPSackRecovery", LINUX_MIB_TCPSACKRECOVERY), + SNMP_MIB_ITEM("TCPSACKReneging", LINUX_MIB_TCPSACKRENEGING), + SNMP_MIB_ITEM("TCPFACKReorder", LINUX_MIB_TCPFACKREORDER), + SNMP_MIB_ITEM("TCPSACKReorder", LINUX_MIB_TCPSACKREORDER), + SNMP_MIB_ITEM("TCPRenoReorder", LINUX_MIB_TCPRENOREORDER), + SNMP_MIB_ITEM("TCPTSReorder", LINUX_MIB_TCPTSREORDER), + SNMP_MIB_ITEM("TCPFullUndo", LINUX_MIB_TCPFULLUNDO), + SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO), + SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO), + SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO), + SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS), + SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT), + SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES), + SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), + SNMP_MIB_ITEM("TCPLossFailures", LINUX_MIB_TCPLOSSFAILURES), + SNMP_MIB_ITEM("TCPFastRetrans", LINUX_MIB_TCPFASTRETRANS), + SNMP_MIB_ITEM("TCPForwardRetrans", LINUX_MIB_TCPFORWARDRETRANS), + SNMP_MIB_ITEM("TCPSlowStartRetrans", LINUX_MIB_TCPSLOWSTARTRETRANS), + SNMP_MIB_ITEM("TCPTimeouts", LINUX_MIB_TCPTIMEOUTS), + SNMP_MIB_ITEM("TCPRenoRecoveryFail", LINUX_MIB_TCPRENORECOVERYFAIL), + SNMP_MIB_ITEM("TCPSackRecoveryFail", LINUX_MIB_TCPSACKRECOVERYFAIL), + SNMP_MIB_ITEM("TCPSchedulerFailed", LINUX_MIB_TCPSCHEDULERFAILED), + SNMP_MIB_ITEM("TCPRcvCollapsed", LINUX_MIB_TCPRCVCOLLAPSED), + SNMP_MIB_ITEM("TCPDSACKOldSent", LINUX_MIB_TCPDSACKOLDSENT), + SNMP_MIB_ITEM("TCPDSACKOfoSent", LINUX_MIB_TCPDSACKOFOSENT), + SNMP_MIB_ITEM("TCPDSACKRecv", LINUX_MIB_TCPDSACKRECV), + SNMP_MIB_ITEM("TCPDSACKOfoRecv", LINUX_MIB_TCPDSACKOFORECV), + SNMP_MIB_ITEM("TCPAbortOnSyn", LINUX_MIB_TCPABORTONSYN), + SNMP_MIB_ITEM("TCPAbortOnData", LINUX_MIB_TCPABORTONDATA), + SNMP_MIB_ITEM("TCPAbortOnClose", LINUX_MIB_TCPABORTONCLOSE), + SNMP_MIB_ITEM("TCPAbortOnMemory", LINUX_MIB_TCPABORTONMEMORY), + SNMP_MIB_ITEM("TCPAbortOnTimeout", LINUX_MIB_TCPABORTONTIMEOUT), + SNMP_MIB_ITEM("TCPAbortOnLinger", LINUX_MIB_TCPABORTONLINGER), + SNMP_MIB_ITEM("TCPAbortFailed", LINUX_MIB_TCPABORTFAILED), + SNMP_MIB_ITEM("TCPMemoryPressures", LINUX_MIB_TCPMEMORYPRESSURES), + SNMP_MIB_SENTINEL +}; + +/* + * Called from the PROCfs module. This outputs /proc/net/snmp. + */ +static int snmp_seq_show(struct seq_file *seq, void *v) +{ + int i; + + seq_puts(seq, "Ip: Forwarding DefaultTTL"); + + for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) + seq_printf(seq, " %s", snmp4_ipstats_list[i].name); + + seq_printf(seq, "\nIp: %d %d", + ipv4_devconf.forwarding ? 1 : 2, sysctl_ip_default_ttl); + + for (i = 0; snmp4_ipstats_list[i].name != NULL; i++) + seq_printf(seq, " %lu", + fold_field((void **) ip_statistics, + snmp4_ipstats_list[i].entry)); + + seq_puts(seq, "\nIcmp:"); + for (i = 0; snmp4_icmp_list[i].name != NULL; i++) + seq_printf(seq, " %s", snmp4_icmp_list[i].name); + + seq_puts(seq, "\nIcmp:"); + for (i = 0; snmp4_icmp_list[i].name != NULL; i++) + seq_printf(seq, " %lu", + fold_field((void **) icmp_statistics, + snmp4_icmp_list[i].entry)); + + seq_puts(seq, "\nTcp:"); + for (i = 0; snmp4_tcp_list[i].name != NULL; i++) + seq_printf(seq, " %s", snmp4_tcp_list[i].name); + + seq_puts(seq, "\nTcp:"); + for (i = 0; snmp4_tcp_list[i].name != NULL; i++) { + /* MaxConn field is signed, RFC 2012 */ + if (snmp4_tcp_list[i].entry == TCP_MIB_MAXCONN) + seq_printf(seq, " %ld", + fold_field((void **) tcp_statistics, + snmp4_tcp_list[i].entry)); + else + seq_printf(seq, " %lu", + fold_field((void **) tcp_statistics, + snmp4_tcp_list[i].entry)); + } + + seq_puts(seq, "\nUdp:"); + for (i = 0; snmp4_udp_list[i].name != NULL; i++) + seq_printf(seq, " %s", snmp4_udp_list[i].name); + + seq_puts(seq, "\nUdp:"); + for (i = 0; snmp4_udp_list[i].name != NULL; i++) + seq_printf(seq, " %lu", + fold_field((void **) udp_statistics, + snmp4_udp_list[i].entry)); + + seq_putc(seq, '\n'); + return 0; +} + +static int snmp_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, snmp_seq_show, NULL); +} + +static struct file_operations snmp_seq_fops = { + .owner = THIS_MODULE, + .open = snmp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * Output /proc/net/netstat + */ +static int netstat_seq_show(struct seq_file *seq, void *v) +{ + int i; + + seq_puts(seq, "TcpExt:"); + for (i = 0; snmp4_net_list[i].name != NULL; i++) + seq_printf(seq, " %s", snmp4_net_list[i].name); + + seq_puts(seq, "\nTcpExt:"); + for (i = 0; snmp4_net_list[i].name != NULL; i++) + seq_printf(seq, " %lu", + fold_field((void **) net_statistics, + snmp4_net_list[i].entry)); + + seq_putc(seq, '\n'); + return 0; +} + +static int netstat_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, netstat_seq_show, NULL); +} + +static struct file_operations netstat_seq_fops = { + .owner = THIS_MODULE, + .open = netstat_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +int __init ip_misc_proc_init(void) +{ + int rc = 0; + + if (!proc_net_fops_create("netstat", S_IRUGO, &netstat_seq_fops)) + goto out_netstat; + + if (!proc_net_fops_create("snmp", S_IRUGO, &snmp_seq_fops)) + goto out_snmp; + + if (!proc_net_fops_create("sockstat", S_IRUGO, &sockstat_seq_fops)) + goto out_sockstat; +out: + return rc; +out_sockstat: + proc_net_remove("snmp"); +out_snmp: + proc_net_remove("netstat"); +out_netstat: + rc = -ENOMEM; + goto out; +} + diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c new file mode 100644 index 000000000000..90a587cacaa4 --- /dev/null +++ b/net/ipv4/protocol.c @@ -0,0 +1,101 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * INET protocol dispatch tables. + * + * Version: $Id: protocol.c,v 1.14 2001/05/18 02:25:49 davem Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * + * Fixes: + * Alan Cox : Ahah! udp icmp errors don't work because + * udp_err is never called! + * Alan Cox : Added new fields for init and ready for + * proper fragmentation (_NO_ 4K limits!) + * Richard Colella : Hang on hash collision + * Vince Laviano : Modified inet_del_protocol() to correctly + * maintain copy bit. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct net_protocol *inet_protos[MAX_INET_PROTOS]; +static DEFINE_SPINLOCK(inet_proto_lock); + +/* + * Add a protocol handler to the hash tables + */ + +int inet_add_protocol(struct net_protocol *prot, unsigned char protocol) +{ + int hash, ret; + + hash = protocol & (MAX_INET_PROTOS - 1); + + spin_lock_bh(&inet_proto_lock); + if (inet_protos[hash]) { + ret = -1; + } else { + inet_protos[hash] = prot; + ret = 0; + } + spin_unlock_bh(&inet_proto_lock); + + return ret; +} + +/* + * Remove a protocol from the hash tables. + */ + +int inet_del_protocol(struct net_protocol *prot, unsigned char protocol) +{ + int hash, ret; + + hash = protocol & (MAX_INET_PROTOS - 1); + + spin_lock_bh(&inet_proto_lock); + if (inet_protos[hash] == prot) { + inet_protos[hash] = NULL; + ret = 0; + } else { + ret = -1; + } + spin_unlock_bh(&inet_proto_lock); + + synchronize_net(); + + return ret; +} + +EXPORT_SYMBOL(inet_add_protocol); +EXPORT_SYMBOL(inet_del_protocol); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c new file mode 100644 index 000000000000..93624a32eb9a --- /dev/null +++ b/net/ipv4/raw.c @@ -0,0 +1,888 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * RAW - implementation of IP "raw" sockets. + * + * Version: $Id: raw.c,v 1.64 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * + * Fixes: + * Alan Cox : verify_area() fixed up + * Alan Cox : ICMP error handling + * Alan Cox : EMSGSIZE if you send too big a packet + * Alan Cox : Now uses generic datagrams and shared + * skbuff library. No more peek crashes, + * no more backlogs + * Alan Cox : Checks sk->broadcast. + * Alan Cox : Uses skb_free_datagram/skb_copy_datagram + * Alan Cox : Raw passes ip options too + * Alan Cox : Setsocketopt added + * Alan Cox : Fixed error return for broadcasts + * Alan Cox : Removed wake_up calls + * Alan Cox : Use ttl/tos + * Alan Cox : Cleaned up old debugging + * Alan Cox : Use new kernel side addresses + * Arnt Gulbrandsen : Fixed MSG_DONTROUTE in raw sockets. + * Alan Cox : BSD style RAW socket demultiplexing. + * Alan Cox : Beginnings of mrouted support. + * Alan Cox : Added IP_HDRINCL option. + * Alan Cox : Skip broadcast check if BSDism set. + * David S. Miller : New socket lookup architecture. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct hlist_head raw_v4_htable[RAWV4_HTABLE_SIZE]; +DEFINE_RWLOCK(raw_v4_lock); + +static void raw_v4_hash(struct sock *sk) +{ + struct hlist_head *head = &raw_v4_htable[inet_sk(sk)->num & + (RAWV4_HTABLE_SIZE - 1)]; + + write_lock_bh(&raw_v4_lock); + sk_add_node(sk, head); + sock_prot_inc_use(sk->sk_prot); + write_unlock_bh(&raw_v4_lock); +} + +static void raw_v4_unhash(struct sock *sk) +{ + write_lock_bh(&raw_v4_lock); + if (sk_del_node_init(sk)) + sock_prot_dec_use(sk->sk_prot); + write_unlock_bh(&raw_v4_lock); +} + +struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, + unsigned long raddr, unsigned long laddr, + int dif) +{ + struct hlist_node *node; + + sk_for_each_from(sk, node) { + struct inet_sock *inet = inet_sk(sk); + + if (inet->num == num && + !(inet->daddr && inet->daddr != raddr) && + !(inet->rcv_saddr && inet->rcv_saddr != laddr) && + !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) + goto found; /* gotcha */ + } + sk = NULL; +found: + return sk; +} + +/* + * 0 - deliver + * 1 - block + */ +static __inline__ int icmp_filter(struct sock *sk, struct sk_buff *skb) +{ + int type; + + if (!pskb_may_pull(skb, sizeof(struct icmphdr))) + return 1; + + type = skb->h.icmph->type; + if (type < 32) { + __u32 data = raw_sk(sk)->filter.data; + + return ((1 << type) & data) != 0; + } + + /* Do not block unknown ICMP types */ + return 0; +} + +/* IP input processing comes here for RAW socket delivery. + * Caller owns SKB, so we must make clones. + * + * RFC 1122: SHOULD pass TOS value up to the transport layer. + * -> It does. And not only TOS, but all IP header. + */ +void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash) +{ + struct sock *sk; + struct hlist_head *head; + + read_lock(&raw_v4_lock); + head = &raw_v4_htable[hash]; + if (hlist_empty(head)) + goto out; + sk = __raw_v4_lookup(__sk_head(head), iph->protocol, + iph->saddr, iph->daddr, + skb->dev->ifindex); + + while (sk) { + if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { + struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); + + /* Not releasing hash table! */ + if (clone) + raw_rcv(sk, clone); + } + sk = __raw_v4_lookup(sk_next(sk), iph->protocol, + iph->saddr, iph->daddr, + skb->dev->ifindex); + } +out: + read_unlock(&raw_v4_lock); +} + +void raw_err (struct sock *sk, struct sk_buff *skb, u32 info) +{ + struct inet_sock *inet = inet_sk(sk); + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + int err = 0; + int harderr = 0; + + /* Report error on raw socket, if: + 1. User requested ip_recverr. + 2. Socket is connected (otherwise the error indication + is useless without ip_recverr and error is hard. + */ + if (!inet->recverr && sk->sk_state != TCP_ESTABLISHED) + return; + + switch (type) { + default: + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + case ICMP_SOURCE_QUENCH: + return; + case ICMP_PARAMETERPROB: + err = EPROTO; + harderr = 1; + break; + case ICMP_DEST_UNREACH: + err = EHOSTUNREACH; + if (code > NR_ICMP_UNREACH) + break; + err = icmp_err_convert[code].errno; + harderr = icmp_err_convert[code].fatal; + if (code == ICMP_FRAG_NEEDED) { + harderr = inet->pmtudisc != IP_PMTUDISC_DONT; + err = EMSGSIZE; + } + } + + if (inet->recverr) { + struct iphdr *iph = (struct iphdr*)skb->data; + u8 *payload = skb->data + (iph->ihl << 2); + + if (inet->hdrincl) + payload = skb->data; + ip_icmp_error(sk, skb, err, 0, info, payload); + } + + if (inet->recverr || harderr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } +} + +static int raw_rcv_skb(struct sock * sk, struct sk_buff * skb) +{ + /* Charge it to the socket. */ + + if (sock_queue_rcv_skb(sk, skb) < 0) { + /* FIXME: increment a raw drops counter here */ + kfree_skb(skb); + return NET_RX_DROP; + } + + return NET_RX_SUCCESS; +} + +int raw_rcv(struct sock *sk, struct sk_buff *skb) +{ + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return NET_RX_DROP; + } + + skb_push(skb, skb->data - skb->nh.raw); + + raw_rcv_skb(sk, skb); + return 0; +} + +static int raw_send_hdrinc(struct sock *sk, void *from, int length, + struct rtable *rt, + unsigned int flags) +{ + struct inet_sock *inet = inet_sk(sk); + int hh_len; + struct iphdr *iph; + struct sk_buff *skb; + int err; + + if (length > rt->u.dst.dev->mtu) { + ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, + rt->u.dst.dev->mtu); + return -EMSGSIZE; + } + if (flags&MSG_PROBE) + goto out; + + hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + + skb = sock_alloc_send_skb(sk, length+hh_len+15, + flags&MSG_DONTWAIT, &err); + if (skb == NULL) + goto error; + skb_reserve(skb, hh_len); + + skb->priority = sk->sk_priority; + skb->dst = dst_clone(&rt->u.dst); + + skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length); + + skb->ip_summed = CHECKSUM_NONE; + + skb->h.raw = skb->nh.raw; + err = memcpy_fromiovecend((void *)iph, from, 0, length); + if (err) + goto error_fault; + + /* We don't modify invalid header */ + if (length >= sizeof(*iph) && iph->ihl * 4 <= length) { + if (!iph->saddr) + iph->saddr = rt->rt_src; + iph->check = 0; + iph->tot_len = htons(length); + if (!iph->id) + ip_select_ident(iph, &rt->u.dst, NULL); + + iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); + } + + err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + dst_output); + if (err > 0) + err = inet->recverr ? net_xmit_errno(err) : 0; + if (err) + goto error; +out: + return 0; + +error_fault: + err = -EFAULT; + kfree_skb(skb); +error: + IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + return err; +} + +static void raw_probe_proto_opt(struct flowi *fl, struct msghdr *msg) +{ + struct iovec *iov; + u8 __user *type = NULL; + u8 __user *code = NULL; + int probed = 0; + int i; + + if (!msg->msg_iov) + return; + + for (i = 0; i < msg->msg_iovlen; i++) { + iov = &msg->msg_iov[i]; + if (!iov) + continue; + + switch (fl->proto) { + case IPPROTO_ICMP: + /* check if one-byte field is readable or not. */ + if (iov->iov_base && iov->iov_len < 1) + break; + + if (!type) { + type = iov->iov_base; + /* check if code field is readable or not. */ + if (iov->iov_len > 1) + code = type + 1; + } else if (!code) + code = iov->iov_base; + + if (type && code) { + get_user(fl->fl_icmp_type, type); + __get_user(fl->fl_icmp_code, code); + probed = 1; + } + break; + default: + probed = 1; + break; + } + if (probed) + break; + } +} + +static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipcm_cookie ipc; + struct rtable *rt = NULL; + int free = 0; + u32 daddr; + u32 saddr; + u8 tos; + int err; + + err = -EMSGSIZE; + if (len < 0 || len > 0xFFFF) + goto out; + + /* + * Check the flags. + */ + + err = -EOPNOTSUPP; + if (msg->msg_flags & MSG_OOB) /* Mirror BSD error message */ + goto out; /* compatibility */ + + /* + * Get and verify the address. + */ + + if (msg->msg_namelen) { + struct sockaddr_in *usin = (struct sockaddr_in*)msg->msg_name; + err = -EINVAL; + if (msg->msg_namelen < sizeof(*usin)) + goto out; + if (usin->sin_family != AF_INET) { + static int complained; + if (!complained++) + printk(KERN_INFO "%s forgot to set AF_INET in " + "raw sendmsg. Fix it!\n", + current->comm); + err = -EAFNOSUPPORT; + if (usin->sin_family) + goto out; + } + daddr = usin->sin_addr.s_addr; + /* ANK: I did not forget to get protocol from port field. + * I just do not know, who uses this weirdness. + * IP_HDRINCL is much more convenient. + */ + } else { + err = -EDESTADDRREQ; + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + daddr = inet->daddr; + } + + ipc.addr = inet->saddr; + ipc.opt = NULL; + ipc.oif = sk->sk_bound_dev_if; + + if (msg->msg_controllen) { + err = ip_cmsg_send(msg, &ipc); + if (err) + goto out; + if (ipc.opt) + free = 1; + } + + saddr = ipc.addr; + ipc.addr = daddr; + + if (!ipc.opt) + ipc.opt = inet->opt; + + if (ipc.opt) { + err = -EINVAL; + /* Linux does not mangle headers on raw sockets, + * so that IP options + IP_HDRINCL is non-sense. + */ + if (inet->hdrincl) + goto done; + if (ipc.opt->srr) { + if (!daddr) + goto done; + daddr = ipc.opt->faddr; + } + } + tos = RT_CONN_FLAGS(sk); + if (msg->msg_flags & MSG_DONTROUTE) + tos |= RTO_ONLINK; + + if (MULTICAST(daddr)) { + if (!ipc.oif) + ipc.oif = inet->mc_index; + if (!saddr) + saddr = inet->mc_addr; + } + + { + struct flowi fl = { .oif = ipc.oif, + .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = saddr, + .tos = tos } }, + .proto = inet->hdrincl ? IPPROTO_RAW : + sk->sk_protocol, + }; + if (!inet->hdrincl) + raw_probe_proto_opt(&fl, msg); + + err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT)); + } + if (err) + goto done; + + err = -EACCES; + if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) + goto done; + + if (msg->msg_flags & MSG_CONFIRM) + goto do_confirm; +back_from_confirm: + + if (inet->hdrincl) + err = raw_send_hdrinc(sk, msg->msg_iov, len, + rt, msg->msg_flags); + + else { + if (!ipc.addr) + ipc.addr = rt->rt_dst; + lock_sock(sk); + err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, + &ipc, rt, msg->msg_flags); + if (err) + ip_flush_pending_frames(sk); + else if (!(msg->msg_flags & MSG_MORE)) + err = ip_push_pending_frames(sk); + release_sock(sk); + } +done: + if (free) + kfree(ipc.opt); + ip_rt_put(rt); + +out: return err < 0 ? err : len; + +do_confirm: + dst_confirm(&rt->u.dst); + if (!(msg->msg_flags & MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto done; +} + +static void raw_close(struct sock *sk, long timeout) +{ + /* + * Raw sockets may have direct kernel refereneces. Kill them. + */ + ip_ra_control(sk, 0, NULL); + + sk_common_release(sk); +} + +/* This gets rid of all the nasties in af_inet. -DaveM */ +static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct sockaddr_in *addr = (struct sockaddr_in *) uaddr; + int ret = -EINVAL; + int chk_addr_ret; + + if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) + goto out; + chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + ret = -EADDRNOTAVAIL; + if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && + chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) + goto out; + inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; + if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) + inet->saddr = 0; /* Use device */ + sk_dst_reset(sk); + ret = 0; +out: return ret; +} + +/* + * This should be easy, if there is something there + * we return it, otherwise we block. + */ + +static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + size_t copied = 0; + int err = -EOPNOTSUPP; + struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + struct sk_buff *skb; + + if (flags & MSG_OOB) + goto out; + + if (addr_len) + *addr_len = sizeof(*sin); + + if (flags & MSG_ERRQUEUE) { + err = ip_recv_error(sk, msg, len); + goto out; + } + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len; + if (len < copied) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto done; + + sock_recv_timestamp(msg, sk, skb); + + /* Copy the address. */ + if (sin) { + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = skb->nh.iph->saddr; + memset(&sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + if (flags & MSG_TRUNC) + copied = skb->len; +done: + skb_free_datagram(sk, skb); +out: return err ? err : copied; +} + +static int raw_init(struct sock *sk) +{ + struct raw_sock *rp = raw_sk(sk); + + if (inet_sk(sk)->num == IPPROTO_ICMP) + memset(&rp->filter, 0, sizeof(rp->filter)); + return 0; +} + +static int raw_seticmpfilter(struct sock *sk, char __user *optval, int optlen) +{ + if (optlen > sizeof(struct icmp_filter)) + optlen = sizeof(struct icmp_filter); + if (copy_from_user(&raw_sk(sk)->filter, optval, optlen)) + return -EFAULT; + return 0; +} + +static int raw_geticmpfilter(struct sock *sk, char __user *optval, int __user *optlen) +{ + int len, ret = -EFAULT; + + if (get_user(len, optlen)) + goto out; + ret = -EINVAL; + if (len < 0) + goto out; + if (len > sizeof(struct icmp_filter)) + len = sizeof(struct icmp_filter); + ret = -EFAULT; + if (put_user(len, optlen) || + copy_to_user(optval, &raw_sk(sk)->filter, len)) + goto out; + ret = 0; +out: return ret; +} + +static int raw_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, int optlen) +{ + if (level != SOL_RAW) + return ip_setsockopt(sk, level, optname, optval, optlen); + + if (optname == ICMP_FILTER) { + if (inet_sk(sk)->num != IPPROTO_ICMP) + return -EOPNOTSUPP; + else + return raw_seticmpfilter(sk, optval, optlen); + } + return -ENOPROTOOPT; +} + +static int raw_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + if (level != SOL_RAW) + return ip_getsockopt(sk, level, optname, optval, optlen); + + if (optname == ICMP_FILTER) { + if (inet_sk(sk)->num != IPPROTO_ICMP) + return -EOPNOTSUPP; + else + return raw_geticmpfilter(sk, optval, optlen); + } + return -ENOPROTOOPT; +} + +static int raw_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + switch (cmd) { + case SIOCOUTQ: { + int amount = atomic_read(&sk->sk_wmem_alloc); + return put_user(amount, (int __user *)arg); + } + case SIOCINQ: { + struct sk_buff *skb; + int amount = 0; + + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) + amount = skb->len; + spin_unlock_irq(&sk->sk_receive_queue.lock); + return put_user(amount, (int __user *)arg); + } + + default: +#ifdef CONFIG_IP_MROUTE + return ipmr_ioctl(sk, cmd, (void __user *)arg); +#else + return -ENOIOCTLCMD; +#endif + } +} + +struct proto raw_prot = { + .name = "RAW", + .owner = THIS_MODULE, + .close = raw_close, + .connect = ip4_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = raw_ioctl, + .init = raw_init, + .setsockopt = raw_setsockopt, + .getsockopt = raw_getsockopt, + .sendmsg = raw_sendmsg, + .recvmsg = raw_recvmsg, + .bind = raw_bind, + .backlog_rcv = raw_rcv_skb, + .hash = raw_v4_hash, + .unhash = raw_v4_unhash, + .obj_size = sizeof(struct raw_sock), +}; + +#ifdef CONFIG_PROC_FS +struct raw_iter_state { + int bucket; +}; + +#define raw_seq_private(seq) ((struct raw_iter_state *)(seq)->private) + +static struct sock *raw_get_first(struct seq_file *seq) +{ + struct sock *sk; + struct raw_iter_state* state = raw_seq_private(seq); + + for (state->bucket = 0; state->bucket < RAWV4_HTABLE_SIZE; ++state->bucket) { + struct hlist_node *node; + + sk_for_each(sk, node, &raw_v4_htable[state->bucket]) + if (sk->sk_family == PF_INET) + goto found; + } + sk = NULL; +found: + return sk; +} + +static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) +{ + struct raw_iter_state* state = raw_seq_private(seq); + + do { + sk = sk_next(sk); +try_again: + ; + } while (sk && sk->sk_family != PF_INET); + + if (!sk && ++state->bucket < RAWV4_HTABLE_SIZE) { + sk = sk_head(&raw_v4_htable[state->bucket]); + goto try_again; + } + return sk; +} + +static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) +{ + struct sock *sk = raw_get_first(seq); + + if (sk) + while (pos && (sk = raw_get_next(seq, sk)) != NULL) + --pos; + return pos ? NULL : sk; +} + +static void *raw_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&raw_v4_lock); + return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock *sk; + + if (v == SEQ_START_TOKEN) + sk = raw_get_first(seq); + else + sk = raw_get_next(seq, v); + ++*pos; + return sk; +} + +static void raw_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&raw_v4_lock); +} + +static __inline__ char *get_raw_sock(struct sock *sp, char *tmpbuf, int i) +{ + struct inet_sock *inet = inet_sk(sp); + unsigned int dest = inet->daddr, + src = inet->rcv_saddr; + __u16 destp = 0, + srcp = inet->num; + + sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p", + i, src, srcp, dest, destp, sp->sk_state, + atomic_read(&sp->sk_wmem_alloc), + atomic_read(&sp->sk_rmem_alloc), + 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp); + return tmpbuf; +} + +static int raw_seq_show(struct seq_file *seq, void *v) +{ + char tmpbuf[129]; + + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%-127s\n", + " sl local_address rem_address st tx_queue " + "rx_queue tr tm->when retrnsmt uid timeout " + "inode"); + else { + struct raw_iter_state *state = raw_seq_private(seq); + + seq_printf(seq, "%-127s\n", + get_raw_sock(v, tmpbuf, state->bucket)); + } + return 0; +} + +static struct seq_operations raw_seq_ops = { + .start = raw_seq_start, + .next = raw_seq_next, + .stop = raw_seq_stop, + .show = raw_seq_show, +}; + +static int raw_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct raw_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + rc = seq_open(file, &raw_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations raw_seq_fops = { + .owner = THIS_MODULE, + .open = raw_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +int __init raw_proc_init(void) +{ + if (!proc_net_fops_create("raw", S_IRUGO, &raw_seq_fops)) + return -ENOMEM; + return 0; +} + +void __init raw_proc_exit(void) +{ + proc_net_remove("raw"); +} +#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv4/route.c b/net/ipv4/route.c new file mode 100644 index 000000000000..9f91a116d919 --- /dev/null +++ b/net/ipv4/route.c @@ -0,0 +1,3177 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * ROUTE - implementation of the IP router. + * + * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Alan Cox, + * Linus Torvalds, + * Alexey Kuznetsov, + * + * Fixes: + * Alan Cox : Verify area fixes. + * Alan Cox : cli() protects routing changes + * Rui Oliveira : ICMP routing table updates + * (rco@di.uminho.pt) Routing table insertion and update + * Linus Torvalds : Rewrote bits to be sensible + * Alan Cox : Added BSD route gw semantics + * Alan Cox : Super /proc >4K + * Alan Cox : MTU in route table + * Alan Cox : MSS actually. Also added the window + * clamper. + * Sam Lantinga : Fixed route matching in rt_del() + * Alan Cox : Routing cache support. + * Alan Cox : Removed compatibility cruft. + * Alan Cox : RTF_REJECT support. + * Alan Cox : TCP irtt support. + * Jonathan Naylor : Added Metric support. + * Miquel van Smoorenburg : BSD API fixes. + * Miquel van Smoorenburg : Metrics. + * Alan Cox : Use __u32 properly + * Alan Cox : Aligned routing errors more closely with BSD + * our system is still very different. + * Alan Cox : Faster /proc handling + * Alexey Kuznetsov : Massive rework to support tree based routing, + * routing caches and better behaviour. + * + * Olaf Erb : irtt wasn't being copied right. + * Bjorn Ekwall : Kerneld route support. + * Alan Cox : Multicast fixed (I hope) + * Pavel Krauz : Limited broadcast fixed + * Mike McLagan : Routing by source + * Alexey Kuznetsov : End of old history. Split to fib.c and + * route.c and rewritten from scratch. + * Andi Kleen : Load-limit warning messages. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Vitaly E. Lavrov : Race condition in ip_route_input_slow. + * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. + * Vladimir V. Ivanov : IP rule info (flowid) is really useful. + * Marc Boucher : routing by fwmark + * Robert Olsson : Added rt_cache statistics + * Arnaldo C. Melo : Convert proc stuff to seq_file + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif + +#define RT_FL_TOS(oldflp) \ + ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) + +#define IP_MAX_MTU 0xFFF0 + +#define RT_GC_TIMEOUT (300*HZ) + +static int ip_rt_min_delay = 2 * HZ; +static int ip_rt_max_delay = 10 * HZ; +static int ip_rt_max_size; +static int ip_rt_gc_timeout = RT_GC_TIMEOUT; +static int ip_rt_gc_interval = 60 * HZ; +static int ip_rt_gc_min_interval = HZ / 2; +static int ip_rt_redirect_number = 9; +static int ip_rt_redirect_load = HZ / 50; +static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1)); +static int ip_rt_error_cost = HZ; +static int ip_rt_error_burst = 5 * HZ; +static int ip_rt_gc_elasticity = 8; +static int ip_rt_mtu_expires = 10 * 60 * HZ; +static int ip_rt_min_pmtu = 512 + 20 + 20; +static int ip_rt_min_advmss = 256; +static int ip_rt_secret_interval = 10 * 60 * HZ; +static unsigned long rt_deadline; + +#define RTprint(a...) printk(KERN_DEBUG a) + +static struct timer_list rt_flush_timer; +static struct timer_list rt_periodic_timer; +static struct timer_list rt_secret_timer; + +/* + * Interface to generic destination cache. + */ + +static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); +static void ipv4_dst_destroy(struct dst_entry *dst); +static void ipv4_dst_ifdown(struct dst_entry *dst, + struct net_device *dev, int how); +static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); +static void ipv4_link_failure(struct sk_buff *skb); +static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); +static int rt_garbage_collect(void); + + +static struct dst_ops ipv4_dst_ops = { + .family = AF_INET, + .protocol = __constant_htons(ETH_P_IP), + .gc = rt_garbage_collect, + .check = ipv4_dst_check, + .destroy = ipv4_dst_destroy, + .ifdown = ipv4_dst_ifdown, + .negative_advice = ipv4_negative_advice, + .link_failure = ipv4_link_failure, + .update_pmtu = ip_rt_update_pmtu, + .entry_size = sizeof(struct rtable), +}; + +#define ECN_OR_COST(class) TC_PRIO_##class + +__u8 ip_tos2prio[16] = { + TC_PRIO_BESTEFFORT, + ECN_OR_COST(FILLER), + TC_PRIO_BESTEFFORT, + ECN_OR_COST(BESTEFFORT), + TC_PRIO_BULK, + ECN_OR_COST(BULK), + TC_PRIO_BULK, + ECN_OR_COST(BULK), + TC_PRIO_INTERACTIVE, + ECN_OR_COST(INTERACTIVE), + TC_PRIO_INTERACTIVE, + ECN_OR_COST(INTERACTIVE), + TC_PRIO_INTERACTIVE_BULK, + ECN_OR_COST(INTERACTIVE_BULK), + TC_PRIO_INTERACTIVE_BULK, + ECN_OR_COST(INTERACTIVE_BULK) +}; + + +/* + * Route cache. + */ + +/* The locking scheme is rather straight forward: + * + * 1) Read-Copy Update protects the buckets of the central route hash. + * 2) Only writers remove entries, and they hold the lock + * as they look at rtable reference counts. + * 3) Only readers acquire references to rtable entries, + * they do so with atomic increments and with the + * lock held. + */ + +struct rt_hash_bucket { + struct rtable *chain; + spinlock_t lock; +} __attribute__((__aligned__(8))); + +static struct rt_hash_bucket *rt_hash_table; +static unsigned rt_hash_mask; +static int rt_hash_log; +static unsigned int rt_hash_rnd; + +struct rt_cache_stat *rt_cache_stat; + +static int rt_intern_hash(unsigned hash, struct rtable *rth, + struct rtable **res); + +static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos) +{ + return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd) + & rt_hash_mask); +} + +#ifdef CONFIG_PROC_FS +struct rt_cache_iter_state { + int bucket; +}; + +static struct rtable *rt_cache_get_first(struct seq_file *seq) +{ + struct rtable *r = NULL; + struct rt_cache_iter_state *st = seq->private; + + for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { + rcu_read_lock_bh(); + r = rt_hash_table[st->bucket].chain; + if (r) + break; + rcu_read_unlock_bh(); + } + return r; +} + +static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r) +{ + struct rt_cache_iter_state *st = rcu_dereference(seq->private); + + r = r->u.rt_next; + while (!r) { + rcu_read_unlock_bh(); + if (--st->bucket < 0) + break; + rcu_read_lock_bh(); + r = rt_hash_table[st->bucket].chain; + } + return r; +} + +static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) +{ + struct rtable *r = rt_cache_get_first(seq); + + if (r) + while (pos && (r = rt_cache_get_next(seq, r))) + --pos; + return pos ? NULL : r; +} + +static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) +{ + return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct rtable *r = NULL; + + if (v == SEQ_START_TOKEN) + r = rt_cache_get_first(seq); + else + r = rt_cache_get_next(seq, v); + ++*pos; + return r; +} + +static void rt_cache_seq_stop(struct seq_file *seq, void *v) +{ + if (v && v != SEQ_START_TOKEN) + rcu_read_unlock_bh(); +} + +static int rt_cache_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%-127s\n", + "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" + "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" + "HHUptod\tSpecDst"); + else { + struct rtable *r = v; + char temp[256]; + + sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" + "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X", + r->u.dst.dev ? r->u.dst.dev->name : "*", + (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, + r->rt_flags, atomic_read(&r->u.dst.__refcnt), + r->u.dst.__use, 0, (unsigned long)r->rt_src, + (dst_metric(&r->u.dst, RTAX_ADVMSS) ? + (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), + dst_metric(&r->u.dst, RTAX_WINDOW), + (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + + dst_metric(&r->u.dst, RTAX_RTTVAR)), + r->fl.fl4_tos, + r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, + r->u.dst.hh ? (r->u.dst.hh->hh_output == + dev_queue_xmit) : 0, + r->rt_spec_dst); + seq_printf(seq, "%-127s\n", temp); + } + return 0; +} + +static struct seq_operations rt_cache_seq_ops = { + .start = rt_cache_seq_start, + .next = rt_cache_seq_next, + .stop = rt_cache_seq_stop, + .show = rt_cache_seq_show, +}; + +static int rt_cache_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + rc = seq_open(file, &rt_cache_seq_ops); + if (rc) + goto out_kfree; + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations rt_cache_seq_fops = { + .owner = THIS_MODULE, + .open = rt_cache_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + + +static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) +{ + int cpu; + + if (*pos == 0) + return SEQ_START_TOKEN; + + for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(rt_cache_stat, cpu); + } + return NULL; +} + +static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + int cpu; + + for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + if (!cpu_possible(cpu)) + continue; + *pos = cpu+1; + return per_cpu_ptr(rt_cache_stat, cpu); + } + return NULL; + +} + +static void rt_cpu_seq_stop(struct seq_file *seq, void *v) +{ + +} + +static int rt_cpu_seq_show(struct seq_file *seq, void *v) +{ + struct rt_cache_stat *st = v; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "entries in_hit in_slow_tot in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); + return 0; + } + + seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " + " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", + atomic_read(&ipv4_dst_ops.entries), + st->in_hit, + st->in_slow_tot, + st->in_slow_mc, + st->in_no_route, + st->in_brd, + st->in_martian_dst, + st->in_martian_src, + + st->out_hit, + st->out_slow_tot, + st->out_slow_mc, + + st->gc_total, + st->gc_ignored, + st->gc_goal_miss, + st->gc_dst_overflow, + st->in_hlist_search, + st->out_hlist_search + ); + return 0; +} + +static struct seq_operations rt_cpu_seq_ops = { + .start = rt_cpu_seq_start, + .next = rt_cpu_seq_next, + .stop = rt_cpu_seq_stop, + .show = rt_cpu_seq_show, +}; + + +static int rt_cpu_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rt_cpu_seq_ops); +} + +static struct file_operations rt_cpu_seq_fops = { + .owner = THIS_MODULE, + .open = rt_cpu_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* CONFIG_PROC_FS */ + +static __inline__ void rt_free(struct rtable *rt) +{ + multipath_remove(rt); + call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); +} + +static __inline__ void rt_drop(struct rtable *rt) +{ + multipath_remove(rt); + ip_rt_put(rt); + call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); +} + +static __inline__ int rt_fast_clean(struct rtable *rth) +{ + /* Kill broadcast/multicast entries very aggresively, if they + collide in hash table with more useful entries */ + return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && + rth->fl.iif && rth->u.rt_next; +} + +static __inline__ int rt_valuable(struct rtable *rth) +{ + return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || + rth->u.dst.expires; +} + +static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) +{ + unsigned long age; + int ret = 0; + + if (atomic_read(&rth->u.dst.__refcnt)) + goto out; + + ret = 1; + if (rth->u.dst.expires && + time_after_eq(jiffies, rth->u.dst.expires)) + goto out; + + age = jiffies - rth->u.dst.lastuse; + ret = 0; + if ((age <= tmo1 && !rt_fast_clean(rth)) || + (age <= tmo2 && rt_valuable(rth))) + goto out; + ret = 1; +out: return ret; +} + +/* Bits of score are: + * 31: very valuable + * 30: not quite useless + * 29..0: usage counter + */ +static inline u32 rt_score(struct rtable *rt) +{ + u32 score = jiffies - rt->u.dst.lastuse; + + score = ~score & ~(3<<30); + + if (rt_valuable(rt)) + score |= (1<<31); + + if (!rt->fl.iif || + !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) + score |= (1<<30); + + return score; +} + +static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) +{ + return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 && + fl1->oif == fl2->oif && + fl1->iif == fl2->iif; +} + +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED +static struct rtable **rt_remove_balanced_route(struct rtable **chain_head, + struct rtable *expentry, + int *removed_count) +{ + int passedexpired = 0; + struct rtable **nextstep = NULL; + struct rtable **rthp = chain_head; + struct rtable *rth; + + if (removed_count) + *removed_count = 0; + + while ((rth = *rthp) != NULL) { + if (rth == expentry) + passedexpired = 1; + + if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 && + compare_keys(&(*rthp)->fl, &expentry->fl)) { + if (*rthp == expentry) { + *rthp = rth->u.rt_next; + continue; + } else { + *rthp = rth->u.rt_next; + rt_free(rth); + if (removed_count) + ++(*removed_count); + } + } else { + if (!((*rthp)->u.dst.flags & DST_BALANCED) && + passedexpired && !nextstep) + nextstep = &rth->u.rt_next; + + rthp = &rth->u.rt_next; + } + } + + rt_free(expentry); + if (removed_count) + ++(*removed_count); + + return nextstep; +} +#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + + +/* This runs via a timer and thus is always in BH context. */ +static void rt_check_expire(unsigned long dummy) +{ + static int rover; + int i = rover, t; + struct rtable *rth, **rthp; + unsigned long now = jiffies; + + for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; + t -= ip_rt_gc_timeout) { + unsigned long tmo = ip_rt_gc_timeout; + + i = (i + 1) & rt_hash_mask; + rthp = &rt_hash_table[i].chain; + + spin_lock(&rt_hash_table[i].lock); + while ((rth = *rthp) != NULL) { + if (rth->u.dst.expires) { + /* Entry is expired even if it is in use */ + if (time_before_eq(now, rth->u.dst.expires)) { + tmo >>= 1; + rthp = &rth->u.rt_next; + continue; + } + } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { + tmo >>= 1; + rthp = &rth->u.rt_next; + continue; + } + + /* Cleanup aged off entries. */ +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + /* remove all related balanced entries if necessary */ + if (rth->u.dst.flags & DST_BALANCED) { + rthp = rt_remove_balanced_route( + &rt_hash_table[i].chain, + rth, NULL); + if (!rthp) + break; + } else { + *rthp = rth->u.rt_next; + rt_free(rth); + } +#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + *rthp = rth->u.rt_next; + rt_free(rth); +#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + } + spin_unlock(&rt_hash_table[i].lock); + + /* Fallback loop breaker. */ + if (time_after(jiffies, now)) + break; + } + rover = i; + mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); +} + +/* This can run from both BH and non-BH contexts, the latter + * in the case of a forced flush event. + */ +static void rt_run_flush(unsigned long dummy) +{ + int i; + struct rtable *rth, *next; + + rt_deadline = 0; + + get_random_bytes(&rt_hash_rnd, 4); + + for (i = rt_hash_mask; i >= 0; i--) { + spin_lock_bh(&rt_hash_table[i].lock); + rth = rt_hash_table[i].chain; + if (rth) + rt_hash_table[i].chain = NULL; + spin_unlock_bh(&rt_hash_table[i].lock); + + for (; rth; rth = next) { + next = rth->u.rt_next; + rt_free(rth); + } + } +} + +static DEFINE_SPINLOCK(rt_flush_lock); + +void rt_cache_flush(int delay) +{ + unsigned long now = jiffies; + int user_mode = !in_softirq(); + + if (delay < 0) + delay = ip_rt_min_delay; + + /* flush existing multipath state*/ + multipath_flush(); + + spin_lock_bh(&rt_flush_lock); + + if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) { + long tmo = (long)(rt_deadline - now); + + /* If flush timer is already running + and flush request is not immediate (delay > 0): + + if deadline is not achieved, prolongate timer to "delay", + otherwise fire it at deadline time. + */ + + if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay) + tmo = 0; + + if (delay > tmo) + delay = tmo; + } + + if (delay <= 0) { + spin_unlock_bh(&rt_flush_lock); + rt_run_flush(0); + return; + } + + if (rt_deadline == 0) + rt_deadline = now + ip_rt_max_delay; + + mod_timer(&rt_flush_timer, now+delay); + spin_unlock_bh(&rt_flush_lock); +} + +static void rt_secret_rebuild(unsigned long dummy) +{ + unsigned long now = jiffies; + + rt_cache_flush(0); + mod_timer(&rt_secret_timer, now + ip_rt_secret_interval); +} + +/* + Short description of GC goals. + + We want to build algorithm, which will keep routing cache + at some equilibrium point, when number of aged off entries + is kept approximately equal to newly generated ones. + + Current expiration strength is variable "expire". + We try to adjust it dynamically, so that if networking + is idle expires is large enough to keep enough of warm entries, + and when load increases it reduces to limit cache size. + */ + +static int rt_garbage_collect(void) +{ + static unsigned long expire = RT_GC_TIMEOUT; + static unsigned long last_gc; + static int rover; + static int equilibrium; + struct rtable *rth, **rthp; + unsigned long now = jiffies; + int goal; + + /* + * Garbage collection is pretty expensive, + * do not make it too frequently. + */ + + RT_CACHE_STAT_INC(gc_total); + + if (now - last_gc < ip_rt_gc_min_interval && + atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { + RT_CACHE_STAT_INC(gc_ignored); + goto out; + } + + /* Calculate number of entries, which we want to expire now. */ + goal = atomic_read(&ipv4_dst_ops.entries) - + (ip_rt_gc_elasticity << rt_hash_log); + if (goal <= 0) { + if (equilibrium < ipv4_dst_ops.gc_thresh) + equilibrium = ipv4_dst_ops.gc_thresh; + goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; + if (goal > 0) { + equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1); + goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; + } + } else { + /* We are in dangerous area. Try to reduce cache really + * aggressively. + */ + goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1); + equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; + } + + if (now - last_gc >= ip_rt_gc_min_interval) + last_gc = now; + + if (goal <= 0) { + equilibrium += goal; + goto work_done; + } + + do { + int i, k; + + for (i = rt_hash_mask, k = rover; i >= 0; i--) { + unsigned long tmo = expire; + + k = (k + 1) & rt_hash_mask; + rthp = &rt_hash_table[k].chain; + spin_lock_bh(&rt_hash_table[k].lock); + while ((rth = *rthp) != NULL) { + if (!rt_may_expire(rth, tmo, expire)) { + tmo >>= 1; + rthp = &rth->u.rt_next; + continue; + } +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + /* remove all related balanced entries + * if necessary + */ + if (rth->u.dst.flags & DST_BALANCED) { + int r; + + rthp = rt_remove_balanced_route( + &rt_hash_table[i].chain, + rth, + &r); + goal -= r; + if (!rthp) + break; + } else { + *rthp = rth->u.rt_next; + rt_free(rth); + goal--; + } +#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + *rthp = rth->u.rt_next; + rt_free(rth); + goal--; +#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + } + spin_unlock_bh(&rt_hash_table[k].lock); + if (goal <= 0) + break; + } + rover = k; + + if (goal <= 0) + goto work_done; + + /* Goal is not achieved. We stop process if: + + - if expire reduced to zero. Otherwise, expire is halfed. + - if table is not full. + - if we are called from interrupt. + - jiffies check is just fallback/debug loop breaker. + We will not spin here for long time in any case. + */ + + RT_CACHE_STAT_INC(gc_goal_miss); + + if (expire == 0) + break; + + expire >>= 1; +#if RT_CACHE_DEBUG >= 2 + printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, + atomic_read(&ipv4_dst_ops.entries), goal, i); +#endif + + if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) + goto out; + } while (!in_softirq() && time_before_eq(jiffies, now)); + + if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) + goto out; + if (net_ratelimit()) + printk(KERN_WARNING "dst cache overflow\n"); + RT_CACHE_STAT_INC(gc_dst_overflow); + return 1; + +work_done: + expire += ip_rt_gc_min_interval; + if (expire > ip_rt_gc_timeout || + atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) + expire = ip_rt_gc_timeout; +#if RT_CACHE_DEBUG >= 2 + printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, + atomic_read(&ipv4_dst_ops.entries), goal, rover); +#endif +out: return 0; +} + +static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) +{ + struct rtable *rth, **rthp; + unsigned long now; + struct rtable *cand, **candp; + u32 min_score; + int chain_length; + int attempts = !in_softirq(); + +restart: + chain_length = 0; + min_score = ~(u32)0; + cand = NULL; + candp = NULL; + now = jiffies; + + rthp = &rt_hash_table[hash].chain; + + spin_lock_bh(&rt_hash_table[hash].lock); + while ((rth = *rthp) != NULL) { +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + if (!(rth->u.dst.flags & DST_BALANCED) && + compare_keys(&rth->fl, &rt->fl)) { +#else + if (compare_keys(&rth->fl, &rt->fl)) { +#endif + /* Put it first */ + *rthp = rth->u.rt_next; + /* + * Since lookup is lockfree, the deletion + * must be visible to another weakly ordered CPU before + * the insertion at the start of the hash chain. + */ + rcu_assign_pointer(rth->u.rt_next, + rt_hash_table[hash].chain); + /* + * Since lookup is lockfree, the update writes + * must be ordered for consistency on SMP. + */ + rcu_assign_pointer(rt_hash_table[hash].chain, rth); + + rth->u.dst.__use++; + dst_hold(&rth->u.dst); + rth->u.dst.lastuse = now; + spin_unlock_bh(&rt_hash_table[hash].lock); + + rt_drop(rt); + *rp = rth; + return 0; + } + + if (!atomic_read(&rth->u.dst.__refcnt)) { + u32 score = rt_score(rth); + + if (score <= min_score) { + cand = rth; + candp = rthp; + min_score = score; + } + } + + chain_length++; + + rthp = &rth->u.rt_next; + } + + if (cand) { + /* ip_rt_gc_elasticity used to be average length of chain + * length, when exceeded gc becomes really aggressive. + * + * The second limit is less certain. At the moment it allows + * only 2 entries per bucket. We will see. + */ + if (chain_length > ip_rt_gc_elasticity) { + *candp = cand->u.rt_next; + rt_free(cand); + } + } + + /* Try to bind route to arp only if it is output + route or unicast forwarding path. + */ + if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { + int err = arp_bind_neighbour(&rt->u.dst); + if (err) { + spin_unlock_bh(&rt_hash_table[hash].lock); + + if (err != -ENOBUFS) { + rt_drop(rt); + return err; + } + + /* Neighbour tables are full and nothing + can be released. Try to shrink route cache, + it is most likely it holds some neighbour records. + */ + if (attempts-- > 0) { + int saved_elasticity = ip_rt_gc_elasticity; + int saved_int = ip_rt_gc_min_interval; + ip_rt_gc_elasticity = 1; + ip_rt_gc_min_interval = 0; + rt_garbage_collect(); + ip_rt_gc_min_interval = saved_int; + ip_rt_gc_elasticity = saved_elasticity; + goto restart; + } + + if (net_ratelimit()) + printk(KERN_WARNING "Neighbour table overflow.\n"); + rt_drop(rt); + return -ENOBUFS; + } + } + + rt->u.rt_next = rt_hash_table[hash].chain; +#if RT_CACHE_DEBUG >= 2 + if (rt->u.rt_next) { + struct rtable *trt; + printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash, + NIPQUAD(rt->rt_dst)); + for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next) + printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst)); + printk("\n"); + } +#endif + rt_hash_table[hash].chain = rt; + spin_unlock_bh(&rt_hash_table[hash].lock); + *rp = rt; + return 0; +} + +void rt_bind_peer(struct rtable *rt, int create) +{ + static DEFINE_SPINLOCK(rt_peer_lock); + struct inet_peer *peer; + + peer = inet_getpeer(rt->rt_dst, create); + + spin_lock_bh(&rt_peer_lock); + if (rt->peer == NULL) { + rt->peer = peer; + peer = NULL; + } + spin_unlock_bh(&rt_peer_lock); + if (peer) + inet_putpeer(peer); +} + +/* + * Peer allocation may fail only in serious out-of-memory conditions. However + * we still can generate some output. + * Random ID selection looks a bit dangerous because we have no chances to + * select ID being unique in a reasonable period of time. + * But broken packet identifier may be better than no packet at all. + */ +static void ip_select_fb_ident(struct iphdr *iph) +{ + static DEFINE_SPINLOCK(ip_fb_id_lock); + static u32 ip_fallback_id; + u32 salt; + + spin_lock_bh(&ip_fb_id_lock); + salt = secure_ip_id(ip_fallback_id ^ iph->daddr); + iph->id = htons(salt & 0xFFFF); + ip_fallback_id = salt; + spin_unlock_bh(&ip_fb_id_lock); +} + +void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) +{ + struct rtable *rt = (struct rtable *) dst; + + if (rt) { + if (rt->peer == NULL) + rt_bind_peer(rt, 1); + + /* If peer is attached to destination, it is never detached, + so that we need not to grab a lock to dereference it. + */ + if (rt->peer) { + iph->id = htons(inet_getid(rt->peer, more)); + return; + } + } else + printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", NET_CALLER(iph)); + + ip_select_fb_ident(iph); +} + +static void rt_del(unsigned hash, struct rtable *rt) +{ + struct rtable **rthp; + + spin_lock_bh(&rt_hash_table[hash].lock); + ip_rt_put(rt); + for (rthp = &rt_hash_table[hash].chain; *rthp; + rthp = &(*rthp)->u.rt_next) + if (*rthp == rt) { + *rthp = rt->u.rt_next; + rt_free(rt); + break; + } + spin_unlock_bh(&rt_hash_table[hash].lock); +} + +void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, + u32 saddr, u8 tos, struct net_device *dev) +{ + int i, k; + struct in_device *in_dev = in_dev_get(dev); + struct rtable *rth, **rthp; + u32 skeys[2] = { saddr, 0 }; + int ikeys[2] = { dev->ifindex, 0 }; + + tos &= IPTOS_RT_MASK; + + if (!in_dev) + return; + + if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) + || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw)) + goto reject_redirect; + + if (!IN_DEV_SHARED_MEDIA(in_dev)) { + if (!inet_addr_onlink(in_dev, new_gw, old_gw)) + goto reject_redirect; + if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) + goto reject_redirect; + } else { + if (inet_addr_type(new_gw) != RTN_UNICAST) + goto reject_redirect; + } + + for (i = 0; i < 2; i++) { + for (k = 0; k < 2; k++) { + unsigned hash = rt_hash_code(daddr, + skeys[i] ^ (ikeys[k] << 5), + tos); + + rthp=&rt_hash_table[hash].chain; + + rcu_read_lock(); + while ((rth = rcu_dereference(*rthp)) != NULL) { + struct rtable *rt; + + if (rth->fl.fl4_dst != daddr || + rth->fl.fl4_src != skeys[i] || + rth->fl.fl4_tos != tos || + rth->fl.oif != ikeys[k] || + rth->fl.iif != 0) { + rthp = &rth->u.rt_next; + continue; + } + + if (rth->rt_dst != daddr || + rth->rt_src != saddr || + rth->u.dst.error || + rth->rt_gateway != old_gw || + rth->u.dst.dev != dev) + break; + + dst_hold(&rth->u.dst); + rcu_read_unlock(); + + rt = dst_alloc(&ipv4_dst_ops); + if (rt == NULL) { + ip_rt_put(rth); + in_dev_put(in_dev); + return; + } + + /* Copy all the information. */ + *rt = *rth; + INIT_RCU_HEAD(&rt->u.dst.rcu_head); + rt->u.dst.__use = 1; + atomic_set(&rt->u.dst.__refcnt, 1); + rt->u.dst.child = NULL; + if (rt->u.dst.dev) + dev_hold(rt->u.dst.dev); + if (rt->idev) + in_dev_hold(rt->idev); + rt->u.dst.obsolete = 0; + rt->u.dst.lastuse = jiffies; + rt->u.dst.path = &rt->u.dst; + rt->u.dst.neighbour = NULL; + rt->u.dst.hh = NULL; + rt->u.dst.xfrm = NULL; + + rt->rt_flags |= RTCF_REDIRECTED; + + /* Gateway is different ... */ + rt->rt_gateway = new_gw; + + /* Redirect received -> path was valid */ + dst_confirm(&rth->u.dst); + + if (rt->peer) + atomic_inc(&rt->peer->refcnt); + + if (arp_bind_neighbour(&rt->u.dst) || + !(rt->u.dst.neighbour->nud_state & + NUD_VALID)) { + if (rt->u.dst.neighbour) + neigh_event_send(rt->u.dst.neighbour, NULL); + ip_rt_put(rth); + rt_drop(rt); + goto do_next; + } + + rt_del(hash, rth); + if (!rt_intern_hash(hash, rt, &rt)) + ip_rt_put(rt); + goto do_next; + } + rcu_read_unlock(); + do_next: + ; + } + } + in_dev_put(in_dev); + return; + +reject_redirect: +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) + printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about " + "%u.%u.%u.%u ignored.\n" + " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, " + "tos %02x\n", + NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw), + NIPQUAD(saddr), NIPQUAD(daddr), tos); +#endif + in_dev_put(in_dev); +} + +static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) +{ + struct rtable *rt = (struct rtable*)dst; + struct dst_entry *ret = dst; + + if (rt) { + if (dst->obsolete) { + ip_rt_put(rt); + ret = NULL; + } else if ((rt->rt_flags & RTCF_REDIRECTED) || + rt->u.dst.expires) { + unsigned hash = rt_hash_code(rt->fl.fl4_dst, + rt->fl.fl4_src ^ + (rt->fl.oif << 5), + rt->fl.fl4_tos); +#if RT_CACHE_DEBUG >= 1 + printk(KERN_DEBUG "ip_rt_advice: redirect to " + "%u.%u.%u.%u/%02x dropped\n", + NIPQUAD(rt->rt_dst), rt->fl.fl4_tos); +#endif + rt_del(hash, rt); + ret = NULL; + } + } + return ret; +} + +/* + * Algorithm: + * 1. The first ip_rt_redirect_number redirects are sent + * with exponential backoff, then we stop sending them at all, + * assuming that the host ignores our redirects. + * 2. If we did not see packets requiring redirects + * during ip_rt_redirect_silence, we assume that the host + * forgot redirected route and start to send redirects again. + * + * This algorithm is much cheaper and more intelligent than dumb load limiting + * in icmp.c. + * + * NOTE. Do not forget to inhibit load limiting for redirects (redundant) + * and "frag. need" (breaks PMTU discovery) in icmp.c. + */ + +void ip_rt_send_redirect(struct sk_buff *skb) +{ + struct rtable *rt = (struct rtable*)skb->dst; + struct in_device *in_dev = in_dev_get(rt->u.dst.dev); + + if (!in_dev) + return; + + if (!IN_DEV_TX_REDIRECTS(in_dev)) + goto out; + + /* No redirected packets during ip_rt_redirect_silence; + * reset the algorithm. + */ + if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence)) + rt->u.dst.rate_tokens = 0; + + /* Too many ignored redirects; do not send anything + * set u.dst.rate_last to the last seen redirected packet. + */ + if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { + rt->u.dst.rate_last = jiffies; + goto out; + } + + /* Check for load limit; set rate_last to the latest sent + * redirect. + */ + if (time_after(jiffies, + (rt->u.dst.rate_last + + (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) { + icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); + rt->u.dst.rate_last = jiffies; + ++rt->u.dst.rate_tokens; +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && + rt->u.dst.rate_tokens == ip_rt_redirect_number && + net_ratelimit()) + printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores " + "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n", + NIPQUAD(rt->rt_src), rt->rt_iif, + NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway)); +#endif + } +out: + in_dev_put(in_dev); +} + +static int ip_error(struct sk_buff *skb) +{ + struct rtable *rt = (struct rtable*)skb->dst; + unsigned long now; + int code; + + switch (rt->u.dst.error) { + case EINVAL: + default: + goto out; + case EHOSTUNREACH: + code = ICMP_HOST_UNREACH; + break; + case ENETUNREACH: + code = ICMP_NET_UNREACH; + break; + case EACCES: + code = ICMP_PKT_FILTERED; + break; + } + + now = jiffies; + rt->u.dst.rate_tokens += now - rt->u.dst.rate_last; + if (rt->u.dst.rate_tokens > ip_rt_error_burst) + rt->u.dst.rate_tokens = ip_rt_error_burst; + rt->u.dst.rate_last = now; + if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { + rt->u.dst.rate_tokens -= ip_rt_error_cost; + icmp_send(skb, ICMP_DEST_UNREACH, code, 0); + } + +out: kfree_skb(skb); + return 0; +} + +/* + * The last two values are not from the RFC but + * are needed for AMPRnet AX.25 paths. + */ + +static unsigned short mtu_plateau[] = +{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; + +static __inline__ unsigned short guess_mtu(unsigned short old_mtu) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) + if (old_mtu > mtu_plateau[i]) + return mtu_plateau[i]; + return 68; +} + +unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) +{ + int i; + unsigned short old_mtu = ntohs(iph->tot_len); + struct rtable *rth; + u32 skeys[2] = { iph->saddr, 0, }; + u32 daddr = iph->daddr; + u8 tos = iph->tos & IPTOS_RT_MASK; + unsigned short est_mtu = 0; + + if (ipv4_config.no_pmtu_disc) + return 0; + + for (i = 0; i < 2; i++) { + unsigned hash = rt_hash_code(daddr, skeys[i], tos); + + rcu_read_lock(); + for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; + rth = rcu_dereference(rth->u.rt_next)) { + if (rth->fl.fl4_dst == daddr && + rth->fl.fl4_src == skeys[i] && + rth->rt_dst == daddr && + rth->rt_src == iph->saddr && + rth->fl.fl4_tos == tos && + rth->fl.iif == 0 && + !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) { + unsigned short mtu = new_mtu; + + if (new_mtu < 68 || new_mtu >= old_mtu) { + + /* BSD 4.2 compatibility hack :-( */ + if (mtu == 0 && + old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] && + old_mtu >= 68 + (iph->ihl << 2)) + old_mtu -= iph->ihl << 2; + + mtu = guess_mtu(old_mtu); + } + if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) { + if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { + dst_confirm(&rth->u.dst); + if (mtu < ip_rt_min_pmtu) { + mtu = ip_rt_min_pmtu; + rth->u.dst.metrics[RTAX_LOCK-1] |= + (1 << RTAX_MTU); + } + rth->u.dst.metrics[RTAX_MTU-1] = mtu; + dst_set_expires(&rth->u.dst, + ip_rt_mtu_expires); + } + est_mtu = mtu; + } + } + } + rcu_read_unlock(); + } + return est_mtu ? : new_mtu; +} + +static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 && + !(dst_metric_locked(dst, RTAX_MTU))) { + if (mtu < ip_rt_min_pmtu) { + mtu = ip_rt_min_pmtu; + dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); + } + dst->metrics[RTAX_MTU-1] = mtu; + dst_set_expires(dst, ip_rt_mtu_expires); + } +} + +static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) +{ + return NULL; +} + +static void ipv4_dst_destroy(struct dst_entry *dst) +{ + struct rtable *rt = (struct rtable *) dst; + struct inet_peer *peer = rt->peer; + struct in_device *idev = rt->idev; + + if (peer) { + rt->peer = NULL; + inet_putpeer(peer); + } + + if (idev) { + rt->idev = NULL; + in_dev_put(idev); + } +} + +static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int how) +{ + struct rtable *rt = (struct rtable *) dst; + struct in_device *idev = rt->idev; + if (dev != &loopback_dev && idev && idev->dev == dev) { + struct in_device *loopback_idev = in_dev_get(&loopback_dev); + if (loopback_idev) { + rt->idev = loopback_idev; + in_dev_put(idev); + } + } +} + +static void ipv4_link_failure(struct sk_buff *skb) +{ + struct rtable *rt; + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + + rt = (struct rtable *) skb->dst; + if (rt) + dst_set_expires(&rt->u.dst, 0); +} + +static int ip_rt_bug(struct sk_buff *skb) +{ + printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n", + NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr), + skb->dev ? skb->dev->name : "?"); + kfree_skb(skb); + return 0; +} + +/* + We do not cache source address of outgoing interface, + because it is used only by IP RR, TS and SRR options, + so that it out of fast path. + + BTW remember: "addr" is allowed to be not aligned + in IP options! + */ + +void ip_rt_get_source(u8 *addr, struct rtable *rt) +{ + u32 src; + struct fib_result res; + + if (rt->fl.iif == 0) + src = rt->rt_src; + else if (fib_lookup(&rt->fl, &res) == 0) { + src = FIB_RES_PREFSRC(res); + fib_res_put(&res); + } else + src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, + RT_SCOPE_UNIVERSE); + memcpy(addr, &src, 4); +} + +#ifdef CONFIG_NET_CLS_ROUTE +static void set_class_tag(struct rtable *rt, u32 tag) +{ + if (!(rt->u.dst.tclassid & 0xFFFF)) + rt->u.dst.tclassid |= tag & 0xFFFF; + if (!(rt->u.dst.tclassid & 0xFFFF0000)) + rt->u.dst.tclassid |= tag & 0xFFFF0000; +} +#endif + +static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) +{ + struct fib_info *fi = res->fi; + + if (fi) { + if (FIB_RES_GW(*res) && + FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) + rt->rt_gateway = FIB_RES_GW(*res); + memcpy(rt->u.dst.metrics, fi->fib_metrics, + sizeof(rt->u.dst.metrics)); + if (fi->fib_mtu == 0) { + rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; + if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) && + rt->rt_gateway != rt->rt_dst && + rt->u.dst.dev->mtu > 576) + rt->u.dst.metrics[RTAX_MTU-1] = 576; + } +#ifdef CONFIG_NET_CLS_ROUTE + rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; +#endif + } else + rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; + + if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0) + rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; + if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU) + rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; + if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0) + rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, + ip_rt_min_advmss); + if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40) + rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; + +#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_MULTIPLE_TABLES + set_class_tag(rt, fib_rules_tclass(res)); +#endif + set_class_tag(rt, itag); +#endif + rt->rt_type = res->type; +} + +static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, + u8 tos, struct net_device *dev, int our) +{ + unsigned hash; + struct rtable *rth; + u32 spec_dst; + struct in_device *in_dev = in_dev_get(dev); + u32 itag = 0; + + /* Primary sanity checks. */ + + if (in_dev == NULL) + return -EINVAL; + + if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) || + skb->protocol != htons(ETH_P_IP)) + goto e_inval; + + if (ZERONET(saddr)) { + if (!LOCAL_MCAST(daddr)) + goto e_inval; + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); + } else if (fib_validate_source(saddr, 0, tos, 0, + dev, &spec_dst, &itag) < 0) + goto e_inval; + + rth = dst_alloc(&ipv4_dst_ops); + if (!rth) + goto e_nobufs; + + rth->u.dst.output= ip_rt_bug; + + atomic_set(&rth->u.dst.__refcnt, 1); + rth->u.dst.flags= DST_HOST; + if (in_dev->cnf.no_policy) + rth->u.dst.flags |= DST_NOPOLICY; + rth->fl.fl4_dst = daddr; + rth->rt_dst = daddr; + rth->fl.fl4_tos = tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark= skb->nfmark; +#endif + rth->fl.fl4_src = saddr; + rth->rt_src = saddr; +#ifdef CONFIG_NET_CLS_ROUTE + rth->u.dst.tclassid = itag; +#endif + rth->rt_iif = + rth->fl.iif = dev->ifindex; + rth->u.dst.dev = &loopback_dev; + dev_hold(rth->u.dst.dev); + rth->idev = in_dev_get(rth->u.dst.dev); + rth->fl.oif = 0; + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; + rth->rt_type = RTN_MULTICAST; + rth->rt_flags = RTCF_MULTICAST; + if (our) { + rth->u.dst.input= ip_local_deliver; + rth->rt_flags |= RTCF_LOCAL; + } + +#ifdef CONFIG_IP_MROUTE + if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) + rth->u.dst.input = ip_mr_input; +#endif + RT_CACHE_STAT_INC(in_slow_mc); + + in_dev_put(in_dev); + hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos); + return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst); + +e_nobufs: + in_dev_put(in_dev); + return -ENOBUFS; + +e_inval: + in_dev_put(in_dev); + return -EINVAL; +} + + +static void ip_handle_martian_source(struct net_device *dev, + struct in_device *in_dev, + struct sk_buff *skb, + u32 daddr, + u32 saddr) +{ + RT_CACHE_STAT_INC(in_martian_src); +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { + /* + * RFC1812 recommendation, if source is martian, + * the only hint is MAC header. + */ + printk(KERN_WARNING "martian source %u.%u.%u.%u from " + "%u.%u.%u.%u, on dev %s\n", + NIPQUAD(daddr), NIPQUAD(saddr), dev->name); + if (dev->hard_header_len) { + int i; + unsigned char *p = skb->mac.raw; + printk(KERN_WARNING "ll header: "); + for (i = 0; i < dev->hard_header_len; i++, p++) { + printk("%02x", *p); + if (i < (dev->hard_header_len - 1)) + printk(":"); + } + printk("\n"); + } + } +#endif +} + +static inline int __mkroute_input(struct sk_buff *skb, + struct fib_result* res, + struct in_device *in_dev, + u32 daddr, u32 saddr, u32 tos, + struct rtable **result) +{ + + struct rtable *rth; + int err; + struct in_device *out_dev; + unsigned flags = 0; + u32 spec_dst, itag; + + /* get a working reference to the output device */ + out_dev = in_dev_get(FIB_RES_DEV(*res)); + if (out_dev == NULL) { + if (net_ratelimit()) + printk(KERN_CRIT "Bug in ip_route_input" \ + "_slow(). Please, report\n"); + return -EINVAL; + } + + + err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), + in_dev->dev, &spec_dst, &itag); + if (err < 0) { + ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, + saddr); + + err = -EINVAL; + goto cleanup; + } + + if (err) + flags |= RTCF_DIRECTSRC; + + if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) && + (IN_DEV_SHARED_MEDIA(out_dev) || + inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) + flags |= RTCF_DOREDIRECT; + + if (skb->protocol != htons(ETH_P_IP)) { + /* Not IP (i.e. ARP). Do not create route, if it is + * invalid for proxy arp. DNAT routes are always valid. + */ + if (out_dev == in_dev && !(flags & RTCF_DNAT)) { + err = -EINVAL; + goto cleanup; + } + } + + + rth = dst_alloc(&ipv4_dst_ops); + if (!rth) { + err = -ENOBUFS; + goto cleanup; + } + + rth->u.dst.flags= DST_HOST; +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + if (res->fi->fib_nhs > 1) + rth->u.dst.flags |= DST_BALANCED; +#endif + if (in_dev->cnf.no_policy) + rth->u.dst.flags |= DST_NOPOLICY; + if (in_dev->cnf.no_xfrm) + rth->u.dst.flags |= DST_NOXFRM; + rth->fl.fl4_dst = daddr; + rth->rt_dst = daddr; + rth->fl.fl4_tos = tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark= skb->nfmark; +#endif + rth->fl.fl4_src = saddr; + rth->rt_src = saddr; + rth->rt_gateway = daddr; + rth->rt_iif = + rth->fl.iif = in_dev->dev->ifindex; + rth->u.dst.dev = (out_dev)->dev; + dev_hold(rth->u.dst.dev); + rth->idev = in_dev_get(rth->u.dst.dev); + rth->fl.oif = 0; + rth->rt_spec_dst= spec_dst; + + rth->u.dst.input = ip_forward; + rth->u.dst.output = ip_output; + + rt_set_nexthop(rth, res, itag); + + rth->rt_flags = flags; + + *result = rth; + err = 0; + cleanup: + /* release the working reference to the output device */ + in_dev_put(out_dev); + return err; +} + +static inline int ip_mkroute_input_def(struct sk_buff *skb, + struct fib_result* res, + const struct flowi *fl, + struct in_device *in_dev, + u32 daddr, u32 saddr, u32 tos) +{ + struct rtable* rth; + int err; + unsigned hash; + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) + fib_select_multipath(fl, res); +#endif + + /* create a routing cache entry */ + err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); + if (err) + return err; + atomic_set(&rth->u.dst.__refcnt, 1); + + /* put it into the cache */ + hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos); + return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); +} + +static inline int ip_mkroute_input(struct sk_buff *skb, + struct fib_result* res, + const struct flowi *fl, + struct in_device *in_dev, + u32 daddr, u32 saddr, u32 tos) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + struct rtable* rth; + unsigned char hop, hopcount, lasthop; + int err = -EINVAL; + unsigned int hash; + + if (res->fi) + hopcount = res->fi->fib_nhs; + else + hopcount = 1; + + lasthop = hopcount - 1; + + /* distinguish between multipath and singlepath */ + if (hopcount < 2) + return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, + saddr, tos); + + /* add all alternatives to the routing cache */ + for (hop = 0; hop < hopcount; hop++) { + res->nh_sel = hop; + + /* create a routing cache entry */ + err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, + &rth); + if (err) + return err; + + /* put it into the cache */ + hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos); + err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); + if (err) + return err; + + /* forward hop information to multipath impl. */ + multipath_set_nhinfo(rth, + FIB_RES_NETWORK(*res), + FIB_RES_NETMASK(*res), + res->prefixlen, + &FIB_RES_NH(*res)); + + /* only for the last hop the reference count is handled + * outside + */ + if (hop == lasthop) + atomic_set(&(skb->dst->__refcnt), 1); + } + return err; +#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos); +#endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ +} + + +/* + * NOTE. We drop all the packets that has local source + * addresses, because every properly looped back packet + * must have correct destination already attached by output routine. + * + * Such approach solves two big problems: + * 1. Not simplex devices are handled properly. + * 2. IP spoofing attempts are filtered with 100% of guarantee. + */ + +static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, + u8 tos, struct net_device *dev) +{ + struct fib_result res; + struct in_device *in_dev = in_dev_get(dev); + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = saddr, + .tos = tos, + .scope = RT_SCOPE_UNIVERSE, +#ifdef CONFIG_IP_ROUTE_FWMARK + .fwmark = skb->nfmark +#endif + } }, + .iif = dev->ifindex }; + unsigned flags = 0; + u32 itag = 0; + struct rtable * rth; + unsigned hash; + u32 spec_dst; + int err = -EINVAL; + int free_res = 0; + + /* IP on this device is disabled. */ + + if (!in_dev) + goto out; + + /* Check for the most weird martians, which can be not detected + by fib_lookup. + */ + + if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr)) + goto martian_source; + + if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0)) + goto brd_input; + + /* Accept zero addresses only to limited broadcast; + * I even do not know to fix it or not. Waiting for complains :-) + */ + if (ZERONET(saddr)) + goto martian_source; + + if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr)) + goto martian_destination; + + /* + * Now we are ready to route packet. + */ + if ((err = fib_lookup(&fl, &res)) != 0) { + if (!IN_DEV_FORWARD(in_dev)) + goto e_inval; + goto no_route; + } + free_res = 1; + + RT_CACHE_STAT_INC(in_slow_tot); + + if (res.type == RTN_BROADCAST) + goto brd_input; + + if (res.type == RTN_LOCAL) { + int result; + result = fib_validate_source(saddr, daddr, tos, + loopback_dev.ifindex, + dev, &spec_dst, &itag); + if (result < 0) + goto martian_source; + if (result) + flags |= RTCF_DIRECTSRC; + spec_dst = daddr; + goto local_input; + } + + if (!IN_DEV_FORWARD(in_dev)) + goto e_inval; + if (res.type != RTN_UNICAST) + goto martian_destination; + + err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); + if (err == -ENOBUFS) + goto e_nobufs; + if (err == -EINVAL) + goto e_inval; + +done: + in_dev_put(in_dev); + if (free_res) + fib_res_put(&res); +out: return err; + +brd_input: + if (skb->protocol != htons(ETH_P_IP)) + goto e_inval; + + if (ZERONET(saddr)) + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); + else { + err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, + &itag); + if (err < 0) + goto martian_source; + if (err) + flags |= RTCF_DIRECTSRC; + } + flags |= RTCF_BROADCAST; + res.type = RTN_BROADCAST; + RT_CACHE_STAT_INC(in_brd); + +local_input: + rth = dst_alloc(&ipv4_dst_ops); + if (!rth) + goto e_nobufs; + + rth->u.dst.output= ip_rt_bug; + + atomic_set(&rth->u.dst.__refcnt, 1); + rth->u.dst.flags= DST_HOST; + if (in_dev->cnf.no_policy) + rth->u.dst.flags |= DST_NOPOLICY; + rth->fl.fl4_dst = daddr; + rth->rt_dst = daddr; + rth->fl.fl4_tos = tos; +#ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark= skb->nfmark; +#endif + rth->fl.fl4_src = saddr; + rth->rt_src = saddr; +#ifdef CONFIG_NET_CLS_ROUTE + rth->u.dst.tclassid = itag; +#endif + rth->rt_iif = + rth->fl.iif = dev->ifindex; + rth->u.dst.dev = &loopback_dev; + dev_hold(rth->u.dst.dev); + rth->idev = in_dev_get(rth->u.dst.dev); + rth->rt_gateway = daddr; + rth->rt_spec_dst= spec_dst; + rth->u.dst.input= ip_local_deliver; + rth->rt_flags = flags|RTCF_LOCAL; + if (res.type == RTN_UNREACHABLE) { + rth->u.dst.input= ip_error; + rth->u.dst.error= -err; + rth->rt_flags &= ~RTCF_LOCAL; + } + rth->rt_type = res.type; + hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos); + err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); + goto done; + +no_route: + RT_CACHE_STAT_INC(in_no_route); + spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); + res.type = RTN_UNREACHABLE; + goto local_input; + + /* + * Do not cache martian addresses: they should be logged (RFC1812) + */ +martian_destination: + RT_CACHE_STAT_INC(in_martian_dst); +#ifdef CONFIG_IP_ROUTE_VERBOSE + if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) + printk(KERN_WARNING "martian destination %u.%u.%u.%u from " + "%u.%u.%u.%u, dev %s\n", + NIPQUAD(daddr), NIPQUAD(saddr), dev->name); +#endif +e_inval: + err = -EINVAL; + goto done; + +e_nobufs: + err = -ENOBUFS; + goto done; + +martian_source: + ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); + goto e_inval; +} + +int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, + u8 tos, struct net_device *dev) +{ + struct rtable * rth; + unsigned hash; + int iif = dev->ifindex; + + tos &= IPTOS_RT_MASK; + hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos); + + rcu_read_lock(); + for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; + rth = rcu_dereference(rth->u.rt_next)) { + if (rth->fl.fl4_dst == daddr && + rth->fl.fl4_src == saddr && + rth->fl.iif == iif && + rth->fl.oif == 0 && +#ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark == skb->nfmark && +#endif + rth->fl.fl4_tos == tos) { + rth->u.dst.lastuse = jiffies; + dst_hold(&rth->u.dst); + rth->u.dst.__use++; + RT_CACHE_STAT_INC(in_hit); + rcu_read_unlock(); + skb->dst = (struct dst_entry*)rth; + return 0; + } + RT_CACHE_STAT_INC(in_hlist_search); + } + rcu_read_unlock(); + + /* Multicast recognition logic is moved from route cache to here. + The problem was that too many Ethernet cards have broken/missing + hardware multicast filters :-( As result the host on multicasting + network acquires a lot of useless route cache entries, sort of + SDR messages from all the world. Now we try to get rid of them. + Really, provided software IP multicast filter is organized + reasonably (at least, hashed), it does not result in a slowdown + comparing with route cache reject entries. + Note, that multicast routers are not affected, because + route cache entry is created eventually. + */ + if (MULTICAST(daddr)) { + struct in_device *in_dev; + + rcu_read_lock(); + if ((in_dev = __in_dev_get(dev)) != NULL) { + int our = ip_check_mc(in_dev, daddr, saddr, + skb->nh.iph->protocol); + if (our +#ifdef CONFIG_IP_MROUTE + || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) +#endif + ) { + rcu_read_unlock(); + return ip_route_input_mc(skb, daddr, saddr, + tos, dev, our); + } + } + rcu_read_unlock(); + return -EINVAL; + } + return ip_route_input_slow(skb, daddr, saddr, tos, dev); +} + +static inline int __mkroute_output(struct rtable **result, + struct fib_result* res, + const struct flowi *fl, + const struct flowi *oldflp, + struct net_device *dev_out, + unsigned flags) +{ + struct rtable *rth; + struct in_device *in_dev; + u32 tos = RT_FL_TOS(oldflp); + int err = 0; + + if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) + return -EINVAL; + + if (fl->fl4_dst == 0xFFFFFFFF) + res->type = RTN_BROADCAST; + else if (MULTICAST(fl->fl4_dst)) + res->type = RTN_MULTICAST; + else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst)) + return -EINVAL; + + if (dev_out->flags & IFF_LOOPBACK) + flags |= RTCF_LOCAL; + + /* get work reference to inet device */ + in_dev = in_dev_get(dev_out); + if (!in_dev) + return -EINVAL; + + if (res->type == RTN_BROADCAST) { + flags |= RTCF_BROADCAST | RTCF_LOCAL; + if (res->fi) { + fib_info_put(res->fi); + res->fi = NULL; + } + } else if (res->type == RTN_MULTICAST) { + flags |= RTCF_MULTICAST|RTCF_LOCAL; + if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, + oldflp->proto)) + flags &= ~RTCF_LOCAL; + /* If multicast route do not exist use + default one, but do not gateway in this case. + Yes, it is hack. + */ + if (res->fi && res->prefixlen < 4) { + fib_info_put(res->fi); + res->fi = NULL; + } + } + + + rth = dst_alloc(&ipv4_dst_ops); + if (!rth) { + err = -ENOBUFS; + goto cleanup; + } + + rth->u.dst.flags= DST_HOST; +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + if (res->fi) { + rth->rt_multipath_alg = res->fi->fib_mp_alg; + if (res->fi->fib_nhs > 1) + rth->u.dst.flags |= DST_BALANCED; + } +#endif + if (in_dev->cnf.no_xfrm) + rth->u.dst.flags |= DST_NOXFRM; + if (in_dev->cnf.no_policy) + rth->u.dst.flags |= DST_NOPOLICY; + + rth->fl.fl4_dst = oldflp->fl4_dst; + rth->fl.fl4_tos = tos; + rth->fl.fl4_src = oldflp->fl4_src; + rth->fl.oif = oldflp->oif; +#ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark= oldflp->fl4_fwmark; +#endif + rth->rt_dst = fl->fl4_dst; + rth->rt_src = fl->fl4_src; + rth->rt_iif = oldflp->oif ? : dev_out->ifindex; + /* get references to the devices that are to be hold by the routing + cache entry */ + rth->u.dst.dev = dev_out; + dev_hold(dev_out); + rth->idev = in_dev_get(dev_out); + rth->rt_gateway = fl->fl4_dst; + rth->rt_spec_dst= fl->fl4_src; + + rth->u.dst.output=ip_output; + + RT_CACHE_STAT_INC(out_slow_tot); + + if (flags & RTCF_LOCAL) { + rth->u.dst.input = ip_local_deliver; + rth->rt_spec_dst = fl->fl4_dst; + } + if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { + rth->rt_spec_dst = fl->fl4_src; + if (flags & RTCF_LOCAL && + !(dev_out->flags & IFF_LOOPBACK)) { + rth->u.dst.output = ip_mc_output; + RT_CACHE_STAT_INC(out_slow_mc); + } +#ifdef CONFIG_IP_MROUTE + if (res->type == RTN_MULTICAST) { + if (IN_DEV_MFORWARD(in_dev) && + !LOCAL_MCAST(oldflp->fl4_dst)) { + rth->u.dst.input = ip_mr_input; + rth->u.dst.output = ip_mc_output; + } + } +#endif + } + + rt_set_nexthop(rth, res, 0); + + rth->rt_flags = flags; + + *result = rth; + cleanup: + /* release work reference to inet device */ + in_dev_put(in_dev); + + return err; +} + +static inline int ip_mkroute_output_def(struct rtable **rp, + struct fib_result* res, + const struct flowi *fl, + const struct flowi *oldflp, + struct net_device *dev_out, + unsigned flags) +{ + struct rtable *rth; + int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); + unsigned hash; + if (err == 0) { + u32 tos = RT_FL_TOS(oldflp); + + atomic_set(&rth->u.dst.__refcnt, 1); + + hash = rt_hash_code(oldflp->fl4_dst, + oldflp->fl4_src ^ (oldflp->oif << 5), tos); + err = rt_intern_hash(hash, rth, rp); + } + + return err; +} + +static inline int ip_mkroute_output(struct rtable** rp, + struct fib_result* res, + const struct flowi *fl, + const struct flowi *oldflp, + struct net_device *dev_out, + unsigned flags) +{ +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + u32 tos = RT_FL_TOS(oldflp); + unsigned char hop; + unsigned hash; + int err = -EINVAL; + struct rtable *rth; + + if (res->fi && res->fi->fib_nhs > 1) { + unsigned char hopcount = res->fi->fib_nhs; + + for (hop = 0; hop < hopcount; hop++) { + struct net_device *dev2nexthop; + + res->nh_sel = hop; + + /* hold a work reference to the output device */ + dev2nexthop = FIB_RES_DEV(*res); + dev_hold(dev2nexthop); + + err = __mkroute_output(&rth, res, fl, oldflp, + dev2nexthop, flags); + + if (err != 0) + goto cleanup; + + hash = rt_hash_code(oldflp->fl4_dst, + oldflp->fl4_src ^ + (oldflp->oif << 5), tos); + err = rt_intern_hash(hash, rth, rp); + + /* forward hop information to multipath impl. */ + multipath_set_nhinfo(rth, + FIB_RES_NETWORK(*res), + FIB_RES_NETMASK(*res), + res->prefixlen, + &FIB_RES_NH(*res)); + cleanup: + /* release work reference to output device */ + dev_put(dev2nexthop); + + if (err != 0) + return err; + } + atomic_set(&(*rp)->u.dst.__refcnt, 1); + return err; + } else { + return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, + flags); + } +#else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ + return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags); +#endif +} + +/* + * Major route resolver routine. + */ + +static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) +{ + u32 tos = RT_FL_TOS(oldflp); + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = oldflp->fl4_dst, + .saddr = oldflp->fl4_src, + .tos = tos & IPTOS_RT_MASK, + .scope = ((tos & RTO_ONLINK) ? + RT_SCOPE_LINK : + RT_SCOPE_UNIVERSE), +#ifdef CONFIG_IP_ROUTE_FWMARK + .fwmark = oldflp->fl4_fwmark +#endif + } }, + .iif = loopback_dev.ifindex, + .oif = oldflp->oif }; + struct fib_result res; + unsigned flags = 0; + struct net_device *dev_out = NULL; + int free_res = 0; + int err; + + + res.fi = NULL; +#ifdef CONFIG_IP_MULTIPLE_TABLES + res.r = NULL; +#endif + + if (oldflp->fl4_src) { + err = -EINVAL; + if (MULTICAST(oldflp->fl4_src) || + BADCLASS(oldflp->fl4_src) || + ZERONET(oldflp->fl4_src)) + goto out; + + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = ip_dev_find(oldflp->fl4_src); + if (dev_out == NULL) + goto out; + + /* I removed check for oif == dev_out->oif here. + It was wrong for two reasons: + 1. ip_dev_find(saddr) can return wrong iface, if saddr is + assigned to multiple interfaces. + 2. Moreover, we are allowed to send packets with saddr + of another iface. --ANK + */ + + if (oldflp->oif == 0 + && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) { + /* Special hack: user can direct multicasts + and limited broadcast via necessary interface + without fiddling with IP_MULTICAST_IF or IP_PKTINFO. + This hack is not just for fun, it allows + vic,vat and friends to work. + They bind socket to loopback, set ttl to zero + and expect that it will work. + From the viewpoint of routing cache they are broken, + because we are not allowed to build multicast path + with loopback source addr (look, routing cache + cannot know, that ttl is zero, so that packet + will not leave this host and route is valid). + Luckily, this hack is good workaround. + */ + + fl.oif = dev_out->ifindex; + goto make_route; + } + if (dev_out) + dev_put(dev_out); + dev_out = NULL; + } + + + if (oldflp->oif) { + dev_out = dev_get_by_index(oldflp->oif); + err = -ENODEV; + if (dev_out == NULL) + goto out; + if (__in_dev_get(dev_out) == NULL) { + dev_put(dev_out); + goto out; /* Wrong error code */ + } + + if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) { + if (!fl.fl4_src) + fl.fl4_src = inet_select_addr(dev_out, 0, + RT_SCOPE_LINK); + goto make_route; + } + if (!fl.fl4_src) { + if (MULTICAST(oldflp->fl4_dst)) + fl.fl4_src = inet_select_addr(dev_out, 0, + fl.fl4_scope); + else if (!oldflp->fl4_dst) + fl.fl4_src = inet_select_addr(dev_out, 0, + RT_SCOPE_HOST); + } + } + + if (!fl.fl4_dst) { + fl.fl4_dst = fl.fl4_src; + if (!fl.fl4_dst) + fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); + if (dev_out) + dev_put(dev_out); + dev_out = &loopback_dev; + dev_hold(dev_out); + fl.oif = loopback_dev.ifindex; + res.type = RTN_LOCAL; + flags |= RTCF_LOCAL; + goto make_route; + } + + if (fib_lookup(&fl, &res)) { + res.fi = NULL; + if (oldflp->oif) { + /* Apparently, routing tables are wrong. Assume, + that the destination is on link. + + WHY? DW. + Because we are allowed to send to iface + even if it has NO routes and NO assigned + addresses. When oif is specified, routing + tables are looked up with only one purpose: + to catch if destination is gatewayed, rather than + direct. Moreover, if MSG_DONTROUTE is set, + we send packet, ignoring both routing tables + and ifaddr state. --ANK + + + We could make it even if oif is unknown, + likely IPv6, but we do not. + */ + + if (fl.fl4_src == 0) + fl.fl4_src = inet_select_addr(dev_out, 0, + RT_SCOPE_LINK); + res.type = RTN_UNICAST; + goto make_route; + } + if (dev_out) + dev_put(dev_out); + err = -ENETUNREACH; + goto out; + } + free_res = 1; + + if (res.type == RTN_LOCAL) { + if (!fl.fl4_src) + fl.fl4_src = fl.fl4_dst; + if (dev_out) + dev_put(dev_out); + dev_out = &loopback_dev; + dev_hold(dev_out); + fl.oif = dev_out->ifindex; + if (res.fi) + fib_info_put(res.fi); + res.fi = NULL; + flags |= RTCF_LOCAL; + goto make_route; + } + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + if (res.fi->fib_nhs > 1 && fl.oif == 0) + fib_select_multipath(&fl, &res); + else +#endif + if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) + fib_select_default(&fl, &res); + + if (!fl.fl4_src) + fl.fl4_src = FIB_RES_PREFSRC(res); + + if (dev_out) + dev_put(dev_out); + dev_out = FIB_RES_DEV(res); + dev_hold(dev_out); + fl.oif = dev_out->ifindex; + + +make_route: + err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); + + + if (free_res) + fib_res_put(&res); + if (dev_out) + dev_put(dev_out); +out: return err; +} + +int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) +{ + unsigned hash; + struct rtable *rth; + + hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos); + + rcu_read_lock_bh(); + for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; + rth = rcu_dereference(rth->u.rt_next)) { + if (rth->fl.fl4_dst == flp->fl4_dst && + rth->fl.fl4_src == flp->fl4_src && + rth->fl.iif == 0 && + rth->fl.oif == flp->oif && +#ifdef CONFIG_IP_ROUTE_FWMARK + rth->fl.fl4_fwmark == flp->fl4_fwmark && +#endif + !((rth->fl.fl4_tos ^ flp->fl4_tos) & + (IPTOS_RT_MASK | RTO_ONLINK))) { + + /* check for multipath routes and choose one if + * necessary + */ + if (multipath_select_route(flp, rth, rp)) { + dst_hold(&(*rp)->u.dst); + RT_CACHE_STAT_INC(out_hit); + rcu_read_unlock_bh(); + return 0; + } + + rth->u.dst.lastuse = jiffies; + dst_hold(&rth->u.dst); + rth->u.dst.__use++; + RT_CACHE_STAT_INC(out_hit); + rcu_read_unlock_bh(); + *rp = rth; + return 0; + } + RT_CACHE_STAT_INC(out_hlist_search); + } + rcu_read_unlock_bh(); + + return ip_route_output_slow(rp, flp); +} + +int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) +{ + int err; + + if ((err = __ip_route_output_key(rp, flp)) != 0) + return err; + + if (flp->proto) { + if (!flp->fl4_src) + flp->fl4_src = (*rp)->rt_src; + if (!flp->fl4_dst) + flp->fl4_dst = (*rp)->rt_dst; + return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags); + } + + return 0; +} + +int ip_route_output_key(struct rtable **rp, struct flowi *flp) +{ + return ip_route_output_flow(rp, flp, NULL, 0); +} + +static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, + int nowait) +{ + struct rtable *rt = (struct rtable*)skb->dst; + struct rtmsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct rta_cacheinfo ci; +#ifdef CONFIG_IP_MROUTE + struct rtattr *eptr; +#endif + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); + r = NLMSG_DATA(nlh); + nlh->nlmsg_flags = (nowait && pid) ? NLM_F_MULTI : 0; + r->rtm_family = AF_INET; + r->rtm_dst_len = 32; + r->rtm_src_len = 0; + r->rtm_tos = rt->fl.fl4_tos; + r->rtm_table = RT_TABLE_MAIN; + r->rtm_type = rt->rt_type; + r->rtm_scope = RT_SCOPE_UNIVERSE; + r->rtm_protocol = RTPROT_UNSPEC; + r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; + if (rt->rt_flags & RTCF_NOTIFY) + r->rtm_flags |= RTM_F_NOTIFY; + RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst); + if (rt->fl.fl4_src) { + r->rtm_src_len = 32; + RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src); + } + if (rt->u.dst.dev) + RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); +#ifdef CONFIG_NET_CLS_ROUTE + if (rt->u.dst.tclassid) + RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid); +#endif +#ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED + if (rt->rt_multipath_alg != IP_MP_ALG_NONE) { + __u32 alg = rt->rt_multipath_alg; + + RTA_PUT(skb, RTA_MP_ALGO, 4, &alg); + } +#endif + if (rt->fl.iif) + RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); + else if (rt->rt_src != rt->fl.fl4_src) + RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src); + if (rt->rt_dst != rt->rt_gateway) + RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway); + if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) + goto rtattr_failure; + ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse); + ci.rta_used = rt->u.dst.__use; + ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt); + if (rt->u.dst.expires) + ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies); + else + ci.rta_expires = 0; + ci.rta_error = rt->u.dst.error; + ci.rta_id = ci.rta_ts = ci.rta_tsage = 0; + if (rt->peer) { + ci.rta_id = rt->peer->ip_id_count; + if (rt->peer->tcp_ts_stamp) { + ci.rta_ts = rt->peer->tcp_ts; + ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp; + } + } +#ifdef CONFIG_IP_MROUTE + eptr = (struct rtattr*)skb->tail; +#endif + RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); + if (rt->fl.iif) { +#ifdef CONFIG_IP_MROUTE + u32 dst = rt->rt_dst; + + if (MULTICAST(dst) && !LOCAL_MCAST(dst) && + ipv4_devconf.mc_forwarding) { + int err = ipmr_get_route(skb, r, nowait); + if (err <= 0) { + if (!nowait) { + if (err == 0) + return 0; + goto nlmsg_failure; + } else { + if (err == -EMSGSIZE) + goto nlmsg_failure; + ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err; + } + } + } else +#endif + RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif); + } + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +{ + struct rtattr **rta = arg; + struct rtmsg *rtm = NLMSG_DATA(nlh); + struct rtable *rt = NULL; + u32 dst = 0; + u32 src = 0; + int iif = 0; + int err = -ENOBUFS; + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + goto out; + + /* Reserve room for dummy headers, this skb can pass + through good chunk of routing engine. + */ + skb->mac.raw = skb->data; + skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); + + if (rta[RTA_SRC - 1]) + memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4); + if (rta[RTA_DST - 1]) + memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4); + if (rta[RTA_IIF - 1]) + memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int)); + + if (iif) { + struct net_device *dev = __dev_get_by_index(iif); + err = -ENODEV; + if (!dev) + goto out_free; + skb->protocol = htons(ETH_P_IP); + skb->dev = dev; + local_bh_disable(); + err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); + local_bh_enable(); + rt = (struct rtable*)skb->dst; + if (!err && rt->u.dst.error) + err = -rt->u.dst.error; + } else { + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst, + .saddr = src, + .tos = rtm->rtm_tos } } }; + int oif = 0; + if (rta[RTA_OIF - 1]) + memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int)); + fl.oif = oif; + err = ip_route_output_key(&rt, &fl); + } + if (err) + goto out_free; + + skb->dst = &rt->u.dst; + if (rtm->rtm_flags & RTM_F_NOTIFY) + rt->rt_flags |= RTCF_NOTIFY; + + NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; + + err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + RTM_NEWROUTE, 0); + if (!err) + goto out_free; + if (err < 0) { + err = -EMSGSIZE; + goto out_free; + } + + err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); + if (err > 0) + err = 0; +out: return err; + +out_free: + kfree_skb(skb); + goto out; +} + +int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rtable *rt; + int h, s_h; + int idx, s_idx; + + s_h = cb->args[0]; + s_idx = idx = cb->args[1]; + for (h = 0; h <= rt_hash_mask; h++) { + if (h < s_h) continue; + if (h > s_h) + s_idx = 0; + rcu_read_lock_bh(); + for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; + rt = rcu_dereference(rt->u.rt_next), idx++) { + if (idx < s_idx) + continue; + skb->dst = dst_clone(&rt->u.dst); + if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, + RTM_NEWROUTE, 1) <= 0) { + dst_release(xchg(&skb->dst, NULL)); + rcu_read_unlock_bh(); + goto done; + } + dst_release(xchg(&skb->dst, NULL)); + } + rcu_read_unlock_bh(); + } + +done: + cb->args[0] = h; + cb->args[1] = idx; + return skb->len; +} + +void ip_rt_multicast_event(struct in_device *in_dev) +{ + rt_cache_flush(0); +} + +#ifdef CONFIG_SYSCTL +static int flush_delay; + +static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, + struct file *filp, void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + if (write) { + proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + rt_cache_flush(flush_delay); + return 0; + } + + return -EINVAL; +} + +static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, + int __user *name, + int nlen, + void __user *oldval, + size_t __user *oldlenp, + void __user *newval, + size_t newlen, + void **context) +{ + int delay; + if (newlen != sizeof(int)) + return -EINVAL; + if (get_user(delay, (int __user *)newval)) + return -EFAULT; + rt_cache_flush(delay); + return 0; +} + +ctl_table ipv4_route_table[] = { + { + .ctl_name = NET_IPV4_ROUTE_FLUSH, + .procname = "flush", + .data = &flush_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ipv4_sysctl_rtcache_flush, + .strategy = &ipv4_sysctl_rtcache_flush_strategy, + }, + { + .ctl_name = NET_IPV4_ROUTE_MIN_DELAY, + .procname = "min_delay", + .data = &ip_rt_min_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV4_ROUTE_MAX_DELAY, + .procname = "max_delay", + .data = &ip_rt_max_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV4_ROUTE_GC_THRESH, + .procname = "gc_thresh", + .data = &ipv4_dst_ops.gc_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_MAX_SIZE, + .procname = "max_size", + .data = &ip_rt_max_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + /* Deprecated. Use gc_min_interval_ms */ + + .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL, + .procname = "gc_min_interval", + .data = &ip_rt_gc_min_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, + .procname = "gc_min_interval_ms", + .data = &ip_rt_gc_min_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_ms_jiffies, + .strategy = &sysctl_ms_jiffies, + }, + { + .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT, + .procname = "gc_timeout", + .data = &ip_rt_gc_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL, + .procname = "gc_interval", + .data = &ip_rt_gc_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD, + .procname = "redirect_load", + .data = &ip_rt_redirect_load, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER, + .procname = "redirect_number", + .data = &ip_rt_redirect_number, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE, + .procname = "redirect_silence", + .data = &ip_rt_redirect_silence, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_ERROR_COST, + .procname = "error_cost", + .data = &ip_rt_error_cost, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_ERROR_BURST, + .procname = "error_burst", + .data = &ip_rt_error_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY, + .procname = "gc_elasticity", + .data = &ip_rt_gc_elasticity, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES, + .procname = "mtu_expires", + .data = &ip_rt_mtu_expires, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV4_ROUTE_MIN_PMTU, + .procname = "min_pmtu", + .data = &ip_rt_min_pmtu, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS, + .procname = "min_adv_mss", + .data = &ip_rt_min_advmss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL, + .procname = "secret_interval", + .data = &ip_rt_secret_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { .ctl_name = 0 } +}; +#endif + +#ifdef CONFIG_NET_CLS_ROUTE +struct ip_rt_acct *ip_rt_acct; + +/* This code sucks. But you should have seen it before! --RR */ + +/* IP route accounting ptr for this logical cpu number. */ +#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256) + +#ifdef CONFIG_PROC_FS +static int ip_rt_acct_read(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + unsigned int i; + + if ((offset & 3) || (length & 3)) + return -EIO; + + if (offset >= sizeof(struct ip_rt_acct) * 256) { + *eof = 1; + return 0; + } + + if (offset + length >= sizeof(struct ip_rt_acct) * 256) { + length = sizeof(struct ip_rt_acct) * 256 - offset; + *eof = 1; + } + + offset /= sizeof(u32); + + if (length > 0) { + u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset; + u32 *dst = (u32 *) buffer; + + /* Copy first cpu. */ + *start = buffer; + memcpy(dst, src, length); + + /* Add the other cpus in, one int at a time */ + for_each_cpu(i) { + unsigned int j; + + src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset; + + for (j = 0; j < length/4; j++) + dst[j] += src[j]; + } + } + return length; +} +#endif /* CONFIG_PROC_FS */ +#endif /* CONFIG_NET_CLS_ROUTE */ + +static __initdata unsigned long rhash_entries; +static int __init set_rhash_entries(char *str) +{ + if (!str) + return 0; + rhash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("rhash_entries=", set_rhash_entries); + +int __init ip_rt_init(void) +{ + int i, order, goal, rc = 0; + + rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ + (jiffies ^ (jiffies >> 7))); + +#ifdef CONFIG_NET_CLS_ROUTE + for (order = 0; + (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) + /* NOTHING */; + ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order); + if (!ip_rt_acct) + panic("IP: failed to allocate ip_rt_acct\n"); + memset(ip_rt_acct, 0, PAGE_SIZE << order); +#endif + + ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", + sizeof(struct rtable), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + + if (!ipv4_dst_ops.kmem_cachep) + panic("IP: failed to allocate ip_dst_cache\n"); + + goal = num_physpages >> (26 - PAGE_SHIFT); + if (rhash_entries) + goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT; + for (order = 0; (1UL << order) < goal; order++) + /* NOTHING */; + + do { + rt_hash_mask = (1UL << order) * PAGE_SIZE / + sizeof(struct rt_hash_bucket); + while (rt_hash_mask & (rt_hash_mask - 1)) + rt_hash_mask--; + rt_hash_table = (struct rt_hash_bucket *) + __get_free_pages(GFP_ATOMIC, order); + } while (rt_hash_table == NULL && --order > 0); + + if (!rt_hash_table) + panic("Failed to allocate IP route cache hash table\n"); + + printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n", + rt_hash_mask, + (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024); + + for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++) + /* NOTHING */; + + rt_hash_mask--; + for (i = 0; i <= rt_hash_mask; i++) { + spin_lock_init(&rt_hash_table[i].lock); + rt_hash_table[i].chain = NULL; + } + + ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); + ip_rt_max_size = (rt_hash_mask + 1) * 16; + + rt_cache_stat = alloc_percpu(struct rt_cache_stat); + if (!rt_cache_stat) + return -ENOMEM; + + devinet_init(); + ip_fib_init(); + + init_timer(&rt_flush_timer); + rt_flush_timer.function = rt_run_flush; + init_timer(&rt_periodic_timer); + rt_periodic_timer.function = rt_check_expire; + init_timer(&rt_secret_timer); + rt_secret_timer.function = rt_secret_rebuild; + + /* All the timers, started at system startup tend + to synchronize. Perturb it a bit. + */ + rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval + + ip_rt_gc_interval; + add_timer(&rt_periodic_timer); + + rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval + + ip_rt_secret_interval; + add_timer(&rt_secret_timer); + +#ifdef CONFIG_PROC_FS + { + struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */ + if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) || + !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, + proc_net_stat))) { + free_percpu(rt_cache_stat); + return -ENOMEM; + } + rtstat_pde->proc_fops = &rt_cpu_seq_fops; + } +#ifdef CONFIG_NET_CLS_ROUTE + create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL); +#endif +#endif +#ifdef CONFIG_XFRM + xfrm_init(); + xfrm4_init(); +#endif + return rc; +} + +EXPORT_SYMBOL(__ip_select_ident); +EXPORT_SYMBOL(ip_route_input); +EXPORT_SYMBOL(ip_route_output_key); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c new file mode 100644 index 000000000000..e923d2f021aa --- /dev/null +++ b/net/ipv4/syncookies.c @@ -0,0 +1,279 @@ +/* + * Syncookies implementation for the Linux kernel + * + * Copyright (C) 1997 Andi Kleen + * Based on ideas by D.J.Bernstein and Eric Schenk. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * $Id: syncookies.c,v 1.18 2002/02/01 22:01:04 davem Exp $ + * + * Missing: IPv6 support. + */ + +#include +#include +#include +#include +#include +#include + +extern int sysctl_tcp_syncookies; + +static __u32 syncookie_secret[2][16-3+SHA_DIGEST_WORDS]; + +static __init int init_syncookies(void) +{ + get_random_bytes(syncookie_secret, sizeof(syncookie_secret)); + return 0; +} +module_init(init_syncookies); + +#define COOKIEBITS 24 /* Upper bits store count */ +#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) + +static u32 cookie_hash(u32 saddr, u32 daddr, u32 sport, u32 dport, + u32 count, int c) +{ + __u32 tmp[16 + 5 + SHA_WORKSPACE_WORDS]; + + memcpy(tmp + 3, syncookie_secret[c], sizeof(syncookie_secret[c])); + tmp[0] = saddr; + tmp[1] = daddr; + tmp[2] = (sport << 16) + dport; + tmp[3] = count; + sha_transform(tmp + 16, (__u8 *)tmp, tmp + 16 + 5); + + return tmp[17]; +} + +static __u32 secure_tcp_syn_cookie(__u32 saddr, __u32 daddr, __u16 sport, + __u16 dport, __u32 sseq, __u32 count, + __u32 data) +{ + /* + * Compute the secure sequence number. + * The output should be: + * HASH(sec1,saddr,sport,daddr,dport,sec1) + sseq + (count * 2^24) + * + (HASH(sec2,saddr,sport,daddr,dport,count,sec2) % 2^24). + * Where sseq is their sequence number and count increases every + * minute by 1. + * As an extra hack, we add a small "data" value that encodes the + * MSS into the second hash value. + */ + + return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + + sseq + (count << COOKIEBITS) + + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) + & COOKIEMASK)); +} + +/* + * This retrieves the small "data" value from the syncookie. + * If the syncookie is bad, the data returned will be out of + * range. This must be checked by the caller. + * + * The count value used to generate the cookie must be within + * "maxdiff" if the current (passed-in) "count". The return value + * is (__u32)-1 if this test fails. + */ +static __u32 check_tcp_syn_cookie(__u32 cookie, __u32 saddr, __u32 daddr, + __u16 sport, __u16 dport, __u32 sseq, + __u32 count, __u32 maxdiff) +{ + __u32 diff; + + /* Strip away the layers from the cookie */ + cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; + + /* Cookie is now reduced to (count * 2^24) ^ (hash % 2^24) */ + diff = (count - (cookie >> COOKIEBITS)) & ((__u32) - 1 >> COOKIEBITS); + if (diff >= maxdiff) + return (__u32)-1; + + return (cookie - + cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) + & COOKIEMASK; /* Leaving the data behind */ +} + +/* + * This table has to be sorted and terminated with (__u16)-1. + * XXX generate a better table. + * Unresolved Issues: HIPPI with a 64k MSS is not well supported. + */ +static __u16 const msstab[] = { + 64 - 1, + 256 - 1, + 512 - 1, + 536 - 1, + 1024 - 1, + 1440 - 1, + 1460 - 1, + 4312 - 1, + (__u16)-1 +}; +/* The number doesn't include the -1 terminator */ +#define NUM_MSS (ARRAY_SIZE(msstab) - 1) + +/* + * Generate a syncookie. mssp points to the mss, which is returned + * rounded down to the value encoded in the cookie. + */ +__u32 cookie_v4_init_sequence(struct sock *sk, struct sk_buff *skb, __u16 *mssp) +{ + struct tcp_sock *tp = tcp_sk(sk); + int mssind; + const __u16 mss = *mssp; + + + tp->last_synq_overflow = jiffies; + + /* XXX sort msstab[] by probability? Binary search? */ + for (mssind = 0; mss > msstab[mssind + 1]; mssind++) + ; + *mssp = msstab[mssind] + 1; + + NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESSENT); + + return secure_tcp_syn_cookie(skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->h.th->source, skb->h.th->dest, + ntohl(skb->h.th->seq), + jiffies / (HZ * 60), mssind); +} + +/* + * This (misnamed) value is the age of syncookie which is permitted. + * Its ideal value should be dependent on TCP_TIMEOUT_INIT and + * sysctl_tcp_retries1. It's a rather complicated formula (exponential + * backoff) to compute at runtime so it's currently hardcoded here. + */ +#define COUNTER_TRIES 4 +/* + * Check if a ack sequence number is a valid syncookie. + * Return the decoded mss if it is, or 0 if not. + */ +static inline int cookie_check(struct sk_buff *skb, __u32 cookie) +{ + __u32 seq; + __u32 mssind; + + seq = ntohl(skb->h.th->seq)-1; + mssind = check_tcp_syn_cookie(cookie, + skb->nh.iph->saddr, skb->nh.iph->daddr, + skb->h.th->source, skb->h.th->dest, + seq, jiffies / (HZ * 60), COUNTER_TRIES); + + return mssind < NUM_MSS ? msstab[mssind] + 1 : 0; +} + +extern struct or_calltable or_ipv4; + +static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, + struct open_request *req, + struct dst_entry *dst) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sock *child; + + child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); + if (child) + tcp_acceptq_queue(sk, req, child); + else + tcp_openreq_free(req); + + return child; +} + +struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, + struct ip_options *opt) +{ + struct tcp_sock *tp = tcp_sk(sk); + __u32 cookie = ntohl(skb->h.th->ack_seq) - 1; + struct sock *ret = sk; + struct open_request *req; + int mss; + struct rtable *rt; + __u8 rcv_wscale; + + if (!sysctl_tcp_syncookies || !skb->h.th->ack) + goto out; + + if (time_after(jiffies, tp->last_synq_overflow + TCP_TIMEOUT_INIT) || + (mss = cookie_check(skb, cookie)) == 0) { + NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESFAILED); + goto out; + } + + NET_INC_STATS_BH(LINUX_MIB_SYNCOOKIESRECV); + + req = tcp_openreq_alloc(); + ret = NULL; + if (!req) + goto out; + + req->rcv_isn = htonl(skb->h.th->seq) - 1; + req->snt_isn = cookie; + req->mss = mss; + req->rmt_port = skb->h.th->source; + req->af.v4_req.loc_addr = skb->nh.iph->daddr; + req->af.v4_req.rmt_addr = skb->nh.iph->saddr; + req->class = &or_ipv4; /* for savety */ + req->af.v4_req.opt = NULL; + + /* We throwed the options of the initial SYN away, so we hope + * the ACK carries the same options again (see RFC1122 4.2.3.8) + */ + if (opt && opt->optlen) { + int opt_size = sizeof(struct ip_options) + opt->optlen; + + req->af.v4_req.opt = kmalloc(opt_size, GFP_ATOMIC); + if (req->af.v4_req.opt) { + if (ip_options_echo(req->af.v4_req.opt, skb)) { + kfree(req->af.v4_req.opt); + req->af.v4_req.opt = NULL; + } + } + } + + req->snd_wscale = req->rcv_wscale = req->tstamp_ok = 0; + req->wscale_ok = req->sack_ok = 0; + req->expires = 0UL; + req->retrans = 0; + + /* + * We need to lookup the route here to get at the correct + * window size. We should better make sure that the window size + * hasn't changed since we received the original syn, but I see + * no easy way to do this. + */ + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = ((opt && opt->srr) ? + opt->faddr : + req->af.v4_req.rmt_addr), + .saddr = req->af.v4_req.loc_addr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = IPPROTO_TCP, + .uli_u = { .ports = + { .sport = skb->h.th->dest, + .dport = skb->h.th->source } } }; + if (ip_route_output_key(&rt, &fl)) { + tcp_openreq_free(req); + goto out; + } + } + + /* Try to redo what tcp_v4_send_synack did. */ + req->window_clamp = dst_metric(&rt->u.dst, RTAX_WINDOW); + tcp_select_initial_window(tcp_full_space(sk), req->mss, + &req->rcv_wnd, &req->window_clamp, + 0, &rcv_wscale); + /* BTW win scale with syncookies is 0 by definition */ + req->rcv_wscale = rcv_wscale; + + ret = get_cookie_sock(sk, skb, req, &rt->u.dst); +out: return ret; +} diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c new file mode 100644 index 000000000000..3aafb298c1c1 --- /dev/null +++ b/net/ipv4/sysctl_net_ipv4.c @@ -0,0 +1,698 @@ +/* + * sysctl_net_ipv4.c: sysctl interface to net IPV4 subsystem. + * + * $Id: sysctl_net_ipv4.c,v 1.50 2001/10/20 00:00:11 davem Exp $ + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/ipv4 directory entry (empty =) ). [MS] + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* From af_inet.c */ +extern int sysctl_ip_nonlocal_bind; + +/* From icmp.c */ +extern int sysctl_icmp_echo_ignore_all; +extern int sysctl_icmp_echo_ignore_broadcasts; +extern int sysctl_icmp_ignore_bogus_error_responses; + +/* From ip_fragment.c */ +extern int sysctl_ipfrag_low_thresh; +extern int sysctl_ipfrag_high_thresh; +extern int sysctl_ipfrag_time; +extern int sysctl_ipfrag_secret_interval; + +/* From ip_output.c */ +extern int sysctl_ip_dynaddr; + +/* From icmp.c */ +extern int sysctl_icmp_ratelimit; +extern int sysctl_icmp_ratemask; + +/* From igmp.c */ +extern int sysctl_igmp_max_memberships; +extern int sysctl_igmp_max_msf; + +/* From inetpeer.c */ +extern int inet_peer_threshold; +extern int inet_peer_minttl; +extern int inet_peer_maxttl; +extern int inet_peer_gc_mintime; +extern int inet_peer_gc_maxtime; + +#ifdef CONFIG_SYSCTL +static int tcp_retr1_max = 255; +static int ip_local_port_range_min[] = { 1, 1 }; +static int ip_local_port_range_max[] = { 65535, 65535 }; +#endif + +struct ipv4_config ipv4_config; + +extern ctl_table ipv4_route_table[]; + +#ifdef CONFIG_SYSCTL + +static +int ipv4_sysctl_forward(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int val = ipv4_devconf.forwarding; + int ret; + + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + + if (write && ipv4_devconf.forwarding != val) + inet_forward_change(); + + return ret; +} + +static int ipv4_sysctl_forward_strategy(ctl_table *table, + int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + int *valp = table->data; + int new; + + if (!newval || !newlen) + return 0; + + if (newlen != sizeof(int)) + return -EINVAL; + + if (get_user(new, (int __user *)newval)) + return -EFAULT; + + if (new == *valp) + return 0; + + if (oldval && oldlenp) { + size_t len; + + if (get_user(len, oldlenp)) + return -EFAULT; + + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if (copy_to_user(oldval, valp, len)) + return -EFAULT; + if (put_user(len, oldlenp)) + return -EFAULT; + } + } + + *valp = new; + inet_forward_change(); + return 1; +} + +ctl_table ipv4_table[] = { + { + .ctl_name = NET_IPV4_TCP_TIMESTAMPS, + .procname = "tcp_timestamps", + .data = &sysctl_tcp_timestamps, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_WINDOW_SCALING, + .procname = "tcp_window_scaling", + .data = &sysctl_tcp_window_scaling, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_SACK, + .procname = "tcp_sack", + .data = &sysctl_tcp_sack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_RETRANS_COLLAPSE, + .procname = "tcp_retrans_collapse", + .data = &sysctl_tcp_retrans_collapse, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_FORWARD, + .procname = "ip_forward", + .data = &ipv4_devconf.forwarding, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ipv4_sysctl_forward, + .strategy = &ipv4_sysctl_forward_strategy + }, + { + .ctl_name = NET_IPV4_DEFAULT_TTL, + .procname = "ip_default_ttl", + .data = &sysctl_ip_default_ttl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ipv4_doint_and_flush, + .strategy = &ipv4_doint_and_flush_strategy, + }, + { + .ctl_name = NET_IPV4_AUTOCONFIG, + .procname = "ip_autoconfig", + .data = &ipv4_config.autoconfig, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_NO_PMTU_DISC, + .procname = "ip_no_pmtu_disc", + .data = &ipv4_config.no_pmtu_disc, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_NONLOCAL_BIND, + .procname = "ip_nonlocal_bind", + .data = &sysctl_ip_nonlocal_bind, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_SYN_RETRIES, + .procname = "tcp_syn_retries", + .data = &sysctl_tcp_syn_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_SYNACK_RETRIES, + .procname = "tcp_synack_retries", + .data = &sysctl_tcp_synack_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_MAX_ORPHANS, + .procname = "tcp_max_orphans", + .data = &sysctl_tcp_max_orphans, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_MAX_TW_BUCKETS, + .procname = "tcp_max_tw_buckets", + .data = &sysctl_tcp_max_tw_buckets, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_IPFRAG_HIGH_THRESH, + .procname = "ipfrag_high_thresh", + .data = &sysctl_ipfrag_high_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_IPFRAG_LOW_THRESH, + .procname = "ipfrag_low_thresh", + .data = &sysctl_ipfrag_low_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_DYNADDR, + .procname = "ip_dynaddr", + .data = &sysctl_ip_dynaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_IPFRAG_TIME, + .procname = "ipfrag_time", + .data = &sysctl_ipfrag_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_IPV4_TCP_KEEPALIVE_TIME, + .procname = "tcp_keepalive_time", + .data = &sysctl_tcp_keepalive_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_IPV4_TCP_KEEPALIVE_PROBES, + .procname = "tcp_keepalive_probes", + .data = &sysctl_tcp_keepalive_probes, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_KEEPALIVE_INTVL, + .procname = "tcp_keepalive_intvl", + .data = &sysctl_tcp_keepalive_intvl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_IPV4_TCP_RETRIES1, + .procname = "tcp_retries1", + .data = &sysctl_tcp_retries1, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra2 = &tcp_retr1_max + }, + { + .ctl_name = NET_IPV4_TCP_RETRIES2, + .procname = "tcp_retries2", + .data = &sysctl_tcp_retries2, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_TCP_FIN_TIMEOUT, + .procname = "tcp_fin_timeout", + .data = &sysctl_tcp_fin_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, +#ifdef CONFIG_SYN_COOKIES + { + .ctl_name = NET_TCP_SYNCOOKIES, + .procname = "tcp_syncookies", + .data = &sysctl_tcp_syncookies, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, +#endif + { + .ctl_name = NET_TCP_TW_RECYCLE, + .procname = "tcp_tw_recycle", + .data = &sysctl_tcp_tw_recycle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_ABORT_ON_OVERFLOW, + .procname = "tcp_abort_on_overflow", + .data = &sysctl_tcp_abort_on_overflow, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_STDURG, + .procname = "tcp_stdurg", + .data = &sysctl_tcp_stdurg, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_RFC1337, + .procname = "tcp_rfc1337", + .data = &sysctl_tcp_rfc1337, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_MAX_SYN_BACKLOG, + .procname = "tcp_max_syn_backlog", + .data = &sysctl_max_syn_backlog, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_LOCAL_PORT_RANGE, + .procname = "ip_local_port_range", + .data = &sysctl_local_port_range, + .maxlen = sizeof(sysctl_local_port_range), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = ip_local_port_range_min, + .extra2 = ip_local_port_range_max + }, + { + .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_ALL, + .procname = "icmp_echo_ignore_all", + .data = &sysctl_icmp_echo_ignore_all, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_ICMP_ECHO_IGNORE_BROADCASTS, + .procname = "icmp_echo_ignore_broadcasts", + .data = &sysctl_icmp_echo_ignore_broadcasts, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_ICMP_IGNORE_BOGUS_ERROR_RESPONSES, + .procname = "icmp_ignore_bogus_error_responses", + .data = &sysctl_icmp_ignore_bogus_error_responses, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_ROUTE, + .procname = "route", + .maxlen = 0, + .mode = 0555, + .child = ipv4_route_table + }, +#ifdef CONFIG_IP_MULTICAST + { + .ctl_name = NET_IPV4_IGMP_MAX_MEMBERSHIPS, + .procname = "igmp_max_memberships", + .data = &sysctl_igmp_max_memberships, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + +#endif + { + .ctl_name = NET_IPV4_IGMP_MAX_MSF, + .procname = "igmp_max_msf", + .data = &sysctl_igmp_max_msf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_INET_PEER_THRESHOLD, + .procname = "inet_peer_threshold", + .data = &inet_peer_threshold, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_INET_PEER_MINTTL, + .procname = "inet_peer_minttl", + .data = &inet_peer_minttl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_IPV4_INET_PEER_MAXTTL, + .procname = "inet_peer_maxttl", + .data = &inet_peer_maxttl, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_IPV4_INET_PEER_GC_MINTIME, + .procname = "inet_peer_gc_mintime", + .data = &inet_peer_gc_mintime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_IPV4_INET_PEER_GC_MAXTIME, + .procname = "inet_peer_gc_maxtime", + .data = &inet_peer_gc_maxtime, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_TCP_ORPHAN_RETRIES, + .procname = "tcp_orphan_retries", + .data = &sysctl_tcp_orphan_retries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_FACK, + .procname = "tcp_fack", + .data = &sysctl_tcp_fack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_REORDERING, + .procname = "tcp_reordering", + .data = &sysctl_tcp_reordering, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_ECN, + .procname = "tcp_ecn", + .data = &sysctl_tcp_ecn, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_DSACK, + .procname = "tcp_dsack", + .data = &sysctl_tcp_dsack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_MEM, + .procname = "tcp_mem", + .data = &sysctl_tcp_mem, + .maxlen = sizeof(sysctl_tcp_mem), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_WMEM, + .procname = "tcp_wmem", + .data = &sysctl_tcp_wmem, + .maxlen = sizeof(sysctl_tcp_wmem), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_RMEM, + .procname = "tcp_rmem", + .data = &sysctl_tcp_rmem, + .maxlen = sizeof(sysctl_tcp_rmem), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_APP_WIN, + .procname = "tcp_app_win", + .data = &sysctl_tcp_app_win, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_ADV_WIN_SCALE, + .procname = "tcp_adv_win_scale", + .data = &sysctl_tcp_adv_win_scale, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_ICMP_RATELIMIT, + .procname = "icmp_ratelimit", + .data = &sysctl_icmp_ratelimit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_ICMP_RATEMASK, + .procname = "icmp_ratemask", + .data = &sysctl_icmp_ratemask, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_TW_REUSE, + .procname = "tcp_tw_reuse", + .data = &sysctl_tcp_tw_reuse, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_FRTO, + .procname = "tcp_frto", + .data = &sysctl_tcp_frto, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_TCP_LOW_LATENCY, + .procname = "tcp_low_latency", + .data = &sysctl_tcp_low_latency, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV4_IPFRAG_SECRET_INTERVAL, + .procname = "ipfrag_secret_interval", + .data = &sysctl_ipfrag_secret_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_TCP_NO_METRICS_SAVE, + .procname = "tcp_no_metrics_save", + .data = &sysctl_tcp_nometrics_save, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_WESTWOOD, + .procname = "tcp_westwood", + .data = &sysctl_tcp_westwood, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_VEGAS, + .procname = "tcp_vegas_cong_avoid", + .data = &sysctl_tcp_vegas_cong_avoid, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_VEGAS_ALPHA, + .procname = "tcp_vegas_alpha", + .data = &sysctl_tcp_vegas_alpha, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_VEGAS_BETA, + .procname = "tcp_vegas_beta", + .data = &sysctl_tcp_vegas_beta, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_VEGAS_GAMMA, + .procname = "tcp_vegas_gamma", + .data = &sysctl_tcp_vegas_gamma, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_BIC, + .procname = "tcp_bic", + .data = &sysctl_tcp_bic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_BIC_FAST_CONVERGENCE, + .procname = "tcp_bic_fast_convergence", + .data = &sysctl_tcp_bic_fast_convergence, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_BIC_LOW_WINDOW, + .procname = "tcp_bic_low_window", + .data = &sysctl_tcp_bic_low_window, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_MODERATE_RCVBUF, + .procname = "tcp_moderate_rcvbuf", + .data = &sysctl_tcp_moderate_rcvbuf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_TSO_WIN_DIVISOR, + .procname = "tcp_tso_win_divisor", + .data = &sysctl_tcp_tso_win_divisor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_BIC_BETA, + .procname = "tcp_bic_beta", + .data = &sysctl_tcp_bic_beta, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = 0 } +}; + +#endif /* CONFIG_SYSCTL */ + +EXPORT_SYMBOL(ipv4_config); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c new file mode 100644 index 000000000000..5cff56af7855 --- /dev/null +++ b/net/ipv4/tcp.c @@ -0,0 +1,2386 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * Jorge Cwik, + * + * Fixes: + * Alan Cox : Numerous verify_area() calls + * Alan Cox : Set the ACK bit on a reset + * Alan Cox : Stopped it crashing if it closed while + * sk->inuse=1 and was trying to connect + * (tcp_err()). + * Alan Cox : All icmp error handling was broken + * pointers passed where wrong and the + * socket was looked up backwards. Nobody + * tested any icmp error code obviously. + * Alan Cox : tcp_err() now handled properly. It + * wakes people on errors. poll + * behaves and the icmp error race + * has gone by moving it into sock.c + * Alan Cox : tcp_send_reset() fixed to work for + * everything not just packets for + * unknown sockets. + * Alan Cox : tcp option processing. + * Alan Cox : Reset tweaked (still not 100%) [Had + * syn rule wrong] + * Herp Rosmanith : More reset fixes + * Alan Cox : No longer acks invalid rst frames. + * Acking any kind of RST is right out. + * Alan Cox : Sets an ignore me flag on an rst + * receive otherwise odd bits of prattle + * escape still + * Alan Cox : Fixed another acking RST frame bug. + * Should stop LAN workplace lockups. + * Alan Cox : Some tidyups using the new skb list + * facilities + * Alan Cox : sk->keepopen now seems to work + * Alan Cox : Pulls options out correctly on accepts + * Alan Cox : Fixed assorted sk->rqueue->next errors + * Alan Cox : PSH doesn't end a TCP read. Switched a + * bit to skb ops. + * Alan Cox : Tidied tcp_data to avoid a potential + * nasty. + * Alan Cox : Added some better commenting, as the + * tcp is hard to follow + * Alan Cox : Removed incorrect check for 20 * psh + * Michael O'Reilly : ack < copied bug fix. + * Johannes Stille : Misc tcp fixes (not all in yet). + * Alan Cox : FIN with no memory -> CRASH + * Alan Cox : Added socket option proto entries. + * Also added awareness of them to accept. + * Alan Cox : Added TCP options (SOL_TCP) + * Alan Cox : Switched wakeup calls to callbacks, + * so the kernel can layer network + * sockets. + * Alan Cox : Use ip_tos/ip_ttl settings. + * Alan Cox : Handle FIN (more) properly (we hope). + * Alan Cox : RST frames sent on unsynchronised + * state ack error. + * Alan Cox : Put in missing check for SYN bit. + * Alan Cox : Added tcp_select_window() aka NET2E + * window non shrink trick. + * Alan Cox : Added a couple of small NET2E timer + * fixes + * Charles Hedrick : TCP fixes + * Toomas Tamm : TCP window fixes + * Alan Cox : Small URG fix to rlogin ^C ack fight + * Charles Hedrick : Rewrote most of it to actually work + * Linus : Rewrote tcp_read() and URG handling + * completely + * Gerhard Koerting: Fixed some missing timer handling + * Matthew Dillon : Reworked TCP machine states as per RFC + * Gerhard Koerting: PC/TCP workarounds + * Adam Caldwell : Assorted timer/timing errors + * Matthew Dillon : Fixed another RST bug + * Alan Cox : Move to kernel side addressing changes. + * Alan Cox : Beginning work on TCP fastpathing + * (not yet usable) + * Arnt Gulbrandsen: Turbocharged tcp_check() routine. + * Alan Cox : TCP fast path debugging + * Alan Cox : Window clamping + * Michael Riepe : Bug in tcp_check() + * Matt Dillon : More TCP improvements and RST bug fixes + * Matt Dillon : Yet more small nasties remove from the + * TCP code (Be very nice to this man if + * tcp finally works 100%) 8) + * Alan Cox : BSD accept semantics. + * Alan Cox : Reset on closedown bug. + * Peter De Schrijver : ENOTCONN check missing in tcp_sendto(). + * Michael Pall : Handle poll() after URG properly in + * all cases. + * Michael Pall : Undo the last fix in tcp_read_urg() + * (multi URG PUSH broke rlogin). + * Michael Pall : Fix the multi URG PUSH problem in + * tcp_readable(), poll() after URG + * works now. + * Michael Pall : recv(...,MSG_OOB) never blocks in the + * BSD api. + * Alan Cox : Changed the semantics of sk->socket to + * fix a race and a signal problem with + * accept() and async I/O. + * Alan Cox : Relaxed the rules on tcp_sendto(). + * Yury Shevchuk : Really fixed accept() blocking problem. + * Craig I. Hagan : Allow for BSD compatible TIME_WAIT for + * clients/servers which listen in on + * fixed ports. + * Alan Cox : Cleaned the above up and shrank it to + * a sensible code size. + * Alan Cox : Self connect lockup fix. + * Alan Cox : No connect to multicast. + * Ross Biro : Close unaccepted children on master + * socket close. + * Alan Cox : Reset tracing code. + * Alan Cox : Spurious resets on shutdown. + * Alan Cox : Giant 15 minute/60 second timer error + * Alan Cox : Small whoops in polling before an + * accept. + * Alan Cox : Kept the state trace facility since + * it's handy for debugging. + * Alan Cox : More reset handler fixes. + * Alan Cox : Started rewriting the code based on + * the RFC's for other useful protocol + * references see: Comer, KA9Q NOS, and + * for a reference on the difference + * between specifications and how BSD + * works see the 4.4lite source. + * A.N.Kuznetsov : Don't time wait on completion of tidy + * close. + * Linus Torvalds : Fin/Shutdown & copied_seq changes. + * Linus Torvalds : Fixed BSD port reuse to work first syn + * Alan Cox : Reimplemented timers as per the RFC + * and using multiple timers for sanity. + * Alan Cox : Small bug fixes, and a lot of new + * comments. + * Alan Cox : Fixed dual reader crash by locking + * the buffers (much like datagram.c) + * Alan Cox : Fixed stuck sockets in probe. A probe + * now gets fed up of retrying without + * (even a no space) answer. + * Alan Cox : Extracted closing code better + * Alan Cox : Fixed the closing state machine to + * resemble the RFC. + * Alan Cox : More 'per spec' fixes. + * Jorge Cwik : Even faster checksumming. + * Alan Cox : tcp_data() doesn't ack illegal PSH + * only frames. At least one pc tcp stack + * generates them. + * Alan Cox : Cache last socket. + * Alan Cox : Per route irtt. + * Matt Day : poll()->select() match BSD precisely on error + * Alan Cox : New buffers + * Marc Tamsky : Various sk->prot->retransmits and + * sk->retransmits misupdating fixed. + * Fixed tcp_write_timeout: stuck close, + * and TCP syn retries gets used now. + * Mark Yarvis : In tcp_read_wakeup(), don't send an + * ack if state is TCP_CLOSED. + * Alan Cox : Look up device on a retransmit - routes may + * change. Doesn't yet cope with MSS shrink right + * but it's a start! + * Marc Tamsky : Closing in closing fixes. + * Mike Shaver : RFC1122 verifications. + * Alan Cox : rcv_saddr errors. + * Alan Cox : Block double connect(). + * Alan Cox : Small hooks for enSKIP. + * Alexey Kuznetsov: Path MTU discovery. + * Alan Cox : Support soft errors. + * Alan Cox : Fix MTU discovery pathological case + * when the remote claims no mtu! + * Marc Tamsky : TCP_CLOSE fix. + * Colin (G3TNE) : Send a reset on syn ack replies in + * window but wrong (fixes NT lpd problems) + * Pedro Roque : Better TCP window handling, delayed ack. + * Joerg Reuter : No modification of locked buffers in + * tcp_do_retransmit() + * Eric Schenk : Changed receiver side silly window + * avoidance algorithm to BSD style + * algorithm. This doubles throughput + * against machines running Solaris, + * and seems to result in general + * improvement. + * Stefan Magdalinski : adjusted tcp_readable() to fix FIONREAD + * Willy Konynenberg : Transparent proxying support. + * Mike McLagan : Routing by source + * Keith Owens : Do proper merging with partial SKB's in + * tcp_do_sendmsg to avoid burstiness. + * Eric Schenk : Fix fast close down bug with + * shutdown() followed by close(). + * Andi Kleen : Make poll agree with SIGIO + * Salvatore Sanfilippo : Support SO_LINGER with linger == 1 and + * lingertime == 0 (RFC 793 ABORT Call) + * Hirokazu Takahashi : Use copy_from_user() instead of + * csum_and_copy_from_user() if possible. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + * + * Description of States: + * + * TCP_SYN_SENT sent a connection request, waiting for ack + * + * TCP_SYN_RECV received a connection request, sent ack, + * waiting for final ack in three-way handshake. + * + * TCP_ESTABLISHED connection established + * + * TCP_FIN_WAIT1 our side has shutdown, waiting to complete + * transmission of remaining buffered data + * + * TCP_FIN_WAIT2 all buffered data sent, waiting for remote + * to shutdown + * + * TCP_CLOSING both sides have shutdown but we still have + * data we have to finish sending + * + * TCP_TIME_WAIT timeout to catch resent junk before entering + * closed, can only be entered from FIN_WAIT2 + * or CLOSING. Required because the other end + * may not have gotten our last ACK causing it + * to retransmit the data packet (which we ignore) + * + * TCP_CLOSE_WAIT remote side has shutdown and is waiting for + * us to finish writing our data and to shutdown + * (we have to close() to move on to LAST_ACK) + * + * TCP_LAST_ACK out side has shutdown after remote has + * shutdown. There may still be data in our + * buffer that we have to finish sending + * + * TCP_CLOSE socket is finished + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + +#include +#include + +int sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT; + +DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics); + +kmem_cache_t *tcp_openreq_cachep; +kmem_cache_t *tcp_bucket_cachep; +kmem_cache_t *tcp_timewait_cachep; + +atomic_t tcp_orphan_count = ATOMIC_INIT(0); + +int sysctl_tcp_mem[3]; +int sysctl_tcp_wmem[3] = { 4 * 1024, 16 * 1024, 128 * 1024 }; +int sysctl_tcp_rmem[3] = { 4 * 1024, 87380, 87380 * 2 }; + +EXPORT_SYMBOL(sysctl_tcp_mem); +EXPORT_SYMBOL(sysctl_tcp_rmem); +EXPORT_SYMBOL(sysctl_tcp_wmem); + +atomic_t tcp_memory_allocated; /* Current allocated memory. */ +atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */ + +EXPORT_SYMBOL(tcp_memory_allocated); +EXPORT_SYMBOL(tcp_sockets_allocated); + +/* + * Pressure flag: try to collapse. + * Technical note: it is used by multiple contexts non atomically. + * All the sk_stream_mem_schedule() is of this nature: accounting + * is strict, actions are advisory and have some latency. + */ +int tcp_memory_pressure; + +EXPORT_SYMBOL(tcp_memory_pressure); + +void tcp_enter_memory_pressure(void) +{ + if (!tcp_memory_pressure) { + NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES); + tcp_memory_pressure = 1; + } +} + +EXPORT_SYMBOL(tcp_enter_memory_pressure); + +/* + * LISTEN is a special case for poll.. + */ +static __inline__ unsigned int tcp_listen_poll(struct sock *sk, + poll_table *wait) +{ + return tcp_sk(sk)->accept_queue ? (POLLIN | POLLRDNORM) : 0; +} + +/* + * Wait for a TCP event. + * + * Note that we don't need to lock the socket, as the upper poll layers + * take care of normal races (between the test and the event) and we don't + * go look at any of the socket buffers directly. + */ +unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + unsigned int mask; + struct sock *sk = sock->sk; + struct tcp_sock *tp = tcp_sk(sk); + + poll_wait(file, sk->sk_sleep, wait); + if (sk->sk_state == TCP_LISTEN) + return tcp_listen_poll(sk, wait); + + /* Socket is not locked. We are protected from async events + by poll logic and correct handling of state changes + made by another threads is impossible in any case. + */ + + mask = 0; + if (sk->sk_err) + mask = POLLERR; + + /* + * POLLHUP is certainly not done right. But poll() doesn't + * have a notion of HUP in just one direction, and for a + * socket the read side is more interesting. + * + * Some poll() documentation says that POLLHUP is incompatible + * with the POLLOUT/POLLWR flags, so somebody should check this + * all. But careful, it tends to be safer to return too many + * bits than too few, and you can easily break real applications + * if you don't tell them that something has hung up! + * + * Check-me. + * + * Check number 1. POLLHUP is _UNMASKABLE_ event (see UNIX98 and + * our fs/select.c). It means that after we received EOF, + * poll always returns immediately, making impossible poll() on write() + * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP + * if and only if shutdown has been made in both directions. + * Actually, it is interesting to look how Solaris and DUX + * solve this dilemma. I would prefer, if PULLHUP were maskable, + * then we could set it on SND_SHUTDOWN. BTW examples given + * in Stevens' books assume exactly this behaviour, it explains + * why PULLHUP is incompatible with POLLOUT. --ANK + * + * NOTE. Check for TCP_CLOSE is added. The goal is to prevent + * blocking on fresh not-connected or disconnected socket. --ANK + */ + if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE) + mask |= POLLHUP; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLIN | POLLRDNORM; + + /* Connected? */ + if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { + /* Potential race condition. If read of tp below will + * escape above sk->sk_state, we can be illegally awaken + * in SYN_* states. */ + if ((tp->rcv_nxt != tp->copied_seq) && + (tp->urg_seq != tp->copied_seq || + tp->rcv_nxt != tp->copied_seq + 1 || + sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data)) + mask |= POLLIN | POLLRDNORM; + + if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) { + mask |= POLLOUT | POLLWRNORM; + } else { /* send SIGIO later */ + set_bit(SOCK_ASYNC_NOSPACE, + &sk->sk_socket->flags); + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); + + /* Race breaker. If space is freed after + * wspace test but before the flags are set, + * IO signal will be lost. + */ + if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) + mask |= POLLOUT | POLLWRNORM; + } + } + + if (tp->urg_data & TCP_URG_VALID) + mask |= POLLPRI; + } + return mask; +} + +int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + struct tcp_sock *tp = tcp_sk(sk); + int answ; + + switch (cmd) { + case SIOCINQ: + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + lock_sock(sk); + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + answ = 0; + else if (sock_flag(sk, SOCK_URGINLINE) || + !tp->urg_data || + before(tp->urg_seq, tp->copied_seq) || + !before(tp->urg_seq, tp->rcv_nxt)) { + answ = tp->rcv_nxt - tp->copied_seq; + + /* Subtract 1, if FIN is in queue. */ + if (answ && !skb_queue_empty(&sk->sk_receive_queue)) + answ -= + ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin; + } else + answ = tp->urg_seq - tp->copied_seq; + release_sock(sk); + break; + case SIOCATMARK: + answ = tp->urg_data && tp->urg_seq == tp->copied_seq; + break; + case SIOCOUTQ: + if (sk->sk_state == TCP_LISTEN) + return -EINVAL; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) + answ = 0; + else + answ = tp->write_seq - tp->snd_una; + break; + default: + return -ENOIOCTLCMD; + }; + + return put_user(answ, (int __user *)arg); +} + + +int tcp_listen_start(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_listen_opt *lopt; + + sk->sk_max_ack_backlog = 0; + sk->sk_ack_backlog = 0; + tp->accept_queue = tp->accept_queue_tail = NULL; + rwlock_init(&tp->syn_wait_lock); + tcp_delack_init(tp); + + lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL); + if (!lopt) + return -ENOMEM; + + memset(lopt, 0, sizeof(struct tcp_listen_opt)); + for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++) + if ((1 << lopt->max_qlen_log) >= sysctl_max_syn_backlog) + break; + get_random_bytes(&lopt->hash_rnd, 4); + + write_lock_bh(&tp->syn_wait_lock); + tp->listen_opt = lopt; + write_unlock_bh(&tp->syn_wait_lock); + + /* There is race window here: we announce ourselves listening, + * but this transition is still not validated by get_port(). + * It is OK, because this socket enters to hash table only + * after validation is complete. + */ + sk->sk_state = TCP_LISTEN; + if (!sk->sk_prot->get_port(sk, inet->num)) { + inet->sport = htons(inet->num); + + sk_dst_reset(sk); + sk->sk_prot->hash(sk); + + return 0; + } + + sk->sk_state = TCP_CLOSE; + write_lock_bh(&tp->syn_wait_lock); + tp->listen_opt = NULL; + write_unlock_bh(&tp->syn_wait_lock); + kfree(lopt); + return -EADDRINUSE; +} + +/* + * This routine closes sockets which have been at least partially + * opened, but not yet accepted. + */ + +static void tcp_listen_stop (struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_listen_opt *lopt = tp->listen_opt; + struct open_request *acc_req = tp->accept_queue; + struct open_request *req; + int i; + + tcp_delete_keepalive_timer(sk); + + /* make all the listen_opt local to us */ + write_lock_bh(&tp->syn_wait_lock); + tp->listen_opt = NULL; + write_unlock_bh(&tp->syn_wait_lock); + tp->accept_queue = tp->accept_queue_tail = NULL; + + if (lopt->qlen) { + for (i = 0; i < TCP_SYNQ_HSIZE; i++) { + while ((req = lopt->syn_table[i]) != NULL) { + lopt->syn_table[i] = req->dl_next; + lopt->qlen--; + tcp_openreq_free(req); + + /* Following specs, it would be better either to send FIN + * (and enter FIN-WAIT-1, it is normal close) + * or to send active reset (abort). + * Certainly, it is pretty dangerous while synflood, but it is + * bad justification for our negligence 8) + * To be honest, we are not able to make either + * of the variants now. --ANK + */ + } + } + } + BUG_TRAP(!lopt->qlen); + + kfree(lopt); + + while ((req = acc_req) != NULL) { + struct sock *child = req->sk; + + acc_req = req->dl_next; + + local_bh_disable(); + bh_lock_sock(child); + BUG_TRAP(!sock_owned_by_user(child)); + sock_hold(child); + + tcp_disconnect(child, O_NONBLOCK); + + sock_orphan(child); + + atomic_inc(&tcp_orphan_count); + + tcp_destroy_sock(child); + + bh_unlock_sock(child); + local_bh_enable(); + sock_put(child); + + sk_acceptq_removed(sk); + tcp_openreq_fastfree(req); + } + BUG_TRAP(!sk->sk_ack_backlog); +} + +static inline void tcp_mark_push(struct tcp_sock *tp, struct sk_buff *skb) +{ + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + tp->pushed_seq = tp->write_seq; +} + +static inline int forced_push(struct tcp_sock *tp) +{ + return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); +} + +static inline void skb_entail(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) +{ + skb->csum = 0; + TCP_SKB_CB(skb)->seq = tp->write_seq; + TCP_SKB_CB(skb)->end_seq = tp->write_seq; + TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(skb)->sacked = 0; + skb_header_release(skb); + __skb_queue_tail(&sk->sk_write_queue, skb); + sk_charge_skb(sk, skb); + if (!sk->sk_send_head) + sk->sk_send_head = skb; + else if (tp->nonagle&TCP_NAGLE_PUSH) + tp->nonagle &= ~TCP_NAGLE_PUSH; +} + +static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, + struct sk_buff *skb) +{ + if (flags & MSG_OOB) { + tp->urg_mode = 1; + tp->snd_up = tp->write_seq; + TCP_SKB_CB(skb)->sacked |= TCPCB_URG; + } +} + +static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags, + int mss_now, int nonagle) +{ + if (sk->sk_send_head) { + struct sk_buff *skb = sk->sk_write_queue.prev; + if (!(flags & MSG_MORE) || forced_push(tp)) + tcp_mark_push(tp, skb); + tcp_mark_urg(tp, flags, skb); + __tcp_push_pending_frames(sk, tp, mss_now, + (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); + } +} + +static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, + size_t psize, int flags) +{ + struct tcp_sock *tp = tcp_sk(sk); + int mss_now; + int err; + ssize_t copied; + long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + + /* Wait for a connection to finish. */ + if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + goto out_err; + + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + copied = 0; + + err = -EPIPE; + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto do_error; + + while (psize > 0) { + struct sk_buff *skb = sk->sk_write_queue.prev; + struct page *page = pages[poffset / PAGE_SIZE]; + int copy, i, can_coalesce; + int offset = poffset % PAGE_SIZE; + int size = min_t(size_t, psize, PAGE_SIZE - offset); + + if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) { +new_segment: + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + + skb = sk_stream_alloc_pskb(sk, 0, 0, + sk->sk_allocation); + if (!skb) + goto wait_for_memory; + + skb_entail(sk, tp, skb); + copy = mss_now; + } + + if (copy > size) + copy = size; + + i = skb_shinfo(skb)->nr_frags; + can_coalesce = skb_can_coalesce(skb, i, page, offset); + if (!can_coalesce && i >= MAX_SKB_FRAGS) { + tcp_mark_push(tp, skb); + goto new_segment; + } + if (sk->sk_forward_alloc < copy && + !sk_stream_mem_schedule(sk, copy, 0)) + goto wait_for_memory; + + if (can_coalesce) { + skb_shinfo(skb)->frags[i - 1].size += copy; + } else { + get_page(page); + skb_fill_page_desc(skb, i, page, offset, copy); + } + + skb->len += copy; + skb->data_len += copy; + skb->truesize += copy; + sk->sk_wmem_queued += copy; + sk->sk_forward_alloc -= copy; + skb->ip_summed = CHECKSUM_HW; + tp->write_seq += copy; + TCP_SKB_CB(skb)->end_seq += copy; + skb_shinfo(skb)->tso_segs = 0; + + if (!copied) + TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; + + copied += copy; + poffset += copy; + if (!(psize -= copy)) + goto out; + + if (skb->len != mss_now || (flags & MSG_OOB)) + continue; + + if (forced_push(tp)) { + tcp_mark_push(tp, skb); + __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH); + } else if (skb == sk->sk_send_head) + tcp_push_one(sk, mss_now); + continue; + +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + if (copied) + tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + + if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + goto do_error; + + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + } + +out: + if (copied) + tcp_push(sk, tp, flags, mss_now, tp->nonagle); + return copied; + +do_error: + if (copied) + goto out; +out_err: + return sk_stream_error(sk, flags, err); +} + +ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, + size_t size, int flags) +{ + ssize_t res; + struct sock *sk = sock->sk; + +#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM) + + if (!(sk->sk_route_caps & NETIF_F_SG) || + !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) + return sock_no_sendpage(sock, page, offset, size, flags); + +#undef TCP_ZC_CSUM_FLAGS + + lock_sock(sk); + TCP_CHECK_TIMER(sk); + res = do_tcp_sendpages(sk, &page, offset, size, flags); + TCP_CHECK_TIMER(sk); + release_sock(sk); + return res; +} + +#define TCP_PAGE(sk) (sk->sk_sndmsg_page) +#define TCP_OFF(sk) (sk->sk_sndmsg_off) + +static inline int select_size(struct sock *sk, struct tcp_sock *tp) +{ + int tmp = tp->mss_cache_std; + + if (sk->sk_route_caps & NETIF_F_SG) { + int pgbreak = SKB_MAX_HEAD(MAX_TCP_HEADER); + + if (tmp >= pgbreak && + tmp <= pgbreak + (MAX_SKB_FRAGS - 1) * PAGE_SIZE) + tmp = pgbreak; + } + return tmp; +} + +int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t size) +{ + struct iovec *iov; + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int iovlen, flags; + int mss_now; + int err, copied; + long timeo; + + lock_sock(sk); + TCP_CHECK_TIMER(sk); + + flags = msg->msg_flags; + timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT); + + /* Wait for a connection to finish. */ + if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) + if ((err = sk_stream_wait_connect(sk, &timeo)) != 0) + goto out_err; + + /* This should be in poll */ + clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + + /* Ok commence sending. */ + iovlen = msg->msg_iovlen; + iov = msg->msg_iov; + copied = 0; + + err = -EPIPE; + if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) + goto do_error; + + while (--iovlen >= 0) { + int seglen = iov->iov_len; + unsigned char __user *from = iov->iov_base; + + iov++; + + while (seglen > 0) { + int copy; + + skb = sk->sk_write_queue.prev; + + if (!sk->sk_send_head || + (copy = mss_now - skb->len) <= 0) { + +new_segment: + /* Allocate new segment. If the interface is SG, + * allocate skb fitting to single page. + */ + if (!sk_stream_memory_free(sk)) + goto wait_for_sndbuf; + + skb = sk_stream_alloc_pskb(sk, select_size(sk, tp), + 0, sk->sk_allocation); + if (!skb) + goto wait_for_memory; + + /* + * Check whether we can use HW checksum. + */ + if (sk->sk_route_caps & + (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | + NETIF_F_HW_CSUM)) + skb->ip_summed = CHECKSUM_HW; + + skb_entail(sk, tp, skb); + copy = mss_now; + } + + /* Try to append data to the end of skb. */ + if (copy > seglen) + copy = seglen; + + /* Where to copy to? */ + if (skb_tailroom(skb) > 0) { + /* We have some space in skb head. Superb! */ + if (copy > skb_tailroom(skb)) + copy = skb_tailroom(skb); + if ((err = skb_add_data(skb, from, copy)) != 0) + goto do_fault; + } else { + int merge = 0; + int i = skb_shinfo(skb)->nr_frags; + struct page *page = TCP_PAGE(sk); + int off = TCP_OFF(sk); + + if (skb_can_coalesce(skb, i, page, off) && + off != PAGE_SIZE) { + /* We can extend the last page + * fragment. */ + merge = 1; + } else if (i == MAX_SKB_FRAGS || + (!i && + !(sk->sk_route_caps & NETIF_F_SG))) { + /* Need to add new fragment and cannot + * do this because interface is non-SG, + * or because all the page slots are + * busy. */ + tcp_mark_push(tp, skb); + goto new_segment; + } else if (page) { + /* If page is cached, align + * offset to L1 cache boundary + */ + off = (off + L1_CACHE_BYTES - 1) & + ~(L1_CACHE_BYTES - 1); + if (off == PAGE_SIZE) { + put_page(page); + TCP_PAGE(sk) = page = NULL; + } + } + + if (!page) { + /* Allocate new cache page. */ + if (!(page = sk_stream_alloc_page(sk))) + goto wait_for_memory; + off = 0; + } + + if (copy > PAGE_SIZE - off) + copy = PAGE_SIZE - off; + + /* Time to copy data. We are close to + * the end! */ + err = skb_copy_to_page(sk, from, skb, page, + off, copy); + if (err) { + /* If this page was new, give it to the + * socket so it does not get leaked. + */ + if (!TCP_PAGE(sk)) { + TCP_PAGE(sk) = page; + TCP_OFF(sk) = 0; + } + goto do_error; + } + + /* Update the skb. */ + if (merge) { + skb_shinfo(skb)->frags[i - 1].size += + copy; + } else { + skb_fill_page_desc(skb, i, page, off, copy); + if (TCP_PAGE(sk)) { + get_page(page); + } else if (off + copy < PAGE_SIZE) { + get_page(page); + TCP_PAGE(sk) = page; + } + } + + TCP_OFF(sk) = off + copy; + } + + if (!copied) + TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_PSH; + + tp->write_seq += copy; + TCP_SKB_CB(skb)->end_seq += copy; + skb_shinfo(skb)->tso_segs = 0; + + from += copy; + copied += copy; + if ((seglen -= copy) == 0 && iovlen == 0) + goto out; + + if (skb->len != mss_now || (flags & MSG_OOB)) + continue; + + if (forced_push(tp)) { + tcp_mark_push(tp, skb); + __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH); + } else if (skb == sk->sk_send_head) + tcp_push_one(sk, mss_now); + continue; + +wait_for_sndbuf: + set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); +wait_for_memory: + if (copied) + tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + + if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) + goto do_error; + + mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); + } + } + +out: + if (copied) + tcp_push(sk, tp, flags, mss_now, tp->nonagle); + TCP_CHECK_TIMER(sk); + release_sock(sk); + return copied; + +do_fault: + if (!skb->len) { + if (sk->sk_send_head == skb) + sk->sk_send_head = NULL; + __skb_unlink(skb, skb->list); + sk_stream_free_skb(sk, skb); + } + +do_error: + if (copied) + goto out; +out_err: + err = sk_stream_error(sk, flags, err); + TCP_CHECK_TIMER(sk); + release_sock(sk); + return err; +} + +/* + * Handle reading urgent data. BSD has very simple semantics for + * this, no blocking and very strange errors 8) + */ + +static int tcp_recv_urg(struct sock *sk, long timeo, + struct msghdr *msg, int len, int flags, + int *addr_len) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* No URG data to read. */ + if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || + tp->urg_data == TCP_URG_READ) + return -EINVAL; /* Yes this is right ! */ + + if (sk->sk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DONE)) + return -ENOTCONN; + + if (tp->urg_data & TCP_URG_VALID) { + int err = 0; + char c = tp->urg_data; + + if (!(flags & MSG_PEEK)) + tp->urg_data = TCP_URG_READ; + + /* Read urgent data. */ + msg->msg_flags |= MSG_OOB; + + if (len > 0) { + if (!(flags & MSG_TRUNC)) + err = memcpy_toiovec(msg->msg_iov, &c, 1); + len = 1; + } else + msg->msg_flags |= MSG_TRUNC; + + return err ? -EFAULT : len; + } + + if (sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN)) + return 0; + + /* Fixed the recv(..., MSG_OOB) behaviour. BSD docs and + * the available implementations agree in this case: + * this call should never block, independent of the + * blocking state of the socket. + * Mike + */ + return -EAGAIN; +} + +/* Clean up the receive buffer for full frames taken by the user, + * then send an ACK if necessary. COPIED is the number of bytes + * tcp_recvmsg has given to the user so far, it speeds up the + * calculation of whether or not we must ACK for the sake of + * a window update. + */ +static void cleanup_rbuf(struct sock *sk, int copied) +{ + struct tcp_sock *tp = tcp_sk(sk); + int time_to_ack = 0; + +#if TCP_DEBUG + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + + BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); +#endif + + if (tcp_ack_scheduled(tp)) { + /* Delayed ACKs frequently hit locked sockets during bulk + * receive. */ + if (tp->ack.blocked || + /* Once-per-two-segments ACK was not sent by tcp_input.c */ + tp->rcv_nxt - tp->rcv_wup > tp->ack.rcv_mss || + /* + * If this read emptied read buffer, we send ACK, if + * connection is not bidirectional, user drained + * receive buffer and there was a small segment + * in queue. + */ + (copied > 0 && (tp->ack.pending & TCP_ACK_PUSHED) && + !tp->ack.pingpong && !atomic_read(&sk->sk_rmem_alloc))) + time_to_ack = 1; + } + + /* We send an ACK if we can now advertise a non-zero window + * which has been raised "significantly". + * + * Even if window raised up to infinity, do not send window open ACK + * in states, where we will not receive more. It is useless. + */ + if (copied > 0 && !time_to_ack && !(sk->sk_shutdown & RCV_SHUTDOWN)) { + __u32 rcv_window_now = tcp_receive_window(tp); + + /* Optimize, __tcp_select_window() is not cheap. */ + if (2*rcv_window_now <= tp->window_clamp) { + __u32 new_window = __tcp_select_window(sk); + + /* Send ACK now, if this read freed lots of space + * in our buffer. Certainly, new_window is new window. + * We can advertise it now, if it is not less than current one. + * "Lots" means "at least twice" here. + */ + if (new_window && new_window >= 2 * rcv_window_now) + time_to_ack = 1; + } + } + if (time_to_ack) + tcp_send_ack(sk); +} + +static void tcp_prequeue_process(struct sock *sk) +{ + struct sk_buff *skb; + struct tcp_sock *tp = tcp_sk(sk); + + NET_ADD_STATS_USER(LINUX_MIB_TCPPREQUEUED, skb_queue_len(&tp->ucopy.prequeue)); + + /* RX process wants to run with disabled BHs, though it is not + * necessary */ + local_bh_disable(); + while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) + sk->sk_backlog_rcv(sk, skb); + local_bh_enable(); + + /* Clear memory counter. */ + tp->ucopy.memory = 0; +} + +static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) +{ + struct sk_buff *skb; + u32 offset; + + skb_queue_walk(&sk->sk_receive_queue, skb) { + offset = seq - TCP_SKB_CB(skb)->seq; + if (skb->h.th->syn) + offset--; + if (offset < skb->len || skb->h.th->fin) { + *off = offset; + return skb; + } + } + return NULL; +} + +/* + * This routine provides an alternative to tcp_recvmsg() for routines + * that would like to handle copying from skbuffs directly in 'sendfile' + * fashion. + * Note: + * - It is assumed that the socket was locked by the caller. + * - The routine does not block. + * - At present, there is no support for reading OOB data + * or for 'peeking' the socket using this routine + * (although both would be easy to implement). + */ +int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, + sk_read_actor_t recv_actor) +{ + struct sk_buff *skb; + struct tcp_sock *tp = tcp_sk(sk); + u32 seq = tp->copied_seq; + u32 offset; + int copied = 0; + + if (sk->sk_state == TCP_LISTEN) + return -ENOTCONN; + while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { + if (offset < skb->len) { + size_t used, len; + + len = skb->len - offset; + /* Stop reading if we hit a patch of urgent data */ + if (tp->urg_data) { + u32 urg_offset = tp->urg_seq - seq; + if (urg_offset < len) + len = urg_offset; + if (!len) + break; + } + used = recv_actor(desc, skb, offset, len); + if (used <= len) { + seq += used; + copied += used; + offset += used; + } + if (offset != skb->len) + break; + } + if (skb->h.th->fin) { + sk_eat_skb(sk, skb); + ++seq; + break; + } + sk_eat_skb(sk, skb); + if (!desc->count) + break; + } + tp->copied_seq = seq; + + tcp_rcv_space_adjust(sk); + + /* Clean up data we have read: This will do ACK frames. */ + if (copied) + cleanup_rbuf(sk, copied); + return copied; +} + +/* + * This routine copies from a sock struct into the user buffer. + * + * Technical note: in 2.3 we work on _locked_ socket, so that + * tricks with *seq access order and skb->users are not required. + * Probably, code can be easily improved even more. + */ + +int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int nonblock, int flags, int *addr_len) +{ + struct tcp_sock *tp = tcp_sk(sk); + int copied = 0; + u32 peek_seq; + u32 *seq; + unsigned long used; + int err; + int target; /* Read at least this many bytes */ + long timeo; + struct task_struct *user_recv = NULL; + + lock_sock(sk); + + TCP_CHECK_TIMER(sk); + + err = -ENOTCONN; + if (sk->sk_state == TCP_LISTEN) + goto out; + + timeo = sock_rcvtimeo(sk, nonblock); + + /* Urgent data needs to be handled specially. */ + if (flags & MSG_OOB) + goto recv_urg; + + seq = &tp->copied_seq; + if (flags & MSG_PEEK) { + peek_seq = tp->copied_seq; + seq = &peek_seq; + } + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); + + do { + struct sk_buff *skb; + u32 offset; + + /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ + if (tp->urg_data && tp->urg_seq == *seq) { + if (copied) + break; + if (signal_pending(current)) { + copied = timeo ? sock_intr_errno(timeo) : -EAGAIN; + break; + } + } + + /* Next get a buffer. */ + + skb = skb_peek(&sk->sk_receive_queue); + do { + if (!skb) + break; + + /* Now that we have two receive queues this + * shouldn't happen. + */ + if (before(*seq, TCP_SKB_CB(skb)->seq)) { + printk(KERN_INFO "recvmsg bug: copied %X " + "seq %X\n", *seq, TCP_SKB_CB(skb)->seq); + break; + } + offset = *seq - TCP_SKB_CB(skb)->seq; + if (skb->h.th->syn) + offset--; + if (offset < skb->len) + goto found_ok_skb; + if (skb->h.th->fin) + goto found_fin_ok; + BUG_TRAP(flags & MSG_PEEK); + skb = skb->next; + } while (skb != (struct sk_buff *)&sk->sk_receive_queue); + + /* Well, if we have backlog, try to process it now yet. */ + + if (copied >= target && !sk->sk_backlog.tail) + break; + + if (copied) { + if (sk->sk_err || + sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || + !timeo || + signal_pending(current) || + (flags & MSG_PEEK)) + break; + } else { + if (sock_flag(sk, SOCK_DONE)) + break; + + if (sk->sk_err) { + copied = sock_error(sk); + break; + } + + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + if (sk->sk_state == TCP_CLOSE) { + if (!sock_flag(sk, SOCK_DONE)) { + /* This occurs when user tries to read + * from never connected socket. + */ + copied = -ENOTCONN; + break; + } + break; + } + + if (!timeo) { + copied = -EAGAIN; + break; + } + + if (signal_pending(current)) { + copied = sock_intr_errno(timeo); + break; + } + } + + cleanup_rbuf(sk, copied); + + if (tp->ucopy.task == user_recv) { + /* Install new reader */ + if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) { + user_recv = current; + tp->ucopy.task = user_recv; + tp->ucopy.iov = msg->msg_iov; + } + + tp->ucopy.len = len; + + BUG_TRAP(tp->copied_seq == tp->rcv_nxt || + (flags & (MSG_PEEK | MSG_TRUNC))); + + /* Ugly... If prequeue is not empty, we have to + * process it before releasing socket, otherwise + * order will be broken at second iteration. + * More elegant solution is required!!! + * + * Look: we have the following (pseudo)queues: + * + * 1. packets in flight + * 2. backlog + * 3. prequeue + * 4. receive_queue + * + * Each queue can be processed only if the next ones + * are empty. At this point we have empty receive_queue. + * But prequeue _can_ be not empty after 2nd iteration, + * when we jumped to start of loop because backlog + * processing added something to receive_queue. + * We cannot release_sock(), because backlog contains + * packets arrived _after_ prequeued ones. + * + * Shortly, algorithm is clear --- to process all + * the queues in order. We could make it more directly, + * requeueing packets from backlog to prequeue, if + * is not empty. It is more elegant, but eats cycles, + * unfortunately. + */ + if (skb_queue_len(&tp->ucopy.prequeue)) + goto do_prequeue; + + /* __ Set realtime policy in scheduler __ */ + } + + if (copied >= target) { + /* Do not sleep, just process backlog. */ + release_sock(sk); + lock_sock(sk); + } else + sk_wait_data(sk, &timeo); + + if (user_recv) { + int chunk; + + /* __ Restore normal policy in scheduler __ */ + + if ((chunk = len - tp->ucopy.len) != 0) { + NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); + len -= chunk; + copied += chunk; + } + + if (tp->rcv_nxt == tp->copied_seq && + skb_queue_len(&tp->ucopy.prequeue)) { +do_prequeue: + tcp_prequeue_process(sk); + + if ((chunk = len - tp->ucopy.len) != 0) { + NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + len -= chunk; + copied += chunk; + } + } + } + if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) { + if (net_ratelimit()) + printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", + current->comm, current->pid); + peek_seq = tp->copied_seq; + } + continue; + + found_ok_skb: + /* Ok so how much can we use? */ + used = skb->len - offset; + if (len < used) + used = len; + + /* Do we have urgent data here? */ + if (tp->urg_data) { + u32 urg_offset = tp->urg_seq - *seq; + if (urg_offset < used) { + if (!urg_offset) { + if (!sock_flag(sk, SOCK_URGINLINE)) { + ++*seq; + offset++; + used--; + if (!used) + goto skip_copy; + } + } else + used = urg_offset; + } + } + + if (!(flags & MSG_TRUNC)) { + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + } + + *seq += used; + copied += used; + len -= used; + + tcp_rcv_space_adjust(sk); + +skip_copy: + if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { + tp->urg_data = 0; + tcp_fast_path_check(sk, tp); + } + if (used + offset < skb->len) + continue; + + if (skb->h.th->fin) + goto found_fin_ok; + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); + continue; + + found_fin_ok: + /* Process the FIN. */ + ++*seq; + if (!(flags & MSG_PEEK)) + sk_eat_skb(sk, skb); + break; + } while (len > 0); + + if (user_recv) { + if (skb_queue_len(&tp->ucopy.prequeue)) { + int chunk; + + tp->ucopy.len = copied > 0 ? len : 0; + + tcp_prequeue_process(sk); + + if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { + NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + len -= chunk; + copied += chunk; + } + } + + tp->ucopy.task = NULL; + tp->ucopy.len = 0; + } + + /* According to UNIX98, msg_name/msg_namelen are ignored + * on connected socket. I was just happy when found this 8) --ANK + */ + + /* Clean up data we have read: This will do ACK frames. */ + cleanup_rbuf(sk, copied); + + TCP_CHECK_TIMER(sk); + release_sock(sk); + return copied; + +out: + TCP_CHECK_TIMER(sk); + release_sock(sk); + return err; + +recv_urg: + err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); + goto out; +} + +/* + * State processing on a close. This implements the state shift for + * sending our FIN frame. Note that we only send a FIN for some + * states. A shutdown() may have already sent the FIN, or we may be + * closed. + */ + +static unsigned char new_state[16] = { + /* current state: new state: action: */ + /* (Invalid) */ TCP_CLOSE, + /* TCP_ESTABLISHED */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_SYN_SENT */ TCP_CLOSE, + /* TCP_SYN_RECV */ TCP_FIN_WAIT1 | TCP_ACTION_FIN, + /* TCP_FIN_WAIT1 */ TCP_FIN_WAIT1, + /* TCP_FIN_WAIT2 */ TCP_FIN_WAIT2, + /* TCP_TIME_WAIT */ TCP_CLOSE, + /* TCP_CLOSE */ TCP_CLOSE, + /* TCP_CLOSE_WAIT */ TCP_LAST_ACK | TCP_ACTION_FIN, + /* TCP_LAST_ACK */ TCP_LAST_ACK, + /* TCP_LISTEN */ TCP_CLOSE, + /* TCP_CLOSING */ TCP_CLOSING, +}; + +static int tcp_close_state(struct sock *sk) +{ + int next = (int)new_state[sk->sk_state]; + int ns = next & TCP_STATE_MASK; + + tcp_set_state(sk, ns); + + return next & TCP_ACTION_FIN; +} + +/* + * Shutdown the sending side of a connection. Much like close except + * that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD). + */ + +void tcp_shutdown(struct sock *sk, int how) +{ + /* We need to grab some memory, and put together a FIN, + * and then put it into the queue to be sent. + * Tim MacKenzie(tym@dibbler.cs.monash.edu.au) 4 Dec '92. + */ + if (!(how & SEND_SHUTDOWN)) + return; + + /* If we've already sent a FIN, or it's a closed state, skip this. */ + if ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_SYN_SENT | + TCPF_SYN_RECV | TCPF_CLOSE_WAIT)) { + /* Clear out any half completed packets. FIN if needed. */ + if (tcp_close_state(sk)) + tcp_send_fin(sk); + } +} + +/* + * At this point, there should be no process reference to this + * socket, and thus no user references at all. Therefore we + * can assume the socket waitqueue is inactive and nobody will + * try to jump onto it. + */ +void tcp_destroy_sock(struct sock *sk) +{ + BUG_TRAP(sk->sk_state == TCP_CLOSE); + BUG_TRAP(sock_flag(sk, SOCK_DEAD)); + + /* It cannot be in hash table! */ + BUG_TRAP(sk_unhashed(sk)); + + /* If it has not 0 inet_sk(sk)->num, it must be bound */ + BUG_TRAP(!inet_sk(sk)->num || tcp_sk(sk)->bind_hash); + + sk->sk_prot->destroy(sk); + + sk_stream_kill_queues(sk); + + xfrm_sk_free_policy(sk); + +#ifdef INET_REFCNT_DEBUG + if (atomic_read(&sk->sk_refcnt) != 1) { + printk(KERN_DEBUG "Destruction TCP %p delayed, c=%d\n", + sk, atomic_read(&sk->sk_refcnt)); + } +#endif + + atomic_dec(&tcp_orphan_count); + sock_put(sk); +} + +void tcp_close(struct sock *sk, long timeout) +{ + struct sk_buff *skb; + int data_was_unread = 0; + + lock_sock(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + + if (sk->sk_state == TCP_LISTEN) { + tcp_set_state(sk, TCP_CLOSE); + + /* Special case. */ + tcp_listen_stop(sk); + + goto adjudge_to_death; + } + + /* We need to flush the recv. buffs. We do this only on the + * descriptor close, not protocol-sourced closes, because the + * reader process may not have drained the data yet! + */ + while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { + u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - + skb->h.th->fin; + data_was_unread += len; + __kfree_skb(skb); + } + + sk_stream_mem_reclaim(sk); + + /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section + * 3.10, we send a RST here because data was lost. To + * witness the awful effects of the old behavior of always + * doing a FIN, run an older 2.1.x kernel or 2.0.x, start + * a bulk GET in an FTP client, suspend the process, wait + * for the client to advertise a zero window, then kill -9 + * the FTP client, wheee... Note: timeout is always zero + * in such a case. + */ + if (data_was_unread) { + /* Unread data was tossed, zap the connection. */ + NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE); + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, GFP_KERNEL); + } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { + /* Check zero linger _after_ checking for unread data. */ + sk->sk_prot->disconnect(sk, 0); + NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA); + } else if (tcp_close_state(sk)) { + /* We FIN if the application ate all the data before + * zapping the connection. + */ + + /* RED-PEN. Formally speaking, we have broken TCP state + * machine. State transitions: + * + * TCP_ESTABLISHED -> TCP_FIN_WAIT1 + * TCP_SYN_RECV -> TCP_FIN_WAIT1 (forget it, it's impossible) + * TCP_CLOSE_WAIT -> TCP_LAST_ACK + * + * are legal only when FIN has been sent (i.e. in window), + * rather than queued out of window. Purists blame. + * + * F.e. "RFC state" is ESTABLISHED, + * if Linux state is FIN-WAIT-1, but FIN is still not sent. + * + * The visible declinations are that sometimes + * we enter time-wait state, when it is not required really + * (harmless), do not send active resets, when they are + * required by specs (TCP_ESTABLISHED, TCP_CLOSE_WAIT, when + * they look as CLOSING or LAST_ACK for Linux) + * Probably, I missed some more holelets. + * --ANK + */ + tcp_send_fin(sk); + } + + sk_stream_wait_close(sk, timeout); + +adjudge_to_death: + /* It is the last release_sock in its life. It will remove backlog. */ + release_sock(sk); + + + /* Now socket is owned by kernel and we acquire BH lock + to finish close. No need to check for user refs. + */ + local_bh_disable(); + bh_lock_sock(sk); + BUG_TRAP(!sock_owned_by_user(sk)); + + sock_hold(sk); + sock_orphan(sk); + + /* This is a (useful) BSD violating of the RFC. There is a + * problem with TCP as specified in that the other end could + * keep a socket open forever with no application left this end. + * We use a 3 minute timeout (about the same as BSD) then kill + * our end. If they send after that then tough - BUT: long enough + * that we won't make the old 4*rto = almost no time - whoops + * reset mistake. + * + * Nope, it was not mistake. It is really desired behaviour + * f.e. on http servers, when such sockets are useless, but + * consume significant resources. Let's do it with special + * linger2 option. --ANK + */ + + if (sk->sk_state == TCP_FIN_WAIT2) { + struct tcp_sock *tp = tcp_sk(sk); + if (tp->linger2 < 0) { + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, GFP_ATOMIC); + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER); + } else { + int tmo = tcp_fin_time(tp); + + if (tmo > TCP_TIMEWAIT_LEN) { + tcp_reset_keepalive_timer(sk, tcp_fin_time(tp)); + } else { + atomic_inc(&tcp_orphan_count); + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto out; + } + } + } + if (sk->sk_state != TCP_CLOSE) { + sk_stream_mem_reclaim(sk); + if (atomic_read(&tcp_orphan_count) > sysctl_tcp_max_orphans || + (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && + atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { + if (net_ratelimit()) + printk(KERN_INFO "TCP: too many of orphaned " + "sockets\n"); + tcp_set_state(sk, TCP_CLOSE); + tcp_send_active_reset(sk, GFP_ATOMIC); + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); + } + } + atomic_inc(&tcp_orphan_count); + + if (sk->sk_state == TCP_CLOSE) + tcp_destroy_sock(sk); + /* Otherwise, socket is reprieved until protocol close. */ + +out: + bh_unlock_sock(sk); + local_bh_enable(); + sock_put(sk); +} + +/* These states need RST on ABORT according to RFC793 */ + +static inline int tcp_need_reset(int state) +{ + return (1 << state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_FIN_WAIT1 | + TCPF_FIN_WAIT2 | TCPF_SYN_RECV); +} + +int tcp_disconnect(struct sock *sk, int flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + int err = 0; + int old_state = sk->sk_state; + + if (old_state != TCP_CLOSE) + tcp_set_state(sk, TCP_CLOSE); + + /* ABORT function of RFC793 */ + if (old_state == TCP_LISTEN) { + tcp_listen_stop(sk); + } else if (tcp_need_reset(old_state) || + (tp->snd_nxt != tp->write_seq && + (1 << old_state) & (TCPF_CLOSING | TCPF_LAST_ACK))) { + /* The last check adjusts for discrepance of Linux wrt. RFC + * states + */ + tcp_send_active_reset(sk, gfp_any()); + sk->sk_err = ECONNRESET; + } else if (old_state == TCP_SYN_SENT) + sk->sk_err = ECONNRESET; + + tcp_clear_xmit_timers(sk); + __skb_queue_purge(&sk->sk_receive_queue); + sk_stream_writequeue_purge(sk); + __skb_queue_purge(&tp->out_of_order_queue); + + inet->dport = 0; + + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); + + sk->sk_shutdown = 0; + sock_reset_flag(sk, SOCK_DONE); + tp->srtt = 0; + if ((tp->write_seq += tp->max_window + 2) == 0) + tp->write_seq = 1; + tp->backoff = 0; + tp->snd_cwnd = 2; + tp->probes_out = 0; + tp->packets_out = 0; + tp->snd_ssthresh = 0x7fffffff; + tp->snd_cwnd_cnt = 0; + tcp_set_ca_state(tp, TCP_CA_Open); + tcp_clear_retrans(tp); + tcp_delack_init(tp); + sk->sk_send_head = NULL; + tp->rx_opt.saw_tstamp = 0; + tcp_sack_reset(&tp->rx_opt); + __sk_dst_reset(sk); + + BUG_TRAP(!inet->num || tp->bind_hash); + + sk->sk_error_report(sk); + return err; +} + +/* + * Wait for an incoming connection, avoid race + * conditions. This must be called with the socket locked. + */ +static int wait_for_connect(struct sock *sk, long timeo) +{ + struct tcp_sock *tp = tcp_sk(sk); + DEFINE_WAIT(wait); + int err; + + /* + * True wake-one mechanism for incoming connections: only + * one process gets woken up, not the 'whole herd'. + * Since we do not 'race & poll' for established sockets + * anymore, the common case will execute the loop only once. + * + * Subtle issue: "add_wait_queue_exclusive()" will be added + * after any current non-exclusive waiters, and we know that + * it will always _stay_ after any new non-exclusive waiters + * because all non-exclusive waiters are added at the + * beginning of the wait-queue. As such, it's ok to "drop" + * our exclusiveness temporarily when we get woken up without + * having to remove and re-insert us on the wait queue. + */ + for (;;) { + prepare_to_wait_exclusive(sk->sk_sleep, &wait, + TASK_INTERRUPTIBLE); + release_sock(sk); + if (!tp->accept_queue) + timeo = schedule_timeout(timeo); + lock_sock(sk); + err = 0; + if (tp->accept_queue) + break; + err = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + break; + err = sock_intr_errno(timeo); + if (signal_pending(current)) + break; + err = -EAGAIN; + if (!timeo) + break; + } + finish_wait(sk->sk_sleep, &wait); + return err; +} + +/* + * This will accept the next outstanding connection. + */ + +struct sock *tcp_accept(struct sock *sk, int flags, int *err) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct open_request *req; + struct sock *newsk; + int error; + + lock_sock(sk); + + /* We need to make sure that this socket is listening, + * and that it has something pending. + */ + error = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + goto out; + + /* Find already established connection */ + if (!tp->accept_queue) { + long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK); + + /* If this is a non blocking socket don't sleep */ + error = -EAGAIN; + if (!timeo) + goto out; + + error = wait_for_connect(sk, timeo); + if (error) + goto out; + } + + req = tp->accept_queue; + if ((tp->accept_queue = req->dl_next) == NULL) + tp->accept_queue_tail = NULL; + + newsk = req->sk; + sk_acceptq_removed(sk); + tcp_openreq_fastfree(req); + BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); + release_sock(sk); + return newsk; + +out: + release_sock(sk); + *err = error; + return NULL; +} + +/* + * Socket option code for TCP. + */ +int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, + int optlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + int val; + int err = 0; + + if (level != SOL_TCP) + return tp->af_specific->setsockopt(sk, level, optname, + optval, optlen); + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + lock_sock(sk); + + switch (optname) { + case TCP_MAXSEG: + /* Values greater than interface MTU won't take effect. However + * at the point when this call is done we typically don't yet + * know which interface is going to be used */ + if (val < 8 || val > MAX_TCP_WINDOW) { + err = -EINVAL; + break; + } + tp->rx_opt.user_mss = val; + break; + + case TCP_NODELAY: + if (val) { + /* TCP_NODELAY is weaker than TCP_CORK, so that + * this option on corked socket is remembered, but + * it is not activated until cork is cleared. + * + * However, when TCP_NODELAY is set we make + * an explicit push, which overrides even TCP_CORK + * for currently queued segments. + */ + tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk, tp); + } else { + tp->nonagle &= ~TCP_NAGLE_OFF; + } + break; + + case TCP_CORK: + /* When set indicates to always queue non-full frames. + * Later the user clears this option and we transmit + * any pending partial frames in the queue. This is + * meant to be used alongside sendfile() to get properly + * filled frames when the user (for example) must write + * out headers with a write() call first and then use + * sendfile to send out the data parts. + * + * TCP_CORK can be set together with TCP_NODELAY and it is + * stronger than TCP_NODELAY. + */ + if (val) { + tp->nonagle |= TCP_NAGLE_CORK; + } else { + tp->nonagle &= ~TCP_NAGLE_CORK; + if (tp->nonagle&TCP_NAGLE_OFF) + tp->nonagle |= TCP_NAGLE_PUSH; + tcp_push_pending_frames(sk, tp); + } + break; + + case TCP_KEEPIDLE: + if (val < 1 || val > MAX_TCP_KEEPIDLE) + err = -EINVAL; + else { + tp->keepalive_time = val * HZ; + if (sock_flag(sk, SOCK_KEEPOPEN) && + !((1 << sk->sk_state) & + (TCPF_CLOSE | TCPF_LISTEN))) { + __u32 elapsed = tcp_time_stamp - tp->rcv_tstamp; + if (tp->keepalive_time > elapsed) + elapsed = tp->keepalive_time - elapsed; + else + elapsed = 0; + tcp_reset_keepalive_timer(sk, elapsed); + } + } + break; + case TCP_KEEPINTVL: + if (val < 1 || val > MAX_TCP_KEEPINTVL) + err = -EINVAL; + else + tp->keepalive_intvl = val * HZ; + break; + case TCP_KEEPCNT: + if (val < 1 || val > MAX_TCP_KEEPCNT) + err = -EINVAL; + else + tp->keepalive_probes = val; + break; + case TCP_SYNCNT: + if (val < 1 || val > MAX_TCP_SYNCNT) + err = -EINVAL; + else + tp->syn_retries = val; + break; + + case TCP_LINGER2: + if (val < 0) + tp->linger2 = -1; + else if (val > sysctl_tcp_fin_timeout / HZ) + tp->linger2 = 0; + else + tp->linger2 = val * HZ; + break; + + case TCP_DEFER_ACCEPT: + tp->defer_accept = 0; + if (val > 0) { + /* Translate value in seconds to number of + * retransmits */ + while (tp->defer_accept < 32 && + val > ((TCP_TIMEOUT_INIT / HZ) << + tp->defer_accept)) + tp->defer_accept++; + tp->defer_accept++; + } + break; + + case TCP_WINDOW_CLAMP: + if (!val) { + if (sk->sk_state != TCP_CLOSE) { + err = -EINVAL; + break; + } + tp->window_clamp = 0; + } else + tp->window_clamp = val < SOCK_MIN_RCVBUF / 2 ? + SOCK_MIN_RCVBUF / 2 : val; + break; + + case TCP_QUICKACK: + if (!val) { + tp->ack.pingpong = 1; + } else { + tp->ack.pingpong = 0; + if ((1 << sk->sk_state) & + (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && + tcp_ack_scheduled(tp)) { + tp->ack.pending |= TCP_ACK_PUSHED; + cleanup_rbuf(sk, 1); + if (!(val & 1)) + tp->ack.pingpong = 1; + } + } + break; + + default: + err = -ENOPROTOOPT; + break; + }; + release_sock(sk); + return err; +} + +/* Return information about state of tcp endpoint in API format. */ +void tcp_get_info(struct sock *sk, struct tcp_info *info) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 now = tcp_time_stamp; + + memset(info, 0, sizeof(*info)); + + info->tcpi_state = sk->sk_state; + info->tcpi_ca_state = tp->ca_state; + info->tcpi_retransmits = tp->retransmits; + info->tcpi_probes = tp->probes_out; + info->tcpi_backoff = tp->backoff; + + if (tp->rx_opt.tstamp_ok) + info->tcpi_options |= TCPI_OPT_TIMESTAMPS; + if (tp->rx_opt.sack_ok) + info->tcpi_options |= TCPI_OPT_SACK; + if (tp->rx_opt.wscale_ok) { + info->tcpi_options |= TCPI_OPT_WSCALE; + info->tcpi_snd_wscale = tp->rx_opt.snd_wscale; + info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; + } + + if (tp->ecn_flags&TCP_ECN_OK) + info->tcpi_options |= TCPI_OPT_ECN; + + info->tcpi_rto = jiffies_to_usecs(tp->rto); + info->tcpi_ato = jiffies_to_usecs(tp->ack.ato); + info->tcpi_snd_mss = tp->mss_cache_std; + info->tcpi_rcv_mss = tp->ack.rcv_mss; + + info->tcpi_unacked = tp->packets_out; + info->tcpi_sacked = tp->sacked_out; + info->tcpi_lost = tp->lost_out; + info->tcpi_retrans = tp->retrans_out; + info->tcpi_fackets = tp->fackets_out; + + info->tcpi_last_data_sent = jiffies_to_msecs(now - tp->lsndtime); + info->tcpi_last_data_recv = jiffies_to_msecs(now - tp->ack.lrcvtime); + info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); + + info->tcpi_pmtu = tp->pmtu_cookie; + info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; + info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; + info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; + info->tcpi_snd_ssthresh = tp->snd_ssthresh; + info->tcpi_snd_cwnd = tp->snd_cwnd; + info->tcpi_advmss = tp->advmss; + info->tcpi_reordering = tp->reordering; + + info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3; + info->tcpi_rcv_space = tp->rcvq_space.space; + + info->tcpi_total_retrans = tp->total_retrans; +} + +EXPORT_SYMBOL_GPL(tcp_get_info); + +int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, + int __user *optlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + int val, len; + + if (level != SOL_TCP) + return tp->af_specific->getsockopt(sk, level, optname, + optval, optlen); + + if (get_user(len, optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + + if (len < 0) + return -EINVAL; + + switch (optname) { + case TCP_MAXSEG: + val = tp->mss_cache_std; + if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) + val = tp->rx_opt.user_mss; + break; + case TCP_NODELAY: + val = !!(tp->nonagle&TCP_NAGLE_OFF); + break; + case TCP_CORK: + val = !!(tp->nonagle&TCP_NAGLE_CORK); + break; + case TCP_KEEPIDLE: + val = (tp->keepalive_time ? : sysctl_tcp_keepalive_time) / HZ; + break; + case TCP_KEEPINTVL: + val = (tp->keepalive_intvl ? : sysctl_tcp_keepalive_intvl) / HZ; + break; + case TCP_KEEPCNT: + val = tp->keepalive_probes ? : sysctl_tcp_keepalive_probes; + break; + case TCP_SYNCNT: + val = tp->syn_retries ? : sysctl_tcp_syn_retries; + break; + case TCP_LINGER2: + val = tp->linger2; + if (val >= 0) + val = (val ? : sysctl_tcp_fin_timeout) / HZ; + break; + case TCP_DEFER_ACCEPT: + val = !tp->defer_accept ? 0 : ((TCP_TIMEOUT_INIT / HZ) << + (tp->defer_accept - 1)); + break; + case TCP_WINDOW_CLAMP: + val = tp->window_clamp; + break; + case TCP_INFO: { + struct tcp_info info; + + if (get_user(len, optlen)) + return -EFAULT; + + tcp_get_info(sk, &info); + + len = min_t(unsigned int, len, sizeof(info)); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &info, len)) + return -EFAULT; + return 0; + } + case TCP_QUICKACK: + val = !tp->ack.pingpong; + break; + default: + return -ENOPROTOOPT; + }; + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + return 0; +} + + +extern void __skb_cb_too_small_for_tcp(int, int); +extern void tcpdiag_init(void); + +static __initdata unsigned long thash_entries; +static int __init set_thash_entries(char *str) +{ + if (!str) + return 0; + thash_entries = simple_strtoul(str, &str, 0); + return 1; +} +__setup("thash_entries=", set_thash_entries); + +void __init tcp_init(void) +{ + struct sk_buff *skb = NULL; + int order, i; + + if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb)) + __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), + sizeof(skb->cb)); + + tcp_openreq_cachep = kmem_cache_create("tcp_open_request", + sizeof(struct open_request), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!tcp_openreq_cachep) + panic("tcp_init: Cannot alloc open_request cache."); + + tcp_bucket_cachep = kmem_cache_create("tcp_bind_bucket", + sizeof(struct tcp_bind_bucket), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!tcp_bucket_cachep) + panic("tcp_init: Cannot alloc tcp_bind_bucket cache."); + + tcp_timewait_cachep = kmem_cache_create("tcp_tw_bucket", + sizeof(struct tcp_tw_bucket), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!tcp_timewait_cachep) + panic("tcp_init: Cannot alloc tcp_tw_bucket cache."); + + /* Size and allocate the main established and bind bucket + * hash tables. + * + * The methodology is similar to that of the buffer cache. + */ + tcp_ehash = (struct tcp_ehash_bucket *) + alloc_large_system_hash("TCP established", + sizeof(struct tcp_ehash_bucket), + thash_entries, + (num_physpages >= 128 * 1024) ? + (25 - PAGE_SHIFT) : + (27 - PAGE_SHIFT), + HASH_HIGHMEM, + &tcp_ehash_size, + NULL, + 0); + tcp_ehash_size = (1 << tcp_ehash_size) >> 1; + for (i = 0; i < (tcp_ehash_size << 1); i++) { + rwlock_init(&tcp_ehash[i].lock); + INIT_HLIST_HEAD(&tcp_ehash[i].chain); + } + + tcp_bhash = (struct tcp_bind_hashbucket *) + alloc_large_system_hash("TCP bind", + sizeof(struct tcp_bind_hashbucket), + tcp_ehash_size, + (num_physpages >= 128 * 1024) ? + (25 - PAGE_SHIFT) : + (27 - PAGE_SHIFT), + HASH_HIGHMEM, + &tcp_bhash_size, + NULL, + 64 * 1024); + tcp_bhash_size = 1 << tcp_bhash_size; + for (i = 0; i < tcp_bhash_size; i++) { + spin_lock_init(&tcp_bhash[i].lock); + INIT_HLIST_HEAD(&tcp_bhash[i].chain); + } + + /* Try to be a bit smarter and adjust defaults depending + * on available memory. + */ + for (order = 0; ((1 << order) << PAGE_SHIFT) < + (tcp_bhash_size * sizeof(struct tcp_bind_hashbucket)); + order++) + ; + if (order > 4) { + sysctl_local_port_range[0] = 32768; + sysctl_local_port_range[1] = 61000; + sysctl_tcp_max_tw_buckets = 180000; + sysctl_tcp_max_orphans = 4096 << (order - 4); + sysctl_max_syn_backlog = 1024; + } else if (order < 3) { + sysctl_local_port_range[0] = 1024 * (3 - order); + sysctl_tcp_max_tw_buckets >>= (3 - order); + sysctl_tcp_max_orphans >>= (3 - order); + sysctl_max_syn_backlog = 128; + } + tcp_port_rover = sysctl_local_port_range[0] - 1; + + sysctl_tcp_mem[0] = 768 << order; + sysctl_tcp_mem[1] = 1024 << order; + sysctl_tcp_mem[2] = 1536 << order; + + if (order < 3) { + sysctl_tcp_wmem[2] = 64 * 1024; + sysctl_tcp_rmem[0] = PAGE_SIZE; + sysctl_tcp_rmem[1] = 43689; + sysctl_tcp_rmem[2] = 2 * 43689; + } + + printk(KERN_INFO "TCP: Hash tables configured " + "(established %d bind %d)\n", + tcp_ehash_size << 1, tcp_bhash_size); +} + +EXPORT_SYMBOL(tcp_accept); +EXPORT_SYMBOL(tcp_close); +EXPORT_SYMBOL(tcp_destroy_sock); +EXPORT_SYMBOL(tcp_disconnect); +EXPORT_SYMBOL(tcp_getsockopt); +EXPORT_SYMBOL(tcp_ioctl); +EXPORT_SYMBOL(tcp_openreq_cachep); +EXPORT_SYMBOL(tcp_poll); +EXPORT_SYMBOL(tcp_read_sock); +EXPORT_SYMBOL(tcp_recvmsg); +EXPORT_SYMBOL(tcp_sendmsg); +EXPORT_SYMBOL(tcp_sendpage); +EXPORT_SYMBOL(tcp_setsockopt); +EXPORT_SYMBOL(tcp_shutdown); +EXPORT_SYMBOL(tcp_statistics); +EXPORT_SYMBOL(tcp_timewait_cachep); diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c new file mode 100644 index 000000000000..313c1408da33 --- /dev/null +++ b/net/ipv4/tcp_diag.c @@ -0,0 +1,802 @@ +/* + * tcp_diag.c Module for monitoring TCP sockets. + * + * Version: $Id: tcp_diag.c,v 1.3 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Alexey Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +#include + +struct tcpdiag_entry +{ + u32 *saddr; + u32 *daddr; + u16 sport; + u16 dport; + u16 family; + u16 userlocks; +}; + +static struct sock *tcpnl; + + +#define TCPDIAG_PUT(skb, attrtype, attrlen) \ +({ int rtalen = RTA_LENGTH(attrlen); \ + struct rtattr *rta; \ + if (skb_tailroom(skb) < RTA_ALIGN(rtalen)) goto nlmsg_failure; \ + rta = (void*)__skb_put(skb, RTA_ALIGN(rtalen)); \ + rta->rta_type = attrtype; \ + rta->rta_len = rtalen; \ + RTA_DATA(rta); }) + +static int tcpdiag_fill(struct sk_buff *skb, struct sock *sk, + int ext, u32 pid, u32 seq, u16 nlmsg_flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcpdiagmsg *r; + struct nlmsghdr *nlh; + struct tcp_info *info = NULL; + struct tcpdiag_meminfo *minfo = NULL; + struct tcpvegas_info *vinfo = NULL; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); + nlh->nlmsg_flags = nlmsg_flags; + r = NLMSG_DATA(nlh); + if (sk->sk_state != TCP_TIME_WAIT) { + if (ext & (1<<(TCPDIAG_MEMINFO-1))) + minfo = TCPDIAG_PUT(skb, TCPDIAG_MEMINFO, sizeof(*minfo)); + if (ext & (1<<(TCPDIAG_INFO-1))) + info = TCPDIAG_PUT(skb, TCPDIAG_INFO, sizeof(*info)); + + if ((tcp_is_westwood(tp) || tcp_is_vegas(tp)) + && (ext & (1<<(TCPDIAG_VEGASINFO-1)))) + vinfo = TCPDIAG_PUT(skb, TCPDIAG_VEGASINFO, sizeof(*vinfo)); + } + r->tcpdiag_family = sk->sk_family; + r->tcpdiag_state = sk->sk_state; + r->tcpdiag_timer = 0; + r->tcpdiag_retrans = 0; + + r->id.tcpdiag_if = sk->sk_bound_dev_if; + r->id.tcpdiag_cookie[0] = (u32)(unsigned long)sk; + r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1); + + if (r->tcpdiag_state == TCP_TIME_WAIT) { + struct tcp_tw_bucket *tw = (struct tcp_tw_bucket*)sk; + long tmo = tw->tw_ttd - jiffies; + if (tmo < 0) + tmo = 0; + + r->id.tcpdiag_sport = tw->tw_sport; + r->id.tcpdiag_dport = tw->tw_dport; + r->id.tcpdiag_src[0] = tw->tw_rcv_saddr; + r->id.tcpdiag_dst[0] = tw->tw_daddr; + r->tcpdiag_state = tw->tw_substate; + r->tcpdiag_timer = 3; + r->tcpdiag_expires = (tmo*1000+HZ-1)/HZ; + r->tcpdiag_rqueue = 0; + r->tcpdiag_wqueue = 0; + r->tcpdiag_uid = 0; + r->tcpdiag_inode = 0; +#ifdef CONFIG_IP_TCPDIAG_IPV6 + if (r->tcpdiag_family == AF_INET6) { + ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, + &tw->tw_v6_rcv_saddr); + ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, + &tw->tw_v6_daddr); + } +#endif + nlh->nlmsg_len = skb->tail - b; + return skb->len; + } + + r->id.tcpdiag_sport = inet->sport; + r->id.tcpdiag_dport = inet->dport; + r->id.tcpdiag_src[0] = inet->rcv_saddr; + r->id.tcpdiag_dst[0] = inet->daddr; + +#ifdef CONFIG_IP_TCPDIAG_IPV6 + if (r->tcpdiag_family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, + &np->rcv_saddr); + ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, + &np->daddr); + } +#endif + +#define EXPIRES_IN_MS(tmo) ((tmo-jiffies)*1000+HZ-1)/HZ + + if (tp->pending == TCP_TIME_RETRANS) { + r->tcpdiag_timer = 1; + r->tcpdiag_retrans = tp->retransmits; + r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout); + } else if (tp->pending == TCP_TIME_PROBE0) { + r->tcpdiag_timer = 4; + r->tcpdiag_retrans = tp->probes_out; + r->tcpdiag_expires = EXPIRES_IN_MS(tp->timeout); + } else if (timer_pending(&sk->sk_timer)) { + r->tcpdiag_timer = 2; + r->tcpdiag_retrans = tp->probes_out; + r->tcpdiag_expires = EXPIRES_IN_MS(sk->sk_timer.expires); + } else { + r->tcpdiag_timer = 0; + r->tcpdiag_expires = 0; + } +#undef EXPIRES_IN_MS + + r->tcpdiag_rqueue = tp->rcv_nxt - tp->copied_seq; + r->tcpdiag_wqueue = tp->write_seq - tp->snd_una; + r->tcpdiag_uid = sock_i_uid(sk); + r->tcpdiag_inode = sock_i_ino(sk); + + if (minfo) { + minfo->tcpdiag_rmem = atomic_read(&sk->sk_rmem_alloc); + minfo->tcpdiag_wmem = sk->sk_wmem_queued; + minfo->tcpdiag_fmem = sk->sk_forward_alloc; + minfo->tcpdiag_tmem = atomic_read(&sk->sk_wmem_alloc); + } + + if (info) + tcp_get_info(sk, info); + + if (vinfo) { + if (tcp_is_vegas(tp)) { + vinfo->tcpv_enabled = tp->vegas.doing_vegas_now; + vinfo->tcpv_rttcnt = tp->vegas.cntRTT; + vinfo->tcpv_rtt = jiffies_to_usecs(tp->vegas.baseRTT); + vinfo->tcpv_minrtt = jiffies_to_usecs(tp->vegas.minRTT); + } else { + vinfo->tcpv_enabled = 0; + vinfo->tcpv_rttcnt = 0; + vinfo->tcpv_rtt = jiffies_to_usecs(tp->westwood.rtt); + vinfo->tcpv_minrtt = jiffies_to_usecs(tp->westwood.rtt_min); + } + } + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, + int dif); +#ifdef CONFIG_IP_TCPDIAG_IPV6 +extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, + struct in6_addr *daddr, u16 dport, + int dif); +#else +static inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, + struct in6_addr *daddr, u16 dport, + int dif) +{ + return NULL; +} +#endif + +static int tcpdiag_get_exact(struct sk_buff *in_skb, const struct nlmsghdr *nlh) +{ + int err; + struct sock *sk; + struct tcpdiagreq *req = NLMSG_DATA(nlh); + struct sk_buff *rep; + + if (req->tcpdiag_family == AF_INET) { + sk = tcp_v4_lookup(req->id.tcpdiag_dst[0], req->id.tcpdiag_dport, + req->id.tcpdiag_src[0], req->id.tcpdiag_sport, + req->id.tcpdiag_if); + } +#ifdef CONFIG_IP_TCPDIAG_IPV6 + else if (req->tcpdiag_family == AF_INET6) { + sk = tcp_v6_lookup((struct in6_addr*)req->id.tcpdiag_dst, req->id.tcpdiag_dport, + (struct in6_addr*)req->id.tcpdiag_src, req->id.tcpdiag_sport, + req->id.tcpdiag_if); + } +#endif + else { + return -EINVAL; + } + + if (sk == NULL) + return -ENOENT; + + err = -ESTALE; + if ((req->id.tcpdiag_cookie[0] != TCPDIAG_NOCOOKIE || + req->id.tcpdiag_cookie[1] != TCPDIAG_NOCOOKIE) && + ((u32)(unsigned long)sk != req->id.tcpdiag_cookie[0] || + (u32)((((unsigned long)sk) >> 31) >> 1) != req->id.tcpdiag_cookie[1])) + goto out; + + err = -ENOMEM; + rep = alloc_skb(NLMSG_SPACE(sizeof(struct tcpdiagmsg)+ + sizeof(struct tcpdiag_meminfo)+ + sizeof(struct tcp_info)+64), GFP_KERNEL); + if (!rep) + goto out; + + if (tcpdiag_fill(rep, sk, req->tcpdiag_ext, + NETLINK_CB(in_skb).pid, + nlh->nlmsg_seq, 0) <= 0) + BUG(); + + err = netlink_unicast(tcpnl, rep, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); + if (err > 0) + err = 0; + +out: + if (sk) { + if (sk->sk_state == TCP_TIME_WAIT) + tcp_tw_put((struct tcp_tw_bucket*)sk); + else + sock_put(sk); + } + return err; +} + +static int bitstring_match(const u32 *a1, const u32 *a2, int bits) +{ + int words = bits >> 5; + + bits &= 0x1f; + + if (words) { + if (memcmp(a1, a2, words << 2)) + return 0; + } + if (bits) { + __u32 w1, w2; + __u32 mask; + + w1 = a1[words]; + w2 = a2[words]; + + mask = htonl((0xffffffff) << (32 - bits)); + + if ((w1 ^ w2) & mask) + return 0; + } + + return 1; +} + + +static int tcpdiag_bc_run(const void *bc, int len, + const struct tcpdiag_entry *entry) +{ + while (len > 0) { + int yes = 1; + const struct tcpdiag_bc_op *op = bc; + + switch (op->code) { + case TCPDIAG_BC_NOP: + break; + case TCPDIAG_BC_JMP: + yes = 0; + break; + case TCPDIAG_BC_S_GE: + yes = entry->sport >= op[1].no; + break; + case TCPDIAG_BC_S_LE: + yes = entry->dport <= op[1].no; + break; + case TCPDIAG_BC_D_GE: + yes = entry->dport >= op[1].no; + break; + case TCPDIAG_BC_D_LE: + yes = entry->dport <= op[1].no; + break; + case TCPDIAG_BC_AUTO: + yes = !(entry->userlocks & SOCK_BINDPORT_LOCK); + break; + case TCPDIAG_BC_S_COND: + case TCPDIAG_BC_D_COND: + { + struct tcpdiag_hostcond *cond = (struct tcpdiag_hostcond*)(op+1); + u32 *addr; + + if (cond->port != -1 && + cond->port != (op->code == TCPDIAG_BC_S_COND ? + entry->sport : entry->dport)) { + yes = 0; + break; + } + + if (cond->prefix_len == 0) + break; + + if (op->code == TCPDIAG_BC_S_COND) + addr = entry->saddr; + else + addr = entry->daddr; + + if (bitstring_match(addr, cond->addr, cond->prefix_len)) + break; + if (entry->family == AF_INET6 && + cond->family == AF_INET) { + if (addr[0] == 0 && addr[1] == 0 && + addr[2] == htonl(0xffff) && + bitstring_match(addr+3, cond->addr, cond->prefix_len)) + break; + } + yes = 0; + break; + } + } + + if (yes) { + len -= op->yes; + bc += op->yes; + } else { + len -= op->no; + bc += op->no; + } + } + return (len == 0); +} + +static int valid_cc(const void *bc, int len, int cc) +{ + while (len >= 0) { + const struct tcpdiag_bc_op *op = bc; + + if (cc > len) + return 0; + if (cc == len) + return 1; + if (op->yes < 4) + return 0; + len -= op->yes; + bc += op->yes; + } + return 0; +} + +static int tcpdiag_bc_audit(const void *bytecode, int bytecode_len) +{ + const unsigned char *bc = bytecode; + int len = bytecode_len; + + while (len > 0) { + struct tcpdiag_bc_op *op = (struct tcpdiag_bc_op*)bc; + +//printk("BC: %d %d %d {%d} / %d\n", op->code, op->yes, op->no, op[1].no, len); + switch (op->code) { + case TCPDIAG_BC_AUTO: + case TCPDIAG_BC_S_COND: + case TCPDIAG_BC_D_COND: + case TCPDIAG_BC_S_GE: + case TCPDIAG_BC_S_LE: + case TCPDIAG_BC_D_GE: + case TCPDIAG_BC_D_LE: + if (op->yes < 4 || op->yes > len+4) + return -EINVAL; + case TCPDIAG_BC_JMP: + if (op->no < 4 || op->no > len+4) + return -EINVAL; + if (op->no < len && + !valid_cc(bytecode, bytecode_len, len-op->no)) + return -EINVAL; + break; + case TCPDIAG_BC_NOP: + if (op->yes < 4 || op->yes > len+4) + return -EINVAL; + break; + default: + return -EINVAL; + } + bc += op->yes; + len -= op->yes; + } + return len == 0 ? 0 : -EINVAL; +} + +static int tcpdiag_dump_sock(struct sk_buff *skb, struct sock *sk, + struct netlink_callback *cb) +{ + struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); + + if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + struct tcpdiag_entry entry; + struct rtattr *bc = (struct rtattr *)(r + 1); + struct inet_sock *inet = inet_sk(sk); + + entry.family = sk->sk_family; +#ifdef CONFIG_IP_TCPDIAG_IPV6 + if (entry.family == AF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + entry.saddr = np->rcv_saddr.s6_addr32; + entry.daddr = np->daddr.s6_addr32; + } else +#endif + { + entry.saddr = &inet->rcv_saddr; + entry.daddr = &inet->daddr; + } + entry.sport = inet->num; + entry.dport = ntohs(inet->dport); + entry.userlocks = sk->sk_userlocks; + + if (!tcpdiag_bc_run(RTA_DATA(bc), RTA_PAYLOAD(bc), &entry)) + return 0; + } + + return tcpdiag_fill(skb, sk, r->tcpdiag_ext, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI); +} + +static int tcpdiag_fill_req(struct sk_buff *skb, struct sock *sk, + struct open_request *req, + u32 pid, u32 seq) +{ + struct inet_sock *inet = inet_sk(sk); + unsigned char *b = skb->tail; + struct tcpdiagmsg *r; + struct nlmsghdr *nlh; + long tmo; + + nlh = NLMSG_PUT(skb, pid, seq, TCPDIAG_GETSOCK, sizeof(*r)); + nlh->nlmsg_flags = NLM_F_MULTI; + r = NLMSG_DATA(nlh); + + r->tcpdiag_family = sk->sk_family; + r->tcpdiag_state = TCP_SYN_RECV; + r->tcpdiag_timer = 1; + r->tcpdiag_retrans = req->retrans; + + r->id.tcpdiag_if = sk->sk_bound_dev_if; + r->id.tcpdiag_cookie[0] = (u32)(unsigned long)req; + r->id.tcpdiag_cookie[1] = (u32)(((unsigned long)req >> 31) >> 1); + + tmo = req->expires - jiffies; + if (tmo < 0) + tmo = 0; + + r->id.tcpdiag_sport = inet->sport; + r->id.tcpdiag_dport = req->rmt_port; + r->id.tcpdiag_src[0] = req->af.v4_req.loc_addr; + r->id.tcpdiag_dst[0] = req->af.v4_req.rmt_addr; + r->tcpdiag_expires = jiffies_to_msecs(tmo), + r->tcpdiag_rqueue = 0; + r->tcpdiag_wqueue = 0; + r->tcpdiag_uid = sock_i_uid(sk); + r->tcpdiag_inode = 0; +#ifdef CONFIG_IP_TCPDIAG_IPV6 + if (r->tcpdiag_family == AF_INET6) { + ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_src, + &req->af.v6_req.loc_addr); + ipv6_addr_copy((struct in6_addr *)r->id.tcpdiag_dst, + &req->af.v6_req.rmt_addr); + } +#endif + nlh->nlmsg_len = skb->tail - b; + + return skb->len; + +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int tcpdiag_dump_reqs(struct sk_buff *skb, struct sock *sk, + struct netlink_callback *cb) +{ + struct tcpdiag_entry entry; + struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_listen_opt *lopt; + struct rtattr *bc = NULL; + struct inet_sock *inet = inet_sk(sk); + int j, s_j; + int reqnum, s_reqnum; + int err = 0; + + s_j = cb->args[3]; + s_reqnum = cb->args[4]; + + if (s_j > 0) + s_j--; + + entry.family = sk->sk_family; + + read_lock_bh(&tp->syn_wait_lock); + + lopt = tp->listen_opt; + if (!lopt || !lopt->qlen) + goto out; + + if (cb->nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(*r))) { + bc = (struct rtattr *)(r + 1); + entry.sport = inet->num; + entry.userlocks = sk->sk_userlocks; + } + + for (j = s_j; j < TCP_SYNQ_HSIZE; j++) { + struct open_request *req, *head = lopt->syn_table[j]; + + reqnum = 0; + for (req = head; req; reqnum++, req = req->dl_next) { + if (reqnum < s_reqnum) + continue; + if (r->id.tcpdiag_dport != req->rmt_port && + r->id.tcpdiag_dport) + continue; + + if (bc) { + entry.saddr = +#ifdef CONFIG_IP_TCPDIAG_IPV6 + (entry.family == AF_INET6) ? + req->af.v6_req.loc_addr.s6_addr32 : +#endif + &req->af.v4_req.loc_addr; + entry.daddr = +#ifdef CONFIG_IP_TCPDIAG_IPV6 + (entry.family == AF_INET6) ? + req->af.v6_req.rmt_addr.s6_addr32 : +#endif + &req->af.v4_req.rmt_addr; + entry.dport = ntohs(req->rmt_port); + + if (!tcpdiag_bc_run(RTA_DATA(bc), + RTA_PAYLOAD(bc), &entry)) + continue; + } + + err = tcpdiag_fill_req(skb, sk, req, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq); + if (err < 0) { + cb->args[3] = j + 1; + cb->args[4] = reqnum; + goto out; + } + } + + s_reqnum = 0; + } + +out: + read_unlock_bh(&tp->syn_wait_lock); + + return err; +} + +static int tcpdiag_dump(struct sk_buff *skb, struct netlink_callback *cb) +{ + int i, num; + int s_i, s_num; + struct tcpdiagreq *r = NLMSG_DATA(cb->nlh); + + s_i = cb->args[1]; + s_num = num = cb->args[2]; + + if (cb->args[0] == 0) { + if (!(r->tcpdiag_states&(TCPF_LISTEN|TCPF_SYN_RECV))) + goto skip_listen_ht; + tcp_listen_lock(); + for (i = s_i; i < TCP_LHTABLE_SIZE; i++) { + struct sock *sk; + struct hlist_node *node; + + num = 0; + sk_for_each(sk, node, &tcp_listening_hash[i]) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) { + num++; + continue; + } + + if (r->id.tcpdiag_sport != inet->sport && + r->id.tcpdiag_sport) + goto next_listen; + + if (!(r->tcpdiag_states&TCPF_LISTEN) || + r->id.tcpdiag_dport || + cb->args[3] > 0) + goto syn_recv; + + if (tcpdiag_dump_sock(skb, sk, cb) < 0) { + tcp_listen_unlock(); + goto done; + } + +syn_recv: + if (!(r->tcpdiag_states&TCPF_SYN_RECV)) + goto next_listen; + + if (tcpdiag_dump_reqs(skb, sk, cb) < 0) { + tcp_listen_unlock(); + goto done; + } + +next_listen: + cb->args[3] = 0; + cb->args[4] = 0; + ++num; + } + + s_num = 0; + cb->args[3] = 0; + cb->args[4] = 0; + } + tcp_listen_unlock(); +skip_listen_ht: + cb->args[0] = 1; + s_i = num = s_num = 0; + } + + if (!(r->tcpdiag_states&~(TCPF_LISTEN|TCPF_SYN_RECV))) + return skb->len; + + for (i = s_i; i < tcp_ehash_size; i++) { + struct tcp_ehash_bucket *head = &tcp_ehash[i]; + struct sock *sk; + struct hlist_node *node; + + if (i > s_i) + s_num = 0; + + read_lock_bh(&head->lock); + + num = 0; + sk_for_each(sk, node, &head->chain) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_normal; + if (!(r->tcpdiag_states & (1 << sk->sk_state))) + goto next_normal; + if (r->id.tcpdiag_sport != inet->sport && + r->id.tcpdiag_sport) + goto next_normal; + if (r->id.tcpdiag_dport != inet->dport && r->id.tcpdiag_dport) + goto next_normal; + if (tcpdiag_dump_sock(skb, sk, cb) < 0) { + read_unlock_bh(&head->lock); + goto done; + } +next_normal: + ++num; + } + + if (r->tcpdiag_states&TCPF_TIME_WAIT) { + sk_for_each(sk, node, + &tcp_ehash[i + tcp_ehash_size].chain) { + struct inet_sock *inet = inet_sk(sk); + + if (num < s_num) + goto next_dying; + if (r->id.tcpdiag_sport != inet->sport && + r->id.tcpdiag_sport) + goto next_dying; + if (r->id.tcpdiag_dport != inet->dport && + r->id.tcpdiag_dport) + goto next_dying; + if (tcpdiag_dump_sock(skb, sk, cb) < 0) { + read_unlock_bh(&head->lock); + goto done; + } +next_dying: + ++num; + } + } + read_unlock_bh(&head->lock); + } + +done: + cb->args[1] = i; + cb->args[2] = num; + return skb->len; +} + +static int tcpdiag_dump_done(struct netlink_callback *cb) +{ + return 0; +} + + +static __inline__ int +tcpdiag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +{ + if (!(nlh->nlmsg_flags&NLM_F_REQUEST)) + return 0; + + if (nlh->nlmsg_type != TCPDIAG_GETSOCK) + goto err_inval; + + if (NLMSG_LENGTH(sizeof(struct tcpdiagreq)) > skb->len) + goto err_inval; + + if (nlh->nlmsg_flags&NLM_F_DUMP) { + if (nlh->nlmsg_len > 4 + NLMSG_SPACE(sizeof(struct tcpdiagreq))) { + struct rtattr *rta = (struct rtattr*)(NLMSG_DATA(nlh) + sizeof(struct tcpdiagreq)); + if (rta->rta_type != TCPDIAG_REQ_BYTECODE || + rta->rta_len < 8 || + rta->rta_len > nlh->nlmsg_len - NLMSG_SPACE(sizeof(struct tcpdiagreq))) + goto err_inval; + if (tcpdiag_bc_audit(RTA_DATA(rta), RTA_PAYLOAD(rta))) + goto err_inval; + } + return netlink_dump_start(tcpnl, skb, nlh, + tcpdiag_dump, + tcpdiag_dump_done); + } else { + return tcpdiag_get_exact(skb, nlh); + } + +err_inval: + return -EINVAL; +} + + +static inline void tcpdiag_rcv_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr * nlh; + + if (skb->len >= NLMSG_SPACE(0)) { + nlh = (struct nlmsghdr *)skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len) + return; + err = tcpdiag_rcv_msg(skb, nlh); + if (err || nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, err); + } +} + +static void tcpdiag_rcv(struct sock *sk, int len) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + tcpdiag_rcv_skb(skb); + kfree_skb(skb); + } +} + +static int __init tcpdiag_init(void) +{ + tcpnl = netlink_kernel_create(NETLINK_TCPDIAG, tcpdiag_rcv); + if (tcpnl == NULL) + return -ENOMEM; + return 0; +} + +static void __exit tcpdiag_exit(void) +{ + sock_release(tcpnl->sk_socket); +} + +module_init(tcpdiag_init); +module_exit(tcpdiag_exit); +MODULE_LICENSE("GPL"); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c new file mode 100644 index 000000000000..250492735902 --- /dev/null +++ b/net/ipv4/tcp_input.c @@ -0,0 +1,4959 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_input.c,v 1.243 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * Jorge Cwik, + */ + +/* + * Changes: + * Pedro Roque : Fast Retransmit/Recovery. + * Two receive queues. + * Retransmit queue handled by TCP. + * Better retransmit timer handling. + * New congestion avoidance. + * Header prediction. + * Variable renaming. + * + * Eric : Fast Retransmit. + * Randy Scott : MSS option defines. + * Eric Schenk : Fixes to slow start algorithm. + * Eric Schenk : Yet another double ACK bug. + * Eric Schenk : Delayed ACK bug fixes. + * Eric Schenk : Floyd style fast retrans war avoidance. + * David S. Miller : Don't allow zero congestion window. + * Eric Schenk : Fix retransmitter so that it sends + * next packet on ack of previous packet. + * Andi Kleen : Moved open_request checking here + * and process RSTs for open_requests. + * Andi Kleen : Better prune_queue, and other fixes. + * Andrey Savochkin: Fix RTT measurements in the presnce of + * timestamps. + * Andrey Savochkin: Check sequence numbers correctly when + * removing SACKs due to in sequence incoming + * data segments. + * Andi Kleen: Make sure we never ack data there is not + * enough room for. Also make this condition + * a fatal error if it might still happen. + * Andi Kleen: Add tcp_measure_rcv_mss to make + * connections with MSS +#include +#include +#include +#include +#include +#include +#include + +int sysctl_tcp_timestamps = 1; +int sysctl_tcp_window_scaling = 1; +int sysctl_tcp_sack = 1; +int sysctl_tcp_fack = 1; +int sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH; +int sysctl_tcp_ecn; +int sysctl_tcp_dsack = 1; +int sysctl_tcp_app_win = 31; +int sysctl_tcp_adv_win_scale = 2; + +int sysctl_tcp_stdurg; +int sysctl_tcp_rfc1337; +int sysctl_tcp_max_orphans = NR_FILE; +int sysctl_tcp_frto; +int sysctl_tcp_nometrics_save; +int sysctl_tcp_westwood; +int sysctl_tcp_vegas_cong_avoid; + +int sysctl_tcp_moderate_rcvbuf = 1; + +/* Default values of the Vegas variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 +int sysctl_tcp_vegas_alpha = 1<rx_opt.sack_ok == 0) +#define IsFack(tp) ((tp)->rx_opt.sack_ok & 2) +#define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4) + +#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) + +/* Adapt the MSS value used to make delayed ack decision to the + * real world. + */ +static inline void tcp_measure_rcv_mss(struct tcp_sock *tp, + struct sk_buff *skb) +{ + unsigned int len, lss; + + lss = tp->ack.last_seg_size; + tp->ack.last_seg_size = 0; + + /* skb->len may jitter because of SACKs, even if peer + * sends good full-sized frames. + */ + len = skb->len; + if (len >= tp->ack.rcv_mss) { + tp->ack.rcv_mss = len; + } else { + /* Otherwise, we make more careful check taking into account, + * that SACKs block is variable. + * + * "len" is invariant segment length, including TCP header. + */ + len += skb->data - skb->h.raw; + if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || + /* If PSH is not set, packet should be + * full sized, provided peer TCP is not badly broken. + * This observation (if it is correct 8)) allows + * to handle super-low mtu links fairly. + */ + (len >= TCP_MIN_MSS + sizeof(struct tcphdr) && + !(tcp_flag_word(skb->h.th)&TCP_REMNANT))) { + /* Subtract also invariant (if peer is RFC compliant), + * tcp header plus fixed timestamp option length. + * Resulting "len" is MSS free of SACK jitter. + */ + len -= tp->tcp_header_len; + tp->ack.last_seg_size = len; + if (len == lss) { + tp->ack.rcv_mss = len; + return; + } + } + tp->ack.pending |= TCP_ACK_PUSHED; + } +} + +static void tcp_incr_quickack(struct tcp_sock *tp) +{ + unsigned quickacks = tp->rcv_wnd/(2*tp->ack.rcv_mss); + + if (quickacks==0) + quickacks=2; + if (quickacks > tp->ack.quick) + tp->ack.quick = min(quickacks, TCP_MAX_QUICKACKS); +} + +void tcp_enter_quickack_mode(struct tcp_sock *tp) +{ + tcp_incr_quickack(tp); + tp->ack.pingpong = 0; + tp->ack.ato = TCP_ATO_MIN; +} + +/* Send ACKs quickly, if "quick" count is not exhausted + * and the session is not interactive. + */ + +static __inline__ int tcp_in_quickack_mode(struct tcp_sock *tp) +{ + return (tp->ack.quick && !tp->ack.pingpong); +} + +/* Buffer size and advertised window tuning. + * + * 1. Tuning sk->sk_sndbuf, when connection enters established state. + */ + +static void tcp_fixup_sndbuf(struct sock *sk) +{ + int sndmem = tcp_sk(sk)->rx_opt.mss_clamp + MAX_TCP_HEADER + 16 + + sizeof(struct sk_buff); + + if (sk->sk_sndbuf < 3 * sndmem) + sk->sk_sndbuf = min(3 * sndmem, sysctl_tcp_wmem[2]); +} + +/* 2. Tuning advertised window (window_clamp, rcv_ssthresh) + * + * All tcp_full_space() is split to two parts: "network" buffer, allocated + * forward and advertised in receiver window (tp->rcv_wnd) and + * "application buffer", required to isolate scheduling/application + * latencies from network. + * window_clamp is maximal advertised window. It can be less than + * tcp_full_space(), in this case tcp_full_space() - window_clamp + * is reserved for "application" buffer. The less window_clamp is + * the smoother our behaviour from viewpoint of network, but the lower + * throughput and the higher sensitivity of the connection to losses. 8) + * + * rcv_ssthresh is more strict window_clamp used at "slow start" + * phase to predict further behaviour of this connection. + * It is used for two goals: + * - to enforce header prediction at sender, even when application + * requires some significant "application buffer". It is check #1. + * - to prevent pruning of receive queue because of misprediction + * of receiver window. Check #2. + * + * The scheme does not work when sender sends good segments opening + * window and then starts to feed us spagetti. But it should work + * in common situations. Otherwise, we have to rely on queue collapsing. + */ + +/* Slow part of check#2. */ +static int __tcp_grow_window(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) +{ + /* Optimize this! */ + int truesize = tcp_win_from_space(skb->truesize)/2; + int window = tcp_full_space(sk)/2; + + while (tp->rcv_ssthresh <= window) { + if (truesize <= skb->len) + return 2*tp->ack.rcv_mss; + + truesize >>= 1; + window >>= 1; + } + return 0; +} + +static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) +{ + /* Check #1 */ + if (tp->rcv_ssthresh < tp->window_clamp && + (int)tp->rcv_ssthresh < tcp_space(sk) && + !tcp_memory_pressure) { + int incr; + + /* Check #2. Increase window, if skb with such overhead + * will fit to rcvbuf in future. + */ + if (tcp_win_from_space(skb->truesize) <= skb->len) + incr = 2*tp->advmss; + else + incr = __tcp_grow_window(sk, tp, skb); + + if (incr) { + tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); + tp->ack.quick |= 1; + } + } +} + +/* 3. Tuning rcvbuf, when connection enters established state. */ + +static void tcp_fixup_rcvbuf(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int rcvmem = tp->advmss + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff); + + /* Try to select rcvbuf so that 4 mss-sized segments + * will fit to window and correspoding skbs will fit to our rcvbuf. + * (was 3; 4 is minimum to allow fast retransmit to work.) + */ + while (tcp_win_from_space(rcvmem) < tp->advmss) + rcvmem += 128; + if (sk->sk_rcvbuf < 4 * rcvmem) + sk->sk_rcvbuf = min(4 * rcvmem, sysctl_tcp_rmem[2]); +} + +/* 4. Try to fixup all. It is made iimediately after connection enters + * established state. + */ +static void tcp_init_buffer_space(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int maxwin; + + if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) + tcp_fixup_rcvbuf(sk); + if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) + tcp_fixup_sndbuf(sk); + + tp->rcvq_space.space = tp->rcv_wnd; + + maxwin = tcp_full_space(sk); + + if (tp->window_clamp >= maxwin) { + tp->window_clamp = maxwin; + + if (sysctl_tcp_app_win && maxwin > 4 * tp->advmss) + tp->window_clamp = max(maxwin - + (maxwin >> sysctl_tcp_app_win), + 4 * tp->advmss); + } + + /* Force reservation of one segment. */ + if (sysctl_tcp_app_win && + tp->window_clamp > 2 * tp->advmss && + tp->window_clamp + tp->advmss > maxwin) + tp->window_clamp = max(2 * tp->advmss, maxwin - tp->advmss); + + tp->rcv_ssthresh = min(tp->rcv_ssthresh, tp->window_clamp); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static void init_bictcp(struct tcp_sock *tp) +{ + tp->bictcp.cnt = 0; + + tp->bictcp.last_max_cwnd = 0; + tp->bictcp.last_cwnd = 0; + tp->bictcp.last_stamp = 0; +} + +/* 5. Recalculate window clamp after socket hit its memory bounds. */ +static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) +{ + struct sk_buff *skb; + unsigned int app_win = tp->rcv_nxt - tp->copied_seq; + int ofo_win = 0; + + tp->ack.quick = 0; + + skb_queue_walk(&tp->out_of_order_queue, skb) { + ofo_win += skb->len; + } + + /* If overcommit is due to out of order segments, + * do not clamp window. Try to expand rcvbuf instead. + */ + if (ofo_win) { + if (sk->sk_rcvbuf < sysctl_tcp_rmem[2] && + !(sk->sk_userlocks & SOCK_RCVBUF_LOCK) && + !tcp_memory_pressure && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) + sk->sk_rcvbuf = min(atomic_read(&sk->sk_rmem_alloc), + sysctl_tcp_rmem[2]); + } + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf) { + app_win += ofo_win; + if (atomic_read(&sk->sk_rmem_alloc) >= 2 * sk->sk_rcvbuf) + app_win >>= 1; + if (app_win > tp->ack.rcv_mss) + app_win -= tp->ack.rcv_mss; + app_win = max(app_win, 2U*tp->advmss); + + if (!ofo_win) + tp->window_clamp = min(tp->window_clamp, app_win); + tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); + } +} + +/* Receiver "autotuning" code. + * + * The algorithm for RTT estimation w/o timestamps is based on + * Dynamic Right-Sizing (DRS) by Wu Feng and Mike Fisk of LANL. + * + * + * More detail on this code can be found at + * , + * though this reference is out of date. A new paper + * is pending. + */ +static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) +{ + u32 new_sample = tp->rcv_rtt_est.rtt; + long m = sample; + + if (m == 0) + m = 1; + + if (new_sample != 0) { + /* If we sample in larger samples in the non-timestamp + * case, we could grossly overestimate the RTT especially + * with chatty applications or bulk transfer apps which + * are stalled on filesystem I/O. + * + * Also, since we are only going for a minimum in the + * non-timestamp case, we do not smoothe things out + * else with timestamps disabled convergance takes too + * long. + */ + if (!win_dep) { + m -= (new_sample >> 3); + new_sample += m; + } else if (m < new_sample) + new_sample = m << 3; + } else { + /* No previous mesaure. */ + new_sample = m << 3; + } + + if (tp->rcv_rtt_est.rtt != new_sample) + tp->rcv_rtt_est.rtt = new_sample; +} + +static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) +{ + if (tp->rcv_rtt_est.time == 0) + goto new_measure; + if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) + return; + tcp_rcv_rtt_update(tp, + jiffies - tp->rcv_rtt_est.time, + 1); + +new_measure: + tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; + tp->rcv_rtt_est.time = tcp_time_stamp; +} + +static inline void tcp_rcv_rtt_measure_ts(struct tcp_sock *tp, struct sk_buff *skb) +{ + if (tp->rx_opt.rcv_tsecr && + (TCP_SKB_CB(skb)->end_seq - + TCP_SKB_CB(skb)->seq >= tp->ack.rcv_mss)) + tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); +} + +/* + * This function should be called every time data is copied to user space. + * It calculates the appropriate TCP receive buffer space. + */ +void tcp_rcv_space_adjust(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int time; + int space; + + if (tp->rcvq_space.time == 0) + goto new_measure; + + time = tcp_time_stamp - tp->rcvq_space.time; + if (time < (tp->rcv_rtt_est.rtt >> 3) || + tp->rcv_rtt_est.rtt == 0) + return; + + space = 2 * (tp->copied_seq - tp->rcvq_space.seq); + + space = max(tp->rcvq_space.space, space); + + if (tp->rcvq_space.space != space) { + int rcvmem; + + tp->rcvq_space.space = space; + + if (sysctl_tcp_moderate_rcvbuf) { + int new_clamp = space; + + /* Receive space grows, normalize in order to + * take into account packet headers and sk_buff + * structure overhead. + */ + space /= tp->advmss; + if (!space) + space = 1; + rcvmem = (tp->advmss + MAX_TCP_HEADER + + 16 + sizeof(struct sk_buff)); + while (tcp_win_from_space(rcvmem) < tp->advmss) + rcvmem += 128; + space *= rcvmem; + space = min(space, sysctl_tcp_rmem[2]); + if (space > sk->sk_rcvbuf) { + sk->sk_rcvbuf = space; + + /* Make the window clamp follow along. */ + tp->window_clamp = new_clamp; + } + } + } + +new_measure: + tp->rcvq_space.seq = tp->copied_seq; + tp->rcvq_space.time = tcp_time_stamp; +} + +/* There is something which you must keep in mind when you analyze the + * behavior of the tp->ato delayed ack timeout interval. When a + * connection starts up, we want to ack as quickly as possible. The + * problem is that "good" TCP's do slow start at the beginning of data + * transmission. The means that until we send the first few ACK's the + * sender will sit on his end and only queue most of his data, because + * he can only send snd_cwnd unacked packets at any given time. For + * each ACK we send, he increments snd_cwnd and transmits more of his + * queue. -DaveM + */ +static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) +{ + u32 now; + + tcp_schedule_ack(tp); + + tcp_measure_rcv_mss(tp, skb); + + tcp_rcv_rtt_measure(tp); + + now = tcp_time_stamp; + + if (!tp->ack.ato) { + /* The _first_ data packet received, initialize + * delayed ACK engine. + */ + tcp_incr_quickack(tp); + tp->ack.ato = TCP_ATO_MIN; + } else { + int m = now - tp->ack.lrcvtime; + + if (m <= TCP_ATO_MIN/2) { + /* The fastest case is the first. */ + tp->ack.ato = (tp->ack.ato>>1) + TCP_ATO_MIN/2; + } else if (m < tp->ack.ato) { + tp->ack.ato = (tp->ack.ato>>1) + m; + if (tp->ack.ato > tp->rto) + tp->ack.ato = tp->rto; + } else if (m > tp->rto) { + /* Too long gap. Apparently sender falled to + * restart window, so that we send ACKs quickly. + */ + tcp_incr_quickack(tp); + sk_stream_mem_reclaim(sk); + } + } + tp->ack.lrcvtime = now; + + TCP_ECN_check_ce(tp, skb); + + if (skb->len >= 128) + tcp_grow_window(sk, tp, skb); +} + +/* When starting a new connection, pin down the current choice of + * congestion algorithm. + */ +void tcp_ca_init(struct tcp_sock *tp) +{ + if (sysctl_tcp_westwood) + tp->adv_cong = TCP_WESTWOOD; + else if (sysctl_tcp_bic) + tp->adv_cong = TCP_BIC; + else if (sysctl_tcp_vegas_cong_avoid) { + tp->adv_cong = TCP_VEGAS; + tp->vegas.baseRTT = 0x7fffffff; + tcp_vegas_enable(tp); + } +} + +/* Do RTT sampling needed for Vegas. + * Basically we: + * o min-filter RTT samples from within an RTT to get the current + * propagation delay + queuing delay (we are min-filtering to try to + * avoid the effects of delayed ACKs) + * o min-filter RTT samples from a much longer window (forever for now) + * to find the propagation delay (baseRTT) + */ +static inline void vegas_rtt_calc(struct tcp_sock *tp, __u32 rtt) +{ + __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */ + + /* Filter to find propagation delay: */ + if (vrtt < tp->vegas.baseRTT) + tp->vegas.baseRTT = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt); + tp->vegas.cntRTT++; +} + +/* Called to compute a smoothed rtt estimate. The data fed to this + * routine either comes from timestamps, or from segments that were + * known _not_ to have been retransmitted [see Karn/Partridge + * Proceedings SIGCOMM 87]. The algorithm is from the SIGCOMM 88 + * piece by Van Jacobson. + * NOTE: the next three routines used to be one big routine. + * To save cycles in the RFC 1323 implementation it was better to break + * it up into three procedures. -- erics + */ +static void tcp_rtt_estimator(struct tcp_sock *tp, __u32 mrtt) +{ + long m = mrtt; /* RTT */ + + if (tcp_vegas_enabled(tp)) + vegas_rtt_calc(tp, mrtt); + + /* The following amusing code comes from Jacobson's + * article in SIGCOMM '88. Note that rtt and mdev + * are scaled versions of rtt and mean deviation. + * This is designed to be as fast as possible + * m stands for "measurement". + * + * On a 1990 paper the rto value is changed to: + * RTO = rtt + 4 * mdev + * + * Funny. This algorithm seems to be very broken. + * These formulae increase RTO, when it should be decreased, increase + * too slowly, when it should be incresed fastly, decrease too fastly + * etc. I guess in BSD RTO takes ONE value, so that it is absolutely + * does not matter how to _calculate_ it. Seems, it was trap + * that VJ failed to avoid. 8) + */ + if(m == 0) + m = 1; + if (tp->srtt != 0) { + m -= (tp->srtt >> 3); /* m is now error in rtt est */ + tp->srtt += m; /* rtt = 7/8 rtt + 1/8 new */ + if (m < 0) { + m = -m; /* m is now abs(error) */ + m -= (tp->mdev >> 2); /* similar update on mdev */ + /* This is similar to one of Eifel findings. + * Eifel blocks mdev updates when rtt decreases. + * This solution is a bit different: we use finer gain + * for mdev in this case (alpha*beta). + * Like Eifel it also prevents growth of rto, + * but also it limits too fast rto decreases, + * happening in pure Eifel. + */ + if (m > 0) + m >>= 3; + } else { + m -= (tp->mdev >> 2); /* similar update on mdev */ + } + tp->mdev += m; /* mdev = 3/4 mdev + 1/4 new */ + if (tp->mdev > tp->mdev_max) { + tp->mdev_max = tp->mdev; + if (tp->mdev_max > tp->rttvar) + tp->rttvar = tp->mdev_max; + } + if (after(tp->snd_una, tp->rtt_seq)) { + if (tp->mdev_max < tp->rttvar) + tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2; + tp->rtt_seq = tp->snd_nxt; + tp->mdev_max = TCP_RTO_MIN; + } + } else { + /* no previous measure. */ + tp->srtt = m<<3; /* take the measured time to be rtt */ + tp->mdev = m<<1; /* make sure rto = 3*rtt */ + tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); + tp->rtt_seq = tp->snd_nxt; + } + + tcp_westwood_update_rtt(tp, tp->srtt >> 3); +} + +/* Calculate rto without backoff. This is the second half of Van Jacobson's + * routine referred to above. + */ +static inline void tcp_set_rto(struct tcp_sock *tp) +{ + /* Old crap is replaced with new one. 8) + * + * More seriously: + * 1. If rtt variance happened to be less 50msec, it is hallucination. + * It cannot be less due to utterly erratic ACK generation made + * at least by solaris and freebsd. "Erratic ACKs" has _nothing_ + * to do with delayed acks, because at cwnd>2 true delack timeout + * is invisible. Actually, Linux-2.4 also generates erratic + * ACKs in some curcumstances. + */ + tp->rto = (tp->srtt >> 3) + tp->rttvar; + + /* 2. Fixups made earlier cannot be right. + * If we do not estimate RTO correctly without them, + * all the algo is pure shit and should be replaced + * with correct one. It is exaclty, which we pretend to do. + */ +} + +/* NOTE: clamping at TCP_RTO_MIN is not required, current algo + * guarantees that rto is higher. + */ +static inline void tcp_bound_rto(struct tcp_sock *tp) +{ + if (tp->rto > TCP_RTO_MAX) + tp->rto = TCP_RTO_MAX; +} + +/* Save metrics learned by this TCP session. + This function is called only, when TCP finishes successfully + i.e. when it enters TIME-WAIT or goes from LAST-ACK to CLOSE. + */ +void tcp_update_metrics(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + + if (sysctl_tcp_nometrics_save) + return; + + dst_confirm(dst); + + if (dst && (dst->flags&DST_HOST)) { + int m; + + if (tp->backoff || !tp->srtt) { + /* This session failed to estimate rtt. Why? + * Probably, no packets returned in time. + * Reset our results. + */ + if (!(dst_metric_locked(dst, RTAX_RTT))) + dst->metrics[RTAX_RTT-1] = 0; + return; + } + + m = dst_metric(dst, RTAX_RTT) - tp->srtt; + + /* If newly calculated rtt larger than stored one, + * store new one. Otherwise, use EWMA. Remember, + * rtt overestimation is always better than underestimation. + */ + if (!(dst_metric_locked(dst, RTAX_RTT))) { + if (m <= 0) + dst->metrics[RTAX_RTT-1] = tp->srtt; + else + dst->metrics[RTAX_RTT-1] -= (m>>3); + } + + if (!(dst_metric_locked(dst, RTAX_RTTVAR))) { + if (m < 0) + m = -m; + + /* Scale deviation to rttvar fixed point */ + m >>= 1; + if (m < tp->mdev) + m = tp->mdev; + + if (m >= dst_metric(dst, RTAX_RTTVAR)) + dst->metrics[RTAX_RTTVAR-1] = m; + else + dst->metrics[RTAX_RTTVAR-1] -= + (dst->metrics[RTAX_RTTVAR-1] - m)>>2; + } + + if (tp->snd_ssthresh >= 0xFFFF) { + /* Slow start still did not finish. */ + if (dst_metric(dst, RTAX_SSTHRESH) && + !dst_metric_locked(dst, RTAX_SSTHRESH) && + (tp->snd_cwnd >> 1) > dst_metric(dst, RTAX_SSTHRESH)) + dst->metrics[RTAX_SSTHRESH-1] = tp->snd_cwnd >> 1; + if (!dst_metric_locked(dst, RTAX_CWND) && + tp->snd_cwnd > dst_metric(dst, RTAX_CWND)) + dst->metrics[RTAX_CWND-1] = tp->snd_cwnd; + } else if (tp->snd_cwnd > tp->snd_ssthresh && + tp->ca_state == TCP_CA_Open) { + /* Cong. avoidance phase, cwnd is reliable. */ + if (!dst_metric_locked(dst, RTAX_SSTHRESH)) + dst->metrics[RTAX_SSTHRESH-1] = + max(tp->snd_cwnd >> 1, tp->snd_ssthresh); + if (!dst_metric_locked(dst, RTAX_CWND)) + dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_cwnd) >> 1; + } else { + /* Else slow start did not finish, cwnd is non-sense, + ssthresh may be also invalid. + */ + if (!dst_metric_locked(dst, RTAX_CWND)) + dst->metrics[RTAX_CWND-1] = (dst->metrics[RTAX_CWND-1] + tp->snd_ssthresh) >> 1; + if (dst->metrics[RTAX_SSTHRESH-1] && + !dst_metric_locked(dst, RTAX_SSTHRESH) && + tp->snd_ssthresh > dst->metrics[RTAX_SSTHRESH-1]) + dst->metrics[RTAX_SSTHRESH-1] = tp->snd_ssthresh; + } + + if (!dst_metric_locked(dst, RTAX_REORDERING)) { + if (dst->metrics[RTAX_REORDERING-1] < tp->reordering && + tp->reordering != sysctl_tcp_reordering) + dst->metrics[RTAX_REORDERING-1] = tp->reordering; + } + } +} + +/* Numbers are taken from RFC2414. */ +__u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) +{ + __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); + + if (!cwnd) { + if (tp->mss_cache_std > 1460) + cwnd = 2; + else + cwnd = (tp->mss_cache_std > 1095) ? 3 : 4; + } + return min_t(__u32, cwnd, tp->snd_cwnd_clamp); +} + +/* Initialize metrics on socket. */ + +static void tcp_init_metrics(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + + if (dst == NULL) + goto reset; + + dst_confirm(dst); + + if (dst_metric_locked(dst, RTAX_CWND)) + tp->snd_cwnd_clamp = dst_metric(dst, RTAX_CWND); + if (dst_metric(dst, RTAX_SSTHRESH)) { + tp->snd_ssthresh = dst_metric(dst, RTAX_SSTHRESH); + if (tp->snd_ssthresh > tp->snd_cwnd_clamp) + tp->snd_ssthresh = tp->snd_cwnd_clamp; + } + if (dst_metric(dst, RTAX_REORDERING) && + tp->reordering != dst_metric(dst, RTAX_REORDERING)) { + tp->rx_opt.sack_ok &= ~2; + tp->reordering = dst_metric(dst, RTAX_REORDERING); + } + + if (dst_metric(dst, RTAX_RTT) == 0) + goto reset; + + if (!tp->srtt && dst_metric(dst, RTAX_RTT) < (TCP_TIMEOUT_INIT << 3)) + goto reset; + + /* Initial rtt is determined from SYN,SYN-ACK. + * The segment is small and rtt may appear much + * less than real one. Use per-dst memory + * to make it more realistic. + * + * A bit of theory. RTT is time passed after "normal" sized packet + * is sent until it is ACKed. In normal curcumstances sending small + * packets force peer to delay ACKs and calculation is correct too. + * The algorithm is adaptive and, provided we follow specs, it + * NEVER underestimate RTT. BUT! If peer tries to make some clever + * tricks sort of "quick acks" for time long enough to decrease RTT + * to low value, and then abruptly stops to do it and starts to delay + * ACKs, wait for troubles. + */ + if (dst_metric(dst, RTAX_RTT) > tp->srtt) { + tp->srtt = dst_metric(dst, RTAX_RTT); + tp->rtt_seq = tp->snd_nxt; + } + if (dst_metric(dst, RTAX_RTTVAR) > tp->mdev) { + tp->mdev = dst_metric(dst, RTAX_RTTVAR); + tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); + } + tcp_set_rto(tp); + tcp_bound_rto(tp); + if (tp->rto < TCP_TIMEOUT_INIT && !tp->rx_opt.saw_tstamp) + goto reset; + tp->snd_cwnd = tcp_init_cwnd(tp, dst); + tp->snd_cwnd_stamp = tcp_time_stamp; + return; + +reset: + /* Play conservative. If timestamps are not + * supported, TCP will fail to recalculate correct + * rtt, if initial rto is too small. FORGET ALL AND RESET! + */ + if (!tp->rx_opt.saw_tstamp && tp->srtt) { + tp->srtt = 0; + tp->mdev = tp->mdev_max = tp->rttvar = TCP_TIMEOUT_INIT; + tp->rto = TCP_TIMEOUT_INIT; + } +} + +static void tcp_update_reordering(struct tcp_sock *tp, int metric, int ts) +{ + if (metric > tp->reordering) { + tp->reordering = min(TCP_MAX_REORDERING, metric); + + /* This exciting event is worth to be remembered. 8) */ + if (ts) + NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER); + else if (IsReno(tp)) + NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER); + else if (IsFack(tp)) + NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER); + else + NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); +#if FASTRETRANS_DEBUG > 1 + printk(KERN_DEBUG "Disorder%d %d %u f%u s%u rr%d\n", + tp->rx_opt.sack_ok, tp->ca_state, + tp->reordering, + tp->fackets_out, + tp->sacked_out, + tp->undo_marker ? tp->undo_retrans : 0); +#endif + /* Disable FACK yet. */ + tp->rx_opt.sack_ok &= ~2; + } +} + +/* This procedure tags the retransmission queue when SACKs arrive. + * + * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). + * Packets in queue with these bits set are counted in variables + * sacked_out, retrans_out and lost_out, correspondingly. + * + * Valid combinations are: + * Tag InFlight Description + * 0 1 - orig segment is in flight. + * S 0 - nothing flies, orig reached receiver. + * L 0 - nothing flies, orig lost by net. + * R 2 - both orig and retransmit are in flight. + * L|R 1 - orig is lost, retransmit is in flight. + * S|R 1 - orig reached receiver, retrans is still in flight. + * (L|S|R is logically valid, it could occur when L|R is sacked, + * but it is equivalent to plain S and code short-curcuits it to S. + * L|S is logically invalid, it would mean -1 packet in flight 8)) + * + * These 6 states form finite state machine, controlled by the following events: + * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) + * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) + * 3. Loss detection event of one of three flavors: + * A. Scoreboard estimator decided the packet is lost. + * A'. Reno "three dupacks" marks head of queue lost. + * A''. Its FACK modfication, head until snd.fack is lost. + * B. SACK arrives sacking data transmitted after never retransmitted + * hole was sent out. + * C. SACK arrives sacking SND.NXT at the moment, when the + * segment was retransmitted. + * 4. D-SACK added new rule: D-SACK changes any tag to S. + * + * It is pleasant to note, that state diagram turns out to be commutative, + * so that we are allowed not to be bothered by order of our actions, + * when multiple events arrive simultaneously. (see the function below). + * + * Reordering detection. + * -------------------- + * Reordering metric is maximal distance, which a packet can be displaced + * in packet stream. With SACKs we can estimate it: + * + * 1. SACK fills old hole and the corresponding segment was not + * ever retransmitted -> reordering. Alas, we cannot use it + * when segment was retransmitted. + * 2. The last flaw is solved with D-SACK. D-SACK arrives + * for retransmitted and already SACKed segment -> reordering.. + * Both of these heuristics are not used in Loss state, when we cannot + * account for retransmits accurately. + */ +static int +tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; + struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2); + int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; + int reord = tp->packets_out; + int prior_fackets; + u32 lost_retrans = 0; + int flag = 0; + int i; + + /* So, SACKs for already sent large segments will be lost. + * Not good, but alternative is to resegment the queue. */ + if (sk->sk_route_caps & NETIF_F_TSO) { + sk->sk_route_caps &= ~NETIF_F_TSO; + sock_set_flag(sk, SOCK_NO_LARGESEND); + tp->mss_cache = tp->mss_cache_std; + } + + if (!tp->sacked_out) + tp->fackets_out = 0; + prior_fackets = tp->fackets_out; + + for (i=0; istart_seq); + __u32 end_seq = ntohl(sp->end_seq); + int fack_count = 0; + int dup_sack = 0; + + /* Check for D-SACK. */ + if (i == 0) { + u32 ack = TCP_SKB_CB(ack_skb)->ack_seq; + + if (before(start_seq, ack)) { + dup_sack = 1; + tp->rx_opt.sack_ok |= 4; + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); + } else if (num_sacks > 1 && + !after(end_seq, ntohl(sp[1].end_seq)) && + !before(start_seq, ntohl(sp[1].start_seq))) { + dup_sack = 1; + tp->rx_opt.sack_ok |= 4; + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); + } + + /* D-SACK for already forgotten data... + * Do dumb counting. */ + if (dup_sack && + !after(end_seq, prior_snd_una) && + after(end_seq, tp->undo_marker)) + tp->undo_retrans--; + + /* Eliminate too old ACKs, but take into + * account more or less fresh ones, they can + * contain valid SACK info. + */ + if (before(ack, prior_snd_una - tp->max_window)) + return 0; + } + + /* Event "B" in the comment above. */ + if (after(end_seq, tp->high_seq)) + flag |= FLAG_DATA_LOST; + + sk_stream_for_retrans_queue(skb, sk) { + u8 sacked = TCP_SKB_CB(skb)->sacked; + int in_sack; + + /* The retransmission queue is always in order, so + * we can short-circuit the walk early. + */ + if(!before(TCP_SKB_CB(skb)->seq, end_seq)) + break; + + fack_count += tcp_skb_pcount(skb); + + in_sack = !after(start_seq, TCP_SKB_CB(skb)->seq) && + !before(end_seq, TCP_SKB_CB(skb)->end_seq); + + /* Account D-SACK for retransmitted packet. */ + if ((dup_sack && in_sack) && + (sacked & TCPCB_RETRANS) && + after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) + tp->undo_retrans--; + + /* The frame is ACKed. */ + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) { + if (sacked&TCPCB_RETRANS) { + if ((dup_sack && in_sack) && + (sacked&TCPCB_SACKED_ACKED)) + reord = min(fack_count, reord); + } else { + /* If it was in a hole, we detected reordering. */ + if (fack_count < prior_fackets && + !(sacked&TCPCB_SACKED_ACKED)) + reord = min(fack_count, reord); + } + + /* Nothing to do; acked frame is about to be dropped. */ + continue; + } + + if ((sacked&TCPCB_SACKED_RETRANS) && + after(end_seq, TCP_SKB_CB(skb)->ack_seq) && + (!lost_retrans || after(end_seq, lost_retrans))) + lost_retrans = end_seq; + + if (!in_sack) + continue; + + if (!(sacked&TCPCB_SACKED_ACKED)) { + if (sacked & TCPCB_SACKED_RETRANS) { + /* If the segment is not tagged as lost, + * we do not clear RETRANS, believing + * that retransmission is still in flight. + */ + if (sacked & TCPCB_LOST) { + TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + tp->lost_out -= tcp_skb_pcount(skb); + tp->retrans_out -= tcp_skb_pcount(skb); + } + } else { + /* New sack for not retransmitted frame, + * which was in hole. It is reordering. + */ + if (!(sacked & TCPCB_RETRANS) && + fack_count < prior_fackets) + reord = min(fack_count, reord); + + if (sacked & TCPCB_LOST) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + tp->lost_out -= tcp_skb_pcount(skb); + } + } + + TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; + flag |= FLAG_DATA_SACKED; + tp->sacked_out += tcp_skb_pcount(skb); + + if (fack_count > tp->fackets_out) + tp->fackets_out = fack_count; + } else { + if (dup_sack && (sacked&TCPCB_RETRANS)) + reord = min(fack_count, reord); + } + + /* D-SACK. We can detect redundant retransmission + * in S|R and plain R frames and clear it. + * undo_retrans is decreased above, L|R frames + * are accounted above as well. + */ + if (dup_sack && + (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS)) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + } + } + } + + /* Check for lost retransmit. This superb idea is + * borrowed from "ratehalving". Event "C". + * Later note: FACK people cheated me again 8), + * we have to account for reordering! Ugly, + * but should help. + */ + if (lost_retrans && tp->ca_state == TCP_CA_Recovery) { + struct sk_buff *skb; + + sk_stream_for_retrans_queue(skb, sk) { + if (after(TCP_SKB_CB(skb)->seq, lost_retrans)) + break; + if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + continue; + if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) && + after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) && + (IsFack(tp) || + !before(lost_retrans, + TCP_SKB_CB(skb)->ack_seq + tp->reordering * + tp->mss_cache_std))) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + + if (!(TCP_SKB_CB(skb)->sacked&(TCPCB_LOST|TCPCB_SACKED_ACKED))) { + tp->lost_out += tcp_skb_pcount(skb); + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + flag |= FLAG_DATA_SACKED; + NET_INC_STATS_BH(LINUX_MIB_TCPLOSTRETRANSMIT); + } + } + } + } + + tp->left_out = tp->sacked_out + tp->lost_out; + + if ((reord < tp->fackets_out) && tp->ca_state != TCP_CA_Loss) + tcp_update_reordering(tp, ((tp->fackets_out + 1) - reord), 0); + +#if FASTRETRANS_DEBUG > 0 + BUG_TRAP((int)tp->sacked_out >= 0); + BUG_TRAP((int)tp->lost_out >= 0); + BUG_TRAP((int)tp->retrans_out >= 0); + BUG_TRAP((int)tcp_packets_in_flight(tp) >= 0); +#endif + return flag; +} + +/* RTO occurred, but do not yet enter loss state. Instead, transmit two new + * segments to see from the next ACKs whether any data was really missing. + * If the RTO was spurious, new ACKs should arrive. + */ +void tcp_enter_frto(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + tp->frto_counter = 1; + + if (tp->ca_state <= TCP_CA_Disorder || + tp->snd_una == tp->high_seq || + (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { + tp->prior_ssthresh = tcp_current_ssthresh(tp); + if (!tcp_westwood_ssthresh(tp)) + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + } + + /* Have to clear retransmission markers here to keep the bookkeeping + * in shape, even though we are not yet in Loss state. + * If something was really lost, it is eventually caught up + * in tcp_enter_frto_loss. + */ + tp->retrans_out = 0; + tp->undo_marker = tp->snd_una; + tp->undo_retrans = 0; + + sk_stream_for_retrans_queue(skb, sk) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_RETRANS; + } + tcp_sync_left_out(tp); + + tcp_set_ca_state(tp, TCP_CA_Open); + tp->frto_highmark = tp->snd_nxt; +} + +/* Enter Loss state after F-RTO was applied. Dupack arrived after RTO, + * which indicates that we should follow the traditional RTO recovery, + * i.e. mark everything lost and do go-back-N retransmission. + */ +static void tcp_enter_frto_loss(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int cnt = 0; + + tp->sacked_out = 0; + tp->lost_out = 0; + tp->fackets_out = 0; + + sk_stream_for_retrans_queue(skb, sk) { + cnt += tcp_skb_pcount(skb); + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { + + /* Do not mark those segments lost that were + * forward transmitted after RTO + */ + if (!after(TCP_SKB_CB(skb)->end_seq, + tp->frto_highmark)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out += tcp_skb_pcount(skb); + } + } else { + tp->sacked_out += tcp_skb_pcount(skb); + tp->fackets_out = cnt; + } + } + tcp_sync_left_out(tp); + + tp->snd_cwnd = tp->frto_counter + tcp_packets_in_flight(tp)+1; + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->undo_marker = 0; + tp->frto_counter = 0; + + tp->reordering = min_t(unsigned int, tp->reordering, + sysctl_tcp_reordering); + tcp_set_ca_state(tp, TCP_CA_Loss); + tp->high_seq = tp->frto_highmark; + TCP_ECN_queue_cwr(tp); + + init_bictcp(tp); +} + +void tcp_clear_retrans(struct tcp_sock *tp) +{ + tp->left_out = 0; + tp->retrans_out = 0; + + tp->fackets_out = 0; + tp->sacked_out = 0; + tp->lost_out = 0; + + tp->undo_marker = 0; + tp->undo_retrans = 0; +} + +/* Enter Loss state. If "how" is not zero, forget all SACK information + * and reset tags completely, otherwise preserve SACKs. If receiver + * dropped its ofo queue, we will know this due to reneging detection. + */ +void tcp_enter_loss(struct sock *sk, int how) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int cnt = 0; + + /* Reduce ssthresh if it has not yet been made inside this window. */ + if (tp->ca_state <= TCP_CA_Disorder || tp->snd_una == tp->high_seq || + (tp->ca_state == TCP_CA_Loss && !tp->retransmits)) { + tp->prior_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + } + tp->snd_cwnd = 1; + tp->snd_cwnd_cnt = 0; + tp->snd_cwnd_stamp = tcp_time_stamp; + + tcp_clear_retrans(tp); + + /* Push undo marker, if it was plain RTO and nothing + * was retransmitted. */ + if (!how) + tp->undo_marker = tp->snd_una; + + sk_stream_for_retrans_queue(skb, sk) { + cnt += tcp_skb_pcount(skb); + if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) + tp->undo_marker = 0; + TCP_SKB_CB(skb)->sacked &= (~TCPCB_TAGBITS)|TCPCB_SACKED_ACKED; + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) || how) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_ACKED; + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out += tcp_skb_pcount(skb); + } else { + tp->sacked_out += tcp_skb_pcount(skb); + tp->fackets_out = cnt; + } + } + tcp_sync_left_out(tp); + + tp->reordering = min_t(unsigned int, tp->reordering, + sysctl_tcp_reordering); + tcp_set_ca_state(tp, TCP_CA_Loss); + tp->high_seq = tp->snd_nxt; + TCP_ECN_queue_cwr(tp); +} + +static int tcp_check_sack_reneging(struct sock *sk, struct tcp_sock *tp) +{ + struct sk_buff *skb; + + /* If ACK arrived pointing to a remembered SACK, + * it means that our remembered SACKs do not reflect + * real state of receiver i.e. + * receiver _host_ is heavily congested (or buggy). + * Do processing similar to RTO timeout. + */ + if ((skb = skb_peek(&sk->sk_write_queue)) != NULL && + (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { + NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); + + tcp_enter_loss(sk, 1); + tp->retransmits++; + tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + return 1; + } + return 0; +} + +static inline int tcp_fackets_out(struct tcp_sock *tp) +{ + return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; +} + +static inline int tcp_skb_timedout(struct tcp_sock *tp, struct sk_buff *skb) +{ + return (tcp_time_stamp - TCP_SKB_CB(skb)->when > tp->rto); +} + +static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) +{ + return tp->packets_out && + tcp_skb_timedout(tp, skb_peek(&sk->sk_write_queue)); +} + +/* Linux NewReno/SACK/FACK/ECN state machine. + * -------------------------------------- + * + * "Open" Normal state, no dubious events, fast path. + * "Disorder" In all the respects it is "Open", + * but requires a bit more attention. It is entered when + * we see some SACKs or dupacks. It is split of "Open" + * mainly to move some processing from fast path to slow one. + * "CWR" CWND was reduced due to some Congestion Notification event. + * It can be ECN, ICMP source quench, local device congestion. + * "Recovery" CWND was reduced, we are fast-retransmitting. + * "Loss" CWND was reduced due to RTO timeout or SACK reneging. + * + * tcp_fastretrans_alert() is entered: + * - each incoming ACK, if state is not "Open" + * - when arrived ACK is unusual, namely: + * * SACK + * * Duplicate ACK. + * * ECN ECE. + * + * Counting packets in flight is pretty simple. + * + * in_flight = packets_out - left_out + retrans_out + * + * packets_out is SND.NXT-SND.UNA counted in packets. + * + * retrans_out is number of retransmitted segments. + * + * left_out is number of segments left network, but not ACKed yet. + * + * left_out = sacked_out + lost_out + * + * sacked_out: Packets, which arrived to receiver out of order + * and hence not ACKed. With SACKs this number is simply + * amount of SACKed data. Even without SACKs + * it is easy to give pretty reliable estimate of this number, + * counting duplicate ACKs. + * + * lost_out: Packets lost by network. TCP has no explicit + * "loss notification" feedback from network (for now). + * It means that this number can be only _guessed_. + * Actually, it is the heuristics to predict lossage that + * distinguishes different algorithms. + * + * F.e. after RTO, when all the queue is considered as lost, + * lost_out = packets_out and in_flight = retrans_out. + * + * Essentially, we have now two algorithms counting + * lost packets. + * + * FACK: It is the simplest heuristics. As soon as we decided + * that something is lost, we decide that _all_ not SACKed + * packets until the most forward SACK are lost. I.e. + * lost_out = fackets_out - sacked_out and left_out = fackets_out. + * It is absolutely correct estimate, if network does not reorder + * packets. And it loses any connection to reality when reordering + * takes place. We use FACK by default until reordering + * is suspected on the path to this destination. + * + * NewReno: when Recovery is entered, we assume that one segment + * is lost (classic Reno). While we are in Recovery and + * a partial ACK arrives, we assume that one more packet + * is lost (NewReno). This heuristics are the same in NewReno + * and SACK. + * + * Imagine, that's all! Forget about all this shamanism about CWND inflation + * deflation etc. CWND is real congestion window, never inflated, changes + * only according to classic VJ rules. + * + * Really tricky (and requiring careful tuning) part of algorithm + * is hidden in functions tcp_time_to_recover() and tcp_xmit_retransmit_queue(). + * The first determines the moment _when_ we should reduce CWND and, + * hence, slow down forward transmission. In fact, it determines the moment + * when we decide that hole is caused by loss, rather than by a reorder. + * + * tcp_xmit_retransmit_queue() decides, _what_ we should retransmit to fill + * holes, caused by lost packets. + * + * And the most logically complicated part of algorithm is undo + * heuristics. We detect false retransmits due to both too early + * fast retransmit (reordering) and underestimated RTO, analyzing + * timestamps and D-SACKs. When we detect that some segments were + * retransmitted by mistake and CWND reduction was wrong, we undo + * window reduction and abort recovery phase. This logic is hidden + * inside several functions named tcp_try_undo_. + */ + +/* This function decides, when we should leave Disordered state + * and enter Recovery phase, reducing congestion window. + * + * Main question: may we further continue forward transmission + * with the same cwnd? + */ +static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) +{ + __u32 packets_out; + + /* Trick#1: The loss is proven. */ + if (tp->lost_out) + return 1; + + /* Not-A-Trick#2 : Classic rule... */ + if (tcp_fackets_out(tp) > tp->reordering) + return 1; + + /* Trick#3 : when we use RFC2988 timer restart, fast + * retransmit can be triggered by timeout of queue head. + */ + if (tcp_head_timedout(sk, tp)) + return 1; + + /* Trick#4: It is still not OK... But will it be useful to delay + * recovery more? + */ + packets_out = tp->packets_out; + if (packets_out <= tp->reordering && + tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && + !tcp_may_send_now(sk, tp)) { + /* We have nothing to send. This connection is limited + * either by receiver window or by application. + */ + return 1; + } + + return 0; +} + +/* If we receive more dupacks than we expected counting segments + * in assumption of absent reordering, interpret this as reordering. + * The only another reason could be bug in receiver TCP. + */ +static void tcp_check_reno_reordering(struct tcp_sock *tp, int addend) +{ + u32 holes; + + holes = max(tp->lost_out, 1U); + holes = min(holes, tp->packets_out); + + if ((tp->sacked_out + holes) > tp->packets_out) { + tp->sacked_out = tp->packets_out - holes; + tcp_update_reordering(tp, tp->packets_out+addend, 0); + } +} + +/* Emulate SACKs for SACKless connection: account for a new dupack. */ + +static void tcp_add_reno_sack(struct tcp_sock *tp) +{ + tp->sacked_out++; + tcp_check_reno_reordering(tp, 0); + tcp_sync_left_out(tp); +} + +/* Account for ACK, ACKing some data in Reno Recovery phase. */ + +static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acked) +{ + if (acked > 0) { + /* One ACK acked hole. The rest eat duplicate ACKs. */ + if (acked-1 >= tp->sacked_out) + tp->sacked_out = 0; + else + tp->sacked_out -= acked-1; + } + tcp_check_reno_reordering(tp, acked); + tcp_sync_left_out(tp); +} + +static inline void tcp_reset_reno_sack(struct tcp_sock *tp) +{ + tp->sacked_out = 0; + tp->left_out = tp->lost_out; +} + +/* Mark head of queue up as lost. */ +static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, + int packets, u32 high_seq) +{ + struct sk_buff *skb; + int cnt = packets; + + BUG_TRAP(cnt <= tp->packets_out); + + sk_stream_for_retrans_queue(skb, sk) { + cnt -= tcp_skb_pcount(skb); + if (cnt < 0 || after(TCP_SKB_CB(skb)->end_seq, high_seq)) + break; + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out += tcp_skb_pcount(skb); + } + } + tcp_sync_left_out(tp); +} + +/* Account newly detected lost packet(s) */ + +static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) +{ + if (IsFack(tp)) { + int lost = tp->fackets_out - tp->reordering; + if (lost <= 0) + lost = 1; + tcp_mark_head_lost(sk, tp, lost, tp->high_seq); + } else { + tcp_mark_head_lost(sk, tp, 1, tp->high_seq); + } + + /* New heuristics: it is possible only after we switched + * to restart timer each time when something is ACKed. + * Hence, we can detect timed out packets during fast + * retransmit without falling to slow start. + */ + if (tcp_head_timedout(sk, tp)) { + struct sk_buff *skb; + + sk_stream_for_retrans_queue(skb, sk) { + if (tcp_skb_timedout(tp, skb) && + !(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out += tcp_skb_pcount(skb); + } + } + tcp_sync_left_out(tp); + } +} + +/* CWND moderation, preventing bursts due to too big ACKs + * in dubious situations. + */ +static inline void tcp_moderate_cwnd(struct tcp_sock *tp) +{ + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp)+tcp_max_burst(tp)); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +/* Decrease cwnd each second ack. */ + +static void tcp_cwnd_down(struct tcp_sock *tp) +{ + int decr = tp->snd_cwnd_cnt + 1; + __u32 limit; + + /* + * TCP Westwood + * Here limit is evaluated as BWestimation*RTTmin (for obtaining it + * in packets we use mss_cache). If sysctl_tcp_westwood is off + * tcp_westwood_bw_rttmin() returns 0. In such case snd_ssthresh is + * still used as usual. It prevents other strange cases in which + * BWE*RTTmin could assume value 0. It should not happen but... + */ + + if (!(limit = tcp_westwood_bw_rttmin(tp))) + limit = tp->snd_ssthresh/2; + + tp->snd_cwnd_cnt = decr&1; + decr >>= 1; + + if (decr && tp->snd_cwnd > limit) + tp->snd_cwnd -= decr; + + tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +/* Nothing was retransmitted or returned timestamp is less + * than timestamp of the first retransmission. + */ +static inline int tcp_packet_delayed(struct tcp_sock *tp) +{ + return !tp->retrans_stamp || + (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + (__s32)(tp->rx_opt.rcv_tsecr - tp->retrans_stamp) < 0); +} + +/* Undo procedures. */ + +#if FASTRETRANS_DEBUG > 1 +static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) +{ + struct inet_sock *inet = inet_sk(sk); + printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n", + msg, + NIPQUAD(inet->daddr), ntohs(inet->dport), + tp->snd_cwnd, tp->left_out, + tp->snd_ssthresh, tp->prior_ssthresh, + tp->packets_out); +} +#else +#define DBGUNDO(x...) do { } while (0) +#endif + +static void tcp_undo_cwr(struct tcp_sock *tp, int undo) +{ + if (tp->prior_ssthresh) { + if (tcp_is_bic(tp)) + tp->snd_cwnd = max(tp->snd_cwnd, tp->bictcp.last_max_cwnd); + else + tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh<<1); + + if (undo && tp->prior_ssthresh > tp->snd_ssthresh) { + tp->snd_ssthresh = tp->prior_ssthresh; + TCP_ECN_withdraw_cwr(tp); + } + } else { + tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh); + } + tcp_moderate_cwnd(tp); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static inline int tcp_may_undo(struct tcp_sock *tp) +{ + return tp->undo_marker && + (!tp->undo_retrans || tcp_packet_delayed(tp)); +} + +/* People celebrate: "We love our President!" */ +static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) +{ + if (tcp_may_undo(tp)) { + /* Happy end! We did not retransmit anything + * or our original transmission succeeded. + */ + DBGUNDO(sk, tp, tp->ca_state == TCP_CA_Loss ? "loss" : "retrans"); + tcp_undo_cwr(tp, 1); + if (tp->ca_state == TCP_CA_Loss) + NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); + else + NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); + tp->undo_marker = 0; + } + if (tp->snd_una == tp->high_seq && IsReno(tp)) { + /* Hold old state until something *above* high_seq + * is ACKed. For Reno it is MUST to prevent false + * fast retransmits (RFC2582). SACK TCP is safe. */ + tcp_moderate_cwnd(tp); + return 1; + } + tcp_set_ca_state(tp, TCP_CA_Open); + return 0; +} + +/* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ +static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp) +{ + if (tp->undo_marker && !tp->undo_retrans) { + DBGUNDO(sk, tp, "D-SACK"); + tcp_undo_cwr(tp, 1); + tp->undo_marker = 0; + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); + } +} + +/* Undo during fast recovery after partial ACK. */ + +static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, + int acked) +{ + /* Partial ACK arrived. Force Hoe's retransmit. */ + int failed = IsReno(tp) || tp->fackets_out>tp->reordering; + + if (tcp_may_undo(tp)) { + /* Plain luck! Hole if filled with delayed + * packet, rather than with a retransmit. + */ + if (tp->retrans_out == 0) + tp->retrans_stamp = 0; + + tcp_update_reordering(tp, tcp_fackets_out(tp)+acked, 1); + + DBGUNDO(sk, tp, "Hoe"); + tcp_undo_cwr(tp, 0); + NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); + + /* So... Do not make Hoe's retransmit yet. + * If the first packet was delayed, the rest + * ones are most probably delayed as well. + */ + failed = 0; + } + return failed; +} + +/* Undo during loss recovery after partial ACK. */ +static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) +{ + if (tcp_may_undo(tp)) { + struct sk_buff *skb; + sk_stream_for_retrans_queue(skb, sk) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; + } + DBGUNDO(sk, tp, "partial loss"); + tp->lost_out = 0; + tp->left_out = tp->sacked_out; + tcp_undo_cwr(tp, 1); + NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); + tp->retransmits = 0; + tp->undo_marker = 0; + if (!IsReno(tp)) + tcp_set_ca_state(tp, TCP_CA_Open); + return 1; + } + return 0; +} + +static inline void tcp_complete_cwr(struct tcp_sock *tp) +{ + if (tcp_westwood_cwnd(tp)) + tp->snd_ssthresh = tp->snd_cwnd; + else + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) +{ + tp->left_out = tp->sacked_out; + + if (tp->retrans_out == 0) + tp->retrans_stamp = 0; + + if (flag&FLAG_ECE) + tcp_enter_cwr(tp); + + if (tp->ca_state != TCP_CA_CWR) { + int state = TCP_CA_Open; + + if (tp->left_out || tp->retrans_out || tp->undo_marker) + state = TCP_CA_Disorder; + + if (tp->ca_state != state) { + tcp_set_ca_state(tp, state); + tp->high_seq = tp->snd_nxt; + } + tcp_moderate_cwnd(tp); + } else { + tcp_cwnd_down(tp); + } +} + +/* Process an event, which can update packets-in-flight not trivially. + * Main goal of this function is to calculate new estimate for left_out, + * taking into account both packets sitting in receiver's buffer and + * packets lost by network. + * + * Besides that it does CWND reduction, when packet loss is detected + * and changes state of machine. + * + * It does _not_ decide what to send, it is made in function + * tcp_xmit_retransmit_queue(). + */ +static void +tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, + int prior_packets, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP)); + + /* Some technical things: + * 1. Reno does not count dupacks (sacked_out) automatically. */ + if (!tp->packets_out) + tp->sacked_out = 0; + /* 2. SACK counts snd_fack in packets inaccurately. */ + if (tp->sacked_out == 0) + tp->fackets_out = 0; + + /* Now state machine starts. + * A. ECE, hence prohibit cwnd undoing, the reduction is required. */ + if (flag&FLAG_ECE) + tp->prior_ssthresh = 0; + + /* B. In all the states check for reneging SACKs. */ + if (tp->sacked_out && tcp_check_sack_reneging(sk, tp)) + return; + + /* C. Process data loss notification, provided it is valid. */ + if ((flag&FLAG_DATA_LOST) && + before(tp->snd_una, tp->high_seq) && + tp->ca_state != TCP_CA_Open && + tp->fackets_out > tp->reordering) { + tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq); + NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); + } + + /* D. Synchronize left_out to current state. */ + tcp_sync_left_out(tp); + + /* E. Check state exit conditions. State can be terminated + * when high_seq is ACKed. */ + if (tp->ca_state == TCP_CA_Open) { + if (!sysctl_tcp_frto) + BUG_TRAP(tp->retrans_out == 0); + tp->retrans_stamp = 0; + } else if (!before(tp->snd_una, tp->high_seq)) { + switch (tp->ca_state) { + case TCP_CA_Loss: + tp->retransmits = 0; + if (tcp_try_undo_recovery(sk, tp)) + return; + break; + + case TCP_CA_CWR: + /* CWR is to be held something *above* high_seq + * is ACKed for CWR bit to reach receiver. */ + if (tp->snd_una != tp->high_seq) { + tcp_complete_cwr(tp); + tcp_set_ca_state(tp, TCP_CA_Open); + } + break; + + case TCP_CA_Disorder: + tcp_try_undo_dsack(sk, tp); + if (!tp->undo_marker || + /* For SACK case do not Open to allow to undo + * catching for all duplicate ACKs. */ + IsReno(tp) || tp->snd_una != tp->high_seq) { + tp->undo_marker = 0; + tcp_set_ca_state(tp, TCP_CA_Open); + } + break; + + case TCP_CA_Recovery: + if (IsReno(tp)) + tcp_reset_reno_sack(tp); + if (tcp_try_undo_recovery(sk, tp)) + return; + tcp_complete_cwr(tp); + break; + } + } + + /* F. Process state. */ + switch (tp->ca_state) { + case TCP_CA_Recovery: + if (prior_snd_una == tp->snd_una) { + if (IsReno(tp) && is_dupack) + tcp_add_reno_sack(tp); + } else { + int acked = prior_packets - tp->packets_out; + if (IsReno(tp)) + tcp_remove_reno_sacks(sk, tp, acked); + is_dupack = tcp_try_undo_partial(sk, tp, acked); + } + break; + case TCP_CA_Loss: + if (flag&FLAG_DATA_ACKED) + tp->retransmits = 0; + if (!tcp_try_undo_loss(sk, tp)) { + tcp_moderate_cwnd(tp); + tcp_xmit_retransmit_queue(sk); + return; + } + if (tp->ca_state != TCP_CA_Open) + return; + /* Loss is undone; fall through to processing in Open state. */ + default: + if (IsReno(tp)) { + if (tp->snd_una != prior_snd_una) + tcp_reset_reno_sack(tp); + if (is_dupack) + tcp_add_reno_sack(tp); + } + + if (tp->ca_state == TCP_CA_Disorder) + tcp_try_undo_dsack(sk, tp); + + if (!tcp_time_to_recover(sk, tp)) { + tcp_try_to_open(sk, tp, flag); + return; + } + + /* Otherwise enter Recovery state */ + + if (IsReno(tp)) + NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY); + else + NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY); + + tp->high_seq = tp->snd_nxt; + tp->prior_ssthresh = 0; + tp->undo_marker = tp->snd_una; + tp->undo_retrans = tp->retrans_out; + + if (tp->ca_state < TCP_CA_CWR) { + if (!(flag&FLAG_ECE)) + tp->prior_ssthresh = tcp_current_ssthresh(tp); + tp->snd_ssthresh = tcp_recalc_ssthresh(tp); + TCP_ECN_queue_cwr(tp); + } + + tp->snd_cwnd_cnt = 0; + tcp_set_ca_state(tp, TCP_CA_Recovery); + } + + if (is_dupack || tcp_head_timedout(sk, tp)) + tcp_update_scoreboard(sk, tp); + tcp_cwnd_down(tp); + tcp_xmit_retransmit_queue(sk); +} + +/* Read draft-ietf-tcplw-high-performance before mucking + * with this code. (Superceeds RFC1323) + */ +static void tcp_ack_saw_tstamp(struct tcp_sock *tp, int flag) +{ + __u32 seq_rtt; + + /* RTTM Rule: A TSecr value received in a segment is used to + * update the averaged RTT measurement only if the segment + * acknowledges some new data, i.e., only if it advances the + * left edge of the send window. + * + * See draft-ietf-tcplw-high-performance-00, section 3.3. + * 1998/04/10 Andrey V. Savochkin + * + * Changed: reset backoff as soon as we see the first valid sample. + * If we do not, we get strongly overstimated rto. With timestamps + * samples are accepted even from very old segments: f.e., when rtt=1 + * increases to 8, we retransmit 5 times and after 8 seconds delayed + * answer arrives rto becomes 120 seconds! If at least one of segments + * in window is lost... Voila. --ANK (010210) + */ + seq_rtt = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + tcp_rtt_estimator(tp, seq_rtt); + tcp_set_rto(tp); + tp->backoff = 0; + tcp_bound_rto(tp); +} + +static void tcp_ack_no_tstamp(struct tcp_sock *tp, u32 seq_rtt, int flag) +{ + /* We don't have a timestamp. Can only use + * packets that are not retransmitted to determine + * rtt estimates. Also, we must not reset the + * backoff for rto until we get a non-retransmitted + * packet. This allows us to deal with a situation + * where the network delay has increased suddenly. + * I.e. Karn's algorithm. (SIGCOMM '87, p5.) + */ + + if (flag & FLAG_RETRANS_DATA_ACKED) + return; + + tcp_rtt_estimator(tp, seq_rtt); + tcp_set_rto(tp); + tp->backoff = 0; + tcp_bound_rto(tp); +} + +static inline void tcp_ack_update_rtt(struct tcp_sock *tp, + int flag, s32 seq_rtt) +{ + /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) + tcp_ack_saw_tstamp(tp, flag); + else if (seq_rtt >= 0) + tcp_ack_no_tstamp(tp, seq_rtt, flag); +} + +/* + * Compute congestion window to use. + * + * This is from the implementation of BICTCP in + * Lison-Xu, Kahaled Harfoush, and Injog Rhee. + * "Binary Increase Congestion Control for Fast, Long Distance + * Networks" in InfoComm 2004 + * Available from: + * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp.pdf + * + * Unless BIC is enabled and congestion window is large + * this behaves the same as the original Reno. + */ +static inline __u32 bictcp_cwnd(struct tcp_sock *tp) +{ + /* orignal Reno behaviour */ + if (!tcp_is_bic(tp)) + return tp->snd_cwnd; + + if (tp->bictcp.last_cwnd == tp->snd_cwnd && + (s32)(tcp_time_stamp - tp->bictcp.last_stamp) <= (HZ>>5)) + return tp->bictcp.cnt; + + tp->bictcp.last_cwnd = tp->snd_cwnd; + tp->bictcp.last_stamp = tcp_time_stamp; + + /* start off normal */ + if (tp->snd_cwnd <= sysctl_tcp_bic_low_window) + tp->bictcp.cnt = tp->snd_cwnd; + + /* binary increase */ + else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd) { + __u32 dist = (tp->bictcp.last_max_cwnd - tp->snd_cwnd) + / BICTCP_B; + + if (dist > BICTCP_MAX_INCREMENT) + /* linear increase */ + tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; + else if (dist <= 1U) + /* binary search increase */ + tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR + / BICTCP_B; + else + /* binary search increase */ + tp->bictcp.cnt = tp->snd_cwnd / dist; + } else { + /* slow start amd linear increase */ + if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + BICTCP_B) + /* slow start */ + tp->bictcp.cnt = tp->snd_cwnd * BICTCP_FUNC_OF_MIN_INCR + / BICTCP_B; + else if (tp->snd_cwnd < tp->bictcp.last_max_cwnd + + BICTCP_MAX_INCREMENT*(BICTCP_B-1)) + /* slow start */ + tp->bictcp.cnt = tp->snd_cwnd * (BICTCP_B-1) + / (tp->snd_cwnd-tp->bictcp.last_max_cwnd); + else + /* linear increase */ + tp->bictcp.cnt = tp->snd_cwnd / BICTCP_MAX_INCREMENT; + } + return tp->bictcp.cnt; +} + +/* This is Jacobson's slow start and congestion avoidance. + * SIGCOMM '88, p. 328. + */ +static inline void reno_cong_avoid(struct tcp_sock *tp) +{ + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* In "safe" area, increase. */ + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } else { + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= bictcp_cwnd(tp)) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt=0; + } else + tp->snd_cwnd_cnt++; + } + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +/* This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from: + * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + */ +static void vegas_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) +{ + /* The key players are v_beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up nicely with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, tp->vegas.beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + u32 old_wnd, old_snd_cwnd; + + + /* Here old_wnd is essentially the window of data that was + * sent during the previous RTT, and has all + * been acknowledged in the course of the RTT that ended + * with the ACK we just received. Likewise, old_snd_cwnd + * is the cwnd during the previous RTT. + */ + old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) / + tp->mss_cache_std; + old_snd_cwnd = tp->vegas.beg_snd_cwnd; + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt; + tp->vegas.beg_snd_nxt = tp->snd_nxt; + tp->vegas.beg_snd_cwnd = tp->snd_cwnd; + + /* Take into account the current RTT sample too, to + * decrease the impact of delayed acks. This double counts + * this sample since we count it for the next window as well, + * but that's not too awful, since we're taking the min, + * rather than averaging. + */ + vegas_rtt_calc(tp, seq_rtt); + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (tp->vegas.cntRTT <= 2) { + /* We don't have enough RTT samples to do the Vegas + * calculation, so we'll behave like Reno. + */ + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd++; + } else { + u32 rtt, target_cwnd, diff; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = tp->vegas.minRTT; + + /* Calculate the cwnd we should have, if we weren't + * going too fast. + * + * This is: + * (actual rate in segments) * baseRTT + * We keep it as a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary point. + */ + target_cwnd = ((old_wnd * tp->vegas.baseRTT) + << V_PARAM_SHIFT) / rtt; + + /* Calculate the difference between the window we had, + * and the window we would like to have. This quantity + * is the "Diff" from the Arizona Vegas papers. + * + * Again, this is a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary + * point. + */ + diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; + + if (tp->snd_cwnd < tp->snd_ssthresh) { + /* Slow start. */ + if (diff > sysctl_tcp_vegas_gamma) { + /* Going too fast. Time to slow down + * and switch to congestion avoidance. + */ + tp->snd_ssthresh = 2; + + /* Set cwnd to match the actual rate + * exactly: + * cwnd = (actual rate) * baseRTT + * Then we add 1 because the integer + * truncation robs us of full link + * utilization. + */ + tp->snd_cwnd = min(tp->snd_cwnd, + (target_cwnd >> + V_PARAM_SHIFT)+1); + + } + } else { + /* Congestion avoidance. */ + u32 next_snd_cwnd; + + /* Figure out where we would like cwnd + * to be. + */ + if (diff > sysctl_tcp_vegas_beta) { + /* The old window was too fast, so + * we slow down. + */ + next_snd_cwnd = old_snd_cwnd - 1; + } else if (diff < sysctl_tcp_vegas_alpha) { + /* We don't have enough extra packets + * in the network, so speed up. + */ + next_snd_cwnd = old_snd_cwnd + 1; + } else { + /* Sending just as fast as we + * should be. + */ + next_snd_cwnd = old_snd_cwnd; + } + + /* Adjust cwnd upward or downward, toward the + * desired value. + */ + if (next_snd_cwnd > tp->snd_cwnd) + tp->snd_cwnd++; + else if (next_snd_cwnd < tp->snd_cwnd) + tp->snd_cwnd--; + } + } + + /* Wipe the slate clean for the next RTT. */ + tp->vegas.cntRTT = 0; + tp->vegas.minRTT = 0x7fffffff; + } + + /* The following code is executed for every ack we receive, + * except for conditions checked in should_advance_cwnd() + * before the call to tcp_cong_avoid(). Mainly this means that + * we only execute this code if the ack actually acked some + * data. + */ + + /* If we are in slow start, increase our cwnd in response to this ACK. + * (If we are not in slow start then we are in congestion avoidance, + * and adjust our congestion window only once per RTT. See the code + * above.) + */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tp->snd_cwnd++; + + /* to keep cwnd from growing without bound */ + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); + + /* Make sure that we are never so timid as to reduce our cwnd below + * 2 MSS. + * + * Going below 2 MSS would risk huge delayed ACKs from our receiver. + */ + tp->snd_cwnd = max(tp->snd_cwnd, 2U); + + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static inline void tcp_cong_avoid(struct tcp_sock *tp, u32 ack, u32 seq_rtt) +{ + if (tcp_vegas_enabled(tp)) + vegas_cong_avoid(tp, ack, seq_rtt); + else + reno_cong_avoid(tp); +} + +/* Restart timer after forward progress on connection. + * RFC2988 recommends to restart timer to now+rto. + */ + +static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) +{ + if (!tp->packets_out) { + tcp_clear_xmit_timer(sk, TCP_TIME_RETRANS); + } else { + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + } +} + +/* There is one downside to this scheme. Although we keep the + * ACK clock ticking, adjusting packet counters and advancing + * congestion window, we do not liberate socket send buffer + * space. + * + * Mucking with skb->truesize and sk->sk_wmem_alloc et al. + * then making a write space wakeup callback is a possible + * future enhancement. WARNING: it is not trivial to make. + */ +static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, + __u32 now, __s32 *seq_rtt) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + __u32 seq = tp->snd_una; + __u32 packets_acked; + int acked = 0; + + /* If we get here, the whole TSO packet has not been + * acked. + */ + BUG_ON(!after(scb->end_seq, seq)); + + packets_acked = tcp_skb_pcount(skb); + if (tcp_trim_head(sk, skb, seq - scb->seq)) + return 0; + packets_acked -= tcp_skb_pcount(skb); + + if (packets_acked) { + __u8 sacked = scb->sacked; + + acked |= FLAG_DATA_ACKED; + if (sacked) { + if (sacked & TCPCB_RETRANS) { + if (sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out -= packets_acked; + acked |= FLAG_RETRANS_DATA_ACKED; + *seq_rtt = -1; + } else if (*seq_rtt < 0) + *seq_rtt = now - scb->when; + if (sacked & TCPCB_SACKED_ACKED) + tp->sacked_out -= packets_acked; + if (sacked & TCPCB_LOST) + tp->lost_out -= packets_acked; + if (sacked & TCPCB_URG) { + if (tp->urg_mode && + !before(seq, tp->snd_up)) + tp->urg_mode = 0; + } + } else if (*seq_rtt < 0) + *seq_rtt = now - scb->when; + + if (tp->fackets_out) { + __u32 dval = min(tp->fackets_out, packets_acked); + tp->fackets_out -= dval; + } + tp->packets_out -= packets_acked; + + BUG_ON(tcp_skb_pcount(skb) == 0); + BUG_ON(!before(scb->seq, scb->end_seq)); + } + + return acked; +} + + +/* Remove acknowledged frames from the retransmission queue. */ +static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + __u32 now = tcp_time_stamp; + int acked = 0; + __s32 seq_rtt = -1; + + while ((skb = skb_peek(&sk->sk_write_queue)) && + skb != sk->sk_send_head) { + struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + __u8 sacked = scb->sacked; + + /* If our packet is before the ack sequence we can + * discard it as it's confirmed to have arrived at + * the other end. + */ + if (after(scb->end_seq, tp->snd_una)) { + if (tcp_skb_pcount(skb) > 1) + acked |= tcp_tso_acked(sk, skb, + now, &seq_rtt); + break; + } + + /* Initial outgoing SYN's get put onto the write_queue + * just like anything else we transmit. It is not + * true data, and if we misinform our callers that + * this ACK acks real data, we will erroneously exit + * connection startup slow start one packet too + * quickly. This is severely frowned upon behavior. + */ + if (!(scb->flags & TCPCB_FLAG_SYN)) { + acked |= FLAG_DATA_ACKED; + } else { + acked |= FLAG_SYN_ACKED; + tp->retrans_stamp = 0; + } + + if (sacked) { + if (sacked & TCPCB_RETRANS) { + if(sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out -= tcp_skb_pcount(skb); + acked |= FLAG_RETRANS_DATA_ACKED; + seq_rtt = -1; + } else if (seq_rtt < 0) + seq_rtt = now - scb->when; + if (sacked & TCPCB_SACKED_ACKED) + tp->sacked_out -= tcp_skb_pcount(skb); + if (sacked & TCPCB_LOST) + tp->lost_out -= tcp_skb_pcount(skb); + if (sacked & TCPCB_URG) { + if (tp->urg_mode && + !before(scb->end_seq, tp->snd_up)) + tp->urg_mode = 0; + } + } else if (seq_rtt < 0) + seq_rtt = now - scb->when; + tcp_dec_pcount_approx(&tp->fackets_out, skb); + tcp_packets_out_dec(tp, skb); + __skb_unlink(skb, skb->list); + sk_stream_free_skb(sk, skb); + } + + if (acked&FLAG_ACKED) { + tcp_ack_update_rtt(tp, acked, seq_rtt); + tcp_ack_packets_out(sk, tp); + } + +#if FASTRETRANS_DEBUG > 0 + BUG_TRAP((int)tp->sacked_out >= 0); + BUG_TRAP((int)tp->lost_out >= 0); + BUG_TRAP((int)tp->retrans_out >= 0); + if (!tp->packets_out && tp->rx_opt.sack_ok) { + if (tp->lost_out) { + printk(KERN_DEBUG "Leak l=%u %d\n", + tp->lost_out, tp->ca_state); + tp->lost_out = 0; + } + if (tp->sacked_out) { + printk(KERN_DEBUG "Leak s=%u %d\n", + tp->sacked_out, tp->ca_state); + tp->sacked_out = 0; + } + if (tp->retrans_out) { + printk(KERN_DEBUG "Leak r=%u %d\n", + tp->retrans_out, tp->ca_state); + tp->retrans_out = 0; + } + } +#endif + *seq_rtt_p = seq_rtt; + return acked; +} + +static void tcp_ack_probe(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Was it a usable window open? */ + + if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq, + tp->snd_una + tp->snd_wnd)) { + tp->backoff = 0; + tcp_clear_xmit_timer(sk, TCP_TIME_PROBE0); + /* Socket must be waked up by subsequent tcp_data_snd_check(). + * This function is not for random using! + */ + } else { + tcp_reset_xmit_timer(sk, TCP_TIME_PROBE0, + min(tp->rto << tp->backoff, TCP_RTO_MAX)); + } +} + +static inline int tcp_ack_is_dubious(struct tcp_sock *tp, int flag) +{ + return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || + tp->ca_state != TCP_CA_Open); +} + +static inline int tcp_may_raise_cwnd(struct tcp_sock *tp, int flag) +{ + return (!(flag & FLAG_ECE) || tp->snd_cwnd < tp->snd_ssthresh) && + !((1<ca_state)&(TCPF_CA_Recovery|TCPF_CA_CWR)); +} + +/* Check that window update is acceptable. + * The function assumes that snd_una<=ack<=snd_next. + */ +static inline int tcp_may_update_window(struct tcp_sock *tp, u32 ack, + u32 ack_seq, u32 nwin) +{ + return (after(ack, tp->snd_una) || + after(ack_seq, tp->snd_wl1) || + (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd)); +} + +/* Update our send window. + * + * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 + * and in FreeBSD. NetBSD's one is even worse.) is wrong. + */ +static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb, u32 ack, u32 ack_seq) +{ + int flag = 0; + u32 nwin = ntohs(skb->h.th->window); + + if (likely(!skb->h.th->syn)) + nwin <<= tp->rx_opt.snd_wscale; + + if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { + flag |= FLAG_WIN_UPDATE; + tcp_update_wl(tp, ack, ack_seq); + + if (tp->snd_wnd != nwin) { + tp->snd_wnd = nwin; + + /* Note, it is the only place, where + * fast path is recovered for sending TCP. + */ + tcp_fast_path_check(sk, tp); + + if (nwin > tp->max_window) { + tp->max_window = nwin; + tcp_sync_mss(sk, tp->pmtu_cookie); + } + } + } + + tp->snd_una = ack; + + return flag; +} + +static void tcp_process_frto(struct sock *sk, u32 prior_snd_una) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_sync_left_out(tp); + + if (tp->snd_una == prior_snd_una || + !before(tp->snd_una, tp->frto_highmark)) { + /* RTO was caused by loss, start retransmitting in + * go-back-N slow start + */ + tcp_enter_frto_loss(sk); + return; + } + + if (tp->frto_counter == 1) { + /* First ACK after RTO advances the window: allow two new + * segments out. + */ + tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; + } else { + /* Also the second ACK after RTO advances the window. + * The RTO was likely spurious. Reduce cwnd and continue + * in congestion avoidance + */ + tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); + tcp_moderate_cwnd(tp); + } + + /* F-RTO affects on two new ACKs following RTO. + * At latest on third ACK the TCP behavor is back to normal. + */ + tp->frto_counter = (tp->frto_counter + 1) % 3; +} + +/* + * TCP Westwood+ + */ + +/* + * @init_westwood + * This function initializes fields used in TCP Westwood+. We can't + * get no information about RTTmin at this time so we simply set it to + * TCP_WESTWOOD_INIT_RTT. This value was chosen to be too conservative + * since in this way we're sure it will be updated in a consistent + * way as soon as possible. It will reasonably happen within the first + * RTT period of the connection lifetime. + */ + +static void init_westwood(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->westwood.bw_ns_est = 0; + tp->westwood.bw_est = 0; + tp->westwood.accounted = 0; + tp->westwood.cumul_ack = 0; + tp->westwood.rtt_win_sx = tcp_time_stamp; + tp->westwood.rtt = TCP_WESTWOOD_INIT_RTT; + tp->westwood.rtt_min = TCP_WESTWOOD_INIT_RTT; + tp->westwood.snd_una = tp->snd_una; +} + +/* + * @westwood_do_filter + * Low-pass filter. Implemented using constant coeffients. + */ + +static inline __u32 westwood_do_filter(__u32 a, __u32 b) +{ + return (((7 * a) + b) >> 3); +} + +static void westwood_filter(struct sock *sk, __u32 delta) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->westwood.bw_ns_est = + westwood_do_filter(tp->westwood.bw_ns_est, + tp->westwood.bk / delta); + tp->westwood.bw_est = + westwood_do_filter(tp->westwood.bw_est, + tp->westwood.bw_ns_est); +} + +/* + * @westwood_update_rttmin + * It is used to update RTTmin. In this case we MUST NOT use + * WESTWOOD_RTT_MIN minimum bound since we could be on a LAN! + */ + +static inline __u32 westwood_update_rttmin(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + __u32 rttmin = tp->westwood.rtt_min; + + if (tp->westwood.rtt != 0 && + (tp->westwood.rtt < tp->westwood.rtt_min || !rttmin)) + rttmin = tp->westwood.rtt; + + return rttmin; +} + +/* + * @westwood_acked + * Evaluate increases for dk. + */ + +static inline __u32 westwood_acked(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + + return tp->snd_una - tp->westwood.snd_una; +} + +/* + * @westwood_new_window + * It evaluates if we are receiving data inside the same RTT window as + * when we started. + * Return value: + * It returns 0 if we are still evaluating samples in the same RTT + * window, 1 if the sample has to be considered in the next window. + */ + +static int westwood_new_window(const struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + __u32 left_bound; + __u32 rtt; + int ret = 0; + + left_bound = tp->westwood.rtt_win_sx; + rtt = max(tp->westwood.rtt, (u32) TCP_WESTWOOD_RTT_MIN); + + /* + * A RTT-window has passed. Be careful since if RTT is less than + * 50ms we don't filter but we continue 'building the sample'. + * This minimum limit was choosen since an estimation on small + * time intervals is better to avoid... + * Obvioulsy on a LAN we reasonably will always have + * right_bound = left_bound + WESTWOOD_RTT_MIN + */ + + if ((left_bound + rtt) < tcp_time_stamp) + ret = 1; + + return ret; +} + +/* + * @westwood_update_window + * It updates RTT evaluation window if it is the right moment to do + * it. If so it calls filter for evaluating bandwidth. + */ + +static void __westwood_update_window(struct sock *sk, __u32 now) +{ + struct tcp_sock *tp = tcp_sk(sk); + __u32 delta = now - tp->westwood.rtt_win_sx; + + if (delta) { + if (tp->westwood.rtt) + westwood_filter(sk, delta); + + tp->westwood.bk = 0; + tp->westwood.rtt_win_sx = tcp_time_stamp; + } +} + + +static void westwood_update_window(struct sock *sk, __u32 now) +{ + if (westwood_new_window(sk)) + __westwood_update_window(sk, now); +} + +/* + * @__tcp_westwood_fast_bw + * It is called when we are in fast path. In particular it is called when + * header prediction is successfull. In such case infact update is + * straight forward and doesn't need any particular care. + */ + +static void __tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + westwood_update_window(sk, tcp_time_stamp); + + tp->westwood.bk += westwood_acked(sk); + tp->westwood.snd_una = tp->snd_una; + tp->westwood.rtt_min = westwood_update_rttmin(sk); +} + +static inline void tcp_westwood_fast_bw(struct sock *sk, struct sk_buff *skb) +{ + if (tcp_is_westwood(tcp_sk(sk))) + __tcp_westwood_fast_bw(sk, skb); +} + + +/* + * @westwood_dupack_update + * It updates accounted and cumul_ack when receiving a dupack. + */ + +static void westwood_dupack_update(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->westwood.accounted += tp->mss_cache_std; + tp->westwood.cumul_ack = tp->mss_cache_std; +} + +static inline int westwood_may_change_cumul(struct tcp_sock *tp) +{ + return (tp->westwood.cumul_ack > tp->mss_cache_std); +} + +static inline void westwood_partial_update(struct tcp_sock *tp) +{ + tp->westwood.accounted -= tp->westwood.cumul_ack; + tp->westwood.cumul_ack = tp->mss_cache_std; +} + +static inline void westwood_complete_update(struct tcp_sock *tp) +{ + tp->westwood.cumul_ack -= tp->westwood.accounted; + tp->westwood.accounted = 0; +} + +/* + * @westwood_acked_count + * This function evaluates cumul_ack for evaluating dk in case of + * delayed or partial acks. + */ + +static inline __u32 westwood_acked_count(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->westwood.cumul_ack = westwood_acked(sk); + + /* If cumul_ack is 0 this is a dupack since it's not moving + * tp->snd_una. + */ + if (!(tp->westwood.cumul_ack)) + westwood_dupack_update(sk); + + if (westwood_may_change_cumul(tp)) { + /* Partial or delayed ack */ + if (tp->westwood.accounted >= tp->westwood.cumul_ack) + westwood_partial_update(tp); + else + westwood_complete_update(tp); + } + + tp->westwood.snd_una = tp->snd_una; + + return tp->westwood.cumul_ack; +} + + +/* + * @__tcp_westwood_slow_bw + * It is called when something is going wrong..even if there could + * be no problems! Infact a simple delayed packet may trigger a + * dupack. But we need to be careful in such case. + */ + +static void __tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + westwood_update_window(sk, tcp_time_stamp); + + tp->westwood.bk += westwood_acked_count(sk); + tp->westwood.rtt_min = westwood_update_rttmin(sk); +} + +static inline void tcp_westwood_slow_bw(struct sock *sk, struct sk_buff *skb) +{ + if (tcp_is_westwood(tcp_sk(sk))) + __tcp_westwood_slow_bw(sk, skb); +} + +/* This routine deals with incoming acks, but not outgoing ones. */ +static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 prior_snd_una = tp->snd_una; + u32 ack_seq = TCP_SKB_CB(skb)->seq; + u32 ack = TCP_SKB_CB(skb)->ack_seq; + u32 prior_in_flight; + s32 seq_rtt; + int prior_packets; + + /* If the ack is newer than sent or older than previous acks + * then we can probably ignore it. + */ + if (after(ack, tp->snd_nxt)) + goto uninteresting_ack; + + if (before(ack, prior_snd_una)) + goto old_ack; + + if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) { + /* Window is constant, pure forward advance. + * No more checks are required. + * Note, we use the fact that SND.UNA>=SND.WL2. + */ + tcp_update_wl(tp, ack, ack_seq); + tp->snd_una = ack; + tcp_westwood_fast_bw(sk, skb); + flag |= FLAG_WIN_UPDATE; + + NET_INC_STATS_BH(LINUX_MIB_TCPHPACKS); + } else { + if (ack_seq != TCP_SKB_CB(skb)->end_seq) + flag |= FLAG_DATA; + else + NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS); + + flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq); + + if (TCP_SKB_CB(skb)->sacked) + flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); + + if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) + flag |= FLAG_ECE; + + tcp_westwood_slow_bw(sk,skb); + } + + /* We passed data and got it acked, remove any soft error + * log. Something worked... + */ + sk->sk_err_soft = 0; + tp->rcv_tstamp = tcp_time_stamp; + prior_packets = tp->packets_out; + if (!prior_packets) + goto no_queue; + + prior_in_flight = tcp_packets_in_flight(tp); + + /* See if we can take anything off of the retransmit queue. */ + flag |= tcp_clean_rtx_queue(sk, &seq_rtt); + + if (tp->frto_counter) + tcp_process_frto(sk, prior_snd_una); + + if (tcp_ack_is_dubious(tp, flag)) { + /* Advanve CWND, if state allows this. */ + if ((flag & FLAG_DATA_ACKED) && + (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && + tcp_may_raise_cwnd(tp, flag)) + tcp_cong_avoid(tp, ack, seq_rtt); + tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); + } else { + if ((flag & FLAG_DATA_ACKED) && + (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) + tcp_cong_avoid(tp, ack, seq_rtt); + } + + if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) + dst_confirm(sk->sk_dst_cache); + + return 1; + +no_queue: + tp->probes_out = 0; + + /* If this ack opens up a zero window, clear backoff. It was + * being used to time the probes, and is probably far higher than + * it needs to be for normal retransmission. + */ + if (sk->sk_send_head) + tcp_ack_probe(sk); + return 1; + +old_ack: + if (TCP_SKB_CB(skb)->sacked) + tcp_sacktag_write_queue(sk, skb, prior_snd_una); + +uninteresting_ack: + SOCK_DEBUG(sk, "Ack %u out of %u:%u\n", ack, tp->snd_una, tp->snd_nxt); + return 0; +} + + +/* Look for tcp options. Normally only called on SYN and SYNACK packets. + * But, this can also be called on packets in the established flow when + * the fast version below fails. + */ +void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab) +{ + unsigned char *ptr; + struct tcphdr *th = skb->h.th; + int length=(th->doff*4)-sizeof(struct tcphdr); + + ptr = (unsigned char *)(th + 1); + opt_rx->saw_tstamp = 0; + + while(length>0) { + int opcode=*ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ + length--; + continue; + default: + opsize=*ptr++; + if (opsize < 2) /* "silly options" */ + return; + if (opsize > length) + return; /* don't parse partial options */ + switch(opcode) { + case TCPOPT_MSS: + if(opsize==TCPOLEN_MSS && th->syn && !estab) { + u16 in_mss = ntohs(get_unaligned((__u16 *)ptr)); + if (in_mss) { + if (opt_rx->user_mss && opt_rx->user_mss < in_mss) + in_mss = opt_rx->user_mss; + opt_rx->mss_clamp = in_mss; + } + } + break; + case TCPOPT_WINDOW: + if(opsize==TCPOLEN_WINDOW && th->syn && !estab) + if (sysctl_tcp_window_scaling) { + __u8 snd_wscale = *(__u8 *) ptr; + opt_rx->wscale_ok = 1; + if (snd_wscale > 14) { + if(net_ratelimit()) + printk(KERN_INFO "tcp_parse_options: Illegal window " + "scaling value %d >14 received.\n", + snd_wscale); + snd_wscale = 14; + } + opt_rx->snd_wscale = snd_wscale; + } + break; + case TCPOPT_TIMESTAMP: + if(opsize==TCPOLEN_TIMESTAMP) { + if ((estab && opt_rx->tstamp_ok) || + (!estab && sysctl_tcp_timestamps)) { + opt_rx->saw_tstamp = 1; + opt_rx->rcv_tsval = ntohl(get_unaligned((__u32 *)ptr)); + opt_rx->rcv_tsecr = ntohl(get_unaligned((__u32 *)(ptr+4))); + } + } + break; + case TCPOPT_SACK_PERM: + if(opsize==TCPOLEN_SACK_PERM && th->syn && !estab) { + if (sysctl_tcp_sack) { + opt_rx->sack_ok = 1; + tcp_sack_reset(opt_rx); + } + } + break; + + case TCPOPT_SACK: + if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && + !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && + opt_rx->sack_ok) { + TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; + } + }; + ptr+=opsize-2; + length-=opsize; + }; + } +} + +/* Fast parse options. This hopes to only see timestamps. + * If it is wrong it falls back on tcp_parse_options(). + */ +static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, + struct tcp_sock *tp) +{ + if (th->doff == sizeof(struct tcphdr)>>2) { + tp->rx_opt.saw_tstamp = 0; + return 0; + } else if (tp->rx_opt.tstamp_ok && + th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) { + __u32 *ptr = (__u32 *)(th + 1); + if (*ptr == ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) { + tp->rx_opt.saw_tstamp = 1; + ++ptr; + tp->rx_opt.rcv_tsval = ntohl(*ptr); + ++ptr; + tp->rx_opt.rcv_tsecr = ntohl(*ptr); + return 1; + } + } + tcp_parse_options(skb, &tp->rx_opt, 1); + return 1; +} + +static inline void tcp_store_ts_recent(struct tcp_sock *tp) +{ + tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; + tp->rx_opt.ts_recent_stamp = xtime.tv_sec; +} + +static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) +{ + if (tp->rx_opt.saw_tstamp && !after(seq, tp->rcv_wup)) { + /* PAWS bug workaround wrt. ACK frames, the PAWS discard + * extra check below makes sure this can only happen + * for pure ACK frames. -DaveM + * + * Not only, also it occurs for expired timestamps. + */ + + if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 || + xtime.tv_sec >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS) + tcp_store_ts_recent(tp); + } +} + +/* Sorry, PAWS as specified is broken wrt. pure-ACKs -DaveM + * + * It is not fatal. If this ACK does _not_ change critical state (seqs, window) + * it can pass through stack. So, the following predicate verifies that + * this segment is not used for anything but congestion avoidance or + * fast retransmit. Moreover, we even are able to eliminate most of such + * second order effects, if we apply some small "replay" window (~RTO) + * to timestamp space. + * + * All these measures still do not guarantee that we reject wrapped ACKs + * on networks with high bandwidth, when sequence space is recycled fastly, + * but it guarantees that such events will be very rare and do not affect + * connection seriously. This doesn't look nice, but alas, PAWS is really + * buggy extension. + * + * [ Later note. Even worse! It is buggy for segments _with_ data. RFC + * states that events when retransmit arrives after original data are rare. + * It is a blatant lie. VJ forgot about fast retransmit! 8)8) It is + * the biggest problem on large power networks even with minor reordering. + * OK, let's give it small replay window. If peer clock is even 1hz, it is safe + * up to bandwidth of 18Gigabit/sec. 8) ] + */ + +static int tcp_disordered_ack(struct tcp_sock *tp, struct sk_buff *skb) +{ + struct tcphdr *th = skb->h.th; + u32 seq = TCP_SKB_CB(skb)->seq; + u32 ack = TCP_SKB_CB(skb)->ack_seq; + + return (/* 1. Pure ACK with correct sequence number. */ + (th->ack && seq == TCP_SKB_CB(skb)->end_seq && seq == tp->rcv_nxt) && + + /* 2. ... and duplicate ACK. */ + ack == tp->snd_una && + + /* 3. ... and does not update window. */ + !tcp_may_update_window(tp, ack, seq, ntohs(th->window) << tp->rx_opt.snd_wscale) && + + /* 4. ... and sits in replay window. */ + (s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) <= (tp->rto*1024)/HZ); +} + +static inline int tcp_paws_discard(struct tcp_sock *tp, struct sk_buff *skb) +{ + return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && + xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && + !tcp_disordered_ack(tp, skb)); +} + +/* Check segment sequence number for validity. + * + * Segment controls are considered valid, if the segment + * fits to the window after truncation to the window. Acceptability + * of data (and SYN, FIN, of course) is checked separately. + * See tcp_data_queue(), for example. + * + * Also, controls (RST is main one) are accepted using RCV.WUP instead + * of RCV.NXT. Peer still did not advance his SND.UNA when we + * delayed ACK, so that hisSND.UNA<=ourRCV.WUP. + * (borrowed from freebsd) + */ + +static inline int tcp_sequence(struct tcp_sock *tp, u32 seq, u32 end_seq) +{ + return !before(end_seq, tp->rcv_wup) && + !after(seq, tp->rcv_nxt + tcp_receive_window(tp)); +} + +/* When we get a reset we do this. */ +static void tcp_reset(struct sock *sk) +{ + /* We want the right error as BSD sees it (and indeed as we do). */ + switch (sk->sk_state) { + case TCP_SYN_SENT: + sk->sk_err = ECONNREFUSED; + break; + case TCP_CLOSE_WAIT: + sk->sk_err = EPIPE; + break; + case TCP_CLOSE: + return; + default: + sk->sk_err = ECONNRESET; + } + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + + tcp_done(sk); +} + +/* + * Process the FIN bit. This now behaves as it is supposed to work + * and the FIN takes effect when it is validly part of sequence + * space. Not before when we get holes. + * + * If we are ESTABLISHED, a received fin moves us to CLOSE-WAIT + * (and thence onto LAST-ACK and finally, CLOSE, we never enter + * TIME-WAIT) + * + * If we are in FINWAIT-1, a received FIN indicates simultaneous + * close and we go into CLOSING (and later onto TIME-WAIT) + * + * If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT. + */ +static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_schedule_ack(tp); + + sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(sk, SOCK_DONE); + + switch (sk->sk_state) { + case TCP_SYN_RECV: + case TCP_ESTABLISHED: + /* Move to CLOSE_WAIT */ + tcp_set_state(sk, TCP_CLOSE_WAIT); + tp->ack.pingpong = 1; + break; + + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + /* Received a retransmission of the FIN, do + * nothing. + */ + break; + case TCP_LAST_ACK: + /* RFC793: Remain in the LAST-ACK state. */ + break; + + case TCP_FIN_WAIT1: + /* This case occurs when a simultaneous close + * happens, we must ack the received FIN and + * enter the CLOSING state. + */ + tcp_send_ack(sk); + tcp_set_state(sk, TCP_CLOSING); + break; + case TCP_FIN_WAIT2: + /* Received a FIN -- send ACK and enter TIME_WAIT. */ + tcp_send_ack(sk); + tcp_time_wait(sk, TCP_TIME_WAIT, 0); + break; + default: + /* Only TCP_LISTEN and TCP_CLOSE are left, in these + * cases we should never reach this piece of code. + */ + printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", + __FUNCTION__, sk->sk_state); + break; + }; + + /* It _is_ possible, that we have something out-of-order _after_ FIN. + * Probably, we should reset in this case. For now drop them. + */ + __skb_queue_purge(&tp->out_of_order_queue); + if (tp->rx_opt.sack_ok) + tcp_sack_reset(&tp->rx_opt); + sk_stream_mem_reclaim(sk); + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + + /* Do not send POLL_HUP for half duplex close. */ + if (sk->sk_shutdown == SHUTDOWN_MASK || + sk->sk_state == TCP_CLOSE) + sk_wake_async(sk, 1, POLL_HUP); + else + sk_wake_async(sk, 1, POLL_IN); + } +} + +static __inline__ int +tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) +{ + if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { + if (before(seq, sp->start_seq)) + sp->start_seq = seq; + if (after(end_seq, sp->end_seq)) + sp->end_seq = end_seq; + return 1; + } + return 0; +} + +static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) +{ + if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { + if (before(seq, tp->rcv_nxt)) + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT); + else + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFOSENT); + + tp->rx_opt.dsack = 1; + tp->duplicate_sack[0].start_seq = seq; + tp->duplicate_sack[0].end_seq = end_seq; + tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + 1, 4 - tp->rx_opt.tstamp_ok); + } +} + +static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) +{ + if (!tp->rx_opt.dsack) + tcp_dsack_set(tp, seq, end_seq); + else + tcp_sack_extend(tp->duplicate_sack, seq, end_seq); +} + +static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); + tcp_enter_quickack_mode(tp); + + if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) + end_seq = tp->rcv_nxt; + tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, end_seq); + } + } + + tcp_send_ack(sk); +} + +/* These routines update the SACK block as out-of-order packets arrive or + * in-order packets close up the sequence space. + */ +static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) +{ + int this_sack; + struct tcp_sack_block *sp = &tp->selective_acks[0]; + struct tcp_sack_block *swalk = sp+1; + + /* See if the recent change to the first SACK eats into + * or hits the sequence space of other SACK blocks, if so coalesce. + */ + for (this_sack = 1; this_sack < tp->rx_opt.num_sacks; ) { + if (tcp_sack_extend(sp, swalk->start_seq, swalk->end_seq)) { + int i; + + /* Zap SWALK, by moving every further SACK up by one slot. + * Decrease num_sacks. + */ + tp->rx_opt.num_sacks--; + tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); + for(i=this_sack; i < tp->rx_opt.num_sacks; i++) + sp[i] = sp[i+1]; + continue; + } + this_sack++, swalk++; + } +} + +static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) +{ + __u32 tmp; + + tmp = sack1->start_seq; + sack1->start_seq = sack2->start_seq; + sack2->start_seq = tmp; + + tmp = sack1->end_seq; + sack1->end_seq = sack2->end_seq; + sack2->end_seq = tmp; +} + +static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int cur_sacks = tp->rx_opt.num_sacks; + int this_sack; + + if (!cur_sacks) + goto new_sack; + + for (this_sack=0; this_sack0; this_sack--, sp--) + tcp_sack_swap(sp, sp-1); + if (cur_sacks > 1) + tcp_sack_maybe_coalesce(tp); + return; + } + } + + /* Could not find an adjacent existing SACK, build a new one, + * put it at the front, and shift everyone else down. We + * always know there is at least one SACK present already here. + * + * If the sack array is full, forget about the last one. + */ + if (this_sack >= 4) { + this_sack--; + tp->rx_opt.num_sacks--; + sp--; + } + for(; this_sack > 0; this_sack--, sp--) + *sp = *(sp-1); + +new_sack: + /* Build the new head SACK, and we're done. */ + sp->start_seq = seq; + sp->end_seq = end_seq; + tp->rx_opt.num_sacks++; + tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); +} + +/* RCV.NXT advances, some SACKs should be eaten. */ + +static void tcp_sack_remove(struct tcp_sock *tp) +{ + struct tcp_sack_block *sp = &tp->selective_acks[0]; + int num_sacks = tp->rx_opt.num_sacks; + int this_sack; + + /* Empty ofo queue, hence, all the SACKs are eaten. Clear. */ + if (skb_queue_len(&tp->out_of_order_queue) == 0) { + tp->rx_opt.num_sacks = 0; + tp->rx_opt.eff_sacks = tp->rx_opt.dsack; + return; + } + + for(this_sack = 0; this_sack < num_sacks; ) { + /* Check if the start of the sack is covered by RCV.NXT. */ + if (!before(tp->rcv_nxt, sp->start_seq)) { + int i; + + /* RCV.NXT must cover all the block! */ + BUG_TRAP(!before(tp->rcv_nxt, sp->end_seq)); + + /* Zap this SACK, by moving forward any other SACKS. */ + for (i=this_sack+1; i < num_sacks; i++) + tp->selective_acks[i-1] = tp->selective_acks[i]; + num_sacks--; + continue; + } + this_sack++; + sp++; + } + if (num_sacks != tp->rx_opt.num_sacks) { + tp->rx_opt.num_sacks = num_sacks; + tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); + } +} + +/* This one checks to see if we can put data from the + * out_of_order queue into the receive_queue. + */ +static void tcp_ofo_queue(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + __u32 dsack_high = tp->rcv_nxt; + struct sk_buff *skb; + + while ((skb = skb_peek(&tp->out_of_order_queue)) != NULL) { + if (after(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) + break; + + if (before(TCP_SKB_CB(skb)->seq, dsack_high)) { + __u32 dsack = dsack_high; + if (before(TCP_SKB_CB(skb)->end_seq, dsack_high)) + dsack_high = TCP_SKB_CB(skb)->end_seq; + tcp_dsack_extend(tp, TCP_SKB_CB(skb)->seq, dsack); + } + + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + SOCK_DEBUG(sk, "ofo packet was already received \n"); + __skb_unlink(skb, skb->list); + __kfree_skb(skb); + continue; + } + SOCK_DEBUG(sk, "ofo requeuing : rcv_next %X seq %X - %X\n", + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq); + + __skb_unlink(skb, skb->list); + __skb_queue_tail(&sk->sk_receive_queue, skb); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if(skb->h.th->fin) + tcp_fin(skb, sk, skb->h.th); + } +} + +static int tcp_prune_queue(struct sock *sk); + +static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) +{ + struct tcphdr *th = skb->h.th; + struct tcp_sock *tp = tcp_sk(sk); + int eaten = -1; + + if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) + goto drop; + + th = skb->h.th; + __skb_pull(skb, th->doff*4); + + TCP_ECN_accept_cwr(tp, skb); + + if (tp->rx_opt.dsack) { + tp->rx_opt.dsack = 0; + tp->rx_opt.eff_sacks = min_t(unsigned int, tp->rx_opt.num_sacks, + 4 - tp->rx_opt.tstamp_ok); + } + + /* Queue data for delivery to the user. + * Packets in sequence go to the receive queue. + * Out of sequence packets to the out_of_order_queue. + */ + if (TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + if (tcp_receive_window(tp) == 0) + goto out_of_window; + + /* Ok. In sequence. In window. */ + if (tp->ucopy.task == current && + tp->copied_seq == tp->rcv_nxt && tp->ucopy.len && + sock_owned_by_user(sk) && !tp->urg_data) { + int chunk = min_t(unsigned int, skb->len, + tp->ucopy.len); + + __set_current_state(TASK_RUNNING); + + local_bh_enable(); + if (!skb_copy_datagram_iovec(skb, 0, tp->ucopy.iov, chunk)) { + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + eaten = (chunk == skb->len && !th->fin); + tcp_rcv_space_adjust(sk); + } + local_bh_disable(); + } + + if (eaten <= 0) { +queue_and_out: + if (eaten < 0 && + (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + !sk_stream_rmem_schedule(sk, skb))) { + if (tcp_prune_queue(sk) < 0 || + !sk_stream_rmem_schedule(sk, skb)) + goto drop; + } + sk_stream_set_owner_r(skb, sk); + __skb_queue_tail(&sk->sk_receive_queue, skb); + } + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if(skb->len) + tcp_event_data_recv(sk, tp, skb); + if(th->fin) + tcp_fin(skb, sk, th); + + if (skb_queue_len(&tp->out_of_order_queue)) { + tcp_ofo_queue(sk); + + /* RFC2581. 4.2. SHOULD send immediate ACK, when + * gap in queue is filled. + */ + if (!skb_queue_len(&tp->out_of_order_queue)) + tp->ack.pingpong = 0; + } + + if (tp->rx_opt.num_sacks) + tcp_sack_remove(tp); + + tcp_fast_path_check(sk, tp); + + if (eaten > 0) + __kfree_skb(skb); + else if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, 0); + return; + } + + if (!after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) { + /* A retransmit, 2nd most common case. Force an immediate ack. */ + NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); + tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + +out_of_window: + tcp_enter_quickack_mode(tp); + tcp_schedule_ack(tp); +drop: + __kfree_skb(skb); + return; + } + + /* Out of window. F.e. zero window probe. */ + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt + tcp_receive_window(tp))) + goto out_of_window; + + tcp_enter_quickack_mode(tp); + + if (before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + /* Partial packet, seq < rcv_next < end_seq */ + SOCK_DEBUG(sk, "partial packet: rcv_next %X seq %X - %X\n", + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq); + + tcp_dsack_set(tp, TCP_SKB_CB(skb)->seq, tp->rcv_nxt); + + /* If window is closed, drop tail of packet. But after + * remembering D-SACK for its head made in previous line. + */ + if (!tcp_receive_window(tp)) + goto out_of_window; + goto queue_and_out; + } + + TCP_ECN_check_ce(tp, skb); + + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + !sk_stream_rmem_schedule(sk, skb)) { + if (tcp_prune_queue(sk) < 0 || + !sk_stream_rmem_schedule(sk, skb)) + goto drop; + } + + /* Disable header prediction. */ + tp->pred_flags = 0; + tcp_schedule_ack(tp); + + SOCK_DEBUG(sk, "out of order segment: rcv_next %X seq %X - %X\n", + tp->rcv_nxt, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq); + + sk_stream_set_owner_r(skb, sk); + + if (!skb_peek(&tp->out_of_order_queue)) { + /* Initial out of order segment, build 1 SACK. */ + if (tp->rx_opt.sack_ok) { + tp->rx_opt.num_sacks = 1; + tp->rx_opt.dsack = 0; + tp->rx_opt.eff_sacks = 1; + tp->selective_acks[0].start_seq = TCP_SKB_CB(skb)->seq; + tp->selective_acks[0].end_seq = + TCP_SKB_CB(skb)->end_seq; + } + __skb_queue_head(&tp->out_of_order_queue,skb); + } else { + struct sk_buff *skb1 = tp->out_of_order_queue.prev; + u32 seq = TCP_SKB_CB(skb)->seq; + u32 end_seq = TCP_SKB_CB(skb)->end_seq; + + if (seq == TCP_SKB_CB(skb1)->end_seq) { + __skb_append(skb1, skb); + + if (!tp->rx_opt.num_sacks || + tp->selective_acks[0].end_seq != seq) + goto add_sack; + + /* Common case: data arrive in order after hole. */ + tp->selective_acks[0].end_seq = end_seq; + return; + } + + /* Find place to insert this segment. */ + do { + if (!after(TCP_SKB_CB(skb1)->seq, seq)) + break; + } while ((skb1 = skb1->prev) != + (struct sk_buff*)&tp->out_of_order_queue); + + /* Do skb overlap to previous one? */ + if (skb1 != (struct sk_buff*)&tp->out_of_order_queue && + before(seq, TCP_SKB_CB(skb1)->end_seq)) { + if (!after(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + /* All the bits are present. Drop. */ + __kfree_skb(skb); + tcp_dsack_set(tp, seq, end_seq); + goto add_sack; + } + if (after(seq, TCP_SKB_CB(skb1)->seq)) { + /* Partial overlap. */ + tcp_dsack_set(tp, seq, TCP_SKB_CB(skb1)->end_seq); + } else { + skb1 = skb1->prev; + } + } + __skb_insert(skb, skb1, skb1->next, &tp->out_of_order_queue); + + /* And clean segments covered by new one as whole. */ + while ((skb1 = skb->next) != + (struct sk_buff*)&tp->out_of_order_queue && + after(end_seq, TCP_SKB_CB(skb1)->seq)) { + if (before(end_seq, TCP_SKB_CB(skb1)->end_seq)) { + tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, end_seq); + break; + } + __skb_unlink(skb1, skb1->list); + tcp_dsack_extend(tp, TCP_SKB_CB(skb1)->seq, TCP_SKB_CB(skb1)->end_seq); + __kfree_skb(skb1); + } + +add_sack: + if (tp->rx_opt.sack_ok) + tcp_sack_new_ofo_skb(sk, seq, end_seq); + } +} + +/* Collapse contiguous sequence of skbs head..tail with + * sequence numbers start..end. + * Segments with FIN/SYN are not collapsed (only because this + * simplifies code) + */ +static void +tcp_collapse(struct sock *sk, struct sk_buff *head, + struct sk_buff *tail, u32 start, u32 end) +{ + struct sk_buff *skb; + + /* First, check that queue is collapsable and find + * the point where collapsing can be useful. */ + for (skb = head; skb != tail; ) { + /* No new bits? It is possible on ofo queue. */ + if (!before(start, TCP_SKB_CB(skb)->end_seq)) { + struct sk_buff *next = skb->next; + __skb_unlink(skb, skb->list); + __kfree_skb(skb); + NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); + skb = next; + continue; + } + + /* The first skb to collapse is: + * - not SYN/FIN and + * - bloated or contains data before "start" or + * overlaps to the next one. + */ + if (!skb->h.th->syn && !skb->h.th->fin && + (tcp_win_from_space(skb->truesize) > skb->len || + before(TCP_SKB_CB(skb)->seq, start) || + (skb->next != tail && + TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb->next)->seq))) + break; + + /* Decided to skip this, advance start seq. */ + start = TCP_SKB_CB(skb)->end_seq; + skb = skb->next; + } + if (skb == tail || skb->h.th->syn || skb->h.th->fin) + return; + + while (before(start, end)) { + struct sk_buff *nskb; + int header = skb_headroom(skb); + int copy = SKB_MAX_ORDER(header, 0); + + /* Too big header? This can happen with IPv6. */ + if (copy < 0) + return; + if (end-start < copy) + copy = end-start; + nskb = alloc_skb(copy+header, GFP_ATOMIC); + if (!nskb) + return; + skb_reserve(nskb, header); + memcpy(nskb->head, skb->head, header); + nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head); + nskb->h.raw = nskb->head + (skb->h.raw-skb->head); + nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head); + memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); + TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; + __skb_insert(nskb, skb->prev, skb, skb->list); + sk_stream_set_owner_r(nskb, sk); + + /* Copy data, releasing collapsed skbs. */ + while (copy > 0) { + int offset = start - TCP_SKB_CB(skb)->seq; + int size = TCP_SKB_CB(skb)->end_seq - start; + + if (offset < 0) BUG(); + if (size > 0) { + size = min(copy, size); + if (skb_copy_bits(skb, offset, skb_put(nskb, size), size)) + BUG(); + TCP_SKB_CB(nskb)->end_seq += size; + copy -= size; + start += size; + } + if (!before(start, TCP_SKB_CB(skb)->end_seq)) { + struct sk_buff *next = skb->next; + __skb_unlink(skb, skb->list); + __kfree_skb(skb); + NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); + skb = next; + if (skb == tail || skb->h.th->syn || skb->h.th->fin) + return; + } + } + } +} + +/* Collapse ofo queue. Algorithm: select contiguous sequence of skbs + * and tcp_collapse() them until all the queue is collapsed. + */ +static void tcp_collapse_ofo_queue(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = skb_peek(&tp->out_of_order_queue); + struct sk_buff *head; + u32 start, end; + + if (skb == NULL) + return; + + start = TCP_SKB_CB(skb)->seq; + end = TCP_SKB_CB(skb)->end_seq; + head = skb; + + for (;;) { + skb = skb->next; + + /* Segment is terminated when we see gap or when + * we are at the end of all the queue. */ + if (skb == (struct sk_buff *)&tp->out_of_order_queue || + after(TCP_SKB_CB(skb)->seq, end) || + before(TCP_SKB_CB(skb)->end_seq, start)) { + tcp_collapse(sk, head, skb, start, end); + head = skb; + if (skb == (struct sk_buff *)&tp->out_of_order_queue) + break; + /* Start new segment */ + start = TCP_SKB_CB(skb)->seq; + end = TCP_SKB_CB(skb)->end_seq; + } else { + if (before(TCP_SKB_CB(skb)->seq, start)) + start = TCP_SKB_CB(skb)->seq; + if (after(TCP_SKB_CB(skb)->end_seq, end)) + end = TCP_SKB_CB(skb)->end_seq; + } + } +} + +/* Reduce allocated memory if we can, trying to get + * the socket within its memory limits again. + * + * Return less than zero if we should start dropping frames + * until the socket owning process reads some of the data + * to stabilize the situation. + */ +static int tcp_prune_queue(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + SOCK_DEBUG(sk, "prune_queue: c=%x\n", tp->copied_seq); + + NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED); + + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + tcp_clamp_window(sk, tp); + else if (tcp_memory_pressure) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + + tcp_collapse_ofo_queue(sk); + tcp_collapse(sk, sk->sk_receive_queue.next, + (struct sk_buff*)&sk->sk_receive_queue, + tp->copied_seq, tp->rcv_nxt); + sk_stream_mem_reclaim(sk); + + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + return 0; + + /* Collapsing did not help, destructive actions follow. + * This must not ever occur. */ + + /* First, purge the out_of_order queue. */ + if (skb_queue_len(&tp->out_of_order_queue)) { + NET_ADD_STATS_BH(LINUX_MIB_OFOPRUNED, + skb_queue_len(&tp->out_of_order_queue)); + __skb_queue_purge(&tp->out_of_order_queue); + + /* Reset SACK state. A conforming SACK implementation will + * do the same at a timeout based retransmit. When a connection + * is in a sad state like this, we care only about integrity + * of the connection not performance. + */ + if (tp->rx_opt.sack_ok) + tcp_sack_reset(&tp->rx_opt); + sk_stream_mem_reclaim(sk); + } + + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) + return 0; + + /* If we are really being abused, tell the caller to silently + * drop receive data on the floor. It will get retransmitted + * and hopefully then we'll have sufficient space. + */ + NET_INC_STATS_BH(LINUX_MIB_RCVPRUNED); + + /* Massive buffer overcommit. */ + tp->pred_flags = 0; + return -1; +} + + +/* RFC2861, slow part. Adjust cwnd, after it was not full during one rto. + * As additional protections, we do not touch cwnd in retransmission phases, + * and if application hit its sndbuf limit recently. + */ +void tcp_cwnd_application_limited(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->ca_state == TCP_CA_Open && + sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) { + /* Limited by application or receiver window. */ + u32 win_used = max(tp->snd_cwnd_used, 2U); + if (win_used < tp->snd_cwnd) { + tp->snd_ssthresh = tcp_current_ssthresh(tp); + tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1; + } + tp->snd_cwnd_used = 0; + } + tp->snd_cwnd_stamp = tcp_time_stamp; +} + + +/* When incoming ACK allowed to free some skb from write_queue, + * we remember this event in flag SOCK_QUEUE_SHRUNK and wake up socket + * on the exit from tcp input handler. + * + * PROBLEM: sndbuf expansion does not work well with largesend. + */ +static void tcp_new_space(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->packets_out < tp->snd_cwnd && + !(sk->sk_userlocks & SOCK_SNDBUF_LOCK) && + !tcp_memory_pressure && + atomic_read(&tcp_memory_allocated) < sysctl_tcp_mem[0]) { + int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) + + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), + demanded = max_t(unsigned int, tp->snd_cwnd, + tp->reordering + 1); + sndmem *= 2*demanded; + if (sndmem > sk->sk_sndbuf) + sk->sk_sndbuf = min(sndmem, sysctl_tcp_wmem[2]); + tp->snd_cwnd_stamp = tcp_time_stamp; + } + + sk->sk_write_space(sk); +} + +static inline void tcp_check_space(struct sock *sk) +{ + if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { + sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); + if (sk->sk_socket && + test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) + tcp_new_space(sk); + } +} + +static void __tcp_data_snd_check(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (after(TCP_SKB_CB(skb)->end_seq, tp->snd_una + tp->snd_wnd) || + tcp_packets_in_flight(tp) >= tp->snd_cwnd || + tcp_write_xmit(sk, tp->nonagle)) + tcp_check_probe_timer(sk, tp); +} + +static __inline__ void tcp_data_snd_check(struct sock *sk) +{ + struct sk_buff *skb = sk->sk_send_head; + + if (skb != NULL) + __tcp_data_snd_check(sk, skb); + tcp_check_space(sk); +} + +/* + * Check if sending an ack is needed. + */ +static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > tp->ack.rcv_mss + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). Or... + */ + && __tcp_select_window(sk) >= tp->rcv_wnd) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(tp) || + /* We have out of order data. */ + (ofo_possible && + skb_peek(&tp->out_of_order_queue))) { + /* Then ack it now */ + tcp_send_ack(sk); + } else { + /* Else, send delayed ack. */ + tcp_send_delayed_ack(sk); + } +} + +static __inline__ void tcp_ack_snd_check(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + if (!tcp_ack_scheduled(tp)) { + /* We sent a data segment already. */ + return; + } + __tcp_ack_snd_check(sk, 1); +} + +/* + * This routine is only called when we have urgent data + * signalled. Its the 'slow' part of tcp_urg. It could be + * moved inline now as tcp_urg is only called from one + * place. We handle URGent data wrong. We have to - as + * BSD still doesn't use the correction from RFC961. + * For 1003.1g we should support a new option TCP_STDURG to permit + * either form (or just set the sysctl tcp_stdurg). + */ + +static void tcp_check_urg(struct sock * sk, struct tcphdr * th) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 ptr = ntohs(th->urg_ptr); + + if (ptr && !sysctl_tcp_stdurg) + ptr--; + ptr += ntohl(th->seq); + + /* Ignore urgent data that we've already seen and read. */ + if (after(tp->copied_seq, ptr)) + return; + + /* Do not replay urg ptr. + * + * NOTE: interesting situation not covered by specs. + * Misbehaving sender may send urg ptr, pointing to segment, + * which we already have in ofo queue. We are not able to fetch + * such data and will stay in TCP_URG_NOTYET until will be eaten + * by recvmsg(). Seems, we are not obliged to handle such wicked + * situations. But it is worth to think about possibility of some + * DoSes using some hypothetical application level deadlock. + */ + if (before(ptr, tp->rcv_nxt)) + return; + + /* Do we already have a newer (or duplicate) urgent pointer? */ + if (tp->urg_data && !after(ptr, tp->urg_seq)) + return; + + /* Tell the world about our new urgent pointer. */ + sk_send_sigurg(sk); + + /* We may be adding urgent data when the last byte read was + * urgent. To do this requires some care. We cannot just ignore + * tp->copied_seq since we would read the last urgent byte again + * as data, nor can we alter copied_seq until this data arrives + * or we break the sematics of SIOCATMARK (and thus sockatmark()) + * + * NOTE. Double Dutch. Rendering to plain English: author of comment + * above did something sort of send("A", MSG_OOB); send("B", MSG_OOB); + * and expect that both A and B disappear from stream. This is _wrong_. + * Though this happens in BSD with high probability, this is occasional. + * Any application relying on this is buggy. Note also, that fix "works" + * only in this artificial test. Insert some normal data between A and B and we will + * decline of BSD again. Verdict: it is better to remove to trap + * buggy users. + */ + if (tp->urg_seq == tp->copied_seq && tp->urg_data && + !sock_flag(sk, SOCK_URGINLINE) && + tp->copied_seq != tp->rcv_nxt) { + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + tp->copied_seq++; + if (skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)) { + __skb_unlink(skb, skb->list); + __kfree_skb(skb); + } + } + + tp->urg_data = TCP_URG_NOTYET; + tp->urg_seq = ptr; + + /* Disable header prediction. */ + tp->pred_flags = 0; +} + +/* This is the 'fast' part of urgent handling. */ +static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Check if we get a new urgent pointer - normally not. */ + if (th->urg) + tcp_check_urg(sk,th); + + /* Do we wait for any urgent data? - normally not... */ + if (tp->urg_data == TCP_URG_NOTYET) { + u32 ptr = tp->urg_seq - ntohl(th->seq) + (th->doff * 4) - + th->syn; + + /* Is the urgent pointer pointing into this packet? */ + if (ptr < skb->len) { + u8 tmp; + if (skb_copy_bits(skb, ptr, &tmp, 1)) + BUG(); + tp->urg_data = TCP_URG_VALID | tmp; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, 0); + } + } +} + +static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + int chunk = skb->len - hlen; + int err; + + local_bh_enable(); + if (skb->ip_summed==CHECKSUM_UNNECESSARY) + err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk); + else + err = skb_copy_and_csum_datagram_iovec(skb, hlen, + tp->ucopy.iov); + + if (!err) { + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + tcp_rcv_space_adjust(sk); + } + + local_bh_disable(); + return err; +} + +static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +{ + int result; + + if (sock_owned_by_user(sk)) { + local_bh_enable(); + result = __tcp_checksum_complete(skb); + local_bh_disable(); + } else { + result = __tcp_checksum_complete(skb); + } + return result; +} + +static __inline__ int +tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +{ + return skb->ip_summed != CHECKSUM_UNNECESSARY && + __tcp_checksum_complete_user(sk, skb); +} + +/* + * TCP receive function for the ESTABLISHED state. + * + * It is split into a fast path and a slow path. The fast path is + * disabled when: + * - A zero window was announced from us - zero window probing + * is only handled properly in the slow path. + * - Out of order segments arrived. + * - Urgent data is expected. + * - There is no buffer space left + * - Unexpected TCP flags/window values/header lengths are received + * (detected by checking the TCP header against pred_flags) + * - Data is sent in both directions. Fast path only supports pure senders + * or pure receivers (this means either the sequence number or the ack + * value must stay constant) + * - Unexpected TCP option. + * + * When these conditions are not satisfied it drops into a standard + * receive procedure patterned after RFC793 to handle all cases. + * The first three cases are guaranteed by proper pred_flags setting, + * the rest is checked inline. Fast processing is turned on in + * tcp_data_queue when everything is OK. + */ +int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* + * Header prediction. + * The code loosely follows the one in the famous + * "30 instruction TCP receive" Van Jacobson mail. + * + * Van's trick is to deposit buffers into socket queue + * on a device interrupt, to call tcp_recv function + * on the receive process context and checksum and copy + * the buffer to user space. smart... + * + * Our current scheme is not silly either but we take the + * extra cost of the net_bh soft interrupt processing... + * We do checksum and copy also but from device to kernel. + */ + + tp->rx_opt.saw_tstamp = 0; + + /* pred_flags is 0xS?10 << 16 + snd_wnd + * if header_predition is to be made + * 'S' will always be tp->tcp_header_len >> 2 + * '?' will be 0 for the fast path, otherwise pred_flags is 0 to + * turn it off (when there are holes in the receive + * space for instance) + * PSH flag is ignored. + */ + + if ((tcp_flag_word(th) & TCP_HP_BITS) == tp->pred_flags && + TCP_SKB_CB(skb)->seq == tp->rcv_nxt) { + int tcp_header_len = tp->tcp_header_len; + + /* Timestamp header prediction: tcp_header_len + * is automatically equal to th->doff*4 due to pred_flags + * match. + */ + + /* Check timestamp */ + if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) { + __u32 *ptr = (__u32 *)(th + 1); + + /* No? Slow path! */ + if (*ptr != ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) + | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) + goto slow_path; + + tp->rx_opt.saw_tstamp = 1; + ++ptr; + tp->rx_opt.rcv_tsval = ntohl(*ptr); + ++ptr; + tp->rx_opt.rcv_tsecr = ntohl(*ptr); + + /* If PAWS failed, check it more carefully in slow path */ + if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) < 0) + goto slow_path; + + /* DO NOT update ts_recent here, if checksum fails + * and timestamp was corrupted part, it will result + * in a hung connection since we will drop all + * future packets due to the PAWS test. + */ + } + + if (len <= tcp_header_len) { + /* Bulk data transfer: sender */ + if (len == tcp_header_len) { + /* Predicted packet is in window by definition. + * seq == rcv_nxt and rcv_wup <= rcv_nxt. + * Hence, check seq<=rcv_wup reduces to: + */ + if (tcp_header_len == + (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && + tp->rcv_nxt == tp->rcv_wup) + tcp_store_ts_recent(tp); + + tcp_rcv_rtt_measure_ts(tp, skb); + + /* We know that such packets are checksummed + * on entry. + */ + tcp_ack(sk, skb, 0); + __kfree_skb(skb); + tcp_data_snd_check(sk); + return 0; + } else { /* Header too small */ + TCP_INC_STATS_BH(TCP_MIB_INERRS); + goto discard; + } + } else { + int eaten = 0; + + if (tp->ucopy.task == current && + tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len && + sock_owned_by_user(sk)) { + __set_current_state(TASK_RUNNING); + + if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { + /* Predicted packet is in window by definition. + * seq == rcv_nxt and rcv_wup <= rcv_nxt. + * Hence, check seq<=rcv_wup reduces to: + */ + if (tcp_header_len == + (sizeof(struct tcphdr) + + TCPOLEN_TSTAMP_ALIGNED) && + tp->rcv_nxt == tp->rcv_wup) + tcp_store_ts_recent(tp); + + tcp_rcv_rtt_measure_ts(tp, skb); + + __skb_pull(skb, tcp_header_len); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER); + eaten = 1; + } + } + if (!eaten) { + if (tcp_checksum_complete_user(sk, skb)) + goto csum_error; + + /* Predicted packet is in window by definition. + * seq == rcv_nxt and rcv_wup <= rcv_nxt. + * Hence, check seq<=rcv_wup reduces to: + */ + if (tcp_header_len == + (sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) && + tp->rcv_nxt == tp->rcv_wup) + tcp_store_ts_recent(tp); + + tcp_rcv_rtt_measure_ts(tp, skb); + + if ((int)skb->truesize > sk->sk_forward_alloc) + goto step5; + + NET_INC_STATS_BH(LINUX_MIB_TCPHPHITS); + + /* Bulk data transfer: receiver */ + __skb_pull(skb,tcp_header_len); + __skb_queue_tail(&sk->sk_receive_queue, skb); + sk_stream_set_owner_r(skb, sk); + tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; + } + + tcp_event_data_recv(sk, tp, skb); + + if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { + /* Well, only one small jumplet in fast path... */ + tcp_ack(sk, skb, FLAG_DATA); + tcp_data_snd_check(sk); + if (!tcp_ack_scheduled(tp)) + goto no_ack; + } + + if (eaten) { + if (tcp_in_quickack_mode(tp)) { + tcp_send_ack(sk); + } else { + tcp_send_delayed_ack(sk); + } + } else { + __tcp_ack_snd_check(sk, 0); + } + +no_ack: + if (eaten) + __kfree_skb(skb); + else + sk->sk_data_ready(sk, 0); + return 0; + } + } + +slow_path: + if (len < (th->doff<<2) || tcp_checksum_complete_user(sk, skb)) + goto csum_error; + + /* + * RFC1323: H1. Apply PAWS check first. + */ + if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && + tcp_paws_discard(tp, skb)) { + if (!th->rst) { + NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + tcp_send_dupack(sk, skb); + goto discard; + } + /* Resets are accepted even if PAWS failed. + + ts_recent update must be made after we are sure + that the packet is in window. + */ + } + + /* + * Standard slow path. + */ + + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + /* RFC793, page 37: "In all states except SYN-SENT, all reset + * (RST) segments are validated by checking their SEQ-fields." + * And page 69: "If an incoming segment is not acceptable, + * an acknowledgment should be sent in reply (unless the RST bit + * is set, if so drop the segment and return)". + */ + if (!th->rst) + tcp_send_dupack(sk, skb); + goto discard; + } + + if(th->rst) { + tcp_reset(sk); + goto discard; + } + + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + + if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + TCP_INC_STATS_BH(TCP_MIB_INERRS); + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); + tcp_reset(sk); + return 1; + } + +step5: + if(th->ack) + tcp_ack(sk, skb, FLAG_SLOWPATH); + + tcp_rcv_rtt_measure_ts(tp, skb); + + /* Process urgent data. */ + tcp_urg(sk, skb, th); + + /* step 7: process the segment text */ + tcp_data_queue(sk, skb); + + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); + return 0; + +csum_error: + TCP_INC_STATS_BH(TCP_MIB_INERRS); + +discard: + __kfree_skb(skb); + return 0; +} + +static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ + struct tcp_sock *tp = tcp_sk(sk); + int saved_clamp = tp->rx_opt.mss_clamp; + + tcp_parse_options(skb, &tp->rx_opt, 0); + + if (th->ack) { + /* rfc793: + * "If the state is SYN-SENT then + * first check the ACK bit + * If the ACK bit is set + * If SEG.ACK =< ISS, or SEG.ACK > SND.NXT, send + * a reset (unless the RST bit is set, if so drop + * the segment and return)" + * + * We do not send data with SYN, so that RFC-correct + * test reduces to: + */ + if (TCP_SKB_CB(skb)->ack_seq != tp->snd_nxt) + goto reset_and_undo; + + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + !between(tp->rx_opt.rcv_tsecr, tp->retrans_stamp, + tcp_time_stamp)) { + NET_INC_STATS_BH(LINUX_MIB_PAWSACTIVEREJECTED); + goto reset_and_undo; + } + + /* Now ACK is acceptable. + * + * "If the RST bit is set + * If the ACK was acceptable then signal the user "error: + * connection reset", drop the segment, enter CLOSED state, + * delete TCB, and return." + */ + + if (th->rst) { + tcp_reset(sk); + goto discard; + } + + /* rfc793: + * "fifth, if neither of the SYN or RST bits is set then + * drop the segment and return." + * + * See note below! + * --ANK(990513) + */ + if (!th->syn) + goto discard_and_undo; + + /* rfc793: + * "If the SYN bit is on ... + * are acceptable then ... + * (our SYN has been ACKed), change the connection + * state to ESTABLISHED..." + */ + + TCP_ECN_rcv_synack(tp, th); + if (tp->ecn_flags&TCP_ECN_OK) + sock_set_flag(sk, SOCK_NO_LARGESEND); + + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tcp_ack(sk, skb, FLAG_SLOWPATH); + + /* Ok.. it's good. Set up sequence numbers and + * move to established. + */ + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + + /* RFC1323: The window in SYN & SYN/ACK segments is + * never scaled. + */ + tp->snd_wnd = ntohs(th->window); + tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, TCP_SKB_CB(skb)->seq); + + if (!tp->rx_opt.wscale_ok) { + tp->rx_opt.snd_wscale = tp->rx_opt.rcv_wscale = 0; + tp->window_clamp = min(tp->window_clamp, 65535U); + } + + if (tp->rx_opt.saw_tstamp) { + tp->rx_opt.tstamp_ok = 1; + tp->tcp_header_len = + sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + tcp_store_ts_recent(tp); + } else { + tp->tcp_header_len = sizeof(struct tcphdr); + } + + if (tp->rx_opt.sack_ok && sysctl_tcp_fack) + tp->rx_opt.sack_ok |= 2; + + tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_initialize_rcv_mss(sk); + + /* Remember, tcp_poll() does not lock socket! + * Change state from SYN-SENT only after copied_seq + * is initialized. */ + tp->copied_seq = tp->rcv_nxt; + mb(); + tcp_set_state(sk, TCP_ESTABLISHED); + + /* Make sure socket is routed, for correct metrics. */ + tp->af_specific->rebuild_header(sk); + + tcp_init_metrics(sk); + + /* Prevent spurious tcp_cwnd_restart() on first data + * packet. + */ + tp->lsndtime = tcp_time_stamp; + + tcp_init_buffer_space(sk); + + if (sock_flag(sk, SOCK_KEEPOPEN)) + tcp_reset_keepalive_timer(sk, keepalive_time_when(tp)); + + if (!tp->rx_opt.snd_wscale) + __tcp_fast_path_on(tp, tp->snd_wnd); + else + tp->pred_flags = 0; + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sk_wake_async(sk, 0, POLL_OUT); + } + + if (sk->sk_write_pending || tp->defer_accept || tp->ack.pingpong) { + /* Save one ACK. Data will be ready after + * several ticks, if write_pending is set. + * + * It may be deleted, but with this feature tcpdumps + * look so _wonderfully_ clever, that I was not able + * to stand against the temptation 8) --ANK + */ + tcp_schedule_ack(tp); + tp->ack.lrcvtime = tcp_time_stamp; + tp->ack.ato = TCP_ATO_MIN; + tcp_incr_quickack(tp); + tcp_enter_quickack_mode(tp); + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); + +discard: + __kfree_skb(skb); + return 0; + } else { + tcp_send_ack(sk); + } + return -1; + } + + /* No ACK in the segment */ + + if (th->rst) { + /* rfc793: + * "If the RST bit is set + * + * Otherwise (no ACK) drop the segment and return." + */ + + goto discard_and_undo; + } + + /* PAWS check. */ + if (tp->rx_opt.ts_recent_stamp && tp->rx_opt.saw_tstamp && tcp_paws_check(&tp->rx_opt, 0)) + goto discard_and_undo; + + if (th->syn) { + /* We see SYN without ACK. It is attempt of + * simultaneous connect with crossed SYNs. + * Particularly, it can be connect to self. + */ + tcp_set_state(sk, TCP_SYN_RECV); + + if (tp->rx_opt.saw_tstamp) { + tp->rx_opt.tstamp_ok = 1; + tcp_store_ts_recent(tp); + tp->tcp_header_len = + sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + tp->tcp_header_len = sizeof(struct tcphdr); + } + + tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1; + tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1; + + /* RFC1323: The window in SYN & SYN/ACK segments is + * never scaled. + */ + tp->snd_wnd = ntohs(th->window); + tp->snd_wl1 = TCP_SKB_CB(skb)->seq; + tp->max_window = tp->snd_wnd; + + TCP_ECN_rcv_syn(tp, th); + if (tp->ecn_flags&TCP_ECN_OK) + sock_set_flag(sk, SOCK_NO_LARGESEND); + + tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_initialize_rcv_mss(sk); + + + tcp_send_synack(sk); +#if 0 + /* Note, we could accept data and URG from this segment. + * There are no obstacles to make this. + * + * However, if we ignore data in ACKless segments sometimes, + * we have no reasons to accept it sometimes. + * Also, seems the code doing it in step6 of tcp_rcv_state_process + * is not flawless. So, discard packet for sanity. + * Uncomment this return to process the data. + */ + return -1; +#else + goto discard; +#endif + } + /* "fifth, if neither of the SYN or RST bits is set then + * drop the segment and return." + */ + +discard_and_undo: + tcp_clear_options(&tp->rx_opt); + tp->rx_opt.mss_clamp = saved_clamp; + goto discard; + +reset_and_undo: + tcp_clear_options(&tp->rx_opt); + tp->rx_opt.mss_clamp = saved_clamp; + return 1; +} + + +/* + * This function implements the receiving procedure of RFC 793 for + * all states except ESTABLISHED and TIME_WAIT. + * It's called from both tcp_v4_rcv and tcp_v6_rcv and should be + * address independent. + */ + +int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ + struct tcp_sock *tp = tcp_sk(sk); + int queued = 0; + + tp->rx_opt.saw_tstamp = 0; + + switch (sk->sk_state) { + case TCP_CLOSE: + goto discard; + + case TCP_LISTEN: + if(th->ack) + return 1; + + if(th->rst) + goto discard; + + if(th->syn) { + if(tp->af_specific->conn_request(sk, skb) < 0) + return 1; + + init_westwood(sk); + init_bictcp(tp); + + /* Now we have several options: In theory there is + * nothing else in the frame. KA9Q has an option to + * send data with the syn, BSD accepts data with the + * syn up to the [to be] advertised window and + * Solaris 2.1 gives you a protocol error. For now + * we just ignore it, that fits the spec precisely + * and avoids incompatibilities. It would be nice in + * future to drop through and process the data. + * + * Now that TTCP is starting to be used we ought to + * queue this data. + * But, this leaves one open to an easy denial of + * service attack, and SYN cookies can't defend + * against this problem. So, we drop the data + * in the interest of security over speed. + */ + goto discard; + } + goto discard; + + case TCP_SYN_SENT: + init_westwood(sk); + init_bictcp(tp); + + queued = tcp_rcv_synsent_state_process(sk, skb, th, len); + if (queued >= 0) + return queued; + + /* Do step6 onward by hand. */ + tcp_urg(sk, skb, th); + __kfree_skb(skb); + tcp_data_snd_check(sk); + return 0; + } + + if (tcp_fast_parse_options(skb, th, tp) && tp->rx_opt.saw_tstamp && + tcp_paws_discard(tp, skb)) { + if (!th->rst) { + NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + tcp_send_dupack(sk, skb); + goto discard; + } + /* Reset is accepted even if it did not pass PAWS. */ + } + + /* step 1: check sequence number */ + if (!tcp_sequence(tp, TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)) { + if (!th->rst) + tcp_send_dupack(sk, skb); + goto discard; + } + + /* step 2: check RST bit */ + if(th->rst) { + tcp_reset(sk); + goto discard; + } + + tcp_replace_ts_recent(tp, TCP_SKB_CB(skb)->seq); + + /* step 3: check security and precedence [ignored] */ + + /* step 4: + * + * Check for a SYN in window. + */ + if (th->syn && !before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) { + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONSYN); + tcp_reset(sk); + return 1; + } + + /* step 5: check the ACK field */ + if (th->ack) { + int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH); + + switch(sk->sk_state) { + case TCP_SYN_RECV: + if (acceptable) { + tp->copied_seq = tp->rcv_nxt; + mb(); + tcp_set_state(sk, TCP_ESTABLISHED); + sk->sk_state_change(sk); + + /* Note, that this wakeup is only for marginal + * crossed SYN case. Passively open sockets + * are not waked up, because sk->sk_sleep == + * NULL and sk->sk_socket == NULL. + */ + if (sk->sk_socket) { + sk_wake_async(sk,0,POLL_OUT); + } + + tp->snd_una = TCP_SKB_CB(skb)->ack_seq; + tp->snd_wnd = ntohs(th->window) << + tp->rx_opt.snd_wscale; + tcp_init_wl(tp, TCP_SKB_CB(skb)->ack_seq, + TCP_SKB_CB(skb)->seq); + + /* tcp_ack considers this ACK as duplicate + * and does not calculate rtt. + * Fix it at least with timestamps. + */ + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr && + !tp->srtt) + tcp_ack_saw_tstamp(tp, 0); + + if (tp->rx_opt.tstamp_ok) + tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; + + /* Make sure socket is routed, for + * correct metrics. + */ + tp->af_specific->rebuild_header(sk); + + tcp_init_metrics(sk); + + /* Prevent spurious tcp_cwnd_restart() on + * first data packet. + */ + tp->lsndtime = tcp_time_stamp; + + tcp_initialize_rcv_mss(sk); + tcp_init_buffer_space(sk); + tcp_fast_path_on(tp); + } else { + return 1; + } + break; + + case TCP_FIN_WAIT1: + if (tp->snd_una == tp->write_seq) { + tcp_set_state(sk, TCP_FIN_WAIT2); + sk->sk_shutdown |= SEND_SHUTDOWN; + dst_confirm(sk->sk_dst_cache); + + if (!sock_flag(sk, SOCK_DEAD)) + /* Wake up lingering close() */ + sk->sk_state_change(sk); + else { + int tmo; + + if (tp->linger2 < 0 || + (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { + tcp_done(sk); + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA); + return 1; + } + + tmo = tcp_fin_time(tp); + if (tmo > TCP_TIMEWAIT_LEN) { + tcp_reset_keepalive_timer(sk, tmo - TCP_TIMEWAIT_LEN); + } else if (th->fin || sock_owned_by_user(sk)) { + /* Bad case. We could lose such FIN otherwise. + * It is not a big problem, but it looks confusing + * and not so rare event. We still can lose it now, + * if it spins in bh_lock_sock(), but it is really + * marginal case. + */ + tcp_reset_keepalive_timer(sk, tmo); + } else { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto discard; + } + } + } + break; + + case TCP_CLOSING: + if (tp->snd_una == tp->write_seq) { + tcp_time_wait(sk, TCP_TIME_WAIT, 0); + goto discard; + } + break; + + case TCP_LAST_ACK: + if (tp->snd_una == tp->write_seq) { + tcp_update_metrics(sk); + tcp_done(sk); + goto discard; + } + break; + } + } else + goto discard; + + /* step 6: check the URG bit */ + tcp_urg(sk, skb, th); + + /* step 7: process the segment text */ + switch (sk->sk_state) { + case TCP_CLOSE_WAIT: + case TCP_CLOSING: + case TCP_LAST_ACK: + if (!before(TCP_SKB_CB(skb)->seq, tp->rcv_nxt)) + break; + case TCP_FIN_WAIT1: + case TCP_FIN_WAIT2: + /* RFC 793 says to queue data in these states, + * RFC 1122 says we MUST send a reset. + * BSD 4.4 also does reset. + */ + if (sk->sk_shutdown & RCV_SHUTDOWN) { + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONDATA); + tcp_reset(sk); + return 1; + } + } + /* Fall through */ + case TCP_ESTABLISHED: + tcp_data_queue(sk, skb); + queued = 1; + break; + } + + /* tcp_data could move socket to TIME-WAIT */ + if (sk->sk_state != TCP_CLOSE) { + tcp_data_snd_check(sk); + tcp_ack_snd_check(sk); + } + + if (!queued) { +discard: + __kfree_skb(skb); + } + return 0; +} + +EXPORT_SYMBOL(sysctl_tcp_ecn); +EXPORT_SYMBOL(sysctl_tcp_reordering); +EXPORT_SYMBOL(tcp_parse_options); +EXPORT_SYMBOL(tcp_rcv_established); +EXPORT_SYMBOL(tcp_rcv_state_process); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c new file mode 100644 index 000000000000..3ac6659869c4 --- /dev/null +++ b/net/ipv4/tcp_ipv4.c @@ -0,0 +1,2663 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $ + * + * IPv4 specific functions + * + * + * code split from: + * linux/ipv4/tcp.c + * linux/ipv4/tcp_input.c + * linux/ipv4/tcp_output.c + * + * See tcp.c for author information + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * David S. Miller : New socket lookup architecture. + * This code is dedicated to John Dyson. + * David S. Miller : Change semantics of established hash, + * half is devoted to TIME_WAIT sockets + * and the rest go in the other half. + * Andi Kleen : Add support for syncookies and fixed + * some bugs: ip options weren't passed to + * the TCP layer, missed a check for an + * ACK bit. + * Andi Kleen : Implemented fast path mtu discovery. + * Fixed many serious bugs in the + * open_request handling and moved + * most of it into the af independent code. + * Added tail drop and some other bugfixes. + * Added new listen sematics. + * Mike McLagan : Routing by source + * Juan Jose Ciarlante: ip_dynaddr bits + * Andi Kleen: various fixes. + * Vitaly E. Lavrov : Transparent proxy revived after year + * coma. + * Andi Kleen : Fix new listen. + * Andi Kleen : Fix accept error reporting. + * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which + * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind + * a single port at the same time. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +extern int sysctl_ip_dynaddr; +int sysctl_tcp_tw_reuse; +int sysctl_tcp_low_latency; + +/* Check TCP sequence numbers in ICMP packets. */ +#define ICMP_MIN_LENGTH 8 + +/* Socket used for sending RSTs */ +static struct socket *tcp_socket; + +void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, + struct sk_buff *skb); + +struct tcp_hashinfo __cacheline_aligned tcp_hashinfo = { + .__tcp_lhash_lock = RW_LOCK_UNLOCKED, + .__tcp_lhash_users = ATOMIC_INIT(0), + .__tcp_lhash_wait + = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.__tcp_lhash_wait), + .__tcp_portalloc_lock = SPIN_LOCK_UNLOCKED +}; + +/* + * This array holds the first and last local port number. + * For high-usage systems, use sysctl to change this to + * 32768-61000 + */ +int sysctl_local_port_range[2] = { 1024, 4999 }; +int tcp_port_rover = 1024 - 1; + +static __inline__ int tcp_hashfn(__u32 laddr, __u16 lport, + __u32 faddr, __u16 fport) +{ + int h = (laddr ^ lport) ^ (faddr ^ fport); + h ^= h >> 16; + h ^= h >> 8; + return h & (tcp_ehash_size - 1); +} + +static __inline__ int tcp_sk_hashfn(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + __u32 laddr = inet->rcv_saddr; + __u16 lport = inet->num; + __u32 faddr = inet->daddr; + __u16 fport = inet->dport; + + return tcp_hashfn(laddr, lport, faddr, fport); +} + +/* Allocate and initialize a new TCP local port bind bucket. + * The bindhash mutex for snum's hash chain must be held here. + */ +struct tcp_bind_bucket *tcp_bucket_create(struct tcp_bind_hashbucket *head, + unsigned short snum) +{ + struct tcp_bind_bucket *tb = kmem_cache_alloc(tcp_bucket_cachep, + SLAB_ATOMIC); + if (tb) { + tb->port = snum; + tb->fastreuse = 0; + INIT_HLIST_HEAD(&tb->owners); + hlist_add_head(&tb->node, &head->chain); + } + return tb; +} + +/* Caller must hold hashbucket lock for this tb with local BH disabled */ +void tcp_bucket_destroy(struct tcp_bind_bucket *tb) +{ + if (hlist_empty(&tb->owners)) { + __hlist_del(&tb->node); + kmem_cache_free(tcp_bucket_cachep, tb); + } +} + +/* Caller must disable local BH processing. */ +static __inline__ void __tcp_inherit_port(struct sock *sk, struct sock *child) +{ + struct tcp_bind_hashbucket *head = + &tcp_bhash[tcp_bhashfn(inet_sk(child)->num)]; + struct tcp_bind_bucket *tb; + + spin_lock(&head->lock); + tb = tcp_sk(sk)->bind_hash; + sk_add_bind_node(child, &tb->owners); + tcp_sk(child)->bind_hash = tb; + spin_unlock(&head->lock); +} + +inline void tcp_inherit_port(struct sock *sk, struct sock *child) +{ + local_bh_disable(); + __tcp_inherit_port(sk, child); + local_bh_enable(); +} + +void tcp_bind_hash(struct sock *sk, struct tcp_bind_bucket *tb, + unsigned short snum) +{ + inet_sk(sk)->num = snum; + sk_add_bind_node(sk, &tb->owners); + tcp_sk(sk)->bind_hash = tb; +} + +static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb) +{ + const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk); + struct sock *sk2; + struct hlist_node *node; + int reuse = sk->sk_reuse; + + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + !tcp_v6_ipv6only(sk2) && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { + if (!reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) { + const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2); + if (!sk2_rcv_saddr || !sk_rcv_saddr || + sk2_rcv_saddr == sk_rcv_saddr) + break; + } + } + } + return node != NULL; +} + +/* Obtain a reference to a local port for the given sock, + * if snum is zero it means select any available local port. + */ +static int tcp_v4_get_port(struct sock *sk, unsigned short snum) +{ + struct tcp_bind_hashbucket *head; + struct hlist_node *node; + struct tcp_bind_bucket *tb; + int ret; + + local_bh_disable(); + if (!snum) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + int rover; + + spin_lock(&tcp_portalloc_lock); + rover = tcp_port_rover; + do { + rover++; + if (rover < low || rover > high) + rover = low; + head = &tcp_bhash[tcp_bhashfn(rover)]; + spin_lock(&head->lock); + tb_for_each(tb, node, &head->chain) + if (tb->port == rover) + goto next; + break; + next: + spin_unlock(&head->lock); + } while (--remaining > 0); + tcp_port_rover = rover; + spin_unlock(&tcp_portalloc_lock); + + /* Exhausted local port range during search? */ + ret = 1; + if (remaining <= 0) + goto fail; + + /* OK, here is the one we will use. HEAD is + * non-NULL and we hold it's mutex. + */ + snum = rover; + } else { + head = &tcp_bhash[tcp_bhashfn(snum)]; + spin_lock(&head->lock); + tb_for_each(tb, node, &head->chain) + if (tb->port == snum) + goto tb_found; + } + tb = NULL; + goto tb_not_found; +tb_found: + if (!hlist_empty(&tb->owners)) { + if (sk->sk_reuse > 1) + goto success; + if (tb->fastreuse > 0 && + sk->sk_reuse && sk->sk_state != TCP_LISTEN) { + goto success; + } else { + ret = 1; + if (tcp_bind_conflict(sk, tb)) + goto fail_unlock; + } + } +tb_not_found: + ret = 1; + if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL) + goto fail_unlock; + if (hlist_empty(&tb->owners)) { + if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) + tb->fastreuse = 1; + else + tb->fastreuse = 0; + } else if (tb->fastreuse && + (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) + tb->fastreuse = 0; +success: + if (!tcp_sk(sk)->bind_hash) + tcp_bind_hash(sk, tb, snum); + BUG_TRAP(tcp_sk(sk)->bind_hash == tb); + ret = 0; + +fail_unlock: + spin_unlock(&head->lock); +fail: + local_bh_enable(); + return ret; +} + +/* Get rid of any references to a local port held by the + * given sock. + */ +static void __tcp_put_port(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct tcp_bind_hashbucket *head = &tcp_bhash[tcp_bhashfn(inet->num)]; + struct tcp_bind_bucket *tb; + + spin_lock(&head->lock); + tb = tcp_sk(sk)->bind_hash; + __sk_del_bind_node(sk); + tcp_sk(sk)->bind_hash = NULL; + inet->num = 0; + tcp_bucket_destroy(tb); + spin_unlock(&head->lock); +} + +void tcp_put_port(struct sock *sk) +{ + local_bh_disable(); + __tcp_put_port(sk); + local_bh_enable(); +} + +/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it can be very bad on SMP. + * Look, when several writers sleep and reader wakes them up, all but one + * immediately hit write lock and grab all the cpus. Exclusive sleep solves + * this, _but_ remember, it adds useless work on UP machines (wake up each + * exclusive lock release). It should be ifdefed really. + */ + +void tcp_listen_wlock(void) +{ + write_lock(&tcp_lhash_lock); + + if (atomic_read(&tcp_lhash_users)) { + DEFINE_WAIT(wait); + + for (;;) { + prepare_to_wait_exclusive(&tcp_lhash_wait, + &wait, TASK_UNINTERRUPTIBLE); + if (!atomic_read(&tcp_lhash_users)) + break; + write_unlock_bh(&tcp_lhash_lock); + schedule(); + write_lock_bh(&tcp_lhash_lock); + } + + finish_wait(&tcp_lhash_wait, &wait); + } +} + +static __inline__ void __tcp_v4_hash(struct sock *sk, const int listen_possible) +{ + struct hlist_head *list; + rwlock_t *lock; + + BUG_TRAP(sk_unhashed(sk)); + if (listen_possible && sk->sk_state == TCP_LISTEN) { + list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; + lock = &tcp_lhash_lock; + tcp_listen_wlock(); + } else { + list = &tcp_ehash[(sk->sk_hashent = tcp_sk_hashfn(sk))].chain; + lock = &tcp_ehash[sk->sk_hashent].lock; + write_lock(lock); + } + __sk_add_node(sk, list); + sock_prot_inc_use(sk->sk_prot); + write_unlock(lock); + if (listen_possible && sk->sk_state == TCP_LISTEN) + wake_up(&tcp_lhash_wait); +} + +static void tcp_v4_hash(struct sock *sk) +{ + if (sk->sk_state != TCP_CLOSE) { + local_bh_disable(); + __tcp_v4_hash(sk, 1); + local_bh_enable(); + } +} + +void tcp_unhash(struct sock *sk) +{ + rwlock_t *lock; + + if (sk_unhashed(sk)) + goto ende; + + if (sk->sk_state == TCP_LISTEN) { + local_bh_disable(); + tcp_listen_wlock(); + lock = &tcp_lhash_lock; + } else { + struct tcp_ehash_bucket *head = &tcp_ehash[sk->sk_hashent]; + lock = &head->lock; + write_lock_bh(&head->lock); + } + + if (__sk_del_node_init(sk)) + sock_prot_dec_use(sk->sk_prot); + write_unlock_bh(lock); + + ende: + if (sk->sk_state == TCP_LISTEN) + wake_up(&tcp_lhash_wait); +} + +/* Don't inline this cruft. Here are some nice properties to + * exploit here. The BSD API does not allow a listening TCP + * to specify the remote port nor the remote address for the + * connection. So always assume those are both wildcarded + * during the search since they can never be otherwise. + */ +static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr, + unsigned short hnum, int dif) +{ + struct sock *result = NULL, *sk; + struct hlist_node *node; + int score, hiscore; + + hiscore=-1; + sk_for_each(sk, node, head) { + struct inet_sock *inet = inet_sk(sk); + + if (inet->num == hnum && !ipv6_only_sock(sk)) { + __u32 rcv_saddr = inet->rcv_saddr; + + score = (sk->sk_family == PF_INET ? 1 : 0); + if (rcv_saddr) { + if (rcv_saddr != daddr) + continue; + score+=2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score+=2; + } + if (score == 5) + return sk; + if (score > hiscore) { + hiscore = score; + result = sk; + } + } + } + return result; +} + +/* Optimize the common listener case. */ +static inline struct sock *tcp_v4_lookup_listener(u32 daddr, + unsigned short hnum, int dif) +{ + struct sock *sk = NULL; + struct hlist_head *head; + + read_lock(&tcp_lhash_lock); + head = &tcp_listening_hash[tcp_lhashfn(hnum)]; + if (!hlist_empty(head)) { + struct inet_sock *inet = inet_sk((sk = __sk_head(head))); + + if (inet->num == hnum && !sk->sk_node.next && + (!inet->rcv_saddr || inet->rcv_saddr == daddr) && + (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && + !sk->sk_bound_dev_if) + goto sherry_cache; + sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif); + } + if (sk) { +sherry_cache: + sock_hold(sk); + } + read_unlock(&tcp_lhash_lock); + return sk; +} + +/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so + * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM + * + * Local BH must be disabled here. + */ + +static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport, + u32 daddr, u16 hnum, + int dif) +{ + struct tcp_ehash_bucket *head; + TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) + __u32 ports = TCP_COMBINED_PORTS(sport, hnum); + struct sock *sk; + struct hlist_node *node; + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + int hash = tcp_hashfn(daddr, hnum, saddr, sport); + head = &tcp_ehash[hash]; + read_lock(&head->lock); + sk_for_each(sk, node, &head->chain) { + if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) + goto hit; /* You sunk my battleship! */ + } + + /* Must check for a TIME_WAIT'er before going to listener hash. */ + sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) { + if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) + goto hit; + } + sk = NULL; +out: + read_unlock(&head->lock); + return sk; +hit: + sock_hold(sk); + goto out; +} + +static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, + u32 daddr, u16 hnum, int dif) +{ + struct sock *sk = __tcp_v4_lookup_established(saddr, sport, + daddr, hnum, dif); + + return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif); +} + +inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, + u16 dport, int dif) +{ + struct sock *sk; + + local_bh_disable(); + sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif); + local_bh_enable(); + + return sk; +} + +EXPORT_SYMBOL_GPL(tcp_v4_lookup); + +static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) +{ + return secure_tcp_sequence_number(skb->nh.iph->daddr, + skb->nh.iph->saddr, + skb->h.th->dest, + skb->h.th->source); +} + +/* called with local bh disabled */ +static int __tcp_v4_check_established(struct sock *sk, __u16 lport, + struct tcp_tw_bucket **twp) +{ + struct inet_sock *inet = inet_sk(sk); + u32 daddr = inet->rcv_saddr; + u32 saddr = inet->daddr; + int dif = sk->sk_bound_dev_if; + TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) + __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); + int hash = tcp_hashfn(daddr, lport, saddr, inet->dport); + struct tcp_ehash_bucket *head = &tcp_ehash[hash]; + struct sock *sk2; + struct hlist_node *node; + struct tcp_tw_bucket *tw; + + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) { + tw = (struct tcp_tw_bucket *)sk2; + + if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { + struct tcp_sock *tp = tcp_sk(sk); + + /* With PAWS, it is safe from the viewpoint + of data integrity. Even without PAWS it + is safe provided sequence spaces do not + overlap i.e. at data rates <= 80Mbit/sec. + + Actually, the idea is close to VJ's one, + only timestamp cache is held not per host, + but per port pair and TW bucket is used + as state holder. + + If TW bucket has been already destroyed we + fall back to VJ's scheme and use initial + timestamp retrieved from peer table. + */ + if (tw->tw_ts_recent_stamp && + (!twp || (sysctl_tcp_tw_reuse && + xtime.tv_sec - + tw->tw_ts_recent_stamp > 1))) { + if ((tp->write_seq = + tw->tw_snd_nxt + 65535 + 2) == 0) + tp->write_seq = 1; + tp->rx_opt.ts_recent = tw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; + sock_hold(sk2); + goto unique; + } else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + /* Must record num and sport now. Otherwise we will see + * in hash table socket with a funny identity. */ + inet->num = lport; + inet->sport = htons(lport); + sk->sk_hashent = hash; + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw) { + /* Silly. Should hash-dance instead... */ + tcp_tw_deschedule(tw); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + tcp_tw_put(tw); + } + + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +static inline u32 connect_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + + return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, + inet->dport); +} + +/* + * Bind a port for a connect operation and hash it. + */ +static inline int tcp_v4_hash_connect(struct sock *sk) +{ + unsigned short snum = inet_sk(sk)->num; + struct tcp_bind_hashbucket *head; + struct tcp_bind_bucket *tb; + int ret; + + if (!snum) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int range = high - low; + int i; + int port; + static u32 hint; + u32 offset = hint + connect_port_offset(sk); + struct hlist_node *node; + struct tcp_tw_bucket *tw = NULL; + + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; + head = &tcp_bhash[tcp_bhashfn(port)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + tb_for_each(tb, node, &head->chain) { + if (tb->port == port) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__tcp_v4_check_established(sk, + port, + &tw)) + goto ok; + goto next_port; + } + } + + tb = tcp_bucket_create(head, port); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + hint += i; + + /* Head lock still held and bh's disabled */ + tcp_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(port); + __tcp_v4_hash(sk, 0); + } + spin_unlock(&head->lock); + + if (tw) { + tcp_tw_deschedule(tw); + tcp_tw_put(tw); + } + + ret = 0; + goto out; + } + + head = &tcp_bhash[tcp_bhashfn(snum)]; + tb = tcp_sk(sk)->bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { + __tcp_v4_hash(sk, 0); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __tcp_v4_check_established(sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +/* This will initiate an outgoing connection. */ +int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; + struct rtable *rt; + u32 daddr, nexthop; + int tmp; + int err; + + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + if (usin->sin_family != AF_INET) + return -EAFNOSUPPORT; + + nexthop = daddr = usin->sin_addr.s_addr; + if (inet->opt && inet->opt->srr) { + if (!daddr) + return -EINVAL; + nexthop = inet->opt->faddr; + } + + tmp = ip_route_connect(&rt, nexthop, inet->saddr, + RT_CONN_FLAGS(sk), sk->sk_bound_dev_if, + IPPROTO_TCP, + inet->sport, usin->sin_port, sk); + if (tmp < 0) + return tmp; + + if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) { + ip_rt_put(rt); + return -ENETUNREACH; + } + + if (!inet->opt || !inet->opt->srr) + daddr = rt->rt_dst; + + if (!inet->saddr) + inet->saddr = rt->rt_src; + inet->rcv_saddr = inet->saddr; + + if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) { + /* Reset inherited state */ + tp->rx_opt.ts_recent = 0; + tp->rx_opt.ts_recent_stamp = 0; + tp->write_seq = 0; + } + + if (sysctl_tcp_tw_recycle && + !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) { + struct inet_peer *peer = rt_get_peer(rt); + + /* VJ's idea. We save last timestamp seen from + * the destination in peer table, when entering state TIME-WAIT + * and initialize rx_opt.ts_recent from it, when trying new connection. + */ + + if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) { + tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp; + tp->rx_opt.ts_recent = peer->tcp_ts; + } + } + + inet->dport = usin->sin_port; + inet->daddr = daddr; + + tp->ext_header_len = 0; + if (inet->opt) + tp->ext_header_len = inet->opt->optlen; + + tp->rx_opt.mss_clamp = 536; + + /* Socket identity is still unknown (sport may be zero). + * However we set state to SYN-SENT and not releasing socket + * lock select source port, enter ourselves into the hash tables and + * complete initialization after this. + */ + tcp_set_state(sk, TCP_SYN_SENT); + err = tcp_v4_hash_connect(sk); + if (err) + goto failure; + + err = ip_route_newports(&rt, inet->sport, inet->dport, sk); + if (err) + goto failure; + + /* OK, now commit destination to socket. */ + __sk_dst_set(sk, &rt->u.dst); + tcp_v4_setup_caps(sk, &rt->u.dst); + + if (!tp->write_seq) + tp->write_seq = secure_tcp_sequence_number(inet->saddr, + inet->daddr, + inet->sport, + usin->sin_port); + + inet->id = tp->write_seq ^ jiffies; + + err = tcp_connect(sk); + rt = NULL; + if (err) + goto failure; + + return 0; + +failure: + /* This unhashes the socket and releases the local port, if necessary. */ + tcp_set_state(sk, TCP_CLOSE); + ip_rt_put(rt); + sk->sk_route_caps = 0; + inet->dport = 0; + return err; +} + +static __inline__ int tcp_v4_iif(struct sk_buff *skb) +{ + return ((struct rtable *)skb->dst)->rt_iif; +} + +static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd) +{ + return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1)); +} + +static struct open_request *tcp_v4_search_req(struct tcp_sock *tp, + struct open_request ***prevp, + __u16 rport, + __u32 raddr, __u32 laddr) +{ + struct tcp_listen_opt *lopt = tp->listen_opt; + struct open_request *req, **prev; + + for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + if (req->rmt_port == rport && + req->af.v4_req.rmt_addr == raddr && + req->af.v4_req.loc_addr == laddr && + TCP_INET_FAMILY(req->class->family)) { + BUG_TRAP(!req->sk); + *prevp = prev; + break; + } + } + + return req; +} + +static void tcp_v4_synq_add(struct sock *sk, struct open_request *req) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_listen_opt *lopt = tp->listen_opt; + u32 h = tcp_v4_synq_hash(req->af.v4_req.rmt_addr, req->rmt_port, lopt->hash_rnd); + + req->expires = jiffies + TCP_TIMEOUT_INIT; + req->retrans = 0; + req->sk = NULL; + req->dl_next = lopt->syn_table[h]; + + write_lock(&tp->syn_wait_lock); + lopt->syn_table[h] = req; + write_unlock(&tp->syn_wait_lock); + + tcp_synq_added(sk); +} + + +/* + * This routine does path mtu discovery as defined in RFC1191. + */ +static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, + u32 mtu) +{ + struct dst_entry *dst; + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + + /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs + * send out by Linux are always <576bytes so they should go through + * unfragmented). + */ + if (sk->sk_state == TCP_LISTEN) + return; + + /* We don't check in the destentry if pmtu discovery is forbidden + * on this route. We just assume that no packet_to_big packets + * are send back when pmtu discovery is not active. + * There is a small race when the user changes this flag in the + * route, but I think that's acceptable. + */ + if ((dst = __sk_dst_check(sk, 0)) == NULL) + return; + + dst->ops->update_pmtu(dst, mtu); + + /* Something is about to be wrong... Remember soft error + * for the case, if this connection will not able to recover. + */ + if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst)) + sk->sk_err_soft = EMSGSIZE; + + mtu = dst_mtu(dst); + + if (inet->pmtudisc != IP_PMTUDISC_DONT && + tp->pmtu_cookie > mtu) { + tcp_sync_mss(sk, mtu); + + /* Resend the TCP packet because it's + * clear that the old packet has been + * dropped. This is the new "fast" path mtu + * discovery. + */ + tcp_simple_retransmit(sk); + } /* else let the usual retransmit timer handle it */ +} + +/* + * This routine is called by the ICMP module when it gets some + * sort of error condition. If err < 0 then the socket should + * be closed and the error returned to the user. If err > 0 + * it's just the icmp type << 8 | icmp code. After adjustment + * header points to the first 8 bytes of the tcp header. We need + * to find the appropriate port. + * + * The locking strategy used here is very "optimistic". When + * someone else accesses the socket the ICMP is just dropped + * and for some paths there is no check at all. + * A more general error queue to queue errors for later handling + * is probably better. + * + */ + +void tcp_v4_err(struct sk_buff *skb, u32 info) +{ + struct iphdr *iph = (struct iphdr *)skb->data; + struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2)); + struct tcp_sock *tp; + struct inet_sock *inet; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct sock *sk; + __u32 seq; + int err; + + if (skb->len < (iph->ihl << 2) + 8) { + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + return; + } + + sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, + th->source, tcp_v4_iif(skb)); + if (!sk) { + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + return; + } + if (sk->sk_state == TCP_TIME_WAIT) { + tcp_tw_put((struct tcp_tw_bucket *)sk); + return; + } + + bh_lock_sock(sk); + /* If too many ICMPs get dropped on busy + * servers this needs to be solved differently. + */ + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); + + if (sk->sk_state == TCP_CLOSE) + goto out; + + tp = tcp_sk(sk); + seq = ntohl(th->seq); + if (sk->sk_state != TCP_LISTEN && + !between(seq, tp->snd_una, tp->snd_nxt)) { + NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + switch (type) { + case ICMP_SOURCE_QUENCH: + /* Just silently ignore these. */ + goto out; + case ICMP_PARAMETERPROB: + err = EPROTO; + break; + case ICMP_DEST_UNREACH: + if (code > NR_ICMP_UNREACH) + goto out; + + if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ + if (!sock_owned_by_user(sk)) + do_pmtu_discovery(sk, iph, info); + goto out; + } + + err = icmp_err_convert[code].errno; + break; + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + default: + goto out; + } + + switch (sk->sk_state) { + struct open_request *req, **prev; + case TCP_LISTEN: + if (sock_owned_by_user(sk)) + goto out; + + req = tcp_v4_search_req(tp, &prev, th->dest, + iph->daddr, iph->saddr); + if (!req) + goto out; + + /* ICMPs are not backlogged, hence we cannot get + an established socket here. + */ + BUG_TRAP(!req->sk); + + if (seq != req->snt_isn) { + NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + /* + * Still in SYN_RECV, just remove it silently. + * There is no good way to pass the error to the newly + * created socket, and POSIX does not want network + * errors returned from accept(). + */ + tcp_synq_drop(sk, req, prev); + goto out; + + case TCP_SYN_SENT: + case TCP_SYN_RECV: /* Cannot happen. + It can f.e. if SYNs crossed. + */ + if (!sock_owned_by_user(sk)) { + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + sk->sk_err = err; + + sk->sk_error_report(sk); + + tcp_done(sk); + } else { + sk->sk_err_soft = err; + } + goto out; + } + + /* If we've already connected we will keep trying + * until we time out, or the user gives up. + * + * rfc1122 4.2.3.9 allows to consider as hard errors + * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too, + * but it is obsoleted by pmtu discovery). + * + * Note, that in modern internet, where routing is unreliable + * and in each dark corner broken firewalls sit, sending random + * errors ordered by their masters even this two messages finally lose + * their original sense (even Linux sends invalid PORT_UNREACHs) + * + * Now we are in compliance with RFCs. + * --ANK (980905) + */ + + inet = inet_sk(sk); + if (!sock_owned_by_user(sk) && inet->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else { /* Only an error on timeout */ + sk->sk_err_soft = err; + } + +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +/* This routine computes an IPv4 TCP checksum. */ +void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, + struct sk_buff *skb) +{ + struct inet_sock *inet = inet_sk(sk); + + if (skb->ip_summed == CHECKSUM_HW) { + th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0); + skb->csum = offsetof(struct tcphdr, check); + } else { + th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr, + csum_partial((char *)th, + th->doff << 2, + skb->csum)); + } +} + +/* + * This routine will send an RST to the other tcp. + * + * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.) + * for reset. + * Answer: if a packet caused RST, it is not for a socket + * existing in our system, if it is matched to a socket, + * it is just duplicate segment or bug in other side's TCP. + * So that we build reply only basing on parameters + * arrived with segment. + * Exception: precedence violation. We do not implement it in any case. + */ + +static void tcp_v4_send_reset(struct sk_buff *skb) +{ + struct tcphdr *th = skb->h.th; + struct tcphdr rth; + struct ip_reply_arg arg; + + /* Never send a reset in response to a reset. */ + if (th->rst) + return; + + if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL) + return; + + /* Swap the send and the receive. */ + memset(&rth, 0, sizeof(struct tcphdr)); + rth.dest = th->source; + rth.source = th->dest; + rth.doff = sizeof(struct tcphdr) / 4; + rth.rst = 1; + + if (th->ack) { + rth.seq = th->ack_seq; + } else { + rth.ack = 1; + rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + + skb->len - (th->doff << 2)); + } + + memset(&arg, 0, sizeof arg); + arg.iov[0].iov_base = (unsigned char *)&rth; + arg.iov[0].iov_len = sizeof rth; + arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, + skb->nh.iph->saddr, /*XXX*/ + sizeof(struct tcphdr), IPPROTO_TCP, 0); + arg.csumoffset = offsetof(struct tcphdr, check) / 2; + + ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth); + + TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); + TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); +} + +/* The code following below sending ACKs in SYN-RECV and TIME-WAIT states + outside socket context is ugly, certainly. What can I do? + */ + +static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, + u32 win, u32 ts) +{ + struct tcphdr *th = skb->h.th; + struct { + struct tcphdr th; + u32 tsopt[3]; + } rep; + struct ip_reply_arg arg; + + memset(&rep.th, 0, sizeof(struct tcphdr)); + memset(&arg, 0, sizeof arg); + + arg.iov[0].iov_base = (unsigned char *)&rep; + arg.iov[0].iov_len = sizeof(rep.th); + if (ts) { + rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + rep.tsopt[1] = htonl(tcp_time_stamp); + rep.tsopt[2] = htonl(ts); + arg.iov[0].iov_len = sizeof(rep); + } + + /* Swap the send and the receive. */ + rep.th.dest = th->source; + rep.th.source = th->dest; + rep.th.doff = arg.iov[0].iov_len / 4; + rep.th.seq = htonl(seq); + rep.th.ack_seq = htonl(ack); + rep.th.ack = 1; + rep.th.window = htons(win); + + arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr, + skb->nh.iph->saddr, /*XXX*/ + arg.iov[0].iov_len, IPPROTO_TCP, 0); + arg.csumoffset = offsetof(struct tcphdr, check) / 2; + + ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len); + + TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); +} + +static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + + tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt, + tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent); + + tcp_tw_put(tw); +} + +static void tcp_v4_or_send_ack(struct sk_buff *skb, struct open_request *req) +{ + tcp_v4_send_ack(skb, req->snt_isn + 1, req->rcv_isn + 1, req->rcv_wnd, + req->ts_recent); +} + +static struct dst_entry* tcp_v4_route_req(struct sock *sk, + struct open_request *req) +{ + struct rtable *rt; + struct ip_options *opt = req->af.v4_req.opt; + struct flowi fl = { .oif = sk->sk_bound_dev_if, + .nl_u = { .ip4_u = + { .daddr = ((opt && opt->srr) ? + opt->faddr : + req->af.v4_req.rmt_addr), + .saddr = req->af.v4_req.loc_addr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = IPPROTO_TCP, + .uli_u = { .ports = + { .sport = inet_sk(sk)->sport, + .dport = req->rmt_port } } }; + + if (ip_route_output_flow(&rt, &fl, sk, 0)) { + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + return NULL; + } + if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { + ip_rt_put(rt); + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + return NULL; + } + return &rt->u.dst; +} + +/* + * Send a SYN-ACK after having received an ACK. + * This still operates on a open_request only, not on a big + * socket. + */ +static int tcp_v4_send_synack(struct sock *sk, struct open_request *req, + struct dst_entry *dst) +{ + int err = -1; + struct sk_buff * skb; + + /* First, grab a route. */ + if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) + goto out; + + skb = tcp_make_synack(sk, dst, req); + + if (skb) { + struct tcphdr *th = skb->h.th; + + th->check = tcp_v4_check(th, skb->len, + req->af.v4_req.loc_addr, + req->af.v4_req.rmt_addr, + csum_partial((char *)th, skb->len, + skb->csum)); + + err = ip_build_and_send_pkt(skb, sk, req->af.v4_req.loc_addr, + req->af.v4_req.rmt_addr, + req->af.v4_req.opt); + if (err == NET_XMIT_CN) + err = 0; + } + +out: + dst_release(dst); + return err; +} + +/* + * IPv4 open_request destructor. + */ +static void tcp_v4_or_free(struct open_request *req) +{ + if (req->af.v4_req.opt) + kfree(req->af.v4_req.opt); +} + +static inline void syn_flood_warning(struct sk_buff *skb) +{ + static unsigned long warntime; + + if (time_after(jiffies, (warntime + HZ * 60))) { + warntime = jiffies; + printk(KERN_INFO + "possible SYN flooding on port %d. Sending cookies.\n", + ntohs(skb->h.th->dest)); + } +} + +/* + * Save and compile IPv4 options into the open_request if needed. + */ +static inline struct ip_options *tcp_v4_save_options(struct sock *sk, + struct sk_buff *skb) +{ + struct ip_options *opt = &(IPCB(skb)->opt); + struct ip_options *dopt = NULL; + + if (opt && opt->optlen) { + int opt_size = optlength(opt); + dopt = kmalloc(opt_size, GFP_ATOMIC); + if (dopt) { + if (ip_options_echo(dopt, skb)) { + kfree(dopt); + dopt = NULL; + } + } + } + return dopt; +} + +/* + * Maximum number of SYN_RECV sockets in queue per LISTEN socket. + * One SYN_RECV socket costs about 80bytes on a 32bit machine. + * It would be better to replace it with a global counter for all sockets + * but then some measure against one socket starving all other sockets + * would be needed. + * + * It was 128 by default. Experiments with real servers show, that + * it is absolutely not enough even at 100conn/sec. 256 cures most + * of problems. This value is adjusted to 128 for very small machines + * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb). + * Further increasing requires to change hash table size. + */ +int sysctl_max_syn_backlog = 256; + +struct or_calltable or_ipv4 = { + .family = PF_INET, + .rtx_syn_ack = tcp_v4_send_synack, + .send_ack = tcp_v4_or_send_ack, + .destructor = tcp_v4_or_free, + .send_reset = tcp_v4_send_reset, +}; + +int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_options_received tmp_opt; + struct open_request *req; + __u32 saddr = skb->nh.iph->saddr; + __u32 daddr = skb->nh.iph->daddr; + __u32 isn = TCP_SKB_CB(skb)->when; + struct dst_entry *dst = NULL; +#ifdef CONFIG_SYN_COOKIES + int want_cookie = 0; +#else +#define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */ +#endif + + /* Never answer to SYNs send to broadcast or multicast */ + if (((struct rtable *)skb->dst)->rt_flags & + (RTCF_BROADCAST | RTCF_MULTICAST)) + goto drop; + + /* TW buckets are converted to open requests without + * limitations, they conserve resources and peer is + * evidently real one. + */ + if (tcp_synq_is_full(sk) && !isn) { +#ifdef CONFIG_SYN_COOKIES + if (sysctl_tcp_syncookies) { + want_cookie = 1; + } else +#endif + goto drop; + } + + /* Accept backlog is full. If we have already queued enough + * of warm entries in syn queue, drop request. It is better than + * clogging syn queue with openreqs with exponentially increasing + * timeout. + */ + if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) + goto drop; + + req = tcp_openreq_alloc(); + if (!req) + goto drop; + + tcp_clear_options(&tmp_opt); + tmp_opt.mss_clamp = 536; + tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss; + + tcp_parse_options(skb, &tmp_opt, 0); + + if (want_cookie) { + tcp_clear_options(&tmp_opt); + tmp_opt.saw_tstamp = 0; + } + + if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) { + /* Some OSes (unknown ones, but I see them on web server, which + * contains information interesting only for windows' + * users) do not send their stamp in SYN. It is easy case. + * We simply do not advertise TS support. + */ + tmp_opt.saw_tstamp = 0; + tmp_opt.tstamp_ok = 0; + } + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; + + tcp_openreq_init(req, &tmp_opt, skb); + + req->af.v4_req.loc_addr = daddr; + req->af.v4_req.rmt_addr = saddr; + req->af.v4_req.opt = tcp_v4_save_options(sk, skb); + req->class = &or_ipv4; + if (!want_cookie) + TCP_ECN_create_request(req, skb->h.th); + + if (want_cookie) { +#ifdef CONFIG_SYN_COOKIES + syn_flood_warning(skb); +#endif + isn = cookie_v4_init_sequence(sk, skb, &req->mss); + } else if (!isn) { + struct inet_peer *peer = NULL; + + /* VJ's idea. We save last timestamp seen + * from the destination in peer table, when entering + * state TIME-WAIT, and check against it before + * accepting new connection request. + * + * If "isn" is not zero, this request hit alive + * timewait bucket, so that all the necessary checks + * are made in the function processing timewait state. + */ + if (tmp_opt.saw_tstamp && + sysctl_tcp_tw_recycle && + (dst = tcp_v4_route_req(sk, req)) != NULL && + (peer = rt_get_peer((struct rtable *)dst)) != NULL && + peer->v4daddr == saddr) { + if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL && + (s32)(peer->tcp_ts - req->ts_recent) > + TCP_PAWS_WINDOW) { + NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED); + dst_release(dst); + goto drop_and_free; + } + } + /* Kill the following clause, if you dislike this way. */ + else if (!sysctl_tcp_syncookies && + (sysctl_max_syn_backlog - tcp_synq_len(sk) < + (sysctl_max_syn_backlog >> 2)) && + (!peer || !peer->tcp_ts_stamp) && + (!dst || !dst_metric(dst, RTAX_RTT))) { + /* Without syncookies last quarter of + * backlog is filled with destinations, + * proven to be alive. + * It means that we continue to communicate + * to destinations, already remembered + * to the moment of synflood. + */ + NETDEBUG(if (net_ratelimit()) \ + printk(KERN_DEBUG "TCP: drop open " + "request from %u.%u." + "%u.%u/%u\n", \ + NIPQUAD(saddr), + ntohs(skb->h.th->source))); + dst_release(dst); + goto drop_and_free; + } + + isn = tcp_v4_init_sequence(sk, skb); + } + req->snt_isn = isn; + + if (tcp_v4_send_synack(sk, req, dst)) + goto drop_and_free; + + if (want_cookie) { + tcp_openreq_free(req); + } else { + tcp_v4_synq_add(sk, req); + } + return 0; + +drop_and_free: + tcp_openreq_free(req); +drop: + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + return 0; +} + + +/* + * The three way handshake has completed - we got a valid synack - + * now create the new socket. + */ +struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct open_request *req, + struct dst_entry *dst) +{ + struct inet_sock *newinet; + struct tcp_sock *newtp; + struct sock *newsk; + + if (sk_acceptq_is_full(sk)) + goto exit_overflow; + + if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL) + goto exit; + + newsk = tcp_create_openreq_child(sk, req, skb); + if (!newsk) + goto exit; + + newsk->sk_dst_cache = dst; + tcp_v4_setup_caps(newsk, dst); + + newtp = tcp_sk(newsk); + newinet = inet_sk(newsk); + newinet->daddr = req->af.v4_req.rmt_addr; + newinet->rcv_saddr = req->af.v4_req.loc_addr; + newinet->saddr = req->af.v4_req.loc_addr; + newinet->opt = req->af.v4_req.opt; + req->af.v4_req.opt = NULL; + newinet->mc_index = tcp_v4_iif(skb); + newinet->mc_ttl = skb->nh.iph->ttl; + newtp->ext_header_len = 0; + if (newinet->opt) + newtp->ext_header_len = newinet->opt->optlen; + newinet->id = newtp->write_seq ^ jiffies; + + tcp_sync_mss(newsk, dst_mtu(dst)); + newtp->advmss = dst_metric(dst, RTAX_ADVMSS); + tcp_initialize_rcv_mss(newsk); + + __tcp_v4_hash(newsk, 0); + __tcp_inherit_port(sk, newsk); + + return newsk; + +exit_overflow: + NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS); +exit: + NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS); + dst_release(dst); + return NULL; +} + +static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) +{ + struct tcphdr *th = skb->h.th; + struct iphdr *iph = skb->nh.iph; + struct tcp_sock *tp = tcp_sk(sk); + struct sock *nsk; + struct open_request **prev; + /* Find possible connection requests. */ + struct open_request *req = tcp_v4_search_req(tp, &prev, th->source, + iph->saddr, iph->daddr); + if (req) + return tcp_check_req(sk, skb, req, prev); + + nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, + th->source, + skb->nh.iph->daddr, + ntohs(th->dest), + tcp_v4_iif(skb)); + + if (nsk) { + if (nsk->sk_state != TCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + tcp_tw_put((struct tcp_tw_bucket *)nsk); + return NULL; + } + +#ifdef CONFIG_SYN_COOKIES + if (!th->rst && !th->syn && th->ack) + sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt)); +#endif + return sk; +} + +static int tcp_v4_checksum_init(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_HW) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, + skb->nh.iph->daddr, skb->csum)) + return 0; + + NETDEBUG(if (net_ratelimit()) + printk(KERN_DEBUG "hw tcp v4 csum failed\n")); + skb->ip_summed = CHECKSUM_NONE; + } + if (skb->len <= 76) { + if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr, + skb->nh.iph->daddr, + skb_checksum(skb, 0, skb->len, 0))) + return -1; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else { + skb->csum = ~tcp_v4_check(skb->h.th, skb->len, + skb->nh.iph->saddr, + skb->nh.iph->daddr, 0); + } + return 0; +} + + +/* The socket must have it's spinlock held when we get + * here. + * + * We have a potential double-lock case here, so even when + * doing backlog processing we use the BH locking scheme. + * This is because we cannot sleep with the original spinlock + * held. + */ +int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + TCP_CHECK_TIMER(sk); + if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) + goto reset; + TCP_CHECK_TIMER(sk); + return 0; + } + + if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb)) + goto csum_err; + + if (sk->sk_state == TCP_LISTEN) { + struct sock *nsk = tcp_v4_hnd_req(sk, skb); + if (!nsk) + goto discard; + + if (nsk != sk) { + if (tcp_child_process(sk, nsk, skb)) + goto reset; + return 0; + } + } + + TCP_CHECK_TIMER(sk); + if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) + goto reset; + TCP_CHECK_TIMER(sk); + return 0; + +reset: + tcp_v4_send_reset(skb); +discard: + kfree_skb(skb); + /* Be careful here. If this function gets more complicated and + * gcc suffers from register pressure on the x86, sk (in %ebx) + * might be destroyed here. This current version compiles correctly, + * but you have been warned. + */ + return 0; + +csum_err: + TCP_INC_STATS_BH(TCP_MIB_INERRS); + goto discard; +} + +/* + * From tcp_input.c + */ + +int tcp_v4_rcv(struct sk_buff *skb) +{ + struct tcphdr *th; + struct sock *sk; + int ret; + + if (skb->pkt_type != PACKET_HOST) + goto discard_it; + + /* Count it even if it's bad */ + TCP_INC_STATS_BH(TCP_MIB_INSEGS); + + if (!pskb_may_pull(skb, sizeof(struct tcphdr))) + goto discard_it; + + th = skb->h.th; + + if (th->doff < sizeof(struct tcphdr) / 4) + goto bad_packet; + if (!pskb_may_pull(skb, th->doff * 4)) + goto discard_it; + + /* An explanation is required here, I think. + * Packet length and doff are validated by header prediction, + * provided case of th->doff==0 is elimineted. + * So, we defer the checks. */ + if ((skb->ip_summed != CHECKSUM_UNNECESSARY && + tcp_v4_checksum_init(skb) < 0)) + goto bad_packet; + + th = skb->h.th; + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + skb->len - th->doff * 4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->when = 0; + TCP_SKB_CB(skb)->flags = skb->nh.iph->tos; + TCP_SKB_CB(skb)->sacked = 0; + + sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source, + skb->nh.iph->daddr, ntohs(th->dest), + tcp_v4_iif(skb)); + + if (!sk) + goto no_tcp_socket; + +process: + if (sk->sk_state == TCP_TIME_WAIT) + goto do_time_wait; + + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + + if (sk_filter(sk, skb, 0)) + goto discard_and_relse; + + skb->dev = NULL; + + bh_lock_sock(sk); + ret = 0; + if (!sock_owned_by_user(sk)) { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v4_do_rcv(sk, skb); + } else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); + + sock_put(sk); + + return ret; + +no_tcp_socket: + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + + if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { +bad_packet: + TCP_INC_STATS_BH(TCP_MIB_INERRS); + } else { + tcp_v4_send_reset(skb); + } + +discard_it: + /* Discard frame. */ + kfree_skb(skb); + return 0; + +discard_and_relse: + sock_put(sk); + goto discard_it; + +do_time_wait: + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { + tcp_tw_put((struct tcp_tw_bucket *) sk); + goto discard_it; + } + + if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) { + TCP_INC_STATS_BH(TCP_MIB_INERRS); + tcp_tw_put((struct tcp_tw_bucket *) sk); + goto discard_it; + } + switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk, + skb, th, skb->len)) { + case TCP_TW_SYN: { + struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, + ntohs(th->dest), + tcp_v4_iif(skb)); + if (sk2) { + tcp_tw_deschedule((struct tcp_tw_bucket *)sk); + tcp_tw_put((struct tcp_tw_bucket *)sk); + sk = sk2; + goto process; + } + /* Fall through to ACK */ + } + case TCP_TW_ACK: + tcp_v4_timewait_ack(sk, skb); + break; + case TCP_TW_RST: + goto no_tcp_socket; + case TCP_TW_SUCCESS:; + } + goto discard_it; +} + +/* With per-bucket locks this operation is not-atomic, so that + * this version is not worse. + */ +static void __tcp_v4_rehash(struct sock *sk) +{ + sk->sk_prot->unhash(sk); + sk->sk_prot->hash(sk); +} + +static int tcp_v4_reselect_saddr(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + int err; + struct rtable *rt; + __u32 old_saddr = inet->saddr; + __u32 new_saddr; + __u32 daddr = inet->daddr; + + if (inet->opt && inet->opt->srr) + daddr = inet->opt->faddr; + + /* Query new route. */ + err = ip_route_connect(&rt, daddr, 0, + RT_CONN_FLAGS(sk), + sk->sk_bound_dev_if, + IPPROTO_TCP, + inet->sport, inet->dport, sk); + if (err) + return err; + + __sk_dst_set(sk, &rt->u.dst); + tcp_v4_setup_caps(sk, &rt->u.dst); + + new_saddr = rt->rt_src; + + if (new_saddr == old_saddr) + return 0; + + if (sysctl_ip_dynaddr > 1) { + printk(KERN_INFO "tcp_v4_rebuild_header(): shifting inet->" + "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n", + NIPQUAD(old_saddr), + NIPQUAD(new_saddr)); + } + + inet->saddr = new_saddr; + inet->rcv_saddr = new_saddr; + + /* XXX The only one ugly spot where we need to + * XXX really change the sockets identity after + * XXX it has entered the hashes. -DaveM + * + * Besides that, it does not check for connection + * uniqueness. Wait for troubles. + */ + __tcp_v4_rehash(sk); + return 0; +} + +int tcp_v4_rebuild_header(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct rtable *rt = (struct rtable *)__sk_dst_check(sk, 0); + u32 daddr; + int err; + + /* Route is OK, nothing to do. */ + if (rt) + return 0; + + /* Reroute. */ + daddr = inet->daddr; + if (inet->opt && inet->opt->srr) + daddr = inet->opt->faddr; + + { + struct flowi fl = { .oif = sk->sk_bound_dev_if, + .nl_u = { .ip4_u = + { .daddr = daddr, + .saddr = inet->saddr, + .tos = RT_CONN_FLAGS(sk) } }, + .proto = IPPROTO_TCP, + .uli_u = { .ports = + { .sport = inet->sport, + .dport = inet->dport } } }; + + err = ip_route_output_flow(&rt, &fl, sk, 0); + } + if (!err) { + __sk_dst_set(sk, &rt->u.dst); + tcp_v4_setup_caps(sk, &rt->u.dst); + return 0; + } + + /* Routing failed... */ + sk->sk_route_caps = 0; + + if (!sysctl_ip_dynaddr || + sk->sk_state != TCP_SYN_SENT || + (sk->sk_userlocks & SOCK_BINDADDR_LOCK) || + (err = tcp_v4_reselect_saddr(sk)) != 0) + sk->sk_err_soft = -err; + + return err; +} + +static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) +{ + struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; + struct inet_sock *inet = inet_sk(sk); + + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = inet->daddr; + sin->sin_port = inet->dport; +} + +/* VJ's idea. Save last timestamp seen from this destination + * and hold it at least for normal timewait interval to use for duplicate + * segment detection in subsequent connections, before they enter synchronized + * state. + */ + +int tcp_v4_remember_stamp(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct rtable *rt = (struct rtable *)__sk_dst_get(sk); + struct inet_peer *peer = NULL; + int release_it = 0; + + if (!rt || rt->rt_dst != inet->daddr) { + peer = inet_getpeer(inet->daddr, 1); + release_it = 1; + } else { + if (!rt->peer) + rt_bind_peer(rt, 1); + peer = rt->peer; + } + + if (peer) { + if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 || + (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && + peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) { + peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp; + peer->tcp_ts = tp->rx_opt.ts_recent; + } + if (release_it) + inet_putpeer(peer); + return 1; + } + + return 0; +} + +int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw) +{ + struct inet_peer *peer = NULL; + + peer = inet_getpeer(tw->tw_daddr, 1); + + if (peer) { + if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 || + (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec && + peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) { + peer->tcp_ts_stamp = tw->tw_ts_recent_stamp; + peer->tcp_ts = tw->tw_ts_recent; + } + inet_putpeer(peer); + return 1; + } + + return 0; +} + +struct tcp_func ipv4_specific = { + .queue_xmit = ip_queue_xmit, + .send_check = tcp_v4_send_check, + .rebuild_header = tcp_v4_rebuild_header, + .conn_request = tcp_v4_conn_request, + .syn_recv_sock = tcp_v4_syn_recv_sock, + .remember_stamp = tcp_v4_remember_stamp, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ip_setsockopt, + .getsockopt = ip_getsockopt, + .addr2sockaddr = v4_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in), +}; + +/* NOTE: A lot of things set to zero explicitly by call to + * sk_alloc() so need not be done here. + */ +static int tcp_v4_init_sock(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + skb_queue_head_init(&tp->out_of_order_queue); + tcp_init_xmit_timers(sk); + tcp_prequeue_init(tp); + + tp->rto = TCP_TIMEOUT_INIT; + tp->mdev = TCP_TIMEOUT_INIT; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + tp->snd_cwnd = 2; + + /* See draft-stevens-tcpca-spec-01 for discussion of the + * initialization of these values. + */ + tp->snd_ssthresh = 0x7fffffff; /* Infinity */ + tp->snd_cwnd_clamp = ~0; + tp->mss_cache_std = tp->mss_cache = 536; + + tp->reordering = sysctl_tcp_reordering; + + sk->sk_state = TCP_CLOSE; + + sk->sk_write_space = sk_stream_write_space; + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + + tp->af_specific = &ipv4_specific; + + sk->sk_sndbuf = sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + + atomic_inc(&tcp_sockets_allocated); + + return 0; +} + +int tcp_v4_destroy_sock(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_clear_xmit_timers(sk); + + /* Cleanup up the write buffer. */ + sk_stream_writequeue_purge(sk); + + /* Cleans up our, hopefully empty, out_of_order_queue. */ + __skb_queue_purge(&tp->out_of_order_queue); + + /* Clean prequeue, it must be empty really */ + __skb_queue_purge(&tp->ucopy.prequeue); + + /* Clean up a referenced TCP bind bucket. */ + if (tp->bind_hash) + tcp_put_port(sk); + + /* + * If sendmsg cached page exists, toss it. + */ + if (sk->sk_sndmsg_page) { + __free_page(sk->sk_sndmsg_page); + sk->sk_sndmsg_page = NULL; + } + + atomic_dec(&tcp_sockets_allocated); + + return 0; +} + +EXPORT_SYMBOL(tcp_v4_destroy_sock); + +#ifdef CONFIG_PROC_FS +/* Proc filesystem TCP sock list dumping. */ + +static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head) +{ + return hlist_empty(head) ? NULL : + list_entry(head->first, struct tcp_tw_bucket, tw_node); +} + +static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw) +{ + return tw->tw_node.next ? + hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL; +} + +static void *listening_get_next(struct seq_file *seq, void *cur) +{ + struct tcp_sock *tp; + struct hlist_node *node; + struct sock *sk = cur; + struct tcp_iter_state* st = seq->private; + + if (!sk) { + st->bucket = 0; + sk = sk_head(&tcp_listening_hash[0]); + goto get_sk; + } + + ++st->num; + + if (st->state == TCP_SEQ_STATE_OPENREQ) { + struct open_request *req = cur; + + tp = tcp_sk(st->syn_wait_sk); + req = req->dl_next; + while (1) { + while (req) { + if (req->class->family == st->family) { + cur = req; + goto out; + } + req = req->dl_next; + } + if (++st->sbucket >= TCP_SYNQ_HSIZE) + break; +get_req: + req = tp->listen_opt->syn_table[st->sbucket]; + } + sk = sk_next(st->syn_wait_sk); + st->state = TCP_SEQ_STATE_LISTENING; + read_unlock_bh(&tp->syn_wait_lock); + } else { + tp = tcp_sk(sk); + read_lock_bh(&tp->syn_wait_lock); + if (tp->listen_opt && tp->listen_opt->qlen) + goto start_req; + read_unlock_bh(&tp->syn_wait_lock); + sk = sk_next(sk); + } +get_sk: + sk_for_each_from(sk, node) { + if (sk->sk_family == st->family) { + cur = sk; + goto out; + } + tp = tcp_sk(sk); + read_lock_bh(&tp->syn_wait_lock); + if (tp->listen_opt && tp->listen_opt->qlen) { +start_req: + st->uid = sock_i_uid(sk); + st->syn_wait_sk = sk; + st->state = TCP_SEQ_STATE_OPENREQ; + st->sbucket = 0; + goto get_req; + } + read_unlock_bh(&tp->syn_wait_lock); + } + if (++st->bucket < TCP_LHTABLE_SIZE) { + sk = sk_head(&tcp_listening_hash[st->bucket]); + goto get_sk; + } + cur = NULL; +out: + return cur; +} + +static void *listening_get_idx(struct seq_file *seq, loff_t *pos) +{ + void *rc = listening_get_next(seq, NULL); + + while (rc && *pos) { + rc = listening_get_next(seq, rc); + --*pos; + } + return rc; +} + +static void *established_get_first(struct seq_file *seq) +{ + struct tcp_iter_state* st = seq->private; + void *rc = NULL; + + for (st->bucket = 0; st->bucket < tcp_ehash_size; ++st->bucket) { + struct sock *sk; + struct hlist_node *node; + struct tcp_tw_bucket *tw; + + /* We can reschedule _before_ having picked the target: */ + cond_resched_softirq(); + + read_lock(&tcp_ehash[st->bucket].lock); + sk_for_each(sk, node, &tcp_ehash[st->bucket].chain) { + if (sk->sk_family != st->family) { + continue; + } + rc = sk; + goto out; + } + st->state = TCP_SEQ_STATE_TIME_WAIT; + tw_for_each(tw, node, + &tcp_ehash[st->bucket + tcp_ehash_size].chain) { + if (tw->tw_family != st->family) { + continue; + } + rc = tw; + goto out; + } + read_unlock(&tcp_ehash[st->bucket].lock); + st->state = TCP_SEQ_STATE_ESTABLISHED; + } +out: + return rc; +} + +static void *established_get_next(struct seq_file *seq, void *cur) +{ + struct sock *sk = cur; + struct tcp_tw_bucket *tw; + struct hlist_node *node; + struct tcp_iter_state* st = seq->private; + + ++st->num; + + if (st->state == TCP_SEQ_STATE_TIME_WAIT) { + tw = cur; + tw = tw_next(tw); +get_tw: + while (tw && tw->tw_family != st->family) { + tw = tw_next(tw); + } + if (tw) { + cur = tw; + goto out; + } + read_unlock(&tcp_ehash[st->bucket].lock); + st->state = TCP_SEQ_STATE_ESTABLISHED; + + /* We can reschedule between buckets: */ + cond_resched_softirq(); + + if (++st->bucket < tcp_ehash_size) { + read_lock(&tcp_ehash[st->bucket].lock); + sk = sk_head(&tcp_ehash[st->bucket].chain); + } else { + cur = NULL; + goto out; + } + } else + sk = sk_next(sk); + + sk_for_each_from(sk, node) { + if (sk->sk_family == st->family) + goto found; + } + + st->state = TCP_SEQ_STATE_TIME_WAIT; + tw = tw_head(&tcp_ehash[st->bucket + tcp_ehash_size].chain); + goto get_tw; +found: + cur = sk; +out: + return cur; +} + +static void *established_get_idx(struct seq_file *seq, loff_t pos) +{ + void *rc = established_get_first(seq); + + while (rc && pos) { + rc = established_get_next(seq, rc); + --pos; + } + return rc; +} + +static void *tcp_get_idx(struct seq_file *seq, loff_t pos) +{ + void *rc; + struct tcp_iter_state* st = seq->private; + + tcp_listen_lock(); + st->state = TCP_SEQ_STATE_LISTENING; + rc = listening_get_idx(seq, &pos); + + if (!rc) { + tcp_listen_unlock(); + local_bh_disable(); + st->state = TCP_SEQ_STATE_ESTABLISHED; + rc = established_get_idx(seq, pos); + } + + return rc; +} + +static void *tcp_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct tcp_iter_state* st = seq->private; + st->state = TCP_SEQ_STATE_LISTENING; + st->num = 0; + return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + void *rc = NULL; + struct tcp_iter_state* st; + + if (v == SEQ_START_TOKEN) { + rc = tcp_get_idx(seq, 0); + goto out; + } + st = seq->private; + + switch (st->state) { + case TCP_SEQ_STATE_OPENREQ: + case TCP_SEQ_STATE_LISTENING: + rc = listening_get_next(seq, v); + if (!rc) { + tcp_listen_unlock(); + local_bh_disable(); + st->state = TCP_SEQ_STATE_ESTABLISHED; + rc = established_get_first(seq); + } + break; + case TCP_SEQ_STATE_ESTABLISHED: + case TCP_SEQ_STATE_TIME_WAIT: + rc = established_get_next(seq, v); + break; + } +out: + ++*pos; + return rc; +} + +static void tcp_seq_stop(struct seq_file *seq, void *v) +{ + struct tcp_iter_state* st = seq->private; + + switch (st->state) { + case TCP_SEQ_STATE_OPENREQ: + if (v) { + struct tcp_sock *tp = tcp_sk(st->syn_wait_sk); + read_unlock_bh(&tp->syn_wait_lock); + } + case TCP_SEQ_STATE_LISTENING: + if (v != SEQ_START_TOKEN) + tcp_listen_unlock(); + break; + case TCP_SEQ_STATE_TIME_WAIT: + case TCP_SEQ_STATE_ESTABLISHED: + if (v) + read_unlock(&tcp_ehash[st->bucket].lock); + local_bh_enable(); + break; + } +} + +static int tcp_seq_open(struct inode *inode, struct file *file) +{ + struct tcp_seq_afinfo *afinfo = PDE(inode)->data; + struct seq_file *seq; + struct tcp_iter_state *s; + int rc; + + if (unlikely(afinfo == NULL)) + return -EINVAL; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + memset(s, 0, sizeof(*s)); + s->family = afinfo->family; + s->seq_ops.start = tcp_seq_start; + s->seq_ops.next = tcp_seq_next; + s->seq_ops.show = afinfo->seq_show; + s->seq_ops.stop = tcp_seq_stop; + + rc = seq_open(file, &s->seq_ops); + if (rc) + goto out_kfree; + seq = file->private_data; + seq->private = s; +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +int tcp_proc_register(struct tcp_seq_afinfo *afinfo) +{ + int rc = 0; + struct proc_dir_entry *p; + + if (!afinfo) + return -EINVAL; + afinfo->seq_fops->owner = afinfo->owner; + afinfo->seq_fops->open = tcp_seq_open; + afinfo->seq_fops->read = seq_read; + afinfo->seq_fops->llseek = seq_lseek; + afinfo->seq_fops->release = seq_release_private; + + p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); + if (p) + p->data = afinfo; + else + rc = -ENOMEM; + return rc; +} + +void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo) +{ + if (!afinfo) + return; + proc_net_remove(afinfo->name); + memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); +} + +static void get_openreq4(struct sock *sk, struct open_request *req, + char *tmpbuf, int i, int uid) +{ + int ttd = req->expires - jiffies; + + sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p", + i, + req->af.v4_req.loc_addr, + ntohs(inet_sk(sk)->sport), + req->af.v4_req.rmt_addr, + ntohs(req->rmt_port), + TCP_SYN_RECV, + 0, 0, /* could print option size, but that is af dependent. */ + 1, /* timers active (only the expire timer) */ + jiffies_to_clock_t(ttd), + req->retrans, + uid, + 0, /* non standard timer */ + 0, /* open_requests have no inode */ + atomic_read(&sk->sk_refcnt), + req); +} + +static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i) +{ + int timer_active; + unsigned long timer_expires; + struct tcp_sock *tp = tcp_sk(sp); + struct inet_sock *inet = inet_sk(sp); + unsigned int dest = inet->daddr; + unsigned int src = inet->rcv_saddr; + __u16 destp = ntohs(inet->dport); + __u16 srcp = ntohs(inet->sport); + + if (tp->pending == TCP_TIME_RETRANS) { + timer_active = 1; + timer_expires = tp->timeout; + } else if (tp->pending == TCP_TIME_PROBE0) { + timer_active = 4; + timer_expires = tp->timeout; + } else if (timer_pending(&sp->sk_timer)) { + timer_active = 2; + timer_expires = sp->sk_timer.expires; + } else { + timer_active = 0; + timer_expires = jiffies; + } + + sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX " + "%08X %5d %8d %lu %d %p %u %u %u %u %d", + i, src, srcp, dest, destp, sp->sk_state, + tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq, + timer_active, + jiffies_to_clock_t(timer_expires - jiffies), + tp->retransmits, + sock_i_uid(sp), + tp->probes_out, + sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, + tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong, + tp->snd_cwnd, + tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh); +} + +static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i) +{ + unsigned int dest, src; + __u16 destp, srcp; + int ttd = tw->tw_ttd - jiffies; + + if (ttd < 0) + ttd = 0; + + dest = tw->tw_daddr; + src = tw->tw_rcv_saddr; + destp = ntohs(tw->tw_dport); + srcp = ntohs(tw->tw_sport); + + sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p", + i, src, srcp, dest, destp, tw->tw_substate, 0, 0, + 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, + atomic_read(&tw->tw_refcnt), tw); +} + +#define TMPSZ 150 + +static int tcp4_seq_show(struct seq_file *seq, void *v) +{ + struct tcp_iter_state* st; + char tmpbuf[TMPSZ + 1]; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "%-*s\n", TMPSZ - 1, + " sl local_address rem_address st tx_queue " + "rx_queue tr tm->when retrnsmt uid timeout " + "inode"); + goto out; + } + st = seq->private; + + switch (st->state) { + case TCP_SEQ_STATE_LISTENING: + case TCP_SEQ_STATE_ESTABLISHED: + get_tcp4_sock(v, tmpbuf, st->num); + break; + case TCP_SEQ_STATE_OPENREQ: + get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid); + break; + case TCP_SEQ_STATE_TIME_WAIT: + get_timewait4_sock(v, tmpbuf, st->num); + break; + } + seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf); +out: + return 0; +} + +static struct file_operations tcp4_seq_fops; +static struct tcp_seq_afinfo tcp4_seq_afinfo = { + .owner = THIS_MODULE, + .name = "tcp", + .family = AF_INET, + .seq_show = tcp4_seq_show, + .seq_fops = &tcp4_seq_fops, +}; + +int __init tcp4_proc_init(void) +{ + return tcp_proc_register(&tcp4_seq_afinfo); +} + +void tcp4_proc_exit(void) +{ + tcp_proc_unregister(&tcp4_seq_afinfo); +} +#endif /* CONFIG_PROC_FS */ + +struct proto tcp_prot = { + .name = "TCP", + .owner = THIS_MODULE, + .close = tcp_close, + .connect = tcp_v4_connect, + .disconnect = tcp_disconnect, + .accept = tcp_accept, + .ioctl = tcp_ioctl, + .init = tcp_v4_init_sock, + .destroy = tcp_v4_destroy_sock, + .shutdown = tcp_shutdown, + .setsockopt = tcp_setsockopt, + .getsockopt = tcp_getsockopt, + .sendmsg = tcp_sendmsg, + .recvmsg = tcp_recvmsg, + .backlog_rcv = tcp_v4_do_rcv, + .hash = tcp_v4_hash, + .unhash = tcp_unhash, + .get_port = tcp_v4_get_port, + .enter_memory_pressure = tcp_enter_memory_pressure, + .sockets_allocated = &tcp_sockets_allocated, + .memory_allocated = &tcp_memory_allocated, + .memory_pressure = &tcp_memory_pressure, + .sysctl_mem = sysctl_tcp_mem, + .sysctl_wmem = sysctl_tcp_wmem, + .sysctl_rmem = sysctl_tcp_rmem, + .max_header = MAX_TCP_HEADER, + .obj_size = sizeof(struct tcp_sock), +}; + + + +void __init tcp_v4_init(struct net_proto_family *ops) +{ + int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket); + if (err < 0) + panic("Failed to create the TCP control socket.\n"); + tcp_socket->sk->sk_allocation = GFP_ATOMIC; + inet_sk(tcp_socket->sk)->uc_ttl = -1; + + /* Unhash it so that IP input processing does not even + * see it, we do not wish this socket to see incoming + * packets. + */ + tcp_socket->sk->sk_prot->unhash(tcp_socket->sk); +} + +EXPORT_SYMBOL(ipv4_specific); +EXPORT_SYMBOL(tcp_bind_hash); +EXPORT_SYMBOL(tcp_bucket_create); +EXPORT_SYMBOL(tcp_hashinfo); +EXPORT_SYMBOL(tcp_inherit_port); +EXPORT_SYMBOL(tcp_listen_wlock); +EXPORT_SYMBOL(tcp_port_rover); +EXPORT_SYMBOL(tcp_prot); +EXPORT_SYMBOL(tcp_put_port); +EXPORT_SYMBOL(tcp_unhash); +EXPORT_SYMBOL(tcp_v4_conn_request); +EXPORT_SYMBOL(tcp_v4_connect); +EXPORT_SYMBOL(tcp_v4_do_rcv); +EXPORT_SYMBOL(tcp_v4_rebuild_header); +EXPORT_SYMBOL(tcp_v4_remember_stamp); +EXPORT_SYMBOL(tcp_v4_send_check); +EXPORT_SYMBOL(tcp_v4_syn_recv_sock); + +#ifdef CONFIG_PROC_FS +EXPORT_SYMBOL(tcp_proc_register); +EXPORT_SYMBOL(tcp_proc_unregister); +#endif +EXPORT_SYMBOL(sysctl_local_port_range); +EXPORT_SYMBOL(sysctl_max_syn_backlog); +EXPORT_SYMBOL(sysctl_tcp_low_latency); +EXPORT_SYMBOL(sysctl_tcp_tw_reuse); + diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c new file mode 100644 index 000000000000..fd70509f0d53 --- /dev/null +++ b/net/ipv4/tcp_minisocks.c @@ -0,0 +1,1077 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * Jorge Cwik, + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_SYSCTL +#define SYNC_INIT 0 /* let the user enable it */ +#else +#define SYNC_INIT 1 +#endif + +int sysctl_tcp_tw_recycle; +int sysctl_tcp_max_tw_buckets = NR_FILE*2; + +int sysctl_tcp_syncookies = SYNC_INIT; +int sysctl_tcp_abort_on_overflow; + +static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo); + +static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) +{ + if (seq == s_win) + return 1; + if (after(end_seq, s_win) && before(seq, e_win)) + return 1; + return (seq == e_win && seq == end_seq); +} + +/* New-style handling of TIME_WAIT sockets. */ + +int tcp_tw_count; + + +/* Must be called with locally disabled BHs. */ +static void tcp_timewait_kill(struct tcp_tw_bucket *tw) +{ + struct tcp_ehash_bucket *ehead; + struct tcp_bind_hashbucket *bhead; + struct tcp_bind_bucket *tb; + + /* Unlink from established hashes. */ + ehead = &tcp_ehash[tw->tw_hashent]; + write_lock(&ehead->lock); + if (hlist_unhashed(&tw->tw_node)) { + write_unlock(&ehead->lock); + return; + } + __hlist_del(&tw->tw_node); + sk_node_init(&tw->tw_node); + write_unlock(&ehead->lock); + + /* Disassociate with bind bucket. */ + bhead = &tcp_bhash[tcp_bhashfn(tw->tw_num)]; + spin_lock(&bhead->lock); + tb = tw->tw_tb; + __hlist_del(&tw->tw_bind_node); + tw->tw_tb = NULL; + tcp_bucket_destroy(tb); + spin_unlock(&bhead->lock); + +#ifdef INET_REFCNT_DEBUG + if (atomic_read(&tw->tw_refcnt) != 1) { + printk(KERN_DEBUG "tw_bucket %p refcnt=%d\n", tw, + atomic_read(&tw->tw_refcnt)); + } +#endif + tcp_tw_put(tw); +} + +/* + * * Main purpose of TIME-WAIT state is to close connection gracefully, + * when one of ends sits in LAST-ACK or CLOSING retransmitting FIN + * (and, probably, tail of data) and one or more our ACKs are lost. + * * What is TIME-WAIT timeout? It is associated with maximal packet + * lifetime in the internet, which results in wrong conclusion, that + * it is set to catch "old duplicate segments" wandering out of their path. + * It is not quite correct. This timeout is calculated so that it exceeds + * maximal retransmission timeout enough to allow to lose one (or more) + * segments sent by peer and our ACKs. This time may be calculated from RTO. + * * When TIME-WAIT socket receives RST, it means that another end + * finally closed and we are allowed to kill TIME-WAIT too. + * * Second purpose of TIME-WAIT is catching old duplicate segments. + * Well, certainly it is pure paranoia, but if we load TIME-WAIT + * with this semantics, we MUST NOT kill TIME-WAIT state with RSTs. + * * If we invented some more clever way to catch duplicates + * (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs. + * + * The algorithm below is based on FORMAL INTERPRETATION of RFCs. + * When you compare it to RFCs, please, read section SEGMENT ARRIVES + * from the very beginning. + * + * NOTE. With recycling (and later with fin-wait-2) TW bucket + * is _not_ stateless. It means, that strictly speaking we must + * spinlock it. I do not want! Well, probability of misbehaviour + * is ridiculously low and, seems, we could use some mb() tricks + * to avoid misread sequence numbers, states etc. --ANK + */ +enum tcp_tw_status +tcp_timewait_state_process(struct tcp_tw_bucket *tw, struct sk_buff *skb, + struct tcphdr *th, unsigned len) +{ + struct tcp_options_received tmp_opt; + int paws_reject = 0; + + tmp_opt.saw_tstamp = 0; + if (th->doff > (sizeof(struct tcphdr) >> 2) && tw->tw_ts_recent_stamp) { + tcp_parse_options(skb, &tmp_opt, 0); + + if (tmp_opt.saw_tstamp) { + tmp_opt.ts_recent = tw->tw_ts_recent; + tmp_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; + paws_reject = tcp_paws_check(&tmp_opt, th->rst); + } + } + + if (tw->tw_substate == TCP_FIN_WAIT2) { + /* Just repeat all the checks of tcp_rcv_state_process() */ + + /* Out of window, send ACK */ + if (paws_reject || + !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + tw->tw_rcv_nxt, + tw->tw_rcv_nxt + tw->tw_rcv_wnd)) + return TCP_TW_ACK; + + if (th->rst) + goto kill; + + if (th->syn && !before(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt)) + goto kill_with_rst; + + /* Dup ACK? */ + if (!after(TCP_SKB_CB(skb)->end_seq, tw->tw_rcv_nxt) || + TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) { + tcp_tw_put(tw); + return TCP_TW_SUCCESS; + } + + /* New data or FIN. If new data arrive after half-duplex close, + * reset. + */ + if (!th->fin || + TCP_SKB_CB(skb)->end_seq != tw->tw_rcv_nxt + 1) { +kill_with_rst: + tcp_tw_deschedule(tw); + tcp_tw_put(tw); + return TCP_TW_RST; + } + + /* FIN arrived, enter true time-wait state. */ + tw->tw_substate = TCP_TIME_WAIT; + tw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq; + if (tmp_opt.saw_tstamp) { + tw->tw_ts_recent_stamp = xtime.tv_sec; + tw->tw_ts_recent = tmp_opt.rcv_tsval; + } + + /* I am shamed, but failed to make it more elegant. + * Yes, it is direct reference to IP, which is impossible + * to generalize to IPv6. Taking into account that IPv6 + * do not undertsnad recycling in any case, it not + * a big problem in practice. --ANK */ + if (tw->tw_family == AF_INET && + sysctl_tcp_tw_recycle && tw->tw_ts_recent_stamp && + tcp_v4_tw_remember_stamp(tw)) + tcp_tw_schedule(tw, tw->tw_timeout); + else + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + return TCP_TW_ACK; + } + + /* + * Now real TIME-WAIT state. + * + * RFC 1122: + * "When a connection is [...] on TIME-WAIT state [...] + * [a TCP] MAY accept a new SYN from the remote TCP to + * reopen the connection directly, if it: + * + * (1) assigns its initial sequence number for the new + * connection to be larger than the largest sequence + * number it used on the previous connection incarnation, + * and + * + * (2) returns to TIME-WAIT state if the SYN turns out + * to be an old duplicate". + */ + + if (!paws_reject && + (TCP_SKB_CB(skb)->seq == tw->tw_rcv_nxt && + (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) { + /* In window segment, it may be only reset or bare ack. */ + + if (th->rst) { + /* This is TIME_WAIT assasination, in two flavors. + * Oh well... nobody has a sufficient solution to this + * protocol bug yet. + */ + if (sysctl_tcp_rfc1337 == 0) { +kill: + tcp_tw_deschedule(tw); + tcp_tw_put(tw); + return TCP_TW_SUCCESS; + } + } + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + + if (tmp_opt.saw_tstamp) { + tw->tw_ts_recent = tmp_opt.rcv_tsval; + tw->tw_ts_recent_stamp = xtime.tv_sec; + } + + tcp_tw_put(tw); + return TCP_TW_SUCCESS; + } + + /* Out of window segment. + + All the segments are ACKed immediately. + + The only exception is new SYN. We accept it, if it is + not old duplicate and we are not in danger to be killed + by delayed old duplicates. RFC check is that it has + newer sequence number works at rates <40Mbit/sec. + However, if paws works, it is reliable AND even more, + we even may relax silly seq space cutoff. + + RED-PEN: we violate main RFC requirement, if this SYN will appear + old duplicate (i.e. we receive RST in reply to SYN-ACK), + we must return socket to time-wait state. It is not good, + but not fatal yet. + */ + + if (th->syn && !th->rst && !th->ack && !paws_reject && + (after(TCP_SKB_CB(skb)->seq, tw->tw_rcv_nxt) || + (tmp_opt.saw_tstamp && (s32)(tw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) { + u32 isn = tw->tw_snd_nxt + 65535 + 2; + if (isn == 0) + isn++; + TCP_SKB_CB(skb)->when = isn; + return TCP_TW_SYN; + } + + if (paws_reject) + NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + + if(!th->rst) { + /* In this case we must reset the TIMEWAIT timer. + * + * If it is ACKless SYN it may be both old duplicate + * and new good SYN with random sequence number ack) + tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN); + + /* Send ACK. Note, we do not put the bucket, + * it will be released by caller. + */ + return TCP_TW_ACK; + } + tcp_tw_put(tw); + return TCP_TW_SUCCESS; +} + +/* Enter the time wait state. This is called with locally disabled BH. + * Essentially we whip up a timewait bucket, copy the + * relevant info into it from the SK, and mess with hash chains + * and list linkage. + */ +static void __tcp_tw_hashdance(struct sock *sk, struct tcp_tw_bucket *tw) +{ + struct tcp_ehash_bucket *ehead = &tcp_ehash[sk->sk_hashent]; + struct tcp_bind_hashbucket *bhead; + + /* Step 1: Put TW into bind hash. Original socket stays there too. + Note, that any socket with inet_sk(sk)->num != 0 MUST be bound in + binding cache, even if it is closed. + */ + bhead = &tcp_bhash[tcp_bhashfn(inet_sk(sk)->num)]; + spin_lock(&bhead->lock); + tw->tw_tb = tcp_sk(sk)->bind_hash; + BUG_TRAP(tcp_sk(sk)->bind_hash); + tw_add_bind_node(tw, &tw->tw_tb->owners); + spin_unlock(&bhead->lock); + + write_lock(&ehead->lock); + + /* Step 2: Remove SK from established hash. */ + if (__sk_del_node_init(sk)) + sock_prot_dec_use(sk->sk_prot); + + /* Step 3: Hash TW into TIMEWAIT half of established hash table. */ + tw_add_node(tw, &(ehead + tcp_ehash_size)->chain); + atomic_inc(&tw->tw_refcnt); + + write_unlock(&ehead->lock); +} + +/* + * Move a socket to time-wait or dead fin-wait-2 state. + */ +void tcp_time_wait(struct sock *sk, int state, int timeo) +{ + struct tcp_tw_bucket *tw = NULL; + struct tcp_sock *tp = tcp_sk(sk); + int recycle_ok = 0; + + if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp) + recycle_ok = tp->af_specific->remember_stamp(sk); + + if (tcp_tw_count < sysctl_tcp_max_tw_buckets) + tw = kmem_cache_alloc(tcp_timewait_cachep, SLAB_ATOMIC); + + if(tw != NULL) { + struct inet_sock *inet = inet_sk(sk); + int rto = (tp->rto<<2) - (tp->rto>>1); + + /* Give us an identity. */ + tw->tw_daddr = inet->daddr; + tw->tw_rcv_saddr = inet->rcv_saddr; + tw->tw_bound_dev_if = sk->sk_bound_dev_if; + tw->tw_num = inet->num; + tw->tw_state = TCP_TIME_WAIT; + tw->tw_substate = state; + tw->tw_sport = inet->sport; + tw->tw_dport = inet->dport; + tw->tw_family = sk->sk_family; + tw->tw_reuse = sk->sk_reuse; + tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; + atomic_set(&tw->tw_refcnt, 1); + + tw->tw_hashent = sk->sk_hashent; + tw->tw_rcv_nxt = tp->rcv_nxt; + tw->tw_snd_nxt = tp->snd_nxt; + tw->tw_rcv_wnd = tcp_receive_window(tp); + tw->tw_ts_recent = tp->rx_opt.ts_recent; + tw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp; + tw_dead_node_init(tw); + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (tw->tw_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + ipv6_addr_copy(&tw->tw_v6_daddr, &np->daddr); + ipv6_addr_copy(&tw->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_v6_ipv6only = np->ipv6only; + } else { + memset(&tw->tw_v6_daddr, 0, sizeof(tw->tw_v6_daddr)); + memset(&tw->tw_v6_rcv_saddr, 0, sizeof(tw->tw_v6_rcv_saddr)); + tw->tw_v6_ipv6only = 0; + } +#endif + /* Linkage updates. */ + __tcp_tw_hashdance(sk, tw); + + /* Get the TIME_WAIT timeout firing. */ + if (timeo < rto) + timeo = rto; + + if (recycle_ok) { + tw->tw_timeout = rto; + } else { + tw->tw_timeout = TCP_TIMEWAIT_LEN; + if (state == TCP_TIME_WAIT) + timeo = TCP_TIMEWAIT_LEN; + } + + tcp_tw_schedule(tw, timeo); + tcp_tw_put(tw); + } else { + /* Sorry, if we're out of memory, just CLOSE this + * socket up. We've got bigger problems than + * non-graceful socket closings. + */ + if (net_ratelimit()) + printk(KERN_INFO "TCP: time wait bucket table overflow\n"); + } + + tcp_update_metrics(sk); + tcp_done(sk); +} + +/* Kill off TIME_WAIT sockets once their lifetime has expired. */ +static int tcp_tw_death_row_slot; + +static void tcp_twkill(unsigned long); + +/* TIME_WAIT reaping mechanism. */ +#define TCP_TWKILL_SLOTS 8 /* Please keep this a power of 2. */ +#define TCP_TWKILL_PERIOD (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS) + +#define TCP_TWKILL_QUOTA 100 + +static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS]; +static DEFINE_SPINLOCK(tw_death_lock); +static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0); +static void twkill_work(void *); +static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL); +static u32 twkill_thread_slots; + +/* Returns non-zero if quota exceeded. */ +static int tcp_do_twkill_work(int slot, unsigned int quota) +{ + struct tcp_tw_bucket *tw; + struct hlist_node *node; + unsigned int killed; + int ret; + + /* NOTE: compare this to previous version where lock + * was released after detaching chain. It was racy, + * because tw buckets are scheduled in not serialized context + * in 2.3 (with netfilter), and with softnet it is common, because + * soft irqs are not sequenced. + */ + killed = 0; + ret = 0; +rescan: + tw_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) { + __tw_del_dead_node(tw); + spin_unlock(&tw_death_lock); + tcp_timewait_kill(tw); + tcp_tw_put(tw); + killed++; + spin_lock(&tw_death_lock); + if (killed > quota) { + ret = 1; + break; + } + + /* While we dropped tw_death_lock, another cpu may have + * killed off the next TW bucket in the list, therefore + * do a fresh re-read of the hlist head node with the + * lock reacquired. We still use the hlist traversal + * macro in order to get the prefetches. + */ + goto rescan; + } + + tcp_tw_count -= killed; + NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed); + + return ret; +} + +static void tcp_twkill(unsigned long dummy) +{ + int need_timer, ret; + + spin_lock(&tw_death_lock); + + if (tcp_tw_count == 0) + goto out; + + need_timer = 0; + ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA); + if (ret) { + twkill_thread_slots |= (1 << tcp_tw_death_row_slot); + mb(); + schedule_work(&tcp_twkill_work); + need_timer = 1; + } else { + /* We purged the entire slot, anything left? */ + if (tcp_tw_count) + need_timer = 1; + } + tcp_tw_death_row_slot = + ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); + if (need_timer) + mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD); +out: + spin_unlock(&tw_death_lock); +} + +extern void twkill_slots_invalid(void); + +static void twkill_work(void *dummy) +{ + int i; + + if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8)) + twkill_slots_invalid(); + + while (twkill_thread_slots) { + spin_lock_bh(&tw_death_lock); + for (i = 0; i < TCP_TWKILL_SLOTS; i++) { + if (!(twkill_thread_slots & (1 << i))) + continue; + + while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) { + if (need_resched()) { + spin_unlock_bh(&tw_death_lock); + schedule(); + spin_lock_bh(&tw_death_lock); + } + } + + twkill_thread_slots &= ~(1 << i); + } + spin_unlock_bh(&tw_death_lock); + } +} + +/* These are always called from BH context. See callers in + * tcp_input.c to verify this. + */ + +/* This is for handling early-kills of TIME_WAIT sockets. */ +void tcp_tw_deschedule(struct tcp_tw_bucket *tw) +{ + spin_lock(&tw_death_lock); + if (tw_del_dead_node(tw)) { + tcp_tw_put(tw); + if (--tcp_tw_count == 0) + del_timer(&tcp_tw_timer); + } + spin_unlock(&tw_death_lock); + tcp_timewait_kill(tw); +} + +/* Short-time timewait calendar */ + +static int tcp_twcal_hand = -1; +static int tcp_twcal_jiffie; +static void tcp_twcal_tick(unsigned long); +static struct timer_list tcp_twcal_timer = + TIMER_INITIALIZER(tcp_twcal_tick, 0, 0); +static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS]; + +static void tcp_tw_schedule(struct tcp_tw_bucket *tw, int timeo) +{ + struct hlist_head *list; + int slot; + + /* timeout := RTO * 3.5 + * + * 3.5 = 1+2+0.5 to wait for two retransmits. + * + * RATIONALE: if FIN arrived and we entered TIME-WAIT state, + * our ACK acking that FIN can be lost. If N subsequent retransmitted + * FINs (or previous seqments) are lost (probability of such event + * is p^(N+1), where p is probability to lose single packet and + * time to detect the loss is about RTO*(2^N - 1) with exponential + * backoff). Normal timewait length is calculated so, that we + * waited at least for one retransmitted FIN (maximal RTO is 120sec). + * [ BTW Linux. following BSD, violates this requirement waiting + * only for 60sec, we should wait at least for 240 secs. + * Well, 240 consumes too much of resources 8) + * ] + * This interval is not reduced to catch old duplicate and + * responces to our wandering segments living for two MSLs. + * However, if we use PAWS to detect + * old duplicates, we can reduce the interval to bounds required + * by RTO, rather than MSL. So, if peer understands PAWS, we + * kill tw bucket after 3.5*RTO (it is important that this number + * is greater than TS tick!) and detect old duplicates with help + * of PAWS. + */ + slot = (timeo + (1<> TCP_TW_RECYCLE_TICK; + + spin_lock(&tw_death_lock); + + /* Unlink it, if it was scheduled */ + if (tw_del_dead_node(tw)) + tcp_tw_count--; + else + atomic_inc(&tw->tw_refcnt); + + if (slot >= TCP_TW_RECYCLE_SLOTS) { + /* Schedule to slow timer */ + if (timeo >= TCP_TIMEWAIT_LEN) { + slot = TCP_TWKILL_SLOTS-1; + } else { + slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD; + if (slot >= TCP_TWKILL_SLOTS) + slot = TCP_TWKILL_SLOTS-1; + } + tw->tw_ttd = jiffies + timeo; + slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1); + list = &tcp_tw_death_row[slot]; + } else { + tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK); + + if (tcp_twcal_hand < 0) { + tcp_twcal_hand = 0; + tcp_twcal_jiffie = jiffies; + tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<tw_death_node, list); + + if (tcp_tw_count++ == 0) + mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); + spin_unlock(&tw_death_lock); +} + +void tcp_twcal_tick(unsigned long dummy) +{ + int n, slot; + unsigned long j; + unsigned long now = jiffies; + int killed = 0; + int adv = 0; + + spin_lock(&tw_death_lock); + if (tcp_twcal_hand < 0) + goto out; + + slot = tcp_twcal_hand; + j = tcp_twcal_jiffie; + + for (n=0; nsk_prot -acme */ + struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0); + + if(newsk != NULL) { + struct tcp_sock *newtp; + struct sk_filter *filter; + + memcpy(newsk, sk, sizeof(struct tcp_sock)); + newsk->sk_state = TCP_SYN_RECV; + + /* SANITY */ + sk_node_init(&newsk->sk_node); + tcp_sk(newsk)->bind_hash = NULL; + + /* Clone the TCP header template */ + inet_sk(newsk)->dport = req->rmt_port; + + sock_lock_init(newsk); + bh_lock_sock(newsk); + + rwlock_init(&newsk->sk_dst_lock); + atomic_set(&newsk->sk_rmem_alloc, 0); + skb_queue_head_init(&newsk->sk_receive_queue); + atomic_set(&newsk->sk_wmem_alloc, 0); + skb_queue_head_init(&newsk->sk_write_queue); + atomic_set(&newsk->sk_omem_alloc, 0); + newsk->sk_wmem_queued = 0; + newsk->sk_forward_alloc = 0; + + sock_reset_flag(newsk, SOCK_DONE); + newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; + newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL; + newsk->sk_send_head = NULL; + rwlock_init(&newsk->sk_callback_lock); + skb_queue_head_init(&newsk->sk_error_queue); + newsk->sk_write_space = sk_stream_write_space; + + if ((filter = newsk->sk_filter) != NULL) + sk_filter_charge(newsk, filter); + + if (unlikely(xfrm_sk_clone_policy(newsk))) { + /* It is still raw copy of parent, so invalidate + * destructor and make plain sk_free() */ + newsk->sk_destruct = NULL; + sk_free(newsk); + return NULL; + } + + /* Now setup tcp_sock */ + newtp = tcp_sk(newsk); + newtp->pred_flags = 0; + newtp->rcv_nxt = req->rcv_isn + 1; + newtp->snd_nxt = req->snt_isn + 1; + newtp->snd_una = req->snt_isn + 1; + newtp->snd_sml = req->snt_isn + 1; + + tcp_prequeue_init(newtp); + + tcp_init_wl(newtp, req->snt_isn, req->rcv_isn); + + newtp->retransmits = 0; + newtp->backoff = 0; + newtp->srtt = 0; + newtp->mdev = TCP_TIMEOUT_INIT; + newtp->rto = TCP_TIMEOUT_INIT; + + newtp->packets_out = 0; + newtp->left_out = 0; + newtp->retrans_out = 0; + newtp->sacked_out = 0; + newtp->fackets_out = 0; + newtp->snd_ssthresh = 0x7fffffff; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + newtp->snd_cwnd = 2; + newtp->snd_cwnd_cnt = 0; + + newtp->frto_counter = 0; + newtp->frto_highmark = 0; + + tcp_set_ca_state(newtp, TCP_CA_Open); + tcp_init_xmit_timers(newsk); + skb_queue_head_init(&newtp->out_of_order_queue); + newtp->rcv_wup = req->rcv_isn + 1; + newtp->write_seq = req->snt_isn + 1; + newtp->pushed_seq = newtp->write_seq; + newtp->copied_seq = req->rcv_isn + 1; + + newtp->rx_opt.saw_tstamp = 0; + + newtp->rx_opt.dsack = 0; + newtp->rx_opt.eff_sacks = 0; + + newtp->probes_out = 0; + newtp->rx_opt.num_sacks = 0; + newtp->urg_data = 0; + newtp->listen_opt = NULL; + newtp->accept_queue = newtp->accept_queue_tail = NULL; + /* Deinitialize syn_wait_lock to trap illegal accesses. */ + memset(&newtp->syn_wait_lock, 0, sizeof(newtp->syn_wait_lock)); + + /* Back to base struct sock members. */ + newsk->sk_err = 0; + newsk->sk_priority = 0; + atomic_set(&newsk->sk_refcnt, 2); +#ifdef INET_REFCNT_DEBUG + atomic_inc(&inet_sock_nr); +#endif + atomic_inc(&tcp_sockets_allocated); + + if (sock_flag(newsk, SOCK_KEEPOPEN)) + tcp_reset_keepalive_timer(newsk, + keepalive_time_when(newtp)); + newsk->sk_socket = NULL; + newsk->sk_sleep = NULL; + + newtp->rx_opt.tstamp_ok = req->tstamp_ok; + if((newtp->rx_opt.sack_ok = req->sack_ok) != 0) { + if (sysctl_tcp_fack) + newtp->rx_opt.sack_ok |= 2; + } + newtp->window_clamp = req->window_clamp; + newtp->rcv_ssthresh = req->rcv_wnd; + newtp->rcv_wnd = req->rcv_wnd; + newtp->rx_opt.wscale_ok = req->wscale_ok; + if (newtp->rx_opt.wscale_ok) { + newtp->rx_opt.snd_wscale = req->snd_wscale; + newtp->rx_opt.rcv_wscale = req->rcv_wscale; + } else { + newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0; + newtp->window_clamp = min(newtp->window_clamp, 65535U); + } + newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale; + newtp->max_window = newtp->snd_wnd; + + if (newtp->rx_opt.tstamp_ok) { + newtp->rx_opt.ts_recent = req->ts_recent; + newtp->rx_opt.ts_recent_stamp = xtime.tv_sec; + newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; + } else { + newtp->rx_opt.ts_recent_stamp = 0; + newtp->tcp_header_len = sizeof(struct tcphdr); + } + if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len) + newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len; + newtp->rx_opt.mss_clamp = req->mss; + TCP_ECN_openreq_child(newtp, req); + if (newtp->ecn_flags&TCP_ECN_OK) + sock_set_flag(newsk, SOCK_NO_LARGESEND); + + tcp_ca_init(newtp); + + TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS); + } + return newsk; +} + +/* + * Process an incoming packet for SYN_RECV sockets represented + * as an open_request. + */ + +struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, + struct open_request *req, + struct open_request **prev) +{ + struct tcphdr *th = skb->h.th; + struct tcp_sock *tp = tcp_sk(sk); + u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); + int paws_reject = 0; + struct tcp_options_received tmp_opt; + struct sock *child; + + tmp_opt.saw_tstamp = 0; + if (th->doff > (sizeof(struct tcphdr)>>2)) { + tcp_parse_options(skb, &tmp_opt, 0); + + if (tmp_opt.saw_tstamp) { + tmp_opt.ts_recent = req->ts_recent; + /* We do not store true stamp, but it is not required, + * it can be estimated (approximately) + * from another data. + */ + tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<retrans); + paws_reject = tcp_paws_check(&tmp_opt, th->rst); + } + } + + /* Check for pure retransmitted SYN. */ + if (TCP_SKB_CB(skb)->seq == req->rcv_isn && + flg == TCP_FLAG_SYN && + !paws_reject) { + /* + * RFC793 draws (Incorrectly! It was fixed in RFC1122) + * this case on figure 6 and figure 8, but formal + * protocol description says NOTHING. + * To be more exact, it says that we should send ACK, + * because this segment (at least, if it has no data) + * is out of window. + * + * CONCLUSION: RFC793 (even with RFC1122) DOES NOT + * describe SYN-RECV state. All the description + * is wrong, we cannot believe to it and should + * rely only on common sense and implementation + * experience. + * + * Enforce "SYN-ACK" according to figure 8, figure 6 + * of RFC793, fixed by RFC1122. + */ + req->class->rtx_syn_ack(sk, req, NULL); + return NULL; + } + + /* Further reproduces section "SEGMENT ARRIVES" + for state SYN-RECEIVED of RFC793. + It is broken, however, it does not work only + when SYNs are crossed. + + You would think that SYN crossing is impossible here, since + we should have a SYN_SENT socket (from connect()) on our end, + but this is not true if the crossed SYNs were sent to both + ends by a malicious third party. We must defend against this, + and to do that we first verify the ACK (as per RFC793, page + 36) and reset if it is invalid. Is this a true full defense? + To convince ourselves, let us consider a way in which the ACK + test can still pass in this 'malicious crossed SYNs' case. + Malicious sender sends identical SYNs (and thus identical sequence + numbers) to both A and B: + + A: gets SYN, seq=7 + B: gets SYN, seq=7 + + By our good fortune, both A and B select the same initial + send sequence number of seven :-) + + A: sends SYN|ACK, seq=7, ack_seq=8 + B: sends SYN|ACK, seq=7, ack_seq=8 + + So we are now A eating this SYN|ACK, ACK test passes. So + does sequence test, SYN is truncated, and thus we consider + it a bare ACK. + + If tp->defer_accept, we silently drop this bare ACK. Otherwise, + we create an established connection. Both ends (listening sockets) + accept the new incoming connection and try to talk to each other. 8-) + + Note: This case is both harmless, and rare. Possibility is about the + same as us discovering intelligent life on another plant tomorrow. + + But generally, we should (RFC lies!) to accept ACK + from SYNACK both here and in tcp_rcv_state_process(). + tcp_rcv_state_process() does not, hence, we do not too. + + Note that the case is absolutely generic: + we cannot optimize anything here without + violating protocol. All the checks must be made + before attempt to create socket. + */ + + /* RFC793 page 36: "If the connection is in any non-synchronized state ... + * and the incoming segment acknowledges something not yet + * sent (the segment carries an unaccaptable ACK) ... + * a reset is sent." + * + * Invalid ACK: reset will be sent by listening socket + */ + if ((flg & TCP_FLAG_ACK) && + (TCP_SKB_CB(skb)->ack_seq != req->snt_isn+1)) + return sk; + + /* Also, it would be not so bad idea to check rcv_tsecr, which + * is essentially ACK extension and too early or too late values + * should cause reset in unsynchronized states. + */ + + /* RFC793: "first check sequence number". */ + + if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, + req->rcv_isn+1, req->rcv_isn+1+req->rcv_wnd)) { + /* Out of window: send ACK and drop. */ + if (!(flg & TCP_FLAG_RST)) + req->class->send_ack(skb, req); + if (paws_reject) + NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED); + return NULL; + } + + /* In sequence, PAWS is OK. */ + + if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, req->rcv_isn+1)) + req->ts_recent = tmp_opt.rcv_tsval; + + if (TCP_SKB_CB(skb)->seq == req->rcv_isn) { + /* Truncate SYN, it is out of window starting + at req->rcv_isn+1. */ + flg &= ~TCP_FLAG_SYN; + } + + /* RFC793: "second check the RST bit" and + * "fourth, check the SYN bit" + */ + if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN)) + goto embryonic_reset; + + /* ACK sequence verified above, just make sure ACK is + * set. If ACK not set, just silently drop the packet. + */ + if (!(flg & TCP_FLAG_ACK)) + return NULL; + + /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */ + if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == req->rcv_isn+1) { + req->acked = 1; + return NULL; + } + + /* OK, ACK is valid, create big socket and + * feed this segment to it. It will repeat all + * the tests. THIS SEGMENT MUST MOVE SOCKET TO + * ESTABLISHED STATE. If it will be dropped after + * socket is created, wait for troubles. + */ + child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); + if (child == NULL) + goto listen_overflow; + + tcp_synq_unlink(tp, req, prev); + tcp_synq_removed(sk, req); + + tcp_acceptq_queue(sk, req, child); + return child; + + listen_overflow: + if (!sysctl_tcp_abort_on_overflow) { + req->acked = 1; + return NULL; + } + + embryonic_reset: + NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS); + if (!(flg & TCP_FLAG_RST)) + req->class->send_reset(skb); + + tcp_synq_drop(sk, req, prev); + return NULL; +} + +/* + * Queue segment on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket. + */ + +int tcp_child_process(struct sock *parent, struct sock *child, + struct sk_buff *skb) +{ + int ret = 0; + int state = child->sk_state; + + if (!sock_owned_by_user(child)) { + ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len); + + /* Wakeup parent, send SIGIO */ + if (state == TCP_SYN_RECV && child->sk_state != state) + parent->sk_data_ready(parent, 0); + } else { + /* Alas, it is possible again, because we do lookup + * in main socket hash table and lock on listening + * socket does not protect us more. + */ + sk_add_backlog(child, skb); + } + + bh_unlock_sock(child); + sock_put(child); + return ret; +} + +EXPORT_SYMBOL(tcp_check_req); +EXPORT_SYMBOL(tcp_child_process); +EXPORT_SYMBOL(tcp_create_openreq_child); +EXPORT_SYMBOL(tcp_timewait_state_process); +EXPORT_SYMBOL(tcp_tw_deschedule); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c new file mode 100644 index 000000000000..13c14cb6dee4 --- /dev/null +++ b/net/ipv4/tcp_output.c @@ -0,0 +1,1739 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * Jorge Cwik, + */ + +/* + * Changes: Pedro Roque : Retransmit queue handled by TCP. + * : Fragmentation on mtu decrease + * : Segment collapse on retransmit + * : AF independence + * + * Linus Torvalds : send_delayed_ack + * David S. Miller : Charge memory using the right skb + * during syn/ack processing. + * David S. Miller : Output engine completely rewritten. + * Andrea Arcangeli: SYNACK carry ts_recent in tsecr. + * Cacophonix Gaul : draft-minshall-nagle-01 + * J Hadi Salim : ECN support + * + */ + +#include + +#include +#include +#include + +/* People can turn this off for buggy TCP's found in printers etc. */ +int sysctl_tcp_retrans_collapse = 1; + +/* This limits the percentage of the congestion window which we + * will allow a single TSO frame to consume. Building TSO frames + * which are too large can cause TCP streams to be bursty. + */ +int sysctl_tcp_tso_win_divisor = 8; + +static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) +{ + sk->sk_send_head = skb->next; + if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) + sk->sk_send_head = NULL; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_packets_out_inc(sk, tp, skb); +} + +/* SND.NXT, if window was not shrunk. + * If window has been shrunk, what should we make? It is not clear at all. + * Using SND.UNA we will fail to open window, SND.NXT is out of window. :-( + * Anything in between SND.UNA...SND.UNA+SND.WND also can be already + * invalid. OK, let's make this for now: + */ +static inline __u32 tcp_acceptable_seq(struct sock *sk, struct tcp_sock *tp) +{ + if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt)) + return tp->snd_nxt; + else + return tp->snd_una+tp->snd_wnd; +} + +/* Calculate mss to advertise in SYN segment. + * RFC1122, RFC1063, draft-ietf-tcpimpl-pmtud-01 state that: + * + * 1. It is independent of path mtu. + * 2. Ideally, it is maximal possible segment size i.e. 65535-40. + * 3. For IPv4 it is reasonable to calculate it from maximal MTU of + * attached devices, because some buggy hosts are confused by + * large MSS. + * 4. We do not make 3, we advertise MSS, calculated from first + * hop device mtu, but allow to raise it to ip_rt_min_advmss. + * This may be overridden via information stored in routing table. + * 5. Value 65535 for MSS is valid in IPv6 and means "as large as possible, + * probably even Jumbo". + */ +static __u16 tcp_advertise_mss(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + int mss = tp->advmss; + + if (dst && dst_metric(dst, RTAX_ADVMSS) < mss) { + mss = dst_metric(dst, RTAX_ADVMSS); + tp->advmss = mss; + } + + return (__u16)mss; +} + +/* RFC2861. Reset CWND after idle period longer RTO to "restart window". + * This is the first part of cwnd validation mechanism. */ +static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst) +{ + s32 delta = tcp_time_stamp - tp->lsndtime; + u32 restart_cwnd = tcp_init_cwnd(tp, dst); + u32 cwnd = tp->snd_cwnd; + + if (tcp_is_vegas(tp)) + tcp_vegas_enable(tp); + + tp->snd_ssthresh = tcp_current_ssthresh(tp); + restart_cwnd = min(restart_cwnd, cwnd); + + while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd) + cwnd >>= 1; + tp->snd_cwnd = max(cwnd, restart_cwnd); + tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd_used = 0; +} + +static inline void tcp_event_data_sent(struct tcp_sock *tp, + struct sk_buff *skb, struct sock *sk) +{ + u32 now = tcp_time_stamp; + + if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto) + tcp_cwnd_restart(tp, __sk_dst_get(sk)); + + tp->lsndtime = now; + + /* If it is a reply for ato after last received + * packet, enter pingpong mode. + */ + if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato) + tp->ack.pingpong = 1; +} + +static __inline__ void tcp_event_ack_sent(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tcp_dec_quickack_mode(tp); + tcp_clear_xmit_timer(sk, TCP_TIME_DACK); +} + +/* Determine a window scaling and initial window to offer. + * Based on the assumption that the given amount of space + * will be offered. Store the results in the tp structure. + * NOTE: for smooth operation initial space offering should + * be a multiple of mss if possible. We assume here that mss >= 1. + * This MUST be enforced by all callers. + */ +void tcp_select_initial_window(int __space, __u32 mss, + __u32 *rcv_wnd, __u32 *window_clamp, + int wscale_ok, __u8 *rcv_wscale) +{ + unsigned int space = (__space < 0 ? 0 : __space); + + /* If no clamp set the clamp to the max possible scaled window */ + if (*window_clamp == 0) + (*window_clamp) = (65535 << 14); + space = min(*window_clamp, space); + + /* Quantize space offering to a multiple of mss if possible. */ + if (space > mss) + space = (space / mss) * mss; + + /* NOTE: offering an initial window larger than 32767 + * will break some buggy TCP stacks. We try to be nice. + * If we are not window scaling, then this truncates + * our initial window offering to 32k. There should also + * be a sysctl option to stop being nice. + */ + (*rcv_wnd) = min(space, MAX_TCP_WINDOW); + (*rcv_wscale) = 0; + if (wscale_ok) { + /* Set window scaling on max possible window + * See RFC1323 for an explanation of the limit to 14 + */ + space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max); + while (space > 65535 && (*rcv_wscale) < 14) { + space >>= 1; + (*rcv_wscale)++; + } + } + + /* Set initial window to value enough for senders, + * following RFC1414. Senders, not following this RFC, + * will be satisfied with 2. + */ + if (mss > (1<<*rcv_wscale)) { + int init_cwnd = 4; + if (mss > 1460*3) + init_cwnd = 2; + else if (mss > 1460) + init_cwnd = 3; + if (*rcv_wnd > init_cwnd*mss) + *rcv_wnd = init_cwnd*mss; + } + + /* Set the clamp no higher than max representable value */ + (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); +} + +/* Chose a new window to advertise, update state in tcp_sock for the + * socket, and return result with RFC1323 scaling applied. The return + * value can be stuffed directly into th->window for an outgoing + * frame. + */ +static __inline__ u16 tcp_select_window(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 cur_win = tcp_receive_window(tp); + u32 new_win = __tcp_select_window(sk); + + /* Never shrink the offered window */ + if(new_win < cur_win) { + /* Danger Will Robinson! + * Don't update rcv_wup/rcv_wnd here or else + * we will not be able to advertise a zero + * window in time. --DaveM + * + * Relax Will Robinson. + */ + new_win = cur_win; + } + tp->rcv_wnd = new_win; + tp->rcv_wup = tp->rcv_nxt; + + /* Make sure we do not exceed the maximum possible + * scaled window. + */ + if (!tp->rx_opt.rcv_wscale) + new_win = min(new_win, MAX_TCP_WINDOW); + else + new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale)); + + /* RFC1323 scaling applied */ + new_win >>= tp->rx_opt.rcv_wscale; + + /* If we advertise zero window, disable fast path. */ + if (new_win == 0) + tp->pred_flags = 0; + + return new_win; +} + + +/* This routine actually transmits TCP packets queued in by + * tcp_do_sendmsg(). This is used by both the initial + * transmission and possible later retransmissions. + * All SKB's seen here are completely headerless. It is our + * job to build the TCP header, and pass the packet down to + * IP so it can do the same plus pass the packet off to the + * device. + * + * We are working here with either a clone of the original + * SKB, or a fresh unique copy made by the retransmit engine. + */ +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb) +{ + if (skb != NULL) { + struct inet_sock *inet = inet_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); + int tcp_header_size = tp->tcp_header_len; + struct tcphdr *th; + int sysctl_flags; + int err; + + BUG_ON(!tcp_skb_pcount(skb)); + +#define SYSCTL_FLAG_TSTAMPS 0x1 +#define SYSCTL_FLAG_WSCALE 0x2 +#define SYSCTL_FLAG_SACK 0x4 + + sysctl_flags = 0; + if (tcb->flags & TCPCB_FLAG_SYN) { + tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; + if(sysctl_tcp_timestamps) { + tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_TSTAMPS; + } + if(sysctl_tcp_window_scaling) { + tcp_header_size += TCPOLEN_WSCALE_ALIGNED; + sysctl_flags |= SYSCTL_FLAG_WSCALE; + } + if(sysctl_tcp_sack) { + sysctl_flags |= SYSCTL_FLAG_SACK; + if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) + tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; + } + } else if (tp->rx_opt.eff_sacks) { + /* A SACK is 2 pad bytes, a 2 byte header, plus + * 2 32-bit sequence numbers for each SACK block. + */ + tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); + } + + /* + * If the connection is idle and we are restarting, + * then we don't want to do any Vegas calculations + * until we get fresh RTT samples. So when we + * restart, we reset our Vegas state to a clean + * slate. After we get acks for this flight of + * packets, _then_ we can make Vegas calculations + * again. + */ + if (tcp_is_vegas(tp) && tcp_packets_in_flight(tp) == 0) + tcp_vegas_enable(tp); + + th = (struct tcphdr *) skb_push(skb, tcp_header_size); + skb->h.th = th; + skb_set_owner_w(skb, sk); + + /* Build TCP header and checksum it. */ + th->source = inet->sport; + th->dest = inet->dport; + th->seq = htonl(tcb->seq); + th->ack_seq = htonl(tp->rcv_nxt); + *(((__u16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | tcb->flags); + if (tcb->flags & TCPCB_FLAG_SYN) { + /* RFC1323: The window in SYN & SYN/ACK segments + * is never scaled. + */ + th->window = htons(tp->rcv_wnd); + } else { + th->window = htons(tcp_select_window(sk)); + } + th->check = 0; + th->urg_ptr = 0; + + if (tp->urg_mode && + between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) { + th->urg_ptr = htons(tp->snd_up-tcb->seq); + th->urg = 1; + } + + if (tcb->flags & TCPCB_FLAG_SYN) { + tcp_syn_build_options((__u32 *)(th + 1), + tcp_advertise_mss(sk), + (sysctl_flags & SYSCTL_FLAG_TSTAMPS), + (sysctl_flags & SYSCTL_FLAG_SACK), + (sysctl_flags & SYSCTL_FLAG_WSCALE), + tp->rx_opt.rcv_wscale, + tcb->when, + tp->rx_opt.ts_recent); + } else { + tcp_build_and_update_options((__u32 *)(th + 1), + tp, tcb->when); + + TCP_ECN_send(sk, tp, skb, tcp_header_size); + } + tp->af_specific->send_check(sk, th, skb->len, skb); + + if (tcb->flags & TCPCB_FLAG_ACK) + tcp_event_ack_sent(sk); + + if (skb->len != tcp_header_size) + tcp_event_data_sent(tp, skb, sk); + + TCP_INC_STATS(TCP_MIB_OUTSEGS); + + err = tp->af_specific->queue_xmit(skb, 0); + if (err <= 0) + return err; + + tcp_enter_cwr(tp); + + /* NET_XMIT_CN is special. It does not guarantee, + * that this packet is lost. It tells that device + * is about to start to drop packets or already + * drops some packets of the same priority and + * invokes us to send less aggressively. + */ + return err == NET_XMIT_CN ? 0 : err; + } + return -ENOBUFS; +#undef SYSCTL_FLAG_TSTAMPS +#undef SYSCTL_FLAG_WSCALE +#undef SYSCTL_FLAG_SACK +} + + +/* This routine just queue's the buffer + * + * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, + * otherwise socket can stall. + */ +static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + + /* Advance write_seq and place onto the write_queue. */ + tp->write_seq = TCP_SKB_CB(skb)->end_seq; + skb_header_release(skb); + __skb_queue_tail(&sk->sk_write_queue, skb); + sk_charge_skb(sk, skb); + + /* Queue it, remembering where we must start sending. */ + if (sk->sk_send_head == NULL) + sk->sk_send_head = skb; +} + +static inline void tcp_tso_set_push(struct sk_buff *skb) +{ + /* Force push to be on for any TSO frames to workaround + * problems with busted implementations like Mac OS-X that + * hold off socket receive wakeups until push is seen. + */ + if (tcp_skb_pcount(skb) > 1) + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; +} + +/* Send _single_ skb sitting at the send head. This function requires + * true push pending frames to setup probe timer etc. + */ +void tcp_push_one(struct sock *sk, unsigned cur_mss) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = sk->sk_send_head; + + if (tcp_snd_test(tp, skb, cur_mss, TCP_NAGLE_PUSH)) { + /* Send it out now. */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_tso_set_push(skb); + if (!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation))) { + sk->sk_send_head = NULL; + tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; + tcp_packets_out_inc(sk, tp, skb); + return; + } + } +} + +void tcp_set_skb_tso_segs(struct sk_buff *skb, unsigned int mss_std) +{ + if (skb->len <= mss_std) { + /* Avoid the costly divide in the normal + * non-TSO case. + */ + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; + } else { + unsigned int factor; + + factor = skb->len + (mss_std - 1); + factor /= mss_std; + skb_shinfo(skb)->tso_segs = factor; + skb_shinfo(skb)->tso_size = mss_std; + } +} + +/* Function to create two new TCP segments. Shrinks the given segment + * to the specified size and appends a new segment with the rest of the + * packet to the list. This won't be called frequently, I hope. + * Remember, these are still headerless SKBs at this point. + */ +static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; + int nsize; + u16 flags; + + nsize = skb_headlen(skb) - len; + if (nsize < 0) + nsize = 0; + + if (skb_cloned(skb) && + skb_is_nonlinear(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return -ENOMEM; + + /* Get a new skb... force flag on. */ + buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC); + if (buff == NULL) + return -ENOMEM; /* We'll just try again later. */ + sk_charge_skb(sk, buff); + + /* Correct the sequence numbers. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len; + TCP_SKB_CB(buff)->end_seq = TCP_SKB_CB(skb)->end_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(buff)->seq; + + /* PSH and FIN should only be set in the second packet. */ + flags = TCP_SKB_CB(skb)->flags; + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + TCP_SKB_CB(buff)->flags = flags; + TCP_SKB_CB(buff)->sacked = + (TCP_SKB_CB(skb)->sacked & + (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL)); + TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL; + + if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) { + /* Copy and checksum data tail into the new buffer. */ + buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize), + nsize, 0); + + skb_trim(skb, len); + + skb->csum = csum_block_sub(skb->csum, buff->csum, len); + } else { + skb->ip_summed = CHECKSUM_HW; + skb_split(skb, buff, len); + } + + buff->ip_summed = skb->ip_summed; + + /* Looks stupid, but our code really uses when of + * skbs, which it never sent before. --ANK + */ + TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when; + + if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { + tp->lost_out -= tcp_skb_pcount(skb); + tp->left_out -= tcp_skb_pcount(skb); + } + + /* Fix up tso_factor for both original and new SKB. */ + tcp_set_skb_tso_segs(skb, tp->mss_cache_std); + tcp_set_skb_tso_segs(buff, tp->mss_cache_std); + + if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) { + tp->lost_out += tcp_skb_pcount(skb); + tp->left_out += tcp_skb_pcount(skb); + } + + if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) { + tp->lost_out += tcp_skb_pcount(buff); + tp->left_out += tcp_skb_pcount(buff); + } + + /* Link BUFF into the send queue. */ + __skb_append(skb, buff); + + return 0; +} + +/* This is similar to __pskb_pull_head() (it will go to core/skbuff.c + * eventually). The difference is that pulled data not copied, but + * immediately discarded. + */ +static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len) +{ + int i, k, eat; + + eat = len; + k = 0; + for (i=0; inr_frags; i++) { + if (skb_shinfo(skb)->frags[i].size <= eat) { + put_page(skb_shinfo(skb)->frags[i].page); + eat -= skb_shinfo(skb)->frags[i].size; + } else { + skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i]; + if (eat) { + skb_shinfo(skb)->frags[k].page_offset += eat; + skb_shinfo(skb)->frags[k].size -= eat; + eat = 0; + } + k++; + } + } + skb_shinfo(skb)->nr_frags = k; + + skb->tail = skb->data; + skb->data_len -= len; + skb->len = skb->data_len; + return skb->tail; +} + +int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) +{ + if (skb_cloned(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return -ENOMEM; + + if (len <= skb_headlen(skb)) { + __skb_pull(skb, len); + } else { + if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL) + return -ENOMEM; + } + + TCP_SKB_CB(skb)->seq += len; + skb->ip_summed = CHECKSUM_HW; + + skb->truesize -= len; + sk->sk_wmem_queued -= len; + sk->sk_forward_alloc += len; + sock_set_flag(sk, SOCK_QUEUE_SHRUNK); + + /* Any change of skb->len requires recalculation of tso + * factor and mss. + */ + if (tcp_skb_pcount(skb) > 1) + tcp_set_skb_tso_segs(skb, tcp_skb_mss(skb)); + + return 0; +} + +/* This function synchronize snd mss to current pmtu/exthdr set. + + tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts + for TCP options, but includes only bare TCP header. + + tp->rx_opt.mss_clamp is mss negotiated at connection setup. + It is minumum of user_mss and mss received with SYN. + It also does not include TCP options. + + tp->pmtu_cookie is last pmtu, seen by this function. + + tp->mss_cache is current effective sending mss, including + all tcp options except for SACKs. It is evaluated, + taking into account current pmtu, but never exceeds + tp->rx_opt.mss_clamp. + + NOTE1. rfc1122 clearly states that advertised MSS + DOES NOT include either tcp or ip options. + + NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside + this function. --ANK (980731) + */ + +unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) +{ + struct tcp_sock *tp = tcp_sk(sk); + int mss_now; + + /* Calculate base mss without TCP options: + It is MMS_S - sizeof(tcphdr) of rfc1122 + */ + mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); + + /* Clamp it (mss_clamp does not include tcp options) */ + if (mss_now > tp->rx_opt.mss_clamp) + mss_now = tp->rx_opt.mss_clamp; + + /* Now subtract optional transport overhead */ + mss_now -= tp->ext_header_len; + + /* Then reserve room for full set of TCP options and 8 bytes of data */ + if (mss_now < 48) + mss_now = 48; + + /* Now subtract TCP options size, not including SACKs */ + mss_now -= tp->tcp_header_len - sizeof(struct tcphdr); + + /* Bound mss with half of window */ + if (tp->max_window && mss_now > (tp->max_window>>1)) + mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); + + /* And store cached results */ + tp->pmtu_cookie = pmtu; + tp->mss_cache = tp->mss_cache_std = mss_now; + + return mss_now; +} + +/* Compute the current effective MSS, taking SACKs and IP options, + * and even PMTU discovery events into account. + * + * LARGESEND note: !urg_mode is overkill, only frames up to snd_up + * cannot be large. However, taking into account rare use of URG, this + * is not a big flaw. + */ + +unsigned int tcp_current_mss(struct sock *sk, int large) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct dst_entry *dst = __sk_dst_get(sk); + unsigned int do_large, mss_now; + + mss_now = tp->mss_cache_std; + if (dst) { + u32 mtu = dst_mtu(dst); + if (mtu != tp->pmtu_cookie) + mss_now = tcp_sync_mss(sk, mtu); + } + + do_large = (large && + (sk->sk_route_caps & NETIF_F_TSO) && + !tp->urg_mode); + + if (do_large) { + unsigned int large_mss, factor, limit; + + large_mss = 65535 - tp->af_specific->net_header_len - + tp->ext_header_len - tp->tcp_header_len; + + if (tp->max_window && large_mss > (tp->max_window>>1)) + large_mss = max((tp->max_window>>1), + 68U - tp->tcp_header_len); + + factor = large_mss / mss_now; + + /* Always keep large mss multiple of real mss, but + * do not exceed 1/tso_win_divisor of the congestion window + * so we can keep the ACK clock ticking and minimize + * bursting. + */ + limit = tp->snd_cwnd; + if (sysctl_tcp_tso_win_divisor) + limit /= sysctl_tcp_tso_win_divisor; + limit = max(1U, limit); + if (factor > limit) + factor = limit; + + tp->mss_cache = mss_now * factor; + + mss_now = tp->mss_cache; + } + + if (tp->rx_opt.eff_sacks) + mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + + (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); + return mss_now; +} + +/* This routine writes packets to the network. It advances the + * send_head. This happens as incoming acks open up the remote + * window for us. + * + * Returns 1, if no segments are in flight and we have queued segments, but + * cannot send anything now because of SWS or another problem. + */ +int tcp_write_xmit(struct sock *sk, int nonagle) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int mss_now; + + /* If we are closed, the bytes will have to remain here. + * In time closedown will finish, we empty the write queue and all + * will be happy. + */ + if (sk->sk_state != TCP_CLOSE) { + struct sk_buff *skb; + int sent_pkts = 0; + + /* Account for SACKS, we may need to fragment due to this. + * It is just like the real MSS changing on us midstream. + * We also handle things correctly when the user adds some + * IP options mid-stream. Silly to do, but cover it. + */ + mss_now = tcp_current_mss(sk, 1); + + while ((skb = sk->sk_send_head) && + tcp_snd_test(tp, skb, mss_now, + tcp_skb_is_last(sk, skb) ? nonagle : + TCP_NAGLE_PUSH)) { + if (skb->len > mss_now) { + if (tcp_fragment(sk, skb, mss_now)) + break; + } + + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_tso_set_push(skb); + if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) + break; + + /* Advance the send_head. This one is sent out. + * This call will increment packets_out. + */ + update_send_head(sk, tp, skb); + + tcp_minshall_update(tp, mss_now, skb); + sent_pkts = 1; + } + + if (sent_pkts) { + tcp_cwnd_validate(sk, tp); + return 0; + } + + return !tp->packets_out && sk->sk_send_head; + } + return 0; +} + +/* This function returns the amount that we can raise the + * usable window based on the following constraints + * + * 1. The window can never be shrunk once it is offered (RFC 793) + * 2. We limit memory per socket + * + * RFC 1122: + * "the suggested [SWS] avoidance algorithm for the receiver is to keep + * RECV.NEXT + RCV.WIN fixed until: + * RCV.BUFF - RCV.USER - RCV.WINDOW >= min(1/2 RCV.BUFF, MSS)" + * + * i.e. don't raise the right edge of the window until you can raise + * it at least MSS bytes. + * + * Unfortunately, the recommended algorithm breaks header prediction, + * since header prediction assumes th->window stays fixed. + * + * Strictly speaking, keeping th->window fixed violates the receiver + * side SWS prevention criteria. The problem is that under this rule + * a stream of single byte packets will cause the right side of the + * window to always advance by a single byte. + * + * Of course, if the sender implements sender side SWS prevention + * then this will not be a problem. + * + * BSD seems to make the following compromise: + * + * If the free space is less than the 1/4 of the maximum + * space available and the free space is less than 1/2 mss, + * then set the window to 0. + * [ Actually, bsd uses MSS and 1/4 of maximal _window_ ] + * Otherwise, just prevent the window from shrinking + * and from being larger than the largest representable value. + * + * This prevents incremental opening of the window in the regime + * where TCP is limited by the speed of the reader side taking + * data out of the TCP receive queue. It does nothing about + * those cases where the window is constrained on the sender side + * because the pipeline is full. + * + * BSD also seems to "accidentally" limit itself to windows that are a + * multiple of MSS, at least until the free space gets quite small. + * This would appear to be a side effect of the mbuf implementation. + * Combining these two algorithms results in the observed behavior + * of having a fixed window size at almost all times. + * + * Below we obtain similar behavior by forcing the offered window to + * a multiple of the mss when it is feasible to do so. + * + * Note, we don't "adjust" for TIMESTAMP or SACK option bytes. + * Regular options like TIMESTAMP are taken into account. + */ +u32 __tcp_select_window(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + /* MSS for the peer's data. Previous verions used mss_clamp + * here. I don't know if the value based on our guesses + * of peer's MSS is better for the performance. It's more correct + * but may be worse for the performance because of rcv_mss + * fluctuations. --SAW 1998/11/1 + */ + int mss = tp->ack.rcv_mss; + int free_space = tcp_space(sk); + int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk)); + int window; + + if (mss > full_space) + mss = full_space; + + if (free_space < full_space/2) { + tp->ack.quick = 0; + + if (tcp_memory_pressure) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); + + if (free_space < mss) + return 0; + } + + if (free_space > tp->rcv_ssthresh) + free_space = tp->rcv_ssthresh; + + /* Don't do rounding if we are using window scaling, since the + * scaled window will not line up with the MSS boundary anyway. + */ + window = tp->rcv_wnd; + if (tp->rx_opt.rcv_wscale) { + window = free_space; + + /* Advertise enough space so that it won't get scaled away. + * Import case: prevent zero window announcement if + * 1< mss. + */ + if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window) + window = (((window >> tp->rx_opt.rcv_wscale) + 1) + << tp->rx_opt.rcv_wscale); + } else { + /* Get the largest window that is a nice multiple of mss. + * Window clamp already applied above. + * If our current window offering is within 1 mss of the + * free space we just keep it. This prevents the divide + * and multiply from happening most of the time. + * We also don't do any window rounding when the free space + * is too small. + */ + if (window <= free_space - mss || window > free_space) + window = (free_space/mss)*mss; + } + + return window; +} + +/* Attempt to collapse two adjacent SKB's during retransmission. */ +static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *next_skb = skb->next; + + /* The first test we must make is that neither of these two + * SKB's are still referenced by someone else. + */ + if (!skb_cloned(skb) && !skb_cloned(next_skb)) { + int skb_size = skb->len, next_skb_size = next_skb->len; + u16 flags = TCP_SKB_CB(skb)->flags; + + /* Also punt if next skb has been SACK'd. */ + if(TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) + return; + + /* Next skb is out of window. */ + if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd)) + return; + + /* Punt if not enough space exists in the first SKB for + * the data in the second, or the total combined payload + * would exceed the MSS. + */ + if ((next_skb_size > skb_tailroom(skb)) || + ((skb_size + next_skb_size) > mss_now)) + return; + + BUG_ON(tcp_skb_pcount(skb) != 1 || + tcp_skb_pcount(next_skb) != 1); + + /* Ok. We will be able to collapse the packet. */ + __skb_unlink(next_skb, next_skb->list); + + memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size); + + if (next_skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_HW; + + if (skb->ip_summed != CHECKSUM_HW) + skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size); + + /* Update sequence range on original skb. */ + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; + + /* Merge over control information. */ + flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ + TCP_SKB_CB(skb)->flags = flags; + + /* All done, get rid of second SKB and account for it so + * packet counting does not break. + */ + TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); + if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS) + tp->retrans_out -= tcp_skb_pcount(next_skb); + if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) { + tp->lost_out -= tcp_skb_pcount(next_skb); + tp->left_out -= tcp_skb_pcount(next_skb); + } + /* Reno case is special. Sigh... */ + if (!tp->rx_opt.sack_ok && tp->sacked_out) { + tcp_dec_pcount_approx(&tp->sacked_out, next_skb); + tp->left_out -= tcp_skb_pcount(next_skb); + } + + /* Not quite right: it can be > snd.fack, but + * it is better to underestimate fackets. + */ + tcp_dec_pcount_approx(&tp->fackets_out, next_skb); + tcp_packets_out_dec(tp, next_skb); + sk_stream_free_skb(sk, next_skb); + } +} + +/* Do a simple retransmit without using the backoff mechanisms in + * tcp_timer. This is used for path mtu discovery. + * The socket is already locked here. + */ +void tcp_simple_retransmit(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + unsigned int mss = tcp_current_mss(sk, 0); + int lost = 0; + + sk_stream_for_retrans_queue(skb, sk) { + if (skb->len > mss && + !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { + if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { + TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; + tp->retrans_out -= tcp_skb_pcount(skb); + } + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out += tcp_skb_pcount(skb); + lost = 1; + } + } + } + + if (!lost) + return; + + tcp_sync_left_out(tp); + + /* Don't muck with the congestion window here. + * Reason is that we do not increase amount of _data_ + * in network, but units changed and effective + * cwnd/ssthresh really reduced now. + */ + if (tp->ca_state != TCP_CA_Loss) { + tp->high_seq = tp->snd_nxt; + tp->snd_ssthresh = tcp_current_ssthresh(tp); + tp->prior_ssthresh = 0; + tp->undo_marker = 0; + tcp_set_ca_state(tp, TCP_CA_Loss); + } + tcp_xmit_retransmit_queue(sk); +} + +/* This retransmits one SKB. Policy decisions and retransmit queue + * state updates are done by the caller. Returns non-zero if an + * error occurred which prevented the send. + */ +int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int cur_mss = tcp_current_mss(sk, 0); + int err; + + /* Do not sent more than we queued. 1/4 is reserved for possible + * copying overhead: frgagmentation, tunneling, mangling etc. + */ + if (atomic_read(&sk->sk_wmem_alloc) > + min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf)) + return -EAGAIN; + + if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) { + if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + BUG(); + + if (sk->sk_route_caps & NETIF_F_TSO) { + sk->sk_route_caps &= ~NETIF_F_TSO; + sock_set_flag(sk, SOCK_NO_LARGESEND); + tp->mss_cache = tp->mss_cache_std; + } + + if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) + return -ENOMEM; + } + + /* If receiver has shrunk his window, and skb is out of + * new window, do not retransmit it. The exception is the + * case, when window is shrunk to zero. In this case + * our retransmit serves as a zero window probe. + */ + if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd) + && TCP_SKB_CB(skb)->seq != tp->snd_una) + return -EAGAIN; + + if (skb->len > cur_mss) { + int old_factor = tcp_skb_pcount(skb); + int new_factor; + + if (tcp_fragment(sk, skb, cur_mss)) + return -ENOMEM; /* We'll try again later. */ + + /* New SKB created, account for it. */ + new_factor = tcp_skb_pcount(skb); + tp->packets_out -= old_factor - new_factor; + tp->packets_out += tcp_skb_pcount(skb->next); + } + + /* Collapse two adjacent packets if worthwhile and we can. */ + if(!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && + (skb->len < (cur_mss >> 1)) && + (skb->next != sk->sk_send_head) && + (skb->next != (struct sk_buff *)&sk->sk_write_queue) && + (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(skb->next)->nr_frags == 0) && + (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(skb->next) == 1) && + (sysctl_tcp_retrans_collapse != 0)) + tcp_retrans_try_collapse(sk, skb, cur_mss); + + if(tp->af_specific->rebuild_header(sk)) + return -EHOSTUNREACH; /* Routing failure or similar. */ + + /* Some Solaris stacks overoptimize and ignore the FIN on a + * retransmit when old data is attached. So strip it off + * since it is cheap to do so and saves bytes on the network. + */ + if(skb->len > 0 && + (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && + tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { + if (!pskb_trim(skb, 0)) { + TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; + skb->ip_summed = CHECKSUM_NONE; + skb->csum = 0; + } + } + + /* Make a copy, if the first transmission SKB clone we made + * is still in somebody's hands, else make a clone. + */ + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_tso_set_push(skb); + + err = tcp_transmit_skb(sk, (skb_cloned(skb) ? + pskb_copy(skb, GFP_ATOMIC): + skb_clone(skb, GFP_ATOMIC))); + + if (err == 0) { + /* Update global TCP statistics. */ + TCP_INC_STATS(TCP_MIB_RETRANSSEGS); + + tp->total_retrans++; + +#if FASTRETRANS_DEBUG > 0 + if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { + if (net_ratelimit()) + printk(KERN_DEBUG "retrans_out leaked.\n"); + } +#endif + TCP_SKB_CB(skb)->sacked |= TCPCB_RETRANS; + tp->retrans_out += tcp_skb_pcount(skb); + + /* Save stamp of the first retransmit. */ + if (!tp->retrans_stamp) + tp->retrans_stamp = TCP_SKB_CB(skb)->when; + + tp->undo_retrans++; + + /* snd_nxt is stored to detect loss of retransmitted segment, + * see tcp_input.c tcp_sacktag_write_queue(). + */ + TCP_SKB_CB(skb)->ack_seq = tp->snd_nxt; + } + return err; +} + +/* This gets called after a retransmit timeout, and the initially + * retransmitted data is acknowledged. It tries to continue + * resending the rest of the retransmit queue, until either + * we've sent it all or the congestion window limit is reached. + * If doing SACK, the first ACK which comes back for a timeout + * based retransmit packet might feed us FACK information again. + * If so, we use it to avoid unnecessarily retransmissions. + */ +void tcp_xmit_retransmit_queue(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + int packet_cnt = tp->lost_out; + + /* First pass: retransmit lost packets. */ + if (packet_cnt) { + sk_stream_for_retrans_queue(skb, sk) { + __u8 sacked = TCP_SKB_CB(skb)->sacked; + + /* Assume this retransmit will generate + * only one packet for congestion window + * calculation purposes. This works because + * tcp_retransmit_skb() will chop up the + * packet to be MSS sized and all the + * packet counting works out. + */ + if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + return; + + if (sacked&TCPCB_LOST) { + if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { + if (tcp_retransmit_skb(sk, skb)) + return; + if (tp->ca_state != TCP_CA_Loss) + NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); + else + NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); + + if (skb == + skb_peek(&sk->sk_write_queue)) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + } + + packet_cnt -= tcp_skb_pcount(skb); + if (packet_cnt <= 0) + break; + } + } + } + + /* OK, demanded retransmission is finished. */ + + /* Forward retransmissions are possible only during Recovery. */ + if (tp->ca_state != TCP_CA_Recovery) + return; + + /* No forward retransmissions in Reno are possible. */ + if (!tp->rx_opt.sack_ok) + return; + + /* Yeah, we have to make difficult choice between forward transmission + * and retransmission... Both ways have their merits... + * + * For now we do not retransmit anything, while we have some new + * segments to send. + */ + + if (tcp_may_send_now(sk, tp)) + return; + + packet_cnt = 0; + + sk_stream_for_retrans_queue(skb, sk) { + /* Similar to the retransmit loop above we + * can pretend that the retransmitted SKB + * we send out here will be composed of one + * real MSS sized packet because tcp_retransmit_skb() + * will fragment it if necessary. + */ + if (++packet_cnt > tp->fackets_out) + break; + + if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) + break; + + if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) + continue; + + /* Ok, retransmit it. */ + if (tcp_retransmit_skb(sk, skb)) + break; + + if (skb == skb_peek(&sk->sk_write_queue)) + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + + NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); + } +} + + +/* Send a fin. The caller locks the socket for us. This cannot be + * allowed to fail queueing a FIN frame under any circumstances. + */ +void tcp_send_fin(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue); + int mss_now; + + /* Optimization, tack on the FIN if we have a queue of + * unsent frames. But be careful about outgoing SACKS + * and IP options. + */ + mss_now = tcp_current_mss(sk, 1); + + if (sk->sk_send_head != NULL) { + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN; + TCP_SKB_CB(skb)->end_seq++; + tp->write_seq++; + } else { + /* Socket is locked, keep trying until memory is available. */ + for (;;) { + skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL); + if (skb) + break; + yield(); + } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); + TCP_SKB_CB(skb)->sacked = 0; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; + + /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ + TCP_SKB_CB(skb)->seq = tp->write_seq; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + tcp_queue_skb(sk, skb); + } + __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_OFF); +} + +/* We get here when a process closes a file descriptor (either due to + * an explicit close() or as a byproduct of exit()'ing) and there + * was unread data in the receive queue. This behavior is recommended + * by draft-ietf-tcpimpl-prob-03.txt section 3.10. -DaveM + */ +void tcp_send_active_reset(struct sock *sk, int priority) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + /* NOTE: No TCP options attached and we never retransmit this. */ + skb = alloc_skb(MAX_TCP_HEADER, priority); + if (!skb) { + NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); + return; + } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); + TCP_SKB_CB(skb)->sacked = 0; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; + + /* Send it off. */ + TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_transmit_skb(sk, skb)) + NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); +} + +/* WARNING: This routine must only be called when we have already sent + * a SYN packet that crossed the incoming SYN that caused this routine + * to get called. If this assumption fails then the initial rcv_wnd + * and rcv_wscale values will not be correct. + */ +int tcp_send_synack(struct sock *sk) +{ + struct sk_buff* skb; + + skb = skb_peek(&sk->sk_write_queue); + if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) { + printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); + return -EFAULT; + } + if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) { + if (skb_cloned(skb)) { + struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); + if (nskb == NULL) + return -ENOMEM; + __skb_unlink(skb, &sk->sk_write_queue); + skb_header_release(nskb); + __skb_queue_head(&sk->sk_write_queue, nskb); + sk_stream_free_skb(sk, skb); + sk_charge_skb(sk, nskb); + skb = nskb; + } + + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ACK; + TCP_ECN_send_synack(tcp_sk(sk), skb); + } + TCP_SKB_CB(skb)->when = tcp_time_stamp; + return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); +} + +/* + * Prepare a SYN-ACK. + */ +struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, + struct open_request *req) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcphdr *th; + int tcp_header_size; + struct sk_buff *skb; + + skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); + if (skb == NULL) + return NULL; + + /* Reserve space for headers. */ + skb_reserve(skb, MAX_TCP_HEADER); + + skb->dst = dst_clone(dst); + + tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS + + (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + + (req->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + + /* SACK_PERM is in the place of NOP NOP of TS */ + ((req->sack_ok && !req->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); + skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size); + + memset(th, 0, sizeof(struct tcphdr)); + th->syn = 1; + th->ack = 1; + if (dst->dev->features&NETIF_F_TSO) + req->ecn_ok = 0; + TCP_ECN_make_synack(req, th); + th->source = inet_sk(sk)->sport; + th->dest = req->rmt_port; + TCP_SKB_CB(skb)->seq = req->snt_isn; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + TCP_SKB_CB(skb)->sacked = 0; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; + th->seq = htonl(TCP_SKB_CB(skb)->seq); + th->ack_seq = htonl(req->rcv_isn + 1); + if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ + __u8 rcv_wscale; + /* Set this up on the first call only */ + req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + /* tcp_full_space because it is guaranteed to be the first packet */ + tcp_select_initial_window(tcp_full_space(sk), + dst_metric(dst, RTAX_ADVMSS) - (req->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + &req->rcv_wnd, + &req->window_clamp, + req->wscale_ok, + &rcv_wscale); + req->rcv_wscale = rcv_wscale; + } + + /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ + th->window = htons(req->rcv_wnd); + + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), req->tstamp_ok, + req->sack_ok, req->wscale_ok, req->rcv_wscale, + TCP_SKB_CB(skb)->when, + req->ts_recent); + + skb->csum = 0; + th->doff = (tcp_header_size >> 2); + TCP_INC_STATS(TCP_MIB_OUTSEGS); + return skb; +} + +/* + * Do all connect socket setups that can be done AF independent. + */ +static inline void tcp_connect_init(struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_get(sk); + struct tcp_sock *tp = tcp_sk(sk); + __u8 rcv_wscale; + + /* We'll fix this up when we get a response from the other end. + * See tcp_input.c:tcp_rcv_state_process case TCP_SYN_SENT. + */ + tp->tcp_header_len = sizeof(struct tcphdr) + + (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0); + + /* If user gave his TCP_MAXSEG, record it to clamp */ + if (tp->rx_opt.user_mss) + tp->rx_opt.mss_clamp = tp->rx_opt.user_mss; + tp->max_window = 0; + tcp_sync_mss(sk, dst_mtu(dst)); + + if (!tp->window_clamp) + tp->window_clamp = dst_metric(dst, RTAX_WINDOW); + tp->advmss = dst_metric(dst, RTAX_ADVMSS); + tcp_initialize_rcv_mss(sk); + tcp_ca_init(tp); + + tcp_select_initial_window(tcp_full_space(sk), + tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), + &tp->rcv_wnd, + &tp->window_clamp, + sysctl_tcp_window_scaling, + &rcv_wscale); + + tp->rx_opt.rcv_wscale = rcv_wscale; + tp->rcv_ssthresh = tp->rcv_wnd; + + sk->sk_err = 0; + sock_reset_flag(sk, SOCK_DONE); + tp->snd_wnd = 0; + tcp_init_wl(tp, tp->write_seq, 0); + tp->snd_una = tp->write_seq; + tp->snd_sml = tp->write_seq; + tp->rcv_nxt = 0; + tp->rcv_wup = 0; + tp->copied_seq = 0; + + tp->rto = TCP_TIMEOUT_INIT; + tp->retransmits = 0; + tcp_clear_retrans(tp); +} + +/* + * Build a SYN and send it off. + */ +int tcp_connect(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; + + tcp_connect_init(sk); + + buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation); + if (unlikely(buff == NULL)) + return -ENOBUFS; + + /* Reserve space for headers. */ + skb_reserve(buff, MAX_TCP_HEADER); + + TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; + TCP_ECN_send_syn(sk, tp, buff); + TCP_SKB_CB(buff)->sacked = 0; + skb_shinfo(buff)->tso_segs = 1; + skb_shinfo(buff)->tso_size = 0; + buff->csum = 0; + TCP_SKB_CB(buff)->seq = tp->write_seq++; + TCP_SKB_CB(buff)->end_seq = tp->write_seq; + tp->snd_nxt = tp->write_seq; + tp->pushed_seq = tp->write_seq; + tcp_ca_init(tp); + + /* Send it off. */ + TCP_SKB_CB(buff)->when = tcp_time_stamp; + tp->retrans_stamp = TCP_SKB_CB(buff)->when; + skb_header_release(buff); + __skb_queue_tail(&sk->sk_write_queue, buff); + sk_charge_skb(sk, buff); + tp->packets_out += tcp_skb_pcount(buff); + tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL)); + TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); + + /* Timer for repeating the SYN until an answer. */ + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + return 0; +} + +/* Send out a delayed ack, the caller does the policy checking + * to see if we should even be here. See tcp_input.c:tcp_ack_snd_check() + * for details. + */ +void tcp_send_delayed_ack(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int ato = tp->ack.ato; + unsigned long timeout; + + if (ato > TCP_DELACK_MIN) { + int max_ato = HZ/2; + + if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED)) + max_ato = TCP_DELACK_MAX; + + /* Slow path, intersegment interval is "high". */ + + /* If some rtt estimate is known, use it to bound delayed ack. + * Do not use tp->rto here, use results of rtt measurements + * directly. + */ + if (tp->srtt) { + int rtt = max(tp->srtt>>3, TCP_DELACK_MIN); + + if (rtt < max_ato) + max_ato = rtt; + } + + ato = min(ato, max_ato); + } + + /* Stay within the limit we were given */ + timeout = jiffies + ato; + + /* Use new timeout only if there wasn't a older one earlier. */ + if (tp->ack.pending&TCP_ACK_TIMER) { + /* If delack timer was blocked or is about to expire, + * send ACK now. + */ + if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) { + tcp_send_ack(sk); + return; + } + + if (!time_before(timeout, tp->ack.timeout)) + timeout = tp->ack.timeout; + } + tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER; + tp->ack.timeout = timeout; + sk_reset_timer(sk, &tp->delack_timer, timeout); +} + +/* This routine sends an ack and also updates the window. */ +void tcp_send_ack(struct sock *sk) +{ + /* If we have been reset, we may not send again. */ + if (sk->sk_state != TCP_CLOSE) { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; + + /* We are not putting this on the write queue, so + * tcp_transmit_skb() will set the ownership to this + * sock. + */ + buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + if (buff == NULL) { + tcp_schedule_ack(tp); + tp->ack.ato = TCP_ATO_MIN; + tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX); + return; + } + + /* Reserve space for headers and prepare control bits. */ + skb_reserve(buff, MAX_TCP_HEADER); + buff->csum = 0; + TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(buff)->sacked = 0; + skb_shinfo(buff)->tso_segs = 1; + skb_shinfo(buff)->tso_size = 0; + + /* Send it off, this clears delayed acks for us. */ + TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp); + TCP_SKB_CB(buff)->when = tcp_time_stamp; + tcp_transmit_skb(sk, buff); + } +} + +/* This routine sends a packet with an out of date sequence + * number. It assumes the other end will try to ack it. + * + * Question: what should we make while urgent mode? + * 4.4BSD forces sending single byte of data. We cannot send + * out of window data, because we have SND.NXT==SND.MAX... + * + * Current solution: to send TWO zero-length segments in urgent mode: + * one is with SEG.SEQ=SND.UNA to deliver urgent pointer, another is + * out-of-date with SND.UNA-1 to probe window. + */ +static int tcp_xmit_probe_skb(struct sock *sk, int urgent) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + /* We don't queue it, tcp_transmit_skb() sets ownership. */ + skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + if (skb == NULL) + return -1; + + /* Reserve space for headers and set control bits. */ + skb_reserve(skb, MAX_TCP_HEADER); + skb->csum = 0; + TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; + TCP_SKB_CB(skb)->sacked = urgent; + skb_shinfo(skb)->tso_segs = 1; + skb_shinfo(skb)->tso_size = 0; + + /* Use a previous sequence. This should cause the other + * end to send an ack. Don't queue or clone SKB, just + * send it. + */ + TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1; + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + return tcp_transmit_skb(sk, skb); +} + +int tcp_write_wakeup(struct sock *sk) +{ + if (sk->sk_state != TCP_CLOSE) { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + + if ((skb = sk->sk_send_head) != NULL && + before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) { + int err; + unsigned int mss = tcp_current_mss(sk, 0); + unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq; + + if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) + tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; + + /* We are probing the opening of a window + * but the window size is != 0 + * must have been a result SWS avoidance ( sender ) + */ + if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || + skb->len > mss) { + seg_size = min(seg_size, mss); + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + if (tcp_fragment(sk, skb, seg_size)) + return -1; + /* SWS override triggered forced fragmentation. + * Disable TSO, the connection is too sick. */ + if (sk->sk_route_caps & NETIF_F_TSO) { + sock_set_flag(sk, SOCK_NO_LARGESEND); + sk->sk_route_caps &= ~NETIF_F_TSO; + tp->mss_cache = tp->mss_cache_std; + } + } else if (!tcp_skb_pcount(skb)) + tcp_set_skb_tso_segs(skb, tp->mss_cache_std); + + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_tso_set_push(skb); + err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); + if (!err) { + update_send_head(sk, tp, skb); + } + return err; + } else { + if (tp->urg_mode && + between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF)) + tcp_xmit_probe_skb(sk, TCPCB_URG); + return tcp_xmit_probe_skb(sk, 0); + } + } + return -1; +} + +/* A window probe timeout has occurred. If window is not closed send + * a partial packet else a zero probe. + */ +void tcp_send_probe0(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int err; + + err = tcp_write_wakeup(sk); + + if (tp->packets_out || !sk->sk_send_head) { + /* Cancel probe timer, if it is not required. */ + tp->probes_out = 0; + tp->backoff = 0; + return; + } + + if (err <= 0) { + if (tp->backoff < sysctl_tcp_retries2) + tp->backoff++; + tp->probes_out++; + tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, + min(tp->rto << tp->backoff, TCP_RTO_MAX)); + } else { + /* If packet was not sent due to local congestion, + * do not backoff and do not remember probes_out. + * Let local senders to fight for local resources. + * + * Use accumulated backoff yet. + */ + if (!tp->probes_out) + tp->probes_out=1; + tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, + min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL)); + } +} + +EXPORT_SYMBOL(tcp_connect); +EXPORT_SYMBOL(tcp_make_synack); +EXPORT_SYMBOL(tcp_simple_retransmit); +EXPORT_SYMBOL(tcp_sync_mss); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c new file mode 100644 index 000000000000..85b279f1e935 --- /dev/null +++ b/net/ipv4/tcp_timer.c @@ -0,0 +1,656 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Implementation of the Transmission Control Protocol(TCP). + * + * Version: $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Mark Evans, + * Corey Minyard + * Florian La Roche, + * Charles Hedrick, + * Linus Torvalds, + * Alan Cox, + * Matthew Dillon, + * Arnt Gulbrandsen, + * Jorge Cwik, + */ + +#include +#include + +int sysctl_tcp_syn_retries = TCP_SYN_RETRIES; +int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES; +int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME; +int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES; +int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL; +int sysctl_tcp_retries1 = TCP_RETR1; +int sysctl_tcp_retries2 = TCP_RETR2; +int sysctl_tcp_orphan_retries; + +static void tcp_write_timer(unsigned long); +static void tcp_delack_timer(unsigned long); +static void tcp_keepalive_timer (unsigned long data); + +#ifdef TCP_DEBUG +const char tcp_timer_bug_msg[] = KERN_DEBUG "tcpbug: unknown timer value\n"; +EXPORT_SYMBOL(tcp_timer_bug_msg); +#endif + +/* + * Using different timers for retransmit, delayed acks and probes + * We may wish use just one timer maintaining a list of expire jiffies + * to optimize. + */ + +void tcp_init_xmit_timers(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + init_timer(&tp->retransmit_timer); + tp->retransmit_timer.function=&tcp_write_timer; + tp->retransmit_timer.data = (unsigned long) sk; + tp->pending = 0; + + init_timer(&tp->delack_timer); + tp->delack_timer.function=&tcp_delack_timer; + tp->delack_timer.data = (unsigned long) sk; + tp->ack.pending = 0; + + init_timer(&sk->sk_timer); + sk->sk_timer.function = &tcp_keepalive_timer; + sk->sk_timer.data = (unsigned long)sk; +} + +void tcp_clear_xmit_timers(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->pending = 0; + sk_stop_timer(sk, &tp->retransmit_timer); + + tp->ack.pending = 0; + tp->ack.blocked = 0; + sk_stop_timer(sk, &tp->delack_timer); + + sk_stop_timer(sk, &sk->sk_timer); +} + +static void tcp_write_err(struct sock *sk) +{ + sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT; + sk->sk_error_report(sk); + + tcp_done(sk); + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT); +} + +/* Do not allow orphaned sockets to eat all our resources. + * This is direct violation of TCP specs, but it is required + * to prevent DoS attacks. It is called when a retransmission timeout + * or zero probe timeout occurs on orphaned socket. + * + * Criterium is still not confirmed experimentally and may change. + * We kill the socket, if: + * 1. If number of orphaned sockets exceeds an administratively configured + * limit. + * 2. If we have strong memory pressure. + */ +static int tcp_out_of_resources(struct sock *sk, int do_reset) +{ + struct tcp_sock *tp = tcp_sk(sk); + int orphans = atomic_read(&tcp_orphan_count); + + /* If peer does not open window for long time, or did not transmit + * anything for long time, penalize it. */ + if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset) + orphans <<= 1; + + /* If some dubious ICMP arrived, penalize even more. */ + if (sk->sk_err_soft) + orphans <<= 1; + + if (orphans >= sysctl_tcp_max_orphans || + (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && + atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { + if (net_ratelimit()) + printk(KERN_INFO "Out of socket memory\n"); + + /* Catch exceptional cases, when connection requires reset. + * 1. Last segment was sent recently. */ + if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || + /* 2. Window is closed. */ + (!tp->snd_wnd && !tp->packets_out)) + do_reset = 1; + if (do_reset) + tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_done(sk); + NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); + return 1; + } + return 0; +} + +/* Calculate maximal number or retries on an orphaned socket. */ +static int tcp_orphan_retries(struct sock *sk, int alive) +{ + int retries = sysctl_tcp_orphan_retries; /* May be zero. */ + + /* We know from an ICMP that something is wrong. */ + if (sk->sk_err_soft && !alive) + retries = 0; + + /* However, if socket sent something recently, select some safe + * number of retries. 8 corresponds to >100 seconds with minimal + * RTO of 200msec. */ + if (retries == 0 && alive) + retries = 8; + return retries; +} + +/* A write timeout has occurred. Process the after effects. */ +static int tcp_write_timeout(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int retry_until; + + if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { + if (tp->retransmits) + dst_negative_advice(&sk->sk_dst_cache); + retry_until = tp->syn_retries ? : sysctl_tcp_syn_retries; + } else { + if (tp->retransmits >= sysctl_tcp_retries1) { + /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black + hole detection. :-( + + It is place to make it. It is not made. I do not want + to make it. It is disguisting. It does not work in any + case. Let me to cite the same draft, which requires for + us to implement this: + + "The one security concern raised by this memo is that ICMP black holes + are often caused by over-zealous security administrators who block + all ICMP messages. It is vitally important that those who design and + deploy security systems understand the impact of strict filtering on + upper-layer protocols. The safest web site in the world is worthless + if most TCP implementations cannot transfer data from it. It would + be far nicer to have all of the black holes fixed rather than fixing + all of the TCP implementations." + + Golden words :-). + */ + + dst_negative_advice(&sk->sk_dst_cache); + } + + retry_until = sysctl_tcp_retries2; + if (sock_flag(sk, SOCK_DEAD)) { + int alive = (tp->rto < TCP_RTO_MAX); + + retry_until = tcp_orphan_retries(sk, alive); + + if (tcp_out_of_resources(sk, alive || tp->retransmits < retry_until)) + return 1; + } + } + + if (tp->retransmits >= retry_until) { + /* Has it gone just too far? */ + tcp_write_err(sk); + return 1; + } + return 0; +} + +static void tcp_delack_timer(unsigned long data) +{ + struct sock *sk = (struct sock*)data; + struct tcp_sock *tp = tcp_sk(sk); + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + tp->ack.blocked = 1; + NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED); + sk_reset_timer(sk, &tp->delack_timer, jiffies + TCP_DELACK_MIN); + goto out_unlock; + } + + sk_stream_mem_reclaim(sk); + + if (sk->sk_state == TCP_CLOSE || !(tp->ack.pending & TCP_ACK_TIMER)) + goto out; + + if (time_after(tp->ack.timeout, jiffies)) { + sk_reset_timer(sk, &tp->delack_timer, tp->ack.timeout); + goto out; + } + tp->ack.pending &= ~TCP_ACK_TIMER; + + if (skb_queue_len(&tp->ucopy.prequeue)) { + struct sk_buff *skb; + + NET_ADD_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED, + skb_queue_len(&tp->ucopy.prequeue)); + + while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) + sk->sk_backlog_rcv(sk, skb); + + tp->ucopy.memory = 0; + } + + if (tcp_ack_scheduled(tp)) { + if (!tp->ack.pingpong) { + /* Delayed ACK missed: inflate ATO. */ + tp->ack.ato = min(tp->ack.ato << 1, tp->rto); + } else { + /* Delayed ACK missed: leave pingpong mode and + * deflate ATO. + */ + tp->ack.pingpong = 0; + tp->ack.ato = TCP_ATO_MIN; + } + tcp_send_ack(sk); + NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS); + } + TCP_CHECK_TIMER(sk); + +out: + if (tcp_memory_pressure) + sk_stream_mem_reclaim(sk); +out_unlock: + bh_unlock_sock(sk); + sock_put(sk); +} + +static void tcp_probe_timer(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + int max_probes; + + if (tp->packets_out || !sk->sk_send_head) { + tp->probes_out = 0; + return; + } + + /* *WARNING* RFC 1122 forbids this + * + * It doesn't AFAIK, because we kill the retransmit timer -AK + * + * FIXME: We ought not to do it, Solaris 2.5 actually has fixing + * this behaviour in Solaris down as a bug fix. [AC] + * + * Let me to explain. probes_out is zeroed by incoming ACKs + * even if they advertise zero window. Hence, connection is killed only + * if we received no ACKs for normal connection timeout. It is not killed + * only because window stays zero for some time, window may be zero + * until armageddon and even later. We are in full accordance + * with RFCs, only probe timer combines both retransmission timeout + * and probe timeout in one bottle. --ANK + */ + max_probes = sysctl_tcp_retries2; + + if (sock_flag(sk, SOCK_DEAD)) { + int alive = ((tp->rto<backoff) < TCP_RTO_MAX); + + max_probes = tcp_orphan_retries(sk, alive); + + if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes)) + return; + } + + if (tp->probes_out > max_probes) { + tcp_write_err(sk); + } else { + /* Only send another probe if we didn't close things up. */ + tcp_send_probe0(sk); + } +} + +/* + * The TCP retransmit timer. + */ + +static void tcp_retransmit_timer(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (!tp->packets_out) + goto out; + + BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue)); + + if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) && + !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) { + /* Receiver dastardly shrinks window. Our retransmits + * become zero probes, but we should not timeout this + * connection. If the socket is an orphan, time it out, + * we cannot allow such beasts to hang infinitely. + */ +#ifdef TCP_DEBUG + if (net_ratelimit()) { + struct inet_sock *inet = inet_sk(sk); + printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n", + NIPQUAD(inet->daddr), htons(inet->dport), + inet->num, tp->snd_una, tp->snd_nxt); + } +#endif + if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) { + tcp_write_err(sk); + goto out; + } + tcp_enter_loss(sk, 0); + tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); + __sk_dst_reset(sk); + goto out_reset_timer; + } + + if (tcp_write_timeout(sk)) + goto out; + + if (tp->retransmits == 0) { + if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) { + if (tp->rx_opt.sack_ok) { + if (tp->ca_state == TCP_CA_Recovery) + NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL); + else + NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES); + } else { + if (tp->ca_state == TCP_CA_Recovery) + NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL); + else + NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES); + } + } else if (tp->ca_state == TCP_CA_Loss) { + NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES); + } else { + NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS); + } + } + + if (tcp_use_frto(sk)) { + tcp_enter_frto(sk); + } else { + tcp_enter_loss(sk, 0); + } + + if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) { + /* Retransmission failed because of local congestion, + * do not backoff. + */ + if (!tp->retransmits) + tp->retransmits=1; + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, + min(tp->rto, TCP_RESOURCE_PROBE_INTERVAL)); + goto out; + } + + /* Increase the timeout each time we retransmit. Note that + * we do not increase the rtt estimate. rto is initialized + * from rtt, but increases here. Jacobson (SIGCOMM 88) suggests + * that doubling rto each time is the least we can get away with. + * In KA9Q, Karn uses this for the first few times, and then + * goes to quadratic. netBSD doubles, but only goes up to *64, + * and clamps at 1 to 64 sec afterwards. Note that 120 sec is + * defined in the protocol as the maximum possible RTT. I guess + * we'll have to use something other than TCP to talk to the + * University of Mars. + * + * PAWS allows us longer timeouts and large windows, so once + * implemented ftp to mars will work nicely. We will have to fix + * the 120 second clamps though! + */ + tp->backoff++; + tp->retransmits++; + +out_reset_timer: + tp->rto = min(tp->rto << 1, TCP_RTO_MAX); + tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto); + if (tp->retransmits > sysctl_tcp_retries1) + __sk_dst_reset(sk); + +out:; +} + +static void tcp_write_timer(unsigned long data) +{ + struct sock *sk = (struct sock*)data; + struct tcp_sock *tp = tcp_sk(sk); + int event; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later */ + sk_reset_timer(sk, &tp->retransmit_timer, jiffies + (HZ / 20)); + goto out_unlock; + } + + if (sk->sk_state == TCP_CLOSE || !tp->pending) + goto out; + + if (time_after(tp->timeout, jiffies)) { + sk_reset_timer(sk, &tp->retransmit_timer, tp->timeout); + goto out; + } + + event = tp->pending; + tp->pending = 0; + + switch (event) { + case TCP_TIME_RETRANS: + tcp_retransmit_timer(sk); + break; + case TCP_TIME_PROBE0: + tcp_probe_timer(sk); + break; + } + TCP_CHECK_TIMER(sk); + +out: + sk_stream_mem_reclaim(sk); +out_unlock: + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * Timer for listening sockets + */ + +static void tcp_synack_timer(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_listen_opt *lopt = tp->listen_opt; + int max_retries = tp->syn_retries ? : sysctl_tcp_synack_retries; + int thresh = max_retries; + unsigned long now = jiffies; + struct open_request **reqp, *req; + int i, budget; + + if (lopt == NULL || lopt->qlen == 0) + return; + + /* Normally all the openreqs are young and become mature + * (i.e. converted to established socket) for first timeout. + * If synack was not acknowledged for 3 seconds, it means + * one of the following things: synack was lost, ack was lost, + * rtt is high or nobody planned to ack (i.e. synflood). + * When server is a bit loaded, queue is populated with old + * open requests, reducing effective size of queue. + * When server is well loaded, queue size reduces to zero + * after several minutes of work. It is not synflood, + * it is normal operation. The solution is pruning + * too old entries overriding normal timeout, when + * situation becomes dangerous. + * + * Essentially, we reserve half of room for young + * embrions; and abort old ones without pity, if old + * ones are about to clog our table. + */ + if (lopt->qlen>>(lopt->max_qlen_log-1)) { + int young = (lopt->qlen_young<<1); + + while (thresh > 2) { + if (lopt->qlen < young) + break; + thresh--; + young <<= 1; + } + } + + if (tp->defer_accept) + max_retries = tp->defer_accept; + + budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL)); + i = lopt->clock_hand; + + do { + reqp=&lopt->syn_table[i]; + while ((req = *reqp) != NULL) { + if (time_after_eq(now, req->expires)) { + if ((req->retrans < thresh || + (req->acked && req->retrans < max_retries)) + && !req->class->rtx_syn_ack(sk, req, NULL)) { + unsigned long timeo; + + if (req->retrans++ == 0) + lopt->qlen_young--; + timeo = min((TCP_TIMEOUT_INIT << req->retrans), + TCP_RTO_MAX); + req->expires = now + timeo; + reqp = &req->dl_next; + continue; + } + + /* Drop this request */ + write_lock(&tp->syn_wait_lock); + *reqp = req->dl_next; + write_unlock(&tp->syn_wait_lock); + lopt->qlen--; + if (req->retrans == 0) + lopt->qlen_young--; + tcp_openreq_free(req); + continue; + } + reqp = &req->dl_next; + } + + i = (i+1)&(TCP_SYNQ_HSIZE-1); + + } while (--budget > 0); + + lopt->clock_hand = i; + + if (lopt->qlen) + tcp_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL); +} + +void tcp_delete_keepalive_timer (struct sock *sk) +{ + sk_stop_timer(sk, &sk->sk_timer); +} + +void tcp_reset_keepalive_timer (struct sock *sk, unsigned long len) +{ + sk_reset_timer(sk, &sk->sk_timer, jiffies + len); +} + +void tcp_set_keepalive(struct sock *sk, int val) +{ + if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)) + return; + + if (val && !sock_flag(sk, SOCK_KEEPOPEN)) + tcp_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk))); + else if (!val) + tcp_delete_keepalive_timer(sk); +} + + +static void tcp_keepalive_timer (unsigned long data) +{ + struct sock *sk = (struct sock *) data; + struct tcp_sock *tp = tcp_sk(sk); + __u32 elapsed; + + /* Only process if socket is not in use. */ + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { + /* Try again later. */ + tcp_reset_keepalive_timer (sk, HZ/20); + goto out; + } + + if (sk->sk_state == TCP_LISTEN) { + tcp_synack_timer(sk); + goto out; + } + + if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) { + if (tp->linger2 >= 0) { + int tmo = tcp_fin_time(tp) - TCP_TIMEWAIT_LEN; + + if (tmo > 0) { + tcp_time_wait(sk, TCP_FIN_WAIT2, tmo); + goto out; + } + } + tcp_send_active_reset(sk, GFP_ATOMIC); + goto death; + } + + if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE) + goto out; + + elapsed = keepalive_time_when(tp); + + /* It is alive without keepalive 8) */ + if (tp->packets_out || sk->sk_send_head) + goto resched; + + elapsed = tcp_time_stamp - tp->rcv_tstamp; + + if (elapsed >= keepalive_time_when(tp)) { + if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) || + (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) { + tcp_send_active_reset(sk, GFP_ATOMIC); + tcp_write_err(sk); + goto out; + } + if (tcp_write_wakeup(sk) <= 0) { + tp->probes_out++; + elapsed = keepalive_intvl_when(tp); + } else { + /* If keepalive was lost due to local congestion, + * try harder. + */ + elapsed = TCP_RESOURCE_PROBE_INTERVAL; + } + } else { + /* It is tp->rcv_tstamp + keepalive_time_when(tp) */ + elapsed = keepalive_time_when(tp) - elapsed; + } + + TCP_CHECK_TIMER(sk); + sk_stream_mem_reclaim(sk); + +resched: + tcp_reset_keepalive_timer (sk, elapsed); + goto out; + +death: + tcp_done(sk); + +out: + bh_unlock_sock(sk); + sock_put(sk); +} + +EXPORT_SYMBOL(tcp_clear_xmit_timers); +EXPORT_SYMBOL(tcp_delete_keepalive_timer); +EXPORT_SYMBOL(tcp_init_xmit_timers); +EXPORT_SYMBOL(tcp_reset_keepalive_timer); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c new file mode 100644 index 000000000000..6baddfbedca3 --- /dev/null +++ b/net/ipv4/udp.c @@ -0,0 +1,1575 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * The User Datagram Protocol (UDP). + * + * Version: $Id: udp.c,v 1.102 2002/02/01 22:01:04 davem Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Arnt Gulbrandsen, + * Alan Cox, + * Hirokazu Takahashi, + * + * Fixes: + * Alan Cox : verify_area() calls + * Alan Cox : stopped close while in use off icmp + * messages. Not a fix but a botch that + * for udp at least is 'valid'. + * Alan Cox : Fixed icmp handling properly + * Alan Cox : Correct error for oversized datagrams + * Alan Cox : Tidied select() semantics. + * Alan Cox : udp_err() fixed properly, also now + * select and read wake correctly on errors + * Alan Cox : udp_send verify_area moved to avoid mem leak + * Alan Cox : UDP can count its memory + * Alan Cox : send to an unknown connection causes + * an ECONNREFUSED off the icmp, but + * does NOT close. + * Alan Cox : Switched to new sk_buff handlers. No more backlog! + * Alan Cox : Using generic datagram code. Even smaller and the PEEK + * bug no longer crashes it. + * Fred Van Kempen : Net2e support for sk->broadcast. + * Alan Cox : Uses skb_free_datagram + * Alan Cox : Added get/set sockopt support. + * Alan Cox : Broadcasting without option set returns EACCES. + * Alan Cox : No wakeup calls. Instead we now use the callbacks. + * Alan Cox : Use ip_tos and ip_ttl + * Alan Cox : SNMP Mibs + * Alan Cox : MSG_DONTROUTE, and 0.0.0.0 support. + * Matt Dillon : UDP length checks. + * Alan Cox : Smarter af_inet used properly. + * Alan Cox : Use new kernel side addressing. + * Alan Cox : Incorrect return on truncated datagram receive. + * Arnt Gulbrandsen : New udp_send and stuff + * Alan Cox : Cache last socket + * Alan Cox : Route cache + * Jon Peatfield : Minor efficiency fix to sendto(). + * Mike Shaver : RFC1122 checks. + * Alan Cox : Nonblocking error fix. + * Willy Konynenberg : Transparent proxying support. + * Mike McLagan : Routing by source + * David S. Miller : New socket lookup architecture. + * Last socket cache retained as it + * does have a high hit rate. + * Olaf Kirch : Don't linearise iovec on sendmsg. + * Andi Kleen : Some cleanups, cache destination entry + * for connect. + * Vitaly E. Lavrov : Transparent proxy revived after year coma. + * Melvin Smith : Check msg_name not msg_namelen in sendto(), + * return ENOTCONN for unconnected sockets (POSIX) + * Janos Farkas : don't deliver multi/broadcasts to a different + * bound-to-device socket + * Hirokazu Takahashi : HW checksumming for outgoing UDP + * datagrams. + * Hirokazu Takahashi : sendfile() on UDP works now. + * Arnaldo C. Melo : convert /proc/net/udp to seq_file + * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which + * Alexey Kuznetsov: allow both IPv4 and IPv6 sockets to bind + * a single port at the same time. + * Derek Atkins : Add Encapulation Support + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Snmp MIB for the UDP layer + */ + +DEFINE_SNMP_STAT(struct udp_mib, udp_statistics); + +struct hlist_head udp_hash[UDP_HTABLE_SIZE]; +DEFINE_RWLOCK(udp_hash_lock); + +/* Shared by v4/v6 udp. */ +int udp_port_rover; + +static int udp_v4_get_port(struct sock *sk, unsigned short snum) +{ + struct hlist_node *node; + struct sock *sk2; + struct inet_sock *inet = inet_sk(sk); + + write_lock_bh(&udp_hash_lock); + if (snum == 0) { + int best_size_so_far, best, result, i; + + if (udp_port_rover > sysctl_local_port_range[1] || + udp_port_rover < sysctl_local_port_range[0]) + udp_port_rover = sysctl_local_port_range[0]; + best_size_so_far = 32767; + best = result = udp_port_rover; + for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { + struct hlist_head *list; + int size; + + list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)]; + if (hlist_empty(list)) { + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & + (UDP_HTABLE_SIZE - 1)); + goto gotit; + } + size = 0; + sk_for_each(sk2, node, list) + if (++size >= best_size_so_far) + goto next; + best_size_so_far = size; + best = result; + next:; + } + result = best; + for(i = 0; i < (1 << 16) / UDP_HTABLE_SIZE; i++, result += UDP_HTABLE_SIZE) { + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & + (UDP_HTABLE_SIZE - 1)); + if (!udp_lport_inuse(result)) + break; + } + if (i >= (1 << 16) / UDP_HTABLE_SIZE) + goto fail; +gotit: + udp_port_rover = snum = result; + } else { + sk_for_each(sk2, node, + &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) { + struct inet_sock *inet2 = inet_sk(sk2); + + if (inet2->num == snum && + sk2 != sk && + !ipv6_only_sock(sk2) && + (!sk2->sk_bound_dev_if || + !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (!inet2->rcv_saddr || + !inet->rcv_saddr || + inet2->rcv_saddr == inet->rcv_saddr) && + (!sk2->sk_reuse || !sk->sk_reuse)) + goto fail; + } + } + inet->num = snum; + if (sk_unhashed(sk)) { + struct hlist_head *h = &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]; + + sk_add_node(sk, h); + sock_prot_inc_use(sk->sk_prot); + } + write_unlock_bh(&udp_hash_lock); + return 0; + +fail: + write_unlock_bh(&udp_hash_lock); + return 1; +} + +static void udp_v4_hash(struct sock *sk) +{ + BUG(); +} + +static void udp_v4_unhash(struct sock *sk) +{ + write_lock_bh(&udp_hash_lock); + if (sk_del_node_init(sk)) { + inet_sk(sk)->num = 0; + sock_prot_dec_use(sk->sk_prot); + } + write_unlock_bh(&udp_hash_lock); +} + +/* UDP is nearly always wildcards out the wazoo, it makes no sense to try + * harder than this. -DaveM + */ +static struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, + u32 daddr, u16 dport, int dif) +{ + struct sock *sk, *result = NULL; + struct hlist_node *node; + unsigned short hnum = ntohs(dport); + int badness = -1; + + sk_for_each(sk, node, &udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]) { + struct inet_sock *inet = inet_sk(sk); + + if (inet->num == hnum && !ipv6_only_sock(sk)) { + int score = (sk->sk_family == PF_INET ? 1 : 0); + if (inet->rcv_saddr) { + if (inet->rcv_saddr != daddr) + continue; + score+=2; + } + if (inet->daddr) { + if (inet->daddr != saddr) + continue; + score+=2; + } + if (inet->dport) { + if (inet->dport != sport) + continue; + score+=2; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score+=2; + } + if(score == 9) { + result = sk; + break; + } else if(score > badness) { + result = sk; + badness = score; + } + } + } + return result; +} + +static __inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport, + u32 daddr, u16 dport, int dif) +{ + struct sock *sk; + + read_lock(&udp_hash_lock); + sk = udp_v4_lookup_longway(saddr, sport, daddr, dport, dif); + if (sk) + sock_hold(sk); + read_unlock(&udp_hash_lock); + return sk; +} + +static inline struct sock *udp_v4_mcast_next(struct sock *sk, + u16 loc_port, u32 loc_addr, + u16 rmt_port, u32 rmt_addr, + int dif) +{ + struct hlist_node *node; + struct sock *s = sk; + unsigned short hnum = ntohs(loc_port); + + sk_for_each_from(s, node) { + struct inet_sock *inet = inet_sk(s); + + if (inet->num != hnum || + (inet->daddr && inet->daddr != rmt_addr) || + (inet->dport != rmt_port && inet->dport) || + (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || + ipv6_only_sock(s) || + (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) + continue; + if (!ip_mc_sf_allow(s, loc_addr, rmt_addr, dif)) + continue; + goto found; + } + s = NULL; +found: + return s; +} + +/* + * This routine is called by the ICMP module when it gets some + * sort of error condition. If err < 0 then the socket should + * be closed and the error returned to the user. If err > 0 + * it's just the icmp type << 8 | icmp code. + * Header points to the ip header of the error packet. We move + * on past this. Then (as it used to claim before adjustment) + * header points to the first 8 bytes of the udp header. We need + * to find the appropriate port. + */ + +void udp_err(struct sk_buff *skb, u32 info) +{ + struct inet_sock *inet; + struct iphdr *iph = (struct iphdr*)skb->data; + struct udphdr *uh = (struct udphdr*)(skb->data+(iph->ihl<<2)); + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct sock *sk; + int harderr; + int err; + + sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex); + if (sk == NULL) { + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + return; /* No socket for error */ + } + + err = 0; + harderr = 0; + inet = inet_sk(sk); + + switch (type) { + default: + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + case ICMP_SOURCE_QUENCH: + goto out; + case ICMP_PARAMETERPROB: + err = EPROTO; + harderr = 1; + break; + case ICMP_DEST_UNREACH: + if (code == ICMP_FRAG_NEEDED) { /* Path MTU discovery */ + if (inet->pmtudisc != IP_PMTUDISC_DONT) { + err = EMSGSIZE; + harderr = 1; + break; + } + goto out; + } + err = EHOSTUNREACH; + if (code <= NR_ICMP_UNREACH) { + harderr = icmp_err_convert[code].fatal; + err = icmp_err_convert[code].errno; + } + break; + } + + /* + * RFC1122: OK. Passes ICMP errors back to application, as per + * 4.1.3.3. + */ + if (!inet->recverr) { + if (!harderr || sk->sk_state != TCP_ESTABLISHED) + goto out; + } else { + ip_icmp_error(sk, skb, err, uh->dest, info, (u8*)(uh+1)); + } + sk->sk_err = err; + sk->sk_error_report(sk); +out: + sock_put(sk); +} + +/* + * Throw away all pending data and cancel the corking. Socket is locked. + */ +static void udp_flush_pending_frames(struct sock *sk) +{ + struct udp_sock *up = udp_sk(sk); + + if (up->pending) { + up->len = 0; + up->pending = 0; + ip_flush_pending_frames(sk); + } +} + +/* + * Push out all pending data as one UDP datagram. Socket is locked. + */ +static int udp_push_pending_frames(struct sock *sk, struct udp_sock *up) +{ + struct inet_sock *inet = inet_sk(sk); + struct flowi *fl = &inet->cork.fl; + struct sk_buff *skb; + struct udphdr *uh; + int err = 0; + + /* Grab the skbuff where UDP header space exists. */ + if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) + goto out; + + /* + * Create a UDP header + */ + uh = skb->h.uh; + uh->source = fl->fl_ip_sport; + uh->dest = fl->fl_ip_dport; + uh->len = htons(up->len); + uh->check = 0; + + if (sk->sk_no_check == UDP_CSUM_NOXMIT) { + skb->ip_summed = CHECKSUM_NONE; + goto send; + } + + if (skb_queue_len(&sk->sk_write_queue) == 1) { + /* + * Only one fragment on the socket. + */ + if (skb->ip_summed == CHECKSUM_HW) { + skb->csum = offsetof(struct udphdr, check); + uh->check = ~csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, + up->len, IPPROTO_UDP, 0); + } else { + skb->csum = csum_partial((char *)uh, + sizeof(struct udphdr), skb->csum); + uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, + up->len, IPPROTO_UDP, skb->csum); + if (uh->check == 0) + uh->check = -1; + } + } else { + unsigned int csum = 0; + /* + * HW-checksum won't work as there are two or more + * fragments on the socket so that all csums of sk_buffs + * should be together. + */ + if (skb->ip_summed == CHECKSUM_HW) { + int offset = (unsigned char *)uh - skb->data; + skb->csum = skb_checksum(skb, offset, skb->len - offset, 0); + + skb->ip_summed = CHECKSUM_NONE; + } else { + skb->csum = csum_partial((char *)uh, + sizeof(struct udphdr), skb->csum); + } + + skb_queue_walk(&sk->sk_write_queue, skb) { + csum = csum_add(csum, skb->csum); + } + uh->check = csum_tcpudp_magic(fl->fl4_src, fl->fl4_dst, + up->len, IPPROTO_UDP, csum); + if (uh->check == 0) + uh->check = -1; + } +send: + err = ip_push_pending_frames(sk); +out: + up->len = 0; + up->pending = 0; + return err; +} + + +static unsigned short udp_check(struct udphdr *uh, int len, unsigned long saddr, unsigned long daddr, unsigned long base) +{ + return(csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base)); +} + +int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len) +{ + struct inet_sock *inet = inet_sk(sk); + struct udp_sock *up = udp_sk(sk); + int ulen = len; + struct ipcm_cookie ipc; + struct rtable *rt = NULL; + int free = 0; + int connected = 0; + u32 daddr, faddr, saddr; + u16 dport; + u8 tos; + int err; + int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + + if (len > 0xFFFF) + return -EMSGSIZE; + + /* + * Check the flags. + */ + + if (msg->msg_flags&MSG_OOB) /* Mirror BSD error message compatibility */ + return -EOPNOTSUPP; + + ipc.opt = NULL; + + if (up->pending) { + /* + * There are pending frames. + * The socket lock must be held while it's corked. + */ + lock_sock(sk); + if (likely(up->pending)) { + if (unlikely(up->pending != AF_INET)) { + release_sock(sk); + return -EINVAL; + } + goto do_append_data; + } + release_sock(sk); + } + ulen += sizeof(struct udphdr); + + /* + * Get and verify the address. + */ + if (msg->msg_name) { + struct sockaddr_in * usin = (struct sockaddr_in*)msg->msg_name; + if (msg->msg_namelen < sizeof(*usin)) + return -EINVAL; + if (usin->sin_family != AF_INET) { + if (usin->sin_family != AF_UNSPEC) + return -EAFNOSUPPORT; + } + + daddr = usin->sin_addr.s_addr; + dport = usin->sin_port; + if (dport == 0) + return -EINVAL; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + daddr = inet->daddr; + dport = inet->dport; + /* Open fast path for connected socket. + Route will not be used, if at least one option is set. + */ + connected = 1; + } + ipc.addr = inet->saddr; + + ipc.oif = sk->sk_bound_dev_if; + if (msg->msg_controllen) { + err = ip_cmsg_send(msg, &ipc); + if (err) + return err; + if (ipc.opt) + free = 1; + connected = 0; + } + if (!ipc.opt) + ipc.opt = inet->opt; + + saddr = ipc.addr; + ipc.addr = faddr = daddr; + + if (ipc.opt && ipc.opt->srr) { + if (!daddr) + return -EINVAL; + faddr = ipc.opt->faddr; + connected = 0; + } + tos = RT_TOS(inet->tos); + if (sock_flag(sk, SOCK_LOCALROUTE) || + (msg->msg_flags & MSG_DONTROUTE) || + (ipc.opt && ipc.opt->is_strictroute)) { + tos |= RTO_ONLINK; + connected = 0; + } + + if (MULTICAST(daddr)) { + if (!ipc.oif) + ipc.oif = inet->mc_index; + if (!saddr) + saddr = inet->mc_addr; + connected = 0; + } + + if (connected) + rt = (struct rtable*)sk_dst_check(sk, 0); + + if (rt == NULL) { + struct flowi fl = { .oif = ipc.oif, + .nl_u = { .ip4_u = + { .daddr = faddr, + .saddr = saddr, + .tos = tos } }, + .proto = IPPROTO_UDP, + .uli_u = { .ports = + { .sport = inet->sport, + .dport = dport } } }; + err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT)); + if (err) + goto out; + + err = -EACCES; + if ((rt->rt_flags & RTCF_BROADCAST) && + !sock_flag(sk, SOCK_BROADCAST)) + goto out; + if (connected) + sk_dst_set(sk, dst_clone(&rt->u.dst)); + } + + if (msg->msg_flags&MSG_CONFIRM) + goto do_confirm; +back_from_confirm: + + saddr = rt->rt_src; + if (!ipc.addr) + daddr = ipc.addr = rt->rt_dst; + + lock_sock(sk); + if (unlikely(up->pending)) { + /* The socket is already corked while preparing it. */ + /* ... which is an evident application bug. --ANK */ + release_sock(sk); + + NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 2\n")); + err = -EINVAL; + goto out; + } + /* + * Now cork the socket to pend data. + */ + inet->cork.fl.fl4_dst = daddr; + inet->cork.fl.fl_ip_dport = dport; + inet->cork.fl.fl4_src = saddr; + inet->cork.fl.fl_ip_sport = inet->sport; + up->pending = AF_INET; + +do_append_data: + up->len += ulen; + err = ip_append_data(sk, ip_generic_getfrag, msg->msg_iov, ulen, + sizeof(struct udphdr), &ipc, rt, + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); + if (err) + udp_flush_pending_frames(sk); + else if (!corkreq) + err = udp_push_pending_frames(sk, up); + release_sock(sk); + +out: + ip_rt_put(rt); + if (free) + kfree(ipc.opt); + if (!err) { + UDP_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS); + return len; + } + return err; + +do_confirm: + dst_confirm(&rt->u.dst); + if (!(msg->msg_flags&MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto out; +} + +static int udp_sendpage(struct sock *sk, struct page *page, int offset, + size_t size, int flags) +{ + struct udp_sock *up = udp_sk(sk); + int ret; + + if (!up->pending) { + struct msghdr msg = { .msg_flags = flags|MSG_MORE }; + + /* Call udp_sendmsg to specify destination address which + * sendpage interface can't pass. + * This will succeed only when the socket is connected. + */ + ret = udp_sendmsg(NULL, sk, &msg, 0); + if (ret < 0) + return ret; + } + + lock_sock(sk); + + if (unlikely(!up->pending)) { + release_sock(sk); + + NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp cork app bug 3\n")); + return -EINVAL; + } + + ret = ip_append_page(sk, page, offset, size, flags); + if (ret == -EOPNOTSUPP) { + release_sock(sk); + return sock_no_sendpage(sk->sk_socket, page, offset, + size, flags); + } + if (ret < 0) { + udp_flush_pending_frames(sk); + goto out; + } + + up->len += size; + if (!(up->corkflag || (flags&MSG_MORE))) + ret = udp_push_pending_frames(sk, up); + if (!ret) + ret = size; +out: + release_sock(sk); + return ret; +} + +/* + * IOCTL requests applicable to the UDP protocol + */ + +int udp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + switch(cmd) + { + case SIOCOUTQ: + { + int amount = atomic_read(&sk->sk_wmem_alloc); + return put_user(amount, (int __user *)arg); + } + + case SIOCINQ: + { + struct sk_buff *skb; + unsigned long amount; + + amount = 0; + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) { + /* + * We will only return the amount + * of this packet since that is all + * that will be read. + */ + amount = skb->len - sizeof(struct udphdr); + } + spin_unlock_irq(&sk->sk_receive_queue.lock); + return put_user(amount, (int __user *)arg); + } + + default: + return -ENOIOCTLCMD; + } + return(0); +} + +static __inline__ int __udp_checksum_complete(struct sk_buff *skb) +{ + return (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum)); +} + +static __inline__ int udp_checksum_complete(struct sk_buff *skb) +{ + return skb->ip_summed != CHECKSUM_UNNECESSARY && + __udp_checksum_complete(skb); +} + +/* + * This should be easy, if there is something there we + * return it, otherwise we block. + */ + +static int udp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, + size_t len, int noblock, int flags, int *addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct sockaddr_in *sin = (struct sockaddr_in *)msg->msg_name; + struct sk_buff *skb; + int copied, err; + + /* + * Check any passed addresses + */ + if (addr_len) + *addr_len=sizeof(*sin); + + if (flags & MSG_ERRQUEUE) + return ip_recv_error(sk, msg, len); + +try_again: + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len - sizeof(struct udphdr); + if (copied > len) { + copied = len; + msg->msg_flags |= MSG_TRUNC; + } + + if (skb->ip_summed==CHECKSUM_UNNECESSARY) { + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, + copied); + } else if (msg->msg_flags&MSG_TRUNC) { + if (__udp_checksum_complete(skb)) + goto csum_copy_err; + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, + copied); + } else { + err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); + + if (err == -EINVAL) + goto csum_copy_err; + } + + if (err) + goto out_free; + + sock_recv_timestamp(msg, sk, skb); + + /* Copy the address. */ + if (sin) + { + sin->sin_family = AF_INET; + sin->sin_port = skb->h.uh->source; + sin->sin_addr.s_addr = skb->nh.iph->saddr; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + + err = copied; + if (flags & MSG_TRUNC) + err = skb->len - sizeof(struct udphdr); + +out_free: + skb_free_datagram(sk, skb); +out: + return err; + +csum_copy_err: + UDP_INC_STATS_BH(UDP_MIB_INERRORS); + + /* Clear queue. */ + if (flags&MSG_PEEK) { + int clear = 0; + spin_lock_irq(&sk->sk_receive_queue.lock); + if (skb == skb_peek(&sk->sk_receive_queue)) { + __skb_unlink(skb, &sk->sk_receive_queue); + clear = 1; + } + spin_unlock_irq(&sk->sk_receive_queue.lock); + if (clear) + kfree_skb(skb); + } + + skb_free_datagram(sk, skb); + + if (noblock) + return -EAGAIN; + goto try_again; +} + + +int udp_disconnect(struct sock *sk, int flags) +{ + struct inet_sock *inet = inet_sk(sk); + /* + * 1003.1g - break association. + */ + + sk->sk_state = TCP_CLOSE; + inet->daddr = 0; + inet->dport = 0; + sk->sk_bound_dev_if = 0; + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); + + if (!(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) { + sk->sk_prot->unhash(sk); + inet->sport = 0; + } + sk_dst_reset(sk); + return 0; +} + +static void udp_close(struct sock *sk, long timeout) +{ + sk_common_release(sk); +} + +/* return: + * 1 if the the UDP system should process it + * 0 if we should drop this packet + * -1 if it should get processed by xfrm4_rcv_encap + */ +static int udp_encap_rcv(struct sock * sk, struct sk_buff *skb) +{ +#ifndef CONFIG_XFRM + return 1; +#else + struct udp_sock *up = udp_sk(sk); + struct udphdr *uh = skb->h.uh; + struct iphdr *iph; + int iphlen, len; + + __u8 *udpdata = (__u8 *)uh + sizeof(struct udphdr); + __u32 *udpdata32 = (__u32 *)udpdata; + __u16 encap_type = up->encap_type; + + /* if we're overly short, let UDP handle it */ + if (udpdata > skb->tail) + return 1; + + /* if this is not encapsulated socket, then just return now */ + if (!encap_type) + return 1; + + len = skb->tail - udpdata; + + switch (encap_type) { + default: + case UDP_ENCAP_ESPINUDP: + /* Check if this is a keepalive packet. If so, eat it. */ + if (len == 1 && udpdata[0] == 0xff) { + return 0; + } else if (len > sizeof(struct ip_esp_hdr) && udpdata32[0] != 0 ) { + /* ESP Packet without Non-ESP header */ + len = sizeof(struct udphdr); + } else + /* Must be an IKE packet.. pass it through */ + return 1; + break; + case UDP_ENCAP_ESPINUDP_NON_IKE: + /* Check if this is a keepalive packet. If so, eat it. */ + if (len == 1 && udpdata[0] == 0xff) { + return 0; + } else if (len > 2 * sizeof(u32) + sizeof(struct ip_esp_hdr) && + udpdata32[0] == 0 && udpdata32[1] == 0) { + + /* ESP Packet with Non-IKE marker */ + len = sizeof(struct udphdr) + 2 * sizeof(u32); + } else + /* Must be an IKE packet.. pass it through */ + return 1; + break; + } + + /* At this point we are sure that this is an ESPinUDP packet, + * so we need to remove 'len' bytes from the packet (the UDP + * header and optional ESP marker bytes) and then modify the + * protocol to ESP, and then call into the transform receiver. + */ + + /* Now we can update and verify the packet length... */ + iph = skb->nh.iph; + iphlen = iph->ihl << 2; + iph->tot_len = htons(ntohs(iph->tot_len) - len); + if (skb->len < iphlen + len) { + /* packet is too small!?! */ + return 0; + } + + /* pull the data buffer up to the ESP header and set the + * transport header to point to ESP. Keep UDP on the stack + * for later. + */ + skb->h.raw = skb_pull(skb, len); + + /* modify the protocol (it's ESP!) */ + iph->protocol = IPPROTO_ESP; + + /* and let the caller know to send this into the ESP processor... */ + return -1; +#endif +} + +/* returns: + * -1: error + * 0: success + * >0: "udp encap" protocol resubmission + * + * Note that in the success and error cases, the skb is assumed to + * have either been requeued or freed. + */ +static int udp_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) +{ + struct udp_sock *up = udp_sk(sk); + + /* + * Charge it to the socket, dropping if the queue is full. + */ + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return -1; + } + + if (up->encap_type) { + /* + * This is an encapsulation socket, so let's see if this is + * an encapsulated packet. + * If it's a keepalive packet, then just eat it. + * If it's an encapsulateed packet, then pass it to the + * IPsec xfrm input and return the response + * appropriately. Otherwise, just fall through and + * pass this up the UDP socket. + */ + int ret; + + ret = udp_encap_rcv(sk, skb); + if (ret == 0) { + /* Eat the packet .. */ + kfree_skb(skb); + return 0; + } + if (ret < 0) { + /* process the ESP packet */ + ret = xfrm4_rcv_encap(skb, up->encap_type); + UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS); + return -ret; + } + /* FALLTHROUGH -- it's a UDP Packet */ + } + + if (sk->sk_filter && skb->ip_summed != CHECKSUM_UNNECESSARY) { + if (__udp_checksum_complete(skb)) { + UDP_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return -1; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + if (sock_queue_rcv_skb(sk,skb)<0) { + UDP_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return -1; + } + UDP_INC_STATS_BH(UDP_MIB_INDATAGRAMS); + return 0; +} + +/* + * Multicasts and broadcasts go to each listener. + * + * Note: called only from the BH handler context, + * so we don't need to lock the hashes. + */ +static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, + u32 saddr, u32 daddr) +{ + struct sock *sk; + int dif; + + read_lock(&udp_hash_lock); + sk = sk_head(&udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); + dif = skb->dev->ifindex; + sk = udp_v4_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); + if (sk) { + struct sock *sknext = NULL; + + do { + struct sk_buff *skb1 = skb; + + sknext = udp_v4_mcast_next(sk_next(sk), uh->dest, daddr, + uh->source, saddr, dif); + if(sknext) + skb1 = skb_clone(skb, GFP_ATOMIC); + + if(skb1) { + int ret = udp_queue_rcv_skb(sk, skb1); + if (ret > 0) + /* we should probably re-process instead + * of dropping packets here. */ + kfree_skb(skb1); + } + sk = sknext; + } while(sknext); + } else + kfree_skb(skb); + read_unlock(&udp_hash_lock); + return 0; +} + +/* Initialize UDP checksum. If exited with zero value (success), + * CHECKSUM_UNNECESSARY means, that no more checks are required. + * Otherwise, csum completion requires chacksumming packet body, + * including udp header and folding it to skb->csum. + */ +static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, + unsigned short ulen, u32 saddr, u32 daddr) +{ + if (uh->check == 0) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else if (skb->ip_summed == CHECKSUM_HW) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (!udp_check(uh, ulen, saddr, daddr, skb->csum)) + return 0; + NETDEBUG(if (net_ratelimit()) printk(KERN_DEBUG "udp v4 hw csum failure.\n")); + skb->ip_summed = CHECKSUM_NONE; + } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->csum = csum_tcpudp_nofold(saddr, daddr, ulen, IPPROTO_UDP, 0); + /* Probably, we should checksum udp header (it should be in cache + * in any case) and data in tiny packets (< rx copybreak). + */ + return 0; +} + +/* + * All we need to do is get the socket, and then do a checksum. + */ + +int udp_rcv(struct sk_buff *skb) +{ + struct sock *sk; + struct udphdr *uh; + unsigned short ulen; + struct rtable *rt = (struct rtable*)skb->dst; + u32 saddr = skb->nh.iph->saddr; + u32 daddr = skb->nh.iph->daddr; + int len = skb->len; + + /* + * Validate the packet and the UDP length. + */ + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto no_header; + + uh = skb->h.uh; + + ulen = ntohs(uh->len); + + if (ulen > len || ulen < sizeof(*uh)) + goto short_packet; + + if (pskb_trim(skb, ulen)) + goto short_packet; + + if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) + goto csum_error; + + if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) + return udp_v4_mcast_deliver(skb, uh, saddr, daddr); + + sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex); + + if (sk != NULL) { + int ret = udp_queue_rcv_skb(sk, skb); + sock_put(sk); + + /* a return value > 0 means to resubmit the input, but + * it it wants the return to be -protocol, or 0 + */ + if (ret > 0) + return -ret; + return 0; + } + + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto drop; + + /* No socket. Drop packet silently, if checksum is wrong */ + if (udp_checksum_complete(skb)) + goto csum_error; + + UDP_INC_STATS_BH(UDP_MIB_NOPORTS); + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); + + /* + * Hmm. We got an UDP packet to a port to which we + * don't wanna listen. Ignore it. + */ + kfree_skb(skb); + return(0); + +short_packet: + NETDEBUG(if (net_ratelimit()) + printk(KERN_DEBUG "UDP: short packet: From %u.%u.%u.%u:%u %d/%d to %u.%u.%u.%u:%u\n", + NIPQUAD(saddr), + ntohs(uh->source), + ulen, + len, + NIPQUAD(daddr), + ntohs(uh->dest))); +no_header: + UDP_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return(0); + +csum_error: + /* + * RFC1122: OK. Discards the bad packet silently (as far as + * the network is concerned, anyway) as per 4.1.3.4 (MUST). + */ + NETDEBUG(if (net_ratelimit()) + printk(KERN_DEBUG "UDP: bad checksum. From %d.%d.%d.%d:%d to %d.%d.%d.%d:%d ulen %d\n", + NIPQUAD(saddr), + ntohs(uh->source), + NIPQUAD(daddr), + ntohs(uh->dest), + ulen)); +drop: + UDP_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return(0); +} + +static int udp_destroy_sock(struct sock *sk) +{ + lock_sock(sk); + udp_flush_pending_frames(sk); + release_sock(sk); + return 0; +} + +/* + * Socket option code for UDP + */ +static int udp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, int optlen) +{ + struct udp_sock *up = udp_sk(sk); + int val; + int err = 0; + + if (level != SOL_UDP) + return ip_setsockopt(sk, level, optname, optval, optlen); + + if(optlencorkflag = 1; + } else { + up->corkflag = 0; + lock_sock(sk); + udp_push_pending_frames(sk, up); + release_sock(sk); + } + break; + + case UDP_ENCAP: + switch (val) { + case 0: + case UDP_ENCAP_ESPINUDP: + case UDP_ENCAP_ESPINUDP_NON_IKE: + up->encap_type = val; + break; + default: + err = -ENOPROTOOPT; + break; + } + break; + + default: + err = -ENOPROTOOPT; + break; + }; + + return err; +} + +static int udp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct udp_sock *up = udp_sk(sk); + int val, len; + + if (level != SOL_UDP) + return ip_getsockopt(sk, level, optname, optval, optlen); + + if(get_user(len,optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + + if(len < 0) + return -EINVAL; + + switch(optname) { + case UDP_CORK: + val = up->corkflag; + break; + + case UDP_ENCAP: + val = up->encap_type; + break; + + default: + return -ENOPROTOOPT; + }; + + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval, &val,len)) + return -EFAULT; + return 0; +} + +/** + * udp_poll - wait for a UDP event. + * @file - file struct + * @sock - socket + * @wait - poll table + * + * This is same as datagram poll, except for the special case of + * blocking sockets. If application is using a blocking fd + * and a packet with checksum error is in the queue; + * then it could get return from select indicating data available + * but then block when reading it. Add special case code + * to work around these arguably broken applications. + */ +unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + unsigned int mask = datagram_poll(file, sock, wait); + struct sock *sk = sock->sk; + + /* Check for false positives due to checksum errors */ + if ( (mask & POLLRDNORM) && + !(file->f_flags & O_NONBLOCK) && + !(sk->sk_shutdown & RCV_SHUTDOWN)){ + struct sk_buff_head *rcvq = &sk->sk_receive_queue; + struct sk_buff *skb; + + spin_lock_irq(&rcvq->lock); + while ((skb = skb_peek(rcvq)) != NULL) { + if (udp_checksum_complete(skb)) { + UDP_INC_STATS_BH(UDP_MIB_INERRORS); + __skb_unlink(skb, rcvq); + kfree_skb(skb); + } else { + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + } + } + spin_unlock_irq(&rcvq->lock); + + /* nothing to see, move along */ + if (skb == NULL) + mask &= ~(POLLIN | POLLRDNORM); + } + + return mask; + +} + +struct proto udp_prot = { + .name = "UDP", + .owner = THIS_MODULE, + .close = udp_close, + .connect = ip4_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .destroy = udp_destroy_sock, + .setsockopt = udp_setsockopt, + .getsockopt = udp_getsockopt, + .sendmsg = udp_sendmsg, + .recvmsg = udp_recvmsg, + .sendpage = udp_sendpage, + .backlog_rcv = udp_queue_rcv_skb, + .hash = udp_v4_hash, + .unhash = udp_v4_unhash, + .get_port = udp_v4_get_port, + .obj_size = sizeof(struct udp_sock), +}; + +/* ------------------------------------------------------------------------ */ +#ifdef CONFIG_PROC_FS + +static struct sock *udp_get_first(struct seq_file *seq) +{ + struct sock *sk; + struct udp_iter_state *state = seq->private; + + for (state->bucket = 0; state->bucket < UDP_HTABLE_SIZE; ++state->bucket) { + struct hlist_node *node; + sk_for_each(sk, node, &udp_hash[state->bucket]) { + if (sk->sk_family == state->family) + goto found; + } + } + sk = NULL; +found: + return sk; +} + +static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk) +{ + struct udp_iter_state *state = seq->private; + + do { + sk = sk_next(sk); +try_again: + ; + } while (sk && sk->sk_family != state->family); + + if (!sk && ++state->bucket < UDP_HTABLE_SIZE) { + sk = sk_head(&udp_hash[state->bucket]); + goto try_again; + } + return sk; +} + +static struct sock *udp_get_idx(struct seq_file *seq, loff_t pos) +{ + struct sock *sk = udp_get_first(seq); + + if (sk) + while(pos && (sk = udp_get_next(seq, sk)) != NULL) + --pos; + return pos ? NULL : sk; +} + +static void *udp_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&udp_hash_lock); + return *pos ? udp_get_idx(seq, *pos-1) : (void *)1; +} + +static void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock *sk; + + if (v == (void *)1) + sk = udp_get_idx(seq, 0); + else + sk = udp_get_next(seq, v); + + ++*pos; + return sk; +} + +static void udp_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&udp_hash_lock); +} + +static int udp_seq_open(struct inode *inode, struct file *file) +{ + struct udp_seq_afinfo *afinfo = PDE(inode)->data; + struct seq_file *seq; + int rc = -ENOMEM; + struct udp_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + memset(s, 0, sizeof(*s)); + s->family = afinfo->family; + s->seq_ops.start = udp_seq_start; + s->seq_ops.next = udp_seq_next; + s->seq_ops.show = afinfo->seq_show; + s->seq_ops.stop = udp_seq_stop; + + rc = seq_open(file, &s->seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +/* ------------------------------------------------------------------------ */ +int udp_proc_register(struct udp_seq_afinfo *afinfo) +{ + struct proc_dir_entry *p; + int rc = 0; + + if (!afinfo) + return -EINVAL; + afinfo->seq_fops->owner = afinfo->owner; + afinfo->seq_fops->open = udp_seq_open; + afinfo->seq_fops->read = seq_read; + afinfo->seq_fops->llseek = seq_lseek; + afinfo->seq_fops->release = seq_release_private; + + p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops); + if (p) + p->data = afinfo; + else + rc = -ENOMEM; + return rc; +} + +void udp_proc_unregister(struct udp_seq_afinfo *afinfo) +{ + if (!afinfo) + return; + proc_net_remove(afinfo->name); + memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops)); +} + +/* ------------------------------------------------------------------------ */ +static void udp4_format_sock(struct sock *sp, char *tmpbuf, int bucket) +{ + struct inet_sock *inet = inet_sk(sp); + unsigned int dest = inet->daddr; + unsigned int src = inet->rcv_saddr; + __u16 destp = ntohs(inet->dport); + __u16 srcp = ntohs(inet->sport); + + sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X" + " %02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p", + bucket, src, srcp, dest, destp, sp->sk_state, + atomic_read(&sp->sk_wmem_alloc), + atomic_read(&sp->sk_rmem_alloc), + 0, 0L, 0, sock_i_uid(sp), 0, sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp); +} + +static int udp4_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%-127s\n", + " sl local_address rem_address st tx_queue " + "rx_queue tr tm->when retrnsmt uid timeout " + "inode"); + else { + char tmpbuf[129]; + struct udp_iter_state *state = seq->private; + + udp4_format_sock(v, tmpbuf, state->bucket); + seq_printf(seq, "%-127s\n", tmpbuf); + } + return 0; +} + +/* ------------------------------------------------------------------------ */ +static struct file_operations udp4_seq_fops; +static struct udp_seq_afinfo udp4_seq_afinfo = { + .owner = THIS_MODULE, + .name = "udp", + .family = AF_INET, + .seq_show = udp4_seq_show, + .seq_fops = &udp4_seq_fops, +}; + +int __init udp4_proc_init(void) +{ + return udp_proc_register(&udp4_seq_afinfo); +} + +void udp4_proc_exit(void) +{ + udp_proc_unregister(&udp4_seq_afinfo); +} +#endif /* CONFIG_PROC_FS */ + +EXPORT_SYMBOL(udp_disconnect); +EXPORT_SYMBOL(udp_hash); +EXPORT_SYMBOL(udp_hash_lock); +EXPORT_SYMBOL(udp_ioctl); +EXPORT_SYMBOL(udp_port_rover); +EXPORT_SYMBOL(udp_prot); +EXPORT_SYMBOL(udp_sendmsg); +EXPORT_SYMBOL(udp_poll); + +#ifdef CONFIG_PROC_FS +EXPORT_SYMBOL(udp_proc_register); +EXPORT_SYMBOL(udp_proc_unregister); +#endif diff --git a/net/ipv4/utils.c b/net/ipv4/utils.c new file mode 100644 index 000000000000..6aecd7a43534 --- /dev/null +++ b/net/ipv4/utils.c @@ -0,0 +1,59 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Various kernel-resident INET utility functions; mainly + * for format conversion and debugging output. + * + * Version: $Id: utils.c,v 1.8 2000/10/03 07:29:01 anton Exp $ + * + * Author: Fred N. van Kempen, + * + * Fixes: + * Alan Cox : verify_area check. + * Alan Cox : removed old debugging. + * Andi Kleen : add net_ratelimit() + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include + +/* + * Convert an ASCII string to binary IP. + */ + +__u32 in_aton(const char *str) +{ + unsigned long l; + unsigned int val; + int i; + + l = 0; + for (i = 0; i < 4; i++) + { + l <<= 8; + if (*str != '\0') + { + val = 0; + while (*str != '\0' && *str != '.') + { + val *= 10; + val += *str - '0'; + str++; + } + l |= val; + if (*str != '\0') + str++; + } + } + return(htonl(l)); +} + +EXPORT_SYMBOL(in_aton); diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c new file mode 100644 index 000000000000..2d3849c38a0f --- /dev/null +++ b/net/ipv4/xfrm4_input.c @@ -0,0 +1,160 @@ +/* + * xfrm4_input.c + * + * Changes: + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * Derek Atkins + * Add Encapsulation support + * + */ + +#include +#include +#include +#include +#include + +int xfrm4_rcv(struct sk_buff *skb) +{ + return xfrm4_rcv_encap(skb, 0); +} + +EXPORT_SYMBOL(xfrm4_rcv); + +static inline void ipip_ecn_decapsulate(struct sk_buff *skb) +{ + struct iphdr *outer_iph = skb->nh.iph; + struct iphdr *inner_iph = skb->h.ipiph; + + if (INET_ECN_is_ce(outer_iph->tos)) + IP_ECN_set_ce(inner_iph); +} + +static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq) +{ + switch (nexthdr) { + case IPPROTO_IPIP: + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + return -EINVAL; + *spi = skb->nh.iph->saddr; + *seq = 0; + return 0; + } + + return xfrm_parse_spi(skb, nexthdr, spi, seq); +} + +int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) +{ + int err; + u32 spi, seq; + struct sec_decap_state xfrm_vec[XFRM_MAX_DEPTH]; + struct xfrm_state *x; + int xfrm_nr = 0; + int decaps = 0; + + if ((err = xfrm4_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) != 0) + goto drop; + + do { + struct iphdr *iph = skb->nh.iph; + + if (xfrm_nr == XFRM_MAX_DEPTH) + goto drop; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, iph->protocol, AF_INET); + if (x == NULL) + goto drop; + + spin_lock(&x->lock); + if (unlikely(x->km.state != XFRM_STATE_VALID)) + goto drop_unlock; + + if (x->props.replay_window && xfrm_replay_check(x, seq)) + goto drop_unlock; + + if (xfrm_state_check_expire(x)) + goto drop_unlock; + + xfrm_vec[xfrm_nr].decap.decap_type = encap_type; + if (x->type->input(x, &(xfrm_vec[xfrm_nr].decap), skb)) + goto drop_unlock; + + /* only the first xfrm gets the encap type */ + encap_type = 0; + + if (x->props.replay_window) + xfrm_replay_advance(x, seq); + + x->curlft.bytes += skb->len; + x->curlft.packets++; + + spin_unlock(&x->lock); + + xfrm_vec[xfrm_nr++].xvec = x; + + iph = skb->nh.iph; + + if (x->props.mode) { + if (iph->protocol != IPPROTO_IPIP) + goto drop; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto drop; + if (skb_cloned(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto drop; + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv4_copy_dscp(iph, skb->h.ipiph); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip_ecn_decapsulate(skb); + skb->mac.raw = memmove(skb->data - skb->mac_len, + skb->mac.raw, skb->mac_len); + skb->nh.raw = skb->data; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + decaps = 1; + break; + } + + if ((err = xfrm_parse_spi(skb, skb->nh.iph->protocol, &spi, &seq)) < 0) + goto drop; + } while (!err); + + /* Allocate new secpath or COW existing one. */ + + if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { + struct sec_path *sp; + sp = secpath_dup(skb->sp); + if (!sp) + goto drop; + if (skb->sp) + secpath_put(skb->sp); + skb->sp = sp; + } + if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH) + goto drop; + + memcpy(skb->sp->x+skb->sp->len, xfrm_vec, xfrm_nr*sizeof(struct sec_decap_state)); + skb->sp->len += xfrm_nr; + + if (decaps) { + if (!(skb->dev->flags&IFF_LOOPBACK)) { + dst_release(skb->dst); + skb->dst = NULL; + } + netif_rx(skb); + return 0; + } else { + return -skb->nh.iph->protocol; + } + +drop_unlock: + spin_unlock(&x->lock); + xfrm_state_put(x); +drop: + while (--xfrm_nr >= 0) + xfrm_state_put(xfrm_vec[xfrm_nr].xvec); + + kfree_skb(skb); + return 0; +} diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c new file mode 100644 index 000000000000..af2392ae5769 --- /dev/null +++ b/net/ipv4/xfrm4_output.c @@ -0,0 +1,141 @@ +/* + * xfrm4_output.c - Common IPsec encapsulation code for IPv4. + * Copyright (c) 2004 Herbert Xu + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include + +/* Add encapsulation header. + * + * In transport mode, the IP header will be moved forward to make space + * for the encapsulation header. + * + * In tunnel mode, the top IP header will be constructed per RFC 2401. + * The following fields in it shall be filled in by x->type->output: + * tot_len + * check + * + * On exit, skb->h will be set to the start of the payload to be processed + * by x->type->output and skb->nh will be set to the top IP header. + */ +static void xfrm4_encap(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct iphdr *iph, *top_iph; + + iph = skb->nh.iph; + skb->h.ipiph = iph; + + skb->nh.raw = skb_push(skb, x->props.header_len); + top_iph = skb->nh.iph; + + if (!x->props.mode) { + skb->h.raw += iph->ihl*4; + memmove(top_iph, iph, iph->ihl*4); + return; + } + + top_iph->ihl = 5; + top_iph->version = 4; + + /* DS disclosed */ + top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); + if (x->props.flags & XFRM_STATE_NOECN) + IP_ECN_clear(top_iph); + + top_iph->frag_off = iph->frag_off & htons(IP_DF); + if (!top_iph->frag_off) + __ip_select_ident(top_iph, dst, 0); + + top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); + + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + top_iph->protocol = IPPROTO_IPIP; + + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); +} + +static int xfrm4_tunnel_check_size(struct sk_buff *skb) +{ + int mtu, ret = 0; + struct dst_entry *dst; + struct iphdr *iph = skb->nh.iph; + + if (IPCB(skb)->flags & IPSKB_XFRM_TUNNEL_SIZE) + goto out; + + IPCB(skb)->flags |= IPSKB_XFRM_TUNNEL_SIZE; + + if (!(iph->frag_off & htons(IP_DF)) || skb->local_df) + goto out; + + dst = skb->dst; + mtu = dst_mtu(dst); + if (skb->len > mtu) { + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); + ret = -EMSGSIZE; + } +out: + return ret; +} + +int xfrm4_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + int err; + + if (skb->ip_summed == CHECKSUM_HW) { + err = skb_checksum_help(skb, 0); + if (err) + goto error_nolock; + } + + if (x->props.mode) { + err = xfrm4_tunnel_check_size(skb); + if (err) + goto error_nolock; + } + + spin_lock_bh(&x->lock); + err = xfrm_state_check(x, skb); + if (err) + goto error; + + xfrm4_encap(skb); + + err = x->type->output(x, skb); + if (err) + goto error; + + x->curlft.bytes += skb->len; + x->curlft.packets++; + + spin_unlock_bh(&x->lock); + + if (!(skb->dst = dst_pop(dst))) { + err = -EHOSTUNREACH; + goto error_nolock; + } + err = NET_XMIT_BYPASS; + +out_exit: + return err; +error: + spin_unlock_bh(&x->lock); +error_nolock: + kfree_skb(skb); + goto out_exit; +} diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c new file mode 100644 index 000000000000..7fe2afd2e669 --- /dev/null +++ b/net/ipv4/xfrm4_policy.c @@ -0,0 +1,281 @@ +/* + * xfrm4_policy.c + * + * Changes: + * Kazunori MIYAZAWA @USAGI + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * + */ + +#include +#include +#include + +static struct dst_ops xfrm4_dst_ops; +static struct xfrm_policy_afinfo xfrm4_policy_afinfo; + +static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED }; + +static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) +{ + return __ip_route_output_key((struct rtable**)dst, fl); +} + +static struct dst_entry * +__xfrm4_find_bundle(struct flowi *fl, struct xfrm_policy *policy) +{ + struct dst_entry *dst; + + read_lock_bh(&policy->lock); + for (dst = policy->bundles; dst; dst = dst->next) { + struct xfrm_dst *xdst = (struct xfrm_dst*)dst; + if (xdst->u.rt.fl.oif == fl->oif && /*XXX*/ + xdst->u.rt.fl.fl4_dst == fl->fl4_dst && + xdst->u.rt.fl.fl4_src == fl->fl4_src && + xfrm_bundle_ok(xdst, fl, AF_INET)) { + dst_clone(dst); + break; + } + } + read_unlock_bh(&policy->lock); + return dst; +} + +/* Allocate chain of dst_entry's, attach known xfrm's, calculate + * all the metrics... Shortly, bundle a bundle. + */ + +static int +__xfrm4_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, + struct flowi *fl, struct dst_entry **dst_p) +{ + struct dst_entry *dst, *dst_prev; + struct rtable *rt0 = (struct rtable*)(*dst_p); + struct rtable *rt = rt0; + u32 remote = fl->fl4_dst; + u32 local = fl->fl4_src; + struct flowi fl_tunnel = { + .nl_u = { + .ip4_u = { + .saddr = local, + .daddr = remote + } + } + }; + int i; + int err; + int header_len = 0; + int trailer_len = 0; + + dst = dst_prev = NULL; + dst_hold(&rt->u.dst); + + for (i = 0; i < nx; i++) { + struct dst_entry *dst1 = dst_alloc(&xfrm4_dst_ops); + struct xfrm_dst *xdst; + int tunnel = 0; + + if (unlikely(dst1 == NULL)) { + err = -ENOBUFS; + dst_release(&rt->u.dst); + goto error; + } + + if (!dst) + dst = dst1; + else { + dst_prev->child = dst1; + dst1->flags |= DST_NOHASH; + dst_clone(dst1); + } + + xdst = (struct xfrm_dst *)dst1; + xdst->route = &rt->u.dst; + + dst1->next = dst_prev; + dst_prev = dst1; + if (xfrm[i]->props.mode) { + remote = xfrm[i]->id.daddr.a4; + local = xfrm[i]->props.saddr.a4; + tunnel = 1; + } + header_len += xfrm[i]->props.header_len; + trailer_len += xfrm[i]->props.trailer_len; + + if (tunnel) { + fl_tunnel.fl4_src = local; + fl_tunnel.fl4_dst = remote; + err = xfrm_dst_lookup((struct xfrm_dst **)&rt, + &fl_tunnel, AF_INET); + if (err) + goto error; + } else + dst_hold(&rt->u.dst); + } + + dst_prev->child = &rt->u.dst; + dst->path = &rt->u.dst; + + *dst_p = dst; + dst = dst_prev; + + dst_prev = *dst_p; + i = 0; + for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) { + struct xfrm_dst *x = (struct xfrm_dst*)dst_prev; + x->u.rt.fl = *fl; + + dst_prev->xfrm = xfrm[i++]; + dst_prev->dev = rt->u.dst.dev; + if (rt->u.dst.dev) + dev_hold(rt->u.dst.dev); + dst_prev->obsolete = -1; + dst_prev->flags |= DST_HOST; + dst_prev->lastuse = jiffies; + dst_prev->header_len = header_len; + dst_prev->trailer_len = trailer_len; + memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics)); + + /* Copy neighbout for reachability confirmation */ + dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour); + dst_prev->input = rt->u.dst.input; + dst_prev->output = xfrm4_output; + if (rt->peer) + atomic_inc(&rt->peer->refcnt); + x->u.rt.peer = rt->peer; + /* Sheit... I remember I did this right. Apparently, + * it was magically lost, so this code needs audit */ + x->u.rt.rt_flags = rt0->rt_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL); + x->u.rt.rt_type = rt->rt_type; + x->u.rt.rt_src = rt0->rt_src; + x->u.rt.rt_dst = rt0->rt_dst; + x->u.rt.rt_gateway = rt->rt_gateway; + x->u.rt.rt_spec_dst = rt0->rt_spec_dst; + header_len -= x->u.dst.xfrm->props.header_len; + trailer_len -= x->u.dst.xfrm->props.trailer_len; + } + + xfrm_init_pmtu(dst); + return 0; + +error: + if (dst) + dst_free(dst); + return err; +} + +static void +_decode_session4(struct sk_buff *skb, struct flowi *fl) +{ + struct iphdr *iph = skb->nh.iph; + u8 *xprth = skb->nh.raw + iph->ihl*4; + + memset(fl, 0, sizeof(struct flowi)); + if (!(iph->frag_off & htons(IP_MF | IP_OFFSET))) { + switch (iph->protocol) { + case IPPROTO_UDP: + case IPPROTO_TCP: + case IPPROTO_SCTP: + if (pskb_may_pull(skb, xprth + 4 - skb->data)) { + u16 *ports = (u16 *)xprth; + + fl->fl_ip_sport = ports[0]; + fl->fl_ip_dport = ports[1]; + } + break; + + case IPPROTO_ICMP: + if (pskb_may_pull(skb, xprth + 2 - skb->data)) { + u8 *icmp = xprth; + + fl->fl_icmp_type = icmp[0]; + fl->fl_icmp_code = icmp[1]; + } + break; + + case IPPROTO_ESP: + if (pskb_may_pull(skb, xprth + 4 - skb->data)) { + u32 *ehdr = (u32 *)xprth; + + fl->fl_ipsec_spi = ehdr[0]; + } + break; + + case IPPROTO_AH: + if (pskb_may_pull(skb, xprth + 8 - skb->data)) { + u32 *ah_hdr = (u32*)xprth; + + fl->fl_ipsec_spi = ah_hdr[1]; + } + break; + + case IPPROTO_COMP: + if (pskb_may_pull(skb, xprth + 4 - skb->data)) { + u16 *ipcomp_hdr = (u16 *)xprth; + + fl->fl_ipsec_spi = ntohl(ntohs(ipcomp_hdr[1])); + } + break; + default: + fl->fl_ipsec_spi = 0; + break; + }; + } + fl->proto = iph->protocol; + fl->fl4_dst = iph->daddr; + fl->fl4_src = iph->saddr; +} + +static inline int xfrm4_garbage_collect(void) +{ + read_lock(&xfrm4_policy_afinfo.lock); + xfrm4_policy_afinfo.garbage_collect(); + read_unlock(&xfrm4_policy_afinfo.lock); + return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); +} + +static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct dst_entry *path = xdst->route; + + path->ops->update_pmtu(path, mtu); +} + +static struct dst_ops xfrm4_dst_ops = { + .family = AF_INET, + .protocol = __constant_htons(ETH_P_IP), + .gc = xfrm4_garbage_collect, + .update_pmtu = xfrm4_update_pmtu, + .gc_thresh = 1024, + .entry_size = sizeof(struct xfrm_dst), +}; + +static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { + .family = AF_INET, + .lock = RW_LOCK_UNLOCKED, + .type_map = &xfrm4_type_map, + .dst_ops = &xfrm4_dst_ops, + .dst_lookup = xfrm4_dst_lookup, + .find_bundle = __xfrm4_find_bundle, + .bundle_create = __xfrm4_bundle_create, + .decode_session = _decode_session4, +}; + +static void __init xfrm4_policy_init(void) +{ + xfrm_policy_register_afinfo(&xfrm4_policy_afinfo); +} + +static void __exit xfrm4_policy_fini(void) +{ + xfrm_policy_unregister_afinfo(&xfrm4_policy_afinfo); +} + +void __init xfrm4_init(void) +{ + xfrm4_state_init(); + xfrm4_policy_init(); +} + diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c new file mode 100644 index 000000000000..223a2e83853f --- /dev/null +++ b/net/ipv4/xfrm4_state.c @@ -0,0 +1,126 @@ +/* + * xfrm4_state.c + * + * Changes: + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * + */ + +#include +#include +#include + +static struct xfrm_state_afinfo xfrm4_state_afinfo; + +static void +__xfrm4_init_tempsel(struct xfrm_state *x, struct flowi *fl, + struct xfrm_tmpl *tmpl, + xfrm_address_t *daddr, xfrm_address_t *saddr) +{ + x->sel.daddr.a4 = fl->fl4_dst; + x->sel.saddr.a4 = fl->fl4_src; + x->sel.dport = xfrm_flowi_dport(fl); + x->sel.dport_mask = ~0; + x->sel.sport = xfrm_flowi_sport(fl); + x->sel.sport_mask = ~0; + x->sel.prefixlen_d = 32; + x->sel.prefixlen_s = 32; + x->sel.proto = fl->proto; + x->sel.ifindex = fl->oif; + x->id = tmpl->id; + if (x->id.daddr.a4 == 0) + x->id.daddr.a4 = daddr->a4; + x->props.saddr = tmpl->saddr; + if (x->props.saddr.a4 == 0) + x->props.saddr.a4 = saddr->a4; + x->props.mode = tmpl->mode; + x->props.reqid = tmpl->reqid; + x->props.family = AF_INET; +} + +static struct xfrm_state * +__xfrm4_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto) +{ + unsigned h = __xfrm4_spi_hash(daddr, spi, proto); + struct xfrm_state *x; + + list_for_each_entry(x, xfrm4_state_afinfo.state_byspi+h, byspi) { + if (x->props.family == AF_INET && + spi == x->id.spi && + daddr->a4 == x->id.daddr.a4 && + proto == x->id.proto) { + xfrm_state_hold(x); + return x; + } + } + return NULL; +} + +static struct xfrm_state * +__xfrm4_find_acq(u8 mode, u32 reqid, u8 proto, + xfrm_address_t *daddr, xfrm_address_t *saddr, + int create) +{ + struct xfrm_state *x, *x0; + unsigned h = __xfrm4_dst_hash(daddr); + + x0 = NULL; + + list_for_each_entry(x, xfrm4_state_afinfo.state_bydst+h, bydst) { + if (x->props.family == AF_INET && + daddr->a4 == x->id.daddr.a4 && + mode == x->props.mode && + proto == x->id.proto && + saddr->a4 == x->props.saddr.a4 && + reqid == x->props.reqid && + x->km.state == XFRM_STATE_ACQ && + !x->id.spi) { + x0 = x; + break; + } + } + if (!x0 && create && (x0 = xfrm_state_alloc()) != NULL) { + x0->sel.daddr.a4 = daddr->a4; + x0->sel.saddr.a4 = saddr->a4; + x0->sel.prefixlen_d = 32; + x0->sel.prefixlen_s = 32; + x0->props.saddr.a4 = saddr->a4; + x0->km.state = XFRM_STATE_ACQ; + x0->id.daddr.a4 = daddr->a4; + x0->id.proto = proto; + x0->props.family = AF_INET; + x0->props.mode = mode; + x0->props.reqid = reqid; + x0->props.family = AF_INET; + x0->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES; + xfrm_state_hold(x0); + x0->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ; + add_timer(&x0->timer); + xfrm_state_hold(x0); + list_add_tail(&x0->bydst, xfrm4_state_afinfo.state_bydst+h); + wake_up(&km_waitq); + } + if (x0) + xfrm_state_hold(x0); + return x0; +} + +static struct xfrm_state_afinfo xfrm4_state_afinfo = { + .family = AF_INET, + .lock = RW_LOCK_UNLOCKED, + .init_tempsel = __xfrm4_init_tempsel, + .state_lookup = __xfrm4_state_lookup, + .find_acq = __xfrm4_find_acq, +}; + +void __init xfrm4_state_init(void) +{ + xfrm_state_register_afinfo(&xfrm4_state_afinfo); +} + +void __exit xfrm4_state_fini(void) +{ + xfrm_state_unregister_afinfo(&xfrm4_state_afinfo); +} + diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c new file mode 100644 index 000000000000..413191f585f6 --- /dev/null +++ b/net/ipv4/xfrm4_tunnel.c @@ -0,0 +1,144 @@ +/* xfrm4_tunnel.c: Generic IP tunnel transformer. + * + * Copyright (C) 2003 David S. Miller (davem@redhat.com) + */ + +#include +#include +#include +#include +#include + +static int ipip_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct iphdr *iph; + + iph = skb->nh.iph; + iph->tot_len = htons(skb->len); + ip_send_check(iph); + + return 0; +} + +static int ipip_xfrm_rcv(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + return 0; +} + +static struct xfrm_tunnel *ipip_handler; +static DECLARE_MUTEX(xfrm4_tunnel_sem); + +int xfrm4_tunnel_register(struct xfrm_tunnel *handler) +{ + int ret; + + down(&xfrm4_tunnel_sem); + ret = 0; + if (ipip_handler != NULL) + ret = -EINVAL; + if (!ret) + ipip_handler = handler; + up(&xfrm4_tunnel_sem); + + return ret; +} + +EXPORT_SYMBOL(xfrm4_tunnel_register); + +int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler) +{ + int ret; + + down(&xfrm4_tunnel_sem); + ret = 0; + if (ipip_handler != handler) + ret = -EINVAL; + if (!ret) + ipip_handler = NULL; + up(&xfrm4_tunnel_sem); + + synchronize_net(); + + return ret; +} + +EXPORT_SYMBOL(xfrm4_tunnel_deregister); + +static int ipip_rcv(struct sk_buff *skb) +{ + struct xfrm_tunnel *handler = ipip_handler; + + /* Tunnel devices take precedence. */ + if (handler && handler->handler(skb) == 0) + return 0; + + return xfrm4_rcv(skb); +} + +static void ipip_err(struct sk_buff *skb, u32 info) +{ + struct xfrm_tunnel *handler = ipip_handler; + u32 arg = info; + + if (handler) + handler->err_handler(skb, &arg); +} + +static int ipip_init_state(struct xfrm_state *x, void *args) +{ + if (!x->props.mode) + return -EINVAL; + + if (x->encap) + return -EINVAL; + + x->props.header_len = sizeof(struct iphdr); + + return 0; +} + +static void ipip_destroy(struct xfrm_state *x) +{ +} + +static struct xfrm_type ipip_type = { + .description = "IPIP", + .owner = THIS_MODULE, + .proto = IPPROTO_IPIP, + .init_state = ipip_init_state, + .destructor = ipip_destroy, + .input = ipip_xfrm_rcv, + .output = ipip_output +}; + +static struct net_protocol ipip_protocol = { + .handler = ipip_rcv, + .err_handler = ipip_err, + .no_policy = 1, +}; + +static int __init ipip_init(void) +{ + if (xfrm_register_type(&ipip_type, AF_INET) < 0) { + printk(KERN_INFO "ipip init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet_add_protocol(&ipip_protocol, IPPROTO_IPIP) < 0) { + printk(KERN_INFO "ipip init: can't add protocol\n"); + xfrm_unregister_type(&ipip_type, AF_INET); + return -EAGAIN; + } + return 0; +} + +static void __exit ipip_fini(void) +{ + if (inet_del_protocol(&ipip_protocol, IPPROTO_IPIP) < 0) + printk(KERN_INFO "ipip close: can't remove protocol\n"); + if (xfrm_unregister_type(&ipip_type, AF_INET) < 0) + printk(KERN_INFO "ipip close: can't remove xfrm type\n"); +} + +module_init(ipip_init); +module_exit(ipip_fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig new file mode 100644 index 000000000000..e66ca9381cfd --- /dev/null +++ b/net/ipv6/Kconfig @@ -0,0 +1,79 @@ +# +# IPv6 configuration +# +config IPV6_PRIVACY + bool "IPv6: Privacy Extensions (RFC 3041) support" + depends on IPV6 + ---help--- + Privacy Extensions for Stateless Address Autoconfiguration in IPv6 + support. With this option, additional periodically-alter + pseudo-random global-scope unicast address(es) will assigned to + your interface(s). + + By default, kernel do not generate temporary addresses. + To use temporary addresses, do + + echo 2 >/proc/sys/net/ipv6/conf/all/use_tempaddr + + See for details. + +config INET6_AH + tristate "IPv6: AH transformation" + depends on IPV6 + select XFRM + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_MD5 + select CRYPTO_SHA1 + ---help--- + Support for IPsec AH. + + If unsure, say Y. + +config INET6_ESP + tristate "IPv6: ESP transformation" + depends on IPV6 + select XFRM + select CRYPTO + select CRYPTO_HMAC + select CRYPTO_MD5 + select CRYPTO_SHA1 + select CRYPTO_DES + ---help--- + Support for IPsec ESP. + + If unsure, say Y. + +config INET6_IPCOMP + tristate "IPv6: IPComp transformation" + depends on IPV6 + select XFRM + select INET6_TUNNEL + select CRYPTO + select CRYPTO_DEFLATE + ---help--- + Support for IP Payload Compression Protocol (IPComp) (RFC3173), + typically needed for IPsec. + + If unsure, say Y. + +config INET6_TUNNEL + tristate "IPv6: tunnel transformation" + depends on IPV6 + select XFRM + ---help--- + Support for generic IPv6-in-IPv6 tunnel transformation, which is + required by the IPv6-in-IPv6 tunneling module as well as tunnel mode + IPComp. + + If unsure, say Y. + +config IPV6_TUNNEL + tristate "IPv6: IPv6-in-IPv6 tunnel" + depends on IPV6 + select INET6_TUNNEL + ---help--- + Support for IPv6-in-IPv6 tunnels described in RFC 2473. + + If unsure, say N. + diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile new file mode 100644 index 000000000000..b39e04940590 --- /dev/null +++ b/net/ipv6/Makefile @@ -0,0 +1,25 @@ +# +# Makefile for the Linux TCP/IP (INET6) layer. +# + +obj-$(CONFIG_IPV6) += ipv6.o + +ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \ + route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \ + protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ + exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \ + ip6_flowlabel.o ipv6_syms.o + +ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ + xfrm6_output.o +ipv6-objs += $(ipv6-y) + +obj-$(CONFIG_INET6_AH) += ah6.o +obj-$(CONFIG_INET6_ESP) += esp6.o +obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o +obj-$(CONFIG_INET6_TUNNEL) += xfrm6_tunnel.o +obj-$(CONFIG_NETFILTER) += netfilter/ + +obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o + +obj-y += exthdrs_core.o diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c new file mode 100644 index 000000000000..5ffde14ddc09 --- /dev/null +++ b/net/ipv6/addrconf.c @@ -0,0 +1,3615 @@ +/* + * IPv6 Address [auto]configuration + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * Alexey Kuznetsov + * + * $Id: addrconf.c,v 1.69 2001/10/31 21:55:54 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Janos Farkas : delete timer on ifdown + * + * Andi Kleen : kill double kfree on module + * unload. + * Maciej W. Rozycki : FDDI support + * sekiya@USAGI : Don't send too many RS + * packets. + * yoshfuji@USAGI : Fixed interval between DAD + * packets. + * YOSHIFUJI Hideaki @USAGI : improved accuracy of + * address validation timer. + * YOSHIFUJI Hideaki @USAGI : Privacy Extensions (RFC3041) + * support. + * Yuji SEKIYA @USAGI : Don't assign a same IPv6 + * address on a same interface. + * YOSHIFUJI Hideaki @USAGI : ARCnet support + * YOSHIFUJI Hideaki @USAGI : convert /proc/net/if_inet6 to + * seq_file. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_IPV6_PRIVACY +#include +#include +#include +#endif + +#include + +#include +#include + +/* Set to 3 to get tracing... */ +#define ACONF_DEBUG 2 + +#if ACONF_DEBUG >= 3 +#define ADBG(x) printk x +#else +#define ADBG(x) +#endif + +#define INFINITY_LIFE_TIME 0xFFFFFFFF +#define TIME_DELTA(a,b) ((unsigned long)((long)(a) - (long)(b))) + +#ifdef CONFIG_SYSCTL +static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p); +static void addrconf_sysctl_unregister(struct ipv6_devconf *p); +#endif + +#ifdef CONFIG_IPV6_PRIVACY +static int __ipv6_regen_rndid(struct inet6_dev *idev); +static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr); +static void ipv6_regen_rndid(unsigned long data); + +static int desync_factor = MAX_DESYNC_FACTOR * HZ; +static struct crypto_tfm *md5_tfm; +static DEFINE_SPINLOCK(md5_tfm_lock); +#endif + +static int ipv6_count_addresses(struct inet6_dev *idev); + +/* + * Configured unicast address hash table + */ +static struct inet6_ifaddr *inet6_addr_lst[IN6_ADDR_HSIZE]; +static DEFINE_RWLOCK(addrconf_hash_lock); + +/* Protects inet6 devices */ +DEFINE_RWLOCK(addrconf_lock); + +static void addrconf_verify(unsigned long); + +static struct timer_list addr_chk_timer = + TIMER_INITIALIZER(addrconf_verify, 0, 0); +static DEFINE_SPINLOCK(addrconf_verify_lock); + +static void addrconf_join_anycast(struct inet6_ifaddr *ifp); +static void addrconf_leave_anycast(struct inet6_ifaddr *ifp); + +static int addrconf_ifdown(struct net_device *dev, int how); + +static void addrconf_dad_start(struct inet6_ifaddr *ifp, int flags); +static void addrconf_dad_timer(unsigned long data); +static void addrconf_dad_completed(struct inet6_ifaddr *ifp); +static void addrconf_rs_timer(unsigned long data); +static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); +static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifa); + +static void inet6_prefix_notify(int event, struct inet6_dev *idev, + struct prefix_info *pinfo); +static int ipv6_chk_same_addr(const struct in6_addr *addr, struct net_device *dev); + +static struct notifier_block *inet6addr_chain; + +struct ipv6_devconf ipv6_devconf = { + .forwarding = 0, + .hop_limit = IPV6_DEFAULT_HOPLIMIT, + .mtu6 = IPV6_MIN_MTU, + .accept_ra = 1, + .accept_redirects = 1, + .autoconf = 1, + .force_mld_version = 0, + .dad_transmits = 1, + .rtr_solicits = MAX_RTR_SOLICITATIONS, + .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, + .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, +#ifdef CONFIG_IPV6_PRIVACY + .use_tempaddr = 0, + .temp_valid_lft = TEMP_VALID_LIFETIME, + .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, + .regen_max_retry = REGEN_MAX_RETRY, + .max_desync_factor = MAX_DESYNC_FACTOR, +#endif + .max_addresses = IPV6_MAX_ADDRESSES, +}; + +static struct ipv6_devconf ipv6_devconf_dflt = { + .forwarding = 0, + .hop_limit = IPV6_DEFAULT_HOPLIMIT, + .mtu6 = IPV6_MIN_MTU, + .accept_ra = 1, + .accept_redirects = 1, + .autoconf = 1, + .dad_transmits = 1, + .rtr_solicits = MAX_RTR_SOLICITATIONS, + .rtr_solicit_interval = RTR_SOLICITATION_INTERVAL, + .rtr_solicit_delay = MAX_RTR_SOLICITATION_DELAY, +#ifdef CONFIG_IPV6_PRIVACY + .use_tempaddr = 0, + .temp_valid_lft = TEMP_VALID_LIFETIME, + .temp_prefered_lft = TEMP_PREFERRED_LIFETIME, + .regen_max_retry = REGEN_MAX_RETRY, + .max_desync_factor = MAX_DESYNC_FACTOR, +#endif + .max_addresses = IPV6_MAX_ADDRESSES, +}; + +/* IPv6 Wildcard Address and Loopback Address defined by RFC2553 */ +#if 0 +const struct in6_addr in6addr_any = IN6ADDR_ANY_INIT; +#endif +const struct in6_addr in6addr_loopback = IN6ADDR_LOOPBACK_INIT; + +int ipv6_addr_type(const struct in6_addr *addr) +{ + int type; + u32 st; + + st = addr->s6_addr32[0]; + + if ((st & htonl(0xFF000000)) == htonl(0xFF000000)) { + type = IPV6_ADDR_MULTICAST; + + switch((st & htonl(0x00FF0000))) { + case __constant_htonl(0x00010000): + type |= IPV6_ADDR_LOOPBACK; + break; + + case __constant_htonl(0x00020000): + type |= IPV6_ADDR_LINKLOCAL; + break; + + case __constant_htonl(0x00050000): + type |= IPV6_ADDR_SITELOCAL; + break; + }; + return type; + } + + type = IPV6_ADDR_UNICAST; + + /* Consider all addresses with the first three bits different of + 000 and 111 as finished. + */ + if ((st & htonl(0xE0000000)) != htonl(0x00000000) && + (st & htonl(0xE0000000)) != htonl(0xE0000000)) + return type; + + if ((st & htonl(0xFFC00000)) == htonl(0xFE800000)) + return (IPV6_ADDR_LINKLOCAL | type); + + if ((st & htonl(0xFFC00000)) == htonl(0xFEC00000)) + return (IPV6_ADDR_SITELOCAL | type); + + if ((addr->s6_addr32[0] | addr->s6_addr32[1]) == 0) { + if (addr->s6_addr32[2] == 0) { + if (addr->s6_addr32[3] == 0) + return IPV6_ADDR_ANY; + + if (addr->s6_addr32[3] == htonl(0x00000001)) + return (IPV6_ADDR_LOOPBACK | type); + + return (IPV6_ADDR_COMPATv4 | type); + } + + if (addr->s6_addr32[2] == htonl(0x0000ffff)) + return IPV6_ADDR_MAPPED; + } + + st &= htonl(0xFF000000); + if (st == 0) + return IPV6_ADDR_RESERVED; + st &= htonl(0xFE000000); + if (st == htonl(0x02000000)) + return IPV6_ADDR_RESERVED; /* for NSAP */ + if (st == htonl(0x04000000)) + return IPV6_ADDR_RESERVED; /* for IPX */ + return type; +} + +static void addrconf_del_timer(struct inet6_ifaddr *ifp) +{ + if (del_timer(&ifp->timer)) + __in6_ifa_put(ifp); +} + +enum addrconf_timer_t +{ + AC_NONE, + AC_DAD, + AC_RS, +}; + +static void addrconf_mod_timer(struct inet6_ifaddr *ifp, + enum addrconf_timer_t what, + unsigned long when) +{ + if (!del_timer(&ifp->timer)) + in6_ifa_hold(ifp); + + switch (what) { + case AC_DAD: + ifp->timer.function = addrconf_dad_timer; + break; + case AC_RS: + ifp->timer.function = addrconf_rs_timer; + break; + default:; + } + ifp->timer.expires = jiffies + when; + add_timer(&ifp->timer); +} + +/* Nobody refers to this device, we may destroy it. */ + +void in6_dev_finish_destroy(struct inet6_dev *idev) +{ + struct net_device *dev = idev->dev; + BUG_TRAP(idev->addr_list==NULL); + BUG_TRAP(idev->mc_list==NULL); +#ifdef NET_REFCNT_DEBUG + printk(KERN_DEBUG "in6_dev_finish_destroy: %s\n", dev ? dev->name : "NIL"); +#endif + dev_put(dev); + if (!idev->dead) { + printk("Freeing alive inet6 device %p\n", idev); + return; + } + snmp6_free_dev(idev); + kfree(idev); +} + +static struct inet6_dev * ipv6_add_dev(struct net_device *dev) +{ + struct inet6_dev *ndev; + + ASSERT_RTNL(); + + if (dev->mtu < IPV6_MIN_MTU) + return NULL; + + ndev = kmalloc(sizeof(struct inet6_dev), GFP_KERNEL); + + if (ndev) { + memset(ndev, 0, sizeof(struct inet6_dev)); + + rwlock_init(&ndev->lock); + ndev->dev = dev; + memcpy(&ndev->cnf, &ipv6_devconf_dflt, sizeof(ndev->cnf)); + ndev->cnf.mtu6 = dev->mtu; + ndev->cnf.sysctl = NULL; + ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl); + if (ndev->nd_parms == NULL) { + kfree(ndev); + return NULL; + } + /* We refer to the device */ + dev_hold(dev); + + if (snmp6_alloc_dev(ndev) < 0) { + ADBG((KERN_WARNING + "%s(): cannot allocate memory for statistics; dev=%s.\n", + __FUNCTION__, dev->name)); + neigh_parms_release(&nd_tbl, ndev->nd_parms); + ndev->dead = 1; + in6_dev_finish_destroy(ndev); + return NULL; + } + + if (snmp6_register_dev(ndev) < 0) { + ADBG((KERN_WARNING + "%s(): cannot create /proc/net/dev_snmp6/%s\n", + __FUNCTION__, dev->name)); + neigh_parms_release(&nd_tbl, ndev->nd_parms); + ndev->dead = 1; + in6_dev_finish_destroy(ndev); + return NULL; + } + + /* One reference from device. We must do this before + * we invoke __ipv6_regen_rndid(). + */ + in6_dev_hold(ndev); + +#ifdef CONFIG_IPV6_PRIVACY + get_random_bytes(ndev->rndid, sizeof(ndev->rndid)); + get_random_bytes(ndev->entropy, sizeof(ndev->entropy)); + init_timer(&ndev->regen_timer); + ndev->regen_timer.function = ipv6_regen_rndid; + ndev->regen_timer.data = (unsigned long) ndev; + if ((dev->flags&IFF_LOOPBACK) || + dev->type == ARPHRD_TUNNEL || + dev->type == ARPHRD_SIT) { + printk(KERN_INFO + "Disabled Privacy Extensions on device %p(%s)\n", + dev, dev->name); + ndev->cnf.use_tempaddr = -1; + } else { + in6_dev_hold(ndev); + ipv6_regen_rndid((unsigned long) ndev); + } +#endif + + write_lock_bh(&addrconf_lock); + dev->ip6_ptr = ndev; + write_unlock_bh(&addrconf_lock); + + ipv6_mc_init_dev(ndev); + ndev->tstamp = jiffies; +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(dev, ndev->nd_parms, NET_IPV6, + NET_IPV6_NEIGH, "ipv6", + &ndisc_ifinfo_sysctl_change, + NULL); + addrconf_sysctl_register(ndev, &ndev->cnf); +#endif + } + return ndev; +} + +static struct inet6_dev * ipv6_find_idev(struct net_device *dev) +{ + struct inet6_dev *idev; + + ASSERT_RTNL(); + + if ((idev = __in6_dev_get(dev)) == NULL) { + if ((idev = ipv6_add_dev(dev)) == NULL) + return NULL; + } + if (dev->flags&IFF_UP) + ipv6_mc_up(idev); + return idev; +} + +#ifdef CONFIG_SYSCTL +static void dev_forward_change(struct inet6_dev *idev) +{ + struct net_device *dev; + struct inet6_ifaddr *ifa; + struct in6_addr addr; + + if (!idev) + return; + dev = idev->dev; + if (dev && (dev->flags & IFF_MULTICAST)) { + ipv6_addr_all_routers(&addr); + + if (idev->cnf.forwarding) + ipv6_dev_mc_inc(dev, &addr); + else + ipv6_dev_mc_dec(dev, &addr); + } + for (ifa=idev->addr_list; ifa; ifa=ifa->if_next) { + if (idev->cnf.forwarding) + addrconf_join_anycast(ifa); + else + addrconf_leave_anycast(ifa); + } +} + + +static void addrconf_forward_change(void) +{ + struct net_device *dev; + struct inet6_dev *idev; + + read_lock(&dev_base_lock); + for (dev=dev_base; dev; dev=dev->next) { + read_lock(&addrconf_lock); + idev = __in6_dev_get(dev); + if (idev) { + int changed = (!idev->cnf.forwarding) ^ (!ipv6_devconf.forwarding); + idev->cnf.forwarding = ipv6_devconf.forwarding; + if (changed) + dev_forward_change(idev); + } + read_unlock(&addrconf_lock); + } + read_unlock(&dev_base_lock); +} +#endif + +/* Nobody refers to this ifaddr, destroy it */ + +void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp) +{ + BUG_TRAP(ifp->if_next==NULL); + BUG_TRAP(ifp->lst_next==NULL); +#ifdef NET_REFCNT_DEBUG + printk(KERN_DEBUG "inet6_ifa_finish_destroy\n"); +#endif + + in6_dev_put(ifp->idev); + + if (del_timer(&ifp->timer)) + printk("Timer is still running, when freeing ifa=%p\n", ifp); + + if (!ifp->dead) { + printk("Freeing alive inet6 address %p\n", ifp); + return; + } + dst_release(&ifp->rt->u.dst); + + kfree(ifp); +} + +/* On success it returns ifp with increased reference count */ + +static struct inet6_ifaddr * +ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, int pfxlen, + int scope, unsigned flags) +{ + struct inet6_ifaddr *ifa = NULL; + struct rt6_info *rt; + int hash; + int err = 0; + + read_lock_bh(&addrconf_lock); + if (idev->dead) { + err = -ENODEV; /*XXX*/ + goto out2; + } + + write_lock(&addrconf_hash_lock); + + /* Ignore adding duplicate addresses on an interface */ + if (ipv6_chk_same_addr(addr, idev->dev)) { + ADBG(("ipv6_add_addr: already assigned\n")); + err = -EEXIST; + goto out; + } + + ifa = kmalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC); + + if (ifa == NULL) { + ADBG(("ipv6_add_addr: malloc failed\n")); + err = -ENOBUFS; + goto out; + } + + rt = addrconf_dst_alloc(idev, addr, 0); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + goto out; + } + + memset(ifa, 0, sizeof(struct inet6_ifaddr)); + ipv6_addr_copy(&ifa->addr, addr); + + spin_lock_init(&ifa->lock); + init_timer(&ifa->timer); + ifa->timer.data = (unsigned long) ifa; + ifa->scope = scope; + ifa->prefix_len = pfxlen; + ifa->flags = flags | IFA_F_TENTATIVE; + ifa->cstamp = ifa->tstamp = jiffies; + + ifa->idev = idev; + in6_dev_hold(idev); + /* For caller */ + in6_ifa_hold(ifa); + + /* Add to big hash table */ + hash = ipv6_addr_hash(addr); + + ifa->lst_next = inet6_addr_lst[hash]; + inet6_addr_lst[hash] = ifa; + in6_ifa_hold(ifa); + write_unlock(&addrconf_hash_lock); + + write_lock(&idev->lock); + /* Add to inet6_dev unicast addr list. */ + ifa->if_next = idev->addr_list; + idev->addr_list = ifa; + +#ifdef CONFIG_IPV6_PRIVACY + if (ifa->flags&IFA_F_TEMPORARY) { + ifa->tmp_next = idev->tempaddr_list; + idev->tempaddr_list = ifa; + in6_ifa_hold(ifa); + } +#endif + + ifa->rt = rt; + + in6_ifa_hold(ifa); + write_unlock(&idev->lock); +out2: + read_unlock_bh(&addrconf_lock); + + if (unlikely(err == 0)) + notifier_call_chain(&inet6addr_chain, NETDEV_UP, ifa); + else { + kfree(ifa); + ifa = ERR_PTR(err); + } + + return ifa; +out: + write_unlock(&addrconf_hash_lock); + goto out2; +} + +/* This function wants to get referenced ifp and releases it before return */ + +static void ipv6_del_addr(struct inet6_ifaddr *ifp) +{ + struct inet6_ifaddr *ifa, **ifap; + struct inet6_dev *idev = ifp->idev; + int hash; + int deleted = 0, onlink = 0; + unsigned long expires = jiffies; + + hash = ipv6_addr_hash(&ifp->addr); + + ifp->dead = 1; + + write_lock_bh(&addrconf_hash_lock); + for (ifap = &inet6_addr_lst[hash]; (ifa=*ifap) != NULL; + ifap = &ifa->lst_next) { + if (ifa == ifp) { + *ifap = ifa->lst_next; + __in6_ifa_put(ifp); + ifa->lst_next = NULL; + break; + } + } + write_unlock_bh(&addrconf_hash_lock); + + write_lock_bh(&idev->lock); +#ifdef CONFIG_IPV6_PRIVACY + if (ifp->flags&IFA_F_TEMPORARY) { + for (ifap = &idev->tempaddr_list; (ifa=*ifap) != NULL; + ifap = &ifa->tmp_next) { + if (ifa == ifp) { + *ifap = ifa->tmp_next; + if (ifp->ifpub) { + in6_ifa_put(ifp->ifpub); + ifp->ifpub = NULL; + } + __in6_ifa_put(ifp); + ifa->tmp_next = NULL; + break; + } + } + } +#endif + + for (ifap = &idev->addr_list; (ifa=*ifap) != NULL; + ifap = &ifa->if_next) { + if (ifa == ifp) { + *ifap = ifa->if_next; + __in6_ifa_put(ifp); + ifa->if_next = NULL; + if (!(ifp->flags & IFA_F_PERMANENT) || onlink > 0) + break; + deleted = 1; + } else if (ifp->flags & IFA_F_PERMANENT) { + if (ipv6_prefix_equal(&ifa->addr, &ifp->addr, + ifp->prefix_len)) { + if (ifa->flags & IFA_F_PERMANENT) { + onlink = 1; + if (deleted) + break; + } else { + unsigned long lifetime; + + if (!onlink) + onlink = -1; + + spin_lock(&ifa->lock); + lifetime = min_t(unsigned long, + ifa->valid_lft, 0x7fffffffUL/HZ); + if (time_before(expires, + ifa->tstamp + lifetime * HZ)) + expires = ifa->tstamp + lifetime * HZ; + spin_unlock(&ifa->lock); + } + } + } + } + write_unlock_bh(&idev->lock); + + ipv6_ifa_notify(RTM_DELADDR, ifp); + + notifier_call_chain(&inet6addr_chain,NETDEV_DOWN,ifp); + + addrconf_del_timer(ifp); + + /* + * Purge or update corresponding prefix + * + * 1) we don't purge prefix here if address was not permanent. + * prefix is managed by its own lifetime. + * 2) if there're no addresses, delete prefix. + * 3) if there're still other permanent address(es), + * corresponding prefix is still permanent. + * 4) otherwise, update prefix lifetime to the + * longest valid lifetime among the corresponding + * addresses on the device. + * Note: subsequent RA will update lifetime. + * + * --yoshfuji + */ + if ((ifp->flags & IFA_F_PERMANENT) && onlink < 1) { + struct in6_addr prefix; + struct rt6_info *rt; + + ipv6_addr_prefix(&prefix, &ifp->addr, ifp->prefix_len); + rt = rt6_lookup(&prefix, NULL, ifp->idev->dev->ifindex, 1); + + if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { + if (onlink == 0) { + ip6_del_rt(rt, NULL, NULL); + rt = NULL; + } else if (!(rt->rt6i_flags & RTF_EXPIRES)) { + rt->rt6i_expires = expires; + rt->rt6i_flags |= RTF_EXPIRES; + } + } + dst_release(&rt->u.dst); + } + + in6_ifa_put(ifp); +} + +#ifdef CONFIG_IPV6_PRIVACY +static int ipv6_create_tempaddr(struct inet6_ifaddr *ifp, struct inet6_ifaddr *ift) +{ + struct inet6_dev *idev = ifp->idev; + struct in6_addr addr, *tmpaddr; + unsigned long tmp_prefered_lft, tmp_valid_lft, tmp_cstamp, tmp_tstamp; + int tmp_plen; + int ret = 0; + int max_addresses; + + write_lock(&idev->lock); + if (ift) { + spin_lock_bh(&ift->lock); + memcpy(&addr.s6_addr[8], &ift->addr.s6_addr[8], 8); + spin_unlock_bh(&ift->lock); + tmpaddr = &addr; + } else { + tmpaddr = NULL; + } +retry: + in6_dev_hold(idev); + if (idev->cnf.use_tempaddr <= 0) { + write_unlock(&idev->lock); + printk(KERN_INFO + "ipv6_create_tempaddr(): use_tempaddr is disabled.\n"); + in6_dev_put(idev); + ret = -1; + goto out; + } + spin_lock_bh(&ifp->lock); + if (ifp->regen_count++ >= idev->cnf.regen_max_retry) { + idev->cnf.use_tempaddr = -1; /*XXX*/ + spin_unlock_bh(&ifp->lock); + write_unlock(&idev->lock); + printk(KERN_WARNING + "ipv6_create_tempaddr(): regeneration time exceeded. disabled temporary address support.\n"); + in6_dev_put(idev); + ret = -1; + goto out; + } + in6_ifa_hold(ifp); + memcpy(addr.s6_addr, ifp->addr.s6_addr, 8); + if (__ipv6_try_regen_rndid(idev, tmpaddr) < 0) { + spin_unlock_bh(&ifp->lock); + write_unlock(&idev->lock); + printk(KERN_WARNING + "ipv6_create_tempaddr(): regeneration of randomized interface id failed.\n"); + in6_ifa_put(ifp); + in6_dev_put(idev); + ret = -1; + goto out; + } + memcpy(&addr.s6_addr[8], idev->rndid, 8); + tmp_valid_lft = min_t(__u32, + ifp->valid_lft, + idev->cnf.temp_valid_lft); + tmp_prefered_lft = min_t(__u32, + ifp->prefered_lft, + idev->cnf.temp_prefered_lft - desync_factor / HZ); + tmp_plen = ifp->prefix_len; + max_addresses = idev->cnf.max_addresses; + tmp_cstamp = ifp->cstamp; + tmp_tstamp = ifp->tstamp; + spin_unlock_bh(&ifp->lock); + + write_unlock(&idev->lock); + ift = !max_addresses || + ipv6_count_addresses(idev) < max_addresses ? + ipv6_add_addr(idev, &addr, tmp_plen, + ipv6_addr_type(&addr)&IPV6_ADDR_SCOPE_MASK, IFA_F_TEMPORARY) : NULL; + if (!ift || IS_ERR(ift)) { + in6_ifa_put(ifp); + in6_dev_put(idev); + printk(KERN_INFO + "ipv6_create_tempaddr(): retry temporary address regeneration.\n"); + tmpaddr = &addr; + write_lock(&idev->lock); + goto retry; + } + + spin_lock_bh(&ift->lock); + ift->ifpub = ifp; + ift->valid_lft = tmp_valid_lft; + ift->prefered_lft = tmp_prefered_lft; + ift->cstamp = tmp_cstamp; + ift->tstamp = tmp_tstamp; + spin_unlock_bh(&ift->lock); + + addrconf_dad_start(ift, 0); + in6_ifa_put(ift); + in6_dev_put(idev); +out: + return ret; +} +#endif + +/* + * Choose an appropriate source address + * should do: + * i) get an address with an appropriate scope + * ii) see if there is a specific route for the destination and use + * an address of the attached interface + * iii) don't use deprecated addresses + */ +static int inline ipv6_saddr_pref(const struct inet6_ifaddr *ifp, u8 invpref) +{ + int pref; + pref = ifp->flags&IFA_F_DEPRECATED ? 0 : 2; +#ifdef CONFIG_IPV6_PRIVACY + pref |= (ifp->flags^invpref)&IFA_F_TEMPORARY ? 0 : 1; +#endif + return pref; +} + +#ifdef CONFIG_IPV6_PRIVACY +#define IPV6_GET_SADDR_MAXSCORE(score) ((score) == 3) +#else +#define IPV6_GET_SADDR_MAXSCORE(score) (score) +#endif + +int ipv6_dev_get_saddr(struct net_device *dev, + struct in6_addr *daddr, struct in6_addr *saddr) +{ + struct inet6_ifaddr *ifp = NULL; + struct inet6_ifaddr *match = NULL; + struct inet6_dev *idev; + int scope; + int err; + int hiscore = -1, score; + + scope = ipv6_addr_scope(daddr); + + /* + * known dev + * search dev and walk through dev addresses + */ + + if (dev) { + if (dev->flags & IFF_LOOPBACK) + scope = IFA_HOST; + + read_lock(&addrconf_lock); + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { + if (ifp->scope == scope) { + if (ifp->flags&IFA_F_TENTATIVE) + continue; +#ifdef CONFIG_IPV6_PRIVACY + score = ipv6_saddr_pref(ifp, idev->cnf.use_tempaddr > 1 ? IFA_F_TEMPORARY : 0); +#else + score = ipv6_saddr_pref(ifp, 0); +#endif + if (score <= hiscore) + continue; + + if (match) + in6_ifa_put(match); + match = ifp; + hiscore = score; + in6_ifa_hold(ifp); + + if (IPV6_GET_SADDR_MAXSCORE(score)) { + read_unlock_bh(&idev->lock); + read_unlock(&addrconf_lock); + goto out; + } + } + } + read_unlock_bh(&idev->lock); + } + read_unlock(&addrconf_lock); + } + + if (scope == IFA_LINK) + goto out; + + /* + * dev == NULL or search failed for specified dev + */ + + read_lock(&dev_base_lock); + read_lock(&addrconf_lock); + for (dev = dev_base; dev; dev=dev->next) { + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { + if (ifp->scope == scope) { + if (ifp->flags&IFA_F_TENTATIVE) + continue; +#ifdef CONFIG_IPV6_PRIVACY + score = ipv6_saddr_pref(ifp, idev->cnf.use_tempaddr > 1 ? IFA_F_TEMPORARY : 0); +#else + score = ipv6_saddr_pref(ifp, 0); +#endif + if (score <= hiscore) + continue; + + if (match) + in6_ifa_put(match); + match = ifp; + hiscore = score; + in6_ifa_hold(ifp); + + if (IPV6_GET_SADDR_MAXSCORE(score)) { + read_unlock_bh(&idev->lock); + goto out_unlock_base; + } + } + } + read_unlock_bh(&idev->lock); + } + } + +out_unlock_base: + read_unlock(&addrconf_lock); + read_unlock(&dev_base_lock); + +out: + err = -EADDRNOTAVAIL; + if (match) { + ipv6_addr_copy(saddr, &match->addr); + err = 0; + in6_ifa_put(match); + } + + return err; +} + + +int ipv6_get_saddr(struct dst_entry *dst, + struct in6_addr *daddr, struct in6_addr *saddr) +{ + return ipv6_dev_get_saddr(dst ? ((struct rt6_info *)dst)->rt6i_idev->dev : NULL, daddr, saddr); +} + + +int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr) +{ + struct inet6_dev *idev; + int err = -EADDRNOTAVAIL; + + read_lock(&addrconf_lock); + if ((idev = __in6_dev_get(dev)) != NULL) { + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { + if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) { + ipv6_addr_copy(addr, &ifp->addr); + err = 0; + break; + } + } + read_unlock_bh(&idev->lock); + } + read_unlock(&addrconf_lock); + return err; +} + +static int ipv6_count_addresses(struct inet6_dev *idev) +{ + int cnt = 0; + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) + cnt++; + read_unlock_bh(&idev->lock); + return cnt; +} + +int ipv6_chk_addr(struct in6_addr *addr, struct net_device *dev, int strict) +{ + struct inet6_ifaddr * ifp; + u8 hash = ipv6_addr_hash(addr); + + read_lock_bh(&addrconf_hash_lock); + for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { + if (ipv6_addr_equal(&ifp->addr, addr) && + !(ifp->flags&IFA_F_TENTATIVE)) { + if (dev == NULL || ifp->idev->dev == dev || + !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) + break; + } + } + read_unlock_bh(&addrconf_hash_lock); + return ifp != NULL; +} + +static +int ipv6_chk_same_addr(const struct in6_addr *addr, struct net_device *dev) +{ + struct inet6_ifaddr * ifp; + u8 hash = ipv6_addr_hash(addr); + + for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { + if (ipv6_addr_equal(&ifp->addr, addr)) { + if (dev == NULL || ifp->idev->dev == dev) + break; + } + } + return ifp != NULL; +} + +struct inet6_ifaddr * ipv6_get_ifaddr(struct in6_addr *addr, struct net_device *dev, int strict) +{ + struct inet6_ifaddr * ifp; + u8 hash = ipv6_addr_hash(addr); + + read_lock_bh(&addrconf_hash_lock); + for(ifp = inet6_addr_lst[hash]; ifp; ifp=ifp->lst_next) { + if (ipv6_addr_equal(&ifp->addr, addr)) { + if (dev == NULL || ifp->idev->dev == dev || + !(ifp->scope&(IFA_LINK|IFA_HOST) || strict)) { + in6_ifa_hold(ifp); + break; + } + } + } + read_unlock_bh(&addrconf_hash_lock); + + return ifp; +} + +int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) +{ + const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; + const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2); + u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr; + u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2); + int sk_ipv6only = ipv6_only_sock(sk); + int sk2_ipv6only = tcp_v6_ipv6only(sk2); + int addr_type = ipv6_addr_type(sk_rcv_saddr6); + int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED; + + if (!sk2_rcv_saddr && !sk_ipv6only) + return 1; + + if (addr_type2 == IPV6_ADDR_ANY && + !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED)) + return 1; + + if (addr_type == IPV6_ADDR_ANY && + !(sk_ipv6only && addr_type2 == IPV6_ADDR_MAPPED)) + return 1; + + if (sk2_rcv_saddr6 && + ipv6_addr_equal(sk_rcv_saddr6, sk2_rcv_saddr6)) + return 1; + + if (addr_type == IPV6_ADDR_MAPPED && + !sk2_ipv6only && + (!sk2_rcv_saddr || !sk_rcv_saddr || sk_rcv_saddr == sk2_rcv_saddr)) + return 1; + + return 0; +} + +/* Gets referenced address, destroys ifaddr */ + +void addrconf_dad_failure(struct inet6_ifaddr *ifp) +{ + if (net_ratelimit()) + printk(KERN_INFO "%s: duplicate address detected!\n", ifp->idev->dev->name); + if (ifp->flags&IFA_F_PERMANENT) { + spin_lock_bh(&ifp->lock); + addrconf_del_timer(ifp); + ifp->flags |= IFA_F_TENTATIVE; + spin_unlock_bh(&ifp->lock); + in6_ifa_put(ifp); +#ifdef CONFIG_IPV6_PRIVACY + } else if (ifp->flags&IFA_F_TEMPORARY) { + struct inet6_ifaddr *ifpub; + spin_lock_bh(&ifp->lock); + ifpub = ifp->ifpub; + if (ifpub) { + in6_ifa_hold(ifpub); + spin_unlock_bh(&ifp->lock); + ipv6_create_tempaddr(ifpub, ifp); + in6_ifa_put(ifpub); + } else { + spin_unlock_bh(&ifp->lock); + } + ipv6_del_addr(ifp); +#endif + } else + ipv6_del_addr(ifp); +} + + +/* Join to solicited addr multicast group. */ + +void addrconf_join_solict(struct net_device *dev, struct in6_addr *addr) +{ + struct in6_addr maddr; + + if (dev->flags&(IFF_LOOPBACK|IFF_NOARP)) + return; + + addrconf_addr_solict_mult(addr, &maddr); + ipv6_dev_mc_inc(dev, &maddr); +} + +void addrconf_leave_solict(struct inet6_dev *idev, struct in6_addr *addr) +{ + struct in6_addr maddr; + + if (idev->dev->flags&(IFF_LOOPBACK|IFF_NOARP)) + return; + + addrconf_addr_solict_mult(addr, &maddr); + __ipv6_dev_mc_dec(idev, &maddr); +} + +void addrconf_join_anycast(struct inet6_ifaddr *ifp) +{ + struct in6_addr addr; + ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); + if (ipv6_addr_any(&addr)) + return; + ipv6_dev_ac_inc(ifp->idev->dev, &addr); +} + +void addrconf_leave_anycast(struct inet6_ifaddr *ifp) +{ + struct in6_addr addr; + ipv6_addr_prefix(&addr, &ifp->addr, ifp->prefix_len); + if (ipv6_addr_any(&addr)) + return; + __ipv6_dev_ac_dec(ifp->idev, &addr); +} + +static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) +{ + switch (dev->type) { + case ARPHRD_ETHER: + case ARPHRD_FDDI: + case ARPHRD_IEEE802_TR: + if (dev->addr_len != ETH_ALEN) + return -1; + memcpy(eui, dev->dev_addr, 3); + memcpy(eui + 5, dev->dev_addr + 3, 3); + + /* + * The zSeries OSA network cards can be shared among various + * OS instances, but the OSA cards have only one MAC address. + * This leads to duplicate address conflicts in conjunction + * with IPv6 if more than one instance uses the same card. + * + * The driver for these cards can deliver a unique 16-bit + * identifier for each instance sharing the same card. It is + * placed instead of 0xFFFE in the interface identifier. The + * "u" bit of the interface identifier is not inverted in this + * case. Hence the resulting interface identifier has local + * scope according to RFC2373. + */ + if (dev->dev_id) { + eui[3] = (dev->dev_id >> 8) & 0xFF; + eui[4] = dev->dev_id & 0xFF; + } else { + eui[3] = 0xFF; + eui[4] = 0xFE; + eui[0] ^= 2; + } + return 0; + case ARPHRD_ARCNET: + /* XXX: inherit EUI-64 from other interface -- yoshfuji */ + if (dev->addr_len != ARCNET_ALEN) + return -1; + memset(eui, 0, 7); + eui[7] = *(u8*)dev->dev_addr; + return 0; + case ARPHRD_INFINIBAND: + if (dev->addr_len != INFINIBAND_ALEN) + return -1; + memcpy(eui, dev->dev_addr + 12, 8); + eui[0] |= 2; + return 0; + } + return -1; +} + +static int ipv6_inherit_eui64(u8 *eui, struct inet6_dev *idev) +{ + int err = -1; + struct inet6_ifaddr *ifp; + + read_lock_bh(&idev->lock); + for (ifp=idev->addr_list; ifp; ifp=ifp->if_next) { + if (ifp->scope == IFA_LINK && !(ifp->flags&IFA_F_TENTATIVE)) { + memcpy(eui, ifp->addr.s6_addr+8, 8); + err = 0; + break; + } + } + read_unlock_bh(&idev->lock); + return err; +} + +#ifdef CONFIG_IPV6_PRIVACY +/* (re)generation of randomized interface identifier (RFC 3041 3.2, 3.5) */ +static int __ipv6_regen_rndid(struct inet6_dev *idev) +{ + struct net_device *dev; + struct scatterlist sg[2]; + + sg[0].page = virt_to_page(idev->entropy); + sg[0].offset = offset_in_page(idev->entropy); + sg[0].length = 8; + sg[1].page = virt_to_page(idev->work_eui64); + sg[1].offset = offset_in_page(idev->work_eui64); + sg[1].length = 8; + + dev = idev->dev; + + if (ipv6_generate_eui64(idev->work_eui64, dev)) { + printk(KERN_INFO + "__ipv6_regen_rndid(idev=%p): cannot get EUI64 identifier; use random bytes.\n", + idev); + get_random_bytes(idev->work_eui64, sizeof(idev->work_eui64)); + } +regen: + spin_lock(&md5_tfm_lock); + if (unlikely(md5_tfm == NULL)) { + spin_unlock(&md5_tfm_lock); + return -1; + } + crypto_digest_init(md5_tfm); + crypto_digest_update(md5_tfm, sg, 2); + crypto_digest_final(md5_tfm, idev->work_digest); + spin_unlock(&md5_tfm_lock); + + memcpy(idev->rndid, &idev->work_digest[0], 8); + idev->rndid[0] &= ~0x02; + memcpy(idev->entropy, &idev->work_digest[8], 8); + + /* + * : + * check if generated address is not inappropriate + * + * - Reserved subnet anycast (RFC 2526) + * 11111101 11....11 1xxxxxxx + * - ISATAP (draft-ietf-ngtrans-isatap-13.txt) 5.1 + * 00-00-5E-FE-xx-xx-xx-xx + * - value 0 + * - XXX: already assigned to an address on the device + */ + if (idev->rndid[0] == 0xfd && + (idev->rndid[1]&idev->rndid[2]&idev->rndid[3]&idev->rndid[4]&idev->rndid[5]&idev->rndid[6]) == 0xff && + (idev->rndid[7]&0x80)) + goto regen; + if ((idev->rndid[0]|idev->rndid[1]) == 0) { + if (idev->rndid[2] == 0x5e && idev->rndid[3] == 0xfe) + goto regen; + if ((idev->rndid[2]|idev->rndid[3]|idev->rndid[4]|idev->rndid[5]|idev->rndid[6]|idev->rndid[7]) == 0x00) + goto regen; + } + + return 0; +} + +static void ipv6_regen_rndid(unsigned long data) +{ + struct inet6_dev *idev = (struct inet6_dev *) data; + unsigned long expires; + + read_lock_bh(&addrconf_lock); + write_lock_bh(&idev->lock); + + if (idev->dead) + goto out; + + if (__ipv6_regen_rndid(idev) < 0) + goto out; + + expires = jiffies + + idev->cnf.temp_prefered_lft * HZ - + idev->cnf.regen_max_retry * idev->cnf.dad_transmits * idev->nd_parms->retrans_time - desync_factor; + if (time_before(expires, jiffies)) { + printk(KERN_WARNING + "ipv6_regen_rndid(): too short regeneration interval; timer disabled for %s.\n", + idev->dev->name); + goto out; + } + + if (!mod_timer(&idev->regen_timer, expires)) + in6_dev_hold(idev); + +out: + write_unlock_bh(&idev->lock); + read_unlock_bh(&addrconf_lock); + in6_dev_put(idev); +} + +static int __ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpaddr) { + int ret = 0; + + if (tmpaddr && memcmp(idev->rndid, &tmpaddr->s6_addr[8], 8) == 0) + ret = __ipv6_regen_rndid(idev); + return ret; +} +#endif + +/* + * Add prefix route. + */ + +static void +addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev, + unsigned long expires, unsigned flags) +{ + struct in6_rtmsg rtmsg; + + memset(&rtmsg, 0, sizeof(rtmsg)); + ipv6_addr_copy(&rtmsg.rtmsg_dst, pfx); + rtmsg.rtmsg_dst_len = plen; + rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; + rtmsg.rtmsg_ifindex = dev->ifindex; + rtmsg.rtmsg_info = expires; + rtmsg.rtmsg_flags = RTF_UP|flags; + rtmsg.rtmsg_type = RTMSG_NEWROUTE; + + /* Prevent useless cloning on PtP SIT. + This thing is done here expecting that the whole + class of non-broadcast devices need not cloning. + */ + if (dev->type == ARPHRD_SIT && (dev->flags&IFF_POINTOPOINT)) + rtmsg.rtmsg_flags |= RTF_NONEXTHOP; + + ip6_route_add(&rtmsg, NULL, NULL); +} + +/* Create "default" multicast route to the interface */ + +static void addrconf_add_mroute(struct net_device *dev) +{ + struct in6_rtmsg rtmsg; + + memset(&rtmsg, 0, sizeof(rtmsg)); + ipv6_addr_set(&rtmsg.rtmsg_dst, + htonl(0xFF000000), 0, 0, 0); + rtmsg.rtmsg_dst_len = 8; + rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; + rtmsg.rtmsg_ifindex = dev->ifindex; + rtmsg.rtmsg_flags = RTF_UP; + rtmsg.rtmsg_type = RTMSG_NEWROUTE; + ip6_route_add(&rtmsg, NULL, NULL); +} + +static void sit_route_add(struct net_device *dev) +{ + struct in6_rtmsg rtmsg; + + memset(&rtmsg, 0, sizeof(rtmsg)); + + rtmsg.rtmsg_type = RTMSG_NEWROUTE; + rtmsg.rtmsg_metric = IP6_RT_PRIO_ADDRCONF; + + /* prefix length - 96 bits "::d.d.d.d" */ + rtmsg.rtmsg_dst_len = 96; + rtmsg.rtmsg_flags = RTF_UP|RTF_NONEXTHOP; + rtmsg.rtmsg_ifindex = dev->ifindex; + + ip6_route_add(&rtmsg, NULL, NULL); +} + +static void addrconf_add_lroute(struct net_device *dev) +{ + struct in6_addr addr; + + ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0); + addrconf_prefix_route(&addr, 64, dev, 0, 0); +} + +static struct inet6_dev *addrconf_add_dev(struct net_device *dev) +{ + struct inet6_dev *idev; + + ASSERT_RTNL(); + + if ((idev = ipv6_find_idev(dev)) == NULL) + return NULL; + + /* Add default multicast route */ + addrconf_add_mroute(dev); + + /* Add link local route */ + addrconf_add_lroute(dev); + return idev; +} + +void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len) +{ + struct prefix_info *pinfo; + __u32 valid_lft; + __u32 prefered_lft; + int addr_type; + unsigned long rt_expires; + struct inet6_dev *in6_dev; + + pinfo = (struct prefix_info *) opt; + + if (len < sizeof(struct prefix_info)) { + ADBG(("addrconf: prefix option too short\n")); + return; + } + + /* + * Validation checks ([ADDRCONF], page 19) + */ + + addr_type = ipv6_addr_type(&pinfo->prefix); + + if (addr_type & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL)) + return; + + valid_lft = ntohl(pinfo->valid); + prefered_lft = ntohl(pinfo->prefered); + + if (prefered_lft > valid_lft) { + if (net_ratelimit()) + printk(KERN_WARNING "addrconf: prefix option has invalid lifetime\n"); + return; + } + + in6_dev = in6_dev_get(dev); + + if (in6_dev == NULL) { + if (net_ratelimit()) + printk(KERN_DEBUG "addrconf: device %s not configured\n", dev->name); + return; + } + + /* + * Two things going on here: + * 1) Add routes for on-link prefixes + * 2) Configure prefixes with the auto flag set + */ + + /* Avoid arithmetic overflow. Really, we could + save rt_expires in seconds, likely valid_lft, + but it would require division in fib gc, that it + not good. + */ + if (valid_lft >= 0x7FFFFFFF/HZ) + rt_expires = 0; + else + rt_expires = jiffies + valid_lft * HZ; + + if (pinfo->onlink) { + struct rt6_info *rt; + rt = rt6_lookup(&pinfo->prefix, NULL, dev->ifindex, 1); + + if (rt && ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0)) { + if (rt->rt6i_flags&RTF_EXPIRES) { + if (valid_lft == 0) { + ip6_del_rt(rt, NULL, NULL); + rt = NULL; + } else { + rt->rt6i_expires = rt_expires; + } + } + } else if (valid_lft) { + addrconf_prefix_route(&pinfo->prefix, pinfo->prefix_len, + dev, rt_expires, RTF_ADDRCONF|RTF_EXPIRES|RTF_PREFIX_RT); + } + if (rt) + dst_release(&rt->u.dst); + } + + /* Try to figure out our local address for this prefix */ + + if (pinfo->autoconf && in6_dev->cnf.autoconf) { + struct inet6_ifaddr * ifp; + struct in6_addr addr; + int create = 0, update_lft = 0; + + if (pinfo->prefix_len == 64) { + memcpy(&addr, &pinfo->prefix, 8); + if (ipv6_generate_eui64(addr.s6_addr + 8, dev) && + ipv6_inherit_eui64(addr.s6_addr + 8, in6_dev)) { + in6_dev_put(in6_dev); + return; + } + goto ok; + } + if (net_ratelimit()) + printk(KERN_DEBUG "IPv6 addrconf: prefix with wrong length %d\n", + pinfo->prefix_len); + in6_dev_put(in6_dev); + return; + +ok: + + ifp = ipv6_get_ifaddr(&addr, dev, 1); + + if (ifp == NULL && valid_lft) { + int max_addresses = in6_dev->cnf.max_addresses; + + /* Do not allow to create too much of autoconfigured + * addresses; this would be too easy way to crash kernel. + */ + if (!max_addresses || + ipv6_count_addresses(in6_dev) < max_addresses) + ifp = ipv6_add_addr(in6_dev, &addr, pinfo->prefix_len, + addr_type&IPV6_ADDR_SCOPE_MASK, 0); + + if (!ifp || IS_ERR(ifp)) { + in6_dev_put(in6_dev); + return; + } + + update_lft = create = 1; + ifp->cstamp = jiffies; + addrconf_dad_start(ifp, RTF_ADDRCONF|RTF_PREFIX_RT); + } + + if (ifp) { + int flags; + unsigned long now; +#ifdef CONFIG_IPV6_PRIVACY + struct inet6_ifaddr *ift; +#endif + u32 stored_lft; + + /* update lifetime (RFC2462 5.5.3 e) */ + spin_lock(&ifp->lock); + now = jiffies; + if (ifp->valid_lft > (now - ifp->tstamp) / HZ) + stored_lft = ifp->valid_lft - (now - ifp->tstamp) / HZ; + else + stored_lft = 0; + if (!update_lft && stored_lft) { + if (valid_lft > MIN_VALID_LIFETIME || + valid_lft > stored_lft) + update_lft = 1; + else if (stored_lft <= MIN_VALID_LIFETIME) { + /* valid_lft <= stored_lft is always true */ + /* XXX: IPsec */ + update_lft = 0; + } else { + valid_lft = MIN_VALID_LIFETIME; + if (valid_lft < prefered_lft) + prefered_lft = valid_lft; + update_lft = 1; + } + } + + if (update_lft) { + ifp->valid_lft = valid_lft; + ifp->prefered_lft = prefered_lft; + ifp->tstamp = now; + flags = ifp->flags; + ifp->flags &= ~IFA_F_DEPRECATED; + spin_unlock(&ifp->lock); + + if (!(flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify(0, ifp); + } else + spin_unlock(&ifp->lock); + +#ifdef CONFIG_IPV6_PRIVACY + read_lock_bh(&in6_dev->lock); + /* update all temporary addresses in the list */ + for (ift=in6_dev->tempaddr_list; ift; ift=ift->tmp_next) { + /* + * When adjusting the lifetimes of an existing + * temporary address, only lower the lifetimes. + * Implementations must not increase the + * lifetimes of an existing temporary address + * when processing a Prefix Information Option. + */ + spin_lock(&ift->lock); + flags = ift->flags; + if (ift->valid_lft > valid_lft && + ift->valid_lft - valid_lft > (jiffies - ift->tstamp) / HZ) + ift->valid_lft = valid_lft + (jiffies - ift->tstamp) / HZ; + if (ift->prefered_lft > prefered_lft && + ift->prefered_lft - prefered_lft > (jiffies - ift->tstamp) / HZ) + ift->prefered_lft = prefered_lft + (jiffies - ift->tstamp) / HZ; + spin_unlock(&ift->lock); + if (!(flags&IFA_F_TENTATIVE)) + ipv6_ifa_notify(0, ift); + } + + if (create && in6_dev->cnf.use_tempaddr > 0) { + /* + * When a new public address is created as described in [ADDRCONF], + * also create a new temporary address. + */ + read_unlock_bh(&in6_dev->lock); + ipv6_create_tempaddr(ifp, NULL); + } else { + read_unlock_bh(&in6_dev->lock); + } +#endif + in6_ifa_put(ifp); + addrconf_verify(0); + } + } + inet6_prefix_notify(RTM_NEWPREFIX, in6_dev, pinfo); + in6_dev_put(in6_dev); +} + +/* + * Set destination address. + * Special case for SIT interfaces where we create a new "virtual" + * device. + */ +int addrconf_set_dstaddr(void __user *arg) +{ + struct in6_ifreq ireq; + struct net_device *dev; + int err = -EINVAL; + + rtnl_lock(); + + err = -EFAULT; + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + goto err_exit; + + dev = __dev_get_by_index(ireq.ifr6_ifindex); + + err = -ENODEV; + if (dev == NULL) + goto err_exit; + + if (dev->type == ARPHRD_SIT) { + struct ifreq ifr; + mm_segment_t oldfs; + struct ip_tunnel_parm p; + + err = -EADDRNOTAVAIL; + if (!(ipv6_addr_type(&ireq.ifr6_addr) & IPV6_ADDR_COMPATv4)) + goto err_exit; + + memset(&p, 0, sizeof(p)); + p.iph.daddr = ireq.ifr6_addr.s6_addr32[3]; + p.iph.saddr = 0; + p.iph.version = 4; + p.iph.ihl = 5; + p.iph.protocol = IPPROTO_IPV6; + p.iph.ttl = 64; + ifr.ifr_ifru.ifru_data = (void __user *)&p; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL); + set_fs(oldfs); + + if (err == 0) { + err = -ENOBUFS; + if ((dev = __dev_get_by_name(p.name)) == NULL) + goto err_exit; + err = dev_open(dev); + } + } + +err_exit: + rtnl_unlock(); + return err; +} + +/* + * Manual configuration of address on an interface + */ +static int inet6_addr_add(int ifindex, struct in6_addr *pfx, int plen) +{ + struct inet6_ifaddr *ifp; + struct inet6_dev *idev; + struct net_device *dev; + int scope; + + ASSERT_RTNL(); + + if ((dev = __dev_get_by_index(ifindex)) == NULL) + return -ENODEV; + + if (!(dev->flags&IFF_UP)) + return -ENETDOWN; + + if ((idev = addrconf_add_dev(dev)) == NULL) + return -ENOBUFS; + + scope = ipv6_addr_scope(pfx); + + ifp = ipv6_add_addr(idev, pfx, plen, scope, IFA_F_PERMANENT); + if (!IS_ERR(ifp)) { + addrconf_dad_start(ifp, 0); + in6_ifa_put(ifp); + return 0; + } + + return PTR_ERR(ifp); +} + +static int inet6_addr_del(int ifindex, struct in6_addr *pfx, int plen) +{ + struct inet6_ifaddr *ifp; + struct inet6_dev *idev; + struct net_device *dev; + + if ((dev = __dev_get_by_index(ifindex)) == NULL) + return -ENODEV; + + if ((idev = __in6_dev_get(dev)) == NULL) + return -ENXIO; + + read_lock_bh(&idev->lock); + for (ifp = idev->addr_list; ifp; ifp=ifp->if_next) { + if (ifp->prefix_len == plen && + ipv6_addr_equal(pfx, &ifp->addr)) { + in6_ifa_hold(ifp); + read_unlock_bh(&idev->lock); + + ipv6_del_addr(ifp); + + /* If the last address is deleted administratively, + disable IPv6 on this interface. + */ + if (idev->addr_list == NULL) + addrconf_ifdown(idev->dev, 1); + return 0; + } + } + read_unlock_bh(&idev->lock); + return -EADDRNOTAVAIL; +} + + +int addrconf_add_ifaddr(void __user *arg) +{ + struct in6_ifreq ireq; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + return -EFAULT; + + rtnl_lock(); + err = inet6_addr_add(ireq.ifr6_ifindex, &ireq.ifr6_addr, ireq.ifr6_prefixlen); + rtnl_unlock(); + return err; +} + +int addrconf_del_ifaddr(void __user *arg) +{ + struct in6_ifreq ireq; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (copy_from_user(&ireq, arg, sizeof(struct in6_ifreq))) + return -EFAULT; + + rtnl_lock(); + err = inet6_addr_del(ireq.ifr6_ifindex, &ireq.ifr6_addr, ireq.ifr6_prefixlen); + rtnl_unlock(); + return err; +} + +static void sit_add_v4_addrs(struct inet6_dev *idev) +{ + struct inet6_ifaddr * ifp; + struct in6_addr addr; + struct net_device *dev; + int scope; + + ASSERT_RTNL(); + + memset(&addr, 0, sizeof(struct in6_addr)); + memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); + + if (idev->dev->flags&IFF_POINTOPOINT) { + addr.s6_addr32[0] = htonl(0xfe800000); + scope = IFA_LINK; + } else { + scope = IPV6_ADDR_COMPATv4; + } + + if (addr.s6_addr32[3]) { + ifp = ipv6_add_addr(idev, &addr, 128, scope, IFA_F_PERMANENT); + if (!IS_ERR(ifp)) { + spin_lock_bh(&ifp->lock); + ifp->flags &= ~IFA_F_TENTATIVE; + spin_unlock_bh(&ifp->lock); + ipv6_ifa_notify(RTM_NEWADDR, ifp); + in6_ifa_put(ifp); + } + return; + } + + for (dev = dev_base; dev != NULL; dev = dev->next) { + struct in_device * in_dev = __in_dev_get(dev); + if (in_dev && (dev->flags & IFF_UP)) { + struct in_ifaddr * ifa; + + int flag = scope; + + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + int plen; + + addr.s6_addr32[3] = ifa->ifa_local; + + if (ifa->ifa_scope == RT_SCOPE_LINK) + continue; + if (ifa->ifa_scope >= RT_SCOPE_HOST) { + if (idev->dev->flags&IFF_POINTOPOINT) + continue; + flag |= IFA_HOST; + } + if (idev->dev->flags&IFF_POINTOPOINT) + plen = 64; + else + plen = 96; + + ifp = ipv6_add_addr(idev, &addr, plen, flag, + IFA_F_PERMANENT); + if (!IS_ERR(ifp)) { + spin_lock_bh(&ifp->lock); + ifp->flags &= ~IFA_F_TENTATIVE; + spin_unlock_bh(&ifp->lock); + ipv6_ifa_notify(RTM_NEWADDR, ifp); + in6_ifa_put(ifp); + } + } + } + } +} + +static void init_loopback(struct net_device *dev) +{ + struct inet6_dev *idev; + struct inet6_ifaddr * ifp; + + /* ::1 */ + + ASSERT_RTNL(); + + if ((idev = ipv6_find_idev(dev)) == NULL) { + printk(KERN_DEBUG "init loopback: add_dev failed\n"); + return; + } + + ifp = ipv6_add_addr(idev, &in6addr_loopback, 128, IFA_HOST, IFA_F_PERMANENT); + if (!IS_ERR(ifp)) { + spin_lock_bh(&ifp->lock); + ifp->flags &= ~IFA_F_TENTATIVE; + spin_unlock_bh(&ifp->lock); + ipv6_ifa_notify(RTM_NEWADDR, ifp); + in6_ifa_put(ifp); + } +} + +static void addrconf_add_linklocal(struct inet6_dev *idev, struct in6_addr *addr) +{ + struct inet6_ifaddr * ifp; + + ifp = ipv6_add_addr(idev, addr, 64, IFA_LINK, IFA_F_PERMANENT); + if (!IS_ERR(ifp)) { + addrconf_dad_start(ifp, 0); + in6_ifa_put(ifp); + } +} + +static void addrconf_dev_config(struct net_device *dev) +{ + struct in6_addr addr; + struct inet6_dev * idev; + + ASSERT_RTNL(); + + if ((dev->type != ARPHRD_ETHER) && + (dev->type != ARPHRD_FDDI) && + (dev->type != ARPHRD_IEEE802_TR) && + (dev->type != ARPHRD_ARCNET) && + (dev->type != ARPHRD_INFINIBAND)) { + /* Alas, we support only Ethernet autoconfiguration. */ + return; + } + + idev = addrconf_add_dev(dev); + if (idev == NULL) + return; + + memset(&addr, 0, sizeof(struct in6_addr)); + addr.s6_addr32[0] = htonl(0xFE800000); + + if (ipv6_generate_eui64(addr.s6_addr + 8, dev) == 0) + addrconf_add_linklocal(idev, &addr); +} + +static void addrconf_sit_config(struct net_device *dev) +{ + struct inet6_dev *idev; + + ASSERT_RTNL(); + + /* + * Configure the tunnel with one of our IPv4 + * addresses... we should configure all of + * our v4 addrs in the tunnel + */ + + if ((idev = ipv6_find_idev(dev)) == NULL) { + printk(KERN_DEBUG "init sit: add_dev failed\n"); + return; + } + + sit_add_v4_addrs(idev); + + if (dev->flags&IFF_POINTOPOINT) { + addrconf_add_mroute(dev); + addrconf_add_lroute(dev); + } else + sit_route_add(dev); +} + +static inline int +ipv6_inherit_linklocal(struct inet6_dev *idev, struct net_device *link_dev) +{ + struct in6_addr lladdr; + + if (!ipv6_get_lladdr(link_dev, &lladdr)) { + addrconf_add_linklocal(idev, &lladdr); + return 0; + } + return -1; +} + +static void ip6_tnl_add_linklocal(struct inet6_dev *idev) +{ + struct net_device *link_dev; + + /* first try to inherit the link-local address from the link device */ + if (idev->dev->iflink && + (link_dev = __dev_get_by_index(idev->dev->iflink))) { + if (!ipv6_inherit_linklocal(idev, link_dev)) + return; + } + /* then try to inherit it from any device */ + for (link_dev = dev_base; link_dev; link_dev = link_dev->next) { + if (!ipv6_inherit_linklocal(idev, link_dev)) + return; + } + printk(KERN_DEBUG "init ip6-ip6: add_linklocal failed\n"); +} + +/* + * Autoconfigure tunnel with a link-local address so routing protocols, + * DHCPv6, MLD etc. can be run over the virtual link + */ + +static void addrconf_ip6_tnl_config(struct net_device *dev) +{ + struct inet6_dev *idev; + + ASSERT_RTNL(); + + if ((idev = addrconf_add_dev(dev)) == NULL) { + printk(KERN_DEBUG "init ip6-ip6: add_dev failed\n"); + return; + } + ip6_tnl_add_linklocal(idev); + addrconf_add_mroute(dev); +} + +static int addrconf_notify(struct notifier_block *this, unsigned long event, + void * data) +{ + struct net_device *dev = (struct net_device *) data; + struct inet6_dev *idev = __in6_dev_get(dev); + + switch(event) { + case NETDEV_UP: + switch(dev->type) { + case ARPHRD_SIT: + addrconf_sit_config(dev); + break; + case ARPHRD_TUNNEL6: + addrconf_ip6_tnl_config(dev); + break; + case ARPHRD_LOOPBACK: + init_loopback(dev); + break; + + default: + addrconf_dev_config(dev); + break; + }; + if (idev) { + /* If the MTU changed during the interface down, when the + interface up, the changed MTU must be reflected in the + idev as well as routers. + */ + if (idev->cnf.mtu6 != dev->mtu && dev->mtu >= IPV6_MIN_MTU) { + rt6_mtu_change(dev, dev->mtu); + idev->cnf.mtu6 = dev->mtu; + } + idev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, idev); + /* If the changed mtu during down is lower than IPV6_MIN_MTU + stop IPv6 on this interface. + */ + if (dev->mtu < IPV6_MIN_MTU) + addrconf_ifdown(dev, event != NETDEV_DOWN); + } + break; + + case NETDEV_CHANGEMTU: + if ( idev && dev->mtu >= IPV6_MIN_MTU) { + rt6_mtu_change(dev, dev->mtu); + idev->cnf.mtu6 = dev->mtu; + break; + } + + /* MTU falled under IPV6_MIN_MTU. Stop IPv6 on this interface. */ + + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + /* + * Remove all addresses from this interface. + */ + addrconf_ifdown(dev, event != NETDEV_DOWN); + break; + case NETDEV_CHANGE: + break; + case NETDEV_CHANGENAME: +#ifdef CONFIG_SYSCTL + if (idev) { + addrconf_sysctl_unregister(&idev->cnf); + neigh_sysctl_unregister(idev->nd_parms); + neigh_sysctl_register(dev, idev->nd_parms, + NET_IPV6, NET_IPV6_NEIGH, "ipv6", + &ndisc_ifinfo_sysctl_change, + NULL); + addrconf_sysctl_register(idev, &idev->cnf); + } +#endif + break; + }; + + return NOTIFY_OK; +} + +/* + * addrconf module should be notified of a device going up + */ +static struct notifier_block ipv6_dev_notf = { + .notifier_call = addrconf_notify, + .priority = 0 +}; + +static int addrconf_ifdown(struct net_device *dev, int how) +{ + struct inet6_dev *idev; + struct inet6_ifaddr *ifa, **bifa; + int i; + + ASSERT_RTNL(); + + if (dev == &loopback_dev && how == 1) + how = 0; + + rt6_ifdown(dev); + neigh_ifdown(&nd_tbl, dev); + + idev = __in6_dev_get(dev); + if (idev == NULL) + return -ENODEV; + + /* Step 1: remove reference to ipv6 device from parent device. + Do not dev_put! + */ + if (how == 1) { + write_lock_bh(&addrconf_lock); + dev->ip6_ptr = NULL; + idev->dead = 1; + write_unlock_bh(&addrconf_lock); + + /* Step 1.5: remove snmp6 entry */ + snmp6_unregister_dev(idev); + + } + + /* Step 2: clear hash table */ + for (i=0; iidev == idev) { + *bifa = ifa->lst_next; + ifa->lst_next = NULL; + addrconf_del_timer(ifa); + in6_ifa_put(ifa); + continue; + } + bifa = &ifa->lst_next; + } + write_unlock_bh(&addrconf_hash_lock); + } + + write_lock_bh(&idev->lock); + + /* Step 3: clear flags for stateless addrconf */ + if (how != 1) + idev->if_flags &= ~(IF_RS_SENT|IF_RA_RCVD); + + /* Step 4: clear address list */ +#ifdef CONFIG_IPV6_PRIVACY + if (how == 1 && del_timer(&idev->regen_timer)) + in6_dev_put(idev); + + /* clear tempaddr list */ + while ((ifa = idev->tempaddr_list) != NULL) { + idev->tempaddr_list = ifa->tmp_next; + ifa->tmp_next = NULL; + ifa->dead = 1; + write_unlock_bh(&idev->lock); + spin_lock_bh(&ifa->lock); + + if (ifa->ifpub) { + in6_ifa_put(ifa->ifpub); + ifa->ifpub = NULL; + } + spin_unlock_bh(&ifa->lock); + in6_ifa_put(ifa); + write_lock_bh(&idev->lock); + } +#endif + while ((ifa = idev->addr_list) != NULL) { + idev->addr_list = ifa->if_next; + ifa->if_next = NULL; + ifa->dead = 1; + addrconf_del_timer(ifa); + write_unlock_bh(&idev->lock); + + __ipv6_ifa_notify(RTM_DELADDR, ifa); + in6_ifa_put(ifa); + + write_lock_bh(&idev->lock); + } + write_unlock_bh(&idev->lock); + + /* Step 5: Discard multicast list */ + + if (how == 1) + ipv6_mc_destroy_dev(idev); + else + ipv6_mc_down(idev); + + /* Step 5: netlink notification of this interface */ + idev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, idev); + + /* Shot the device (if unregistered) */ + + if (how == 1) { +#ifdef CONFIG_SYSCTL + addrconf_sysctl_unregister(&idev->cnf); + neigh_sysctl_unregister(idev->nd_parms); +#endif + neigh_parms_release(&nd_tbl, idev->nd_parms); + neigh_ifdown(&nd_tbl, dev); + in6_dev_put(idev); + } + return 0; +} + +static void addrconf_rs_timer(unsigned long data) +{ + struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + + if (ifp->idev->cnf.forwarding) + goto out; + + if (ifp->idev->if_flags & IF_RA_RCVD) { + /* + * Announcement received after solicitation + * was sent + */ + goto out; + } + + spin_lock(&ifp->lock); + if (ifp->probes++ < ifp->idev->cnf.rtr_solicits) { + struct in6_addr all_routers; + + /* The wait after the last probe can be shorter */ + addrconf_mod_timer(ifp, AC_RS, + (ifp->probes == ifp->idev->cnf.rtr_solicits) ? + ifp->idev->cnf.rtr_solicit_delay : + ifp->idev->cnf.rtr_solicit_interval); + spin_unlock(&ifp->lock); + + ipv6_addr_all_routers(&all_routers); + + ndisc_send_rs(ifp->idev->dev, &ifp->addr, &all_routers); + } else { + spin_unlock(&ifp->lock); + /* + * Note: we do not support deprecated "all on-link" + * assumption any longer. + */ + printk(KERN_DEBUG "%s: no IPv6 routers present\n", + ifp->idev->dev->name); + } + +out: + in6_ifa_put(ifp); +} + +/* + * Duplicate Address Detection + */ +static void addrconf_dad_start(struct inet6_ifaddr *ifp, int flags) +{ + struct inet6_dev *idev = ifp->idev; + struct net_device *dev = idev->dev; + unsigned long rand_num; + + addrconf_join_solict(dev, &ifp->addr); + + if (ifp->prefix_len != 128 && (ifp->flags&IFA_F_PERMANENT)) + addrconf_prefix_route(&ifp->addr, ifp->prefix_len, dev, 0, + flags); + + net_srandom(ifp->addr.s6_addr32[3]); + rand_num = net_random() % (idev->cnf.rtr_solicit_delay ? : 1); + + read_lock_bh(&idev->lock); + if (ifp->dead) + goto out; + spin_lock_bh(&ifp->lock); + + if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) || + !(ifp->flags&IFA_F_TENTATIVE)) { + ifp->flags &= ~IFA_F_TENTATIVE; + spin_unlock_bh(&ifp->lock); + read_unlock_bh(&idev->lock); + + addrconf_dad_completed(ifp); + return; + } + + ifp->probes = idev->cnf.dad_transmits; + addrconf_mod_timer(ifp, AC_DAD, rand_num); + + spin_unlock_bh(&ifp->lock); +out: + read_unlock_bh(&idev->lock); +} + +static void addrconf_dad_timer(unsigned long data) +{ + struct inet6_ifaddr *ifp = (struct inet6_ifaddr *) data; + struct inet6_dev *idev = ifp->idev; + struct in6_addr unspec; + struct in6_addr mcaddr; + + read_lock_bh(&idev->lock); + if (idev->dead) { + read_unlock_bh(&idev->lock); + goto out; + } + spin_lock_bh(&ifp->lock); + if (ifp->probes == 0) { + /* + * DAD was successful + */ + + ifp->flags &= ~IFA_F_TENTATIVE; + spin_unlock_bh(&ifp->lock); + read_unlock_bh(&idev->lock); + + addrconf_dad_completed(ifp); + + goto out; + } + + ifp->probes--; + addrconf_mod_timer(ifp, AC_DAD, ifp->idev->nd_parms->retrans_time); + spin_unlock_bh(&ifp->lock); + read_unlock_bh(&idev->lock); + + /* send a neighbour solicitation for our addr */ + memset(&unspec, 0, sizeof(unspec)); + addrconf_addr_solict_mult(&ifp->addr, &mcaddr); + ndisc_send_ns(ifp->idev->dev, NULL, &ifp->addr, &mcaddr, &unspec); +out: + in6_ifa_put(ifp); +} + +static void addrconf_dad_completed(struct inet6_ifaddr *ifp) +{ + struct net_device * dev = ifp->idev->dev; + + /* + * Configure the address for reception. Now it is valid. + */ + + ipv6_ifa_notify(RTM_NEWADDR, ifp); + + /* If added prefix is link local and forwarding is off, + start sending router solicitations. + */ + + if (ifp->idev->cnf.forwarding == 0 && + ifp->idev->cnf.rtr_solicits > 0 && + (dev->flags&IFF_LOOPBACK) == 0 && + (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) { + struct in6_addr all_routers; + + ipv6_addr_all_routers(&all_routers); + + /* + * If a host as already performed a random delay + * [...] as part of DAD [...] there is no need + * to delay again before sending the first RS + */ + ndisc_send_rs(ifp->idev->dev, &ifp->addr, &all_routers); + + spin_lock_bh(&ifp->lock); + ifp->probes = 1; + ifp->idev->if_flags |= IF_RS_SENT; + addrconf_mod_timer(ifp, AC_RS, ifp->idev->cnf.rtr_solicit_interval); + spin_unlock_bh(&ifp->lock); + } +} + +#ifdef CONFIG_PROC_FS +struct if6_iter_state { + int bucket; +}; + +static struct inet6_ifaddr *if6_get_first(struct seq_file *seq) +{ + struct inet6_ifaddr *ifa = NULL; + struct if6_iter_state *state = seq->private; + + for (state->bucket = 0; state->bucket < IN6_ADDR_HSIZE; ++state->bucket) { + ifa = inet6_addr_lst[state->bucket]; + if (ifa) + break; + } + return ifa; +} + +static struct inet6_ifaddr *if6_get_next(struct seq_file *seq, struct inet6_ifaddr *ifa) +{ + struct if6_iter_state *state = seq->private; + + ifa = ifa->lst_next; +try_again: + if (!ifa && ++state->bucket < IN6_ADDR_HSIZE) { + ifa = inet6_addr_lst[state->bucket]; + goto try_again; + } + return ifa; +} + +static struct inet6_ifaddr *if6_get_idx(struct seq_file *seq, loff_t pos) +{ + struct inet6_ifaddr *ifa = if6_get_first(seq); + + if (ifa) + while(pos && (ifa = if6_get_next(seq, ifa)) != NULL) + --pos; + return pos ? NULL : ifa; +} + +static void *if6_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock_bh(&addrconf_hash_lock); + return if6_get_idx(seq, *pos); +} + +static void *if6_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct inet6_ifaddr *ifa; + + ifa = if6_get_next(seq, v); + ++*pos; + return ifa; +} + +static void if6_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock_bh(&addrconf_hash_lock); +} + +static int if6_seq_show(struct seq_file *seq, void *v) +{ + struct inet6_ifaddr *ifp = (struct inet6_ifaddr *)v; + seq_printf(seq, + "%04x%04x%04x%04x%04x%04x%04x%04x %02x %02x %02x %02x %8s\n", + NIP6(ifp->addr), + ifp->idev->dev->ifindex, + ifp->prefix_len, + ifp->scope, + ifp->flags, + ifp->idev->dev->name); + return 0; +} + +static struct seq_operations if6_seq_ops = { + .start = if6_seq_start, + .next = if6_seq_next, + .show = if6_seq_show, + .stop = if6_seq_stop, +}; + +static int if6_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct if6_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + memset(s, 0, sizeof(*s)); + + rc = seq_open(file, &if6_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations if6_fops = { + .owner = THIS_MODULE, + .open = if6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +int __init if6_proc_init(void) +{ + if (!proc_net_fops_create("if_inet6", S_IRUGO, &if6_fops)) + return -ENOMEM; + return 0; +} + +void if6_proc_exit(void) +{ + proc_net_remove("if_inet6"); +} +#endif /* CONFIG_PROC_FS */ + +/* + * Periodic address status verification + */ + +static void addrconf_verify(unsigned long foo) +{ + struct inet6_ifaddr *ifp; + unsigned long now, next; + int i; + + spin_lock_bh(&addrconf_verify_lock); + now = jiffies; + next = now + ADDR_CHECK_FREQUENCY; + + del_timer(&addr_chk_timer); + + for (i=0; i < IN6_ADDR_HSIZE; i++) { + +restart: + write_lock(&addrconf_hash_lock); + for (ifp=inet6_addr_lst[i]; ifp; ifp=ifp->lst_next) { + unsigned long age; +#ifdef CONFIG_IPV6_PRIVACY + unsigned long regen_advance; +#endif + + if (ifp->flags & IFA_F_PERMANENT) + continue; + + spin_lock(&ifp->lock); + age = (now - ifp->tstamp) / HZ; + +#ifdef CONFIG_IPV6_PRIVACY + regen_advance = ifp->idev->cnf.regen_max_retry * + ifp->idev->cnf.dad_transmits * + ifp->idev->nd_parms->retrans_time / HZ; +#endif + + if (age >= ifp->valid_lft) { + spin_unlock(&ifp->lock); + in6_ifa_hold(ifp); + write_unlock(&addrconf_hash_lock); + ipv6_del_addr(ifp); + goto restart; + } else if (age >= ifp->prefered_lft) { + /* jiffies - ifp->tsamp > age >= ifp->prefered_lft */ + int deprecate = 0; + + if (!(ifp->flags&IFA_F_DEPRECATED)) { + deprecate = 1; + ifp->flags |= IFA_F_DEPRECATED; + } + + if (time_before(ifp->tstamp + ifp->valid_lft * HZ, next)) + next = ifp->tstamp + ifp->valid_lft * HZ; + + spin_unlock(&ifp->lock); + + if (deprecate) { + in6_ifa_hold(ifp); + write_unlock(&addrconf_hash_lock); + + ipv6_ifa_notify(0, ifp); + in6_ifa_put(ifp); + goto restart; + } +#ifdef CONFIG_IPV6_PRIVACY + } else if ((ifp->flags&IFA_F_TEMPORARY) && + !(ifp->flags&IFA_F_TENTATIVE)) { + if (age >= ifp->prefered_lft - regen_advance) { + struct inet6_ifaddr *ifpub = ifp->ifpub; + if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) + next = ifp->tstamp + ifp->prefered_lft * HZ; + if (!ifp->regen_count && ifpub) { + ifp->regen_count++; + in6_ifa_hold(ifp); + in6_ifa_hold(ifpub); + spin_unlock(&ifp->lock); + write_unlock(&addrconf_hash_lock); + ipv6_create_tempaddr(ifpub, ifp); + in6_ifa_put(ifpub); + in6_ifa_put(ifp); + goto restart; + } + } else if (time_before(ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ, next)) + next = ifp->tstamp + ifp->prefered_lft * HZ - regen_advance * HZ; + spin_unlock(&ifp->lock); +#endif + } else { + /* ifp->prefered_lft <= ifp->valid_lft */ + if (time_before(ifp->tstamp + ifp->prefered_lft * HZ, next)) + next = ifp->tstamp + ifp->prefered_lft * HZ; + spin_unlock(&ifp->lock); + } + } + write_unlock(&addrconf_hash_lock); + } + + addr_chk_timer.expires = time_before(next, jiffies + HZ) ? jiffies + HZ : next; + add_timer(&addr_chk_timer); + spin_unlock_bh(&addrconf_verify_lock); +} + +static int +inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct in6_addr *pfx; + + pfx = NULL; + if (rta[IFA_ADDRESS-1]) { + if (RTA_PAYLOAD(rta[IFA_ADDRESS-1]) < sizeof(*pfx)) + return -EINVAL; + pfx = RTA_DATA(rta[IFA_ADDRESS-1]); + } + if (rta[IFA_LOCAL-1]) { + if (pfx && memcmp(pfx, RTA_DATA(rta[IFA_LOCAL-1]), sizeof(*pfx))) + return -EINVAL; + pfx = RTA_DATA(rta[IFA_LOCAL-1]); + } + if (pfx == NULL) + return -EINVAL; + + return inet6_addr_del(ifm->ifa_index, pfx, ifm->ifa_prefixlen); +} + +static int +inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct ifaddrmsg *ifm = NLMSG_DATA(nlh); + struct in6_addr *pfx; + + pfx = NULL; + if (rta[IFA_ADDRESS-1]) { + if (RTA_PAYLOAD(rta[IFA_ADDRESS-1]) < sizeof(*pfx)) + return -EINVAL; + pfx = RTA_DATA(rta[IFA_ADDRESS-1]); + } + if (rta[IFA_LOCAL-1]) { + if (pfx && memcmp(pfx, RTA_DATA(rta[IFA_LOCAL-1]), sizeof(*pfx))) + return -EINVAL; + pfx = RTA_DATA(rta[IFA_LOCAL-1]); + } + if (pfx == NULL) + return -EINVAL; + + return inet6_addr_add(ifm->ifa_index, pfx, ifm->ifa_prefixlen); +} + +static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, + u32 pid, u32 seq, int event) +{ + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + struct ifa_cacheinfo ci; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; + ifm = NLMSG_DATA(nlh); + ifm->ifa_family = AF_INET6; + ifm->ifa_prefixlen = ifa->prefix_len; + ifm->ifa_flags = ifa->flags; + ifm->ifa_scope = RT_SCOPE_UNIVERSE; + if (ifa->scope&IFA_HOST) + ifm->ifa_scope = RT_SCOPE_HOST; + else if (ifa->scope&IFA_LINK) + ifm->ifa_scope = RT_SCOPE_LINK; + else if (ifa->scope&IFA_SITE) + ifm->ifa_scope = RT_SCOPE_SITE; + ifm->ifa_index = ifa->idev->dev->ifindex; + RTA_PUT(skb, IFA_ADDRESS, 16, &ifa->addr); + if (!(ifa->flags&IFA_F_PERMANENT)) { + ci.ifa_prefered = ifa->prefered_lft; + ci.ifa_valid = ifa->valid_lft; + if (ci.ifa_prefered != INFINITY_LIFE_TIME) { + long tval = (jiffies - ifa->tstamp)/HZ; + ci.ifa_prefered -= tval; + if (ci.ifa_valid != INFINITY_LIFE_TIME) + ci.ifa_valid -= tval; + } + } else { + ci.ifa_prefered = INFINITY_LIFE_TIME; + ci.ifa_valid = INFINITY_LIFE_TIME; + } + ci.cstamp = (__u32)(TIME_DELTA(ifa->cstamp, INITIAL_JIFFIES) / HZ * 100 + + TIME_DELTA(ifa->cstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); + ci.tstamp = (__u32)(TIME_DELTA(ifa->tstamp, INITIAL_JIFFIES) / HZ * 100 + + TIME_DELTA(ifa->tstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); + RTA_PUT(skb, IFA_CACHEINFO, sizeof(ci), &ci); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int inet6_fill_ifmcaddr(struct sk_buff *skb, struct ifmcaddr6 *ifmca, + u32 pid, u32 seq, int event) +{ + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + struct ifa_cacheinfo ci; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; + ifm = NLMSG_DATA(nlh); + ifm->ifa_family = AF_INET6; + ifm->ifa_prefixlen = 128; + ifm->ifa_flags = IFA_F_PERMANENT; + ifm->ifa_scope = RT_SCOPE_UNIVERSE; + if (ipv6_addr_scope(&ifmca->mca_addr)&IFA_SITE) + ifm->ifa_scope = RT_SCOPE_SITE; + ifm->ifa_index = ifmca->idev->dev->ifindex; + RTA_PUT(skb, IFA_MULTICAST, 16, &ifmca->mca_addr); + ci.cstamp = (__u32)(TIME_DELTA(ifmca->mca_cstamp, INITIAL_JIFFIES) / HZ + * 100 + TIME_DELTA(ifmca->mca_cstamp, INITIAL_JIFFIES) % HZ + * 100 / HZ); + ci.tstamp = (__u32)(TIME_DELTA(ifmca->mca_tstamp, INITIAL_JIFFIES) / HZ + * 100 + TIME_DELTA(ifmca->mca_tstamp, INITIAL_JIFFIES) % HZ + * 100 / HZ); + ci.ifa_prefered = INFINITY_LIFE_TIME; + ci.ifa_valid = INFINITY_LIFE_TIME; + RTA_PUT(skb, IFA_CACHEINFO, sizeof(ci), &ci); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int inet6_fill_ifacaddr(struct sk_buff *skb, struct ifacaddr6 *ifaca, + u32 pid, u32 seq, int event) +{ + struct ifaddrmsg *ifm; + struct nlmsghdr *nlh; + struct ifa_cacheinfo ci; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*ifm)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; + ifm = NLMSG_DATA(nlh); + ifm->ifa_family = AF_INET6; + ifm->ifa_prefixlen = 128; + ifm->ifa_flags = IFA_F_PERMANENT; + ifm->ifa_scope = RT_SCOPE_UNIVERSE; + if (ipv6_addr_scope(&ifaca->aca_addr)&IFA_SITE) + ifm->ifa_scope = RT_SCOPE_SITE; + ifm->ifa_index = ifaca->aca_idev->dev->ifindex; + RTA_PUT(skb, IFA_ANYCAST, 16, &ifaca->aca_addr); + ci.cstamp = (__u32)(TIME_DELTA(ifaca->aca_cstamp, INITIAL_JIFFIES) / HZ + * 100 + TIME_DELTA(ifaca->aca_cstamp, INITIAL_JIFFIES) % HZ + * 100 / HZ); + ci.tstamp = (__u32)(TIME_DELTA(ifaca->aca_tstamp, INITIAL_JIFFIES) / HZ + * 100 + TIME_DELTA(ifaca->aca_tstamp, INITIAL_JIFFIES) % HZ + * 100 / HZ); + ci.ifa_prefered = INFINITY_LIFE_TIME; + ci.ifa_valid = INFINITY_LIFE_TIME; + RTA_PUT(skb, IFA_CACHEINFO, sizeof(ci), &ci); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +enum addr_type_t +{ + UNICAST_ADDR, + MULTICAST_ADDR, + ANYCAST_ADDR, +}; + +static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, + enum addr_type_t type) +{ + int idx, ip_idx; + int s_idx, s_ip_idx; + int err = 1; + struct net_device *dev; + struct inet6_dev *idev = NULL; + struct inet6_ifaddr *ifa; + struct ifmcaddr6 *ifmca; + struct ifacaddr6 *ifaca; + + s_idx = cb->args[0]; + s_ip_idx = ip_idx = cb->args[1]; + read_lock(&dev_base_lock); + + for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (idx > s_idx) + s_ip_idx = 0; + ip_idx = 0; + if ((idev = in6_dev_get(dev)) == NULL) + continue; + read_lock_bh(&idev->lock); + switch (type) { + case UNICAST_ADDR: + /* unicast address */ + for (ifa = idev->addr_list; ifa; + ifa = ifa->if_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if ((err = inet6_fill_ifaddr(skb, ifa, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWADDR)) <= 0) + goto done; + } + /* temp addr */ +#ifdef CONFIG_IPV6_PRIVACY + for (ifa = idev->tempaddr_list; ifa; + ifa = ifa->tmp_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if ((err = inet6_fill_ifaddr(skb, ifa, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWADDR)) <= 0) + goto done; + } +#endif + break; + case MULTICAST_ADDR: + /* multicast address */ + for (ifmca = idev->mc_list; ifmca; + ifmca = ifmca->next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if ((err = inet6_fill_ifmcaddr(skb, ifmca, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_GETMULTICAST)) <= 0) + goto done; + } + break; + case ANYCAST_ADDR: + /* anycast address */ + for (ifaca = idev->ac_list; ifaca; + ifaca = ifaca->aca_next, ip_idx++) { + if (ip_idx < s_ip_idx) + continue; + if ((err = inet6_fill_ifacaddr(skb, ifaca, + NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_GETANYCAST)) <= 0) + goto done; + } + break; + default: + break; + } + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + } +done: + if (err <= 0) { + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + } + read_unlock(&dev_base_lock); + cb->args[0] = idx; + cb->args[1] = ip_idx; + return skb->len; +} + +static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + enum addr_type_t type = UNICAST_ADDR; + return inet6_dump_addr(skb, cb, type); +} + +static int inet6_dump_ifmcaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + enum addr_type_t type = MULTICAST_ADDR; + return inet6_dump_addr(skb, cb, type); +} + + +static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb) +{ + enum addr_type_t type = ANYCAST_ADDR; + return inet6_dump_addr(skb, cb, type); +} + +static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) +{ + struct sk_buff *skb; + int size = NLMSG_SPACE(sizeof(struct ifaddrmsg)+128); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) { + netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, ENOBUFS); + return; + } + if (inet6_fill_ifaddr(skb, ifa, 0, 0, event) < 0) { + kfree_skb(skb); + netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFADDR, EINVAL); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFADDR; + netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFADDR, GFP_ATOMIC); +} + +static void inline ipv6_store_devconf(struct ipv6_devconf *cnf, + __s32 *array, int bytes) +{ + memset(array, 0, bytes); + array[DEVCONF_FORWARDING] = cnf->forwarding; + array[DEVCONF_HOPLIMIT] = cnf->hop_limit; + array[DEVCONF_MTU6] = cnf->mtu6; + array[DEVCONF_ACCEPT_RA] = cnf->accept_ra; + array[DEVCONF_ACCEPT_REDIRECTS] = cnf->accept_redirects; + array[DEVCONF_AUTOCONF] = cnf->autoconf; + array[DEVCONF_DAD_TRANSMITS] = cnf->dad_transmits; + array[DEVCONF_RTR_SOLICITS] = cnf->rtr_solicits; + array[DEVCONF_RTR_SOLICIT_INTERVAL] = cnf->rtr_solicit_interval; + array[DEVCONF_RTR_SOLICIT_DELAY] = cnf->rtr_solicit_delay; + array[DEVCONF_FORCE_MLD_VERSION] = cnf->force_mld_version; +#ifdef CONFIG_IPV6_PRIVACY + array[DEVCONF_USE_TEMPADDR] = cnf->use_tempaddr; + array[DEVCONF_TEMP_VALID_LFT] = cnf->temp_valid_lft; + array[DEVCONF_TEMP_PREFERED_LFT] = cnf->temp_prefered_lft; + array[DEVCONF_REGEN_MAX_RETRY] = cnf->regen_max_retry; + array[DEVCONF_MAX_DESYNC_FACTOR] = cnf->max_desync_factor; +#endif + array[DEVCONF_MAX_ADDRESSES] = cnf->max_addresses; +} + +static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, + u32 pid, u32 seq, int event) +{ + struct net_device *dev = idev->dev; + __s32 *array = NULL; + struct ifinfomsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct rtattr *subattr; + __u32 mtu = dev->mtu; + struct ifla_cacheinfo ci; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*r)); + if (pid) nlh->nlmsg_flags |= NLM_F_MULTI; + r = NLMSG_DATA(nlh); + r->ifi_family = AF_INET6; + r->ifi_type = dev->type; + r->ifi_index = dev->ifindex; + r->ifi_flags = dev_get_flags(dev); + r->ifi_change = 0; + + RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name); + + if (dev->addr_len) + RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + + RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu); + if (dev->ifindex != dev->iflink) + RTA_PUT(skb, IFLA_LINK, sizeof(int), &dev->iflink); + + subattr = (struct rtattr*)skb->tail; + + RTA_PUT(skb, IFLA_PROTINFO, 0, NULL); + + /* return the device flags */ + RTA_PUT(skb, IFLA_INET6_FLAGS, sizeof(__u32), &idev->if_flags); + + /* return interface cacheinfo */ + ci.max_reasm_len = IPV6_MAXPLEN; + ci.tstamp = (__u32)(TIME_DELTA(idev->tstamp, INITIAL_JIFFIES) / HZ * 100 + + TIME_DELTA(idev->tstamp, INITIAL_JIFFIES) % HZ * 100 / HZ); + ci.reachable_time = idev->nd_parms->reachable_time; + ci.retrans_time = idev->nd_parms->retrans_time; + RTA_PUT(skb, IFLA_INET6_CACHEINFO, sizeof(ci), &ci); + + /* return the device sysctl params */ + if ((array = kmalloc(DEVCONF_MAX * sizeof(*array), GFP_ATOMIC)) == NULL) + goto rtattr_failure; + ipv6_store_devconf(&idev->cnf, array, DEVCONF_MAX * sizeof(*array)); + RTA_PUT(skb, IFLA_INET6_CONF, DEVCONF_MAX * sizeof(*array), array); + + /* XXX - Statistics/MC not implemented */ + subattr->rta_len = skb->tail - (u8*)subattr; + + nlh->nlmsg_len = skb->tail - b; + kfree(array); + return skb->len; + +nlmsg_failure: +rtattr_failure: + if (array) + kfree(array); + skb_trim(skb, b - skb->data); + return -1; +} + +static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx, err; + int s_idx = cb->args[0]; + struct net_device *dev; + struct inet6_dev *idev; + + read_lock(&dev_base_lock); + for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if ((idev = in6_dev_get(dev)) == NULL) + continue; + err = inet6_fill_ifinfo(skb, idev, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWLINK); + in6_dev_put(idev); + if (err <= 0) + break; + } + read_unlock(&dev_base_lock); + cb->args[0] = idx; + + return skb->len; +} + +void inet6_ifinfo_notify(int event, struct inet6_dev *idev) +{ + struct sk_buff *skb; + /* 128 bytes ?? */ + int size = NLMSG_SPACE(sizeof(struct ifinfomsg)+128); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) { + netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, ENOBUFS); + return; + } + if (inet6_fill_ifinfo(skb, idev, 0, 0, event) < 0) { + kfree_skb(skb); + netlink_set_err(rtnl, 0, RTMGRP_IPV6_IFINFO, EINVAL); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_IFINFO; + netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_IFINFO, GFP_ATOMIC); +} + +static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, + struct prefix_info *pinfo, u32 pid, u32 seq, int event) +{ + struct prefixmsg *pmsg; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct prefix_cacheinfo ci; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*pmsg)); + + if (pid) + nlh->nlmsg_flags |= NLM_F_MULTI; + + pmsg = NLMSG_DATA(nlh); + pmsg->prefix_family = AF_INET6; + pmsg->prefix_ifindex = idev->dev->ifindex; + pmsg->prefix_len = pinfo->prefix_len; + pmsg->prefix_type = pinfo->type; + + pmsg->prefix_flags = 0; + if (pinfo->onlink) + pmsg->prefix_flags |= IF_PREFIX_ONLINK; + if (pinfo->autoconf) + pmsg->prefix_flags |= IF_PREFIX_AUTOCONF; + + RTA_PUT(skb, PREFIX_ADDRESS, sizeof(pinfo->prefix), &pinfo->prefix); + + ci.preferred_time = ntohl(pinfo->prefered); + ci.valid_time = ntohl(pinfo->valid); + RTA_PUT(skb, PREFIX_CACHEINFO, sizeof(ci), &ci); + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static void inet6_prefix_notify(int event, struct inet6_dev *idev, + struct prefix_info *pinfo) +{ + struct sk_buff *skb; + int size = NLMSG_SPACE(sizeof(struct prefixmsg)+128); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) { + netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, ENOBUFS); + return; + } + if (inet6_fill_prefix(skb, idev, pinfo, 0, 0, event) < 0) { + kfree_skb(skb); + netlink_set_err(rtnl, 0, RTMGRP_IPV6_PREFIX, EINVAL); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_PREFIX; + netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_PREFIX, GFP_ATOMIC); +} + +static struct rtnetlink_link inet6_rtnetlink_table[RTM_MAX - RTM_BASE + 1] = { + [RTM_GETLINK - RTM_BASE] = { .dumpit = inet6_dump_ifinfo, }, + [RTM_NEWADDR - RTM_BASE] = { .doit = inet6_rtm_newaddr, }, + [RTM_DELADDR - RTM_BASE] = { .doit = inet6_rtm_deladdr, }, + [RTM_GETADDR - RTM_BASE] = { .dumpit = inet6_dump_ifaddr, }, + [RTM_GETMULTICAST - RTM_BASE] = { .dumpit = inet6_dump_ifmcaddr, }, + [RTM_GETANYCAST - RTM_BASE] = { .dumpit = inet6_dump_ifacaddr, }, + [RTM_NEWROUTE - RTM_BASE] = { .doit = inet6_rtm_newroute, }, + [RTM_DELROUTE - RTM_BASE] = { .doit = inet6_rtm_delroute, }, + [RTM_GETROUTE - RTM_BASE] = { .doit = inet6_rtm_getroute, + .dumpit = inet6_dump_fib, }, +}; + +static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) +{ + inet6_ifa_notify(event ? : RTM_NEWADDR, ifp); + + switch (event) { + case RTM_NEWADDR: + dst_hold(&ifp->rt->u.dst); + if (ip6_ins_rt(ifp->rt, NULL, NULL)) + dst_release(&ifp->rt->u.dst); + if (ifp->idev->cnf.forwarding) + addrconf_join_anycast(ifp); + break; + case RTM_DELADDR: + if (ifp->idev->cnf.forwarding) + addrconf_leave_anycast(ifp); + addrconf_leave_solict(ifp->idev, &ifp->addr); + dst_hold(&ifp->rt->u.dst); + if (ip6_del_rt(ifp->rt, NULL, NULL)) + dst_free(&ifp->rt->u.dst); + else + dst_release(&ifp->rt->u.dst); + break; + } +} + +static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) +{ + read_lock_bh(&addrconf_lock); + if (likely(ifp->idev->dead == 0)) + __ipv6_ifa_notify(event, ifp); + read_unlock_bh(&addrconf_lock); +} + +#ifdef CONFIG_SYSCTL + +static +int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + int ret; + + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + + if (write && valp != &ipv6_devconf_dflt.forwarding) { + if (valp != &ipv6_devconf.forwarding) { + if ((!*valp) ^ (!val)) { + struct inet6_dev *idev = (struct inet6_dev *)ctl->extra1; + if (idev == NULL) + return ret; + dev_forward_change(idev); + } + } else { + ipv6_devconf_dflt.forwarding = ipv6_devconf.forwarding; + addrconf_forward_change(); + } + if (*valp) + rt6_purge_dflt_routers(); + } + + return ret; +} + +static int addrconf_sysctl_forward_strategy(ctl_table *table, + int __user *name, int nlen, + void __user *oldval, + size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + int *valp = table->data; + int new; + + if (!newval || !newlen) + return 0; + if (newlen != sizeof(int)) + return -EINVAL; + if (get_user(new, (int __user *)newval)) + return -EFAULT; + if (new == *valp) + return 0; + if (oldval && oldlenp) { + size_t len; + if (get_user(len, oldlenp)) + return -EFAULT; + if (len) { + if (len > table->maxlen) + len = table->maxlen; + if (copy_to_user(oldval, valp, len)) + return -EFAULT; + if (put_user(len, oldlenp)) + return -EFAULT; + } + } + + if (valp != &ipv6_devconf_dflt.forwarding) { + if (valp != &ipv6_devconf.forwarding) { + struct inet6_dev *idev = (struct inet6_dev *)table->extra1; + int changed; + if (unlikely(idev == NULL)) + return -ENODEV; + changed = (!*valp) ^ (!new); + *valp = new; + if (changed) + dev_forward_change(idev); + } else { + *valp = new; + addrconf_forward_change(); + } + + if (*valp) + rt6_purge_dflt_routers(); + } else + *valp = new; + + return 1; +} + +static struct addrconf_sysctl_table +{ + struct ctl_table_header *sysctl_header; + ctl_table addrconf_vars[__NET_IPV6_MAX]; + ctl_table addrconf_dev[2]; + ctl_table addrconf_conf_dir[2]; + ctl_table addrconf_proto_dir[2]; + ctl_table addrconf_root_dir[2]; +} addrconf_sysctl = { + .sysctl_header = NULL, + .addrconf_vars = { + { + .ctl_name = NET_IPV6_FORWARDING, + .procname = "forwarding", + .data = &ipv6_devconf.forwarding, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &addrconf_sysctl_forward, + .strategy = &addrconf_sysctl_forward_strategy, + }, + { + .ctl_name = NET_IPV6_HOP_LIMIT, + .procname = "hop_limit", + .data = &ipv6_devconf.hop_limit, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .ctl_name = NET_IPV6_MTU, + .procname = "mtu", + .data = &ipv6_devconf.mtu6, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_ACCEPT_RA, + .procname = "accept_ra", + .data = &ipv6_devconf.accept_ra, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_ACCEPT_REDIRECTS, + .procname = "accept_redirects", + .data = &ipv6_devconf.accept_redirects, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_AUTOCONF, + .procname = "autoconf", + .data = &ipv6_devconf.autoconf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_DAD_TRANSMITS, + .procname = "dad_transmits", + .data = &ipv6_devconf.dad_transmits, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_RTR_SOLICITS, + .procname = "router_solicitations", + .data = &ipv6_devconf.rtr_solicits, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_RTR_SOLICIT_INTERVAL, + .procname = "router_solicitation_interval", + .data = &ipv6_devconf.rtr_solicit_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_RTR_SOLICIT_DELAY, + .procname = "router_solicitation_delay", + .data = &ipv6_devconf.rtr_solicit_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_FORCE_MLD_VERSION, + .procname = "force_mld_version", + .data = &ipv6_devconf.force_mld_version, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#ifdef CONFIG_IPV6_PRIVACY + { + .ctl_name = NET_IPV6_USE_TEMPADDR, + .procname = "use_tempaddr", + .data = &ipv6_devconf.use_tempaddr, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_TEMP_VALID_LFT, + .procname = "temp_valid_lft", + .data = &ipv6_devconf.temp_valid_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_TEMP_PREFERED_LFT, + .procname = "temp_prefered_lft", + .data = &ipv6_devconf.temp_prefered_lft, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_REGEN_MAX_RETRY, + .procname = "regen_max_retry", + .data = &ipv6_devconf.regen_max_retry, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_MAX_DESYNC_FACTOR, + .procname = "max_desync_factor", + .data = &ipv6_devconf.max_desync_factor, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif + { + .ctl_name = NET_IPV6_MAX_ADDRESSES, + .procname = "max_addresses", + .data = &ipv6_devconf.max_addresses, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = 0, /* sentinel */ + } + }, + .addrconf_dev = { + { + .ctl_name = NET_PROTO_CONF_ALL, + .procname = "all", + .mode = 0555, + .child = addrconf_sysctl.addrconf_vars, + }, + { + .ctl_name = 0, /* sentinel */ + } + }, + .addrconf_conf_dir = { + { + .ctl_name = NET_IPV6_CONF, + .procname = "conf", + .mode = 0555, + .child = addrconf_sysctl.addrconf_dev, + }, + { + .ctl_name = 0, /* sentinel */ + } + }, + .addrconf_proto_dir = { + { + .ctl_name = NET_IPV6, + .procname = "ipv6", + .mode = 0555, + .child = addrconf_sysctl.addrconf_conf_dir, + }, + { + .ctl_name = 0, /* sentinel */ + } + }, + .addrconf_root_dir = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = addrconf_sysctl.addrconf_proto_dir, + }, + { + .ctl_name = 0, /* sentinel */ + } + }, +}; + +static void addrconf_sysctl_register(struct inet6_dev *idev, struct ipv6_devconf *p) +{ + int i; + struct net_device *dev = idev ? idev->dev : NULL; + struct addrconf_sysctl_table *t; + char *dev_name = NULL; + + t = kmalloc(sizeof(*t), GFP_KERNEL); + if (t == NULL) + return; + memcpy(t, &addrconf_sysctl, sizeof(*t)); + for (i=0; t->addrconf_vars[i].data; i++) { + t->addrconf_vars[i].data += (char*)p - (char*)&ipv6_devconf; + t->addrconf_vars[i].de = NULL; + t->addrconf_vars[i].extra1 = idev; /* embedded; no ref */ + } + if (dev) { + dev_name = dev->name; + t->addrconf_dev[0].ctl_name = dev->ifindex; + } else { + dev_name = "default"; + t->addrconf_dev[0].ctl_name = NET_PROTO_CONF_DEFAULT; + } + + /* + * Make a copy of dev_name, because '.procname' is regarded as const + * by sysctl and we wouldn't want anyone to change it under our feet + * (see SIOCSIFNAME). + */ + dev_name = net_sysctl_strdup(dev_name); + if (!dev_name) + goto free; + + t->addrconf_dev[0].procname = dev_name; + + t->addrconf_dev[0].child = t->addrconf_vars; + t->addrconf_dev[0].de = NULL; + t->addrconf_conf_dir[0].child = t->addrconf_dev; + t->addrconf_conf_dir[0].de = NULL; + t->addrconf_proto_dir[0].child = t->addrconf_conf_dir; + t->addrconf_proto_dir[0].de = NULL; + t->addrconf_root_dir[0].child = t->addrconf_proto_dir; + t->addrconf_root_dir[0].de = NULL; + + t->sysctl_header = register_sysctl_table(t->addrconf_root_dir, 0); + if (t->sysctl_header == NULL) + goto free_procname; + else + p->sysctl = t; + return; + + /* error path */ + free_procname: + kfree(dev_name); + free: + kfree(t); + + return; +} + +static void addrconf_sysctl_unregister(struct ipv6_devconf *p) +{ + if (p->sysctl) { + struct addrconf_sysctl_table *t = p->sysctl; + p->sysctl = NULL; + unregister_sysctl_table(t->sysctl_header); + kfree(t->addrconf_dev[0].procname); + kfree(t); + } +} + + +#endif + +/* + * Device notifier + */ + +int register_inet6addr_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&inet6addr_chain, nb); +} + +int unregister_inet6addr_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&inet6addr_chain,nb); +} + +/* + * Init / cleanup code + */ + +int __init addrconf_init(void) +{ + int err = 0; + + /* The addrconf netdev notifier requires that loopback_dev + * has it's ipv6 private information allocated and setup + * before it can bring up and give link-local addresses + * to other devices which are up. + * + * Unfortunately, loopback_dev is not necessarily the first + * entry in the global dev_base list of net devices. In fact, + * it is likely to be the very last entry on that list. + * So this causes the notifier registry below to try and + * give link-local addresses to all devices besides loopback_dev + * first, then loopback_dev, which cases all the non-loopback_dev + * devices to fail to get a link-local address. + * + * So, as a temporary fix, allocate the ipv6 structure for + * loopback_dev first by hand. + * Longer term, all of the dependencies ipv6 has upon the loopback + * device and it being up should be removed. + */ + rtnl_lock(); + if (!ipv6_add_dev(&loopback_dev)) + err = -ENOMEM; + rtnl_unlock(); + if (err) + return err; + + register_netdevice_notifier(&ipv6_dev_notf); + +#ifdef CONFIG_IPV6_PRIVACY + md5_tfm = crypto_alloc_tfm("md5", 0); + if (unlikely(md5_tfm == NULL)) + printk(KERN_WARNING + "failed to load transform for md5\n"); +#endif + + addrconf_verify(0); + rtnetlink_links[PF_INET6] = inet6_rtnetlink_table; +#ifdef CONFIG_SYSCTL + addrconf_sysctl.sysctl_header = + register_sysctl_table(addrconf_sysctl.addrconf_root_dir, 0); + addrconf_sysctl_register(NULL, &ipv6_devconf_dflt); +#endif + + return 0; +} + +void __exit addrconf_cleanup(void) +{ + struct net_device *dev; + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + int i; + + unregister_netdevice_notifier(&ipv6_dev_notf); + + rtnetlink_links[PF_INET6] = NULL; +#ifdef CONFIG_SYSCTL + addrconf_sysctl_unregister(&ipv6_devconf_dflt); + addrconf_sysctl_unregister(&ipv6_devconf); +#endif + + rtnl_lock(); + + /* + * clean dev list. + */ + + for (dev=dev_base; dev; dev=dev->next) { + if ((idev = __in6_dev_get(dev)) == NULL) + continue; + addrconf_ifdown(dev, 1); + } + addrconf_ifdown(&loopback_dev, 2); + + /* + * Check hash table. + */ + + write_lock_bh(&addrconf_hash_lock); + for (i=0; i < IN6_ADDR_HSIZE; i++) { + for (ifa=inet6_addr_lst[i]; ifa; ) { + struct inet6_ifaddr *bifa; + + bifa = ifa; + ifa = ifa->lst_next; + printk(KERN_DEBUG "bug: IPv6 address leakage detected: ifa=%p\n", bifa); + /* Do not free it; something is wrong. + Now we can investigate it with debugger. + */ + } + } + write_unlock_bh(&addrconf_hash_lock); + + del_timer(&addr_chk_timer); + + rtnl_unlock(); + +#ifdef CONFIG_IPV6_PRIVACY + if (likely(md5_tfm != NULL)) { + crypto_free_tfm(md5_tfm); + md5_tfm = NULL; + } +#endif + +#ifdef CONFIG_PROC_FS + proc_net_remove("if_inet6"); +#endif +} diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c new file mode 100644 index 000000000000..768b11703daf --- /dev/null +++ b/net/ipv6/af_inet6.c @@ -0,0 +1,867 @@ +/* + * PF_INET6 socket protocol family + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * Adapted from linux/net/ipv4/af_inet.c + * + * $Id: af_inet6.c,v 1.66 2002/02/01 22:01:04 davem Exp $ + * + * Fixes: + * piggy, Karl Knutson : Socket protocol table + * Hideaki YOSHIFUJI : sin6_scope_id support + * Arnaldo Melo : check proc_net_create return, cleanups + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_IPV6_TUNNEL +#include +#endif + +#include +#include + +MODULE_AUTHOR("Cast of dozens"); +MODULE_DESCRIPTION("IPv6 protocol stack for Linux"); +MODULE_LICENSE("GPL"); + +/* IPv6 procfs goodies... */ + +#ifdef CONFIG_PROC_FS +extern int raw6_proc_init(void); +extern void raw6_proc_exit(void); +extern int tcp6_proc_init(void); +extern void tcp6_proc_exit(void); +extern int udp6_proc_init(void); +extern void udp6_proc_exit(void); +extern int ipv6_misc_proc_init(void); +extern void ipv6_misc_proc_exit(void); +extern int ac6_proc_init(void); +extern void ac6_proc_exit(void); +extern int if6_proc_init(void); +extern void if6_proc_exit(void); +#endif + +int sysctl_ipv6_bindv6only; + +#ifdef INET_REFCNT_DEBUG +atomic_t inet6_sock_nr; +#endif + +/* The inetsw table contains everything that inet_create needs to + * build a new socket. + */ +static struct list_head inetsw6[SOCK_MAX]; +static DEFINE_SPINLOCK(inetsw6_lock); + +static void inet6_sock_destruct(struct sock *sk) +{ + inet_sock_destruct(sk); + +#ifdef INET_REFCNT_DEBUG + atomic_dec(&inet6_sock_nr); +#endif +} + +static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) +{ + const int offset = sk->sk_prot->obj_size - sizeof(struct ipv6_pinfo); + + return (struct ipv6_pinfo *)(((u8 *)sk) + offset); +} + +static int inet6_create(struct socket *sock, int protocol) +{ + struct inet_sock *inet; + struct ipv6_pinfo *np; + struct sock *sk; + struct list_head *p; + struct inet_protosw *answer; + struct proto *answer_prot; + unsigned char answer_flags; + char answer_no_check; + int rc; + + /* Look for the requested type/protocol pair. */ + answer = NULL; + rcu_read_lock(); + list_for_each_rcu(p, &inetsw6[sock->type]) { + answer = list_entry(p, struct inet_protosw, list); + + /* Check the non-wild match. */ + if (protocol == answer->protocol) { + if (protocol != IPPROTO_IP) + break; + } else { + /* Check for the two wild cases. */ + if (IPPROTO_IP == protocol) { + protocol = answer->protocol; + break; + } + if (IPPROTO_IP == answer->protocol) + break; + } + answer = NULL; + } + + rc = -ESOCKTNOSUPPORT; + if (!answer) + goto out_rcu_unlock; + rc = -EPERM; + if (answer->capability > 0 && !capable(answer->capability)) + goto out_rcu_unlock; + rc = -EPROTONOSUPPORT; + if (!protocol) + goto out_rcu_unlock; + + sock->ops = answer->ops; + + answer_prot = answer->prot; + answer_no_check = answer->no_check; + answer_flags = answer->flags; + rcu_read_unlock(); + + BUG_TRAP(answer_prot->slab != NULL); + + rc = -ENOBUFS; + sk = sk_alloc(PF_INET6, GFP_KERNEL, answer_prot, 1); + if (sk == NULL) + goto out; + + sock_init_data(sock, sk); + + rc = 0; + sk->sk_no_check = answer_no_check; + if (INET_PROTOSW_REUSE & answer_flags) + sk->sk_reuse = 1; + + inet = inet_sk(sk); + + if (SOCK_RAW == sock->type) { + inet->num = protocol; + if (IPPROTO_RAW == protocol) + inet->hdrincl = 1; + } + + sk->sk_destruct = inet6_sock_destruct; + sk->sk_family = PF_INET6; + sk->sk_protocol = protocol; + + sk->sk_backlog_rcv = answer->prot->backlog_rcv; + + inet_sk(sk)->pinet6 = np = inet6_sk_generic(sk); + np->hop_limit = -1; + np->mcast_hops = -1; + np->mc_loop = 1; + np->pmtudisc = IPV6_PMTUDISC_WANT; + np->ipv6only = sysctl_ipv6_bindv6only; + + /* Init the ipv4 part of the socket since we can have sockets + * using v6 API for ipv4. + */ + inet->uc_ttl = -1; + + inet->mc_loop = 1; + inet->mc_ttl = 1; + inet->mc_index = 0; + inet->mc_list = NULL; + + if (ipv4_config.no_pmtu_disc) + inet->pmtudisc = IP_PMTUDISC_DONT; + else + inet->pmtudisc = IP_PMTUDISC_WANT; + + +#ifdef INET_REFCNT_DEBUG + atomic_inc(&inet6_sock_nr); + atomic_inc(&inet_sock_nr); +#endif + if (inet->num) { + /* It assumes that any protocol which allows + * the user to assign a number at socket + * creation time automatically shares. + */ + inet->sport = ntohs(inet->num); + sk->sk_prot->hash(sk); + } + if (sk->sk_prot->init) { + rc = sk->sk_prot->init(sk); + if (rc) { + sk_common_release(sk); + goto out; + } + } +out: + return rc; +out_rcu_unlock: + rcu_read_unlock(); + goto out; +} + + +/* bind for INET6 API */ +int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in6 *addr=(struct sockaddr_in6 *)uaddr; + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + __u32 v4addr = 0; + unsigned short snum; + int addr_type = 0; + int err = 0; + + /* If the socket has its own bind function then use it. */ + if (sk->sk_prot->bind) + return sk->sk_prot->bind(sk, uaddr, addr_len); + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + addr_type = ipv6_addr_type(&addr->sin6_addr); + if ((addr_type & IPV6_ADDR_MULTICAST) && sock->type == SOCK_STREAM) + return -EINVAL; + + snum = ntohs(addr->sin6_port); + if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + lock_sock(sk); + + /* Check these errors (active socket, double bind). */ + if (sk->sk_state != TCP_CLOSE || inet->num) { + err = -EINVAL; + goto out; + } + + /* Check if the address belongs to the host. */ + if (addr_type == IPV6_ADDR_MAPPED) { + v4addr = addr->sin6_addr.s6_addr32[3]; + if (inet_addr_type(v4addr) != RTN_LOCAL) { + err = -EADDRNOTAVAIL; + goto out; + } + } else { + if (addr_type != IPV6_ADDR_ANY) { + struct net_device *dev = NULL; + + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + addr->sin6_scope_id) { + /* Override any existing binding, if another one + * is supplied by user. + */ + sk->sk_bound_dev_if = addr->sin6_scope_id; + } + + /* Binding to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) { + err = -EINVAL; + goto out; + } + dev = dev_get_by_index(sk->sk_bound_dev_if); + if (!dev) { + err = -ENODEV; + goto out; + } + } + + /* ipv4 addr of the socket is invalid. Only the + * unspecified and mapped address have a v4 equivalent. + */ + v4addr = LOOPBACK4_IPV6; + if (!(addr_type & IPV6_ADDR_MULTICAST)) { + if (!ipv6_chk_addr(&addr->sin6_addr, dev, 0)) { + if (dev) + dev_put(dev); + err = -EADDRNOTAVAIL; + goto out; + } + } + if (dev) + dev_put(dev); + } + } + + inet->rcv_saddr = v4addr; + inet->saddr = v4addr; + + ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr); + + if (!(addr_type & IPV6_ADDR_MULTICAST)) + ipv6_addr_copy(&np->saddr, &addr->sin6_addr); + + /* Make sure we are allowed to bind here. */ + if (sk->sk_prot->get_port(sk, snum)) { + inet_reset_saddr(sk); + err = -EADDRINUSE; + goto out; + } + + if (addr_type != IPV6_ADDR_ANY) + sk->sk_userlocks |= SOCK_BINDADDR_LOCK; + if (snum) + sk->sk_userlocks |= SOCK_BINDPORT_LOCK; + inet->sport = ntohs(inet->num); + inet->dport = 0; + inet->daddr = 0; +out: + release_sock(sk); + return err; +} + +int inet6_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (sk == NULL) + return -EINVAL; + + /* Free mc lists */ + ipv6_sock_mc_close(sk); + + /* Free ac lists */ + ipv6_sock_ac_close(sk); + + return inet_release(sock); +} + +int inet6_destroy_sock(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + struct ipv6_txoptions *opt; + + /* + * Release destination entry + */ + + sk_dst_reset(sk); + + /* Release rx options */ + + if ((skb = xchg(&np->pktoptions, NULL)) != NULL) + kfree_skb(skb); + + /* Free flowlabels */ + fl6_free_socklist(sk); + + /* Free tx options */ + + if ((opt = xchg(&np->opt, NULL)) != NULL) + sock_kfree_s(sk, opt, opt->tot_len); + + return 0; +} + +/* + * This does both peername and sockname. + */ + +int inet6_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_in6 *sin=(struct sockaddr_in6 *)uaddr; + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + + sin->sin6_family = AF_INET6; + sin->sin6_flowinfo = 0; + sin->sin6_scope_id = 0; + if (peer) { + if (!inet->dport) + return -ENOTCONN; + if (((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) && + peer == 1) + return -ENOTCONN; + sin->sin6_port = inet->dport; + ipv6_addr_copy(&sin->sin6_addr, &np->daddr); + if (np->sndflow) + sin->sin6_flowinfo = np->flow_label; + } else { + if (ipv6_addr_any(&np->rcv_saddr)) + ipv6_addr_copy(&sin->sin6_addr, &np->saddr); + else + ipv6_addr_copy(&sin->sin6_addr, &np->rcv_saddr); + + sin->sin6_port = inet->sport; + } + if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin->sin6_scope_id = sk->sk_bound_dev_if; + *uaddr_len = sizeof(*sin); + return(0); +} + +int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + int err = -EINVAL; + + switch(cmd) + { + case SIOCGSTAMP: + return sock_get_timestamp(sk, (struct timeval __user *)arg); + + case SIOCADDRT: + case SIOCDELRT: + + return(ipv6_route_ioctl(cmd,(void __user *)arg)); + + case SIOCSIFADDR: + return addrconf_add_ifaddr((void __user *) arg); + case SIOCDIFADDR: + return addrconf_del_ifaddr((void __user *) arg); + case SIOCSIFDSTADDR: + return addrconf_set_dstaddr((void __user *) arg); + default: + if (!sk->sk_prot->ioctl || + (err = sk->sk_prot->ioctl(sk, cmd, arg)) == -ENOIOCTLCMD) + return(dev_ioctl(cmd,(void __user *) arg)); + return err; + } + /*NOTREACHED*/ + return(0); +} + +struct proto_ops inet6_stream_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_stream_connect, /* ok */ + .socketpair = sock_no_socketpair, /* a do nothing */ + .accept = inet_accept, /* ok */ + .getname = inet6_getname, + .poll = tcp_poll, /* ok */ + .ioctl = inet6_ioctl, /* must change */ + .listen = inet_listen, /* ok */ + .shutdown = inet_shutdown, /* ok */ + .setsockopt = sock_common_setsockopt, /* ok */ + .getsockopt = sock_common_getsockopt, /* ok */ + .sendmsg = inet_sendmsg, /* ok */ + .recvmsg = sock_common_recvmsg, /* ok */ + .mmap = sock_no_mmap, + .sendpage = tcp_sendpage +}; + +struct proto_ops inet6_dgram_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_dgram_connect, /* ok */ + .socketpair = sock_no_socketpair, /* a do nothing */ + .accept = sock_no_accept, /* a do nothing */ + .getname = inet6_getname, + .poll = udp_poll, /* ok */ + .ioctl = inet6_ioctl, /* must change */ + .listen = sock_no_listen, /* ok */ + .shutdown = inet_shutdown, /* ok */ + .setsockopt = sock_common_setsockopt, /* ok */ + .getsockopt = sock_common_getsockopt, /* ok */ + .sendmsg = inet_sendmsg, /* ok */ + .recvmsg = sock_common_recvmsg, /* ok */ + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct net_proto_family inet6_family_ops = { + .family = PF_INET6, + .create = inet6_create, + .owner = THIS_MODULE, +}; + +#ifdef CONFIG_SYSCTL +extern void ipv6_sysctl_register(void); +extern void ipv6_sysctl_unregister(void); +#endif + +/* Same as inet6_dgram_ops, sans udp_poll. */ +static struct proto_ops inet6_sockraw_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_dgram_connect, /* ok */ + .socketpair = sock_no_socketpair, /* a do nothing */ + .accept = sock_no_accept, /* a do nothing */ + .getname = inet6_getname, + .poll = datagram_poll, /* ok */ + .ioctl = inet6_ioctl, /* must change */ + .listen = sock_no_listen, /* ok */ + .shutdown = inet_shutdown, /* ok */ + .setsockopt = sock_common_setsockopt, /* ok */ + .getsockopt = sock_common_getsockopt, /* ok */ + .sendmsg = inet_sendmsg, /* ok */ + .recvmsg = sock_common_recvmsg, /* ok */ + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct inet_protosw rawv6_protosw = { + .type = SOCK_RAW, + .protocol = IPPROTO_IP, /* wild card */ + .prot = &rawv6_prot, + .ops = &inet6_sockraw_ops, + .capability = CAP_NET_RAW, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_REUSE, +}; + +void +inet6_register_protosw(struct inet_protosw *p) +{ + struct list_head *lh; + struct inet_protosw *answer; + int protocol = p->protocol; + struct list_head *last_perm; + + spin_lock_bh(&inetsw6_lock); + + if (p->type >= SOCK_MAX) + goto out_illegal; + + /* If we are trying to override a permanent protocol, bail. */ + answer = NULL; + last_perm = &inetsw6[p->type]; + list_for_each(lh, &inetsw6[p->type]) { + answer = list_entry(lh, struct inet_protosw, list); + + /* Check only the non-wild match. */ + if (INET_PROTOSW_PERMANENT & answer->flags) { + if (protocol == answer->protocol) + break; + last_perm = lh; + } + + answer = NULL; + } + if (answer) + goto out_permanent; + + /* Add the new entry after the last permanent entry if any, so that + * the new entry does not override a permanent entry when matched with + * a wild-card protocol. But it is allowed to override any existing + * non-permanent entry. This means that when we remove this entry, the + * system automatically returns to the old behavior. + */ + list_add_rcu(&p->list, last_perm); +out: + spin_unlock_bh(&inetsw6_lock); + return; + +out_permanent: + printk(KERN_ERR "Attempt to override permanent protocol %d.\n", + protocol); + goto out; + +out_illegal: + printk(KERN_ERR + "Ignoring attempt to register invalid socket type %d.\n", + p->type); + goto out; +} + +void +inet6_unregister_protosw(struct inet_protosw *p) +{ + if (INET_PROTOSW_PERMANENT & p->flags) { + printk(KERN_ERR + "Attempt to unregister permanent protocol %d.\n", + p->protocol); + } else { + spin_lock_bh(&inetsw6_lock); + list_del_rcu(&p->list); + spin_unlock_bh(&inetsw6_lock); + + synchronize_net(); + } +} + +int +snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) +{ + if (ptr == NULL) + return -EINVAL; + + ptr[0] = __alloc_percpu(mibsize, mibalign); + if (!ptr[0]) + goto err0; + + ptr[1] = __alloc_percpu(mibsize, mibalign); + if (!ptr[1]) + goto err1; + + return 0; + +err1: + free_percpu(ptr[0]); + ptr[0] = NULL; +err0: + return -ENOMEM; +} + +void +snmp6_mib_free(void *ptr[2]) +{ + if (ptr == NULL) + return; + if (ptr[0]) + free_percpu(ptr[0]); + if (ptr[1]) + free_percpu(ptr[1]); + ptr[0] = ptr[1] = NULL; +} + +static int __init init_ipv6_mibs(void) +{ + if (snmp6_mib_init((void **)ipv6_statistics, sizeof (struct ipstats_mib), + __alignof__(struct ipstats_mib)) < 0) + goto err_ip_mib; + if (snmp6_mib_init((void **)icmpv6_statistics, sizeof (struct icmpv6_mib), + __alignof__(struct icmpv6_mib)) < 0) + goto err_icmp_mib; + if (snmp6_mib_init((void **)udp_stats_in6, sizeof (struct udp_mib), + __alignof__(struct udp_mib)) < 0) + goto err_udp_mib; + return 0; + +err_udp_mib: + snmp6_mib_free((void **)icmpv6_statistics); +err_icmp_mib: + snmp6_mib_free((void **)ipv6_statistics); +err_ip_mib: + return -ENOMEM; + +} + +static void cleanup_ipv6_mibs(void) +{ + snmp6_mib_free((void **)ipv6_statistics); + snmp6_mib_free((void **)icmpv6_statistics); + snmp6_mib_free((void **)udp_stats_in6); +} + +extern int ipv6_misc_proc_init(void); + +static int __init inet6_init(void) +{ + struct sk_buff *dummy_skb; + struct list_head *r; + int err; + +#ifdef MODULE +#if 0 /* FIXME --RR */ + if (!mod_member_present(&__this_module, can_unload)) + return -EINVAL; + + __this_module.can_unload = &ipv6_unload; +#endif +#endif + + if (sizeof(struct inet6_skb_parm) > sizeof(dummy_skb->cb)) { + printk(KERN_CRIT "inet6_proto_init: size fault\n"); + return -EINVAL; + } + + err = proto_register(&tcpv6_prot, 1); + if (err) + goto out; + + err = proto_register(&udpv6_prot, 1); + if (err) + goto out_unregister_tcp_proto; + + err = proto_register(&rawv6_prot, 1); + if (err) + goto out_unregister_udp_proto; + + + /* Register the socket-side information for inet6_create. */ + for(r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) + INIT_LIST_HEAD(r); + + /* We MUST register RAW sockets before we create the ICMP6, + * IGMP6, or NDISC control sockets. + */ + inet6_register_protosw(&rawv6_protosw); + + /* Register the family here so that the init calls below will + * be able to create sockets. (?? is this dangerous ??) + */ + (void) sock_register(&inet6_family_ops); + + /* Initialise ipv6 mibs */ + err = init_ipv6_mibs(); + if (err) + goto out_unregister_raw_proto; + + /* + * ipngwg API draft makes clear that the correct semantics + * for TCP and UDP is to consider one TCP and UDP instance + * in a host availiable by both INET and INET6 APIs and + * able to communicate via both network protocols. + */ + +#ifdef CONFIG_SYSCTL + ipv6_sysctl_register(); +#endif + err = icmpv6_init(&inet6_family_ops); + if (err) + goto icmp_fail; + err = ndisc_init(&inet6_family_ops); + if (err) + goto ndisc_fail; + err = igmp6_init(&inet6_family_ops); + if (err) + goto igmp_fail; + /* Create /proc/foo6 entries. */ +#ifdef CONFIG_PROC_FS + err = -ENOMEM; + if (raw6_proc_init()) + goto proc_raw6_fail; + if (tcp6_proc_init()) + goto proc_tcp6_fail; + if (udp6_proc_init()) + goto proc_udp6_fail; + if (ipv6_misc_proc_init()) + goto proc_misc6_fail; + + if (ac6_proc_init()) + goto proc_anycast6_fail; + if (if6_proc_init()) + goto proc_if6_fail; +#endif + ipv6_packet_init(); + ip6_route_init(); + ip6_flowlabel_init(); + err = addrconf_init(); + if (err) + goto addrconf_fail; + sit_init(); + + /* Init v6 extension headers. */ + ipv6_rthdr_init(); + ipv6_frag_init(); + ipv6_nodata_init(); + ipv6_destopt_init(); + + /* Init v6 transport protocols. */ + udpv6_init(); + tcpv6_init(); + err = 0; +out: + return err; + +addrconf_fail: + ip6_flowlabel_cleanup(); + ip6_route_cleanup(); + ipv6_packet_cleanup(); +#ifdef CONFIG_PROC_FS + if6_proc_exit(); +proc_if6_fail: + ac6_proc_exit(); +proc_anycast6_fail: + ipv6_misc_proc_exit(); +proc_misc6_fail: + udp6_proc_exit(); +proc_udp6_fail: + tcp6_proc_exit(); +proc_tcp6_fail: + raw6_proc_exit(); +proc_raw6_fail: +#endif + igmp6_cleanup(); +igmp_fail: + ndisc_cleanup(); +ndisc_fail: + icmpv6_cleanup(); +icmp_fail: +#ifdef CONFIG_SYSCTL + ipv6_sysctl_unregister(); +#endif + cleanup_ipv6_mibs(); +out_unregister_raw_proto: + proto_unregister(&rawv6_prot); +out_unregister_udp_proto: + proto_unregister(&udpv6_prot); +out_unregister_tcp_proto: + proto_unregister(&tcpv6_prot); + goto out; +} +module_init(inet6_init); + +static void __exit inet6_exit(void) +{ + /* First of all disallow new sockets creation. */ + sock_unregister(PF_INET6); +#ifdef CONFIG_PROC_FS + if6_proc_exit(); + ac6_proc_exit(); + ipv6_misc_proc_exit(); + udp6_proc_exit(); + tcp6_proc_exit(); + raw6_proc_exit(); +#endif + /* Cleanup code parts. */ + sit_cleanup(); + ip6_flowlabel_cleanup(); + addrconf_cleanup(); + ip6_route_cleanup(); + ipv6_packet_cleanup(); + igmp6_cleanup(); + ndisc_cleanup(); + icmpv6_cleanup(); +#ifdef CONFIG_SYSCTL + ipv6_sysctl_unregister(); +#endif + cleanup_ipv6_mibs(); + proto_unregister(&rawv6_prot); + proto_unregister(&udpv6_prot); + proto_unregister(&tcpv6_prot); +} +module_exit(inet6_exit); + +MODULE_ALIAS_NETPROTO(PF_INET6); diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c new file mode 100644 index 000000000000..e3ecf626cbf7 --- /dev/null +++ b/net/ipv6/ah6.c @@ -0,0 +1,478 @@ +/* + * Copyright (C)2002 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors + * + * Mitsuru KANDA @USAGI : IPv6 Support + * Kazunori MIYAZAWA @USAGI : + * Kunihiro Ishiguro + * + * This file is derived from net/ipv4/ah.c. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int zero_out_mutable_opts(struct ipv6_opt_hdr *opthdr) +{ + u8 *opt = (u8 *)opthdr; + int len = ipv6_optlen(opthdr); + int off = 0; + int optlen = 0; + + off += 2; + len -= 2; + + while (len > 0) { + + switch (opt[off]) { + + case IPV6_TLV_PAD0: + optlen = 1; + break; + default: + if (len < 2) + goto bad; + optlen = opt[off+1]+2; + if (len < optlen) + goto bad; + if (opt[off] & 0x20) + memset(&opt[off+2], 0, opt[off+1]); + break; + } + + off += optlen; + len -= optlen; + } + if (len == 0) + return 1; + +bad: + return 0; +} + +/** + * ipv6_rearrange_rthdr - rearrange IPv6 routing header + * @iph: IPv6 header + * @rthdr: routing header + * + * Rearrange the destination address in @iph and the addresses in @rthdr + * so that they appear in the order they will at the final destination. + * See Appendix A2 of RFC 2402 for details. + */ +static void ipv6_rearrange_rthdr(struct ipv6hdr *iph, struct ipv6_rt_hdr *rthdr) +{ + int segments, segments_left; + struct in6_addr *addrs; + struct in6_addr final_addr; + + segments_left = rthdr->segments_left; + if (segments_left == 0) + return; + rthdr->segments_left = 0; + + /* The value of rthdr->hdrlen has been verified either by the system + * call if it is locally generated, or by ipv6_rthdr_rcv() for incoming + * packets. So we can assume that it is even and that segments is + * greater than or equal to segments_left. + * + * For the same reason we can assume that this option is of type 0. + */ + segments = rthdr->hdrlen >> 1; + + addrs = ((struct rt0_hdr *)rthdr)->addr; + ipv6_addr_copy(&final_addr, addrs + segments - 1); + + addrs += segments - segments_left; + memmove(addrs + 1, addrs, (segments_left - 1) * sizeof(*addrs)); + + ipv6_addr_copy(addrs, &iph->daddr); + ipv6_addr_copy(&iph->daddr, &final_addr); +} + +static int ipv6_clear_mutable_options(struct ipv6hdr *iph, int len) +{ + union { + struct ipv6hdr *iph; + struct ipv6_opt_hdr *opth; + struct ipv6_rt_hdr *rth; + char *raw; + } exthdr = { .iph = iph }; + char *end = exthdr.raw + len; + int nexthdr = iph->nexthdr; + + exthdr.iph++; + + while (exthdr.raw < end) { + switch (nexthdr) { + case NEXTHDR_HOP: + case NEXTHDR_DEST: + if (!zero_out_mutable_opts(exthdr.opth)) { + LIMIT_NETDEBUG(printk( + KERN_WARNING "overrun %sopts\n", + nexthdr == NEXTHDR_HOP ? + "hop" : "dest")); + return -EINVAL; + } + break; + + case NEXTHDR_ROUTING: + ipv6_rearrange_rthdr(iph, exthdr.rth); + break; + + default : + return 0; + } + + nexthdr = exthdr.opth->nexthdr; + exthdr.raw += ipv6_optlen(exthdr.opth); + } + + return 0; +} + +static int ah6_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + int extlen; + struct ipv6hdr *top_iph; + struct ip_auth_hdr *ah; + struct ah_data *ahp; + u8 nexthdr; + char tmp_base[8]; + struct { + struct in6_addr daddr; + char hdrs[0]; + } *tmp_ext; + + top_iph = (struct ipv6hdr *)skb->data; + top_iph->payload_len = htons(skb->len - sizeof(*top_iph)); + + nexthdr = *skb->nh.raw; + *skb->nh.raw = IPPROTO_AH; + + /* When there are no extension headers, we only need to save the first + * 8 bytes of the base IP header. + */ + memcpy(tmp_base, top_iph, sizeof(tmp_base)); + + tmp_ext = NULL; + extlen = skb->h.raw - (unsigned char *)(top_iph + 1); + if (extlen) { + extlen += sizeof(*tmp_ext); + tmp_ext = kmalloc(extlen, GFP_ATOMIC); + if (!tmp_ext) { + err = -ENOMEM; + goto error; + } + memcpy(tmp_ext, &top_iph->daddr, extlen); + err = ipv6_clear_mutable_options(top_iph, + extlen - sizeof(*tmp_ext) + + sizeof(*top_iph)); + if (err) + goto error_free_iph; + } + + ah = (struct ip_auth_hdr *)skb->h.raw; + ah->nexthdr = nexthdr; + + top_iph->priority = 0; + top_iph->flow_lbl[0] = 0; + top_iph->flow_lbl[1] = 0; + top_iph->flow_lbl[2] = 0; + top_iph->hop_limit = 0; + + ahp = x->data; + ah->hdrlen = (XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + + ahp->icv_trunc_len) >> 2) - 2; + + ah->reserved = 0; + ah->spi = x->id.spi; + ah->seq_no = htonl(++x->replay.oseq); + ahp->icv(ahp, skb, ah->auth_data); + + err = 0; + + memcpy(top_iph, tmp_base, sizeof(tmp_base)); + if (tmp_ext) { + memcpy(&top_iph->daddr, tmp_ext, extlen); +error_free_iph: + kfree(tmp_ext); + } + +error: + return err; +} + +static int ah6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + /* + * Before process AH + * [IPv6][Ext1][Ext2][AH][Dest][Payload] + * |<-------------->| hdr_len + * + * To erase AH: + * Keeping copy of cleared headers. After AH processing, + * Moving the pointer of skb->nh.raw by using skb_pull as long as AH + * header length. Then copy back the copy as long as hdr_len + * If destination header following AH exists, copy it into after [Ext2]. + * + * |<>|[IPv6][Ext1][Ext2][Dest][Payload] + * There is offset of AH before IPv6 header after the process. + */ + + struct ipv6_auth_hdr *ah; + struct ah_data *ahp; + unsigned char *tmp_hdr = NULL; + u16 hdr_len; + u16 ah_hlen; + int nexthdr; + + if (!pskb_may_pull(skb, sizeof(struct ip_auth_hdr))) + goto out; + + /* We are going to _remove_ AH header to keep sockets happy, + * so... Later this can change. */ + if (skb_cloned(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto out; + + hdr_len = skb->data - skb->nh.raw; + ah = (struct ipv6_auth_hdr*)skb->data; + ahp = x->data; + nexthdr = ah->nexthdr; + ah_hlen = (ah->hdrlen + 2) << 2; + + if (ah_hlen != XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + ahp->icv_full_len) && + ah_hlen != XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + ahp->icv_trunc_len)) + goto out; + + if (!pskb_may_pull(skb, ah_hlen)) + goto out; + + tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); + if (!tmp_hdr) + goto out; + memcpy(tmp_hdr, skb->nh.raw, hdr_len); + if (ipv6_clear_mutable_options(skb->nh.ipv6h, hdr_len)) + goto out; + skb->nh.ipv6h->priority = 0; + skb->nh.ipv6h->flow_lbl[0] = 0; + skb->nh.ipv6h->flow_lbl[1] = 0; + skb->nh.ipv6h->flow_lbl[2] = 0; + skb->nh.ipv6h->hop_limit = 0; + + { + u8 auth_data[MAX_AH_AUTH_LEN]; + + memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); + memset(ah->auth_data, 0, ahp->icv_trunc_len); + skb_push(skb, skb->data - skb->nh.raw); + ahp->icv(ahp, skb, ah->auth_data); + if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { + LIMIT_NETDEBUG( + printk(KERN_WARNING "ipsec ah authentication error\n")); + x->stats.integrity_failed++; + goto free_out; + } + } + + skb->nh.raw = skb_pull(skb, ah_hlen); + memcpy(skb->nh.raw, tmp_hdr, hdr_len); + skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + skb_pull(skb, hdr_len); + skb->h.raw = skb->data; + + + kfree(tmp_hdr); + + return nexthdr; + +free_out: + kfree(tmp_hdr); +out: + return -EINVAL; +} + +static void ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; + struct ip_auth_hdr *ah = (struct ip_auth_hdr*)(skb->data+offset); + struct xfrm_state *x; + + if (type != ICMPV6_DEST_UNREACH && + type != ICMPV6_PKT_TOOBIG) + return; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, ah->spi, IPPROTO_AH, AF_INET6); + if (!x) + return; + + NETDEBUG(printk(KERN_DEBUG "pmtu discovery on SA AH/%08x/" + "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + ntohl(ah->spi), NIP6(iph->daddr))); + + xfrm_state_put(x); +} + +static int ah6_init_state(struct xfrm_state *x, void *args) +{ + struct ah_data *ahp = NULL; + struct xfrm_algo_desc *aalg_desc; + + if (!x->aalg) + goto error; + + /* null auth can use a zero length key */ + if (x->aalg->alg_key_len > 512) + goto error; + + if (x->encap) + goto error; + + ahp = kmalloc(sizeof(*ahp), GFP_KERNEL); + if (ahp == NULL) + return -ENOMEM; + + memset(ahp, 0, sizeof(*ahp)); + + ahp->key = x->aalg->alg_key; + ahp->key_len = (x->aalg->alg_key_len+7)/8; + ahp->tfm = crypto_alloc_tfm(x->aalg->alg_name, 0); + if (!ahp->tfm) + goto error; + ahp->icv = ah_hmac_digest; + + /* + * Lookup the algorithm description maintained by xfrm_algo, + * verify crypto transform properties, and store information + * we need for AH processing. This lookup cannot fail here + * after a successful crypto_alloc_tfm(). + */ + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); + BUG_ON(!aalg_desc); + + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_tfm_alg_digestsize(ahp->tfm)) { + printk(KERN_INFO "AH: %s digestsize %u != %hu\n", + x->aalg->alg_name, crypto_tfm_alg_digestsize(ahp->tfm), + aalg_desc->uinfo.auth.icv_fullbits/8); + goto error; + } + + ahp->icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; + ahp->icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; + + BUG_ON(ahp->icv_trunc_len > MAX_AH_AUTH_LEN); + + ahp->work_icv = kmalloc(ahp->icv_full_len, GFP_KERNEL); + if (!ahp->work_icv) + goto error; + + x->props.header_len = XFRM_ALIGN8(sizeof(struct ipv6_auth_hdr) + ahp->icv_trunc_len); + if (x->props.mode) + x->props.header_len += sizeof(struct ipv6hdr); + x->data = ahp; + + return 0; + +error: + if (ahp) { + if (ahp->work_icv) + kfree(ahp->work_icv); + if (ahp->tfm) + crypto_free_tfm(ahp->tfm); + kfree(ahp); + } + return -EINVAL; +} + +static void ah6_destroy(struct xfrm_state *x) +{ + struct ah_data *ahp = x->data; + + if (!ahp) + return; + + if (ahp->work_icv) { + kfree(ahp->work_icv); + ahp->work_icv = NULL; + } + if (ahp->tfm) { + crypto_free_tfm(ahp->tfm); + ahp->tfm = NULL; + } + kfree(ahp); +} + +static struct xfrm_type ah6_type = +{ + .description = "AH6", + .owner = THIS_MODULE, + .proto = IPPROTO_AH, + .init_state = ah6_init_state, + .destructor = ah6_destroy, + .input = ah6_input, + .output = ah6_output +}; + +static struct inet6_protocol ah6_protocol = { + .handler = xfrm6_rcv, + .err_handler = ah6_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static int __init ah6_init(void) +{ + if (xfrm_register_type(&ah6_type, AF_INET6) < 0) { + printk(KERN_INFO "ipv6 ah init: can't add xfrm type\n"); + return -EAGAIN; + } + + if (inet6_add_protocol(&ah6_protocol, IPPROTO_AH) < 0) { + printk(KERN_INFO "ipv6 ah init: can't add protocol\n"); + xfrm_unregister_type(&ah6_type, AF_INET6); + return -EAGAIN; + } + + return 0; +} + +static void __exit ah6_fini(void) +{ + if (inet6_del_protocol(&ah6_protocol, IPPROTO_AH) < 0) + printk(KERN_INFO "ipv6 ah close: can't remove protocol\n"); + + if (xfrm_unregister_type(&ah6_type, AF_INET6) < 0) + printk(KERN_INFO "ipv6 ah close: can't remove xfrm type\n"); + +} + +module_init(ah6_init); +module_exit(ah6_fini); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/anycast.c b/net/ipv6/anycast.c new file mode 100644 index 000000000000..5d22ca3cca2e --- /dev/null +++ b/net/ipv6/anycast.c @@ -0,0 +1,594 @@ +/* + * Anycast support for IPv6 + * Linux INET6 implementation + * + * Authors: + * David L Stevens (dlstevens@us.ibm.com) + * + * based heavily on net/ipv6/mcast.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +static int ipv6_dev_ac_dec(struct net_device *dev, struct in6_addr *addr); + +/* Big ac list lock for all the sockets */ +static DEFINE_RWLOCK(ipv6_sk_ac_lock); + +static int +ip6_onlink(struct in6_addr *addr, struct net_device *dev) +{ + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; + int onlink; + + onlink = 0; + read_lock(&addrconf_lock); + idev = __in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + for (ifa=idev->addr_list; ifa; ifa=ifa->if_next) { + onlink = ipv6_prefix_equal(addr, &ifa->addr, + ifa->prefix_len); + if (onlink) + break; + } + read_unlock_bh(&idev->lock); + } + read_unlock(&addrconf_lock); + return onlink; +} + +/* + * socket join an anycast group + */ + +int ipv6_sock_ac_join(struct sock *sk, int ifindex, struct in6_addr *addr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct net_device *dev = NULL; + struct inet6_dev *idev; + struct ipv6_ac_socklist *pac; + int ishost = !ipv6_devconf.forwarding; + int err = 0; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (ipv6_addr_is_multicast(addr)) + return -EINVAL; + if (ipv6_chk_addr(addr, NULL, 0)) + return -EINVAL; + + pac = sock_kmalloc(sk, sizeof(struct ipv6_ac_socklist), GFP_KERNEL); + if (pac == NULL) + return -ENOMEM; + pac->acl_next = NULL; + ipv6_addr_copy(&pac->acl_addr, addr); + + if (ifindex == 0) { + struct rt6_info *rt; + + rt = rt6_lookup(addr, NULL, 0, 0); + if (rt) { + dev = rt->rt6i_dev; + dev_hold(dev); + dst_release(&rt->u.dst); + } else if (ishost) { + err = -EADDRNOTAVAIL; + goto out_free_pac; + } else { + /* router, no matching interface: just pick one */ + + dev = dev_get_by_flags(IFF_UP, IFF_UP|IFF_LOOPBACK); + } + } else + dev = dev_get_by_index(ifindex); + + if (dev == NULL) { + err = -ENODEV; + goto out_free_pac; + } + + idev = in6_dev_get(dev); + if (!idev) { + if (ifindex) + err = -ENODEV; + else + err = -EADDRNOTAVAIL; + goto out_dev_put; + } + /* reset ishost, now that we have a specific device */ + ishost = !idev->cnf.forwarding; + in6_dev_put(idev); + + pac->acl_ifindex = dev->ifindex; + + /* XXX + * For hosts, allow link-local or matching prefix anycasts. + * This obviates the need for propagating anycast routes while + * still allowing some non-router anycast participation. + */ + if (!ip6_onlink(addr, dev)) { + if (ishost) + err = -EADDRNOTAVAIL; + if (err) + goto out_dev_put; + } + + err = ipv6_dev_ac_inc(dev, addr); + if (err) + goto out_dev_put; + + write_lock_bh(&ipv6_sk_ac_lock); + pac->acl_next = np->ipv6_ac_list; + np->ipv6_ac_list = pac; + write_unlock_bh(&ipv6_sk_ac_lock); + + dev_put(dev); + + return 0; + +out_dev_put: + dev_put(dev); +out_free_pac: + sock_kfree_s(sk, pac, sizeof(*pac)); + return err; +} + +/* + * socket leave an anycast group + */ +int ipv6_sock_ac_drop(struct sock *sk, int ifindex, struct in6_addr *addr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct net_device *dev; + struct ipv6_ac_socklist *pac, *prev_pac; + + write_lock_bh(&ipv6_sk_ac_lock); + prev_pac = NULL; + for (pac = np->ipv6_ac_list; pac; pac = pac->acl_next) { + if ((ifindex == 0 || pac->acl_ifindex == ifindex) && + ipv6_addr_equal(&pac->acl_addr, addr)) + break; + prev_pac = pac; + } + if (!pac) { + write_unlock_bh(&ipv6_sk_ac_lock); + return -ENOENT; + } + if (prev_pac) + prev_pac->acl_next = pac->acl_next; + else + np->ipv6_ac_list = pac->acl_next; + + write_unlock_bh(&ipv6_sk_ac_lock); + + dev = dev_get_by_index(pac->acl_ifindex); + if (dev) { + ipv6_dev_ac_dec(dev, &pac->acl_addr); + dev_put(dev); + } + sock_kfree_s(sk, pac, sizeof(*pac)); + return 0; +} + +void ipv6_sock_ac_close(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct net_device *dev = NULL; + struct ipv6_ac_socklist *pac; + int prev_index; + + write_lock_bh(&ipv6_sk_ac_lock); + pac = np->ipv6_ac_list; + np->ipv6_ac_list = NULL; + write_unlock_bh(&ipv6_sk_ac_lock); + + prev_index = 0; + while (pac) { + struct ipv6_ac_socklist *next = pac->acl_next; + + if (pac->acl_ifindex != prev_index) { + if (dev) + dev_put(dev); + dev = dev_get_by_index(pac->acl_ifindex); + prev_index = pac->acl_ifindex; + } + if (dev) + ipv6_dev_ac_dec(dev, &pac->acl_addr); + sock_kfree_s(sk, pac, sizeof(*pac)); + pac = next; + } + if (dev) + dev_put(dev); +} + +#if 0 +/* The function is not used, which is funny. Apparently, author + * supposed to use it to filter out datagrams inside udp/raw but forgot. + * + * It is OK, anycasts are not special comparing to delivery to unicasts. + */ + +int inet6_ac_check(struct sock *sk, struct in6_addr *addr, int ifindex) +{ + struct ipv6_ac_socklist *pac; + struct ipv6_pinfo *np = inet6_sk(sk); + int found; + + found = 0; + read_lock(&ipv6_sk_ac_lock); + for (pac=np->ipv6_ac_list; pac; pac=pac->acl_next) { + if (ifindex && pac->acl_ifindex != ifindex) + continue; + found = ipv6_addr_equal(&pac->acl_addr, addr); + if (found) + break; + } + read_unlock(&ipv6_sk_ac_lock); + + return found; +} + +#endif + +static void aca_put(struct ifacaddr6 *ac) +{ + if (atomic_dec_and_test(&ac->aca_refcnt)) { + in6_dev_put(ac->aca_idev); + dst_release(&ac->aca_rt->u.dst); + kfree(ac); + } +} + +/* + * device anycast group inc (add if not found) + */ +int ipv6_dev_ac_inc(struct net_device *dev, struct in6_addr *addr) +{ + struct ifacaddr6 *aca; + struct inet6_dev *idev; + struct rt6_info *rt; + int err; + + idev = in6_dev_get(dev); + + if (idev == NULL) + return -EINVAL; + + write_lock_bh(&idev->lock); + if (idev->dead) { + err = -ENODEV; + goto out; + } + + for (aca = idev->ac_list; aca; aca = aca->aca_next) { + if (ipv6_addr_equal(&aca->aca_addr, addr)) { + aca->aca_users++; + err = 0; + goto out; + } + } + + /* + * not found: create a new one. + */ + + aca = kmalloc(sizeof(struct ifacaddr6), GFP_ATOMIC); + + if (aca == NULL) { + err = -ENOMEM; + goto out; + } + + rt = addrconf_dst_alloc(idev, addr, 1); + if (IS_ERR(rt)) { + kfree(aca); + err = PTR_ERR(rt); + goto out; + } + + memset(aca, 0, sizeof(struct ifacaddr6)); + + ipv6_addr_copy(&aca->aca_addr, addr); + aca->aca_idev = idev; + aca->aca_rt = rt; + aca->aca_users = 1; + /* aca_tstamp should be updated upon changes */ + aca->aca_cstamp = aca->aca_tstamp = jiffies; + atomic_set(&aca->aca_refcnt, 2); + spin_lock_init(&aca->aca_lock); + + aca->aca_next = idev->ac_list; + idev->ac_list = aca; + write_unlock_bh(&idev->lock); + + dst_hold(&rt->u.dst); + if (ip6_ins_rt(rt, NULL, NULL)) + dst_release(&rt->u.dst); + + addrconf_join_solict(dev, &aca->aca_addr); + + aca_put(aca); + return 0; +out: + write_unlock_bh(&idev->lock); + in6_dev_put(idev); + return err; +} + +/* + * device anycast group decrement + */ +int __ipv6_dev_ac_dec(struct inet6_dev *idev, struct in6_addr *addr) +{ + struct ifacaddr6 *aca, *prev_aca; + + write_lock_bh(&idev->lock); + prev_aca = NULL; + for (aca = idev->ac_list; aca; aca = aca->aca_next) { + if (ipv6_addr_equal(&aca->aca_addr, addr)) + break; + prev_aca = aca; + } + if (!aca) { + write_unlock_bh(&idev->lock); + return -ENOENT; + } + if (--aca->aca_users > 0) { + write_unlock_bh(&idev->lock); + return 0; + } + if (prev_aca) + prev_aca->aca_next = aca->aca_next; + else + idev->ac_list = aca->aca_next; + write_unlock_bh(&idev->lock); + addrconf_leave_solict(idev, &aca->aca_addr); + + dst_hold(&aca->aca_rt->u.dst); + if (ip6_del_rt(aca->aca_rt, NULL, NULL)) + dst_free(&aca->aca_rt->u.dst); + else + dst_release(&aca->aca_rt->u.dst); + + aca_put(aca); + return 0; +} + +static int ipv6_dev_ac_dec(struct net_device *dev, struct in6_addr *addr) +{ + int ret; + struct inet6_dev *idev = in6_dev_get(dev); + if (idev == NULL) + return -ENODEV; + ret = __ipv6_dev_ac_dec(idev, addr); + in6_dev_put(idev); + return ret; +} + +/* + * check if the interface has this anycast address + */ +static int ipv6_chk_acast_dev(struct net_device *dev, struct in6_addr *addr) +{ + struct inet6_dev *idev; + struct ifacaddr6 *aca; + + idev = in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + for (aca = idev->ac_list; aca; aca = aca->aca_next) + if (ipv6_addr_equal(&aca->aca_addr, addr)) + break; + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + return aca != 0; + } + return 0; +} + +/* + * check if given interface (or any, if dev==0) has this anycast address + */ +int ipv6_chk_acast_addr(struct net_device *dev, struct in6_addr *addr) +{ + if (dev) + return ipv6_chk_acast_dev(dev, addr); + read_lock(&dev_base_lock); + for (dev=dev_base; dev; dev=dev->next) + if (ipv6_chk_acast_dev(dev, addr)) + break; + read_unlock(&dev_base_lock); + return dev != 0; +} + + +#ifdef CONFIG_PROC_FS +struct ac6_iter_state { + struct net_device *dev; + struct inet6_dev *idev; +}; + +#define ac6_seq_private(seq) ((struct ac6_iter_state *)(seq)->private) + +static inline struct ifacaddr6 *ac6_get_first(struct seq_file *seq) +{ + struct ifacaddr6 *im = NULL; + struct ac6_iter_state *state = ac6_seq_private(seq); + + for (state->dev = dev_base, state->idev = NULL; + state->dev; + state->dev = state->dev->next) { + struct inet6_dev *idev; + idev = in6_dev_get(state->dev); + if (!idev) + continue; + read_lock_bh(&idev->lock); + im = idev->ac_list; + if (im) { + state->idev = idev; + break; + } + read_unlock_bh(&idev->lock); + } + return im; +} + +static struct ifacaddr6 *ac6_get_next(struct seq_file *seq, struct ifacaddr6 *im) +{ + struct ac6_iter_state *state = ac6_seq_private(seq); + + im = im->aca_next; + while (!im) { + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + in6_dev_put(state->idev); + } + state->dev = state->dev->next; + if (!state->dev) { + state->idev = NULL; + break; + } + state->idev = in6_dev_get(state->dev); + if (!state->idev) + continue; + read_lock_bh(&state->idev->lock); + im = state->idev->ac_list; + } + return im; +} + +static struct ifacaddr6 *ac6_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ifacaddr6 *im = ac6_get_first(seq); + if (im) + while (pos && (im = ac6_get_next(seq, im)) != NULL) + --pos; + return pos ? NULL : im; +} + +static void *ac6_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&dev_base_lock); + return ac6_get_idx(seq, *pos); +} + +static void *ac6_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ifacaddr6 *im; + im = ac6_get_next(seq, v); + ++*pos; + return im; +} + +static void ac6_seq_stop(struct seq_file *seq, void *v) +{ + struct ac6_iter_state *state = ac6_seq_private(seq); + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + in6_dev_put(state->idev); + } + read_unlock(&dev_base_lock); +} + +static int ac6_seq_show(struct seq_file *seq, void *v) +{ + struct ifacaddr6 *im = (struct ifacaddr6 *)v; + struct ac6_iter_state *state = ac6_seq_private(seq); + + seq_printf(seq, + "%-4d %-15s " + "%04x%04x%04x%04x%04x%04x%04x%04x " + "%5d\n", + state->dev->ifindex, state->dev->name, + NIP6(im->aca_addr), + im->aca_users); + return 0; +} + +static struct seq_operations ac6_seq_ops = { + .start = ac6_seq_start, + .next = ac6_seq_next, + .stop = ac6_seq_stop, + .show = ac6_seq_show, +}; + +static int ac6_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct ac6_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &ac6_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations ac6_seq_fops = { + .owner = THIS_MODULE, + .open = ac6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +int __init ac6_proc_init(void) +{ + if (!proc_net_fops_create("anycast6", S_IRUGO, &ac6_seq_fops)) + return -ENOMEM; + + return 0; +} + +void ac6_proc_exit(void) +{ + proc_net_remove("anycast6"); +} +#endif + diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c new file mode 100644 index 000000000000..65b9375df57d --- /dev/null +++ b/net/ipv6/datagram.c @@ -0,0 +1,600 @@ +/* + * common UDP/RAW code + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * $Id: datagram.c,v 1.24 2002/02/01 22:01:04 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +int ip6_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *daddr, *final_p = NULL, final; + struct dst_entry *dst; + struct flowi fl; + struct ip6_flowlabel *flowlabel = NULL; + int addr_type; + int err; + + if (usin->sin6_family == AF_INET) { + if (__ipv6_only_sock(sk)) + return -EAFNOSUPPORT; + err = ip4_datagram_connect(sk, uaddr, addr_len); + goto ipv4_connected; + } + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (usin->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + + memset(&fl, 0, sizeof(fl)); + if (np->sndflow) { + fl.fl6_flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if (flowlabel == NULL) + return -EINVAL; + ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); + } + } + + addr_type = ipv6_addr_type(&usin->sin6_addr); + + if (addr_type == IPV6_ADDR_ANY) { + /* + * connect to self + */ + usin->sin6_addr.s6_addr[15] = 0x01; + } + + daddr = &usin->sin6_addr; + + if (addr_type == IPV6_ADDR_MAPPED) { + struct sockaddr_in sin; + + if (__ipv6_only_sock(sk)) { + err = -ENETUNREACH; + goto out; + } + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = daddr->s6_addr32[3]; + sin.sin_port = usin->sin6_port; + + err = ip4_datagram_connect(sk, + (struct sockaddr*) &sin, + sizeof(sin)); + +ipv4_connected: + if (err) + goto out; + + ipv6_addr_set(&np->daddr, 0, 0, htonl(0x0000ffff), inet->daddr); + + if (ipv6_addr_any(&np->saddr)) { + ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000ffff), + inet->saddr); + } + + if (ipv6_addr_any(&np->rcv_saddr)) { + ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000ffff), + inet->rcv_saddr); + } + goto out; + } + + if (addr_type&IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + usin->sin6_scope_id) { + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != usin->sin6_scope_id) { + err = -EINVAL; + goto out; + } + sk->sk_bound_dev_if = usin->sin6_scope_id; + if (!sk->sk_bound_dev_if && + (addr_type & IPV6_ADDR_MULTICAST)) + fl.oif = np->mcast_oif; + } + + /* Connect to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) { + err = -EINVAL; + goto out; + } + } + + ipv6_addr_copy(&np->daddr, daddr); + np->flow_label = fl.fl6_flowlabel; + + inet->dport = usin->sin6_port; + + /* + * Check for a route to destination an obtain the + * destination cache for it. + */ + + fl.proto = sk->sk_protocol; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet->dport; + fl.fl_ip_sport = inet->sport; + + if (!fl.oif && (addr_type&IPV6_ADDR_MULTICAST)) + fl.oif = np->mcast_oif; + + if (flowlabel) { + if (flowlabel->opt && flowlabel->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) flowlabel->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + } else if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto out; + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + dst_release(dst); + goto out; + } + + /* source address lookup done in ip6_dst_lookup */ + + if (ipv6_addr_any(&np->saddr)) + ipv6_addr_copy(&np->saddr, &fl.fl6_src); + + if (ipv6_addr_any(&np->rcv_saddr)) { + ipv6_addr_copy(&np->rcv_saddr, &fl.fl6_src); + inet->rcv_saddr = LOOPBACK4_IPV6; + } + + ip6_dst_store(sk, dst, + ipv6_addr_equal(&fl.fl6_dst, &np->daddr) ? + &np->daddr : NULL); + + sk->sk_state = TCP_ESTABLISHED; +out: + fl6_sock_release(flowlabel); + return err; +} + +void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, + u16 port, u32 info, u8 *payload) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct icmp6hdr *icmph = (struct icmp6hdr *)skb->h.raw; + struct sock_exterr_skb *serr; + + if (!np->recverr) + return; + + skb = skb_clone(skb, GFP_ATOMIC); + if (!skb) + return; + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_ICMP6; + serr->ee.ee_type = icmph->icmp6_type; + serr->ee.ee_code = icmph->icmp6_code; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8*)&(((struct ipv6hdr*)(icmph+1))->daddr) - skb->nh.raw; + serr->port = port; + + skb->h.raw = payload; + __skb_pull(skb, payload - skb->data); + + if (sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +void ipv6_local_error(struct sock *sk, int err, struct flowi *fl, u32 info) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sock_exterr_skb *serr; + struct ipv6hdr *iph; + struct sk_buff *skb; + + if (!np->recverr) + return; + + skb = alloc_skb(sizeof(struct ipv6hdr), GFP_ATOMIC); + if (!skb) + return; + + iph = (struct ipv6hdr*)skb_put(skb, sizeof(struct ipv6hdr)); + skb->nh.ipv6h = iph; + ipv6_addr_copy(&iph->daddr, &fl->fl6_dst); + + serr = SKB_EXT_ERR(skb); + serr->ee.ee_errno = err; + serr->ee.ee_origin = SO_EE_ORIGIN_LOCAL; + serr->ee.ee_type = 0; + serr->ee.ee_code = 0; + serr->ee.ee_pad = 0; + serr->ee.ee_info = info; + serr->ee.ee_data = 0; + serr->addr_offset = (u8*)&iph->daddr - skb->nh.raw; + serr->port = fl->fl_ip_dport; + + skb->h.raw = skb->tail; + __skb_pull(skb, skb->tail - skb->data); + + if (sock_queue_err_skb(sk, skb)) + kfree_skb(skb); +} + +/* + * Handle MSG_ERRQUEUE + */ +int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sock_exterr_skb *serr; + struct sk_buff *skb, *skb2; + struct sockaddr_in6 *sin; + struct { + struct sock_extended_err ee; + struct sockaddr_in6 offender; + } errhdr; + int err; + int copied; + + err = -EAGAIN; + skb = skb_dequeue(&sk->sk_error_queue); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free_skb; + + sock_recv_timestamp(msg, sk, skb); + + serr = SKB_EXT_ERR(skb); + + sin = (struct sockaddr_in6 *)msg->msg_name; + if (sin) { + sin->sin6_family = AF_INET6; + sin->sin6_flowinfo = 0; + sin->sin6_port = serr->port; + sin->sin6_scope_id = 0; + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) { + ipv6_addr_copy(&sin->sin6_addr, + (struct in6_addr *)(skb->nh.raw + serr->addr_offset)); + if (np->sndflow) + sin->sin6_flowinfo = *(u32*)(skb->nh.raw + serr->addr_offset - 24) & IPV6_FLOWINFO_MASK; + if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin->sin6_scope_id = IP6CB(skb)->iif; + } else { + ipv6_addr_set(&sin->sin6_addr, 0, 0, + htonl(0xffff), + *(u32*)(skb->nh.raw + serr->addr_offset)); + } + } + + memcpy(&errhdr.ee, &serr->ee, sizeof(struct sock_extended_err)); + sin = &errhdr.offender; + sin->sin6_family = AF_UNSPEC; + if (serr->ee.ee_origin != SO_EE_ORIGIN_LOCAL) { + sin->sin6_family = AF_INET6; + sin->sin6_flowinfo = 0; + sin->sin6_scope_id = 0; + if (serr->ee.ee_origin == SO_EE_ORIGIN_ICMP6) { + ipv6_addr_copy(&sin->sin6_addr, &skb->nh.ipv6h->saddr); + if (np->rxopt.all) + datagram_recv_ctl(sk, msg, skb); + if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin->sin6_scope_id = IP6CB(skb)->iif; + } else { + struct inet_sock *inet = inet_sk(sk); + + ipv6_addr_set(&sin->sin6_addr, 0, 0, + htonl(0xffff), + skb->nh.iph->saddr); + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + } + } + + put_cmsg(msg, SOL_IPV6, IPV6_RECVERR, sizeof(errhdr), &errhdr); + + /* Now we could try to dump offended packet options */ + + msg->msg_flags |= MSG_ERRQUEUE; + err = copied; + + /* Reset and regenerate socket error */ + spin_lock_irq(&sk->sk_error_queue.lock); + sk->sk_err = 0; + if ((skb2 = skb_peek(&sk->sk_error_queue)) != NULL) { + sk->sk_err = SKB_EXT_ERR(skb2)->ee.ee_errno; + spin_unlock_irq(&sk->sk_error_queue.lock); + sk->sk_error_report(sk); + } else { + spin_unlock_irq(&sk->sk_error_queue.lock); + } + +out_free_skb: + kfree_skb(skb); +out: + return err; +} + + + +int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_skb_parm *opt = IP6CB(skb); + + if (np->rxopt.bits.rxinfo) { + struct in6_pktinfo src_info; + + src_info.ipi6_ifindex = opt->iif; + ipv6_addr_copy(&src_info.ipi6_addr, &skb->nh.ipv6h->daddr); + put_cmsg(msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); + } + + if (np->rxopt.bits.rxhlim) { + int hlim = skb->nh.ipv6h->hop_limit; + put_cmsg(msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); + } + + if (np->rxopt.bits.rxflow && (*(u32*)skb->nh.raw & IPV6_FLOWINFO_MASK)) { + u32 flowinfo = *(u32*)skb->nh.raw & IPV6_FLOWINFO_MASK; + put_cmsg(msg, SOL_IPV6, IPV6_FLOWINFO, sizeof(flowinfo), &flowinfo); + } + if (np->rxopt.bits.hopopts && opt->hop) { + u8 *ptr = skb->nh.raw + opt->hop; + put_cmsg(msg, SOL_IPV6, IPV6_HOPOPTS, (ptr[1]+1)<<3, ptr); + } + if (np->rxopt.bits.dstopts && opt->dst0) { + u8 *ptr = skb->nh.raw + opt->dst0; + put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, (ptr[1]+1)<<3, ptr); + } + if (np->rxopt.bits.srcrt && opt->srcrt) { + struct ipv6_rt_hdr *rthdr = (struct ipv6_rt_hdr *)(skb->nh.raw + opt->srcrt); + put_cmsg(msg, SOL_IPV6, IPV6_RTHDR, (rthdr->hdrlen+1) << 3, rthdr); + } + if (np->rxopt.bits.dstopts && opt->dst1) { + u8 *ptr = skb->nh.raw + opt->dst1; + put_cmsg(msg, SOL_IPV6, IPV6_DSTOPTS, (ptr[1]+1)<<3, ptr); + } + return 0; +} + +int datagram_send_ctl(struct msghdr *msg, struct flowi *fl, + struct ipv6_txoptions *opt, + int *hlimit) +{ + struct in6_pktinfo *src_info; + struct cmsghdr *cmsg; + struct ipv6_rt_hdr *rthdr; + struct ipv6_opt_hdr *hdr; + int len; + int err = 0; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + int addr_type; + struct net_device *dev = NULL; + + if (!CMSG_OK(msg, cmsg)) { + err = -EINVAL; + goto exit_f; + } + + if (cmsg->cmsg_level != SOL_IPV6) + continue; + + switch (cmsg->cmsg_type) { + case IPV6_PKTINFO: + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct in6_pktinfo))) { + err = -EINVAL; + goto exit_f; + } + + src_info = (struct in6_pktinfo *)CMSG_DATA(cmsg); + + if (src_info->ipi6_ifindex) { + if (fl->oif && src_info->ipi6_ifindex != fl->oif) + return -EINVAL; + fl->oif = src_info->ipi6_ifindex; + } + + addr_type = ipv6_addr_type(&src_info->ipi6_addr); + + if (addr_type == IPV6_ADDR_ANY) + break; + + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (!src_info->ipi6_ifindex) + return -EINVAL; + else { + dev = dev_get_by_index(src_info->ipi6_ifindex); + if (!dev) + return -ENODEV; + } + } + if (!ipv6_chk_addr(&src_info->ipi6_addr, dev, 0)) { + if (dev) + dev_put(dev); + err = -EINVAL; + goto exit_f; + } + if (dev) + dev_put(dev); + + ipv6_addr_copy(&fl->fl6_src, &src_info->ipi6_addr); + break; + + case IPV6_FLOWINFO: + if (cmsg->cmsg_len < CMSG_LEN(4)) { + err = -EINVAL; + goto exit_f; + } + + if (fl->fl6_flowlabel&IPV6_FLOWINFO_MASK) { + if ((fl->fl6_flowlabel^*(u32 *)CMSG_DATA(cmsg))&~IPV6_FLOWINFO_MASK) { + err = -EINVAL; + goto exit_f; + } + } + fl->fl6_flowlabel = IPV6_FLOWINFO_MASK & *(u32 *)CMSG_DATA(cmsg); + break; + + case IPV6_HOPOPTS: + if (opt->hopopt || cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { + err = -EINVAL; + goto exit_f; + } + + hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); + len = ((hdr->hdrlen + 1) << 3); + if (cmsg->cmsg_len < CMSG_LEN(len)) { + err = -EINVAL; + goto exit_f; + } + if (!capable(CAP_NET_RAW)) { + err = -EPERM; + goto exit_f; + } + opt->opt_nflen += len; + opt->hopopt = hdr; + break; + + case IPV6_DSTOPTS: + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_opt_hdr))) { + err = -EINVAL; + goto exit_f; + } + + hdr = (struct ipv6_opt_hdr *)CMSG_DATA(cmsg); + len = ((hdr->hdrlen + 1) << 3); + if (cmsg->cmsg_len < CMSG_LEN(len)) { + err = -EINVAL; + goto exit_f; + } + if (!capable(CAP_NET_RAW)) { + err = -EPERM; + goto exit_f; + } + if (opt->dst1opt) { + err = -EINVAL; + goto exit_f; + } + opt->opt_flen += len; + opt->dst1opt = hdr; + break; + + case IPV6_RTHDR: + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct ipv6_rt_hdr))) { + err = -EINVAL; + goto exit_f; + } + + rthdr = (struct ipv6_rt_hdr *)CMSG_DATA(cmsg); + + /* + * TYPE 0 + */ + if (rthdr->type) { + err = -EINVAL; + goto exit_f; + } + + len = ((rthdr->hdrlen + 1) << 3); + + if (cmsg->cmsg_len < CMSG_LEN(len)) { + err = -EINVAL; + goto exit_f; + } + + /* segments left must also match */ + if ((rthdr->hdrlen >> 1) != rthdr->segments_left) { + err = -EINVAL; + goto exit_f; + } + + opt->opt_nflen += len; + opt->srcrt = rthdr; + + if (opt->dst1opt) { + int dsthdrlen = ((opt->dst1opt->hdrlen+1)<<3); + + opt->opt_nflen += dsthdrlen; + opt->dst0opt = opt->dst1opt; + opt->dst1opt = NULL; + opt->opt_flen -= dsthdrlen; + } + + break; + + case IPV6_HOPLIMIT: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(int))) { + err = -EINVAL; + goto exit_f; + } + + *hlimit = *(int *)CMSG_DATA(cmsg); + break; + + default: + LIMIT_NETDEBUG( + printk(KERN_DEBUG "invalid cmsg type: %d\n", cmsg->cmsg_type)); + err = -EINVAL; + break; + }; + } + +exit_f: + return err; +} diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c new file mode 100644 index 000000000000..be7095d6babe --- /dev/null +++ b/net/ipv6/esp6.c @@ -0,0 +1,424 @@ +/* + * Copyright (C)2002 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors + * + * Mitsuru KANDA @USAGI : IPv6 Support + * Kazunori MIYAZAWA @USAGI : + * Kunihiro Ishiguro + * + * This file is derived from net/ipv4/esp.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + int hdr_len; + struct ipv6hdr *top_iph; + struct ipv6_esp_hdr *esph; + struct crypto_tfm *tfm; + struct esp_data *esp; + struct sk_buff *trailer; + int blksize; + int clen; + int alen; + int nfrags; + + esp = x->data; + hdr_len = skb->h.raw - skb->data + + sizeof(*esph) + esp->conf.ivlen; + + /* Strip IP+ESP header. */ + __skb_pull(skb, hdr_len); + + /* Now skb is pure payload to encrypt */ + err = -ENOMEM; + + /* Round to block size */ + clen = skb->len; + + alen = esp->auth.icv_trunc_len; + tfm = esp->conf.tfm; + blksize = (crypto_tfm_alg_blocksize(tfm) + 3) & ~3; + clen = (clen + 2 + blksize-1)&~(blksize-1); + if (esp->conf.padlen) + clen = (clen + esp->conf.padlen-1)&~(esp->conf.padlen-1); + + if ((nfrags = skb_cow_data(skb, clen-skb->len+alen, &trailer)) < 0) { + goto error; + } + + /* Fill padding... */ + do { + int i; + for (i=0; ilen - 2; i++) + *(u8*)(trailer->tail + i) = i+1; + } while (0); + *(u8*)(trailer->tail + clen-skb->len - 2) = (clen - skb->len)-2; + pskb_put(skb, trailer, clen - skb->len); + + top_iph = (struct ipv6hdr *)__skb_push(skb, hdr_len); + esph = (struct ipv6_esp_hdr *)skb->h.raw; + top_iph->payload_len = htons(skb->len + alen - sizeof(*top_iph)); + *(u8*)(trailer->tail - 1) = *skb->nh.raw; + *skb->nh.raw = IPPROTO_ESP; + + esph->spi = x->id.spi; + esph->seq_no = htonl(++x->replay.oseq); + + if (esp->conf.ivlen) + crypto_cipher_set_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + + do { + struct scatterlist *sg = &esp->sgbuf[0]; + + if (unlikely(nfrags > ESP_NUM_FAST_SG)) { + sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC); + if (!sg) + goto error; + } + skb_to_sgvec(skb, sg, esph->enc_data+esp->conf.ivlen-skb->data, clen); + crypto_cipher_encrypt(tfm, sg, sg, clen); + if (unlikely(sg != &esp->sgbuf[0])) + kfree(sg); + } while (0); + + if (esp->conf.ivlen) { + memcpy(esph->enc_data, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + crypto_cipher_get_iv(tfm, esp->conf.ivec, crypto_tfm_alg_ivsize(tfm)); + } + + if (esp->auth.icv_full_len) { + esp->auth.icv(esp, skb, (u8*)esph-skb->data, + sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen+clen, trailer->tail); + pskb_put(skb, trailer, alen); + } + + err = 0; + +error: + return err; +} + +static int esp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + struct ipv6hdr *iph; + struct ipv6_esp_hdr *esph; + struct esp_data *esp = x->data; + struct sk_buff *trailer; + int blksize = crypto_tfm_alg_blocksize(esp->conf.tfm); + int alen = esp->auth.icv_trunc_len; + int elen = skb->len - sizeof(struct ipv6_esp_hdr) - esp->conf.ivlen - alen; + + int hdr_len = skb->h.raw - skb->nh.raw; + int nfrags; + unsigned char *tmp_hdr = NULL; + int ret = 0; + + if (!pskb_may_pull(skb, sizeof(struct ipv6_esp_hdr))) { + ret = -EINVAL; + goto out_nofree; + } + + if (elen <= 0 || (elen & (blksize-1))) { + ret = -EINVAL; + goto out_nofree; + } + + tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); + if (!tmp_hdr) { + ret = -ENOMEM; + goto out_nofree; + } + memcpy(tmp_hdr, skb->nh.raw, hdr_len); + + /* If integrity check is required, do this. */ + if (esp->auth.icv_full_len) { + u8 sum[esp->auth.icv_full_len]; + u8 sum1[alen]; + + esp->auth.icv(esp, skb, 0, skb->len-alen, sum); + + if (skb_copy_bits(skb, skb->len-alen, sum1, alen)) + BUG(); + + if (unlikely(memcmp(sum, sum1, alen))) { + x->stats.integrity_failed++; + ret = -EINVAL; + goto out; + } + } + + if ((nfrags = skb_cow_data(skb, 0, &trailer)) < 0) { + ret = -EINVAL; + goto out; + } + + skb->ip_summed = CHECKSUM_NONE; + + esph = (struct ipv6_esp_hdr*)skb->data; + iph = skb->nh.ipv6h; + + /* Get ivec. This can be wrong, check against another impls. */ + if (esp->conf.ivlen) + crypto_cipher_set_iv(esp->conf.tfm, esph->enc_data, crypto_tfm_alg_ivsize(esp->conf.tfm)); + + { + u8 nexthdr[2]; + struct scatterlist *sg = &esp->sgbuf[0]; + u8 padlen; + + if (unlikely(nfrags > ESP_NUM_FAST_SG)) { + sg = kmalloc(sizeof(struct scatterlist)*nfrags, GFP_ATOMIC); + if (!sg) { + ret = -ENOMEM; + goto out; + } + } + skb_to_sgvec(skb, sg, sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen, elen); + crypto_cipher_decrypt(esp->conf.tfm, sg, sg, elen); + if (unlikely(sg != &esp->sgbuf[0])) + kfree(sg); + + if (skb_copy_bits(skb, skb->len-alen-2, nexthdr, 2)) + BUG(); + + padlen = nexthdr[0]; + if (padlen+2 >= elen) { + LIMIT_NETDEBUG( + printk(KERN_WARNING "ipsec esp packet is garbage padlen=%d, elen=%d\n", padlen+2, elen)); + ret = -EINVAL; + goto out; + } + /* ... check padding bits here. Silly. :-) */ + + pskb_trim(skb, skb->len - alen - padlen - 2); + skb->h.raw = skb_pull(skb, sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen); + skb->nh.raw += sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen; + memcpy(skb->nh.raw, tmp_hdr, hdr_len); + skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + ret = nexthdr[1]; + } + +out: + kfree(tmp_hdr); +out_nofree: + return ret; +} + +static u32 esp6_get_max_size(struct xfrm_state *x, int mtu) +{ + struct esp_data *esp = x->data; + u32 blksize = crypto_tfm_alg_blocksize(esp->conf.tfm); + + if (x->props.mode) { + mtu = (mtu + 2 + blksize-1)&~(blksize-1); + } else { + /* The worst case. */ + mtu += 2 + blksize; + } + if (esp->conf.padlen) + mtu = (mtu + esp->conf.padlen-1)&~(esp->conf.padlen-1); + + return mtu + x->props.header_len + esp->auth.icv_full_len; +} + +static void esp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; + struct ipv6_esp_hdr *esph = (struct ipv6_esp_hdr*)(skb->data+offset); + struct xfrm_state *x; + + if (type != ICMPV6_DEST_UNREACH && + type != ICMPV6_PKT_TOOBIG) + return; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, esph->spi, IPPROTO_ESP, AF_INET6); + if (!x) + return; + printk(KERN_DEBUG "pmtu discovery on SA ESP/%08x/" + "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + ntohl(esph->spi), NIP6(iph->daddr)); + xfrm_state_put(x); +} + +static void esp6_destroy(struct xfrm_state *x) +{ + struct esp_data *esp = x->data; + + if (!esp) + return; + + if (esp->conf.tfm) { + crypto_free_tfm(esp->conf.tfm); + esp->conf.tfm = NULL; + } + if (esp->conf.ivec) { + kfree(esp->conf.ivec); + esp->conf.ivec = NULL; + } + if (esp->auth.tfm) { + crypto_free_tfm(esp->auth.tfm); + esp->auth.tfm = NULL; + } + if (esp->auth.work_icv) { + kfree(esp->auth.work_icv); + esp->auth.work_icv = NULL; + } + kfree(esp); +} + +static int esp6_init_state(struct xfrm_state *x, void *args) +{ + struct esp_data *esp = NULL; + + /* null auth and encryption can have zero length keys */ + if (x->aalg) { + if (x->aalg->alg_key_len > 512) + goto error; + } + if (x->ealg == NULL) + goto error; + + if (x->encap) + goto error; + + esp = kmalloc(sizeof(*esp), GFP_KERNEL); + if (esp == NULL) + return -ENOMEM; + + memset(esp, 0, sizeof(*esp)); + + if (x->aalg) { + struct xfrm_algo_desc *aalg_desc; + + esp->auth.key = x->aalg->alg_key; + esp->auth.key_len = (x->aalg->alg_key_len+7)/8; + esp->auth.tfm = crypto_alloc_tfm(x->aalg->alg_name, 0); + if (esp->auth.tfm == NULL) + goto error; + esp->auth.icv = esp_hmac_digest; + + aalg_desc = xfrm_aalg_get_byname(x->aalg->alg_name, 0); + BUG_ON(!aalg_desc); + + if (aalg_desc->uinfo.auth.icv_fullbits/8 != + crypto_tfm_alg_digestsize(esp->auth.tfm)) { + printk(KERN_INFO "ESP: %s digestsize %u != %hu\n", + x->aalg->alg_name, + crypto_tfm_alg_digestsize(esp->auth.tfm), + aalg_desc->uinfo.auth.icv_fullbits/8); + goto error; + } + + esp->auth.icv_full_len = aalg_desc->uinfo.auth.icv_fullbits/8; + esp->auth.icv_trunc_len = aalg_desc->uinfo.auth.icv_truncbits/8; + + esp->auth.work_icv = kmalloc(esp->auth.icv_full_len, GFP_KERNEL); + if (!esp->auth.work_icv) + goto error; + } + esp->conf.key = x->ealg->alg_key; + esp->conf.key_len = (x->ealg->alg_key_len+7)/8; + if (x->props.ealgo == SADB_EALG_NULL) + esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_ECB); + else + esp->conf.tfm = crypto_alloc_tfm(x->ealg->alg_name, CRYPTO_TFM_MODE_CBC); + if (esp->conf.tfm == NULL) + goto error; + esp->conf.ivlen = crypto_tfm_alg_ivsize(esp->conf.tfm); + esp->conf.padlen = 0; + if (esp->conf.ivlen) { + esp->conf.ivec = kmalloc(esp->conf.ivlen, GFP_KERNEL); + if (unlikely(esp->conf.ivec == NULL)) + goto error; + get_random_bytes(esp->conf.ivec, esp->conf.ivlen); + } + if (crypto_cipher_setkey(esp->conf.tfm, esp->conf.key, esp->conf.key_len)) + goto error; + x->props.header_len = sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen; + if (x->props.mode) + x->props.header_len += sizeof(struct ipv6hdr); + x->data = esp; + return 0; + +error: + x->data = esp; + esp6_destroy(x); + x->data = NULL; + return -EINVAL; +} + +static struct xfrm_type esp6_type = +{ + .description = "ESP6", + .owner = THIS_MODULE, + .proto = IPPROTO_ESP, + .init_state = esp6_init_state, + .destructor = esp6_destroy, + .get_max_size = esp6_get_max_size, + .input = esp6_input, + .output = esp6_output +}; + +static struct inet6_protocol esp6_protocol = { + .handler = xfrm6_rcv, + .err_handler = esp6_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static int __init esp6_init(void) +{ + if (xfrm_register_type(&esp6_type, AF_INET6) < 0) { + printk(KERN_INFO "ipv6 esp init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet6_add_protocol(&esp6_protocol, IPPROTO_ESP) < 0) { + printk(KERN_INFO "ipv6 esp init: can't add protocol\n"); + xfrm_unregister_type(&esp6_type, AF_INET6); + return -EAGAIN; + } + + return 0; +} + +static void __exit esp6_fini(void) +{ + if (inet6_del_protocol(&esp6_protocol, IPPROTO_ESP) < 0) + printk(KERN_INFO "ipv6 esp close: can't remove protocol\n"); + if (xfrm_unregister_type(&esp6_type, AF_INET6) < 0) + printk(KERN_INFO "ipv6 esp close: can't remove xfrm type\n"); +} + +module_init(esp6_init); +module_exit(esp6_fini); + +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c new file mode 100644 index 000000000000..e0839eafc3a9 --- /dev/null +++ b/net/ipv6/exthdrs.c @@ -0,0 +1,575 @@ +/* + * Extension Header handling for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * Andi Kleen + * Alexey Kuznetsov + * + * $Id: exthdrs.c,v 1.13 2001/06/19 15:58:56 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* Changes: + * yoshfuji : ensure not to overrun while parsing + * tlv options. + * Mitsuru KANDA @USAGI and: Remove ipv6_parse_exthdrs(). + * YOSHIFUJI Hideaki @USAGI Register inbound extension header + * handlers as inet6_protocol{}. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Parsing tlv encoded headers. + * + * Parsing function "func" returns 1, if parsing succeed + * and 0, if it failed. + * It MUST NOT touch skb->h. + */ + +struct tlvtype_proc { + int type; + int (*func)(struct sk_buff *skb, int offset); +}; + +/********************* + Generic functions + *********************/ + +/* An unknown option is detected, decide what to do */ + +static int ip6_tlvopt_unknown(struct sk_buff *skb, int optoff) +{ + switch ((skb->nh.raw[optoff] & 0xC0) >> 6) { + case 0: /* ignore */ + return 1; + + case 1: /* drop packet */ + break; + + case 3: /* Send ICMP if not a multicast address and drop packet */ + /* Actually, it is redundant check. icmp_send + will recheck in any case. + */ + if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) + break; + case 2: /* send ICMP PARM PROB regardless and drop packet */ + icmpv6_param_prob(skb, ICMPV6_UNK_OPTION, optoff); + return 0; + }; + + kfree_skb(skb); + return 0; +} + +/* Parse tlv encoded option header (hop-by-hop or destination) */ + +static int ip6_parse_tlv(struct tlvtype_proc *procs, struct sk_buff *skb) +{ + struct tlvtype_proc *curr; + int off = skb->h.raw - skb->nh.raw; + int len = ((skb->h.raw[1]+1)<<3); + + if ((skb->h.raw + len) - skb->data > skb_headlen(skb)) + goto bad; + + off += 2; + len -= 2; + + while (len > 0) { + int optlen = skb->nh.raw[off+1]+2; + + switch (skb->nh.raw[off]) { + case IPV6_TLV_PAD0: + optlen = 1; + break; + + case IPV6_TLV_PADN: + break; + + default: /* Other TLV code so scan list */ + if (optlen > len) + goto bad; + for (curr=procs; curr->type >= 0; curr++) { + if (curr->type == skb->nh.raw[off]) { + /* type specific length/alignment + checks will be performed in the + func(). */ + if (curr->func(skb, off) == 0) + return 0; + break; + } + } + if (curr->type < 0) { + if (ip6_tlvopt_unknown(skb, off) == 0) + return 0; + } + break; + } + off += optlen; + len -= optlen; + } + if (len == 0) + return 1; +bad: + kfree_skb(skb); + return 0; +} + +/***************************** + Destination options header. + *****************************/ + +static struct tlvtype_proc tlvprocdestopt_lst[] = { + /* No destination options are defined now */ + {-1, NULL} +}; + +static int ipv6_destopt_rcv(struct sk_buff **skbp, unsigned int *nhoffp) +{ + struct sk_buff *skb = *skbp; + struct inet6_skb_parm *opt = IP6CB(skb); + + if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+8) || + !pskb_may_pull(skb, (skb->h.raw-skb->data)+((skb->h.raw[1]+1)<<3))) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + kfree_skb(skb); + return -1; + } + + opt->dst1 = skb->h.raw - skb->nh.raw; + + if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) { + skb->h.raw += ((skb->h.raw[1]+1)<<3); + *nhoffp = opt->dst1; + return 1; + } + + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + return -1; +} + +static struct inet6_protocol destopt_protocol = { + .handler = ipv6_destopt_rcv, + .flags = INET6_PROTO_NOPOLICY, +}; + +void __init ipv6_destopt_init(void) +{ + if (inet6_add_protocol(&destopt_protocol, IPPROTO_DSTOPTS) < 0) + printk(KERN_ERR "ipv6_destopt_init: Could not register protocol\n"); +} + +/******************************** + NONE header. No data in packet. + ********************************/ + +static int ipv6_nodata_rcv(struct sk_buff **skbp, unsigned int *nhoffp) +{ + struct sk_buff *skb = *skbp; + + kfree_skb(skb); + return 0; +} + +static struct inet6_protocol nodata_protocol = { + .handler = ipv6_nodata_rcv, + .flags = INET6_PROTO_NOPOLICY, +}; + +void __init ipv6_nodata_init(void) +{ + if (inet6_add_protocol(&nodata_protocol, IPPROTO_NONE) < 0) + printk(KERN_ERR "ipv6_nodata_init: Could not register protocol\n"); +} + +/******************************** + Routing header. + ********************************/ + +static int ipv6_rthdr_rcv(struct sk_buff **skbp, unsigned int *nhoffp) +{ + struct sk_buff *skb = *skbp; + struct inet6_skb_parm *opt = IP6CB(skb); + struct in6_addr *addr; + struct in6_addr daddr; + int n, i; + + struct ipv6_rt_hdr *hdr; + struct rt0_hdr *rthdr; + + if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+8) || + !pskb_may_pull(skb, (skb->h.raw-skb->data)+((skb->h.raw[1]+1)<<3))) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + kfree_skb(skb); + return -1; + } + + hdr = (struct ipv6_rt_hdr *) skb->h.raw; + + if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr) || + skb->pkt_type != PACKET_HOST) { + IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); + kfree_skb(skb); + return -1; + } + +looped_back: + if (hdr->segments_left == 0) { + opt->srcrt = skb->h.raw - skb->nh.raw; + skb->h.raw += (hdr->hdrlen + 1) << 3; + opt->dst0 = opt->dst1; + opt->dst1 = 0; + *nhoffp = (&hdr->nexthdr) - skb->nh.raw; + return 1; + } + + if (hdr->type != IPV6_SRCRT_TYPE_0) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->type) - skb->nh.raw); + return -1; + } + + if (hdr->hdrlen & 0x01) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->hdrlen) - skb->nh.raw); + return -1; + } + + /* + * This is the routing header forwarding algorithm from + * RFC 2460, page 16. + */ + + n = hdr->hdrlen >> 1; + + if (hdr->segments_left > n) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, (&hdr->segments_left) - skb->nh.raw); + return -1; + } + + /* We are about to mangle packet header. Be careful! + Do not damage packets queued somewhere. + */ + if (skb_cloned(skb)) { + struct sk_buff *skb2 = skb_copy(skb, GFP_ATOMIC); + kfree_skb(skb); + /* the copy is a forwarded packet */ + if (skb2 == NULL) { + IP6_INC_STATS_BH(IPSTATS_MIB_OUTDISCARDS); + return -1; + } + *skbp = skb = skb2; + opt = IP6CB(skb2); + hdr = (struct ipv6_rt_hdr *) skb2->h.raw; + } + + if (skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_NONE; + + i = n - --hdr->segments_left; + + rthdr = (struct rt0_hdr *) hdr; + addr = rthdr->addr; + addr += i - 1; + + if (ipv6_addr_is_multicast(addr)) { + IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); + kfree_skb(skb); + return -1; + } + + ipv6_addr_copy(&daddr, addr); + ipv6_addr_copy(addr, &skb->nh.ipv6h->daddr); + ipv6_addr_copy(&skb->nh.ipv6h->daddr, &daddr); + + dst_release(xchg(&skb->dst, NULL)); + ip6_route_input(skb); + if (skb->dst->error) { + skb_push(skb, skb->data - skb->nh.raw); + dst_input(skb); + return -1; + } + + if (skb->dst->dev->flags&IFF_LOOPBACK) { + if (skb->nh.ipv6h->hop_limit <= 1) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, + 0, skb->dev); + kfree_skb(skb); + return -1; + } + skb->nh.ipv6h->hop_limit--; + goto looped_back; + } + + skb_push(skb, skb->data - skb->nh.raw); + dst_input(skb); + return -1; +} + +static struct inet6_protocol rthdr_protocol = { + .handler = ipv6_rthdr_rcv, + .flags = INET6_PROTO_NOPOLICY, +}; + +void __init ipv6_rthdr_init(void) +{ + if (inet6_add_protocol(&rthdr_protocol, IPPROTO_ROUTING) < 0) + printk(KERN_ERR "ipv6_rthdr_init: Could not register protocol\n"); +}; + +/* + This function inverts received rthdr. + NOTE: specs allow to make it automatically only if + packet authenticated. + + I will not discuss it here (though, I am really pissed off at + this stupid requirement making rthdr idea useless) + + Actually, it creates severe problems for us. + Embryonic requests has no associated sockets, + so that user have no control over it and + cannot not only to set reply options, but + even to know, that someone wants to connect + without success. :-( + + For now we need to test the engine, so that I created + temporary (or permanent) backdoor. + If listening socket set IPV6_RTHDR to 2, then we invert header. + --ANK (980729) + */ + +struct ipv6_txoptions * +ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr) +{ + /* Received rthdr: + + [ H1 -> H2 -> ... H_prev ] daddr=ME + + Inverted result: + [ H_prev -> ... -> H1 ] daddr =sender + + Note, that IP output engine will rewrite this rthdr + by rotating it left by one addr. + */ + + int n, i; + struct rt0_hdr *rthdr = (struct rt0_hdr*)hdr; + struct rt0_hdr *irthdr; + struct ipv6_txoptions *opt; + int hdrlen = ipv6_optlen(hdr); + + if (hdr->segments_left || + hdr->type != IPV6_SRCRT_TYPE_0 || + hdr->hdrlen & 0x01) + return NULL; + + n = hdr->hdrlen >> 1; + opt = sock_kmalloc(sk, sizeof(*opt) + hdrlen, GFP_ATOMIC); + if (opt == NULL) + return NULL; + memset(opt, 0, sizeof(*opt)); + opt->tot_len = sizeof(*opt) + hdrlen; + opt->srcrt = (void*)(opt+1); + opt->opt_nflen = hdrlen; + + memcpy(opt->srcrt, hdr, sizeof(*hdr)); + irthdr = (struct rt0_hdr*)opt->srcrt; + /* Obsolete field, MBZ, when originated by us */ + irthdr->bitmap = 0; + opt->srcrt->segments_left = n; + for (i=0; iaddr+i, rthdr->addr+(n-1-i), 16); + return opt; +} + +/********************************** + Hop-by-hop options. + **********************************/ + +/* Router Alert as of RFC 2711 */ + +static int ipv6_hop_ra(struct sk_buff *skb, int optoff) +{ + if (skb->nh.raw[optoff+1] == 2) { + IP6CB(skb)->ra = optoff; + return 1; + } + LIMIT_NETDEBUG( + printk(KERN_DEBUG "ipv6_hop_ra: wrong RA length %d\n", skb->nh.raw[optoff+1])); + kfree_skb(skb); + return 0; +} + +/* Jumbo payload */ + +static int ipv6_hop_jumbo(struct sk_buff *skb, int optoff) +{ + u32 pkt_len; + + if (skb->nh.raw[optoff+1] != 4 || (optoff&3) != 2) { + LIMIT_NETDEBUG( + printk(KERN_DEBUG "ipv6_hop_jumbo: wrong jumbo opt length/alignment %d\n", skb->nh.raw[optoff+1])); + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + goto drop; + } + + pkt_len = ntohl(*(u32*)(skb->nh.raw+optoff+2)); + if (pkt_len <= IPV6_MAXPLEN) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff+2); + return 0; + } + if (skb->nh.ipv6h->payload_len) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, optoff); + return 0; + } + + if (pkt_len > skb->len - sizeof(struct ipv6hdr)) { + IP6_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } + if (pkt_len + sizeof(struct ipv6hdr) < skb->len) { + __pskb_trim(skb, pkt_len + sizeof(struct ipv6hdr)); + if (skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_NONE; + } + return 1; + +drop: + kfree_skb(skb); + return 0; +} + +static struct tlvtype_proc tlvprochopopt_lst[] = { + { + .type = IPV6_TLV_ROUTERALERT, + .func = ipv6_hop_ra, + }, + { + .type = IPV6_TLV_JUMBO, + .func = ipv6_hop_jumbo, + }, + { -1, } +}; + +int ipv6_parse_hopopts(struct sk_buff *skb, int nhoff) +{ + IP6CB(skb)->hop = sizeof(struct ipv6hdr); + if (ip6_parse_tlv(tlvprochopopt_lst, skb)) + return sizeof(struct ipv6hdr); + return -1; +} + +/* + * Creating outbound headers. + * + * "build" functions work when skb is filled from head to tail (datagram) + * "push" functions work when headers are added from tail to head (tcp) + * + * In both cases we assume, that caller reserved enough room + * for headers. + */ + +static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto, + struct ipv6_rt_hdr *opt, + struct in6_addr **addr_p) +{ + struct rt0_hdr *phdr, *ihdr; + int hops; + + ihdr = (struct rt0_hdr *) opt; + + phdr = (struct rt0_hdr *) skb_push(skb, (ihdr->rt_hdr.hdrlen + 1) << 3); + memcpy(phdr, ihdr, sizeof(struct rt0_hdr)); + + hops = ihdr->rt_hdr.hdrlen >> 1; + + if (hops > 1) + memcpy(phdr->addr, ihdr->addr + 1, + (hops - 1) * sizeof(struct in6_addr)); + + ipv6_addr_copy(phdr->addr + (hops - 1), *addr_p); + *addr_p = ihdr->addr; + + phdr->rt_hdr.nexthdr = *proto; + *proto = NEXTHDR_ROUTING; +} + +static void ipv6_push_exthdr(struct sk_buff *skb, u8 *proto, u8 type, struct ipv6_opt_hdr *opt) +{ + struct ipv6_opt_hdr *h = (struct ipv6_opt_hdr *)skb_push(skb, ipv6_optlen(opt)); + + memcpy(h, opt, ipv6_optlen(opt)); + h->nexthdr = *proto; + *proto = type; +} + +void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, + u8 *proto, + struct in6_addr **daddr) +{ + if (opt->srcrt) + ipv6_push_rthdr(skb, proto, opt->srcrt, daddr); + if (opt->dst0opt) + ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst0opt); + if (opt->hopopt) + ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt); +} + +void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto) +{ + if (opt->dst1opt) + ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt); +} + +struct ipv6_txoptions * +ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) +{ + struct ipv6_txoptions *opt2; + + opt2 = sock_kmalloc(sk, opt->tot_len, GFP_ATOMIC); + if (opt2) { + long dif = (char*)opt2 - (char*)opt; + memcpy(opt2, opt, opt->tot_len); + if (opt2->hopopt) + *((char**)&opt2->hopopt) += dif; + if (opt2->dst0opt) + *((char**)&opt2->dst0opt) += dif; + if (opt2->dst1opt) + *((char**)&opt2->dst1opt) += dif; + if (opt2->srcrt) + *((char**)&opt2->srcrt) += dif; + } + return opt2; +} diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c new file mode 100644 index 000000000000..6dda815c013f --- /dev/null +++ b/net/ipv6/exthdrs_core.c @@ -0,0 +1,109 @@ +/* + * IPv6 library code, needed by static components when full IPv6 support is + * not configured or static. + */ +#include + +/* + * find out if nexthdr is a well-known extension header or a protocol + */ + +int ipv6_ext_hdr(u8 nexthdr) +{ + /* + * find out if nexthdr is an extension header or a protocol + */ + return ( (nexthdr == NEXTHDR_HOP) || + (nexthdr == NEXTHDR_ROUTING) || + (nexthdr == NEXTHDR_FRAGMENT) || + (nexthdr == NEXTHDR_AUTH) || + (nexthdr == NEXTHDR_NONE) || + (nexthdr == NEXTHDR_DEST) ); +} + +/* + * Skip any extension headers. This is used by the ICMP module. + * + * Note that strictly speaking this conflicts with RFC 2460 4.0: + * ...The contents and semantics of each extension header determine whether + * or not to proceed to the next header. Therefore, extension headers must + * be processed strictly in the order they appear in the packet; a + * receiver must not, for example, scan through a packet looking for a + * particular kind of extension header and process that header prior to + * processing all preceding ones. + * + * We do exactly this. This is a protocol bug. We can't decide after a + * seeing an unknown discard-with-error flavour TLV option if it's a + * ICMP error message or not (errors should never be send in reply to + * ICMP error messages). + * + * But I see no other way to do this. This might need to be reexamined + * when Linux implements ESP (and maybe AUTH) headers. + * --AK + * + * This function parses (probably truncated) exthdr set "hdr" + * of length "len". "nexthdrp" initially points to some place, + * where type of the first header can be found. + * + * It skips all well-known exthdrs, and returns pointer to the start + * of unparsable area i.e. the first header with unknown type. + * If it is not NULL *nexthdr is updated by type/protocol of this header. + * + * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL. + * - it may return pointer pointing beyond end of packet, + * if the last recognized header is truncated in the middle. + * - if packet is truncated, so that all parsed headers are skipped, + * it returns NULL. + * - First fragment header is skipped, not-first ones + * are considered as unparsable. + * - ESP is unparsable for now and considered like + * normal payload protocol. + * - Note also special handling of AUTH header. Thanks to IPsec wizards. + * + * --ANK (980726) + */ + +int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, int len) +{ + u8 nexthdr = *nexthdrp; + + while (ipv6_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + int hdrlen; + + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return -1; + if (nexthdr == NEXTHDR_NONE) + return -1; + hp = skb_header_pointer(skb, start, sizeof(_hdr), &_hdr); + if (hp == NULL) + BUG(); + if (nexthdr == NEXTHDR_FRAGMENT) { + unsigned short _frag_off, *fp; + fp = skb_header_pointer(skb, + start+offsetof(struct frag_hdr, + frag_off), + sizeof(_frag_off), + &_frag_off); + if (fp == NULL) + return -1; + + if (ntohs(*fp) & ~0x7) + break; + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + nexthdr = hp->nexthdr; + len -= hdrlen; + start += hdrlen; + } + + *nexthdrp = nexthdr; + return start; +} + +EXPORT_SYMBOL(ipv6_ext_hdr); +EXPORT_SYMBOL(ipv6_skip_exthdr); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c new file mode 100644 index 000000000000..87b9082ceab2 --- /dev/null +++ b/net/ipv6/icmp.c @@ -0,0 +1,822 @@ +/* + * Internet Control Message Protocol (ICMPv6) + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * $Id: icmp.c,v 1.38 2002/02/08 03:57:19 davem Exp $ + * + * Based on net/ipv4/icmp.c + * + * RFC 1885 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Andi Kleen : exception handling + * Andi Kleen add rate limits. never reply to a icmp. + * add more length checks and other fixes. + * yoshfuji : ensure to sent parameter problem for + * fragments. + * YOSHIFUJI Hideaki @USAGI: added sysctl for icmp rate limit. + * Randy Dunlap and + * YOSHIFUJI Hideaki @USAGI: Per-interface statistics support + * Kazunori MIYAZAWA @USAGI: change output process to use ip6_append_data + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_SYSCTL +#include +#endif + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics); + +/* + * The ICMP socket(s). This is the most convenient way to flow control + * our ICMP output as well as maintain a clean interface throughout + * all layers. All Socketless IP sends will soon be gone. + * + * On SMP we have one ICMP socket per-cpu. + */ +static DEFINE_PER_CPU(struct socket *, __icmpv6_socket) = NULL; +#define icmpv6_socket __get_cpu_var(__icmpv6_socket) + +static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp); + +static struct inet6_protocol icmpv6_protocol = { + .handler = icmpv6_rcv, + .flags = INET6_PROTO_FINAL, +}; + +static __inline__ int icmpv6_xmit_lock(void) +{ + local_bh_disable(); + + if (unlikely(!spin_trylock(&icmpv6_socket->sk->sk_lock.slock))) { + /* This can happen if the output path (f.e. SIT or + * ip6ip6 tunnel) signals dst_link_failure() for an + * outgoing ICMP6 packet. + */ + local_bh_enable(); + return 1; + } + return 0; +} + +static __inline__ void icmpv6_xmit_unlock(void) +{ + spin_unlock_bh(&icmpv6_socket->sk->sk_lock.slock); +} + +/* + * Slightly more convenient version of icmpv6_send. + */ +void icmpv6_param_prob(struct sk_buff *skb, int code, int pos) +{ + icmpv6_send(skb, ICMPV6_PARAMPROB, code, pos, skb->dev); + kfree_skb(skb); +} + +/* + * Figure out, may we reply to this packet with icmp error. + * + * We do not reply, if: + * - it was icmp error message. + * - it is truncated, so that it is known, that protocol is ICMPV6 + * (i.e. in the middle of some exthdr) + * + * --ANK (980726) + */ + +static int is_ineligible(struct sk_buff *skb) +{ + int ptr = (u8*)(skb->nh.ipv6h+1) - skb->data; + int len = skb->len - ptr; + __u8 nexthdr = skb->nh.ipv6h->nexthdr; + + if (len < 0) + return 1; + + ptr = ipv6_skip_exthdr(skb, ptr, &nexthdr, len); + if (ptr < 0) + return 0; + if (nexthdr == IPPROTO_ICMPV6) { + u8 _type, *tp; + tp = skb_header_pointer(skb, + ptr+offsetof(struct icmp6hdr, icmp6_type), + sizeof(_type), &_type); + if (tp == NULL || + !(*tp & ICMPV6_INFOMSG_MASK)) + return 1; + } + return 0; +} + +static int sysctl_icmpv6_time = 1*HZ; + +/* + * Check the ICMP output rate limit + */ +static inline int icmpv6_xrlim_allow(struct sock *sk, int type, + struct flowi *fl) +{ + struct dst_entry *dst; + int res = 0; + + /* Informational messages are not limited. */ + if (type & ICMPV6_INFOMSG_MASK) + return 1; + + /* Do not limit pmtu discovery, it would break it. */ + if (type == ICMPV6_PKT_TOOBIG) + return 1; + + /* + * Look up the output route. + * XXX: perhaps the expire for routing entries cloned by + * this lookup should be more aggressive (not longer than timeout). + */ + dst = ip6_route_output(sk, fl); + if (dst->error) { + IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + } else if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) { + res = 1; + } else { + struct rt6_info *rt = (struct rt6_info *)dst; + int tmo = sysctl_icmpv6_time; + + /* Give more bandwidth to wider prefixes. */ + if (rt->rt6i_dst.plen < 128) + tmo >>= ((128 - rt->rt6i_dst.plen)>>5); + + res = xrlim_allow(dst, tmo); + } + dst_release(dst); + return res; +} + +/* + * an inline helper for the "simple" if statement below + * checks if parameter problem report is caused by an + * unrecognized IPv6 option that has the Option Type + * highest-order two bits set to 10 + */ + +static __inline__ int opt_unrec(struct sk_buff *skb, __u32 offset) +{ + u8 _optval, *op; + + offset += skb->nh.raw - skb->data; + op = skb_header_pointer(skb, offset, sizeof(_optval), &_optval); + if (op == NULL) + return 1; + return (*op & 0xC0) == 0x80; +} + +static int icmpv6_push_pending_frames(struct sock *sk, struct flowi *fl, struct icmp6hdr *thdr, int len) +{ + struct sk_buff *skb; + struct icmp6hdr *icmp6h; + int err = 0; + + if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) + goto out; + + icmp6h = (struct icmp6hdr*) skb->h.raw; + memcpy(icmp6h, thdr, sizeof(struct icmp6hdr)); + icmp6h->icmp6_cksum = 0; + + if (skb_queue_len(&sk->sk_write_queue) == 1) { + skb->csum = csum_partial((char *)icmp6h, + sizeof(struct icmp6hdr), skb->csum); + icmp6h->icmp6_cksum = csum_ipv6_magic(&fl->fl6_src, + &fl->fl6_dst, + len, fl->proto, + skb->csum); + } else { + u32 tmp_csum = 0; + + skb_queue_walk(&sk->sk_write_queue, skb) { + tmp_csum = csum_add(tmp_csum, skb->csum); + } + + tmp_csum = csum_partial((char *)icmp6h, + sizeof(struct icmp6hdr), tmp_csum); + tmp_csum = csum_ipv6_magic(&fl->fl6_src, + &fl->fl6_dst, + len, fl->proto, tmp_csum); + icmp6h->icmp6_cksum = tmp_csum; + } + if (icmp6h->icmp6_cksum == 0) + icmp6h->icmp6_cksum = -1; + ip6_push_pending_frames(sk); +out: + return err; +} + +struct icmpv6_msg { + struct sk_buff *skb; + int offset; +}; + +static int icmpv6_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb) +{ + struct icmpv6_msg *msg = (struct icmpv6_msg *) from; + struct sk_buff *org_skb = msg->skb; + __u32 csum = 0; + + csum = skb_copy_and_csum_bits(org_skb, msg->offset + offset, + to, len, csum); + skb->csum = csum_block_add(skb->csum, csum, odd); + return 0; +} + +/* + * Send an ICMP message in response to a packet in error + */ +void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, + struct net_device *dev) +{ + struct inet6_dev *idev = NULL; + struct ipv6hdr *hdr = skb->nh.ipv6h; + struct sock *sk = icmpv6_socket->sk; + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *saddr = NULL; + struct dst_entry *dst; + struct icmp6hdr tmp_hdr; + struct flowi fl; + struct icmpv6_msg msg; + int iif = 0; + int addr_type = 0; + int len; + int hlimit; + int err = 0; + + if ((u8*)hdr < skb->head || (u8*)(hdr+1) > skb->tail) + return; + + /* + * Make sure we respect the rules + * i.e. RFC 1885 2.4(e) + * Rule (e.1) is enforced by not using icmpv6_send + * in any code that processes icmp errors. + */ + addr_type = ipv6_addr_type(&hdr->daddr); + + if (ipv6_chk_addr(&hdr->daddr, skb->dev, 0)) + saddr = &hdr->daddr; + + /* + * Dest addr check + */ + + if ((addr_type & IPV6_ADDR_MULTICAST || skb->pkt_type != PACKET_HOST)) { + if (type != ICMPV6_PKT_TOOBIG && + !(type == ICMPV6_PARAMPROB && + code == ICMPV6_UNK_OPTION && + (opt_unrec(skb, info)))) + return; + + saddr = NULL; + } + + addr_type = ipv6_addr_type(&hdr->saddr); + + /* + * Source addr check + */ + + if (addr_type & IPV6_ADDR_LINKLOCAL) + iif = skb->dev->ifindex; + + /* + * Must not send if we know that source is Anycast also. + * for now we don't know that. + */ + if ((addr_type == IPV6_ADDR_ANY) || (addr_type & IPV6_ADDR_MULTICAST)) { + LIMIT_NETDEBUG( + printk(KERN_DEBUG "icmpv6_send: addr_any/mcast source\n")); + return; + } + + /* + * Never answer to a ICMP packet. + */ + if (is_ineligible(skb)) { + LIMIT_NETDEBUG( + printk(KERN_DEBUG "icmpv6_send: no reply to icmp error\n")); + return; + } + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_ICMPV6; + ipv6_addr_copy(&fl.fl6_dst, &hdr->saddr); + if (saddr) + ipv6_addr_copy(&fl.fl6_src, saddr); + fl.oif = iif; + fl.fl_icmp_type = type; + fl.fl_icmp_code = code; + + if (icmpv6_xmit_lock()) + return; + + if (!icmpv6_xrlim_allow(sk, type, &fl)) + goto out; + + tmp_hdr.icmp6_type = type; + tmp_hdr.icmp6_code = code; + tmp_hdr.icmp6_cksum = 0; + tmp_hdr.icmp6_pointer = htonl(info); + + if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) + fl.oif = np->mcast_oif; + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto out; + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto out_dst_release; + + if (ipv6_addr_is_multicast(&fl.fl6_dst)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = dst_metric(dst, RTAX_HOPLIMIT); + if (hlimit < 0) + hlimit = ipv6_get_hoplimit(dst->dev); + + msg.skb = skb; + msg.offset = skb->nh.raw - skb->data; + + len = skb->len - msg.offset; + len = min_t(unsigned int, len, IPV6_MIN_MTU - sizeof(struct ipv6hdr) -sizeof(struct icmp6hdr)); + if (len < 0) { + LIMIT_NETDEBUG( + printk(KERN_DEBUG "icmp: len problem\n")); + goto out_dst_release; + } + + idev = in6_dev_get(skb->dev); + + err = ip6_append_data(sk, icmpv6_getfrag, &msg, + len + sizeof(struct icmp6hdr), + sizeof(struct icmp6hdr), + hlimit, NULL, &fl, (struct rt6_info*)dst, + MSG_DONTWAIT); + if (err) { + ip6_flush_pending_frames(sk); + goto out_put; + } + err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, len + sizeof(struct icmp6hdr)); + + if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB) + ICMP6_INC_STATS_OFFSET_BH(idev, ICMP6_MIB_OUTDESTUNREACHS, type - ICMPV6_DEST_UNREACH); + ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS); + +out_put: + if (likely(idev != NULL)) + in6_dev_put(idev); +out_dst_release: + dst_release(dst); +out: + icmpv6_xmit_unlock(); +} + +static void icmpv6_echo_reply(struct sk_buff *skb) +{ + struct sock *sk = icmpv6_socket->sk; + struct inet6_dev *idev; + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *saddr = NULL; + struct icmp6hdr *icmph = (struct icmp6hdr *) skb->h.raw; + struct icmp6hdr tmp_hdr; + struct flowi fl; + struct icmpv6_msg msg; + struct dst_entry *dst; + int err = 0; + int hlimit; + + saddr = &skb->nh.ipv6h->daddr; + + if (!ipv6_unicast_destination(skb)) + saddr = NULL; + + memcpy(&tmp_hdr, icmph, sizeof(tmp_hdr)); + tmp_hdr.icmp6_type = ICMPV6_ECHO_REPLY; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_ICMPV6; + ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr); + if (saddr) + ipv6_addr_copy(&fl.fl6_src, saddr); + fl.oif = skb->dev->ifindex; + fl.fl_icmp_type = ICMPV6_ECHO_REPLY; + + if (icmpv6_xmit_lock()) + return; + + if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) + fl.oif = np->mcast_oif; + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto out; + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto out_dst_release; + + if (ipv6_addr_is_multicast(&fl.fl6_dst)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = dst_metric(dst, RTAX_HOPLIMIT); + if (hlimit < 0) + hlimit = ipv6_get_hoplimit(dst->dev); + + idev = in6_dev_get(skb->dev); + + msg.skb = skb; + msg.offset = 0; + + err = ip6_append_data(sk, icmpv6_getfrag, &msg, skb->len + sizeof(struct icmp6hdr), + sizeof(struct icmp6hdr), hlimit, NULL, &fl, + (struct rt6_info*)dst, MSG_DONTWAIT); + + if (err) { + ip6_flush_pending_frames(sk); + goto out_put; + } + err = icmpv6_push_pending_frames(sk, &fl, &tmp_hdr, skb->len + sizeof(struct icmp6hdr)); + + ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTECHOREPLIES); + ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS); + +out_put: + if (likely(idev != NULL)) + in6_dev_put(idev); +out_dst_release: + dst_release(dst); +out: + icmpv6_xmit_unlock(); +} + +static void icmpv6_notify(struct sk_buff *skb, int type, int code, u32 info) +{ + struct in6_addr *saddr, *daddr; + struct inet6_protocol *ipprot; + struct sock *sk; + int inner_offset; + int hash; + u8 nexthdr; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + return; + + nexthdr = ((struct ipv6hdr *)skb->data)->nexthdr; + if (ipv6_ext_hdr(nexthdr)) { + /* now skip over extension headers */ + inner_offset = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &nexthdr, skb->len - sizeof(struct ipv6hdr)); + if (inner_offset<0) + return; + } else { + inner_offset = sizeof(struct ipv6hdr); + } + + /* Checkin header including 8 bytes of inner protocol header. */ + if (!pskb_may_pull(skb, inner_offset+8)) + return; + + saddr = &skb->nh.ipv6h->saddr; + daddr = &skb->nh.ipv6h->daddr; + + /* BUGGG_FUTURE: we should try to parse exthdrs in this packet. + Without this we will not able f.e. to make source routed + pmtu discovery. + Corresponding argument (opt) to notifiers is already added. + --ANK (980726) + */ + + hash = nexthdr & (MAX_INET_PROTOS - 1); + + rcu_read_lock(); + ipprot = rcu_dereference(inet6_protos[hash]); + if (ipprot && ipprot->err_handler) + ipprot->err_handler(skb, NULL, type, code, inner_offset, info); + rcu_read_unlock(); + + read_lock(&raw_v6_lock); + if ((sk = sk_head(&raw_v6_htable[hash])) != NULL) { + while((sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr))) { + rawv6_err(sk, skb, NULL, type, code, inner_offset, info); + sk = sk_next(sk); + } + } + read_unlock(&raw_v6_lock); +} + +/* + * Handle icmp messages + */ + +static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + struct sk_buff *skb = *pskb; + struct net_device *dev = skb->dev; + struct inet6_dev *idev = __in6_dev_get(dev); + struct in6_addr *saddr, *daddr; + struct ipv6hdr *orig_hdr; + struct icmp6hdr *hdr; + int type; + + ICMP6_INC_STATS_BH(idev, ICMP6_MIB_INMSGS); + + saddr = &skb->nh.ipv6h->saddr; + daddr = &skb->nh.ipv6h->daddr; + + /* Perform checksum. */ + if (skb->ip_summed == CHECKSUM_HW) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, + skb->csum)) { + LIMIT_NETDEBUG( + printk(KERN_DEBUG "ICMPv6 hw checksum failed\n")); + skb->ip_summed = CHECKSUM_NONE; + } + } + if (skb->ip_summed == CHECKSUM_NONE) { + if (csum_ipv6_magic(saddr, daddr, skb->len, IPPROTO_ICMPV6, + skb_checksum(skb, 0, skb->len, 0))) { + LIMIT_NETDEBUG( + printk(KERN_DEBUG "ICMPv6 checksum failed [%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x > %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x]\n", + NIP6(*saddr), NIP6(*daddr))); + goto discard_it; + } + } + + if (!pskb_pull(skb, sizeof(struct icmp6hdr))) + goto discard_it; + + hdr = (struct icmp6hdr *) skb->h.raw; + + type = hdr->icmp6_type; + + if (type >= ICMPV6_DEST_UNREACH && type <= ICMPV6_PARAMPROB) + ICMP6_INC_STATS_OFFSET_BH(idev, ICMP6_MIB_INDESTUNREACHS, type - ICMPV6_DEST_UNREACH); + else if (type >= ICMPV6_ECHO_REQUEST && type <= NDISC_REDIRECT) + ICMP6_INC_STATS_OFFSET_BH(idev, ICMP6_MIB_INECHOS, type - ICMPV6_ECHO_REQUEST); + + switch (type) { + case ICMPV6_ECHO_REQUEST: + icmpv6_echo_reply(skb); + break; + + case ICMPV6_ECHO_REPLY: + /* we couldn't care less */ + break; + + case ICMPV6_PKT_TOOBIG: + /* BUGGG_FUTURE: if packet contains rthdr, we cannot update + standard destination cache. Seems, only "advanced" + destination cache will allow to solve this problem + --ANK (980726) + */ + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto discard_it; + hdr = (struct icmp6hdr *) skb->h.raw; + orig_hdr = (struct ipv6hdr *) (hdr + 1); + rt6_pmtu_discovery(&orig_hdr->daddr, &orig_hdr->saddr, dev, + ntohl(hdr->icmp6_mtu)); + + /* + * Drop through to notify + */ + + case ICMPV6_DEST_UNREACH: + case ICMPV6_TIME_EXCEED: + case ICMPV6_PARAMPROB: + icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); + break; + + case NDISC_ROUTER_SOLICITATION: + case NDISC_ROUTER_ADVERTISEMENT: + case NDISC_NEIGHBOUR_SOLICITATION: + case NDISC_NEIGHBOUR_ADVERTISEMENT: + case NDISC_REDIRECT: + ndisc_rcv(skb); + break; + + case ICMPV6_MGM_QUERY: + igmp6_event_query(skb); + break; + + case ICMPV6_MGM_REPORT: + igmp6_event_report(skb); + break; + + case ICMPV6_MGM_REDUCTION: + case ICMPV6_NI_QUERY: + case ICMPV6_NI_REPLY: + case ICMPV6_MLD2_REPORT: + case ICMPV6_DHAAD_REQUEST: + case ICMPV6_DHAAD_REPLY: + case ICMPV6_MOBILE_PREFIX_SOL: + case ICMPV6_MOBILE_PREFIX_ADV: + break; + + default: + LIMIT_NETDEBUG( + printk(KERN_DEBUG "icmpv6: msg of unknown type\n")); + + /* informational */ + if (type & ICMPV6_INFOMSG_MASK) + break; + + /* + * error of unknown type. + * must pass to upper level + */ + + icmpv6_notify(skb, type, hdr->icmp6_code, hdr->icmp6_mtu); + }; + kfree_skb(skb); + return 0; + +discard_it: + ICMP6_INC_STATS_BH(idev, ICMP6_MIB_INERRORS); + kfree_skb(skb); + return 0; +} + +int __init icmpv6_init(struct net_proto_family *ops) +{ + struct sock *sk; + int err, i, j; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_possible(i)) + continue; + + err = sock_create_kern(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, + &per_cpu(__icmpv6_socket, i)); + if (err < 0) { + printk(KERN_ERR + "Failed to initialize the ICMP6 control socket " + "(err %d).\n", + err); + goto fail; + } + + sk = per_cpu(__icmpv6_socket, i)->sk; + sk->sk_allocation = GFP_ATOMIC; + + /* Enough space for 2 64K ICMP packets, including + * sk_buff struct overhead. + */ + sk->sk_sndbuf = + (2 * ((64 * 1024) + sizeof(struct sk_buff))); + + sk->sk_prot->unhash(sk); + } + + + if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) { + printk(KERN_ERR "Failed to register ICMP6 protocol\n"); + err = -EAGAIN; + goto fail; + } + + return 0; + + fail: + for (j = 0; j < i; j++) { + if (!cpu_possible(j)) + continue; + sock_release(per_cpu(__icmpv6_socket, j)); + } + + return err; +} + +void icmpv6_cleanup(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_possible(i)) + continue; + sock_release(per_cpu(__icmpv6_socket, i)); + } + inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6); +} + +static struct icmp6_err { + int err; + int fatal; +} tab_unreach[] = { + { /* NOROUTE */ + .err = ENETUNREACH, + .fatal = 0, + }, + { /* ADM_PROHIBITED */ + .err = EACCES, + .fatal = 1, + }, + { /* Was NOT_NEIGHBOUR, now reserved */ + .err = EHOSTUNREACH, + .fatal = 0, + }, + { /* ADDR_UNREACH */ + .err = EHOSTUNREACH, + .fatal = 0, + }, + { /* PORT_UNREACH */ + .err = ECONNREFUSED, + .fatal = 1, + }, +}; + +int icmpv6_err_convert(int type, int code, int *err) +{ + int fatal = 0; + + *err = EPROTO; + + switch (type) { + case ICMPV6_DEST_UNREACH: + fatal = 1; + if (code <= ICMPV6_PORT_UNREACH) { + *err = tab_unreach[code].err; + fatal = tab_unreach[code].fatal; + } + break; + + case ICMPV6_PKT_TOOBIG: + *err = EMSGSIZE; + break; + + case ICMPV6_PARAMPROB: + *err = EPROTO; + fatal = 1; + break; + + case ICMPV6_TIME_EXCEED: + *err = EHOSTUNREACH; + break; + }; + + return fatal; +} + +#ifdef CONFIG_SYSCTL +ctl_table ipv6_icmp_table[] = { + { + .ctl_name = NET_IPV6_ICMP_RATELIMIT, + .procname = "ratelimit", + .data = &sysctl_icmpv6_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = 0 }, +}; +#endif + diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c new file mode 100644 index 000000000000..405740b75abb --- /dev/null +++ b/net/ipv6/ip6_fib.c @@ -0,0 +1,1225 @@ +/* + * Linux INET6 implementation + * Forwarding Information Database + * + * Authors: + * Pedro Roque + * + * $Id: ip6_fib.c,v 1.25 2001/10/31 21:55:55 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * Yuji SEKIYA @USAGI: Support default route on router node; + * remove ip6_null_entry from the top of + * routing table. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PROC_FS +#include +#endif + +#include +#include +#include + +#include +#include + +#define RT6_DEBUG 2 + +#if RT6_DEBUG >= 3 +#define RT6_TRACE(x...) printk(KERN_DEBUG x) +#else +#define RT6_TRACE(x...) do { ; } while (0) +#endif + +struct rt6_statistics rt6_stats; + +static kmem_cache_t * fib6_node_kmem; + +enum fib_walk_state_t +{ +#ifdef CONFIG_IPV6_SUBTREES + FWS_S, +#endif + FWS_L, + FWS_R, + FWS_C, + FWS_U +}; + +struct fib6_cleaner_t +{ + struct fib6_walker_t w; + int (*func)(struct rt6_info *, void *arg); + void *arg; +}; + +DEFINE_RWLOCK(fib6_walker_lock); + + +#ifdef CONFIG_IPV6_SUBTREES +#define FWS_INIT FWS_S +#define SUBTREE(fn) ((fn)->subtree) +#else +#define FWS_INIT FWS_L +#define SUBTREE(fn) NULL +#endif + +static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt); +static struct fib6_node * fib6_repair_tree(struct fib6_node *fn); + +/* + * A routing update causes an increase of the serial number on the + * affected subtree. This allows for cached routes to be asynchronously + * tested when modifications are made to the destination cache as a + * result of redirects, path MTU changes, etc. + */ + +static __u32 rt_sernum; + +static struct timer_list ip6_fib_timer = TIMER_INITIALIZER(fib6_run_gc, 0, 0); + +struct fib6_walker_t fib6_walker_list = { + .prev = &fib6_walker_list, + .next = &fib6_walker_list, +}; + +#define FOR_WALKERS(w) for ((w)=fib6_walker_list.next; (w) != &fib6_walker_list; (w)=(w)->next) + +static __inline__ u32 fib6_new_sernum(void) +{ + u32 n = ++rt_sernum; + if ((__s32)n <= 0) + rt_sernum = n = 1; + return n; +} + +/* + * Auxiliary address test functions for the radix tree. + * + * These assume a 32bit processor (although it will work on + * 64bit processors) + */ + +/* + * test bit + */ + +static __inline__ int addr_bit_set(void *token, int fn_bit) +{ + __u32 *addr = token; + + return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5]; +} + +/* + * find the first different bit between two addresses + * length of address must be a multiple of 32bits + */ + +static __inline__ int addr_diff(void *token1, void *token2, int addrlen) +{ + __u32 *a1 = token1; + __u32 *a2 = token2; + int i; + + addrlen >>= 2; + + for (i = 0; i < addrlen; i++) { + __u32 xb; + + xb = a1[i] ^ a2[i]; + + if (xb) { + int j = 31; + + xb = ntohl(xb); + + while ((xb & (1 << j)) == 0) + j--; + + return (i * 32 + 31 - j); + } + } + + /* + * we should *never* get to this point since that + * would mean the addrs are equal + * + * However, we do get to it 8) And exacly, when + * addresses are equal 8) + * + * ip route add 1111::/128 via ... + * ip route add 1111::/64 via ... + * and we are here. + * + * Ideally, this function should stop comparison + * at prefix length. It does not, but it is still OK, + * if returned value is greater than prefix length. + * --ANK (980803) + */ + + return addrlen<<5; +} + +static __inline__ struct fib6_node * node_alloc(void) +{ + struct fib6_node *fn; + + if ((fn = kmem_cache_alloc(fib6_node_kmem, SLAB_ATOMIC)) != NULL) + memset(fn, 0, sizeof(struct fib6_node)); + + return fn; +} + +static __inline__ void node_free(struct fib6_node * fn) +{ + kmem_cache_free(fib6_node_kmem, fn); +} + +static __inline__ void rt6_release(struct rt6_info *rt) +{ + if (atomic_dec_and_test(&rt->rt6i_ref)) + dst_free(&rt->u.dst); +} + + +/* + * Routing Table + * + * return the appropriate node for a routing tree "add" operation + * by either creating and inserting or by returning an existing + * node. + */ + +static struct fib6_node * fib6_add_1(struct fib6_node *root, void *addr, + int addrlen, int plen, + int offset) +{ + struct fib6_node *fn, *in, *ln; + struct fib6_node *pn = NULL; + struct rt6key *key; + int bit; + int dir = 0; + __u32 sernum = fib6_new_sernum(); + + RT6_TRACE("fib6_add_1\n"); + + /* insert node in tree */ + + fn = root; + + do { + key = (struct rt6key *)((u8 *)fn->leaf + offset); + + /* + * Prefix match + */ + if (plen < fn->fn_bit || + !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) + goto insert_above; + + /* + * Exact match ? + */ + + if (plen == fn->fn_bit) { + /* clean up an intermediate node */ + if ((fn->fn_flags & RTN_RTINFO) == 0) { + rt6_release(fn->leaf); + fn->leaf = NULL; + } + + fn->fn_sernum = sernum; + + return fn; + } + + /* + * We have more bits to go + */ + + /* Try to walk down on tree. */ + fn->fn_sernum = sernum; + dir = addr_bit_set(addr, fn->fn_bit); + pn = fn; + fn = dir ? fn->right: fn->left; + } while (fn); + + /* + * We walked to the bottom of tree. + * Create new leaf node without children. + */ + + ln = node_alloc(); + + if (ln == NULL) + return NULL; + ln->fn_bit = plen; + + ln->parent = pn; + ln->fn_sernum = sernum; + + if (dir) + pn->right = ln; + else + pn->left = ln; + + return ln; + + +insert_above: + /* + * split since we don't have a common prefix anymore or + * we have a less significant route. + * we've to insert an intermediate node on the list + * this new node will point to the one we need to create + * and the current + */ + + pn = fn->parent; + + /* find 1st bit in difference between the 2 addrs. + + See comment in addr_diff: bit may be an invalid value, + but if it is >= plen, the value is ignored in any case. + */ + + bit = addr_diff(addr, &key->addr, addrlen); + + /* + * (intermediate)[in] + * / \ + * (new leaf node)[ln] (old node)[fn] + */ + if (plen > bit) { + in = node_alloc(); + ln = node_alloc(); + + if (in == NULL || ln == NULL) { + if (in) + node_free(in); + if (ln) + node_free(ln); + return NULL; + } + + /* + * new intermediate node. + * RTN_RTINFO will + * be off since that an address that chooses one of + * the branches would not match less specific routes + * in the other branch + */ + + in->fn_bit = bit; + + in->parent = pn; + in->leaf = fn->leaf; + atomic_inc(&in->leaf->rt6i_ref); + + in->fn_sernum = sernum; + + /* update parent pointer */ + if (dir) + pn->right = in; + else + pn->left = in; + + ln->fn_bit = plen; + + ln->parent = in; + fn->parent = in; + + ln->fn_sernum = sernum; + + if (addr_bit_set(addr, bit)) { + in->right = ln; + in->left = fn; + } else { + in->left = ln; + in->right = fn; + } + } else { /* plen <= bit */ + + /* + * (new leaf node)[ln] + * / \ + * (old node)[fn] NULL + */ + + ln = node_alloc(); + + if (ln == NULL) + return NULL; + + ln->fn_bit = plen; + + ln->parent = pn; + + ln->fn_sernum = sernum; + + if (dir) + pn->right = ln; + else + pn->left = ln; + + if (addr_bit_set(&key->addr, plen)) + ln->right = fn; + else + ln->left = fn; + + fn->parent = ln; + } + return ln; +} + +/* + * Insert routing information in a node. + */ + +static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, + struct nlmsghdr *nlh) +{ + struct rt6_info *iter = NULL; + struct rt6_info **ins; + + ins = &fn->leaf; + + if (fn->fn_flags&RTN_TL_ROOT && + fn->leaf == &ip6_null_entry && + !(rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) ){ + fn->leaf = rt; + rt->u.next = NULL; + goto out; + } + + for (iter = fn->leaf; iter; iter=iter->u.next) { + /* + * Search for duplicates + */ + + if (iter->rt6i_metric == rt->rt6i_metric) { + /* + * Same priority level + */ + + if (iter->rt6i_dev == rt->rt6i_dev && + iter->rt6i_idev == rt->rt6i_idev && + ipv6_addr_equal(&iter->rt6i_gateway, + &rt->rt6i_gateway)) { + if (!(iter->rt6i_flags&RTF_EXPIRES)) + return -EEXIST; + iter->rt6i_expires = rt->rt6i_expires; + if (!(rt->rt6i_flags&RTF_EXPIRES)) { + iter->rt6i_flags &= ~RTF_EXPIRES; + iter->rt6i_expires = 0; + } + return -EEXIST; + } + } + + if (iter->rt6i_metric > rt->rt6i_metric) + break; + + ins = &iter->u.next; + } + + /* + * insert node + */ + +out: + rt->u.next = iter; + *ins = rt; + rt->rt6i_node = fn; + atomic_inc(&rt->rt6i_ref); + inet6_rt_notify(RTM_NEWROUTE, rt, nlh); + rt6_stats.fib_rt_entries++; + + if ((fn->fn_flags & RTN_RTINFO) == 0) { + rt6_stats.fib_route_nodes++; + fn->fn_flags |= RTN_RTINFO; + } + + return 0; +} + +static __inline__ void fib6_start_gc(struct rt6_info *rt) +{ + if (ip6_fib_timer.expires == 0 && + (rt->rt6i_flags & (RTF_EXPIRES|RTF_CACHE))) + mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval); +} + +void fib6_force_start_gc(void) +{ + if (ip6_fib_timer.expires == 0) + mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval); +} + +/* + * Add routing information to the routing tree. + * / + * with source addr info in sub-trees + */ + +int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) +{ + struct fib6_node *fn; + int err = -ENOMEM; + + fn = fib6_add_1(root, &rt->rt6i_dst.addr, sizeof(struct in6_addr), + rt->rt6i_dst.plen, offsetof(struct rt6_info, rt6i_dst)); + + if (fn == NULL) + goto out; + +#ifdef CONFIG_IPV6_SUBTREES + if (rt->rt6i_src.plen) { + struct fib6_node *sn; + + if (fn->subtree == NULL) { + struct fib6_node *sfn; + + /* + * Create subtree. + * + * fn[main tree] + * | + * sfn[subtree root] + * \ + * sn[new leaf node] + */ + + /* Create subtree root node */ + sfn = node_alloc(); + if (sfn == NULL) + goto st_failure; + + sfn->leaf = &ip6_null_entry; + atomic_inc(&ip6_null_entry.rt6i_ref); + sfn->fn_flags = RTN_ROOT; + sfn->fn_sernum = fib6_new_sernum(); + + /* Now add the first leaf node to new subtree */ + + sn = fib6_add_1(sfn, &rt->rt6i_src.addr, + sizeof(struct in6_addr), rt->rt6i_src.plen, + offsetof(struct rt6_info, rt6i_src)); + + if (sn == NULL) { + /* If it is failed, discard just allocated + root, and then (in st_failure) stale node + in main tree. + */ + node_free(sfn); + goto st_failure; + } + + /* Now link new subtree to main tree */ + sfn->parent = fn; + fn->subtree = sfn; + if (fn->leaf == NULL) { + fn->leaf = rt; + atomic_inc(&rt->rt6i_ref); + } + } else { + sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr, + sizeof(struct in6_addr), rt->rt6i_src.plen, + offsetof(struct rt6_info, rt6i_src)); + + if (sn == NULL) + goto st_failure; + } + + fn = sn; + } +#endif + + err = fib6_add_rt2node(fn, rt, nlh); + + if (err == 0) { + fib6_start_gc(rt); + if (!(rt->rt6i_flags&RTF_CACHE)) + fib6_prune_clones(fn, rt); + } + +out: + if (err) + dst_free(&rt->u.dst); + return err; + +#ifdef CONFIG_IPV6_SUBTREES + /* Subtree creation failed, probably main tree node + is orphan. If it is, shoot it. + */ +st_failure: + if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) + fib6_repair_tree(fn); + dst_free(&rt->u.dst); + return err; +#endif +} + +/* + * Routing tree lookup + * + */ + +struct lookup_args { + int offset; /* key offset on rt6_info */ + struct in6_addr *addr; /* search key */ +}; + +static struct fib6_node * fib6_lookup_1(struct fib6_node *root, + struct lookup_args *args) +{ + struct fib6_node *fn; + int dir; + + /* + * Descend on a tree + */ + + fn = root; + + for (;;) { + struct fib6_node *next; + + dir = addr_bit_set(args->addr, fn->fn_bit); + + next = dir ? fn->right : fn->left; + + if (next) { + fn = next; + continue; + } + + break; + } + + while ((fn->fn_flags & RTN_ROOT) == 0) { +#ifdef CONFIG_IPV6_SUBTREES + if (fn->subtree) { + struct fib6_node *st; + struct lookup_args *narg; + + narg = args + 1; + + if (narg->addr) { + st = fib6_lookup_1(fn->subtree, narg); + + if (st && !(st->fn_flags & RTN_ROOT)) + return st; + } + } +#endif + + if (fn->fn_flags & RTN_RTINFO) { + struct rt6key *key; + + key = (struct rt6key *) ((u8 *) fn->leaf + + args->offset); + + if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) + return fn; + } + + fn = fn->parent; + } + + return NULL; +} + +struct fib6_node * fib6_lookup(struct fib6_node *root, struct in6_addr *daddr, + struct in6_addr *saddr) +{ + struct lookup_args args[2]; + struct fib6_node *fn; + + args[0].offset = offsetof(struct rt6_info, rt6i_dst); + args[0].addr = daddr; + +#ifdef CONFIG_IPV6_SUBTREES + args[1].offset = offsetof(struct rt6_info, rt6i_src); + args[1].addr = saddr; +#endif + + fn = fib6_lookup_1(root, args); + + if (fn == NULL || fn->fn_flags & RTN_TL_ROOT) + fn = root; + + return fn; +} + +/* + * Get node with specified destination prefix (and source prefix, + * if subtrees are used) + */ + + +static struct fib6_node * fib6_locate_1(struct fib6_node *root, + struct in6_addr *addr, + int plen, int offset) +{ + struct fib6_node *fn; + + for (fn = root; fn ; ) { + struct rt6key *key = (struct rt6key *)((u8 *)fn->leaf + offset); + + /* + * Prefix match + */ + if (plen < fn->fn_bit || + !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) + return NULL; + + if (plen == fn->fn_bit) + return fn; + + /* + * We have more bits to go + */ + if (addr_bit_set(addr, fn->fn_bit)) + fn = fn->right; + else + fn = fn->left; + } + return NULL; +} + +struct fib6_node * fib6_locate(struct fib6_node *root, + struct in6_addr *daddr, int dst_len, + struct in6_addr *saddr, int src_len) +{ + struct fib6_node *fn; + + fn = fib6_locate_1(root, daddr, dst_len, + offsetof(struct rt6_info, rt6i_dst)); + +#ifdef CONFIG_IPV6_SUBTREES + if (src_len) { + BUG_TRAP(saddr!=NULL); + if (fn == NULL) + fn = fn->subtree; + if (fn) + fn = fib6_locate_1(fn, saddr, src_len, + offsetof(struct rt6_info, rt6i_src)); + } +#endif + + if (fn && fn->fn_flags&RTN_RTINFO) + return fn; + + return NULL; +} + + +/* + * Deletion + * + */ + +static struct rt6_info * fib6_find_prefix(struct fib6_node *fn) +{ + if (fn->fn_flags&RTN_ROOT) + return &ip6_null_entry; + + while(fn) { + if(fn->left) + return fn->left->leaf; + + if(fn->right) + return fn->right->leaf; + + fn = SUBTREE(fn); + } + return NULL; +} + +/* + * Called to trim the tree of intermediate nodes when possible. "fn" + * is the node we want to try and remove. + */ + +static struct fib6_node * fib6_repair_tree(struct fib6_node *fn) +{ + int children; + int nstate; + struct fib6_node *child, *pn; + struct fib6_walker_t *w; + int iter = 0; + + for (;;) { + RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter); + iter++; + + BUG_TRAP(!(fn->fn_flags&RTN_RTINFO)); + BUG_TRAP(!(fn->fn_flags&RTN_TL_ROOT)); + BUG_TRAP(fn->leaf==NULL); + + children = 0; + child = NULL; + if (fn->right) child = fn->right, children |= 1; + if (fn->left) child = fn->left, children |= 2; + + if (children == 3 || SUBTREE(fn) +#ifdef CONFIG_IPV6_SUBTREES + /* Subtree root (i.e. fn) may have one child */ + || (children && fn->fn_flags&RTN_ROOT) +#endif + ) { + fn->leaf = fib6_find_prefix(fn); +#if RT6_DEBUG >= 2 + if (fn->leaf==NULL) { + BUG_TRAP(fn->leaf); + fn->leaf = &ip6_null_entry; + } +#endif + atomic_inc(&fn->leaf->rt6i_ref); + return fn->parent; + } + + pn = fn->parent; +#ifdef CONFIG_IPV6_SUBTREES + if (SUBTREE(pn) == fn) { + BUG_TRAP(fn->fn_flags&RTN_ROOT); + SUBTREE(pn) = NULL; + nstate = FWS_L; + } else { + BUG_TRAP(!(fn->fn_flags&RTN_ROOT)); +#endif + if (pn->right == fn) pn->right = child; + else if (pn->left == fn) pn->left = child; +#if RT6_DEBUG >= 2 + else BUG_TRAP(0); +#endif + if (child) + child->parent = pn; + nstate = FWS_R; +#ifdef CONFIG_IPV6_SUBTREES + } +#endif + + read_lock(&fib6_walker_lock); + FOR_WALKERS(w) { + if (child == NULL) { + if (w->root == fn) { + w->root = w->node = NULL; + RT6_TRACE("W %p adjusted by delroot 1\n", w); + } else if (w->node == fn) { + RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate); + w->node = pn; + w->state = nstate; + } + } else { + if (w->root == fn) { + w->root = child; + RT6_TRACE("W %p adjusted by delroot 2\n", w); + } + if (w->node == fn) { + w->node = child; + if (children&2) { + RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); + w->state = w->state>=FWS_R ? FWS_U : FWS_INIT; + } else { + RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state); + w->state = w->state>=FWS_C ? FWS_U : FWS_INIT; + } + } + } + } + read_unlock(&fib6_walker_lock); + + node_free(fn); + if (pn->fn_flags&RTN_RTINFO || SUBTREE(pn)) + return pn; + + rt6_release(pn->leaf); + pn->leaf = NULL; + fn = pn; + } +} + +static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, + struct nlmsghdr *nlh, void *_rtattr) +{ + struct fib6_walker_t *w; + struct rt6_info *rt = *rtp; + + RT6_TRACE("fib6_del_route\n"); + + /* Unlink it */ + *rtp = rt->u.next; + rt->rt6i_node = NULL; + rt6_stats.fib_rt_entries--; + rt6_stats.fib_discarded_routes++; + + /* Adjust walkers */ + read_lock(&fib6_walker_lock); + FOR_WALKERS(w) { + if (w->state == FWS_C && w->leaf == rt) { + RT6_TRACE("walker %p adjusted by delroute\n", w); + w->leaf = rt->u.next; + if (w->leaf == NULL) + w->state = FWS_U; + } + } + read_unlock(&fib6_walker_lock); + + rt->u.next = NULL; + + if (fn->leaf == NULL && fn->fn_flags&RTN_TL_ROOT) + fn->leaf = &ip6_null_entry; + + /* If it was last route, expunge its radix tree node */ + if (fn->leaf == NULL) { + fn->fn_flags &= ~RTN_RTINFO; + rt6_stats.fib_route_nodes--; + fn = fib6_repair_tree(fn); + } + + if (atomic_read(&rt->rt6i_ref) != 1) { + /* This route is used as dummy address holder in some split + * nodes. It is not leaked, but it still holds other resources, + * which must be released in time. So, scan ascendant nodes + * and replace dummy references to this route with references + * to still alive ones. + */ + while (fn) { + if (!(fn->fn_flags&RTN_RTINFO) && fn->leaf == rt) { + fn->leaf = fib6_find_prefix(fn); + atomic_inc(&fn->leaf->rt6i_ref); + rt6_release(rt); + } + fn = fn->parent; + } + /* No more references are possible at this point. */ + if (atomic_read(&rt->rt6i_ref) != 1) BUG(); + } + + inet6_rt_notify(RTM_DELROUTE, rt, nlh); + rt6_release(rt); +} + +int fib6_del(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) +{ + struct fib6_node *fn = rt->rt6i_node; + struct rt6_info **rtp; + +#if RT6_DEBUG >= 2 + if (rt->u.dst.obsolete>0) { + BUG_TRAP(fn==NULL); + return -ENOENT; + } +#endif + if (fn == NULL || rt == &ip6_null_entry) + return -ENOENT; + + BUG_TRAP(fn->fn_flags&RTN_RTINFO); + + if (!(rt->rt6i_flags&RTF_CACHE)) + fib6_prune_clones(fn, rt); + + /* + * Walk the leaf entries looking for ourself + */ + + for (rtp = &fn->leaf; *rtp; rtp = &(*rtp)->u.next) { + if (*rtp == rt) { + fib6_del_route(fn, rtp, nlh, _rtattr); + return 0; + } + } + return -ENOENT; +} + +/* + * Tree traversal function. + * + * Certainly, it is not interrupt safe. + * However, it is internally reenterable wrt itself and fib6_add/fib6_del. + * It means, that we can modify tree during walking + * and use this function for garbage collection, clone pruning, + * cleaning tree when a device goes down etc. etc. + * + * It guarantees that every node will be traversed, + * and that it will be traversed only once. + * + * Callback function w->func may return: + * 0 -> continue walking. + * positive value -> walking is suspended (used by tree dumps, + * and probably by gc, if it will be split to several slices) + * negative value -> terminate walking. + * + * The function itself returns: + * 0 -> walk is complete. + * >0 -> walk is incomplete (i.e. suspended) + * <0 -> walk is terminated by an error. + */ + +int fib6_walk_continue(struct fib6_walker_t *w) +{ + struct fib6_node *fn, *pn; + + for (;;) { + fn = w->node; + if (fn == NULL) + return 0; + + if (w->prune && fn != w->root && + fn->fn_flags&RTN_RTINFO && w->state < FWS_C) { + w->state = FWS_C; + w->leaf = fn->leaf; + } + switch (w->state) { +#ifdef CONFIG_IPV6_SUBTREES + case FWS_S: + if (SUBTREE(fn)) { + w->node = SUBTREE(fn); + continue; + } + w->state = FWS_L; +#endif + case FWS_L: + if (fn->left) { + w->node = fn->left; + w->state = FWS_INIT; + continue; + } + w->state = FWS_R; + case FWS_R: + if (fn->right) { + w->node = fn->right; + w->state = FWS_INIT; + continue; + } + w->state = FWS_C; + w->leaf = fn->leaf; + case FWS_C: + if (w->leaf && fn->fn_flags&RTN_RTINFO) { + int err = w->func(w); + if (err) + return err; + continue; + } + w->state = FWS_U; + case FWS_U: + if (fn == w->root) + return 0; + pn = fn->parent; + w->node = pn; +#ifdef CONFIG_IPV6_SUBTREES + if (SUBTREE(pn) == fn) { + BUG_TRAP(fn->fn_flags&RTN_ROOT); + w->state = FWS_L; + continue; + } +#endif + if (pn->left == fn) { + w->state = FWS_R; + continue; + } + if (pn->right == fn) { + w->state = FWS_C; + w->leaf = w->node->leaf; + continue; + } +#if RT6_DEBUG >= 2 + BUG_TRAP(0); +#endif + } + } +} + +int fib6_walk(struct fib6_walker_t *w) +{ + int res; + + w->state = FWS_INIT; + w->node = w->root; + + fib6_walker_link(w); + res = fib6_walk_continue(w); + if (res <= 0) + fib6_walker_unlink(w); + return res; +} + +static int fib6_clean_node(struct fib6_walker_t *w) +{ + int res; + struct rt6_info *rt; + struct fib6_cleaner_t *c = (struct fib6_cleaner_t*)w; + + for (rt = w->leaf; rt; rt = rt->u.next) { + res = c->func(rt, c->arg); + if (res < 0) { + w->leaf = rt; + res = fib6_del(rt, NULL, NULL); + if (res) { +#if RT6_DEBUG >= 2 + printk(KERN_DEBUG "fib6_clean_node: del failed: rt=%p@%p err=%d\n", rt, rt->rt6i_node, res); +#endif + continue; + } + return 0; + } + BUG_TRAP(res==0); + } + w->leaf = rt; + return 0; +} + +/* + * Convenient frontend to tree walker. + * + * func is called on each route. + * It may return -1 -> delete this route. + * 0 -> continue walking + * + * prune==1 -> only immediate children of node (certainly, + * ignoring pure split nodes) will be scanned. + */ + +void fib6_clean_tree(struct fib6_node *root, + int (*func)(struct rt6_info *, void *arg), + int prune, void *arg) +{ + struct fib6_cleaner_t c; + + c.w.root = root; + c.w.func = fib6_clean_node; + c.w.prune = prune; + c.func = func; + c.arg = arg; + + fib6_walk(&c.w); +} + +static int fib6_prune_clone(struct rt6_info *rt, void *arg) +{ + if (rt->rt6i_flags & RTF_CACHE) { + RT6_TRACE("pruning clone %p\n", rt); + return -1; + } + + return 0; +} + +static void fib6_prune_clones(struct fib6_node *fn, struct rt6_info *rt) +{ + fib6_clean_tree(fn, fib6_prune_clone, 1, rt); +} + +/* + * Garbage collection + */ + +static struct fib6_gc_args +{ + int timeout; + int more; +} gc_args; + +static int fib6_age(struct rt6_info *rt, void *arg) +{ + unsigned long now = jiffies; + + /* + * check addrconf expiration here. + * Routes are expired even if they are in use. + * + * Also age clones. Note, that clones are aged out + * only if they are not in use now. + */ + + if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) { + if (time_after(now, rt->rt6i_expires)) { + RT6_TRACE("expiring %p\n", rt); + rt6_reset_dflt_pointer(rt); + return -1; + } + gc_args.more++; + } else if (rt->rt6i_flags & RTF_CACHE) { + if (atomic_read(&rt->u.dst.__refcnt) == 0 && + time_after_eq(now, rt->u.dst.lastuse + gc_args.timeout)) { + RT6_TRACE("aging clone %p\n", rt); + return -1; + } else if ((rt->rt6i_flags & RTF_GATEWAY) && + (!(rt->rt6i_nexthop->flags & NTF_ROUTER))) { + RT6_TRACE("purging route %p via non-router but gateway\n", + rt); + return -1; + } + gc_args.more++; + } + + return 0; +} + +static DEFINE_SPINLOCK(fib6_gc_lock); + +void fib6_run_gc(unsigned long dummy) +{ + if (dummy != ~0UL) { + spin_lock_bh(&fib6_gc_lock); + gc_args.timeout = dummy ? (int)dummy : ip6_rt_gc_interval; + } else { + local_bh_disable(); + if (!spin_trylock(&fib6_gc_lock)) { + mod_timer(&ip6_fib_timer, jiffies + HZ); + local_bh_enable(); + return; + } + gc_args.timeout = ip6_rt_gc_interval; + } + gc_args.more = 0; + + + write_lock_bh(&rt6_lock); + ndisc_dst_gc(&gc_args.more); + fib6_clean_tree(&ip6_routing_table, fib6_age, 0, NULL); + write_unlock_bh(&rt6_lock); + + if (gc_args.more) + mod_timer(&ip6_fib_timer, jiffies + ip6_rt_gc_interval); + else { + del_timer(&ip6_fib_timer); + ip6_fib_timer.expires = 0; + } + spin_unlock_bh(&fib6_gc_lock); +} + +void __init fib6_init(void) +{ + fib6_node_kmem = kmem_cache_create("fib6_nodes", + sizeof(struct fib6_node), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!fib6_node_kmem) + panic("cannot create fib6_nodes cache"); +} + +void fib6_gc_cleanup(void) +{ + del_timer(&ip6_fib_timer); + kmem_cache_destroy(fib6_node_kmem); +} diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c new file mode 100644 index 000000000000..a93f6dc51979 --- /dev/null +++ b/net/ipv6/ip6_flowlabel.c @@ -0,0 +1,706 @@ +/* + * ip6_flowlabel.c IPv6 flowlabel manager. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define FL_MIN_LINGER 6 /* Minimal linger. It is set to 6sec specified + in old IPv6 RFC. Well, it was reasonable value. + */ +#define FL_MAX_LINGER 60 /* Maximal linger timeout */ + +/* FL hash table */ + +#define FL_MAX_PER_SOCK 32 +#define FL_MAX_SIZE 4096 +#define FL_HASH_MASK 255 +#define FL_HASH(l) (ntohl(l)&FL_HASH_MASK) + +static atomic_t fl_size = ATOMIC_INIT(0); +static struct ip6_flowlabel *fl_ht[FL_HASH_MASK+1]; + +static void ip6_fl_gc(unsigned long dummy); +static struct timer_list ip6_fl_gc_timer = TIMER_INITIALIZER(ip6_fl_gc, 0, 0); + +/* FL hash table lock: it protects only of GC */ + +static DEFINE_RWLOCK(ip6_fl_lock); + +/* Big socket sock */ + +static DEFINE_RWLOCK(ip6_sk_fl_lock); + + +static __inline__ struct ip6_flowlabel * __fl_lookup(u32 label) +{ + struct ip6_flowlabel *fl; + + for (fl=fl_ht[FL_HASH(label)]; fl; fl = fl->next) { + if (fl->label == label) + return fl; + } + return NULL; +} + +static struct ip6_flowlabel * fl_lookup(u32 label) +{ + struct ip6_flowlabel *fl; + + read_lock_bh(&ip6_fl_lock); + fl = __fl_lookup(label); + if (fl) + atomic_inc(&fl->users); + read_unlock_bh(&ip6_fl_lock); + return fl; +} + + +static void fl_free(struct ip6_flowlabel *fl) +{ + if (fl) + kfree(fl->opt); + kfree(fl); +} + +static void fl_release(struct ip6_flowlabel *fl) +{ + write_lock_bh(&ip6_fl_lock); + + fl->lastuse = jiffies; + if (atomic_dec_and_test(&fl->users)) { + unsigned long ttd = fl->lastuse + fl->linger; + if (time_after(ttd, fl->expires)) + fl->expires = ttd; + ttd = fl->expires; + if (fl->opt && fl->share == IPV6_FL_S_EXCL) { + struct ipv6_txoptions *opt = fl->opt; + fl->opt = NULL; + kfree(opt); + } + if (!timer_pending(&ip6_fl_gc_timer) || + time_after(ip6_fl_gc_timer.expires, ttd)) + mod_timer(&ip6_fl_gc_timer, ttd); + } + + write_unlock_bh(&ip6_fl_lock); +} + +static void ip6_fl_gc(unsigned long dummy) +{ + int i; + unsigned long now = jiffies; + unsigned long sched = 0; + + write_lock(&ip6_fl_lock); + + for (i=0; i<=FL_HASH_MASK; i++) { + struct ip6_flowlabel *fl, **flp; + flp = &fl_ht[i]; + while ((fl=*flp) != NULL) { + if (atomic_read(&fl->users) == 0) { + unsigned long ttd = fl->lastuse + fl->linger; + if (time_after(ttd, fl->expires)) + fl->expires = ttd; + ttd = fl->expires; + if (time_after_eq(now, ttd)) { + *flp = fl->next; + fl_free(fl); + atomic_dec(&fl_size); + continue; + } + if (!sched || time_before(ttd, sched)) + sched = ttd; + } + flp = &fl->next; + } + } + if (!sched && atomic_read(&fl_size)) + sched = now + FL_MAX_LINGER; + if (sched) { + ip6_fl_gc_timer.expires = sched; + add_timer(&ip6_fl_gc_timer); + } + write_unlock(&ip6_fl_lock); +} + +static int fl_intern(struct ip6_flowlabel *fl, __u32 label) +{ + fl->label = label & IPV6_FLOWLABEL_MASK; + + write_lock_bh(&ip6_fl_lock); + if (label == 0) { + for (;;) { + fl->label = htonl(net_random())&IPV6_FLOWLABEL_MASK; + if (fl->label) { + struct ip6_flowlabel *lfl; + lfl = __fl_lookup(fl->label); + if (lfl == NULL) + break; + } + } + } + + fl->lastuse = jiffies; + fl->next = fl_ht[FL_HASH(fl->label)]; + fl_ht[FL_HASH(fl->label)] = fl; + atomic_inc(&fl_size); + write_unlock_bh(&ip6_fl_lock); + return 0; +} + + + +/* Socket flowlabel lists */ + +struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, u32 label) +{ + struct ipv6_fl_socklist *sfl; + struct ipv6_pinfo *np = inet6_sk(sk); + + label &= IPV6_FLOWLABEL_MASK; + + for (sfl=np->ipv6_fl_list; sfl; sfl = sfl->next) { + struct ip6_flowlabel *fl = sfl->fl; + if (fl->label == label) { + fl->lastuse = jiffies; + atomic_inc(&fl->users); + return fl; + } + } + return NULL; +} + +void fl6_free_socklist(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_fl_socklist *sfl; + + while ((sfl = np->ipv6_fl_list) != NULL) { + np->ipv6_fl_list = sfl->next; + fl_release(sfl->fl); + kfree(sfl); + } +} + +/* Service routines */ + + +/* + It is the only difficult place. flowlabel enforces equal headers + before and including routing header, however user may supply options + following rthdr. + */ + +struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions * opt_space, + struct ip6_flowlabel * fl, + struct ipv6_txoptions * fopt) +{ + struct ipv6_txoptions * fl_opt = fl->opt; + + if (fopt == NULL || fopt->opt_flen == 0) + return fl_opt; + + if (fl_opt != NULL) { + opt_space->hopopt = fl_opt->hopopt; + opt_space->dst0opt = fl_opt->dst0opt; + opt_space->srcrt = fl_opt->srcrt; + opt_space->opt_nflen = fl_opt->opt_nflen; + } else { + if (fopt->opt_nflen == 0) + return fopt; + opt_space->hopopt = NULL; + opt_space->dst0opt = NULL; + opt_space->srcrt = NULL; + opt_space->opt_nflen = 0; + } + opt_space->dst1opt = fopt->dst1opt; + opt_space->auth = fopt->auth; + opt_space->opt_flen = fopt->opt_flen; + return opt_space; +} + +static unsigned long check_linger(unsigned long ttl) +{ + if (ttl < FL_MIN_LINGER) + return FL_MIN_LINGER*HZ; + if (ttl > FL_MAX_LINGER && !capable(CAP_NET_ADMIN)) + return 0; + return ttl*HZ; +} + +static int fl6_renew(struct ip6_flowlabel *fl, unsigned long linger, unsigned long expires) +{ + linger = check_linger(linger); + if (!linger) + return -EPERM; + expires = check_linger(expires); + if (!expires) + return -EPERM; + fl->lastuse = jiffies; + if (time_before(fl->linger, linger)) + fl->linger = linger; + if (time_before(expires, fl->linger)) + expires = fl->linger; + if (time_before(fl->expires, fl->lastuse + expires)) + fl->expires = fl->lastuse + expires; + return 0; +} + +static struct ip6_flowlabel * +fl_create(struct in6_flowlabel_req *freq, char __user *optval, int optlen, int *err_p) +{ + struct ip6_flowlabel *fl; + int olen; + int addr_type; + int err; + + err = -ENOMEM; + fl = kmalloc(sizeof(*fl), GFP_KERNEL); + if (fl == NULL) + goto done; + memset(fl, 0, sizeof(*fl)); + + olen = optlen - CMSG_ALIGN(sizeof(*freq)); + if (olen > 0) { + struct msghdr msg; + struct flowi flowi; + int junk; + + err = -ENOMEM; + fl->opt = kmalloc(sizeof(*fl->opt) + olen, GFP_KERNEL); + if (fl->opt == NULL) + goto done; + + memset(fl->opt, 0, sizeof(*fl->opt)); + fl->opt->tot_len = sizeof(*fl->opt) + olen; + err = -EFAULT; + if (copy_from_user(fl->opt+1, optval+CMSG_ALIGN(sizeof(*freq)), olen)) + goto done; + + msg.msg_controllen = olen; + msg.msg_control = (void*)(fl->opt+1); + flowi.oif = 0; + + err = datagram_send_ctl(&msg, &flowi, fl->opt, &junk); + if (err) + goto done; + err = -EINVAL; + if (fl->opt->opt_flen) + goto done; + if (fl->opt->opt_nflen == 0) { + kfree(fl->opt); + fl->opt = NULL; + } + } + + fl->expires = jiffies; + err = fl6_renew(fl, freq->flr_linger, freq->flr_expires); + if (err) + goto done; + fl->share = freq->flr_share; + addr_type = ipv6_addr_type(&freq->flr_dst); + if ((addr_type&IPV6_ADDR_MAPPED) + || addr_type == IPV6_ADDR_ANY) + goto done; + ipv6_addr_copy(&fl->dst, &freq->flr_dst); + atomic_set(&fl->users, 1); + switch (fl->share) { + case IPV6_FL_S_EXCL: + case IPV6_FL_S_ANY: + break; + case IPV6_FL_S_PROCESS: + fl->owner = current->pid; + break; + case IPV6_FL_S_USER: + fl->owner = current->euid; + break; + default: + err = -EINVAL; + goto done; + } + return fl; + +done: + fl_free(fl); + *err_p = err; + return NULL; +} + +static int mem_check(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_fl_socklist *sfl; + int room = FL_MAX_SIZE - atomic_read(&fl_size); + int count = 0; + + if (room > FL_MAX_SIZE - FL_MAX_PER_SOCK) + return 0; + + for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) + count++; + + if (room <= 0 || + ((count >= FL_MAX_PER_SOCK || + (count > 0 && room < FL_MAX_SIZE/2) || room < FL_MAX_SIZE/4) + && !capable(CAP_NET_ADMIN))) + return -ENOBUFS; + + return 0; +} + +static int ipv6_hdr_cmp(struct ipv6_opt_hdr *h1, struct ipv6_opt_hdr *h2) +{ + if (h1 == h2) + return 0; + if (h1 == NULL || h2 == NULL) + return 1; + if (h1->hdrlen != h2->hdrlen) + return 1; + return memcmp(h1+1, h2+1, ((h1->hdrlen+1)<<3) - sizeof(*h1)); +} + +static int ipv6_opt_cmp(struct ipv6_txoptions *o1, struct ipv6_txoptions *o2) +{ + if (o1 == o2) + return 0; + if (o1 == NULL || o2 == NULL) + return 1; + if (o1->opt_nflen != o2->opt_nflen) + return 1; + if (ipv6_hdr_cmp(o1->hopopt, o2->hopopt)) + return 1; + if (ipv6_hdr_cmp(o1->dst0opt, o2->dst0opt)) + return 1; + if (ipv6_hdr_cmp((struct ipv6_opt_hdr *)o1->srcrt, (struct ipv6_opt_hdr *)o2->srcrt)) + return 1; + return 0; +} + +int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen) +{ + int err; + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_flowlabel_req freq; + struct ipv6_fl_socklist *sfl1=NULL; + struct ipv6_fl_socklist *sfl, **sflp; + struct ip6_flowlabel *fl; + + if (optlen < sizeof(freq)) + return -EINVAL; + + if (copy_from_user(&freq, optval, sizeof(freq))) + return -EFAULT; + + switch (freq.flr_action) { + case IPV6_FL_A_PUT: + write_lock_bh(&ip6_sk_fl_lock); + for (sflp = &np->ipv6_fl_list; (sfl=*sflp)!=NULL; sflp = &sfl->next) { + if (sfl->fl->label == freq.flr_label) { + if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK)) + np->flow_label &= ~IPV6_FLOWLABEL_MASK; + *sflp = sfl->next; + write_unlock_bh(&ip6_sk_fl_lock); + fl_release(sfl->fl); + kfree(sfl); + return 0; + } + } + write_unlock_bh(&ip6_sk_fl_lock); + return -ESRCH; + + case IPV6_FL_A_RENEW: + read_lock_bh(&ip6_sk_fl_lock); + for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) { + if (sfl->fl->label == freq.flr_label) { + err = fl6_renew(sfl->fl, freq.flr_linger, freq.flr_expires); + read_unlock_bh(&ip6_sk_fl_lock); + return err; + } + } + read_unlock_bh(&ip6_sk_fl_lock); + + if (freq.flr_share == IPV6_FL_S_NONE && capable(CAP_NET_ADMIN)) { + fl = fl_lookup(freq.flr_label); + if (fl) { + err = fl6_renew(fl, freq.flr_linger, freq.flr_expires); + fl_release(fl); + return err; + } + } + return -ESRCH; + + case IPV6_FL_A_GET: + if (freq.flr_label & ~IPV6_FLOWLABEL_MASK) + return -EINVAL; + + fl = fl_create(&freq, optval, optlen, &err); + if (fl == NULL) + return err; + sfl1 = kmalloc(sizeof(*sfl1), GFP_KERNEL); + + if (freq.flr_label) { + struct ip6_flowlabel *fl1 = NULL; + + err = -EEXIST; + read_lock_bh(&ip6_sk_fl_lock); + for (sfl = np->ipv6_fl_list; sfl; sfl = sfl->next) { + if (sfl->fl->label == freq.flr_label) { + if (freq.flr_flags&IPV6_FL_F_EXCL) { + read_unlock_bh(&ip6_sk_fl_lock); + goto done; + } + fl1 = sfl->fl; + atomic_inc(&fl->users); + break; + } + } + read_unlock_bh(&ip6_sk_fl_lock); + + if (fl1 == NULL) + fl1 = fl_lookup(freq.flr_label); + if (fl1) { + err = -EEXIST; + if (freq.flr_flags&IPV6_FL_F_EXCL) + goto release; + err = -EPERM; + if (fl1->share == IPV6_FL_S_EXCL || + fl1->share != fl->share || + fl1->owner != fl->owner) + goto release; + + err = -EINVAL; + if (!ipv6_addr_equal(&fl1->dst, &fl->dst) || + ipv6_opt_cmp(fl1->opt, fl->opt)) + goto release; + + err = -ENOMEM; + if (sfl1 == NULL) + goto release; + if (fl->linger > fl1->linger) + fl1->linger = fl->linger; + if ((long)(fl->expires - fl1->expires) > 0) + fl1->expires = fl->expires; + write_lock_bh(&ip6_sk_fl_lock); + sfl1->fl = fl1; + sfl1->next = np->ipv6_fl_list; + np->ipv6_fl_list = sfl1; + write_unlock_bh(&ip6_sk_fl_lock); + fl_free(fl); + return 0; + +release: + fl_release(fl1); + goto done; + } + } + err = -ENOENT; + if (!(freq.flr_flags&IPV6_FL_F_CREATE)) + goto done; + + err = -ENOMEM; + if (sfl1 == NULL || (err = mem_check(sk)) != 0) + goto done; + + err = fl_intern(fl, freq.flr_label); + if (err) + goto done; + + /* Do not check for fault */ + if (!freq.flr_label) + copy_to_user(&((struct in6_flowlabel_req __user *) optval)->flr_label, + &fl->label, sizeof(fl->label)); + + sfl1->fl = fl; + sfl1->next = np->ipv6_fl_list; + np->ipv6_fl_list = sfl1; + return 0; + + default: + return -EINVAL; + } + +done: + fl_free(fl); + kfree(sfl1); + return err; +} + +#ifdef CONFIG_PROC_FS + +struct ip6fl_iter_state { + int bucket; +}; + +#define ip6fl_seq_private(seq) ((struct ip6fl_iter_state *)(seq)->private) + +static struct ip6_flowlabel *ip6fl_get_first(struct seq_file *seq) +{ + struct ip6_flowlabel *fl = NULL; + struct ip6fl_iter_state *state = ip6fl_seq_private(seq); + + for (state->bucket = 0; state->bucket <= FL_HASH_MASK; ++state->bucket) { + if (fl_ht[state->bucket]) { + fl = fl_ht[state->bucket]; + break; + } + } + return fl; +} + +static struct ip6_flowlabel *ip6fl_get_next(struct seq_file *seq, struct ip6_flowlabel *fl) +{ + struct ip6fl_iter_state *state = ip6fl_seq_private(seq); + + fl = fl->next; + while (!fl) { + if (++state->bucket <= FL_HASH_MASK) + fl = fl_ht[state->bucket]; + } + return fl; +} + +static struct ip6_flowlabel *ip6fl_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ip6_flowlabel *fl = ip6fl_get_first(seq); + if (fl) + while (pos && (fl = ip6fl_get_next(seq, fl)) != NULL) + --pos; + return pos ? NULL : fl; +} + +static void *ip6fl_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock_bh(&ip6_fl_lock); + return *pos ? ip6fl_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *ip6fl_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip6_flowlabel *fl; + + if (v == SEQ_START_TOKEN) + fl = ip6fl_get_first(seq); + else + fl = ip6fl_get_next(seq, v); + ++*pos; + return fl; +} + +static void ip6fl_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock_bh(&ip6_fl_lock); +} + +static void ip6fl_fl_seq_show(struct seq_file *seq, struct ip6_flowlabel *fl) +{ + while(fl) { + seq_printf(seq, + "%05X %-1d %-6d %-6d %-6ld %-8ld " + "%02x%02x%02x%02x%02x%02x%02x%02x " + "%-4d\n", + (unsigned)ntohl(fl->label), + fl->share, + (unsigned)fl->owner, + atomic_read(&fl->users), + fl->linger/HZ, + (long)(fl->expires - jiffies)/HZ, + NIP6(fl->dst), + fl->opt ? fl->opt->opt_nflen : 0); + fl = fl->next; + } +} + +static int ip6fl_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Label S Owner Users Linger Expires " + "Dst Opt\n"); + else + ip6fl_fl_seq_show(seq, v); + return 0; +} + +static struct seq_operations ip6fl_seq_ops = { + .start = ip6fl_seq_start, + .next = ip6fl_seq_next, + .stop = ip6fl_seq_stop, + .show = ip6fl_seq_show, +}; + +static int ip6fl_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct ip6fl_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &ip6fl_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations ip6fl_seq_fops = { + .owner = THIS_MODULE, + .open = ip6fl_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + + +void ip6_flowlabel_init(void) +{ +#ifdef CONFIG_PROC_FS + proc_net_fops_create("ip6_flowlabel", S_IRUGO, &ip6fl_seq_fops); +#endif +} + +void ip6_flowlabel_cleanup(void) +{ + del_timer(&ip6_fl_gc_timer); +#ifdef CONFIG_PROC_FS + proc_net_remove("ip6_flowlabel"); +#endif +} diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c new file mode 100644 index 000000000000..866f10726c58 --- /dev/null +++ b/net/ipv6/ip6_input.c @@ -0,0 +1,269 @@ +/* + * IPv6 input + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * Ian P. Morris + * + * $Id: ip6_input.c,v 1.19 2000/12/13 18:31:50 davem Exp $ + * + * Based in linux/net/ipv4/ip_input.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +/* Changes + * + * Mitsuru KANDA @USAGI and + * YOSHIFUJI Hideaki @USAGI: Remove ipv6_parse_exthdrs(). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + + + +static inline int ip6_rcv_finish( struct sk_buff *skb) +{ + if (skb->dst == NULL) + ip6_route_input(skb); + + return dst_input(skb); +} + +int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +{ + struct ipv6hdr *hdr; + u32 pkt_len; + + if (skb->pkt_type == PACKET_OTHERHOST) + goto drop; + + IP6_INC_STATS_BH(IPSTATS_MIB_INRECEIVES); + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { + IP6_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + goto out; + } + + /* + * Store incoming device index. When the packet will + * be queued, we cannot refer to skb->dev anymore. + * + * BTW, when we send a packet for our own local address on a + * non-loopback interface (e.g. ethX), it is being delivered + * via the loopback interface (lo) here; skb->dev = &loopback_dev. + * It, however, should be considered as if it is being + * arrived via the sending interface (ethX), because of the + * nature of scoping architecture. --yoshfuji + */ + IP6CB(skb)->iif = skb->dst ? ((struct rt6_info *)skb->dst)->rt6i_idev->dev->ifindex : dev->ifindex; + + if (skb->len < sizeof(struct ipv6hdr)) + goto err; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + goto drop; + } + + hdr = skb->nh.ipv6h; + + if (hdr->version != 6) + goto err; + + pkt_len = ntohs(hdr->payload_len); + + /* pkt_len may be zero if Jumbo payload option is present */ + if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { + if (pkt_len + sizeof(struct ipv6hdr) > skb->len) + goto truncated; + if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + goto drop; + } + hdr = skb->nh.ipv6h; + } + + if (hdr->nexthdr == NEXTHDR_HOP) { + skb->h.raw = (u8*)(hdr+1); + if (ipv6_parse_hopopts(skb, offsetof(struct ipv6hdr, nexthdr)) < 0) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + return 0; + } + hdr = skb->nh.ipv6h; + } + + return NF_HOOK(PF_INET6,NF_IP6_PRE_ROUTING, skb, dev, NULL, ip6_rcv_finish); +truncated: + IP6_INC_STATS_BH(IPSTATS_MIB_INTRUNCATEDPKTS); +err: + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); +drop: + kfree_skb(skb); +out: + return 0; +} + +/* + * Deliver the packet to the host + */ + + +static inline int ip6_input_finish(struct sk_buff *skb) +{ + struct inet6_protocol *ipprot; + struct sock *raw_sk; + unsigned int nhoff; + int nexthdr; + u8 hash; + + skb->h.raw = skb->nh.raw + sizeof(struct ipv6hdr); + + /* + * Parse extension headers + */ + + nexthdr = skb->nh.ipv6h->nexthdr; + nhoff = offsetof(struct ipv6hdr, nexthdr); + + /* Skip hop-by-hop options, they are already parsed. */ + if (nexthdr == NEXTHDR_HOP) { + nhoff = sizeof(struct ipv6hdr); + nexthdr = skb->h.raw[0]; + skb->h.raw += (skb->h.raw[1]+1)<<3; + } + + rcu_read_lock(); +resubmit: + if (!pskb_pull(skb, skb->h.raw - skb->data)) + goto discard; + nexthdr = skb->nh.raw[nhoff]; + + raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]); + if (raw_sk) + ipv6_raw_deliver(skb, nexthdr); + + hash = nexthdr & (MAX_INET_PROTOS - 1); + if ((ipprot = rcu_dereference(inet6_protos[hash])) != NULL) { + int ret; + + if (ipprot->flags & INET6_PROTO_FINAL) { + struct ipv6hdr *hdr; + + skb_postpull_rcsum(skb, skb->nh.raw, + skb->h.raw - skb->nh.raw); + hdr = skb->nh.ipv6h; + if (ipv6_addr_is_multicast(&hdr->daddr) && + !ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, + &hdr->saddr) && + !ipv6_is_mld(skb, nexthdr)) + goto discard; + } + if (!(ipprot->flags & INET6_PROTO_NOPOLICY) && + !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard; + + ret = ipprot->handler(&skb, &nhoff); + if (ret > 0) + goto resubmit; + else if (ret == 0) + IP6_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); + } else { + if (!raw_sk) { + if (xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + IP6_INC_STATS_BH(IPSTATS_MIB_INUNKNOWNPROTOS); + icmpv6_param_prob(skb, ICMPV6_UNK_NEXTHDR, nhoff); + } + } else { + IP6_INC_STATS_BH(IPSTATS_MIB_INDELIVERS); + kfree_skb(skb); + } + } + rcu_read_unlock(); + return 0; + +discard: + IP6_INC_STATS_BH(IPSTATS_MIB_INDISCARDS); + rcu_read_unlock(); + kfree_skb(skb); + return 0; +} + + +int ip6_input(struct sk_buff *skb) +{ + return NF_HOOK(PF_INET6,NF_IP6_LOCAL_IN, skb, skb->dev, NULL, ip6_input_finish); +} + +int ip6_mc_input(struct sk_buff *skb) +{ + struct ipv6hdr *hdr; + int deliver; + + IP6_INC_STATS_BH(IPSTATS_MIB_INMCASTPKTS); + + hdr = skb->nh.ipv6h; + deliver = likely(!(skb->dev->flags & (IFF_PROMISC|IFF_ALLMULTI))) || + ipv6_chk_mcast_addr(skb->dev, &hdr->daddr, NULL); + + /* + * IPv6 multicast router mode isnt currently supported. + */ +#if 0 + if (ipv6_config.multicast_route) { + int addr_type; + + addr_type = ipv6_addr_type(&hdr->daddr); + + if (!(addr_type & (IPV6_ADDR_LOOPBACK | IPV6_ADDR_LINKLOCAL))) { + struct sk_buff *skb2; + struct dst_entry *dst; + + dst = skb->dst; + + if (deliver) { + skb2 = skb_clone(skb, GFP_ATOMIC); + dst_output(skb2); + } else { + dst_output(skb); + return 0; + } + } + } +#endif + + if (likely(deliver)) { + ip6_input(skb); + return 0; + } + /* discard */ + kfree_skb(skb); + + return 0; +} diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c new file mode 100644 index 000000000000..49208ba75094 --- /dev/null +++ b/net/ipv6/ip6_output.c @@ -0,0 +1,1197 @@ +/* + * IPv6 output functions + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $ + * + * Based on linux/net/ipv4/ip_output.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * A.N.Kuznetsov : airthmetics in fragmentation. + * extension headers are implemented. + * route changes now work. + * ip6_forward does not confuse sniffers. + * etc. + * + * H. von Brand : Added missing #include + * Imran Patel : frag id should be in NBO + * Kazunori MIYAZAWA @USAGI + * : add ip6_append_data and related functions + * for datagram xmit + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); + +static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr) +{ + static u32 ipv6_fragmentation_id = 1; + static DEFINE_SPINLOCK(ip6_id_lock); + + spin_lock_bh(&ip6_id_lock); + fhdr->identification = htonl(ipv6_fragmentation_id); + if (++ipv6_fragmentation_id == 0) + ipv6_fragmentation_id = 1; + spin_unlock_bh(&ip6_id_lock); +} + +static inline int ip6_output_finish(struct sk_buff *skb) +{ + + struct dst_entry *dst = skb->dst; + struct hh_cache *hh = dst->hh; + + if (hh) { + int hh_alen; + + read_lock_bh(&hh->hh_lock); + hh_alen = HH_DATA_ALIGN(hh->hh_len); + memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); + read_unlock_bh(&hh->hh_lock); + skb_push(skb, hh->hh_len); + return hh->hh_output(skb); + } else if (dst->neighbour) + return dst->neighbour->output(skb); + + IP6_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + kfree_skb(skb); + return -EINVAL; + +} + +/* dev_loopback_xmit for use with netfilter. */ +static int ip6_dev_loopback_xmit(struct sk_buff *newskb) +{ + newskb->mac.raw = newskb->data; + __skb_pull(newskb, newskb->nh.raw - newskb->data); + newskb->pkt_type = PACKET_LOOPBACK; + newskb->ip_summed = CHECKSUM_UNNECESSARY; + BUG_TRAP(newskb->dst); + + netif_rx(newskb); + return 0; +} + + +static int ip6_output2(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct net_device *dev = dst->dev; + + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; + + if (ipv6_addr_is_multicast(&skb->nh.ipv6h->daddr)) { + struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL; + + if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) && + ipv6_chk_mcast_addr(dev, &skb->nh.ipv6h->daddr, + &skb->nh.ipv6h->saddr)) { + struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); + + /* Do not check for IFF_ALLMULTI; multicast routing + is not supported in any case. + */ + if (newskb) + NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, newskb, NULL, + newskb->dev, + ip6_dev_loopback_xmit); + + if (skb->nh.ipv6h->hop_limit == 0) { + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + return 0; + } + } + + IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); + } + + return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish); +} + +int ip6_output(struct sk_buff *skb) +{ + if (skb->len > dst_mtu(skb->dst) || dst_allfrag(skb->dst)) + return ip6_fragment(skb, ip6_output2); + else + return ip6_output2(skb); +} + +#ifdef CONFIG_NETFILTER +int ip6_route_me_harder(struct sk_buff *skb) +{ + struct ipv6hdr *iph = skb->nh.ipv6h; + struct dst_entry *dst; + struct flowi fl = { + .oif = skb->sk ? skb->sk->sk_bound_dev_if : 0, + .nl_u = + { .ip6_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, } }, + .proto = iph->nexthdr, + }; + + dst = ip6_route_output(skb->sk, &fl); + + if (dst->error) { + IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + LIMIT_NETDEBUG( + printk(KERN_DEBUG "ip6_route_me_harder: No more route.\n")); + dst_release(dst); + return -EINVAL; + } + + /* Drop old route. */ + dst_release(skb->dst); + + skb->dst = dst; + return 0; +} +#endif + +static inline int ip6_maybe_reroute(struct sk_buff *skb) +{ +#ifdef CONFIG_NETFILTER + if (skb->nfcache & NFC_ALTERED){ + if (ip6_route_me_harder(skb) != 0){ + kfree_skb(skb); + return -EINVAL; + } + } +#endif /* CONFIG_NETFILTER */ + return dst_output(skb); +} + +/* + * xmit an sk_buff (used by TCP) + */ + +int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl, + struct ipv6_txoptions *opt, int ipfragok) +{ + struct ipv6_pinfo *np = sk ? inet6_sk(sk) : NULL; + struct in6_addr *first_hop = &fl->fl6_dst; + struct dst_entry *dst = skb->dst; + struct ipv6hdr *hdr; + u8 proto = fl->proto; + int seg_len = skb->len; + int hlimit; + u32 mtu; + + if (opt) { + int head_room; + + /* First: exthdrs may take lots of space (~8K for now) + MAX_HEADER is not enough. + */ + head_room = opt->opt_nflen + opt->opt_flen; + seg_len += head_room; + head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); + + if (skb_headroom(skb) < head_room) { + struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); + kfree_skb(skb); + skb = skb2; + if (skb == NULL) { + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + return -ENOBUFS; + } + if (sk) + skb_set_owner_w(skb, sk); + } + if (opt->opt_flen) + ipv6_push_frag_opts(skb, opt, &proto); + if (opt->opt_nflen) + ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop); + } + + hdr = skb->nh.ipv6h = (struct ipv6hdr*)skb_push(skb, sizeof(struct ipv6hdr)); + + /* + * Fill in the IPv6 header + */ + + *(u32*)hdr = htonl(0x60000000) | fl->fl6_flowlabel; + hlimit = -1; + if (np) + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = dst_metric(dst, RTAX_HOPLIMIT); + if (hlimit < 0) + hlimit = ipv6_get_hoplimit(dst->dev); + + hdr->payload_len = htons(seg_len); + hdr->nexthdr = proto; + hdr->hop_limit = hlimit; + + ipv6_addr_copy(&hdr->saddr, &fl->fl6_src); + ipv6_addr_copy(&hdr->daddr, first_hop); + + mtu = dst_mtu(dst); + if ((skb->len <= mtu) || ipfragok) { + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + return NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, ip6_maybe_reroute); + } + + if (net_ratelimit()) + printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n"); + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; +} + +/* + * To avoid extra problems ND packets are send through this + * routine. It's code duplication but I really want to avoid + * extra checks since ipv6_build_header is used by TCP (which + * is for us performance critical) + */ + +int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev, + struct in6_addr *saddr, struct in6_addr *daddr, + int proto, int len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6hdr *hdr; + int totlen; + + skb->protocol = htons(ETH_P_IPV6); + skb->dev = dev; + + totlen = len + sizeof(struct ipv6hdr); + + hdr = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr)); + skb->nh.ipv6h = hdr; + + *(u32*)hdr = htonl(0x60000000); + + hdr->payload_len = htons(len); + hdr->nexthdr = proto; + hdr->hop_limit = np->hop_limit; + + ipv6_addr_copy(&hdr->saddr, saddr); + ipv6_addr_copy(&hdr->daddr, daddr); + + return 0; +} + +static int ip6_call_ra_chain(struct sk_buff *skb, int sel) +{ + struct ip6_ra_chain *ra; + struct sock *last = NULL; + + read_lock(&ip6_ra_lock); + for (ra = ip6_ra_chain; ra; ra = ra->next) { + struct sock *sk = ra->sk; + if (sk && ra->sel == sel) { + if (last) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) + rawv6_rcv(last, skb2); + } + last = sk; + } + } + + if (last) { + rawv6_rcv(last, skb); + read_unlock(&ip6_ra_lock); + return 1; + } + read_unlock(&ip6_ra_lock); + return 0; +} + +static inline int ip6_forward_finish(struct sk_buff *skb) +{ + return dst_output(skb); +} + +int ip6_forward(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct ipv6hdr *hdr = skb->nh.ipv6h; + struct inet6_skb_parm *opt = IP6CB(skb); + + if (ipv6_devconf.forwarding == 0) + goto error; + + if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { + IP6_INC_STATS(IPSTATS_MIB_INDISCARDS); + goto drop; + } + + skb->ip_summed = CHECKSUM_NONE; + + /* + * We DO NOT make any processing on + * RA packets, pushing them to user level AS IS + * without ane WARRANTY that application will be able + * to interpret them. The reason is that we + * cannot make anything clever here. + * + * We are not end-node, so that if packet contains + * AH/ESP, we cannot make anything. + * Defragmentation also would be mistake, RA packets + * cannot be fragmented, because there is no warranty + * that different fragments will go along one path. --ANK + */ + if (opt->ra) { + u8 *ptr = skb->nh.raw + opt->ra; + if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3])) + return 0; + } + + /* + * check and decrement ttl + */ + if (hdr->hop_limit <= 1) { + /* Force OUTPUT device used as source address */ + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, + 0, skb->dev); + + kfree_skb(skb); + return -ETIMEDOUT; + } + + if (!xfrm6_route_forward(skb)) { + IP6_INC_STATS(IPSTATS_MIB_INDISCARDS); + goto drop; + } + dst = skb->dst; + + /* IPv6 specs say nothing about it, but it is clear that we cannot + send redirects to source routed frames. + */ + if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0) { + struct in6_addr *target = NULL; + struct rt6_info *rt; + struct neighbour *n = dst->neighbour; + + /* + * incoming and outgoing devices are the same + * send a redirect. + */ + + rt = (struct rt6_info *) dst; + if ((rt->rt6i_flags & RTF_GATEWAY)) + target = (struct in6_addr*)&n->primary_key; + else + target = &hdr->daddr; + + /* Limit redirects both by destination (here) + and by source (inside ndisc_send_redirect) + */ + if (xrlim_allow(dst, 1*HZ)) + ndisc_send_redirect(skb, n, target); + } else if (ipv6_addr_type(&hdr->saddr)&(IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK + |IPV6_ADDR_LINKLOCAL)) { + /* This check is security critical. */ + goto error; + } + + if (skb->len > dst_mtu(dst)) { + /* Again, force OUTPUT device used as source address */ + skb->dev = dst->dev; + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev); + IP6_INC_STATS_BH(IPSTATS_MIB_INTOOBIGERRORS); + IP6_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS); + kfree_skb(skb); + return -EMSGSIZE; + } + + if (skb_cow(skb, dst->dev->hard_header_len)) { + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + goto drop; + } + + hdr = skb->nh.ipv6h; + + /* Mangling hops number delayed to point after skb COW */ + + hdr->hop_limit--; + + IP6_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS); + return NF_HOOK(PF_INET6,NF_IP6_FORWARD, skb, skb->dev, dst->dev, ip6_forward_finish); + +error: + IP6_INC_STATS_BH(IPSTATS_MIB_INADDRERRORS); +drop: + kfree_skb(skb); + return -EINVAL; +} + +static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) +{ + to->pkt_type = from->pkt_type; + to->priority = from->priority; + to->protocol = from->protocol; + to->security = from->security; + dst_release(to->dst); + to->dst = dst_clone(from->dst); + to->dev = from->dev; + +#ifdef CONFIG_NET_SCHED + to->tc_index = from->tc_index; +#endif +#ifdef CONFIG_NETFILTER + to->nfmark = from->nfmark; + /* Connection association is same as pre-frag packet */ + to->nfct = from->nfct; + nf_conntrack_get(to->nfct); + to->nfctinfo = from->nfctinfo; +#ifdef CONFIG_BRIDGE_NETFILTER + nf_bridge_put(to->nf_bridge); + to->nf_bridge = from->nf_bridge; + nf_bridge_get(to->nf_bridge); +#endif +#ifdef CONFIG_NETFILTER_DEBUG + to->nf_debug = from->nf_debug; +#endif +#endif +} + +int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) +{ + u16 offset = sizeof(struct ipv6hdr); + struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.ipv6h + 1); + unsigned int packet_len = skb->tail - skb->nh.raw; + int found_rhdr = 0; + *nexthdr = &skb->nh.ipv6h->nexthdr; + + while (offset + 1 <= packet_len) { + + switch (**nexthdr) { + + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_DEST: + if (**nexthdr == NEXTHDR_ROUTING) found_rhdr = 1; + if (**nexthdr == NEXTHDR_DEST && found_rhdr) return offset; + offset += ipv6_optlen(exthdr); + *nexthdr = &exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + break; + default : + return offset; + } + } + + return offset; +} + +static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) +{ + struct net_device *dev; + struct sk_buff *frag; + struct rt6_info *rt = (struct rt6_info*)skb->dst; + struct ipv6hdr *tmp_hdr; + struct frag_hdr *fh; + unsigned int mtu, hlen, left, len; + u32 frag_id = 0; + int ptr, offset = 0, err=0; + u8 *prevhdr, nexthdr = 0; + + dev = rt->u.dst.dev; + hlen = ip6_find_1stfragopt(skb, &prevhdr); + nexthdr = *prevhdr; + + mtu = dst_mtu(&rt->u.dst) - hlen - sizeof(struct frag_hdr); + + if (skb_shinfo(skb)->frag_list) { + int first_len = skb_pagelen(skb); + + if (first_len - hlen > mtu || + ((first_len - hlen) & 7) || + skb_cloned(skb)) + goto slow_path; + + for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) { + /* Correct geometry. */ + if (frag->len > mtu || + ((frag->len & 7) && frag->next) || + skb_headroom(frag) < hlen) + goto slow_path; + + /* Correct socket ownership. */ + if (frag->sk == NULL) + goto slow_path; + + /* Partially cloned skb? */ + if (skb_shared(frag)) + goto slow_path; + } + + err = 0; + offset = 0; + frag = skb_shinfo(skb)->frag_list; + skb_shinfo(skb)->frag_list = NULL; + /* BUILD HEADER */ + + tmp_hdr = kmalloc(hlen, GFP_ATOMIC); + if (!tmp_hdr) { + IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS); + return -ENOMEM; + } + + *prevhdr = NEXTHDR_FRAGMENT; + memcpy(tmp_hdr, skb->nh.raw, hlen); + __skb_pull(skb, hlen); + fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr)); + skb->nh.raw = __skb_push(skb, hlen); + memcpy(skb->nh.raw, tmp_hdr, hlen); + + ipv6_select_ident(skb, fh); + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(IP6_MF); + frag_id = fh->identification; + + first_len = skb_pagelen(skb); + skb->data_len = first_len - skb_headlen(skb); + skb->len = first_len; + skb->nh.ipv6h->payload_len = htons(first_len - sizeof(struct ipv6hdr)); + + + for (;;) { + /* Prepare header of the next frame, + * before previous one went down. */ + if (frag) { + frag->ip_summed = CHECKSUM_NONE; + frag->h.raw = frag->data; + fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr)); + frag->nh.raw = __skb_push(frag, hlen); + memcpy(frag->nh.raw, tmp_hdr, hlen); + offset += skb->len - hlen - sizeof(struct frag_hdr); + fh->nexthdr = nexthdr; + fh->reserved = 0; + fh->frag_off = htons(offset); + if (frag->next != NULL) + fh->frag_off |= htons(IP6_MF); + fh->identification = frag_id; + frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + ip6_copy_metadata(frag, skb); + } + + err = output(skb); + if (err || !frag) + break; + + skb = frag; + frag = skb->next; + skb->next = NULL; + } + + if (tmp_hdr) + kfree(tmp_hdr); + + if (err == 0) { + IP6_INC_STATS(IPSTATS_MIB_FRAGOKS); + return 0; + } + + while (frag) { + skb = frag->next; + kfree_skb(frag); + frag = skb; + } + + IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS); + return err; + } + +slow_path: + left = skb->len - hlen; /* Space per frame */ + ptr = hlen; /* Where to start from */ + + /* + * Fragment the datagram. + */ + + *prevhdr = NEXTHDR_FRAGMENT; + + /* + * Keep copying data until we run out. + */ + while(left > 0) { + len = left; + /* IF: it doesn't fit, use 'mtu' - the data space left */ + if (len > mtu) + len = mtu; + /* IF: we are not sending upto and including the packet end + then align the next start on an eight byte boundary */ + if (len < left) { + len &= ~7; + } + /* + * Allocate buffer. + */ + + if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) { + NETDEBUG(printk(KERN_INFO "IPv6: frag: no memory for new fragment!\n")); + IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS); + err = -ENOMEM; + goto fail; + } + + /* + * Set up data on packet + */ + + ip6_copy_metadata(frag, skb); + skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev)); + skb_put(frag, len + hlen + sizeof(struct frag_hdr)); + frag->nh.raw = frag->data; + fh = (struct frag_hdr*)(frag->data + hlen); + frag->h.raw = frag->data + hlen + sizeof(struct frag_hdr); + + /* + * Charge the memory for the fragment to any owner + * it might possess + */ + if (skb->sk) + skb_set_owner_w(frag, skb->sk); + + /* + * Copy the packet header into the new buffer. + */ + memcpy(frag->nh.raw, skb->data, hlen); + + /* + * Build fragment header. + */ + fh->nexthdr = nexthdr; + fh->reserved = 0; + if (frag_id) { + ipv6_select_ident(skb, fh); + frag_id = fh->identification; + } else + fh->identification = frag_id; + + /* + * Copy a block of the IP datagram. + */ + if (skb_copy_bits(skb, ptr, frag->h.raw, len)) + BUG(); + left -= len; + + fh->frag_off = htons(offset); + if (left > 0) + fh->frag_off |= htons(IP6_MF); + frag->nh.ipv6h->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); + + ptr += len; + offset += len; + + /* + * Put this fragment into the sending queue. + */ + + IP6_INC_STATS(IPSTATS_MIB_FRAGCREATES); + + err = output(frag); + if (err) + goto fail; + } + kfree_skb(skb); + IP6_INC_STATS(IPSTATS_MIB_FRAGOKS); + return err; + +fail: + kfree_skb(skb); + IP6_INC_STATS(IPSTATS_MIB_FRAGFAILS); + return err; +} + +int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl) +{ + int err = 0; + + *dst = NULL; + if (sk) { + struct ipv6_pinfo *np = inet6_sk(sk); + + *dst = sk_dst_check(sk, np->dst_cookie); + if (*dst) { + struct rt6_info *rt = (struct rt6_info*)*dst; + + /* Yes, checking route validity in not connected + case is not very simple. Take into account, + that we do not support routing by source, TOS, + and MSG_DONTROUTE --ANK (980726) + + 1. If route was host route, check that + cached destination is current. + If it is network route, we still may + check its validity using saved pointer + to the last used address: daddr_cache. + We do not want to save whole address now, + (because main consumer of this service + is tcp, which has not this problem), + so that the last trick works only on connected + sockets. + 2. oif also should be the same. + */ + + if (((rt->rt6i_dst.plen != 128 || + !ipv6_addr_equal(&fl->fl6_dst, &rt->rt6i_dst.addr)) + && (np->daddr_cache == NULL || + !ipv6_addr_equal(&fl->fl6_dst, np->daddr_cache))) + || (fl->oif && fl->oif != (*dst)->dev->ifindex)) { + dst_release(*dst); + *dst = NULL; + } + } + } + + if (*dst == NULL) + *dst = ip6_route_output(sk, fl); + + if ((err = (*dst)->error)) + goto out_err_release; + + if (ipv6_addr_any(&fl->fl6_src)) { + err = ipv6_get_saddr(*dst, &fl->fl6_dst, &fl->fl6_src); + + if (err) { +#if IP6_DEBUG >= 2 + printk(KERN_DEBUG "ip6_dst_lookup: " + "no available source address\n"); +#endif + goto out_err_release; + } + } + + return 0; + +out_err_release: + dst_release(*dst); + *dst = NULL; + return err; +} + +int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), + void *from, int length, int transhdrlen, + int hlimit, struct ipv6_txoptions *opt, struct flowi *fl, struct rt6_info *rt, + unsigned int flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + unsigned int maxfraglen, fragheaderlen; + int exthdrlen; + int hh_len; + int mtu; + int copy; + int err; + int offset = 0; + int csummode = CHECKSUM_NONE; + + if (flags&MSG_PROBE) + return 0; + if (skb_queue_empty(&sk->sk_write_queue)) { + /* + * setup for corking + */ + if (opt) { + if (np->cork.opt == NULL) { + np->cork.opt = kmalloc(opt->tot_len, + sk->sk_allocation); + if (unlikely(np->cork.opt == NULL)) + return -ENOBUFS; + } else if (np->cork.opt->tot_len < opt->tot_len) { + printk(KERN_DEBUG "ip6_append_data: invalid option length\n"); + return -EINVAL; + } + memcpy(np->cork.opt, opt, opt->tot_len); + inet->cork.flags |= IPCORK_OPT; + /* need source address above miyazawa*/ + } + dst_hold(&rt->u.dst); + np->cork.rt = rt; + inet->cork.fl = *fl; + np->cork.hop_limit = hlimit; + inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path); + if (dst_allfrag(rt->u.dst.path)) + inet->cork.flags |= IPCORK_ALLFRAG; + inet->cork.length = 0; + sk->sk_sndmsg_page = NULL; + sk->sk_sndmsg_off = 0; + exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0); + length += exthdrlen; + transhdrlen += exthdrlen; + } else { + rt = np->cork.rt; + fl = &inet->cork.fl; + if (inet->cork.flags & IPCORK_OPT) + opt = np->cork.opt; + transhdrlen = 0; + exthdrlen = 0; + mtu = inet->cork.fragsize; + } + + hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + + fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0); + maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); + + if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { + if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) { + ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen); + return -EMSGSIZE; + } + } + + /* + * Let's try using as much space as possible. + * Use MTU if total length of the message fits into the MTU. + * Otherwise, we need to reserve fragment header and + * fragment alignment (= 8-15 octects, in total). + * + * Note that we may need to "move" the data from the tail of + * of the buffer to the new fragment when we split + * the message. + * + * FIXME: It may be fragmented into multiple chunks + * at once if non-fragmentable extension headers + * are too large. + * --yoshfuji + */ + + inet->cork.length += length; + + if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) + goto alloc_new_skb; + + while (length > 0) { + /* Check if the remaining data fits into current packet. */ + copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; + if (copy < length) + copy = maxfraglen - skb->len; + + if (copy <= 0) { + char *data; + unsigned int datalen; + unsigned int fraglen; + unsigned int fraggap; + unsigned int alloclen; + struct sk_buff *skb_prev; +alloc_new_skb: + skb_prev = skb; + + /* There's no room in the current skb */ + if (skb_prev) + fraggap = skb_prev->len - maxfraglen; + else + fraggap = 0; + + /* + * If remaining data exceeds the mtu, + * we know we need more fragment(s). + */ + datalen = length + fraggap; + if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) + datalen = maxfraglen - fragheaderlen; + + fraglen = datalen + fragheaderlen; + if ((flags & MSG_MORE) && + !(rt->u.dst.dev->features&NETIF_F_SG)) + alloclen = mtu; + else + alloclen = datalen + fragheaderlen; + + /* + * The last fragment gets additional space at tail. + * Note: we overallocate on fragments with MSG_MODE + * because we have no idea if we're the last one. + */ + if (datalen == length + fraggap) + alloclen += rt->u.dst.trailer_len; + + /* + * We just reserve space for fragment header. + * Note: this may be overallocation if the message + * (without MSG_MORE) fits into the MTU. + */ + alloclen += sizeof(struct frag_hdr); + + if (transhdrlen) { + skb = sock_alloc_send_skb(sk, + alloclen + hh_len, + (flags & MSG_DONTWAIT), &err); + } else { + skb = NULL; + if (atomic_read(&sk->sk_wmem_alloc) <= + 2 * sk->sk_sndbuf) + skb = sock_wmalloc(sk, + alloclen + hh_len, 1, + sk->sk_allocation); + if (unlikely(skb == NULL)) + err = -ENOBUFS; + } + if (skb == NULL) + goto error; + /* + * Fill in the control structures + */ + skb->ip_summed = csummode; + skb->csum = 0; + /* reserve for fragmentation */ + skb_reserve(skb, hh_len+sizeof(struct frag_hdr)); + + /* + * Find where to start putting bytes + */ + data = skb_put(skb, fraglen); + skb->nh.raw = data + exthdrlen; + data += fragheaderlen; + skb->h.raw = data + exthdrlen; + + if (fraggap) { + skb->csum = skb_copy_and_csum_bits( + skb_prev, maxfraglen, + data + transhdrlen, fraggap, 0); + skb_prev->csum = csum_sub(skb_prev->csum, + skb->csum); + data += fraggap; + skb_trim(skb_prev, maxfraglen); + } + copy = datalen - transhdrlen - fraggap; + if (copy < 0) { + err = -EINVAL; + kfree_skb(skb); + goto error; + } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { + err = -EFAULT; + kfree_skb(skb); + goto error; + } + + offset += copy; + length -= datalen - fraggap; + transhdrlen = 0; + exthdrlen = 0; + csummode = CHECKSUM_NONE; + + /* + * Put the packet on the pending queue + */ + __skb_queue_tail(&sk->sk_write_queue, skb); + continue; + } + + if (copy > length) + copy = length; + + if (!(rt->u.dst.dev->features&NETIF_F_SG)) { + unsigned int off; + + off = skb->len; + if (getfrag(from, skb_put(skb, copy), + offset, copy, off, skb) < 0) { + __skb_trim(skb, off); + err = -EFAULT; + goto error; + } + } else { + int i = skb_shinfo(skb)->nr_frags; + skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; + struct page *page = sk->sk_sndmsg_page; + int off = sk->sk_sndmsg_off; + unsigned int left; + + if (page && (left = PAGE_SIZE - off) > 0) { + if (copy >= left) + copy = left; + if (page != frag->page) { + if (i == MAX_SKB_FRAGS) { + err = -EMSGSIZE; + goto error; + } + get_page(page); + skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); + frag = &skb_shinfo(skb)->frags[i]; + } + } else if(i < MAX_SKB_FRAGS) { + if (copy > PAGE_SIZE) + copy = PAGE_SIZE; + page = alloc_pages(sk->sk_allocation, 0); + if (page == NULL) { + err = -ENOMEM; + goto error; + } + sk->sk_sndmsg_page = page; + sk->sk_sndmsg_off = 0; + + skb_fill_page_desc(skb, i, page, 0, 0); + frag = &skb_shinfo(skb)->frags[i]; + skb->truesize += PAGE_SIZE; + atomic_add(PAGE_SIZE, &sk->sk_wmem_alloc); + } else { + err = -EMSGSIZE; + goto error; + } + if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) { + err = -EFAULT; + goto error; + } + sk->sk_sndmsg_off += copy; + frag->size += copy; + skb->len += copy; + skb->data_len += copy; + } + offset += copy; + length -= copy; + } + return 0; +error: + inet->cork.length -= length; + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + return err; +} + +int ip6_push_pending_frames(struct sock *sk) +{ + struct sk_buff *skb, *tmp_skb; + struct sk_buff **tail_skb; + struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6hdr *hdr; + struct ipv6_txoptions *opt = np->cork.opt; + struct rt6_info *rt = np->cork.rt; + struct flowi *fl = &inet->cork.fl; + unsigned char proto = fl->proto; + int err = 0; + + if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) + goto out; + tail_skb = &(skb_shinfo(skb)->frag_list); + + /* move skb->data to ip header from ext header */ + if (skb->data < skb->nh.raw) + __skb_pull(skb, skb->nh.raw - skb->data); + while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { + __skb_pull(tmp_skb, skb->h.raw - skb->nh.raw); + *tail_skb = tmp_skb; + tail_skb = &(tmp_skb->next); + skb->len += tmp_skb->len; + skb->data_len += tmp_skb->len; +#if 0 /* Logically correct, but useless work, ip_fragment() will have to undo */ + skb->truesize += tmp_skb->truesize; + __sock_put(tmp_skb->sk); + tmp_skb->destructor = NULL; + tmp_skb->sk = NULL; +#endif + } + + ipv6_addr_copy(final_dst, &fl->fl6_dst); + __skb_pull(skb, skb->h.raw - skb->nh.raw); + if (opt && opt->opt_flen) + ipv6_push_frag_opts(skb, opt, &proto); + if (opt && opt->opt_nflen) + ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst); + + skb->nh.ipv6h = hdr = (struct ipv6hdr*) skb_push(skb, sizeof(struct ipv6hdr)); + + *(u32*)hdr = fl->fl6_flowlabel | htonl(0x60000000); + + if (skb->len <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) + hdr->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + else + hdr->payload_len = 0; + hdr->hop_limit = np->cork.hop_limit; + hdr->nexthdr = proto; + ipv6_addr_copy(&hdr->saddr, &fl->fl6_src); + ipv6_addr_copy(&hdr->daddr, final_dst); + + skb->dst = dst_clone(&rt->u.dst); + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dst->dev, dst_output); + if (err) { + if (err > 0) + err = inet->recverr ? net_xmit_errno(err) : 0; + if (err) + goto error; + } + +out: + inet->cork.flags &= ~IPCORK_OPT; + if (np->cork.opt) { + kfree(np->cork.opt); + np->cork.opt = NULL; + } + if (np->cork.rt) { + dst_release(&np->cork.rt->u.dst); + np->cork.rt = NULL; + inet->cork.flags &= ~IPCORK_ALLFRAG; + } + memset(&inet->cork.fl, 0, sizeof(inet->cork.fl)); + return err; +error: + goto out; +} + +void ip6_flush_pending_frames(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + + while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) { + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); + } + + inet->cork.flags &= ~IPCORK_OPT; + + if (np->cork.opt) { + kfree(np->cork.opt); + np->cork.opt = NULL; + } + if (np->cork.rt) { + dst_release(&np->cork.rt->u.dst); + np->cork.rt = NULL; + inet->cork.flags &= ~IPCORK_ALLFRAG; + } + memset(&inet->cork.fl, 0, sizeof(inet->cork.fl)); +} diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c new file mode 100644 index 000000000000..3b1c9fa184ae --- /dev/null +++ b/net/ipv6/ip6_tunnel.c @@ -0,0 +1,1163 @@ +/* + * IPv6 over IPv6 tunnel device + * Linux INET6 implementation + * + * Authors: + * Ville Nuorvala + * + * $Id$ + * + * Based on: + * linux/net/ipv6/sit.c + * + * RFC 2473 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Ville Nuorvala"); +MODULE_DESCRIPTION("IPv6-in-IPv6 tunnel"); +MODULE_LICENSE("GPL"); + +#define IPV6_TLV_TEL_DST_SIZE 8 + +#ifdef IP6_TNL_DEBUG +#define IP6_TNL_TRACE(x...) printk(KERN_DEBUG "%s:" x "\n", __FUNCTION__) +#else +#define IP6_TNL_TRACE(x...) do {;} while(0) +#endif + +#define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) + +#define HASH_SIZE 32 + +#define HASH(addr) (((addr)->s6_addr32[0] ^ (addr)->s6_addr32[1] ^ \ + (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \ + (HASH_SIZE - 1)) + +static int ip6ip6_fb_tnl_dev_init(struct net_device *dev); +static int ip6ip6_tnl_dev_init(struct net_device *dev); +static void ip6ip6_tnl_dev_setup(struct net_device *dev); + +/* the IPv6 tunnel fallback device */ +static struct net_device *ip6ip6_fb_tnl_dev; + + +/* lists for storing tunnels in use */ +static struct ip6_tnl *tnls_r_l[HASH_SIZE]; +static struct ip6_tnl *tnls_wc[1]; +static struct ip6_tnl **tnls[2] = { tnls_wc, tnls_r_l }; + +/* lock for the tunnel lists */ +static DEFINE_RWLOCK(ip6ip6_lock); + +static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) +{ + struct dst_entry *dst = t->dst_cache; + + if (dst && dst->obsolete && + dst->ops->check(dst, t->dst_cookie) == NULL) { + t->dst_cache = NULL; + dst_release(dst); + return NULL; + } + + return dst; +} + +static inline void ip6_tnl_dst_reset(struct ip6_tnl *t) +{ + dst_release(t->dst_cache); + t->dst_cache = NULL; +} + +static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) +{ + struct rt6_info *rt = (struct rt6_info *) dst; + t->dst_cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + dst_release(t->dst_cache); + t->dst_cache = dst; +} + +/** + * ip6ip6_tnl_lookup - fetch tunnel matching the end-point addresses + * @remote: the address of the tunnel exit-point + * @local: the address of the tunnel entry-point + * + * Return: + * tunnel matching given end-points if found, + * else fallback tunnel if its device is up, + * else %NULL + **/ + +static struct ip6_tnl * +ip6ip6_tnl_lookup(struct in6_addr *remote, struct in6_addr *local) +{ + unsigned h0 = HASH(remote); + unsigned h1 = HASH(local); + struct ip6_tnl *t; + + for (t = tnls_r_l[h0 ^ h1]; t; t = t->next) { + if (ipv6_addr_equal(local, &t->parms.laddr) && + ipv6_addr_equal(remote, &t->parms.raddr) && + (t->dev->flags & IFF_UP)) + return t; + } + if ((t = tnls_wc[0]) != NULL && (t->dev->flags & IFF_UP)) + return t; + + return NULL; +} + +/** + * ip6ip6_bucket - get head of list matching given tunnel parameters + * @p: parameters containing tunnel end-points + * + * Description: + * ip6ip6_bucket() returns the head of the list matching the + * &struct in6_addr entries laddr and raddr in @p. + * + * Return: head of IPv6 tunnel list + **/ + +static struct ip6_tnl ** +ip6ip6_bucket(struct ip6_tnl_parm *p) +{ + struct in6_addr *remote = &p->raddr; + struct in6_addr *local = &p->laddr; + unsigned h = 0; + int prio = 0; + + if (!ipv6_addr_any(remote) || !ipv6_addr_any(local)) { + prio = 1; + h = HASH(remote) ^ HASH(local); + } + return &tnls[prio][h]; +} + +/** + * ip6ip6_tnl_link - add tunnel to hash table + * @t: tunnel to be added + **/ + +static void +ip6ip6_tnl_link(struct ip6_tnl *t) +{ + struct ip6_tnl **tp = ip6ip6_bucket(&t->parms); + + t->next = *tp; + write_lock_bh(&ip6ip6_lock); + *tp = t; + write_unlock_bh(&ip6ip6_lock); +} + +/** + * ip6ip6_tnl_unlink - remove tunnel from hash table + * @t: tunnel to be removed + **/ + +static void +ip6ip6_tnl_unlink(struct ip6_tnl *t) +{ + struct ip6_tnl **tp; + + for (tp = ip6ip6_bucket(&t->parms); *tp; tp = &(*tp)->next) { + if (t == *tp) { + write_lock_bh(&ip6ip6_lock); + *tp = t->next; + write_unlock_bh(&ip6ip6_lock); + break; + } + } +} + +/** + * ip6_tnl_create() - create a new tunnel + * @p: tunnel parameters + * @pt: pointer to new tunnel + * + * Description: + * Create tunnel matching given parameters. + * + * Return: + * 0 on success + **/ + +static int +ip6_tnl_create(struct ip6_tnl_parm *p, struct ip6_tnl **pt) +{ + struct net_device *dev; + struct ip6_tnl *t; + char name[IFNAMSIZ]; + int err; + + if (p->name[0]) { + strlcpy(name, p->name, IFNAMSIZ); + } else { + int i; + for (i = 1; i < IP6_TNL_MAX; i++) { + sprintf(name, "ip6tnl%d", i); + if (__dev_get_by_name(name) == NULL) + break; + } + if (i == IP6_TNL_MAX) + return -ENOBUFS; + } + dev = alloc_netdev(sizeof (*t), name, ip6ip6_tnl_dev_setup); + if (dev == NULL) + return -ENOMEM; + + t = dev->priv; + dev->init = ip6ip6_tnl_dev_init; + t->parms = *p; + + if ((err = register_netdevice(dev)) < 0) { + free_netdev(dev); + return err; + } + dev_hold(dev); + + ip6ip6_tnl_link(t); + *pt = t; + return 0; +} + +/** + * ip6ip6_tnl_locate - find or create tunnel matching given parameters + * @p: tunnel parameters + * @create: != 0 if allowed to create new tunnel if no match found + * + * Description: + * ip6ip6_tnl_locate() first tries to locate an existing tunnel + * based on @parms. If this is unsuccessful, but @create is set a new + * tunnel device is created and registered for use. + * + * Return: + * 0 if tunnel located or created, + * -EINVAL if parameters incorrect, + * -ENODEV if no matching tunnel available + **/ + +static int +ip6ip6_tnl_locate(struct ip6_tnl_parm *p, struct ip6_tnl **pt, int create) +{ + struct in6_addr *remote = &p->raddr; + struct in6_addr *local = &p->laddr; + struct ip6_tnl *t; + + if (p->proto != IPPROTO_IPV6) + return -EINVAL; + + for (t = *ip6ip6_bucket(p); t; t = t->next) { + if (ipv6_addr_equal(local, &t->parms.laddr) && + ipv6_addr_equal(remote, &t->parms.raddr)) { + *pt = t; + return (create ? -EEXIST : 0); + } + } + if (!create) + return -ENODEV; + + return ip6_tnl_create(p, pt); +} + +/** + * ip6ip6_tnl_dev_uninit - tunnel device uninitializer + * @dev: the device to be destroyed + * + * Description: + * ip6ip6_tnl_dev_uninit() removes tunnel from its list + **/ + +static void +ip6ip6_tnl_dev_uninit(struct net_device *dev) +{ + struct ip6_tnl *t = dev->priv; + + if (dev == ip6ip6_fb_tnl_dev) { + write_lock_bh(&ip6ip6_lock); + tnls_wc[0] = NULL; + write_unlock_bh(&ip6ip6_lock); + } else { + ip6ip6_tnl_unlink(t); + } + ip6_tnl_dst_reset(t); + dev_put(dev); +} + +/** + * parse_tvl_tnl_enc_lim - handle encapsulation limit option + * @skb: received socket buffer + * + * Return: + * 0 if none was found, + * else index to encapsulation limit + **/ + +static __u16 +parse_tlv_tnl_enc_lim(struct sk_buff *skb, __u8 * raw) +{ + struct ipv6hdr *ipv6h = (struct ipv6hdr *) raw; + __u8 nexthdr = ipv6h->nexthdr; + __u16 off = sizeof (*ipv6h); + + while (ipv6_ext_hdr(nexthdr) && nexthdr != NEXTHDR_NONE) { + __u16 optlen = 0; + struct ipv6_opt_hdr *hdr; + if (raw + off + sizeof (*hdr) > skb->data && + !pskb_may_pull(skb, raw - skb->data + off + sizeof (*hdr))) + break; + + hdr = (struct ipv6_opt_hdr *) (raw + off); + if (nexthdr == NEXTHDR_FRAGMENT) { + struct frag_hdr *frag_hdr = (struct frag_hdr *) hdr; + if (frag_hdr->frag_off) + break; + optlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) { + optlen = (hdr->hdrlen + 2) << 2; + } else { + optlen = ipv6_optlen(hdr); + } + if (nexthdr == NEXTHDR_DEST) { + __u16 i = off + 2; + while (1) { + struct ipv6_tlv_tnl_enc_lim *tel; + + /* No more room for encapsulation limit */ + if (i + sizeof (*tel) > off + optlen) + break; + + tel = (struct ipv6_tlv_tnl_enc_lim *) &raw[i]; + /* return index of option if found and valid */ + if (tel->type == IPV6_TLV_TNL_ENCAP_LIMIT && + tel->length == 1) + return i; + /* else jump to next option */ + if (tel->type) + i += tel->length + 2; + else + i++; + } + } + nexthdr = hdr->nexthdr; + off += optlen; + } + return 0; +} + +/** + * ip6ip6_err - tunnel error handler + * + * Description: + * ip6ip6_err() should handle errors in the tunnel according + * to the specifications in RFC 2473. + **/ + +static void +ip6ip6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct ipv6hdr *ipv6h = (struct ipv6hdr *) skb->data; + struct ip6_tnl *t; + int rel_msg = 0; + int rel_type = ICMPV6_DEST_UNREACH; + int rel_code = ICMPV6_ADDR_UNREACH; + __u32 rel_info = 0; + __u16 len; + + /* If the packet doesn't contain the original IPv6 header we are + in trouble since we might need the source address for further + processing of the error. */ + + read_lock(&ip6ip6_lock); + if ((t = ip6ip6_tnl_lookup(&ipv6h->daddr, &ipv6h->saddr)) == NULL) + goto out; + + switch (type) { + __u32 teli; + struct ipv6_tlv_tnl_enc_lim *tel; + __u32 mtu; + case ICMPV6_DEST_UNREACH: + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Path to destination invalid " + "or inactive!\n", t->parms.name); + rel_msg = 1; + break; + case ICMPV6_TIME_EXCEED: + if (code == ICMPV6_EXC_HOPLIMIT) { + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Too small hop limit or " + "routing loop in tunnel!\n", + t->parms.name); + rel_msg = 1; + } + break; + case ICMPV6_PARAMPROB: + /* ignore if parameter problem not caused by a tunnel + encapsulation limit sub-option */ + if (code != ICMPV6_HDR_FIELD) { + break; + } + teli = parse_tlv_tnl_enc_lim(skb, skb->data); + + if (teli && teli == ntohl(info) - 2) { + tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->data[teli]; + if (tel->encap_limit == 0) { + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Too small encapsulation " + "limit or routing loop in " + "tunnel!\n", t->parms.name); + rel_msg = 1; + } + } + break; + case ICMPV6_PKT_TOOBIG: + mtu = ntohl(info) - offset; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + t->dev->mtu = mtu; + + if ((len = sizeof (*ipv6h) + ipv6h->payload_len) > mtu) { + rel_type = ICMPV6_PKT_TOOBIG; + rel_code = 0; + rel_info = mtu; + rel_msg = 1; + } + break; + } + if (rel_msg && pskb_may_pull(skb, offset + sizeof (*ipv6h))) { + struct rt6_info *rt; + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (!skb2) + goto out; + + dst_release(skb2->dst); + skb2->dst = NULL; + skb_pull(skb2, offset); + skb2->nh.raw = skb2->data; + + /* Try to guess incoming interface */ + rt = rt6_lookup(&skb2->nh.ipv6h->saddr, NULL, 0, 0); + + if (rt && rt->rt6i_dev) + skb2->dev = rt->rt6i_dev; + + icmpv6_send(skb2, rel_type, rel_code, rel_info, skb2->dev); + + if (rt) + dst_release(&rt->u.dst); + + kfree_skb(skb2); + } +out: + read_unlock(&ip6ip6_lock); +} + +static inline void ip6ip6_ecn_decapsulate(struct ipv6hdr *outer_iph, + struct sk_buff *skb) +{ + struct ipv6hdr *inner_iph = skb->nh.ipv6h; + + if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) + IP6_ECN_set_ce(inner_iph); +} + +/** + * ip6ip6_rcv - decapsulate IPv6 packet and retransmit it locally + * @skb: received socket buffer + * + * Return: 0 + **/ + +static int +ip6ip6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + struct sk_buff *skb = *pskb; + struct ipv6hdr *ipv6h; + struct ip6_tnl *t; + + if (!pskb_may_pull(skb, sizeof (*ipv6h))) + goto discard; + + ipv6h = skb->nh.ipv6h; + + read_lock(&ip6ip6_lock); + + if ((t = ip6ip6_tnl_lookup(&ipv6h->saddr, &ipv6h->daddr)) != NULL) { + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return 0; + } + + if (!(t->parms.flags & IP6_TNL_F_CAP_RCV)) { + t->stat.rx_dropped++; + read_unlock(&ip6ip6_lock); + goto discard; + } + secpath_reset(skb); + skb->mac.raw = skb->nh.raw; + skb->nh.raw = skb->data; + skb->protocol = htons(ETH_P_IPV6); + skb->pkt_type = PACKET_HOST; + memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); + skb->dev = t->dev; + dst_release(skb->dst); + skb->dst = NULL; + if (t->parms.flags & IP6_TNL_F_RCV_DSCP_COPY) + ipv6_copy_dscp(ipv6h, skb->nh.ipv6h); + ip6ip6_ecn_decapsulate(ipv6h, skb); + t->stat.rx_packets++; + t->stat.rx_bytes += skb->len; + netif_rx(skb); + read_unlock(&ip6ip6_lock); + return 0; + } + read_unlock(&ip6ip6_lock); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev); +discard: + return 1; +} + +static inline struct ipv6_txoptions *create_tel(__u8 encap_limit) +{ + struct ipv6_tlv_tnl_enc_lim *tel; + struct ipv6_txoptions *opt; + __u8 *raw; + + int opt_len = sizeof(*opt) + 8; + + if (!(opt = kmalloc(opt_len, GFP_ATOMIC))) { + return NULL; + } + memset(opt, 0, opt_len); + opt->tot_len = opt_len; + opt->dst0opt = (struct ipv6_opt_hdr *) (opt + 1); + opt->opt_nflen = 8; + + tel = (struct ipv6_tlv_tnl_enc_lim *) (opt->dst0opt + 1); + tel->type = IPV6_TLV_TNL_ENCAP_LIMIT; + tel->length = 1; + tel->encap_limit = encap_limit; + + raw = (__u8 *) opt->dst0opt; + raw[5] = IPV6_TLV_PADN; + raw[6] = 1; + + return opt; +} + +/** + * ip6ip6_tnl_addr_conflict - compare packet addresses to tunnel's own + * @t: the outgoing tunnel device + * @hdr: IPv6 header from the incoming packet + * + * Description: + * Avoid trivial tunneling loop by checking that tunnel exit-point + * doesn't match source of incoming packet. + * + * Return: + * 1 if conflict, + * 0 else + **/ + +static inline int +ip6ip6_tnl_addr_conflict(struct ip6_tnl *t, struct ipv6hdr *hdr) +{ + return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); +} + +/** + * ip6ip6_tnl_xmit - encapsulate packet and send + * @skb: the outgoing socket buffer + * @dev: the outgoing tunnel device + * + * Description: + * Build new header and do some sanity checks on the packet before sending + * it. + * + * Return: + * 0 + **/ + +static int +ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip6_tnl *t = (struct ip6_tnl *) dev->priv; + struct net_device_stats *stats = &t->stat; + struct ipv6hdr *ipv6h = skb->nh.ipv6h; + struct ipv6_txoptions *opt = NULL; + int encap_limit = -1; + __u16 offset; + struct flowi fl; + struct dst_entry *dst; + struct net_device *tdev; + int mtu; + int max_headroom = sizeof(struct ipv6hdr); + u8 proto; + int err; + int pkt_len; + int dsfield; + + if (t->recursion++) { + stats->collisions++; + goto tx_err; + } + if (skb->protocol != htons(ETH_P_IPV6) || + !(t->parms.flags & IP6_TNL_F_CAP_XMIT) || + ip6ip6_tnl_addr_conflict(t, ipv6h)) { + goto tx_err; + } + if ((offset = parse_tlv_tnl_enc_lim(skb, skb->nh.raw)) > 0) { + struct ipv6_tlv_tnl_enc_lim *tel; + tel = (struct ipv6_tlv_tnl_enc_lim *) &skb->nh.raw[offset]; + if (tel->encap_limit == 0) { + icmpv6_send(skb, ICMPV6_PARAMPROB, + ICMPV6_HDR_FIELD, offset + 2, skb->dev); + goto tx_err; + } + encap_limit = tel->encap_limit - 1; + } else if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) { + encap_limit = t->parms.encap_limit; + } + memcpy(&fl, &t->fl, sizeof (fl)); + proto = fl.proto; + + dsfield = ipv6_get_dsfield(ipv6h); + if ((t->parms.flags & IP6_TNL_F_USE_ORIG_TCLASS)) + fl.fl6_flowlabel |= (*(__u32 *) ipv6h & IPV6_TCLASS_MASK); + if ((t->parms.flags & IP6_TNL_F_USE_ORIG_FLOWLABEL)) + fl.fl6_flowlabel |= (*(__u32 *) ipv6h & IPV6_FLOWLABEL_MASK); + + if (encap_limit >= 0 && (opt = create_tel(encap_limit)) == NULL) + goto tx_err; + + if ((dst = ip6_tnl_dst_check(t)) != NULL) + dst_hold(dst); + else + dst = ip6_route_output(NULL, &fl); + + if (dst->error || xfrm_lookup(&dst, &fl, NULL, 0) < 0) + goto tx_err_link_failure; + + tdev = dst->dev; + + if (tdev == dev) { + stats->collisions++; + if (net_ratelimit()) + printk(KERN_WARNING + "%s: Local routing loop detected!\n", + t->parms.name); + goto tx_err_dst_release; + } + mtu = dst_mtu(dst) - sizeof (*ipv6h); + if (opt) { + max_headroom += 8; + mtu -= 8; + } + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + if (skb->dst && mtu < dst_mtu(skb->dst)) { + struct rt6_info *rt = (struct rt6_info *) skb->dst; + rt->rt6i_flags |= RTF_MODIFIED; + rt->u.dst.metrics[RTAX_MTU-1] = mtu; + } + if (skb->len > mtu) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + goto tx_err_dst_release; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom += LL_RESERVED_SPACE(tdev); + + if (skb_headroom(skb) < max_headroom || + skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb; + + if (!(new_skb = skb_realloc_headroom(skb, max_headroom))) + goto tx_err_dst_release; + + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + kfree_skb(skb); + skb = new_skb; + } + dst_release(skb->dst); + skb->dst = dst_clone(dst); + + skb->h.raw = skb->nh.raw; + + if (opt) + ipv6_push_nfrag_opts(skb, opt, &proto, NULL); + + skb->nh.raw = skb_push(skb, sizeof(struct ipv6hdr)); + ipv6h = skb->nh.ipv6h; + *(u32*)ipv6h = fl.fl6_flowlabel | htonl(0x60000000); + dsfield = INET_ECN_encapsulate(0, dsfield); + ipv6_change_dsfield(ipv6h, ~INET_ECN_MASK, dsfield); + ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + ipv6h->hop_limit = t->parms.hop_limit; + ipv6h->nexthdr = proto; + ipv6_addr_copy(&ipv6h->saddr, &fl.fl6_src); + ipv6_addr_copy(&ipv6h->daddr, &fl.fl6_dst); + nf_reset(skb); + pkt_len = skb->len; + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, + skb->dst->dev, dst_output); + + if (err == NET_XMIT_SUCCESS || err == NET_XMIT_CN) { + stats->tx_bytes += pkt_len; + stats->tx_packets++; + } else { + stats->tx_errors++; + stats->tx_aborted_errors++; + } + ip6_tnl_dst_store(t, dst); + + if (opt) + kfree(opt); + + t->recursion--; + return 0; +tx_err_link_failure: + stats->tx_carrier_errors++; + dst_link_failure(skb); +tx_err_dst_release: + dst_release(dst); + if (opt) + kfree(opt); +tx_err: + stats->tx_errors++; + stats->tx_dropped++; + kfree_skb(skb); + t->recursion--; + return 0; +} + +static void ip6_tnl_set_cap(struct ip6_tnl *t) +{ + struct ip6_tnl_parm *p = &t->parms; + struct in6_addr *laddr = &p->laddr; + struct in6_addr *raddr = &p->raddr; + int ltype = ipv6_addr_type(laddr); + int rtype = ipv6_addr_type(raddr); + + p->flags &= ~(IP6_TNL_F_CAP_XMIT|IP6_TNL_F_CAP_RCV); + + if (ltype != IPV6_ADDR_ANY && rtype != IPV6_ADDR_ANY && + ((ltype|rtype) & + (IPV6_ADDR_UNICAST| + IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL| + IPV6_ADDR_MAPPED|IPV6_ADDR_RESERVED)) == IPV6_ADDR_UNICAST) { + struct net_device *ldev = NULL; + int l_ok = 1; + int r_ok = 1; + + if (p->link) + ldev = dev_get_by_index(p->link); + + if (ltype&IPV6_ADDR_UNICAST && !ipv6_chk_addr(laddr, ldev, 0)) + l_ok = 0; + + if (rtype&IPV6_ADDR_UNICAST && ipv6_chk_addr(raddr, NULL, 0)) + r_ok = 0; + + if (l_ok && r_ok) { + if (ltype&IPV6_ADDR_UNICAST) + p->flags |= IP6_TNL_F_CAP_XMIT; + if (rtype&IPV6_ADDR_UNICAST) + p->flags |= IP6_TNL_F_CAP_RCV; + } + if (ldev) + dev_put(ldev); + } +} + +static void ip6ip6_tnl_link_config(struct ip6_tnl *t) +{ + struct net_device *dev = t->dev; + struct ip6_tnl_parm *p = &t->parms; + struct flowi *fl = &t->fl; + + memcpy(&dev->dev_addr, &p->laddr, sizeof(struct in6_addr)); + memcpy(&dev->broadcast, &p->raddr, sizeof(struct in6_addr)); + + /* Set up flowi template */ + ipv6_addr_copy(&fl->fl6_src, &p->laddr); + ipv6_addr_copy(&fl->fl6_dst, &p->raddr); + fl->oif = p->link; + fl->fl6_flowlabel = 0; + + if (!(p->flags&IP6_TNL_F_USE_ORIG_TCLASS)) + fl->fl6_flowlabel |= IPV6_TCLASS_MASK & p->flowinfo; + if (!(p->flags&IP6_TNL_F_USE_ORIG_FLOWLABEL)) + fl->fl6_flowlabel |= IPV6_FLOWLABEL_MASK & p->flowinfo; + + ip6_tnl_set_cap(t); + + if (p->flags&IP6_TNL_F_CAP_XMIT && p->flags&IP6_TNL_F_CAP_RCV) + dev->flags |= IFF_POINTOPOINT; + else + dev->flags &= ~IFF_POINTOPOINT; + + dev->iflink = p->link; + + if (p->flags & IP6_TNL_F_CAP_XMIT) { + struct rt6_info *rt = rt6_lookup(&p->raddr, &p->laddr, + p->link, 0); + + if (rt == NULL) + return; + + if (rt->rt6i_dev) { + dev->hard_header_len = rt->rt6i_dev->hard_header_len + + sizeof (struct ipv6hdr); + + dev->mtu = rt->rt6i_dev->mtu - sizeof (struct ipv6hdr); + + if (dev->mtu < IPV6_MIN_MTU) + dev->mtu = IPV6_MIN_MTU; + } + dst_release(&rt->u.dst); + } +} + +/** + * ip6ip6_tnl_change - update the tunnel parameters + * @t: tunnel to be changed + * @p: tunnel configuration parameters + * @active: != 0 if tunnel is ready for use + * + * Description: + * ip6ip6_tnl_change() updates the tunnel parameters + **/ + +static int +ip6ip6_tnl_change(struct ip6_tnl *t, struct ip6_tnl_parm *p) +{ + ipv6_addr_copy(&t->parms.laddr, &p->laddr); + ipv6_addr_copy(&t->parms.raddr, &p->raddr); + t->parms.flags = p->flags; + t->parms.hop_limit = p->hop_limit; + t->parms.encap_limit = p->encap_limit; + t->parms.flowinfo = p->flowinfo; + ip6ip6_tnl_link_config(t); + return 0; +} + +/** + * ip6ip6_tnl_ioctl - configure ipv6 tunnels from userspace + * @dev: virtual device associated with tunnel + * @ifr: parameters passed from userspace + * @cmd: command to be performed + * + * Description: + * ip6ip6_tnl_ioctl() is used for managing IPv6 tunnels + * from userspace. + * + * The possible commands are the following: + * %SIOCGETTUNNEL: get tunnel parameters for device + * %SIOCADDTUNNEL: add tunnel matching given tunnel parameters + * %SIOCCHGTUNNEL: change tunnel parameters to those given + * %SIOCDELTUNNEL: delete tunnel + * + * The fallback device "ip6tnl0", created during module + * initialization, can be used for creating other tunnel devices. + * + * Return: + * 0 on success, + * %-EFAULT if unable to copy data to or from userspace, + * %-EPERM if current process hasn't %CAP_NET_ADMIN set + * %-EINVAL if passed tunnel parameters are invalid, + * %-EEXIST if changing a tunnel's parameters would cause a conflict + * %-ENODEV if attempting to change or delete a nonexisting device + **/ + +static int +ip6ip6_tnl_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + int create; + struct ip6_tnl_parm p; + struct ip6_tnl *t = NULL; + + switch (cmd) { + case SIOCGETTUNNEL: + if (dev == ip6ip6_fb_tnl_dev) { + if (copy_from_user(&p, + ifr->ifr_ifru.ifru_data, + sizeof (p))) { + err = -EFAULT; + break; + } + if ((err = ip6ip6_tnl_locate(&p, &t, 0)) == -ENODEV) + t = (struct ip6_tnl *) dev->priv; + else if (err) + break; + } else + t = (struct ip6_tnl *) dev->priv; + + memcpy(&p, &t->parms, sizeof (p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof (p))) { + err = -EFAULT; + } + break; + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + create = (cmd == SIOCADDTUNNEL); + if (!capable(CAP_NET_ADMIN)) + break; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof (p))) { + err = -EFAULT; + break; + } + if (!create && dev != ip6ip6_fb_tnl_dev) { + t = (struct ip6_tnl *) dev->priv; + } + if (!t && (err = ip6ip6_tnl_locate(&p, &t, create))) { + break; + } + if (cmd == SIOCCHGTUNNEL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + ip6ip6_tnl_unlink(t); + err = ip6ip6_tnl_change(t, &p); + ip6ip6_tnl_link(t); + netdev_state_change(dev); + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, + &t->parms, sizeof (p))) { + err = -EFAULT; + } else { + err = 0; + } + break; + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + + if (dev == ip6ip6_fb_tnl_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, + sizeof (p))) { + err = -EFAULT; + break; + } + err = ip6ip6_tnl_locate(&p, &t, 0); + if (err) + break; + if (t == ip6ip6_fb_tnl_dev->priv) { + err = -EPERM; + break; + } + } else { + t = (struct ip6_tnl *) dev->priv; + } + err = unregister_netdevice(t->dev); + break; + default: + err = -EINVAL; + } + return err; +} + +/** + * ip6ip6_tnl_get_stats - return the stats for tunnel device + * @dev: virtual device associated with tunnel + * + * Return: stats for device + **/ + +static struct net_device_stats * +ip6ip6_tnl_get_stats(struct net_device *dev) +{ + return &(((struct ip6_tnl *) dev->priv)->stat); +} + +/** + * ip6ip6_tnl_change_mtu - change mtu manually for tunnel device + * @dev: virtual device associated with tunnel + * @new_mtu: the new mtu + * + * Return: + * 0 on success, + * %-EINVAL if mtu too small + **/ + +static int +ip6ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < IPV6_MIN_MTU) { + return -EINVAL; + } + dev->mtu = new_mtu; + return 0; +} + +/** + * ip6ip6_tnl_dev_setup - setup virtual tunnel device + * @dev: virtual device associated with tunnel + * + * Description: + * Initialize function pointers and device parameters + **/ + +static void ip6ip6_tnl_dev_setup(struct net_device *dev) +{ + SET_MODULE_OWNER(dev); + dev->uninit = ip6ip6_tnl_dev_uninit; + dev->destructor = free_netdev; + dev->hard_start_xmit = ip6ip6_tnl_xmit; + dev->get_stats = ip6ip6_tnl_get_stats; + dev->do_ioctl = ip6ip6_tnl_ioctl; + dev->change_mtu = ip6ip6_tnl_change_mtu; + + dev->type = ARPHRD_TUNNEL6; + dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr); + dev->mtu = ETH_DATA_LEN - sizeof (struct ipv6hdr); + dev->flags |= IFF_NOARP; + dev->addr_len = sizeof(struct in6_addr); +} + + +/** + * ip6ip6_tnl_dev_init_gen - general initializer for all tunnel devices + * @dev: virtual device associated with tunnel + **/ + +static inline void +ip6ip6_tnl_dev_init_gen(struct net_device *dev) +{ + struct ip6_tnl *t = (struct ip6_tnl *) dev->priv; + t->fl.proto = IPPROTO_IPV6; + t->dev = dev; + strcpy(t->parms.name, dev->name); +} + +/** + * ip6ip6_tnl_dev_init - initializer for all non fallback tunnel devices + * @dev: virtual device associated with tunnel + **/ + +static int +ip6ip6_tnl_dev_init(struct net_device *dev) +{ + struct ip6_tnl *t = (struct ip6_tnl *) dev->priv; + ip6ip6_tnl_dev_init_gen(dev); + ip6ip6_tnl_link_config(t); + return 0; +} + +/** + * ip6ip6_fb_tnl_dev_init - initializer for fallback tunnel device + * @dev: fallback device + * + * Return: 0 + **/ + +static int +ip6ip6_fb_tnl_dev_init(struct net_device *dev) +{ + struct ip6_tnl *t = dev->priv; + ip6ip6_tnl_dev_init_gen(dev); + dev_hold(dev); + tnls_wc[0] = t; + return 0; +} + +static struct xfrm6_tunnel ip6ip6_handler = { + .handler = ip6ip6_rcv, + .err_handler = ip6ip6_err, +}; + +/** + * ip6_tunnel_init - register protocol and reserve needed resources + * + * Return: 0 on success + **/ + +static int __init ip6_tunnel_init(void) +{ + int err; + + if (xfrm6_tunnel_register(&ip6ip6_handler) < 0) { + printk(KERN_ERR "ip6ip6 init: can't register tunnel\n"); + return -EAGAIN; + } + ip6ip6_fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0", + ip6ip6_tnl_dev_setup); + + if (!ip6ip6_fb_tnl_dev) { + err = -ENOMEM; + goto fail; + } + ip6ip6_fb_tnl_dev->init = ip6ip6_fb_tnl_dev_init; + + if ((err = register_netdev(ip6ip6_fb_tnl_dev))) { + free_netdev(ip6ip6_fb_tnl_dev); + goto fail; + } + return 0; +fail: + xfrm6_tunnel_deregister(&ip6ip6_handler); + return err; +} + +/** + * ip6_tunnel_cleanup - free resources and unregister protocol + **/ + +static void __exit ip6_tunnel_cleanup(void) +{ + if (xfrm6_tunnel_deregister(&ip6ip6_handler) < 0) + printk(KERN_INFO "ip6ip6 close: can't deregister tunnel\n"); + + unregister_netdev(ip6ip6_fb_tnl_dev); +} + +module_init(ip6_tunnel_init); +module_exit(ip6_tunnel_cleanup); diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c new file mode 100644 index 000000000000..6cde5310cd76 --- /dev/null +++ b/net/ipv6/ipcomp6.c @@ -0,0 +1,524 @@ +/* + * IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173 + * + * Copyright (C)2003 USAGI/WIDE Project + * + * Author Mitsuru KANDA + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ +/* + * [Memo] + * + * Outbound: + * The compression of IP datagram MUST be done before AH/ESP processing, + * fragmentation, and the addition of Hop-by-Hop/Routing header. + * + * Inbound: + * The decompression of IP datagram MUST be done after the reassembly, + * AH/ESP processing. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct ipcomp6_tfms { + struct list_head list; + struct crypto_tfm **tfms; + int users; +}; + +static DECLARE_MUTEX(ipcomp6_resource_sem); +static void **ipcomp6_scratches; +static int ipcomp6_scratch_users; +static LIST_HEAD(ipcomp6_tfms_list); + +static int ipcomp6_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + int err = 0; + u8 nexthdr = 0; + int hdr_len = skb->h.raw - skb->nh.raw; + unsigned char *tmp_hdr = NULL; + struct ipv6hdr *iph; + int plen, dlen; + struct ipcomp_data *ipcd = x->data; + u8 *start, *scratch; + struct crypto_tfm *tfm; + int cpu; + + if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && + skb_linearize(skb, GFP_ATOMIC) != 0) { + err = -ENOMEM; + goto out; + } + + skb->ip_summed = CHECKSUM_NONE; + + /* Remove ipcomp header and decompress original payload */ + iph = skb->nh.ipv6h; + tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); + if (!tmp_hdr) + goto out; + memcpy(tmp_hdr, iph, hdr_len); + nexthdr = *(u8 *)skb->data; + skb_pull(skb, sizeof(struct ipv6_comp_hdr)); + skb->nh.raw += sizeof(struct ipv6_comp_hdr); + memcpy(skb->nh.raw, tmp_hdr, hdr_len); + iph = skb->nh.ipv6h; + iph->payload_len = htons(ntohs(iph->payload_len) - sizeof(struct ipv6_comp_hdr)); + skb->h.raw = skb->data; + + /* decompression */ + plen = skb->len; + dlen = IPCOMP_SCRATCH_SIZE; + start = skb->data; + + cpu = get_cpu(); + scratch = *per_cpu_ptr(ipcomp6_scratches, cpu); + tfm = *per_cpu_ptr(ipcd->tfms, cpu); + + err = crypto_comp_decompress(tfm, start, plen, scratch, &dlen); + if (err) { + err = -EINVAL; + goto out_put_cpu; + } + + if (dlen < (plen + sizeof(struct ipv6_comp_hdr))) { + err = -EINVAL; + goto out_put_cpu; + } + + err = pskb_expand_head(skb, 0, dlen - plen, GFP_ATOMIC); + if (err) { + goto out_put_cpu; + } + + skb_put(skb, dlen - plen); + memcpy(skb->data, scratch, dlen); + + iph = skb->nh.ipv6h; + iph->payload_len = htons(skb->len); + +out_put_cpu: + put_cpu(); +out: + if (tmp_hdr) + kfree(tmp_hdr); + if (err) + goto error_out; + return nexthdr; +error_out: + return err; +} + +static int ipcomp6_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int err; + struct ipv6hdr *top_iph; + int hdr_len; + struct ipv6_comp_hdr *ipch; + struct ipcomp_data *ipcd = x->data; + int plen, dlen; + u8 *start, *scratch; + struct crypto_tfm *tfm; + int cpu; + + hdr_len = skb->h.raw - skb->data; + + /* check whether datagram len is larger than threshold */ + if ((skb->len - hdr_len) < ipcd->threshold) { + goto out_ok; + } + + if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && + skb_linearize(skb, GFP_ATOMIC) != 0) { + goto out_ok; + } + + /* compression */ + plen = skb->len - hdr_len; + dlen = IPCOMP_SCRATCH_SIZE; + start = skb->h.raw; + + cpu = get_cpu(); + scratch = *per_cpu_ptr(ipcomp6_scratches, cpu); + tfm = *per_cpu_ptr(ipcd->tfms, cpu); + + err = crypto_comp_compress(tfm, start, plen, scratch, &dlen); + if (err || (dlen + sizeof(struct ipv6_comp_hdr)) >= plen) { + put_cpu(); + goto out_ok; + } + memcpy(start + sizeof(struct ip_comp_hdr), scratch, dlen); + put_cpu(); + pskb_trim(skb, hdr_len + dlen + sizeof(struct ip_comp_hdr)); + + /* insert ipcomp header and replace datagram */ + top_iph = (struct ipv6hdr *)skb->data; + + top_iph->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + + ipch = (struct ipv6_comp_hdr *)start; + ipch->nexthdr = *skb->nh.raw; + ipch->flags = 0; + ipch->cpi = htons((u16 )ntohl(x->id.spi)); + *skb->nh.raw = IPPROTO_COMP; + +out_ok: + return 0; +} + +static void ipcomp6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + u32 spi; + struct ipv6hdr *iph = (struct ipv6hdr*)skb->data; + struct ipv6_comp_hdr *ipcomph = (struct ipv6_comp_hdr*)(skb->data+offset); + struct xfrm_state *x; + + if (type != ICMPV6_DEST_UNREACH && type != ICMPV6_PKT_TOOBIG) + return; + + spi = ntohl(ntohs(ipcomph->cpi)); + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, IPPROTO_COMP, AF_INET6); + if (!x) + return; + + printk(KERN_DEBUG "pmtu discovery on SA IPCOMP/%08x/" + "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + spi, NIP6(iph->daddr)); + xfrm_state_put(x); +} + +static struct xfrm_state *ipcomp6_tunnel_create(struct xfrm_state *x) +{ + struct xfrm_state *t = NULL; + + t = xfrm_state_alloc(); + if (!t) + goto out; + + t->id.proto = IPPROTO_IPV6; + t->id.spi = xfrm6_tunnel_alloc_spi((xfrm_address_t *)&x->props.saddr); + memcpy(t->id.daddr.a6, x->id.daddr.a6, sizeof(struct in6_addr)); + memcpy(&t->sel, &x->sel, sizeof(t->sel)); + t->props.family = AF_INET6; + t->props.mode = 1; + memcpy(t->props.saddr.a6, x->props.saddr.a6, sizeof(struct in6_addr)); + + t->type = xfrm_get_type(IPPROTO_IPV6, t->props.family); + if (t->type == NULL) + goto error; + + if (t->type->init_state(t, NULL)) + goto error; + + t->km.state = XFRM_STATE_VALID; + atomic_set(&t->tunnel_users, 1); + +out: + return t; + +error: + xfrm_state_put(t); + goto out; +} + +static int ipcomp6_tunnel_attach(struct xfrm_state *x) +{ + int err = 0; + struct xfrm_state *t = NULL; + u32 spi; + + spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&x->props.saddr); + if (spi) + t = xfrm_state_lookup((xfrm_address_t *)&x->id.daddr, + spi, IPPROTO_IPV6, AF_INET6); + if (!t) { + t = ipcomp6_tunnel_create(x); + if (!t) { + err = -EINVAL; + goto out; + } + xfrm_state_insert(t); + xfrm_state_hold(t); + } + x->tunnel = t; + atomic_inc(&t->tunnel_users); + +out: + return err; +} + +static void ipcomp6_free_scratches(void) +{ + int i; + void **scratches; + + if (--ipcomp6_scratch_users) + return; + + scratches = ipcomp6_scratches; + if (!scratches) + return; + + for_each_cpu(i) { + void *scratch = *per_cpu_ptr(scratches, i); + if (scratch) + vfree(scratch); + } + + free_percpu(scratches); +} + +static void **ipcomp6_alloc_scratches(void) +{ + int i; + void **scratches; + + if (ipcomp6_scratch_users++) + return ipcomp6_scratches; + + scratches = alloc_percpu(void *); + if (!scratches) + return NULL; + + ipcomp6_scratches = scratches; + + for_each_cpu(i) { + void *scratch = vmalloc(IPCOMP_SCRATCH_SIZE); + if (!scratch) + return NULL; + *per_cpu_ptr(scratches, i) = scratch; + } + + return scratches; +} + +static void ipcomp6_free_tfms(struct crypto_tfm **tfms) +{ + struct ipcomp6_tfms *pos; + int cpu; + + list_for_each_entry(pos, &ipcomp6_tfms_list, list) { + if (pos->tfms == tfms) + break; + } + + BUG_TRAP(pos); + + if (--pos->users) + return; + + list_del(&pos->list); + kfree(pos); + + if (!tfms) + return; + + for_each_cpu(cpu) { + struct crypto_tfm *tfm = *per_cpu_ptr(tfms, cpu); + if (tfm) + crypto_free_tfm(tfm); + } + free_percpu(tfms); +} + +static struct crypto_tfm **ipcomp6_alloc_tfms(const char *alg_name) +{ + struct ipcomp6_tfms *pos; + struct crypto_tfm **tfms; + int cpu; + + /* This can be any valid CPU ID so we don't need locking. */ + cpu = smp_processor_id(); + + list_for_each_entry(pos, &ipcomp6_tfms_list, list) { + struct crypto_tfm *tfm; + + tfms = pos->tfms; + tfm = *per_cpu_ptr(tfms, cpu); + + if (!strcmp(crypto_tfm_alg_name(tfm), alg_name)) { + pos->users++; + return tfms; + } + } + + pos = kmalloc(sizeof(*pos), GFP_KERNEL); + if (!pos) + return NULL; + + pos->users = 1; + INIT_LIST_HEAD(&pos->list); + list_add(&pos->list, &ipcomp6_tfms_list); + + pos->tfms = tfms = alloc_percpu(struct crypto_tfm *); + if (!tfms) + goto error; + + for_each_cpu(cpu) { + struct crypto_tfm *tfm = crypto_alloc_tfm(alg_name, 0); + if (!tfm) + goto error; + *per_cpu_ptr(tfms, cpu) = tfm; + } + + return tfms; + +error: + ipcomp6_free_tfms(tfms); + return NULL; +} + +static void ipcomp6_free_data(struct ipcomp_data *ipcd) +{ + if (ipcd->tfms) + ipcomp6_free_tfms(ipcd->tfms); + ipcomp6_free_scratches(); +} + +static void ipcomp6_destroy(struct xfrm_state *x) +{ + struct ipcomp_data *ipcd = x->data; + if (!ipcd) + return; + xfrm_state_delete_tunnel(x); + down(&ipcomp6_resource_sem); + ipcomp6_free_data(ipcd); + up(&ipcomp6_resource_sem); + kfree(ipcd); + + xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr); +} + +static int ipcomp6_init_state(struct xfrm_state *x, void *args) +{ + int err; + struct ipcomp_data *ipcd; + struct xfrm_algo_desc *calg_desc; + + err = -EINVAL; + if (!x->calg) + goto out; + + if (x->encap) + goto out; + + err = -ENOMEM; + ipcd = kmalloc(sizeof(*ipcd), GFP_KERNEL); + if (!ipcd) + goto out; + + memset(ipcd, 0, sizeof(*ipcd)); + x->props.header_len = 0; + if (x->props.mode) + x->props.header_len += sizeof(struct ipv6hdr); + + down(&ipcomp6_resource_sem); + if (!ipcomp6_alloc_scratches()) + goto error; + + ipcd->tfms = ipcomp6_alloc_tfms(x->calg->alg_name); + if (!ipcd->tfms) + goto error; + up(&ipcomp6_resource_sem); + + if (x->props.mode) { + err = ipcomp6_tunnel_attach(x); + if (err) + goto error_tunnel; + } + + calg_desc = xfrm_calg_get_byname(x->calg->alg_name, 0); + BUG_ON(!calg_desc); + ipcd->threshold = calg_desc->uinfo.comp.threshold; + x->data = ipcd; + err = 0; +out: + return err; +error_tunnel: + down(&ipcomp6_resource_sem); +error: + ipcomp6_free_data(ipcd); + up(&ipcomp6_resource_sem); + kfree(ipcd); + + goto out; +} + +static struct xfrm_type ipcomp6_type = +{ + .description = "IPCOMP6", + .owner = THIS_MODULE, + .proto = IPPROTO_COMP, + .init_state = ipcomp6_init_state, + .destructor = ipcomp6_destroy, + .input = ipcomp6_input, + .output = ipcomp6_output, +}; + +static struct inet6_protocol ipcomp6_protocol = +{ + .handler = xfrm6_rcv, + .err_handler = ipcomp6_err, + .flags = INET6_PROTO_NOPOLICY, +}; + +static int __init ipcomp6_init(void) +{ + if (xfrm_register_type(&ipcomp6_type, AF_INET6) < 0) { + printk(KERN_INFO "ipcomp6 init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet6_add_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) { + printk(KERN_INFO "ipcomp6 init: can't add protocol\n"); + xfrm_unregister_type(&ipcomp6_type, AF_INET6); + return -EAGAIN; + } + return 0; +} + +static void __exit ipcomp6_fini(void) +{ + if (inet6_del_protocol(&ipcomp6_protocol, IPPROTO_COMP) < 0) + printk(KERN_INFO "ipv6 ipcomp close: can't remove protocol\n"); + if (xfrm_unregister_type(&ipcomp6_type, AF_INET6) < 0) + printk(KERN_INFO "ipv6 ipcomp close: can't remove xfrm type\n"); +} + +module_init(ipcomp6_init); +module_exit(ipcomp6_fini); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IP Payload Compression Protocol (IPComp) for IPv6 - RFC3173"); +MODULE_AUTHOR("Mitsuru KANDA "); + + diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c new file mode 100644 index 000000000000..279ab86be662 --- /dev/null +++ b/net/ipv6/ipv6_sockglue.c @@ -0,0 +1,704 @@ +/* + * IPv6 BSD socket options interface + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * Based on linux/net/ipv4/ip_sockglue.c + * + * $Id: ipv6_sockglue.c,v 1.41 2002/02/01 22:01:04 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * FIXME: Make the setsockopt code POSIX compliant: That is + * + * o Return -EINVAL for setsockopt of short lengths + * o Truncate getsockopt returns + * o Return an optlen of the truncated length if need be + * + * Changes: + * David L Stevens : + * - added multicast source filtering API for MLDv2 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +DEFINE_SNMP_STAT(struct ipstats_mib, ipv6_statistics); + +static struct packet_type ipv6_packet_type = { + .type = __constant_htons(ETH_P_IPV6), + .func = ipv6_rcv, +}; + +struct ip6_ra_chain *ip6_ra_chain; +DEFINE_RWLOCK(ip6_ra_lock); + +int ip6_ra_control(struct sock *sk, int sel, void (*destructor)(struct sock *)) +{ + struct ip6_ra_chain *ra, *new_ra, **rap; + + /* RA packet may be delivered ONLY to IPPROTO_RAW socket */ + if (sk->sk_type != SOCK_RAW || inet_sk(sk)->num != IPPROTO_RAW) + return -EINVAL; + + new_ra = (sel>=0) ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; + + write_lock_bh(&ip6_ra_lock); + for (rap = &ip6_ra_chain; (ra=*rap) != NULL; rap = &ra->next) { + if (ra->sk == sk) { + if (sel>=0) { + write_unlock_bh(&ip6_ra_lock); + if (new_ra) + kfree(new_ra); + return -EADDRINUSE; + } + + *rap = ra->next; + write_unlock_bh(&ip6_ra_lock); + + if (ra->destructor) + ra->destructor(sk); + sock_put(sk); + kfree(ra); + return 0; + } + } + if (new_ra == NULL) { + write_unlock_bh(&ip6_ra_lock); + return -ENOBUFS; + } + new_ra->sk = sk; + new_ra->sel = sel; + new_ra->destructor = destructor; + new_ra->next = ra; + *rap = new_ra; + sock_hold(sk); + write_unlock_bh(&ip6_ra_lock); + return 0; +} + +extern int ip6_mc_source(int add, int omode, struct sock *sk, + struct group_source_req *pgsr); +extern int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf); +extern int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, + struct group_filter __user *optval, int __user *optlen); + + +int ipv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, int optlen) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + int val, valbool; + int retv = -ENOPROTOOPT; + + if (level == SOL_IP && sk->sk_type != SOCK_RAW) + return udp_prot.setsockopt(sk, level, optname, optval, optlen); + + if(level!=SOL_IPV6) + goto out; + + if (optval == NULL) + val=0; + else if (get_user(val, (int __user *) optval)) + return -EFAULT; + + valbool = (val!=0); + + lock_sock(sk); + + switch (optname) { + + case IPV6_ADDRFORM: + if (val == PF_INET) { + struct ipv6_txoptions *opt; + struct sk_buff *pktopt; + + if (sk->sk_protocol != IPPROTO_UDP && + sk->sk_protocol != IPPROTO_TCP) + break; + + if (sk->sk_state != TCP_ESTABLISHED) { + retv = -ENOTCONN; + break; + } + + if (ipv6_only_sock(sk) || + !(ipv6_addr_type(&np->daddr) & IPV6_ADDR_MAPPED)) { + retv = -EADDRNOTAVAIL; + break; + } + + fl6_free_socklist(sk); + ipv6_sock_mc_close(sk); + + if (sk->sk_protocol == IPPROTO_TCP) { + struct tcp_sock *tp = tcp_sk(sk); + + local_bh_disable(); + sock_prot_dec_use(sk->sk_prot); + sock_prot_inc_use(&tcp_prot); + local_bh_enable(); + sk->sk_prot = &tcp_prot; + tp->af_specific = &ipv4_specific; + sk->sk_socket->ops = &inet_stream_ops; + sk->sk_family = PF_INET; + tcp_sync_mss(sk, tp->pmtu_cookie); + } else { + local_bh_disable(); + sock_prot_dec_use(sk->sk_prot); + sock_prot_inc_use(&udp_prot); + local_bh_enable(); + sk->sk_prot = &udp_prot; + sk->sk_socket->ops = &inet_dgram_ops; + sk->sk_family = PF_INET; + } + opt = xchg(&np->opt, NULL); + if (opt) + sock_kfree_s(sk, opt, opt->tot_len); + pktopt = xchg(&np->pktoptions, NULL); + if (pktopt) + kfree_skb(pktopt); + + sk->sk_destruct = inet_sock_destruct; +#ifdef INET_REFCNT_DEBUG + atomic_dec(&inet6_sock_nr); +#endif + module_put(THIS_MODULE); + retv = 0; + break; + } + goto e_inval; + + case IPV6_V6ONLY: + if (inet_sk(sk)->num) + goto e_inval; + np->ipv6only = valbool; + retv = 0; + break; + + case IPV6_PKTINFO: + np->rxopt.bits.rxinfo = valbool; + retv = 0; + break; + + case IPV6_HOPLIMIT: + np->rxopt.bits.rxhlim = valbool; + retv = 0; + break; + + case IPV6_RTHDR: + if (val < 0 || val > 2) + goto e_inval; + np->rxopt.bits.srcrt = val; + retv = 0; + break; + + case IPV6_HOPOPTS: + np->rxopt.bits.hopopts = valbool; + retv = 0; + break; + + case IPV6_DSTOPTS: + np->rxopt.bits.dstopts = valbool; + retv = 0; + break; + + case IPV6_FLOWINFO: + np->rxopt.bits.rxflow = valbool; + retv = 0; + break; + + case IPV6_PKTOPTIONS: + { + struct ipv6_txoptions *opt = NULL; + struct msghdr msg; + struct flowi fl; + int junk; + + fl.fl6_flowlabel = 0; + fl.oif = sk->sk_bound_dev_if; + + if (optlen == 0) + goto update; + + /* 1K is probably excessive + * 1K is surely not enough, 2K per standard header is 16K. + */ + retv = -EINVAL; + if (optlen > 64*1024) + break; + + opt = sock_kmalloc(sk, sizeof(*opt) + optlen, GFP_KERNEL); + retv = -ENOBUFS; + if (opt == NULL) + break; + + memset(opt, 0, sizeof(*opt)); + opt->tot_len = sizeof(*opt) + optlen; + retv = -EFAULT; + if (copy_from_user(opt+1, optval, optlen)) + goto done; + + msg.msg_controllen = optlen; + msg.msg_control = (void*)(opt+1); + + retv = datagram_send_ctl(&msg, &fl, opt, &junk); + if (retv) + goto done; +update: + retv = 0; + if (sk->sk_type == SOCK_STREAM) { + if (opt) { + struct tcp_sock *tp = tcp_sk(sk); + if (!((1 << sk->sk_state) & + (TCPF_LISTEN | TCPF_CLOSE)) + && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { + tp->ext_header_len = opt->opt_flen + opt->opt_nflen; + tcp_sync_mss(sk, tp->pmtu_cookie); + } + } + opt = xchg(&np->opt, opt); + sk_dst_reset(sk); + } else { + write_lock(&sk->sk_dst_lock); + opt = xchg(&np->opt, opt); + write_unlock(&sk->sk_dst_lock); + sk_dst_reset(sk); + } + +done: + if (opt) + sock_kfree_s(sk, opt, opt->tot_len); + break; + } + case IPV6_UNICAST_HOPS: + if (val > 255 || val < -1) + goto e_inval; + np->hop_limit = val; + retv = 0; + break; + + case IPV6_MULTICAST_HOPS: + if (sk->sk_type == SOCK_STREAM) + goto e_inval; + if (val > 255 || val < -1) + goto e_inval; + np->mcast_hops = val; + retv = 0; + break; + + case IPV6_MULTICAST_LOOP: + np->mc_loop = valbool; + retv = 0; + break; + + case IPV6_MULTICAST_IF: + if (sk->sk_type == SOCK_STREAM) + goto e_inval; + if (sk->sk_bound_dev_if && sk->sk_bound_dev_if != val) + goto e_inval; + + if (__dev_get_by_index(val) == NULL) { + retv = -ENODEV; + break; + } + np->mcast_oif = val; + retv = 0; + break; + case IPV6_ADD_MEMBERSHIP: + case IPV6_DROP_MEMBERSHIP: + { + struct ipv6_mreq mreq; + + retv = -EFAULT; + if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq))) + break; + + if (optname == IPV6_ADD_MEMBERSHIP) + retv = ipv6_sock_mc_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); + else + retv = ipv6_sock_mc_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_multiaddr); + break; + } + case IPV6_JOIN_ANYCAST: + case IPV6_LEAVE_ANYCAST: + { + struct ipv6_mreq mreq; + + if (optlen != sizeof(struct ipv6_mreq)) + goto e_inval; + + retv = -EFAULT; + if (copy_from_user(&mreq, optval, sizeof(struct ipv6_mreq))) + break; + + if (optname == IPV6_JOIN_ANYCAST) + retv = ipv6_sock_ac_join(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); + else + retv = ipv6_sock_ac_drop(sk, mreq.ipv6mr_ifindex, &mreq.ipv6mr_acaddr); + break; + } + case MCAST_JOIN_GROUP: + case MCAST_LEAVE_GROUP: + { + struct group_req greq; + struct sockaddr_in6 *psin6; + + retv = -EFAULT; + if (copy_from_user(&greq, optval, sizeof(struct group_req))) + break; + if (greq.gr_group.ss_family != AF_INET6) { + retv = -EADDRNOTAVAIL; + break; + } + psin6 = (struct sockaddr_in6 *)&greq.gr_group; + if (optname == MCAST_JOIN_GROUP) + retv = ipv6_sock_mc_join(sk, greq.gr_interface, + &psin6->sin6_addr); + else + retv = ipv6_sock_mc_drop(sk, greq.gr_interface, + &psin6->sin6_addr); + break; + } + case MCAST_JOIN_SOURCE_GROUP: + case MCAST_LEAVE_SOURCE_GROUP: + case MCAST_BLOCK_SOURCE: + case MCAST_UNBLOCK_SOURCE: + { + struct group_source_req greqs; + int omode, add; + + if (optlen != sizeof(struct group_source_req)) + goto e_inval; + if (copy_from_user(&greqs, optval, sizeof(greqs))) { + retv = -EFAULT; + break; + } + if (greqs.gsr_group.ss_family != AF_INET6 || + greqs.gsr_source.ss_family != AF_INET6) { + retv = -EADDRNOTAVAIL; + break; + } + if (optname == MCAST_BLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 1; + } else if (optname == MCAST_UNBLOCK_SOURCE) { + omode = MCAST_EXCLUDE; + add = 0; + } else if (optname == MCAST_JOIN_SOURCE_GROUP) { + struct sockaddr_in6 *psin6; + + psin6 = (struct sockaddr_in6 *)&greqs.gsr_group; + retv = ipv6_sock_mc_join(sk, greqs.gsr_interface, + &psin6->sin6_addr); + if (retv) + break; + omode = MCAST_INCLUDE; + add = 1; + } else /*IP_DROP_SOURCE_MEMBERSHIP */ { + omode = MCAST_INCLUDE; + add = 0; + } + retv = ip6_mc_source(add, omode, sk, &greqs); + break; + } + case MCAST_MSFILTER: + { + extern int sysctl_optmem_max; + extern int sysctl_mld_max_msf; + struct group_filter *gsf; + + if (optlen < GROUP_FILTER_SIZE(0)) + goto e_inval; + if (optlen > sysctl_optmem_max) { + retv = -ENOBUFS; + break; + } + gsf = (struct group_filter *)kmalloc(optlen,GFP_KERNEL); + if (gsf == 0) { + retv = -ENOBUFS; + break; + } + retv = -EFAULT; + if (copy_from_user(gsf, optval, optlen)) { + kfree(gsf); + break; + } + /* numsrc >= (4G-140)/128 overflow in 32 bits */ + if (gsf->gf_numsrc >= 0x1ffffffU || + gsf->gf_numsrc > sysctl_mld_max_msf) { + kfree(gsf); + retv = -ENOBUFS; + break; + } + if (GROUP_FILTER_SIZE(gsf->gf_numsrc) > optlen) { + kfree(gsf); + retv = -EINVAL; + break; + } + retv = ip6_mc_msfilter(sk, gsf); + kfree(gsf); + + break; + } + case IPV6_ROUTER_ALERT: + retv = ip6_ra_control(sk, val, NULL); + break; + case IPV6_MTU_DISCOVER: + if (val<0 || val>2) + goto e_inval; + np->pmtudisc = val; + retv = 0; + break; + case IPV6_MTU: + if (val && val < IPV6_MIN_MTU) + goto e_inval; + np->frag_size = val; + retv = 0; + break; + case IPV6_RECVERR: + np->recverr = valbool; + if (!val) + skb_queue_purge(&sk->sk_error_queue); + retv = 0; + break; + case IPV6_FLOWINFO_SEND: + np->sndflow = valbool; + retv = 0; + break; + case IPV6_FLOWLABEL_MGR: + retv = ipv6_flowlabel_opt(sk, optval, optlen); + break; + case IPV6_IPSEC_POLICY: + case IPV6_XFRM_POLICY: + retv = xfrm_user_policy(sk, optname, optval, optlen); + break; + +#ifdef CONFIG_NETFILTER + default: + retv = nf_setsockopt(sk, PF_INET6, optname, optval, + optlen); + break; +#endif + + } + release_sock(sk); + +out: + return retv; + +e_inval: + release_sock(sk); + return -EINVAL; +} + +int ipv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + int len; + int val; + + if (level == SOL_IP && sk->sk_type != SOCK_RAW) + return udp_prot.getsockopt(sk, level, optname, optval, optlen); + if(level!=SOL_IPV6) + return -ENOPROTOOPT; + if (get_user(len, optlen)) + return -EFAULT; + switch (optname) { + case IPV6_ADDRFORM: + if (sk->sk_protocol != IPPROTO_UDP && + sk->sk_protocol != IPPROTO_TCP) + return -EINVAL; + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + val = sk->sk_family; + break; + case MCAST_MSFILTER: + { + struct group_filter gsf; + int err; + + if (len < GROUP_FILTER_SIZE(0)) + return -EINVAL; + if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) + return -EFAULT; + lock_sock(sk); + err = ip6_mc_msfget(sk, &gsf, + (struct group_filter __user *)optval, optlen); + release_sock(sk); + return err; + } + + case IPV6_PKTOPTIONS: + { + struct msghdr msg; + struct sk_buff *skb; + + if (sk->sk_type != SOCK_STREAM) + return -ENOPROTOOPT; + + msg.msg_control = optval; + msg.msg_controllen = len; + msg.msg_flags = 0; + + lock_sock(sk); + skb = np->pktoptions; + if (skb) + atomic_inc(&skb->users); + release_sock(sk); + + if (skb) { + int err = datagram_recv_ctl(sk, &msg, skb); + kfree_skb(skb); + if (err) + return err; + } else { + if (np->rxopt.bits.rxinfo) { + struct in6_pktinfo src_info; + src_info.ipi6_ifindex = np->mcast_oif; + ipv6_addr_copy(&src_info.ipi6_addr, &np->daddr); + put_cmsg(&msg, SOL_IPV6, IPV6_PKTINFO, sizeof(src_info), &src_info); + } + if (np->rxopt.bits.rxhlim) { + int hlim = np->mcast_hops; + put_cmsg(&msg, SOL_IPV6, IPV6_HOPLIMIT, sizeof(hlim), &hlim); + } + } + len -= msg.msg_controllen; + return put_user(len, optlen); + } + case IPV6_MTU: + { + struct dst_entry *dst; + val = 0; + lock_sock(sk); + dst = sk_dst_get(sk); + if (dst) { + val = dst_mtu(dst); + dst_release(dst); + } + release_sock(sk); + if (!val) + return -ENOTCONN; + break; + } + + case IPV6_V6ONLY: + val = np->ipv6only; + break; + + case IPV6_PKTINFO: + val = np->rxopt.bits.rxinfo; + break; + + case IPV6_HOPLIMIT: + val = np->rxopt.bits.rxhlim; + break; + + case IPV6_RTHDR: + val = np->rxopt.bits.srcrt; + break; + + case IPV6_HOPOPTS: + val = np->rxopt.bits.hopopts; + break; + + case IPV6_DSTOPTS: + val = np->rxopt.bits.dstopts; + break; + + case IPV6_FLOWINFO: + val = np->rxopt.bits.rxflow; + break; + + case IPV6_UNICAST_HOPS: + val = np->hop_limit; + break; + + case IPV6_MULTICAST_HOPS: + val = np->mcast_hops; + break; + + case IPV6_MULTICAST_LOOP: + val = np->mc_loop; + break; + + case IPV6_MULTICAST_IF: + val = np->mcast_oif; + break; + + case IPV6_MTU_DISCOVER: + val = np->pmtudisc; + break; + + case IPV6_RECVERR: + val = np->recverr; + break; + + case IPV6_FLOWINFO_SEND: + val = np->sndflow; + break; + + default: +#ifdef CONFIG_NETFILTER + lock_sock(sk); + val = nf_getsockopt(sk, PF_INET6, optname, optval, + &len); + release_sock(sk); + if (val >= 0) + val = put_user(len, optlen); + return val; +#else + return -EINVAL; +#endif + } + len = min_t(unsigned int, sizeof(int), len); + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval,&val,len)) + return -EFAULT; + return 0; +} + +void __init ipv6_packet_init(void) +{ + dev_add_pack(&ipv6_packet_type); +} + +void ipv6_packet_cleanup(void) +{ + dev_remove_pack(&ipv6_packet_type); +} diff --git a/net/ipv6/ipv6_syms.c b/net/ipv6/ipv6_syms.c new file mode 100644 index 000000000000..2f4c91ddc9a3 --- /dev/null +++ b/net/ipv6/ipv6_syms.c @@ -0,0 +1,41 @@ + +#include +#include +#include +#include +#include +#include +#include + +EXPORT_SYMBOL(ipv6_addr_type); +EXPORT_SYMBOL(icmpv6_send); +EXPORT_SYMBOL(icmpv6_statistics); +EXPORT_SYMBOL(icmpv6_err_convert); +EXPORT_SYMBOL(ndisc_mc_map); +EXPORT_SYMBOL(register_inet6addr_notifier); +EXPORT_SYMBOL(unregister_inet6addr_notifier); +EXPORT_SYMBOL(ip6_route_output); +#ifdef CONFIG_NETFILTER +EXPORT_SYMBOL(ip6_route_me_harder); +#endif +EXPORT_SYMBOL(addrconf_lock); +EXPORT_SYMBOL(ipv6_setsockopt); +EXPORT_SYMBOL(ipv6_getsockopt); +EXPORT_SYMBOL(inet6_register_protosw); +EXPORT_SYMBOL(inet6_unregister_protosw); +EXPORT_SYMBOL(inet6_add_protocol); +EXPORT_SYMBOL(inet6_del_protocol); +EXPORT_SYMBOL(ip6_xmit); +EXPORT_SYMBOL(inet6_release); +EXPORT_SYMBOL(inet6_bind); +EXPORT_SYMBOL(inet6_getname); +EXPORT_SYMBOL(inet6_ioctl); +EXPORT_SYMBOL(ipv6_get_saddr); +EXPORT_SYMBOL(ipv6_chk_addr); +EXPORT_SYMBOL(in6_dev_finish_destroy); +#ifdef CONFIG_XFRM +EXPORT_SYMBOL(xfrm6_rcv); +#endif +EXPORT_SYMBOL(rt6_lookup); +EXPORT_SYMBOL(fl6_sock_lookup); +EXPORT_SYMBOL(ipv6_push_nfrag_opts); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c new file mode 100644 index 000000000000..393b6e6f50a9 --- /dev/null +++ b/net/ipv6/mcast.c @@ -0,0 +1,2499 @@ +/* + * Multicast support for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * $Id: mcast.c,v 1.40 2002/02/08 03:57:19 davem Exp $ + * + * Based on linux/ipv4/igmp.c and linux/ipv4/ip_sockglue.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* Changes: + * + * yoshfuji : fix format of router-alert option + * YOSHIFUJI Hideaki @USAGI: + * Fixed source address for MLD message based on + * . + * YOSHIFUJI Hideaki @USAGI: + * - Ignore Queries for invalid addresses. + * - MLD for link-local addresses. + * David L Stevens : + * - MLDv2 support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +/* Set to 3 to get tracing... */ +#define MCAST_DEBUG 2 + +#if MCAST_DEBUG >= 3 +#define MDBG(x) printk x +#else +#define MDBG(x) +#endif + +/* + * These header formats should be in a separate include file, but icmpv6.h + * doesn't have in6_addr defined in all cases, there is no __u128, and no + * other files reference these. + * + * +-DLS 4/14/03 + */ + +/* Multicast Listener Discovery version 2 headers */ + +struct mld2_grec { + __u8 grec_type; + __u8 grec_auxwords; + __u16 grec_nsrcs; + struct in6_addr grec_mca; + struct in6_addr grec_src[0]; +}; + +struct mld2_report { + __u8 type; + __u8 resv1; + __u16 csum; + __u16 resv2; + __u16 ngrec; + struct mld2_grec grec[0]; +}; + +struct mld2_query { + __u8 type; + __u8 code; + __u16 csum; + __u16 mrc; + __u16 resv1; + struct in6_addr mca; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 qrv:3, + suppress:1, + resv2:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 resv2:4, + suppress:1, + qrv:3; +#else +#error "Please fix " +#endif + __u8 qqic; + __u16 nsrcs; + struct in6_addr srcs[0]; +}; + +static struct in6_addr mld2_all_mcr = MLD2_ALL_MCR_INIT; + +/* Big mc list lock for all the sockets */ +static DEFINE_RWLOCK(ipv6_sk_mc_lock); + +static struct socket *igmp6_socket; + +int __ipv6_dev_mc_dec(struct inet6_dev *idev, struct in6_addr *addr); + +static void igmp6_join_group(struct ifmcaddr6 *ma); +static void igmp6_leave_group(struct ifmcaddr6 *ma); +static void igmp6_timer_handler(unsigned long data); + +static void mld_gq_timer_expire(unsigned long data); +static void mld_ifc_timer_expire(unsigned long data); +static void mld_ifc_event(struct inet6_dev *idev); +static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *pmc); +static void mld_del_delrec(struct inet6_dev *idev, struct in6_addr *addr); +static void mld_clear_delrec(struct inet6_dev *idev); +static int sf_setstate(struct ifmcaddr6 *pmc); +static void sf_markstate(struct ifmcaddr6 *pmc); +static void ip6_mc_clear_src(struct ifmcaddr6 *pmc); +static int ip6_mc_del_src(struct inet6_dev *idev, struct in6_addr *pmca, + int sfmode, int sfcount, struct in6_addr *psfsrc, + int delta); +static int ip6_mc_add_src(struct inet6_dev *idev, struct in6_addr *pmca, + int sfmode, int sfcount, struct in6_addr *psfsrc, + int delta); +static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, + struct inet6_dev *idev); + + +#define IGMP6_UNSOLICITED_IVAL (10*HZ) +#define MLD_QRV_DEFAULT 2 + +#define MLD_V1_SEEN(idev) (ipv6_devconf.force_mld_version == 1 || \ + (idev)->cnf.force_mld_version == 1 || \ + ((idev)->mc_v1_seen && \ + time_before(jiffies, (idev)->mc_v1_seen))) + +#define MLDV2_MASK(value, nb) ((nb)>=32 ? (value) : ((1<<(nb))-1) & (value)) +#define MLDV2_EXP(thresh, nbmant, nbexp, value) \ + ((value) < (thresh) ? (value) : \ + ((MLDV2_MASK(value, nbmant) | (1<<(nbmant+nbexp))) << \ + (MLDV2_MASK((value) >> (nbmant), nbexp) + (nbexp)))) + +#define MLDV2_QQIC(value) MLDV2_EXP(0x80, 4, 3, value) +#define MLDV2_MRC(value) MLDV2_EXP(0x8000, 12, 3, value) + +#define IPV6_MLD_MAX_MSF 10 + +int sysctl_mld_max_msf = IPV6_MLD_MAX_MSF; + +/* + * socket join on multicast group + */ + +int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) +{ + struct net_device *dev = NULL; + struct ipv6_mc_socklist *mc_lst; + struct ipv6_pinfo *np = inet6_sk(sk); + int err; + + if (!ipv6_addr_is_multicast(addr)) + return -EINVAL; + + mc_lst = sock_kmalloc(sk, sizeof(struct ipv6_mc_socklist), GFP_KERNEL); + + if (mc_lst == NULL) + return -ENOMEM; + + mc_lst->next = NULL; + ipv6_addr_copy(&mc_lst->addr, addr); + + if (ifindex == 0) { + struct rt6_info *rt; + rt = rt6_lookup(addr, NULL, 0, 0); + if (rt) { + dev = rt->rt6i_dev; + dev_hold(dev); + dst_release(&rt->u.dst); + } + } else + dev = dev_get_by_index(ifindex); + + if (dev == NULL) { + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + return -ENODEV; + } + + mc_lst->ifindex = dev->ifindex; + mc_lst->sfmode = MCAST_EXCLUDE; + mc_lst->sflist = NULL; + + /* + * now add/increase the group membership on the device + */ + + err = ipv6_dev_mc_inc(dev, addr); + + if (err) { + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + dev_put(dev); + return err; + } + + write_lock_bh(&ipv6_sk_mc_lock); + mc_lst->next = np->ipv6_mc_list; + np->ipv6_mc_list = mc_lst; + write_unlock_bh(&ipv6_sk_mc_lock); + + dev_put(dev); + + return 0; +} + +/* + * socket leave on multicast group + */ +int ipv6_sock_mc_drop(struct sock *sk, int ifindex, struct in6_addr *addr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_mc_socklist *mc_lst, **lnk; + + write_lock_bh(&ipv6_sk_mc_lock); + for (lnk = &np->ipv6_mc_list; (mc_lst = *lnk) !=NULL ; lnk = &mc_lst->next) { + if ((ifindex == 0 || mc_lst->ifindex == ifindex) && + ipv6_addr_equal(&mc_lst->addr, addr)) { + struct net_device *dev; + + *lnk = mc_lst->next; + write_unlock_bh(&ipv6_sk_mc_lock); + + if ((dev = dev_get_by_index(mc_lst->ifindex)) != NULL) { + struct inet6_dev *idev = in6_dev_get(dev); + + if (idev) { + (void) ip6_mc_leave_src(sk,mc_lst,idev); + __ipv6_dev_mc_dec(idev, &mc_lst->addr); + in6_dev_put(idev); + } + dev_put(dev); + } + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + return 0; + } + } + write_unlock_bh(&ipv6_sk_mc_lock); + + return -ENOENT; +} + +static struct inet6_dev *ip6_mc_find_dev(struct in6_addr *group, int ifindex) +{ + struct net_device *dev = NULL; + struct inet6_dev *idev = NULL; + + if (ifindex == 0) { + struct rt6_info *rt; + + rt = rt6_lookup(group, NULL, 0, 0); + if (rt) { + dev = rt->rt6i_dev; + dev_hold(dev); + dst_release(&rt->u.dst); + } + } else + dev = dev_get_by_index(ifindex); + + if (!dev) + return NULL; + idev = in6_dev_get(dev); + if (!idev) { + dev_put(dev); + return NULL; + } + read_lock_bh(&idev->lock); + if (idev->dead) { + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + dev_put(dev); + return NULL; + } + return idev; +} + +void ipv6_sock_mc_close(struct sock *sk) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_mc_socklist *mc_lst; + + write_lock_bh(&ipv6_sk_mc_lock); + while ((mc_lst = np->ipv6_mc_list) != NULL) { + struct net_device *dev; + + np->ipv6_mc_list = mc_lst->next; + write_unlock_bh(&ipv6_sk_mc_lock); + + dev = dev_get_by_index(mc_lst->ifindex); + if (dev) { + struct inet6_dev *idev = in6_dev_get(dev); + + if (idev) { + (void) ip6_mc_leave_src(sk, mc_lst, idev); + __ipv6_dev_mc_dec(idev, &mc_lst->addr); + in6_dev_put(idev); + } + dev_put(dev); + } + + sock_kfree_s(sk, mc_lst, sizeof(*mc_lst)); + + write_lock_bh(&ipv6_sk_mc_lock); + } + write_unlock_bh(&ipv6_sk_mc_lock); +} + +int ip6_mc_source(int add, int omode, struct sock *sk, + struct group_source_req *pgsr) +{ + struct in6_addr *source, *group; + struct ipv6_mc_socklist *pmc; + struct net_device *dev; + struct inet6_dev *idev; + struct ipv6_pinfo *inet6 = inet6_sk(sk); + struct ip6_sf_socklist *psl; + int i, j, rv; + int err; + + if (pgsr->gsr_group.ss_family != AF_INET6 || + pgsr->gsr_source.ss_family != AF_INET6) + return -EINVAL; + + source = &((struct sockaddr_in6 *)&pgsr->gsr_source)->sin6_addr; + group = &((struct sockaddr_in6 *)&pgsr->gsr_group)->sin6_addr; + + if (!ipv6_addr_is_multicast(group)) + return -EINVAL; + + idev = ip6_mc_find_dev(group, pgsr->gsr_interface); + if (!idev) + return -ENODEV; + dev = idev->dev; + + err = -EADDRNOTAVAIL; + + for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { + if (pgsr->gsr_interface && pmc->ifindex != pgsr->gsr_interface) + continue; + if (ipv6_addr_equal(&pmc->addr, group)) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + /* if a source filter was set, must be the same mode as before */ + if (pmc->sflist) { + if (pmc->sfmode != omode) + goto done; + } else if (pmc->sfmode != omode) { + /* allow mode switches for empty-set filters */ + ip6_mc_add_src(idev, group, omode, 0, NULL, 0); + ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); + pmc->sfmode = omode; + } + + psl = pmc->sflist; + if (!add) { + if (!psl) + goto done; + rv = !0; + for (i=0; isl_count; i++) { + rv = memcmp(&psl->sl_addr[i], source, + sizeof(struct in6_addr)); + if (rv == 0) + break; + } + if (rv) /* source not found */ + goto done; + + /* update the interface filter */ + ip6_mc_del_src(idev, group, omode, 1, source, 1); + + for (j=i+1; jsl_count; j++) + psl->sl_addr[j-1] = psl->sl_addr[j]; + psl->sl_count--; + err = 0; + goto done; + } + /* else, add a new source to the filter */ + + if (psl && psl->sl_count >= sysctl_mld_max_msf) { + err = -ENOBUFS; + goto done; + } + if (!psl || psl->sl_count == psl->sl_max) { + struct ip6_sf_socklist *newpsl; + int count = IP6_SFBLOCK; + + if (psl) + count += psl->sl_max; + newpsl = (struct ip6_sf_socklist *)sock_kmalloc(sk, + IP6_SFLSIZE(count), GFP_ATOMIC); + if (!newpsl) { + err = -ENOBUFS; + goto done; + } + newpsl->sl_max = count; + newpsl->sl_count = count - IP6_SFBLOCK; + if (psl) { + for (i=0; isl_count; i++) + newpsl->sl_addr[i] = psl->sl_addr[i]; + sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max)); + } + pmc->sflist = psl = newpsl; + } + rv = 1; /* > 0 for insert logic below if sl_count is 0 */ + for (i=0; isl_count; i++) { + rv = memcmp(&psl->sl_addr[i], source, sizeof(struct in6_addr)); + if (rv == 0) + break; + } + if (rv == 0) /* address already there is an error */ + goto done; + for (j=psl->sl_count-1; j>=i; j--) + psl->sl_addr[j+1] = psl->sl_addr[j]; + psl->sl_addr[i] = *source; + psl->sl_count++; + err = 0; + /* update the interface list */ + ip6_mc_add_src(idev, group, omode, 1, source, 1); +done: + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + dev_put(dev); + return err; +} + +int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf) +{ + struct in6_addr *group; + struct ipv6_mc_socklist *pmc; + struct net_device *dev; + struct inet6_dev *idev; + struct ipv6_pinfo *inet6 = inet6_sk(sk); + struct ip6_sf_socklist *newpsl, *psl; + int i, err; + + group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; + + if (!ipv6_addr_is_multicast(group)) + return -EINVAL; + if (gsf->gf_fmode != MCAST_INCLUDE && + gsf->gf_fmode != MCAST_EXCLUDE) + return -EINVAL; + + idev = ip6_mc_find_dev(group, gsf->gf_interface); + + if (!idev) + return -ENODEV; + dev = idev->dev; + err = -EADDRNOTAVAIL; + + for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { + if (pmc->ifindex != gsf->gf_interface) + continue; + if (ipv6_addr_equal(&pmc->addr, group)) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + if (gsf->gf_numsrc) { + newpsl = (struct ip6_sf_socklist *)sock_kmalloc(sk, + IP6_SFLSIZE(gsf->gf_numsrc), GFP_ATOMIC); + if (!newpsl) { + err = -ENOBUFS; + goto done; + } + newpsl->sl_max = newpsl->sl_count = gsf->gf_numsrc; + for (i=0; isl_count; ++i) { + struct sockaddr_in6 *psin6; + + psin6 = (struct sockaddr_in6 *)&gsf->gf_slist[i]; + newpsl->sl_addr[i] = psin6->sin6_addr; + } + err = ip6_mc_add_src(idev, group, gsf->gf_fmode, + newpsl->sl_count, newpsl->sl_addr, 0); + if (err) { + sock_kfree_s(sk, newpsl, IP6_SFLSIZE(newpsl->sl_max)); + goto done; + } + } else + newpsl = NULL; + psl = pmc->sflist; + if (psl) { + (void) ip6_mc_del_src(idev, group, pmc->sfmode, + psl->sl_count, psl->sl_addr, 0); + sock_kfree_s(sk, psl, IP6_SFLSIZE(psl->sl_max)); + } else + (void) ip6_mc_del_src(idev, group, pmc->sfmode, 0, NULL, 0); + pmc->sflist = newpsl; + pmc->sfmode = gsf->gf_fmode; +done: + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + dev_put(dev); + return err; +} + +int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, + struct group_filter __user *optval, int __user *optlen) +{ + int err, i, count, copycount; + struct in6_addr *group; + struct ipv6_mc_socklist *pmc; + struct inet6_dev *idev; + struct net_device *dev; + struct ipv6_pinfo *inet6 = inet6_sk(sk); + struct ip6_sf_socklist *psl; + + group = &((struct sockaddr_in6 *)&gsf->gf_group)->sin6_addr; + + if (!ipv6_addr_is_multicast(group)) + return -EINVAL; + + idev = ip6_mc_find_dev(group, gsf->gf_interface); + + if (!idev) + return -ENODEV; + + dev = idev->dev; + + err = -EADDRNOTAVAIL; + + for (pmc=inet6->ipv6_mc_list; pmc; pmc=pmc->next) { + if (pmc->ifindex != gsf->gf_interface) + continue; + if (ipv6_addr_equal(group, &pmc->addr)) + break; + } + if (!pmc) /* must have a prior join */ + goto done; + gsf->gf_fmode = pmc->sfmode; + psl = pmc->sflist; + count = psl ? psl->sl_count : 0; + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + dev_put(dev); + + copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; + gsf->gf_numsrc = count; + if (put_user(GROUP_FILTER_SIZE(copycount), optlen) || + copy_to_user(optval, gsf, GROUP_FILTER_SIZE(0))) { + return -EFAULT; + } + for (i=0; isin6_family = AF_INET6; + psin6->sin6_addr = psl->sl_addr[i]; + if (copy_to_user(&optval->gf_slist[i], &ss, sizeof(ss))) + return -EFAULT; + } + return 0; +done: + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + dev_put(dev); + return err; +} + +int inet6_mc_check(struct sock *sk, struct in6_addr *mc_addr, + struct in6_addr *src_addr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct ipv6_mc_socklist *mc; + struct ip6_sf_socklist *psl; + int rv = 1; + + read_lock(&ipv6_sk_mc_lock); + for (mc = np->ipv6_mc_list; mc; mc = mc->next) { + if (ipv6_addr_equal(&mc->addr, mc_addr)) + break; + } + if (!mc) { + read_unlock(&ipv6_sk_mc_lock); + return 1; + } + psl = mc->sflist; + if (!psl) { + rv = mc->sfmode == MCAST_EXCLUDE; + } else { + int i; + + for (i=0; isl_count; i++) { + if (ipv6_addr_equal(&psl->sl_addr[i], src_addr)) + break; + } + if (mc->sfmode == MCAST_INCLUDE && i >= psl->sl_count) + rv = 0; + if (mc->sfmode == MCAST_EXCLUDE && i < psl->sl_count) + rv = 0; + } + read_unlock(&ipv6_sk_mc_lock); + + return rv; +} + +static void ma_put(struct ifmcaddr6 *mc) +{ + if (atomic_dec_and_test(&mc->mca_refcnt)) { + in6_dev_put(mc->idev); + kfree(mc); + } +} + +static void igmp6_group_added(struct ifmcaddr6 *mc) +{ + struct net_device *dev = mc->idev->dev; + char buf[MAX_ADDR_LEN]; + + spin_lock_bh(&mc->mca_lock); + if (!(mc->mca_flags&MAF_LOADED)) { + mc->mca_flags |= MAF_LOADED; + if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) + dev_mc_add(dev, buf, dev->addr_len, 0); + } + spin_unlock_bh(&mc->mca_lock); + + if (!(dev->flags & IFF_UP) || (mc->mca_flags & MAF_NOREPORT)) + return; + + if (MLD_V1_SEEN(mc->idev)) { + igmp6_join_group(mc); + return; + } + /* else v2 */ + + mc->mca_crcount = mc->idev->mc_qrv; + mld_ifc_event(mc->idev); +} + +static void igmp6_group_dropped(struct ifmcaddr6 *mc) +{ + struct net_device *dev = mc->idev->dev; + char buf[MAX_ADDR_LEN]; + + spin_lock_bh(&mc->mca_lock); + if (mc->mca_flags&MAF_LOADED) { + mc->mca_flags &= ~MAF_LOADED; + if (ndisc_mc_map(&mc->mca_addr, buf, dev, 0) == 0) + dev_mc_delete(dev, buf, dev->addr_len, 0); + } + + if (mc->mca_flags & MAF_NOREPORT) + goto done; + spin_unlock_bh(&mc->mca_lock); + + if (!mc->idev->dead) + igmp6_leave_group(mc); + + spin_lock_bh(&mc->mca_lock); + if (del_timer(&mc->mca_timer)) + atomic_dec(&mc->mca_refcnt); +done: + ip6_mc_clear_src(mc); + spin_unlock_bh(&mc->mca_lock); +} + +/* + * deleted ifmcaddr6 manipulation + */ +static void mld_add_delrec(struct inet6_dev *idev, struct ifmcaddr6 *im) +{ + struct ifmcaddr6 *pmc; + + /* this is an "ifmcaddr6" for convenience; only the fields below + * are actually used. In particular, the refcnt and users are not + * used for management of the delete list. Using the same structure + * for deleted items allows change reports to use common code with + * non-deleted or query-response MCA's. + */ + pmc = (struct ifmcaddr6 *)kmalloc(sizeof(*pmc), GFP_ATOMIC); + if (!pmc) + return; + memset(pmc, 0, sizeof(*pmc)); + spin_lock_bh(&im->mca_lock); + spin_lock_init(&pmc->mca_lock); + pmc->idev = im->idev; + in6_dev_hold(idev); + pmc->mca_addr = im->mca_addr; + pmc->mca_crcount = idev->mc_qrv; + pmc->mca_sfmode = im->mca_sfmode; + if (pmc->mca_sfmode == MCAST_INCLUDE) { + struct ip6_sf_list *psf; + + pmc->mca_tomb = im->mca_tomb; + pmc->mca_sources = im->mca_sources; + im->mca_tomb = im->mca_sources = NULL; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) + psf->sf_crcount = pmc->mca_crcount; + } + spin_unlock_bh(&im->mca_lock); + + write_lock_bh(&idev->mc_lock); + pmc->next = idev->mc_tomb; + idev->mc_tomb = pmc; + write_unlock_bh(&idev->mc_lock); +} + +static void mld_del_delrec(struct inet6_dev *idev, struct in6_addr *pmca) +{ + struct ifmcaddr6 *pmc, *pmc_prev; + struct ip6_sf_list *psf, *psf_next; + + write_lock_bh(&idev->mc_lock); + pmc_prev = NULL; + for (pmc=idev->mc_tomb; pmc; pmc=pmc->next) { + if (ipv6_addr_equal(&pmc->mca_addr, pmca)) + break; + pmc_prev = pmc; + } + if (pmc) { + if (pmc_prev) + pmc_prev->next = pmc->next; + else + idev->mc_tomb = pmc->next; + } + write_unlock_bh(&idev->mc_lock); + if (pmc) { + for (psf=pmc->mca_tomb; psf; psf=psf_next) { + psf_next = psf->sf_next; + kfree(psf); + } + in6_dev_put(pmc->idev); + kfree(pmc); + } +} + +static void mld_clear_delrec(struct inet6_dev *idev) +{ + struct ifmcaddr6 *pmc, *nextpmc; + + write_lock_bh(&idev->mc_lock); + pmc = idev->mc_tomb; + idev->mc_tomb = NULL; + write_unlock_bh(&idev->mc_lock); + + for (; pmc; pmc = nextpmc) { + nextpmc = pmc->next; + ip6_mc_clear_src(pmc); + in6_dev_put(pmc->idev); + kfree(pmc); + } + + /* clear dead sources, too */ + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + struct ip6_sf_list *psf, *psf_next; + + spin_lock_bh(&pmc->mca_lock); + psf = pmc->mca_tomb; + pmc->mca_tomb = NULL; + spin_unlock_bh(&pmc->mca_lock); + for (; psf; psf=psf_next) { + psf_next = psf->sf_next; + kfree(psf); + } + } + read_unlock_bh(&idev->lock); +} + + +/* + * device multicast group inc (add if not found) + */ +int ipv6_dev_mc_inc(struct net_device *dev, struct in6_addr *addr) +{ + struct ifmcaddr6 *mc; + struct inet6_dev *idev; + + idev = in6_dev_get(dev); + + if (idev == NULL) + return -EINVAL; + + write_lock_bh(&idev->lock); + if (idev->dead) { + write_unlock_bh(&idev->lock); + in6_dev_put(idev); + return -ENODEV; + } + + for (mc = idev->mc_list; mc; mc = mc->next) { + if (ipv6_addr_equal(&mc->mca_addr, addr)) { + mc->mca_users++; + write_unlock_bh(&idev->lock); + ip6_mc_add_src(idev, &mc->mca_addr, MCAST_EXCLUDE, 0, + NULL, 0); + in6_dev_put(idev); + return 0; + } + } + + /* + * not found: create a new one. + */ + + mc = kmalloc(sizeof(struct ifmcaddr6), GFP_ATOMIC); + + if (mc == NULL) { + write_unlock_bh(&idev->lock); + in6_dev_put(idev); + return -ENOMEM; + } + + memset(mc, 0, sizeof(struct ifmcaddr6)); + init_timer(&mc->mca_timer); + mc->mca_timer.function = igmp6_timer_handler; + mc->mca_timer.data = (unsigned long) mc; + + ipv6_addr_copy(&mc->mca_addr, addr); + mc->idev = idev; + mc->mca_users = 1; + /* mca_stamp should be updated upon changes */ + mc->mca_cstamp = mc->mca_tstamp = jiffies; + atomic_set(&mc->mca_refcnt, 2); + spin_lock_init(&mc->mca_lock); + + /* initial mode is (EX, empty) */ + mc->mca_sfmode = MCAST_EXCLUDE; + mc->mca_sfcount[MCAST_EXCLUDE] = 1; + + if (ipv6_addr_is_ll_all_nodes(&mc->mca_addr) || + IPV6_ADDR_MC_SCOPE(&mc->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) + mc->mca_flags |= MAF_NOREPORT; + + mc->next = idev->mc_list; + idev->mc_list = mc; + write_unlock_bh(&idev->lock); + + mld_del_delrec(idev, &mc->mca_addr); + igmp6_group_added(mc); + ma_put(mc); + return 0; +} + +/* + * device multicast group del + */ +int __ipv6_dev_mc_dec(struct inet6_dev *idev, struct in6_addr *addr) +{ + struct ifmcaddr6 *ma, **map; + + write_lock_bh(&idev->lock); + for (map = &idev->mc_list; (ma=*map) != NULL; map = &ma->next) { + if (ipv6_addr_equal(&ma->mca_addr, addr)) { + if (--ma->mca_users == 0) { + *map = ma->next; + write_unlock_bh(&idev->lock); + + igmp6_group_dropped(ma); + + ma_put(ma); + return 0; + } + write_unlock_bh(&idev->lock); + return 0; + } + } + write_unlock_bh(&idev->lock); + + return -ENOENT; +} + +int ipv6_dev_mc_dec(struct net_device *dev, struct in6_addr *addr) +{ + struct inet6_dev *idev = in6_dev_get(dev); + int err; + + if (!idev) + return -ENODEV; + + err = __ipv6_dev_mc_dec(idev, addr); + + in6_dev_put(idev); + + return err; +} + +/* + * identify MLD packets for MLD filter exceptions + */ +int ipv6_is_mld(struct sk_buff *skb, int nexthdr) +{ + struct icmp6hdr *pic; + + if (nexthdr != IPPROTO_ICMPV6) + return 0; + + if (!pskb_may_pull(skb, sizeof(struct icmp6hdr))) + return 0; + + pic = (struct icmp6hdr *)skb->h.raw; + + switch (pic->icmp6_type) { + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + case ICMPV6_MLD2_REPORT: + return 1; + default: + break; + } + return 0; +} + +/* + * check if the interface/address pair is valid + */ +int ipv6_chk_mcast_addr(struct net_device *dev, struct in6_addr *group, + struct in6_addr *src_addr) +{ + struct inet6_dev *idev; + struct ifmcaddr6 *mc; + int rv = 0; + + idev = in6_dev_get(dev); + if (idev) { + read_lock_bh(&idev->lock); + for (mc = idev->mc_list; mc; mc=mc->next) { + if (ipv6_addr_equal(&mc->mca_addr, group)) + break; + } + if (mc) { + if (src_addr && !ipv6_addr_any(src_addr)) { + struct ip6_sf_list *psf; + + spin_lock_bh(&mc->mca_lock); + for (psf=mc->mca_sources;psf;psf=psf->sf_next) { + if (ipv6_addr_equal(&psf->sf_addr, src_addr)) + break; + } + if (psf) + rv = psf->sf_count[MCAST_INCLUDE] || + psf->sf_count[MCAST_EXCLUDE] != + mc->mca_sfcount[MCAST_EXCLUDE]; + else + rv = mc->mca_sfcount[MCAST_EXCLUDE] !=0; + spin_unlock_bh(&mc->mca_lock); + } else + rv = 1; /* don't filter unspecified source */ + } + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + } + return rv; +} + +static void mld_gq_start_timer(struct inet6_dev *idev) +{ + int tv = net_random() % idev->mc_maxdelay; + + idev->mc_gq_running = 1; + if (!mod_timer(&idev->mc_gq_timer, jiffies+tv+2)) + in6_dev_hold(idev); +} + +static void mld_ifc_start_timer(struct inet6_dev *idev, int delay) +{ + int tv = net_random() % delay; + + if (!mod_timer(&idev->mc_ifc_timer, jiffies+tv+2)) + in6_dev_hold(idev); +} + +/* + * IGMP handling (alias multicast ICMPv6 messages) + */ + +static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) +{ + unsigned long delay = resptime; + + /* Do not start timer for these addresses */ + if (ipv6_addr_is_ll_all_nodes(&ma->mca_addr) || + IPV6_ADDR_MC_SCOPE(&ma->mca_addr) < IPV6_ADDR_SCOPE_LINKLOCAL) + return; + + if (del_timer(&ma->mca_timer)) { + atomic_dec(&ma->mca_refcnt); + delay = ma->mca_timer.expires - jiffies; + } + + if (delay >= resptime) { + if (resptime) + delay = net_random() % resptime; + else + delay = 1; + } + ma->mca_timer.expires = jiffies + delay; + if (!mod_timer(&ma->mca_timer, jiffies + delay)) + atomic_inc(&ma->mca_refcnt); + ma->mca_flags |= MAF_TIMER_RUNNING; +} + +static void mld_marksources(struct ifmcaddr6 *pmc, int nsrcs, + struct in6_addr *srcs) +{ + struct ip6_sf_list *psf; + int i, scount; + + scount = 0; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (scount == nsrcs) + break; + for (i=0; isf_addr)) { + psf->sf_gsresp = 1; + scount++; + break; + } + } +} + +int igmp6_event_query(struct sk_buff *skb) +{ + struct mld2_query *mlh2 = (struct mld2_query *) skb->h.raw; + struct ifmcaddr6 *ma; + struct in6_addr *group; + unsigned long max_delay; + struct inet6_dev *idev; + struct icmp6hdr *hdr; + int group_type; + int mark = 0; + int len; + + if (!pskb_may_pull(skb, sizeof(struct in6_addr))) + return -EINVAL; + + /* compute payload length excluding extension headers */ + len = ntohs(skb->nh.ipv6h->payload_len) + sizeof(struct ipv6hdr); + len -= (char *)skb->h.raw - (char *)skb->nh.ipv6h; + + /* Drop queries with not link local source */ + if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr)&IPV6_ADDR_LINKLOCAL)) + return -EINVAL; + + idev = in6_dev_get(skb->dev); + + if (idev == NULL) + return 0; + + hdr = (struct icmp6hdr *) skb->h.raw; + group = (struct in6_addr *) (hdr + 1); + group_type = ipv6_addr_type(group); + + if (group_type != IPV6_ADDR_ANY && + !(group_type&IPV6_ADDR_MULTICAST)) { + in6_dev_put(idev); + return -EINVAL; + } + + if (len == 24) { + int switchback; + /* MLDv1 router present */ + + /* Translate milliseconds to jiffies */ + max_delay = (ntohs(hdr->icmp6_maxdelay)*HZ)/1000; + + switchback = (idev->mc_qrv + 1) * max_delay; + idev->mc_v1_seen = jiffies + switchback; + + /* cancel the interface change timer */ + idev->mc_ifc_count = 0; + if (del_timer(&idev->mc_ifc_timer)) + __in6_dev_put(idev); + /* clear deleted report items */ + mld_clear_delrec(idev); + } else if (len >= 28) { + max_delay = (MLDV2_MRC(ntohs(mlh2->mrc))*HZ)/1000; + if (!max_delay) + max_delay = 1; + idev->mc_maxdelay = max_delay; + if (mlh2->qrv) + idev->mc_qrv = mlh2->qrv; + if (group_type == IPV6_ADDR_ANY) { /* general query */ + if (mlh2->nsrcs) { + in6_dev_put(idev); + return -EINVAL; /* no sources allowed */ + } + mld_gq_start_timer(idev); + in6_dev_put(idev); + return 0; + } + /* mark sources to include, if group & source-specific */ + mark = mlh2->nsrcs != 0; + } else { + in6_dev_put(idev); + return -EINVAL; + } + + read_lock_bh(&idev->lock); + if (group_type == IPV6_ADDR_ANY) { + for (ma = idev->mc_list; ma; ma=ma->next) { + spin_lock_bh(&ma->mca_lock); + igmp6_group_queried(ma, max_delay); + spin_unlock_bh(&ma->mca_lock); + } + } else { + for (ma = idev->mc_list; ma; ma=ma->next) { + if (group_type != IPV6_ADDR_ANY && + !ipv6_addr_equal(group, &ma->mca_addr)) + continue; + spin_lock_bh(&ma->mca_lock); + if (ma->mca_flags & MAF_TIMER_RUNNING) { + /* gsquery <- gsquery && mark */ + if (!mark) + ma->mca_flags &= ~MAF_GSQUERY; + } else { + /* gsquery <- mark */ + if (mark) + ma->mca_flags |= MAF_GSQUERY; + else + ma->mca_flags &= ~MAF_GSQUERY; + } + if (ma->mca_flags & MAF_GSQUERY) + mld_marksources(ma, ntohs(mlh2->nsrcs), + mlh2->srcs); + igmp6_group_queried(ma, max_delay); + spin_unlock_bh(&ma->mca_lock); + if (group_type != IPV6_ADDR_ANY) + break; + } + } + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + + return 0; +} + + +int igmp6_event_report(struct sk_buff *skb) +{ + struct ifmcaddr6 *ma; + struct in6_addr *addrp; + struct inet6_dev *idev; + struct icmp6hdr *hdr; + int addr_type; + + /* Our own report looped back. Ignore it. */ + if (skb->pkt_type == PACKET_LOOPBACK) + return 0; + + if (!pskb_may_pull(skb, sizeof(struct in6_addr))) + return -EINVAL; + + hdr = (struct icmp6hdr*) skb->h.raw; + + /* Drop reports with not link local source */ + addr_type = ipv6_addr_type(&skb->nh.ipv6h->saddr); + if (addr_type != IPV6_ADDR_ANY && + !(addr_type&IPV6_ADDR_LINKLOCAL)) + return -EINVAL; + + addrp = (struct in6_addr *) (hdr + 1); + + idev = in6_dev_get(skb->dev); + if (idev == NULL) + return -ENODEV; + + /* + * Cancel the timer for this group + */ + + read_lock_bh(&idev->lock); + for (ma = idev->mc_list; ma; ma=ma->next) { + if (ipv6_addr_equal(&ma->mca_addr, addrp)) { + spin_lock(&ma->mca_lock); + if (del_timer(&ma->mca_timer)) + atomic_dec(&ma->mca_refcnt); + ma->mca_flags &= ~(MAF_LAST_REPORTER|MAF_TIMER_RUNNING); + spin_unlock(&ma->mca_lock); + break; + } + } + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + return 0; +} + +static int is_in(struct ifmcaddr6 *pmc, struct ip6_sf_list *psf, int type, + int gdeleted, int sdeleted) +{ + switch (type) { + case MLD2_MODE_IS_INCLUDE: + case MLD2_MODE_IS_EXCLUDE: + if (gdeleted || sdeleted) + return 0; + return !((pmc->mca_flags & MAF_GSQUERY) && !psf->sf_gsresp); + case MLD2_CHANGE_TO_INCLUDE: + if (gdeleted || sdeleted) + return 0; + return psf->sf_count[MCAST_INCLUDE] != 0; + case MLD2_CHANGE_TO_EXCLUDE: + if (gdeleted || sdeleted) + return 0; + if (pmc->mca_sfcount[MCAST_EXCLUDE] == 0 || + psf->sf_count[MCAST_INCLUDE]) + return 0; + return pmc->mca_sfcount[MCAST_EXCLUDE] == + psf->sf_count[MCAST_EXCLUDE]; + case MLD2_ALLOW_NEW_SOURCES: + if (gdeleted || !psf->sf_crcount) + return 0; + return (pmc->mca_sfmode == MCAST_INCLUDE) ^ sdeleted; + case MLD2_BLOCK_OLD_SOURCES: + if (pmc->mca_sfmode == MCAST_INCLUDE) + return gdeleted || (psf->sf_crcount && sdeleted); + return psf->sf_crcount && !gdeleted && !sdeleted; + } + return 0; +} + +static int +mld_scount(struct ifmcaddr6 *pmc, int type, int gdeleted, int sdeleted) +{ + struct ip6_sf_list *psf; + int scount = 0; + + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (!is_in(pmc, psf, type, gdeleted, sdeleted)) + continue; + scount++; + } + return scount; +} + +static struct sk_buff *mld_newpack(struct net_device *dev, int size) +{ + struct sock *sk = igmp6_socket->sk; + struct sk_buff *skb; + struct mld2_report *pmr; + struct in6_addr addr_buf; + int err; + u8 ra[8] = { IPPROTO_ICMPV6, 0, + IPV6_TLV_ROUTERALERT, 2, 0, 0, + IPV6_TLV_PADN, 0 }; + + /* we assume size > sizeof(ra) here */ + skb = sock_alloc_send_skb(sk, size + LL_RESERVED_SPACE(dev), 1, &err); + + if (skb == 0) + return NULL; + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + if (dev->hard_header) { + unsigned char ha[MAX_ADDR_LEN]; + + ndisc_mc_map(&mld2_all_mcr, ha, dev, 1); + if (dev->hard_header(skb, dev, ETH_P_IPV6,ha,NULL,size) < 0) { + kfree_skb(skb); + return NULL; + } + } + + if (ipv6_get_lladdr(dev, &addr_buf)) { + /* : + * use unspecified address as the source address + * when a valid link-local address is not available. + */ + memset(&addr_buf, 0, sizeof(addr_buf)); + } + + ip6_nd_hdr(sk, skb, dev, &addr_buf, &mld2_all_mcr, NEXTHDR_HOP, 0); + + memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); + + pmr =(struct mld2_report *)skb_put(skb, sizeof(*pmr)); + skb->h.raw = (unsigned char *)pmr; + pmr->type = ICMPV6_MLD2_REPORT; + pmr->resv1 = 0; + pmr->csum = 0; + pmr->resv2 = 0; + pmr->ngrec = 0; + return skb; +} + +static void mld_sendpack(struct sk_buff *skb) +{ + struct ipv6hdr *pip6 = skb->nh.ipv6h; + struct mld2_report *pmr = (struct mld2_report *)skb->h.raw; + int payload_len, mldlen; + struct inet6_dev *idev = in6_dev_get(skb->dev); + int err; + + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + payload_len = skb->tail - (unsigned char *)skb->nh.ipv6h - + sizeof(struct ipv6hdr); + mldlen = skb->tail - skb->h.raw; + pip6->payload_len = htons(payload_len); + + pmr->csum = csum_ipv6_magic(&pip6->saddr, &pip6->daddr, mldlen, + IPPROTO_ICMPV6, csum_partial(skb->h.raw, mldlen, 0)); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev, + dev_queue_xmit); + if (!err) { + ICMP6_INC_STATS(idev,ICMP6_MIB_OUTMSGS); + IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); + } else + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + + if (likely(idev != NULL)) + in6_dev_put(idev); +} + +static int grec_size(struct ifmcaddr6 *pmc, int type, int gdel, int sdel) +{ + return sizeof(struct mld2_grec) + 4*mld_scount(pmc,type,gdel,sdel); +} + +static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc, + int type, struct mld2_grec **ppgr) +{ + struct net_device *dev = pmc->idev->dev; + struct mld2_report *pmr; + struct mld2_grec *pgr; + + if (!skb) + skb = mld_newpack(dev, dev->mtu); + if (!skb) + return NULL; + pgr = (struct mld2_grec *)skb_put(skb, sizeof(struct mld2_grec)); + pgr->grec_type = type; + pgr->grec_auxwords = 0; + pgr->grec_nsrcs = 0; + pgr->grec_mca = pmc->mca_addr; /* structure copy */ + pmr = (struct mld2_report *)skb->h.raw; + pmr->ngrec = htons(ntohs(pmr->ngrec)+1); + *ppgr = pgr; + return skb; +} + +#define AVAILABLE(skb) ((skb) ? ((skb)->dev ? (skb)->dev->mtu - (skb)->len : \ + skb_tailroom(skb)) : 0) + +static struct sk_buff *add_grec(struct sk_buff *skb, struct ifmcaddr6 *pmc, + int type, int gdeleted, int sdeleted) +{ + struct net_device *dev = pmc->idev->dev; + struct mld2_report *pmr; + struct mld2_grec *pgr = NULL; + struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list; + int scount, first, isquery, truncate; + + if (pmc->mca_flags & MAF_NOREPORT) + return skb; + + isquery = type == MLD2_MODE_IS_INCLUDE || + type == MLD2_MODE_IS_EXCLUDE; + truncate = type == MLD2_MODE_IS_EXCLUDE || + type == MLD2_CHANGE_TO_EXCLUDE; + + psf_list = sdeleted ? &pmc->mca_tomb : &pmc->mca_sources; + + if (!*psf_list) { + if (type == MLD2_ALLOW_NEW_SOURCES || + type == MLD2_BLOCK_OLD_SOURCES) + return skb; + if (pmc->mca_crcount || isquery) { + /* make sure we have room for group header and at + * least one source. + */ + if (skb && AVAILABLE(skb) < sizeof(struct mld2_grec)+ + sizeof(struct in6_addr)) { + mld_sendpack(skb); + skb = NULL; /* add_grhead will get a new one */ + } + skb = add_grhead(skb, pmc, type, &pgr); + } + return skb; + } + pmr = skb ? (struct mld2_report *)skb->h.raw : NULL; + + /* EX and TO_EX get a fresh packet, if needed */ + if (truncate) { + if (pmr && pmr->ngrec && + AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) { + if (skb) + mld_sendpack(skb); + skb = mld_newpack(dev, dev->mtu); + } + } + first = 1; + scount = 0; + psf_prev = NULL; + for (psf=*psf_list; psf; psf=psf_next) { + struct in6_addr *psrc; + + psf_next = psf->sf_next; + + if (!is_in(pmc, psf, type, gdeleted, sdeleted)) { + psf_prev = psf; + continue; + } + + /* clear marks on query responses */ + if (isquery) + psf->sf_gsresp = 0; + + if (AVAILABLE(skb) < sizeof(*psrc) + + first*sizeof(struct mld2_grec)) { + if (truncate && !first) + break; /* truncate these */ + if (pgr) + pgr->grec_nsrcs = htons(scount); + if (skb) + mld_sendpack(skb); + skb = mld_newpack(dev, dev->mtu); + first = 1; + scount = 0; + } + if (first) { + skb = add_grhead(skb, pmc, type, &pgr); + first = 0; + } + psrc = (struct in6_addr *)skb_put(skb, sizeof(*psrc)); + *psrc = psf->sf_addr; + scount++; + if ((type == MLD2_ALLOW_NEW_SOURCES || + type == MLD2_BLOCK_OLD_SOURCES) && psf->sf_crcount) { + psf->sf_crcount--; + if ((sdeleted || gdeleted) && psf->sf_crcount == 0) { + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + *psf_list = psf->sf_next; + kfree(psf); + continue; + } + } + psf_prev = psf; + } + if (pgr) + pgr->grec_nsrcs = htons(scount); + + if (isquery) + pmc->mca_flags &= ~MAF_GSQUERY; /* clear query state */ + return skb; +} + +static void mld_send_report(struct inet6_dev *idev, struct ifmcaddr6 *pmc) +{ + struct sk_buff *skb = NULL; + int type; + + if (!pmc) { + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + if (pmc->mca_flags & MAF_NOREPORT) + continue; + spin_lock_bh(&pmc->mca_lock); + if (pmc->mca_sfcount[MCAST_EXCLUDE]) + type = MLD2_MODE_IS_EXCLUDE; + else + type = MLD2_MODE_IS_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + spin_unlock_bh(&pmc->mca_lock); + } + read_unlock_bh(&idev->lock); + } else { + spin_lock_bh(&pmc->mca_lock); + if (pmc->mca_sfcount[MCAST_EXCLUDE]) + type = MLD2_MODE_IS_EXCLUDE; + else + type = MLD2_MODE_IS_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + spin_unlock_bh(&pmc->mca_lock); + } + if (skb) + mld_sendpack(skb); +} + +/* + * remove zero-count source records from a source filter list + */ +static void mld_clear_zeros(struct ip6_sf_list **ppsf) +{ + struct ip6_sf_list *psf_prev, *psf_next, *psf; + + psf_prev = NULL; + for (psf=*ppsf; psf; psf = psf_next) { + psf_next = psf->sf_next; + if (psf->sf_crcount == 0) { + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + *ppsf = psf->sf_next; + kfree(psf); + } else + psf_prev = psf; + } +} + +static void mld_send_cr(struct inet6_dev *idev) +{ + struct ifmcaddr6 *pmc, *pmc_prev, *pmc_next; + struct sk_buff *skb = NULL; + int type, dtype; + + read_lock_bh(&idev->lock); + write_lock_bh(&idev->mc_lock); + + /* deleted MCA's */ + pmc_prev = NULL; + for (pmc=idev->mc_tomb; pmc; pmc=pmc_next) { + pmc_next = pmc->next; + if (pmc->mca_sfmode == MCAST_INCLUDE) { + type = MLD2_BLOCK_OLD_SOURCES; + dtype = MLD2_BLOCK_OLD_SOURCES; + skb = add_grec(skb, pmc, type, 1, 0); + skb = add_grec(skb, pmc, dtype, 1, 1); + } + if (pmc->mca_crcount) { + pmc->mca_crcount--; + if (pmc->mca_sfmode == MCAST_EXCLUDE) { + type = MLD2_CHANGE_TO_INCLUDE; + skb = add_grec(skb, pmc, type, 1, 0); + } + if (pmc->mca_crcount == 0) { + mld_clear_zeros(&pmc->mca_tomb); + mld_clear_zeros(&pmc->mca_sources); + } + } + if (pmc->mca_crcount == 0 && !pmc->mca_tomb && + !pmc->mca_sources) { + if (pmc_prev) + pmc_prev->next = pmc_next; + else + idev->mc_tomb = pmc_next; + in6_dev_put(pmc->idev); + kfree(pmc); + } else + pmc_prev = pmc; + } + write_unlock_bh(&idev->mc_lock); + + /* change recs */ + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + spin_lock_bh(&pmc->mca_lock); + if (pmc->mca_sfcount[MCAST_EXCLUDE]) { + type = MLD2_BLOCK_OLD_SOURCES; + dtype = MLD2_ALLOW_NEW_SOURCES; + } else { + type = MLD2_ALLOW_NEW_SOURCES; + dtype = MLD2_BLOCK_OLD_SOURCES; + } + skb = add_grec(skb, pmc, type, 0, 0); + skb = add_grec(skb, pmc, dtype, 0, 1); /* deleted sources */ + + /* filter mode changes */ + if (pmc->mca_crcount) { + pmc->mca_crcount--; + if (pmc->mca_sfmode == MCAST_EXCLUDE) + type = MLD2_CHANGE_TO_EXCLUDE; + else + type = MLD2_CHANGE_TO_INCLUDE; + skb = add_grec(skb, pmc, type, 0, 0); + } + spin_unlock_bh(&pmc->mca_lock); + } + read_unlock_bh(&idev->lock); + if (!skb) + return; + (void) mld_sendpack(skb); +} + +static void igmp6_send(struct in6_addr *addr, struct net_device *dev, int type) +{ + struct sock *sk = igmp6_socket->sk; + struct inet6_dev *idev; + struct sk_buff *skb; + struct icmp6hdr *hdr; + struct in6_addr *snd_addr; + struct in6_addr *addrp; + struct in6_addr addr_buf; + struct in6_addr all_routers; + int err, len, payload_len, full_len; + u8 ra[8] = { IPPROTO_ICMPV6, 0, + IPV6_TLV_ROUTERALERT, 2, 0, 0, + IPV6_TLV_PADN, 0 }; + + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + snd_addr = addr; + if (type == ICMPV6_MGM_REDUCTION) { + snd_addr = &all_routers; + ipv6_addr_all_routers(&all_routers); + } + + len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); + payload_len = len + sizeof(ra); + full_len = sizeof(struct ipv6hdr) + payload_len; + + skb = sock_alloc_send_skb(sk, LL_RESERVED_SPACE(dev) + full_len, 1, &err); + + if (skb == NULL) { + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + return; + } + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + if (dev->hard_header) { + unsigned char ha[MAX_ADDR_LEN]; + ndisc_mc_map(snd_addr, ha, dev, 1); + if (dev->hard_header(skb, dev, ETH_P_IPV6, ha, NULL, full_len) < 0) + goto out; + } + + if (ipv6_get_lladdr(dev, &addr_buf)) { + /* : + * use unspecified address as the source address + * when a valid link-local address is not available. + */ + memset(&addr_buf, 0, sizeof(addr_buf)); + } + + ip6_nd_hdr(sk, skb, dev, &addr_buf, snd_addr, NEXTHDR_HOP, payload_len); + + memcpy(skb_put(skb, sizeof(ra)), ra, sizeof(ra)); + + hdr = (struct icmp6hdr *) skb_put(skb, sizeof(struct icmp6hdr)); + memset(hdr, 0, sizeof(struct icmp6hdr)); + hdr->icmp6_type = type; + + addrp = (struct in6_addr *) skb_put(skb, sizeof(struct in6_addr)); + ipv6_addr_copy(addrp, addr); + + hdr->icmp6_cksum = csum_ipv6_magic(&addr_buf, snd_addr, len, + IPPROTO_ICMPV6, + csum_partial((__u8 *) hdr, len, 0)); + + idev = in6_dev_get(skb->dev); + + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, skb->dev, + dev_queue_xmit); + if (!err) { + if (type == ICMPV6_MGM_REDUCTION) + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBREDUCTIONS); + else + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTGROUPMEMBRESPONSES); + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); + IP6_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS); + } else + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + + if (likely(idev != NULL)) + in6_dev_put(idev); + return; + +out: + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + kfree_skb(skb); +} + +static int ip6_mc_del1_src(struct ifmcaddr6 *pmc, int sfmode, + struct in6_addr *psfsrc) +{ + struct ip6_sf_list *psf, *psf_prev; + int rv = 0; + + psf_prev = NULL; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) + break; + psf_prev = psf; + } + if (!psf || psf->sf_count[sfmode] == 0) { + /* source filter not found, or count wrong => bug */ + return -ESRCH; + } + psf->sf_count[sfmode]--; + if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) { + struct inet6_dev *idev = pmc->idev; + + /* no more filters for this source */ + if (psf_prev) + psf_prev->sf_next = psf->sf_next; + else + pmc->mca_sources = psf->sf_next; + if (psf->sf_oldin && !(pmc->mca_flags & MAF_NOREPORT) && + !MLD_V1_SEEN(idev)) { + psf->sf_crcount = idev->mc_qrv; + psf->sf_next = pmc->mca_tomb; + pmc->mca_tomb = psf; + rv = 1; + } else + kfree(psf); + } + return rv; +} + +static int ip6_mc_del_src(struct inet6_dev *idev, struct in6_addr *pmca, + int sfmode, int sfcount, struct in6_addr *psfsrc, + int delta) +{ + struct ifmcaddr6 *pmc; + int changerec = 0; + int i, err; + + if (!idev) + return -ENODEV; + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + if (ipv6_addr_equal(pmca, &pmc->mca_addr)) + break; + } + if (!pmc) { + /* MCA not found?? bug */ + read_unlock_bh(&idev->lock); + return -ESRCH; + } + spin_lock_bh(&pmc->mca_lock); + sf_markstate(pmc); + if (!delta) { + if (!pmc->mca_sfcount[sfmode]) { + spin_unlock_bh(&pmc->mca_lock); + read_unlock_bh(&idev->lock); + return -EINVAL; + } + pmc->mca_sfcount[sfmode]--; + } + err = 0; + for (i=0; i 0; + if (!err && rv < 0) + err = rv; + } + if (pmc->mca_sfmode == MCAST_EXCLUDE && + pmc->mca_sfcount[MCAST_EXCLUDE] == 0 && + pmc->mca_sfcount[MCAST_INCLUDE]) { + struct ip6_sf_list *psf; + + /* filter mode change */ + pmc->mca_sfmode = MCAST_INCLUDE; + pmc->mca_crcount = idev->mc_qrv; + idev->mc_ifc_count = pmc->mca_crcount; + for (psf=pmc->mca_sources; psf; psf = psf->sf_next) + psf->sf_crcount = 0; + mld_ifc_event(pmc->idev); + } else if (sf_setstate(pmc) || changerec) + mld_ifc_event(pmc->idev); + spin_unlock_bh(&pmc->mca_lock); + read_unlock_bh(&idev->lock); + return err; +} + +/* + * Add multicast single-source filter to the interface list + */ +static int ip6_mc_add1_src(struct ifmcaddr6 *pmc, int sfmode, + struct in6_addr *psfsrc, int delta) +{ + struct ip6_sf_list *psf, *psf_prev; + + psf_prev = NULL; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (ipv6_addr_equal(&psf->sf_addr, psfsrc)) + break; + psf_prev = psf; + } + if (!psf) { + psf = (struct ip6_sf_list *)kmalloc(sizeof(*psf), GFP_ATOMIC); + if (!psf) + return -ENOBUFS; + memset(psf, 0, sizeof(*psf)); + psf->sf_addr = *psfsrc; + if (psf_prev) { + psf_prev->sf_next = psf; + } else + pmc->mca_sources = psf; + } + psf->sf_count[sfmode]++; + return 0; +} + +static void sf_markstate(struct ifmcaddr6 *pmc) +{ + struct ip6_sf_list *psf; + int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; + + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) + if (pmc->mca_sfcount[MCAST_EXCLUDE]) { + psf->sf_oldin = mca_xcount == + psf->sf_count[MCAST_EXCLUDE] && + !psf->sf_count[MCAST_INCLUDE]; + } else + psf->sf_oldin = psf->sf_count[MCAST_INCLUDE] != 0; +} + +static int sf_setstate(struct ifmcaddr6 *pmc) +{ + struct ip6_sf_list *psf; + int mca_xcount = pmc->mca_sfcount[MCAST_EXCLUDE]; + int qrv = pmc->idev->mc_qrv; + int new_in, rv; + + rv = 0; + for (psf=pmc->mca_sources; psf; psf=psf->sf_next) { + if (pmc->mca_sfcount[MCAST_EXCLUDE]) { + new_in = mca_xcount == psf->sf_count[MCAST_EXCLUDE] && + !psf->sf_count[MCAST_INCLUDE]; + } else + new_in = psf->sf_count[MCAST_INCLUDE] != 0; + if (new_in != psf->sf_oldin) { + psf->sf_crcount = qrv; + rv++; + } + } + return rv; +} + +/* + * Add multicast source filter list to the interface list + */ +static int ip6_mc_add_src(struct inet6_dev *idev, struct in6_addr *pmca, + int sfmode, int sfcount, struct in6_addr *psfsrc, + int delta) +{ + struct ifmcaddr6 *pmc; + int isexclude; + int i, err; + + if (!idev) + return -ENODEV; + read_lock_bh(&idev->lock); + for (pmc=idev->mc_list; pmc; pmc=pmc->next) { + if (ipv6_addr_equal(pmca, &pmc->mca_addr)) + break; + } + if (!pmc) { + /* MCA not found?? bug */ + read_unlock_bh(&idev->lock); + return -ESRCH; + } + spin_lock_bh(&pmc->mca_lock); + + sf_markstate(pmc); + isexclude = pmc->mca_sfmode == MCAST_EXCLUDE; + if (!delta) + pmc->mca_sfcount[sfmode]++; + err = 0; + for (i=0; imca_sfcount[sfmode]--; + for (j=0; jmca_sfcount[MCAST_EXCLUDE] != 0)) { + struct inet6_dev *idev = pmc->idev; + struct ip6_sf_list *psf; + + /* filter mode change */ + if (pmc->mca_sfcount[MCAST_EXCLUDE]) + pmc->mca_sfmode = MCAST_EXCLUDE; + else if (pmc->mca_sfcount[MCAST_INCLUDE]) + pmc->mca_sfmode = MCAST_INCLUDE; + /* else no filters; keep old mode for reports */ + + pmc->mca_crcount = idev->mc_qrv; + idev->mc_ifc_count = pmc->mca_crcount; + for (psf=pmc->mca_sources; psf; psf = psf->sf_next) + psf->sf_crcount = 0; + mld_ifc_event(idev); + } else if (sf_setstate(pmc)) + mld_ifc_event(idev); + spin_unlock_bh(&pmc->mca_lock); + read_unlock_bh(&idev->lock); + return err; +} + +static void ip6_mc_clear_src(struct ifmcaddr6 *pmc) +{ + struct ip6_sf_list *psf, *nextpsf; + + for (psf=pmc->mca_tomb; psf; psf=nextpsf) { + nextpsf = psf->sf_next; + kfree(psf); + } + pmc->mca_tomb = NULL; + for (psf=pmc->mca_sources; psf; psf=nextpsf) { + nextpsf = psf->sf_next; + kfree(psf); + } + pmc->mca_sources = NULL; + pmc->mca_sfmode = MCAST_EXCLUDE; + pmc->mca_sfcount[MCAST_EXCLUDE] = 0; + pmc->mca_sfcount[MCAST_EXCLUDE] = 1; +} + + +static void igmp6_join_group(struct ifmcaddr6 *ma) +{ + unsigned long delay; + + if (ma->mca_flags & MAF_NOREPORT) + return; + + igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); + + delay = net_random() % IGMP6_UNSOLICITED_IVAL; + + spin_lock_bh(&ma->mca_lock); + if (del_timer(&ma->mca_timer)) { + atomic_dec(&ma->mca_refcnt); + delay = ma->mca_timer.expires - jiffies; + } + + if (!mod_timer(&ma->mca_timer, jiffies + delay)) + atomic_inc(&ma->mca_refcnt); + ma->mca_flags |= MAF_TIMER_RUNNING | MAF_LAST_REPORTER; + spin_unlock_bh(&ma->mca_lock); +} + +static int ip6_mc_leave_src(struct sock *sk, struct ipv6_mc_socklist *iml, + struct inet6_dev *idev) +{ + int err; + + if (iml->sflist == 0) { + /* any-source empty exclude case */ + return ip6_mc_del_src(idev, &iml->addr, iml->sfmode, 0, NULL, 0); + } + err = ip6_mc_del_src(idev, &iml->addr, iml->sfmode, + iml->sflist->sl_count, iml->sflist->sl_addr, 0); + sock_kfree_s(sk, iml->sflist, IP6_SFLSIZE(iml->sflist->sl_max)); + iml->sflist = NULL; + return err; +} + +static void igmp6_leave_group(struct ifmcaddr6 *ma) +{ + if (MLD_V1_SEEN(ma->idev)) { + if (ma->mca_flags & MAF_LAST_REPORTER) + igmp6_send(&ma->mca_addr, ma->idev->dev, + ICMPV6_MGM_REDUCTION); + } else { + mld_add_delrec(ma->idev, ma); + mld_ifc_event(ma->idev); + } +} + +static void mld_gq_timer_expire(unsigned long data) +{ + struct inet6_dev *idev = (struct inet6_dev *)data; + + idev->mc_gq_running = 0; + mld_send_report(idev, NULL); + __in6_dev_put(idev); +} + +static void mld_ifc_timer_expire(unsigned long data) +{ + struct inet6_dev *idev = (struct inet6_dev *)data; + + mld_send_cr(idev); + if (idev->mc_ifc_count) { + idev->mc_ifc_count--; + if (idev->mc_ifc_count) + mld_ifc_start_timer(idev, idev->mc_maxdelay); + } + __in6_dev_put(idev); +} + +static void mld_ifc_event(struct inet6_dev *idev) +{ + if (MLD_V1_SEEN(idev)) + return; + idev->mc_ifc_count = idev->mc_qrv; + mld_ifc_start_timer(idev, 1); +} + + +static void igmp6_timer_handler(unsigned long data) +{ + struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data; + + if (MLD_V1_SEEN(ma->idev)) + igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); + else + mld_send_report(ma->idev, ma); + + spin_lock(&ma->mca_lock); + ma->mca_flags |= MAF_LAST_REPORTER; + ma->mca_flags &= ~MAF_TIMER_RUNNING; + spin_unlock(&ma->mca_lock); + ma_put(ma); +} + +/* Device going down */ + +void ipv6_mc_down(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + + /* Withdraw multicast list */ + + read_lock_bh(&idev->lock); + idev->mc_ifc_count = 0; + if (del_timer(&idev->mc_ifc_timer)) + __in6_dev_put(idev); + idev->mc_gq_running = 0; + if (del_timer(&idev->mc_gq_timer)) + __in6_dev_put(idev); + + for (i = idev->mc_list; i; i=i->next) + igmp6_group_dropped(i); + read_unlock_bh(&idev->lock); + + mld_clear_delrec(idev); +} + + +/* Device going up */ + +void ipv6_mc_up(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + + /* Install multicast list, except for all-nodes (already installed) */ + + read_lock_bh(&idev->lock); + for (i = idev->mc_list; i; i=i->next) + igmp6_group_added(i); + read_unlock_bh(&idev->lock); +} + +/* IPv6 device initialization. */ + +void ipv6_mc_init_dev(struct inet6_dev *idev) +{ + struct in6_addr maddr; + + write_lock_bh(&idev->lock); + rwlock_init(&idev->mc_lock); + idev->mc_gq_running = 0; + init_timer(&idev->mc_gq_timer); + idev->mc_gq_timer.data = (unsigned long) idev; + idev->mc_gq_timer.function = &mld_gq_timer_expire; + idev->mc_tomb = NULL; + idev->mc_ifc_count = 0; + init_timer(&idev->mc_ifc_timer); + idev->mc_ifc_timer.data = (unsigned long) idev; + idev->mc_ifc_timer.function = &mld_ifc_timer_expire; + idev->mc_qrv = MLD_QRV_DEFAULT; + idev->mc_maxdelay = IGMP6_UNSOLICITED_IVAL; + idev->mc_v1_seen = 0; + write_unlock_bh(&idev->lock); + + /* Add all-nodes address. */ + ipv6_addr_all_nodes(&maddr); + ipv6_dev_mc_inc(idev->dev, &maddr); +} + +/* + * Device is about to be destroyed: clean up. + */ + +void ipv6_mc_destroy_dev(struct inet6_dev *idev) +{ + struct ifmcaddr6 *i; + struct in6_addr maddr; + + /* Deactivate timers */ + ipv6_mc_down(idev); + + /* Delete all-nodes address. */ + ipv6_addr_all_nodes(&maddr); + + /* We cannot call ipv6_dev_mc_dec() directly, our caller in + * addrconf.c has NULL'd out dev->ip6_ptr so in6_dev_get() will + * fail. + */ + __ipv6_dev_mc_dec(idev, &maddr); + + if (idev->cnf.forwarding) { + ipv6_addr_all_routers(&maddr); + __ipv6_dev_mc_dec(idev, &maddr); + } + + write_lock_bh(&idev->lock); + while ((i = idev->mc_list) != NULL) { + idev->mc_list = i->next; + write_unlock_bh(&idev->lock); + + igmp6_group_dropped(i); + ma_put(i); + + write_lock_bh(&idev->lock); + } + write_unlock_bh(&idev->lock); +} + +#ifdef CONFIG_PROC_FS +struct igmp6_mc_iter_state { + struct net_device *dev; + struct inet6_dev *idev; +}; + +#define igmp6_mc_seq_private(seq) ((struct igmp6_mc_iter_state *)(seq)->private) + +static inline struct ifmcaddr6 *igmp6_mc_get_first(struct seq_file *seq) +{ + struct ifmcaddr6 *im = NULL; + struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + + for (state->dev = dev_base, state->idev = NULL; + state->dev; + state->dev = state->dev->next) { + struct inet6_dev *idev; + idev = in6_dev_get(state->dev); + if (!idev) + continue; + read_lock_bh(&idev->lock); + im = idev->mc_list; + if (im) { + state->idev = idev; + break; + } + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + } + return im; +} + +static struct ifmcaddr6 *igmp6_mc_get_next(struct seq_file *seq, struct ifmcaddr6 *im) +{ + struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + + im = im->next; + while (!im) { + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + in6_dev_put(state->idev); + } + state->dev = state->dev->next; + if (!state->dev) { + state->idev = NULL; + break; + } + state->idev = in6_dev_get(state->dev); + if (!state->idev) + continue; + read_lock_bh(&state->idev->lock); + im = state->idev->mc_list; + } + return im; +} + +static struct ifmcaddr6 *igmp6_mc_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ifmcaddr6 *im = igmp6_mc_get_first(seq); + if (im) + while (pos && (im = igmp6_mc_get_next(seq, im)) != NULL) + --pos; + return pos ? NULL : im; +} + +static void *igmp6_mc_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&dev_base_lock); + return igmp6_mc_get_idx(seq, *pos); +} + +static void *igmp6_mc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ifmcaddr6 *im; + im = igmp6_mc_get_next(seq, v); + ++*pos; + return im; +} + +static void igmp6_mc_seq_stop(struct seq_file *seq, void *v) +{ + struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + in6_dev_put(state->idev); + state->idev = NULL; + } + state->dev = NULL; + read_unlock(&dev_base_lock); +} + +static int igmp6_mc_seq_show(struct seq_file *seq, void *v) +{ + struct ifmcaddr6 *im = (struct ifmcaddr6 *)v; + struct igmp6_mc_iter_state *state = igmp6_mc_seq_private(seq); + + seq_printf(seq, + "%-4d %-15s %04x%04x%04x%04x%04x%04x%04x%04x %5d %08X %ld\n", + state->dev->ifindex, state->dev->name, + NIP6(im->mca_addr), + im->mca_users, im->mca_flags, + (im->mca_flags&MAF_TIMER_RUNNING) ? + jiffies_to_clock_t(im->mca_timer.expires-jiffies) : 0); + return 0; +} + +static struct seq_operations igmp6_mc_seq_ops = { + .start = igmp6_mc_seq_start, + .next = igmp6_mc_seq_next, + .stop = igmp6_mc_seq_stop, + .show = igmp6_mc_seq_show, +}; + +static int igmp6_mc_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct igmp6_mc_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &igmp6_mc_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations igmp6_mc_seq_fops = { + .owner = THIS_MODULE, + .open = igmp6_mc_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +struct igmp6_mcf_iter_state { + struct net_device *dev; + struct inet6_dev *idev; + struct ifmcaddr6 *im; +}; + +#define igmp6_mcf_seq_private(seq) ((struct igmp6_mcf_iter_state *)(seq)->private) + +static inline struct ip6_sf_list *igmp6_mcf_get_first(struct seq_file *seq) +{ + struct ip6_sf_list *psf = NULL; + struct ifmcaddr6 *im = NULL; + struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + + for (state->dev = dev_base, state->idev = NULL, state->im = NULL; + state->dev; + state->dev = state->dev->next) { + struct inet6_dev *idev; + idev = in6_dev_get(state->dev); + if (unlikely(idev == NULL)) + continue; + read_lock_bh(&idev->lock); + im = idev->mc_list; + if (likely(im != NULL)) { + spin_lock_bh(&im->mca_lock); + psf = im->mca_sources; + if (likely(psf != NULL)) { + state->im = im; + state->idev = idev; + break; + } + spin_unlock_bh(&im->mca_lock); + } + read_unlock_bh(&idev->lock); + in6_dev_put(idev); + } + return psf; +} + +static struct ip6_sf_list *igmp6_mcf_get_next(struct seq_file *seq, struct ip6_sf_list *psf) +{ + struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + + psf = psf->sf_next; + while (!psf) { + spin_unlock_bh(&state->im->mca_lock); + state->im = state->im->next; + while (!state->im) { + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + in6_dev_put(state->idev); + } + state->dev = state->dev->next; + if (!state->dev) { + state->idev = NULL; + goto out; + } + state->idev = in6_dev_get(state->dev); + if (!state->idev) + continue; + read_lock_bh(&state->idev->lock); + state->im = state->idev->mc_list; + } + if (!state->im) + break; + spin_lock_bh(&state->im->mca_lock); + psf = state->im->mca_sources; + } +out: + return psf; +} + +static struct ip6_sf_list *igmp6_mcf_get_idx(struct seq_file *seq, loff_t pos) +{ + struct ip6_sf_list *psf = igmp6_mcf_get_first(seq); + if (psf) + while (pos && (psf = igmp6_mcf_get_next(seq, psf)) != NULL) + --pos; + return pos ? NULL : psf; +} + +static void *igmp6_mcf_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&dev_base_lock); + return *pos ? igmp6_mcf_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *igmp6_mcf_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ip6_sf_list *psf; + if (v == SEQ_START_TOKEN) + psf = igmp6_mcf_get_first(seq); + else + psf = igmp6_mcf_get_next(seq, v); + ++*pos; + return psf; +} + +static void igmp6_mcf_seq_stop(struct seq_file *seq, void *v) +{ + struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + if (likely(state->im != NULL)) { + spin_unlock_bh(&state->im->mca_lock); + state->im = NULL; + } + if (likely(state->idev != NULL)) { + read_unlock_bh(&state->idev->lock); + in6_dev_put(state->idev); + state->idev = NULL; + } + state->dev = NULL; + read_unlock(&dev_base_lock); +} + +static int igmp6_mcf_seq_show(struct seq_file *seq, void *v) +{ + struct ip6_sf_list *psf = (struct ip6_sf_list *)v; + struct igmp6_mcf_iter_state *state = igmp6_mcf_seq_private(seq); + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, + "%3s %6s " + "%32s %32s %6s %6s\n", "Idx", + "Device", "Multicast Address", + "Source Address", "INC", "EXC"); + } else { + seq_printf(seq, + "%3d %6.6s " + "%04x%04x%04x%04x%04x%04x%04x%04x " + "%04x%04x%04x%04x%04x%04x%04x%04x " + "%6lu %6lu\n", + state->dev->ifindex, state->dev->name, + NIP6(state->im->mca_addr), + NIP6(psf->sf_addr), + psf->sf_count[MCAST_INCLUDE], + psf->sf_count[MCAST_EXCLUDE]); + } + return 0; +} + +static struct seq_operations igmp6_mcf_seq_ops = { + .start = igmp6_mcf_seq_start, + .next = igmp6_mcf_seq_next, + .stop = igmp6_mcf_seq_stop, + .show = igmp6_mcf_seq_show, +}; + +static int igmp6_mcf_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct igmp6_mcf_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + rc = seq_open(file, &igmp6_mcf_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations igmp6_mcf_seq_fops = { + .owner = THIS_MODULE, + .open = igmp6_mcf_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; +#endif + +int __init igmp6_init(struct net_proto_family *ops) +{ + struct ipv6_pinfo *np; + struct sock *sk; + int err; + + err = sock_create_kern(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, &igmp6_socket); + if (err < 0) { + printk(KERN_ERR + "Failed to initialize the IGMP6 control socket (err %d).\n", + err); + igmp6_socket = NULL; /* For safety. */ + return err; + } + + sk = igmp6_socket->sk; + sk->sk_allocation = GFP_ATOMIC; + sk->sk_prot->unhash(sk); + + np = inet6_sk(sk); + np->hop_limit = 1; + +#ifdef CONFIG_PROC_FS + proc_net_fops_create("igmp6", S_IRUGO, &igmp6_mc_seq_fops); + proc_net_fops_create("mcfilter6", S_IRUGO, &igmp6_mcf_seq_fops); +#endif + + return 0; +} + +void igmp6_cleanup(void) +{ + sock_release(igmp6_socket); + igmp6_socket = NULL; /* for safety */ + +#ifdef CONFIG_PROC_FS + proc_net_remove("mcfilter6"); + proc_net_remove("igmp6"); +#endif +} diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c new file mode 100644 index 000000000000..7c291f4e9edc --- /dev/null +++ b/net/ipv6/ndisc.c @@ -0,0 +1,1690 @@ +/* + * Neighbour Discovery for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * Mike Shaver + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Lars Fenneberg : fixed MTU setting on receipt + * of an RA. + * + * Janos Farkas : kmalloc failure checks + * Alexey Kuznetsov : state machine reworked + * and moved to net/core. + * Pekka Savola : RFC2461 validation + * YOSHIFUJI Hideaki @USAGI : Verify ND options properly + */ + +/* Set to 3 to get tracing... */ +#define ND_DEBUG 1 + +#define ND_PRINTK(fmt, args...) do { if (net_ratelimit()) { printk(fmt, ## args); } } while(0) +#define ND_NOPRINTK(x...) do { ; } while(0) +#define ND_PRINTK0 ND_PRINTK +#define ND_PRINTK1 ND_NOPRINTK +#define ND_PRINTK2 ND_NOPRINTK +#define ND_PRINTK3 ND_NOPRINTK +#if ND_DEBUG >= 1 +#undef ND_PRINTK1 +#define ND_PRINTK1 ND_PRINTK +#endif +#if ND_DEBUG >= 2 +#undef ND_PRINTK2 +#define ND_PRINTK2 ND_PRINTK +#endif +#if ND_DEBUG >= 3 +#undef ND_PRINTK3 +#define ND_PRINTK3 ND_PRINTK +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_SYSCTL +#include +#endif + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +static struct socket *ndisc_socket; + +static u32 ndisc_hash(const void *pkey, const struct net_device *dev); +static int ndisc_constructor(struct neighbour *neigh); +static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); +static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); +static int pndisc_constructor(struct pneigh_entry *n); +static void pndisc_destructor(struct pneigh_entry *n); +static void pndisc_redo(struct sk_buff *skb); + +static struct neigh_ops ndisc_generic_ops = { + .family = AF_INET6, + .solicit = ndisc_solicit, + .error_report = ndisc_error_report, + .output = neigh_resolve_output, + .connected_output = neigh_connected_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +static struct neigh_ops ndisc_hh_ops = { + .family = AF_INET6, + .solicit = ndisc_solicit, + .error_report = ndisc_error_report, + .output = neigh_resolve_output, + .connected_output = neigh_resolve_output, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + + +static struct neigh_ops ndisc_direct_ops = { + .family = AF_INET6, + .output = dev_queue_xmit, + .connected_output = dev_queue_xmit, + .hh_output = dev_queue_xmit, + .queue_xmit = dev_queue_xmit, +}; + +struct neigh_table nd_tbl = { + .family = AF_INET6, + .entry_size = sizeof(struct neighbour) + sizeof(struct in6_addr), + .key_len = sizeof(struct in6_addr), + .hash = ndisc_hash, + .constructor = ndisc_constructor, + .pconstructor = pndisc_constructor, + .pdestructor = pndisc_destructor, + .proxy_redo = pndisc_redo, + .id = "ndisc_cache", + .parms = { + .tbl = &nd_tbl, + .base_reachable_time = 30 * HZ, + .retrans_time = 1 * HZ, + .gc_staletime = 60 * HZ, + .reachable_time = 30 * HZ, + .delay_probe_time = 5 * HZ, + .queue_len = 3, + .ucast_probes = 3, + .mcast_probes = 3, + .anycast_delay = 1 * HZ, + .proxy_delay = (8 * HZ) / 10, + .proxy_qlen = 64, + }, + .gc_interval = 30 * HZ, + .gc_thresh1 = 128, + .gc_thresh2 = 512, + .gc_thresh3 = 1024, +}; + +/* ND options */ +struct ndisc_options { + struct nd_opt_hdr *nd_opt_array[__ND_OPT_MAX]; +}; + +#define nd_opts_src_lladdr nd_opt_array[ND_OPT_SOURCE_LL_ADDR] +#define nd_opts_tgt_lladdr nd_opt_array[ND_OPT_TARGET_LL_ADDR] +#define nd_opts_pi nd_opt_array[ND_OPT_PREFIX_INFO] +#define nd_opts_pi_end nd_opt_array[__ND_OPT_PREFIX_INFO_END] +#define nd_opts_rh nd_opt_array[ND_OPT_REDIRECT_HDR] +#define nd_opts_mtu nd_opt_array[ND_OPT_MTU] + +#define NDISC_OPT_SPACE(len) (((len)+2+7)&~7) + +/* + * Return the padding between the option length and the start of the + * link addr. Currently only IP-over-InfiniBand needs this, although + * if RFC 3831 IPv6-over-Fibre Channel is ever implemented it may + * also need a pad of 2. + */ +static int ndisc_addr_option_pad(unsigned short type) +{ + switch (type) { + case ARPHRD_INFINIBAND: return 2; + default: return 0; + } +} + +static inline int ndisc_opt_addr_space(struct net_device *dev) +{ + return NDISC_OPT_SPACE(dev->addr_len + ndisc_addr_option_pad(dev->type)); +} + +static u8 *ndisc_fill_addr_option(u8 *opt, int type, void *data, int data_len, + unsigned short addr_type) +{ + int space = NDISC_OPT_SPACE(data_len); + int pad = ndisc_addr_option_pad(addr_type); + + opt[0] = type; + opt[1] = space>>3; + + memset(opt + 2, 0, pad); + opt += pad; + space -= pad; + + memcpy(opt+2, data, data_len); + data_len += 2; + opt += data_len; + if ((space -= data_len) > 0) + memset(opt, 0, space); + return opt + space; +} + +static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, + struct nd_opt_hdr *end) +{ + int type; + if (!cur || !end || cur >= end) + return NULL; + type = cur->nd_opt_type; + do { + cur = ((void *)cur) + (cur->nd_opt_len << 3); + } while(cur < end && cur->nd_opt_type != type); + return (cur <= end && cur->nd_opt_type == type ? cur : NULL); +} + +static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, + struct ndisc_options *ndopts) +{ + struct nd_opt_hdr *nd_opt = (struct nd_opt_hdr *)opt; + + if (!nd_opt || opt_len < 0 || !ndopts) + return NULL; + memset(ndopts, 0, sizeof(*ndopts)); + while (opt_len) { + int l; + if (opt_len < sizeof(struct nd_opt_hdr)) + return NULL; + l = nd_opt->nd_opt_len << 3; + if (opt_len < l || l == 0) + return NULL; + switch (nd_opt->nd_opt_type) { + case ND_OPT_SOURCE_LL_ADDR: + case ND_OPT_TARGET_LL_ADDR: + case ND_OPT_MTU: + case ND_OPT_REDIRECT_HDR: + if (ndopts->nd_opt_array[nd_opt->nd_opt_type]) { + ND_PRINTK2(KERN_WARNING + "%s(): duplicated ND6 option found: type=%d\n", + __FUNCTION__, + nd_opt->nd_opt_type); + } else { + ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; + } + break; + case ND_OPT_PREFIX_INFO: + ndopts->nd_opts_pi_end = nd_opt; + if (ndopts->nd_opt_array[nd_opt->nd_opt_type] == 0) + ndopts->nd_opt_array[nd_opt->nd_opt_type] = nd_opt; + break; + default: + /* + * Unknown options must be silently ignored, + * to accommodate future extension to the protocol. + */ + ND_PRINTK2(KERN_NOTICE + "%s(): ignored unsupported option; type=%d, len=%d\n", + __FUNCTION__, + nd_opt->nd_opt_type, nd_opt->nd_opt_len); + } + opt_len -= l; + nd_opt = ((void *)nd_opt) + l; + } + return ndopts; +} + +static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p, + struct net_device *dev) +{ + u8 *lladdr = (u8 *)(p + 1); + int lladdrlen = p->nd_opt_len << 3; + int prepad = ndisc_addr_option_pad(dev->type); + if (lladdrlen != NDISC_OPT_SPACE(dev->addr_len + prepad)) + return NULL; + return (lladdr + prepad); +} + +int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int dir) +{ + switch (dev->type) { + case ARPHRD_ETHER: + case ARPHRD_IEEE802: /* Not sure. Check it later. --ANK */ + case ARPHRD_FDDI: + ipv6_eth_mc_map(addr, buf); + return 0; + case ARPHRD_IEEE802_TR: + ipv6_tr_mc_map(addr,buf); + return 0; + case ARPHRD_ARCNET: + ipv6_arcnet_mc_map(addr, buf); + return 0; + case ARPHRD_INFINIBAND: + ipv6_ib_mc_map(addr, buf); + return 0; + default: + if (dir) { + memcpy(buf, dev->broadcast, dev->addr_len); + return 0; + } + } + return -EINVAL; +} + +static u32 ndisc_hash(const void *pkey, const struct net_device *dev) +{ + const u32 *p32 = pkey; + u32 addr_hash, i; + + addr_hash = 0; + for (i = 0; i < (sizeof(struct in6_addr) / sizeof(u32)); i++) + addr_hash ^= *p32++; + + return jhash_2words(addr_hash, dev->ifindex, nd_tbl.hash_rnd); +} + +static int ndisc_constructor(struct neighbour *neigh) +{ + struct in6_addr *addr = (struct in6_addr*)&neigh->primary_key; + struct net_device *dev = neigh->dev; + struct inet6_dev *in6_dev; + struct neigh_parms *parms; + int is_multicast = ipv6_addr_is_multicast(addr); + + rcu_read_lock(); + in6_dev = in6_dev_get(dev); + if (in6_dev == NULL) { + rcu_read_unlock(); + return -EINVAL; + } + + parms = in6_dev->nd_parms; + __neigh_parms_put(neigh->parms); + neigh->parms = neigh_parms_clone(parms); + rcu_read_unlock(); + + neigh->type = is_multicast ? RTN_MULTICAST : RTN_UNICAST; + if (dev->hard_header == NULL) { + neigh->nud_state = NUD_NOARP; + neigh->ops = &ndisc_direct_ops; + neigh->output = neigh->ops->queue_xmit; + } else { + if (is_multicast) { + neigh->nud_state = NUD_NOARP; + ndisc_mc_map(addr, neigh->ha, dev, 1); + } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->dev_addr, dev->addr_len); + if (dev->flags&IFF_LOOPBACK) + neigh->type = RTN_LOCAL; + } else if (dev->flags&IFF_POINTOPOINT) { + neigh->nud_state = NUD_NOARP; + memcpy(neigh->ha, dev->broadcast, dev->addr_len); + } + if (dev->hard_header_cache) + neigh->ops = &ndisc_hh_ops; + else + neigh->ops = &ndisc_generic_ops; + if (neigh->nud_state&NUD_VALID) + neigh->output = neigh->ops->connected_output; + else + neigh->output = neigh->ops->output; + } + in6_dev_put(in6_dev); + return 0; +} + +static int pndisc_constructor(struct pneigh_entry *n) +{ + struct in6_addr *addr = (struct in6_addr*)&n->key; + struct in6_addr maddr; + struct net_device *dev = n->dev; + + if (dev == NULL || __in6_dev_get(dev) == NULL) + return -EINVAL; + addrconf_addr_solict_mult(addr, &maddr); + ipv6_dev_mc_inc(dev, &maddr); + return 0; +} + +static void pndisc_destructor(struct pneigh_entry *n) +{ + struct in6_addr *addr = (struct in6_addr*)&n->key; + struct in6_addr maddr; + struct net_device *dev = n->dev; + + if (dev == NULL || __in6_dev_get(dev) == NULL) + return; + addrconf_addr_solict_mult(addr, &maddr); + ipv6_dev_mc_dec(dev, &maddr); +} + +/* + * Send a Neighbour Advertisement + */ + +static inline void ndisc_flow_init(struct flowi *fl, u8 type, + struct in6_addr *saddr, struct in6_addr *daddr) +{ + memset(fl, 0, sizeof(*fl)); + ipv6_addr_copy(&fl->fl6_src, saddr); + ipv6_addr_copy(&fl->fl6_dst, daddr); + fl->proto = IPPROTO_ICMPV6; + fl->fl_icmp_type = type; + fl->fl_icmp_code = 0; +} + +static void ndisc_send_na(struct net_device *dev, struct neighbour *neigh, + struct in6_addr *daddr, struct in6_addr *solicited_addr, + int router, int solicited, int override, int inc_opt) +{ + struct in6_addr tmpaddr; + struct inet6_ifaddr *ifp; + struct inet6_dev *idev; + struct flowi fl; + struct dst_entry* dst; + struct sock *sk = ndisc_socket->sk; + struct in6_addr *src_addr; + struct nd_msg *msg; + int len; + struct sk_buff *skb; + int err; + + len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); + + /* for anycast or proxy, solicited_addr != src_addr */ + ifp = ipv6_get_ifaddr(solicited_addr, dev, 1); + if (ifp) { + src_addr = solicited_addr; + in6_ifa_put(ifp); + } else { + if (ipv6_dev_get_saddr(dev, daddr, &tmpaddr)) + return; + src_addr = &tmpaddr; + } + + ndisc_flow_init(&fl, NDISC_NEIGHBOUR_ADVERTISEMENT, src_addr, daddr); + + dst = ndisc_dst_alloc(dev, neigh, daddr, ip6_output); + if (!dst) + return; + + err = xfrm_lookup(&dst, &fl, NULL, 0); + if (err < 0) { + dst_release(dst); + return; + } + + if (inc_opt) { + if (dev->addr_len) + len += ndisc_opt_addr_space(dev); + else + inc_opt = 0; + } + + skb = sock_alloc_send_skb(sk, MAX_HEADER + len + LL_RESERVED_SPACE(dev), + 1, &err); + + if (skb == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 NA: %s() failed to allocate an skb.\n", + __FUNCTION__); + dst_release(dst); + return; + } + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + ip6_nd_hdr(sk, skb, dev, src_addr, daddr, IPPROTO_ICMPV6, len); + + msg = (struct nd_msg *)skb_put(skb, len); + skb->h.raw = (unsigned char*)msg; + + msg->icmph.icmp6_type = NDISC_NEIGHBOUR_ADVERTISEMENT; + msg->icmph.icmp6_code = 0; + msg->icmph.icmp6_cksum = 0; + + msg->icmph.icmp6_unused = 0; + msg->icmph.icmp6_router = router; + msg->icmph.icmp6_solicited = solicited; + msg->icmph.icmp6_override = !!override; + + /* Set the target address. */ + ipv6_addr_copy(&msg->target, solicited_addr); + + if (inc_opt) + ndisc_fill_addr_option(msg->opt, ND_OPT_TARGET_LL_ADDR, dev->dev_addr, + dev->addr_len, dev->type); + + /* checksum */ + msg->icmph.icmp6_cksum = csum_ipv6_magic(src_addr, daddr, len, + IPPROTO_ICMPV6, + csum_partial((__u8 *) msg, + len, 0)); + + skb->dst = dst; + idev = in6_dev_get(dst->dev); + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output); + if (!err) { + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTNEIGHBORADVERTISEMENTS); + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); + } + + if (likely(idev != NULL)) + in6_dev_put(idev); +} + +void ndisc_send_ns(struct net_device *dev, struct neighbour *neigh, + struct in6_addr *solicit, + struct in6_addr *daddr, struct in6_addr *saddr) +{ + struct flowi fl; + struct dst_entry* dst; + struct inet6_dev *idev; + struct sock *sk = ndisc_socket->sk; + struct sk_buff *skb; + struct nd_msg *msg; + struct in6_addr addr_buf; + int len; + int err; + int send_llinfo; + + if (saddr == NULL) { + if (ipv6_get_lladdr(dev, &addr_buf)) + return; + saddr = &addr_buf; + } + + ndisc_flow_init(&fl, NDISC_NEIGHBOUR_SOLICITATION, saddr, daddr); + + dst = ndisc_dst_alloc(dev, neigh, daddr, ip6_output); + if (!dst) + return; + + err = xfrm_lookup(&dst, &fl, NULL, 0); + if (err < 0) { + dst_release(dst); + return; + } + + len = sizeof(struct icmp6hdr) + sizeof(struct in6_addr); + send_llinfo = dev->addr_len && !ipv6_addr_any(saddr); + if (send_llinfo) + len += ndisc_opt_addr_space(dev); + + skb = sock_alloc_send_skb(sk, MAX_HEADER + len + LL_RESERVED_SPACE(dev), + 1, &err); + if (skb == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 NA: %s() failed to allocate an skb.\n", + __FUNCTION__); + dst_release(dst); + return; + } + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); + + msg = (struct nd_msg *)skb_put(skb, len); + skb->h.raw = (unsigned char*)msg; + msg->icmph.icmp6_type = NDISC_NEIGHBOUR_SOLICITATION; + msg->icmph.icmp6_code = 0; + msg->icmph.icmp6_cksum = 0; + msg->icmph.icmp6_unused = 0; + + /* Set the target address. */ + ipv6_addr_copy(&msg->target, solicit); + + if (send_llinfo) + ndisc_fill_addr_option(msg->opt, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, + dev->addr_len, dev->type); + + /* checksum */ + msg->icmph.icmp6_cksum = csum_ipv6_magic(&skb->nh.ipv6h->saddr, + daddr, len, + IPPROTO_ICMPV6, + csum_partial((__u8 *) msg, + len, 0)); + /* send it! */ + skb->dst = dst; + idev = in6_dev_get(dst->dev); + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output); + if (!err) { + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTNEIGHBORSOLICITS); + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); + } + + if (likely(idev != NULL)) + in6_dev_put(idev); +} + +void ndisc_send_rs(struct net_device *dev, struct in6_addr *saddr, + struct in6_addr *daddr) +{ + struct flowi fl; + struct dst_entry* dst; + struct inet6_dev *idev; + struct sock *sk = ndisc_socket->sk; + struct sk_buff *skb; + struct icmp6hdr *hdr; + __u8 * opt; + int len; + int err; + + ndisc_flow_init(&fl, NDISC_ROUTER_SOLICITATION, saddr, daddr); + + dst = ndisc_dst_alloc(dev, NULL, daddr, ip6_output); + if (!dst) + return; + + err = xfrm_lookup(&dst, &fl, NULL, 0); + if (err < 0) { + dst_release(dst); + return; + } + + len = sizeof(struct icmp6hdr); + if (dev->addr_len) + len += ndisc_opt_addr_space(dev); + + skb = sock_alloc_send_skb(sk, MAX_HEADER + len + LL_RESERVED_SPACE(dev), + 1, &err); + if (skb == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 RS: %s() failed to allocate an skb.\n", + __FUNCTION__); + dst_release(dst); + return; + } + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + ip6_nd_hdr(sk, skb, dev, saddr, daddr, IPPROTO_ICMPV6, len); + + hdr = (struct icmp6hdr *)skb_put(skb, len); + skb->h.raw = (unsigned char*)hdr; + hdr->icmp6_type = NDISC_ROUTER_SOLICITATION; + hdr->icmp6_code = 0; + hdr->icmp6_cksum = 0; + hdr->icmp6_unused = 0; + + opt = (u8*) (hdr + 1); + + if (dev->addr_len) + ndisc_fill_addr_option(opt, ND_OPT_SOURCE_LL_ADDR, dev->dev_addr, + dev->addr_len, dev->type); + + /* checksum */ + hdr->icmp6_cksum = csum_ipv6_magic(&skb->nh.ipv6h->saddr, daddr, len, + IPPROTO_ICMPV6, + csum_partial((__u8 *) hdr, len, 0)); + + /* send it! */ + skb->dst = dst; + idev = in6_dev_get(dst->dev); + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, dst->dev, dst_output); + if (!err) { + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTROUTERSOLICITS); + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); + } + + if (likely(idev != NULL)) + in6_dev_put(idev); +} + + +static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb) +{ + /* + * "The sender MUST return an ICMP + * destination unreachable" + */ + dst_link_failure(skb); + kfree_skb(skb); +} + +/* Called with locked neigh: either read or both */ + +static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb) +{ + struct in6_addr *saddr = NULL; + struct in6_addr mcaddr; + struct net_device *dev = neigh->dev; + struct in6_addr *target = (struct in6_addr *)&neigh->primary_key; + int probes = atomic_read(&neigh->probes); + + if (skb && ipv6_chk_addr(&skb->nh.ipv6h->saddr, dev, 1)) + saddr = &skb->nh.ipv6h->saddr; + + if ((probes -= neigh->parms->ucast_probes) < 0) { + if (!(neigh->nud_state & NUD_VALID)) { + ND_PRINTK1(KERN_DEBUG + "%s(): trying to ucast probe in NUD_INVALID: " + "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + __FUNCTION__, + NIP6(*target)); + } + ndisc_send_ns(dev, neigh, target, target, saddr); + } else if ((probes -= neigh->parms->app_probes) < 0) { +#ifdef CONFIG_ARPD + neigh_app_ns(neigh); +#endif + } else { + addrconf_addr_solict_mult(target, &mcaddr); + ndisc_send_ns(dev, NULL, target, &mcaddr, saddr); + } +} + +static void ndisc_recv_ns(struct sk_buff *skb) +{ + struct nd_msg *msg = (struct nd_msg *)skb->h.raw; + struct in6_addr *saddr = &skb->nh.ipv6h->saddr; + struct in6_addr *daddr = &skb->nh.ipv6h->daddr; + u8 *lladdr = NULL; + u32 ndoptlen = skb->tail - msg->opt; + struct ndisc_options ndopts; + struct net_device *dev = skb->dev; + struct inet6_ifaddr *ifp; + struct inet6_dev *idev = NULL; + struct neighbour *neigh; + int dad = ipv6_addr_any(saddr); + int inc; + + if (ipv6_addr_is_multicast(&msg->target)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: multicast target address"); + return; + } + + /* + * RFC2461 7.1.1: + * DAD has to be destined for solicited node multicast address. + */ + if (dad && + !(daddr->s6_addr32[0] == htonl(0xff020000) && + daddr->s6_addr32[1] == htonl(0x00000000) && + daddr->s6_addr32[2] == htonl(0x00000001) && + daddr->s6_addr [12] == 0xff )) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: bad DAD packet (wrong destination)\n"); + return; + } + + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: invalid ND options\n"); + return; + } + + if (ndopts.nd_opts_src_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, dev); + if (!lladdr) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: invalid link-layer address length\n"); + return; + } + + /* RFC2461 7.1.1: + * If the IP source address is the unspecified address, + * there MUST NOT be source link-layer address option + * in the message. + */ + if (dad) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: bad DAD packet (link-layer address option)\n"); + return; + } + } + + inc = ipv6_addr_is_multicast(daddr); + + if ((ifp = ipv6_get_ifaddr(&msg->target, dev, 1)) != NULL) { + if (ifp->flags & IFA_F_TENTATIVE) { + /* Address is tentative. If the source + is unspecified address, it is someone + does DAD, otherwise we ignore solicitations + until DAD timer expires. + */ + if (!dad) + goto out; + if (dev->type == ARPHRD_IEEE802_TR) { + unsigned char *sadr = skb->mac.raw; + if (((sadr[8] ^ dev->dev_addr[0]) & 0x7f) == 0 && + sadr[9] == dev->dev_addr[1] && + sadr[10] == dev->dev_addr[2] && + sadr[11] == dev->dev_addr[3] && + sadr[12] == dev->dev_addr[4] && + sadr[13] == dev->dev_addr[5]) { + /* looped-back to us */ + goto out; + } + } + addrconf_dad_failure(ifp); + return; + } + + idev = ifp->idev; + } else { + idev = in6_dev_get(dev); + if (!idev) { + /* XXX: count this drop? */ + return; + } + + if (ipv6_chk_acast_addr(dev, &msg->target) || + (idev->cnf.forwarding && + pneigh_lookup(&nd_tbl, &msg->target, dev, 0))) { + if (skb->stamp.tv_sec != LOCALLY_ENQUEUED && + skb->pkt_type != PACKET_HOST && + inc != 0 && + idev->nd_parms->proxy_delay != 0) { + /* + * for anycast or proxy, + * sender should delay its response + * by a random time between 0 and + * MAX_ANYCAST_DELAY_TIME seconds. + * (RFC2461) -- yoshfuji + */ + struct sk_buff *n = skb_clone(skb, GFP_ATOMIC); + if (n) + pneigh_enqueue(&nd_tbl, idev->nd_parms, n); + goto out; + } + } else + goto out; + } + + if (dad) { + struct in6_addr maddr; + + ipv6_addr_all_nodes(&maddr); + ndisc_send_na(dev, NULL, &maddr, &msg->target, + idev->cnf.forwarding, 0, (ifp != NULL), 1); + goto out; + } + + if (inc) + NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_mcast); + else + NEIGH_CACHE_STAT_INC(&nd_tbl, rcv_probes_ucast); + + /* + * update / create cache entry + * for the source address + */ + neigh = __neigh_lookup(&nd_tbl, saddr, dev, + !inc || lladdr || !dev->addr_len); + if (neigh) + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE); + if (neigh || !dev->hard_header) { + ndisc_send_na(dev, neigh, saddr, &msg->target, + idev->cnf.forwarding, + 1, (ifp != NULL && inc), inc); + if (neigh) + neigh_release(neigh); + } + +out: + if (ifp) + in6_ifa_put(ifp); + else + in6_dev_put(idev); + + return; +} + +static void ndisc_recv_na(struct sk_buff *skb) +{ + struct nd_msg *msg = (struct nd_msg *)skb->h.raw; + struct in6_addr *saddr = &skb->nh.ipv6h->saddr; + struct in6_addr *daddr = &skb->nh.ipv6h->daddr; + u8 *lladdr = NULL; + u32 ndoptlen = skb->tail - msg->opt; + struct ndisc_options ndopts; + struct net_device *dev = skb->dev; + struct inet6_ifaddr *ifp; + struct neighbour *neigh; + + if (skb->len < sizeof(struct nd_msg)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NA: packet too short\n"); + return; + } + + if (ipv6_addr_is_multicast(&msg->target)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NA: target address is multicast.\n"); + return; + } + + if (ipv6_addr_is_multicast(daddr) && + msg->icmph.icmp6_solicited) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NA: solicited NA is multicasted.\n"); + return; + } + + if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NS: invalid ND option\n"); + return; + } + if (ndopts.nd_opts_tgt_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, dev); + if (!lladdr) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NA: invalid link-layer address length\n"); + return; + } + } + if ((ifp = ipv6_get_ifaddr(&msg->target, dev, 1))) { + if (ifp->flags & IFA_F_TENTATIVE) { + addrconf_dad_failure(ifp); + return; + } + /* What should we make now? The advertisement + is invalid, but ndisc specs say nothing + about it. It could be misconfiguration, or + an smart proxy agent tries to help us :-) + */ + ND_PRINTK1(KERN_WARNING + "ICMPv6 NA: someone advertises our address on %s!\n", + ifp->idev->dev->name); + in6_ifa_put(ifp); + return; + } + neigh = neigh_lookup(&nd_tbl, &msg->target, dev); + + if (neigh) { + u8 old_flags = neigh->flags; + + if (neigh->nud_state & NUD_FAILED) + goto out; + + neigh_update(neigh, lladdr, + msg->icmph.icmp6_solicited ? NUD_REACHABLE : NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + (msg->icmph.icmp6_override ? NEIGH_UPDATE_F_OVERRIDE : 0)| + NEIGH_UPDATE_F_OVERRIDE_ISROUTER| + (msg->icmph.icmp6_router ? NEIGH_UPDATE_F_ISROUTER : 0)); + + if ((old_flags & ~neigh->flags) & NTF_ROUTER) { + /* + * Change: router to host + */ + struct rt6_info *rt; + rt = rt6_get_dflt_router(saddr, dev); + if (rt) + ip6_del_rt(rt, NULL, NULL); + } + +out: + neigh_release(neigh); + } +} + +static void ndisc_recv_rs(struct sk_buff *skb) +{ + struct rs_msg *rs_msg = (struct rs_msg *) skb->h.raw; + unsigned long ndoptlen = skb->len - sizeof(*rs_msg); + struct neighbour *neigh; + struct inet6_dev *idev; + struct in6_addr *saddr = &skb->nh.ipv6h->saddr; + struct ndisc_options ndopts; + u8 *lladdr = NULL; + + if (skb->len < sizeof(*rs_msg)) + return; + + idev = in6_dev_get(skb->dev); + if (!idev) { + if (net_ratelimit()) + ND_PRINTK1("ICMP6 RS: can't find in6 device\n"); + return; + } + + /* Don't accept RS if we're not in router mode */ + if (!idev->cnf.forwarding) + goto out; + + /* + * Don't update NCE if src = ::; + * this implies that the source node has no ip address assigned yet. + */ + if (ipv6_addr_any(saddr)) + goto out; + + /* Parse ND options */ + if (!ndisc_parse_options(rs_msg->opt, ndoptlen, &ndopts)) { + if (net_ratelimit()) + ND_PRINTK2("ICMP6 NS: invalid ND option, ignored\n"); + goto out; + } + + if (ndopts.nd_opts_src_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, + skb->dev); + if (!lladdr) + goto out; + } + + neigh = __neigh_lookup(&nd_tbl, saddr, skb->dev, 1); + if (neigh) { + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE_ISROUTER); + neigh_release(neigh); + } +out: + in6_dev_put(idev); +} + +static void ndisc_router_discovery(struct sk_buff *skb) +{ + struct ra_msg *ra_msg = (struct ra_msg *) skb->h.raw; + struct neighbour *neigh = NULL; + struct inet6_dev *in6_dev; + struct rt6_info *rt; + int lifetime; + struct ndisc_options ndopts; + int optlen; + + __u8 * opt = (__u8 *)(ra_msg + 1); + + optlen = (skb->tail - skb->h.raw) - sizeof(struct ra_msg); + + if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr) & IPV6_ADDR_LINKLOCAL)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: source address is not link-local.\n"); + return; + } + if (optlen < 0) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: packet too short\n"); + return; + } + + /* + * set the RA_RECV flag in the interface + */ + + in6_dev = in6_dev_get(skb->dev); + if (in6_dev == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 RA: can't find inet6 device for %s.\n", + skb->dev->name); + return; + } + if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_ra) { + in6_dev_put(in6_dev); + return; + } + + if (!ndisc_parse_options(opt, optlen, &ndopts)) { + in6_dev_put(in6_dev); + ND_PRINTK2(KERN_WARNING + "ICMP6 RA: invalid ND options\n"); + return; + } + + if (in6_dev->if_flags & IF_RS_SENT) { + /* + * flag that an RA was received after an RS was sent + * out on this interface. + */ + in6_dev->if_flags |= IF_RA_RCVD; + } + + /* + * Remember the managed/otherconf flags from most recently + * received RA message (RFC 2462) -- yoshfuji + */ + in6_dev->if_flags = (in6_dev->if_flags & ~(IF_RA_MANAGED | + IF_RA_OTHERCONF)) | + (ra_msg->icmph.icmp6_addrconf_managed ? + IF_RA_MANAGED : 0) | + (ra_msg->icmph.icmp6_addrconf_other ? + IF_RA_OTHERCONF : 0); + + lifetime = ntohs(ra_msg->icmph.icmp6_rt_lifetime); + + rt = rt6_get_dflt_router(&skb->nh.ipv6h->saddr, skb->dev); + + if (rt) + neigh = rt->rt6i_nexthop; + + if (rt && lifetime == 0) { + neigh_clone(neigh); + ip6_del_rt(rt, NULL, NULL); + rt = NULL; + } + + if (rt == NULL && lifetime) { + ND_PRINTK3(KERN_DEBUG + "ICMPv6 RA: adding default router.\n"); + + rt = rt6_add_dflt_router(&skb->nh.ipv6h->saddr, skb->dev); + if (rt == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 RA: %s() failed to add default route.\n", + __FUNCTION__); + in6_dev_put(in6_dev); + return; + } + + neigh = rt->rt6i_nexthop; + if (neigh == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 RA: %s() got default router without neighbour.\n", + __FUNCTION__); + dst_release(&rt->u.dst); + in6_dev_put(in6_dev); + return; + } + neigh->flags |= NTF_ROUTER; + } + + if (rt) + rt->rt6i_expires = jiffies + (HZ * lifetime); + + if (ra_msg->icmph.icmp6_hop_limit) { + in6_dev->cnf.hop_limit = ra_msg->icmph.icmp6_hop_limit; + if (rt) + rt->u.dst.metrics[RTAX_HOPLIMIT-1] = ra_msg->icmph.icmp6_hop_limit; + } + + /* + * Update Reachable Time and Retrans Timer + */ + + if (in6_dev->nd_parms) { + unsigned long rtime = ntohl(ra_msg->retrans_timer); + + if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) { + rtime = (rtime*HZ)/1000; + if (rtime < HZ/10) + rtime = HZ/10; + in6_dev->nd_parms->retrans_time = rtime; + in6_dev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + } + + rtime = ntohl(ra_msg->reachable_time); + if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/(3*HZ)) { + rtime = (rtime*HZ)/1000; + + if (rtime < HZ/10) + rtime = HZ/10; + + if (rtime != in6_dev->nd_parms->base_reachable_time) { + in6_dev->nd_parms->base_reachable_time = rtime; + in6_dev->nd_parms->gc_staletime = 3 * rtime; + in6_dev->nd_parms->reachable_time = neigh_rand_reach_time(rtime); + in6_dev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, in6_dev); + } + } + } + + /* + * Process options. + */ + + if (!neigh) + neigh = __neigh_lookup(&nd_tbl, &skb->nh.ipv6h->saddr, + skb->dev, 1); + if (neigh) { + u8 *lladdr = NULL; + if (ndopts.nd_opts_src_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_src_lladdr, + skb->dev); + if (!lladdr) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: invalid link-layer address length\n"); + goto out; + } + } + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE_ISROUTER| + NEIGH_UPDATE_F_ISROUTER); + } + + if (ndopts.nd_opts_pi) { + struct nd_opt_hdr *p; + for (p = ndopts.nd_opts_pi; + p; + p = ndisc_next_option(p, ndopts.nd_opts_pi_end)) { + addrconf_prefix_rcv(skb->dev, (u8*)p, (p->nd_opt_len) << 3); + } + } + + if (ndopts.nd_opts_mtu) { + u32 mtu; + + memcpy(&mtu, ((u8*)(ndopts.nd_opts_mtu+1))+2, sizeof(mtu)); + mtu = ntohl(mtu); + + if (mtu < IPV6_MIN_MTU || mtu > skb->dev->mtu) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: invalid mtu: %d\n", + mtu); + } else if (in6_dev->cnf.mtu6 != mtu) { + in6_dev->cnf.mtu6 = mtu; + + if (rt) + rt->u.dst.metrics[RTAX_MTU-1] = mtu; + + rt6_mtu_change(skb->dev, mtu); + } + } + + if (ndopts.nd_opts_tgt_lladdr || ndopts.nd_opts_rh) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 RA: invalid RA options"); + } +out: + if (rt) + dst_release(&rt->u.dst); + else if (neigh) + neigh_release(neigh); + in6_dev_put(in6_dev); +} + +static void ndisc_redirect_rcv(struct sk_buff *skb) +{ + struct inet6_dev *in6_dev; + struct icmp6hdr *icmph; + struct in6_addr *dest; + struct in6_addr *target; /* new first hop to destination */ + struct neighbour *neigh; + int on_link = 0; + struct ndisc_options ndopts; + int optlen; + u8 *lladdr = NULL; + + if (!(ipv6_addr_type(&skb->nh.ipv6h->saddr) & IPV6_ADDR_LINKLOCAL)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: source address is not link-local.\n"); + return; + } + + optlen = skb->tail - skb->h.raw; + optlen -= sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); + + if (optlen < 0) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: packet too short\n"); + return; + } + + icmph = (struct icmp6hdr *) skb->h.raw; + target = (struct in6_addr *) (icmph + 1); + dest = target + 1; + + if (ipv6_addr_is_multicast(dest)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: destination address is multicast.\n"); + return; + } + + if (ipv6_addr_equal(dest, target)) { + on_link = 1; + } else if (!(ipv6_addr_type(target) & IPV6_ADDR_LINKLOCAL)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: target address is not link-local.\n"); + return; + } + + in6_dev = in6_dev_get(skb->dev); + if (!in6_dev) + return; + if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects) { + in6_dev_put(in6_dev); + return; + } + + /* RFC2461 8.1: + * The IP source address of the Redirect MUST be the same as the current + * first-hop router for the specified ICMP Destination Address. + */ + + if (!ndisc_parse_options((u8*)(dest + 1), optlen, &ndopts)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: invalid ND options\n"); + in6_dev_put(in6_dev); + return; + } + if (ndopts.nd_opts_tgt_lladdr) { + lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr, + skb->dev); + if (!lladdr) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: invalid link-layer address length\n"); + in6_dev_put(in6_dev); + return; + } + } + + neigh = __neigh_lookup(&nd_tbl, target, skb->dev, 1); + if (neigh) { + rt6_redirect(dest, &skb->nh.ipv6h->saddr, neigh, lladdr, + on_link); + neigh_release(neigh); + } + in6_dev_put(in6_dev); +} + +void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, + struct in6_addr *target) +{ + struct sock *sk = ndisc_socket->sk; + int len = sizeof(struct icmp6hdr) + 2 * sizeof(struct in6_addr); + struct sk_buff *buff; + struct icmp6hdr *icmph; + struct in6_addr saddr_buf; + struct in6_addr *addrp; + struct net_device *dev; + struct rt6_info *rt; + struct dst_entry *dst; + struct inet6_dev *idev; + struct flowi fl; + u8 *opt; + int rd_len; + int err; + int hlen; + u8 ha_buf[MAX_ADDR_LEN], *ha = NULL; + + dev = skb->dev; + + if (ipv6_get_lladdr(dev, &saddr_buf)) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: no link-local address on %s\n", + dev->name); + return; + } + + ndisc_flow_init(&fl, NDISC_REDIRECT, &saddr_buf, &skb->nh.ipv6h->saddr); + + dst = ip6_route_output(NULL, &fl); + if (dst == NULL) + return; + + err = xfrm_lookup(&dst, &fl, NULL, 0); + if (err) { + dst_release(dst); + return; + } + + rt = (struct rt6_info *) dst; + + if (rt->rt6i_flags & RTF_GATEWAY) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 Redirect: destination is not a neighbour.\n"); + dst_release(dst); + return; + } + if (!xrlim_allow(dst, 1*HZ)) { + dst_release(dst); + return; + } + + if (dev->addr_len) { + read_lock_bh(&neigh->lock); + if (neigh->nud_state & NUD_VALID) { + memcpy(ha_buf, neigh->ha, dev->addr_len); + read_unlock_bh(&neigh->lock); + ha = ha_buf; + len += ndisc_opt_addr_space(dev); + } else + read_unlock_bh(&neigh->lock); + } + + rd_len = min_t(unsigned int, + IPV6_MIN_MTU-sizeof(struct ipv6hdr)-len, skb->len + 8); + rd_len &= ~0x7; + len += rd_len; + + buff = sock_alloc_send_skb(sk, MAX_HEADER + len + LL_RESERVED_SPACE(dev), + 1, &err); + if (buff == NULL) { + ND_PRINTK0(KERN_ERR + "ICMPv6 Redirect: %s() failed to allocate an skb.\n", + __FUNCTION__); + dst_release(dst); + return; + } + + hlen = 0; + + skb_reserve(buff, LL_RESERVED_SPACE(dev)); + ip6_nd_hdr(sk, buff, dev, &saddr_buf, &skb->nh.ipv6h->saddr, + IPPROTO_ICMPV6, len); + + icmph = (struct icmp6hdr *)skb_put(buff, len); + buff->h.raw = (unsigned char*)icmph; + + memset(icmph, 0, sizeof(struct icmp6hdr)); + icmph->icmp6_type = NDISC_REDIRECT; + + /* + * copy target and destination addresses + */ + + addrp = (struct in6_addr *)(icmph + 1); + ipv6_addr_copy(addrp, target); + addrp++; + ipv6_addr_copy(addrp, &skb->nh.ipv6h->daddr); + + opt = (u8*) (addrp + 1); + + /* + * include target_address option + */ + + if (ha) + opt = ndisc_fill_addr_option(opt, ND_OPT_TARGET_LL_ADDR, ha, + dev->addr_len, dev->type); + + /* + * build redirect option and copy skb over to the new packet. + */ + + memset(opt, 0, 8); + *(opt++) = ND_OPT_REDIRECT_HDR; + *(opt++) = (rd_len >> 3); + opt += 6; + + memcpy(opt, skb->nh.ipv6h, rd_len - 8); + + icmph->icmp6_cksum = csum_ipv6_magic(&saddr_buf, &skb->nh.ipv6h->saddr, + len, IPPROTO_ICMPV6, + csum_partial((u8 *) icmph, len, 0)); + + buff->dst = dst; + idev = in6_dev_get(dst->dev); + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, buff, NULL, dst->dev, dst_output); + if (!err) { + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTREDIRECTS); + ICMP6_INC_STATS(idev, ICMP6_MIB_OUTMSGS); + } + + if (likely(idev != NULL)) + in6_dev_put(idev); +} + +static void pndisc_redo(struct sk_buff *skb) +{ + ndisc_rcv(skb); + kfree_skb(skb); +} + +int ndisc_rcv(struct sk_buff *skb) +{ + struct nd_msg *msg; + + if (!pskb_may_pull(skb, skb->len)) + return 0; + + msg = (struct nd_msg *) skb->h.raw; + + __skb_push(skb, skb->data-skb->h.raw); + + if (skb->nh.ipv6h->hop_limit != 255) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NDISC: invalid hop-limit: %d\n", + skb->nh.ipv6h->hop_limit); + return 0; + } + + if (msg->icmph.icmp6_code != 0) { + ND_PRINTK2(KERN_WARNING + "ICMPv6 NDISC: invalid ICMPv6 code: %d\n", + msg->icmph.icmp6_code); + return 0; + } + + switch (msg->icmph.icmp6_type) { + case NDISC_NEIGHBOUR_SOLICITATION: + ndisc_recv_ns(skb); + break; + + case NDISC_NEIGHBOUR_ADVERTISEMENT: + ndisc_recv_na(skb); + break; + + case NDISC_ROUTER_SOLICITATION: + ndisc_recv_rs(skb); + break; + + case NDISC_ROUTER_ADVERTISEMENT: + ndisc_router_discovery(skb); + break; + + case NDISC_REDIRECT: + ndisc_redirect_rcv(skb); + break; + }; + + return 0; +} + +static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + switch (event) { + case NETDEV_CHANGEADDR: + neigh_changeaddr(&nd_tbl, dev); + fib6_run_gc(~0UL); + break; + case NETDEV_DOWN: + neigh_ifdown(&nd_tbl, dev); + fib6_run_gc(~0UL); + break; + default: + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block ndisc_netdev_notifier = { + .notifier_call = ndisc_netdev_event, +}; + +#ifdef CONFIG_SYSCTL +static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, + const char *func, const char *dev_name) +{ + static char warncomm[TASK_COMM_LEN]; + static int warned; + if (strcmp(warncomm, current->comm) && warned < 5) { + strcpy(warncomm, current->comm); + printk(KERN_WARNING + "process `%s' is using deprecated sysctl (%s) " + "net.ipv6.neigh.%s.%s; " + "Use net.ipv6.neigh.%s.%s_ms " + "instead.\n", + warncomm, func, + dev_name, ctl->procname, + dev_name, ctl->procname); + warned++; + } +} + +int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct net_device *dev = ctl->extra1; + struct inet6_dev *idev; + int ret; + + if (ctl->ctl_name == NET_NEIGH_RETRANS_TIME || + ctl->ctl_name == NET_NEIGH_REACHABLE_TIME) + ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default"); + + switch (ctl->ctl_name) { + case NET_NEIGH_RETRANS_TIME: + ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + break; + case NET_NEIGH_REACHABLE_TIME: + ret = proc_dointvec_jiffies(ctl, write, + filp, buffer, lenp, ppos); + break; + case NET_NEIGH_RETRANS_TIME_MS: + case NET_NEIGH_REACHABLE_TIME_MS: + ret = proc_dointvec_ms_jiffies(ctl, write, + filp, buffer, lenp, ppos); + break; + default: + ret = -1; + } + + if (write && ret == 0 && dev && (idev = in6_dev_get(dev)) != NULL) { + if (ctl->ctl_name == NET_NEIGH_REACHABLE_TIME || + ctl->ctl_name == NET_NEIGH_REACHABLE_TIME_MS) + idev->nd_parms->reachable_time = neigh_rand_reach_time(idev->nd_parms->base_reachable_time); + idev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, idev); + in6_dev_put(idev); + } + return ret; +} + +static int ndisc_ifinfo_sysctl_strategy(ctl_table *ctl, int __user *name, + int nlen, void __user *oldval, + size_t __user *oldlenp, + void __user *newval, size_t newlen, + void **context) +{ + struct net_device *dev = ctl->extra1; + struct inet6_dev *idev; + int ret; + + if (ctl->ctl_name == NET_NEIGH_RETRANS_TIME || + ctl->ctl_name == NET_NEIGH_REACHABLE_TIME) + ndisc_warn_deprecated_sysctl(ctl, "procfs", dev ? dev->name : "default"); + + switch (ctl->ctl_name) { + case NET_NEIGH_REACHABLE_TIME: + ret = sysctl_jiffies(ctl, name, nlen, + oldval, oldlenp, newval, newlen, + context); + break; + case NET_NEIGH_RETRANS_TIME_MS: + case NET_NEIGH_REACHABLE_TIME_MS: + ret = sysctl_ms_jiffies(ctl, name, nlen, + oldval, oldlenp, newval, newlen, + context); + break; + default: + ret = 0; + } + + if (newval && newlen && ret > 0 && + dev && (idev = in6_dev_get(dev)) != NULL) { + if (ctl->ctl_name == NET_NEIGH_REACHABLE_TIME || + ctl->ctl_name == NET_NEIGH_REACHABLE_TIME_MS) + idev->nd_parms->reachable_time = neigh_rand_reach_time(idev->nd_parms->base_reachable_time); + idev->tstamp = jiffies; + inet6_ifinfo_notify(RTM_NEWLINK, idev); + in6_dev_put(idev); + } + + return ret; +} + +#endif + +int __init ndisc_init(struct net_proto_family *ops) +{ + struct ipv6_pinfo *np; + struct sock *sk; + int err; + + err = sock_create_kern(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, &ndisc_socket); + if (err < 0) { + ND_PRINTK0(KERN_ERR + "ICMPv6 NDISC: Failed to initialize the control socket (err %d).\n", + err); + ndisc_socket = NULL; /* For safety. */ + return err; + } + + sk = ndisc_socket->sk; + np = inet6_sk(sk); + sk->sk_allocation = GFP_ATOMIC; + np->hop_limit = 255; + /* Do not loopback ndisc messages */ + np->mc_loop = 0; + sk->sk_prot->unhash(sk); + + /* + * Initialize the neighbour table + */ + + neigh_table_init(&nd_tbl); + +#ifdef CONFIG_SYSCTL + neigh_sysctl_register(NULL, &nd_tbl.parms, NET_IPV6, NET_IPV6_NEIGH, + "ipv6", + &ndisc_ifinfo_sysctl_change, + &ndisc_ifinfo_sysctl_strategy); +#endif + + register_netdevice_notifier(&ndisc_netdev_notifier); + return 0; +} + +void ndisc_cleanup(void) +{ +#ifdef CONFIG_SYSCTL + neigh_sysctl_unregister(&nd_tbl.parms); +#endif + neigh_table_clear(&nd_tbl); + sock_release(ndisc_socket); + ndisc_socket = NULL; /* For safety. */ +} diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig new file mode 100644 index 000000000000..77ec704c9ee3 --- /dev/null +++ b/net/ipv6/netfilter/Kconfig @@ -0,0 +1,242 @@ +# +# IP netfilter configuration +# + +menu "IPv6: Netfilter Configuration (EXPERIMENTAL)" + depends on INET && IPV6 && NETFILTER && EXPERIMENTAL + +#tristate 'Connection tracking (required for masq/NAT)' CONFIG_IP6_NF_CONNTRACK +#if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then +# dep_tristate ' FTP protocol support' CONFIG_IP6_NF_FTP $CONFIG_IP6_NF_CONNTRACK +#fi +config IP6_NF_QUEUE + tristate "Userspace queueing via NETLINK" + ---help--- + + This option adds a queue handler to the kernel for IPv6 + packets which lets us to receive the filtered packets + with QUEUE target using libiptc as we can do with + the IPv4 now. + + (C) Fernando Anton 2001 + IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. + Universidad Carlos III de Madrid + Universidad Politecnica de Alcala de Henares + email: . + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_IPTABLES + tristate "IP6 tables support (required for filtering/masq/NAT)" + help + ip6tables is a general, extensible packet identification framework. + Currently only the packet filtering and packet mangling subsystem + for IPv6 use this, but connection tracking is going to follow. + Say 'Y' or 'M' here if you want to use either of those. + + To compile it as a module, choose M here. If unsure, say N. + +# The simple matches. +config IP6_NF_MATCH_LIMIT + tristate "limit match support" + depends on IP6_NF_IPTABLES + help + limit matching allows you to control the rate at which a rule can be + matched: mainly useful in combination with the LOG target ("LOG + target support", below) and to avoid some Denial of Service attacks. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_MAC + tristate "MAC address match support" + depends on IP6_NF_IPTABLES + help + mac matching allows you to match packets based on the source + Ethernet address of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_RT + tristate "Routing header match support" + depends on IP6_NF_IPTABLES + help + rt matching allows you to match packets based on the routing + header of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_OPTS + tristate "Hop-by-hop and Dst opts header match support" + depends on IP6_NF_IPTABLES + help + This allows one to match packets based on the hop-by-hop + and destination options headers of a packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_FRAG + tristate "Fragmentation header match support" + depends on IP6_NF_IPTABLES + help + frag matching allows you to match packets based on the fragmentation + header of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_HL + tristate "HL match support" + depends on IP6_NF_IPTABLES + help + HL matching allows you to match packets based on the hop + limit of the packet. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_MULTIPORT + tristate "Multiple port match support" + depends on IP6_NF_IPTABLES + help + Multiport matching allows you to match TCP or UDP packets based on + a series of source or destination ports: normally a rule can only + match a single range of ports. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_OWNER + tristate "Owner match support" + depends on IP6_NF_IPTABLES + help + Packet owner matching allows you to match locally-generated packets + based on who created them: the user, group, process or session. + + To compile it as a module, choose M here. If unsure, say N. + +# dep_tristate ' MAC address match support' CONFIG_IP6_NF_MATCH_MAC $CONFIG_IP6_NF_IPTABLES +config IP6_NF_MATCH_MARK + tristate "netfilter MARK match support" + depends on IP6_NF_IPTABLES + help + Netfilter mark matching allows you to match packets based on the + `nfmark' value in the packet. This can be set by the MARK target + (see below). + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_IPV6HEADER + tristate "IPv6 Extension Headers Match" + depends on IP6_NF_IPTABLES + help + This module allows one to match packets based upon + the ipv6 extension headers. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_AHESP + tristate "AH/ESP match support" + depends on IP6_NF_IPTABLES + help + This module allows one to match AH and ESP packets. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_LENGTH + tristate "Packet Length match support" + depends on IP6_NF_IPTABLES + help + This option allows you to match the length of a packet against a + specific value or range of values. + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_EUI64 + tristate "EUI64 address check" + depends on IP6_NF_IPTABLES + help + This module performs checking on the IPv6 source address + Compares the last 64 bits with the EUI64 (delivered + from the MAC address) address + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_MATCH_PHYSDEV + tristate "Physdev match support" + depends on IP6_NF_IPTABLES && BRIDGE_NETFILTER + help + Physdev packet matching matches against the physical bridge ports + the IP packet arrived on or will leave by. + + To compile it as a module, choose M here. If unsure, say N. + +# dep_tristate ' Multiple port match support' CONFIG_IP6_NF_MATCH_MULTIPORT $CONFIG_IP6_NF_IPTABLES +# dep_tristate ' TOS match support' CONFIG_IP6_NF_MATCH_TOS $CONFIG_IP6_NF_IPTABLES +# if [ "$CONFIG_IP6_NF_CONNTRACK" != "n" ]; then +# dep_tristate ' Connection state match support' CONFIG_IP6_NF_MATCH_STATE $CONFIG_IP6_NF_CONNTRACK $CONFIG_IP6_NF_IPTABLES +# fi +# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then +# dep_tristate ' Unclean match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_UNCLEAN $CONFIG_IP6_NF_IPTABLES +# dep_tristate ' Owner match support (EXPERIMENTAL)' CONFIG_IP6_NF_MATCH_OWNER $CONFIG_IP6_NF_IPTABLES +# fi +# The targets +config IP6_NF_FILTER + tristate "Packet filtering" + depends on IP6_NF_IPTABLES + help + Packet filtering defines a table `filter', which has a series of + rules for simple packet filtering at local input, forwarding and + local output. See the man page for iptables(8). + + To compile it as a module, choose M here. If unsure, say N. + +config IP6_NF_TARGET_LOG + tristate "LOG target support" + depends on IP6_NF_FILTER + help + This option adds a `LOG' target, which allows you to create rules in + any iptables table which records the packet header to the syslog. + + To compile it as a module, choose M here. If unsure, say N. + +# if [ "$CONFIG_IP6_NF_FILTER" != "n" ]; then +# dep_tristate ' REJECT target support' CONFIG_IP6_NF_TARGET_REJECT $CONFIG_IP6_NF_FILTER +# if [ "$CONFIG_EXPERIMENTAL" = "y" ]; then +# dep_tristate ' MIRROR target support (EXPERIMENTAL)' CONFIG_IP6_NF_TARGET_MIRROR $CONFIG_IP6_NF_FILTER +# fi +# fi +config IP6_NF_MANGLE + tristate "Packet mangling" + depends on IP6_NF_IPTABLES + help + This option adds a `mangle' table to iptables: see the man page for + iptables(8). This table is used for various packet alterations + which can effect how the packet is routed. + + To compile it as a module, choose M here. If unsure, say N. + +# dep_tristate ' TOS target support' CONFIG_IP6_NF_TARGET_TOS $CONFIG_IP_NF_MANGLE +config IP6_NF_TARGET_MARK + tristate "MARK target support" + depends on IP6_NF_MANGLE + help + This option adds a `MARK' target, which allows you to create rules + in the `mangle' table which alter the netfilter mark (nfmark) field + associated with the packet packet prior to routing. This can change + the routing method (see `Use netfilter MARK value as routing + key') and can also be used by other subsystems to change their + behavior. + + To compile it as a module, choose M here. If unsure, say N. + +#dep_tristate ' LOG target support' CONFIG_IP6_NF_TARGET_LOG $CONFIG_IP6_NF_IPTABLES +config IP6_NF_RAW + tristate 'raw table support (required for TRACE)' + depends on IP6_NF_IPTABLES + help + This option adds a `raw' table to ip6tables. This table is the very + first in the netfilter framework and hooks in at the PREROUTING + and OUTPUT chains. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + +endmenu + diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile new file mode 100644 index 000000000000..2e51714953b6 --- /dev/null +++ b/net/ipv6/netfilter/Makefile @@ -0,0 +1,26 @@ +# +# Makefile for the netfilter modules on top of IPv6. +# + +# Link order matters here. +obj-$(CONFIG_IP6_NF_IPTABLES) += ip6_tables.o +obj-$(CONFIG_IP6_NF_MATCH_LIMIT) += ip6t_limit.o +obj-$(CONFIG_IP6_NF_MATCH_MARK) += ip6t_mark.o +obj-$(CONFIG_IP6_NF_MATCH_LENGTH) += ip6t_length.o +obj-$(CONFIG_IP6_NF_MATCH_MAC) += ip6t_mac.o +obj-$(CONFIG_IP6_NF_MATCH_RT) += ip6t_rt.o +obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_hbh.o ip6t_dst.o +obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o +obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o +obj-$(CONFIG_IP6_NF_MATCH_AHESP) += ip6t_esp.o ip6t_ah.o +obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o +obj-$(CONFIG_IP6_NF_MATCH_MULTIPORT) += ip6t_multiport.o +obj-$(CONFIG_IP6_NF_MATCH_OWNER) += ip6t_owner.o +obj-$(CONFIG_IP6_NF_MATCH_PHYSDEV) += ip6t_physdev.o +obj-$(CONFIG_IP6_NF_FILTER) += ip6table_filter.o +obj-$(CONFIG_IP6_NF_MANGLE) += ip6table_mangle.o +obj-$(CONFIG_IP6_NF_TARGET_MARK) += ip6t_MARK.o +obj-$(CONFIG_IP6_NF_QUEUE) += ip6_queue.o +obj-$(CONFIG_IP6_NF_TARGET_LOG) += ip6t_LOG.o +obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o +obj-$(CONFIG_IP6_NF_MATCH_HL) += ip6t_hl.o diff --git a/net/ipv6/netfilter/ip6_queue.c b/net/ipv6/netfilter/ip6_queue.c new file mode 100644 index 000000000000..c54830b89593 --- /dev/null +++ b/net/ipv6/netfilter/ip6_queue.c @@ -0,0 +1,741 @@ +/* + * This is a module which is used for queueing IPv6 packets and + * communicating with userspace via netlink. + * + * (C) 2001 Fernando Anton, this code is GPL. + * IPv64 Project - Work based in IPv64 draft by Arturo Azcorra. + * Universidad Carlos III de Madrid - Leganes (Madrid) - Spain + * Universidad Politecnica de Alcala de Henares - Alcala de H. (Madrid) - Spain + * email: fanton@it.uc3m.es + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 2001-11-06: First try. Working with ip_queue.c for IPv4 and trying + * to adapt it to IPv6 + * HEAVILY based in ipqueue.c by James Morris. It's just + * a little modified version of it, so he's nearly the + * real coder of this. + * Few changes needed, mainly the hard_routing code and + * the netlink socket protocol (we're NETLINK_IP6_FW). + * 2002-06-25: Code cleanup. [JM: ported cleanup over from ip_queue.c] + * 2005-02-04: Added /proc counter for dropped packets; fixed so + * packets aren't delivered to user space if they're going + * to be dropped. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define IPQ_QMAX_DEFAULT 1024 +#define IPQ_PROC_FS_NAME "ip6_queue" +#define NET_IPQ_QMAX 2088 +#define NET_IPQ_QMAX_NAME "ip6_queue_maxlen" + +struct ipq_rt_info { + struct in6_addr daddr; + struct in6_addr saddr; +}; + +struct ipq_queue_entry { + struct list_head list; + struct nf_info *info; + struct sk_buff *skb; + struct ipq_rt_info rt_info; +}; + +typedef int (*ipq_cmpfn)(struct ipq_queue_entry *, unsigned long); + +static unsigned char copy_mode = IPQ_COPY_NONE; +static unsigned int queue_maxlen = IPQ_QMAX_DEFAULT; +static DEFINE_RWLOCK(queue_lock); +static int peer_pid; +static unsigned int copy_range; +static unsigned int queue_total; +static unsigned int queue_dropped = 0; +static unsigned int queue_user_dropped = 0; +static struct sock *ipqnl; +static LIST_HEAD(queue_list); +static DECLARE_MUTEX(ipqnl_sem); + +static void +ipq_issue_verdict(struct ipq_queue_entry *entry, int verdict) +{ + nf_reinject(entry->skb, entry->info, verdict); + kfree(entry); +} + +static inline void +__ipq_enqueue_entry(struct ipq_queue_entry *entry) +{ + list_add(&entry->list, &queue_list); + queue_total++; +} + +/* + * Find and return a queued entry matched by cmpfn, or return the last + * entry if cmpfn is NULL. + */ +static inline struct ipq_queue_entry * +__ipq_find_entry(ipq_cmpfn cmpfn, unsigned long data) +{ + struct list_head *p; + + list_for_each_prev(p, &queue_list) { + struct ipq_queue_entry *entry = (struct ipq_queue_entry *)p; + + if (!cmpfn || cmpfn(entry, data)) + return entry; + } + return NULL; +} + +static inline void +__ipq_dequeue_entry(struct ipq_queue_entry *entry) +{ + list_del(&entry->list); + queue_total--; +} + +static inline struct ipq_queue_entry * +__ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data) +{ + struct ipq_queue_entry *entry; + + entry = __ipq_find_entry(cmpfn, data); + if (entry == NULL) + return NULL; + + __ipq_dequeue_entry(entry); + return entry; +} + + +static inline void +__ipq_flush(int verdict) +{ + struct ipq_queue_entry *entry; + + while ((entry = __ipq_find_dequeue_entry(NULL, 0))) + ipq_issue_verdict(entry, verdict); +} + +static inline int +__ipq_set_mode(unsigned char mode, unsigned int range) +{ + int status = 0; + + switch(mode) { + case IPQ_COPY_NONE: + case IPQ_COPY_META: + copy_mode = mode; + copy_range = 0; + break; + + case IPQ_COPY_PACKET: + copy_mode = mode; + copy_range = range; + if (copy_range > 0xFFFF) + copy_range = 0xFFFF; + break; + + default: + status = -EINVAL; + + } + return status; +} + +static inline void +__ipq_reset(void) +{ + peer_pid = 0; + net_disable_timestamp(); + __ipq_set_mode(IPQ_COPY_NONE, 0); + __ipq_flush(NF_DROP); +} + +static struct ipq_queue_entry * +ipq_find_dequeue_entry(ipq_cmpfn cmpfn, unsigned long data) +{ + struct ipq_queue_entry *entry; + + write_lock_bh(&queue_lock); + entry = __ipq_find_dequeue_entry(cmpfn, data); + write_unlock_bh(&queue_lock); + return entry; +} + +static void +ipq_flush(int verdict) +{ + write_lock_bh(&queue_lock); + __ipq_flush(verdict); + write_unlock_bh(&queue_lock); +} + +static struct sk_buff * +ipq_build_packet_message(struct ipq_queue_entry *entry, int *errp) +{ + unsigned char *old_tail; + size_t size = 0; + size_t data_len = 0; + struct sk_buff *skb; + struct ipq_packet_msg *pmsg; + struct nlmsghdr *nlh; + + read_lock_bh(&queue_lock); + + switch (copy_mode) { + case IPQ_COPY_META: + case IPQ_COPY_NONE: + size = NLMSG_SPACE(sizeof(*pmsg)); + data_len = 0; + break; + + case IPQ_COPY_PACKET: + if (copy_range == 0 || copy_range > entry->skb->len) + data_len = entry->skb->len; + else + data_len = copy_range; + + size = NLMSG_SPACE(sizeof(*pmsg) + data_len); + break; + + default: + *errp = -EINVAL; + read_unlock_bh(&queue_lock); + return NULL; + } + + read_unlock_bh(&queue_lock); + + skb = alloc_skb(size, GFP_ATOMIC); + if (!skb) + goto nlmsg_failure; + + old_tail= skb->tail; + nlh = NLMSG_PUT(skb, 0, 0, IPQM_PACKET, size - sizeof(*nlh)); + pmsg = NLMSG_DATA(nlh); + memset(pmsg, 0, sizeof(*pmsg)); + + pmsg->packet_id = (unsigned long )entry; + pmsg->data_len = data_len; + pmsg->timestamp_sec = entry->skb->stamp.tv_sec; + pmsg->timestamp_usec = entry->skb->stamp.tv_usec; + pmsg->mark = entry->skb->nfmark; + pmsg->hook = entry->info->hook; + pmsg->hw_protocol = entry->skb->protocol; + + if (entry->info->indev) + strcpy(pmsg->indev_name, entry->info->indev->name); + else + pmsg->indev_name[0] = '\0'; + + if (entry->info->outdev) + strcpy(pmsg->outdev_name, entry->info->outdev->name); + else + pmsg->outdev_name[0] = '\0'; + + if (entry->info->indev && entry->skb->dev) { + pmsg->hw_type = entry->skb->dev->type; + if (entry->skb->dev->hard_header_parse) + pmsg->hw_addrlen = + entry->skb->dev->hard_header_parse(entry->skb, + pmsg->hw_addr); + } + + if (data_len) + if (skb_copy_bits(entry->skb, 0, pmsg->payload, data_len)) + BUG(); + + nlh->nlmsg_len = skb->tail - old_tail; + return skb; + +nlmsg_failure: + if (skb) + kfree_skb(skb); + *errp = -EINVAL; + printk(KERN_ERR "ip6_queue: error creating packet message\n"); + return NULL; +} + +static int +ipq_enqueue_packet(struct sk_buff *skb, struct nf_info *info, void *data) +{ + int status = -EINVAL; + struct sk_buff *nskb; + struct ipq_queue_entry *entry; + + if (copy_mode == IPQ_COPY_NONE) + return -EAGAIN; + + entry = kmalloc(sizeof(*entry), GFP_ATOMIC); + if (entry == NULL) { + printk(KERN_ERR "ip6_queue: OOM in ipq_enqueue_packet()\n"); + return -ENOMEM; + } + + entry->info = info; + entry->skb = skb; + + if (entry->info->hook == NF_IP_LOCAL_OUT) { + struct ipv6hdr *iph = skb->nh.ipv6h; + + entry->rt_info.daddr = iph->daddr; + entry->rt_info.saddr = iph->saddr; + } + + nskb = ipq_build_packet_message(entry, &status); + if (nskb == NULL) + goto err_out_free; + + write_lock_bh(&queue_lock); + + if (!peer_pid) + goto err_out_free_nskb; + + if (queue_total >= queue_maxlen) { + queue_dropped++; + status = -ENOSPC; + if (net_ratelimit()) + printk (KERN_WARNING "ip6_queue: fill at %d entries, " + "dropping packet(s). Dropped: %d\n", queue_total, + queue_dropped); + goto err_out_free_nskb; + } + + /* netlink_unicast will either free the nskb or attach it to a socket */ + status = netlink_unicast(ipqnl, nskb, peer_pid, MSG_DONTWAIT); + if (status < 0) { + queue_user_dropped++; + goto err_out_unlock; + } + + __ipq_enqueue_entry(entry); + + write_unlock_bh(&queue_lock); + return status; + +err_out_free_nskb: + kfree_skb(nskb); + +err_out_unlock: + write_unlock_bh(&queue_lock); + +err_out_free: + kfree(entry); + return status; +} + +static int +ipq_mangle_ipv6(ipq_verdict_msg_t *v, struct ipq_queue_entry *e) +{ + int diff; + struct ipv6hdr *user_iph = (struct ipv6hdr *)v->payload; + + if (v->data_len < sizeof(*user_iph)) + return 0; + diff = v->data_len - e->skb->len; + if (diff < 0) + skb_trim(e->skb, v->data_len); + else if (diff > 0) { + if (v->data_len > 0xFFFF) + return -EINVAL; + if (diff > skb_tailroom(e->skb)) { + struct sk_buff *newskb; + + newskb = skb_copy_expand(e->skb, + skb_headroom(e->skb), + diff, + GFP_ATOMIC); + if (newskb == NULL) { + printk(KERN_WARNING "ip6_queue: OOM " + "in mangle, dropping packet\n"); + return -ENOMEM; + } + if (e->skb->sk) + skb_set_owner_w(newskb, e->skb->sk); + kfree_skb(e->skb); + e->skb = newskb; + } + skb_put(e->skb, diff); + } + if (!skb_ip_make_writable(&e->skb, v->data_len)) + return -ENOMEM; + memcpy(e->skb->data, v->payload, v->data_len); + e->skb->nfcache |= NFC_ALTERED; + + /* + * Extra routing may needed on local out, as the QUEUE target never + * returns control to the table. + * Not a nice way to cmp, but works + */ + if (e->info->hook == NF_IP_LOCAL_OUT) { + struct ipv6hdr *iph = e->skb->nh.ipv6h; + if (!ipv6_addr_equal(&iph->daddr, &e->rt_info.daddr) || + !ipv6_addr_equal(&iph->saddr, &e->rt_info.saddr)) + return ip6_route_me_harder(e->skb); + } + return 0; +} + +static inline int +id_cmp(struct ipq_queue_entry *e, unsigned long id) +{ + return (id == (unsigned long )e); +} + +static int +ipq_set_verdict(struct ipq_verdict_msg *vmsg, unsigned int len) +{ + struct ipq_queue_entry *entry; + + if (vmsg->value > NF_MAX_VERDICT) + return -EINVAL; + + entry = ipq_find_dequeue_entry(id_cmp, vmsg->id); + if (entry == NULL) + return -ENOENT; + else { + int verdict = vmsg->value; + + if (vmsg->data_len && vmsg->data_len == len) + if (ipq_mangle_ipv6(vmsg, entry) < 0) + verdict = NF_DROP; + + ipq_issue_verdict(entry, verdict); + return 0; + } +} + +static int +ipq_set_mode(unsigned char mode, unsigned int range) +{ + int status; + + write_lock_bh(&queue_lock); + status = __ipq_set_mode(mode, range); + write_unlock_bh(&queue_lock); + return status; +} + +static int +ipq_receive_peer(struct ipq_peer_msg *pmsg, + unsigned char type, unsigned int len) +{ + int status = 0; + + if (len < sizeof(*pmsg)) + return -EINVAL; + + switch (type) { + case IPQM_MODE: + status = ipq_set_mode(pmsg->msg.mode.value, + pmsg->msg.mode.range); + break; + + case IPQM_VERDICT: + if (pmsg->msg.verdict.value > NF_MAX_VERDICT) + status = -EINVAL; + else + status = ipq_set_verdict(&pmsg->msg.verdict, + len - sizeof(*pmsg)); + break; + default: + status = -EINVAL; + } + return status; +} + +static int +dev_cmp(struct ipq_queue_entry *entry, unsigned long ifindex) +{ + if (entry->info->indev) + if (entry->info->indev->ifindex == ifindex) + return 1; + + if (entry->info->outdev) + if (entry->info->outdev->ifindex == ifindex) + return 1; + + return 0; +} + +static void +ipq_dev_drop(int ifindex) +{ + struct ipq_queue_entry *entry; + + while ((entry = ipq_find_dequeue_entry(dev_cmp, ifindex)) != NULL) + ipq_issue_verdict(entry, NF_DROP); +} + +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) + +static inline void +ipq_rcv_skb(struct sk_buff *skb) +{ + int status, type, pid, flags, nlmsglen, skblen; + struct nlmsghdr *nlh; + + skblen = skb->len; + if (skblen < sizeof(*nlh)) + return; + + nlh = (struct nlmsghdr *)skb->data; + nlmsglen = nlh->nlmsg_len; + if (nlmsglen < sizeof(*nlh) || skblen < nlmsglen) + return; + + pid = nlh->nlmsg_pid; + flags = nlh->nlmsg_flags; + + if(pid <= 0 || !(flags & NLM_F_REQUEST) || flags & NLM_F_MULTI) + RCV_SKB_FAIL(-EINVAL); + + if (flags & MSG_TRUNC) + RCV_SKB_FAIL(-ECOMM); + + type = nlh->nlmsg_type; + if (type < NLMSG_NOOP || type >= IPQM_MAX) + RCV_SKB_FAIL(-EINVAL); + + if (type <= IPQM_BASE) + return; + + if (security_netlink_recv(skb)) + RCV_SKB_FAIL(-EPERM); + + write_lock_bh(&queue_lock); + + if (peer_pid) { + if (peer_pid != pid) { + write_unlock_bh(&queue_lock); + RCV_SKB_FAIL(-EBUSY); + } + } else { + net_enable_timestamp(); + peer_pid = pid; + } + + write_unlock_bh(&queue_lock); + + status = ipq_receive_peer(NLMSG_DATA(nlh), type, + skblen - NLMSG_LENGTH(0)); + if (status < 0) + RCV_SKB_FAIL(status); + + if (flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + return; +} + +static void +ipq_rcv_sk(struct sock *sk, int len) +{ + do { + struct sk_buff *skb; + + if (down_trylock(&ipqnl_sem)) + return; + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + ipq_rcv_skb(skb); + kfree_skb(skb); + } + + up(&ipqnl_sem); + + } while (ipqnl && ipqnl->sk_receive_queue.qlen); +} + +static int +ipq_rcv_dev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + + /* Drop any packets associated with the downed device */ + if (event == NETDEV_DOWN) + ipq_dev_drop(dev->ifindex); + return NOTIFY_DONE; +} + +static struct notifier_block ipq_dev_notifier = { + .notifier_call = ipq_rcv_dev_event, +}; + +static int +ipq_rcv_nl_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct netlink_notify *n = ptr; + + if (event == NETLINK_URELEASE && + n->protocol == NETLINK_IP6_FW && n->pid) { + write_lock_bh(&queue_lock); + if (n->pid == peer_pid) + __ipq_reset(); + write_unlock_bh(&queue_lock); + } + return NOTIFY_DONE; +} + +static struct notifier_block ipq_nl_notifier = { + .notifier_call = ipq_rcv_nl_event, +}; + +static struct ctl_table_header *ipq_sysctl_header; + +static ctl_table ipq_table[] = { + { + .ctl_name = NET_IPQ_QMAX, + .procname = NET_IPQ_QMAX_NAME, + .data = &queue_maxlen, + .maxlen = sizeof(queue_maxlen), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { .ctl_name = 0 } +}; + +static ctl_table ipq_dir_table[] = { + { + .ctl_name = NET_IPV6, + .procname = "ipv6", + .mode = 0555, + .child = ipq_table + }, + { .ctl_name = 0 } +}; + +static ctl_table ipq_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ipq_dir_table + }, + { .ctl_name = 0 } +}; + +static int +ipq_get_info(char *buffer, char **start, off_t offset, int length) +{ + int len; + + read_lock_bh(&queue_lock); + + len = sprintf(buffer, + "Peer PID : %d\n" + "Copy mode : %hu\n" + "Copy range : %u\n" + "Queue length : %u\n" + "Queue max. length : %u\n" + "Queue dropped : %u\n" + "Netfilter dropped : %u\n", + peer_pid, + copy_mode, + copy_range, + queue_total, + queue_maxlen, + queue_dropped, + queue_user_dropped); + + read_unlock_bh(&queue_lock); + + *start = buffer + offset; + len -= offset; + if (len > length) + len = length; + else if (len < 0) + len = 0; + return len; +} + +static int +init_or_cleanup(int init) +{ + int status = -ENOMEM; + struct proc_dir_entry *proc; + + if (!init) + goto cleanup; + + netlink_register_notifier(&ipq_nl_notifier); + ipqnl = netlink_kernel_create(NETLINK_IP6_FW, ipq_rcv_sk); + if (ipqnl == NULL) { + printk(KERN_ERR "ip6_queue: failed to create netlink socket\n"); + goto cleanup_netlink_notifier; + } + + proc = proc_net_create(IPQ_PROC_FS_NAME, 0, ipq_get_info); + if (proc) + proc->owner = THIS_MODULE; + else { + printk(KERN_ERR "ip6_queue: failed to create proc entry\n"); + goto cleanup_ipqnl; + } + + register_netdevice_notifier(&ipq_dev_notifier); + ipq_sysctl_header = register_sysctl_table(ipq_root_table, 0); + + status = nf_register_queue_handler(PF_INET6, ipq_enqueue_packet, NULL); + if (status < 0) { + printk(KERN_ERR "ip6_queue: failed to register queue handler\n"); + goto cleanup_sysctl; + } + return status; + +cleanup: + nf_unregister_queue_handler(PF_INET6); + synchronize_net(); + ipq_flush(NF_DROP); + +cleanup_sysctl: + unregister_sysctl_table(ipq_sysctl_header); + unregister_netdevice_notifier(&ipq_dev_notifier); + proc_net_remove(IPQ_PROC_FS_NAME); + +cleanup_ipqnl: + sock_release(ipqnl->sk_socket); + down(&ipqnl_sem); + up(&ipqnl_sem); + +cleanup_netlink_notifier: + netlink_unregister_notifier(&ipq_nl_notifier); + return status; +} + +static int __init init(void) +{ + + return init_or_cleanup(1); +} + +static void __exit fini(void) +{ + init_or_cleanup(0); +} + +MODULE_DESCRIPTION("IPv6 packet queue handler"); +MODULE_LICENSE("GPL"); + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c new file mode 100644 index 000000000000..c735276fdd5f --- /dev/null +++ b/net/ipv6/netfilter/ip6_tables.c @@ -0,0 +1,1970 @@ +/* + * Packet matching code. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2002 Netfilter core team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * 19 Jan 2002 Harald Welte + * - increase module usage count as soon as we have rules inside + * a table + * 06 Jun 2002 Andras Kis-Szabo + * - new extension header parser code + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("IPv6 packet filter"); + +#define IPV6_HDR_LEN (sizeof(struct ipv6hdr)) +#define IPV6_OPTHDR_LEN (sizeof(struct ipv6_opt_hdr)) + +/*#define DEBUG_IP_FIREWALL*/ +/*#define DEBUG_ALLOW_ALL*/ /* Useful for remote debugging */ +/*#define DEBUG_IP_FIREWALL_USER*/ + +#ifdef DEBUG_IP_FIREWALL +#define dprintf(format, args...) printk(format , ## args) +#else +#define dprintf(format, args...) +#endif + +#ifdef DEBUG_IP_FIREWALL_USER +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +#ifdef CONFIG_NETFILTER_DEBUG +#define IP_NF_ASSERT(x) \ +do { \ + if (!(x)) \ + printk("IP_NF_ASSERT: %s:%s:%u\n", \ + __FUNCTION__, __FILE__, __LINE__); \ +} while(0) +#else +#define IP_NF_ASSERT(x) +#endif +#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1)) + +static DECLARE_MUTEX(ip6t_mutex); + +/* Must have mutex */ +#define ASSERT_READ_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) +#define ASSERT_WRITE_LOCK(x) IP_NF_ASSERT(down_trylock(&ip6t_mutex) != 0) +#include +#include + +#if 0 +/* All the better to debug you with... */ +#define static +#define inline +#endif + +/* Locking is simple: we assume at worst case there will be one packet + in user context and one from bottom halves (or soft irq if Alexey's + softnet patch was applied). + + We keep a set of rules for each CPU, so we can avoid write-locking + them; doing a readlock_bh() stops packets coming through if we're + in user context. + + To be cache friendly on SMP, we arrange them like so: + [ n-entries ] + ... cache-align padding ... + [ n-entries ] + + Hence the start of any table is given by get_table() below. */ + +/* The table itself */ +struct ip6t_table_info +{ + /* Size per table */ + unsigned int size; + /* Number of entries: FIXME. --RR */ + unsigned int number; + /* Initial number of entries. Needed for module usage count */ + unsigned int initial_entries; + + /* Entry points and underflows */ + unsigned int hook_entry[NF_IP6_NUMHOOKS]; + unsigned int underflow[NF_IP6_NUMHOOKS]; + + /* ip6t_entry tables: one per CPU */ + char entries[0] ____cacheline_aligned; +}; + +static LIST_HEAD(ip6t_target); +static LIST_HEAD(ip6t_match); +static LIST_HEAD(ip6t_tables); +#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) + +#ifdef CONFIG_SMP +#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) +#else +#define TABLE_OFFSET(t,p) 0 +#endif + +#if 0 +#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) +#define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) +#define up(x) do { printk("UP:%u:" #x "\n", __LINE__); up(x); } while(0) +#endif + +static int ip6_masked_addrcmp(struct in6_addr addr1, struct in6_addr mask, + struct in6_addr addr2) +{ + int i; + for( i = 0; i < 16; i++){ + if((addr1.s6_addr[i] & mask.s6_addr[i]) != + (addr2.s6_addr[i] & mask.s6_addr[i])) + return 1; + } + return 0; +} + +/* Check for an extension */ +int +ip6t_ext_hdr(u8 nexthdr) +{ + return ( (nexthdr == IPPROTO_HOPOPTS) || + (nexthdr == IPPROTO_ROUTING) || + (nexthdr == IPPROTO_FRAGMENT) || + (nexthdr == IPPROTO_ESP) || + (nexthdr == IPPROTO_AH) || + (nexthdr == IPPROTO_NONE) || + (nexthdr == IPPROTO_DSTOPTS) ); +} + +/* Returns whether matches rule or not. */ +static inline int +ip6_packet_match(const struct sk_buff *skb, + const char *indev, + const char *outdev, + const struct ip6t_ip6 *ip6info, + unsigned int *protoff, + int *fragoff) +{ + size_t i; + unsigned long ret; + const struct ipv6hdr *ipv6 = skb->nh.ipv6h; + +#define FWINV(bool,invflg) ((bool) ^ !!(ip6info->invflags & invflg)) + + if (FWINV(ip6_masked_addrcmp(ipv6->saddr,ip6info->smsk,ip6info->src), + IP6T_INV_SRCIP) + || FWINV(ip6_masked_addrcmp(ipv6->daddr,ip6info->dmsk,ip6info->dst), + IP6T_INV_DSTIP)) { + dprintf("Source or dest mismatch.\n"); +/* + dprintf("SRC: %u. Mask: %u. Target: %u.%s\n", ip->saddr, + ipinfo->smsk.s_addr, ipinfo->src.s_addr, + ipinfo->invflags & IP6T_INV_SRCIP ? " (INV)" : ""); + dprintf("DST: %u. Mask: %u. Target: %u.%s\n", ip->daddr, + ipinfo->dmsk.s_addr, ipinfo->dst.s_addr, + ipinfo->invflags & IP6T_INV_DSTIP ? " (INV)" : "");*/ + return 0; + } + + /* Look for ifname matches; this should unroll nicely. */ + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + ret |= (((const unsigned long *)indev)[i] + ^ ((const unsigned long *)ip6info->iniface)[i]) + & ((const unsigned long *)ip6info->iniface_mask)[i]; + } + + if (FWINV(ret != 0, IP6T_INV_VIA_IN)) { + dprintf("VIA in mismatch (%s vs %s).%s\n", + indev, ip6info->iniface, + ip6info->invflags&IP6T_INV_VIA_IN ?" (INV)":""); + return 0; + } + + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned long); i++) { + ret |= (((const unsigned long *)outdev)[i] + ^ ((const unsigned long *)ip6info->outiface)[i]) + & ((const unsigned long *)ip6info->outiface_mask)[i]; + } + + if (FWINV(ret != 0, IP6T_INV_VIA_OUT)) { + dprintf("VIA out mismatch (%s vs %s).%s\n", + outdev, ip6info->outiface, + ip6info->invflags&IP6T_INV_VIA_OUT ?" (INV)":""); + return 0; + } + +/* ... might want to do something with class and flowlabel here ... */ + + /* look for the desired protocol header */ + if((ip6info->flags & IP6T_F_PROTO)) { + u_int8_t currenthdr = ipv6->nexthdr; + struct ipv6_opt_hdr _hdr, *hp; + u_int16_t ptr; /* Header offset in skb */ + u_int16_t hdrlen; /* Header */ + u_int16_t _fragoff = 0, *fp = NULL; + + ptr = IPV6_HDR_LEN; + + while (ip6t_ext_hdr(currenthdr)) { + /* Is there enough space for the next ext header? */ + if (skb->len - ptr < IPV6_OPTHDR_LEN) + return 0; + + /* NONE or ESP: there isn't protocol part */ + /* If we want to count these packets in '-p all', + * we will change the return 0 to 1*/ + if ((currenthdr == IPPROTO_NONE) || + (currenthdr == IPPROTO_ESP)) + break; + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Size calculation */ + if (currenthdr == IPPROTO_FRAGMENT) { + fp = skb_header_pointer(skb, + ptr+offsetof(struct frag_hdr, + frag_off), + sizeof(_fragoff), + &_fragoff); + if (fp == NULL) + return 0; + + _fragoff = ntohs(*fp) & ~0x7; + hdrlen = 8; + } else if (currenthdr == IPPROTO_AH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + currenthdr = hp->nexthdr; + ptr += hdrlen; + /* ptr is too large */ + if ( ptr > skb->len ) + return 0; + if (_fragoff) { + if (ip6t_ext_hdr(currenthdr)) + return 0; + break; + } + } + + *protoff = ptr; + *fragoff = _fragoff; + + /* currenthdr contains the protocol header */ + + dprintf("Packet protocol %hi ?= %s%hi.\n", + currenthdr, + ip6info->invflags & IP6T_INV_PROTO ? "!":"", + ip6info->proto); + + if (ip6info->proto == currenthdr) { + if(ip6info->invflags & IP6T_INV_PROTO) { + return 0; + } + return 1; + } + + /* We need match for the '-p all', too! */ + if ((ip6info->proto != 0) && + !(ip6info->invflags & IP6T_INV_PROTO)) + return 0; + } + return 1; +} + +/* should be ip6 safe */ +static inline int +ip6_checkentry(const struct ip6t_ip6 *ipv6) +{ + if (ipv6->flags & ~IP6T_F_MASK) { + duprintf("Unknown flag bits set: %08X\n", + ipv6->flags & ~IP6T_F_MASK); + return 0; + } + if (ipv6->invflags & ~IP6T_INV_MASK) { + duprintf("Unknown invflag bits set: %08X\n", + ipv6->invflags & ~IP6T_INV_MASK); + return 0; + } + return 1; +} + +static unsigned int +ip6t_error(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + if (net_ratelimit()) + printk("ip6_tables: error: `%s'\n", (char *)targinfo); + + return NF_DROP; +} + +static inline +int do_match(struct ip6t_entry_match *m, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int offset, + unsigned int protoff, + int *hotdrop) +{ + /* Stop iteration if it doesn't match */ + if (!m->u.kernel.match->match(skb, in, out, m->data, + offset, protoff, hotdrop)) + return 1; + else + return 0; +} + +static inline struct ip6t_entry * +get_entry(void *base, unsigned int offset) +{ + return (struct ip6t_entry *)(base + offset); +} + +/* Returns one of the generic firewall policies, like NF_ACCEPT. */ +unsigned int +ip6t_do_table(struct sk_buff **pskb, + unsigned int hook, + const struct net_device *in, + const struct net_device *out, + struct ip6t_table *table, + void *userdata) +{ + static const char nulldevname[IFNAMSIZ]; + int offset = 0; + unsigned int protoff = 0; + int hotdrop = 0; + /* Initializing verdict to NF_DROP keeps gcc happy. */ + unsigned int verdict = NF_DROP; + const char *indev, *outdev; + void *table_base; + struct ip6t_entry *e, *back; + + /* Initialization */ + indev = in ? in->name : nulldevname; + outdev = out ? out->name : nulldevname; + + /* We handle fragments by dealing with the first fragment as + * if it was a normal packet. All other fragments are treated + * normally, except that they will NEVER match rules that ask + * things we don't know, ie. tcp syn flag or ports). If the + * rule is also a fragment-specific rule, non-fragments won't + * match it. */ + + read_lock_bh(&table->lock); + IP_NF_ASSERT(table->valid_hooks & (1 << hook)); + table_base = (void *)table->private->entries + + TABLE_OFFSET(table->private, smp_processor_id()); + e = get_entry(table_base, table->private->hook_entry[hook]); + +#ifdef CONFIG_NETFILTER_DEBUG + /* Check noone else using our table */ + if (((struct ip6t_entry *)table_base)->comefrom != 0xdead57ac + && ((struct ip6t_entry *)table_base)->comefrom != 0xeeeeeeec) { + printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n", + smp_processor_id(), + table->name, + &((struct ip6t_entry *)table_base)->comefrom, + ((struct ip6t_entry *)table_base)->comefrom); + } + ((struct ip6t_entry *)table_base)->comefrom = 0x57acc001; +#endif + + /* For return from builtin chain */ + back = get_entry(table_base, table->private->underflow[hook]); + + do { + IP_NF_ASSERT(e); + IP_NF_ASSERT(back); + (*pskb)->nfcache |= e->nfcache; + if (ip6_packet_match(*pskb, indev, outdev, &e->ipv6, + &protoff, &offset)) { + struct ip6t_entry_target *t; + + if (IP6T_MATCH_ITERATE(e, do_match, + *pskb, in, out, + offset, protoff, &hotdrop) != 0) + goto no_match; + + ADD_COUNTER(e->counters, + ntohs((*pskb)->nh.ipv6h->payload_len) + + IPV6_HDR_LEN, + 1); + + t = ip6t_get_target(e); + IP_NF_ASSERT(t->u.kernel.target); + /* Standard target? */ + if (!t->u.kernel.target->target) { + int v; + + v = ((struct ip6t_standard_target *)t)->verdict; + if (v < 0) { + /* Pop from stack? */ + if (v != IP6T_RETURN) { + verdict = (unsigned)(-v) - 1; + break; + } + e = back; + back = get_entry(table_base, + back->comefrom); + continue; + } + if (table_base + v + != (void *)e + e->next_offset) { + /* Save old back ptr in next entry */ + struct ip6t_entry *next + = (void *)e + e->next_offset; + next->comefrom + = (void *)back - table_base; + /* set back pointer to next entry */ + back = next; + } + + e = get_entry(table_base, v); + } else { + /* Targets which reenter must return + abs. verdicts */ +#ifdef CONFIG_NETFILTER_DEBUG + ((struct ip6t_entry *)table_base)->comefrom + = 0xeeeeeeec; +#endif + verdict = t->u.kernel.target->target(pskb, + in, out, + hook, + t->data, + userdata); + +#ifdef CONFIG_NETFILTER_DEBUG + if (((struct ip6t_entry *)table_base)->comefrom + != 0xeeeeeeec + && verdict == IP6T_CONTINUE) { + printk("Target %s reentered!\n", + t->u.kernel.target->name); + verdict = NF_DROP; + } + ((struct ip6t_entry *)table_base)->comefrom + = 0x57acc001; +#endif + if (verdict == IP6T_CONTINUE) + e = (void *)e + e->next_offset; + else + /* Verdict */ + break; + } + } else { + + no_match: + e = (void *)e + e->next_offset; + } + } while (!hotdrop); + +#ifdef CONFIG_NETFILTER_DEBUG + ((struct ip6t_entry *)table_base)->comefrom = 0xdead57ac; +#endif + read_unlock_bh(&table->lock); + +#ifdef DEBUG_ALLOW_ALL + return NF_ACCEPT; +#else + if (hotdrop) + return NF_DROP; + else return verdict; +#endif +} + +/* If it succeeds, returns element and locks mutex */ +static inline void * +find_inlist_lock_noload(struct list_head *head, + const char *name, + int *error, + struct semaphore *mutex) +{ + void *ret; + +#if 1 + duprintf("find_inlist: searching for `%s' in %s.\n", + name, head == &ip6t_target ? "ip6t_target" + : head == &ip6t_match ? "ip6t_match" + : head == &ip6t_tables ? "ip6t_tables" : "UNKNOWN"); +#endif + + *error = down_interruptible(mutex); + if (*error != 0) + return NULL; + + ret = list_named_find(head, name); + if (!ret) { + *error = -ENOENT; + up(mutex); + } + return ret; +} + +#ifndef CONFIG_KMOD +#define find_inlist_lock(h,n,p,e,m) find_inlist_lock_noload((h),(n),(e),(m)) +#else +static void * +find_inlist_lock(struct list_head *head, + const char *name, + const char *prefix, + int *error, + struct semaphore *mutex) +{ + void *ret; + + ret = find_inlist_lock_noload(head, name, error, mutex); + if (!ret) { + duprintf("find_inlist: loading `%s%s'.\n", prefix, name); + request_module("%s%s", prefix, name); + ret = find_inlist_lock_noload(head, name, error, mutex); + } + + return ret; +} +#endif + +static inline struct ip6t_table * +ip6t_find_table_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&ip6t_tables, name, "ip6table_", error, mutex); +} + +static inline struct ip6t_match * +find_match_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&ip6t_match, name, "ip6t_", error, mutex); +} + +static struct ip6t_target * +ip6t_find_target_lock(const char *name, int *error, struct semaphore *mutex) +{ + return find_inlist_lock(&ip6t_target, name, "ip6t_", error, mutex); +} + +/* All zeroes == unconditional rule. */ +static inline int +unconditional(const struct ip6t_ip6 *ipv6) +{ + unsigned int i; + + for (i = 0; i < sizeof(*ipv6); i++) + if (((char *)ipv6)[i]) + break; + + return (i == sizeof(*ipv6)); +} + +/* Figures out from what hook each rule can be called: returns 0 if + there are loops. Puts hook bitmask in comefrom. */ +static int +mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) +{ + unsigned int hook; + + /* No recursion; use packet counter to save back ptrs (reset + to 0 as we leave), and comefrom to save source hook bitmask */ + for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) { + unsigned int pos = newinfo->hook_entry[hook]; + struct ip6t_entry *e + = (struct ip6t_entry *)(newinfo->entries + pos); + + if (!(valid_hooks & (1 << hook))) + continue; + + /* Set initial back pointer. */ + e->counters.pcnt = pos; + + for (;;) { + struct ip6t_standard_target *t + = (void *)ip6t_get_target(e); + + if (e->comefrom & (1 << NF_IP6_NUMHOOKS)) { + printk("iptables: loop hook %u pos %u %08X.\n", + hook, pos, e->comefrom); + return 0; + } + e->comefrom + |= ((1 << hook) | (1 << NF_IP6_NUMHOOKS)); + + /* Unconditional return/END. */ + if (e->target_offset == sizeof(struct ip6t_entry) + && (strcmp(t->target.u.user.name, + IP6T_STANDARD_TARGET) == 0) + && t->verdict < 0 + && unconditional(&e->ipv6)) { + unsigned int oldpos, size; + + /* Return: backtrack through the last + big jump. */ + do { + e->comefrom ^= (1<comefrom + & (1 << NF_IP6_NUMHOOKS)) { + duprintf("Back unset " + "on hook %u " + "rule %u\n", + hook, pos); + } +#endif + oldpos = pos; + pos = e->counters.pcnt; + e->counters.pcnt = 0; + + /* We're at the start. */ + if (pos == oldpos) + goto next; + + e = (struct ip6t_entry *) + (newinfo->entries + pos); + } while (oldpos == pos + e->next_offset); + + /* Move along one */ + size = e->next_offset; + e = (struct ip6t_entry *) + (newinfo->entries + pos + size); + e->counters.pcnt = pos; + pos += size; + } else { + int newpos = t->verdict; + + if (strcmp(t->target.u.user.name, + IP6T_STANDARD_TARGET) == 0 + && newpos >= 0) { + /* This a jump; chase it. */ + duprintf("Jump rule %u -> %u\n", + pos, newpos); + } else { + /* ... this is a fallthru */ + newpos = pos + e->next_offset; + } + e = (struct ip6t_entry *) + (newinfo->entries + newpos); + e->counters.pcnt = pos; + pos = newpos; + } + } + next: + duprintf("Finished chain %u\n", hook); + } + return 1; +} + +static inline int +cleanup_match(struct ip6t_entry_match *m, unsigned int *i) +{ + if (i && (*i)-- == 0) + return 1; + + if (m->u.kernel.match->destroy) + m->u.kernel.match->destroy(m->data, + m->u.match_size - sizeof(*m)); + module_put(m->u.kernel.match->me); + return 0; +} + +static inline int +standard_check(const struct ip6t_entry_target *t, + unsigned int max_offset) +{ + struct ip6t_standard_target *targ = (void *)t; + + /* Check standard info. */ + if (t->u.target_size + != IP6T_ALIGN(sizeof(struct ip6t_standard_target))) { + duprintf("standard_check: target size %u != %u\n", + t->u.target_size, + IP6T_ALIGN(sizeof(struct ip6t_standard_target))); + return 0; + } + + if (targ->verdict >= 0 + && targ->verdict > max_offset - sizeof(struct ip6t_entry)) { + duprintf("ip6t_standard_check: bad verdict (%i)\n", + targ->verdict); + return 0; + } + + if (targ->verdict < -NF_MAX_VERDICT - 1) { + duprintf("ip6t_standard_check: bad negative verdict (%i)\n", + targ->verdict); + return 0; + } + return 1; +} + +static inline int +check_match(struct ip6t_entry_match *m, + const char *name, + const struct ip6t_ip6 *ipv6, + unsigned int hookmask, + unsigned int *i) +{ + int ret; + struct ip6t_match *match; + + match = find_match_lock(m->u.user.name, &ret, &ip6t_mutex); + if (!match) { + // duprintf("check_match: `%s' not found\n", m->u.name); + return ret; + } + if (!try_module_get(match->me)) { + up(&ip6t_mutex); + return -ENOENT; + } + m->u.kernel.match = match; + up(&ip6t_mutex); + + if (m->u.kernel.match->checkentry + && !m->u.kernel.match->checkentry(name, ipv6, m->data, + m->u.match_size - sizeof(*m), + hookmask)) { + module_put(m->u.kernel.match->me); + duprintf("ip_tables: check failed for `%s'.\n", + m->u.kernel.match->name); + return -EINVAL; + } + + (*i)++; + return 0; +} + +static struct ip6t_target ip6t_standard_target; + +static inline int +check_entry(struct ip6t_entry *e, const char *name, unsigned int size, + unsigned int *i) +{ + struct ip6t_entry_target *t; + struct ip6t_target *target; + int ret; + unsigned int j; + + if (!ip6_checkentry(&e->ipv6)) { + duprintf("ip_tables: ip check failed %p %s.\n", e, name); + return -EINVAL; + } + + j = 0; + ret = IP6T_MATCH_ITERATE(e, check_match, name, &e->ipv6, e->comefrom, &j); + if (ret != 0) + goto cleanup_matches; + + t = ip6t_get_target(e); + target = ip6t_find_target_lock(t->u.user.name, &ret, &ip6t_mutex); + if (!target) { + duprintf("check_entry: `%s' not found\n", t->u.user.name); + goto cleanup_matches; + } + if (!try_module_get(target->me)) { + up(&ip6t_mutex); + ret = -ENOENT; + goto cleanup_matches; + } + t->u.kernel.target = target; + up(&ip6t_mutex); + if (!t->u.kernel.target) { + ret = -EBUSY; + goto cleanup_matches; + } + if (t->u.kernel.target == &ip6t_standard_target) { + if (!standard_check(t, size)) { + ret = -EINVAL; + goto cleanup_matches; + } + } else if (t->u.kernel.target->checkentry + && !t->u.kernel.target->checkentry(name, e, t->data, + t->u.target_size + - sizeof(*t), + e->comefrom)) { + module_put(t->u.kernel.target->me); + duprintf("ip_tables: check failed for `%s'.\n", + t->u.kernel.target->name); + ret = -EINVAL; + goto cleanup_matches; + } + + (*i)++; + return 0; + + cleanup_matches: + IP6T_MATCH_ITERATE(e, cleanup_match, &j); + return ret; +} + +static inline int +check_entry_size_and_hooks(struct ip6t_entry *e, + struct ip6t_table_info *newinfo, + unsigned char *base, + unsigned char *limit, + const unsigned int *hook_entries, + const unsigned int *underflows, + unsigned int *i) +{ + unsigned int h; + + if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 + || (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) { + duprintf("Bad offset %p\n", e); + return -EINVAL; + } + + if (e->next_offset + < sizeof(struct ip6t_entry) + sizeof(struct ip6t_entry_target)) { + duprintf("checking: element %p size %u\n", + e, e->next_offset); + return -EINVAL; + } + + /* Check hooks & underflows */ + for (h = 0; h < NF_IP6_NUMHOOKS; h++) { + if ((unsigned char *)e - base == hook_entries[h]) + newinfo->hook_entry[h] = hook_entries[h]; + if ((unsigned char *)e - base == underflows[h]) + newinfo->underflow[h] = underflows[h]; + } + + /* FIXME: underflows must be unconditional, standard verdicts + < 0 (not IP6T_RETURN). --RR */ + + /* Clear counters and comefrom */ + e->counters = ((struct ip6t_counters) { 0, 0 }); + e->comefrom = 0; + + (*i)++; + return 0; +} + +static inline int +cleanup_entry(struct ip6t_entry *e, unsigned int *i) +{ + struct ip6t_entry_target *t; + + if (i && (*i)-- == 0) + return 1; + + /* Cleanup all matches */ + IP6T_MATCH_ITERATE(e, cleanup_match, NULL); + t = ip6t_get_target(e); + if (t->u.kernel.target->destroy) + t->u.kernel.target->destroy(t->data, + t->u.target_size - sizeof(*t)); + module_put(t->u.kernel.target->me); + return 0; +} + +/* Checks and translates the user-supplied table segment (held in + newinfo) */ +static int +translate_table(const char *name, + unsigned int valid_hooks, + struct ip6t_table_info *newinfo, + unsigned int size, + unsigned int number, + const unsigned int *hook_entries, + const unsigned int *underflows) +{ + unsigned int i; + int ret; + + newinfo->size = size; + newinfo->number = number; + + /* Init all hooks to impossible value. */ + for (i = 0; i < NF_IP6_NUMHOOKS; i++) { + newinfo->hook_entry[i] = 0xFFFFFFFF; + newinfo->underflow[i] = 0xFFFFFFFF; + } + + duprintf("translate_table: size %u\n", newinfo->size); + i = 0; + /* Walk through entries, checking offsets. */ + ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry_size_and_hooks, + newinfo, + newinfo->entries, + newinfo->entries + size, + hook_entries, underflows, &i); + if (ret != 0) + return ret; + + if (i != number) { + duprintf("translate_table: %u not %u entries\n", + i, number); + return -EINVAL; + } + + /* Check hooks all assigned */ + for (i = 0; i < NF_IP6_NUMHOOKS; i++) { + /* Only hooks which are valid */ + if (!(valid_hooks & (1 << i))) + continue; + if (newinfo->hook_entry[i] == 0xFFFFFFFF) { + duprintf("Invalid hook entry %u %u\n", + i, hook_entries[i]); + return -EINVAL; + } + if (newinfo->underflow[i] == 0xFFFFFFFF) { + duprintf("Invalid underflow %u %u\n", + i, underflows[i]); + return -EINVAL; + } + } + + if (!mark_source_chains(newinfo, valid_hooks)) + return -ELOOP; + + /* Finally, each sanity check must pass */ + i = 0; + ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + check_entry, name, size, &i); + + if (ret != 0) { + IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + cleanup_entry, &i); + return ret; + } + + /* And one copy for every other CPU */ + for (i = 1; i < num_possible_cpus(); i++) { + memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i, + newinfo->entries, + SMP_ALIGN(newinfo->size)); + } + + return ret; +} + +static struct ip6t_table_info * +replace_table(struct ip6t_table *table, + unsigned int num_counters, + struct ip6t_table_info *newinfo, + int *error) +{ + struct ip6t_table_info *oldinfo; + +#ifdef CONFIG_NETFILTER_DEBUG + { + struct ip6t_entry *table_base; + unsigned int i; + + for (i = 0; i < num_possible_cpus(); i++) { + table_base = + (void *)newinfo->entries + + TABLE_OFFSET(newinfo, i); + + table_base->comefrom = 0xdead57ac; + } + } +#endif + + /* Do the substitution. */ + write_lock_bh(&table->lock); + /* Check inside lock: is the old number correct? */ + if (num_counters != table->private->number) { + duprintf("num_counters != table->private->number (%u/%u)\n", + num_counters, table->private->number); + write_unlock_bh(&table->lock); + *error = -EAGAIN; + return NULL; + } + oldinfo = table->private; + table->private = newinfo; + newinfo->initial_entries = oldinfo->initial_entries; + write_unlock_bh(&table->lock); + + return oldinfo; +} + +/* Gets counters. */ +static inline int +add_entry_to_counter(const struct ip6t_entry *e, + struct ip6t_counters total[], + unsigned int *i) +{ + ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + +static void +get_counters(const struct ip6t_table_info *t, + struct ip6t_counters counters[]) +{ + unsigned int cpu; + unsigned int i; + + for (cpu = 0; cpu < num_possible_cpus(); cpu++) { + i = 0; + IP6T_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + t->size, + add_entry_to_counter, + counters, + &i); + } +} + +static int +copy_entries_to_user(unsigned int total_size, + struct ip6t_table *table, + void __user *userptr) +{ + unsigned int off, num, countersize; + struct ip6t_entry *e; + struct ip6t_counters *counters; + int ret = 0; + + /* We need atomic snapshot of counters: rest doesn't change + (other than comefrom, which userspace doesn't care + about). */ + countersize = sizeof(struct ip6t_counters) * table->private->number; + counters = vmalloc(countersize); + + if (counters == NULL) + return -ENOMEM; + + /* First, sum counters... */ + memset(counters, 0, countersize); + write_lock_bh(&table->lock); + get_counters(table->private, counters); + write_unlock_bh(&table->lock); + + /* ... then copy entire thing from CPU 0... */ + if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + ret = -EFAULT; + goto free_counters; + } + + /* FIXME: use iterator macros --RR */ + /* ... then go back and fix counters and names */ + for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ + unsigned int i; + struct ip6t_entry_match *m; + struct ip6t_entry_target *t; + + e = (struct ip6t_entry *)(table->private->entries + off); + if (copy_to_user(userptr + off + + offsetof(struct ip6t_entry, counters), + &counters[num], + sizeof(counters[num])) != 0) { + ret = -EFAULT; + goto free_counters; + } + + for (i = sizeof(struct ip6t_entry); + i < e->target_offset; + i += m->u.match_size) { + m = (void *)e + i; + + if (copy_to_user(userptr + off + i + + offsetof(struct ip6t_entry_match, + u.user.name), + m->u.kernel.match->name, + strlen(m->u.kernel.match->name)+1) + != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + t = ip6t_get_target(e); + if (copy_to_user(userptr + off + e->target_offset + + offsetof(struct ip6t_entry_target, + u.user.name), + t->u.kernel.target->name, + strlen(t->u.kernel.target->name)+1) != 0) { + ret = -EFAULT; + goto free_counters; + } + } + + free_counters: + vfree(counters); + return ret; +} + +static int +get_entries(const struct ip6t_get_entries *entries, + struct ip6t_get_entries __user *uptr) +{ + int ret; + struct ip6t_table *t; + + t = ip6t_find_table_lock(entries->name, &ret, &ip6t_mutex); + if (t) { + duprintf("t->private->number = %u\n", + t->private->number); + if (entries->size == t->private->size) + ret = copy_entries_to_user(t->private->size, + t, uptr->entrytable); + else { + duprintf("get_entries: I've got %u not %u!\n", + t->private->size, + entries->size); + ret = -EINVAL; + } + up(&ip6t_mutex); + } else + duprintf("get_entries: Can't find %s!\n", + entries->name); + + return ret; +} + +static int +do_replace(void __user *user, unsigned int len) +{ + int ret; + struct ip6t_replace tmp; + struct ip6t_table *t; + struct ip6t_table_info *newinfo, *oldinfo; + struct ip6t_counters *counters; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ + if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) + return -ENOMEM; + + newinfo = vmalloc(sizeof(struct ip6t_table_info) + + SMP_ALIGN(tmp.size) * num_possible_cpus()); + if (!newinfo) + return -ENOMEM; + + if (copy_from_user(newinfo->entries, user + sizeof(tmp), + tmp.size) != 0) { + ret = -EFAULT; + goto free_newinfo; + } + + counters = vmalloc(tmp.num_counters * sizeof(struct ip6t_counters)); + if (!counters) { + ret = -ENOMEM; + goto free_newinfo; + } + memset(counters, 0, tmp.num_counters * sizeof(struct ip6t_counters)); + + ret = translate_table(tmp.name, tmp.valid_hooks, + newinfo, tmp.size, tmp.num_entries, + tmp.hook_entry, tmp.underflow); + if (ret != 0) + goto free_newinfo_counters; + + duprintf("ip_tables: Translated table\n"); + + t = ip6t_find_table_lock(tmp.name, &ret, &ip6t_mutex); + if (!t) + goto free_newinfo_counters_untrans; + + /* You lied! */ + if (tmp.valid_hooks != t->valid_hooks) { + duprintf("Valid hook crap: %08X vs %08X\n", + tmp.valid_hooks, t->valid_hooks); + ret = -EINVAL; + goto free_newinfo_counters_untrans_unlock; + } + + /* Get a reference in advance, we're not allowed fail later */ + if (!try_module_get(t->me)) { + ret = -EBUSY; + goto free_newinfo_counters_untrans_unlock; + } + + oldinfo = replace_table(t, tmp.num_counters, newinfo, &ret); + if (!oldinfo) + goto put_module; + + /* Update module usage count based on number of rules */ + duprintf("do_replace: oldnum=%u, initnum=%u, newnum=%u\n", + oldinfo->number, oldinfo->initial_entries, newinfo->number); + if ((oldinfo->number > oldinfo->initial_entries) || + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + if ((oldinfo->number > oldinfo->initial_entries) && + (newinfo->number <= oldinfo->initial_entries)) + module_put(t->me); + + /* Get the old counters. */ + get_counters(oldinfo, counters); + /* Decrease module usage counts and free resource */ + IP6T_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); + vfree(oldinfo); + /* Silent error: too late now. */ + if (copy_to_user(tmp.counters, counters, + sizeof(struct ip6t_counters) * tmp.num_counters) != 0) + ret = -EFAULT; + vfree(counters); + up(&ip6t_mutex); + return ret; + + put_module: + module_put(t->me); + free_newinfo_counters_untrans_unlock: + up(&ip6t_mutex); + free_newinfo_counters_untrans: + IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + free_newinfo_counters: + vfree(counters); + free_newinfo: + vfree(newinfo); + return ret; +} + +/* We're lazy, and add to the first CPU; overflow works its fey magic + * and everything is OK. */ +static inline int +add_counter_to_entry(struct ip6t_entry *e, + const struct ip6t_counters addme[], + unsigned int *i) +{ +#if 0 + duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n", + *i, + (long unsigned int)e->counters.pcnt, + (long unsigned int)e->counters.bcnt, + (long unsigned int)addme[*i].pcnt, + (long unsigned int)addme[*i].bcnt); +#endif + + ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt); + + (*i)++; + return 0; +} + +static int +do_add_counters(void __user *user, unsigned int len) +{ + unsigned int i; + struct ip6t_counters_info tmp, *paddc; + struct ip6t_table *t; + int ret; + + if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) + return -EFAULT; + + if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ip6t_counters)) + return -EINVAL; + + paddc = vmalloc(len); + if (!paddc) + return -ENOMEM; + + if (copy_from_user(paddc, user, len) != 0) { + ret = -EFAULT; + goto free; + } + + t = ip6t_find_table_lock(tmp.name, &ret, &ip6t_mutex); + if (!t) + goto free; + + write_lock_bh(&t->lock); + if (t->private->number != paddc->num_counters) { + ret = -EINVAL; + goto unlock_up_free; + } + + i = 0; + IP6T_ENTRY_ITERATE(t->private->entries, + t->private->size, + add_counter_to_entry, + paddc->counters, + &i); + unlock_up_free: + write_unlock_bh(&t->lock); + up(&ip6t_mutex); + free: + vfree(paddc); + + return ret; +} + +static int +do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IP6T_SO_SET_REPLACE: + ret = do_replace(user, len); + break; + + case IP6T_SO_SET_ADD_COUNTERS: + ret = do_add_counters(user, len); + break; + + default: + duprintf("do_ip6t_set_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +static int +do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) +{ + int ret; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + switch (cmd) { + case IP6T_SO_GET_INFO: { + char name[IP6T_TABLE_MAXNAMELEN]; + struct ip6t_table *t; + + if (*len != sizeof(struct ip6t_getinfo)) { + duprintf("length %u != %u\n", *len, + sizeof(struct ip6t_getinfo)); + ret = -EINVAL; + break; + } + + if (copy_from_user(name, user, sizeof(name)) != 0) { + ret = -EFAULT; + break; + } + name[IP6T_TABLE_MAXNAMELEN-1] = '\0'; + t = ip6t_find_table_lock(name, &ret, &ip6t_mutex); + if (t) { + struct ip6t_getinfo info; + + info.valid_hooks = t->valid_hooks; + memcpy(info.hook_entry, t->private->hook_entry, + sizeof(info.hook_entry)); + memcpy(info.underflow, t->private->underflow, + sizeof(info.underflow)); + info.num_entries = t->private->number; + info.size = t->private->size; + memcpy(info.name, name, sizeof(info.name)); + + if (copy_to_user(user, &info, *len) != 0) + ret = -EFAULT; + else + ret = 0; + + up(&ip6t_mutex); + } + } + break; + + case IP6T_SO_GET_ENTRIES: { + struct ip6t_get_entries get; + + if (*len < sizeof(get)) { + duprintf("get_entries: %u < %u\n", *len, sizeof(get)); + ret = -EINVAL; + } else if (copy_from_user(&get, user, sizeof(get)) != 0) { + ret = -EFAULT; + } else if (*len != sizeof(struct ip6t_get_entries) + get.size) { + duprintf("get_entries: %u != %u\n", *len, + sizeof(struct ip6t_get_entries) + get.size); + ret = -EINVAL; + } else + ret = get_entries(&get, user); + break; + } + + default: + duprintf("do_ip6t_get_ctl: unknown request %i\n", cmd); + ret = -EINVAL; + } + + return ret; +} + +/* Registration hooks for targets. */ +int +ip6t_register_target(struct ip6t_target *target) +{ + int ret; + + ret = down_interruptible(&ip6t_mutex); + if (ret != 0) + return ret; + + if (!list_named_insert(&ip6t_target, target)) { + duprintf("ip6t_register_target: `%s' already in list!\n", + target->name); + ret = -EINVAL; + } + up(&ip6t_mutex); + return ret; +} + +void +ip6t_unregister_target(struct ip6t_target *target) +{ + down(&ip6t_mutex); + LIST_DELETE(&ip6t_target, target); + up(&ip6t_mutex); +} + +int +ip6t_register_match(struct ip6t_match *match) +{ + int ret; + + ret = down_interruptible(&ip6t_mutex); + if (ret != 0) + return ret; + + if (!list_named_insert(&ip6t_match, match)) { + duprintf("ip6t_register_match: `%s' already in list!\n", + match->name); + ret = -EINVAL; + } + up(&ip6t_mutex); + + return ret; +} + +void +ip6t_unregister_match(struct ip6t_match *match) +{ + down(&ip6t_mutex); + LIST_DELETE(&ip6t_match, match); + up(&ip6t_mutex); +} + +int ip6t_register_table(struct ip6t_table *table, + const struct ip6t_replace *repl) +{ + int ret; + struct ip6t_table_info *newinfo; + static struct ip6t_table_info bootstrap + = { 0, 0, 0, { 0 }, { 0 }, { } }; + + newinfo = vmalloc(sizeof(struct ip6t_table_info) + + SMP_ALIGN(repl->size) * num_possible_cpus()); + if (!newinfo) + return -ENOMEM; + + memcpy(newinfo->entries, repl->entries, repl->size); + + ret = translate_table(table->name, table->valid_hooks, + newinfo, repl->size, + repl->num_entries, + repl->hook_entry, + repl->underflow); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + ret = down_interruptible(&ip6t_mutex); + if (ret != 0) { + vfree(newinfo); + return ret; + } + + /* Don't autoload: we'd eat our tail... */ + if (list_named_find(&ip6t_tables, table->name)) { + ret = -EEXIST; + goto free_unlock; + } + + /* Simplifies replace_table code. */ + table->private = &bootstrap; + if (!replace_table(table, 0, newinfo, &ret)) + goto free_unlock; + + duprintf("table->private->number = %u\n", + table->private->number); + + /* save number of initial entries */ + table->private->initial_entries = table->private->number; + + rwlock_init(&table->lock); + list_prepend(&ip6t_tables, table); + + unlock: + up(&ip6t_mutex); + return ret; + + free_unlock: + vfree(newinfo); + goto unlock; +} + +void ip6t_unregister_table(struct ip6t_table *table) +{ + down(&ip6t_mutex); + LIST_DELETE(&ip6t_tables, table); + up(&ip6t_mutex); + + /* Decrease module usage counts and free resources */ + IP6T_ENTRY_ITERATE(table->private->entries, table->private->size, + cleanup_entry, NULL); + vfree(table->private); +} + +/* Returns 1 if the port is matched by the range, 0 otherwise */ +static inline int +port_match(u_int16_t min, u_int16_t max, u_int16_t port, int invert) +{ + int ret; + + ret = (port >= min && port <= max) ^ invert; + return ret; +} + +static int +tcp_find_option(u_int8_t option, + const struct sk_buff *skb, + unsigned int tcpoff, + unsigned int optlen, + int invert, + int *hotdrop) +{ + /* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */ + u_int8_t _opt[60 - sizeof(struct tcphdr)], *op; + unsigned int i; + + duprintf("tcp_match: finding option\n"); + if (!optlen) + return invert; + /* If we don't have the whole header, drop packet. */ + op = skb_header_pointer(skb, tcpoff + sizeof(struct tcphdr), optlen, + _opt); + if (op == NULL) { + *hotdrop = 1; + return 0; + } + + for (i = 0; i < optlen; ) { + if (op[i] == option) return !invert; + if (op[i] < 2) i++; + else i += op[i+1]?:1; + } + + return invert; +} + +static int +tcp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct tcphdr _tcph, *th; + const struct ip6t_tcp *tcpinfo = matchinfo; + + if (offset) { + /* To quote Alan: + + Don't allow a fragment of TCP 8 bytes in. Nobody normal + causes this. Its a cracker trying to break in by doing a + flag overwrite to pass the direction checks. + */ + if (offset == 1) { + duprintf("Dropping evil TCP offset=1 frag.\n"); + *hotdrop = 1; + } + /* Must not be a fragment. */ + return 0; + } + +#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg)) + + th = skb_header_pointer(skb, protoff, sizeof(_tcph), &_tcph); + if (th == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil TCP offset=0 tinygram.\n"); + *hotdrop = 1; + return 0; + } + + if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1], + ntohs(th->source), + !!(tcpinfo->invflags & IP6T_TCP_INV_SRCPT))) + return 0; + if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1], + ntohs(th->dest), + !!(tcpinfo->invflags & IP6T_TCP_INV_DSTPT))) + return 0; + if (!FWINVTCP((((unsigned char *)th)[13] & tcpinfo->flg_mask) + == tcpinfo->flg_cmp, + IP6T_TCP_INV_FLAGS)) + return 0; + if (tcpinfo->option) { + if (th->doff * 4 < sizeof(_tcph)) { + *hotdrop = 1; + return 0; + } + if (!tcp_find_option(tcpinfo->option, skb, protoff, + th->doff*4 - sizeof(*th), + tcpinfo->invflags & IP6T_TCP_INV_OPTION, + hotdrop)) + return 0; + } + return 1; +} + +/* Called when user tries to insert an entry of this type. */ +static int +tcp_checkentry(const char *tablename, + const struct ip6t_ip6 *ipv6, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ip6t_tcp *tcpinfo = matchinfo; + + /* Must specify proto == TCP, and no unknown invflags */ + return ipv6->proto == IPPROTO_TCP + && !(ipv6->invflags & IP6T_INV_PROTO) + && matchsize == IP6T_ALIGN(sizeof(struct ip6t_tcp)) + && !(tcpinfo->invflags & ~IP6T_TCP_INV_MASK); +} + +static int +udp_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct udphdr _udph, *uh; + const struct ip6t_udp *udpinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + uh = skb_header_pointer(skb, protoff, sizeof(_udph), &_udph); + if (uh == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil UDP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return port_match(udpinfo->spts[0], udpinfo->spts[1], + ntohs(uh->source), + !!(udpinfo->invflags & IP6T_UDP_INV_SRCPT)) + && port_match(udpinfo->dpts[0], udpinfo->dpts[1], + ntohs(uh->dest), + !!(udpinfo->invflags & IP6T_UDP_INV_DSTPT)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +udp_checkentry(const char *tablename, + const struct ip6t_ip6 *ipv6, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ip6t_udp *udpinfo = matchinfo; + + /* Must specify proto == UDP, and no unknown invflags */ + if (ipv6->proto != IPPROTO_UDP || (ipv6->invflags & IP6T_INV_PROTO)) { + duprintf("ip6t_udp: Protocol %u != %u\n", ipv6->proto, + IPPROTO_UDP); + return 0; + } + if (matchinfosize != IP6T_ALIGN(sizeof(struct ip6t_udp))) { + duprintf("ip6t_udp: matchsize %u != %u\n", + matchinfosize, IP6T_ALIGN(sizeof(struct ip6t_udp))); + return 0; + } + if (udpinfo->invflags & ~IP6T_UDP_INV_MASK) { + duprintf("ip6t_udp: unknown flags %X\n", + udpinfo->invflags); + return 0; + } + + return 1; +} + +/* Returns 1 if the type and code is matched by the range, 0 otherwise */ +static inline int +icmp6_type_code_match(u_int8_t test_type, u_int8_t min_code, u_int8_t max_code, + u_int8_t type, u_int8_t code, + int invert) +{ + return (type == test_type && code >= min_code && code <= max_code) + ^ invert; +} + +static int +icmp6_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct icmp6hdr _icmp, *ic; + const struct ip6t_icmp *icmpinfo = matchinfo; + + /* Must not be a fragment. */ + if (offset) + return 0; + + ic = skb_header_pointer(skb, protoff, sizeof(_icmp), &_icmp); + if (ic == NULL) { + /* We've been asked to examine this packet, and we + can't. Hence, no choice but to drop. */ + duprintf("Dropping evil ICMP tinygram.\n"); + *hotdrop = 1; + return 0; + } + + return icmp6_type_code_match(icmpinfo->type, + icmpinfo->code[0], + icmpinfo->code[1], + ic->icmp6_type, ic->icmp6_code, + !!(icmpinfo->invflags&IP6T_ICMP_INV)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +icmp6_checkentry(const char *tablename, + const struct ip6t_ip6 *ipv6, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ip6t_icmp *icmpinfo = matchinfo; + + /* Must specify proto == ICMP, and no unknown invflags */ + return ipv6->proto == IPPROTO_ICMPV6 + && !(ipv6->invflags & IP6T_INV_PROTO) + && matchsize == IP6T_ALIGN(sizeof(struct ip6t_icmp)) + && !(icmpinfo->invflags & ~IP6T_ICMP_INV); +} + +/* The built-in targets: standard (NULL) and error. */ +static struct ip6t_target ip6t_standard_target = { + .name = IP6T_STANDARD_TARGET, +}; + +static struct ip6t_target ip6t_error_target = { + .name = IP6T_ERROR_TARGET, + .target = ip6t_error, +}; + +static struct nf_sockopt_ops ip6t_sockopts = { + .pf = PF_INET6, + .set_optmin = IP6T_BASE_CTL, + .set_optmax = IP6T_SO_SET_MAX+1, + .set = do_ip6t_set_ctl, + .get_optmin = IP6T_BASE_CTL, + .get_optmax = IP6T_SO_GET_MAX+1, + .get = do_ip6t_get_ctl, +}; + +static struct ip6t_match tcp_matchstruct = { + .name = "tcp", + .match = &tcp_match, + .checkentry = &tcp_checkentry, +}; + +static struct ip6t_match udp_matchstruct = { + .name = "udp", + .match = &udp_match, + .checkentry = &udp_checkentry, +}; + +static struct ip6t_match icmp6_matchstruct = { + .name = "icmp6", + .match = &icmp6_match, + .checkentry = &icmp6_checkentry, +}; + +#ifdef CONFIG_PROC_FS +static inline int print_name(const char *i, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if ((*count)++ >= start_offset) { + unsigned int namelen; + + namelen = sprintf(buffer + *pos, "%s\n", + i + sizeof(struct list_head)); + if (*pos + namelen > length) { + /* Stop iterating */ + return 1; + } + *pos += namelen; + } + return 0; +} + +static inline int print_target(const struct ip6t_target *t, + off_t start_offset, char *buffer, int length, + off_t *pos, unsigned int *count) +{ + if (t == &ip6t_standard_target || t == &ip6t_error_target) + return 0; + return print_name((char *)t, start_offset, buffer, length, pos, count); +} + +static int ip6t_get_tables(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ip6t_mutex) != 0) + return 0; + + LIST_FIND(&ip6t_tables, print_name, char *, + offset, buffer, length, &pos, &count); + + up(&ip6t_mutex); + + /* `start' hack - see fs/proc/generic.c line ~105 */ + *start=(char *)((unsigned long)count-offset); + return pos; +} + +static int ip6t_get_targets(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ip6t_mutex) != 0) + return 0; + + LIST_FIND(&ip6t_target, print_target, struct ip6t_target *, + offset, buffer, length, &pos, &count); + + up(&ip6t_mutex); + + *start = (char *)((unsigned long)count - offset); + return pos; +} + +static int ip6t_get_matches(char *buffer, char **start, off_t offset, int length) +{ + off_t pos = 0; + unsigned int count = 0; + + if (down_interruptible(&ip6t_mutex) != 0) + return 0; + + LIST_FIND(&ip6t_match, print_name, char *, + offset, buffer, length, &pos, &count); + + up(&ip6t_mutex); + + *start = (char *)((unsigned long)count - offset); + return pos; +} + +static struct { char *name; get_info_t *get_info; } ip6t_proc_entry[] = +{ { "ip6_tables_names", ip6t_get_tables }, + { "ip6_tables_targets", ip6t_get_targets }, + { "ip6_tables_matches", ip6t_get_matches }, + { NULL, NULL} }; +#endif /*CONFIG_PROC_FS*/ + +static int __init init(void) +{ + int ret; + + /* Noone else will be downing sem now, so we won't sleep */ + down(&ip6t_mutex); + list_append(&ip6t_target, &ip6t_standard_target); + list_append(&ip6t_target, &ip6t_error_target); + list_append(&ip6t_match, &tcp_matchstruct); + list_append(&ip6t_match, &udp_matchstruct); + list_append(&ip6t_match, &icmp6_matchstruct); + up(&ip6t_mutex); + + /* Register setsockopt */ + ret = nf_register_sockopt(&ip6t_sockopts); + if (ret < 0) { + duprintf("Unable to register sockopts.\n"); + return ret; + } + +#ifdef CONFIG_PROC_FS + { + struct proc_dir_entry *proc; + int i; + + for (i = 0; ip6t_proc_entry[i].name; i++) { + proc = proc_net_create(ip6t_proc_entry[i].name, 0, + ip6t_proc_entry[i].get_info); + if (!proc) { + while (--i >= 0) + proc_net_remove(ip6t_proc_entry[i].name); + nf_unregister_sockopt(&ip6t_sockopts); + return -ENOMEM; + } + proc->owner = THIS_MODULE; + } + } +#endif + + printk("ip6_tables: (C) 2000-2002 Netfilter core team\n"); + return 0; +} + +static void __exit fini(void) +{ + nf_unregister_sockopt(&ip6t_sockopts); +#ifdef CONFIG_PROC_FS + { + int i; + for (i = 0; ip6t_proc_entry[i].name; i++) + proc_net_remove(ip6t_proc_entry[i].name); + } +#endif +} + +EXPORT_SYMBOL(ip6t_register_table); +EXPORT_SYMBOL(ip6t_unregister_table); +EXPORT_SYMBOL(ip6t_do_table); +EXPORT_SYMBOL(ip6t_register_match); +EXPORT_SYMBOL(ip6t_unregister_match); +EXPORT_SYMBOL(ip6t_register_target); +EXPORT_SYMBOL(ip6t_unregister_target); +EXPORT_SYMBOL(ip6t_ext_hdr); + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c new file mode 100644 index 000000000000..bfc3d0185d19 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -0,0 +1,509 @@ +/* + * This is a module which is used for logging packets. + */ + +/* (C) 2001 Jan Rekorajski + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +MODULE_AUTHOR("Jan Rekorajski "); +MODULE_DESCRIPTION("IP6 tables LOG target module"); +MODULE_LICENSE("GPL"); + +static unsigned int nflog = 1; +module_param(nflog, int, 0400); +MODULE_PARM_DESC(nflog, "register as internal netfilter logging module"); + +struct in_device; +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Use lock to serialize, so printks don't overlap */ +static DEFINE_SPINLOCK(log_lock); + +/* One level of recursion won't kill us */ +static void dump_packet(const struct ip6t_log_info *info, + const struct sk_buff *skb, unsigned int ip6hoff, + int recurse) +{ + u_int8_t currenthdr; + int fragment; + struct ipv6hdr _ip6h, *ih; + unsigned int ptr; + unsigned int hdrlen = 0; + + ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); + if (ih == NULL) { + printk("TRUNCATED"); + return; + } + + /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000" */ + printk("SRC=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", NIP6(ih->saddr)); + printk("DST=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", NIP6(ih->daddr)); + + /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ + printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", + ntohs(ih->payload_len) + sizeof(struct ipv6hdr), + (ntohl(*(u_int32_t *)ih) & 0x0ff00000) >> 20, + ih->hop_limit, + (ntohl(*(u_int32_t *)ih) & 0x000fffff)); + + fragment = 0; + ptr = ip6hoff + sizeof(struct ipv6hdr); + currenthdr = ih->nexthdr; + while (currenthdr != NEXTHDR_NONE && ip6t_ext_hdr(currenthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + if (hp == NULL) { + printk("TRUNCATED"); + return; + } + + /* Max length: 48 "OPT (...) " */ + if (info->logflags & IP6T_LOG_IPOPT) + printk("OPT ( "); + + switch (currenthdr) { + case IPPROTO_FRAGMENT: { + struct frag_hdr _fhdr, *fh; + + printk("FRAG:"); + fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), + &_fhdr); + if (fh == NULL) { + printk("TRUNCATED "); + return; + } + + /* Max length: 6 "65535 " */ + printk("%u ", ntohs(fh->frag_off) & 0xFFF8); + + /* Max length: 11 "INCOMPLETE " */ + if (fh->frag_off & htons(0x0001)) + printk("INCOMPLETE "); + + printk("ID:%08x ", ntohl(fh->identification)); + + if (ntohs(fh->frag_off) & 0xFFF8) + fragment = 1; + + hdrlen = 8; + + break; + } + case IPPROTO_DSTOPTS: + case IPPROTO_ROUTING: + case IPPROTO_HOPOPTS: + if (fragment) { + if (info->logflags & IP6T_LOG_IPOPT) + printk(")"); + return; + } + hdrlen = ipv6_optlen(hp); + break; + /* Max Length */ + case IPPROTO_AH: + if (info->logflags & IP6T_LOG_IPOPT) { + struct ip_auth_hdr _ahdr, *ah; + + /* Max length: 3 "AH " */ + printk("AH "); + + if (fragment) { + printk(")"); + return; + } + + ah = skb_header_pointer(skb, ptr, sizeof(_ahdr), + &_ahdr); + if (ah == NULL) { + /* + * Max length: 26 "INCOMPLETE [65535 + * bytes] )" + */ + printk("INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 15 "SPI=0xF1234567 */ + printk("SPI=0x%x ", ntohl(ah->spi)); + + } + + hdrlen = (hp->hdrlen+2)<<2; + break; + case IPPROTO_ESP: + if (info->logflags & IP6T_LOG_IPOPT) { + struct ip_esp_hdr _esph, *eh; + + /* Max length: 4 "ESP " */ + printk("ESP "); + + if (fragment) { + printk(")"); + return; + } + + /* + * Max length: 26 "INCOMPLETE [65535 bytes] )" + */ + eh = skb_header_pointer(skb, ptr, sizeof(_esph), + &_esph); + if (eh == NULL) { + printk("INCOMPLETE [%u bytes] )", + skb->len - ptr); + return; + } + + /* Length: 16 "SPI=0xF1234567 )" */ + printk("SPI=0x%x )", ntohl(eh->spi) ); + + } + return; + default: + /* Max length: 20 "Unknown Ext Hdr 255" */ + printk("Unknown Ext Hdr %u", currenthdr); + return; + } + if (info->logflags & IP6T_LOG_IPOPT) + printk(") "); + + currenthdr = hp->nexthdr; + ptr += hdrlen; + } + + switch (currenthdr) { + case IPPROTO_TCP: { + struct tcphdr _tcph, *th; + + /* Max length: 10 "PROTO=TCP " */ + printk("PROTO=TCP "); + + if (fragment) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + th = skb_header_pointer(skb, ptr, sizeof(_tcph), &_tcph); + if (th == NULL) { + printk("INCOMPLETE [%u bytes] ", skb->len - ptr); + return; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + printk("SPT=%u DPT=%u ", + ntohs(th->source), ntohs(th->dest)); + /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ + if (info->logflags & IP6T_LOG_TCPSEQ) + printk("SEQ=%u ACK=%u ", + ntohl(th->seq), ntohl(th->ack_seq)); + /* Max length: 13 "WINDOW=65535 " */ + printk("WINDOW=%u ", ntohs(th->window)); + /* Max length: 9 "RES=0x3C " */ + printk("RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); + /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ + if (th->cwr) + printk("CWR "); + if (th->ece) + printk("ECE "); + if (th->urg) + printk("URG "); + if (th->ack) + printk("ACK "); + if (th->psh) + printk("PSH "); + if (th->rst) + printk("RST "); + if (th->syn) + printk("SYN "); + if (th->fin) + printk("FIN "); + /* Max length: 11 "URGP=65535 " */ + printk("URGP=%u ", ntohs(th->urg_ptr)); + + if ((info->logflags & IP6T_LOG_TCPOPT) + && th->doff * 4 > sizeof(struct tcphdr)) { + u_int8_t _opt[60 - sizeof(struct tcphdr)], *op; + unsigned int i; + unsigned int optsize = th->doff * 4 + - sizeof(struct tcphdr); + + op = skb_header_pointer(skb, + ptr + sizeof(struct tcphdr), + optsize, _opt); + if (op == NULL) { + printk("OPT (TRUNCATED)"); + return; + } + + /* Max length: 127 "OPT (" 15*4*2chars ") " */ + printk("OPT ("); + for (i =0; i < optsize; i++) + printk("%02X", op[i]); + printk(") "); + } + break; + } + case IPPROTO_UDP: { + struct udphdr _udph, *uh; + + /* Max length: 10 "PROTO=UDP " */ + printk("PROTO=UDP "); + + if (fragment) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + uh = skb_header_pointer(skb, ptr, sizeof(_udph), &_udph); + if (uh == NULL) { + printk("INCOMPLETE [%u bytes] ", skb->len - ptr); + return; + } + + /* Max length: 20 "SPT=65535 DPT=65535 " */ + printk("SPT=%u DPT=%u LEN=%u ", + ntohs(uh->source), ntohs(uh->dest), + ntohs(uh->len)); + break; + } + case IPPROTO_ICMPV6: { + struct icmp6hdr _icmp6h, *ic; + + /* Max length: 13 "PROTO=ICMPv6 " */ + printk("PROTO=ICMPv6 "); + + if (fragment) + break; + + /* Max length: 25 "INCOMPLETE [65535 bytes] " */ + ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); + if (ic == NULL) { + printk("INCOMPLETE [%u bytes] ", skb->len - ptr); + return; + } + + /* Max length: 18 "TYPE=255 CODE=255 " */ + printk("TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); + + switch (ic->icmp6_type) { + case ICMPV6_ECHO_REQUEST: + case ICMPV6_ECHO_REPLY: + /* Max length: 19 "ID=65535 SEQ=65535 " */ + printk("ID=%u SEQ=%u ", + ntohs(ic->icmp6_identifier), + ntohs(ic->icmp6_sequence)); + break; + case ICMPV6_MGM_QUERY: + case ICMPV6_MGM_REPORT: + case ICMPV6_MGM_REDUCTION: + break; + + case ICMPV6_PARAMPROB: + /* Max length: 17 "POINTER=ffffffff " */ + printk("POINTER=%08x ", ntohl(ic->icmp6_pointer)); + /* Fall through */ + case ICMPV6_DEST_UNREACH: + case ICMPV6_PKT_TOOBIG: + case ICMPV6_TIME_EXCEED: + /* Max length: 3+maxlen */ + if (recurse) { + printk("["); + dump_packet(info, skb, ptr + sizeof(_icmp6h), + 0); + printk("] "); + } + + /* Max length: 10 "MTU=65535 " */ + if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) + printk("MTU=%u ", ntohl(ic->icmp6_mtu)); + } + break; + } + /* Max length: 10 "PROTO=255 " */ + default: + printk("PROTO=%u ", currenthdr); + } + + /* Max length: 15 "UID=4294967295 " */ + if ((info->logflags & IP6T_LOG_UID) && recurse && skb->sk) { + read_lock_bh(&skb->sk->sk_callback_lock); + if (skb->sk->sk_socket && skb->sk->sk_socket->file) + printk("UID=%u ", skb->sk->sk_socket->file->f_uid); + read_unlock_bh(&skb->sk->sk_callback_lock); + } +} + +static void +ip6t_log_packet(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const struct ip6t_log_info *loginfo, + const char *level_string, + const char *prefix) +{ + struct ipv6hdr *ipv6h = skb->nh.ipv6h; + + spin_lock_bh(&log_lock); + printk(level_string); + printk("%sIN=%s OUT=%s ", + prefix == NULL ? loginfo->prefix : prefix, + in ? in->name : "", + out ? out->name : ""); + if (in && !out) { + /* MAC logging for input chain only. */ + printk("MAC="); + if (skb->dev && skb->dev->hard_header_len && skb->mac.raw != (void*)ipv6h) { + if (skb->dev->type != ARPHRD_SIT){ + int i; + unsigned char *p = skb->mac.raw; + for (i = 0; i < skb->dev->hard_header_len; i++,p++) + printk("%02x%c", *p, + i==skb->dev->hard_header_len - 1 + ? ' ':':'); + } else { + int i; + unsigned char *p = skb->mac.raw; + if ( p - (ETH_ALEN*2+2) > skb->head ){ + p -= (ETH_ALEN+2); + for (i = 0; i < (ETH_ALEN); i++,p++) + printk("%02x%s", *p, + i == ETH_ALEN-1 ? "->" : ":"); + p -= (ETH_ALEN*2); + for (i = 0; i < (ETH_ALEN); i++,p++) + printk("%02x%c", *p, + i == ETH_ALEN-1 ? ' ' : ':'); + } + + if ((skb->dev->addr_len == 4) && + skb->dev->hard_header_len > 20){ + printk("TUNNEL="); + p = skb->mac.raw + 12; + for (i = 0; i < 4; i++,p++) + printk("%3d%s", *p, + i == 3 ? "->" : "."); + for (i = 0; i < 4; i++,p++) + printk("%3d%c", *p, + i == 3 ? ' ' : '.'); + } + } + } else + printk(" "); + } + + dump_packet(loginfo, skb, (u8*)skb->nh.ipv6h - skb->data, 1); + printk("\n"); + spin_unlock_bh(&log_lock); +} + +static unsigned int +ip6t_log_target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ip6t_log_info *loginfo = targinfo; + char level_string[4] = "< >"; + + level_string[1] = '0' + (loginfo->level % 8); + ip6t_log_packet(hooknum, *pskb, in, out, loginfo, level_string, NULL); + + return IP6T_CONTINUE; +} + +static void +ip6t_logfn(unsigned int hooknum, + const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const char *prefix) +{ + struct ip6t_log_info loginfo = { + .level = 0, + .logflags = IP6T_LOG_MASK, + .prefix = "" + }; + + ip6t_log_packet(hooknum, skb, in, out, &loginfo, KERN_WARNING, prefix); +} + +static int ip6t_log_checkentry(const char *tablename, + const struct ip6t_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + const struct ip6t_log_info *loginfo = targinfo; + + if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_log_info))) { + DEBUGP("LOG: targinfosize %u != %u\n", + targinfosize, IP6T_ALIGN(sizeof(struct ip6t_log_info))); + return 0; + } + + if (loginfo->level >= 8) { + DEBUGP("LOG: level %u >= 8\n", loginfo->level); + return 0; + } + + if (loginfo->prefix[sizeof(loginfo->prefix)-1] != '\0') { + DEBUGP("LOG: prefix term %i\n", + loginfo->prefix[sizeof(loginfo->prefix)-1]); + return 0; + } + + return 1; +} + +static struct ip6t_target ip6t_log_reg = { + .name = "LOG", + .target = ip6t_log_target, + .checkentry = ip6t_log_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + if (ip6t_register_target(&ip6t_log_reg)) + return -EINVAL; + if (nflog) + nf_log_register(PF_INET6, &ip6t_logfn); + + return 0; +} + +static void __exit fini(void) +{ + if (nflog) + nf_log_unregister(PF_INET6, &ip6t_logfn); + ip6t_unregister_target(&ip6t_log_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_MARK.c b/net/ipv6/netfilter/ip6t_MARK.c new file mode 100644 index 000000000000..d09ceb05013a --- /dev/null +++ b/net/ipv6/netfilter/ip6t_MARK.c @@ -0,0 +1,78 @@ +/* This is a module which is used for setting the NFMARK field of an skb. */ + +/* (C) 1999-2001 Marc Boucher + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ip6t_mark_target_info *markinfo = targinfo; + + if((*pskb)->nfmark != markinfo->mark) { + (*pskb)->nfmark = markinfo->mark; + (*pskb)->nfcache |= NFC_ALTERED; + } + return IP6T_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ip6t_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IP6T_ALIGN(sizeof(struct ip6t_mark_target_info))) { + printk(KERN_WARNING "MARK: targinfosize %u != %Zu\n", + targinfosize, + IP6T_ALIGN(sizeof(struct ip6t_mark_target_info))); + return 0; + } + + if (strcmp(tablename, "mangle") != 0) { + printk(KERN_WARNING "MARK: can only be called from \"mangle\" table, not \"%s\"\n", tablename); + return 0; + } + + return 1; +} + +static struct ip6t_target ip6t_mark_reg += { { NULL, NULL }, "MARK", target, checkentry, NULL, THIS_MODULE }; + +static int __init init(void) +{ + printk(KERN_DEBUG "registering ipv6 mark target\n"); + if (ip6t_register_target(&ip6t_mark_reg)) + return -EINVAL; + + return 0; +} + +static void __exit fini(void) +{ + ip6t_unregister_target(&ip6t_mark_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c new file mode 100644 index 000000000000..d5b94f142bba --- /dev/null +++ b/net/ipv6/netfilter/ip6t_ah.c @@ -0,0 +1,208 @@ +/* Kernel module to match AH parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv6 AH match"); +MODULE_AUTHOR("Andras Kis-Szabo "); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline int +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert) +{ + int r=0; + DEBUGP("ah spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ', + min,spi,max); + r = (spi >= min && spi <= max) ^ invert; + DEBUGP(" result %s\n",r? "PASS\n" : "FAILED\n"); + return r; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct ip_auth_hdr *ah = NULL, _ah; + const struct ip6t_ah *ahinfo = matchinfo; + unsigned int temp; + int len; + u8 nexthdr; + unsigned int ptr; + unsigned int hdrlen = 0; + + /*DEBUGP("IPv6 AH entered\n");*/ + /* if (opt->auth == 0) return 0; + * It does not filled on output */ + + /* type of the 1st exthdr */ + nexthdr = skb->nh.ipv6h->nexthdr; + /* pointer to the 1st exthdr */ + ptr = sizeof(struct ipv6hdr); + /* available length */ + len = skb->len - ptr; + temp = 0; + + while (ip6t_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + + DEBUGP("ipv6_ah header iteration \n"); + + /* Is there enough space for the next ext header? */ + if (len < sizeof(struct ipv6_opt_hdr)) + return 0; + /* No more exthdr -> evaluate */ + if (nexthdr == NEXTHDR_NONE) + break; + /* ESP -> evaluate */ + if (nexthdr == NEXTHDR_ESP) + break; + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Calculate the header length */ + if (nexthdr == NEXTHDR_FRAGMENT) + hdrlen = 8; + else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + /* AH -> evaluate */ + if (nexthdr == NEXTHDR_AUTH) { + temp |= MASK_AH; + break; + } + + + /* set the flag */ + switch (nexthdr) { + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_FRAGMENT: + case NEXTHDR_AUTH: + case NEXTHDR_DEST: + break; + default: + DEBUGP("ipv6_ah match: unknown nextheader %u\n",nexthdr); + return 0; + } + + nexthdr = hp->nexthdr; + len -= hdrlen; + ptr += hdrlen; + if (ptr > skb->len) { + DEBUGP("ipv6_ah: new pointer too large! \n"); + break; + } + } + + /* AH header not found */ + if (temp != MASK_AH) + return 0; + + if (len < sizeof(struct ip_auth_hdr)){ + *hotdrop = 1; + return 0; + } + + ah = skb_header_pointer(skb, ptr, sizeof(_ah), &_ah); + BUG_ON(ah == NULL); + + DEBUGP("IPv6 AH LEN %u %u ", hdrlen, ah->hdrlen); + DEBUGP("RES %04X ", ah->reserved); + DEBUGP("SPI %u %08X\n", ntohl(ah->spi), ntohl(ah->spi)); + + DEBUGP("IPv6 AH spi %02X ", + (spi_match(ahinfo->spis[0], ahinfo->spis[1], + ntohl(ah->spi), + !!(ahinfo->invflags & IP6T_AH_INV_SPI)))); + DEBUGP("len %02X %04X %02X ", + ahinfo->hdrlen, hdrlen, + (!ahinfo->hdrlen || + (ahinfo->hdrlen == hdrlen) ^ + !!(ahinfo->invflags & IP6T_AH_INV_LEN))); + DEBUGP("res %02X %04X %02X\n", + ahinfo->hdrres, ah->reserved, + !(ahinfo->hdrres && ah->reserved)); + + return (ah != NULL) + && + (spi_match(ahinfo->spis[0], ahinfo->spis[1], + ntohl(ah->spi), + !!(ahinfo->invflags & IP6T_AH_INV_SPI))) + && + (!ahinfo->hdrlen || + (ahinfo->hdrlen == hdrlen) ^ + !!(ahinfo->invflags & IP6T_AH_INV_LEN)) + && + !(ahinfo->hdrres && ah->reserved); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ip6t_ah *ahinfo = matchinfo; + + if (matchinfosize != IP6T_ALIGN(sizeof(struct ip6t_ah))) { + DEBUGP("ip6t_ah: matchsize %u != %u\n", + matchinfosize, IP6T_ALIGN(sizeof(struct ip6t_ah))); + return 0; + } + if (ahinfo->invflags & ~IP6T_AH_INV_MASK) { + DEBUGP("ip6t_ah: unknown flags %X\n", ahinfo->invflags); + return 0; + } + return 1; +} + +static struct ip6t_match ah_match = { + .name = "ah", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&ah_match); +} + +static void __exit cleanup(void) +{ + ip6t_unregister_match(&ah_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv6/netfilter/ip6t_dst.c b/net/ipv6/netfilter/ip6t_dst.c new file mode 100644 index 000000000000..540925e4a7a8 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_dst.c @@ -0,0 +1,298 @@ +/* Kernel module to match Hop-by-Hop and Destination parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#define HOPBYHOP 0 + +MODULE_LICENSE("GPL"); +#if HOPBYHOP +MODULE_DESCRIPTION("IPv6 HbH match"); +#else +MODULE_DESCRIPTION("IPv6 DST match"); +#endif +MODULE_AUTHOR("Andras Kis-Szabo "); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* + * (Type & 0xC0) >> 6 + * 0 -> ignorable + * 1 -> must drop the packet + * 2 -> send ICMP PARM PROB regardless and drop packet + * 3 -> Send ICMP if not a multicast address and drop packet + * (Type & 0x20) >> 5 + * 0 -> invariant + * 1 -> can change the routing + * (Type & 0x1F) Type + * 0 -> Pad1 (only 1 byte!) + * 1 -> PadN LENGTH info (total length = length + 2) + * C0 | 2 -> JUMBO 4 x x x x ( xxxx > 64k ) + * 5 -> RTALERT 2 x x + */ + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct ipv6_opt_hdr _optsh, *oh; + const struct ip6t_opts *optinfo = matchinfo; + unsigned int temp; + unsigned int len; + u8 nexthdr; + unsigned int ptr; + unsigned int hdrlen = 0; + unsigned int ret = 0; + u8 _opttype, *tp = NULL; + u8 _optlen, *lp = NULL; + unsigned int optlen; + + /* type of the 1st exthdr */ + nexthdr = skb->nh.ipv6h->nexthdr; + /* pointer to the 1st exthdr */ + ptr = sizeof(struct ipv6hdr); + /* available length */ + len = skb->len - ptr; + temp = 0; + + while (ip6t_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + + DEBUGP("ipv6_opts header iteration \n"); + + /* Is there enough space for the next ext header? */ + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return 0; + /* No more exthdr -> evaluate */ + if (nexthdr == NEXTHDR_NONE) { + break; + } + /* ESP -> evaluate */ + if (nexthdr == NEXTHDR_ESP) { + break; + } + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Calculate the header length */ + if (nexthdr == NEXTHDR_FRAGMENT) { + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + /* OPTS -> evaluate */ +#if HOPBYHOP + if (nexthdr == NEXTHDR_HOP) { + temp |= MASK_HOPOPTS; +#else + if (nexthdr == NEXTHDR_DEST) { + temp |= MASK_DSTOPTS; +#endif + break; + } + + + /* set the flag */ + switch (nexthdr){ + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_FRAGMENT: + case NEXTHDR_AUTH: + case NEXTHDR_DEST: + break; + default: + DEBUGP("ipv6_opts match: unknown nextheader %u\n",nexthdr); + return 0; + break; + } + + nexthdr = hp->nexthdr; + len -= hdrlen; + ptr += hdrlen; + if ( ptr > skb->len ) { + DEBUGP("ipv6_opts: new pointer is too large! \n"); + break; + } + } + + /* OPTIONS header not found */ +#if HOPBYHOP + if ( temp != MASK_HOPOPTS ) return 0; +#else + if ( temp != MASK_DSTOPTS ) return 0; +#endif + + if (len < (int)sizeof(struct ipv6_opt_hdr)){ + *hotdrop = 1; + return 0; + } + + if (len < hdrlen){ + /* Packet smaller than it's length field */ + return 0; + } + + oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh); + BUG_ON(oh == NULL); + + DEBUGP("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen); + + DEBUGP("len %02X %04X %02X ", + optinfo->hdrlen, hdrlen, + (!(optinfo->flags & IP6T_OPTS_LEN) || + ((optinfo->hdrlen == hdrlen) ^ + !!(optinfo->invflags & IP6T_OPTS_INV_LEN)))); + + ret = (oh != NULL) + && + (!(optinfo->flags & IP6T_OPTS_LEN) || + ((optinfo->hdrlen == hdrlen) ^ + !!(optinfo->invflags & IP6T_OPTS_INV_LEN))); + + ptr += 2; + hdrlen -= 2; + if ( !(optinfo->flags & IP6T_OPTS_OPTS) ){ + return ret; + } else if (optinfo->flags & IP6T_OPTS_NSTRICT) { + DEBUGP("Not strict - not implemented"); + } else { + DEBUGP("Strict "); + DEBUGP("#%d ",optinfo->optsnr); + for(temp=0; tempoptsnr; temp++){ + /* type field exists ? */ + if (hdrlen < 1) + break; + tp = skb_header_pointer(skb, ptr, sizeof(_opttype), + &_opttype); + if (tp == NULL) + break; + + /* Type check */ + if (*tp != (optinfo->opts[temp] & 0xFF00)>>8){ + DEBUGP("Tbad %02X %02X\n", + *tp, + (optinfo->opts[temp] & 0xFF00)>>8); + return 0; + } else { + DEBUGP("Tok "); + } + /* Length check */ + if (*tp) { + u16 spec_len; + + /* length field exists ? */ + if (hdrlen < 2) + break; + lp = skb_header_pointer(skb, ptr + 1, + sizeof(_optlen), + &_optlen); + if (lp == NULL) + break; + spec_len = optinfo->opts[temp] & 0x00FF; + + if (spec_len != 0x00FF && spec_len != *lp) { + DEBUGP("Lbad %02X %04X\n", *lp, + spec_len); + return 0; + } + DEBUGP("Lok "); + optlen = *lp + 2; + } else { + DEBUGP("Pad1\n"); + optlen = 1; + } + + /* Step to the next */ + DEBUGP("len%04X \n", optlen); + + if ((ptr > skb->len - optlen || hdrlen < optlen) && + (temp < optinfo->optsnr - 1)) { + DEBUGP("new pointer is too large! \n"); + break; + } + ptr += optlen; + hdrlen -= optlen; + } + if (temp == optinfo->optsnr) + return ret; + else return 0; + } + + return 0; +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ip6t_opts *optsinfo = matchinfo; + + if (matchinfosize != IP6T_ALIGN(sizeof(struct ip6t_opts))) { + DEBUGP("ip6t_opts: matchsize %u != %u\n", + matchinfosize, IP6T_ALIGN(sizeof(struct ip6t_opts))); + return 0; + } + if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) { + DEBUGP("ip6t_opts: unknown flags %X\n", + optsinfo->invflags); + return 0; + } + + return 1; +} + +static struct ip6t_match opts_match = { +#if HOPBYHOP + .name = "hbh", +#else + .name = "dst", +#endif + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&opts_match); +} + +static void __exit cleanup(void) +{ + ip6t_unregister_match(&opts_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv6/netfilter/ip6t_esp.c b/net/ipv6/netfilter/ip6t_esp.c new file mode 100644 index 000000000000..e39dd236fd8e --- /dev/null +++ b/net/ipv6/netfilter/ip6t_esp.c @@ -0,0 +1,181 @@ +/* Kernel module to match ESP parameters. */ +/* (C) 2001-2002 Andras Kis-Szabo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + + +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv6 ESP match"); +MODULE_AUTHOR("Andras Kis-Szabo "); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Returns 1 if the spi is matched by the range, 0 otherwise */ +static inline int +spi_match(u_int32_t min, u_int32_t max, u_int32_t spi, int invert) +{ + int r=0; + DEBUGP("esp spi_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ', + min,spi,max); + r=(spi >= min && spi <= max) ^ invert; + DEBUGP(" result %s\n",r? "PASS\n" : "FAILED\n"); + return r; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct ip_esp_hdr _esp, *eh = NULL; + const struct ip6t_esp *espinfo = matchinfo; + unsigned int temp; + int len; + u8 nexthdr; + unsigned int ptr; + + /* Make sure this isn't an evil packet */ + /*DEBUGP("ipv6_esp entered \n");*/ + + /* type of the 1st exthdr */ + nexthdr = skb->nh.ipv6h->nexthdr; + /* pointer to the 1st exthdr */ + ptr = sizeof(struct ipv6hdr); + /* available length */ + len = skb->len - ptr; + temp = 0; + + while (ip6t_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + int hdrlen; + + DEBUGP("ipv6_esp header iteration \n"); + + /* Is there enough space for the next ext header? */ + if (len < sizeof(struct ipv6_opt_hdr)) + return 0; + /* No more exthdr -> evaluate */ + if (nexthdr == NEXTHDR_NONE) + break; + /* ESP -> evaluate */ + if (nexthdr == NEXTHDR_ESP) { + temp |= MASK_ESP; + break; + } + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Calculate the header length */ + if (nexthdr == NEXTHDR_FRAGMENT) + hdrlen = 8; + else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + /* set the flag */ + switch (nexthdr) { + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_FRAGMENT: + case NEXTHDR_AUTH: + case NEXTHDR_DEST: + break; + default: + DEBUGP("ipv6_esp match: unknown nextheader %u\n",nexthdr); + return 0; + } + + nexthdr = hp->nexthdr; + len -= hdrlen; + ptr += hdrlen; + if (ptr > skb->len) { + DEBUGP("ipv6_esp: new pointer too large! \n"); + break; + } + } + + /* ESP header not found */ + if (temp != MASK_ESP) + return 0; + + if (len < sizeof(struct ip_esp_hdr)) { + *hotdrop = 1; + return 0; + } + + eh = skb_header_pointer(skb, ptr, sizeof(_esp), &_esp); + BUG_ON(eh == NULL); + + DEBUGP("IPv6 ESP SPI %u %08X\n", ntohl(eh->spi), ntohl(eh->spi)); + + return (eh != NULL) + && spi_match(espinfo->spis[0], espinfo->spis[1], + ntohl(eh->spi), + !!(espinfo->invflags & IP6T_ESP_INV_SPI)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ip6t_esp *espinfo = matchinfo; + + if (matchinfosize != IP6T_ALIGN(sizeof(struct ip6t_esp))) { + DEBUGP("ip6t_esp: matchsize %u != %u\n", + matchinfosize, IP6T_ALIGN(sizeof(struct ip6t_esp))); + return 0; + } + if (espinfo->invflags & ~IP6T_ESP_INV_MASK) { + DEBUGP("ip6t_esp: unknown flags %X\n", + espinfo->invflags); + return 0; + } + return 1; +} + +static struct ip6t_match esp_match = { + .name = "esp", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&esp_match); +} + +static void __exit cleanup(void) +{ + ip6t_unregister_match(&esp_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv6/netfilter/ip6t_eui64.c b/net/ipv6/netfilter/ip6t_eui64.c new file mode 100644 index 000000000000..616c2cbcd54d --- /dev/null +++ b/net/ipv6/netfilter/ip6t_eui64.c @@ -0,0 +1,101 @@ +/* Kernel module to match EUI64 address parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include + +MODULE_DESCRIPTION("IPv6 EUI64 address checking match"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Andras Kis-Szabo "); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + + unsigned char eui64[8]; + int i=0; + + if ( !(skb->mac.raw >= skb->head + && (skb->mac.raw + ETH_HLEN) <= skb->data) + && offset != 0) { + *hotdrop = 1; + return 0; + } + + memset(eui64, 0, sizeof(eui64)); + + if (eth_hdr(skb)->h_proto == ntohs(ETH_P_IPV6)) { + if (skb->nh.ipv6h->version == 0x6) { + memcpy(eui64, eth_hdr(skb)->h_source, 3); + memcpy(eui64 + 5, eth_hdr(skb)->h_source + 3, 3); + eui64[3]=0xff; + eui64[4]=0xfe; + eui64[0] |= 0x02; + + i=0; + while ((skb->nh.ipv6h->saddr.s6_addr[8+i] == + eui64[i]) && (i<8)) i++; + + if ( i == 8 ) + return 1; + } + } + + return 0; +} + +static int +ip6t_eui64_checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (hook_mask + & ~((1 << NF_IP6_PRE_ROUTING) | (1 << NF_IP6_LOCAL_IN) | + (1 << NF_IP6_FORWARD))) { + printk("ip6t_eui64: only valid for PRE_ROUTING, LOCAL_IN or FORWARD.\n"); + return 0; + } + + if (matchsize != IP6T_ALIGN(sizeof(int))) + return 0; + + return 1; +} + +static struct ip6t_match eui64_match = { + .name = "eui64", + .match = &match, + .checkentry = &ip6t_eui64_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&eui64_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&eui64_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_frag.c b/net/ipv6/netfilter/ip6t_frag.c new file mode 100644 index 000000000000..4bfa30a9bc80 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_frag.c @@ -0,0 +1,229 @@ +/* Kernel module to match FRAG parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv6 FRAG match"); +MODULE_AUTHOR("Andras Kis-Szabo "); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Returns 1 if the id is matched by the range, 0 otherwise */ +static inline int +id_match(u_int32_t min, u_int32_t max, u_int32_t id, int invert) +{ + int r=0; + DEBUGP("frag id_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ', + min,id,max); + r=(id >= min && id <= max) ^ invert; + DEBUGP(" result %s\n",r? "PASS" : "FAILED"); + return r; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct frag_hdr _frag, *fh = NULL; + const struct ip6t_frag *fraginfo = matchinfo; + unsigned int temp; + int len; + u8 nexthdr; + unsigned int ptr; + unsigned int hdrlen = 0; + + /* type of the 1st exthdr */ + nexthdr = skb->nh.ipv6h->nexthdr; + /* pointer to the 1st exthdr */ + ptr = sizeof(struct ipv6hdr); + /* available length */ + len = skb->len - ptr; + temp = 0; + + while (ip6t_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + + DEBUGP("ipv6_frag header iteration \n"); + + /* Is there enough space for the next ext header? */ + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return 0; + /* No more exthdr -> evaluate */ + if (nexthdr == NEXTHDR_NONE) { + break; + } + /* ESP -> evaluate */ + if (nexthdr == NEXTHDR_ESP) { + break; + } + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Calculate the header length */ + if (nexthdr == NEXTHDR_FRAGMENT) { + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + /* FRAG -> evaluate */ + if (nexthdr == NEXTHDR_FRAGMENT) { + temp |= MASK_FRAGMENT; + break; + } + + + /* set the flag */ + switch (nexthdr){ + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_FRAGMENT: + case NEXTHDR_AUTH: + case NEXTHDR_DEST: + break; + default: + DEBUGP("ipv6_frag match: unknown nextheader %u\n",nexthdr); + return 0; + break; + } + + nexthdr = hp->nexthdr; + len -= hdrlen; + ptr += hdrlen; + if ( ptr > skb->len ) { + DEBUGP("ipv6_frag: new pointer too large! \n"); + break; + } + } + + /* FRAG header not found */ + if ( temp != MASK_FRAGMENT ) return 0; + + if (len < sizeof(struct frag_hdr)){ + *hotdrop = 1; + return 0; + } + + fh = skb_header_pointer(skb, ptr, sizeof(_frag), &_frag); + BUG_ON(fh == NULL); + + DEBUGP("INFO %04X ", fh->frag_off); + DEBUGP("OFFSET %04X ", ntohs(fh->frag_off) & ~0x7); + DEBUGP("RES %02X %04X", fh->reserved, ntohs(fh->frag_off) & 0x6); + DEBUGP("MF %04X ", fh->frag_off & htons(IP6_MF)); + DEBUGP("ID %u %08X\n", ntohl(fh->identification), + ntohl(fh->identification)); + + DEBUGP("IPv6 FRAG id %02X ", + (id_match(fraginfo->ids[0], fraginfo->ids[1], + ntohl(fh->identification), + !!(fraginfo->invflags & IP6T_FRAG_INV_IDS)))); + DEBUGP("res %02X %02X%04X %02X ", + (fraginfo->flags & IP6T_FRAG_RES), fh->reserved, + ntohs(fh->frag_off) & 0x6, + !((fraginfo->flags & IP6T_FRAG_RES) + && (fh->reserved || (ntohs(fh->frag_off) & 0x06)))); + DEBUGP("first %02X %02X %02X ", + (fraginfo->flags & IP6T_FRAG_FST), + ntohs(fh->frag_off) & ~0x7, + !((fraginfo->flags & IP6T_FRAG_FST) + && (ntohs(fh->frag_off) & ~0x7))); + DEBUGP("mf %02X %02X %02X ", + (fraginfo->flags & IP6T_FRAG_MF), + ntohs(fh->frag_off) & IP6_MF, + !((fraginfo->flags & IP6T_FRAG_MF) + && !((ntohs(fh->frag_off) & IP6_MF)))); + DEBUGP("last %02X %02X %02X\n", + (fraginfo->flags & IP6T_FRAG_NMF), + ntohs(fh->frag_off) & IP6_MF, + !((fraginfo->flags & IP6T_FRAG_NMF) + && (ntohs(fh->frag_off) & IP6_MF))); + + return (fh != NULL) + && + (id_match(fraginfo->ids[0], fraginfo->ids[1], + ntohl(fh->identification), + !!(fraginfo->invflags & IP6T_FRAG_INV_IDS))) + && + !((fraginfo->flags & IP6T_FRAG_RES) + && (fh->reserved || (ntohs(fh->frag_off) & 0x6))) + && + !((fraginfo->flags & IP6T_FRAG_FST) + && (ntohs(fh->frag_off) & ~0x7)) + && + !((fraginfo->flags & IP6T_FRAG_MF) + && !(ntohs(fh->frag_off) & IP6_MF)) + && + !((fraginfo->flags & IP6T_FRAG_NMF) + && (ntohs(fh->frag_off) & IP6_MF)); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ip6t_frag *fraginfo = matchinfo; + + if (matchinfosize != IP6T_ALIGN(sizeof(struct ip6t_frag))) { + DEBUGP("ip6t_frag: matchsize %u != %u\n", + matchinfosize, IP6T_ALIGN(sizeof(struct ip6t_frag))); + return 0; + } + if (fraginfo->invflags & ~IP6T_FRAG_INV_MASK) { + DEBUGP("ip6t_frag: unknown flags %X\n", + fraginfo->invflags); + return 0; + } + + return 1; +} + +static struct ip6t_match frag_match = { + .name = "frag", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&frag_match); +} + +static void __exit cleanup(void) +{ + ip6t_unregister_match(&frag_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv6/netfilter/ip6t_hbh.c b/net/ipv6/netfilter/ip6t_hbh.c new file mode 100644 index 000000000000..27f3650d127e --- /dev/null +++ b/net/ipv6/netfilter/ip6t_hbh.c @@ -0,0 +1,298 @@ +/* Kernel module to match Hop-by-Hop and Destination parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#define HOPBYHOP 1 + +MODULE_LICENSE("GPL"); +#if HOPBYHOP +MODULE_DESCRIPTION("IPv6 HbH match"); +#else +MODULE_DESCRIPTION("IPv6 DST match"); +#endif +MODULE_AUTHOR("Andras Kis-Szabo "); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* + * (Type & 0xC0) >> 6 + * 0 -> ignorable + * 1 -> must drop the packet + * 2 -> send ICMP PARM PROB regardless and drop packet + * 3 -> Send ICMP if not a multicast address and drop packet + * (Type & 0x20) >> 5 + * 0 -> invariant + * 1 -> can change the routing + * (Type & 0x1F) Type + * 0 -> Pad1 (only 1 byte!) + * 1 -> PadN LENGTH info (total length = length + 2) + * C0 | 2 -> JUMBO 4 x x x x ( xxxx > 64k ) + * 5 -> RTALERT 2 x x + */ + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct ipv6_opt_hdr _optsh, *oh; + const struct ip6t_opts *optinfo = matchinfo; + unsigned int temp; + unsigned int len; + u8 nexthdr; + unsigned int ptr; + unsigned int hdrlen = 0; + unsigned int ret = 0; + u8 _opttype, *tp = NULL; + u8 _optlen, *lp = NULL; + unsigned int optlen; + + /* type of the 1st exthdr */ + nexthdr = skb->nh.ipv6h->nexthdr; + /* pointer to the 1st exthdr */ + ptr = sizeof(struct ipv6hdr); + /* available length */ + len = skb->len - ptr; + temp = 0; + + while (ip6t_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + + DEBUGP("ipv6_opts header iteration \n"); + + /* Is there enough space for the next ext header? */ + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return 0; + /* No more exthdr -> evaluate */ + if (nexthdr == NEXTHDR_NONE) { + break; + } + /* ESP -> evaluate */ + if (nexthdr == NEXTHDR_ESP) { + break; + } + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Calculate the header length */ + if (nexthdr == NEXTHDR_FRAGMENT) { + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + /* OPTS -> evaluate */ +#if HOPBYHOP + if (nexthdr == NEXTHDR_HOP) { + temp |= MASK_HOPOPTS; +#else + if (nexthdr == NEXTHDR_DEST) { + temp |= MASK_DSTOPTS; +#endif + break; + } + + + /* set the flag */ + switch (nexthdr){ + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_FRAGMENT: + case NEXTHDR_AUTH: + case NEXTHDR_DEST: + break; + default: + DEBUGP("ipv6_opts match: unknown nextheader %u\n",nexthdr); + return 0; + break; + } + + nexthdr = hp->nexthdr; + len -= hdrlen; + ptr += hdrlen; + if ( ptr > skb->len ) { + DEBUGP("ipv6_opts: new pointer is too large! \n"); + break; + } + } + + /* OPTIONS header not found */ +#if HOPBYHOP + if ( temp != MASK_HOPOPTS ) return 0; +#else + if ( temp != MASK_DSTOPTS ) return 0; +#endif + + if (len < (int)sizeof(struct ipv6_opt_hdr)){ + *hotdrop = 1; + return 0; + } + + if (len < hdrlen){ + /* Packet smaller than it's length field */ + return 0; + } + + oh = skb_header_pointer(skb, ptr, sizeof(_optsh), &_optsh); + BUG_ON(oh == NULL); + + DEBUGP("IPv6 OPTS LEN %u %u ", hdrlen, oh->hdrlen); + + DEBUGP("len %02X %04X %02X ", + optinfo->hdrlen, hdrlen, + (!(optinfo->flags & IP6T_OPTS_LEN) || + ((optinfo->hdrlen == hdrlen) ^ + !!(optinfo->invflags & IP6T_OPTS_INV_LEN)))); + + ret = (oh != NULL) + && + (!(optinfo->flags & IP6T_OPTS_LEN) || + ((optinfo->hdrlen == hdrlen) ^ + !!(optinfo->invflags & IP6T_OPTS_INV_LEN))); + + ptr += 2; + hdrlen -= 2; + if ( !(optinfo->flags & IP6T_OPTS_OPTS) ){ + return ret; + } else if (optinfo->flags & IP6T_OPTS_NSTRICT) { + DEBUGP("Not strict - not implemented"); + } else { + DEBUGP("Strict "); + DEBUGP("#%d ",optinfo->optsnr); + for(temp=0; tempoptsnr; temp++){ + /* type field exists ? */ + if (hdrlen < 1) + break; + tp = skb_header_pointer(skb, ptr, sizeof(_opttype), + &_opttype); + if (tp == NULL) + break; + + /* Type check */ + if (*tp != (optinfo->opts[temp] & 0xFF00)>>8){ + DEBUGP("Tbad %02X %02X\n", + *tp, + (optinfo->opts[temp] & 0xFF00)>>8); + return 0; + } else { + DEBUGP("Tok "); + } + /* Length check */ + if (*tp) { + u16 spec_len; + + /* length field exists ? */ + if (hdrlen < 2) + break; + lp = skb_header_pointer(skb, ptr + 1, + sizeof(_optlen), + &_optlen); + if (lp == NULL) + break; + spec_len = optinfo->opts[temp] & 0x00FF; + + if (spec_len != 0x00FF && spec_len != *lp) { + DEBUGP("Lbad %02X %04X\n", *lp, + spec_len); + return 0; + } + DEBUGP("Lok "); + optlen = *lp + 2; + } else { + DEBUGP("Pad1\n"); + optlen = 1; + } + + /* Step to the next */ + DEBUGP("len%04X \n", optlen); + + if ((ptr > skb->len - optlen || hdrlen < optlen) && + (temp < optinfo->optsnr - 1)) { + DEBUGP("new pointer is too large! \n"); + break; + } + ptr += optlen; + hdrlen -= optlen; + } + if (temp == optinfo->optsnr) + return ret; + else return 0; + } + + return 0; +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ip6t_opts *optsinfo = matchinfo; + + if (matchinfosize != IP6T_ALIGN(sizeof(struct ip6t_opts))) { + DEBUGP("ip6t_opts: matchsize %u != %u\n", + matchinfosize, IP6T_ALIGN(sizeof(struct ip6t_opts))); + return 0; + } + if (optsinfo->invflags & ~IP6T_OPTS_INV_MASK) { + DEBUGP("ip6t_opts: unknown flags %X\n", + optsinfo->invflags); + return 0; + } + + return 1; +} + +static struct ip6t_match opts_match = { +#if HOPBYHOP + .name = "hbh", +#else + .name = "dst", +#endif + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&opts_match); +} + +static void __exit cleanup(void) +{ + ip6t_unregister_match(&opts_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv6/netfilter/ip6t_hl.c b/net/ipv6/netfilter/ip6t_hl.c new file mode 100644 index 000000000000..0beaff5471dd --- /dev/null +++ b/net/ipv6/netfilter/ip6t_hl.c @@ -0,0 +1,80 @@ +/* Hop Limit matching module */ + +/* (C) 2001-2002 Maciej Soltysiak + * Based on HW's ttl module + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +#include +#include + +MODULE_AUTHOR("Maciej Soltysiak "); +MODULE_DESCRIPTION("IP tables Hop Limit matching module"); +MODULE_LICENSE("GPL"); + +static int match(const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const void *matchinfo, + int offset, unsigned int protoff, + int *hotdrop) +{ + const struct ip6t_hl_info *info = matchinfo; + const struct ipv6hdr *ip6h = skb->nh.ipv6h; + + switch (info->mode) { + case IP6T_HL_EQ: + return (ip6h->hop_limit == info->hop_limit); + break; + case IP6T_HL_NE: + return (!(ip6h->hop_limit == info->hop_limit)); + break; + case IP6T_HL_LT: + return (ip6h->hop_limit < info->hop_limit); + break; + case IP6T_HL_GT: + return (ip6h->hop_limit > info->hop_limit); + break; + default: + printk(KERN_WARNING "ip6t_hl: unknown mode %d\n", + info->mode); + return 0; + } + + return 0; +} + +static int checkentry(const char *tablename, const struct ip6t_ip6 *ip, + void *matchinfo, unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_hl_info))) + return 0; + + return 1; +} + +static struct ip6t_match hl_match = { + .name = "hl", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&hl_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&hl_match); + +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_ipv6header.c b/net/ipv6/netfilter/ip6t_ipv6header.c new file mode 100644 index 000000000000..32e67f05845b --- /dev/null +++ b/net/ipv6/netfilter/ip6t_ipv6header.c @@ -0,0 +1,167 @@ +/* ipv6header match - matches IPv6 packets based + on whether they contain certain headers */ + +/* Original idea: Brad Chapman + * Rewritten by: Andras Kis-Szabo */ + +/* (C) 2001-2002 Andras Kis-Szabo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv6 headers match"); +MODULE_AUTHOR("Andras Kis-Szabo "); + +static int +ipv6header_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + const struct ip6t_ipv6header_info *info = matchinfo; + unsigned int temp; + int len; + u8 nexthdr; + unsigned int ptr; + + /* Make sure this isn't an evil packet */ + + /* type of the 1st exthdr */ + nexthdr = skb->nh.ipv6h->nexthdr; + /* pointer to the 1st exthdr */ + ptr = sizeof(struct ipv6hdr); + /* available length */ + len = skb->len - ptr; + temp = 0; + + while (ip6t_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + int hdrlen; + + /* Is there enough space for the next ext header? */ + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return 0; + /* No more exthdr -> evaluate */ + if (nexthdr == NEXTHDR_NONE) { + temp |= MASK_NONE; + break; + } + /* ESP -> evaluate */ + if (nexthdr == NEXTHDR_ESP) { + temp |= MASK_ESP; + break; + } + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Calculate the header length */ + if (nexthdr == NEXTHDR_FRAGMENT) { + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + /* set the flag */ + switch (nexthdr){ + case NEXTHDR_HOP: + temp |= MASK_HOPOPTS; + break; + case NEXTHDR_ROUTING: + temp |= MASK_ROUTING; + break; + case NEXTHDR_FRAGMENT: + temp |= MASK_FRAGMENT; + break; + case NEXTHDR_AUTH: + temp |= MASK_AH; + break; + case NEXTHDR_DEST: + temp |= MASK_DSTOPTS; + break; + default: + return 0; + break; + } + + nexthdr = hp->nexthdr; + len -= hdrlen; + ptr += hdrlen; + if (ptr > skb->len) + break; + } + + if ( (nexthdr != NEXTHDR_NONE ) && (nexthdr != NEXTHDR_ESP) ) + temp |= MASK_PROTO; + + if (info->modeflag) + return !((temp ^ info->matchflags ^ info->invflags) + & info->matchflags); + else { + if (info->invflags) + return temp != info->matchflags; + else + return temp == info->matchflags; + } +} + +static int +ipv6header_checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ip6t_ipv6header_info *info = matchinfo; + + /* Check for obvious errors */ + /* This match is valid in all hooks! */ + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_ipv6header_info))) + return 0; + + /* invflags is 0 or 0xff in hard mode */ + if ((!info->modeflag) && info->invflags != 0x00 + && info->invflags != 0xFF) + return 0; + + return 1; +} + +static struct ip6t_match ip6t_ipv6header_match = { + .name = "ipv6header", + .match = &ipv6header_match, + .checkentry = &ipv6header_checkentry, + .destroy = NULL, + .me = THIS_MODULE, +}; + +static int __init ipv6header_init(void) +{ + return ip6t_register_match(&ip6t_ipv6header_match); +} + +static void __exit ipv6header_exit(void) +{ + ip6t_unregister_match(&ip6t_ipv6header_match); +} + +module_init(ipv6header_init); +module_exit(ipv6header_exit); + diff --git a/net/ipv6/netfilter/ip6t_length.c b/net/ipv6/netfilter/ip6t_length.c new file mode 100644 index 000000000000..e0537d3811d5 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_length.c @@ -0,0 +1,66 @@ +/* Length Match - IPv6 Port */ + +/* (C) 1999-2001 James Morris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + + +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris "); +MODULE_DESCRIPTION("IPv6 packet length match"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + const struct ip6t_length_info *info = matchinfo; + u_int16_t pktlen = ntohs(skb->nh.ipv6h->payload_len) + sizeof(struct ipv6hdr); + + return (pktlen >= info->min && pktlen <= info->max) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_length_info))) + return 0; + + return 1; +} + +static struct ip6t_match length_match = { + .name = "length", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&length_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&length_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_limit.c b/net/ipv6/netfilter/ip6t_limit.c new file mode 100644 index 000000000000..fb782f610be2 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_limit.c @@ -0,0 +1,147 @@ +/* Kernel module to control the rate + * + * 2 September 1999: Changed from the target RATE to the match + * `limit', removed logging. Did I mention that + * Alexey is a fucking genius? + * Rusty Russell (rusty@rustcorp.com.au). */ + +/* (C) 1999 Jérôme de Vivie + * (C) 1999 Hervé Eychenne + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Herve Eychenne "); +MODULE_DESCRIPTION("rate limiting within ip6tables"); + +/* The algorithm used is the Simple Token Bucket Filter (TBF) + * see net/sched/sch_tbf.c in the linux source tree + */ + +static DEFINE_SPINLOCK(limit_lock); + +/* Rusty: This is my (non-mathematically-inclined) understanding of + this algorithm. The `average rate' in jiffies becomes your initial + amount of credit `credit' and the most credit you can ever have + `credit_cap'. The `peak rate' becomes the cost of passing the + test, `cost'. + + `prev' tracks the last packet hit: you gain one credit per jiffy. + If you get credit balance more than this, the extra credit is + discarded. Every time the match passes, you lose `cost' credits; + if you don't have that many, the test fails. + + See Alexey's formal explanation in net/sched/sch_tbf.c. + + To avoid underflow, we multiply by 128 (ie. you get 128 credits per + jiffy). Hence a cost of 2^32-1, means one pass per 32768 seconds + at 1024HZ (or one every 9 hours). A cost of 1 means 12800 passes + per second at 100HZ. */ + +#define CREDITS_PER_JIFFY 128 + +static int +ip6t_limit_match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct ip6t_rateinfo *r = ((struct ip6t_rateinfo *)matchinfo)->master; + unsigned long now = jiffies; + + spin_lock_bh(&limit_lock); + r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY; + if (r->credit > r->credit_cap) + r->credit = r->credit_cap; + + if (r->credit >= r->cost) { + /* We're not limited. */ + r->credit -= r->cost; + spin_unlock_bh(&limit_lock); + return 1; + } + + spin_unlock_bh(&limit_lock); + return 0; +} + +/* Precision saver. */ +static u_int32_t +user2credits(u_int32_t user) +{ + /* If multiplying would overflow... */ + if (user > 0xFFFFFFFF / (HZ*CREDITS_PER_JIFFY)) + /* Divide first. */ + return (user / IP6T_LIMIT_SCALE) * HZ * CREDITS_PER_JIFFY; + + return (user * HZ * CREDITS_PER_JIFFY) / IP6T_LIMIT_SCALE; +} + +static int +ip6t_limit_checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + struct ip6t_rateinfo *r = matchinfo; + + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_rateinfo))) + return 0; + + /* Check for overflow. */ + if (r->burst == 0 + || user2credits(r->avg * r->burst) < user2credits(r->avg)) { + printk("Call rusty: overflow in ip6t_limit: %u/%u\n", + r->avg, r->burst); + return 0; + } + + /* User avg in seconds * IP6T_LIMIT_SCALE: convert to jiffies * + 128. */ + r->prev = jiffies; + r->credit = user2credits(r->avg * r->burst); /* Credits full. */ + r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */ + r->cost = user2credits(r->avg); + + /* For SMP, we only want to use one set of counters. */ + r->master = r; + + return 1; +} + +static struct ip6t_match ip6t_limit_reg = { + .name = "limit", + .match = ip6t_limit_match, + .checkentry = ip6t_limit_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + if (ip6t_register_match(&ip6t_limit_reg)) + return -EINVAL; + return 0; +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&ip6t_limit_reg); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_mac.c b/net/ipv6/netfilter/ip6t_mac.c new file mode 100644 index 000000000000..526d43e37234 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_mac.c @@ -0,0 +1,80 @@ +/* Kernel module to match MAC address parameters. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MAC address matching module for IPv6"); +MODULE_AUTHOR("Netfilter Core Teaam "); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + const struct ip6t_mac_info *info = matchinfo; + + /* Is mac pointer valid? */ + return (skb->mac.raw >= skb->head + && (skb->mac.raw + ETH_HLEN) <= skb->data + /* If so, compare... */ + && ((memcmp(eth_hdr(skb)->h_source, info->srcaddr, ETH_ALEN) + == 0) ^ info->invert)); +} + +static int +ip6t_mac_checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (hook_mask + & ~((1 << NF_IP6_PRE_ROUTING) | (1 << NF_IP6_LOCAL_IN) + | (1 << NF_IP6_FORWARD))) { + printk("ip6t_mac: only valid for PRE_ROUTING, LOCAL_IN or" + " FORWARD\n"); + return 0; + } + + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_mac_info))) + return 0; + + return 1; +} + +static struct ip6t_match mac_match = { + .name = "mac", + .match = &match, + .checkentry = &ip6t_mac_checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&mac_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&mac_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_mark.c b/net/ipv6/netfilter/ip6t_mark.c new file mode 100644 index 000000000000..affc3de364fc --- /dev/null +++ b/net/ipv6/netfilter/ip6t_mark.c @@ -0,0 +1,66 @@ +/* Kernel module to match NFMARK values. */ + +/* (C) 1999-2001 Marc Boucher + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + + +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("ip6tables mark match"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + const struct ip6t_mark_info *info = matchinfo; + + return ((skb->nfmark & info->mask) == info->mark) ^ info->invert; +} + +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_mark_info))) + return 0; + + return 1; +} + +static struct ip6t_match mark_match = { + .name = "mark", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&mark_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&mark_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_multiport.c b/net/ipv6/netfilter/ip6t_multiport.c new file mode 100644 index 000000000000..6e3246153fa3 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_multiport.c @@ -0,0 +1,125 @@ +/* Kernel module to match one of a list of TCP/UDP ports: ports are in + the same place so we can treat them as equal. */ + +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("ip6tables match for multiple ports"); + +#if 0 +#define duprintf(format, args...) printk(format , ## args) +#else +#define duprintf(format, args...) +#endif + +/* Returns 1 if the port is matched by the test, 0 otherwise. */ +static inline int +ports_match(const u_int16_t *portlist, enum ip6t_multiport_flags flags, + u_int8_t count, u_int16_t src, u_int16_t dst) +{ + unsigned int i; + for (i=0; iports, + multiinfo->flags, multiinfo->count, + ntohs(pptr[0]), ntohs(pptr[1])); +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ip6t_multiport *multiinfo = matchinfo; + + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_multiport))) + return 0; + + /* Must specify proto == TCP/UDP, no unknown flags or bad count */ + return (ip->proto == IPPROTO_TCP || ip->proto == IPPROTO_UDP) + && !(ip->invflags & IP6T_INV_PROTO) + && matchsize == IP6T_ALIGN(sizeof(struct ip6t_multiport)) + && (multiinfo->flags == IP6T_MULTIPORT_SOURCE + || multiinfo->flags == IP6T_MULTIPORT_DESTINATION + || multiinfo->flags == IP6T_MULTIPORT_EITHER) + && multiinfo->count <= IP6T_MULTI_PORTS; +} + +static struct ip6t_match multiport_match = { + .name = "multiport", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&multiport_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&multiport_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_owner.c b/net/ipv6/netfilter/ip6t_owner.c new file mode 100644 index 000000000000..ab0e32d3de46 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_owner.c @@ -0,0 +1,174 @@ +/* Kernel module to match various things tied to sockets associated with + locally generated outgoing packets. */ + +/* (C) 2000-2001 Marc Boucher + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_AUTHOR("Marc Boucher "); +MODULE_DESCRIPTION("IP6 tables owner matching module"); +MODULE_LICENSE("GPL"); + +static int +match_pid(const struct sk_buff *skb, pid_t pid) +{ + struct task_struct *p; + struct files_struct *files; + int i; + + read_lock(&tasklist_lock); + p = find_task_by_pid(pid); + if (!p) + goto out; + task_lock(p); + files = p->files; + if(files) { + spin_lock(&files->file_lock); + for (i=0; i < files->max_fds; i++) { + if (fcheck_files(files, i) == skb->sk->sk_socket->file) { + spin_unlock(&files->file_lock); + task_unlock(p); + read_unlock(&tasklist_lock); + return 1; + } + } + spin_unlock(&files->file_lock); + } + task_unlock(p); +out: + read_unlock(&tasklist_lock); + return 0; +} + +static int +match_sid(const struct sk_buff *skb, pid_t sid) +{ + struct task_struct *g, *p; + struct file *file = skb->sk->sk_socket->file; + int i, found=0; + + read_lock(&tasklist_lock); + do_each_thread(g, p) { + struct files_struct *files; + if (p->signal->session != sid) + continue; + + task_lock(p); + files = p->files; + if (files) { + spin_lock(&files->file_lock); + for (i=0; i < files->max_fds; i++) { + if (fcheck_files(files, i) == file) { + found = 1; + break; + } + } + spin_unlock(&files->file_lock); + } + task_unlock(p); + if (found) + goto out; + } while_each_thread(g, p); +out: + read_unlock(&tasklist_lock); + + return found; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + const struct ip6t_owner_info *info = matchinfo; + + if (!skb->sk || !skb->sk->sk_socket || !skb->sk->sk_socket->file) + return 0; + + if(info->match & IP6T_OWNER_UID) { + if((skb->sk->sk_socket->file->f_uid != info->uid) ^ + !!(info->invert & IP6T_OWNER_UID)) + return 0; + } + + if(info->match & IP6T_OWNER_GID) { + if((skb->sk->sk_socket->file->f_gid != info->gid) ^ + !!(info->invert & IP6T_OWNER_GID)) + return 0; + } + + if(info->match & IP6T_OWNER_PID) { + if (!match_pid(skb, info->pid) ^ + !!(info->invert & IP6T_OWNER_PID)) + return 0; + } + + if(info->match & IP6T_OWNER_SID) { + if (!match_sid(skb, info->sid) ^ + !!(info->invert & IP6T_OWNER_SID)) + return 0; + } + + return 1; +} + +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (hook_mask + & ~((1 << NF_IP6_LOCAL_OUT) | (1 << NF_IP6_POST_ROUTING))) { + printk("ip6t_owner: only valid for LOCAL_OUT or POST_ROUTING.\n"); + return 0; + } + + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_owner_info))) + return 0; +#ifdef CONFIG_SMP + /* files->file_lock can not be used in a BH */ + if (((struct ip6t_owner_info *)matchinfo)->match + & (IP6T_OWNER_PID|IP6T_OWNER_SID)) { + printk("ip6t_owner: pid and sid matching is broken on SMP.\n"); + return 0; + } +#endif + return 1; +} + +static struct ip6t_match owner_match = { + .name = "owner", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&owner_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&owner_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_physdev.c b/net/ipv6/netfilter/ip6t_physdev.c new file mode 100644 index 000000000000..71515c86ece1 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_physdev.c @@ -0,0 +1,135 @@ +/* Kernel module to match the bridge port in and + * out device for IP packets coming into contact with a bridge. */ + +/* (C) 2001-2003 Bart De Schuymer + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#define MATCH 1 +#define NOMATCH 0 + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Bart De Schuymer "); +MODULE_DESCRIPTION("iptables bridge physical device match module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + int i; + static const char nulldevname[IFNAMSIZ]; + const struct ip6t_physdev_info *info = matchinfo; + unsigned int ret; + const char *indev, *outdev; + struct nf_bridge_info *nf_bridge; + + /* Not a bridged IP packet or no info available yet: + * LOCAL_OUT/mangle and LOCAL_OUT/nat don't know if + * the destination device will be a bridge. */ + if (!(nf_bridge = skb->nf_bridge)) { + /* Return MATCH if the invert flags of the used options are on */ + if ((info->bitmask & IP6T_PHYSDEV_OP_BRIDGED) && + !(info->invert & IP6T_PHYSDEV_OP_BRIDGED)) + return NOMATCH; + if ((info->bitmask & IP6T_PHYSDEV_OP_ISIN) && + !(info->invert & IP6T_PHYSDEV_OP_ISIN)) + return NOMATCH; + if ((info->bitmask & IP6T_PHYSDEV_OP_ISOUT) && + !(info->invert & IP6T_PHYSDEV_OP_ISOUT)) + return NOMATCH; + if ((info->bitmask & IP6T_PHYSDEV_OP_IN) && + !(info->invert & IP6T_PHYSDEV_OP_IN)) + return NOMATCH; + if ((info->bitmask & IP6T_PHYSDEV_OP_OUT) && + !(info->invert & IP6T_PHYSDEV_OP_OUT)) + return NOMATCH; + return MATCH; + } + + /* This only makes sense in the FORWARD and POSTROUTING chains */ + if ((info->bitmask & IP6T_PHYSDEV_OP_BRIDGED) && + (!!(nf_bridge->mask & BRNF_BRIDGED) ^ + !(info->invert & IP6T_PHYSDEV_OP_BRIDGED))) + return NOMATCH; + + if ((info->bitmask & IP6T_PHYSDEV_OP_ISIN && + (!nf_bridge->physindev ^ !!(info->invert & IP6T_PHYSDEV_OP_ISIN))) || + (info->bitmask & IP6T_PHYSDEV_OP_ISOUT && + (!nf_bridge->physoutdev ^ !!(info->invert & IP6T_PHYSDEV_OP_ISOUT)))) + return NOMATCH; + + if (!(info->bitmask & IP6T_PHYSDEV_OP_IN)) + goto match_outdev; + indev = nf_bridge->physindev ? nf_bridge->physindev->name : nulldevname; + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) { + ret |= (((const unsigned int *)indev)[i] + ^ ((const unsigned int *)info->physindev)[i]) + & ((const unsigned int *)info->in_mask)[i]; + } + + if ((ret == 0) ^ !(info->invert & IP6T_PHYSDEV_OP_IN)) + return NOMATCH; + +match_outdev: + if (!(info->bitmask & IP6T_PHYSDEV_OP_OUT)) + return MATCH; + outdev = nf_bridge->physoutdev ? + nf_bridge->physoutdev->name : nulldevname; + for (i = 0, ret = 0; i < IFNAMSIZ/sizeof(unsigned int); i++) { + ret |= (((const unsigned int *)outdev)[i] + ^ ((const unsigned int *)info->physoutdev)[i]) + & ((const unsigned int *)info->out_mask)[i]; + } + + return (ret != 0) ^ !(info->invert & IP6T_PHYSDEV_OP_OUT); +} + +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + const struct ip6t_physdev_info *info = matchinfo; + + if (matchsize != IP6T_ALIGN(sizeof(struct ip6t_physdev_info))) + return 0; + if (!(info->bitmask & IP6T_PHYSDEV_OP_MASK) || + info->bitmask & ~IP6T_PHYSDEV_OP_MASK) + return 0; + return 1; +} + +static struct ip6t_match physdev_match = { + .name = "physdev", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&physdev_match); +} + +static void __exit fini(void) +{ + ip6t_unregister_match(&physdev_match); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6t_rt.c b/net/ipv6/netfilter/ip6t_rt.c new file mode 100644 index 000000000000..a9526b773d28 --- /dev/null +++ b/net/ipv6/netfilter/ip6t_rt.c @@ -0,0 +1,301 @@ +/* Kernel module to match ROUTING parameters. */ + +/* (C) 2001-2002 Andras Kis-Szabo + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("IPv6 RT match"); +MODULE_AUTHOR("Andras Kis-Szabo "); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +/* Returns 1 if the id is matched by the range, 0 otherwise */ +static inline int +segsleft_match(u_int32_t min, u_int32_t max, u_int32_t id, int invert) +{ + int r=0; + DEBUGP("rt segsleft_match:%c 0x%x <= 0x%x <= 0x%x",invert? '!':' ', + min,id,max); + r=(id >= min && id <= max) ^ invert; + DEBUGP(" result %s\n",r? "PASS" : "FAILED"); + return r; +} + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + unsigned int protoff, + int *hotdrop) +{ + struct ipv6_rt_hdr _route, *rh = NULL; + const struct ip6t_rt *rtinfo = matchinfo; + unsigned int temp; + unsigned int len; + u8 nexthdr; + unsigned int ptr; + unsigned int hdrlen = 0; + unsigned int ret = 0; + struct in6_addr *ap, _addr; + + /* type of the 1st exthdr */ + nexthdr = skb->nh.ipv6h->nexthdr; + /* pointer to the 1st exthdr */ + ptr = sizeof(struct ipv6hdr); + /* available length */ + len = skb->len - ptr; + temp = 0; + + while (ip6t_ext_hdr(nexthdr)) { + struct ipv6_opt_hdr _hdr, *hp; + + DEBUGP("ipv6_rt header iteration \n"); + + /* Is there enough space for the next ext header? */ + if (len < (int)sizeof(struct ipv6_opt_hdr)) + return 0; + /* No more exthdr -> evaluate */ + if (nexthdr == NEXTHDR_NONE) { + break; + } + /* ESP -> evaluate */ + if (nexthdr == NEXTHDR_ESP) { + break; + } + + hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); + BUG_ON(hp == NULL); + + /* Calculate the header length */ + if (nexthdr == NEXTHDR_FRAGMENT) { + hdrlen = 8; + } else if (nexthdr == NEXTHDR_AUTH) + hdrlen = (hp->hdrlen+2)<<2; + else + hdrlen = ipv6_optlen(hp); + + /* ROUTING -> evaluate */ + if (nexthdr == NEXTHDR_ROUTING) { + temp |= MASK_ROUTING; + break; + } + + + /* set the flag */ + switch (nexthdr){ + case NEXTHDR_HOP: + case NEXTHDR_ROUTING: + case NEXTHDR_FRAGMENT: + case NEXTHDR_AUTH: + case NEXTHDR_DEST: + break; + default: + DEBUGP("ipv6_rt match: unknown nextheader %u\n",nexthdr); + return 0; + break; + } + + nexthdr = hp->nexthdr; + len -= hdrlen; + ptr += hdrlen; + if ( ptr > skb->len ) { + DEBUGP("ipv6_rt: new pointer is too large! \n"); + break; + } + } + + /* ROUTING header not found */ + if ( temp != MASK_ROUTING ) return 0; + + if (len < (int)sizeof(struct ipv6_rt_hdr)){ + *hotdrop = 1; + return 0; + } + + if (len < hdrlen){ + /* Pcket smaller than its length field */ + return 0; + } + + rh = skb_header_pointer(skb, ptr, sizeof(_route), &_route); + BUG_ON(rh == NULL); + + DEBUGP("IPv6 RT LEN %u %u ", hdrlen, rh->hdrlen); + DEBUGP("TYPE %04X ", rh->type); + DEBUGP("SGS_LEFT %u %02X\n", rh->segments_left, rh->segments_left); + + DEBUGP("IPv6 RT segsleft %02X ", + (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], + rh->segments_left, + !!(rtinfo->invflags & IP6T_RT_INV_SGS)))); + DEBUGP("type %02X %02X %02X ", + rtinfo->rt_type, rh->type, + (!(rtinfo->flags & IP6T_RT_TYP) || + ((rtinfo->rt_type == rh->type) ^ + !!(rtinfo->invflags & IP6T_RT_INV_TYP)))); + DEBUGP("len %02X %04X %02X ", + rtinfo->hdrlen, hdrlen, + (!(rtinfo->flags & IP6T_RT_LEN) || + ((rtinfo->hdrlen == hdrlen) ^ + !!(rtinfo->invflags & IP6T_RT_INV_LEN)))); + DEBUGP("res %02X %02X %02X ", + (rtinfo->flags & IP6T_RT_RES), ((struct rt0_hdr *)rh)->bitmap, + !((rtinfo->flags & IP6T_RT_RES) && (((struct rt0_hdr *)rh)->bitmap))); + + ret = (rh != NULL) + && + (segsleft_match(rtinfo->segsleft[0], rtinfo->segsleft[1], + rh->segments_left, + !!(rtinfo->invflags & IP6T_RT_INV_SGS))) + && + (!(rtinfo->flags & IP6T_RT_LEN) || + ((rtinfo->hdrlen == hdrlen) ^ + !!(rtinfo->invflags & IP6T_RT_INV_LEN))) + && + (!(rtinfo->flags & IP6T_RT_TYP) || + ((rtinfo->rt_type == rh->type) ^ + !!(rtinfo->invflags & IP6T_RT_INV_TYP))); + + if (ret && (rtinfo->flags & IP6T_RT_RES)) { + u_int32_t *bp, _bitmap; + bp = skb_header_pointer(skb, + ptr + offsetof(struct rt0_hdr, bitmap), + sizeof(_bitmap), &_bitmap); + + ret = (*bp == 0); + } + + DEBUGP("#%d ",rtinfo->addrnr); + if ( !(rtinfo->flags & IP6T_RT_FST) ){ + return ret; + } else if (rtinfo->flags & IP6T_RT_FST_NSTRICT) { + DEBUGP("Not strict "); + if ( rtinfo->addrnr > (unsigned int)((hdrlen-8)/16) ){ + DEBUGP("There isn't enough space\n"); + return 0; + } else { + unsigned int i = 0; + + DEBUGP("#%d ",rtinfo->addrnr); + for(temp=0; temp<(unsigned int)((hdrlen-8)/16); temp++){ + ap = skb_header_pointer(skb, + ptr + + sizeof(struct rt0_hdr) + + temp * sizeof(_addr), + sizeof(_addr), + &_addr); + + BUG_ON(ap == NULL); + + if (ipv6_addr_equal(ap, &rtinfo->addrs[i])) { + DEBUGP("i=%d temp=%d;\n",i,temp); + i++; + } + if (i==rtinfo->addrnr) break; + } + DEBUGP("i=%d #%d\n", i, rtinfo->addrnr); + if (i == rtinfo->addrnr) + return ret; + else return 0; + } + } else { + DEBUGP("Strict "); + if ( rtinfo->addrnr > (unsigned int)((hdrlen-8)/16) ){ + DEBUGP("There isn't enough space\n"); + return 0; + } else { + DEBUGP("#%d ",rtinfo->addrnr); + for(temp=0; tempaddrnr; temp++){ + ap = skb_header_pointer(skb, + ptr + + sizeof(struct rt0_hdr) + + temp * sizeof(_addr), + sizeof(_addr), + &_addr); + BUG_ON(ap == NULL); + + if (!ipv6_addr_equal(ap, &rtinfo->addrs[temp])) + break; + } + DEBUGP("temp=%d #%d\n", temp, rtinfo->addrnr); + if ((temp == rtinfo->addrnr) && (temp == (unsigned int)((hdrlen-8)/16))) + return ret; + else return 0; + } + } + + return 0; +} + +/* Called when user tries to insert an entry of this type. */ +static int +checkentry(const char *tablename, + const struct ip6t_ip6 *ip, + void *matchinfo, + unsigned int matchinfosize, + unsigned int hook_mask) +{ + const struct ip6t_rt *rtinfo = matchinfo; + + if (matchinfosize != IP6T_ALIGN(sizeof(struct ip6t_rt))) { + DEBUGP("ip6t_rt: matchsize %u != %u\n", + matchinfosize, IP6T_ALIGN(sizeof(struct ip6t_rt))); + return 0; + } + if (rtinfo->invflags & ~IP6T_RT_INV_MASK) { + DEBUGP("ip6t_rt: unknown flags %X\n", + rtinfo->invflags); + return 0; + } + if ( (rtinfo->flags & (IP6T_RT_RES|IP6T_RT_FST_MASK)) && + (!(rtinfo->flags & IP6T_RT_TYP) || + (rtinfo->rt_type != 0) || + (rtinfo->invflags & IP6T_RT_INV_TYP)) ) { + DEBUGP("`--rt-type 0' required before `--rt-0-*'"); + return 0; + } + + return 1; +} + +static struct ip6t_match rt_match = { + .name = "rt", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ip6t_register_match(&rt_match); +} + +static void __exit cleanup(void) +{ + ip6t_unregister_match(&rt_match); +} + +module_init(init); +module_exit(cleanup); diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c new file mode 100644 index 000000000000..4c0028671c20 --- /dev/null +++ b/net/ipv6/netfilter/ip6table_filter.c @@ -0,0 +1,214 @@ +/* + * This is the 1999 rewrite of IP Firewalling, aiming for kernel 2.3.x. + * + * Copyright (C) 1999 Paul `Rusty' Russell & Michael J. Neuling + * Copyright (C) 2000-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("ip6tables filter table"); + +#define FILTER_VALID_HOOKS ((1 << NF_IP6_LOCAL_IN) | (1 << NF_IP6_FORWARD) | (1 << NF_IP6_LOCAL_OUT)) + +/* Standard entry. */ +struct ip6t_standard +{ + struct ip6t_entry entry; + struct ip6t_standard_target target; +}; + +struct ip6t_error_target +{ + struct ip6t_entry_target target; + char errorname[IP6T_FUNCTION_MAXNAMELEN]; +}; + +struct ip6t_error +{ + struct ip6t_entry entry; + struct ip6t_error_target target; +}; + +static struct +{ + struct ip6t_replace repl; + struct ip6t_standard entries[3]; + struct ip6t_error term; +} initial_table __initdata += { { "filter", FILTER_VALID_HOOKS, 4, + sizeof(struct ip6t_standard) * 3 + sizeof(struct ip6t_error), + { [NF_IP6_LOCAL_IN] = 0, + [NF_IP6_FORWARD] = sizeof(struct ip6t_standard), + [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) * 2 }, + { [NF_IP6_LOCAL_IN] = 0, + [NF_IP6_FORWARD] = sizeof(struct ip6t_standard), + [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) * 2 }, + 0, NULL, { } }, + { + /* LOCAL_IN */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* FORWARD */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_OUT */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } } + }, + /* ERROR */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_error), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_error_target)), IP6T_ERROR_TARGET } }, + { } }, + "ERROR" + } + } +}; + +static struct ip6t_table packet_filter = { + .name = "filter", + .valid_hooks = FILTER_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE, +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ip6t_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip6t_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static unsigned int +ip6t_local_out_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ +#if 0 + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ip6t_hook: happy cracking.\n"); + return NF_ACCEPT; + } +#endif + + return ip6t_do_table(pskb, hook, in, out, &packet_filter, NULL); +} + +static struct nf_hook_ops ip6t_ops[] = { + { + .hook = ip6t_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_IN, + .priority = NF_IP6_PRI_FILTER, + }, + { + .hook = ip6t_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_FORWARD, + .priority = NF_IP6_PRI_FILTER, + }, + { + .hook = ip6t_local_out_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_OUT, + .priority = NF_IP6_PRI_FILTER, + }, +}; + +/* Default to forward because I got too much mail already. */ +static int forward = NF_ACCEPT; +module_param(forward, bool, 0000); + +static int __init init(void) +{ + int ret; + + if (forward < 0 || forward > NF_MAX_VERDICT) { + printk("iptables forward must be 0 or 1\n"); + return -EINVAL; + } + + /* Entry 1 is the FORWARD hook */ + initial_table.entries[1].target.verdict = -forward - 1; + + /* Register table */ + ret = ip6t_register_table(&packet_filter, &initial_table.repl); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ip6t_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ip6t_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + ret = nf_register_hook(&ip6t_ops[2]); + if (ret < 0) + goto cleanup_hook1; + + return ret; + + cleanup_hook1: + nf_unregister_hook(&ip6t_ops[1]); + cleanup_hook0: + nf_unregister_hook(&ip6t_ops[0]); + cleanup_table: + ip6t_unregister_table(&packet_filter); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ip6t_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ip6t_ops[i]); + + ip6t_unregister_table(&packet_filter); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c new file mode 100644 index 000000000000..85c1e6eada19 --- /dev/null +++ b/net/ipv6/netfilter/ip6table_mangle.c @@ -0,0 +1,287 @@ +/* + * IPv6 packet mangling table, a port of the IPv4 mangle table to IPv6 + * + * Copyright (C) 2000-2001 by Harald Welte + * Copyright (C) 2000-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Extended to all five netfilter hooks by Brad Chapman & Harald Welte + */ +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Netfilter Core Team "); +MODULE_DESCRIPTION("ip6tables mangle table"); + +#define MANGLE_VALID_HOOKS ((1 << NF_IP6_PRE_ROUTING) | \ + (1 << NF_IP6_LOCAL_IN) | \ + (1 << NF_IP6_FORWARD) | \ + (1 << NF_IP6_LOCAL_OUT) | \ + (1 << NF_IP6_POST_ROUTING)) + +#if 0 +#define DEBUGP(x, args...) printk(KERN_DEBUG x, ## args) +#else +#define DEBUGP(x, args...) +#endif + +/* Standard entry. */ +struct ip6t_standard +{ + struct ip6t_entry entry; + struct ip6t_standard_target target; +}; + +struct ip6t_error_target +{ + struct ip6t_entry_target target; + char errorname[IP6T_FUNCTION_MAXNAMELEN]; +}; + +struct ip6t_error +{ + struct ip6t_entry entry; + struct ip6t_error_target target; +}; + +static struct +{ + struct ip6t_replace repl; + struct ip6t_standard entries[5]; + struct ip6t_error term; +} initial_table __initdata += { { "mangle", MANGLE_VALID_HOOKS, 6, + sizeof(struct ip6t_standard) * 5 + sizeof(struct ip6t_error), + { [NF_IP6_PRE_ROUTING] = 0, + [NF_IP6_LOCAL_IN] = sizeof(struct ip6t_standard), + [NF_IP6_FORWARD] = sizeof(struct ip6t_standard) * 2, + [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) * 3, + [NF_IP6_POST_ROUTING] = sizeof(struct ip6t_standard) * 4}, + { [NF_IP6_PRE_ROUTING] = 0, + [NF_IP6_LOCAL_IN] = sizeof(struct ip6t_standard), + [NF_IP6_FORWARD] = sizeof(struct ip6t_standard) * 2, + [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) * 3, + [NF_IP6_POST_ROUTING] = sizeof(struct ip6t_standard) * 4}, + 0, NULL, { } }, + { + /* PRE_ROUTING */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_IN */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* FORWARD */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* LOCAL_OUT */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } }, + /* POST_ROUTING */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_standard), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_standard_target)), "" } }, { } }, + -NF_ACCEPT - 1 } } + }, + /* ERROR */ + { { { { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, { { { 0 } } }, "", "", { 0 }, { 0 }, 0, 0, 0 }, + 0, + sizeof(struct ip6t_entry), + sizeof(struct ip6t_error), + 0, { 0, 0 }, { } }, + { { { { IP6T_ALIGN(sizeof(struct ip6t_error_target)), IP6T_ERROR_TARGET } }, + { } }, + "ERROR" + } + } +}; + +static struct ip6t_table packet_mangler = { + .name = "mangle", + .valid_hooks = MANGLE_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE, +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ip6t_route_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip6t_do_table(pskb, hook, in, out, &packet_mangler, NULL); +} + +static unsigned int +ip6t_local_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + + unsigned long nfmark; + unsigned int ret; + struct in6_addr saddr, daddr; + u_int8_t hop_limit; + u_int32_t flowlabel; + +#if 0 + /* root is playing with raw sockets. */ + if ((*pskb)->len < sizeof(struct iphdr) + || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr)) { + if (net_ratelimit()) + printk("ip6t_hook: happy cracking.\n"); + return NF_ACCEPT; + } +#endif + + /* save source/dest address, nfmark, hoplimit, flowlabel, priority, */ + memcpy(&saddr, &(*pskb)->nh.ipv6h->saddr, sizeof(saddr)); + memcpy(&daddr, &(*pskb)->nh.ipv6h->daddr, sizeof(daddr)); + nfmark = (*pskb)->nfmark; + hop_limit = (*pskb)->nh.ipv6h->hop_limit; + + /* flowlabel and prio (includes version, which shouldn't change either */ + flowlabel = *((u_int32_t *) (*pskb)->nh.ipv6h); + + ret = ip6t_do_table(pskb, hook, in, out, &packet_mangler, NULL); + + if (ret != NF_DROP && ret != NF_STOLEN + && (memcmp(&(*pskb)->nh.ipv6h->saddr, &saddr, sizeof(saddr)) + || memcmp(&(*pskb)->nh.ipv6h->daddr, &daddr, sizeof(daddr)) + || (*pskb)->nfmark != nfmark + || (*pskb)->nh.ipv6h->hop_limit != hop_limit)) { + + /* something which could affect routing has changed */ + + DEBUGP("ip6table_mangle: we'd need to re-route a packet\n"); + } + + return ret; +} + +static struct nf_hook_ops ip6t_ops[] = { + { + .hook = ip6t_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_PRE_ROUTING, + .priority = NF_IP6_PRI_MANGLE, + }, + { + .hook = ip6t_local_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_IN, + .priority = NF_IP6_PRI_MANGLE, + }, + { + .hook = ip6t_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_FORWARD, + .priority = NF_IP6_PRI_MANGLE, + }, + { + .hook = ip6t_local_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_OUT, + .priority = NF_IP6_PRI_MANGLE, + }, + { + .hook = ip6t_route_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_POST_ROUTING, + .priority = NF_IP6_PRI_MANGLE, + }, +}; + +static int __init init(void) +{ + int ret; + + /* Register table */ + ret = ip6t_register_table(&packet_mangler, &initial_table.repl); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ip6t_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ip6t_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + ret = nf_register_hook(&ip6t_ops[2]); + if (ret < 0) + goto cleanup_hook1; + + ret = nf_register_hook(&ip6t_ops[3]); + if (ret < 0) + goto cleanup_hook2; + + ret = nf_register_hook(&ip6t_ops[4]); + if (ret < 0) + goto cleanup_hook3; + + return ret; + + cleanup_hook3: + nf_unregister_hook(&ip6t_ops[3]); + cleanup_hook2: + nf_unregister_hook(&ip6t_ops[2]); + cleanup_hook1: + nf_unregister_hook(&ip6t_ops[1]); + cleanup_hook0: + nf_unregister_hook(&ip6t_ops[0]); + cleanup_table: + ip6t_unregister_table(&packet_mangler); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ip6t_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ip6t_ops[i]); + + ip6t_unregister_table(&packet_mangler); +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c new file mode 100644 index 000000000000..71407beaf790 --- /dev/null +++ b/net/ipv6/netfilter/ip6table_raw.c @@ -0,0 +1,182 @@ +/* + * IPv6 raw table, a port of the IPv4 raw table to IPv6 + * + * Copyright (C) 2003 Jozsef Kadlecsik + */ +#include +#include + +#define RAW_VALID_HOOKS ((1 << NF_IP6_PRE_ROUTING) | (1 << NF_IP6_LOCAL_OUT)) + +#if 0 +#define DEBUGP(x, args...) printk(KERN_DEBUG x, ## args) +#else +#define DEBUGP(x, args...) +#endif + +/* Standard entry. */ +struct ip6t_standard +{ + struct ip6t_entry entry; + struct ip6t_standard_target target; +}; + +struct ip6t_error_target +{ + struct ip6t_entry_target target; + char errorname[IP6T_FUNCTION_MAXNAMELEN]; +}; + +struct ip6t_error +{ + struct ip6t_entry entry; + struct ip6t_error_target target; +}; + +static struct +{ + struct ip6t_replace repl; + struct ip6t_standard entries[2]; + struct ip6t_error term; +} initial_table __initdata = { + .repl = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .num_entries = 3, + .size = sizeof(struct ip6t_standard) * 2 + sizeof(struct ip6t_error), + .hook_entry = { + [NF_IP6_PRE_ROUTING] = 0, + [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) + }, + .underflow = { + [NF_IP6_PRE_ROUTING] = 0, + [NF_IP6_LOCAL_OUT] = sizeof(struct ip6t_standard) + }, + }, + .entries = { + /* PRE_ROUTING */ + { + .entry = { + .target_offset = sizeof(struct ip6t_entry), + .next_offset = sizeof(struct ip6t_standard), + }, + .target = { + .target = { + .u = { + .target_size = IP6T_ALIGN(sizeof(struct ip6t_standard_target)), + }, + }, + .verdict = -NF_ACCEPT - 1, + }, + }, + + /* LOCAL_OUT */ + { + .entry = { + .target_offset = sizeof(struct ip6t_entry), + .next_offset = sizeof(struct ip6t_standard), + }, + .target = { + .target = { + .u = { + .target_size = IP6T_ALIGN(sizeof(struct ip6t_standard_target)), + }, + }, + .verdict = -NF_ACCEPT - 1, + }, + }, + }, + /* ERROR */ + .term = { + .entry = { + .target_offset = sizeof(struct ip6t_entry), + .next_offset = sizeof(struct ip6t_error), + }, + .target = { + .target = { + .u = { + .user = { + .target_size = IP6T_ALIGN(sizeof(struct ip6t_error_target)), + .name = IP6T_ERROR_TARGET, + }, + }, + }, + .errorname = "ERROR", + }, + } +}; + +static struct ip6t_table packet_raw = { + .name = "raw", + .valid_hooks = RAW_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ip6t_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip6t_do_table(pskb, hook, in, out, &packet_raw, NULL); +} + +static struct nf_hook_ops ip6t_ops[] = { + { + .hook = ip6t_hook, + .pf = PF_INET6, + .hooknum = NF_IP6_PRE_ROUTING, + .priority = NF_IP6_PRI_FIRST + }, + { + .hook = ip6t_hook, + .pf = PF_INET6, + .hooknum = NF_IP6_LOCAL_OUT, + .priority = NF_IP6_PRI_FIRST + }, +}; + +static int __init init(void) +{ + int ret; + + /* Register table */ + ret = ip6t_register_table(&packet_raw, &initial_table.repl); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ip6t_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ip6t_ops[1]); + if (ret < 0) + goto cleanup_hook0; + + return ret; + + cleanup_hook0: + nf_unregister_hook(&ip6t_ops[0]); + cleanup_table: + ip6t_unregister_table(&packet_raw); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ip6t_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ip6t_ops[i]); + + ip6t_unregister_table(&packet_raw); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c new file mode 100644 index 000000000000..334a5967831e --- /dev/null +++ b/net/ipv6/proc.c @@ -0,0 +1,303 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * This file implements the various access functions for the + * PROC file system. This is very similar to the IPv4 version, + * except it reports the sockets in the INET6 address family. + * + * Version: $Id: proc.c,v 1.17 2002/02/01 22:01:04 davem Exp $ + * + * Authors: David S. Miller (davem@caip.rutgers.edu) + * YOSHIFUJI Hideaki + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PROC_FS +static struct proc_dir_entry *proc_net_devsnmp6; + +static int fold_prot_inuse(struct proto *proto) +{ + int res = 0; + int cpu; + + for (cpu=0; cpustats[cpu].inuse; + + return res; +} + +static int sockstat6_seq_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "TCP6: inuse %d\n", + fold_prot_inuse(&tcpv6_prot)); + seq_printf(seq, "UDP6: inuse %d\n", + fold_prot_inuse(&udpv6_prot)); + seq_printf(seq, "RAW6: inuse %d\n", + fold_prot_inuse(&rawv6_prot)); + seq_printf(seq, "FRAG6: inuse %d memory %d\n", + ip6_frag_nqueues, atomic_read(&ip6_frag_mem)); + return 0; +} + +static struct snmp_mib snmp6_ipstats_list[] = { +/* ipv6 mib according to RFC 2465 */ + SNMP_MIB_ITEM("Ip6InReceives", IPSTATS_MIB_INRECEIVES), + SNMP_MIB_ITEM("Ip6InHdrErrors", IPSTATS_MIB_INHDRERRORS), + SNMP_MIB_ITEM("Ip6InTooBigErrors", IPSTATS_MIB_INTOOBIGERRORS), + SNMP_MIB_ITEM("Ip6InNoRoutes", IPSTATS_MIB_INNOROUTES), + SNMP_MIB_ITEM("Ip6InAddrErrors", IPSTATS_MIB_INADDRERRORS), + SNMP_MIB_ITEM("Ip6InUnknownProtos", IPSTATS_MIB_INUNKNOWNPROTOS), + SNMP_MIB_ITEM("Ip6InTruncatedPkts", IPSTATS_MIB_INTRUNCATEDPKTS), + SNMP_MIB_ITEM("Ip6InDiscards", IPSTATS_MIB_INDISCARDS), + SNMP_MIB_ITEM("Ip6InDelivers", IPSTATS_MIB_INDELIVERS), + SNMP_MIB_ITEM("Ip6OutForwDatagrams", IPSTATS_MIB_OUTFORWDATAGRAMS), + SNMP_MIB_ITEM("Ip6OutRequests", IPSTATS_MIB_OUTREQUESTS), + SNMP_MIB_ITEM("Ip6OutDiscards", IPSTATS_MIB_OUTDISCARDS), + SNMP_MIB_ITEM("Ip6OutNoRoutes", IPSTATS_MIB_OUTNOROUTES), + SNMP_MIB_ITEM("Ip6ReasmTimeout", IPSTATS_MIB_REASMTIMEOUT), + SNMP_MIB_ITEM("Ip6ReasmReqds", IPSTATS_MIB_REASMREQDS), + SNMP_MIB_ITEM("Ip6ReasmOKs", IPSTATS_MIB_REASMOKS), + SNMP_MIB_ITEM("Ip6ReasmFails", IPSTATS_MIB_REASMFAILS), + SNMP_MIB_ITEM("Ip6FragOKs", IPSTATS_MIB_FRAGOKS), + SNMP_MIB_ITEM("Ip6FragFails", IPSTATS_MIB_FRAGFAILS), + SNMP_MIB_ITEM("Ip6FragCreates", IPSTATS_MIB_FRAGCREATES), + SNMP_MIB_ITEM("Ip6InMcastPkts", IPSTATS_MIB_INMCASTPKTS), + SNMP_MIB_ITEM("Ip6OutMcastPkts", IPSTATS_MIB_OUTMCASTPKTS), + SNMP_MIB_SENTINEL +}; + +static struct snmp_mib snmp6_icmp6_list[] = { +/* icmpv6 mib according to RFC 2466 + + Exceptions: {In|Out}AdminProhibs are removed, because I see + no good reasons to account them separately + of another dest.unreachs. + OutErrs is zero identically. + OutEchos too. + OutRouterAdvertisements too. + OutGroupMembQueries too. + */ + SNMP_MIB_ITEM("Icmp6InMsgs", ICMP6_MIB_INMSGS), + SNMP_MIB_ITEM("Icmp6InErrors", ICMP6_MIB_INERRORS), + SNMP_MIB_ITEM("Icmp6InDestUnreachs", ICMP6_MIB_INDESTUNREACHS), + SNMP_MIB_ITEM("Icmp6InPktTooBigs", ICMP6_MIB_INPKTTOOBIGS), + SNMP_MIB_ITEM("Icmp6InTimeExcds", ICMP6_MIB_INTIMEEXCDS), + SNMP_MIB_ITEM("Icmp6InParmProblems", ICMP6_MIB_INPARMPROBLEMS), + SNMP_MIB_ITEM("Icmp6InEchos", ICMP6_MIB_INECHOS), + SNMP_MIB_ITEM("Icmp6InEchoReplies", ICMP6_MIB_INECHOREPLIES), + SNMP_MIB_ITEM("Icmp6InGroupMembQueries", ICMP6_MIB_INGROUPMEMBQUERIES), + SNMP_MIB_ITEM("Icmp6InGroupMembResponses", ICMP6_MIB_INGROUPMEMBRESPONSES), + SNMP_MIB_ITEM("Icmp6InGroupMembReductions", ICMP6_MIB_INGROUPMEMBREDUCTIONS), + SNMP_MIB_ITEM("Icmp6InRouterSolicits", ICMP6_MIB_INROUTERSOLICITS), + SNMP_MIB_ITEM("Icmp6InRouterAdvertisements", ICMP6_MIB_INROUTERADVERTISEMENTS), + SNMP_MIB_ITEM("Icmp6InNeighborSolicits", ICMP6_MIB_INNEIGHBORSOLICITS), + SNMP_MIB_ITEM("Icmp6InNeighborAdvertisements", ICMP6_MIB_INNEIGHBORADVERTISEMENTS), + SNMP_MIB_ITEM("Icmp6InRedirects", ICMP6_MIB_INREDIRECTS), + SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), + SNMP_MIB_ITEM("Icmp6OutDestUnreachs", ICMP6_MIB_OUTDESTUNREACHS), + SNMP_MIB_ITEM("Icmp6OutPktTooBigs", ICMP6_MIB_OUTPKTTOOBIGS), + SNMP_MIB_ITEM("Icmp6OutTimeExcds", ICMP6_MIB_OUTTIMEEXCDS), + SNMP_MIB_ITEM("Icmp6OutParmProblems", ICMP6_MIB_OUTPARMPROBLEMS), + SNMP_MIB_ITEM("Icmp6OutEchoReplies", ICMP6_MIB_OUTECHOREPLIES), + SNMP_MIB_ITEM("Icmp6OutRouterSolicits", ICMP6_MIB_OUTROUTERSOLICITS), + SNMP_MIB_ITEM("Icmp6OutNeighborSolicits", ICMP6_MIB_OUTNEIGHBORSOLICITS), + SNMP_MIB_ITEM("Icmp6OutNeighborAdvertisements", ICMP6_MIB_OUTNEIGHBORADVERTISEMENTS), + SNMP_MIB_ITEM("Icmp6OutRedirects", ICMP6_MIB_OUTREDIRECTS), + SNMP_MIB_ITEM("Icmp6OutGroupMembResponses", ICMP6_MIB_OUTGROUPMEMBRESPONSES), + SNMP_MIB_ITEM("Icmp6OutGroupMembReductions", ICMP6_MIB_OUTGROUPMEMBREDUCTIONS), + SNMP_MIB_SENTINEL +}; + +static struct snmp_mib snmp6_udp6_list[] = { + SNMP_MIB_ITEM("Udp6InDatagrams", UDP_MIB_INDATAGRAMS), + SNMP_MIB_ITEM("Udp6NoPorts", UDP_MIB_NOPORTS), + SNMP_MIB_ITEM("Udp6InErrors", UDP_MIB_INERRORS), + SNMP_MIB_ITEM("Udp6OutDatagrams", UDP_MIB_OUTDATAGRAMS), + SNMP_MIB_SENTINEL +}; + +static unsigned long +fold_field(void *mib[], int offt) +{ + unsigned long res = 0; + int i; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_possible(i)) + continue; + res += *(((unsigned long *)per_cpu_ptr(mib[0], i)) + offt); + res += *(((unsigned long *)per_cpu_ptr(mib[1], i)) + offt); + } + return res; +} + +static inline void +snmp6_seq_show_item(struct seq_file *seq, void **mib, struct snmp_mib *itemlist) +{ + int i; + for (i=0; itemlist[i].name; i++) + seq_printf(seq, "%-32s\t%lu\n", itemlist[i].name, + fold_field(mib, itemlist[i].entry)); +} + +static int snmp6_seq_show(struct seq_file *seq, void *v) +{ + struct inet6_dev *idev = (struct inet6_dev *)seq->private; + + if (idev) { + seq_printf(seq, "%-32s\t%u\n", "ifIndex", idev->dev->ifindex); + snmp6_seq_show_item(seq, (void **)idev->stats.icmpv6, snmp6_icmp6_list); + } else { + snmp6_seq_show_item(seq, (void **)ipv6_statistics, snmp6_ipstats_list); + snmp6_seq_show_item(seq, (void **)icmpv6_statistics, snmp6_icmp6_list); + snmp6_seq_show_item(seq, (void **)udp_stats_in6, snmp6_udp6_list); + } + return 0; +} + +static int sockstat6_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, sockstat6_seq_show, NULL); +} + +static struct file_operations sockstat6_seq_fops = { + .owner = THIS_MODULE, + .open = sockstat6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int snmp6_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, snmp6_seq_show, PDE(inode)->data); +} + +static struct file_operations snmp6_seq_fops = { + .owner = THIS_MODULE, + .open = snmp6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +int snmp6_register_dev(struct inet6_dev *idev) +{ + struct proc_dir_entry *p; + + if (!idev || !idev->dev) + return -EINVAL; + + if (!proc_net_devsnmp6) + return -ENOENT; + + p = create_proc_entry(idev->dev->name, S_IRUGO, proc_net_devsnmp6); + if (!p) + return -ENOMEM; + + p->data = idev; + p->proc_fops = &snmp6_seq_fops; + + idev->stats.proc_dir_entry = p; + return 0; +} + +int snmp6_unregister_dev(struct inet6_dev *idev) +{ + if (!proc_net_devsnmp6) + return -ENOENT; + if (!idev || !idev->stats.proc_dir_entry) + return -EINVAL; + remove_proc_entry(idev->stats.proc_dir_entry->name, + proc_net_devsnmp6); + return 0; +} + +int __init ipv6_misc_proc_init(void) +{ + int rc = 0; + + if (!proc_net_fops_create("snmp6", S_IRUGO, &snmp6_seq_fops)) + goto proc_snmp6_fail; + + proc_net_devsnmp6 = proc_mkdir("dev_snmp6", proc_net); + if (!proc_net_devsnmp6) + goto proc_dev_snmp6_fail; + + if (!proc_net_fops_create("sockstat6", S_IRUGO, &sockstat6_seq_fops)) + goto proc_sockstat6_fail; +out: + return rc; + +proc_sockstat6_fail: + proc_net_remove("dev_snmp6"); +proc_dev_snmp6_fail: + proc_net_remove("snmp6"); +proc_snmp6_fail: + rc = -ENOMEM; + goto out; +} + +void ipv6_misc_proc_exit(void) +{ + proc_net_remove("sockstat6"); + proc_net_remove("dev_snmp6"); + proc_net_remove("snmp6"); +} + +#else /* CONFIG_PROC_FS */ + + +int snmp6_register_dev(struct inet6_dev *idev) +{ + return 0; +} + +int snmp6_unregister_dev(struct inet6_dev *idev) +{ + return 0; +} +#endif /* CONFIG_PROC_FS */ + +int snmp6_alloc_dev(struct inet6_dev *idev) +{ + int err = -ENOMEM; + + if (!idev || !idev->dev) + return -EINVAL; + + if (snmp6_mib_init((void **)idev->stats.icmpv6, sizeof(struct icmpv6_mib), + __alignof__(struct icmpv6_mib)) < 0) + goto err_icmp; + + return 0; + +err_icmp: + return err; +} + +int snmp6_free_dev(struct inet6_dev *idev) +{ + snmp6_mib_free((void **)idev->stats.icmpv6); + return 0; +} + + diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c new file mode 100644 index 000000000000..52c1d58b6ca6 --- /dev/null +++ b/net/ipv6/protocol.c @@ -0,0 +1,86 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * PF_INET6 protocol dispatch tables. + * + * Version: $Id: protocol.c,v 1.10 2001/05/18 02:25:49 davem Exp $ + * + * Authors: Pedro Roque + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Changes: + * + * Vince Laviano (vince@cs.stanford.edu) 16 May 2001 + * - Removed unused variable 'inet6_protocol_base' + * - Modified inet6_del_protocol() to correctly maintain copy bit. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +struct inet6_protocol *inet6_protos[MAX_INET_PROTOS]; +static DEFINE_SPINLOCK(inet6_proto_lock); + + +int inet6_add_protocol(struct inet6_protocol *prot, unsigned char protocol) +{ + int ret, hash = protocol & (MAX_INET_PROTOS - 1); + + spin_lock_bh(&inet6_proto_lock); + + if (inet6_protos[hash]) { + ret = -1; + } else { + inet6_protos[hash] = prot; + ret = 0; + } + + spin_unlock_bh(&inet6_proto_lock); + + return ret; +} + +/* + * Remove a protocol from the hash tables. + */ + +int inet6_del_protocol(struct inet6_protocol *prot, unsigned char protocol) +{ + int ret, hash = protocol & (MAX_INET_PROTOS - 1); + + spin_lock_bh(&inet6_proto_lock); + + if (inet6_protos[hash] != prot) { + ret = -1; + } else { + inet6_protos[hash] = NULL; + ret = 0; + } + + spin_unlock_bh(&inet6_proto_lock); + + synchronize_net(); + + return ret; +} diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c new file mode 100644 index 000000000000..5488ad0de4f6 --- /dev/null +++ b/net/ipv6/raw.c @@ -0,0 +1,1157 @@ +/* + * RAW sockets for IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * Adapted from linux/net/ipv4/raw.c + * + * $Id: raw.c,v 1.51 2002/02/01 22:01:04 davem Exp $ + * + * Fixes: + * Hideaki YOSHIFUJI : sin6_scope_id support + * YOSHIFUJI,H.@USAGI : raw checksum (RFC2292(bis) compliance) + * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +struct hlist_head raw_v6_htable[RAWV6_HTABLE_SIZE]; +DEFINE_RWLOCK(raw_v6_lock); + +static void raw_v6_hash(struct sock *sk) +{ + struct hlist_head *list = &raw_v6_htable[inet_sk(sk)->num & + (RAWV6_HTABLE_SIZE - 1)]; + + write_lock_bh(&raw_v6_lock); + sk_add_node(sk, list); + sock_prot_inc_use(sk->sk_prot); + write_unlock_bh(&raw_v6_lock); +} + +static void raw_v6_unhash(struct sock *sk) +{ + write_lock_bh(&raw_v6_lock); + if (sk_del_node_init(sk)) + sock_prot_dec_use(sk->sk_prot); + write_unlock_bh(&raw_v6_lock); +} + + +/* Grumble... icmp and ip_input want to get at this... */ +struct sock *__raw_v6_lookup(struct sock *sk, unsigned short num, + struct in6_addr *loc_addr, struct in6_addr *rmt_addr) +{ + struct hlist_node *node; + int is_multicast = ipv6_addr_is_multicast(loc_addr); + + sk_for_each_from(sk, node) + if (inet_sk(sk)->num == num) { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (!ipv6_addr_any(&np->daddr) && + !ipv6_addr_equal(&np->daddr, rmt_addr)) + continue; + + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (ipv6_addr_equal(&np->rcv_saddr, loc_addr)) + goto found; + if (is_multicast && + inet6_mc_check(sk, loc_addr, rmt_addr)) + goto found; + continue; + } + goto found; + } + sk = NULL; +found: + return sk; +} + +/* + * 0 - deliver + * 1 - block + */ +static __inline__ int icmpv6_filter(struct sock *sk, struct sk_buff *skb) +{ + struct icmp6hdr *icmph; + struct raw6_sock *rp = raw6_sk(sk); + + if (pskb_may_pull(skb, sizeof(struct icmp6hdr))) { + __u32 *data = &rp->filter.data[0]; + int bit_nr; + + icmph = (struct icmp6hdr *) skb->data; + bit_nr = icmph->icmp6_type; + + return (data[bit_nr >> 5] & (1 << (bit_nr & 31))) != 0; + } + return 0; +} + +/* + * demultiplex raw sockets. + * (should consider queueing the skb in the sock receive_queue + * without calling rawv6.c) + * + * Caller owns SKB so we must make clones. + */ +void ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) +{ + struct in6_addr *saddr; + struct in6_addr *daddr; + struct sock *sk; + __u8 hash; + + saddr = &skb->nh.ipv6h->saddr; + daddr = saddr + 1; + + hash = nexthdr & (MAX_INET_PROTOS - 1); + + read_lock(&raw_v6_lock); + sk = sk_head(&raw_v6_htable[hash]); + + /* + * The first socket found will be delivered after + * delivery to transport protocols. + */ + + if (sk == NULL) + goto out; + + sk = __raw_v6_lookup(sk, nexthdr, daddr, saddr); + + while (sk) { + if (nexthdr != IPPROTO_ICMPV6 || !icmpv6_filter(sk, skb)) { + struct sk_buff *clone = skb_clone(skb, GFP_ATOMIC); + + /* Not releasing hash table! */ + if (clone) + rawv6_rcv(sk, clone); + } + sk = __raw_v6_lookup(sk_next(sk), nexthdr, daddr, saddr); + } +out: + read_unlock(&raw_v6_lock); +} + +/* This cleans up af_inet6 a bit. -DaveM */ +static int rawv6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *addr = (struct sockaddr_in6 *) uaddr; + __u32 v4addr = 0; + int addr_type; + int err; + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + addr_type = ipv6_addr_type(&addr->sin6_addr); + + /* Raw sockets are IPv6 only */ + if (addr_type == IPV6_ADDR_MAPPED) + return(-EADDRNOTAVAIL); + + lock_sock(sk); + + err = -EINVAL; + if (sk->sk_state != TCP_CLOSE) + goto out; + + /* Check if the address belongs to the host. */ + if (addr_type != IPV6_ADDR_ANY) { + struct net_device *dev = NULL; + + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + addr->sin6_scope_id) { + /* Override any existing binding, if another + * one is supplied by user. + */ + sk->sk_bound_dev_if = addr->sin6_scope_id; + } + + /* Binding to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) + goto out; + + dev = dev_get_by_index(sk->sk_bound_dev_if); + if (!dev) { + err = -ENODEV; + goto out; + } + } + + /* ipv4 addr of the socket is invalid. Only the + * unspecified and mapped address have a v4 equivalent. + */ + v4addr = LOOPBACK4_IPV6; + if (!(addr_type & IPV6_ADDR_MULTICAST)) { + err = -EADDRNOTAVAIL; + if (!ipv6_chk_addr(&addr->sin6_addr, dev, 0)) { + if (dev) + dev_put(dev); + goto out; + } + } + if (dev) + dev_put(dev); + } + + inet->rcv_saddr = inet->saddr = v4addr; + ipv6_addr_copy(&np->rcv_saddr, &addr->sin6_addr); + if (!(addr_type & IPV6_ADDR_MULTICAST)) + ipv6_addr_copy(&np->saddr, &addr->sin6_addr); + err = 0; +out: + release_sock(sk); + return err; +} + +void rawv6_err(struct sock *sk, struct sk_buff *skb, + struct inet6_skb_parm *opt, + int type, int code, int offset, u32 info) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + int err; + int harderr; + + /* Report error on raw socket, if: + 1. User requested recverr. + 2. Socket is connected (otherwise the error indication + is useless without recverr and error is hard. + */ + if (!np->recverr && sk->sk_state != TCP_ESTABLISHED) + return; + + harderr = icmpv6_err_convert(type, code, &err); + if (type == ICMPV6_PKT_TOOBIG) + harderr = (np->pmtudisc == IPV6_PMTUDISC_DO); + + if (np->recverr) { + u8 *payload = skb->data; + if (!inet->hdrincl) + payload += offset; + ipv6_icmp_error(sk, skb, err, 0, ntohl(info), payload); + } + + if (np->recverr || harderr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } +} + +static inline int rawv6_rcv_skb(struct sock * sk, struct sk_buff * skb) +{ + if ((raw6_sk(sk)->checksum || sk->sk_filter) && + skb->ip_summed != CHECKSUM_UNNECESSARY) { + if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { + /* FIXME: increment a raw6 drops counter here */ + kfree_skb(skb); + return 0; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + /* Charge it to the socket. */ + if (sock_queue_rcv_skb(sk,skb)<0) { + /* FIXME: increment a raw6 drops counter here */ + kfree_skb(skb); + return 0; + } + + return 0; +} + +/* + * This is next to useless... + * if we demultiplex in network layer we don't need the extra call + * just to queue the skb... + * maybe we could have the network decide upon a hint if it + * should call raw_rcv for demultiplexing + */ +int rawv6_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct inet_sock *inet = inet_sk(sk); + struct raw6_sock *rp = raw6_sk(sk); + + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return NET_RX_DROP; + } + + if (!rp->checksum) + skb->ip_summed = CHECKSUM_UNNECESSARY; + + if (skb->ip_summed != CHECKSUM_UNNECESSARY) { + if (skb->ip_summed == CHECKSUM_HW) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (csum_ipv6_magic(&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, + skb->len, inet->num, skb->csum)) { + LIMIT_NETDEBUG( + printk(KERN_DEBUG "raw v6 hw csum failure.\n")); + skb->ip_summed = CHECKSUM_NONE; + } + } + if (skb->ip_summed == CHECKSUM_NONE) + skb->csum = ~csum_ipv6_magic(&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, + skb->len, inet->num, 0); + } + + if (inet->hdrincl) { + if (skb->ip_summed != CHECKSUM_UNNECESSARY && + (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { + /* FIXME: increment a raw6 drops counter here */ + kfree_skb(skb); + return 0; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + rawv6_rcv_skb(sk, skb); + return 0; +} + + +/* + * This should be easy, if there is something there + * we return it, otherwise we block. + */ + +static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)msg->msg_name; + struct sk_buff *skb; + size_t copied; + int err; + + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + if (addr_len) + *addr_len=sizeof(*sin6); + + if (flags & MSG_ERRQUEUE) + return ipv6_recv_error(sk, msg, len); + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len; + if (copied > len) { + copied = len; + msg->msg_flags |= MSG_TRUNC; + } + + if (skb->ip_summed==CHECKSUM_UNNECESSARY) { + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + } else if (msg->msg_flags&MSG_TRUNC) { + if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + goto csum_copy_err; + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + } else { + err = skb_copy_and_csum_datagram_iovec(skb, 0, msg->msg_iov); + if (err == -EINVAL) + goto csum_copy_err; + } + if (err) + goto out_free; + + /* Copy the address. */ + if (sin6) { + sin6->sin6_family = AF_INET6; + ipv6_addr_copy(&sin6->sin6_addr, &skb->nh.ipv6h->saddr); + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = IP6CB(skb)->iif; + } + + sock_recv_timestamp(msg, sk, skb); + + if (np->rxopt.all) + datagram_recv_ctl(sk, msg, skb); + + err = copied; + if (flags & MSG_TRUNC) + err = skb->len; + +out_free: + skb_free_datagram(sk, skb); +out: + return err; + +csum_copy_err: + /* Clear queue. */ + if (flags&MSG_PEEK) { + int clear = 0; + spin_lock_irq(&sk->sk_receive_queue.lock); + if (skb == skb_peek(&sk->sk_receive_queue)) { + __skb_unlink(skb, &sk->sk_receive_queue); + clear = 1; + } + spin_unlock_irq(&sk->sk_receive_queue.lock); + if (clear) + kfree_skb(skb); + } + + /* Error for blocking case is chosen to masquerade + as some normal condition. + */ + err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; + /* FIXME: increment a raw6 drops counter here */ + goto out_free; +} + +static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, + struct raw6_sock *rp, int len) +{ + struct sk_buff *skb; + int err = 0; + u16 *csum; + u32 tmp_csum; + + if (!rp->checksum) + goto send; + + if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) + goto out; + + if (rp->offset + 1 < len) + csum = (u16 *)(skb->h.raw + rp->offset); + else { + err = -EINVAL; + goto out; + } + + /* should be check HW csum miyazawa */ + if (skb_queue_len(&sk->sk_write_queue) == 1) { + /* + * Only one fragment on the socket. + */ + tmp_csum = skb->csum; + } else { + tmp_csum = 0; + + skb_queue_walk(&sk->sk_write_queue, skb) { + tmp_csum = csum_add(tmp_csum, skb->csum); + } + } + + /* in case cksum was not initialized */ + if (unlikely(*csum)) + tmp_csum = csum_sub(tmp_csum, *csum); + + *csum = csum_ipv6_magic(&fl->fl6_src, + &fl->fl6_dst, + len, fl->proto, tmp_csum); + + if (*csum == 0) + *csum = -1; +send: + err = ip6_push_pending_frames(sk); +out: + return err; +} + +static int rawv6_send_hdrinc(struct sock *sk, void *from, int length, + struct flowi *fl, struct rt6_info *rt, + unsigned int flags) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6hdr *iph; + struct sk_buff *skb; + unsigned int hh_len; + int err; + + if (length > rt->u.dst.dev->mtu) { + ipv6_local_error(sk, EMSGSIZE, fl, rt->u.dst.dev->mtu); + return -EMSGSIZE; + } + if (flags&MSG_PROBE) + goto out; + + hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); + + skb = sock_alloc_send_skb(sk, length+hh_len+15, + flags&MSG_DONTWAIT, &err); + if (skb == NULL) + goto error; + skb_reserve(skb, hh_len); + + skb->priority = sk->sk_priority; + skb->dst = dst_clone(&rt->u.dst); + + skb->nh.ipv6h = iph = (struct ipv6hdr *)skb_put(skb, length); + + skb->ip_summed = CHECKSUM_NONE; + + skb->h.raw = skb->nh.raw; + err = memcpy_fromiovecend((void *)iph, from, 0, length); + if (err) + goto error_fault; + + IP6_INC_STATS(IPSTATS_MIB_OUTREQUESTS); + err = NF_HOOK(PF_INET6, NF_IP6_LOCAL_OUT, skb, NULL, rt->u.dst.dev, + dst_output); + if (err > 0) + err = inet->recverr ? net_xmit_errno(err) : 0; + if (err) + goto error; +out: + return 0; + +error_fault: + err = -EFAULT; + kfree_skb(skb); +error: + IP6_INC_STATS(IPSTATS_MIB_OUTDISCARDS); + return err; +} + +static void rawv6_probe_proto_opt(struct flowi *fl, struct msghdr *msg) +{ + struct iovec *iov; + u8 __user *type = NULL; + u8 __user *code = NULL; + int probed = 0; + int i; + + if (!msg->msg_iov) + return; + + for (i = 0; i < msg->msg_iovlen; i++) { + iov = &msg->msg_iov[i]; + if (!iov) + continue; + + switch (fl->proto) { + case IPPROTO_ICMPV6: + /* check if one-byte field is readable or not. */ + if (iov->iov_base && iov->iov_len < 1) + break; + + if (!type) { + type = iov->iov_base; + /* check if code field is readable or not. */ + if (iov->iov_len > 1) + code = type + 1; + } else if (!code) + code = iov->iov_base; + + if (type && code) { + get_user(fl->fl_icmp_type, type); + __get_user(fl->fl_icmp_code, code); + probed = 1; + } + break; + default: + probed = 1; + break; + } + if (probed) + break; + } +} + +static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len) +{ + struct ipv6_txoptions opt_space; + struct sockaddr_in6 * sin6 = (struct sockaddr_in6 *) msg->msg_name; + struct in6_addr *daddr, *final_p = NULL, final; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct raw6_sock *rp = raw6_sk(sk); + struct ipv6_txoptions *opt = NULL; + struct ip6_flowlabel *flowlabel = NULL; + struct dst_entry *dst = NULL; + struct flowi fl; + int addr_len = msg->msg_namelen; + int hlimit = -1; + u16 proto; + int err; + + /* Rough check on arithmetic overflow, + better check is made in ip6_build_xmit + */ + if (len < 0) + return -EMSGSIZE; + + /* Mirror BSD error message compatibility */ + if (msg->msg_flags & MSG_OOB) + return -EOPNOTSUPP; + + /* + * Get and verify the address. + */ + memset(&fl, 0, sizeof(fl)); + + if (sin6) { + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (sin6->sin6_family && sin6->sin6_family != AF_INET6) + return(-EAFNOSUPPORT); + + /* port is the proto value [0..255] carried in nexthdr */ + proto = ntohs(sin6->sin6_port); + + if (!proto) + proto = inet->num; + else if (proto != inet->num) + return(-EINVAL); + + if (proto > 255) + return(-EINVAL); + + daddr = &sin6->sin6_addr; + if (np->sndflow) { + fl.fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if (flowlabel == NULL) + return -EINVAL; + daddr = &flowlabel->dst; + } + } + + /* + * Otherwise it will be difficult to maintain + * sk->sk_dst_cache. + */ + if (sk->sk_state == TCP_ESTABLISHED && + ipv6_addr_equal(daddr, &np->daddr)) + daddr = &np->daddr; + + if (addr_len >= sizeof(struct sockaddr_in6) && + sin6->sin6_scope_id && + ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) + fl.oif = sin6->sin6_scope_id; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + + proto = inet->num; + daddr = &np->daddr; + fl.fl6_flowlabel = np->flow_label; + } + + if (ipv6_addr_any(daddr)) { + /* + * unspecified destination address + * treated as error... is this correct ? + */ + fl6_sock_release(flowlabel); + return(-EINVAL); + } + + if (fl.oif == 0) + fl.oif = sk->sk_bound_dev_if; + + if (msg->msg_controllen) { + opt = &opt_space; + memset(opt, 0, sizeof(struct ipv6_txoptions)); + opt->tot_len = sizeof(struct ipv6_txoptions); + + err = datagram_send_ctl(msg, &fl, opt, &hlimit); + if (err < 0) { + fl6_sock_release(flowlabel); + return err; + } + if ((fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { + flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if (flowlabel == NULL) + return -EINVAL; + } + if (!(opt->opt_nflen|opt->opt_flen)) + opt = NULL; + } + if (opt == NULL) + opt = np->opt; + if (flowlabel) + opt = fl6_merge_options(&opt_space, flowlabel, opt); + + fl.proto = proto; + rawv6_probe_proto_opt(&fl, msg); + + ipv6_addr_copy(&fl.fl6_dst, daddr); + if (ipv6_addr_any(&fl.fl6_src) && !ipv6_addr_any(&np->saddr)) + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + + /* merge ip6_build_xmit from ip6_output */ + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + if (!fl.oif && ipv6_addr_is_multicast(&fl.fl6_dst)) + fl.oif = np->mcast_oif; + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto out; + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + dst_release(dst); + goto out; + } + + if (hlimit < 0) { + if (ipv6_addr_is_multicast(&fl.fl6_dst)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = dst_metric(dst, RTAX_HOPLIMIT); + if (hlimit < 0) + hlimit = ipv6_get_hoplimit(dst->dev); + } + + if (msg->msg_flags&MSG_CONFIRM) + goto do_confirm; + +back_from_confirm: + if (inet->hdrincl) { + err = rawv6_send_hdrinc(sk, msg->msg_iov, len, &fl, (struct rt6_info*)dst, msg->msg_flags); + } else { + lock_sock(sk); + err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, len, 0, + hlimit, opt, &fl, (struct rt6_info*)dst, msg->msg_flags); + + if (err) + ip6_flush_pending_frames(sk); + else if (!(msg->msg_flags & MSG_MORE)) + err = rawv6_push_pending_frames(sk, &fl, rp, len); + } +done: + ip6_dst_store(sk, dst, + ipv6_addr_equal(&fl.fl6_dst, &np->daddr) ? + &np->daddr : NULL); + if (err > 0) + err = np->recverr ? net_xmit_errno(err) : 0; + + release_sock(sk); +out: + fl6_sock_release(flowlabel); + return err<0?err:len; +do_confirm: + dst_confirm(dst); + if (!(msg->msg_flags & MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto done; +} + +static int rawv6_seticmpfilter(struct sock *sk, int level, int optname, + char __user *optval, int optlen) +{ + switch (optname) { + case ICMPV6_FILTER: + if (optlen > sizeof(struct icmp6_filter)) + optlen = sizeof(struct icmp6_filter); + if (copy_from_user(&raw6_sk(sk)->filter, optval, optlen)) + return -EFAULT; + return 0; + default: + return -ENOPROTOOPT; + }; + + return 0; +} + +static int rawv6_geticmpfilter(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + int len; + + switch (optname) { + case ICMPV6_FILTER: + if (get_user(len, optlen)) + return -EFAULT; + if (len < 0) + return -EINVAL; + if (len > sizeof(struct icmp6_filter)) + len = sizeof(struct icmp6_filter); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &raw6_sk(sk)->filter, len)) + return -EFAULT; + return 0; + default: + return -ENOPROTOOPT; + }; + + return 0; +} + + +static int rawv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, int optlen) +{ + struct raw6_sock *rp = raw6_sk(sk); + int val; + + switch(level) { + case SOL_RAW: + break; + + case SOL_ICMPV6: + if (inet_sk(sk)->num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_seticmpfilter(sk, level, optname, optval, + optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return ipv6_setsockopt(sk, level, optname, optval, + optlen); + }; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + switch (optname) { + case IPV6_CHECKSUM: + /* You may get strange result with a positive odd offset; + RFC2292bis agrees with me. */ + if (val > 0 && (val&1)) + return(-EINVAL); + if (val < 0) { + rp->checksum = 0; + } else { + rp->checksum = 1; + rp->offset = val; + } + + return 0; + break; + + default: + return(-ENOPROTOOPT); + } +} + +static int rawv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct raw6_sock *rp = raw6_sk(sk); + int val, len; + + switch(level) { + case SOL_RAW: + break; + + case SOL_ICMPV6: + if (inet_sk(sk)->num != IPPROTO_ICMPV6) + return -EOPNOTSUPP; + return rawv6_geticmpfilter(sk, level, optname, optval, + optlen); + case SOL_IPV6: + if (optname == IPV6_CHECKSUM) + break; + default: + return ipv6_getsockopt(sk, level, optname, optval, + optlen); + }; + + if (get_user(len,optlen)) + return -EFAULT; + + switch (optname) { + case IPV6_CHECKSUM: + if (rp->checksum == 0) + val = -1; + else + val = rp->offset; + break; + + default: + return -ENOPROTOOPT; + } + + len = min_t(unsigned int, sizeof(int), len); + + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval,&val,len)) + return -EFAULT; + return 0; +} + +static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + switch(cmd) { + case SIOCOUTQ: + { + int amount = atomic_read(&sk->sk_wmem_alloc); + return put_user(amount, (int __user *)arg); + } + case SIOCINQ: + { + struct sk_buff *skb; + int amount = 0; + + spin_lock_irq(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) + amount = skb->tail - skb->h.raw; + spin_unlock_irq(&sk->sk_receive_queue.lock); + return put_user(amount, (int __user *)arg); + } + + default: + return -ENOIOCTLCMD; + } +} + +static void rawv6_close(struct sock *sk, long timeout) +{ + if (inet_sk(sk)->num == IPPROTO_RAW) + ip6_ra_control(sk, -1, NULL); + + sk_common_release(sk); +} + +static int rawv6_init_sk(struct sock *sk) +{ + if (inet_sk(sk)->num == IPPROTO_ICMPV6) { + struct raw6_sock *rp = raw6_sk(sk); + rp->checksum = 1; + rp->offset = 2; + } + return(0); +} + +struct proto rawv6_prot = { + .name = "RAWv6", + .owner = THIS_MODULE, + .close = rawv6_close, + .connect = ip6_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = rawv6_ioctl, + .init = rawv6_init_sk, + .destroy = inet6_destroy_sock, + .setsockopt = rawv6_setsockopt, + .getsockopt = rawv6_getsockopt, + .sendmsg = rawv6_sendmsg, + .recvmsg = rawv6_recvmsg, + .bind = rawv6_bind, + .backlog_rcv = rawv6_rcv_skb, + .hash = raw_v6_hash, + .unhash = raw_v6_unhash, + .obj_size = sizeof(struct raw6_sock), +}; + +#ifdef CONFIG_PROC_FS +struct raw6_iter_state { + int bucket; +}; + +#define raw6_seq_private(seq) ((struct raw6_iter_state *)(seq)->private) + +static struct sock *raw6_get_first(struct seq_file *seq) +{ + struct sock *sk; + struct hlist_node *node; + struct raw6_iter_state* state = raw6_seq_private(seq); + + for (state->bucket = 0; state->bucket < RAWV6_HTABLE_SIZE; ++state->bucket) + sk_for_each(sk, node, &raw_v6_htable[state->bucket]) + if (sk->sk_family == PF_INET6) + goto out; + sk = NULL; +out: + return sk; +} + +static struct sock *raw6_get_next(struct seq_file *seq, struct sock *sk) +{ + struct raw6_iter_state* state = raw6_seq_private(seq); + + do { + sk = sk_next(sk); +try_again: + ; + } while (sk && sk->sk_family != PF_INET6); + + if (!sk && ++state->bucket < RAWV6_HTABLE_SIZE) { + sk = sk_head(&raw_v6_htable[state->bucket]); + goto try_again; + } + return sk; +} + +static struct sock *raw6_get_idx(struct seq_file *seq, loff_t pos) +{ + struct sock *sk = raw6_get_first(seq); + if (sk) + while (pos && (sk = raw6_get_next(seq, sk)) != NULL) + --pos; + return pos ? NULL : sk; +} + +static void *raw6_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&raw_v6_lock); + return *pos ? raw6_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *raw6_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock *sk; + + if (v == SEQ_START_TOKEN) + sk = raw6_get_first(seq); + else + sk = raw6_get_next(seq, v); + ++*pos; + return sk; +} + +static void raw6_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&raw_v6_lock); +} + +static void raw6_sock_seq_show(struct seq_file *seq, struct sock *sp, int i) +{ + struct ipv6_pinfo *np = inet6_sk(sp); + struct in6_addr *dest, *src; + __u16 destp, srcp; + + dest = &np->daddr; + src = &np->rcv_saddr; + destp = 0; + srcp = inet_sk(sp)->num; + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p\n", + i, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + sp->sk_state, + atomic_read(&sp->sk_wmem_alloc), + atomic_read(&sp->sk_rmem_alloc), + 0, 0L, 0, + sock_i_uid(sp), 0, + sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp); +} + +static int raw6_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, + " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt" + " uid timeout inode\n"); + else + raw6_sock_seq_show(seq, v, raw6_seq_private(seq)->bucket); + return 0; +} + +static struct seq_operations raw6_seq_ops = { + .start = raw6_seq_start, + .next = raw6_seq_next, + .stop = raw6_seq_stop, + .show = raw6_seq_show, +}; + +static int raw6_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct raw6_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + goto out; + rc = seq_open(file, &raw6_seq_ops); + if (rc) + goto out_kfree; + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +static struct file_operations raw6_seq_fops = { + .owner = THIS_MODULE, + .open = raw6_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +int __init raw6_proc_init(void) +{ + if (!proc_net_fops_create("raw6", S_IRUGO, &raw6_seq_fops)) + return -ENOMEM; + return 0; +} + +void raw6_proc_exit(void) +{ + proc_net_remove("raw6"); +} +#endif /* CONFIG_PROC_FS */ diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c new file mode 100644 index 000000000000..59e7c6317872 --- /dev/null +++ b/net/ipv6/reassembly.c @@ -0,0 +1,771 @@ +/* + * IPv6 fragment reassembly + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * $Id: reassembly.c,v 1.26 2001/03/07 22:00:57 davem Exp $ + * + * Based on: net/ipv4/ip_fragment.c + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * Fixes: + * Andi Kleen Make it work with multiple hosts. + * More RFC compliance. + * + * Horst von Brand Add missing #include + * Alexey Kuznetsov SMP races, threading, cleanup. + * Patrick McHardy LRU queue of frag heads for evictor. + * Mitsuru KANDA @USAGI Register inet6_protocol{}. + * David Stevens and + * YOSHIFUJI,H. @USAGI Always remove fragment header to + * calculate ICV correctly. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +int sysctl_ip6frag_high_thresh = 256*1024; +int sysctl_ip6frag_low_thresh = 192*1024; + +int sysctl_ip6frag_time = IPV6_FRAG_TIMEOUT; + +struct ip6frag_skb_cb +{ + struct inet6_skb_parm h; + int offset; +}; + +#define FRAG6_CB(skb) ((struct ip6frag_skb_cb*)((skb)->cb)) + + +/* + * Equivalent of ipv4 struct ipq + */ + +struct frag_queue +{ + struct frag_queue *next; + struct list_head lru_list; /* lru list member */ + + __u32 id; /* fragment id */ + struct in6_addr saddr; + struct in6_addr daddr; + + spinlock_t lock; + atomic_t refcnt; + struct timer_list timer; /* expire timer */ + struct sk_buff *fragments; + int len; + int meat; + int iif; + struct timeval stamp; + unsigned int csum; + __u8 last_in; /* has first/last segment arrived? */ +#define COMPLETE 4 +#define FIRST_IN 2 +#define LAST_IN 1 + __u16 nhoffset; + struct frag_queue **pprev; +}; + +/* Hash table. */ + +#define IP6Q_HASHSZ 64 + +static struct frag_queue *ip6_frag_hash[IP6Q_HASHSZ]; +static DEFINE_RWLOCK(ip6_frag_lock); +static u32 ip6_frag_hash_rnd; +static LIST_HEAD(ip6_frag_lru_list); +int ip6_frag_nqueues = 0; + +static __inline__ void __fq_unlink(struct frag_queue *fq) +{ + if(fq->next) + fq->next->pprev = fq->pprev; + *fq->pprev = fq->next; + list_del(&fq->lru_list); + ip6_frag_nqueues--; +} + +static __inline__ void fq_unlink(struct frag_queue *fq) +{ + write_lock(&ip6_frag_lock); + __fq_unlink(fq); + write_unlock(&ip6_frag_lock); +} + +static unsigned int ip6qhashfn(u32 id, struct in6_addr *saddr, + struct in6_addr *daddr) +{ + u32 a, b, c; + + a = saddr->s6_addr32[0]; + b = saddr->s6_addr32[1]; + c = saddr->s6_addr32[2]; + + a += JHASH_GOLDEN_RATIO; + b += JHASH_GOLDEN_RATIO; + c += ip6_frag_hash_rnd; + __jhash_mix(a, b, c); + + a += saddr->s6_addr32[3]; + b += daddr->s6_addr32[0]; + c += daddr->s6_addr32[1]; + __jhash_mix(a, b, c); + + a += daddr->s6_addr32[2]; + b += daddr->s6_addr32[3]; + c += id; + __jhash_mix(a, b, c); + + return c & (IP6Q_HASHSZ - 1); +} + +static struct timer_list ip6_frag_secret_timer; +int sysctl_ip6frag_secret_interval = 10 * 60 * HZ; + +static void ip6_frag_secret_rebuild(unsigned long dummy) +{ + unsigned long now = jiffies; + int i; + + write_lock(&ip6_frag_lock); + get_random_bytes(&ip6_frag_hash_rnd, sizeof(u32)); + for (i = 0; i < IP6Q_HASHSZ; i++) { + struct frag_queue *q; + + q = ip6_frag_hash[i]; + while (q) { + struct frag_queue *next = q->next; + unsigned int hval = ip6qhashfn(q->id, + &q->saddr, + &q->daddr); + + if (hval != i) { + /* Unlink. */ + if (q->next) + q->next->pprev = q->pprev; + *q->pprev = q->next; + + /* Relink to new hash chain. */ + if ((q->next = ip6_frag_hash[hval]) != NULL) + q->next->pprev = &q->next; + ip6_frag_hash[hval] = q; + q->pprev = &ip6_frag_hash[hval]; + } + + q = next; + } + } + write_unlock(&ip6_frag_lock); + + mod_timer(&ip6_frag_secret_timer, now + sysctl_ip6frag_secret_interval); +} + +atomic_t ip6_frag_mem = ATOMIC_INIT(0); + +/* Memory Tracking Functions. */ +static inline void frag_kfree_skb(struct sk_buff *skb, int *work) +{ + if (work) + *work -= skb->truesize; + atomic_sub(skb->truesize, &ip6_frag_mem); + kfree_skb(skb); +} + +static inline void frag_free_queue(struct frag_queue *fq, int *work) +{ + if (work) + *work -= sizeof(struct frag_queue); + atomic_sub(sizeof(struct frag_queue), &ip6_frag_mem); + kfree(fq); +} + +static inline struct frag_queue *frag_alloc_queue(void) +{ + struct frag_queue *fq = kmalloc(sizeof(struct frag_queue), GFP_ATOMIC); + + if(!fq) + return NULL; + atomic_add(sizeof(struct frag_queue), &ip6_frag_mem); + return fq; +} + +/* Destruction primitives. */ + +/* Complete destruction of fq. */ +static void ip6_frag_destroy(struct frag_queue *fq, int *work) +{ + struct sk_buff *fp; + + BUG_TRAP(fq->last_in&COMPLETE); + BUG_TRAP(del_timer(&fq->timer) == 0); + + /* Release all fragment data. */ + fp = fq->fragments; + while (fp) { + struct sk_buff *xp = fp->next; + + frag_kfree_skb(fp, work); + fp = xp; + } + + frag_free_queue(fq, work); +} + +static __inline__ void fq_put(struct frag_queue *fq, int *work) +{ + if (atomic_dec_and_test(&fq->refcnt)) + ip6_frag_destroy(fq, work); +} + +/* Kill fq entry. It is not destroyed immediately, + * because caller (and someone more) holds reference count. + */ +static __inline__ void fq_kill(struct frag_queue *fq) +{ + if (del_timer(&fq->timer)) + atomic_dec(&fq->refcnt); + + if (!(fq->last_in & COMPLETE)) { + fq_unlink(fq); + atomic_dec(&fq->refcnt); + fq->last_in |= COMPLETE; + } +} + +static void ip6_evictor(void) +{ + struct frag_queue *fq; + struct list_head *tmp; + int work; + + work = atomic_read(&ip6_frag_mem) - sysctl_ip6frag_low_thresh; + if (work <= 0) + return; + + while(work > 0) { + read_lock(&ip6_frag_lock); + if (list_empty(&ip6_frag_lru_list)) { + read_unlock(&ip6_frag_lock); + return; + } + tmp = ip6_frag_lru_list.next; + fq = list_entry(tmp, struct frag_queue, lru_list); + atomic_inc(&fq->refcnt); + read_unlock(&ip6_frag_lock); + + spin_lock(&fq->lock); + if (!(fq->last_in&COMPLETE)) + fq_kill(fq); + spin_unlock(&fq->lock); + + fq_put(fq, &work); + IP6_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + } +} + +static void ip6_frag_expire(unsigned long data) +{ + struct frag_queue *fq = (struct frag_queue *) data; + + spin_lock(&fq->lock); + + if (fq->last_in & COMPLETE) + goto out; + + fq_kill(fq); + + IP6_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT); + IP6_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + + /* Send error only if the first segment arrived. */ + if (fq->last_in&FIRST_IN && fq->fragments) { + struct net_device *dev = dev_get_by_index(fq->iif); + + /* + But use as source device on which LAST ARRIVED + segment was received. And do not use fq->dev + pointer directly, device might already disappeared. + */ + if (dev) { + fq->fragments->dev = dev; + icmpv6_send(fq->fragments, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0, + dev); + dev_put(dev); + } + } +out: + spin_unlock(&fq->lock); + fq_put(fq, NULL); +} + +/* Creation primitives. */ + + +static struct frag_queue *ip6_frag_intern(unsigned int hash, + struct frag_queue *fq_in) +{ + struct frag_queue *fq; + + write_lock(&ip6_frag_lock); +#ifdef CONFIG_SMP + for (fq = ip6_frag_hash[hash]; fq; fq = fq->next) { + if (fq->id == fq_in->id && + ipv6_addr_equal(&fq_in->saddr, &fq->saddr) && + ipv6_addr_equal(&fq_in->daddr, &fq->daddr)) { + atomic_inc(&fq->refcnt); + write_unlock(&ip6_frag_lock); + fq_in->last_in |= COMPLETE; + fq_put(fq_in, NULL); + return fq; + } + } +#endif + fq = fq_in; + + if (!mod_timer(&fq->timer, jiffies + sysctl_ip6frag_time)) + atomic_inc(&fq->refcnt); + + atomic_inc(&fq->refcnt); + if((fq->next = ip6_frag_hash[hash]) != NULL) + fq->next->pprev = &fq->next; + ip6_frag_hash[hash] = fq; + fq->pprev = &ip6_frag_hash[hash]; + INIT_LIST_HEAD(&fq->lru_list); + list_add_tail(&fq->lru_list, &ip6_frag_lru_list); + ip6_frag_nqueues++; + write_unlock(&ip6_frag_lock); + return fq; +} + + +static struct frag_queue * +ip6_frag_create(unsigned int hash, u32 id, struct in6_addr *src, struct in6_addr *dst) +{ + struct frag_queue *fq; + + if ((fq = frag_alloc_queue()) == NULL) + goto oom; + + memset(fq, 0, sizeof(struct frag_queue)); + + fq->id = id; + ipv6_addr_copy(&fq->saddr, src); + ipv6_addr_copy(&fq->daddr, dst); + + init_timer(&fq->timer); + fq->timer.function = ip6_frag_expire; + fq->timer.data = (long) fq; + spin_lock_init(&fq->lock); + atomic_set(&fq->refcnt, 1); + + return ip6_frag_intern(hash, fq); + +oom: + IP6_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + return NULL; +} + +static __inline__ struct frag_queue * +fq_find(u32 id, struct in6_addr *src, struct in6_addr *dst) +{ + struct frag_queue *fq; + unsigned int hash = ip6qhashfn(id, src, dst); + + read_lock(&ip6_frag_lock); + for(fq = ip6_frag_hash[hash]; fq; fq = fq->next) { + if (fq->id == id && + ipv6_addr_equal(src, &fq->saddr) && + ipv6_addr_equal(dst, &fq->daddr)) { + atomic_inc(&fq->refcnt); + read_unlock(&ip6_frag_lock); + return fq; + } + } + read_unlock(&ip6_frag_lock); + + return ip6_frag_create(hash, id, src, dst); +} + + +static void ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, + struct frag_hdr *fhdr, int nhoff) +{ + struct sk_buff *prev, *next; + int offset, end; + + if (fq->last_in & COMPLETE) + goto err; + + offset = ntohs(fhdr->frag_off) & ~0x7; + end = offset + (ntohs(skb->nh.ipv6h->payload_len) - + ((u8 *) (fhdr + 1) - (u8 *) (skb->nh.ipv6h + 1))); + + if ((unsigned int)end > IPV6_MAXPLEN) { + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb,ICMPV6_HDR_FIELD, (u8*)&fhdr->frag_off - skb->nh.raw); + return; + } + + if (skb->ip_summed == CHECKSUM_HW) + skb->csum = csum_sub(skb->csum, + csum_partial(skb->nh.raw, (u8*)(fhdr+1)-skb->nh.raw, 0)); + + /* Is this the final fragment? */ + if (!(fhdr->frag_off & htons(IP6_MF))) { + /* If we already have some bits beyond end + * or have different end, the segment is corrupted. + */ + if (end < fq->len || + ((fq->last_in & LAST_IN) && end != fq->len)) + goto err; + fq->last_in |= LAST_IN; + fq->len = end; + } else { + /* Check if the fragment is rounded to 8 bytes. + * Required by the RFC. + */ + if (end & 0x7) { + /* RFC2460 says always send parameter problem in + * this case. -DaveM + */ + IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, + offsetof(struct ipv6hdr, payload_len)); + return; + } + if (end > fq->len) { + /* Some bits beyond end -> corruption. */ + if (fq->last_in & LAST_IN) + goto err; + fq->len = end; + } + } + + if (end == offset) + goto err; + + /* Point into the IP datagram 'data' part. */ + if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) + goto err; + if (end-offset < skb->len) { + if (pskb_trim(skb, end - offset)) + goto err; + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + + /* Find out which fragments are in front and at the back of us + * in the chain of fragments so far. We must know where to put + * this fragment, right? + */ + prev = NULL; + for(next = fq->fragments; next != NULL; next = next->next) { + if (FRAG6_CB(next)->offset >= offset) + break; /* bingo! */ + prev = next; + } + + /* We found where to put this one. Check for overlap with + * preceding fragment, and, if needed, align things so that + * any overlaps are eliminated. + */ + if (prev) { + int i = (FRAG6_CB(prev)->offset + prev->len) - offset; + + if (i > 0) { + offset += i; + if (end <= offset) + goto err; + if (!pskb_pull(skb, i)) + goto err; + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->ip_summed = CHECKSUM_NONE; + } + } + + /* Look for overlap with succeeding segments. + * If we can merge fragments, do it. + */ + while (next && FRAG6_CB(next)->offset < end) { + int i = end - FRAG6_CB(next)->offset; /* overlap is 'i' bytes */ + + if (i < next->len) { + /* Eat head of the next overlapped fragment + * and leave the loop. The next ones cannot overlap. + */ + if (!pskb_pull(next, i)) + goto err; + FRAG6_CB(next)->offset += i; /* next fragment */ + fq->meat -= i; + if (next->ip_summed != CHECKSUM_UNNECESSARY) + next->ip_summed = CHECKSUM_NONE; + break; + } else { + struct sk_buff *free_it = next; + + /* Old fragment is completely overridden with + * new one drop it. + */ + next = next->next; + + if (prev) + prev->next = next; + else + fq->fragments = next; + + fq->meat -= free_it->len; + frag_kfree_skb(free_it, NULL); + } + } + + FRAG6_CB(skb)->offset = offset; + + /* Insert this fragment in the chain of fragments. */ + skb->next = next; + if (prev) + prev->next = skb; + else + fq->fragments = skb; + + if (skb->dev) + fq->iif = skb->dev->ifindex; + skb->dev = NULL; + fq->stamp = skb->stamp; + fq->meat += skb->len; + atomic_add(skb->truesize, &ip6_frag_mem); + + /* The first fragment. + * nhoffset is obtained from the first fragment, of course. + */ + if (offset == 0) { + fq->nhoffset = nhoff; + fq->last_in |= FIRST_IN; + } + write_lock(&ip6_frag_lock); + list_move_tail(&fq->lru_list, &ip6_frag_lru_list); + write_unlock(&ip6_frag_lock); + return; + +err: + IP6_INC_STATS(IPSTATS_MIB_REASMFAILS); + kfree_skb(skb); +} + +/* + * Check if this packet is complete. + * Returns NULL on failure by any reason, and pointer + * to current nexthdr field in reassembled frame. + * + * It is called with locked fq, and caller must check that + * queue is eligible for reassembly i.e. it is not COMPLETE, + * the last and the first frames arrived and all the bits are here. + */ +static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in, + unsigned int *nhoffp, + struct net_device *dev) +{ + struct sk_buff *fp, *head = fq->fragments; + int payload_len; + unsigned int nhoff; + + fq_kill(fq); + + BUG_TRAP(head != NULL); + BUG_TRAP(FRAG6_CB(head)->offset == 0); + + /* Unfragmented part is taken from the first segment. */ + payload_len = (head->data - head->nh.raw) - sizeof(struct ipv6hdr) + fq->len - sizeof(struct frag_hdr); + if (payload_len > IPV6_MAXPLEN) + goto out_oversize; + + /* Head of list must not be cloned. */ + if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) + goto out_oom; + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. */ + if (skb_shinfo(head)->frag_list) { + struct sk_buff *clone; + int i, plen = 0; + + if ((clone = alloc_skb(0, GFP_ATOMIC)) == NULL) + goto out_oom; + clone->next = head->next; + head->next = clone; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_shinfo(head)->frag_list = NULL; + for (i=0; inr_frags; i++) + plen += skb_shinfo(head)->frags[i].size; + clone->len = clone->data_len = head->data_len - plen; + head->data_len -= clone->len; + head->len -= clone->len; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + atomic_add(clone->truesize, &ip6_frag_mem); + } + + /* We have to remove fragment header from datagram and to relocate + * header in order to calculate ICV correctly. */ + nhoff = fq->nhoffset; + head->nh.raw[nhoff] = head->h.raw[0]; + memmove(head->head + sizeof(struct frag_hdr), head->head, + (head->data - head->head) - sizeof(struct frag_hdr)); + head->mac.raw += sizeof(struct frag_hdr); + head->nh.raw += sizeof(struct frag_hdr); + + skb_shinfo(head)->frag_list = head->next; + head->h.raw = head->data; + skb_push(head, head->data - head->nh.raw); + atomic_sub(head->truesize, &ip6_frag_mem); + + for (fp=head->next; fp; fp = fp->next) { + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_HW) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + atomic_sub(fp->truesize, &ip6_frag_mem); + } + + head->next = NULL; + head->dev = dev; + head->stamp = fq->stamp; + head->nh.ipv6h->payload_len = htons(payload_len); + + *skb_in = head; + + /* Yes, and fold redundant checksum back. 8) */ + if (head->ip_summed == CHECKSUM_HW) + head->csum = csum_partial(head->nh.raw, head->h.raw-head->nh.raw, head->csum); + + IP6_INC_STATS_BH(IPSTATS_MIB_REASMOKS); + fq->fragments = NULL; + *nhoffp = nhoff; + return 1; + +out_oversize: + if (net_ratelimit()) + printk(KERN_DEBUG "ip6_frag_reasm: payload len = %d\n", payload_len); + goto out_fail; +out_oom: + if (net_ratelimit()) + printk(KERN_DEBUG "ip6_frag_reasm: no memory for reassembly\n"); +out_fail: + IP6_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + return -1; +} + +static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp) +{ + struct sk_buff *skb = *skbp; + struct net_device *dev = skb->dev; + struct frag_hdr *fhdr; + struct frag_queue *fq; + struct ipv6hdr *hdr; + + hdr = skb->nh.ipv6h; + + IP6_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); + + /* Jumbo payload inhibits frag. header */ + if (hdr->payload_len==0) { + IP6_INC_STATS(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw-skb->nh.raw); + return -1; + } + if (!pskb_may_pull(skb, (skb->h.raw-skb->data)+sizeof(struct frag_hdr))) { + IP6_INC_STATS(IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, skb->h.raw-skb->nh.raw); + return -1; + } + + hdr = skb->nh.ipv6h; + fhdr = (struct frag_hdr *)skb->h.raw; + + if (!(fhdr->frag_off & htons(0xFFF9))) { + /* It is not a fragmented frame */ + skb->h.raw += sizeof(struct frag_hdr); + IP6_INC_STATS_BH(IPSTATS_MIB_REASMOKS); + + *nhoffp = (u8*)fhdr - skb->nh.raw; + return 1; + } + + if (atomic_read(&ip6_frag_mem) > sysctl_ip6frag_high_thresh) + ip6_evictor(); + + if ((fq = fq_find(fhdr->identification, &hdr->saddr, &hdr->daddr)) != NULL) { + int ret = -1; + + spin_lock(&fq->lock); + + ip6_frag_queue(fq, skb, fhdr, *nhoffp); + + if (fq->last_in == (FIRST_IN|LAST_IN) && + fq->meat == fq->len) + ret = ip6_frag_reasm(fq, skbp, nhoffp, dev); + + spin_unlock(&fq->lock); + fq_put(fq, NULL); + return ret; + } + + IP6_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + kfree_skb(skb); + return -1; +} + +static struct inet6_protocol frag_protocol = +{ + .handler = ipv6_frag_rcv, + .flags = INET6_PROTO_NOPOLICY, +}; + +void __init ipv6_frag_init(void) +{ + if (inet6_add_protocol(&frag_protocol, IPPROTO_FRAGMENT) < 0) + printk(KERN_ERR "ipv6_frag_init: Could not register protocol\n"); + + ip6_frag_hash_rnd = (u32) ((num_physpages ^ (num_physpages>>7)) ^ + (jiffies ^ (jiffies >> 6))); + + init_timer(&ip6_frag_secret_timer); + ip6_frag_secret_timer.function = ip6_frag_secret_rebuild; + ip6_frag_secret_timer.expires = jiffies + sysctl_ip6frag_secret_interval; + add_timer(&ip6_frag_secret_timer); +} diff --git a/net/ipv6/route.c b/net/ipv6/route.c new file mode 100644 index 000000000000..183802902c02 --- /dev/null +++ b/net/ipv6/route.c @@ -0,0 +1,2131 @@ +/* + * Linux INET6 implementation + * FIB front-end. + * + * Authors: + * Pedro Roque + * + * $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* Changes: + * + * YOSHIFUJI Hideaki @USAGI + * reworked default router selection. + * - respect outgoing interface + * - select from (probably) reachable routers (i.e. + * routers in REACHABLE, STALE, DELAY or PROBE states). + * - always select the same router if it is (probably) + * reachable. otherwise, round-robin the list. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PROC_FS +#include +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_SYSCTL +#include +#endif + +/* Set to 3 to get tracing. */ +#define RT6_DEBUG 2 + +#if RT6_DEBUG >= 3 +#define RDBG(x) printk x +#define RT6_TRACE(x...) printk(KERN_DEBUG x) +#else +#define RDBG(x) +#define RT6_TRACE(x...) do { ; } while (0) +#endif + + +static int ip6_rt_max_size = 4096; +static int ip6_rt_gc_min_interval = HZ / 2; +static int ip6_rt_gc_timeout = 60*HZ; +int ip6_rt_gc_interval = 30*HZ; +static int ip6_rt_gc_elasticity = 9; +static int ip6_rt_mtu_expires = 10*60*HZ; +static int ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40; + +static struct rt6_info * ip6_rt_copy(struct rt6_info *ort); +static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie); +static struct dst_entry *ip6_negative_advice(struct dst_entry *); +static void ip6_dst_destroy(struct dst_entry *); +static void ip6_dst_ifdown(struct dst_entry *, + struct net_device *dev, int how); +static int ip6_dst_gc(void); + +static int ip6_pkt_discard(struct sk_buff *skb); +static int ip6_pkt_discard_out(struct sk_buff *skb); +static void ip6_link_failure(struct sk_buff *skb); +static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu); + +static struct dst_ops ip6_dst_ops = { + .family = AF_INET6, + .protocol = __constant_htons(ETH_P_IPV6), + .gc = ip6_dst_gc, + .gc_thresh = 1024, + .check = ip6_dst_check, + .destroy = ip6_dst_destroy, + .ifdown = ip6_dst_ifdown, + .negative_advice = ip6_negative_advice, + .link_failure = ip6_link_failure, + .update_pmtu = ip6_rt_update_pmtu, + .entry_size = sizeof(struct rt6_info), +}; + +struct rt6_info ip6_null_entry = { + .u = { + .dst = { + .__refcnt = ATOMIC_INIT(1), + .__use = 1, + .dev = &loopback_dev, + .obsolete = -1, + .error = -ENETUNREACH, + .metrics = { [RTAX_HOPLIMIT - 1] = 255, }, + .input = ip6_pkt_discard, + .output = ip6_pkt_discard_out, + .ops = &ip6_dst_ops, + .path = (struct dst_entry*)&ip6_null_entry, + } + }, + .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP), + .rt6i_metric = ~(u32) 0, + .rt6i_ref = ATOMIC_INIT(1), +}; + +struct fib6_node ip6_routing_table = { + .leaf = &ip6_null_entry, + .fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO, +}; + +/* Protects all the ip6 fib */ + +DEFINE_RWLOCK(rt6_lock); + + +/* allocate dst with ip6_dst_ops */ +static __inline__ struct rt6_info *ip6_dst_alloc(void) +{ + return (struct rt6_info *)dst_alloc(&ip6_dst_ops); +} + +static void ip6_dst_destroy(struct dst_entry *dst) +{ + struct rt6_info *rt = (struct rt6_info *)dst; + struct inet6_dev *idev = rt->rt6i_idev; + + if (idev != NULL) { + rt->rt6i_idev = NULL; + in6_dev_put(idev); + } +} + +static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int how) +{ + struct rt6_info *rt = (struct rt6_info *)dst; + struct inet6_dev *idev = rt->rt6i_idev; + + if (dev != &loopback_dev && idev != NULL && idev->dev == dev) { + struct inet6_dev *loopback_idev = in6_dev_get(&loopback_dev); + if (loopback_idev != NULL) { + rt->rt6i_idev = loopback_idev; + in6_dev_put(idev); + } + } +} + +static __inline__ int rt6_check_expired(const struct rt6_info *rt) +{ + return (rt->rt6i_flags & RTF_EXPIRES && + time_after(jiffies, rt->rt6i_expires)); +} + +/* + * Route lookup. Any rt6_lock is implied. + */ + +static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt, + int oif, + int strict) +{ + struct rt6_info *local = NULL; + struct rt6_info *sprt; + + if (oif) { + for (sprt = rt; sprt; sprt = sprt->u.next) { + struct net_device *dev = sprt->rt6i_dev; + if (dev->ifindex == oif) + return sprt; + if (dev->flags & IFF_LOOPBACK) { + if (sprt->rt6i_idev == NULL || + sprt->rt6i_idev->dev->ifindex != oif) { + if (strict && oif) + continue; + if (local && (!oif || + local->rt6i_idev->dev->ifindex == oif)) + continue; + } + local = sprt; + } + } + + if (local) + return local; + + if (strict) + return &ip6_null_entry; + } + return rt; +} + +/* + * pointer to the last default router chosen. BH is disabled locally. + */ +static struct rt6_info *rt6_dflt_pointer; +static DEFINE_SPINLOCK(rt6_dflt_lock); + +void rt6_reset_dflt_pointer(struct rt6_info *rt) +{ + spin_lock_bh(&rt6_dflt_lock); + if (rt == NULL || rt == rt6_dflt_pointer) { + RT6_TRACE("reset default router: %p->NULL\n", rt6_dflt_pointer); + rt6_dflt_pointer = NULL; + } + spin_unlock_bh(&rt6_dflt_lock); +} + +/* Default Router Selection (RFC 2461 6.3.6) */ +static struct rt6_info *rt6_best_dflt(struct rt6_info *rt, int oif) +{ + struct rt6_info *match = NULL; + struct rt6_info *sprt; + int mpri = 0; + + for (sprt = rt; sprt; sprt = sprt->u.next) { + struct neighbour *neigh; + int m = 0; + + if (!oif || + (sprt->rt6i_dev && + sprt->rt6i_dev->ifindex == oif)) + m += 8; + + if (rt6_check_expired(sprt)) + continue; + + if (sprt == rt6_dflt_pointer) + m += 4; + + if ((neigh = sprt->rt6i_nexthop) != NULL) { + read_lock_bh(&neigh->lock); + switch (neigh->nud_state) { + case NUD_REACHABLE: + m += 3; + break; + + case NUD_STALE: + case NUD_DELAY: + case NUD_PROBE: + m += 2; + break; + + case NUD_NOARP: + case NUD_PERMANENT: + m += 1; + break; + + case NUD_INCOMPLETE: + default: + read_unlock_bh(&neigh->lock); + continue; + } + read_unlock_bh(&neigh->lock); + } else { + continue; + } + + if (m > mpri || m >= 12) { + match = sprt; + mpri = m; + if (m >= 12) { + /* we choose the last default router if it + * is in (probably) reachable state. + * If route changed, we should do pmtu + * discovery. --yoshfuji + */ + break; + } + } + } + + spin_lock(&rt6_dflt_lock); + if (!match) { + /* + * No default routers are known to be reachable. + * SHOULD round robin + */ + if (rt6_dflt_pointer) { + for (sprt = rt6_dflt_pointer->u.next; + sprt; sprt = sprt->u.next) { + if (sprt->u.dst.obsolete <= 0 && + sprt->u.dst.error == 0 && + !rt6_check_expired(sprt)) { + match = sprt; + break; + } + } + for (sprt = rt; + !match && sprt; + sprt = sprt->u.next) { + if (sprt->u.dst.obsolete <= 0 && + sprt->u.dst.error == 0 && + !rt6_check_expired(sprt)) { + match = sprt; + break; + } + if (sprt == rt6_dflt_pointer) + break; + } + } + } + + if (match) { + if (rt6_dflt_pointer != match) + RT6_TRACE("changed default router: %p->%p\n", + rt6_dflt_pointer, match); + rt6_dflt_pointer = match; + } + spin_unlock(&rt6_dflt_lock); + + if (!match) { + /* + * Last Resort: if no default routers found, + * use addrconf default route. + * We don't record this route. + */ + for (sprt = ip6_routing_table.leaf; + sprt; sprt = sprt->u.next) { + if (!rt6_check_expired(sprt) && + (sprt->rt6i_flags & RTF_DEFAULT) && + (!oif || + (sprt->rt6i_dev && + sprt->rt6i_dev->ifindex == oif))) { + match = sprt; + break; + } + } + if (!match) { + /* no default route. give up. */ + match = &ip6_null_entry; + } + } + + return match; +} + +struct rt6_info *rt6_lookup(struct in6_addr *daddr, struct in6_addr *saddr, + int oif, int strict) +{ + struct fib6_node *fn; + struct rt6_info *rt; + + read_lock_bh(&rt6_lock); + fn = fib6_lookup(&ip6_routing_table, daddr, saddr); + rt = rt6_device_match(fn->leaf, oif, strict); + dst_hold(&rt->u.dst); + rt->u.dst.__use++; + read_unlock_bh(&rt6_lock); + + rt->u.dst.lastuse = jiffies; + if (rt->u.dst.error == 0) + return rt; + dst_release(&rt->u.dst); + return NULL; +} + +/* ip6_ins_rt is called with FREE rt6_lock. + It takes new route entry, the addition fails by any reason the + route is freed. In any case, if caller does not hold it, it may + be destroyed. + */ + +int ip6_ins_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) +{ + int err; + + write_lock_bh(&rt6_lock); + err = fib6_add(&ip6_routing_table, rt, nlh, _rtattr); + write_unlock_bh(&rt6_lock); + + return err; +} + +/* No rt6_lock! If COW failed, the function returns dead route entry + with dst->error set to errno value. + */ + +static struct rt6_info *rt6_cow(struct rt6_info *ort, struct in6_addr *daddr, + struct in6_addr *saddr) +{ + int err; + struct rt6_info *rt; + + /* + * Clone the route. + */ + + rt = ip6_rt_copy(ort); + + if (rt) { + ipv6_addr_copy(&rt->rt6i_dst.addr, daddr); + + if (!(rt->rt6i_flags&RTF_GATEWAY)) + ipv6_addr_copy(&rt->rt6i_gateway, daddr); + + rt->rt6i_dst.plen = 128; + rt->rt6i_flags |= RTF_CACHE; + rt->u.dst.flags |= DST_HOST; + +#ifdef CONFIG_IPV6_SUBTREES + if (rt->rt6i_src.plen && saddr) { + ipv6_addr_copy(&rt->rt6i_src.addr, saddr); + rt->rt6i_src.plen = 128; + } +#endif + + rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); + + dst_hold(&rt->u.dst); + + err = ip6_ins_rt(rt, NULL, NULL); + if (err == 0) + return rt; + + rt->u.dst.error = err; + + return rt; + } + dst_hold(&ip6_null_entry.u.dst); + return &ip6_null_entry; +} + +#define BACKTRACK() \ +if (rt == &ip6_null_entry && strict) { \ + while ((fn = fn->parent) != NULL) { \ + if (fn->fn_flags & RTN_ROOT) { \ + dst_hold(&rt->u.dst); \ + goto out; \ + } \ + if (fn->fn_flags & RTN_RTINFO) \ + goto restart; \ + } \ +} + + +void ip6_route_input(struct sk_buff *skb) +{ + struct fib6_node *fn; + struct rt6_info *rt; + int strict; + int attempts = 3; + + strict = ipv6_addr_type(&skb->nh.ipv6h->daddr) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL); + +relookup: + read_lock_bh(&rt6_lock); + + fn = fib6_lookup(&ip6_routing_table, &skb->nh.ipv6h->daddr, + &skb->nh.ipv6h->saddr); + +restart: + rt = fn->leaf; + + if ((rt->rt6i_flags & RTF_CACHE)) { + rt = rt6_device_match(rt, skb->dev->ifindex, strict); + BACKTRACK(); + dst_hold(&rt->u.dst); + goto out; + } + + rt = rt6_device_match(rt, skb->dev->ifindex, 0); + BACKTRACK(); + + if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { + struct rt6_info *nrt; + dst_hold(&rt->u.dst); + read_unlock_bh(&rt6_lock); + + nrt = rt6_cow(rt, &skb->nh.ipv6h->daddr, + &skb->nh.ipv6h->saddr); + + dst_release(&rt->u.dst); + rt = nrt; + + if (rt->u.dst.error != -EEXIST || --attempts <= 0) + goto out2; + + /* Race condition! In the gap, when rt6_lock was + released someone could insert this route. Relookup. + */ + dst_release(&rt->u.dst); + goto relookup; + } + dst_hold(&rt->u.dst); + +out: + read_unlock_bh(&rt6_lock); +out2: + rt->u.dst.lastuse = jiffies; + rt->u.dst.__use++; + skb->dst = (struct dst_entry *) rt; +} + +struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl) +{ + struct fib6_node *fn; + struct rt6_info *rt; + int strict; + int attempts = 3; + + strict = ipv6_addr_type(&fl->fl6_dst) & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LINKLOCAL); + +relookup: + read_lock_bh(&rt6_lock); + + fn = fib6_lookup(&ip6_routing_table, &fl->fl6_dst, &fl->fl6_src); + +restart: + rt = fn->leaf; + + if ((rt->rt6i_flags & RTF_CACHE)) { + rt = rt6_device_match(rt, fl->oif, strict); + BACKTRACK(); + dst_hold(&rt->u.dst); + goto out; + } + if (rt->rt6i_flags & RTF_DEFAULT) { + if (rt->rt6i_metric >= IP6_RT_PRIO_ADDRCONF) + rt = rt6_best_dflt(rt, fl->oif); + } else { + rt = rt6_device_match(rt, fl->oif, strict); + BACKTRACK(); + } + + if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { + struct rt6_info *nrt; + dst_hold(&rt->u.dst); + read_unlock_bh(&rt6_lock); + + nrt = rt6_cow(rt, &fl->fl6_dst, &fl->fl6_src); + + dst_release(&rt->u.dst); + rt = nrt; + + if (rt->u.dst.error != -EEXIST || --attempts <= 0) + goto out2; + + /* Race condition! In the gap, when rt6_lock was + released someone could insert this route. Relookup. + */ + dst_release(&rt->u.dst); + goto relookup; + } + dst_hold(&rt->u.dst); + +out: + read_unlock_bh(&rt6_lock); +out2: + rt->u.dst.lastuse = jiffies; + rt->u.dst.__use++; + return &rt->u.dst; +} + + +/* + * Destination cache support functions + */ + +static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie) +{ + struct rt6_info *rt; + + rt = (struct rt6_info *) dst; + + if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie)) + return dst; + + return NULL; +} + +static struct dst_entry *ip6_negative_advice(struct dst_entry *dst) +{ + struct rt6_info *rt = (struct rt6_info *) dst; + + if (rt) { + if (rt->rt6i_flags & RTF_CACHE) + ip6_del_rt(rt, NULL, NULL); + else + dst_release(dst); + } + return NULL; +} + +static void ip6_link_failure(struct sk_buff *skb) +{ + struct rt6_info *rt; + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev); + + rt = (struct rt6_info *) skb->dst; + if (rt) { + if (rt->rt6i_flags&RTF_CACHE) { + dst_set_expires(&rt->u.dst, 0); + rt->rt6i_flags |= RTF_EXPIRES; + } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) + rt->rt6i_node->fn_sernum = -1; + } +} + +static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + struct rt6_info *rt6 = (struct rt6_info*)dst; + + if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) { + rt6->rt6i_flags |= RTF_MODIFIED; + if (mtu < IPV6_MIN_MTU) { + mtu = IPV6_MIN_MTU; + dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; + } + dst->metrics[RTAX_MTU-1] = mtu; + } +} + +/* Protected by rt6_lock. */ +static struct dst_entry *ndisc_dst_gc_list; +static int ipv6_get_mtu(struct net_device *dev); + +static inline unsigned int ipv6_advmss(unsigned int mtu) +{ + mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr); + + if (mtu < ip6_rt_min_advmss) + mtu = ip6_rt_min_advmss; + + /* + * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and + * corresponding MSS is IPV6_MAXPLEN - tcp_header_size. + * IPV6_MAXPLEN is also valid and means: "any MSS, + * rely only on pmtu discovery" + */ + if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr)) + mtu = IPV6_MAXPLEN; + return mtu; +} + +struct dst_entry *ndisc_dst_alloc(struct net_device *dev, + struct neighbour *neigh, + struct in6_addr *addr, + int (*output)(struct sk_buff *)) +{ + struct rt6_info *rt; + struct inet6_dev *idev = in6_dev_get(dev); + + if (unlikely(idev == NULL)) + return NULL; + + rt = ip6_dst_alloc(); + if (unlikely(rt == NULL)) { + in6_dev_put(idev); + goto out; + } + + dev_hold(dev); + if (neigh) + neigh_hold(neigh); + else + neigh = ndisc_get_neigh(dev, addr); + + rt->rt6i_dev = dev; + rt->rt6i_idev = idev; + rt->rt6i_nexthop = neigh; + atomic_set(&rt->u.dst.__refcnt, 1); + rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255; + rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev); + rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); + rt->u.dst.output = output; + +#if 0 /* there's no chance to use these for ndisc */ + rt->u.dst.flags = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST + ? DST_HOST + : 0; + ipv6_addr_copy(&rt->rt6i_dst.addr, addr); + rt->rt6i_dst.plen = 128; +#endif + + write_lock_bh(&rt6_lock); + rt->u.dst.next = ndisc_dst_gc_list; + ndisc_dst_gc_list = &rt->u.dst; + write_unlock_bh(&rt6_lock); + + fib6_force_start_gc(); + +out: + return (struct dst_entry *)rt; +} + +int ndisc_dst_gc(int *more) +{ + struct dst_entry *dst, *next, **pprev; + int freed; + + next = NULL; + pprev = &ndisc_dst_gc_list; + freed = 0; + while ((dst = *pprev) != NULL) { + if (!atomic_read(&dst->__refcnt)) { + *pprev = dst->next; + dst_free(dst); + freed++; + } else { + pprev = &dst->next; + (*more)++; + } + } + + return freed; +} + +static int ip6_dst_gc(void) +{ + static unsigned expire = 30*HZ; + static unsigned long last_gc; + unsigned long now = jiffies; + + if (time_after(last_gc + ip6_rt_gc_min_interval, now) && + atomic_read(&ip6_dst_ops.entries) <= ip6_rt_max_size) + goto out; + + expire++; + fib6_run_gc(expire); + last_gc = now; + if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh) + expire = ip6_rt_gc_timeout>>1; + +out: + expire -= expire>>ip6_rt_gc_elasticity; + return (atomic_read(&ip6_dst_ops.entries) > ip6_rt_max_size); +} + +/* Clean host part of a prefix. Not necessary in radix tree, + but results in cleaner routing tables. + + Remove it only when all the things will work! + */ + +static int ipv6_get_mtu(struct net_device *dev) +{ + int mtu = IPV6_MIN_MTU; + struct inet6_dev *idev; + + idev = in6_dev_get(dev); + if (idev) { + mtu = idev->cnf.mtu6; + in6_dev_put(idev); + } + return mtu; +} + +int ipv6_get_hoplimit(struct net_device *dev) +{ + int hoplimit = ipv6_devconf.hop_limit; + struct inet6_dev *idev; + + idev = in6_dev_get(dev); + if (idev) { + hoplimit = idev->cnf.hop_limit; + in6_dev_put(idev); + } + return hoplimit; +} + +/* + * + */ + +int ip6_route_add(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) +{ + int err; + struct rtmsg *r; + struct rtattr **rta; + struct rt6_info *rt = NULL; + struct net_device *dev = NULL; + struct inet6_dev *idev = NULL; + int addr_type; + + rta = (struct rtattr **) _rtattr; + + if (rtmsg->rtmsg_dst_len > 128 || rtmsg->rtmsg_src_len > 128) + return -EINVAL; +#ifndef CONFIG_IPV6_SUBTREES + if (rtmsg->rtmsg_src_len) + return -EINVAL; +#endif + if (rtmsg->rtmsg_ifindex) { + err = -ENODEV; + dev = dev_get_by_index(rtmsg->rtmsg_ifindex); + if (!dev) + goto out; + idev = in6_dev_get(dev); + if (!idev) + goto out; + } + + if (rtmsg->rtmsg_metric == 0) + rtmsg->rtmsg_metric = IP6_RT_PRIO_USER; + + rt = ip6_dst_alloc(); + + if (rt == NULL) { + err = -ENOMEM; + goto out; + } + + rt->u.dst.obsolete = -1; + rt->rt6i_expires = clock_t_to_jiffies(rtmsg->rtmsg_info); + if (nlh && (r = NLMSG_DATA(nlh))) { + rt->rt6i_protocol = r->rtm_protocol; + } else { + rt->rt6i_protocol = RTPROT_BOOT; + } + + addr_type = ipv6_addr_type(&rtmsg->rtmsg_dst); + + if (addr_type & IPV6_ADDR_MULTICAST) + rt->u.dst.input = ip6_mc_input; + else + rt->u.dst.input = ip6_forward; + + rt->u.dst.output = ip6_output; + + ipv6_addr_prefix(&rt->rt6i_dst.addr, + &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len); + rt->rt6i_dst.plen = rtmsg->rtmsg_dst_len; + if (rt->rt6i_dst.plen == 128) + rt->u.dst.flags = DST_HOST; + +#ifdef CONFIG_IPV6_SUBTREES + ipv6_addr_prefix(&rt->rt6i_src.addr, + &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len); + rt->rt6i_src.plen = rtmsg->rtmsg_src_len; +#endif + + rt->rt6i_metric = rtmsg->rtmsg_metric; + + /* We cannot add true routes via loopback here, + they would result in kernel looping; promote them to reject routes + */ + if ((rtmsg->rtmsg_flags&RTF_REJECT) || + (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) { + /* hold loopback dev/idev if we haven't done so. */ + if (dev != &loopback_dev) { + if (dev) { + dev_put(dev); + in6_dev_put(idev); + } + dev = &loopback_dev; + dev_hold(dev); + idev = in6_dev_get(dev); + if (!idev) { + err = -ENODEV; + goto out; + } + } + rt->u.dst.output = ip6_pkt_discard_out; + rt->u.dst.input = ip6_pkt_discard; + rt->u.dst.error = -ENETUNREACH; + rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP; + goto install_route; + } + + if (rtmsg->rtmsg_flags & RTF_GATEWAY) { + struct in6_addr *gw_addr; + int gwa_type; + + gw_addr = &rtmsg->rtmsg_gateway; + ipv6_addr_copy(&rt->rt6i_gateway, &rtmsg->rtmsg_gateway); + gwa_type = ipv6_addr_type(gw_addr); + + if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) { + struct rt6_info *grt; + + /* IPv6 strictly inhibits using not link-local + addresses as nexthop address. + Otherwise, router will not able to send redirects. + It is very good, but in some (rare!) circumstances + (SIT, PtP, NBMA NOARP links) it is handy to allow + some exceptions. --ANK + */ + err = -EINVAL; + if (!(gwa_type&IPV6_ADDR_UNICAST)) + goto out; + + grt = rt6_lookup(gw_addr, NULL, rtmsg->rtmsg_ifindex, 1); + + err = -EHOSTUNREACH; + if (grt == NULL) + goto out; + if (dev) { + if (dev != grt->rt6i_dev) { + dst_release(&grt->u.dst); + goto out; + } + } else { + dev = grt->rt6i_dev; + idev = grt->rt6i_idev; + dev_hold(dev); + in6_dev_hold(grt->rt6i_idev); + } + if (!(grt->rt6i_flags&RTF_GATEWAY)) + err = 0; + dst_release(&grt->u.dst); + + if (err) + goto out; + } + err = -EINVAL; + if (dev == NULL || (dev->flags&IFF_LOOPBACK)) + goto out; + } + + err = -ENODEV; + if (dev == NULL) + goto out; + + if (rtmsg->rtmsg_flags & (RTF_GATEWAY|RTF_NONEXTHOP)) { + rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev); + if (IS_ERR(rt->rt6i_nexthop)) { + err = PTR_ERR(rt->rt6i_nexthop); + rt->rt6i_nexthop = NULL; + goto out; + } + } + + rt->rt6i_flags = rtmsg->rtmsg_flags; + +install_route: + if (rta && rta[RTA_METRICS-1]) { + int attrlen = RTA_PAYLOAD(rta[RTA_METRICS-1]); + struct rtattr *attr = RTA_DATA(rta[RTA_METRICS-1]); + + while (RTA_OK(attr, attrlen)) { + unsigned flavor = attr->rta_type; + if (flavor) { + if (flavor > RTAX_MAX) { + err = -EINVAL; + goto out; + } + rt->u.dst.metrics[flavor-1] = + *(u32 *)RTA_DATA(attr); + } + attr = RTA_NEXT(attr, attrlen); + } + } + + if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0) + rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1; + if (!rt->u.dst.metrics[RTAX_MTU-1]) + rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev); + if (!rt->u.dst.metrics[RTAX_ADVMSS-1]) + rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); + rt->u.dst.dev = dev; + rt->rt6i_idev = idev; + return ip6_ins_rt(rt, nlh, _rtattr); + +out: + if (dev) + dev_put(dev); + if (idev) + in6_dev_put(idev); + if (rt) + dst_free((struct dst_entry *) rt); + return err; +} + +int ip6_del_rt(struct rt6_info *rt, struct nlmsghdr *nlh, void *_rtattr) +{ + int err; + + write_lock_bh(&rt6_lock); + + rt6_reset_dflt_pointer(NULL); + + err = fib6_del(rt, nlh, _rtattr); + dst_release(&rt->u.dst); + + write_unlock_bh(&rt6_lock); + + return err; +} + +static int ip6_route_del(struct in6_rtmsg *rtmsg, struct nlmsghdr *nlh, void *_rtattr) +{ + struct fib6_node *fn; + struct rt6_info *rt; + int err = -ESRCH; + + read_lock_bh(&rt6_lock); + + fn = fib6_locate(&ip6_routing_table, + &rtmsg->rtmsg_dst, rtmsg->rtmsg_dst_len, + &rtmsg->rtmsg_src, rtmsg->rtmsg_src_len); + + if (fn) { + for (rt = fn->leaf; rt; rt = rt->u.next) { + if (rtmsg->rtmsg_ifindex && + (rt->rt6i_dev == NULL || + rt->rt6i_dev->ifindex != rtmsg->rtmsg_ifindex)) + continue; + if (rtmsg->rtmsg_flags&RTF_GATEWAY && + !ipv6_addr_equal(&rtmsg->rtmsg_gateway, &rt->rt6i_gateway)) + continue; + if (rtmsg->rtmsg_metric && + rtmsg->rtmsg_metric != rt->rt6i_metric) + continue; + dst_hold(&rt->u.dst); + read_unlock_bh(&rt6_lock); + + return ip6_del_rt(rt, nlh, _rtattr); + } + } + read_unlock_bh(&rt6_lock); + + return err; +} + +/* + * Handle redirects + */ +void rt6_redirect(struct in6_addr *dest, struct in6_addr *saddr, + struct neighbour *neigh, u8 *lladdr, int on_link) +{ + struct rt6_info *rt, *nrt; + + /* Locate old route to this destination. */ + rt = rt6_lookup(dest, NULL, neigh->dev->ifindex, 1); + + if (rt == NULL) + return; + + if (neigh->dev != rt->rt6i_dev) + goto out; + + /* + * Current route is on-link; redirect is always invalid. + * + * Seems, previous statement is not true. It could + * be node, which looks for us as on-link (f.e. proxy ndisc) + * But then router serving it might decide, that we should + * know truth 8)8) --ANK (980726). + */ + if (!(rt->rt6i_flags&RTF_GATEWAY)) + goto out; + + /* + * RFC 2461 specifies that redirects should only be + * accepted if they come from the nexthop to the target. + * Due to the way default routers are chosen, this notion + * is a bit fuzzy and one might need to check all default + * routers. + */ + if (!ipv6_addr_equal(saddr, &rt->rt6i_gateway)) { + if (rt->rt6i_flags & RTF_DEFAULT) { + struct rt6_info *rt1; + + read_lock(&rt6_lock); + for (rt1 = ip6_routing_table.leaf; rt1; rt1 = rt1->u.next) { + if (ipv6_addr_equal(saddr, &rt1->rt6i_gateway)) { + dst_hold(&rt1->u.dst); + dst_release(&rt->u.dst); + read_unlock(&rt6_lock); + rt = rt1; + goto source_ok; + } + } + read_unlock(&rt6_lock); + } + if (net_ratelimit()) + printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop " + "for redirect target\n"); + goto out; + } + +source_ok: + + /* + * We have finally decided to accept it. + */ + + neigh_update(neigh, lladdr, NUD_STALE, + NEIGH_UPDATE_F_WEAK_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE| + (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER| + NEIGH_UPDATE_F_ISROUTER)) + ); + + /* + * Redirect received -> path was valid. + * Look, redirects are sent only in response to data packets, + * so that this nexthop apparently is reachable. --ANK + */ + dst_confirm(&rt->u.dst); + + /* Duplicate redirect: silently ignore. */ + if (neigh == rt->u.dst.neighbour) + goto out; + + nrt = ip6_rt_copy(rt); + if (nrt == NULL) + goto out; + + nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE; + if (on_link) + nrt->rt6i_flags &= ~RTF_GATEWAY; + + ipv6_addr_copy(&nrt->rt6i_dst.addr, dest); + nrt->rt6i_dst.plen = 128; + nrt->u.dst.flags |= DST_HOST; + + ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key); + nrt->rt6i_nexthop = neigh_clone(neigh); + /* Reset pmtu, it may be better */ + nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev); + nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst)); + + if (ip6_ins_rt(nrt, NULL, NULL)) + goto out; + + if (rt->rt6i_flags&RTF_CACHE) { + ip6_del_rt(rt, NULL, NULL); + return; + } + +out: + dst_release(&rt->u.dst); + return; +} + +/* + * Handle ICMP "packet too big" messages + * i.e. Path MTU discovery + */ + +void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr, + struct net_device *dev, u32 pmtu) +{ + struct rt6_info *rt, *nrt; + int allfrag = 0; + + rt = rt6_lookup(daddr, saddr, dev->ifindex, 0); + if (rt == NULL) + return; + + if (pmtu >= dst_mtu(&rt->u.dst)) + goto out; + + if (pmtu < IPV6_MIN_MTU) { + /* + * According to RFC2460, PMTU is set to the IPv6 Minimum Link + * MTU (1280) and a fragment header should always be included + * after a node receiving Too Big message reporting PMTU is + * less than the IPv6 Minimum Link MTU. + */ + pmtu = IPV6_MIN_MTU; + allfrag = 1; + } + + /* New mtu received -> path was valid. + They are sent only in response to data packets, + so that this nexthop apparently is reachable. --ANK + */ + dst_confirm(&rt->u.dst); + + /* Host route. If it is static, it would be better + not to override it, but add new one, so that + when cache entry will expire old pmtu + would return automatically. + */ + if (rt->rt6i_flags & RTF_CACHE) { + rt->u.dst.metrics[RTAX_MTU-1] = pmtu; + if (allfrag) + rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; + dst_set_expires(&rt->u.dst, ip6_rt_mtu_expires); + rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES; + goto out; + } + + /* Network route. + Two cases are possible: + 1. It is connected route. Action: COW + 2. It is gatewayed route or NONEXTHOP route. Action: clone it. + */ + if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP)) { + nrt = rt6_cow(rt, daddr, saddr); + if (!nrt->u.dst.error) { + nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; + if (allfrag) + nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; + /* According to RFC 1981, detecting PMTU increase shouldn't be + happened within 5 mins, the recommended timer is 10 mins. + Here this route expiration time is set to ip6_rt_mtu_expires + which is 10 mins. After 10 mins the decreased pmtu is expired + and detecting PMTU increase will be automatically happened. + */ + dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires); + nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES; + } + dst_release(&nrt->u.dst); + } else { + nrt = ip6_rt_copy(rt); + if (nrt == NULL) + goto out; + ipv6_addr_copy(&nrt->rt6i_dst.addr, daddr); + nrt->rt6i_dst.plen = 128; + nrt->u.dst.flags |= DST_HOST; + nrt->rt6i_nexthop = neigh_clone(rt->rt6i_nexthop); + dst_set_expires(&nrt->u.dst, ip6_rt_mtu_expires); + nrt->rt6i_flags |= RTF_DYNAMIC|RTF_CACHE|RTF_EXPIRES; + nrt->u.dst.metrics[RTAX_MTU-1] = pmtu; + if (allfrag) + nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG; + ip6_ins_rt(nrt, NULL, NULL); + } + +out: + dst_release(&rt->u.dst); +} + +/* + * Misc support functions + */ + +static struct rt6_info * ip6_rt_copy(struct rt6_info *ort) +{ + struct rt6_info *rt = ip6_dst_alloc(); + + if (rt) { + rt->u.dst.input = ort->u.dst.input; + rt->u.dst.output = ort->u.dst.output; + + memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); + rt->u.dst.dev = ort->u.dst.dev; + if (rt->u.dst.dev) + dev_hold(rt->u.dst.dev); + rt->rt6i_idev = ort->rt6i_idev; + if (rt->rt6i_idev) + in6_dev_hold(rt->rt6i_idev); + rt->u.dst.lastuse = jiffies; + rt->rt6i_expires = 0; + + ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway); + rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES; + rt->rt6i_metric = 0; + + memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key)); +#ifdef CONFIG_IPV6_SUBTREES + memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key)); +#endif + } + return rt; +} + +struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev) +{ + struct rt6_info *rt; + struct fib6_node *fn; + + fn = &ip6_routing_table; + + write_lock_bh(&rt6_lock); + for (rt = fn->leaf; rt; rt=rt->u.next) { + if (dev == rt->rt6i_dev && + ipv6_addr_equal(&rt->rt6i_gateway, addr)) + break; + } + if (rt) + dst_hold(&rt->u.dst); + write_unlock_bh(&rt6_lock); + return rt; +} + +struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr, + struct net_device *dev) +{ + struct in6_rtmsg rtmsg; + + memset(&rtmsg, 0, sizeof(struct in6_rtmsg)); + rtmsg.rtmsg_type = RTMSG_NEWROUTE; + ipv6_addr_copy(&rtmsg.rtmsg_gateway, gwaddr); + rtmsg.rtmsg_metric = 1024; + rtmsg.rtmsg_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT | RTF_UP | RTF_EXPIRES; + + rtmsg.rtmsg_ifindex = dev->ifindex; + + ip6_route_add(&rtmsg, NULL, NULL); + return rt6_get_dflt_router(gwaddr, dev); +} + +void rt6_purge_dflt_routers(void) +{ + struct rt6_info *rt; + +restart: + read_lock_bh(&rt6_lock); + for (rt = ip6_routing_table.leaf; rt; rt = rt->u.next) { + if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) { + dst_hold(&rt->u.dst); + + rt6_reset_dflt_pointer(NULL); + + read_unlock_bh(&rt6_lock); + + ip6_del_rt(rt, NULL, NULL); + + goto restart; + } + } + read_unlock_bh(&rt6_lock); +} + +int ipv6_route_ioctl(unsigned int cmd, void __user *arg) +{ + struct in6_rtmsg rtmsg; + int err; + + switch(cmd) { + case SIOCADDRT: /* Add a route */ + case SIOCDELRT: /* Delete a route */ + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + err = copy_from_user(&rtmsg, arg, + sizeof(struct in6_rtmsg)); + if (err) + return -EFAULT; + + rtnl_lock(); + switch (cmd) { + case SIOCADDRT: + err = ip6_route_add(&rtmsg, NULL, NULL); + break; + case SIOCDELRT: + err = ip6_route_del(&rtmsg, NULL, NULL); + break; + default: + err = -EINVAL; + } + rtnl_unlock(); + + return err; + }; + + return -EINVAL; +} + +/* + * Drop the packet on the floor + */ + +int ip6_pkt_discard(struct sk_buff *skb) +{ + IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES); + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_NOROUTE, 0, skb->dev); + kfree_skb(skb); + return 0; +} + +int ip6_pkt_discard_out(struct sk_buff *skb) +{ + skb->dev = skb->dst->dev; + return ip6_pkt_discard(skb); +} + +/* + * Allocate a dst for local (unicast / anycast) address. + */ + +struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev, + const struct in6_addr *addr, + int anycast) +{ + struct rt6_info *rt = ip6_dst_alloc(); + + if (rt == NULL) + return ERR_PTR(-ENOMEM); + + dev_hold(&loopback_dev); + in6_dev_hold(idev); + + rt->u.dst.flags = DST_HOST; + rt->u.dst.input = ip6_input; + rt->u.dst.output = ip6_output; + rt->rt6i_dev = &loopback_dev; + rt->rt6i_idev = idev; + rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev); + rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst)); + rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1; + rt->u.dst.obsolete = -1; + + rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP; + if (!anycast) + rt->rt6i_flags |= RTF_LOCAL; + rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway); + if (rt->rt6i_nexthop == NULL) { + dst_free((struct dst_entry *) rt); + return ERR_PTR(-ENOMEM); + } + + ipv6_addr_copy(&rt->rt6i_dst.addr, addr); + rt->rt6i_dst.plen = 128; + + atomic_set(&rt->u.dst.__refcnt, 1); + + return rt; +} + +static int fib6_ifdown(struct rt6_info *rt, void *arg) +{ + if (((void*)rt->rt6i_dev == arg || arg == NULL) && + rt != &ip6_null_entry) { + RT6_TRACE("deleted by ifdown %p\n", rt); + return -1; + } + return 0; +} + +void rt6_ifdown(struct net_device *dev) +{ + write_lock_bh(&rt6_lock); + fib6_clean_tree(&ip6_routing_table, fib6_ifdown, 0, dev); + write_unlock_bh(&rt6_lock); +} + +struct rt6_mtu_change_arg +{ + struct net_device *dev; + unsigned mtu; +}; + +static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) +{ + struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg; + struct inet6_dev *idev; + + /* In IPv6 pmtu discovery is not optional, + so that RTAX_MTU lock cannot disable it. + We still use this lock to block changes + caused by addrconf/ndisc. + */ + + idev = __in6_dev_get(arg->dev); + if (idev == NULL) + return 0; + + /* For administrative MTU increase, there is no way to discover + IPv6 PMTU increase, so PMTU increase should be updated here. + Since RFC 1981 doesn't include administrative MTU increase + update PMTU increase is a MUST. (i.e. jumbo frame) + */ + /* + If new MTU is less than route PMTU, this new MTU will be the + lowest MTU in the path, update the route PMTU to reflect PMTU + decreases; if new MTU is greater than route PMTU, and the + old MTU is the lowest MTU in the path, update the route PMTU + to reflect the increase. In this case if the other nodes' MTU + also have the lowest MTU, TOO BIG MESSAGE will be lead to + PMTU discouvery. + */ + if (rt->rt6i_dev == arg->dev && + !dst_metric_locked(&rt->u.dst, RTAX_MTU) && + (dst_mtu(&rt->u.dst) > arg->mtu || + (dst_mtu(&rt->u.dst) < arg->mtu && + dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) + rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu; + rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu); + return 0; +} + +void rt6_mtu_change(struct net_device *dev, unsigned mtu) +{ + struct rt6_mtu_change_arg arg; + + arg.dev = dev; + arg.mtu = mtu; + read_lock_bh(&rt6_lock); + fib6_clean_tree(&ip6_routing_table, rt6_mtu_change_route, 0, &arg); + read_unlock_bh(&rt6_lock); +} + +static int inet6_rtm_to_rtmsg(struct rtmsg *r, struct rtattr **rta, + struct in6_rtmsg *rtmsg) +{ + memset(rtmsg, 0, sizeof(*rtmsg)); + + rtmsg->rtmsg_dst_len = r->rtm_dst_len; + rtmsg->rtmsg_src_len = r->rtm_src_len; + rtmsg->rtmsg_flags = RTF_UP; + if (r->rtm_type == RTN_UNREACHABLE) + rtmsg->rtmsg_flags |= RTF_REJECT; + + if (rta[RTA_GATEWAY-1]) { + if (rta[RTA_GATEWAY-1]->rta_len != RTA_LENGTH(16)) + return -EINVAL; + memcpy(&rtmsg->rtmsg_gateway, RTA_DATA(rta[RTA_GATEWAY-1]), 16); + rtmsg->rtmsg_flags |= RTF_GATEWAY; + } + if (rta[RTA_DST-1]) { + if (RTA_PAYLOAD(rta[RTA_DST-1]) < ((r->rtm_dst_len+7)>>3)) + return -EINVAL; + memcpy(&rtmsg->rtmsg_dst, RTA_DATA(rta[RTA_DST-1]), ((r->rtm_dst_len+7)>>3)); + } + if (rta[RTA_SRC-1]) { + if (RTA_PAYLOAD(rta[RTA_SRC-1]) < ((r->rtm_src_len+7)>>3)) + return -EINVAL; + memcpy(&rtmsg->rtmsg_src, RTA_DATA(rta[RTA_SRC-1]), ((r->rtm_src_len+7)>>3)); + } + if (rta[RTA_OIF-1]) { + if (rta[RTA_OIF-1]->rta_len != RTA_LENGTH(sizeof(int))) + return -EINVAL; + memcpy(&rtmsg->rtmsg_ifindex, RTA_DATA(rta[RTA_OIF-1]), sizeof(int)); + } + if (rta[RTA_PRIORITY-1]) { + if (rta[RTA_PRIORITY-1]->rta_len != RTA_LENGTH(4)) + return -EINVAL; + memcpy(&rtmsg->rtmsg_metric, RTA_DATA(rta[RTA_PRIORITY-1]), 4); + } + return 0; +} + +int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct rtmsg *r = NLMSG_DATA(nlh); + struct in6_rtmsg rtmsg; + + if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) + return -EINVAL; + return ip6_route_del(&rtmsg, nlh, arg); +} + +int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) +{ + struct rtmsg *r = NLMSG_DATA(nlh); + struct in6_rtmsg rtmsg; + + if (inet6_rtm_to_rtmsg(r, arg, &rtmsg)) + return -EINVAL; + return ip6_route_add(&rtmsg, nlh, arg); +} + +struct rt6_rtnl_dump_arg +{ + struct sk_buff *skb; + struct netlink_callback *cb; +}; + +static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt, + struct in6_addr *dst, + struct in6_addr *src, + int iif, + int type, u32 pid, u32 seq, + struct nlmsghdr *in_nlh, int prefix) +{ + struct rtmsg *rtm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct rta_cacheinfo ci; + + if (prefix) { /* user wants prefix routes only */ + if (!(rt->rt6i_flags & RTF_PREFIX_RT)) { + /* success since this is not a prefix route */ + return 1; + } + } + + if (!pid && in_nlh) { + pid = in_nlh->nlmsg_pid; + } + + nlh = NLMSG_PUT(skb, pid, seq, type, sizeof(*rtm)); + rtm = NLMSG_DATA(nlh); + rtm->rtm_family = AF_INET6; + rtm->rtm_dst_len = rt->rt6i_dst.plen; + rtm->rtm_src_len = rt->rt6i_src.plen; + rtm->rtm_tos = 0; + rtm->rtm_table = RT_TABLE_MAIN; + if (rt->rt6i_flags&RTF_REJECT) + rtm->rtm_type = RTN_UNREACHABLE; + else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK)) + rtm->rtm_type = RTN_LOCAL; + else + rtm->rtm_type = RTN_UNICAST; + rtm->rtm_flags = 0; + rtm->rtm_scope = RT_SCOPE_UNIVERSE; + rtm->rtm_protocol = rt->rt6i_protocol; + if (rt->rt6i_flags&RTF_DYNAMIC) + rtm->rtm_protocol = RTPROT_REDIRECT; + else if (rt->rt6i_flags & RTF_ADDRCONF) + rtm->rtm_protocol = RTPROT_KERNEL; + else if (rt->rt6i_flags&RTF_DEFAULT) + rtm->rtm_protocol = RTPROT_RA; + + if (rt->rt6i_flags&RTF_CACHE) + rtm->rtm_flags |= RTM_F_CLONED; + + if (dst) { + RTA_PUT(skb, RTA_DST, 16, dst); + rtm->rtm_dst_len = 128; + } else if (rtm->rtm_dst_len) + RTA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr); +#ifdef CONFIG_IPV6_SUBTREES + if (src) { + RTA_PUT(skb, RTA_SRC, 16, src); + rtm->rtm_src_len = 128; + } else if (rtm->rtm_src_len) + RTA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr); +#endif + if (iif) + RTA_PUT(skb, RTA_IIF, 4, &iif); + else if (dst) { + struct in6_addr saddr_buf; + if (ipv6_get_saddr(&rt->u.dst, dst, &saddr_buf) == 0) + RTA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf); + } + if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) + goto rtattr_failure; + if (rt->u.dst.neighbour) + RTA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key); + if (rt->u.dst.dev) + RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->rt6i_dev->ifindex); + RTA_PUT(skb, RTA_PRIORITY, 4, &rt->rt6i_metric); + ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse); + if (rt->rt6i_expires) + ci.rta_expires = jiffies_to_clock_t(rt->rt6i_expires - jiffies); + else + ci.rta_expires = 0; + ci.rta_used = rt->u.dst.__use; + ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt); + ci.rta_error = rt->u.dst.error; + ci.rta_id = 0; + ci.rta_ts = 0; + ci.rta_tsage = 0; + RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int rt6_dump_route(struct rt6_info *rt, void *p_arg) +{ + struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg; + int prefix; + + if (arg->cb->nlh->nlmsg_len >= NLMSG_LENGTH(sizeof(struct rtmsg))) { + struct rtmsg *rtm = NLMSG_DATA(arg->cb->nlh); + prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0; + } else + prefix = 0; + + return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE, + NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq, + NULL, prefix); +} + +static int fib6_dump_node(struct fib6_walker_t *w) +{ + int res; + struct rt6_info *rt; + + for (rt = w->leaf; rt; rt = rt->u.next) { + res = rt6_dump_route(rt, w->args); + if (res < 0) { + /* Frame is full, suspend walking */ + w->leaf = rt; + return 1; + } + BUG_TRAP(res!=0); + } + w->leaf = NULL; + return 0; +} + +static void fib6_dump_end(struct netlink_callback *cb) +{ + struct fib6_walker_t *w = (void*)cb->args[0]; + + if (w) { + cb->args[0] = 0; + fib6_walker_unlink(w); + kfree(w); + } + if (cb->args[1]) { + cb->done = (void*)cb->args[1]; + cb->args[1] = 0; + } +} + +static int fib6_dump_done(struct netlink_callback *cb) +{ + fib6_dump_end(cb); + return cb->done(cb); +} + +int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct rt6_rtnl_dump_arg arg; + struct fib6_walker_t *w; + int res; + + arg.skb = skb; + arg.cb = cb; + + w = (void*)cb->args[0]; + if (w == NULL) { + /* New dump: + * + * 1. hook callback destructor. + */ + cb->args[1] = (long)cb->done; + cb->done = fib6_dump_done; + + /* + * 2. allocate and initialize walker. + */ + w = kmalloc(sizeof(*w), GFP_ATOMIC); + if (w == NULL) + return -ENOMEM; + RT6_TRACE("dump<%p", w); + memset(w, 0, sizeof(*w)); + w->root = &ip6_routing_table; + w->func = fib6_dump_node; + w->args = &arg; + cb->args[0] = (long)w; + read_lock_bh(&rt6_lock); + res = fib6_walk(w); + read_unlock_bh(&rt6_lock); + } else { + w->args = &arg; + read_lock_bh(&rt6_lock); + res = fib6_walk_continue(w); + read_unlock_bh(&rt6_lock); + } +#if RT6_DEBUG >= 3 + if (res <= 0 && skb->len == 0) + RT6_TRACE("%p>dump end\n", w); +#endif + res = res < 0 ? res : skb->len; + /* res < 0 is an error. (really, impossible) + res == 0 means that dump is complete, but skb still can contain data. + res > 0 dump is not complete, but frame is full. + */ + /* Destroy walker, if dump of this table is complete. */ + if (res <= 0) + fib6_dump_end(cb); + return res; +} + +int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) +{ + struct rtattr **rta = arg; + int iif = 0; + int err = -ENOBUFS; + struct sk_buff *skb; + struct flowi fl; + struct rt6_info *rt; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (skb == NULL) + goto out; + + /* Reserve room for dummy headers, this skb can pass + through good chunk of routing engine. + */ + skb->mac.raw = skb->data; + skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr)); + + memset(&fl, 0, sizeof(fl)); + if (rta[RTA_SRC-1]) + ipv6_addr_copy(&fl.fl6_src, + (struct in6_addr*)RTA_DATA(rta[RTA_SRC-1])); + if (rta[RTA_DST-1]) + ipv6_addr_copy(&fl.fl6_dst, + (struct in6_addr*)RTA_DATA(rta[RTA_DST-1])); + + if (rta[RTA_IIF-1]) + memcpy(&iif, RTA_DATA(rta[RTA_IIF-1]), sizeof(int)); + + if (iif) { + struct net_device *dev; + dev = __dev_get_by_index(iif); + if (!dev) { + err = -ENODEV; + goto out_free; + } + } + + fl.oif = 0; + if (rta[RTA_OIF-1]) + memcpy(&fl.oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int)); + + rt = (struct rt6_info*)ip6_route_output(NULL, &fl); + + skb->dst = &rt->u.dst; + + NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; + err = rt6_fill_node(skb, rt, + &fl.fl6_dst, &fl.fl6_src, + iif, + RTM_NEWROUTE, NETLINK_CB(in_skb).pid, + nlh->nlmsg_seq, nlh, 0); + if (err < 0) { + err = -EMSGSIZE; + goto out_free; + } + + err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); + if (err > 0) + err = 0; +out: + return err; +out_free: + kfree_skb(skb); + goto out; +} + +void inet6_rt_notify(int event, struct rt6_info *rt, struct nlmsghdr *nlh) +{ + struct sk_buff *skb; + int size = NLMSG_SPACE(sizeof(struct rtmsg)+256); + + skb = alloc_skb(size, gfp_any()); + if (!skb) { + netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, ENOBUFS); + return; + } + if (rt6_fill_node(skb, rt, NULL, NULL, 0, event, 0, 0, nlh, 0) < 0) { + kfree_skb(skb); + netlink_set_err(rtnl, 0, RTMGRP_IPV6_ROUTE, EINVAL); + return; + } + NETLINK_CB(skb).dst_groups = RTMGRP_IPV6_ROUTE; + netlink_broadcast(rtnl, skb, 0, RTMGRP_IPV6_ROUTE, gfp_any()); +} + +/* + * /proc + */ + +#ifdef CONFIG_PROC_FS + +#define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1) + +struct rt6_proc_arg +{ + char *buffer; + int offset; + int length; + int skip; + int len; +}; + +static int rt6_info_route(struct rt6_info *rt, void *p_arg) +{ + struct rt6_proc_arg *arg = (struct rt6_proc_arg *) p_arg; + int i; + + if (arg->skip < arg->offset / RT6_INFO_LEN) { + arg->skip++; + return 0; + } + + if (arg->len >= arg->length) + return 0; + + for (i=0; i<16; i++) { + sprintf(arg->buffer + arg->len, "%02x", + rt->rt6i_dst.addr.s6_addr[i]); + arg->len += 2; + } + arg->len += sprintf(arg->buffer + arg->len, " %02x ", + rt->rt6i_dst.plen); + +#ifdef CONFIG_IPV6_SUBTREES + for (i=0; i<16; i++) { + sprintf(arg->buffer + arg->len, "%02x", + rt->rt6i_src.addr.s6_addr[i]); + arg->len += 2; + } + arg->len += sprintf(arg->buffer + arg->len, " %02x ", + rt->rt6i_src.plen); +#else + sprintf(arg->buffer + arg->len, + "00000000000000000000000000000000 00 "); + arg->len += 36; +#endif + + if (rt->rt6i_nexthop) { + for (i=0; i<16; i++) { + sprintf(arg->buffer + arg->len, "%02x", + rt->rt6i_nexthop->primary_key[i]); + arg->len += 2; + } + } else { + sprintf(arg->buffer + arg->len, + "00000000000000000000000000000000"); + arg->len += 32; + } + arg->len += sprintf(arg->buffer + arg->len, + " %08x %08x %08x %08x %8s\n", + rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt), + rt->u.dst.__use, rt->rt6i_flags, + rt->rt6i_dev ? rt->rt6i_dev->name : ""); + return 0; +} + +static int rt6_proc_info(char *buffer, char **start, off_t offset, int length) +{ + struct rt6_proc_arg arg; + arg.buffer = buffer; + arg.offset = offset; + arg.length = length; + arg.skip = 0; + arg.len = 0; + + read_lock_bh(&rt6_lock); + fib6_clean_tree(&ip6_routing_table, rt6_info_route, 0, &arg); + read_unlock_bh(&rt6_lock); + + *start = buffer; + if (offset) + *start += offset % RT6_INFO_LEN; + + arg.len -= offset % RT6_INFO_LEN; + + if (arg.len > length) + arg.len = length; + if (arg.len < 0) + arg.len = 0; + + return arg.len; +} + +extern struct rt6_statistics rt6_stats; + +static int rt6_stats_seq_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n", + rt6_stats.fib_nodes, rt6_stats.fib_route_nodes, + rt6_stats.fib_rt_alloc, rt6_stats.fib_rt_entries, + rt6_stats.fib_rt_cache, + atomic_read(&ip6_dst_ops.entries), + rt6_stats.fib_discarded_routes); + + return 0; +} + +static int rt6_stats_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, rt6_stats_seq_show, NULL); +} + +static struct file_operations rt6_stats_seq_fops = { + .owner = THIS_MODULE, + .open = rt6_stats_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif /* CONFIG_PROC_FS */ + +#ifdef CONFIG_SYSCTL + +static int flush_delay; + +static +int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + if (write) { + proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + fib6_run_gc(flush_delay <= 0 ? ~0UL : (unsigned long)flush_delay); + return 0; + } else + return -EINVAL; +} + +ctl_table ipv6_route_table[] = { + { + .ctl_name = NET_IPV6_ROUTE_FLUSH, + .procname = "flush", + .data = &flush_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &ipv6_sysctl_rtcache_flush + }, + { + .ctl_name = NET_IPV6_ROUTE_GC_THRESH, + .procname = "gc_thresh", + .data = &ip6_dst_ops.gc_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_ROUTE_MAX_SIZE, + .procname = "max_size", + .data = &ip6_rt_max_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL, + .procname = "gc_min_interval", + .data = &ip6_rt_gc_min_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_ROUTE_GC_TIMEOUT, + .procname = "gc_timeout", + .data = &ip6_rt_gc_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_ROUTE_GC_INTERVAL, + .procname = "gc_interval", + .data = &ip6_rt_gc_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_ROUTE_GC_ELASTICITY, + .procname = "gc_elasticity", + .data = &ip6_rt_gc_elasticity, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_ROUTE_MTU_EXPIRES, + .procname = "mtu_expires", + .data = &ip6_rt_mtu_expires, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_ROUTE_MIN_ADVMSS, + .procname = "min_adv_mss", + .data = &ip6_rt_min_advmss, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS, + .procname = "gc_min_interval_ms", + .data = &ip6_rt_gc_min_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_ms_jiffies, + .strategy = &sysctl_ms_jiffies, + }, + { .ctl_name = 0 } +}; + +#endif + +void __init ip6_route_init(void) +{ + struct proc_dir_entry *p; + + ip6_dst_ops.kmem_cachep = kmem_cache_create("ip6_dst_cache", + sizeof(struct rt6_info), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!ip6_dst_ops.kmem_cachep) + panic("cannot create ip6_dst_cache"); + + fib6_init(); +#ifdef CONFIG_PROC_FS + p = proc_net_create("ipv6_route", 0, rt6_proc_info); + if (p) + p->owner = THIS_MODULE; + + proc_net_fops_create("rt6_stats", S_IRUGO, &rt6_stats_seq_fops); +#endif +#ifdef CONFIG_XFRM + xfrm6_init(); +#endif +} + +void ip6_route_cleanup(void) +{ +#ifdef CONFIG_PROC_FS + proc_net_remove("ipv6_route"); + proc_net_remove("rt6_stats"); +#endif +#ifdef CONFIG_XFRM + xfrm6_fini(); +#endif + rt6_ifdown(NULL); + fib6_gc_cleanup(); + kmem_cache_destroy(ip6_dst_ops.kmem_cachep); +} diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c new file mode 100644 index 000000000000..b788f55e139b --- /dev/null +++ b/net/ipv6/sit.c @@ -0,0 +1,833 @@ +/* + * IPv6 over IPv4 tunnel device - Simple Internet Transition (SIT) + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * Alexey Kuznetsov + * + * $Id: sit.c,v 1.53 2001/09/25 05:09:53 davem Exp $ + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Changes: + * Roger Venning : 6to4 support + * Nate Thompson : 6to4 support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + This version of net/ipv6/sit.c is cloned of net/ipv4/ip_gre.c + + For comments look at net/ipv4/ip_gre.c --ANK + */ + +#define HASH_SIZE 16 +#define HASH(addr) ((addr^(addr>>4))&0xF) + +static int ipip6_fb_tunnel_init(struct net_device *dev); +static int ipip6_tunnel_init(struct net_device *dev); +static void ipip6_tunnel_setup(struct net_device *dev); + +static struct net_device *ipip6_fb_tunnel_dev; + +static struct ip_tunnel *tunnels_r_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_r[HASH_SIZE]; +static struct ip_tunnel *tunnels_l[HASH_SIZE]; +static struct ip_tunnel *tunnels_wc[1]; +static struct ip_tunnel **tunnels[4] = { tunnels_wc, tunnels_l, tunnels_r, tunnels_r_l }; + +static DEFINE_RWLOCK(ipip6_lock); + +static struct ip_tunnel * ipip6_tunnel_lookup(u32 remote, u32 local) +{ + unsigned h0 = HASH(remote); + unsigned h1 = HASH(local); + struct ip_tunnel *t; + + for (t = tunnels_r_l[h0^h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && + remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_r[h0]; t; t = t->next) { + if (remote == t->parms.iph.daddr && (t->dev->flags&IFF_UP)) + return t; + } + for (t = tunnels_l[h1]; t; t = t->next) { + if (local == t->parms.iph.saddr && (t->dev->flags&IFF_UP)) + return t; + } + if ((t = tunnels_wc[0]) != NULL && (t->dev->flags&IFF_UP)) + return t; + return NULL; +} + +static struct ip_tunnel ** ipip6_bucket(struct ip_tunnel *t) +{ + u32 remote = t->parms.iph.daddr; + u32 local = t->parms.iph.saddr; + unsigned h = 0; + int prio = 0; + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + return &tunnels[prio][h]; +} + +static void ipip6_tunnel_unlink(struct ip_tunnel *t) +{ + struct ip_tunnel **tp; + + for (tp = ipip6_bucket(t); *tp; tp = &(*tp)->next) { + if (t == *tp) { + write_lock_bh(&ipip6_lock); + *tp = t->next; + write_unlock_bh(&ipip6_lock); + break; + } + } +} + +static void ipip6_tunnel_link(struct ip_tunnel *t) +{ + struct ip_tunnel **tp = ipip6_bucket(t); + + t->next = *tp; + write_lock_bh(&ipip6_lock); + *tp = t; + write_unlock_bh(&ipip6_lock); +} + +static struct ip_tunnel * ipip6_tunnel_locate(struct ip_tunnel_parm *parms, int create) +{ + u32 remote = parms->iph.daddr; + u32 local = parms->iph.saddr; + struct ip_tunnel *t, **tp, *nt; + struct net_device *dev; + unsigned h = 0; + int prio = 0; + char name[IFNAMSIZ]; + + if (remote) { + prio |= 2; + h ^= HASH(remote); + } + if (local) { + prio |= 1; + h ^= HASH(local); + } + for (tp = &tunnels[prio][h]; (t = *tp) != NULL; tp = &t->next) { + if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) + return t; + } + if (!create) + goto failed; + + if (parms->name[0]) + strlcpy(name, parms->name, IFNAMSIZ); + else { + int i; + for (i=1; i<100; i++) { + sprintf(name, "sit%d", i); + if (__dev_get_by_name(name) == NULL) + break; + } + if (i==100) + goto failed; + } + + dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup); + if (dev == NULL) + return NULL; + + nt = dev->priv; + dev->init = ipip6_tunnel_init; + nt->parms = *parms; + + if (register_netdevice(dev) < 0) { + free_netdev(dev); + goto failed; + } + + dev_hold(dev); + + ipip6_tunnel_link(nt); + /* Do not decrement MOD_USE_COUNT here. */ + return nt; + +failed: + return NULL; +} + +static void ipip6_tunnel_uninit(struct net_device *dev) +{ + if (dev == ipip6_fb_tunnel_dev) { + write_lock_bh(&ipip6_lock); + tunnels_wc[0] = NULL; + write_unlock_bh(&ipip6_lock); + dev_put(dev); + } else { + ipip6_tunnel_unlink((struct ip_tunnel*)dev->priv); + dev_put(dev); + } +} + + +static void ipip6_err(struct sk_buff *skb, u32 info) +{ +#ifndef I_WISH_WORLD_WERE_PERFECT + +/* It is not :-( All the routers (except for Linux) return only + 8 bytes of packet payload. It means, that precise relaying of + ICMP in the real Internet is absolutely infeasible. + */ + struct iphdr *iph = (struct iphdr*)skb->data; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct ip_tunnel *t; + + switch (type) { + default: + case ICMP_PARAMETERPROB: + return; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Soft state for pmtu is maintained by IP core. */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe they are just ether pollution. --ANK + */ + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + break; + } + + read_lock(&ipip6_lock); + t = ipip6_tunnel_lookup(iph->daddr, iph->saddr); + if (t == NULL || t->parms.iph.daddr == 0) + goto out; + if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED) + goto out; + + if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO) + t->err_count++; + else + t->err_count = 1; + t->err_time = jiffies; +out: + read_unlock(&ipip6_lock); + return; +#else + struct iphdr *iph = (struct iphdr*)dp; + int hlen = iph->ihl<<2; + struct ipv6hdr *iph6; + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + int rel_type = 0; + int rel_code = 0; + int rel_info = 0; + struct sk_buff *skb2; + struct rt6_info *rt6i; + + if (len < hlen + sizeof(struct ipv6hdr)) + return; + iph6 = (struct ipv6hdr*)(dp + hlen); + + switch (type) { + default: + return; + case ICMP_PARAMETERPROB: + if (skb->h.icmph->un.gateway < hlen) + return; + + /* So... This guy found something strange INSIDE encapsulated + packet. Well, he is fool, but what can we do ? + */ + rel_type = ICMPV6_PARAMPROB; + rel_info = skb->h.icmph->un.gateway - hlen; + break; + + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_SR_FAILED: + case ICMP_PORT_UNREACH: + /* Impossible event. */ + return; + case ICMP_FRAG_NEEDED: + /* Too complicated case ... */ + return; + default: + /* All others are translated to HOST_UNREACH. + rfc2003 contains "deep thoughts" about NET_UNREACH, + I believe, it is just ether pollution. --ANK + */ + rel_type = ICMPV6_DEST_UNREACH; + rel_code = ICMPV6_ADDR_UNREACH; + break; + } + break; + case ICMP_TIME_EXCEEDED: + if (code != ICMP_EXC_TTL) + return; + rel_type = ICMPV6_TIME_EXCEED; + rel_code = ICMPV6_EXC_HOPLIMIT; + break; + } + + /* Prepare fake skb to feed it to icmpv6_send */ + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 == NULL) + return; + dst_release(skb2->dst); + skb2->dst = NULL; + skb_pull(skb2, skb->data - (u8*)iph6); + skb2->nh.raw = skb2->data; + + /* Try to guess incoming interface */ + rt6i = rt6_lookup(&iph6->saddr, NULL, NULL, 0); + if (rt6i && rt6i->rt6i_dev) { + skb2->dev = rt6i->rt6i_dev; + + rt6i = rt6_lookup(&iph6->daddr, &iph6->saddr, NULL, 0); + + if (rt6i && rt6i->rt6i_dev && rt6i->rt6i_dev->type == ARPHRD_SIT) { + struct ip_tunnel * t = (struct ip_tunnel*)rt6i->rt6i_dev->priv; + if (rel_type == ICMPV6_TIME_EXCEED && t->parms.iph.ttl) { + rel_type = ICMPV6_DEST_UNREACH; + rel_code = ICMPV6_ADDR_UNREACH; + } + icmpv6_send(skb2, rel_type, rel_code, rel_info, skb2->dev); + } + } + kfree_skb(skb2); + return; +#endif +} + +static inline void ipip6_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb) +{ + if (INET_ECN_is_ce(iph->tos)) + IP6_ECN_set_ce(skb->nh.ipv6h); +} + +static int ipip6_rcv(struct sk_buff *skb) +{ + struct iphdr *iph; + struct ip_tunnel *tunnel; + + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto out; + + iph = skb->nh.iph; + + read_lock(&ipip6_lock); + if ((tunnel = ipip6_tunnel_lookup(iph->saddr, iph->daddr)) != NULL) { + secpath_reset(skb); + skb->mac.raw = skb->nh.raw; + skb->nh.raw = skb->data; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + skb->protocol = htons(ETH_P_IPV6); + skb->pkt_type = PACKET_HOST; + tunnel->stat.rx_packets++; + tunnel->stat.rx_bytes += skb->len; + skb->dev = tunnel->dev; + dst_release(skb->dst); + skb->dst = NULL; + nf_reset(skb); + ipip6_ecn_decapsulate(iph, skb); + netif_rx(skb); + read_unlock(&ipip6_lock); + return 0; + } + + icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PROT_UNREACH, 0); + kfree_skb(skb); + read_unlock(&ipip6_lock); +out: + return 0; +} + +/* Returns the embedded IPv4 address if the IPv6 address + comes from 6to4 (RFC 3056) addr space */ + +static inline u32 try_6to4(struct in6_addr *v6dst) +{ + u32 dst = 0; + + if (v6dst->s6_addr16[0] == htons(0x2002)) { + /* 6to4 v6 addr has 16 bits prefix, 32 v4addr, 16 SLA, ... */ + memcpy(&dst, &v6dst->s6_addr16[1], 4); + } + return dst; +} + +/* + * This function assumes it is being called from dev_queue_xmit() + * and that skb is filled properly by that function. + */ + +static int ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ip_tunnel *tunnel = (struct ip_tunnel*)dev->priv; + struct net_device_stats *stats = &tunnel->stat; + struct iphdr *tiph = &tunnel->parms.iph; + struct ipv6hdr *iph6 = skb->nh.ipv6h; + u8 tos = tunnel->parms.iph.tos; + struct rtable *rt; /* Route to the other host */ + struct net_device *tdev; /* Device to other host */ + struct iphdr *iph; /* Our new IP header */ + int max_headroom; /* The extra header space needed */ + u32 dst = tiph->daddr; + int mtu; + struct in6_addr *addr6; + int addr_type; + + if (tunnel->recursion++) { + tunnel->stat.collisions++; + goto tx_error; + } + + if (skb->protocol != htons(ETH_P_IPV6)) + goto tx_error; + + if (!dst) + dst = try_6to4(&iph6->daddr); + + if (!dst) { + struct neighbour *neigh = NULL; + + if (skb->dst) + neigh = skb->dst->neighbour; + + if (neigh == NULL) { + if (net_ratelimit()) + printk(KERN_DEBUG "sit: nexthop == NULL\n"); + goto tx_error; + } + + addr6 = (struct in6_addr*)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); + + if (addr_type == IPV6_ADDR_ANY) { + addr6 = &skb->nh.ipv6h->daddr; + addr_type = ipv6_addr_type(addr6); + } + + if ((addr_type & IPV6_ADDR_COMPATv4) == 0) + goto tx_error_icmp; + + dst = addr6->s6_addr32[3]; + } + + { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = dst, + .saddr = tiph->saddr, + .tos = RT_TOS(tos) } }, + .oif = tunnel->parms.link, + .proto = IPPROTO_IPV6 }; + if (ip_route_output_key(&rt, &fl)) { + tunnel->stat.tx_carrier_errors++; + goto tx_error_icmp; + } + } + if (rt->rt_type != RTN_UNICAST) { + ip_rt_put(rt); + tunnel->stat.tx_carrier_errors++; + goto tx_error_icmp; + } + tdev = rt->u.dst.dev; + + if (tdev == dev) { + ip_rt_put(rt); + tunnel->stat.collisions++; + goto tx_error; + } + + if (tiph->frag_off) + mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr); + else + mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu; + + if (mtu < 68) { + tunnel->stat.collisions++; + ip_rt_put(rt); + goto tx_error; + } + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + if (tunnel->parms.iph.daddr && skb->dst) + skb->dst->ops->update_pmtu(skb->dst, mtu); + + if (skb->len > mtu) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev); + ip_rt_put(rt); + goto tx_error; + } + + if (tunnel->err_count > 0) { + if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) { + tunnel->err_count--; + dst_link_failure(skb); + } else + tunnel->err_count = 0; + } + + /* + * Okay, now see if we can stuff it in the buffer as-is. + */ + max_headroom = LL_RESERVED_SPACE(tdev)+sizeof(struct iphdr); + + if (skb_headroom(skb) < max_headroom || skb_cloned(skb) || skb_shared(skb)) { + struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); + if (!new_skb) { + ip_rt_put(rt); + stats->tx_dropped++; + dev_kfree_skb(skb); + tunnel->recursion--; + return 0; + } + if (skb->sk) + skb_set_owner_w(new_skb, skb->sk); + dev_kfree_skb(skb); + skb = new_skb; + iph6 = skb->nh.ipv6h; + } + + skb->h.raw = skb->nh.raw; + skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); + memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); + dst_release(skb->dst); + skb->dst = &rt->u.dst; + + /* + * Push down and install the IPIP header. + */ + + iph = skb->nh.iph; + iph->version = 4; + iph->ihl = sizeof(struct iphdr)>>2; + if (mtu > IPV6_MIN_MTU) + iph->frag_off = htons(IP_DF); + else + iph->frag_off = 0; + + iph->protocol = IPPROTO_IPV6; + iph->tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6)); + iph->daddr = rt->rt_dst; + iph->saddr = rt->rt_src; + + if ((iph->ttl = tiph->ttl) == 0) + iph->ttl = iph6->hop_limit; + + nf_reset(skb); + + IPTUNNEL_XMIT(); + tunnel->recursion--; + return 0; + +tx_error_icmp: + dst_link_failure(skb); +tx_error: + stats->tx_errors++; + dev_kfree_skb(skb); + tunnel->recursion--; + return 0; +} + +static int +ipip6_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) +{ + int err = 0; + struct ip_tunnel_parm p; + struct ip_tunnel *t; + + switch (cmd) { + case SIOCGETTUNNEL: + t = NULL; + if (dev == ipip6_fb_tunnel_dev) { + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) { + err = -EFAULT; + break; + } + t = ipip6_tunnel_locate(&p, 0); + } + if (t == NULL) + t = (struct ip_tunnel*)dev->priv; + memcpy(&p, &t->parms, sizeof(p)); + if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p))) + err = -EFAULT; + break; + + case SIOCADDTUNNEL: + case SIOCCHGTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + + err = -EINVAL; + if (p.iph.version != 4 || p.iph.protocol != IPPROTO_IPV6 || + p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF))) + goto done; + if (p.iph.ttl) + p.iph.frag_off |= htons(IP_DF); + + t = ipip6_tunnel_locate(&p, cmd == SIOCADDTUNNEL); + + if (dev != ipip6_fb_tunnel_dev && cmd == SIOCCHGTUNNEL) { + if (t != NULL) { + if (t->dev != dev) { + err = -EEXIST; + break; + } + } else { + if (((dev->flags&IFF_POINTOPOINT) && !p.iph.daddr) || + (!(dev->flags&IFF_POINTOPOINT) && p.iph.daddr)) { + err = -EINVAL; + break; + } + t = (struct ip_tunnel*)dev->priv; + ipip6_tunnel_unlink(t); + t->parms.iph.saddr = p.iph.saddr; + t->parms.iph.daddr = p.iph.daddr; + memcpy(dev->dev_addr, &p.iph.saddr, 4); + memcpy(dev->broadcast, &p.iph.daddr, 4); + ipip6_tunnel_link(t); + netdev_state_change(dev); + } + } + + if (t) { + err = 0; + if (cmd == SIOCCHGTUNNEL) { + t->parms.iph.ttl = p.iph.ttl; + t->parms.iph.tos = p.iph.tos; + } + if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p))) + err = -EFAULT; + } else + err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT); + break; + + case SIOCDELTUNNEL: + err = -EPERM; + if (!capable(CAP_NET_ADMIN)) + goto done; + + if (dev == ipip6_fb_tunnel_dev) { + err = -EFAULT; + if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) + goto done; + err = -ENOENT; + if ((t = ipip6_tunnel_locate(&p, 0)) == NULL) + goto done; + err = -EPERM; + if (t == ipip6_fb_tunnel_dev->priv) + goto done; + dev = t->dev; + } + err = unregister_netdevice(dev); + break; + + default: + err = -EINVAL; + } + +done: + return err; +} + +static struct net_device_stats *ipip6_tunnel_get_stats(struct net_device *dev) +{ + return &(((struct ip_tunnel*)dev->priv)->stat); +} + +static int ipip6_tunnel_change_mtu(struct net_device *dev, int new_mtu) +{ + if (new_mtu < IPV6_MIN_MTU || new_mtu > 0xFFF8 - sizeof(struct iphdr)) + return -EINVAL; + dev->mtu = new_mtu; + return 0; +} + +static void ipip6_tunnel_setup(struct net_device *dev) +{ + SET_MODULE_OWNER(dev); + dev->uninit = ipip6_tunnel_uninit; + dev->destructor = free_netdev; + dev->hard_start_xmit = ipip6_tunnel_xmit; + dev->get_stats = ipip6_tunnel_get_stats; + dev->do_ioctl = ipip6_tunnel_ioctl; + dev->change_mtu = ipip6_tunnel_change_mtu; + + dev->type = ARPHRD_SIT; + dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); + dev->mtu = 1500 - sizeof(struct iphdr); + dev->flags = IFF_NOARP; + dev->iflink = 0; + dev->addr_len = 4; +} + +static int ipip6_tunnel_init(struct net_device *dev) +{ + struct net_device *tdev = NULL; + struct ip_tunnel *tunnel; + struct iphdr *iph; + + tunnel = (struct ip_tunnel*)dev->priv; + iph = &tunnel->parms.iph; + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4); + memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); + + if (iph->daddr) { + struct flowi fl = { .nl_u = { .ip4_u = + { .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) } }, + .oif = tunnel->parms.link, + .proto = IPPROTO_IPV6 }; + struct rtable *rt; + if (!ip_route_output_key(&rt, &fl)) { + tdev = rt->u.dst.dev; + ip_rt_put(rt); + } + dev->flags |= IFF_POINTOPOINT; + } + + if (!tdev && tunnel->parms.link) + tdev = __dev_get_by_index(tunnel->parms.link); + + if (tdev) { + dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); + dev->mtu = tdev->mtu - sizeof(struct iphdr); + if (dev->mtu < IPV6_MIN_MTU) + dev->mtu = IPV6_MIN_MTU; + } + dev->iflink = tunnel->parms.link; + + return 0; +} + +int __init ipip6_fb_tunnel_init(struct net_device *dev) +{ + struct ip_tunnel *tunnel = dev->priv; + struct iphdr *iph = &tunnel->parms.iph; + + tunnel->dev = dev; + strcpy(tunnel->parms.name, dev->name); + + iph->version = 4; + iph->protocol = IPPROTO_IPV6; + iph->ihl = 5; + iph->ttl = 64; + + dev_hold(dev); + tunnels_wc[0] = tunnel; + return 0; +} + +static struct net_protocol sit_protocol = { + .handler = ipip6_rcv, + .err_handler = ipip6_err, +}; + +void __exit sit_cleanup(void) +{ + inet_del_protocol(&sit_protocol, IPPROTO_IPV6); + unregister_netdev(ipip6_fb_tunnel_dev); +} + +int __init sit_init(void) +{ + int err; + + printk(KERN_INFO "IPv6 over IPv4 tunneling driver\n"); + + if (inet_add_protocol(&sit_protocol, IPPROTO_IPV6) < 0) { + printk(KERN_INFO "sit init: Can't add protocol\n"); + return -EAGAIN; + } + + ipip6_fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0", + ipip6_tunnel_setup); + if (!ipip6_fb_tunnel_dev) { + err = -ENOMEM; + goto err1; + } + + ipip6_fb_tunnel_dev->init = ipip6_fb_tunnel_init; + + if ((err = register_netdev(ipip6_fb_tunnel_dev))) + goto err2; + + out: + return err; + err2: + free_netdev(ipip6_fb_tunnel_dev); + err1: + inet_del_protocol(&sit_protocol, IPPROTO_IPV6); + goto out; +} diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c new file mode 100644 index 000000000000..3a18e0e6ffed --- /dev/null +++ b/net/ipv6/sysctl_net_ipv6.c @@ -0,0 +1,125 @@ +/* + * sysctl_net_ipv6.c: sysctl interface to net IPV6 subsystem. + * + * Changes: + * YOSHIFUJI Hideaki @USAGI: added icmp sysctl table. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +extern ctl_table ipv6_route_table[]; +extern ctl_table ipv6_icmp_table[]; + +#ifdef CONFIG_SYSCTL + +static ctl_table ipv6_table[] = { + { + .ctl_name = NET_IPV6_ROUTE, + .procname = "route", + .maxlen = 0, + .mode = 0555, + .child = ipv6_route_table + }, + { + .ctl_name = NET_IPV6_ICMP, + .procname = "icmp", + .maxlen = 0, + .mode = 0555, + .child = ipv6_icmp_table + }, + { + .ctl_name = NET_IPV6_BINDV6ONLY, + .procname = "bindv6only", + .data = &sysctl_ipv6_bindv6only, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV6_IP6FRAG_HIGH_THRESH, + .procname = "ip6frag_high_thresh", + .data = &sysctl_ip6frag_high_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV6_IP6FRAG_LOW_THRESH, + .procname = "ip6frag_low_thresh", + .data = &sysctl_ip6frag_low_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_IPV6_IP6FRAG_TIME, + .procname = "ip6frag_time", + .data = &sysctl_ip6frag_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies, + }, + { + .ctl_name = NET_IPV6_IP6FRAG_SECRET_INTERVAL, + .procname = "ip6frag_secret_interval", + .data = &sysctl_ip6frag_secret_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_jiffies, + .strategy = &sysctl_jiffies + }, + { + .ctl_name = NET_IPV6_MLD_MAX_MSF, + .procname = "mld_max_msf", + .data = &sysctl_mld_max_msf, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header *ipv6_sysctl_header; + +static ctl_table ipv6_net_table[] = { + { + .ctl_name = NET_IPV6, + .procname = "ipv6", + .mode = 0555, + .child = ipv6_table + }, + { .ctl_name = 0 } +}; + +static ctl_table ipv6_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ipv6_net_table + }, + { .ctl_name = 0 } +}; + +void ipv6_sysctl_register(void) +{ + ipv6_sysctl_header = register_sysctl_table(ipv6_root_table, 0); +} + +void ipv6_sysctl_unregister(void) +{ + unregister_sysctl_table(ipv6_sysctl_header); +} + +#endif /* CONFIG_SYSCTL */ + + + diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c new file mode 100644 index 000000000000..4760c85e19db --- /dev/null +++ b/net/ipv6/tcp_ipv6.c @@ -0,0 +1,2265 @@ +/* + * TCP over IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * $Id: tcp_ipv6.c,v 1.144 2002/02/01 22:01:04 davem Exp $ + * + * Based on: + * linux/net/ipv4/tcp.c + * linux/net/ipv4/tcp_input.c + * linux/net/ipv4/tcp_output.c + * + * Fixes: + * Hideaki YOSHIFUJI : sin6_scope_id support + * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which + * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind + * a single port at the same time. + * YOSHIFUJI Hideaki @USAGI: convert /proc/net/tcp6 to seq_file. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +static void tcp_v6_send_reset(struct sk_buff *skb); +static void tcp_v6_or_send_ack(struct sk_buff *skb, struct open_request *req); +static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, + struct sk_buff *skb); + +static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); +static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok); + +static struct tcp_func ipv6_mapped; +static struct tcp_func ipv6_specific; + +/* I have no idea if this is a good hash for v6 or not. -DaveM */ +static __inline__ int tcp_v6_hashfn(struct in6_addr *laddr, u16 lport, + struct in6_addr *faddr, u16 fport) +{ + int hashent = (lport ^ fport); + + hashent ^= (laddr->s6_addr32[3] ^ faddr->s6_addr32[3]); + hashent ^= hashent>>16; + hashent ^= hashent>>8; + return (hashent & (tcp_ehash_size - 1)); +} + +static __inline__ int tcp_v6_sk_hashfn(struct sock *sk) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *laddr = &np->rcv_saddr; + struct in6_addr *faddr = &np->daddr; + __u16 lport = inet->num; + __u16 fport = inet->dport; + return tcp_v6_hashfn(laddr, lport, faddr, fport); +} + +static inline int tcp_v6_bind_conflict(struct sock *sk, + struct tcp_bind_bucket *tb) +{ + struct sock *sk2; + struct hlist_node *node; + + /* We must walk the whole port owner list in this case. -DaveM */ + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && + (!sk->sk_reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) && + ipv6_rcv_saddr_equal(sk, sk2)) + break; + } + + return node != NULL; +} + +/* Grrr, addr_type already calculated by caller, but I don't want + * to add some silly "cookie" argument to this method just for that. + * But it doesn't matter, the recalculation is in the rarest path + * this function ever takes. + */ +static int tcp_v6_get_port(struct sock *sk, unsigned short snum) +{ + struct tcp_bind_hashbucket *head; + struct tcp_bind_bucket *tb; + struct hlist_node *node; + int ret; + + local_bh_disable(); + if (snum == 0) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + int rover; + + spin_lock(&tcp_portalloc_lock); + rover = tcp_port_rover; + do { rover++; + if ((rover < low) || (rover > high)) + rover = low; + head = &tcp_bhash[tcp_bhashfn(rover)]; + spin_lock(&head->lock); + tb_for_each(tb, node, &head->chain) + if (tb->port == rover) + goto next; + break; + next: + spin_unlock(&head->lock); + } while (--remaining > 0); + tcp_port_rover = rover; + spin_unlock(&tcp_portalloc_lock); + + /* Exhausted local port range during search? */ + ret = 1; + if (remaining <= 0) + goto fail; + + /* OK, here is the one we will use. */ + snum = rover; + } else { + head = &tcp_bhash[tcp_bhashfn(snum)]; + spin_lock(&head->lock); + tb_for_each(tb, node, &head->chain) + if (tb->port == snum) + goto tb_found; + } + tb = NULL; + goto tb_not_found; +tb_found: + if (tb && !hlist_empty(&tb->owners)) { + if (tb->fastreuse > 0 && sk->sk_reuse && + sk->sk_state != TCP_LISTEN) { + goto success; + } else { + ret = 1; + if (tcp_v6_bind_conflict(sk, tb)) + goto fail_unlock; + } + } +tb_not_found: + ret = 1; + if (!tb && (tb = tcp_bucket_create(head, snum)) == NULL) + goto fail_unlock; + if (hlist_empty(&tb->owners)) { + if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) + tb->fastreuse = 1; + else + tb->fastreuse = 0; + } else if (tb->fastreuse && + (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) + tb->fastreuse = 0; + +success: + if (!tcp_sk(sk)->bind_hash) + tcp_bind_hash(sk, tb, snum); + BUG_TRAP(tcp_sk(sk)->bind_hash == tb); + ret = 0; + +fail_unlock: + spin_unlock(&head->lock); +fail: + local_bh_enable(); + return ret; +} + +static __inline__ void __tcp_v6_hash(struct sock *sk) +{ + struct hlist_head *list; + rwlock_t *lock; + + BUG_TRAP(sk_unhashed(sk)); + + if (sk->sk_state == TCP_LISTEN) { + list = &tcp_listening_hash[tcp_sk_listen_hashfn(sk)]; + lock = &tcp_lhash_lock; + tcp_listen_wlock(); + } else { + sk->sk_hashent = tcp_v6_sk_hashfn(sk); + list = &tcp_ehash[sk->sk_hashent].chain; + lock = &tcp_ehash[sk->sk_hashent].lock; + write_lock(lock); + } + + __sk_add_node(sk, list); + sock_prot_inc_use(sk->sk_prot); + write_unlock(lock); +} + + +static void tcp_v6_hash(struct sock *sk) +{ + if (sk->sk_state != TCP_CLOSE) { + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->af_specific == &ipv6_mapped) { + tcp_prot.hash(sk); + return; + } + local_bh_disable(); + __tcp_v6_hash(sk); + local_bh_enable(); + } +} + +static struct sock *tcp_v6_lookup_listener(struct in6_addr *daddr, unsigned short hnum, int dif) +{ + struct sock *sk; + struct hlist_node *node; + struct sock *result = NULL; + int score, hiscore; + + hiscore=0; + read_lock(&tcp_lhash_lock); + sk_for_each(sk, node, &tcp_listening_hash[tcp_lhashfn(hnum)]) { + if (inet_sk(sk)->num == hnum && sk->sk_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + score = 1; + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + continue; + score++; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score++; + } + if (score == 3) { + result = sk; + break; + } + if (score > hiscore) { + hiscore = score; + result = sk; + } + } + } + if (result) + sock_hold(result); + read_unlock(&tcp_lhash_lock); + return result; +} + +/* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so + * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM + * + * The sockhash lock must be held as a reader here. + */ + +static inline struct sock *__tcp_v6_lookup_established(struct in6_addr *saddr, u16 sport, + struct in6_addr *daddr, u16 hnum, + int dif) +{ + struct tcp_ehash_bucket *head; + struct sock *sk; + struct hlist_node *node; + __u32 ports = TCP_COMBINED_PORTS(sport, hnum); + int hash; + + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + hash = tcp_v6_hashfn(daddr, hnum, saddr, sport); + head = &tcp_ehash[hash]; + read_lock(&head->lock); + sk_for_each(sk, node, &head->chain) { + /* For IPV6 do the cheaper port and family tests first. */ + if(TCP_IPV6_MATCH(sk, saddr, daddr, ports, dif)) + goto hit; /* You sunk my battleship! */ + } + /* Must check for a TIME_WAIT'er before going to listener hash. */ + sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) { + /* FIXME: acme: check this... */ + struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + + if(*((__u32 *)&(tw->tw_dport)) == ports && + sk->sk_family == PF_INET6) { + if(ipv6_addr_equal(&tw->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) && + (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)) + goto hit; + } + } + read_unlock(&head->lock); + return NULL; + +hit: + sock_hold(sk); + read_unlock(&head->lock); + return sk; +} + + +static inline struct sock *__tcp_v6_lookup(struct in6_addr *saddr, u16 sport, + struct in6_addr *daddr, u16 hnum, + int dif) +{ + struct sock *sk; + + sk = __tcp_v6_lookup_established(saddr, sport, daddr, hnum, dif); + + if (sk) + return sk; + + return tcp_v6_lookup_listener(daddr, hnum, dif); +} + +inline struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport, + struct in6_addr *daddr, u16 dport, + int dif) +{ + struct sock *sk; + + local_bh_disable(); + sk = __tcp_v6_lookup(saddr, sport, daddr, ntohs(dport), dif); + local_bh_enable(); + + return sk; +} + +EXPORT_SYMBOL_GPL(tcp_v6_lookup); + + +/* + * Open request hash tables. + */ + +static u32 tcp_v6_synq_hash(struct in6_addr *raddr, u16 rport, u32 rnd) +{ + u32 a, b, c; + + a = raddr->s6_addr32[0]; + b = raddr->s6_addr32[1]; + c = raddr->s6_addr32[2]; + + a += JHASH_GOLDEN_RATIO; + b += JHASH_GOLDEN_RATIO; + c += rnd; + __jhash_mix(a, b, c); + + a += raddr->s6_addr32[3]; + b += (u32) rport; + __jhash_mix(a, b, c); + + return c & (TCP_SYNQ_HSIZE - 1); +} + +static struct open_request *tcp_v6_search_req(struct tcp_sock *tp, + struct open_request ***prevp, + __u16 rport, + struct in6_addr *raddr, + struct in6_addr *laddr, + int iif) +{ + struct tcp_listen_opt *lopt = tp->listen_opt; + struct open_request *req, **prev; + + for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + if (req->rmt_port == rport && + req->class->family == AF_INET6 && + ipv6_addr_equal(&req->af.v6_req.rmt_addr, raddr) && + ipv6_addr_equal(&req->af.v6_req.loc_addr, laddr) && + (!req->af.v6_req.iif || req->af.v6_req.iif == iif)) { + BUG_TRAP(req->sk == NULL); + *prevp = prev; + return req; + } + } + + return NULL; +} + +static __inline__ u16 tcp_v6_check(struct tcphdr *th, int len, + struct in6_addr *saddr, + struct in6_addr *daddr, + unsigned long base) +{ + return csum_ipv6_magic(saddr, daddr, len, IPPROTO_TCP, base); +} + +static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) +{ + if (skb->protocol == htons(ETH_P_IPV6)) { + return secure_tcpv6_sequence_number(skb->nh.ipv6h->daddr.s6_addr32, + skb->nh.ipv6h->saddr.s6_addr32, + skb->h.th->dest, + skb->h.th->source); + } else { + return secure_tcp_sequence_number(skb->nh.iph->daddr, + skb->nh.iph->saddr, + skb->h.th->dest, + skb->h.th->source); + } +} + +static int __tcp_v6_check_established(struct sock *sk, __u16 lport, + struct tcp_tw_bucket **twp) +{ + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct in6_addr *daddr = &np->rcv_saddr; + struct in6_addr *saddr = &np->daddr; + int dif = sk->sk_bound_dev_if; + u32 ports = TCP_COMBINED_PORTS(inet->dport, lport); + int hash = tcp_v6_hashfn(daddr, inet->num, saddr, inet->dport); + struct tcp_ehash_bucket *head = &tcp_ehash[hash]; + struct sock *sk2; + struct hlist_node *node; + struct tcp_tw_bucket *tw; + + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) { + tw = (struct tcp_tw_bucket*)sk2; + + if(*((__u32 *)&(tw->tw_dport)) == ports && + sk2->sk_family == PF_INET6 && + ipv6_addr_equal(&tw->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tw->tw_v6_rcv_saddr, daddr) && + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { + struct tcp_sock *tp = tcp_sk(sk); + + if (tw->tw_ts_recent_stamp && + (!twp || (sysctl_tcp_tw_reuse && + xtime.tv_sec - + tw->tw_ts_recent_stamp > 1))) { + /* See comment in tcp_ipv4.c */ + tp->write_seq = tw->tw_snd_nxt + 65535 + 2; + if (!tp->write_seq) + tp->write_seq = 1; + tp->rx_opt.ts_recent = tw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp; + sock_hold(sk2); + goto unique; + } else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if(TCP_IPV6_MATCH(sk2, saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sk->sk_hashent = hash; + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw) { + /* Silly. Should hash-dance instead... */ + tcp_tw_deschedule(tw); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + tcp_tw_put(tw); + } + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +static inline u32 tcpv6_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + + return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32, + np->daddr.s6_addr32, + inet->dport); +} + +static int tcp_v6_hash_connect(struct sock *sk) +{ + unsigned short snum = inet_sk(sk)->num; + struct tcp_bind_hashbucket *head; + struct tcp_bind_bucket *tb; + int ret; + + if (!snum) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int range = high - low; + int i; + int port; + static u32 hint; + u32 offset = hint + tcpv6_port_offset(sk); + struct hlist_node *node; + struct tcp_tw_bucket *tw = NULL; + + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; + head = &tcp_bhash[tcp_bhashfn(port)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + tb_for_each(tb, node, &head->chain) { + if (tb->port == port) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__tcp_v6_check_established(sk, + port, + &tw)) + goto ok; + goto next_port; + } + } + + tb = tcp_bucket_create(head, port); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + hint += i; + + /* Head lock still held and bh's disabled */ + tcp_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(port); + __tcp_v6_hash(sk); + } + spin_unlock(&head->lock); + + if (tw) { + tcp_tw_deschedule(tw); + tcp_tw_put(tw); + } + + ret = 0; + goto out; + } + + head = &tcp_bhash[tcp_bhashfn(snum)]; + tb = tcp_sk(sk)->bind_hash; + spin_lock_bh(&head->lock); + + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { + __tcp_v6_hash(sk); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __tcp_v6_check_established(sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +static __inline__ int tcp_v6_iif(struct sk_buff *skb) +{ + return IP6CB(skb)->iif; +} + +static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct in6_addr *saddr = NULL, *final_p = NULL, final; + struct flowi fl; + struct dst_entry *dst; + int addr_type; + int err; + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (usin->sin6_family != AF_INET6) + return(-EAFNOSUPPORT); + + memset(&fl, 0, sizeof(fl)); + + if (np->sndflow) { + fl.fl6_flowlabel = usin->sin6_flowinfo&IPV6_FLOWINFO_MASK; + IP6_ECN_flow_init(fl.fl6_flowlabel); + if (fl.fl6_flowlabel&IPV6_FLOWLABEL_MASK) { + struct ip6_flowlabel *flowlabel; + flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if (flowlabel == NULL) + return -EINVAL; + ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); + fl6_sock_release(flowlabel); + } + } + + /* + * connect() to INADDR_ANY means loopback (BSD'ism). + */ + + if(ipv6_addr_any(&usin->sin6_addr)) + usin->sin6_addr.s6_addr[15] = 0x1; + + addr_type = ipv6_addr_type(&usin->sin6_addr); + + if(addr_type & IPV6_ADDR_MULTICAST) + return -ENETUNREACH; + + if (addr_type&IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + usin->sin6_scope_id) { + /* If interface is set while binding, indices + * must coincide. + */ + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != usin->sin6_scope_id) + return -EINVAL; + + sk->sk_bound_dev_if = usin->sin6_scope_id; + } + + /* Connect to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) + return -EINVAL; + } + + if (tp->rx_opt.ts_recent_stamp && + !ipv6_addr_equal(&np->daddr, &usin->sin6_addr)) { + tp->rx_opt.ts_recent = 0; + tp->rx_opt.ts_recent_stamp = 0; + tp->write_seq = 0; + } + + ipv6_addr_copy(&np->daddr, &usin->sin6_addr); + np->flow_label = fl.fl6_flowlabel; + + /* + * TCP over IPv4 + */ + + if (addr_type == IPV6_ADDR_MAPPED) { + u32 exthdrlen = tp->ext_header_len; + struct sockaddr_in sin; + + SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); + + if (__ipv6_only_sock(sk)) + return -ENETUNREACH; + + sin.sin_family = AF_INET; + sin.sin_port = usin->sin6_port; + sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; + + tp->af_specific = &ipv6_mapped; + sk->sk_backlog_rcv = tcp_v4_do_rcv; + + err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); + + if (err) { + tp->ext_header_len = exthdrlen; + tp->af_specific = &ipv6_specific; + sk->sk_backlog_rcv = tcp_v6_do_rcv; + goto failure; + } else { + ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000FFFF), + inet->saddr); + ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000FFFF), + inet->rcv_saddr); + } + + return err; + } + + if (!ipv6_addr_any(&np->rcv_saddr)) + saddr = &np->rcv_saddr; + + fl.proto = IPPROTO_TCP; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, + (saddr ? saddr : &np->saddr)); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = usin->sin6_port; + fl.fl_ip_sport = inet->sport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto failure; + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + dst_release(dst); + goto failure; + } + + if (saddr == NULL) { + saddr = &fl.fl6_src; + ipv6_addr_copy(&np->rcv_saddr, saddr); + } + + /* set the source address */ + ipv6_addr_copy(&np->saddr, saddr); + inet->rcv_saddr = LOOPBACK4_IPV6; + + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + + tp->ext_header_len = 0; + if (np->opt) + tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen; + + tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); + + inet->dport = usin->sin6_port; + + tcp_set_state(sk, TCP_SYN_SENT); + err = tcp_v6_hash_connect(sk); + if (err) + goto late_failure; + + if (!tp->write_seq) + tp->write_seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, + np->daddr.s6_addr32, + inet->sport, + inet->dport); + + err = tcp_connect(sk); + if (err) + goto late_failure; + + return 0; + +late_failure: + tcp_set_state(sk, TCP_CLOSE); + __sk_dst_reset(sk); +failure: + inet->dport = 0; + sk->sk_route_caps = 0; + return err; +} + +static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data; + struct tcphdr *th = (struct tcphdr *)(skb->data+offset); + struct ipv6_pinfo *np; + struct sock *sk; + int err; + struct tcp_sock *tp; + __u32 seq; + + sk = tcp_v6_lookup(&hdr->daddr, th->dest, &hdr->saddr, th->source, skb->dev->ifindex); + + if (sk == NULL) { + ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); + return; + } + + if (sk->sk_state == TCP_TIME_WAIT) { + tcp_tw_put((struct tcp_tw_bucket*)sk); + return; + } + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); + + if (sk->sk_state == TCP_CLOSE) + goto out; + + tp = tcp_sk(sk); + seq = ntohl(th->seq); + if (sk->sk_state != TCP_LISTEN && + !between(seq, tp->snd_una, tp->snd_nxt)) { + NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + np = inet6_sk(sk); + + if (type == ICMPV6_PKT_TOOBIG) { + struct dst_entry *dst = NULL; + + if (sock_owned_by_user(sk)) + goto out; + if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) + goto out; + + /* icmp should have updated the destination cache entry */ + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct flowi fl; + + /* BUGGG_FUTURE: Again, it is not clear how + to handle rthdr case. Ignore this complexity + for now. + */ + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_TCP; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet->dport; + fl.fl_ip_sport = inet->sport; + + if ((err = ip6_dst_lookup(sk, &dst, &fl))) { + sk->sk_err_soft = -err; + goto out; + } + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_err_soft = -err; + goto out; + } + + } else + dst_hold(dst); + + if (tp->pmtu_cookie > dst_mtu(dst)) { + tcp_sync_mss(sk, dst_mtu(dst)); + tcp_simple_retransmit(sk); + } /* else let the usual retransmit timer handle it */ + dst_release(dst); + goto out; + } + + icmpv6_err_convert(type, code, &err); + + /* Might be for an open_request */ + switch (sk->sk_state) { + struct open_request *req, **prev; + case TCP_LISTEN: + if (sock_owned_by_user(sk)) + goto out; + + req = tcp_v6_search_req(tp, &prev, th->dest, &hdr->daddr, + &hdr->saddr, tcp_v6_iif(skb)); + if (!req) + goto out; + + /* ICMPs are not backlogged, hence we cannot get + * an established socket here. + */ + BUG_TRAP(req->sk == NULL); + + if (seq != req->snt_isn) { + NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + tcp_synq_drop(sk, req, prev); + goto out; + + case TCP_SYN_SENT: + case TCP_SYN_RECV: /* Cannot happen. + It can, it SYNs are crossed. --ANK */ + if (!sock_owned_by_user(sk)) { + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + sk->sk_err = err; + sk->sk_error_report(sk); /* Wake people up to see the error (see connect in sock.c) */ + + tcp_done(sk); + } else + sk->sk_err_soft = err; + goto out; + } + + if (!sock_owned_by_user(sk) && np->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else + sk->sk_err_soft = err; + +out: + bh_unlock_sock(sk); + sock_put(sk); +} + + +static int tcp_v6_send_synack(struct sock *sk, struct open_request *req, + struct dst_entry *dst) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff * skb; + struct ipv6_txoptions *opt = NULL; + struct in6_addr * final_p = NULL, final; + struct flowi fl; + int err = -1; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_TCP; + ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr); + ipv6_addr_copy(&fl.fl6_src, &req->af.v6_req.loc_addr); + fl.fl6_flowlabel = 0; + fl.oif = req->af.v6_req.iif; + fl.fl_ip_dport = req->rmt_port; + fl.fl_ip_sport = inet_sk(sk)->sport; + + if (dst == NULL) { + opt = np->opt; + if (opt == NULL && + np->rxopt.bits.srcrt == 2 && + req->af.v6_req.pktopts) { + struct sk_buff *pktopts = req->af.v6_req.pktopts; + struct inet6_skb_parm *rxopt = IP6CB(pktopts); + if (rxopt->srcrt) + opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(pktopts->nh.raw + rxopt->srcrt)); + } + + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto done; + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto done; + } + + skb = tcp_make_synack(sk, dst, req); + if (skb) { + struct tcphdr *th = skb->h.th; + + th->check = tcp_v6_check(th, skb->len, + &req->af.v6_req.loc_addr, &req->af.v6_req.rmt_addr, + csum_partial((char *)th, skb->len, skb->csum)); + + ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr); + err = ip6_xmit(sk, skb, &fl, opt, 0); + if (err == NET_XMIT_CN) + err = 0; + } + +done: + dst_release(dst); + if (opt && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + return err; +} + +static void tcp_v6_or_free(struct open_request *req) +{ + if (req->af.v6_req.pktopts) + kfree_skb(req->af.v6_req.pktopts); +} + +static struct or_calltable or_ipv6 = { + .family = AF_INET6, + .rtx_syn_ack = tcp_v6_send_synack, + .send_ack = tcp_v6_or_send_ack, + .destructor = tcp_v6_or_free, + .send_reset = tcp_v6_send_reset +}; + +static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_skb_parm *opt = IP6CB(skb); + + if (np->rxopt.all) { + if ((opt->hop && np->rxopt.bits.hopopts) || + ((IPV6_FLOWINFO_MASK&*(u32*)skb->nh.raw) && + np->rxopt.bits.rxflow) || + (opt->srcrt && np->rxopt.bits.srcrt) || + ((opt->dst1 || opt->dst0) && np->rxopt.bits.dstopts)) + return 1; + } + return 0; +} + + +static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, + struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + + if (skb->ip_summed == CHECKSUM_HW) { + th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, 0); + skb->csum = offsetof(struct tcphdr, check); + } else { + th->check = csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, + csum_partial((char *)th, th->doff<<2, + skb->csum)); + } +} + + +static void tcp_v6_send_reset(struct sk_buff *skb) +{ + struct tcphdr *th = skb->h.th, *t1; + struct sk_buff *buff; + struct flowi fl; + + if (th->rst) + return; + + if (!ipv6_unicast_destination(skb)) + return; + + /* + * We need to grab some memory, and put together an RST, + * and then put it into the queue to be sent. + */ + + buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + sizeof(struct tcphdr), + GFP_ATOMIC); + if (buff == NULL) + return; + + skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + sizeof(struct tcphdr)); + + t1 = (struct tcphdr *) skb_push(buff,sizeof(struct tcphdr)); + + /* Swap the send and the receive. */ + memset(t1, 0, sizeof(*t1)); + t1->dest = th->source; + t1->source = th->dest; + t1->doff = sizeof(*t1)/4; + t1->rst = 1; + + if(th->ack) { + t1->seq = th->ack_seq; + } else { + t1->ack = 1; + t1->ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin + + skb->len - (th->doff<<2)); + } + + buff->csum = csum_partial((char *)t1, sizeof(*t1), 0); + + memset(&fl, 0, sizeof(fl)); + ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&fl.fl6_src, &skb->nh.ipv6h->daddr); + + t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst, + sizeof(*t1), IPPROTO_TCP, + buff->csum); + + fl.proto = IPPROTO_TCP; + fl.oif = tcp_v6_iif(skb); + fl.fl_ip_dport = t1->dest; + fl.fl_ip_sport = t1->source; + + /* sk = NULL, but it is safe for now. RST socket required. */ + if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) { + + if ((xfrm_lookup(&buff->dst, &fl, NULL, 0)) < 0) { + dst_release(buff->dst); + return; + } + + ip6_xmit(NULL, buff, &fl, NULL, 0); + TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); + TCP_INC_STATS_BH(TCP_MIB_OUTRSTS); + return; + } + + kfree_skb(buff); +} + +static void tcp_v6_send_ack(struct sk_buff *skb, u32 seq, u32 ack, u32 win, u32 ts) +{ + struct tcphdr *th = skb->h.th, *t1; + struct sk_buff *buff; + struct flowi fl; + int tot_len = sizeof(struct tcphdr); + + if (ts) + tot_len += 3*4; + + buff = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + tot_len, + GFP_ATOMIC); + if (buff == NULL) + return; + + skb_reserve(buff, MAX_HEADER + sizeof(struct ipv6hdr) + tot_len); + + t1 = (struct tcphdr *) skb_push(buff,tot_len); + + /* Swap the send and the receive. */ + memset(t1, 0, sizeof(*t1)); + t1->dest = th->source; + t1->source = th->dest; + t1->doff = tot_len/4; + t1->seq = htonl(seq); + t1->ack_seq = htonl(ack); + t1->ack = 1; + t1->window = htons(win); + + if (ts) { + u32 *ptr = (u32*)(t1 + 1); + *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *ptr++ = htonl(tcp_time_stamp); + *ptr = htonl(ts); + } + + buff->csum = csum_partial((char *)t1, tot_len, 0); + + memset(&fl, 0, sizeof(fl)); + ipv6_addr_copy(&fl.fl6_dst, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&fl.fl6_src, &skb->nh.ipv6h->daddr); + + t1->check = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst, + tot_len, IPPROTO_TCP, + buff->csum); + + fl.proto = IPPROTO_TCP; + fl.oif = tcp_v6_iif(skb); + fl.fl_ip_dport = t1->dest; + fl.fl_ip_sport = t1->source; + + if (!ip6_dst_lookup(NULL, &buff->dst, &fl)) { + if ((xfrm_lookup(&buff->dst, &fl, NULL, 0)) < 0) { + dst_release(buff->dst); + return; + } + ip6_xmit(NULL, buff, &fl, NULL, 0); + TCP_INC_STATS_BH(TCP_MIB_OUTSEGS); + return; + } + + kfree_skb(buff); +} + +static void tcp_v6_timewait_ack(struct sock *sk, struct sk_buff *skb) +{ + struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk; + + tcp_v6_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt, + tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent); + + tcp_tw_put(tw); +} + +static void tcp_v6_or_send_ack(struct sk_buff *skb, struct open_request *req) +{ + tcp_v6_send_ack(skb, req->snt_isn+1, req->rcv_isn+1, req->rcv_wnd, req->ts_recent); +} + + +static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) +{ + struct open_request *req, **prev; + struct tcphdr *th = skb->h.th; + struct tcp_sock *tp = tcp_sk(sk); + struct sock *nsk; + + /* Find possible connection requests. */ + req = tcp_v6_search_req(tp, &prev, th->source, &skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, tcp_v6_iif(skb)); + if (req) + return tcp_check_req(sk, skb, req, prev); + + nsk = __tcp_v6_lookup_established(&skb->nh.ipv6h->saddr, + th->source, + &skb->nh.ipv6h->daddr, + ntohs(th->dest), + tcp_v6_iif(skb)); + + if (nsk) { + if (nsk->sk_state != TCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + tcp_tw_put((struct tcp_tw_bucket*)nsk); + return NULL; + } + +#if 0 /*def CONFIG_SYN_COOKIES*/ + if (!th->rst && !th->syn && th->ack) + sk = cookie_v6_check(sk, skb, &(IPCB(skb)->opt)); +#endif + return sk; +} + +static void tcp_v6_synq_add(struct sock *sk, struct open_request *req) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct tcp_listen_opt *lopt = tp->listen_opt; + u32 h = tcp_v6_synq_hash(&req->af.v6_req.rmt_addr, req->rmt_port, lopt->hash_rnd); + + req->sk = NULL; + req->expires = jiffies + TCP_TIMEOUT_INIT; + req->retrans = 0; + req->dl_next = lopt->syn_table[h]; + + write_lock(&tp->syn_wait_lock); + lopt->syn_table[h] = req; + write_unlock(&tp->syn_wait_lock); + + tcp_synq_added(sk); +} + + +/* FIXME: this is substantially similar to the ipv4 code. + * Can some kind of merge be done? -- erics + */ +static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_options_received tmp_opt; + struct tcp_sock *tp = tcp_sk(sk); + struct open_request *req = NULL; + __u32 isn = TCP_SKB_CB(skb)->when; + + if (skb->protocol == htons(ETH_P_IP)) + return tcp_v4_conn_request(sk, skb); + + if (!ipv6_unicast_destination(skb)) + goto drop; + + /* + * There are no SYN attacks on IPv6, yet... + */ + if (tcp_synq_is_full(sk) && !isn) { + if (net_ratelimit()) + printk(KERN_INFO "TCPv6: dropping request, synflood is possible\n"); + goto drop; + } + + if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1) + goto drop; + + req = tcp_openreq_alloc(); + if (req == NULL) + goto drop; + + tcp_clear_options(&tmp_opt); + tmp_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); + tmp_opt.user_mss = tp->rx_opt.user_mss; + + tcp_parse_options(skb, &tmp_opt, 0); + + tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; + tcp_openreq_init(req, &tmp_opt, skb); + + req->class = &or_ipv6; + ipv6_addr_copy(&req->af.v6_req.rmt_addr, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&req->af.v6_req.loc_addr, &skb->nh.ipv6h->daddr); + TCP_ECN_create_request(req, skb->h.th); + req->af.v6_req.pktopts = NULL; + if (ipv6_opt_accepted(sk, skb) || + np->rxopt.bits.rxinfo || + np->rxopt.bits.rxhlim) { + atomic_inc(&skb->users); + req->af.v6_req.pktopts = skb; + } + req->af.v6_req.iif = sk->sk_bound_dev_if; + + /* So that link locals have meaning */ + if (!sk->sk_bound_dev_if && + ipv6_addr_type(&req->af.v6_req.rmt_addr) & IPV6_ADDR_LINKLOCAL) + req->af.v6_req.iif = tcp_v6_iif(skb); + + if (isn == 0) + isn = tcp_v6_init_sequence(sk,skb); + + req->snt_isn = isn; + + if (tcp_v6_send_synack(sk, req, NULL)) + goto drop; + + tcp_v6_synq_add(sk, req); + + return 0; + +drop: + if (req) + tcp_openreq_free(req); + + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + return 0; /* don't send reset */ +} + +static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, + struct open_request *req, + struct dst_entry *dst) +{ + struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct tcp6_sock *newtcp6sk; + struct inet_sock *newinet; + struct tcp_sock *newtp; + struct sock *newsk; + struct ipv6_txoptions *opt; + + if (skb->protocol == htons(ETH_P_IP)) { + /* + * v6 mapped + */ + + newsk = tcp_v4_syn_recv_sock(sk, skb, req, dst); + + if (newsk == NULL) + return NULL; + + newtcp6sk = (struct tcp6_sock *)newsk; + inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; + + newinet = inet_sk(newsk); + newnp = inet6_sk(newsk); + newtp = tcp_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + ipv6_addr_set(&newnp->daddr, 0, 0, htonl(0x0000FFFF), + newinet->daddr); + + ipv6_addr_set(&newnp->saddr, 0, 0, htonl(0x0000FFFF), + newinet->saddr); + + ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); + + newtp->af_specific = &ipv6_mapped; + newsk->sk_backlog_rcv = tcp_v4_do_rcv; + newnp->pktoptions = NULL; + newnp->opt = NULL; + newnp->mcast_oif = tcp_v6_iif(skb); + newnp->mcast_hops = skb->nh.ipv6h->hop_limit; + + /* Charge newly allocated IPv6 socket. Though it is mapped, + * it is IPv6 yet. + */ +#ifdef INET_REFCNT_DEBUG + atomic_inc(&inet6_sock_nr); +#endif + + /* It is tricky place. Until this moment IPv4 tcp + worked with IPv6 af_tcp.af_specific. + Sync it now. + */ + tcp_sync_mss(newsk, newtp->pmtu_cookie); + + return newsk; + } + + opt = np->opt; + + if (sk_acceptq_is_full(sk)) + goto out_overflow; + + if (np->rxopt.bits.srcrt == 2 && + opt == NULL && req->af.v6_req.pktopts) { + struct inet6_skb_parm *rxopt = IP6CB(req->af.v6_req.pktopts); + if (rxopt->srcrt) + opt = ipv6_invert_rthdr(sk, (struct ipv6_rt_hdr*)(req->af.v6_req.pktopts->nh.raw+rxopt->srcrt)); + } + + if (dst == NULL) { + struct in6_addr *final_p = NULL, final; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_TCP; + ipv6_addr_copy(&fl.fl6_dst, &req->af.v6_req.rmt_addr); + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + ipv6_addr_copy(&fl.fl6_src, &req->af.v6_req.loc_addr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = req->rmt_port; + fl.fl_ip_sport = inet_sk(sk)->sport; + + if (ip6_dst_lookup(sk, &dst, &fl)) + goto out; + + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto out; + } + + newsk = tcp_create_openreq_child(sk, req, skb); + if (newsk == NULL) + goto out; + + /* Charge newly allocated IPv6 socket */ +#ifdef INET_REFCNT_DEBUG + atomic_inc(&inet6_sock_nr); +#endif + + ip6_dst_store(newsk, dst, NULL); + newsk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + + newtcp6sk = (struct tcp6_sock *)newsk; + inet_sk(newsk)->pinet6 = &newtcp6sk->inet6; + + newtp = tcp_sk(newsk); + newinet = inet_sk(newsk); + newnp = inet6_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + ipv6_addr_copy(&newnp->daddr, &req->af.v6_req.rmt_addr); + ipv6_addr_copy(&newnp->saddr, &req->af.v6_req.loc_addr); + ipv6_addr_copy(&newnp->rcv_saddr, &req->af.v6_req.loc_addr); + newsk->sk_bound_dev_if = req->af.v6_req.iif; + + /* Now IPv6 options... + + First: no IPv4 options. + */ + newinet->opt = NULL; + + /* Clone RX bits */ + newnp->rxopt.all = np->rxopt.all; + + /* Clone pktoptions received with SYN */ + newnp->pktoptions = NULL; + if (req->af.v6_req.pktopts) { + newnp->pktoptions = skb_clone(req->af.v6_req.pktopts, + GFP_ATOMIC); + kfree_skb(req->af.v6_req.pktopts); + req->af.v6_req.pktopts = NULL; + if (newnp->pktoptions) + skb_set_owner_r(newnp->pktoptions, newsk); + } + newnp->opt = NULL; + newnp->mcast_oif = tcp_v6_iif(skb); + newnp->mcast_hops = skb->nh.ipv6h->hop_limit; + + /* Clone native IPv6 options from listening socket (if any) + + Yes, keeping reference count would be much more clever, + but we make one more one thing there: reattach optmem + to newsk. + */ + if (opt) { + newnp->opt = ipv6_dup_options(newsk, opt); + if (opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + } + + newtp->ext_header_len = 0; + if (newnp->opt) + newtp->ext_header_len = newnp->opt->opt_nflen + + newnp->opt->opt_flen; + + tcp_sync_mss(newsk, dst_mtu(dst)); + newtp->advmss = dst_metric(dst, RTAX_ADVMSS); + tcp_initialize_rcv_mss(newsk); + + newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; + + __tcp_v6_hash(newsk); + tcp_inherit_port(sk, newsk); + + return newsk; + +out_overflow: + NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS); +out: + NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS); + if (opt && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + dst_release(dst); + return NULL; +} + +static int tcp_v6_checksum_init(struct sk_buff *skb) +{ + if (skb->ip_summed == CHECKSUM_HW) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (!tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr,skb->csum)) + return 0; + LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v6 csum failed\n")); + } + if (skb->len <= 76) { + if (tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr,skb_checksum(skb, 0, skb->len, 0))) + return -1; + skb->ip_summed = CHECKSUM_UNNECESSARY; + } else { + skb->csum = ~tcp_v6_check(skb->h.th,skb->len,&skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr,0); + } + return 0; +} + +/* The socket must have it's spinlock held when we get + * here. + * + * We have a potential double-lock case here, so even when + * doing backlog processing we use the BH locking scheme. + * This is because we cannot sleep with the original spinlock + * held. + */ +static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct tcp_sock *tp; + struct sk_buff *opt_skb = NULL; + + /* Imagine: socket is IPv6. IPv4 packet arrives, + goes to IPv4 receive handler and backlogged. + From backlog it always goes here. Kerboom... + Fortunately, tcp_rcv_established and rcv_established + handle them correctly, but it is not case with + tcp_v6_hnd_req and tcp_v6_send_reset(). --ANK + */ + + if (skb->protocol == htons(ETH_P_IP)) + return tcp_v4_do_rcv(sk, skb); + + if (sk_filter(sk, skb, 0)) + goto discard; + + /* + * socket locking is here for SMP purposes as backlog rcv + * is currently called with bh processing disabled. + */ + + /* Do Stevens' IPV6_PKTOPTIONS. + + Yes, guys, it is the only place in our code, where we + may make it not affecting IPv4. + The rest of code is protocol independent, + and I do not like idea to uglify IPv4. + + Actually, all the idea behind IPV6_PKTOPTIONS + looks not very well thought. For now we latch + options, received in the last packet, enqueued + by tcp. Feel free to propose better solution. + --ANK (980728) + */ + if (np->rxopt.all) + opt_skb = skb_clone(skb, GFP_ATOMIC); + + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ + TCP_CHECK_TIMER(sk); + if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) + goto reset; + TCP_CHECK_TIMER(sk); + if (opt_skb) + goto ipv6_pktoptions; + return 0; + } + + if (skb->len < (skb->h.th->doff<<2) || tcp_checksum_complete(skb)) + goto csum_err; + + if (sk->sk_state == TCP_LISTEN) { + struct sock *nsk = tcp_v6_hnd_req(sk, skb); + if (!nsk) + goto discard; + + /* + * Queue it on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket.. + */ + if(nsk != sk) { + if (tcp_child_process(sk, nsk, skb)) + goto reset; + if (opt_skb) + __kfree_skb(opt_skb); + return 0; + } + } + + TCP_CHECK_TIMER(sk); + if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len)) + goto reset; + TCP_CHECK_TIMER(sk); + if (opt_skb) + goto ipv6_pktoptions; + return 0; + +reset: + tcp_v6_send_reset(skb); +discard: + if (opt_skb) + __kfree_skb(opt_skb); + kfree_skb(skb); + return 0; +csum_err: + TCP_INC_STATS_BH(TCP_MIB_INERRS); + goto discard; + + +ipv6_pktoptions: + /* Do you ask, what is it? + + 1. skb was enqueued by tcp. + 2. skb is added to tail of read queue, rather than out of order. + 3. socket is not in passive state. + 4. Finally, it really contains options, which user wants to receive. + */ + tp = tcp_sk(sk); + if (TCP_SKB_CB(opt_skb)->end_seq == tp->rcv_nxt && + !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { + if (np->rxopt.bits.rxinfo) + np->mcast_oif = tcp_v6_iif(opt_skb); + if (np->rxopt.bits.rxhlim) + np->mcast_hops = opt_skb->nh.ipv6h->hop_limit; + if (ipv6_opt_accepted(sk, opt_skb)) { + skb_set_owner_r(opt_skb, sk); + opt_skb = xchg(&np->pktoptions, opt_skb); + } else { + __kfree_skb(opt_skb); + opt_skb = xchg(&np->pktoptions, NULL); + } + } + + if (opt_skb) + kfree_skb(opt_skb); + return 0; +} + +static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + struct sk_buff *skb = *pskb; + struct tcphdr *th; + struct sock *sk; + int ret; + + if (skb->pkt_type != PACKET_HOST) + goto discard_it; + + /* + * Count it even if it's bad. + */ + TCP_INC_STATS_BH(TCP_MIB_INSEGS); + + if (!pskb_may_pull(skb, sizeof(struct tcphdr))) + goto discard_it; + + th = skb->h.th; + + if (th->doff < sizeof(struct tcphdr)/4) + goto bad_packet; + if (!pskb_may_pull(skb, th->doff*4)) + goto discard_it; + + if ((skb->ip_summed != CHECKSUM_UNNECESSARY && + tcp_v6_checksum_init(skb) < 0)) + goto bad_packet; + + th = skb->h.th; + TCP_SKB_CB(skb)->seq = ntohl(th->seq); + TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin + + skb->len - th->doff*4); + TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq); + TCP_SKB_CB(skb)->when = 0; + TCP_SKB_CB(skb)->flags = ipv6_get_dsfield(skb->nh.ipv6h); + TCP_SKB_CB(skb)->sacked = 0; + + sk = __tcp_v6_lookup(&skb->nh.ipv6h->saddr, th->source, + &skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb)); + + if (!sk) + goto no_tcp_socket; + +process: + if (sk->sk_state == TCP_TIME_WAIT) + goto do_time_wait; + + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + + if (sk_filter(sk, skb, 0)) + goto discard_and_relse; + + skb->dev = NULL; + + bh_lock_sock(sk); + ret = 0; + if (!sock_owned_by_user(sk)) { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v6_do_rcv(sk, skb); + } else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); + + sock_put(sk); + return ret ? -1 : 0; + +no_tcp_socket: + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + + if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { +bad_packet: + TCP_INC_STATS_BH(TCP_MIB_INERRS); + } else { + tcp_v6_send_reset(skb); + } + +discard_it: + + /* + * Discard frame + */ + + kfree_skb(skb); + return 0; + +discard_and_relse: + sock_put(sk); + goto discard_it; + +do_time_wait: + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) { + tcp_tw_put((struct tcp_tw_bucket *) sk); + goto discard_it; + } + + if (skb->len < (th->doff<<2) || tcp_checksum_complete(skb)) { + TCP_INC_STATS_BH(TCP_MIB_INERRS); + tcp_tw_put((struct tcp_tw_bucket *) sk); + goto discard_it; + } + + switch(tcp_timewait_state_process((struct tcp_tw_bucket *)sk, + skb, th, skb->len)) { + case TCP_TW_SYN: + { + struct sock *sk2; + + sk2 = tcp_v6_lookup_listener(&skb->nh.ipv6h->daddr, ntohs(th->dest), tcp_v6_iif(skb)); + if (sk2 != NULL) { + tcp_tw_deschedule((struct tcp_tw_bucket *)sk); + tcp_tw_put((struct tcp_tw_bucket *)sk); + sk = sk2; + goto process; + } + /* Fall through to ACK */ + } + case TCP_TW_ACK: + tcp_v6_timewait_ack(sk, skb); + break; + case TCP_TW_RST: + goto no_tcp_socket; + case TCP_TW_SUCCESS:; + } + goto discard_it; +} + +static int tcp_v6_rebuild_header(struct sock *sk) +{ + int err; + struct dst_entry *dst; + struct ipv6_pinfo *np = inet6_sk(sk); + + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *final_p = NULL, final; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_TCP; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.fl6_flowlabel = np->flow_label; + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet->dport; + fl.fl_ip_sport = inet->sport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) { + sk->sk_route_caps = 0; + return err; + } + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_err_soft = -err; + dst_release(dst); + return err; + } + + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + } + + return 0; +} + +static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok) +{ + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi fl; + struct dst_entry *dst; + struct in6_addr *final_p = NULL, final; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_TCP; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.fl6_flowlabel = np->flow_label; + IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_sport = inet->sport; + fl.fl_ip_dport = inet->dport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + int err = ip6_dst_lookup(sk, &dst, &fl); + + if (err) { + sk->sk_err_soft = -err; + return err; + } + + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_route_caps = 0; + dst_release(dst); + return err; + } + + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + } + + skb->dst = dst_clone(dst); + + /* Restore final destination back after routing done */ + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + + return ip6_xmit(sk, skb, &fl, np->opt, 0); +} + +static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; + + sin6->sin6_family = AF_INET6; + ipv6_addr_copy(&sin6->sin6_addr, &np->daddr); + sin6->sin6_port = inet_sk(sk)->dport; + /* We do not store received flowlabel for TCP */ + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + if (sk->sk_bound_dev_if && + ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = sk->sk_bound_dev_if; +} + +static int tcp_v6_remember_stamp(struct sock *sk) +{ + /* Alas, not yet... */ + return 0; +} + +static struct tcp_func ipv6_specific = { + .queue_xmit = tcp_v6_xmit, + .send_check = tcp_v6_send_check, + .rebuild_header = tcp_v6_rebuild_header, + .conn_request = tcp_v6_conn_request, + .syn_recv_sock = tcp_v6_syn_recv_sock, + .remember_stamp = tcp_v6_remember_stamp, + .net_header_len = sizeof(struct ipv6hdr), + + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = v6_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6) +}; + +/* + * TCP over IPv4 via INET6 API + */ + +static struct tcp_func ipv6_mapped = { + .queue_xmit = ip_queue_xmit, + .send_check = tcp_v4_send_check, + .rebuild_header = tcp_v4_rebuild_header, + .conn_request = tcp_v6_conn_request, + .syn_recv_sock = tcp_v6_syn_recv_sock, + .remember_stamp = tcp_v4_remember_stamp, + .net_header_len = sizeof(struct iphdr), + + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = v6_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6) +}; + + + +/* NOTE: A lot of things set to zero explicitly by call to + * sk_alloc() so need not be done here. + */ +static int tcp_v6_init_sock(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + skb_queue_head_init(&tp->out_of_order_queue); + tcp_init_xmit_timers(sk); + tcp_prequeue_init(tp); + + tp->rto = TCP_TIMEOUT_INIT; + tp->mdev = TCP_TIMEOUT_INIT; + + /* So many TCP implementations out there (incorrectly) count the + * initial SYN frame in their delayed-ACK and congestion control + * algorithms that we must have the following bandaid to talk + * efficiently to them. -DaveM + */ + tp->snd_cwnd = 2; + + /* See draft-stevens-tcpca-spec-01 for discussion of the + * initialization of these values. + */ + tp->snd_ssthresh = 0x7fffffff; + tp->snd_cwnd_clamp = ~0; + tp->mss_cache_std = tp->mss_cache = 536; + + tp->reordering = sysctl_tcp_reordering; + + sk->sk_state = TCP_CLOSE; + + tp->af_specific = &ipv6_specific; + + sk->sk_write_space = sk_stream_write_space; + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + + sk->sk_sndbuf = sysctl_tcp_wmem[1]; + sk->sk_rcvbuf = sysctl_tcp_rmem[1]; + + atomic_inc(&tcp_sockets_allocated); + + return 0; +} + +static int tcp_v6_destroy_sock(struct sock *sk) +{ + extern int tcp_v4_destroy_sock(struct sock *sk); + + tcp_v4_destroy_sock(sk); + return inet6_destroy_sock(sk); +} + +/* Proc filesystem TCPv6 sock list dumping. */ +static void get_openreq6(struct seq_file *seq, + struct sock *sk, struct open_request *req, int i, int uid) +{ + struct in6_addr *dest, *src; + int ttd = req->expires - jiffies; + + if (ttd < 0) + ttd = 0; + + src = &req->af.v6_req.loc_addr; + dest = &req->af.v6_req.rmt_addr; + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n", + i, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], + ntohs(inet_sk(sk)->sport), + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], + ntohs(req->rmt_port), + TCP_SYN_RECV, + 0,0, /* could print option size, but that is af dependent. */ + 1, /* timers active (only the expire timer) */ + jiffies_to_clock_t(ttd), + req->retrans, + uid, + 0, /* non standard timer */ + 0, /* open_requests have no inode */ + 0, req); +} + +static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i) +{ + struct in6_addr *dest, *src; + __u16 destp, srcp; + int timer_active; + unsigned long timer_expires; + struct inet_sock *inet = inet_sk(sp); + struct tcp_sock *tp = tcp_sk(sp); + struct ipv6_pinfo *np = inet6_sk(sp); + + dest = &np->daddr; + src = &np->rcv_saddr; + destp = ntohs(inet->dport); + srcp = ntohs(inet->sport); + if (tp->pending == TCP_TIME_RETRANS) { + timer_active = 1; + timer_expires = tp->timeout; + } else if (tp->pending == TCP_TIME_PROBE0) { + timer_active = 4; + timer_expires = tp->timeout; + } else if (timer_pending(&sp->sk_timer)) { + timer_active = 2; + timer_expires = sp->sk_timer.expires; + } else { + timer_active = 0; + timer_expires = jiffies; + } + + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p %u %u %u %u %d\n", + i, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + sp->sk_state, + tp->write_seq-tp->snd_una, tp->rcv_nxt-tp->copied_seq, + timer_active, + jiffies_to_clock_t(timer_expires - jiffies), + tp->retransmits, + sock_i_uid(sp), + tp->probes_out, + sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp, + tp->rto, tp->ack.ato, (tp->ack.quick<<1)|tp->ack.pingpong, + tp->snd_cwnd, tp->snd_ssthresh>=0xFFFF?-1:tp->snd_ssthresh + ); +} + +static void get_timewait6_sock(struct seq_file *seq, + struct tcp_tw_bucket *tw, int i) +{ + struct in6_addr *dest, *src; + __u16 destp, srcp; + int ttd = tw->tw_ttd - jiffies; + + if (ttd < 0) + ttd = 0; + + dest = &tw->tw_v6_daddr; + src = &tw->tw_v6_rcv_saddr; + destp = ntohs(tw->tw_dport); + srcp = ntohs(tw->tw_sport); + + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n", + i, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + tw->tw_substate, 0, 0, + 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0, + atomic_read(&tw->tw_refcnt), tw); +} + +#ifdef CONFIG_PROC_FS +static int tcp6_seq_show(struct seq_file *seq, void *v) +{ + struct tcp_iter_state *st; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, + " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt" + " uid timeout inode\n"); + goto out; + } + st = seq->private; + + switch (st->state) { + case TCP_SEQ_STATE_LISTENING: + case TCP_SEQ_STATE_ESTABLISHED: + get_tcp6_sock(seq, v, st->num); + break; + case TCP_SEQ_STATE_OPENREQ: + get_openreq6(seq, st->syn_wait_sk, v, st->num, st->uid); + break; + case TCP_SEQ_STATE_TIME_WAIT: + get_timewait6_sock(seq, v, st->num); + break; + } +out: + return 0; +} + +static struct file_operations tcp6_seq_fops; +static struct tcp_seq_afinfo tcp6_seq_afinfo = { + .owner = THIS_MODULE, + .name = "tcp6", + .family = AF_INET6, + .seq_show = tcp6_seq_show, + .seq_fops = &tcp6_seq_fops, +}; + +int __init tcp6_proc_init(void) +{ + return tcp_proc_register(&tcp6_seq_afinfo); +} + +void tcp6_proc_exit(void) +{ + tcp_proc_unregister(&tcp6_seq_afinfo); +} +#endif + +struct proto tcpv6_prot = { + .name = "TCPv6", + .owner = THIS_MODULE, + .close = tcp_close, + .connect = tcp_v6_connect, + .disconnect = tcp_disconnect, + .accept = tcp_accept, + .ioctl = tcp_ioctl, + .init = tcp_v6_init_sock, + .destroy = tcp_v6_destroy_sock, + .shutdown = tcp_shutdown, + .setsockopt = tcp_setsockopt, + .getsockopt = tcp_getsockopt, + .sendmsg = tcp_sendmsg, + .recvmsg = tcp_recvmsg, + .backlog_rcv = tcp_v6_do_rcv, + .hash = tcp_v6_hash, + .unhash = tcp_unhash, + .get_port = tcp_v6_get_port, + .enter_memory_pressure = tcp_enter_memory_pressure, + .sockets_allocated = &tcp_sockets_allocated, + .memory_allocated = &tcp_memory_allocated, + .memory_pressure = &tcp_memory_pressure, + .sysctl_mem = sysctl_tcp_mem, + .sysctl_wmem = sysctl_tcp_wmem, + .sysctl_rmem = sysctl_tcp_rmem, + .max_header = MAX_TCP_HEADER, + .obj_size = sizeof(struct tcp6_sock), +}; + +static struct inet6_protocol tcpv6_protocol = { + .handler = tcp_v6_rcv, + .err_handler = tcp_v6_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +extern struct proto_ops inet6_stream_ops; + +static struct inet_protosw tcpv6_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_TCP, + .prot = &tcpv6_prot, + .ops = &inet6_stream_ops, + .capability = -1, + .no_check = 0, + .flags = INET_PROTOSW_PERMANENT, +}; + +void __init tcpv6_init(void) +{ + /* register inet6 protocol */ + if (inet6_add_protocol(&tcpv6_protocol, IPPROTO_TCP) < 0) + printk(KERN_ERR "tcpv6_init: Could not register protocol\n"); + inet6_register_protosw(&tcpv6_protosw); +} diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c new file mode 100644 index 000000000000..e251d0ba4f39 --- /dev/null +++ b/net/ipv6/udp.c @@ -0,0 +1,1075 @@ +/* + * UDP over IPv6 + * Linux INET6 implementation + * + * Authors: + * Pedro Roque + * + * Based on linux/ipv4/udp.c + * + * $Id: udp.c,v 1.65 2002/02/01 22:01:04 davem Exp $ + * + * Fixes: + * Hideaki YOSHIFUJI : sin6_scope_id support + * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which + * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind + * a single port at the same time. + * Kazunori MIYAZAWA @USAGI: change process style to use ip6_append_data + * YOSHIFUJI Hideaki @USAGI: convert /proc/net/udp6 to seq_file. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +DEFINE_SNMP_STAT(struct udp_mib, udp_stats_in6); + +/* Grrr, addr_type already calculated by caller, but I don't want + * to add some silly "cookie" argument to this method just for that. + */ +static int udp_v6_get_port(struct sock *sk, unsigned short snum) +{ + struct sock *sk2; + struct hlist_node *node; + + write_lock_bh(&udp_hash_lock); + if (snum == 0) { + int best_size_so_far, best, result, i; + + if (udp_port_rover > sysctl_local_port_range[1] || + udp_port_rover < sysctl_local_port_range[0]) + udp_port_rover = sysctl_local_port_range[0]; + best_size_so_far = 32767; + best = result = udp_port_rover; + for (i = 0; i < UDP_HTABLE_SIZE; i++, result++) { + int size; + struct hlist_head *list; + + list = &udp_hash[result & (UDP_HTABLE_SIZE - 1)]; + if (hlist_empty(list)) { + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & + (UDP_HTABLE_SIZE - 1)); + goto gotit; + } + size = 0; + sk_for_each(sk2, node, list) + if (++size >= best_size_so_far) + goto next; + best_size_so_far = size; + best = result; + next:; + } + result = best; + for(;; result += UDP_HTABLE_SIZE) { + if (result > sysctl_local_port_range[1]) + result = sysctl_local_port_range[0] + + ((result - sysctl_local_port_range[0]) & + (UDP_HTABLE_SIZE - 1)); + if (!udp_lport_inuse(result)) + break; + } +gotit: + udp_port_rover = snum = result; + } else { + sk_for_each(sk2, node, + &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]) { + if (inet_sk(sk2)->num == snum && + sk2 != sk && + (!sk2->sk_bound_dev_if || + !sk->sk_bound_dev_if || + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && + (!sk2->sk_reuse || !sk->sk_reuse) && + ipv6_rcv_saddr_equal(sk, sk2)) + goto fail; + } + } + + inet_sk(sk)->num = snum; + if (sk_unhashed(sk)) { + sk_add_node(sk, &udp_hash[snum & (UDP_HTABLE_SIZE - 1)]); + sock_prot_inc_use(sk->sk_prot); + } + write_unlock_bh(&udp_hash_lock); + return 0; + +fail: + write_unlock_bh(&udp_hash_lock); + return 1; +} + +static void udp_v6_hash(struct sock *sk) +{ + BUG(); +} + +static void udp_v6_unhash(struct sock *sk) +{ + write_lock_bh(&udp_hash_lock); + if (sk_del_node_init(sk)) { + inet_sk(sk)->num = 0; + sock_prot_dec_use(sk->sk_prot); + } + write_unlock_bh(&udp_hash_lock); +} + +static struct sock *udp_v6_lookup(struct in6_addr *saddr, u16 sport, + struct in6_addr *daddr, u16 dport, int dif) +{ + struct sock *sk, *result = NULL; + struct hlist_node *node; + unsigned short hnum = ntohs(dport); + int badness = -1; + + read_lock(&udp_hash_lock); + sk_for_each(sk, node, &udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]) { + struct inet_sock *inet = inet_sk(sk); + + if (inet->num == hnum && sk->sk_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(sk); + int score = 0; + if (inet->dport) { + if (inet->dport != sport) + continue; + score++; + } + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (!ipv6_addr_equal(&np->rcv_saddr, daddr)) + continue; + score++; + } + if (!ipv6_addr_any(&np->daddr)) { + if (!ipv6_addr_equal(&np->daddr, saddr)) + continue; + score++; + } + if (sk->sk_bound_dev_if) { + if (sk->sk_bound_dev_if != dif) + continue; + score++; + } + if(score == 4) { + result = sk; + break; + } else if(score > badness) { + result = sk; + badness = score; + } + } + } + if (result) + sock_hold(result); + read_unlock(&udp_hash_lock); + return result; +} + +/* + * + */ + +static void udpv6_close(struct sock *sk, long timeout) +{ + sk_common_release(sk); +} + +/* + * This should be easy, if there is something there we + * return it, otherwise we block. + */ + +static int udpv6_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, + int noblock, int flags, int *addr_len) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct sk_buff *skb; + size_t copied; + int err; + + if (addr_len) + *addr_len=sizeof(struct sockaddr_in6); + + if (flags & MSG_ERRQUEUE) + return ipv6_recv_error(sk, msg, len); + +try_again: + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + copied = skb->len - sizeof(struct udphdr); + if (copied > len) { + copied = len; + msg->msg_flags |= MSG_TRUNC; + } + + if (skb->ip_summed==CHECKSUM_UNNECESSARY) { + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, + copied); + } else if (msg->msg_flags&MSG_TRUNC) { + if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + goto csum_copy_err; + err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov, + copied); + } else { + err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); + if (err == -EINVAL) + goto csum_copy_err; + } + if (err) + goto out_free; + + sock_recv_timestamp(msg, sk, skb); + + /* Copy the address. */ + if (msg->msg_name) { + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *) msg->msg_name; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = skb->h.uh->source; + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + + if (skb->protocol == htons(ETH_P_IP)) + ipv6_addr_set(&sin6->sin6_addr, 0, 0, + htonl(0xffff), skb->nh.iph->saddr); + else { + ipv6_addr_copy(&sin6->sin6_addr, &skb->nh.ipv6h->saddr); + if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = IP6CB(skb)->iif; + } + + } + if (skb->protocol == htons(ETH_P_IP)) { + if (inet->cmsg_flags) + ip_cmsg_recv(msg, skb); + } else { + if (np->rxopt.all) + datagram_recv_ctl(sk, msg, skb); + } + + err = copied; + if (flags & MSG_TRUNC) + err = skb->len - sizeof(struct udphdr); + +out_free: + skb_free_datagram(sk, skb); +out: + return err; + +csum_copy_err: + /* Clear queue. */ + if (flags&MSG_PEEK) { + int clear = 0; + spin_lock_irq(&sk->sk_receive_queue.lock); + if (skb == skb_peek(&sk->sk_receive_queue)) { + __skb_unlink(skb, &sk->sk_receive_queue); + clear = 1; + } + spin_unlock_irq(&sk->sk_receive_queue.lock); + if (clear) + kfree_skb(skb); + } + + skb_free_datagram(sk, skb); + + if (flags & MSG_DONTWAIT) { + UDP6_INC_STATS_USER(UDP_MIB_INERRORS); + return -EAGAIN; + } + goto try_again; +} + +static void udpv6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct ipv6_pinfo *np; + struct ipv6hdr *hdr = (struct ipv6hdr*)skb->data; + struct net_device *dev = skb->dev; + struct in6_addr *saddr = &hdr->saddr; + struct in6_addr *daddr = &hdr->daddr; + struct udphdr *uh = (struct udphdr*)(skb->data+offset); + struct sock *sk; + int err; + + sk = udp_v6_lookup(daddr, uh->dest, saddr, uh->source, dev->ifindex); + + if (sk == NULL) + return; + + np = inet6_sk(sk); + + if (!icmpv6_err_convert(type, code, &err) && !np->recverr) + goto out; + + if (sk->sk_state != TCP_ESTABLISHED && !np->recverr) + goto out; + + if (np->recverr) + ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1)); + + sk->sk_err = err; + sk->sk_error_report(sk); +out: + sock_put(sk); +} + +static inline int udpv6_queue_rcv_skb(struct sock * sk, struct sk_buff *skb) +{ + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { + kfree_skb(skb); + return -1; + } + + if (skb->ip_summed != CHECKSUM_UNNECESSARY) { + if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { + UDP6_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return 0; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + + if (sock_queue_rcv_skb(sk,skb)<0) { + UDP6_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return 0; + } + UDP6_INC_STATS_BH(UDP_MIB_INDATAGRAMS); + return 0; +} + +static struct sock *udp_v6_mcast_next(struct sock *sk, + u16 loc_port, struct in6_addr *loc_addr, + u16 rmt_port, struct in6_addr *rmt_addr, + int dif) +{ + struct hlist_node *node; + struct sock *s = sk; + unsigned short num = ntohs(loc_port); + + sk_for_each_from(s, node) { + struct inet_sock *inet = inet_sk(s); + + if (inet->num == num && s->sk_family == PF_INET6) { + struct ipv6_pinfo *np = inet6_sk(s); + if (inet->dport) { + if (inet->dport != rmt_port) + continue; + } + if (!ipv6_addr_any(&np->daddr) && + !ipv6_addr_equal(&np->daddr, rmt_addr)) + continue; + + if (s->sk_bound_dev_if && s->sk_bound_dev_if != dif) + continue; + + if (!ipv6_addr_any(&np->rcv_saddr)) { + if (ipv6_addr_equal(&np->rcv_saddr, loc_addr)) + return s; + continue; + } + if(!inet6_mc_check(s, loc_addr, rmt_addr)) + continue; + return s; + } + } + return NULL; +} + +/* + * Note: called only from the BH handler context, + * so we don't need to lock the hashes. + */ +static void udpv6_mcast_deliver(struct udphdr *uh, + struct in6_addr *saddr, struct in6_addr *daddr, + struct sk_buff *skb) +{ + struct sock *sk, *sk2; + int dif; + + read_lock(&udp_hash_lock); + sk = sk_head(&udp_hash[ntohs(uh->dest) & (UDP_HTABLE_SIZE - 1)]); + dif = skb->dev->ifindex; + sk = udp_v6_mcast_next(sk, uh->dest, daddr, uh->source, saddr, dif); + if (!sk) { + kfree_skb(skb); + goto out; + } + + sk2 = sk; + while ((sk2 = udp_v6_mcast_next(sk_next(sk2), uh->dest, daddr, + uh->source, saddr, dif))) { + struct sk_buff *buff = skb_clone(skb, GFP_ATOMIC); + if (buff) + udpv6_queue_rcv_skb(sk2, buff); + } + udpv6_queue_rcv_skb(sk, skb); +out: + read_unlock(&udp_hash_lock); +} + +static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + struct sk_buff *skb = *pskb; + struct sock *sk; + struct udphdr *uh; + struct net_device *dev = skb->dev; + struct in6_addr *saddr, *daddr; + u32 ulen = 0; + + if (!pskb_may_pull(skb, sizeof(struct udphdr))) + goto short_packet; + + saddr = &skb->nh.ipv6h->saddr; + daddr = &skb->nh.ipv6h->daddr; + uh = skb->h.uh; + + ulen = ntohs(uh->len); + + /* Check for jumbo payload */ + if (ulen == 0) + ulen = skb->len; + + if (ulen > skb->len || ulen < sizeof(*uh)) + goto short_packet; + + if (uh->check == 0) { + /* RFC 2460 section 8.1 says that we SHOULD log + this error. Well, it is reasonable. + */ + LIMIT_NETDEBUG( + printk(KERN_INFO "IPv6: udp checksum is 0\n")); + goto discard; + } + + if (ulen < skb->len) { + if (__pskb_trim(skb, ulen)) + goto discard; + saddr = &skb->nh.ipv6h->saddr; + daddr = &skb->nh.ipv6h->daddr; + uh = skb->h.uh; + } + + if (skb->ip_summed==CHECKSUM_HW) { + skb->ip_summed = CHECKSUM_UNNECESSARY; + if (csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, skb->csum)) { + LIMIT_NETDEBUG(printk(KERN_DEBUG "udp v6 hw csum failure.\n")); + skb->ip_summed = CHECKSUM_NONE; + } + } + if (skb->ip_summed != CHECKSUM_UNNECESSARY) + skb->csum = ~csum_ipv6_magic(saddr, daddr, ulen, IPPROTO_UDP, 0); + + /* + * Multicast receive code + */ + if (ipv6_addr_is_multicast(daddr)) { + udpv6_mcast_deliver(uh, saddr, daddr, skb); + return 0; + } + + /* Unicast */ + + /* + * check socket cache ... must talk to Alan about his plans + * for sock caches... i'll skip this for now. + */ + sk = udp_v6_lookup(saddr, uh->source, daddr, uh->dest, dev->ifindex); + + if (sk == NULL) { + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard; + + if (skb->ip_summed != CHECKSUM_UNNECESSARY && + (unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) + goto discard; + UDP6_INC_STATS_BH(UDP_MIB_NOPORTS); + + icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0, dev); + + kfree_skb(skb); + return(0); + } + + /* deliver */ + + udpv6_queue_rcv_skb(sk, skb); + sock_put(sk); + return(0); + +short_packet: + if (net_ratelimit()) + printk(KERN_DEBUG "UDP: short packet: %d/%u\n", ulen, skb->len); + +discard: + UDP6_INC_STATS_BH(UDP_MIB_INERRORS); + kfree_skb(skb); + return(0); +} +/* + * Throw away all pending data and cancel the corking. Socket is locked. + */ +static void udp_v6_flush_pending_frames(struct sock *sk) +{ + struct udp_sock *up = udp_sk(sk); + + if (up->pending) { + up->len = 0; + up->pending = 0; + ip6_flush_pending_frames(sk); + } +} + +/* + * Sending + */ + +static int udp_v6_push_pending_frames(struct sock *sk, struct udp_sock *up) +{ + struct sk_buff *skb; + struct udphdr *uh; + struct inet_sock *inet = inet_sk(sk); + struct flowi *fl = &inet->cork.fl; + int err = 0; + + /* Grab the skbuff where UDP header space exists. */ + if ((skb = skb_peek(&sk->sk_write_queue)) == NULL) + goto out; + + /* + * Create a UDP header + */ + uh = skb->h.uh; + uh->source = fl->fl_ip_sport; + uh->dest = fl->fl_ip_dport; + uh->len = htons(up->len); + uh->check = 0; + + if (sk->sk_no_check == UDP_CSUM_NOXMIT) { + skb->ip_summed = CHECKSUM_NONE; + goto send; + } + + if (skb_queue_len(&sk->sk_write_queue) == 1) { + skb->csum = csum_partial((char *)uh, + sizeof(struct udphdr), skb->csum); + uh->check = csum_ipv6_magic(&fl->fl6_src, + &fl->fl6_dst, + up->len, fl->proto, skb->csum); + } else { + u32 tmp_csum = 0; + + skb_queue_walk(&sk->sk_write_queue, skb) { + tmp_csum = csum_add(tmp_csum, skb->csum); + } + tmp_csum = csum_partial((char *)uh, + sizeof(struct udphdr), tmp_csum); + tmp_csum = csum_ipv6_magic(&fl->fl6_src, + &fl->fl6_dst, + up->len, fl->proto, tmp_csum); + uh->check = tmp_csum; + + } + if (uh->check == 0) + uh->check = -1; + +send: + err = ip6_push_pending_frames(sk); +out: + up->len = 0; + up->pending = 0; + return err; +} + +static int udpv6_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len) +{ + struct ipv6_txoptions opt_space; + struct udp_sock *up = udp_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) msg->msg_name; + struct in6_addr *daddr, *final_p = NULL, final; + struct ipv6_txoptions *opt = NULL; + struct ip6_flowlabel *flowlabel = NULL; + struct flowi *fl = &inet->cork.fl; + struct dst_entry *dst; + int addr_len = msg->msg_namelen; + int ulen = len; + int hlimit = -1; + int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + int err; + + /* destination address check */ + if (sin6) { + if (addr_len < offsetof(struct sockaddr, sa_data)) + return -EINVAL; + + switch (sin6->sin6_family) { + case AF_INET6: + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + daddr = &sin6->sin6_addr; + break; + case AF_INET: + goto do_udp_sendmsg; + case AF_UNSPEC: + msg->msg_name = sin6 = NULL; + msg->msg_namelen = addr_len = 0; + daddr = NULL; + break; + default: + return -EINVAL; + } + } else if (!up->pending) { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + daddr = &np->daddr; + } else + daddr = NULL; + + if (daddr) { + if (ipv6_addr_type(daddr) == IPV6_ADDR_MAPPED) { + struct sockaddr_in sin; + sin.sin_family = AF_INET; + sin.sin_port = sin6 ? sin6->sin6_port : inet->dport; + sin.sin_addr.s_addr = daddr->s6_addr32[3]; + msg->msg_name = &sin; + msg->msg_namelen = sizeof(sin); +do_udp_sendmsg: + if (__ipv6_only_sock(sk)) + return -ENETUNREACH; + return udp_sendmsg(iocb, sk, msg, len); + } + } + + if (up->pending == AF_INET) + return udp_sendmsg(iocb, sk, msg, len); + + /* Rough check on arithmetic overflow, + better check is made in ip6_build_xmit + */ + if (len > INT_MAX - sizeof(struct udphdr)) + return -EMSGSIZE; + + if (up->pending) { + /* + * There are pending frames. + * The socket lock must be held while it's corked. + */ + lock_sock(sk); + if (likely(up->pending)) { + if (unlikely(up->pending != AF_INET6)) { + release_sock(sk); + return -EAFNOSUPPORT; + } + dst = NULL; + goto do_append_data; + } + release_sock(sk); + } + ulen += sizeof(struct udphdr); + + memset(fl, 0, sizeof(*fl)); + + if (sin6) { + if (sin6->sin6_port == 0) + return -EINVAL; + + fl->fl_ip_dport = sin6->sin6_port; + daddr = &sin6->sin6_addr; + + if (np->sndflow) { + fl->fl6_flowlabel = sin6->sin6_flowinfo&IPV6_FLOWINFO_MASK; + if (fl->fl6_flowlabel&IPV6_FLOWLABEL_MASK) { + flowlabel = fl6_sock_lookup(sk, fl->fl6_flowlabel); + if (flowlabel == NULL) + return -EINVAL; + daddr = &flowlabel->dst; + } + } + + /* + * Otherwise it will be difficult to maintain + * sk->sk_dst_cache. + */ + if (sk->sk_state == TCP_ESTABLISHED && + ipv6_addr_equal(daddr, &np->daddr)) + daddr = &np->daddr; + + if (addr_len >= sizeof(struct sockaddr_in6) && + sin6->sin6_scope_id && + ipv6_addr_type(daddr)&IPV6_ADDR_LINKLOCAL) + fl->oif = sin6->sin6_scope_id; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -EDESTADDRREQ; + + fl->fl_ip_dport = inet->dport; + daddr = &np->daddr; + fl->fl6_flowlabel = np->flow_label; + } + + if (!fl->oif) + fl->oif = sk->sk_bound_dev_if; + + if (msg->msg_controllen) { + opt = &opt_space; + memset(opt, 0, sizeof(struct ipv6_txoptions)); + opt->tot_len = sizeof(*opt); + + err = datagram_send_ctl(msg, fl, opt, &hlimit); + if (err < 0) { + fl6_sock_release(flowlabel); + return err; + } + if ((fl->fl6_flowlabel&IPV6_FLOWLABEL_MASK) && !flowlabel) { + flowlabel = fl6_sock_lookup(sk, fl->fl6_flowlabel); + if (flowlabel == NULL) + return -EINVAL; + } + if (!(opt->opt_nflen|opt->opt_flen)) + opt = NULL; + } + if (opt == NULL) + opt = np->opt; + if (flowlabel) + opt = fl6_merge_options(&opt_space, flowlabel, opt); + + fl->proto = IPPROTO_UDP; + ipv6_addr_copy(&fl->fl6_dst, daddr); + if (ipv6_addr_any(&fl->fl6_src) && !ipv6_addr_any(&np->saddr)) + ipv6_addr_copy(&fl->fl6_src, &np->saddr); + fl->fl_ip_sport = inet->sport; + + /* merge ip6_build_xmit from ip6_output */ + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + ipv6_addr_copy(&final, &fl->fl6_dst); + ipv6_addr_copy(&fl->fl6_dst, rt0->addr); + final_p = &final; + } + + if (!fl->oif && ipv6_addr_is_multicast(&fl->fl6_dst)) + fl->oif = np->mcast_oif; + + err = ip6_dst_lookup(sk, &dst, fl); + if (err) + goto out; + if (final_p) + ipv6_addr_copy(&fl->fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, fl, sk, 0)) < 0) { + dst_release(dst); + goto out; + } + + if (hlimit < 0) { + if (ipv6_addr_is_multicast(&fl->fl6_dst)) + hlimit = np->mcast_hops; + else + hlimit = np->hop_limit; + if (hlimit < 0) + hlimit = dst_metric(dst, RTAX_HOPLIMIT); + if (hlimit < 0) + hlimit = ipv6_get_hoplimit(dst->dev); + } + + if (msg->msg_flags&MSG_CONFIRM) + goto do_confirm; +back_from_confirm: + + lock_sock(sk); + if (unlikely(up->pending)) { + /* The socket is already corked while preparing it. */ + /* ... which is an evident application bug. --ANK */ + release_sock(sk); + + LIMIT_NETDEBUG(printk(KERN_DEBUG "udp cork app bug 2\n")); + err = -EINVAL; + goto out; + } + + up->pending = AF_INET6; + +do_append_data: + up->len += ulen; + err = ip6_append_data(sk, ip_generic_getfrag, msg->msg_iov, ulen, sizeof(struct udphdr), + hlimit, opt, fl, (struct rt6_info*)dst, + corkreq ? msg->msg_flags|MSG_MORE : msg->msg_flags); + if (err) + udp_v6_flush_pending_frames(sk); + else if (!corkreq) + err = udp_v6_push_pending_frames(sk, up); + + if (dst) + ip6_dst_store(sk, dst, + ipv6_addr_equal(&fl->fl6_dst, &np->daddr) ? + &np->daddr : NULL); + if (err > 0) + err = np->recverr ? net_xmit_errno(err) : 0; + release_sock(sk); +out: + fl6_sock_release(flowlabel); + if (!err) { + UDP6_INC_STATS_USER(UDP_MIB_OUTDATAGRAMS); + return len; + } + return err; + +do_confirm: + dst_confirm(dst); + if (!(msg->msg_flags&MSG_PROBE) || len) + goto back_from_confirm; + err = 0; + goto out; +} + +static int udpv6_destroy_sock(struct sock *sk) +{ + lock_sock(sk); + udp_v6_flush_pending_frames(sk); + release_sock(sk); + + inet6_destroy_sock(sk); + + return 0; +} + +/* + * Socket option code for UDP + */ +static int udpv6_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, int optlen) +{ + struct udp_sock *up = udp_sk(sk); + int val; + int err = 0; + + if (level != SOL_UDP) + return ipv6_setsockopt(sk, level, optname, optval, optlen); + + if(optlencorkflag = 1; + } else { + up->corkflag = 0; + lock_sock(sk); + udp_v6_push_pending_frames(sk, up); + release_sock(sk); + } + break; + + case UDP_ENCAP: + switch (val) { + case 0: + up->encap_type = val; + break; + default: + err = -ENOPROTOOPT; + break; + } + break; + + default: + err = -ENOPROTOOPT; + break; + }; + + return err; +} + +static int udpv6_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct udp_sock *up = udp_sk(sk); + int val, len; + + if (level != SOL_UDP) + return ipv6_getsockopt(sk, level, optname, optval, optlen); + + if(get_user(len,optlen)) + return -EFAULT; + + len = min_t(unsigned int, len, sizeof(int)); + + if(len < 0) + return -EINVAL; + + switch(optname) { + case UDP_CORK: + val = up->corkflag; + break; + + case UDP_ENCAP: + val = up->encap_type; + break; + + default: + return -ENOPROTOOPT; + }; + + if(put_user(len, optlen)) + return -EFAULT; + if(copy_to_user(optval, &val,len)) + return -EFAULT; + return 0; +} + +static struct inet6_protocol udpv6_protocol = { + .handler = udpv6_rcv, + .err_handler = udpv6_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +/* ------------------------------------------------------------------------ */ +#ifdef CONFIG_PROC_FS + +static void udp6_sock_seq_show(struct seq_file *seq, struct sock *sp, int bucket) +{ + struct inet_sock *inet = inet_sk(sp); + struct ipv6_pinfo *np = inet6_sk(sp); + struct in6_addr *dest, *src; + __u16 destp, srcp; + + dest = &np->daddr; + src = &np->rcv_saddr; + destp = ntohs(inet->dport); + srcp = ntohs(inet->sport); + seq_printf(seq, + "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " + "%02X %08X:%08X %02X:%08lX %08X %5d %8d %lu %d %p\n", + bucket, + src->s6_addr32[0], src->s6_addr32[1], + src->s6_addr32[2], src->s6_addr32[3], srcp, + dest->s6_addr32[0], dest->s6_addr32[1], + dest->s6_addr32[2], dest->s6_addr32[3], destp, + sp->sk_state, + atomic_read(&sp->sk_wmem_alloc), + atomic_read(&sp->sk_rmem_alloc), + 0, 0L, 0, + sock_i_uid(sp), 0, + sock_i_ino(sp), + atomic_read(&sp->sk_refcnt), sp); +} + +static int udp6_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_printf(seq, + " sl " + "local_address " + "remote_address " + "st tx_queue rx_queue tr tm->when retrnsmt" + " uid timeout inode\n"); + else + udp6_sock_seq_show(seq, v, ((struct udp_iter_state *)seq->private)->bucket); + return 0; +} + +static struct file_operations udp6_seq_fops; +static struct udp_seq_afinfo udp6_seq_afinfo = { + .owner = THIS_MODULE, + .name = "udp6", + .family = AF_INET6, + .seq_show = udp6_seq_show, + .seq_fops = &udp6_seq_fops, +}; + +int __init udp6_proc_init(void) +{ + return udp_proc_register(&udp6_seq_afinfo); +} + +void udp6_proc_exit(void) { + udp_proc_unregister(&udp6_seq_afinfo); +} +#endif /* CONFIG_PROC_FS */ + +/* ------------------------------------------------------------------------ */ + +struct proto udpv6_prot = { + .name = "UDPv6", + .owner = THIS_MODULE, + .close = udpv6_close, + .connect = ip6_datagram_connect, + .disconnect = udp_disconnect, + .ioctl = udp_ioctl, + .destroy = udpv6_destroy_sock, + .setsockopt = udpv6_setsockopt, + .getsockopt = udpv6_getsockopt, + .sendmsg = udpv6_sendmsg, + .recvmsg = udpv6_recvmsg, + .backlog_rcv = udpv6_queue_rcv_skb, + .hash = udp_v6_hash, + .unhash = udp_v6_unhash, + .get_port = udp_v6_get_port, + .obj_size = sizeof(struct udp6_sock), +}; + +extern struct proto_ops inet6_dgram_ops; + +static struct inet_protosw udpv6_protosw = { + .type = SOCK_DGRAM, + .protocol = IPPROTO_UDP, + .prot = &udpv6_prot, + .ops = &inet6_dgram_ops, + .capability =-1, + .no_check = UDP_CSUM_DEFAULT, + .flags = INET_PROTOSW_PERMANENT, +}; + + +void __init udpv6_init(void) +{ + if (inet6_add_protocol(&udpv6_protocol, IPPROTO_UDP) < 0) + printk(KERN_ERR "udpv6_init: Could not register protocol\n"); + inet6_register_protosw(&udpv6_protosw); +} diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c new file mode 100644 index 000000000000..28c29d78338e --- /dev/null +++ b/net/ipv6/xfrm6_input.c @@ -0,0 +1,150 @@ +/* + * xfrm6_input.c: based on net/ipv4/xfrm4_input.c + * + * Authors: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * YOSHIFUJI Hideaki @USAGI + * IPv6 support + */ + +#include +#include +#include +#include +#include +#include +#include + +static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) +{ + struct ipv6hdr *outer_iph = skb->nh.ipv6h; + struct ipv6hdr *inner_iph = skb->h.ipv6h; + + if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) + IP6_ECN_set_ce(inner_iph); +} + +int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi) +{ + struct sk_buff *skb = *pskb; + int err; + u32 seq; + struct sec_decap_state xfrm_vec[XFRM_MAX_DEPTH]; + struct xfrm_state *x; + int xfrm_nr = 0; + int decaps = 0; + int nexthdr; + unsigned int nhoff; + + nhoff = *nhoffp; + nexthdr = skb->nh.raw[nhoff]; + + seq = 0; + if (!spi && (err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) + goto drop; + + do { + struct ipv6hdr *iph = skb->nh.ipv6h; + + if (xfrm_nr == XFRM_MAX_DEPTH) + goto drop; + + x = xfrm_state_lookup((xfrm_address_t *)&iph->daddr, spi, nexthdr, AF_INET6); + if (x == NULL) + goto drop; + spin_lock(&x->lock); + if (unlikely(x->km.state != XFRM_STATE_VALID)) + goto drop_unlock; + + if (x->props.replay_window && xfrm_replay_check(x, seq)) + goto drop_unlock; + + if (xfrm_state_check_expire(x)) + goto drop_unlock; + + nexthdr = x->type->input(x, &(xfrm_vec[xfrm_nr].decap), skb); + if (nexthdr <= 0) + goto drop_unlock; + + skb->nh.raw[nhoff] = nexthdr; + + if (x->props.replay_window) + xfrm_replay_advance(x, seq); + + x->curlft.bytes += skb->len; + x->curlft.packets++; + + spin_unlock(&x->lock); + + xfrm_vec[xfrm_nr++].xvec = x; + + if (x->props.mode) { /* XXX */ + if (nexthdr != IPPROTO_IPV6) + goto drop; + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto drop; + if (skb_cloned(skb) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto drop; + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip6_ecn_decapsulate(skb); + skb->mac.raw = memmove(skb->data - skb->mac_len, + skb->mac.raw, skb->mac_len); + skb->nh.raw = skb->data; + decaps = 1; + break; + } + + if ((err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) < 0) + goto drop; + } while (!err); + + /* Allocate new secpath or COW existing one. */ + if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) { + struct sec_path *sp; + sp = secpath_dup(skb->sp); + if (!sp) + goto drop; + if (skb->sp) + secpath_put(skb->sp); + skb->sp = sp; + } + + if (xfrm_nr + skb->sp->len > XFRM_MAX_DEPTH) + goto drop; + + memcpy(skb->sp->x+skb->sp->len, xfrm_vec, xfrm_nr*sizeof(struct sec_decap_state)); + skb->sp->len += xfrm_nr; + skb->ip_summed = CHECKSUM_NONE; + + if (decaps) { + if (!(skb->dev->flags&IFF_LOOPBACK)) { + dst_release(skb->dst); + skb->dst = NULL; + } + netif_rx(skb); + return -1; + } else { + return 1; + } + +drop_unlock: + spin_unlock(&x->lock); + xfrm_state_put(x); +drop: + while (--xfrm_nr >= 0) + xfrm_state_put(xfrm_vec[xfrm_nr].xvec); + kfree_skb(skb); + return -1; +} + +EXPORT_SYMBOL(xfrm6_rcv_spi); + +int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + return xfrm6_rcv_spi(pskb, nhoffp, 0); +} diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c new file mode 100644 index 000000000000..601a148f60f3 --- /dev/null +++ b/net/ipv6/xfrm6_output.c @@ -0,0 +1,143 @@ +/* + * xfrm6_output.c - Common IPsec encapsulation code for IPv6. + * Copyright (C) 2002 USAGI/WIDE Project + * Copyright (c) 2004 Herbert Xu + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* Add encapsulation header. + * + * In transport mode, the IP header and mutable extension headers will be moved + * forward to make space for the encapsulation header. + * + * In tunnel mode, the top IP header will be constructed per RFC 2401. + * The following fields in it shall be filled in by x->type->output: + * payload_len + * + * On exit, skb->h will be set to the start of the encapsulation header to be + * filled in by x->type->output and skb->nh will be set to the nextheader field + * of the extension header directly preceding the encapsulation header, or in + * its absence, that of the top IP header. The value of skb->data will always + * point to the top IP header. + */ +static void xfrm6_encap(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct ipv6hdr *iph, *top_iph; + int dsfield; + + skb_push(skb, x->props.header_len); + iph = skb->nh.ipv6h; + + if (!x->props.mode) { + u8 *prevhdr; + int hdr_len; + + hdr_len = ip6_find_1stfragopt(skb, &prevhdr); + skb->nh.raw = prevhdr - x->props.header_len; + skb->h.raw = skb->data + hdr_len; + memmove(skb->data, iph, hdr_len); + return; + } + + skb->nh.raw = skb->data; + top_iph = skb->nh.ipv6h; + skb->nh.raw = &top_iph->nexthdr; + skb->h.ipv6h = top_iph + 1; + + top_iph->version = 6; + top_iph->priority = iph->priority; + top_iph->flow_lbl[0] = iph->flow_lbl[0]; + top_iph->flow_lbl[1] = iph->flow_lbl[1]; + top_iph->flow_lbl[2] = iph->flow_lbl[2]; + dsfield = ipv6_get_dsfield(top_iph); + dsfield = INET_ECN_encapsulate(dsfield, dsfield); + if (x->props.flags & XFRM_STATE_NOECN) + dsfield &= ~INET_ECN_MASK; + ipv6_change_dsfield(top_iph, 0, dsfield); + top_iph->nexthdr = IPPROTO_IPV6; + top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT); + ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); + ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); +} + +static int xfrm6_tunnel_check_size(struct sk_buff *skb) +{ + int mtu, ret = 0; + struct dst_entry *dst = skb->dst; + + mtu = dst_mtu(dst); + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + + if (skb->len > mtu) { + icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev); + ret = -EMSGSIZE; + } + + return ret; +} + +int xfrm6_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + int err; + + if (skb->ip_summed == CHECKSUM_HW) { + err = skb_checksum_help(skb, 0); + if (err) + goto error_nolock; + } + + if (x->props.mode) { + err = xfrm6_tunnel_check_size(skb); + if (err) + goto error_nolock; + } + + spin_lock_bh(&x->lock); + err = xfrm_state_check(x, skb); + if (err) + goto error; + + xfrm6_encap(skb); + + err = x->type->output(x, skb); + if (err) + goto error; + + x->curlft.bytes += skb->len; + x->curlft.packets++; + + spin_unlock_bh(&x->lock); + + skb->nh.raw = skb->data; + + if (!(skb->dst = dst_pop(dst))) { + err = -EHOSTUNREACH; + goto error_nolock; + } + err = NET_XMIT_BYPASS; + +out_exit: + return err; +error: + spin_unlock_bh(&x->lock); +error_nolock: + kfree_skb(skb); + goto out_exit; +} diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c new file mode 100644 index 000000000000..8a4f37de4d2d --- /dev/null +++ b/net/ipv6/xfrm6_policy.c @@ -0,0 +1,295 @@ +/* + * xfrm6_policy.c: based on xfrm4_policy.c + * + * Authors: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * IPv6 support + * YOSHIFUJI Hideaki + * Split up af-specific portion + * + */ + +#include +#include +#include +#include +#include + +static struct dst_ops xfrm6_dst_ops; +static struct xfrm_policy_afinfo xfrm6_policy_afinfo; + +static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED }; + +static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) +{ + int err = 0; + *dst = (struct xfrm_dst*)ip6_route_output(NULL, fl); + if (!*dst) + err = -ENETUNREACH; + return err; +} + +static struct dst_entry * +__xfrm6_find_bundle(struct flowi *fl, struct xfrm_policy *policy) +{ + struct dst_entry *dst; + + /* Still not clear if we should set fl->fl6_{src,dst}... */ + read_lock_bh(&policy->lock); + for (dst = policy->bundles; dst; dst = dst->next) { + struct xfrm_dst *xdst = (struct xfrm_dst*)dst; + struct in6_addr fl_dst_prefix, fl_src_prefix; + + ipv6_addr_prefix(&fl_dst_prefix, + &fl->fl6_dst, + xdst->u.rt6.rt6i_dst.plen); + ipv6_addr_prefix(&fl_src_prefix, + &fl->fl6_src, + xdst->u.rt6.rt6i_src.plen); + if (ipv6_addr_equal(&xdst->u.rt6.rt6i_dst.addr, &fl_dst_prefix) && + ipv6_addr_equal(&xdst->u.rt6.rt6i_src.addr, &fl_src_prefix) && + xfrm_bundle_ok(xdst, fl, AF_INET6)) { + dst_clone(dst); + break; + } + } + read_unlock_bh(&policy->lock); + return dst; +} + +/* Allocate chain of dst_entry's, attach known xfrm's, calculate + * all the metrics... Shortly, bundle a bundle. + */ + +static int +__xfrm6_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, + struct flowi *fl, struct dst_entry **dst_p) +{ + struct dst_entry *dst, *dst_prev; + struct rt6_info *rt0 = (struct rt6_info*)(*dst_p); + struct rt6_info *rt = rt0; + struct in6_addr *remote = &fl->fl6_dst; + struct in6_addr *local = &fl->fl6_src; + struct flowi fl_tunnel = { + .nl_u = { + .ip6_u = { + .saddr = *local, + .daddr = *remote + } + } + }; + int i; + int err = 0; + int header_len = 0; + int trailer_len = 0; + + dst = dst_prev = NULL; + dst_hold(&rt->u.dst); + + for (i = 0; i < nx; i++) { + struct dst_entry *dst1 = dst_alloc(&xfrm6_dst_ops); + struct xfrm_dst *xdst; + int tunnel = 0; + + if (unlikely(dst1 == NULL)) { + err = -ENOBUFS; + dst_release(&rt->u.dst); + goto error; + } + + if (!dst) + dst = dst1; + else { + dst_prev->child = dst1; + dst1->flags |= DST_NOHASH; + dst_clone(dst1); + } + + xdst = (struct xfrm_dst *)dst1; + xdst->route = &rt->u.dst; + + dst1->next = dst_prev; + dst_prev = dst1; + if (xfrm[i]->props.mode) { + remote = (struct in6_addr*)&xfrm[i]->id.daddr; + local = (struct in6_addr*)&xfrm[i]->props.saddr; + tunnel = 1; + } + header_len += xfrm[i]->props.header_len; + trailer_len += xfrm[i]->props.trailer_len; + + if (tunnel) { + ipv6_addr_copy(&fl_tunnel.fl6_dst, remote); + ipv6_addr_copy(&fl_tunnel.fl6_src, local); + err = xfrm_dst_lookup((struct xfrm_dst **) &rt, + &fl_tunnel, AF_INET6); + if (err) + goto error; + } else + dst_hold(&rt->u.dst); + } + + dst_prev->child = &rt->u.dst; + dst->path = &rt->u.dst; + + *dst_p = dst; + dst = dst_prev; + + dst_prev = *dst_p; + i = 0; + for (; dst_prev != &rt->u.dst; dst_prev = dst_prev->child) { + struct xfrm_dst *x = (struct xfrm_dst*)dst_prev; + + dst_prev->xfrm = xfrm[i++]; + dst_prev->dev = rt->u.dst.dev; + if (rt->u.dst.dev) + dev_hold(rt->u.dst.dev); + dst_prev->obsolete = -1; + dst_prev->flags |= DST_HOST; + dst_prev->lastuse = jiffies; + dst_prev->header_len = header_len; + dst_prev->trailer_len = trailer_len; + memcpy(&dst_prev->metrics, &x->route->metrics, sizeof(dst_prev->metrics)); + + /* Copy neighbour for reachability confirmation */ + dst_prev->neighbour = neigh_clone(rt->u.dst.neighbour); + dst_prev->input = rt->u.dst.input; + dst_prev->output = xfrm6_output; + /* Sheit... I remember I did this right. Apparently, + * it was magically lost, so this code needs audit */ + x->u.rt6.rt6i_flags = rt0->rt6i_flags&(RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL); + x->u.rt6.rt6i_metric = rt0->rt6i_metric; + x->u.rt6.rt6i_node = rt0->rt6i_node; + x->u.rt6.rt6i_gateway = rt0->rt6i_gateway; + memcpy(&x->u.rt6.rt6i_gateway, &rt0->rt6i_gateway, sizeof(x->u.rt6.rt6i_gateway)); + x->u.rt6.rt6i_dst = rt0->rt6i_dst; + x->u.rt6.rt6i_src = rt0->rt6i_src; + header_len -= x->u.dst.xfrm->props.header_len; + trailer_len -= x->u.dst.xfrm->props.trailer_len; + } + + xfrm_init_pmtu(dst); + return 0; + +error: + if (dst) + dst_free(dst); + return err; +} + +static inline void +_decode_session6(struct sk_buff *skb, struct flowi *fl) +{ + u16 offset = sizeof(struct ipv6hdr); + struct ipv6hdr *hdr = skb->nh.ipv6h; + struct ipv6_opt_hdr *exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + u8 nexthdr = skb->nh.ipv6h->nexthdr; + + memset(fl, 0, sizeof(struct flowi)); + ipv6_addr_copy(&fl->fl6_dst, &hdr->daddr); + ipv6_addr_copy(&fl->fl6_src, &hdr->saddr); + + while (pskb_may_pull(skb, skb->nh.raw + offset + 1 - skb->data)) { + switch (nexthdr) { + case NEXTHDR_ROUTING: + case NEXTHDR_HOP: + case NEXTHDR_DEST: + offset += ipv6_optlen(exthdr); + nexthdr = exthdr->nexthdr; + exthdr = (struct ipv6_opt_hdr*)(skb->nh.raw + offset); + break; + + case IPPROTO_UDP: + case IPPROTO_TCP: + case IPPROTO_SCTP: + if (pskb_may_pull(skb, skb->nh.raw + offset + 4 - skb->data)) { + u16 *ports = (u16 *)exthdr; + + fl->fl_ip_sport = ports[0]; + fl->fl_ip_dport = ports[1]; + } + fl->proto = nexthdr; + return; + + case IPPROTO_ICMPV6: + if (pskb_may_pull(skb, skb->nh.raw + offset + 2 - skb->data)) { + u8 *icmp = (u8 *)exthdr; + + fl->fl_icmp_type = icmp[0]; + fl->fl_icmp_code = icmp[1]; + } + fl->proto = nexthdr; + return; + + /* XXX Why are there these headers? */ + case IPPROTO_AH: + case IPPROTO_ESP: + case IPPROTO_COMP: + default: + fl->fl_ipsec_spi = 0; + fl->proto = nexthdr; + return; + }; + } +} + +static inline int xfrm6_garbage_collect(void) +{ + read_lock(&xfrm6_policy_afinfo.lock); + xfrm6_policy_afinfo.garbage_collect(); + read_unlock(&xfrm6_policy_afinfo.lock); + return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2); +} + +static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct dst_entry *path = xdst->route; + + path->ops->update_pmtu(path, mtu); +} + +static struct dst_ops xfrm6_dst_ops = { + .family = AF_INET6, + .protocol = __constant_htons(ETH_P_IPV6), + .gc = xfrm6_garbage_collect, + .update_pmtu = xfrm6_update_pmtu, + .gc_thresh = 1024, + .entry_size = sizeof(struct xfrm_dst), +}; + +static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { + .family = AF_INET6, + .lock = RW_LOCK_UNLOCKED, + .type_map = &xfrm6_type_map, + .dst_ops = &xfrm6_dst_ops, + .dst_lookup = xfrm6_dst_lookup, + .find_bundle = __xfrm6_find_bundle, + .bundle_create = __xfrm6_bundle_create, + .decode_session = _decode_session6, +}; + +static void __init xfrm6_policy_init(void) +{ + xfrm_policy_register_afinfo(&xfrm6_policy_afinfo); +} + +static void xfrm6_policy_fini(void) +{ + xfrm_policy_unregister_afinfo(&xfrm6_policy_afinfo); +} + +void __init xfrm6_init(void) +{ + xfrm6_policy_init(); + xfrm6_state_init(); +} + +void xfrm6_fini(void) +{ + //xfrm6_input_fini(); + xfrm6_policy_fini(); + xfrm6_state_fini(); +} diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c new file mode 100644 index 000000000000..bf0d0abc3871 --- /dev/null +++ b/net/ipv6/xfrm6_state.c @@ -0,0 +1,136 @@ +/* + * xfrm6_state.c: based on xfrm4_state.c + * + * Authors: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * IPv6 support + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * + */ + +#include +#include +#include +#include + +static struct xfrm_state_afinfo xfrm6_state_afinfo; + +static void +__xfrm6_init_tempsel(struct xfrm_state *x, struct flowi *fl, + struct xfrm_tmpl *tmpl, + xfrm_address_t *daddr, xfrm_address_t *saddr) +{ + /* Initialize temporary selector matching only + * to current session. */ + ipv6_addr_copy((struct in6_addr *)&x->sel.daddr, &fl->fl6_dst); + ipv6_addr_copy((struct in6_addr *)&x->sel.saddr, &fl->fl6_src); + x->sel.dport = xfrm_flowi_dport(fl); + x->sel.dport_mask = ~0; + x->sel.sport = xfrm_flowi_sport(fl); + x->sel.sport_mask = ~0; + x->sel.prefixlen_d = 128; + x->sel.prefixlen_s = 128; + x->sel.proto = fl->proto; + x->sel.ifindex = fl->oif; + x->id = tmpl->id; + if (ipv6_addr_any((struct in6_addr*)&x->id.daddr)) + memcpy(&x->id.daddr, daddr, sizeof(x->sel.daddr)); + memcpy(&x->props.saddr, &tmpl->saddr, sizeof(x->props.saddr)); + if (ipv6_addr_any((struct in6_addr*)&x->props.saddr)) + memcpy(&x->props.saddr, saddr, sizeof(x->props.saddr)); + x->props.mode = tmpl->mode; + x->props.reqid = tmpl->reqid; + x->props.family = AF_INET6; +} + +static struct xfrm_state * +__xfrm6_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto) +{ + unsigned h = __xfrm6_spi_hash(daddr, spi, proto); + struct xfrm_state *x; + + list_for_each_entry(x, xfrm6_state_afinfo.state_byspi+h, byspi) { + if (x->props.family == AF_INET6 && + spi == x->id.spi && + ipv6_addr_equal((struct in6_addr *)daddr, (struct in6_addr *)x->id.daddr.a6) && + proto == x->id.proto) { + xfrm_state_hold(x); + return x; + } + } + return NULL; +} + +static struct xfrm_state * +__xfrm6_find_acq(u8 mode, u32 reqid, u8 proto, + xfrm_address_t *daddr, xfrm_address_t *saddr, + int create) +{ + struct xfrm_state *x, *x0; + unsigned h = __xfrm6_dst_hash(daddr); + + x0 = NULL; + + list_for_each_entry(x, xfrm6_state_afinfo.state_bydst+h, bydst) { + if (x->props.family == AF_INET6 && + ipv6_addr_equal((struct in6_addr *)daddr, (struct in6_addr *)x->id.daddr.a6) && + mode == x->props.mode && + proto == x->id.proto && + ipv6_addr_equal((struct in6_addr *)saddr, (struct in6_addr *)x->props.saddr.a6) && + reqid == x->props.reqid && + x->km.state == XFRM_STATE_ACQ && + !x->id.spi) { + x0 = x; + break; + } + } + if (!x0 && create && (x0 = xfrm_state_alloc()) != NULL) { + ipv6_addr_copy((struct in6_addr *)x0->sel.daddr.a6, + (struct in6_addr *)daddr); + ipv6_addr_copy((struct in6_addr *)x0->sel.saddr.a6, + (struct in6_addr *)saddr); + x0->sel.prefixlen_d = 128; + x0->sel.prefixlen_s = 128; + ipv6_addr_copy((struct in6_addr *)x0->props.saddr.a6, + (struct in6_addr *)saddr); + x0->km.state = XFRM_STATE_ACQ; + ipv6_addr_copy((struct in6_addr *)x0->id.daddr.a6, + (struct in6_addr *)daddr); + x0->id.proto = proto; + x0->props.family = AF_INET6; + x0->props.mode = mode; + x0->props.reqid = reqid; + x0->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES; + xfrm_state_hold(x0); + x0->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ; + add_timer(&x0->timer); + xfrm_state_hold(x0); + list_add_tail(&x0->bydst, xfrm6_state_afinfo.state_bydst+h); + wake_up(&km_waitq); + } + if (x0) + xfrm_state_hold(x0); + return x0; +} + +static struct xfrm_state_afinfo xfrm6_state_afinfo = { + .family = AF_INET6, + .lock = RW_LOCK_UNLOCKED, + .init_tempsel = __xfrm6_init_tempsel, + .state_lookup = __xfrm6_state_lookup, + .find_acq = __xfrm6_find_acq, +}; + +void __init xfrm6_state_init(void) +{ + xfrm_state_register_afinfo(&xfrm6_state_afinfo); +} + +void xfrm6_state_fini(void) +{ + xfrm_state_unregister_afinfo(&xfrm6_state_afinfo); +} + diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c new file mode 100644 index 000000000000..ffcadd68b951 --- /dev/null +++ b/net/ipv6/xfrm6_tunnel.c @@ -0,0 +1,543 @@ +/* + * Copyright (C)2003,2004 USAGI/WIDE Project + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Authors Mitsuru KANDA + * YOSHIFUJI Hideaki + * + * Based on net/ipv4/xfrm4_tunnel.c + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_IPV6_XFRM6_TUNNEL_DEBUG +# define X6TDEBUG 3 +#else +# define X6TDEBUG 1 +#endif + +#define X6TPRINTK(fmt, args...) printk(fmt, ## args) +#define X6TNOPRINTK(fmt, args...) do { ; } while(0) + +#if X6TDEBUG >= 1 +# define X6TPRINTK1 X6TPRINTK +#else +# define X6TPRINTK1 X6TNOPRINTK +#endif + +#if X6TDEBUG >= 3 +# define X6TPRINTK3 X6TPRINTK +#else +# define X6TPRINTK3 X6TNOPRINTK +#endif + +/* + * xfrm_tunnel_spi things are for allocating unique id ("spi") + * per xfrm_address_t. + */ +struct xfrm6_tunnel_spi { + struct hlist_node list_byaddr; + struct hlist_node list_byspi; + xfrm_address_t addr; + u32 spi; + atomic_t refcnt; +#ifdef XFRM6_TUNNEL_SPI_MAGIC + u32 magic; +#endif +}; + +#ifdef CONFIG_IPV6_XFRM6_TUNNEL_DEBUG +# define XFRM6_TUNNEL_SPI_MAGIC 0xdeadbeef +#endif + +static DEFINE_RWLOCK(xfrm6_tunnel_spi_lock); + +static u32 xfrm6_tunnel_spi; + +#define XFRM6_TUNNEL_SPI_MIN 1 +#define XFRM6_TUNNEL_SPI_MAX 0xffffffff + +static kmem_cache_t *xfrm6_tunnel_spi_kmem; + +#define XFRM6_TUNNEL_SPI_BYADDR_HSIZE 256 +#define XFRM6_TUNNEL_SPI_BYSPI_HSIZE 256 + +static struct hlist_head xfrm6_tunnel_spi_byaddr[XFRM6_TUNNEL_SPI_BYADDR_HSIZE]; +static struct hlist_head xfrm6_tunnel_spi_byspi[XFRM6_TUNNEL_SPI_BYSPI_HSIZE]; + +#ifdef XFRM6_TUNNEL_SPI_MAGIC +static int x6spi_check_magic(const struct xfrm6_tunnel_spi *x6spi, + const char *name) +{ + if (unlikely(x6spi->magic != XFRM6_TUNNEL_SPI_MAGIC)) { + X6TPRINTK3(KERN_DEBUG "%s(): x6spi object " + "at %p has corrupted magic %08x " + "(should be %08x)\n", + name, x6spi, x6spi->magic, XFRM6_TUNNEL_SPI_MAGIC); + return -1; + } + return 0; +} +#else +static int inline x6spi_check_magic(const struct xfrm6_tunnel_spi *x6spi, + const char *name) +{ + return 0; +} +#endif + +#define X6SPI_CHECK_MAGIC(x6spi) x6spi_check_magic((x6spi), __FUNCTION__) + + +static unsigned inline xfrm6_tunnel_spi_hash_byaddr(xfrm_address_t *addr) +{ + unsigned h; + + X6TPRINTK3(KERN_DEBUG "%s(addr=%p)\n", __FUNCTION__, addr); + + h = addr->a6[0] ^ addr->a6[1] ^ addr->a6[2] ^ addr->a6[3]; + h ^= h >> 16; + h ^= h >> 8; + h &= XFRM6_TUNNEL_SPI_BYADDR_HSIZE - 1; + + X6TPRINTK3(KERN_DEBUG "%s() = %u\n", __FUNCTION__, h); + + return h; +} + +static unsigned inline xfrm6_tunnel_spi_hash_byspi(u32 spi) +{ + return spi % XFRM6_TUNNEL_SPI_BYSPI_HSIZE; +} + + +static int xfrm6_tunnel_spi_init(void) +{ + int i; + + X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); + + xfrm6_tunnel_spi = 0; + xfrm6_tunnel_spi_kmem = kmem_cache_create("xfrm6_tunnel_spi", + sizeof(struct xfrm6_tunnel_spi), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!xfrm6_tunnel_spi_kmem) { + X6TPRINTK1(KERN_ERR + "%s(): failed to allocate xfrm6_tunnel_spi_kmem\n", + __FUNCTION__); + return -ENOMEM; + } + + for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) + INIT_HLIST_HEAD(&xfrm6_tunnel_spi_byaddr[i]); + for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) + INIT_HLIST_HEAD(&xfrm6_tunnel_spi_byspi[i]); + return 0; +} + +static void xfrm6_tunnel_spi_fini(void) +{ + int i; + + X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); + + for (i = 0; i < XFRM6_TUNNEL_SPI_BYADDR_HSIZE; i++) { + if (!hlist_empty(&xfrm6_tunnel_spi_byaddr[i])) + goto err; + } + for (i = 0; i < XFRM6_TUNNEL_SPI_BYSPI_HSIZE; i++) { + if (!hlist_empty(&xfrm6_tunnel_spi_byspi[i])) + goto err; + } + kmem_cache_destroy(xfrm6_tunnel_spi_kmem); + xfrm6_tunnel_spi_kmem = NULL; + return; +err: + X6TPRINTK1(KERN_ERR "%s(): table is not empty\n", __FUNCTION__); + return; +} + +static struct xfrm6_tunnel_spi *__xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_spi *x6spi; + struct hlist_node *pos; + + X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); + + hlist_for_each_entry(x6spi, pos, + &xfrm6_tunnel_spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], + list_byaddr) { + if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) { + X6SPI_CHECK_MAGIC(x6spi); + X6TPRINTK3(KERN_DEBUG "%s() = %p(%u)\n", __FUNCTION__, x6spi, x6spi->spi); + return x6spi; + } + } + + X6TPRINTK3(KERN_DEBUG "%s() = NULL(0)\n", __FUNCTION__); + return NULL; +} + +u32 xfrm6_tunnel_spi_lookup(xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_spi *x6spi; + u32 spi; + + X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); + + read_lock_bh(&xfrm6_tunnel_spi_lock); + x6spi = __xfrm6_tunnel_spi_lookup(saddr); + spi = x6spi ? x6spi->spi : 0; + read_unlock_bh(&xfrm6_tunnel_spi_lock); + return spi; +} + +EXPORT_SYMBOL(xfrm6_tunnel_spi_lookup); + +static u32 __xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr) +{ + u32 spi; + struct xfrm6_tunnel_spi *x6spi; + struct hlist_node *pos; + unsigned index; + + X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); + + if (xfrm6_tunnel_spi < XFRM6_TUNNEL_SPI_MIN || + xfrm6_tunnel_spi >= XFRM6_TUNNEL_SPI_MAX) + xfrm6_tunnel_spi = XFRM6_TUNNEL_SPI_MIN; + else + xfrm6_tunnel_spi++; + + for (spi = xfrm6_tunnel_spi; spi <= XFRM6_TUNNEL_SPI_MAX; spi++) { + index = xfrm6_tunnel_spi_hash_byspi(spi); + hlist_for_each_entry(x6spi, pos, + &xfrm6_tunnel_spi_byspi[index], + list_byspi) { + if (x6spi->spi == spi) + goto try_next_1; + } + xfrm6_tunnel_spi = spi; + goto alloc_spi; +try_next_1:; + } + for (spi = XFRM6_TUNNEL_SPI_MIN; spi < xfrm6_tunnel_spi; spi++) { + index = xfrm6_tunnel_spi_hash_byspi(spi); + hlist_for_each_entry(x6spi, pos, + &xfrm6_tunnel_spi_byspi[index], + list_byspi) { + if (x6spi->spi == spi) + goto try_next_2; + } + xfrm6_tunnel_spi = spi; + goto alloc_spi; +try_next_2:; + } + spi = 0; + goto out; +alloc_spi: + X6TPRINTK3(KERN_DEBUG "%s(): allocate new spi for " + "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + __FUNCTION__, + NIP6(*(struct in6_addr *)saddr)); + x6spi = kmem_cache_alloc(xfrm6_tunnel_spi_kmem, SLAB_ATOMIC); + if (!x6spi) { + X6TPRINTK1(KERN_ERR "%s(): kmem_cache_alloc() failed\n", + __FUNCTION__); + goto out; + } +#ifdef XFRM6_TUNNEL_SPI_MAGIC + x6spi->magic = XFRM6_TUNNEL_SPI_MAGIC; +#endif + memcpy(&x6spi->addr, saddr, sizeof(x6spi->addr)); + x6spi->spi = spi; + atomic_set(&x6spi->refcnt, 1); + + hlist_add_head(&x6spi->list_byspi, &xfrm6_tunnel_spi_byspi[index]); + + index = xfrm6_tunnel_spi_hash_byaddr(saddr); + hlist_add_head(&x6spi->list_byaddr, &xfrm6_tunnel_spi_byaddr[index]); + X6SPI_CHECK_MAGIC(x6spi); +out: + X6TPRINTK3(KERN_DEBUG "%s() = %u\n", __FUNCTION__, spi); + return spi; +} + +u32 xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_spi *x6spi; + u32 spi; + + X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); + + write_lock_bh(&xfrm6_tunnel_spi_lock); + x6spi = __xfrm6_tunnel_spi_lookup(saddr); + if (x6spi) { + atomic_inc(&x6spi->refcnt); + spi = x6spi->spi; + } else + spi = __xfrm6_tunnel_alloc_spi(saddr); + write_unlock_bh(&xfrm6_tunnel_spi_lock); + + X6TPRINTK3(KERN_DEBUG "%s() = %u\n", __FUNCTION__, spi); + + return spi; +} + +EXPORT_SYMBOL(xfrm6_tunnel_alloc_spi); + +void xfrm6_tunnel_free_spi(xfrm_address_t *saddr) +{ + struct xfrm6_tunnel_spi *x6spi; + struct hlist_node *pos, *n; + + X6TPRINTK3(KERN_DEBUG "%s(saddr=%p)\n", __FUNCTION__, saddr); + + write_lock_bh(&xfrm6_tunnel_spi_lock); + + hlist_for_each_entry_safe(x6spi, pos, n, + &xfrm6_tunnel_spi_byaddr[xfrm6_tunnel_spi_hash_byaddr(saddr)], + list_byaddr) + { + if (memcmp(&x6spi->addr, saddr, sizeof(x6spi->addr)) == 0) { + X6TPRINTK3(KERN_DEBUG "%s(): x6spi object " + "for %04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x " + "found at %p\n", + __FUNCTION__, + NIP6(*(struct in6_addr *)saddr), + x6spi); + X6SPI_CHECK_MAGIC(x6spi); + if (atomic_dec_and_test(&x6spi->refcnt)) { + hlist_del(&x6spi->list_byaddr); + hlist_del(&x6spi->list_byspi); + kmem_cache_free(xfrm6_tunnel_spi_kmem, x6spi); + break; + } + } + } + write_unlock_bh(&xfrm6_tunnel_spi_lock); +} + +EXPORT_SYMBOL(xfrm6_tunnel_free_spi); + +static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ipv6hdr *top_iph; + + top_iph = (struct ipv6hdr *)skb->data; + top_iph->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); + + return 0; +} + +static int xfrm6_tunnel_input(struct xfrm_state *x, struct xfrm_decap_state *decap, struct sk_buff *skb) +{ + return 0; +} + +static struct xfrm6_tunnel *xfrm6_tunnel_handler; +static DECLARE_MUTEX(xfrm6_tunnel_sem); + +int xfrm6_tunnel_register(struct xfrm6_tunnel *handler) +{ + int ret; + + down(&xfrm6_tunnel_sem); + ret = 0; + if (xfrm6_tunnel_handler != NULL) + ret = -EINVAL; + if (!ret) + xfrm6_tunnel_handler = handler; + up(&xfrm6_tunnel_sem); + + return ret; +} + +EXPORT_SYMBOL(xfrm6_tunnel_register); + +int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler) +{ + int ret; + + down(&xfrm6_tunnel_sem); + ret = 0; + if (xfrm6_tunnel_handler != handler) + ret = -EINVAL; + if (!ret) + xfrm6_tunnel_handler = NULL; + up(&xfrm6_tunnel_sem); + + synchronize_net(); + + return ret; +} + +EXPORT_SYMBOL(xfrm6_tunnel_deregister); + +static int xfrm6_tunnel_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + struct sk_buff *skb = *pskb; + struct xfrm6_tunnel *handler = xfrm6_tunnel_handler; + struct ipv6hdr *iph = skb->nh.ipv6h; + u32 spi; + + /* device-like_ip6ip6_handler() */ + if (handler && handler->handler(pskb, nhoffp) == 0) + return 0; + + spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&iph->saddr); + return xfrm6_rcv_spi(pskb, nhoffp, spi); +} + +static void xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct xfrm6_tunnel *handler = xfrm6_tunnel_handler; + + /* call here first for device-like ip6ip6 err handling */ + if (handler) { + handler->err_handler(skb, opt, type, code, offset, info); + return; + } + + /* xfrm6_tunnel native err handling */ + switch (type) { + case ICMPV6_DEST_UNREACH: + switch (code) { + case ICMPV6_NOROUTE: + case ICMPV6_ADM_PROHIBITED: + case ICMPV6_NOT_NEIGHBOUR: + case ICMPV6_ADDR_UNREACH: + case ICMPV6_PORT_UNREACH: + default: + X6TPRINTK3(KERN_DEBUG + "xfrm6_tunnel: Destination Unreach.\n"); + break; + } + break; + case ICMPV6_PKT_TOOBIG: + X6TPRINTK3(KERN_DEBUG + "xfrm6_tunnel: Packet Too Big.\n"); + break; + case ICMPV6_TIME_EXCEED: + switch (code) { + case ICMPV6_EXC_HOPLIMIT: + X6TPRINTK3(KERN_DEBUG + "xfrm6_tunnel: Too small Hoplimit.\n"); + break; + case ICMPV6_EXC_FRAGTIME: + default: + break; + } + break; + case ICMPV6_PARAMPROB: + switch (code) { + case ICMPV6_HDR_FIELD: break; + case ICMPV6_UNK_NEXTHDR: break; + case ICMPV6_UNK_OPTION: break; + } + break; + default: + break; + } + return; +} + +static int xfrm6_tunnel_init_state(struct xfrm_state *x, void *args) +{ + if (!x->props.mode) + return -EINVAL; + + if (x->encap) + return -EINVAL; + + x->props.header_len = sizeof(struct ipv6hdr); + + return 0; +} + +static void xfrm6_tunnel_destroy(struct xfrm_state *x) +{ + xfrm6_tunnel_free_spi((xfrm_address_t *)&x->props.saddr); +} + +static struct xfrm_type xfrm6_tunnel_type = { + .description = "IP6IP6", + .owner = THIS_MODULE, + .proto = IPPROTO_IPV6, + .init_state = xfrm6_tunnel_init_state, + .destructor = xfrm6_tunnel_destroy, + .input = xfrm6_tunnel_input, + .output = xfrm6_tunnel_output, +}; + +static struct inet6_protocol xfrm6_tunnel_protocol = { + .handler = xfrm6_tunnel_rcv, + .err_handler = xfrm6_tunnel_err, + .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, +}; + +static int __init xfrm6_tunnel_init(void) +{ + X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); + + if (xfrm_register_type(&xfrm6_tunnel_type, AF_INET6) < 0) { + X6TPRINTK1(KERN_ERR + "xfrm6_tunnel init: can't add xfrm type\n"); + return -EAGAIN; + } + if (inet6_add_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6) < 0) { + X6TPRINTK1(KERN_ERR + "xfrm6_tunnel init(): can't add protocol\n"); + xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); + return -EAGAIN; + } + if (xfrm6_tunnel_spi_init() < 0) { + X6TPRINTK1(KERN_ERR + "xfrm6_tunnel init: failed to initialize spi\n"); + inet6_del_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6); + xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6); + return -EAGAIN; + } + return 0; +} + +static void __exit xfrm6_tunnel_fini(void) +{ + X6TPRINTK3(KERN_DEBUG "%s()\n", __FUNCTION__); + + xfrm6_tunnel_spi_fini(); + if (inet6_del_protocol(&xfrm6_tunnel_protocol, IPPROTO_IPV6) < 0) + X6TPRINTK1(KERN_ERR + "xfrm6_tunnel close: can't remove protocol\n"); + if (xfrm_unregister_type(&xfrm6_tunnel_type, AF_INET6) < 0) + X6TPRINTK1(KERN_ERR + "xfrm6_tunnel close: can't remove xfrm type\n"); +} + +module_init(xfrm6_tunnel_init); +module_exit(xfrm6_tunnel_fini); +MODULE_LICENSE("GPL"); diff --git a/net/ipx/ChangeLog b/net/ipx/ChangeLog new file mode 100644 index 000000000000..3b29763751a3 --- /dev/null +++ b/net/ipx/ChangeLog @@ -0,0 +1,101 @@ + Revision 0.21: Uses the new generic socket option code. + + Revision 0.22: Gcc clean ups and drop out device registration. Use the + new multi-protocol edition of hard_header + + Revision 0.23: IPX /proc by Mark Evans. Adding a route will + will overwrite any existing route to the same network. + + Revision 0.24: Supports new /proc with no 4K limit + + Revision 0.25: Add ephemeral sockets, passive local network + identification, support for local net 0 and + multiple datalinks + + Revision 0.26: Device drop kills IPX routes via it. (needed for module) + + Revision 0.27: Autobind + + Revision 0.28: Small fix for multiple local networks + + Revision 0.29: Assorted major errors removed + Small correction to promisc mode error fix + Asynchronous I/O support. Changed to use notifiers + and the newer packet_type stuff. Assorted major + fixes + + Revision 0.30: Moved to net/ipx/... + Don't set address length on recvfrom that errors. + Incorrect verify_area. + + Revision 0.31: New sk_buffs. This still needs a lot of + testing. + + Revision 0.32: Using sock_alloc_send_skb, firewall hooks. + Supports sendmsg/recvmsg + + Revision 0.33: Internal network support, routing changes, uses a + protocol private area for ipx data. + + Revision 0.34: Module support. + + Revision 0.35: Checksum support. , hooked in by + Handles WIN95 discovery packets + + Revision 0.36: Internal bump up for 2.1 + + Revision 0.37: Began adding POSIXisms. + + Revision 0.38: Asynchronous socket stuff made current. + + Revision 0.39: SPX interfaces + + Revision 0.40: Tiny SIOCGSTAMP fix (chris@cybernet.co.nz) + + Revision 0.41: 802.2TR removed (p.norton@computer.org) + Fixed connecting to primary net, + Automatic binding on send & receive, + Martijn van Oosterhout + + Revision 042: Multithreading - use spinlocks and refcounting to + protect some structures: ipx_interface sock list, list + of ipx interfaces, etc. + Bugfixes - do refcounting on net_devices, check function + results, etc. Thanks to davem and freitag for + suggestions and guidance. + Arnaldo Carvalho de Melo , + November, 2000 + + Revision 043: Shared SKBs, don't mangle packets, some cleanups + Arnaldo Carvalho de Melo , + December, 2000 + + Revision 044: Call ipxitf_hold on NETDEV_UP - acme + + Revision 045: fix PPROP routing bug - acme + + Revision 046: Further fixes to PPROP, ipxitf_create_internal was + doing an unneeded MOD_INC_USE_COUNT, implement + sysctl for ipx_pprop_broacasting, fix the ipx sysctl + handling, making it dynamic, some cleanups, thanks to + Petr Vandrovec for review and good suggestions. (acme) + + Revision 047: Cleanups, CodingStyle changes, move the ncp connection + hack out of line - acme + + Revision 048: Use sk->protinfo to store the pointer to IPX private + area, remove af_ipx from sk->protinfo and move ipx_opt + to include/net/ipx.h, use IPX_SK like DecNET, etc - acme + + Revision 049: SPX support dropped, see comment in ipx_create - acme + + Revision 050: Use seq_file for proc stuff, moving it to ipx_proc.c - acme + +Other fixes: + + Protect the module by a MOD_INC_USE_COUNT/MOD_DEC_USE_COUNT pair. Also, now + usage count is managed this way: + -Count one if the auto_interface mode is on + -Count one per configured interface + + Jacques Gelinas (jacques@solucorp.qc.ca) diff --git a/net/ipx/Kconfig b/net/ipx/Kconfig new file mode 100644 index 000000000000..a16237c0e783 --- /dev/null +++ b/net/ipx/Kconfig @@ -0,0 +1,31 @@ +# +# IPX configuration +# +config IPX_INTERN + bool "IPX: Full internal IPX network" + depends on IPX + ---help--- + Every IPX network has an address that identifies it. Sometimes it is + useful to give an IPX "network" address to your Linux box as well + (for example if your box is acting as a file server for different + IPX networks: it will then be accessible from everywhere using the + same address). The way this is done is to create a virtual internal + "network" inside your box and to assign an IPX address to this + network. Say Y here if you want to do this; read the IPX-HOWTO at + for details. + + The full internal IPX network enables you to allocate sockets on + different virtual nodes of the internal network. This is done by + evaluating the field sipx_node of the socket address given to the + bind call. So applications should always initialize the node field + to 0 when binding a socket on the primary network. In this case the + socket is assigned the default node that has been given to the + kernel when the internal network was created. By enabling the full + internal IPX network the cross-forwarding of packets targeted at + 'special' sockets to sockets listening on the primary network is + disabled. This might break existing applications, especially RIP/SAP + daemons. A RIP/SAP daemon that works well with the full internal net + can be found on . + + If you don't know what you are doing, say N. + diff --git a/net/ipx/Makefile b/net/ipx/Makefile new file mode 100644 index 000000000000..4b95e3ea0f8b --- /dev/null +++ b/net/ipx/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the Linux IPX layer. +# + +obj-$(CONFIG_IPX) += ipx.o + +ipx-y := af_ipx.o ipx_route.o ipx_proc.o +ipx-$(CONFIG_SYSCTL) += sysctl_net_ipx.o diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c new file mode 100644 index 000000000000..5a27e5df5886 --- /dev/null +++ b/net/ipx/af_ipx.c @@ -0,0 +1,2024 @@ +/* + * Implements an IPX socket layer. + * + * This code is derived from work by + * Ross Biro : Writing the original IP stack + * Fred Van Kempen : Tidying up the TCP/IP + * + * Many thanks go to Keith Baker, Institute For Industrial Information + * Technology Ltd, Swansea University for allowing me to work on this + * in my own time even though it was in some ways related to commercial + * work I am currently employed to do there. + * + * All the material in this file is subject to the Gnu license version 2. + * Neither Alan Cox nor the Swansea University Computer Society admit + * liability nor provide warranty for any of this software. This material + * is provided as is and at no charge. + * + * Portions Copyright (c) 2000-2003 Conectiva, Inc. + * Neither Arnaldo Carvalho de Melo nor Conectiva, Inc. admit liability nor + * provide warranty for any of this software. This material is provided + * "AS-IS" and at no charge. + * + * Portions Copyright (c) 1995 Caldera, Inc. + * Neither Greg Page nor Caldera, Inc. admit liability nor provide + * warranty for any of this software. This material is provided + * "AS-IS" and at no charge. + * + * See net/ipx/ChangeLog. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#ifdef CONFIG_SYSCTL +extern void ipx_register_sysctl(void); +extern void ipx_unregister_sysctl(void); +#else +#define ipx_register_sysctl() +#define ipx_unregister_sysctl() +#endif + +/* Configuration Variables */ +static unsigned char ipxcfg_max_hops = 16; +static char ipxcfg_auto_select_primary; +static char ipxcfg_auto_create_interfaces; +int sysctl_ipx_pprop_broadcasting = 1; + +/* Global Variables */ +static struct datalink_proto *p8022_datalink; +static struct datalink_proto *pEII_datalink; +static struct datalink_proto *p8023_datalink; +static struct datalink_proto *pSNAP_datalink; + +static struct proto_ops ipx_dgram_ops; + +LIST_HEAD(ipx_interfaces); +DEFINE_SPINLOCK(ipx_interfaces_lock); + +struct ipx_interface *ipx_primary_net; +struct ipx_interface *ipx_internal_net; + +extern int ipxrtr_add_route(__u32 network, struct ipx_interface *intrfc, + unsigned char *node); +extern void ipxrtr_del_routes(struct ipx_interface *intrfc); +extern int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, + struct iovec *iov, int len, int noblock); +extern int ipxrtr_route_skb(struct sk_buff *skb); +extern struct ipx_route *ipxrtr_lookup(__u32 net); +extern int ipxrtr_ioctl(unsigned int cmd, void __user *arg); + +#undef IPX_REFCNT_DEBUG +#ifdef IPX_REFCNT_DEBUG +atomic_t ipx_sock_nr; +#endif + +struct ipx_interface *ipx_interfaces_head(void) +{ + struct ipx_interface *rc = NULL; + + if (!list_empty(&ipx_interfaces)) + rc = list_entry(ipx_interfaces.next, + struct ipx_interface, node); + return rc; +} + +static void ipxcfg_set_auto_select(char val) +{ + ipxcfg_auto_select_primary = val; + if (val && !ipx_primary_net) + ipx_primary_net = ipx_interfaces_head(); +} + +static int ipxcfg_get_config_data(struct ipx_config_data __user *arg) +{ + struct ipx_config_data vals; + + vals.ipxcfg_auto_create_interfaces = ipxcfg_auto_create_interfaces; + vals.ipxcfg_auto_select_primary = ipxcfg_auto_select_primary; + + return copy_to_user(arg, &vals, sizeof(vals)) ? -EFAULT : 0; +} + +/* + * Note: Sockets may not be removed _during_ an interrupt or inet_bh + * handler using this technique. They can be added although we do not + * use this facility. + */ + +static void ipx_remove_socket(struct sock *sk) +{ + /* Determine interface with which socket is associated */ + struct ipx_interface *intrfc = ipx_sk(sk)->intrfc; + + if (!intrfc) + goto out; + + ipxitf_hold(intrfc); + spin_lock_bh(&intrfc->if_sklist_lock); + sk_del_node_init(sk); + spin_unlock_bh(&intrfc->if_sklist_lock); + ipxitf_put(intrfc); +out: + return; +} + +static void ipx_destroy_socket(struct sock *sk) +{ + ipx_remove_socket(sk); + skb_queue_purge(&sk->sk_receive_queue); +#ifdef IPX_REFCNT_DEBUG + atomic_dec(&ipx_sock_nr); + printk(KERN_DEBUG "IPX socket %p released, %d are still alive\n", sk, + atomic_read(&ipx_sock_nr)); + if (atomic_read(&sk->sk_refcnt) != 1) + printk(KERN_DEBUG "Destruction sock ipx %p delayed, cnt=%d\n", + sk, atomic_read(&sk->sk_refcnt)); +#endif + sock_put(sk); +} + +/* + * The following code is used to support IPX Interfaces (IPXITF). An + * IPX interface is defined by a physical device and a frame type. + */ + +/* ipxitf_clear_primary_net has to be called with ipx_interfaces_lock held */ + +static void ipxitf_clear_primary_net(void) +{ + ipx_primary_net = NULL; + if (ipxcfg_auto_select_primary) + ipx_primary_net = ipx_interfaces_head(); +} + +static struct ipx_interface *__ipxitf_find_using_phys(struct net_device *dev, + unsigned short datalink) +{ + struct ipx_interface *i; + + list_for_each_entry(i, &ipx_interfaces, node) + if (i->if_dev == dev && i->if_dlink_type == datalink) + goto out; + i = NULL; +out: + return i; +} + +static struct ipx_interface *ipxitf_find_using_phys(struct net_device *dev, + unsigned short datalink) +{ + struct ipx_interface *i; + + spin_lock_bh(&ipx_interfaces_lock); + i = __ipxitf_find_using_phys(dev, datalink); + if (i) + ipxitf_hold(i); + spin_unlock_bh(&ipx_interfaces_lock); + return i; +} + +struct ipx_interface *ipxitf_find_using_net(__u32 net) +{ + struct ipx_interface *i; + + spin_lock_bh(&ipx_interfaces_lock); + if (net) { + list_for_each_entry(i, &ipx_interfaces, node) + if (i->if_netnum == net) + goto hold; + i = NULL; + goto unlock; + } + + i = ipx_primary_net; + if (i) +hold: + ipxitf_hold(i); +unlock: + spin_unlock_bh(&ipx_interfaces_lock); + return i; +} + +/* Sockets are bound to a particular IPX interface. */ +static void ipxitf_insert_socket(struct ipx_interface *intrfc, struct sock *sk) +{ + ipxitf_hold(intrfc); + spin_lock_bh(&intrfc->if_sklist_lock); + ipx_sk(sk)->intrfc = intrfc; + sk_add_node(sk, &intrfc->if_sklist); + spin_unlock_bh(&intrfc->if_sklist_lock); + ipxitf_put(intrfc); +} + +/* caller must hold intrfc->if_sklist_lock */ +static struct sock *__ipxitf_find_socket(struct ipx_interface *intrfc, + unsigned short port) +{ + struct sock *s; + struct hlist_node *node; + + sk_for_each(s, node, &intrfc->if_sklist) + if (ipx_sk(s)->port == port) + goto found; + s = NULL; +found: + return s; +} + +/* caller must hold a reference to intrfc */ +static struct sock *ipxitf_find_socket(struct ipx_interface *intrfc, + unsigned short port) +{ + struct sock *s; + + spin_lock_bh(&intrfc->if_sklist_lock); + s = __ipxitf_find_socket(intrfc, port); + if (s) + sock_hold(s); + spin_unlock_bh(&intrfc->if_sklist_lock); + + return s; +} + +#ifdef CONFIG_IPX_INTERN +static struct sock *ipxitf_find_internal_socket(struct ipx_interface *intrfc, + unsigned char *ipx_node, + unsigned short port) +{ + struct sock *s; + struct hlist_node *node; + + ipxitf_hold(intrfc); + spin_lock_bh(&intrfc->if_sklist_lock); + + sk_for_each(s, node, &intrfc->if_sklist) { + struct ipx_sock *ipxs = ipx_sk(s); + + if (ipxs->port == port && + !memcmp(ipx_node, ipxs->node, IPX_NODE_LEN)) + goto found; + } + s = NULL; +found: + spin_unlock_bh(&intrfc->if_sklist_lock); + ipxitf_put(intrfc); + return s; +} +#endif + +static void __ipxitf_down(struct ipx_interface *intrfc) +{ + struct sock *s; + struct hlist_node *node, *t; + + /* Delete all routes associated with this interface */ + ipxrtr_del_routes(intrfc); + + spin_lock_bh(&intrfc->if_sklist_lock); + /* error sockets */ + sk_for_each_safe(s, node, t, &intrfc->if_sklist) { + struct ipx_sock *ipxs = ipx_sk(s); + + s->sk_err = ENOLINK; + s->sk_error_report(s); + ipxs->intrfc = NULL; + ipxs->port = 0; + sock_set_flag(s, SOCK_ZAPPED); /* Indicates it is no longer bound */ + sk_del_node_init(s); + } + INIT_HLIST_HEAD(&intrfc->if_sklist); + spin_unlock_bh(&intrfc->if_sklist_lock); + + /* remove this interface from list */ + list_del(&intrfc->node); + + /* remove this interface from *special* networks */ + if (intrfc == ipx_primary_net) + ipxitf_clear_primary_net(); + if (intrfc == ipx_internal_net) + ipx_internal_net = NULL; + + if (intrfc->if_dev) + dev_put(intrfc->if_dev); + kfree(intrfc); +} + +void ipxitf_down(struct ipx_interface *intrfc) +{ + spin_lock_bh(&ipx_interfaces_lock); + __ipxitf_down(intrfc); + spin_unlock_bh(&ipx_interfaces_lock); +} + +static __inline__ void __ipxitf_put(struct ipx_interface *intrfc) +{ + if (atomic_dec_and_test(&intrfc->refcnt)) + __ipxitf_down(intrfc); +} + +static int ipxitf_device_event(struct notifier_block *notifier, + unsigned long event, void *ptr) +{ + struct net_device *dev = ptr; + struct ipx_interface *i, *tmp; + + if (event != NETDEV_DOWN && event != NETDEV_UP) + goto out; + + spin_lock_bh(&ipx_interfaces_lock); + list_for_each_entry_safe(i, tmp, &ipx_interfaces, node) + if (i->if_dev == dev) { + if (event == NETDEV_UP) + ipxitf_hold(i); + else + __ipxitf_put(i); + } + spin_unlock_bh(&ipx_interfaces_lock); +out: + return NOTIFY_DONE; +} + + +static __exit void ipxitf_cleanup(void) +{ + struct ipx_interface *i, *tmp; + + spin_lock_bh(&ipx_interfaces_lock); + list_for_each_entry_safe(i, tmp, &ipx_interfaces, node) + __ipxitf_put(i); + spin_unlock_bh(&ipx_interfaces_lock); +} + +static void ipxitf_def_skb_handler(struct sock *sock, struct sk_buff *skb) +{ + if (sock_queue_rcv_skb(sock, skb) < 0) + kfree_skb(skb); +} + +/* + * On input skb->sk is NULL. Nobody is charged for the memory. + */ + +/* caller must hold a reference to intrfc */ + +#ifdef CONFIG_IPX_INTERN +static int ipxitf_demux_socket(struct ipx_interface *intrfc, + struct sk_buff *skb, int copy) +{ + struct ipxhdr *ipx = ipx_hdr(skb); + int is_broadcast = !memcmp(ipx->ipx_dest.node, ipx_broadcast_node, + IPX_NODE_LEN); + struct sock *s; + struct hlist_node *node; + int rc; + + spin_lock_bh(&intrfc->if_sklist_lock); + + sk_for_each(s, node, &intrfc->if_sklist) { + struct ipx_sock *ipxs = ipx_sk(s); + + if (ipxs->port == ipx->ipx_dest.sock && + (is_broadcast || !memcmp(ipx->ipx_dest.node, + ipxs->node, IPX_NODE_LEN))) { + /* We found a socket to which to send */ + struct sk_buff *skb1; + + if (copy) { + skb1 = skb_clone(skb, GFP_ATOMIC); + rc = -ENOMEM; + if (!skb1) + goto out; + } else { + skb1 = skb; + copy = 1; /* skb may only be used once */ + } + ipxitf_def_skb_handler(s, skb1); + + /* On an external interface, one socket can listen */ + if (intrfc != ipx_internal_net) + break; + } + } + + /* skb was solely for us, and we did not make a copy, so free it. */ + if (!copy) + kfree_skb(skb); + + rc = 0; +out: + spin_unlock_bh(&intrfc->if_sklist_lock); + return rc; +} +#else +static struct sock *ncp_connection_hack(struct ipx_interface *intrfc, + struct ipxhdr *ipx) +{ + /* The packet's target is a NCP connection handler. We want to hand it + * to the correct socket directly within the kernel, so that the + * mars_nwe packet distribution process does not have to do it. Here we + * only care about NCP and BURST packets. + * + * You might call this a hack, but believe me, you do not want a + * complete NCP layer in the kernel, and this is VERY fast as well. */ + struct sock *sk = NULL; + int connection = 0; + u8 *ncphdr = (u8 *)(ipx + 1); + + if (*ncphdr == 0x22 && *(ncphdr + 1) == 0x22) /* NCP request */ + connection = (((int) *(ncphdr + 5)) << 8) | (int) *(ncphdr + 3); + else if (*ncphdr == 0x77 && *(ncphdr + 1) == 0x77) /* BURST packet */ + connection = (((int) *(ncphdr + 9)) << 8) | (int) *(ncphdr + 8); + + if (connection) { + struct hlist_node *node; + /* Now we have to look for a special NCP connection handling + * socket. Only these sockets have ipx_ncp_conn != 0, set by + * SIOCIPXNCPCONN. */ + spin_lock_bh(&intrfc->if_sklist_lock); + sk_for_each(sk, node, &intrfc->if_sklist) + if (ipx_sk(sk)->ipx_ncp_conn == connection) { + sock_hold(sk); + goto found; + } + sk = NULL; + found: + spin_unlock_bh(&intrfc->if_sklist_lock); + } + return sk; +} + +static int ipxitf_demux_socket(struct ipx_interface *intrfc, + struct sk_buff *skb, int copy) +{ + struct ipxhdr *ipx = ipx_hdr(skb); + struct sock *sock1 = NULL, *sock2 = NULL; + struct sk_buff *skb1 = NULL, *skb2 = NULL; + int rc; + + if (intrfc == ipx_primary_net && ntohs(ipx->ipx_dest.sock) == 0x451) + sock1 = ncp_connection_hack(intrfc, ipx); + if (!sock1) + /* No special socket found, forward the packet the normal way */ + sock1 = ipxitf_find_socket(intrfc, ipx->ipx_dest.sock); + + /* + * We need to check if there is a primary net and if + * this is addressed to one of the *SPECIAL* sockets because + * these need to be propagated to the primary net. + * The *SPECIAL* socket list contains: 0x452(SAP), 0x453(RIP) and + * 0x456(Diagnostic). + */ + + if (ipx_primary_net && intrfc != ipx_primary_net) { + const int dsock = ntohs(ipx->ipx_dest.sock); + + if (dsock == 0x452 || dsock == 0x453 || dsock == 0x456) + /* The appropriate thing to do here is to dup the + * packet and route to the primary net interface via + * ipxitf_send; however, we'll cheat and just demux it + * here. */ + sock2 = ipxitf_find_socket(ipx_primary_net, + ipx->ipx_dest.sock); + } + + /* + * If there is nothing to do return. The kfree will cancel any charging. + */ + rc = 0; + if (!sock1 && !sock2) { + if (!copy) + kfree_skb(skb); + goto out; + } + + /* + * This next segment of code is a little awkward, but it sets it up + * so that the appropriate number of copies of the SKB are made and + * that skb1 and skb2 point to it (them) so that it (they) can be + * demuxed to sock1 and/or sock2. If we are unable to make enough + * copies, we do as much as is possible. + */ + + if (copy) + skb1 = skb_clone(skb, GFP_ATOMIC); + else + skb1 = skb; + + rc = -ENOMEM; + if (!skb1) + goto out_put; + + /* Do we need 2 SKBs? */ + if (sock1 && sock2) + skb2 = skb_clone(skb1, GFP_ATOMIC); + else + skb2 = skb1; + + if (sock1) + ipxitf_def_skb_handler(sock1, skb1); + + if (!skb2) + goto out_put; + + if (sock2) + ipxitf_def_skb_handler(sock2, skb2); + + rc = 0; +out_put: + if (sock1) + sock_put(sock1); + if (sock2) + sock_put(sock2); +out: + return rc; +} +#endif /* CONFIG_IPX_INTERN */ + +static struct sk_buff *ipxitf_adjust_skbuff(struct ipx_interface *intrfc, + struct sk_buff *skb) +{ + struct sk_buff *skb2; + int in_offset = (unsigned char *)ipx_hdr(skb) - skb->head; + int out_offset = intrfc->if_ipx_offset; + int len; + + /* Hopefully, most cases */ + if (in_offset >= out_offset) + return skb; + + /* Need new SKB */ + len = skb->len + out_offset; + skb2 = alloc_skb(len, GFP_ATOMIC); + if (skb2) { + skb_reserve(skb2, out_offset); + skb2->nh.raw = skb2->h.raw = skb_put(skb2, skb->len); + memcpy(ipx_hdr(skb2), ipx_hdr(skb), skb->len); + memcpy(skb2->cb, skb->cb, sizeof(skb->cb)); + } + kfree_skb(skb); + return skb2; +} + +/* caller must hold a reference to intrfc and the skb has to be unshared */ +int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb, char *node) +{ + struct ipxhdr *ipx = ipx_hdr(skb); + struct net_device *dev = intrfc->if_dev; + struct datalink_proto *dl = intrfc->if_dlink; + char dest_node[IPX_NODE_LEN]; + int send_to_wire = 1; + int addr_len; + + ipx->ipx_tctrl = IPX_SKB_CB(skb)->ipx_tctrl; + ipx->ipx_dest.net = IPX_SKB_CB(skb)->ipx_dest_net; + ipx->ipx_source.net = IPX_SKB_CB(skb)->ipx_source_net; + + /* see if we need to include the netnum in the route list */ + if (IPX_SKB_CB(skb)->last_hop.index >= 0) { + u32 *last_hop = (u32 *)(((u8 *) skb->data) + + sizeof(struct ipxhdr) + + IPX_SKB_CB(skb)->last_hop.index * + sizeof(u32)); + *last_hop = IPX_SKB_CB(skb)->last_hop.netnum; + IPX_SKB_CB(skb)->last_hop.index = -1; + } + + /* + * We need to know how many skbuffs it will take to send out this + * packet to avoid unnecessary copies. + */ + + if (!dl || !dev || dev->flags & IFF_LOOPBACK) + send_to_wire = 0; /* No non looped */ + + /* + * See if this should be demuxed to sockets on this interface + * + * We want to ensure the original was eaten or that we only use + * up clones. + */ + + if (ipx->ipx_dest.net == intrfc->if_netnum) { + /* + * To our own node, loop and free the original. + * The internal net will receive on all node address. + */ + if (intrfc == ipx_internal_net || + !memcmp(intrfc->if_node, node, IPX_NODE_LEN)) { + /* Don't charge sender */ + skb_orphan(skb); + + /* Will charge receiver */ + return ipxitf_demux_socket(intrfc, skb, 0); + } + + /* Broadcast, loop and possibly keep to send on. */ + if (!memcmp(ipx_broadcast_node, node, IPX_NODE_LEN)) { + if (!send_to_wire) + skb_orphan(skb); + ipxitf_demux_socket(intrfc, skb, send_to_wire); + if (!send_to_wire) + goto out; + } + } + + /* + * If the originating net is not equal to our net; this is routed + * We are still charging the sender. Which is right - the driver + * free will handle this fairly. + */ + if (ipx->ipx_source.net != intrfc->if_netnum) { + /* + * Unshare the buffer before modifying the count in + * case it's a flood or tcpdump + */ + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + goto out; + if (++ipx->ipx_tctrl > ipxcfg_max_hops) + send_to_wire = 0; + } + + if (!send_to_wire) { + kfree_skb(skb); + goto out; + } + + /* Determine the appropriate hardware address */ + addr_len = dev->addr_len; + if (!memcmp(ipx_broadcast_node, node, IPX_NODE_LEN)) + memcpy(dest_node, dev->broadcast, addr_len); + else + memcpy(dest_node, &(node[IPX_NODE_LEN-addr_len]), addr_len); + + /* Make any compensation for differing physical/data link size */ + skb = ipxitf_adjust_skbuff(intrfc, skb); + if (!skb) + goto out; + + /* set up data link and physical headers */ + skb->dev = dev; + skb->protocol = htons(ETH_P_IPX); + + /* Send it out */ + dl->request(dl, skb, dest_node); +out: + return 0; +} + +static int ipxitf_add_local_route(struct ipx_interface *intrfc) +{ + return ipxrtr_add_route(intrfc->if_netnum, intrfc, NULL); +} + +static void ipxitf_discover_netnum(struct ipx_interface *intrfc, + struct sk_buff *skb); +static int ipxitf_pprop(struct ipx_interface *intrfc, struct sk_buff *skb); + +static int ipxitf_rcv(struct ipx_interface *intrfc, struct sk_buff *skb) +{ + struct ipxhdr *ipx = ipx_hdr(skb); + int rc = 0; + + ipxitf_hold(intrfc); + + /* See if we should update our network number */ + if (!intrfc->if_netnum) /* net number of intrfc not known yet */ + ipxitf_discover_netnum(intrfc, skb); + + IPX_SKB_CB(skb)->last_hop.index = -1; + if (ipx->ipx_type == IPX_TYPE_PPROP) { + rc = ipxitf_pprop(intrfc, skb); + if (rc) + goto out_free_skb; + } + + /* local processing follows */ + if (!IPX_SKB_CB(skb)->ipx_dest_net) + IPX_SKB_CB(skb)->ipx_dest_net = intrfc->if_netnum; + if (!IPX_SKB_CB(skb)->ipx_source_net) + IPX_SKB_CB(skb)->ipx_source_net = intrfc->if_netnum; + + /* it doesn't make sense to route a pprop packet, there's no meaning + * in the ipx_dest_net for such packets */ + if (ipx->ipx_type != IPX_TYPE_PPROP && + intrfc->if_netnum != IPX_SKB_CB(skb)->ipx_dest_net) { + /* We only route point-to-point packets. */ + if (skb->pkt_type == PACKET_HOST) { + skb = skb_unshare(skb, GFP_ATOMIC); + if (skb) + rc = ipxrtr_route_skb(skb); + goto out_intrfc; + } + + goto out_free_skb; + } + + /* see if we should keep it */ + if (!memcmp(ipx_broadcast_node, ipx->ipx_dest.node, IPX_NODE_LEN) || + !memcmp(intrfc->if_node, ipx->ipx_dest.node, IPX_NODE_LEN)) { + rc = ipxitf_demux_socket(intrfc, skb, 0); + goto out_intrfc; + } + + /* we couldn't pawn it off so unload it */ +out_free_skb: + kfree_skb(skb); +out_intrfc: + ipxitf_put(intrfc); + return rc; +} + +static void ipxitf_discover_netnum(struct ipx_interface *intrfc, + struct sk_buff *skb) +{ + const struct ipx_cb *cb = IPX_SKB_CB(skb); + + /* see if this is an intra packet: source_net == dest_net */ + if (cb->ipx_source_net == cb->ipx_dest_net && cb->ipx_source_net) { + struct ipx_interface *i = + ipxitf_find_using_net(cb->ipx_source_net); + /* NB: NetWare servers lie about their hop count so we + * dropped the test based on it. This is the best way + * to determine this is a 0 hop count packet. */ + if (!i) { + intrfc->if_netnum = cb->ipx_source_net; + ipxitf_add_local_route(intrfc); + } else { + printk(KERN_WARNING "IPX: Network number collision " + "%lx\n %s %s and %s %s\n", + (unsigned long) htonl(cb->ipx_source_net), + ipx_device_name(i), + ipx_frame_name(i->if_dlink_type), + ipx_device_name(intrfc), + ipx_frame_name(intrfc->if_dlink_type)); + ipxitf_put(i); + } + } +} + +/** + * ipxitf_pprop - Process packet propagation IPX packet type 0x14, used for + * NetBIOS broadcasts + * @intrfc: IPX interface receiving this packet + * @skb: Received packet + * + * Checks if packet is valid: if its more than %IPX_MAX_PPROP_HOPS hops or if it + * is smaller than a IPX header + the room for %IPX_MAX_PPROP_HOPS hops we drop + * it, not even processing it locally, if it has exact %IPX_MAX_PPROP_HOPS we + * don't broadcast it, but process it locally. See chapter 5 of Novell's "IPX + * RIP and SAP Router Specification", Part Number 107-000029-001. + * + * If it is valid, check if we have pprop broadcasting enabled by the user, + * if not, just return zero for local processing. + * + * If it is enabled check the packet and don't broadcast it if we have already + * seen this packet. + * + * Broadcast: send it to the interfaces that aren't on the packet visited nets + * array, just after the IPX header. + * + * Returns -EINVAL for invalid packets, so that the calling function drops + * the packet without local processing. 0 if packet is to be locally processed. + */ +static int ipxitf_pprop(struct ipx_interface *intrfc, struct sk_buff *skb) +{ + struct ipxhdr *ipx = ipx_hdr(skb); + int i, rc = -EINVAL; + struct ipx_interface *ifcs; + char *c; + u32 *l; + + /* Illegal packet - too many hops or too short */ + /* We decide to throw it away: no broadcasting, no local processing. + * NetBIOS unaware implementations route them as normal packets - + * tctrl <= 15, any data payload... */ + if (IPX_SKB_CB(skb)->ipx_tctrl > IPX_MAX_PPROP_HOPS || + ntohs(ipx->ipx_pktsize) < sizeof(struct ipxhdr) + + IPX_MAX_PPROP_HOPS * sizeof(u32)) + goto out; + /* are we broadcasting this damn thing? */ + rc = 0; + if (!sysctl_ipx_pprop_broadcasting) + goto out; + /* We do broadcast packet on the IPX_MAX_PPROP_HOPS hop, but we + * process it locally. All previous hops broadcasted it, and process it + * locally. */ + if (IPX_SKB_CB(skb)->ipx_tctrl == IPX_MAX_PPROP_HOPS) + goto out; + + c = ((u8 *) ipx) + sizeof(struct ipxhdr); + l = (u32 *) c; + + /* Don't broadcast packet if already seen this net */ + for (i = 0; i < IPX_SKB_CB(skb)->ipx_tctrl; i++) + if (*l++ == intrfc->if_netnum) + goto out; + + /* < IPX_MAX_PPROP_HOPS hops && input interface not in list. Save the + * position where we will insert recvd netnum into list, later on, + * in ipxitf_send */ + IPX_SKB_CB(skb)->last_hop.index = i; + IPX_SKB_CB(skb)->last_hop.netnum = intrfc->if_netnum; + /* xmit on all other interfaces... */ + spin_lock_bh(&ipx_interfaces_lock); + list_for_each_entry(ifcs, &ipx_interfaces, node) { + /* Except unconfigured interfaces */ + if (!ifcs->if_netnum) + continue; + + /* That aren't in the list */ + if (ifcs == intrfc) + continue; + l = (__u32 *) c; + /* don't consider the last entry in the packet list, + * it is our netnum, and it is not there yet */ + for (i = 0; i < IPX_SKB_CB(skb)->ipx_tctrl; i++) + if (ifcs->if_netnum == *l++) + break; + if (i == IPX_SKB_CB(skb)->ipx_tctrl) { + struct sk_buff *s = skb_copy(skb, GFP_ATOMIC); + + if (s) { + IPX_SKB_CB(s)->ipx_dest_net = ifcs->if_netnum; + ipxrtr_route_skb(s); + } + } + } + spin_unlock_bh(&ipx_interfaces_lock); +out: + return rc; +} + +static void ipxitf_insert(struct ipx_interface *intrfc) +{ + spin_lock_bh(&ipx_interfaces_lock); + list_add_tail(&intrfc->node, &ipx_interfaces); + spin_unlock_bh(&ipx_interfaces_lock); + + if (ipxcfg_auto_select_primary && !ipx_primary_net) + ipx_primary_net = intrfc; +} + +static struct ipx_interface *ipxitf_alloc(struct net_device *dev, __u32 netnum, + unsigned short dlink_type, + struct datalink_proto *dlink, + unsigned char internal, + int ipx_offset) +{ + struct ipx_interface *intrfc = kmalloc(sizeof(*intrfc), GFP_ATOMIC); + + if (intrfc) { + intrfc->if_dev = dev; + intrfc->if_netnum = netnum; + intrfc->if_dlink_type = dlink_type; + intrfc->if_dlink = dlink; + intrfc->if_internal = internal; + intrfc->if_ipx_offset = ipx_offset; + intrfc->if_sknum = IPX_MIN_EPHEMERAL_SOCKET; + INIT_HLIST_HEAD(&intrfc->if_sklist); + atomic_set(&intrfc->refcnt, 1); + spin_lock_init(&intrfc->if_sklist_lock); + } + + return intrfc; +} + +static int ipxitf_create_internal(struct ipx_interface_definition *idef) +{ + struct ipx_interface *intrfc; + int rc = -EEXIST; + + /* Only one primary network allowed */ + if (ipx_primary_net) + goto out; + + /* Must have a valid network number */ + rc = -EADDRNOTAVAIL; + if (!idef->ipx_network) + goto out; + intrfc = ipxitf_find_using_net(idef->ipx_network); + rc = -EADDRINUSE; + if (intrfc) { + ipxitf_put(intrfc); + goto out; + } + intrfc = ipxitf_alloc(NULL, idef->ipx_network, 0, NULL, 1, 0); + rc = -EAGAIN; + if (!intrfc) + goto out; + memcpy((char *)&(intrfc->if_node), idef->ipx_node, IPX_NODE_LEN); + ipx_internal_net = ipx_primary_net = intrfc; + ipxitf_hold(intrfc); + ipxitf_insert(intrfc); + + rc = ipxitf_add_local_route(intrfc); + ipxitf_put(intrfc); +out: + return rc; +} + +static int ipx_map_frame_type(unsigned char type) +{ + int rc = 0; + + switch (type) { + case IPX_FRAME_ETHERII: rc = htons(ETH_P_IPX); break; + case IPX_FRAME_8022: rc = htons(ETH_P_802_2); break; + case IPX_FRAME_SNAP: rc = htons(ETH_P_SNAP); break; + case IPX_FRAME_8023: rc = htons(ETH_P_802_3); break; + } + + return rc; +} + +static int ipxitf_create(struct ipx_interface_definition *idef) +{ + struct net_device *dev; + unsigned short dlink_type = 0; + struct datalink_proto *datalink = NULL; + struct ipx_interface *intrfc; + int rc; + + if (idef->ipx_special == IPX_INTERNAL) { + rc = ipxitf_create_internal(idef); + goto out; + } + + rc = -EEXIST; + if (idef->ipx_special == IPX_PRIMARY && ipx_primary_net) + goto out; + + intrfc = ipxitf_find_using_net(idef->ipx_network); + rc = -EADDRINUSE; + if (idef->ipx_network && intrfc) { + ipxitf_put(intrfc); + goto out; + } + + if (intrfc) + ipxitf_put(intrfc); + + dev = dev_get_by_name(idef->ipx_device); + rc = -ENODEV; + if (!dev) + goto out; + + switch (idef->ipx_dlink_type) { + case IPX_FRAME_TR_8022: + printk(KERN_WARNING "IPX frame type 802.2TR is " + "obsolete Use 802.2 instead.\n"); + /* fall through */ + case IPX_FRAME_8022: + dlink_type = htons(ETH_P_802_2); + datalink = p8022_datalink; + break; + case IPX_FRAME_ETHERII: + if (dev->type != ARPHRD_IEEE802) { + dlink_type = htons(ETH_P_IPX); + datalink = pEII_datalink; + break; + } else + printk(KERN_WARNING "IPX frame type EtherII over " + "token-ring is obsolete. Use SNAP " + "instead.\n"); + /* fall through */ + case IPX_FRAME_SNAP: + dlink_type = htons(ETH_P_SNAP); + datalink = pSNAP_datalink; + break; + case IPX_FRAME_8023: + dlink_type = htons(ETH_P_802_3); + datalink = p8023_datalink; + break; + case IPX_FRAME_NONE: + default: + rc = -EPROTONOSUPPORT; + goto out_dev; + } + + rc = -ENETDOWN; + if (!(dev->flags & IFF_UP)) + goto out_dev; + + /* Check addresses are suitable */ + rc = -EINVAL; + if (dev->addr_len > IPX_NODE_LEN) + goto out_dev; + + intrfc = ipxitf_find_using_phys(dev, dlink_type); + if (!intrfc) { + /* Ok now create */ + intrfc = ipxitf_alloc(dev, idef->ipx_network, dlink_type, + datalink, 0, dev->hard_header_len + + datalink->header_length); + rc = -EAGAIN; + if (!intrfc) + goto out_dev; + /* Setup primary if necessary */ + if (idef->ipx_special == IPX_PRIMARY) + ipx_primary_net = intrfc; + if (!memcmp(idef->ipx_node, "\000\000\000\000\000\000", + IPX_NODE_LEN)) { + memset(intrfc->if_node, 0, IPX_NODE_LEN); + memcpy(intrfc->if_node + IPX_NODE_LEN - dev->addr_len, + dev->dev_addr, dev->addr_len); + } else + memcpy(intrfc->if_node, idef->ipx_node, IPX_NODE_LEN); + ipxitf_hold(intrfc); + ipxitf_insert(intrfc); + } + + + /* If the network number is known, add a route */ + rc = 0; + if (!intrfc->if_netnum) + goto out_intrfc; + + rc = ipxitf_add_local_route(intrfc); +out_intrfc: + ipxitf_put(intrfc); + goto out; +out_dev: + dev_put(dev); +out: + return rc; +} + +static int ipxitf_delete(struct ipx_interface_definition *idef) +{ + struct net_device *dev = NULL; + unsigned short dlink_type = 0; + struct ipx_interface *intrfc; + int rc = 0; + + spin_lock_bh(&ipx_interfaces_lock); + if (idef->ipx_special == IPX_INTERNAL) { + if (ipx_internal_net) { + __ipxitf_put(ipx_internal_net); + goto out; + } + rc = -ENOENT; + goto out; + } + + dlink_type = ipx_map_frame_type(idef->ipx_dlink_type); + rc = -EPROTONOSUPPORT; + if (!dlink_type) + goto out; + + dev = __dev_get_by_name(idef->ipx_device); + rc = -ENODEV; + if (!dev) + goto out; + + intrfc = __ipxitf_find_using_phys(dev, dlink_type); + rc = -EINVAL; + if (!intrfc) + goto out; + __ipxitf_put(intrfc); + + rc = 0; +out: + spin_unlock_bh(&ipx_interfaces_lock); + return rc; +} + +static struct ipx_interface *ipxitf_auto_create(struct net_device *dev, + unsigned short dlink_type) +{ + struct ipx_interface *intrfc = NULL; + struct datalink_proto *datalink; + + if (!dev) + goto out; + + /* Check addresses are suitable */ + if (dev->addr_len > IPX_NODE_LEN) + goto out; + + switch (htons(dlink_type)) { + case ETH_P_IPX: datalink = pEII_datalink; break; + case ETH_P_802_2: datalink = p8022_datalink; break; + case ETH_P_SNAP: datalink = pSNAP_datalink; break; + case ETH_P_802_3: datalink = p8023_datalink; break; + default: goto out; + } + + intrfc = ipxitf_alloc(dev, 0, dlink_type, datalink, 0, + dev->hard_header_len + datalink->header_length); + + if (intrfc) { + memset(intrfc->if_node, 0, IPX_NODE_LEN); + memcpy((char *)&(intrfc->if_node[IPX_NODE_LEN-dev->addr_len]), + dev->dev_addr, dev->addr_len); + spin_lock_init(&intrfc->if_sklist_lock); + atomic_set(&intrfc->refcnt, 1); + ipxitf_insert(intrfc); + dev_hold(dev); + } + +out: + return intrfc; +} + +static int ipxitf_ioctl(unsigned int cmd, void __user *arg) +{ + int rc = -EINVAL; + struct ifreq ifr; + int val; + + switch (cmd) { + case SIOCSIFADDR: { + struct sockaddr_ipx *sipx; + struct ipx_interface_definition f; + + rc = -EFAULT; + if (copy_from_user(&ifr, arg, sizeof(ifr))) + break; + sipx = (struct sockaddr_ipx *)&ifr.ifr_addr; + rc = -EINVAL; + if (sipx->sipx_family != AF_IPX) + break; + f.ipx_network = sipx->sipx_network; + memcpy(f.ipx_device, ifr.ifr_name, + sizeof(f.ipx_device)); + memcpy(f.ipx_node, sipx->sipx_node, IPX_NODE_LEN); + f.ipx_dlink_type = sipx->sipx_type; + f.ipx_special = sipx->sipx_special; + + if (sipx->sipx_action == IPX_DLTITF) + rc = ipxitf_delete(&f); + else + rc = ipxitf_create(&f); + break; + } + case SIOCGIFADDR: { + struct sockaddr_ipx *sipx; + struct ipx_interface *ipxif; + struct net_device *dev; + + rc = -EFAULT; + if (copy_from_user(&ifr, arg, sizeof(ifr))) + break; + sipx = (struct sockaddr_ipx *)&ifr.ifr_addr; + dev = __dev_get_by_name(ifr.ifr_name); + rc = -ENODEV; + if (!dev) + break; + ipxif = ipxitf_find_using_phys(dev, + ipx_map_frame_type(sipx->sipx_type)); + rc = -EADDRNOTAVAIL; + if (!ipxif) + break; + + sipx->sipx_family = AF_IPX; + sipx->sipx_network = ipxif->if_netnum; + memcpy(sipx->sipx_node, ipxif->if_node, + sizeof(sipx->sipx_node)); + rc = -EFAULT; + if (copy_to_user(arg, &ifr, sizeof(ifr))) + break; + ipxitf_put(ipxif); + rc = 0; + break; + } + case SIOCAIPXITFCRT: + rc = -EFAULT; + if (get_user(val, (unsigned char __user *) arg)) + break; + rc = 0; + ipxcfg_auto_create_interfaces = val; + break; + case SIOCAIPXPRISLT: + rc = -EFAULT; + if (get_user(val, (unsigned char __user *) arg)) + break; + rc = 0; + ipxcfg_set_auto_select(val); + break; + } + + return rc; +} + +/* + * Checksum routine for IPX + */ + +/* Note: We assume ipx_tctrl==0 and htons(length)==ipx_pktsize */ +/* This functions should *not* mess with packet contents */ + +__u16 ipx_cksum(struct ipxhdr *packet, int length) +{ + /* + * NOTE: sum is a net byte order quantity, which optimizes the + * loop. This only works on big and little endian machines. (I + * don't know of a machine that isn't.) + */ + /* start at ipx_dest - We skip the checksum field and start with + * ipx_type before the loop, not considering ipx_tctrl in the calc */ + __u16 *p = (__u16 *)&packet->ipx_dest; + __u32 i = (length >> 1) - 1; /* Number of complete words */ + __u32 sum = packet->ipx_type << sizeof(packet->ipx_tctrl); + + /* Loop through all complete words except the checksum field, + * ipx_type (accounted above) and ipx_tctrl (not used in the cksum) */ + while (--i) + sum += *p++; + + /* Add on the last part word if it exists */ + if (packet->ipx_pktsize & htons(1)) + sum += ntohs(0xff00) & *p; + + /* Do final fixup */ + sum = (sum & 0xffff) + (sum >> 16); + + /* It's a pity there's no concept of carry in C */ + if (sum >= 0x10000) + sum++; + + return ~sum; +} + +const char *ipx_frame_name(unsigned short frame) +{ + char* rc = "None"; + + switch (ntohs(frame)) { + case ETH_P_IPX: rc = "EtherII"; break; + case ETH_P_802_2: rc = "802.2"; break; + case ETH_P_SNAP: rc = "SNAP"; break; + case ETH_P_802_3: rc = "802.3"; break; + case ETH_P_TR_802_2: rc = "802.2TR"; break; + } + + return rc; +} + +const char *ipx_device_name(struct ipx_interface *intrfc) +{ + return intrfc->if_internal ? "Internal" : + intrfc->if_dev ? intrfc->if_dev->name : "Unknown"; +} + +/* Handling for system calls applied via the various interfaces to an IPX + * socket object. */ + +static int ipx_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + int opt; + int rc = -EINVAL; + + if (optlen != sizeof(int)) + goto out; + + rc = -EFAULT; + if (get_user(opt, (unsigned int __user *)optval)) + goto out; + + rc = -ENOPROTOOPT; + if (!(level == SOL_IPX && optname == IPX_TYPE)) + goto out; + + ipx_sk(sk)->type = opt; + rc = 0; +out: + return rc; +} + +static int ipx_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + int val = 0; + int len; + int rc = -ENOPROTOOPT; + + if (!(level == SOL_IPX && optname == IPX_TYPE)) + goto out; + + val = ipx_sk(sk)->type; + + rc = -EFAULT; + if (get_user(len, optlen)) + goto out; + + len = min_t(unsigned int, len, sizeof(int)); + rc = -EINVAL; + if(len < 0) + goto out; + + rc = -EFAULT; + if (put_user(len, optlen) || copy_to_user(optval, &val, len)) + goto out; + + rc = 0; +out: + return rc; +} + +static struct proto ipx_proto = { + .name = "IPX", + .owner = THIS_MODULE, + .obj_size = sizeof(struct ipx_sock), +}; + +static int ipx_create(struct socket *sock, int protocol) +{ + int rc = -ESOCKTNOSUPPORT; + struct sock *sk; + + /* + * SPX support is not anymore in the kernel sources. If you want to + * ressurrect it, completing it and making it understand shared skbs, + * be fully multithreaded, etc, grab the sources in an early 2.5 kernel + * tree. + */ + if (sock->type != SOCK_DGRAM) + goto out; + + rc = -ENOMEM; + sk = sk_alloc(PF_IPX, GFP_KERNEL, &ipx_proto, 1); + if (!sk) + goto out; +#ifdef IPX_REFCNT_DEBUG + atomic_inc(&ipx_sock_nr); + printk(KERN_DEBUG "IPX socket %p created, now we have %d alive\n", sk, + atomic_read(&ipx_sock_nr)); +#endif + sock_init_data(sock, sk); + sk->sk_no_check = 1; /* Checksum off by default */ + sock->ops = &ipx_dgram_ops; + rc = 0; +out: + return rc; +} + +static int ipx_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (!sk) + goto out; + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + + sock_set_flag(sk, SOCK_DEAD); + sock->sk = NULL; + ipx_destroy_socket(sk); +out: + return 0; +} + +/* caller must hold a reference to intrfc */ + +static unsigned short ipx_first_free_socketnum(struct ipx_interface *intrfc) +{ + unsigned short socketNum = intrfc->if_sknum; + + spin_lock_bh(&intrfc->if_sklist_lock); + + if (socketNum < IPX_MIN_EPHEMERAL_SOCKET) + socketNum = IPX_MIN_EPHEMERAL_SOCKET; + + while (__ipxitf_find_socket(intrfc, ntohs(socketNum))) + if (socketNum > IPX_MAX_EPHEMERAL_SOCKET) + socketNum = IPX_MIN_EPHEMERAL_SOCKET; + else + socketNum++; + + spin_unlock_bh(&intrfc->if_sklist_lock); + intrfc->if_sknum = socketNum; + + return ntohs(socketNum); +} + +static int ipx_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct ipx_sock *ipxs = ipx_sk(sk); + struct ipx_interface *intrfc; + struct sockaddr_ipx *addr = (struct sockaddr_ipx *)uaddr; + int rc = -EINVAL; + + if (!sock_flag(sk, SOCK_ZAPPED) || addr_len != sizeof(struct sockaddr_ipx)) + goto out; + + intrfc = ipxitf_find_using_net(addr->sipx_network); + rc = -EADDRNOTAVAIL; + if (!intrfc) + goto out; + + if (!addr->sipx_port) { + addr->sipx_port = ipx_first_free_socketnum(intrfc); + rc = -EINVAL; + if (!addr->sipx_port) + goto out_put; + } + + /* protect IPX system stuff like routing/sap */ + rc = -EACCES; + if (ntohs(addr->sipx_port) < IPX_MIN_EPHEMERAL_SOCKET && + !capable(CAP_NET_ADMIN)) + goto out_put; + + ipxs->port = addr->sipx_port; + +#ifdef CONFIG_IPX_INTERN + if (intrfc == ipx_internal_net) { + /* The source address is to be set explicitly if the + * socket is to be bound on the internal network. If a + * node number 0 was specified, the default is used. + */ + + rc = -EINVAL; + if (!memcmp(addr->sipx_node, ipx_broadcast_node, IPX_NODE_LEN)) + goto out_put; + if (!memcmp(addr->sipx_node, ipx_this_node, IPX_NODE_LEN)) + memcpy(ipxs->node, intrfc->if_node, IPX_NODE_LEN); + else + memcpy(ipxs->node, addr->sipx_node, IPX_NODE_LEN); + + rc = -EADDRINUSE; + if (ipxitf_find_internal_socket(intrfc, ipxs->node, + ipxs->port)) { + SOCK_DEBUG(sk, + "IPX: bind failed because port %X in use.\n", + ntohs((int)addr->sipx_port)); + goto out_put; + } + } else { + /* Source addresses are easy. It must be our + * network:node pair for an interface routed to IPX + * with the ipx routing ioctl() + */ + + memcpy(ipxs->node, intrfc->if_node, IPX_NODE_LEN); + + rc = -EADDRINUSE; + if (ipxitf_find_socket(intrfc, addr->sipx_port)) { + SOCK_DEBUG(sk, + "IPX: bind failed because port %X in use.\n", + ntohs((int)addr->sipx_port)); + goto out_put; + } + } + +#else /* !def CONFIG_IPX_INTERN */ + + /* Source addresses are easy. It must be our network:node pair for + an interface routed to IPX with the ipx routing ioctl() */ + + rc = -EADDRINUSE; + if (ipxitf_find_socket(intrfc, addr->sipx_port)) { + SOCK_DEBUG(sk, "IPX: bind failed because port %X in use.\n", + ntohs((int)addr->sipx_port)); + goto out_put; + } + +#endif /* CONFIG_IPX_INTERN */ + + ipxitf_insert_socket(intrfc, sk); + sock_reset_flag(sk, SOCK_ZAPPED); + + rc = 0; +out_put: + ipxitf_put(intrfc); +out: + return rc; +} + +static int ipx_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct ipx_sock *ipxs = ipx_sk(sk); + struct sockaddr_ipx *addr; + int rc = -EINVAL; + struct ipx_route *rt; + + sk->sk_state = TCP_CLOSE; + sock->state = SS_UNCONNECTED; + + if (addr_len != sizeof(*addr)) + goto out; + addr = (struct sockaddr_ipx *)uaddr; + + /* put the autobinding in */ + if (!ipxs->port) { + struct sockaddr_ipx uaddr; + + uaddr.sipx_port = 0; + uaddr.sipx_network = 0; + +#ifdef CONFIG_IPX_INTERN + rc = -ENETDOWN; + if (!ipxs->intrfc) + goto out; /* Someone zonked the iface */ + memcpy(uaddr.sipx_node, ipxs->intrfc->if_node, + IPX_NODE_LEN); +#endif /* CONFIG_IPX_INTERN */ + + rc = ipx_bind(sock, (struct sockaddr *)&uaddr, + sizeof(struct sockaddr_ipx)); + if (rc) + goto out; + } + + /* We can either connect to primary network or somewhere + * we can route to */ + rt = ipxrtr_lookup(addr->sipx_network); + rc = -ENETUNREACH; + if (!rt && !(!addr->sipx_network && ipx_primary_net)) + goto out; + + ipxs->dest_addr.net = addr->sipx_network; + ipxs->dest_addr.sock = addr->sipx_port; + memcpy(ipxs->dest_addr.node, addr->sipx_node, IPX_NODE_LEN); + ipxs->type = addr->sipx_type; + + if (sock->type == SOCK_DGRAM) { + sock->state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; + } + + if (rt) + ipxrtr_put(rt); + rc = 0; +out: + return rc; +} + + +static int ipx_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct ipx_address *addr; + struct sockaddr_ipx sipx; + struct sock *sk = sock->sk; + struct ipx_sock *ipxs = ipx_sk(sk); + int rc; + + *uaddr_len = sizeof(struct sockaddr_ipx); + + if (peer) { + rc = -ENOTCONN; + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + + addr = &ipxs->dest_addr; + sipx.sipx_network = addr->net; + sipx.sipx_port = addr->sock; + memcpy(sipx.sipx_node, addr->node, IPX_NODE_LEN); + } else { + if (ipxs->intrfc) { + sipx.sipx_network = ipxs->intrfc->if_netnum; +#ifdef CONFIG_IPX_INTERN + memcpy(sipx.sipx_node, ipxs->node, IPX_NODE_LEN); +#else + memcpy(sipx.sipx_node, ipxs->intrfc->if_node, + IPX_NODE_LEN); +#endif /* CONFIG_IPX_INTERN */ + + } else { + sipx.sipx_network = 0; + memset(sipx.sipx_node, '\0', IPX_NODE_LEN); + } + + sipx.sipx_port = ipxs->port; + } + + sipx.sipx_family = AF_IPX; + sipx.sipx_type = ipxs->type; + sipx.sipx_zero = 0; + memcpy(uaddr, &sipx, sizeof(sipx)); + + rc = 0; +out: + return rc; +} + +static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +{ + /* NULL here for pt means the packet was looped back */ + struct ipx_interface *intrfc; + struct ipxhdr *ipx; + u16 ipx_pktsize; + int rc = 0; + + /* Not ours */ + if (skb->pkt_type == PACKET_OTHERHOST) + goto drop; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + goto out; + + ipx = ipx_hdr(skb); + ipx_pktsize = ntohs(ipx->ipx_pktsize); + + /* Too small or invalid header? */ + if (ipx_pktsize < sizeof(struct ipxhdr) || ipx_pktsize > skb->len) + goto drop; + + if (ipx->ipx_checksum != IPX_NO_CHECKSUM && + ipx->ipx_checksum != ipx_cksum(ipx, ipx_pktsize)) + goto drop; + + IPX_SKB_CB(skb)->ipx_tctrl = ipx->ipx_tctrl; + IPX_SKB_CB(skb)->ipx_dest_net = ipx->ipx_dest.net; + IPX_SKB_CB(skb)->ipx_source_net = ipx->ipx_source.net; + + /* Determine what local ipx endpoint this is */ + intrfc = ipxitf_find_using_phys(dev, pt->type); + if (!intrfc) { + if (ipxcfg_auto_create_interfaces && + ntohl(IPX_SKB_CB(skb)->ipx_dest_net)) { + intrfc = ipxitf_auto_create(dev, pt->type); + if (intrfc) + ipxitf_hold(intrfc); + } + + if (!intrfc) /* Not one of ours */ + /* or invalid packet for auto creation */ + goto drop; + } + + rc = ipxitf_rcv(intrfc, skb); + ipxitf_put(intrfc); + goto out; +drop: + kfree_skb(skb); +out: + return rc; +} + +static int ipx_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct ipx_sock *ipxs = ipx_sk(sk); + struct sockaddr_ipx *usipx = (struct sockaddr_ipx *)msg->msg_name; + struct sockaddr_ipx local_sipx; + int rc = -EINVAL; + int flags = msg->msg_flags; + + /* Socket gets bound below anyway */ +/* if (sk->sk_zapped) + return -EIO; */ /* Socket not bound */ + if (flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) + goto out; + + /* Max possible packet size limited by 16 bit pktsize in header */ + if (len >= 65535 - sizeof(struct ipxhdr)) + goto out; + + if (usipx) { + if (!ipxs->port) { + struct sockaddr_ipx uaddr; + + uaddr.sipx_port = 0; + uaddr.sipx_network = 0; +#ifdef CONFIG_IPX_INTERN + rc = -ENETDOWN; + if (!ipxs->intrfc) + goto out; /* Someone zonked the iface */ + memcpy(uaddr.sipx_node, ipxs->intrfc->if_node, + IPX_NODE_LEN); +#endif + rc = ipx_bind(sock, (struct sockaddr *)&uaddr, + sizeof(struct sockaddr_ipx)); + if (rc) + goto out; + } + + rc = -EINVAL; + if (msg->msg_namelen < sizeof(*usipx) || + usipx->sipx_family != AF_IPX) + goto out; + } else { + rc = -ENOTCONN; + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + + usipx = &local_sipx; + usipx->sipx_family = AF_IPX; + usipx->sipx_type = ipxs->type; + usipx->sipx_port = ipxs->dest_addr.sock; + usipx->sipx_network = ipxs->dest_addr.net; + memcpy(usipx->sipx_node, ipxs->dest_addr.node, IPX_NODE_LEN); + } + + rc = ipxrtr_route_packet(sk, usipx, msg->msg_iov, len, + flags & MSG_DONTWAIT); + if (rc >= 0) + rc = len; +out: + return rc; +} + + +static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct ipx_sock *ipxs = ipx_sk(sk); + struct sockaddr_ipx *sipx = (struct sockaddr_ipx *)msg->msg_name; + struct ipxhdr *ipx = NULL; + struct sk_buff *skb; + int copied, rc; + + /* put the autobinding in */ + if (!ipxs->port) { + struct sockaddr_ipx uaddr; + + uaddr.sipx_port = 0; + uaddr.sipx_network = 0; + +#ifdef CONFIG_IPX_INTERN + rc = -ENETDOWN; + if (!ipxs->intrfc) + goto out; /* Someone zonked the iface */ + memcpy(uaddr.sipx_node, ipxs->intrfc->if_node, IPX_NODE_LEN); +#endif /* CONFIG_IPX_INTERN */ + + rc = ipx_bind(sock, (struct sockaddr *)&uaddr, + sizeof(struct sockaddr_ipx)); + if (rc) + goto out; + } + + rc = -ENOTCONN; + if (sock_flag(sk, SOCK_ZAPPED)) + goto out; + + skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, + flags & MSG_DONTWAIT, &rc); + if (!skb) + goto out; + + ipx = ipx_hdr(skb); + copied = ntohs(ipx->ipx_pktsize) - sizeof(struct ipxhdr); + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + rc = skb_copy_datagram_iovec(skb, sizeof(struct ipxhdr), msg->msg_iov, + copied); + if (rc) + goto out_free; + if (skb->stamp.tv_sec) + sk->sk_stamp = skb->stamp; + + msg->msg_namelen = sizeof(*sipx); + + if (sipx) { + sipx->sipx_family = AF_IPX; + sipx->sipx_port = ipx->ipx_source.sock; + memcpy(sipx->sipx_node, ipx->ipx_source.node, IPX_NODE_LEN); + sipx->sipx_network = IPX_SKB_CB(skb)->ipx_source_net; + sipx->sipx_type = ipx->ipx_type; + sipx->sipx_zero = 0; + } + rc = copied; + +out_free: + skb_free_datagram(sk, skb); +out: + return rc; +} + + +static int ipx_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + int rc = 0; + long amount = 0; + struct sock *sk = sock->sk; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case TIOCOUTQ: + amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); + if (amount < 0) + amount = 0; + rc = put_user(amount, (int __user *)argp); + break; + case TIOCINQ: { + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + /* These two are safe on a single CPU system as only + * user tasks fiddle here */ + if (skb) + amount = skb->len - sizeof(struct ipxhdr); + rc = put_user(amount, (int __user *)argp); + break; + } + case SIOCADDRT: + case SIOCDELRT: + rc = -EPERM; + if (capable(CAP_NET_ADMIN)) + rc = ipxrtr_ioctl(cmd, argp); + break; + case SIOCSIFADDR: + case SIOCAIPXITFCRT: + case SIOCAIPXPRISLT: + rc = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + case SIOCGIFADDR: + rc = ipxitf_ioctl(cmd, argp); + break; + case SIOCIPXCFGDATA: + rc = ipxcfg_get_config_data(argp); + break; + case SIOCIPXNCPCONN: + /* + * This socket wants to take care of the NCP connection + * handed to us in arg. + */ + rc = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + rc = get_user(ipx_sk(sk)->ipx_ncp_conn, + (const unsigned short __user *)argp); + break; + case SIOCGSTAMP: + rc = -EINVAL; + if (sk) + rc = sock_get_timestamp(sk, argp); + break; + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + rc = -EINVAL; + break; + default: + rc = dev_ioctl(cmd, argp); + break; + } + + return rc; +} + +/* + * Socket family declarations + */ + +static struct net_proto_family ipx_family_ops = { + .family = PF_IPX, + .create = ipx_create, + .owner = THIS_MODULE, +}; + +static struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = { + .family = PF_IPX, + .owner = THIS_MODULE, + .release = ipx_release, + .bind = ipx_bind, + .connect = ipx_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = ipx_getname, + .poll = datagram_poll, + .ioctl = ipx_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, /* FIXME: support shutdown */ + .setsockopt = ipx_setsockopt, + .getsockopt = ipx_getsockopt, + .sendmsg = ipx_sendmsg, + .recvmsg = ipx_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +#include +SOCKOPS_WRAP(ipx_dgram, PF_IPX); + +static struct packet_type ipx_8023_packet_type = { + .type = __constant_htons(ETH_P_802_3), + .func = ipx_rcv, +}; + +static struct packet_type ipx_dix_packet_type = { + .type = __constant_htons(ETH_P_IPX), + .func = ipx_rcv, +}; + +static struct notifier_block ipx_dev_notifier = { + .notifier_call = ipxitf_device_event, +}; + +extern struct datalink_proto *make_EII_client(void); +extern struct datalink_proto *make_8023_client(void); +extern void destroy_EII_client(struct datalink_proto *); +extern void destroy_8023_client(struct datalink_proto *); + +static unsigned char ipx_8022_type = 0xE0; +static unsigned char ipx_snap_id[5] = { 0x0, 0x0, 0x0, 0x81, 0x37 }; +static char ipx_EII_err_msg[] __initdata = + KERN_CRIT "IPX: Unable to register with Ethernet II\n"; +static char ipx_8023_err_msg[] __initdata = + KERN_CRIT "IPX: Unable to register with 802.3\n"; +static char ipx_llc_err_msg[] __initdata = + KERN_CRIT "IPX: Unable to register with 802.2\n"; +static char ipx_snap_err_msg[] __initdata = + KERN_CRIT "IPX: Unable to register with SNAP\n"; + +static int __init ipx_init(void) +{ + int rc = proto_register(&ipx_proto, 1); + + if (rc != 0) + goto out; + + sock_register(&ipx_family_ops); + + pEII_datalink = make_EII_client(); + if (pEII_datalink) + dev_add_pack(&ipx_dix_packet_type); + else + printk(ipx_EII_err_msg); + + p8023_datalink = make_8023_client(); + if (p8023_datalink) + dev_add_pack(&ipx_8023_packet_type); + else + printk(ipx_8023_err_msg); + + p8022_datalink = register_8022_client(ipx_8022_type, ipx_rcv); + if (!p8022_datalink) + printk(ipx_llc_err_msg); + + pSNAP_datalink = register_snap_client(ipx_snap_id, ipx_rcv); + if (!pSNAP_datalink) + printk(ipx_snap_err_msg); + + register_netdevice_notifier(&ipx_dev_notifier); + ipx_register_sysctl(); + ipx_proc_init(); +out: + return rc; +} + +static void __exit ipx_proto_finito(void) +{ + ipx_proc_exit(); + ipx_unregister_sysctl(); + + unregister_netdevice_notifier(&ipx_dev_notifier); + + ipxitf_cleanup(); + + unregister_snap_client(pSNAP_datalink); + pSNAP_datalink = NULL; + + unregister_8022_client(p8022_datalink); + p8022_datalink = NULL; + + dev_remove_pack(&ipx_8023_packet_type); + destroy_8023_client(p8023_datalink); + p8023_datalink = NULL; + + dev_remove_pack(&ipx_dix_packet_type); + destroy_EII_client(pEII_datalink); + pEII_datalink = NULL; + + proto_unregister(&ipx_proto); + sock_unregister(ipx_family_ops.family); +} + +module_init(ipx_init); +module_exit(ipx_proto_finito); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_IPX); diff --git a/net/ipx/ipx_proc.c b/net/ipx/ipx_proc.c new file mode 100644 index 000000000000..b6761913445a --- /dev/null +++ b/net/ipx/ipx_proc.c @@ -0,0 +1,408 @@ +/* + * IPX proc routines + * + * Copyright(C) Arnaldo Carvalho de Melo , 2002 + */ + +#include +#include +#ifdef CONFIG_PROC_FS +#include +#include +#include +#include +#include + +static __inline__ struct ipx_interface *ipx_get_interface_idx(loff_t pos) +{ + struct ipx_interface *i; + + list_for_each_entry(i, &ipx_interfaces, node) + if (!pos--) + goto out; + i = NULL; +out: + return i; +} + +static struct ipx_interface *ipx_interfaces_next(struct ipx_interface *i) +{ + struct ipx_interface *rc = NULL; + + if (i->node.next != &ipx_interfaces) + rc = list_entry(i->node.next, struct ipx_interface, node); + return rc; +} + +static void *ipx_seq_interface_start(struct seq_file *seq, loff_t *pos) +{ + loff_t l = *pos; + + spin_lock_bh(&ipx_interfaces_lock); + return l ? ipx_get_interface_idx(--l) : SEQ_START_TOKEN; +} + +static void *ipx_seq_interface_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ipx_interface *i; + + ++*pos; + if (v == SEQ_START_TOKEN) + i = ipx_interfaces_head(); + else + i = ipx_interfaces_next(v); + return i; +} + +static void ipx_seq_interface_stop(struct seq_file *seq, void *v) +{ + spin_unlock_bh(&ipx_interfaces_lock); +} + +static int ipx_seq_interface_show(struct seq_file *seq, void *v) +{ + struct ipx_interface *i; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "Network Node_Address Primary Device " + "Frame_Type"); +#ifdef IPX_REFCNT_DEBUG + seq_puts(seq, " refcnt"); +#endif + seq_puts(seq, "\n"); + goto out; + } + + i = v; + seq_printf(seq, "%08lX ", (unsigned long int)ntohl(i->if_netnum)); + seq_printf(seq, "%02X%02X%02X%02X%02X%02X ", + i->if_node[0], i->if_node[1], i->if_node[2], + i->if_node[3], i->if_node[4], i->if_node[5]); + seq_printf(seq, "%-9s", i == ipx_primary_net ? "Yes" : "No"); + seq_printf(seq, "%-11s", ipx_device_name(i)); + seq_printf(seq, "%-9s", ipx_frame_name(i->if_dlink_type)); +#ifdef IPX_REFCNT_DEBUG + seq_printf(seq, "%6d", atomic_read(&i->refcnt)); +#endif + seq_puts(seq, "\n"); +out: + return 0; +} + +static struct ipx_route *ipx_routes_head(void) +{ + struct ipx_route *rc = NULL; + + if (!list_empty(&ipx_routes)) + rc = list_entry(ipx_routes.next, struct ipx_route, node); + return rc; +} + +static struct ipx_route *ipx_routes_next(struct ipx_route *r) +{ + struct ipx_route *rc = NULL; + + if (r->node.next != &ipx_routes) + rc = list_entry(r->node.next, struct ipx_route, node); + return rc; +} + +static __inline__ struct ipx_route *ipx_get_route_idx(loff_t pos) +{ + struct ipx_route *r; + + list_for_each_entry(r, &ipx_routes, node) + if (!pos--) + goto out; + r = NULL; +out: + return r; +} + +static void *ipx_seq_route_start(struct seq_file *seq, loff_t *pos) +{ + loff_t l = *pos; + read_lock_bh(&ipx_routes_lock); + return l ? ipx_get_route_idx(--l) : SEQ_START_TOKEN; +} + +static void *ipx_seq_route_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct ipx_route *r; + + ++*pos; + if (v == SEQ_START_TOKEN) + r = ipx_routes_head(); + else + r = ipx_routes_next(v); + return r; +} + +static void ipx_seq_route_stop(struct seq_file *seq, void *v) +{ + read_unlock_bh(&ipx_routes_lock); +} + +static int ipx_seq_route_show(struct seq_file *seq, void *v) +{ + struct ipx_route *rt; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "Network Router_Net Router_Node\n"); + goto out; + } + rt = v; + seq_printf(seq, "%08lX ", (unsigned long int)ntohl(rt->ir_net)); + if (rt->ir_routed) + seq_printf(seq, "%08lX %02X%02X%02X%02X%02X%02X\n", + (long unsigned int)ntohl(rt->ir_intrfc->if_netnum), + rt->ir_router_node[0], rt->ir_router_node[1], + rt->ir_router_node[2], rt->ir_router_node[3], + rt->ir_router_node[4], rt->ir_router_node[5]); + else + seq_puts(seq, "Directly Connected\n"); +out: + return 0; +} + +static __inline__ struct sock *ipx_get_socket_idx(loff_t pos) +{ + struct sock *s = NULL; + struct hlist_node *node; + struct ipx_interface *i; + + list_for_each_entry(i, &ipx_interfaces, node) { + spin_lock_bh(&i->if_sklist_lock); + sk_for_each(s, node, &i->if_sklist) { + if (!pos) + break; + --pos; + } + spin_unlock_bh(&i->if_sklist_lock); + if (!pos) { + if (node) + goto found; + break; + } + } + s = NULL; +found: + return s; +} + +static void *ipx_seq_socket_start(struct seq_file *seq, loff_t *pos) +{ + loff_t l = *pos; + + spin_lock_bh(&ipx_interfaces_lock); + return l ? ipx_get_socket_idx(--l) : SEQ_START_TOKEN; +} + +static void *ipx_seq_socket_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock* sk, *next; + struct ipx_interface *i; + struct ipx_sock *ipxs; + + ++*pos; + if (v == SEQ_START_TOKEN) { + sk = NULL; + i = ipx_interfaces_head(); + if (!i) + goto out; + sk = sk_head(&i->if_sklist); + if (sk) + spin_lock_bh(&i->if_sklist_lock); + goto out; + } + sk = v; + next = sk_next(sk); + if (next) { + sk = next; + goto out; + } + ipxs = ipx_sk(sk); + i = ipxs->intrfc; + spin_unlock_bh(&i->if_sklist_lock); + sk = NULL; + for (;;) { + i = ipx_interfaces_next(i); + if (!i) + break; + spin_lock_bh(&i->if_sklist_lock); + if (!hlist_empty(&i->if_sklist)) { + sk = sk_head(&i->if_sklist); + break; + } + spin_unlock_bh(&i->if_sklist_lock); + } +out: + return sk; +} + +static int ipx_seq_socket_show(struct seq_file *seq, void *v) +{ + struct sock *s; + struct ipx_sock *ipxs; + + if (v == SEQ_START_TOKEN) { +#ifdef CONFIG_IPX_INTERN + seq_puts(seq, "Local_Address " + "Remote_Address Tx_Queue " + "Rx_Queue State Uid\n"); +#else + seq_puts(seq, "Local_Address Remote_Address " + "Tx_Queue Rx_Queue State Uid\n"); +#endif + goto out; + } + + s = v; + ipxs = ipx_sk(s); +#ifdef CONFIG_IPX_INTERN + seq_printf(seq, "%08lX:%02X%02X%02X%02X%02X%02X:%04X ", + (unsigned long)htonl(ipxs->intrfc->if_netnum), + ipxs->node[0], ipxs->node[1], ipxs->node[2], ipxs->node[3], + ipxs->node[4], ipxs->node[5], htons(ipxs->port)); +#else + seq_printf(seq, "%08lX:%04X ", (unsigned long) htonl(ipxs->intrfc->if_netnum), + htons(ipxs->port)); +#endif /* CONFIG_IPX_INTERN */ + if (s->sk_state != TCP_ESTABLISHED) + seq_printf(seq, "%-28s", "Not_Connected"); + else { + seq_printf(seq, "%08lX:%02X%02X%02X%02X%02X%02X:%04X ", + (unsigned long)htonl(ipxs->dest_addr.net), + ipxs->dest_addr.node[0], ipxs->dest_addr.node[1], + ipxs->dest_addr.node[2], ipxs->dest_addr.node[3], + ipxs->dest_addr.node[4], ipxs->dest_addr.node[5], + htons(ipxs->dest_addr.sock)); + } + + seq_printf(seq, "%08X %08X %02X %03d\n", + atomic_read(&s->sk_wmem_alloc), + atomic_read(&s->sk_rmem_alloc), + s->sk_state, SOCK_INODE(s->sk_socket)->i_uid); +out: + return 0; +} + +static struct seq_operations ipx_seq_interface_ops = { + .start = ipx_seq_interface_start, + .next = ipx_seq_interface_next, + .stop = ipx_seq_interface_stop, + .show = ipx_seq_interface_show, +}; + +static struct seq_operations ipx_seq_route_ops = { + .start = ipx_seq_route_start, + .next = ipx_seq_route_next, + .stop = ipx_seq_route_stop, + .show = ipx_seq_route_show, +}; + +static struct seq_operations ipx_seq_socket_ops = { + .start = ipx_seq_socket_start, + .next = ipx_seq_socket_next, + .stop = ipx_seq_interface_stop, + .show = ipx_seq_socket_show, +}; + +static int ipx_seq_route_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ipx_seq_route_ops); +} + +static int ipx_seq_interface_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ipx_seq_interface_ops); +} + +static int ipx_seq_socket_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ipx_seq_socket_ops); +} + +static struct file_operations ipx_seq_interface_fops = { + .owner = THIS_MODULE, + .open = ipx_seq_interface_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct file_operations ipx_seq_route_fops = { + .owner = THIS_MODULE, + .open = ipx_seq_route_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct file_operations ipx_seq_socket_fops = { + .owner = THIS_MODULE, + .open = ipx_seq_socket_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct proc_dir_entry *ipx_proc_dir; + +int __init ipx_proc_init(void) +{ + struct proc_dir_entry *p; + int rc = -ENOMEM; + + ipx_proc_dir = proc_mkdir("ipx", proc_net); + + if (!ipx_proc_dir) + goto out; + p = create_proc_entry("interface", S_IRUGO, ipx_proc_dir); + if (!p) + goto out_interface; + + p->proc_fops = &ipx_seq_interface_fops; + p = create_proc_entry("route", S_IRUGO, ipx_proc_dir); + if (!p) + goto out_route; + + p->proc_fops = &ipx_seq_route_fops; + p = create_proc_entry("socket", S_IRUGO, ipx_proc_dir); + if (!p) + goto out_socket; + + p->proc_fops = &ipx_seq_socket_fops; + + rc = 0; +out: + return rc; +out_socket: + remove_proc_entry("route", ipx_proc_dir); +out_route: + remove_proc_entry("interface", ipx_proc_dir); +out_interface: + remove_proc_entry("ipx", proc_net); + goto out; +} + +void __exit ipx_proc_exit(void) +{ + remove_proc_entry("interface", ipx_proc_dir); + remove_proc_entry("route", ipx_proc_dir); + remove_proc_entry("socket", ipx_proc_dir); + remove_proc_entry("ipx", proc_net); +} + +#else /* CONFIG_PROC_FS */ + +int __init ipx_proc_init(void) +{ + return 0; +} + +void __exit ipx_proc_exit(void) +{ +} + +#endif /* CONFIG_PROC_FS */ diff --git a/net/ipx/ipx_route.c b/net/ipx/ipx_route.c new file mode 100644 index 000000000000..67774448efd9 --- /dev/null +++ b/net/ipx/ipx_route.c @@ -0,0 +1,293 @@ +/* + * Implements the IPX routing routines. + * Code moved from af_ipx.c. + * + * Arnaldo Carvalho de Melo , 2003 + * + * See net/ipx/ChangeLog. + */ + +#include +#include +#include +#include + +#include +#include + +LIST_HEAD(ipx_routes); +DEFINE_RWLOCK(ipx_routes_lock); + +extern struct ipx_interface *ipx_internal_net; + +extern __u16 ipx_cksum(struct ipxhdr *packet, int length); +extern struct ipx_interface *ipxitf_find_using_net(__u32 net); +extern int ipxitf_demux_socket(struct ipx_interface *intrfc, + struct sk_buff *skb, int copy); +extern int ipxitf_demux_socket(struct ipx_interface *intrfc, + struct sk_buff *skb, int copy); +extern int ipxitf_send(struct ipx_interface *intrfc, struct sk_buff *skb, + char *node); +extern struct ipx_interface *ipxitf_find_using_net(__u32 net); + +struct ipx_route *ipxrtr_lookup(__u32 net) +{ + struct ipx_route *r; + + read_lock_bh(&ipx_routes_lock); + list_for_each_entry(r, &ipx_routes, node) + if (r->ir_net == net) { + ipxrtr_hold(r); + goto unlock; + } + r = NULL; +unlock: + read_unlock_bh(&ipx_routes_lock); + return r; +} + +/* + * Caller must hold a reference to intrfc + */ +int ipxrtr_add_route(__u32 network, struct ipx_interface *intrfc, + unsigned char *node) +{ + struct ipx_route *rt; + int rc; + + /* Get a route structure; either existing or create */ + rt = ipxrtr_lookup(network); + if (!rt) { + rt = kmalloc(sizeof(*rt), GFP_ATOMIC); + rc = -EAGAIN; + if (!rt) + goto out; + + atomic_set(&rt->refcnt, 1); + ipxrtr_hold(rt); + write_lock_bh(&ipx_routes_lock); + list_add(&rt->node, &ipx_routes); + write_unlock_bh(&ipx_routes_lock); + } else { + rc = -EEXIST; + if (intrfc == ipx_internal_net) + goto out_put; + } + + rt->ir_net = network; + rt->ir_intrfc = intrfc; + if (!node) { + memset(rt->ir_router_node, '\0', IPX_NODE_LEN); + rt->ir_routed = 0; + } else { + memcpy(rt->ir_router_node, node, IPX_NODE_LEN); + rt->ir_routed = 1; + } + + rc = 0; +out_put: + ipxrtr_put(rt); +out: + return rc; +} + +void ipxrtr_del_routes(struct ipx_interface *intrfc) +{ + struct ipx_route *r, *tmp; + + write_lock_bh(&ipx_routes_lock); + list_for_each_entry_safe(r, tmp, &ipx_routes, node) + if (r->ir_intrfc == intrfc) { + list_del(&r->node); + ipxrtr_put(r); + } + write_unlock_bh(&ipx_routes_lock); +} + +static int ipxrtr_create(struct ipx_route_definition *rd) +{ + struct ipx_interface *intrfc; + int rc = -ENETUNREACH; + + /* Find the appropriate interface */ + intrfc = ipxitf_find_using_net(rd->ipx_router_network); + if (!intrfc) + goto out; + rc = ipxrtr_add_route(rd->ipx_network, intrfc, rd->ipx_router_node); + ipxitf_put(intrfc); +out: + return rc; +} + +static int ipxrtr_delete(long net) +{ + struct ipx_route *r, *tmp; + int rc; + + write_lock_bh(&ipx_routes_lock); + list_for_each_entry_safe(r, tmp, &ipx_routes, node) + if (r->ir_net == net) { + /* Directly connected; can't lose route */ + rc = -EPERM; + if (!r->ir_routed) + goto out; + list_del(&r->node); + ipxrtr_put(r); + rc = 0; + goto out; + } + rc = -ENOENT; +out: + write_unlock_bh(&ipx_routes_lock); + return rc; +} + +/* + * The skb has to be unshared, we'll end up calling ipxitf_send, that'll + * modify the packet + */ +int ipxrtr_route_skb(struct sk_buff *skb) +{ + struct ipxhdr *ipx = ipx_hdr(skb); + struct ipx_route *r = ipxrtr_lookup(IPX_SKB_CB(skb)->ipx_dest_net); + + if (!r) { /* no known route */ + kfree_skb(skb); + return 0; + } + + ipxitf_hold(r->ir_intrfc); + ipxitf_send(r->ir_intrfc, skb, r->ir_routed ? + r->ir_router_node : ipx->ipx_dest.node); + ipxitf_put(r->ir_intrfc); + ipxrtr_put(r); + + return 0; +} + +/* + * Route an outgoing frame from a socket. + */ +int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, + struct iovec *iov, size_t len, int noblock) +{ + struct sk_buff *skb; + struct ipx_sock *ipxs = ipx_sk(sk); + struct ipx_interface *intrfc; + struct ipxhdr *ipx; + size_t size; + int ipx_offset; + struct ipx_route *rt = NULL; + int rc; + + /* Find the appropriate interface on which to send packet */ + if (!usipx->sipx_network && ipx_primary_net) { + usipx->sipx_network = ipx_primary_net->if_netnum; + intrfc = ipx_primary_net; + } else { + rt = ipxrtr_lookup(usipx->sipx_network); + rc = -ENETUNREACH; + if (!rt) + goto out; + intrfc = rt->ir_intrfc; + } + + ipxitf_hold(intrfc); + ipx_offset = intrfc->if_ipx_offset; + size = sizeof(struct ipxhdr) + len + ipx_offset; + + skb = sock_alloc_send_skb(sk, size, noblock, &rc); + if (!skb) + goto out_put; + + skb_reserve(skb, ipx_offset); + skb->sk = sk; + + /* Fill in IPX header */ + skb->h.raw = skb->nh.raw = skb_put(skb, sizeof(struct ipxhdr)); + ipx = ipx_hdr(skb); + ipx->ipx_pktsize = htons(len + sizeof(struct ipxhdr)); + IPX_SKB_CB(skb)->ipx_tctrl = 0; + ipx->ipx_type = usipx->sipx_type; + + IPX_SKB_CB(skb)->last_hop.index = -1; +#ifdef CONFIG_IPX_INTERN + IPX_SKB_CB(skb)->ipx_source_net = ipxs->intrfc->if_netnum; + memcpy(ipx->ipx_source.node, ipxs->node, IPX_NODE_LEN); +#else + rc = ntohs(ipxs->port); + if (rc == 0x453 || rc == 0x452) { + /* RIP/SAP special handling for mars_nwe */ + IPX_SKB_CB(skb)->ipx_source_net = intrfc->if_netnum; + memcpy(ipx->ipx_source.node, intrfc->if_node, IPX_NODE_LEN); + } else { + IPX_SKB_CB(skb)->ipx_source_net = ipxs->intrfc->if_netnum; + memcpy(ipx->ipx_source.node, ipxs->intrfc->if_node, + IPX_NODE_LEN); + } +#endif /* CONFIG_IPX_INTERN */ + ipx->ipx_source.sock = ipxs->port; + IPX_SKB_CB(skb)->ipx_dest_net = usipx->sipx_network; + memcpy(ipx->ipx_dest.node, usipx->sipx_node, IPX_NODE_LEN); + ipx->ipx_dest.sock = usipx->sipx_port; + + rc = memcpy_fromiovec(skb_put(skb, len), iov, len); + if (rc) { + kfree_skb(skb); + goto out_put; + } + + /* Apply checksum. Not allowed on 802.3 links. */ + if (sk->sk_no_check || intrfc->if_dlink_type == IPX_FRAME_8023) + ipx->ipx_checksum = 0xFFFF; + else + ipx->ipx_checksum = ipx_cksum(ipx, len + sizeof(struct ipxhdr)); + + rc = ipxitf_send(intrfc, skb, (rt && rt->ir_routed) ? + rt->ir_router_node : ipx->ipx_dest.node); +out_put: + ipxitf_put(intrfc); + if (rt) + ipxrtr_put(rt); +out: + return rc; +} + +/* + * We use a normal struct rtentry for route handling + */ +int ipxrtr_ioctl(unsigned int cmd, void __user *arg) +{ + struct rtentry rt; /* Use these to behave like 'other' stacks */ + struct sockaddr_ipx *sg, *st; + int rc = -EFAULT; + + if (copy_from_user(&rt, arg, sizeof(rt))) + goto out; + + sg = (struct sockaddr_ipx *)&rt.rt_gateway; + st = (struct sockaddr_ipx *)&rt.rt_dst; + + rc = -EINVAL; + if (!(rt.rt_flags & RTF_GATEWAY) || /* Direct routes are fixed */ + sg->sipx_family != AF_IPX || + st->sipx_family != AF_IPX) + goto out; + + switch (cmd) { + case SIOCDELRT: + rc = ipxrtr_delete(st->sipx_network); + break; + case SIOCADDRT: { + struct ipx_route_definition f; + f.ipx_network = st->sipx_network; + f.ipx_router_network = sg->sipx_network; + memcpy(f.ipx_router_node, sg->sipx_node, IPX_NODE_LEN); + rc = ipxrtr_create(&f); + break; + } + } + +out: + return rc; +} diff --git a/net/ipx/sysctl_net_ipx.c b/net/ipx/sysctl_net_ipx.c new file mode 100644 index 000000000000..510eda96d10a --- /dev/null +++ b/net/ipx/sysctl_net_ipx.c @@ -0,0 +1,62 @@ +/* -*- linux-c -*- + * sysctl_net_ipx.c: sysctl interface to net IPX subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/ipx directory entry (empty =) ). [MS] + * Added /proc/sys/net/ipx/ipx_pprop_broadcasting - acme March 4, 2001 + */ + +#include +#include +#include + +#ifndef CONFIG_SYSCTL +#error This file should not be compiled without CONFIG_SYSCTL defined +#endif + +/* From af_ipx.c */ +extern int sysctl_ipx_pprop_broadcasting; + +static struct ctl_table ipx_table[] = { + { + .ctl_name = NET_IPX_PPROP_BROADCASTING, + .procname = "ipx_pprop_broadcasting", + .data = &sysctl_ipx_pprop_broadcasting, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { 0 }, +}; + +static struct ctl_table ipx_dir_table[] = { + { + .ctl_name = NET_IPX, + .procname = "ipx", + .mode = 0555, + .child = ipx_table, + }, + { 0 }, +}; + +static struct ctl_table ipx_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = ipx_dir_table, + }, + { 0 }, +}; + +static struct ctl_table_header *ipx_table_header; + +void ipx_register_sysctl(void) +{ + ipx_table_header = register_sysctl_table(ipx_root_table, 1); +} + +void ipx_unregister_sysctl(void) +{ + unregister_sysctl_table(ipx_table_header); +} diff --git a/net/irda/Kconfig b/net/irda/Kconfig new file mode 100644 index 000000000000..9efb17ba48ac --- /dev/null +++ b/net/irda/Kconfig @@ -0,0 +1,96 @@ +# +# IrDA protocol configuration +# + +menuconfig IRDA + depends on NET + tristate "IrDA (infrared) subsystem support" + select CRC_CCITT + ---help--- + Say Y here if you want to build support for the IrDA (TM) protocols. + The Infrared Data Associations (tm) specifies standards for wireless + infrared communication and is supported by most laptops and PDA's. + + To use Linux support for the IrDA (tm) protocols, you will also need + some user-space utilities like irattach. For more information, see + the file . You also want to + read the IR-HOWTO, available at + . + + If you want to exchange bits of data (vCal, vCard) with a PDA, you + will need to install some OBEX application, such as OpenObex : + + + To compile this support as a module, choose M here: the module will + be called irda. + +comment "IrDA protocols" + depends on IRDA + +source "net/irda/irlan/Kconfig" + +source "net/irda/irnet/Kconfig" + +source "net/irda/ircomm/Kconfig" + +config IRDA_ULTRA + bool "Ultra (connectionless) protocol" + depends on IRDA + help + Say Y here to support the connectionless Ultra IRDA protocol. + Ultra allows to exchange data over IrDA with really simple devices + (watch, beacon) without the overhead of the IrDA protocol (no handshaking, + no management frames, simple fixed header). + Ultra is available as a special socket : socket(AF_IRDA, SOCK_DGRAM, 1); + +comment "IrDA options" + depends on IRDA + +config IRDA_CACHE_LAST_LSAP + bool "Cache last LSAP" + depends on IRDA + help + Say Y here if you want IrLMP to cache the last LSAP used. This + makes sense since most frames will be sent/received on the same + connection. Enabling this option will save a hash-lookup per frame. + + If unsure, say Y. + +config IRDA_FAST_RR + bool "Fast RRs (low latency)" + depends on IRDA + ---help--- + Say Y here is you want IrLAP to send fast RR (Receive Ready) frames + when acting as a primary station. + Disabling this option will make latency over IrDA very bad. Enabling + this option will make the IrDA stack send more packet than strictly + necessary, thus reduce your battery life (but not that much). + + Fast RR will make IrLAP send out a RR frame immediately when + receiving a frame if its own transmit queue is currently empty. This + will give a lot of speed improvement when receiving much data since + the secondary station will not have to wait the max. turn around + time (usually 500ms) before it is allowed to transmit the next time. + If the transmit queue of the secondary is also empty, the primary will + start backing-off before sending another RR frame, waiting longer + each time until the back-off reaches the max. turn around time. + This back-off increase in controlled via + /proc/sys/net/irda/fast_poll_increase + + If unsure, say Y. + +config IRDA_DEBUG + bool "Debug information" + depends on IRDA + help + Say Y here if you want the IrDA subsystem to write debug information + to your syslog. You can change the debug level in + /proc/sys/net/irda/debug . + When this option is enabled, the IrDA also perform many extra internal + verifications which will usually prevent the kernel to crash in case of + bugs. + + If unsure, say Y (since it makes it easier to find the bugs). + +source "drivers/net/irda/Kconfig" + diff --git a/net/irda/Makefile b/net/irda/Makefile new file mode 100644 index 000000000000..d1366c2a39cb --- /dev/null +++ b/net/irda/Makefile @@ -0,0 +1,15 @@ +# +# Makefile for the Linux IrDA protocol layer. +# + +obj-$(CONFIG_IRDA) += irda.o +obj-$(CONFIG_IRLAN) += irlan/ +obj-$(CONFIG_IRNET) += irnet/ +obj-$(CONFIG_IRCOMM) += ircomm/ + +irda-y := iriap.o iriap_event.o irlmp.o irlmp_event.o irlmp_frame.o \ + irlap.o irlap_event.o irlap_frame.o timer.o qos.o irqueue.o \ + irttp.o irda_device.o irias_object.o wrapper.o af_irda.o \ + discovery.o parameters.o irmod.o +irda-$(CONFIG_PROC_FS) += irproc.o +irda-$(CONFIG_SYSCTL) += irsysctl.o diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c new file mode 100644 index 000000000000..92c6e8d4e731 --- /dev/null +++ b/net/irda/af_irda.c @@ -0,0 +1,2586 @@ +/********************************************************************* + * + * Filename: af_irda.c + * Version: 0.9 + * Description: IrDA sockets implementation + * Status: Stable + * Author: Dag Brattli + * Created at: Sun May 31 10:12:43 1998 + * Modified at: Sat Dec 25 21:10:23 1999 + * Modified by: Dag Brattli + * Sources: af_netroom.c, af_ax25.c, af_rose.c, af_x25.c etc. + * + * Copyright (c) 1999 Dag Brattli + * Copyright (c) 1999-2003 Jean Tourrilhes + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + * Linux-IrDA now supports four different types of IrDA sockets: + * + * o SOCK_STREAM: TinyTP connections with SAR disabled. The + * max SDU size is 0 for conn. of this type + * o SOCK_SEQPACKET: TinyTP connections with SAR enabled. TTP may + * fragment the messages, but will preserve + * the message boundaries + * o SOCK_DGRAM: IRDAPROTO_UNITDATA: TinyTP connections with Unitdata + * (unreliable) transfers + * IRDAPROTO_ULTRA: Connectionless and unreliable data + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include /* TIOCOUTQ, TIOCINQ */ +#include + +#include +#include + +#include + +static int irda_create(struct socket *sock, int protocol); + +static struct proto_ops irda_stream_ops; +static struct proto_ops irda_seqpacket_ops; +static struct proto_ops irda_dgram_ops; + +#ifdef CONFIG_IRDA_ULTRA +static struct proto_ops irda_ultra_ops; +#define ULTRA_MAX_DATA 382 +#endif /* CONFIG_IRDA_ULTRA */ + +#define IRDA_MAX_HEADER (TTP_MAX_HEADER) + +/* + * Function irda_data_indication (instance, sap, skb) + * + * Received some data from TinyTP. Just queue it on the receive queue + * + */ +static int irda_data_indication(void *instance, void *sap, struct sk_buff *skb) +{ + struct irda_sock *self; + struct sock *sk; + int err; + + IRDA_DEBUG(3, "%s()\n", __FUNCTION__); + + self = instance; + sk = instance; + IRDA_ASSERT(sk != NULL, return -1;); + + err = sock_queue_rcv_skb(sk, skb); + if (err) { + IRDA_DEBUG(1, "%s(), error: no more mem!\n", __FUNCTION__); + self->rx_flow = FLOW_STOP; + + /* When we return error, TTP will need to requeue the skb */ + return err; + } + + return 0; +} + +/* + * Function irda_disconnect_indication (instance, sap, reason, skb) + * + * Connection has been closed. Check reason to find out why + * + */ +static void irda_disconnect_indication(void *instance, void *sap, + LM_REASON reason, struct sk_buff *skb) +{ + struct irda_sock *self; + struct sock *sk; + + self = instance; + + IRDA_DEBUG(2, "%s(%p)\n", __FUNCTION__, self); + + /* Don't care about it, but let's not leak it */ + if(skb) + dev_kfree_skb(skb); + + sk = instance; + if (sk == NULL) { + IRDA_DEBUG(0, "%s(%p) : BUG : sk is NULL\n", + __FUNCTION__, self); + return; + } + + /* Prevent race conditions with irda_release() and irda_shutdown() */ + if (!sock_flag(sk, SOCK_DEAD) && sk->sk_state != TCP_CLOSE) { + sk->sk_state = TCP_CLOSE; + sk->sk_err = ECONNRESET; + sk->sk_shutdown |= SEND_SHUTDOWN; + + sk->sk_state_change(sk); + /* Uh-oh... Should use sock_orphan ? */ + sock_set_flag(sk, SOCK_DEAD); + + /* Close our TSAP. + * If we leave it open, IrLMP put it back into the list of + * unconnected LSAPs. The problem is that any incoming request + * can then be matched to this socket (and it will be, because + * it is at the head of the list). This would prevent any + * listening socket waiting on the same TSAP to get those + * requests. Some apps forget to close sockets, or hang to it + * a bit too long, so we may stay in this dead state long + * enough to be noticed... + * Note : all socket function do check sk->sk_state, so we are + * safe... + * Jean II + */ + if (self->tsap) { + irttp_close_tsap(self->tsap); + self->tsap = NULL; + } + } + + /* Note : once we are there, there is not much you want to do + * with the socket anymore, apart from closing it. + * For example, bind() and connect() won't reset sk->sk_err, + * sk->sk_shutdown and sk->sk_flags to valid values... + * Jean II + */ +} + +/* + * Function irda_connect_confirm (instance, sap, qos, max_sdu_size, skb) + * + * Connections has been confirmed by the remote device + * + */ +static void irda_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, __u8 max_header_size, + struct sk_buff *skb) +{ + struct irda_sock *self; + struct sock *sk; + + self = instance; + + IRDA_DEBUG(2, "%s(%p)\n", __FUNCTION__, self); + + sk = instance; + if (sk == NULL) { + dev_kfree_skb(skb); + return; + } + + dev_kfree_skb(skb); + // Should be ??? skb_queue_tail(&sk->sk_receive_queue, skb); + + /* How much header space do we need to reserve */ + self->max_header_size = max_header_size; + + /* IrTTP max SDU size in transmit direction */ + self->max_sdu_size_tx = max_sdu_size; + + /* Find out what the largest chunk of data that we can transmit is */ + switch (sk->sk_type) { + case SOCK_STREAM: + if (max_sdu_size != 0) { + IRDA_ERROR("%s: max_sdu_size must be 0\n", + __FUNCTION__); + return; + } + self->max_data_size = irttp_get_max_seg_size(self->tsap); + break; + case SOCK_SEQPACKET: + if (max_sdu_size == 0) { + IRDA_ERROR("%s: max_sdu_size cannot be 0\n", + __FUNCTION__); + return; + } + self->max_data_size = max_sdu_size; + break; + default: + self->max_data_size = irttp_get_max_seg_size(self->tsap); + }; + + IRDA_DEBUG(2, "%s(), max_data_size=%d\n", __FUNCTION__, + self->max_data_size); + + memcpy(&self->qos_tx, qos, sizeof(struct qos_info)); + + /* We are now connected! */ + sk->sk_state = TCP_ESTABLISHED; + sk->sk_state_change(sk); +} + +/* + * Function irda_connect_indication(instance, sap, qos, max_sdu_size, userdata) + * + * Incoming connection + * + */ +static void irda_connect_indication(void *instance, void *sap, + struct qos_info *qos, __u32 max_sdu_size, + __u8 max_header_size, struct sk_buff *skb) +{ + struct irda_sock *self; + struct sock *sk; + + self = instance; + + IRDA_DEBUG(2, "%s(%p)\n", __FUNCTION__, self); + + sk = instance; + if (sk == NULL) { + dev_kfree_skb(skb); + return; + } + + /* How much header space do we need to reserve */ + self->max_header_size = max_header_size; + + /* IrTTP max SDU size in transmit direction */ + self->max_sdu_size_tx = max_sdu_size; + + /* Find out what the largest chunk of data that we can transmit is */ + switch (sk->sk_type) { + case SOCK_STREAM: + if (max_sdu_size != 0) { + IRDA_ERROR("%s: max_sdu_size must be 0\n", + __FUNCTION__); + kfree_skb(skb); + return; + } + self->max_data_size = irttp_get_max_seg_size(self->tsap); + break; + case SOCK_SEQPACKET: + if (max_sdu_size == 0) { + IRDA_ERROR("%s: max_sdu_size cannot be 0\n", + __FUNCTION__); + kfree_skb(skb); + return; + } + self->max_data_size = max_sdu_size; + break; + default: + self->max_data_size = irttp_get_max_seg_size(self->tsap); + }; + + IRDA_DEBUG(2, "%s(), max_data_size=%d\n", __FUNCTION__, + self->max_data_size); + + memcpy(&self->qos_tx, qos, sizeof(struct qos_info)); + + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_state_change(sk); +} + +/* + * Function irda_connect_response (handle) + * + * Accept incoming connection + * + */ +static void irda_connect_response(struct irda_sock *self) +{ + struct sk_buff *skb; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + + skb = dev_alloc_skb(64); + if (skb == NULL) { + IRDA_DEBUG(0, "%s() Unable to allocate sk_buff!\n", + __FUNCTION__); + return; + } + + /* Reserve space for MUX_CONTROL and LAP header */ + skb_reserve(skb, IRDA_MAX_HEADER); + + irttp_connect_response(self->tsap, self->max_sdu_size_rx, skb); +} + +/* + * Function irda_flow_indication (instance, sap, flow) + * + * Used by TinyTP to tell us if it can accept more data or not + * + */ +static void irda_flow_indication(void *instance, void *sap, LOCAL_FLOW flow) +{ + struct irda_sock *self; + struct sock *sk; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + self = instance; + sk = instance; + IRDA_ASSERT(sk != NULL, return;); + + switch (flow) { + case FLOW_STOP: + IRDA_DEBUG(1, "%s(), IrTTP wants us to slow down\n", + __FUNCTION__); + self->tx_flow = flow; + break; + case FLOW_START: + self->tx_flow = flow; + IRDA_DEBUG(1, "%s(), IrTTP wants us to start again\n", + __FUNCTION__); + wake_up_interruptible(sk->sk_sleep); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown flow command!\n", __FUNCTION__); + /* Unknown flow command, better stop */ + self->tx_flow = flow; + break; + } +} + +/* + * Function irda_getvalue_confirm (obj_id, value, priv) + * + * Got answer from remote LM-IAS, just pass object to requester... + * + * Note : duplicate from above, but we need our own version that + * doesn't touch the dtsap_sel and save the full value structure... + */ +static void irda_getvalue_confirm(int result, __u16 obj_id, + struct ias_value *value, void *priv) +{ + struct irda_sock *self; + + self = (struct irda_sock *) priv; + if (!self) { + IRDA_WARNING("%s: lost myself!\n", __FUNCTION__); + return; + } + + IRDA_DEBUG(2, "%s(%p)\n", __FUNCTION__, self); + + /* We probably don't need to make any more queries */ + iriap_close(self->iriap); + self->iriap = NULL; + + /* Check if request succeeded */ + if (result != IAS_SUCCESS) { + IRDA_DEBUG(1, "%s(), IAS query failed! (%d)\n", __FUNCTION__, + result); + + self->errno = result; /* We really need it later */ + + /* Wake up any processes waiting for result */ + wake_up_interruptible(&self->query_wait); + + return; + } + + /* Pass the object to the caller (so the caller must delete it) */ + self->ias_result = value; + self->errno = 0; + + /* Wake up any processes waiting for result */ + wake_up_interruptible(&self->query_wait); +} + +/* + * Function irda_selective_discovery_indication (discovery) + * + * Got a selective discovery indication from IrLMP. + * + * IrLMP is telling us that this node is new and matching our hint bit + * filter. Wake up any process waiting for answer... + */ +static void irda_selective_discovery_indication(discinfo_t *discovery, + DISCOVERY_MODE mode, + void *priv) +{ + struct irda_sock *self; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + self = (struct irda_sock *) priv; + if (!self) { + IRDA_WARNING("%s: lost myself!\n", __FUNCTION__); + return; + } + + /* Pass parameter to the caller */ + self->cachedaddr = discovery->daddr; + + /* Wake up process if its waiting for device to be discovered */ + wake_up_interruptible(&self->query_wait); +} + +/* + * Function irda_discovery_timeout (priv) + * + * Timeout in the selective discovery process + * + * We were waiting for a node to be discovered, but nothing has come up + * so far. Wake up the user and tell him that we failed... + */ +static void irda_discovery_timeout(u_long priv) +{ + struct irda_sock *self; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + self = (struct irda_sock *) priv; + IRDA_ASSERT(self != NULL, return;); + + /* Nothing for the caller */ + self->cachelog = NULL; + self->cachedaddr = 0; + self->errno = -ETIME; + + /* Wake up process if its still waiting... */ + wake_up_interruptible(&self->query_wait); +} + +/* + * Function irda_open_tsap (self) + * + * Open local Transport Service Access Point (TSAP) + * + */ +static int irda_open_tsap(struct irda_sock *self, __u8 tsap_sel, char *name) +{ + notify_t notify; + + if (self->tsap) { + IRDA_WARNING("%s: busy!\n", __FUNCTION__); + return -EBUSY; + } + + /* Initialize callbacks to be used by the IrDA stack */ + irda_notify_init(¬ify); + notify.connect_confirm = irda_connect_confirm; + notify.connect_indication = irda_connect_indication; + notify.disconnect_indication = irda_disconnect_indication; + notify.data_indication = irda_data_indication; + notify.udata_indication = irda_data_indication; + notify.flow_indication = irda_flow_indication; + notify.instance = self; + strncpy(notify.name, name, NOTIFY_MAX_NAME); + + self->tsap = irttp_open_tsap(tsap_sel, DEFAULT_INITIAL_CREDIT, + ¬ify); + if (self->tsap == NULL) { + IRDA_DEBUG(0, "%s(), Unable to allocate TSAP!\n", + __FUNCTION__); + return -ENOMEM; + } + /* Remember which TSAP selector we actually got */ + self->stsap_sel = self->tsap->stsap_sel; + + return 0; +} + +/* + * Function irda_open_lsap (self) + * + * Open local Link Service Access Point (LSAP). Used for opening Ultra + * sockets + */ +#ifdef CONFIG_IRDA_ULTRA +static int irda_open_lsap(struct irda_sock *self, int pid) +{ + notify_t notify; + + if (self->lsap) { + IRDA_WARNING("%s(), busy!\n", __FUNCTION__); + return -EBUSY; + } + + /* Initialize callbacks to be used by the IrDA stack */ + irda_notify_init(¬ify); + notify.udata_indication = irda_data_indication; + notify.instance = self; + strncpy(notify.name, "Ultra", NOTIFY_MAX_NAME); + + self->lsap = irlmp_open_lsap(LSAP_CONNLESS, ¬ify, pid); + if (self->lsap == NULL) { + IRDA_DEBUG( 0, "%s(), Unable to allocate LSAP!\n", __FUNCTION__); + return -ENOMEM; + } + + return 0; +} +#endif /* CONFIG_IRDA_ULTRA */ + +/* + * Function irda_find_lsap_sel (self, name) + * + * Try to lookup LSAP selector in remote LM-IAS + * + * Basically, we start a IAP query, and then go to sleep. When the query + * return, irda_getvalue_confirm will wake us up, and we can examine the + * result of the query... + * Note that in some case, the query fail even before we go to sleep, + * creating some races... + */ +static int irda_find_lsap_sel(struct irda_sock *self, char *name) +{ + IRDA_DEBUG(2, "%s(%p, %s)\n", __FUNCTION__, self, name); + + IRDA_ASSERT(self != NULL, return -1;); + + if (self->iriap) { + IRDA_WARNING("%s(): busy with a previous query\n", + __FUNCTION__); + return -EBUSY; + } + + self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, + irda_getvalue_confirm); + if(self->iriap == NULL) + return -ENOMEM; + + /* Treat unexpected wakeup as disconnect */ + self->errno = -EHOSTUNREACH; + + /* Query remote LM-IAS */ + iriap_getvaluebyclass_request(self->iriap, self->saddr, self->daddr, + name, "IrDA:TinyTP:LsapSel"); + + /* Wait for answer, if not yet finished (or failed) */ + if (wait_event_interruptible(self->query_wait, (self->iriap==NULL))) + /* Treat signals as disconnect */ + return -EHOSTUNREACH; + + /* Check what happened */ + if (self->errno) + { + /* Requested object/attribute doesn't exist */ + if((self->errno == IAS_CLASS_UNKNOWN) || + (self->errno == IAS_ATTRIB_UNKNOWN)) + return (-EADDRNOTAVAIL); + else + return (-EHOSTUNREACH); + } + + /* Get the remote TSAP selector */ + switch (self->ias_result->type) { + case IAS_INTEGER: + IRDA_DEBUG(4, "%s() int=%d\n", + __FUNCTION__, self->ias_result->t.integer); + + if (self->ias_result->t.integer != -1) + self->dtsap_sel = self->ias_result->t.integer; + else + self->dtsap_sel = 0; + break; + default: + self->dtsap_sel = 0; + IRDA_DEBUG(0, "%s(), bad type!\n", __FUNCTION__); + break; + } + if (self->ias_result) + irias_delete_value(self->ias_result); + + if (self->dtsap_sel) + return 0; + + return -EADDRNOTAVAIL; +} + +/* + * Function irda_discover_daddr_and_lsap_sel (self, name) + * + * This try to find a device with the requested service. + * + * It basically look into the discovery log. For each address in the list, + * it queries the LM-IAS of the device to find if this device offer + * the requested service. + * If there is more than one node supporting the service, we complain + * to the user (it should move devices around). + * The, we set both the destination address and the lsap selector to point + * on the service on the unique device we have found. + * + * Note : this function fails if there is more than one device in range, + * because IrLMP doesn't disconnect the LAP when the last LSAP is closed. + * Moreover, we would need to wait the LAP disconnection... + */ +static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name) +{ + discinfo_t *discoveries; /* Copy of the discovery log */ + int number; /* Number of nodes in the log */ + int i; + int err = -ENETUNREACH; + __u32 daddr = DEV_ADDR_ANY; /* Address we found the service on */ + __u8 dtsap_sel = 0x0; /* TSAP associated with it */ + + IRDA_DEBUG(2, "%s(), name=%s\n", __FUNCTION__, name); + + IRDA_ASSERT(self != NULL, return -1;); + + /* Ask lmp for the current discovery log + * Note : we have to use irlmp_get_discoveries(), as opposed + * to play with the cachelog directly, because while we are + * making our ias query, le log might change... */ + discoveries = irlmp_get_discoveries(&number, self->mask.word, + self->nslots); + /* Check if the we got some results */ + if (discoveries == NULL) + return -ENETUNREACH; /* No nodes discovered */ + + /* + * Now, check all discovered devices (if any), and connect + * client only about the services that the client is + * interested in... + */ + for(i = 0; i < number; i++) { + /* Try the address in the log */ + self->daddr = discoveries[i].daddr; + self->saddr = 0x0; + IRDA_DEBUG(1, "%s(), trying daddr = %08x\n", + __FUNCTION__, self->daddr); + + /* Query remote LM-IAS for this service */ + err = irda_find_lsap_sel(self, name); + switch (err) { + case 0: + /* We found the requested service */ + if(daddr != DEV_ADDR_ANY) { + IRDA_DEBUG(1, "%s(), discovered service ''%s'' in two different devices !!!\n", + __FUNCTION__, name); + self->daddr = DEV_ADDR_ANY; + kfree(discoveries); + return(-ENOTUNIQ); + } + /* First time we found that one, save it ! */ + daddr = self->daddr; + dtsap_sel = self->dtsap_sel; + break; + case -EADDRNOTAVAIL: + /* Requested service simply doesn't exist on this node */ + break; + default: + /* Something bad did happen :-( */ + IRDA_DEBUG(0, "%s(), unexpected IAS query failure\n", __FUNCTION__); + self->daddr = DEV_ADDR_ANY; + kfree(discoveries); + return(-EHOSTUNREACH); + break; + } + } + /* Cleanup our copy of the discovery log */ + kfree(discoveries); + + /* Check out what we found */ + if(daddr == DEV_ADDR_ANY) { + IRDA_DEBUG(1, "%s(), cannot discover service ''%s'' in any device !!!\n", + __FUNCTION__, name); + self->daddr = DEV_ADDR_ANY; + return(-EADDRNOTAVAIL); + } + + /* Revert back to discovered device & service */ + self->daddr = daddr; + self->saddr = 0x0; + self->dtsap_sel = dtsap_sel; + + IRDA_DEBUG(1, "%s(), discovered requested service ''%s'' at address %08x\n", + __FUNCTION__, name, self->daddr); + + return 0; +} + +/* + * Function irda_getname (sock, uaddr, uaddr_len, peer) + * + * Return the our own, or peers socket address (sockaddr_irda) + * + */ +static int irda_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_irda saddr; + struct sock *sk = sock->sk; + struct irda_sock *self = irda_sk(sk); + + if (peer) { + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + saddr.sir_family = AF_IRDA; + saddr.sir_lsap_sel = self->dtsap_sel; + saddr.sir_addr = self->daddr; + } else { + saddr.sir_family = AF_IRDA; + saddr.sir_lsap_sel = self->stsap_sel; + saddr.sir_addr = self->saddr; + } + + IRDA_DEBUG(1, "%s(), tsap_sel = %#x\n", __FUNCTION__, saddr.sir_lsap_sel); + IRDA_DEBUG(1, "%s(), addr = %08x\n", __FUNCTION__, saddr.sir_addr); + + /* uaddr_len come to us uninitialised */ + *uaddr_len = sizeof (struct sockaddr_irda); + memcpy(uaddr, &saddr, *uaddr_len); + + return 0; +} + +/* + * Function irda_listen (sock, backlog) + * + * Just move to the listen state + * + */ +static int irda_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) && + (sk->sk_type != SOCK_DGRAM)) + return -EOPNOTSUPP; + + if (sk->sk_state != TCP_LISTEN) { + sk->sk_max_ack_backlog = backlog; + sk->sk_state = TCP_LISTEN; + + return 0; + } + + return -EOPNOTSUPP; +} + +/* + * Function irda_bind (sock, uaddr, addr_len) + * + * Used by servers to register their well known TSAP + * + */ +static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct sockaddr_irda *addr = (struct sockaddr_irda *) uaddr; + struct irda_sock *self = irda_sk(sk); + int err; + + IRDA_ASSERT(self != NULL, return -1;); + + IRDA_DEBUG(2, "%s(%p)\n", __FUNCTION__, self); + + if (addr_len != sizeof(struct sockaddr_irda)) + return -EINVAL; + +#ifdef CONFIG_IRDA_ULTRA + /* Special care for Ultra sockets */ + if ((sk->sk_type == SOCK_DGRAM) && + (sk->sk_protocol == IRDAPROTO_ULTRA)) { + self->pid = addr->sir_lsap_sel; + if (self->pid & 0x80) { + IRDA_DEBUG(0, "%s(), extension in PID not supp!\n", __FUNCTION__); + return -EOPNOTSUPP; + } + err = irda_open_lsap(self, self->pid); + if (err < 0) + return err; + + /* Pretend we are connected */ + sock->state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; + + return 0; + } +#endif /* CONFIG_IRDA_ULTRA */ + + err = irda_open_tsap(self, addr->sir_lsap_sel, addr->sir_name); + if (err < 0) + return err; + + /* Register with LM-IAS */ + self->ias_obj = irias_new_object(addr->sir_name, jiffies); + irias_add_integer_attrib(self->ias_obj, "IrDA:TinyTP:LsapSel", + self->stsap_sel, IAS_KERNEL_ATTR); + irias_insert_object(self->ias_obj); + + return 0; +} + +/* + * Function irda_accept (sock, newsock, flags) + * + * Wait for incoming connection + * + */ +static int irda_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk = sock->sk; + struct irda_sock *new, *self = irda_sk(sk); + struct sock *newsk; + struct sk_buff *skb; + int err; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return -1;); + + err = irda_create(newsock, sk->sk_protocol); + if (err) + return err; + + if (sock->state != SS_UNCONNECTED) + return -EINVAL; + + if ((sk = sock->sk) == NULL) + return -EINVAL; + + if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) && + (sk->sk_type != SOCK_DGRAM)) + return -EOPNOTSUPP; + + if (sk->sk_state != TCP_LISTEN) + return -EINVAL; + + /* + * The read queue this time is holding sockets ready to use + * hooked into the SABM we saved + */ + + /* + * We can perform the accept only if there is incoming data + * on the listening socket. + * So, we will block the caller until we receive any data. + * If the caller was waiting on select() or poll() before + * calling us, the data is waiting for us ;-) + * Jean II + */ + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb == NULL) { + int ret = 0; + DECLARE_WAITQUEUE(waitq, current); + + /* Non blocking operation */ + if (flags & O_NONBLOCK) + return -EWOULDBLOCK; + + /* The following code is a cut'n'paste of the + * wait_event_interruptible() macro. + * We don't us the macro because the condition has + * side effects : we want to make sure that only one + * skb get dequeued - Jean II */ + add_wait_queue(sk->sk_sleep, &waitq); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb != NULL) + break; + if (!signal_pending(current)) { + schedule(); + continue; + } + ret = -ERESTARTSYS; + break; + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep, &waitq); + if(ret) + return -ERESTARTSYS; + } + + newsk = newsock->sk; + newsk->sk_state = TCP_ESTABLISHED; + + new = irda_sk(newsk); + IRDA_ASSERT(new != NULL, return -1;); + + /* Now attach up the new socket */ + new->tsap = irttp_dup(self->tsap, new); + if (!new->tsap) { + IRDA_DEBUG(0, "%s(), dup failed!\n", __FUNCTION__); + kfree_skb(skb); + return -1; + } + + new->stsap_sel = new->tsap->stsap_sel; + new->dtsap_sel = new->tsap->dtsap_sel; + new->saddr = irttp_get_saddr(new->tsap); + new->daddr = irttp_get_daddr(new->tsap); + + new->max_sdu_size_tx = self->max_sdu_size_tx; + new->max_sdu_size_rx = self->max_sdu_size_rx; + new->max_data_size = self->max_data_size; + new->max_header_size = self->max_header_size; + + memcpy(&new->qos_tx, &self->qos_tx, sizeof(struct qos_info)); + + /* Clean up the original one to keep it in listen state */ + irttp_listen(self->tsap); + + /* Wow ! What is that ? Jean II */ + skb->sk = NULL; + skb->destructor = NULL; + kfree_skb(skb); + sk->sk_ack_backlog--; + + newsock->state = SS_CONNECTED; + + irda_connect_response(new); + + return 0; +} + +/* + * Function irda_connect (sock, uaddr, addr_len, flags) + * + * Connect to a IrDA device + * + * The main difference with a "standard" connect is that with IrDA we need + * to resolve the service name into a TSAP selector (in TCP, port number + * doesn't have to be resolved). + * Because of this service name resoltion, we can offer "auto-connect", + * where we connect to a service without specifying a destination address. + * + * Note : by consulting "errno", the user space caller may learn the cause + * of the failure. Most of them are visible in the function, others may come + * from subroutines called and are listed here : + * o EBUSY : already processing a connect + * o EHOSTUNREACH : bad addr->sir_addr argument + * o EADDRNOTAVAIL : bad addr->sir_name argument + * o ENOTUNIQ : more than one node has addr->sir_name (auto-connect) + * o ENETUNREACH : no node found on the network (auto-connect) + */ +static int irda_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_irda *addr = (struct sockaddr_irda *) uaddr; + struct irda_sock *self = irda_sk(sk); + int err; + + IRDA_DEBUG(2, "%s(%p)\n", __FUNCTION__, self); + + /* Don't allow connect for Ultra sockets */ + if ((sk->sk_type == SOCK_DGRAM) && (sk->sk_protocol == IRDAPROTO_ULTRA)) + return -ESOCKTNOSUPPORT; + + if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { + sock->state = SS_CONNECTED; + return 0; /* Connect completed during a ERESTARTSYS event */ + } + + if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { + sock->state = SS_UNCONNECTED; + return -ECONNREFUSED; + } + + if (sk->sk_state == TCP_ESTABLISHED) + return -EISCONN; /* No reconnect on a seqpacket socket */ + + sk->sk_state = TCP_CLOSE; + sock->state = SS_UNCONNECTED; + + if (addr_len != sizeof(struct sockaddr_irda)) + return -EINVAL; + + /* Check if user supplied any destination device address */ + if ((!addr->sir_addr) || (addr->sir_addr == DEV_ADDR_ANY)) { + /* Try to find one suitable */ + err = irda_discover_daddr_and_lsap_sel(self, addr->sir_name); + if (err) { + IRDA_DEBUG(0, "%s(), auto-connect failed!\n", __FUNCTION__); + return err; + } + } else { + /* Use the one provided by the user */ + self->daddr = addr->sir_addr; + IRDA_DEBUG(1, "%s(), daddr = %08x\n", __FUNCTION__, self->daddr); + + /* If we don't have a valid service name, we assume the + * user want to connect on a specific LSAP. Prevent + * the use of invalid LSAPs (IrLMP 1.1 p10). Jean II */ + if((addr->sir_name[0] != '\0') || + (addr->sir_lsap_sel >= 0x70)) { + /* Query remote LM-IAS using service name */ + err = irda_find_lsap_sel(self, addr->sir_name); + if (err) { + IRDA_DEBUG(0, "%s(), connect failed!\n", __FUNCTION__); + return err; + } + } else { + /* Directly connect to the remote LSAP + * specified by the sir_lsap field. + * Please use with caution, in IrDA LSAPs are + * dynamic and there is no "well-known" LSAP. */ + self->dtsap_sel = addr->sir_lsap_sel; + } + } + + /* Check if we have opened a local TSAP */ + if (!self->tsap) + irda_open_tsap(self, LSAP_ANY, addr->sir_name); + + /* Move to connecting socket, start sending Connect Requests */ + sock->state = SS_CONNECTING; + sk->sk_state = TCP_SYN_SENT; + + /* Connect to remote device */ + err = irttp_connect_request(self->tsap, self->dtsap_sel, + self->saddr, self->daddr, NULL, + self->max_sdu_size_rx, NULL); + if (err) { + IRDA_DEBUG(0, "%s(), connect failed!\n", __FUNCTION__); + return err; + } + + /* Now the loop */ + if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) + return -EINPROGRESS; + + if (wait_event_interruptible(*(sk->sk_sleep), + (sk->sk_state != TCP_SYN_SENT))) + return -ERESTARTSYS; + + if (sk->sk_state != TCP_ESTABLISHED) { + sock->state = SS_UNCONNECTED; + return sock_error(sk); /* Always set at this point */ + } + + sock->state = SS_CONNECTED; + + /* At this point, IrLMP has assigned our source address */ + self->saddr = irttp_get_saddr(self->tsap); + + return 0; +} + +static struct proto irda_proto = { + .name = "IRDA", + .owner = THIS_MODULE, + .obj_size = sizeof(struct irda_sock), +}; + +/* + * Function irda_create (sock, protocol) + * + * Create IrDA socket + * + */ +static int irda_create(struct socket *sock, int protocol) +{ + struct sock *sk; + struct irda_sock *self; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + /* Check for valid socket type */ + switch (sock->type) { + case SOCK_STREAM: /* For TTP connections with SAR disabled */ + case SOCK_SEQPACKET: /* For TTP connections with SAR enabled */ + case SOCK_DGRAM: /* For TTP Unitdata or LMP Ultra transfers */ + break; + default: + return -ESOCKTNOSUPPORT; + } + + /* Allocate networking socket */ + sk = sk_alloc(PF_IRDA, GFP_ATOMIC, &irda_proto, 1); + if (sk == NULL) + return -ENOMEM; + + self = irda_sk(sk); + IRDA_DEBUG(2, "%s() : self is %p\n", __FUNCTION__, self); + + init_waitqueue_head(&self->query_wait); + + /* Initialise networking socket struct */ + sock_init_data(sock, sk); /* Note : set sk->sk_refcnt to 1 */ + sk->sk_family = PF_IRDA; + sk->sk_protocol = protocol; + + switch (sock->type) { + case SOCK_STREAM: + sock->ops = &irda_stream_ops; + self->max_sdu_size_rx = TTP_SAR_DISABLE; + break; + case SOCK_SEQPACKET: + sock->ops = &irda_seqpacket_ops; + self->max_sdu_size_rx = TTP_SAR_UNBOUND; + break; + case SOCK_DGRAM: + switch (protocol) { +#ifdef CONFIG_IRDA_ULTRA + case IRDAPROTO_ULTRA: + sock->ops = &irda_ultra_ops; + /* Initialise now, because we may send on unbound + * sockets. Jean II */ + self->max_data_size = ULTRA_MAX_DATA - LMP_PID_HEADER; + self->max_header_size = IRDA_MAX_HEADER + LMP_PID_HEADER; + break; +#endif /* CONFIG_IRDA_ULTRA */ + case IRDAPROTO_UNITDATA: + sock->ops = &irda_dgram_ops; + /* We let Unitdata conn. be like seqpack conn. */ + self->max_sdu_size_rx = TTP_SAR_UNBOUND; + break; + default: + IRDA_ERROR("%s: protocol not supported!\n", + __FUNCTION__); + return -ESOCKTNOSUPPORT; + } + break; + default: + return -ESOCKTNOSUPPORT; + } + + /* Register as a client with IrLMP */ + self->ckey = irlmp_register_client(0, NULL, NULL, NULL); + self->mask.word = 0xffff; + self->rx_flow = self->tx_flow = FLOW_START; + self->nslots = DISCOVERY_DEFAULT_SLOTS; + self->daddr = DEV_ADDR_ANY; /* Until we get connected */ + self->saddr = 0x0; /* so IrLMP assign us any link */ + return 0; +} + +/* + * Function irda_destroy_socket (self) + * + * Destroy socket + * + */ +static void irda_destroy_socket(struct irda_sock *self) +{ + IRDA_DEBUG(2, "%s(%p)\n", __FUNCTION__, self); + + IRDA_ASSERT(self != NULL, return;); + + /* Unregister with IrLMP */ + irlmp_unregister_client(self->ckey); + irlmp_unregister_service(self->skey); + + /* Unregister with LM-IAS */ + if (self->ias_obj) { + irias_delete_object(self->ias_obj); + self->ias_obj = NULL; + } + + if (self->iriap) { + iriap_close(self->iriap); + self->iriap = NULL; + } + + if (self->tsap) { + irttp_disconnect_request(self->tsap, NULL, P_NORMAL); + irttp_close_tsap(self->tsap); + self->tsap = NULL; + } +#ifdef CONFIG_IRDA_ULTRA + if (self->lsap) { + irlmp_close_lsap(self->lsap); + self->lsap = NULL; + } +#endif /* CONFIG_IRDA_ULTRA */ +} + +/* + * Function irda_release (sock) + */ +static int irda_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + if (sk == NULL) + return 0; + + sk->sk_state = TCP_CLOSE; + sk->sk_shutdown |= SEND_SHUTDOWN; + sk->sk_state_change(sk); + + /* Destroy IrDA socket */ + irda_destroy_socket(irda_sk(sk)); + + sock_orphan(sk); + sock->sk = NULL; + + /* Purge queues (see sock_init_data()) */ + skb_queue_purge(&sk->sk_receive_queue); + + /* Destroy networking socket if we are the last reference on it, + * i.e. if(sk->sk_refcnt == 0) -> sk_free(sk) */ + sock_put(sk); + + /* Notes on socket locking and deallocation... - Jean II + * In theory we should put pairs of sock_hold() / sock_put() to + * prevent the socket to be destroyed whenever there is an + * outstanding request or outstanding incoming packet or event. + * + * 1) This may include IAS request, both in connect and getsockopt. + * Unfortunately, the situation is a bit more messy than it looks, + * because we close iriap and kfree(self) above. + * + * 2) This may include selective discovery in getsockopt. + * Same stuff as above, irlmp registration and self are gone. + * + * Probably 1 and 2 may not matter, because it's all triggered + * by a process and the socket layer already prevent the + * socket to go away while a process is holding it, through + * sockfd_put() and fput()... + * + * 3) This may include deferred TSAP closure. In particular, + * we may receive a late irda_disconnect_indication() + * Fortunately, (tsap_cb *)->close_pend should protect us + * from that. + * + * I did some testing on SMP, and it looks solid. And the socket + * memory leak is now gone... - Jean II + */ + + return 0; +} + +/* + * Function irda_sendmsg (iocb, sock, msg, len) + * + * Send message down to TinyTP. This function is used for both STREAM and + * SEQPACK services. This is possible since it forces the client to + * fragment the message if necessary + */ +static int irda_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct irda_sock *self; + struct sk_buff *skb; + unsigned char *asmptr; + int err; + + IRDA_DEBUG(4, "%s(), len=%zd\n", __FUNCTION__, len); + + /* Note : socket.c set MSG_EOR on SEQPACKET sockets */ + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) + return -EINVAL; + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + send_sig(SIGPIPE, current, 0); + return -EPIPE; + } + + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + self = irda_sk(sk); + IRDA_ASSERT(self != NULL, return -1;); + + /* Check if IrTTP is wants us to slow down */ + + if (wait_event_interruptible(*(sk->sk_sleep), + (self->tx_flow != FLOW_STOP || sk->sk_state != TCP_ESTABLISHED))) + return -ERESTARTSYS; + + /* Check if we are still connected */ + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + /* Check that we don't send out to big frames */ + if (len > self->max_data_size) { + IRDA_DEBUG(2, "%s(), Chopping frame from %zd to %d bytes!\n", + __FUNCTION__, len, self->max_data_size); + len = self->max_data_size; + } + + skb = sock_alloc_send_skb(sk, len + self->max_header_size + 16, + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) + return -ENOBUFS; + + skb_reserve(skb, self->max_header_size + 16); + + asmptr = skb->h.raw = skb_put(skb, len); + err = memcpy_fromiovec(asmptr, msg->msg_iov, len); + if (err) { + kfree_skb(skb); + return err; + } + + /* + * Just send the message to TinyTP, and let it deal with possible + * errors. No need to duplicate all that here + */ + err = irttp_data_request(self->tsap, skb); + if (err) { + IRDA_DEBUG(0, "%s(), err=%d\n", __FUNCTION__, err); + return err; + } + /* Tell client how much data we actually sent */ + return len; +} + +/* + * Function irda_recvmsg_dgram (iocb, sock, msg, size, flags) + * + * Try to receive message and copy it to user. The frame is discarded + * after being read, regardless of how much the user actually read + */ +static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct irda_sock *self = irda_sk(sk); + struct sk_buff *skb; + size_t copied; + int err; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return -1;); + + skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, + flags & MSG_DONTWAIT, &err); + if (!skb) + return err; + + skb->h.raw = skb->data; + copied = skb->len; + + if (copied > size) { + IRDA_DEBUG(2, "%s(), Received truncated frame (%zd < %zd)!\n", + __FUNCTION__, copied, size); + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + + skb_free_datagram(sk, skb); + + /* + * Check if we have previously stopped IrTTP and we know + * have more free space in our rx_queue. If so tell IrTTP + * to start delivering frames again before our rx_queue gets + * empty + */ + if (self->rx_flow == FLOW_STOP) { + if ((atomic_read(&sk->sk_rmem_alloc) << 2) <= sk->sk_rcvbuf) { + IRDA_DEBUG(2, "%s(), Starting IrTTP\n", __FUNCTION__); + self->rx_flow = FLOW_START; + irttp_flow_request(self->tsap, FLOW_START); + } + } + + return copied; +} + +/* + * Function irda_recvmsg_stream (iocb, sock, msg, size, flags) + */ +static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct irda_sock *self = irda_sk(sk); + int noblock = flags & MSG_DONTWAIT; + size_t copied = 0; + int target = 1; + DECLARE_WAITQUEUE(waitq, current); + + IRDA_DEBUG(3, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return -1;); + + if (sock->flags & __SO_ACCEPTCON) + return(-EINVAL); + + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + if (flags & MSG_WAITALL) + target = size; + + msg->msg_namelen = 0; + + do { + int chunk; + struct sk_buff *skb = skb_dequeue(&sk->sk_receive_queue); + + if (skb==NULL) { + int ret = 0; + + if (copied >= target) + break; + + /* The following code is a cut'n'paste of the + * wait_event_interruptible() macro. + * We don't us the macro because the test condition + * is messy. - Jean II */ + set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + add_wait_queue(sk->sk_sleep, &waitq); + set_current_state(TASK_INTERRUPTIBLE); + + /* + * POSIX 1003.1g mandates this order. + */ + if (sk->sk_err) + ret = sock_error(sk); + else if (sk->sk_shutdown & RCV_SHUTDOWN) + ; + else if (noblock) + ret = -EAGAIN; + else if (signal_pending(current)) + ret = -ERESTARTSYS; + else if (skb_peek(&sk->sk_receive_queue) == NULL) + /* Wait process until data arrives */ + schedule(); + + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep, &waitq); + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + + if(ret) + return(ret); + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + continue; + } + + chunk = min_t(unsigned int, skb->len, size); + if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { + skb_queue_head(&sk->sk_receive_queue, skb); + if (copied == 0) + copied = -EFAULT; + break; + } + copied += chunk; + size -= chunk; + + /* Mark read part of skb as used */ + if (!(flags & MSG_PEEK)) { + skb_pull(skb, chunk); + + /* put the skb back if we didn't use it up.. */ + if (skb->len) { + IRDA_DEBUG(1, "%s(), back on q!\n", + __FUNCTION__); + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + + kfree_skb(skb); + } else { + IRDA_DEBUG(0, "%s() questionable!?\n", __FUNCTION__); + + /* put message back and return */ + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + } while (size); + + /* + * Check if we have previously stopped IrTTP and we know + * have more free space in our rx_queue. If so tell IrTTP + * to start delivering frames again before our rx_queue gets + * empty + */ + if (self->rx_flow == FLOW_STOP) { + if ((atomic_read(&sk->sk_rmem_alloc) << 2) <= sk->sk_rcvbuf) { + IRDA_DEBUG(2, "%s(), Starting IrTTP\n", __FUNCTION__); + self->rx_flow = FLOW_START; + irttp_flow_request(self->tsap, FLOW_START); + } + } + + return copied; +} + +/* + * Function irda_sendmsg_dgram (iocb, sock, msg, len) + * + * Send message down to TinyTP for the unreliable sequenced + * packet service... + * + */ +static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct irda_sock *self; + struct sk_buff *skb; + unsigned char *asmptr; + int err; + + IRDA_DEBUG(4, "%s(), len=%zd\n", __FUNCTION__, len); + + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) + return -EINVAL; + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + send_sig(SIGPIPE, current, 0); + return -EPIPE; + } + + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + self = irda_sk(sk); + IRDA_ASSERT(self != NULL, return -1;); + + /* + * Check that we don't send out to big frames. This is an unreliable + * service, so we have no fragmentation and no coalescence + */ + if (len > self->max_data_size) { + IRDA_DEBUG(0, "%s(), Warning to much data! " + "Chopping frame from %zd to %d bytes!\n", + __FUNCTION__, len, self->max_data_size); + len = self->max_data_size; + } + + skb = sock_alloc_send_skb(sk, len + self->max_header_size, + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) + return -ENOBUFS; + + skb_reserve(skb, self->max_header_size); + + IRDA_DEBUG(4, "%s(), appending user data\n", __FUNCTION__); + asmptr = skb->h.raw = skb_put(skb, len); + err = memcpy_fromiovec(asmptr, msg->msg_iov, len); + if (err) { + kfree_skb(skb); + return err; + } + + /* + * Just send the message to TinyTP, and let it deal with possible + * errors. No need to duplicate all that here + */ + err = irttp_udata_request(self->tsap, skb); + if (err) { + IRDA_DEBUG(0, "%s(), err=%d\n", __FUNCTION__, err); + return err; + } + return len; +} + +/* + * Function irda_sendmsg_ultra (iocb, sock, msg, len) + * + * Send message down to IrLMP for the unreliable Ultra + * packet service... + */ +#ifdef CONFIG_IRDA_ULTRA +static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct irda_sock *self; + __u8 pid = 0; + int bound = 0; + struct sk_buff *skb; + unsigned char *asmptr; + int err; + + IRDA_DEBUG(4, "%s(), len=%zd\n", __FUNCTION__, len); + + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) + return -EINVAL; + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + send_sig(SIGPIPE, current, 0); + return -EPIPE; + } + + self = irda_sk(sk); + IRDA_ASSERT(self != NULL, return -1;); + + /* Check if an address was specified with sendto. Jean II */ + if (msg->msg_name) { + struct sockaddr_irda *addr = (struct sockaddr_irda *) msg->msg_name; + /* Check address, extract pid. Jean II */ + if (msg->msg_namelen < sizeof(*addr)) + return -EINVAL; + if (addr->sir_family != AF_IRDA) + return -EINVAL; + + pid = addr->sir_lsap_sel; + if (pid & 0x80) { + IRDA_DEBUG(0, "%s(), extension in PID not supp!\n", __FUNCTION__); + return -EOPNOTSUPP; + } + } else { + /* Check that the socket is properly bound to an Ultra + * port. Jean II */ + if ((self->lsap == NULL) || + (sk->sk_state != TCP_ESTABLISHED)) { + IRDA_DEBUG(0, "%s(), socket not bound to Ultra PID.\n", + __FUNCTION__); + return -ENOTCONN; + } + /* Use PID from socket */ + bound = 1; + } + + /* + * Check that we don't send out to big frames. This is an unreliable + * service, so we have no fragmentation and no coalescence + */ + if (len > self->max_data_size) { + IRDA_DEBUG(0, "%s(), Warning to much data! " + "Chopping frame from %zd to %d bytes!\n", + __FUNCTION__, len, self->max_data_size); + len = self->max_data_size; + } + + skb = sock_alloc_send_skb(sk, len + self->max_header_size, + msg->msg_flags & MSG_DONTWAIT, &err); + if (!skb) + return -ENOBUFS; + + skb_reserve(skb, self->max_header_size); + + IRDA_DEBUG(4, "%s(), appending user data\n", __FUNCTION__); + asmptr = skb->h.raw = skb_put(skb, len); + err = memcpy_fromiovec(asmptr, msg->msg_iov, len); + if (err) { + kfree_skb(skb); + return err; + } + + err = irlmp_connless_data_request((bound ? self->lsap : NULL), + skb, pid); + if (err) { + IRDA_DEBUG(0, "%s(), err=%d\n", __FUNCTION__, err); + return err; + } + return len; +} +#endif /* CONFIG_IRDA_ULTRA */ + +/* + * Function irda_shutdown (sk, how) + */ +static int irda_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + struct irda_sock *self = irda_sk(sk); + + IRDA_ASSERT(self != NULL, return -1;); + + IRDA_DEBUG(1, "%s(%p)\n", __FUNCTION__, self); + + sk->sk_state = TCP_CLOSE; + sk->sk_shutdown |= SEND_SHUTDOWN; + sk->sk_state_change(sk); + + if (self->iriap) { + iriap_close(self->iriap); + self->iriap = NULL; + } + + if (self->tsap) { + irttp_disconnect_request(self->tsap, NULL, P_NORMAL); + irttp_close_tsap(self->tsap); + self->tsap = NULL; + } + + /* A few cleanup so the socket look as good as new... */ + self->rx_flow = self->tx_flow = FLOW_START; /* needed ??? */ + self->daddr = DEV_ADDR_ANY; /* Until we get re-connected */ + self->saddr = 0x0; /* so IrLMP assign us any link */ + + return 0; +} + +/* + * Function irda_poll (file, sock, wait) + */ +static unsigned int irda_poll(struct file * file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + struct irda_sock *self = irda_sk(sk); + unsigned int mask; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + poll_wait(file, sk->sk_sleep, wait); + mask = 0; + + /* Exceptional events? */ + if (sk->sk_err) + mask |= POLLERR; + if (sk->sk_shutdown & RCV_SHUTDOWN) { + IRDA_DEBUG(0, "%s(), POLLHUP\n", __FUNCTION__); + mask |= POLLHUP; + } + + /* Readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue)) { + IRDA_DEBUG(4, "Socket is readable\n"); + mask |= POLLIN | POLLRDNORM; + } + + /* Connection-based need to check for termination and startup */ + switch (sk->sk_type) { + case SOCK_STREAM: + if (sk->sk_state == TCP_CLOSE) { + IRDA_DEBUG(0, "%s(), POLLHUP\n", __FUNCTION__); + mask |= POLLHUP; + } + + if (sk->sk_state == TCP_ESTABLISHED) { + if ((self->tx_flow == FLOW_START) && + sock_writeable(sk)) + { + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + } + } + break; + case SOCK_SEQPACKET: + if ((self->tx_flow == FLOW_START) && + sock_writeable(sk)) + { + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + } + break; + case SOCK_DGRAM: + if (sock_writeable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + break; + default: + break; + } + return mask; +} + +/* + * Function irda_ioctl (sock, cmd, arg) + */ +static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + + IRDA_DEBUG(4, "%s(), cmd=%#x\n", __FUNCTION__, cmd); + + switch (cmd) { + case TIOCOUTQ: { + long amount; + amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); + if (amount < 0) + amount = 0; + if (put_user(amount, (unsigned int __user *)arg)) + return -EFAULT; + return 0; + } + + case TIOCINQ: { + struct sk_buff *skb; + long amount = 0L; + /* These two are safe on a single CPU system as only user tasks fiddle here */ + if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) + amount = skb->len; + if (put_user(amount, (unsigned int __user *)arg)) + return -EFAULT; + return 0; + } + + case SIOCGSTAMP: + if (sk != NULL) + return sock_get_timestamp(sk, (struct timeval __user *)arg); + return -EINVAL; + + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + return -EINVAL; + default: + IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __FUNCTION__); + return dev_ioctl(cmd, (void __user *) arg); + } + + /*NOTREACHED*/ + return 0; +} + +/* + * Function irda_setsockopt (sock, level, optname, optval, optlen) + * + * Set some options for the socket + * + */ +static int irda_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + struct irda_sock *self = irda_sk(sk); + struct irda_ias_set *ias_opt; + struct ias_object *ias_obj; + struct ias_attrib * ias_attr; /* Attribute in IAS object */ + int opt; + + IRDA_ASSERT(self != NULL, return -1;); + + IRDA_DEBUG(2, "%s(%p)\n", __FUNCTION__, self); + + if (level != SOL_IRLMP) + return -ENOPROTOOPT; + + switch (optname) { + case IRLMP_IAS_SET: + /* The user want to add an attribute to an existing IAS object + * (in the IAS database) or to create a new object with this + * attribute. + * We first query IAS to know if the object exist, and then + * create the right attribute... + */ + + if (optlen != sizeof(struct irda_ias_set)) + return -EINVAL; + + ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC); + if (ias_opt == NULL) + return -ENOMEM; + + /* Copy query to the driver. */ + if (copy_from_user(ias_opt, optval, optlen)) { + kfree(ias_opt); + return -EFAULT; + } + + /* Find the object we target. + * If the user gives us an empty string, we use the object + * associated with this socket. This will workaround + * duplicated class name - Jean II */ + if(ias_opt->irda_class_name[0] == '\0') { + if(self->ias_obj == NULL) { + kfree(ias_opt); + return -EINVAL; + } + ias_obj = self->ias_obj; + } else + ias_obj = irias_find_object(ias_opt->irda_class_name); + + /* Only ROOT can mess with the global IAS database. + * Users can only add attributes to the object associated + * with the socket they own - Jean II */ + if((!capable(CAP_NET_ADMIN)) && + ((ias_obj == NULL) || (ias_obj != self->ias_obj))) { + kfree(ias_opt); + return -EPERM; + } + + /* If the object doesn't exist, create it */ + if(ias_obj == (struct ias_object *) NULL) { + /* Create a new object */ + ias_obj = irias_new_object(ias_opt->irda_class_name, + jiffies); + } + + /* Do we have the attribute already ? */ + if(irias_find_attrib(ias_obj, ias_opt->irda_attrib_name)) { + kfree(ias_opt); + return -EINVAL; + } + + /* Look at the type */ + switch(ias_opt->irda_attrib_type) { + case IAS_INTEGER: + /* Add an integer attribute */ + irias_add_integer_attrib( + ias_obj, + ias_opt->irda_attrib_name, + ias_opt->attribute.irda_attrib_int, + IAS_USER_ATTR); + break; + case IAS_OCT_SEQ: + /* Check length */ + if(ias_opt->attribute.irda_attrib_octet_seq.len > + IAS_MAX_OCTET_STRING) { + kfree(ias_opt); + return -EINVAL; + } + /* Add an octet sequence attribute */ + irias_add_octseq_attrib( + ias_obj, + ias_opt->irda_attrib_name, + ias_opt->attribute.irda_attrib_octet_seq.octet_seq, + ias_opt->attribute.irda_attrib_octet_seq.len, + IAS_USER_ATTR); + break; + case IAS_STRING: + /* Should check charset & co */ + /* Check length */ + /* The length is encoded in a __u8, and + * IAS_MAX_STRING == 256, so there is no way + * userspace can pass us a string too large. + * Jean II */ + /* NULL terminate the string (avoid troubles) */ + ias_opt->attribute.irda_attrib_string.string[ias_opt->attribute.irda_attrib_string.len] = '\0'; + /* Add a string attribute */ + irias_add_string_attrib( + ias_obj, + ias_opt->irda_attrib_name, + ias_opt->attribute.irda_attrib_string.string, + IAS_USER_ATTR); + break; + default : + kfree(ias_opt); + return -EINVAL; + } + irias_insert_object(ias_obj); + kfree(ias_opt); + break; + case IRLMP_IAS_DEL: + /* The user want to delete an object from our local IAS + * database. We just need to query the IAS, check is the + * object is not owned by the kernel and delete it. + */ + + if (optlen != sizeof(struct irda_ias_set)) + return -EINVAL; + + ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC); + if (ias_opt == NULL) + return -ENOMEM; + + /* Copy query to the driver. */ + if (copy_from_user(ias_opt, optval, optlen)) { + kfree(ias_opt); + return -EFAULT; + } + + /* Find the object we target. + * If the user gives us an empty string, we use the object + * associated with this socket. This will workaround + * duplicated class name - Jean II */ + if(ias_opt->irda_class_name[0] == '\0') + ias_obj = self->ias_obj; + else + ias_obj = irias_find_object(ias_opt->irda_class_name); + if(ias_obj == (struct ias_object *) NULL) { + kfree(ias_opt); + return -EINVAL; + } + + /* Only ROOT can mess with the global IAS database. + * Users can only del attributes from the object associated + * with the socket they own - Jean II */ + if((!capable(CAP_NET_ADMIN)) && + ((ias_obj == NULL) || (ias_obj != self->ias_obj))) { + kfree(ias_opt); + return -EPERM; + } + + /* Find the attribute (in the object) we target */ + ias_attr = irias_find_attrib(ias_obj, + ias_opt->irda_attrib_name); + if(ias_attr == (struct ias_attrib *) NULL) { + kfree(ias_opt); + return -EINVAL; + } + + /* Check is the user space own the object */ + if(ias_attr->value->owner != IAS_USER_ATTR) { + IRDA_DEBUG(1, "%s(), attempting to delete a kernel attribute\n", __FUNCTION__); + kfree(ias_opt); + return -EPERM; + } + + /* Remove the attribute (and maybe the object) */ + irias_delete_attrib(ias_obj, ias_attr, 1); + kfree(ias_opt); + break; + case IRLMP_MAX_SDU_SIZE: + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(opt, (int __user *)optval)) + return -EFAULT; + + /* Only possible for a seqpacket service (TTP with SAR) */ + if (sk->sk_type != SOCK_SEQPACKET) { + IRDA_DEBUG(2, "%s(), setting max_sdu_size = %d\n", + __FUNCTION__, opt); + self->max_sdu_size_rx = opt; + } else { + IRDA_WARNING("%s: not allowed to set MAXSDUSIZE for this socket type!\n", + __FUNCTION__); + return -ENOPROTOOPT; + } + break; + case IRLMP_HINTS_SET: + if (optlen < sizeof(int)) + return -EINVAL; + + /* The input is really a (__u8 hints[2]), easier as an int */ + if (get_user(opt, (int __user *)optval)) + return -EFAULT; + + /* Unregister any old registration */ + if (self->skey) + irlmp_unregister_service(self->skey); + + self->skey = irlmp_register_service((__u16) opt); + break; + case IRLMP_HINT_MASK_SET: + /* As opposed to the previous case which set the hint bits + * that we advertise, this one set the filter we use when + * making a discovery (nodes which don't match any hint + * bit in the mask are not reported). + */ + if (optlen < sizeof(int)) + return -EINVAL; + + /* The input is really a (__u8 hints[2]), easier as an int */ + if (get_user(opt, (int __user *)optval)) + return -EFAULT; + + /* Set the new hint mask */ + self->mask.word = (__u16) opt; + /* Mask out extension bits */ + self->mask.word &= 0x7f7f; + /* Check if no bits */ + if(!self->mask.word) + self->mask.word = 0xFFFF; + + break; + default: + return -ENOPROTOOPT; + } + return 0; +} + +/* + * Function irda_extract_ias_value(ias_opt, ias_value) + * + * Translate internal IAS value structure to the user space representation + * + * The external representation of IAS values, as we exchange them with + * user space program is quite different from the internal representation, + * as stored in the IAS database (because we need a flat structure for + * crossing kernel boundary). + * This function transform the former in the latter. We also check + * that the value type is valid. + */ +static int irda_extract_ias_value(struct irda_ias_set *ias_opt, + struct ias_value *ias_value) +{ + /* Look at the type */ + switch (ias_value->type) { + case IAS_INTEGER: + /* Copy the integer */ + ias_opt->attribute.irda_attrib_int = ias_value->t.integer; + break; + case IAS_OCT_SEQ: + /* Set length */ + ias_opt->attribute.irda_attrib_octet_seq.len = ias_value->len; + /* Copy over */ + memcpy(ias_opt->attribute.irda_attrib_octet_seq.octet_seq, + ias_value->t.oct_seq, ias_value->len); + break; + case IAS_STRING: + /* Set length */ + ias_opt->attribute.irda_attrib_string.len = ias_value->len; + ias_opt->attribute.irda_attrib_string.charset = ias_value->charset; + /* Copy over */ + memcpy(ias_opt->attribute.irda_attrib_string.string, + ias_value->t.string, ias_value->len); + /* NULL terminate the string (avoid troubles) */ + ias_opt->attribute.irda_attrib_string.string[ias_value->len] = '\0'; + break; + case IAS_MISSING: + default : + return -EINVAL; + } + + /* Copy type over */ + ias_opt->irda_attrib_type = ias_value->type; + + return 0; +} + +/* + * Function irda_getsockopt (sock, level, optname, optval, optlen) + */ +static int irda_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct irda_sock *self = irda_sk(sk); + struct irda_device_list list; + struct irda_device_info *discoveries; + struct irda_ias_set * ias_opt; /* IAS get/query params */ + struct ias_object * ias_obj; /* Object in IAS */ + struct ias_attrib * ias_attr; /* Attribute in IAS object */ + int daddr = DEV_ADDR_ANY; /* Dest address for IAS queries */ + int val = 0; + int len = 0; + int err; + int offset, total; + + IRDA_DEBUG(2, "%s(%p)\n", __FUNCTION__, self); + + if (level != SOL_IRLMP) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + if(len < 0) + return -EINVAL; + + switch (optname) { + case IRLMP_ENUMDEVICES: + /* Ask lmp for the current discovery log */ + discoveries = irlmp_get_discoveries(&list.len, self->mask.word, + self->nslots); + /* Check if the we got some results */ + if (discoveries == NULL) + return -EAGAIN; /* Didn't find any devices */ + err = 0; + + /* Write total list length back to client */ + if (copy_to_user(optval, &list, + sizeof(struct irda_device_list) - + sizeof(struct irda_device_info))) + err = -EFAULT; + + /* Offset to first device entry */ + offset = sizeof(struct irda_device_list) - + sizeof(struct irda_device_info); + + /* Copy the list itself - watch for overflow */ + if(list.len > 2048) + { + err = -EINVAL; + goto bed; + } + total = offset + (list.len * sizeof(struct irda_device_info)); + if (total > len) + total = len; + if (copy_to_user(optval+offset, discoveries, total - offset)) + err = -EFAULT; + + /* Write total number of bytes used back to client */ + if (put_user(total, optlen)) + err = -EFAULT; +bed: + /* Free up our buffer */ + kfree(discoveries); + if (err) + return err; + break; + case IRLMP_MAX_SDU_SIZE: + val = self->max_data_size; + len = sizeof(int); + if (put_user(len, optlen)) + return -EFAULT; + + if (copy_to_user(optval, &val, len)) + return -EFAULT; + break; + case IRLMP_IAS_GET: + /* The user want an object from our local IAS database. + * We just need to query the IAS and return the value + * that we found */ + + /* Check that the user has allocated the right space for us */ + if (len != sizeof(struct irda_ias_set)) + return -EINVAL; + + ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC); + if (ias_opt == NULL) + return -ENOMEM; + + /* Copy query to the driver. */ + if (copy_from_user(ias_opt, optval, len)) { + kfree(ias_opt); + return -EFAULT; + } + + /* Find the object we target. + * If the user gives us an empty string, we use the object + * associated with this socket. This will workaround + * duplicated class name - Jean II */ + if(ias_opt->irda_class_name[0] == '\0') + ias_obj = self->ias_obj; + else + ias_obj = irias_find_object(ias_opt->irda_class_name); + if(ias_obj == (struct ias_object *) NULL) { + kfree(ias_opt); + return -EINVAL; + } + + /* Find the attribute (in the object) we target */ + ias_attr = irias_find_attrib(ias_obj, + ias_opt->irda_attrib_name); + if(ias_attr == (struct ias_attrib *) NULL) { + kfree(ias_opt); + return -EINVAL; + } + + /* Translate from internal to user structure */ + err = irda_extract_ias_value(ias_opt, ias_attr->value); + if(err) { + kfree(ias_opt); + return err; + } + + /* Copy reply to the user */ + if (copy_to_user(optval, ias_opt, + sizeof(struct irda_ias_set))) { + kfree(ias_opt); + return -EFAULT; + } + /* Note : don't need to put optlen, we checked it */ + kfree(ias_opt); + break; + case IRLMP_IAS_QUERY: + /* The user want an object from a remote IAS database. + * We need to use IAP to query the remote database and + * then wait for the answer to come back. */ + + /* Check that the user has allocated the right space for us */ + if (len != sizeof(struct irda_ias_set)) + return -EINVAL; + + ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC); + if (ias_opt == NULL) + return -ENOMEM; + + /* Copy query to the driver. */ + if (copy_from_user(ias_opt, optval, len)) { + kfree(ias_opt); + return -EFAULT; + } + + /* At this point, there are two cases... + * 1) the socket is connected - that's the easy case, we + * just query the device we are connected to... + * 2) the socket is not connected - the user doesn't want + * to connect and/or may not have a valid service name + * (so can't create a fake connection). In this case, + * we assume that the user pass us a valid destination + * address in the requesting structure... + */ + if(self->daddr != DEV_ADDR_ANY) { + /* We are connected - reuse known daddr */ + daddr = self->daddr; + } else { + /* We are not connected, we must specify a valid + * destination address */ + daddr = ias_opt->daddr; + if((!daddr) || (daddr == DEV_ADDR_ANY)) { + kfree(ias_opt); + return -EINVAL; + } + } + + /* Check that we can proceed with IAP */ + if (self->iriap) { + IRDA_WARNING("%s: busy with a previous query\n", + __FUNCTION__); + kfree(ias_opt); + return -EBUSY; + } + + self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, + irda_getvalue_confirm); + + if (self->iriap == NULL) { + kfree(ias_opt); + return -ENOMEM; + } + + /* Treat unexpected wakeup as disconnect */ + self->errno = -EHOSTUNREACH; + + /* Query remote LM-IAS */ + iriap_getvaluebyclass_request(self->iriap, + self->saddr, daddr, + ias_opt->irda_class_name, + ias_opt->irda_attrib_name); + + /* Wait for answer, if not yet finished (or failed) */ + if (wait_event_interruptible(self->query_wait, + (self->iriap == NULL))) { + /* pending request uses copy of ias_opt-content + * we can free it regardless! */ + kfree(ias_opt); + /* Treat signals as disconnect */ + return -EHOSTUNREACH; + } + + /* Check what happened */ + if (self->errno) + { + kfree(ias_opt); + /* Requested object/attribute doesn't exist */ + if((self->errno == IAS_CLASS_UNKNOWN) || + (self->errno == IAS_ATTRIB_UNKNOWN)) + return (-EADDRNOTAVAIL); + else + return (-EHOSTUNREACH); + } + + /* Translate from internal to user structure */ + err = irda_extract_ias_value(ias_opt, self->ias_result); + if (self->ias_result) + irias_delete_value(self->ias_result); + if (err) { + kfree(ias_opt); + return err; + } + + /* Copy reply to the user */ + if (copy_to_user(optval, ias_opt, + sizeof(struct irda_ias_set))) { + kfree(ias_opt); + return -EFAULT; + } + /* Note : don't need to put optlen, we checked it */ + kfree(ias_opt); + break; + case IRLMP_WAITDEVICE: + /* This function is just another way of seeing life ;-) + * IRLMP_ENUMDEVICES assumes that you have a static network, + * and that you just want to pick one of the devices present. + * On the other hand, in here we assume that no device is + * present and that at some point in the future a device will + * come into range. When this device arrive, we just wake + * up the caller, so that he has time to connect to it before + * the device goes away... + * Note : once the node has been discovered for more than a + * few second, it won't trigger this function, unless it + * goes away and come back changes its hint bits (so we + * might call it IRLMP_WAITNEWDEVICE). + */ + + /* Check that the user is passing us an int */ + if (len != sizeof(int)) + return -EINVAL; + /* Get timeout in ms (max time we block the caller) */ + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + /* Tell IrLMP we want to be notified */ + irlmp_update_client(self->ckey, self->mask.word, + irda_selective_discovery_indication, + NULL, (void *) self); + + /* Do some discovery (and also return cached results) */ + irlmp_discovery_request(self->nslots); + + /* Wait until a node is discovered */ + if (!self->cachedaddr) { + int ret = 0; + + IRDA_DEBUG(1, "%s(), nothing discovered yet, going to sleep...\n", __FUNCTION__); + + /* Set watchdog timer to expire in ms. */ + self->errno = 0; + init_timer(&self->watchdog); + self->watchdog.function = irda_discovery_timeout; + self->watchdog.data = (unsigned long) self; + self->watchdog.expires = jiffies + (val * HZ/1000); + add_timer(&(self->watchdog)); + + /* Wait for IR-LMP to call us back */ + __wait_event_interruptible(self->query_wait, + (self->cachedaddr != 0 || self->errno == -ETIME), + ret); + + /* If watchdog is still activated, kill it! */ + if(timer_pending(&(self->watchdog))) + del_timer(&(self->watchdog)); + + IRDA_DEBUG(1, "%s(), ...waking up !\n", __FUNCTION__); + + if (ret != 0) + return ret; + } + else + IRDA_DEBUG(1, "%s(), found immediately !\n", + __FUNCTION__); + + /* Tell IrLMP that we have been notified */ + irlmp_update_client(self->ckey, self->mask.word, + NULL, NULL, NULL); + + /* Check if the we got some results */ + if (!self->cachedaddr) + return -EAGAIN; /* Didn't find any devices */ + daddr = self->cachedaddr; + /* Cleanup */ + self->cachedaddr = 0; + + /* We return the daddr of the device that trigger the + * wakeup. As irlmp pass us only the new devices, we + * are sure that it's not an old device. + * If the user want more details, he should query + * the whole discovery log and pick one device... + */ + if (put_user(daddr, (int __user *)optval)) + return -EFAULT; + + break; + default: + return -ENOPROTOOPT; + } + + return 0; +} + +static struct net_proto_family irda_family_ops = { + .family = PF_IRDA, + .create = irda_create, + .owner = THIS_MODULE, +}; + +static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { + .family = PF_IRDA, + .owner = THIS_MODULE, + .release = irda_release, + .bind = irda_bind, + .connect = irda_connect, + .socketpair = sock_no_socketpair, + .accept = irda_accept, + .getname = irda_getname, + .poll = irda_poll, + .ioctl = irda_ioctl, + .listen = irda_listen, + .shutdown = irda_shutdown, + .setsockopt = irda_setsockopt, + .getsockopt = irda_getsockopt, + .sendmsg = irda_sendmsg, + .recvmsg = irda_recvmsg_stream, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { + .family = PF_IRDA, + .owner = THIS_MODULE, + .release = irda_release, + .bind = irda_bind, + .connect = irda_connect, + .socketpair = sock_no_socketpair, + .accept = irda_accept, + .getname = irda_getname, + .poll = datagram_poll, + .ioctl = irda_ioctl, + .listen = irda_listen, + .shutdown = irda_shutdown, + .setsockopt = irda_setsockopt, + .getsockopt = irda_getsockopt, + .sendmsg = irda_sendmsg, + .recvmsg = irda_recvmsg_dgram, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { + .family = PF_IRDA, + .owner = THIS_MODULE, + .release = irda_release, + .bind = irda_bind, + .connect = irda_connect, + .socketpair = sock_no_socketpair, + .accept = irda_accept, + .getname = irda_getname, + .poll = datagram_poll, + .ioctl = irda_ioctl, + .listen = irda_listen, + .shutdown = irda_shutdown, + .setsockopt = irda_setsockopt, + .getsockopt = irda_getsockopt, + .sendmsg = irda_sendmsg_dgram, + .recvmsg = irda_recvmsg_dgram, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +#ifdef CONFIG_IRDA_ULTRA +static struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = { + .family = PF_IRDA, + .owner = THIS_MODULE, + .release = irda_release, + .bind = irda_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = irda_getname, + .poll = datagram_poll, + .ioctl = irda_ioctl, + .listen = sock_no_listen, + .shutdown = irda_shutdown, + .setsockopt = irda_setsockopt, + .getsockopt = irda_getsockopt, + .sendmsg = irda_sendmsg_ultra, + .recvmsg = irda_recvmsg_dgram, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; +#endif /* CONFIG_IRDA_ULTRA */ + +#include +SOCKOPS_WRAP(irda_stream, PF_IRDA); +SOCKOPS_WRAP(irda_seqpacket, PF_IRDA); +SOCKOPS_WRAP(irda_dgram, PF_IRDA); +#ifdef CONFIG_IRDA_ULTRA +SOCKOPS_WRAP(irda_ultra, PF_IRDA); +#endif /* CONFIG_IRDA_ULTRA */ + +/* + * Function irsock_init (pro) + * + * Initialize IrDA protocol + * + */ +int __init irsock_init(void) +{ + int rc = proto_register(&irda_proto, 0); + + if (rc == 0) + rc = sock_register(&irda_family_ops); + + return rc; +} + +/* + * Function irsock_cleanup (void) + * + * Remove IrDA protocol + * + */ +void __exit irsock_cleanup(void) +{ + sock_unregister(PF_IRDA); + proto_unregister(&irda_proto); +} diff --git a/net/irda/discovery.c b/net/irda/discovery.c new file mode 100644 index 000000000000..c4ba5fa1446a --- /dev/null +++ b/net/irda/discovery.c @@ -0,0 +1,419 @@ +/********************************************************************* + * + * Filename: discovery.c + * Version: 0.1 + * Description: Routines for handling discoveries at the IrLMP layer + * Status: Experimental. + * Author: Dag Brattli + * Created at: Tue Apr 6 15:33:50 1999 + * Modified at: Sat Oct 9 17:11:31 1999 + * Modified by: Dag Brattli + * Modified at: Fri May 28 3:11 CST 1999 + * Modified by: Horst von Brand + * + * Copyright (c) 1999 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include +#include + +#include +#include + +#include + +/* + * Function irlmp_add_discovery (cachelog, discovery) + * + * Add a new discovery to the cachelog, and remove any old discoveries + * from the same device + * + * Note : we try to preserve the time this device was *first* discovered + * (as opposed to the time of last discovery used for cleanup). This is + * used by clients waiting for discovery events to tell if the device + * discovered is "new" or just the same old one. They can't rely there + * on a binary flag (new/old), because not all discovery events are + * propagated to them, and they might not always listen, so they would + * miss some new devices popping up... + * Jean II + */ +void irlmp_add_discovery(hashbin_t *cachelog, discovery_t *new) +{ + discovery_t *discovery, *node; + unsigned long flags; + + /* Set time of first discovery if node is new (see below) */ + new->firststamp = new->timestamp; + + spin_lock_irqsave(&cachelog->hb_spinlock, flags); + + /* + * Remove all discoveries of devices that has previously been + * discovered on the same link with the same name (info), or the + * same daddr. We do this since some devices (mostly PDAs) change + * their device address between every discovery. + */ + discovery = (discovery_t *) hashbin_get_first(cachelog); + while (discovery != NULL ) { + node = discovery; + + /* Be sure to stay one item ahead */ + discovery = (discovery_t *) hashbin_get_next(cachelog); + + if ((node->data.saddr == new->data.saddr) && + ((node->data.daddr == new->data.daddr) || + (strcmp(node->data.info, new->data.info) == 0))) + { + /* This discovery is a previous discovery + * from the same device, so just remove it + */ + hashbin_remove_this(cachelog, (irda_queue_t *) node); + /* Check if hints bits are unchanged */ + if(u16ho(node->data.hints) == u16ho(new->data.hints)) + /* Set time of first discovery for this node */ + new->firststamp = node->firststamp; + kfree(node); + } + } + + /* Insert the new and updated version */ + hashbin_insert(cachelog, (irda_queue_t *) new, new->data.daddr, NULL); + + spin_unlock_irqrestore(&cachelog->hb_spinlock, flags); +} + +/* + * Function irlmp_add_discovery_log (cachelog, log) + * + * Merge a disovery log into the cachelog. + * + */ +void irlmp_add_discovery_log(hashbin_t *cachelog, hashbin_t *log) +{ + discovery_t *discovery; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + /* + * If log is missing this means that IrLAP was unable to perform the + * discovery, so restart discovery again with just the half timeout + * of the normal one. + */ + /* Well... It means that there was nobody out there - Jean II */ + if (log == NULL) { + /* irlmp_start_discovery_timer(irlmp, 150); */ + return; + } + + /* + * Locking : we are the only owner of this discovery log, so + * no need to lock it. + * We just need to lock the global log in irlmp_add_discovery(). + */ + discovery = (discovery_t *) hashbin_remove_first(log); + while (discovery != NULL) { + irlmp_add_discovery(cachelog, discovery); + + discovery = (discovery_t *) hashbin_remove_first(log); + } + + /* Delete the now empty log */ + hashbin_delete(log, (FREE_FUNC) kfree); +} + +/* + * Function irlmp_expire_discoveries (log, saddr, force) + * + * Go through all discoveries and expire all that has stayed too long + * + * Note : this assume that IrLAP won't change its saddr, which + * currently is a valid assumption... + */ +void irlmp_expire_discoveries(hashbin_t *log, __u32 saddr, int force) +{ + discovery_t * discovery; + discovery_t * curr; + unsigned long flags; + discinfo_t * buffer = NULL; + int n; /* Size of the full log */ + int i = 0; /* How many we expired */ + + IRDA_ASSERT(log != NULL, return;); + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + spin_lock_irqsave(&log->hb_spinlock, flags); + + discovery = (discovery_t *) hashbin_get_first(log); + while (discovery != NULL) { + /* Be sure to be one item ahead */ + curr = discovery; + discovery = (discovery_t *) hashbin_get_next(log); + + /* Test if it's time to expire this discovery */ + if ((curr->data.saddr == saddr) && + (force || + ((jiffies - curr->timestamp) > DISCOVERY_EXPIRE_TIMEOUT))) + { + /* Create buffer as needed. + * As this function get called a lot and most time + * we don't have anything to put in the log (we are + * quite picky), we can save a lot of overhead + * by not calling kmalloc. Jean II */ + if(buffer == NULL) { + /* Create the client specific buffer */ + n = HASHBIN_GET_SIZE(log); + buffer = kmalloc(n * sizeof(struct irda_device_info), GFP_ATOMIC); + if (buffer == NULL) { + spin_unlock_irqrestore(&log->hb_spinlock, flags); + return; + } + + } + + /* Copy discovery information */ + memcpy(&(buffer[i]), &(curr->data), + sizeof(discinfo_t)); + i++; + + /* Remove it from the log */ + curr = hashbin_remove_this(log, (irda_queue_t *) curr); + if (curr) + kfree(curr); + } + } + + /* Drop the spinlock before calling the higher layers, as + * we can't guarantee they won't call us back and create a + * deadlock. We will work on our own private data, so we + * don't care to be interupted. - Jean II */ + spin_unlock_irqrestore(&log->hb_spinlock, flags); + + if(buffer == NULL) + return; + + /* Tell IrLMP and registered clients about it */ + irlmp_discovery_expiry(buffer, i); + + /* Free up our buffer */ + kfree(buffer); +} + +#if 0 +/* + * Function irlmp_dump_discoveries (log) + * + * Print out all discoveries in log + * + */ +void irlmp_dump_discoveries(hashbin_t *log) +{ + discovery_t *discovery; + + IRDA_ASSERT(log != NULL, return;); + + discovery = (discovery_t *) hashbin_get_first(log); + while (discovery != NULL) { + IRDA_DEBUG(0, "Discovery:\n"); + IRDA_DEBUG(0, " daddr=%08x\n", discovery->data.daddr); + IRDA_DEBUG(0, " saddr=%08x\n", discovery->data.saddr); + IRDA_DEBUG(0, " nickname=%s\n", discovery->data.info); + + discovery = (discovery_t *) hashbin_get_next(log); + } +} +#endif + +/* + * Function irlmp_copy_discoveries (log, pn, mask) + * + * Copy all discoveries in a buffer + * + * This function implement a safe way for lmp clients to access the + * discovery log. The basic problem is that we don't want the log + * to change (add/remove) while the client is reading it. If the + * lmp client manipulate directly the hashbin, he is sure to get + * into troubles... + * The idea is that we copy all the current discovery log in a buffer + * which is specific to the client and pass this copy to him. As we + * do this operation with the spinlock grabbed, we are safe... + * Note : we don't want those clients to grab the spinlock, because + * we have no control on how long they will hold it... + * Note : we choose to copy the log in "struct irda_device_info" to + * save space... + * Note : the client must kfree himself() the log... + * Jean II + */ +struct irda_device_info *irlmp_copy_discoveries(hashbin_t *log, int *pn, + __u16 mask, int old_entries) +{ + discovery_t * discovery; + unsigned long flags; + discinfo_t * buffer = NULL; + int j_timeout = (sysctl_discovery_timeout * HZ); + int n; /* Size of the full log */ + int i = 0; /* How many we picked */ + + IRDA_ASSERT(pn != NULL, return NULL;); + IRDA_ASSERT(log != NULL, return NULL;); + + /* Save spin lock */ + spin_lock_irqsave(&log->hb_spinlock, flags); + + discovery = (discovery_t *) hashbin_get_first(log); + while (discovery != NULL) { + /* Mask out the ones we don't want : + * We want to match the discovery mask, and to get only + * the most recent one (unless we want old ones) */ + if ((u16ho(discovery->data.hints) & mask) && + ((old_entries) || + ((jiffies - discovery->firststamp) < j_timeout)) ) { + /* Create buffer as needed. + * As this function get called a lot and most time + * we don't have anything to put in the log (we are + * quite picky), we can save a lot of overhead + * by not calling kmalloc. Jean II */ + if(buffer == NULL) { + /* Create the client specific buffer */ + n = HASHBIN_GET_SIZE(log); + buffer = kmalloc(n * sizeof(struct irda_device_info), GFP_ATOMIC); + if (buffer == NULL) { + spin_unlock_irqrestore(&log->hb_spinlock, flags); + return NULL; + } + + } + + /* Copy discovery information */ + memcpy(&(buffer[i]), &(discovery->data), + sizeof(discinfo_t)); + i++; + } + discovery = (discovery_t *) hashbin_get_next(log); + } + + spin_unlock_irqrestore(&log->hb_spinlock, flags); + + /* Get the actual number of device in the buffer and return */ + *pn = i; + return(buffer); +} + +#ifdef CONFIG_PROC_FS +static inline discovery_t *discovery_seq_idx(loff_t pos) + +{ + discovery_t *discovery; + + for (discovery = (discovery_t *) hashbin_get_first(irlmp->cachelog); + discovery != NULL; + discovery = (discovery_t *) hashbin_get_next(irlmp->cachelog)) { + if (pos-- == 0) + break; + } + + return discovery; +} + +static void *discovery_seq_start(struct seq_file *seq, loff_t *pos) +{ + spin_lock_irq(&irlmp->cachelog->hb_spinlock); + return *pos ? discovery_seq_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *discovery_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return (v == SEQ_START_TOKEN) + ? (void *) hashbin_get_first(irlmp->cachelog) + : (void *) hashbin_get_next(irlmp->cachelog); +} + +static void discovery_seq_stop(struct seq_file *seq, void *v) +{ + spin_unlock_irq(&irlmp->cachelog->hb_spinlock); +} + +static int discovery_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "IrLMP: Discovery log:\n\n"); + else { + const discovery_t *discovery = v; + + seq_printf(seq, "nickname: %s, hint: 0x%02x%02x", + discovery->data.info, + discovery->data.hints[0], + discovery->data.hints[1]); +#if 0 + if ( discovery->data.hints[0] & HINT_PNP) + seq_puts(seq, "PnP Compatible "); + if ( discovery->data.hints[0] & HINT_PDA) + seq_puts(seq, "PDA/Palmtop "); + if ( discovery->data.hints[0] & HINT_COMPUTER) + seq_puts(seq, "Computer "); + if ( discovery->data.hints[0] & HINT_PRINTER) + seq_puts(seq, "Printer "); + if ( discovery->data.hints[0] & HINT_MODEM) + seq_puts(seq, "Modem "); + if ( discovery->data.hints[0] & HINT_FAX) + seq_puts(seq, "Fax "); + if ( discovery->data.hints[0] & HINT_LAN) + seq_puts(seq, "LAN Access "); + + if ( discovery->data.hints[1] & HINT_TELEPHONY) + seq_puts(seq, "Telephony "); + if ( discovery->data.hints[1] & HINT_FILE_SERVER) + seq_puts(seq, "File Server "); + if ( discovery->data.hints[1] & HINT_COMM) + seq_puts(seq, "IrCOMM "); + if ( discovery->data.hints[1] & HINT_OBEX) + seq_puts(seq, "IrOBEX "); +#endif + seq_printf(seq,", saddr: 0x%08x, daddr: 0x%08x\n\n", + discovery->data.saddr, + discovery->data.daddr); + + seq_putc(seq, '\n'); + } + return 0; +} + +static struct seq_operations discovery_seq_ops = { + .start = discovery_seq_start, + .next = discovery_seq_next, + .stop = discovery_seq_stop, + .show = discovery_seq_show, +}; + +static int discovery_seq_open(struct inode *inode, struct file *file) +{ + IRDA_ASSERT(irlmp != NULL, return -EINVAL;); + + return seq_open(file, &discovery_seq_ops); +} + +struct file_operations discovery_seq_fops = { + .owner = THIS_MODULE, + .open = discovery_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif diff --git a/net/irda/ircomm/Kconfig b/net/irda/ircomm/Kconfig new file mode 100644 index 000000000000..2d4c6b4a78d6 --- /dev/null +++ b/net/irda/ircomm/Kconfig @@ -0,0 +1,12 @@ +config IRCOMM + tristate "IrCOMM protocol" + depends on IRDA + help + Say Y here if you want to build support for the IrCOMM protocol. + To compile it as modules, choose M here: the modules will be + called ircomm and ircomm_tty. + IrCOMM implements serial port emulation, and makes it possible to + use all existing applications that understands TTY's with an + infrared link. Thus you should be able to use application like PPP, + minicom and others. + diff --git a/net/irda/ircomm/Makefile b/net/irda/ircomm/Makefile new file mode 100644 index 000000000000..48689458c086 --- /dev/null +++ b/net/irda/ircomm/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the Linux IrDA IrCOMM protocol layer. +# + +obj-$(CONFIG_IRCOMM) += ircomm.o ircomm-tty.o + +ircomm-objs := ircomm_core.o ircomm_event.o ircomm_lmp.o ircomm_ttp.o +ircomm-tty-objs := ircomm_tty.o ircomm_tty_attach.o ircomm_tty_ioctl.o ircomm_param.o diff --git a/net/irda/ircomm/ircomm_core.c b/net/irda/ircomm/ircomm_core.c new file mode 100644 index 000000000000..286881978858 --- /dev/null +++ b/net/irda/ircomm/ircomm_core.c @@ -0,0 +1,587 @@ +/********************************************************************* + * + * Filename: ircomm_core.c + * Version: 1.0 + * Description: IrCOMM service interface + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sun Jun 6 20:37:34 1999 + * Modified at: Tue Dec 21 13:26:41 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1999 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static int __ircomm_close(struct ircomm_cb *self); +static void ircomm_control_indication(struct ircomm_cb *self, + struct sk_buff *skb, int clen); + +#ifdef CONFIG_PROC_FS +extern struct proc_dir_entry *proc_irda; +static int ircomm_seq_open(struct inode *, struct file *); + +static struct file_operations ircomm_proc_fops = { + .owner = THIS_MODULE, + .open = ircomm_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* CONFIG_PROC_FS */ + +hashbin_t *ircomm = NULL; + +static int __init ircomm_init(void) +{ + ircomm = hashbin_new(HB_LOCK); + if (ircomm == NULL) { + IRDA_ERROR("%s(), can't allocate hashbin!\n", __FUNCTION__); + return -ENOMEM; + } + +#ifdef CONFIG_PROC_FS + { struct proc_dir_entry *ent; + ent = create_proc_entry("ircomm", 0, proc_irda); + if (ent) + ent->proc_fops = &ircomm_proc_fops; + } +#endif /* CONFIG_PROC_FS */ + + IRDA_MESSAGE("IrCOMM protocol (Dag Brattli)\n"); + + return 0; +} + +static void __exit ircomm_cleanup(void) +{ + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + hashbin_delete(ircomm, (FREE_FUNC) __ircomm_close); + +#ifdef CONFIG_PROC_FS + remove_proc_entry("ircomm", proc_irda); +#endif /* CONFIG_PROC_FS */ +} + +/* + * Function ircomm_open (client_notify) + * + * Start a new IrCOMM instance + * + */ +struct ircomm_cb *ircomm_open(notify_t *notify, __u8 service_type, int line) +{ + struct ircomm_cb *self = NULL; + int ret; + + IRDA_DEBUG(2, "%s(), service_type=0x%02x\n", __FUNCTION__ , + service_type); + + IRDA_ASSERT(ircomm != NULL, return NULL;); + + self = kmalloc(sizeof(struct ircomm_cb), GFP_ATOMIC); + if (self == NULL) + return NULL; + + memset(self, 0, sizeof(struct ircomm_cb)); + + self->notify = *notify; + self->magic = IRCOMM_MAGIC; + + /* Check if we should use IrLMP or IrTTP */ + if (service_type & IRCOMM_3_WIRE_RAW) { + self->flow_status = FLOW_START; + ret = ircomm_open_lsap(self); + } else + ret = ircomm_open_tsap(self); + + if (ret < 0) { + kfree(self); + return NULL; + } + + self->service_type = service_type; + self->line = line; + + hashbin_insert(ircomm, (irda_queue_t *) self, line, NULL); + + ircomm_next_state(self, IRCOMM_IDLE); + + return self; +} + +EXPORT_SYMBOL(ircomm_open); + +/* + * Function ircomm_close_instance (self) + * + * Remove IrCOMM instance + * + */ +static int __ircomm_close(struct ircomm_cb *self) +{ + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + /* Disconnect link if any */ + ircomm_do_event(self, IRCOMM_DISCONNECT_REQUEST, NULL, NULL); + + /* Remove TSAP */ + if (self->tsap) { + irttp_close_tsap(self->tsap); + self->tsap = NULL; + } + + /* Remove LSAP */ + if (self->lsap) { + irlmp_close_lsap(self->lsap); + self->lsap = NULL; + } + self->magic = 0; + + kfree(self); + + return 0; +} + +/* + * Function ircomm_close (self) + * + * Closes and removes the specified IrCOMM instance + * + */ +int ircomm_close(struct ircomm_cb *self) +{ + struct ircomm_cb *entry; + + IRDA_ASSERT(self != NULL, return -EIO;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -EIO;); + + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + entry = hashbin_remove(ircomm, self->line, NULL); + + IRDA_ASSERT(entry == self, return -1;); + + return __ircomm_close(self); +} + +EXPORT_SYMBOL(ircomm_close); + +/* + * Function ircomm_connect_request (self, service_type) + * + * Impl. of this function is differ from one of the reference. This + * function does discovery as well as sending connect request + * + */ +int ircomm_connect_request(struct ircomm_cb *self, __u8 dlsap_sel, + __u32 saddr, __u32 daddr, struct sk_buff *skb, + __u8 service_type) +{ + struct ircomm_info info; + int ret; + + IRDA_DEBUG(2 , "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;); + + self->service_type= service_type; + + info.dlsap_sel = dlsap_sel; + info.saddr = saddr; + info.daddr = daddr; + + ret = ircomm_do_event(self, IRCOMM_CONNECT_REQUEST, skb, &info); + + return ret; +} + +EXPORT_SYMBOL(ircomm_connect_request); + +/* + * Function ircomm_connect_indication (self, qos, skb) + * + * Notify user layer about the incoming connection + * + */ +void ircomm_connect_indication(struct ircomm_cb *self, struct sk_buff *skb, + struct ircomm_info *info) +{ + int clen = 0; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + /* Check if the packet contains data on the control channel */ + if (skb->len > 0) + clen = skb->data[0]; + + /* + * If there are any data hiding in the control channel, we must + * deliver it first. The side effect is that the control channel + * will be removed from the skb + */ + if (self->notify.connect_indication) + self->notify.connect_indication(self->notify.instance, self, + info->qos, info->max_data_size, + info->max_header_size, skb); + else { + IRDA_DEBUG(0, "%s(), missing handler\n", __FUNCTION__ ); + } +} + +/* + * Function ircomm_connect_response (self, userdata, max_sdu_size) + * + * User accepts connection + * + */ +int ircomm_connect_response(struct ircomm_cb *self, struct sk_buff *userdata) +{ + int ret; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;); + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + ret = ircomm_do_event(self, IRCOMM_CONNECT_RESPONSE, userdata, NULL); + + return ret; +} + +EXPORT_SYMBOL(ircomm_connect_response); + +/* + * Function connect_confirm (self, skb) + * + * Notify user layer that the link is now connected + * + */ +void ircomm_connect_confirm(struct ircomm_cb *self, struct sk_buff *skb, + struct ircomm_info *info) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + if (self->notify.connect_confirm ) + self->notify.connect_confirm(self->notify.instance, + self, info->qos, + info->max_data_size, + info->max_header_size, skb); + else { + IRDA_DEBUG(0, "%s(), missing handler\n", __FUNCTION__ ); + } +} + +/* + * Function ircomm_data_request (self, userdata) + * + * Send IrCOMM data to peer device + * + */ +int ircomm_data_request(struct ircomm_cb *self, struct sk_buff *skb) +{ + int ret; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -EFAULT;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -EFAULT;); + IRDA_ASSERT(skb != NULL, return -EFAULT;); + + ret = ircomm_do_event(self, IRCOMM_DATA_REQUEST, skb, NULL); + + return ret; +} + +EXPORT_SYMBOL(ircomm_data_request); + +/* + * Function ircomm_data_indication (self, skb) + * + * Data arrived, so deliver it to user + * + */ +void ircomm_data_indication(struct ircomm_cb *self, struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(skb->len > 0, return;); + + if (self->notify.data_indication) + self->notify.data_indication(self->notify.instance, self, skb); + else { + IRDA_DEBUG(0, "%s(), missing handler\n", __FUNCTION__ ); + } +} + +/* + * Function ircomm_process_data (self, skb) + * + * Data arrived which may contain control channel data + * + */ +void ircomm_process_data(struct ircomm_cb *self, struct sk_buff *skb) +{ + int clen; + + IRDA_ASSERT(skb->len > 0, return;); + + clen = skb->data[0]; + + /* + * If there are any data hiding in the control channel, we must + * deliver it first. The side effect is that the control channel + * will be removed from the skb + */ + if (clen > 0) + ircomm_control_indication(self, skb, clen); + + /* Remove control channel from data channel */ + skb_pull(skb, clen+1); + + if (skb->len) + ircomm_data_indication(self, skb); + else { + IRDA_DEBUG(4, "%s(), data was control info only!\n", + __FUNCTION__ ); + } +} + +/* + * Function ircomm_control_request (self, params) + * + * Send control data to peer device + * + */ +int ircomm_control_request(struct ircomm_cb *self, struct sk_buff *skb) +{ + int ret; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -EFAULT;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -EFAULT;); + IRDA_ASSERT(skb != NULL, return -EFAULT;); + + ret = ircomm_do_event(self, IRCOMM_CONTROL_REQUEST, skb, NULL); + + return ret; +} + +EXPORT_SYMBOL(ircomm_control_request); + +/* + * Function ircomm_control_indication (self, skb) + * + * Data has arrived on the control channel + * + */ +static void ircomm_control_indication(struct ircomm_cb *self, + struct sk_buff *skb, int clen) +{ + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + /* Use udata for delivering data on the control channel */ + if (self->notify.udata_indication) { + struct sk_buff *ctrl_skb; + + /* We don't own the skb, so clone it */ + ctrl_skb = skb_clone(skb, GFP_ATOMIC); + if (!ctrl_skb) + return; + + /* Remove data channel from control channel */ + skb_trim(ctrl_skb, clen+1); + + self->notify.udata_indication(self->notify.instance, self, + ctrl_skb); + + /* Drop reference count - + * see ircomm_tty_control_indication(). */ + dev_kfree_skb(ctrl_skb); + } else { + IRDA_DEBUG(0, "%s(), missing handler\n", __FUNCTION__ ); + } +} + +/* + * Function ircomm_disconnect_request (self, userdata, priority) + * + * User layer wants to disconnect the IrCOMM connection + * + */ +int ircomm_disconnect_request(struct ircomm_cb *self, struct sk_buff *userdata) +{ + struct ircomm_info info; + int ret; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;); + + ret = ircomm_do_event(self, IRCOMM_DISCONNECT_REQUEST, userdata, + &info); + return ret; +} + +EXPORT_SYMBOL(ircomm_disconnect_request); + +/* + * Function disconnect_indication (self, skb) + * + * Tell user that the link has been disconnected + * + */ +void ircomm_disconnect_indication(struct ircomm_cb *self, struct sk_buff *skb, + struct ircomm_info *info) +{ + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(info != NULL, return;); + + if (self->notify.disconnect_indication) { + self->notify.disconnect_indication(self->notify.instance, self, + info->reason, skb); + } else { + IRDA_DEBUG(0, "%s(), missing handler\n", __FUNCTION__ ); + } +} + +/* + * Function ircomm_flow_request (self, flow) + * + * + * + */ +void ircomm_flow_request(struct ircomm_cb *self, LOCAL_FLOW flow) +{ + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + + if (self->service_type == IRCOMM_3_WIRE_RAW) + return; + + irttp_flow_request(self->tsap, flow); +} + +EXPORT_SYMBOL(ircomm_flow_request); + +#ifdef CONFIG_PROC_FS +static void *ircomm_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct ircomm_cb *self; + loff_t off = 0; + + spin_lock_irq(&ircomm->hb_spinlock); + + for (self = (struct ircomm_cb *) hashbin_get_first(ircomm); + self != NULL; + self = (struct ircomm_cb *) hashbin_get_next(ircomm)) { + if (off++ == *pos) + break; + + } + return self; +} + +static void *ircomm_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + + return (void *) hashbin_get_next(ircomm); +} + +static void ircomm_seq_stop(struct seq_file *seq, void *v) +{ + spin_unlock_irq(&ircomm->hb_spinlock); +} + +static int ircomm_seq_show(struct seq_file *seq, void *v) +{ + const struct ircomm_cb *self = v; + + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -EINVAL; ); + + if(self->line < 0x10) + seq_printf(seq, "ircomm%d", self->line); + else + seq_printf(seq, "irlpt%d", self->line - 0x10); + + seq_printf(seq, + " state: %s, slsap_sel: %#02x, dlsap_sel: %#02x, mode:", + ircomm_state[ self->state], + self->slsap_sel, self->dlsap_sel); + + if(self->service_type & IRCOMM_3_WIRE_RAW) + seq_printf(seq, " 3-wire-raw"); + if(self->service_type & IRCOMM_3_WIRE) + seq_printf(seq, " 3-wire"); + if(self->service_type & IRCOMM_9_WIRE) + seq_printf(seq, " 9-wire"); + if(self->service_type & IRCOMM_CENTRONICS) + seq_printf(seq, " Centronics"); + seq_putc(seq, '\n'); + + return 0; +} + +static struct seq_operations ircomm_seq_ops = { + .start = ircomm_seq_start, + .next = ircomm_seq_next, + .stop = ircomm_seq_stop, + .show = ircomm_seq_show, +}; + +static int ircomm_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &ircomm_seq_ops); +} +#endif /* CONFIG_PROC_FS */ + +MODULE_AUTHOR("Dag Brattli "); +MODULE_DESCRIPTION("IrCOMM protocol"); +MODULE_LICENSE("GPL"); + +module_init(ircomm_init); +module_exit(ircomm_cleanup); diff --git a/net/irda/ircomm/ircomm_event.c b/net/irda/ircomm/ircomm_event.c new file mode 100644 index 000000000000..01f4e801a1ba --- /dev/null +++ b/net/irda/ircomm/ircomm_event.c @@ -0,0 +1,251 @@ +/********************************************************************* + * + * Filename: ircomm_event.c + * Version: 1.0 + * Description: IrCOMM layer state machine + * Status: Stable + * Author: Dag Brattli + * Created at: Sun Jun 6 20:33:11 1999 + * Modified at: Sun Dec 12 13:44:32 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1999 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +static int ircomm_state_idle(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info); +static int ircomm_state_waiti(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info); +static int ircomm_state_waitr(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info); +static int ircomm_state_conn(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info); + +char *ircomm_state[] = { + "IRCOMM_IDLE", + "IRCOMM_WAITI", + "IRCOMM_WAITR", + "IRCOMM_CONN", +}; + +#ifdef CONFIG_IRDA_DEBUG +static char *ircomm_event[] = { + "IRCOMM_CONNECT_REQUEST", + "IRCOMM_CONNECT_RESPONSE", + "IRCOMM_TTP_CONNECT_INDICATION", + "IRCOMM_LMP_CONNECT_INDICATION", + "IRCOMM_TTP_CONNECT_CONFIRM", + "IRCOMM_LMP_CONNECT_CONFIRM", + + "IRCOMM_LMP_DISCONNECT_INDICATION", + "IRCOMM_TTP_DISCONNECT_INDICATION", + "IRCOMM_DISCONNECT_REQUEST", + + "IRCOMM_TTP_DATA_INDICATION", + "IRCOMM_LMP_DATA_INDICATION", + "IRCOMM_DATA_REQUEST", + "IRCOMM_CONTROL_REQUEST", + "IRCOMM_CONTROL_INDICATION", +}; +#endif /* CONFIG_IRDA_DEBUG */ + +static int (*state[])(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info) = +{ + ircomm_state_idle, + ircomm_state_waiti, + ircomm_state_waitr, + ircomm_state_conn, +}; + +/* + * Function ircomm_state_idle (self, event, skb) + * + * IrCOMM is currently idle + * + */ +static int ircomm_state_idle(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info) +{ + int ret = 0; + + switch (event) { + case IRCOMM_CONNECT_REQUEST: + ircomm_next_state(self, IRCOMM_WAITI); + ret = self->issue.connect_request(self, skb, info); + break; + case IRCOMM_TTP_CONNECT_INDICATION: + case IRCOMM_LMP_CONNECT_INDICATION: + ircomm_next_state(self, IRCOMM_WAITR); + ircomm_connect_indication(self, skb, info); + break; + default: + IRDA_DEBUG(4, "%s(), unknown event: %s\n", __FUNCTION__ , + ircomm_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_state_waiti (self, event, skb) + * + * The IrCOMM user has requested an IrCOMM connection to the remote + * device and is awaiting confirmation + */ +static int ircomm_state_waiti(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info) +{ + int ret = 0; + + switch (event) { + case IRCOMM_TTP_CONNECT_CONFIRM: + case IRCOMM_LMP_CONNECT_CONFIRM: + ircomm_next_state(self, IRCOMM_CONN); + ircomm_connect_confirm(self, skb, info); + break; + case IRCOMM_TTP_DISCONNECT_INDICATION: + case IRCOMM_LMP_DISCONNECT_INDICATION: + ircomm_next_state(self, IRCOMM_IDLE); + ircomm_disconnect_indication(self, skb, info); + break; + default: + IRDA_DEBUG(0, "%s(), unknown event: %s\n", __FUNCTION__ , + ircomm_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_state_waitr (self, event, skb) + * + * IrCOMM has received an incoming connection request and is awaiting + * response from the user + */ +static int ircomm_state_waitr(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info) +{ + int ret = 0; + + switch (event) { + case IRCOMM_CONNECT_RESPONSE: + ircomm_next_state(self, IRCOMM_CONN); + ret = self->issue.connect_response(self, skb); + break; + case IRCOMM_DISCONNECT_REQUEST: + ircomm_next_state(self, IRCOMM_IDLE); + ret = self->issue.disconnect_request(self, skb, info); + break; + case IRCOMM_TTP_DISCONNECT_INDICATION: + case IRCOMM_LMP_DISCONNECT_INDICATION: + ircomm_next_state(self, IRCOMM_IDLE); + ircomm_disconnect_indication(self, skb, info); + break; + default: + IRDA_DEBUG(0, "%s(), unknown event = %s\n", __FUNCTION__ , + ircomm_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_state_conn (self, event, skb) + * + * IrCOMM is connected to the peer IrCOMM device + * + */ +static int ircomm_state_conn(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info) +{ + int ret = 0; + + switch (event) { + case IRCOMM_DATA_REQUEST: + ret = self->issue.data_request(self, skb, 0); + break; + case IRCOMM_TTP_DATA_INDICATION: + ircomm_process_data(self, skb); + break; + case IRCOMM_LMP_DATA_INDICATION: + ircomm_data_indication(self, skb); + break; + case IRCOMM_CONTROL_REQUEST: + /* Just send a separate frame for now */ + ret = self->issue.data_request(self, skb, skb->len); + break; + case IRCOMM_TTP_DISCONNECT_INDICATION: + case IRCOMM_LMP_DISCONNECT_INDICATION: + ircomm_next_state(self, IRCOMM_IDLE); + ircomm_disconnect_indication(self, skb, info); + break; + case IRCOMM_DISCONNECT_REQUEST: + ircomm_next_state(self, IRCOMM_IDLE); + ret = self->issue.disconnect_request(self, skb, info); + break; + default: + IRDA_DEBUG(0, "%s(), unknown event = %s\n", __FUNCTION__ , + ircomm_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_do_event (self, event, skb) + * + * Process event + * + */ +int ircomm_do_event(struct ircomm_cb *self, IRCOMM_EVENT event, + struct sk_buff *skb, struct ircomm_info *info) +{ + IRDA_DEBUG(4, "%s: state=%s, event=%s\n", __FUNCTION__ , + ircomm_state[self->state], ircomm_event[event]); + + return (*state[self->state])(self, event, skb, info); +} + +/* + * Function ircomm_next_state (self, state) + * + * Switch state + * + */ +void ircomm_next_state(struct ircomm_cb *self, IRCOMM_STATE state) +{ + self->state = state; + + IRDA_DEBUG(4, "%s: next state=%s, service type=%d\n", __FUNCTION__ , + ircomm_state[self->state], self->service_type); +} diff --git a/net/irda/ircomm/ircomm_lmp.c b/net/irda/ircomm/ircomm_lmp.c new file mode 100644 index 000000000000..d9097207aed3 --- /dev/null +++ b/net/irda/ircomm/ircomm_lmp.c @@ -0,0 +1,372 @@ +/********************************************************************* + * + * Filename: ircomm_lmp.c + * Version: 1.0 + * Description: Interface between IrCOMM and IrLMP + * Status: Stable + * Author: Dag Brattli + * Created at: Sun Jun 6 20:48:27 1999 + * Modified at: Sun Dec 12 13:44:17 1999 + * Modified by: Dag Brattli + * Sources: Previous IrLPT work by Thomas Davis + * + * Copyright (c) 1999 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include + +#include +#include +#include +#include /* struct irda_skb_cb */ + +#include +#include + + +/* + * Function ircomm_lmp_connect_request (self, userdata) + * + * + * + */ +static int ircomm_lmp_connect_request(struct ircomm_cb *self, + struct sk_buff *userdata, + struct ircomm_info *info) +{ + int ret = 0; + + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + /* Don't forget to refcount it - should be NULL anyway */ + if(userdata) + skb_get(userdata); + + ret = irlmp_connect_request(self->lsap, info->dlsap_sel, + info->saddr, info->daddr, NULL, userdata); + return ret; +} + +/* + * Function ircomm_lmp_connect_response (self, skb) + * + * + * + */ +static int ircomm_lmp_connect_response(struct ircomm_cb *self, + struct sk_buff *userdata) +{ + struct sk_buff *tx_skb; + int ret; + + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + /* Any userdata supplied? */ + if (userdata == NULL) { + tx_skb = dev_alloc_skb(64); + if (!tx_skb) + return -ENOMEM; + + /* Reserve space for MUX and LAP header */ + skb_reserve(tx_skb, LMP_MAX_HEADER); + } else { + /* + * Check that the client has reserved enough space for + * headers + */ + IRDA_ASSERT(skb_headroom(userdata) >= LMP_MAX_HEADER, + return -1;); + + /* Don't forget to refcount it - should be NULL anyway */ + skb_get(userdata); + tx_skb = userdata; + } + + ret = irlmp_connect_response(self->lsap, tx_skb); + + return 0; +} + +static int ircomm_lmp_disconnect_request(struct ircomm_cb *self, + struct sk_buff *userdata, + struct ircomm_info *info) +{ + struct sk_buff *tx_skb; + int ret; + + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + if (!userdata) { + tx_skb = dev_alloc_skb(64); + if (!tx_skb) + return -ENOMEM; + + /* Reserve space for MUX and LAP header */ + skb_reserve(tx_skb, LMP_MAX_HEADER); + userdata = tx_skb; + } else { + /* Don't forget to refcount it - should be NULL anyway */ + skb_get(userdata); + } + + ret = irlmp_disconnect_request(self->lsap, userdata); + + return ret; +} + +/* + * Function ircomm_lmp_flow_control (skb) + * + * This function is called when a data frame we have sent to IrLAP has + * been deallocated. We do this to make sure we don't flood IrLAP with + * frames, since we are not using the IrTTP flow control mechanism + */ +static void ircomm_lmp_flow_control(struct sk_buff *skb) +{ + struct irda_skb_cb *cb; + struct ircomm_cb *self; + int line; + + IRDA_ASSERT(skb != NULL, return;); + + cb = (struct irda_skb_cb *) skb->cb; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + line = cb->line; + + self = (struct ircomm_cb *) hashbin_lock_find(ircomm, line, NULL); + if (!self) { + IRDA_DEBUG(2, "%s(), didn't find myself\n", __FUNCTION__ ); + return; + } + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + + self->pkt_count--; + + if ((self->pkt_count < 2) && (self->flow_status == FLOW_STOP)) { + IRDA_DEBUG(2, "%s(), asking TTY to start again!\n", __FUNCTION__ ); + self->flow_status = FLOW_START; + if (self->notify.flow_indication) + self->notify.flow_indication(self->notify.instance, + self, FLOW_START); + } +} + +/* + * Function ircomm_lmp_data_request (self, userdata) + * + * Send data frame to peer device + * + */ +static int ircomm_lmp_data_request(struct ircomm_cb *self, + struct sk_buff *skb, + int not_used) +{ + struct irda_skb_cb *cb; + int ret; + + IRDA_ASSERT(skb != NULL, return -1;); + + cb = (struct irda_skb_cb *) skb->cb; + + cb->line = self->line; + + IRDA_DEBUG(4, "%s(), sending frame\n", __FUNCTION__ ); + + /* Don't forget to refcount it - see ircomm_tty_do_softint() */ + skb_get(skb); + + skb->destructor = ircomm_lmp_flow_control; + + if ((self->pkt_count++ > 7) && (self->flow_status == FLOW_START)) { + IRDA_DEBUG(2, "%s(), asking TTY to slow down!\n", __FUNCTION__ ); + self->flow_status = FLOW_STOP; + if (self->notify.flow_indication) + self->notify.flow_indication(self->notify.instance, + self, FLOW_STOP); + } + ret = irlmp_data_request(self->lsap, skb); + if (ret) { + IRDA_ERROR("%s(), failed\n", __FUNCTION__); + /* irlmp_data_request already free the packet */ + } + + return ret; +} + +/* + * Function ircomm_lmp_data_indication (instance, sap, skb) + * + * Incoming data which we must deliver to the state machine, to check + * we are still connected. + */ +static int ircomm_lmp_data_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct ircomm_cb *self = (struct ircomm_cb *) instance; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;); + IRDA_ASSERT(skb != NULL, return -1;); + + ircomm_do_event(self, IRCOMM_LMP_DATA_INDICATION, skb, NULL); + + /* Drop reference count - see ircomm_tty_data_indication(). */ + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function ircomm_lmp_connect_confirm (instance, sap, qos, max_sdu_size, + * max_header_size, skb) + * + * Connection has been confirmed by peer device + * + */ +static void ircomm_lmp_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_seg_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct ircomm_cb *self = (struct ircomm_cb *) instance; + struct ircomm_info info; + + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(qos != NULL, return;); + + info.max_data_size = max_seg_size; + info.max_header_size = max_header_size; + info.qos = qos; + + ircomm_do_event(self, IRCOMM_LMP_CONNECT_CONFIRM, skb, &info); + + /* Drop reference count - see ircomm_tty_connect_confirm(). */ + dev_kfree_skb(skb); +} + +/* + * Function ircomm_lmp_connect_indication (instance, sap, qos, max_sdu_size, + * max_header_size, skb) + * + * Peer device wants to make a connection with us + * + */ +static void ircomm_lmp_connect_indication(void *instance, void *sap, + struct qos_info *qos, + __u32 max_seg_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct ircomm_cb *self = (struct ircomm_cb *)instance; + struct ircomm_info info; + + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(qos != NULL, return;); + + info.max_data_size = max_seg_size; + info.max_header_size = max_header_size; + info.qos = qos; + + ircomm_do_event(self, IRCOMM_LMP_CONNECT_INDICATION, skb, &info); + + /* Drop reference count - see ircomm_tty_connect_indication(). */ + dev_kfree_skb(skb); +} + +/* + * Function ircomm_lmp_disconnect_indication (instance, sap, reason, skb) + * + * Peer device has closed the connection, or the link went down for some + * other reason + */ +static void ircomm_lmp_disconnect_indication(void *instance, void *sap, + LM_REASON reason, + struct sk_buff *skb) +{ + struct ircomm_cb *self = (struct ircomm_cb *) instance; + struct ircomm_info info; + + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + + info.reason = reason; + + ircomm_do_event(self, IRCOMM_LMP_DISCONNECT_INDICATION, skb, &info); + + /* Drop reference count - see ircomm_tty_disconnect_indication(). */ + if(skb) + dev_kfree_skb(skb); +} +/* + * Function ircomm_open_lsap (self) + * + * Open LSAP. This function will only be used when using "raw" services + * + */ +int ircomm_open_lsap(struct ircomm_cb *self) +{ + notify_t notify; + + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + /* Register callbacks */ + irda_notify_init(¬ify); + notify.data_indication = ircomm_lmp_data_indication; + notify.connect_confirm = ircomm_lmp_connect_confirm; + notify.connect_indication = ircomm_lmp_connect_indication; + notify.disconnect_indication = ircomm_lmp_disconnect_indication; + notify.instance = self; + strlcpy(notify.name, "IrCOMM", sizeof(notify.name)); + + self->lsap = irlmp_open_lsap(LSAP_ANY, ¬ify, 0); + if (!self->lsap) { + IRDA_DEBUG(0,"%sfailed to allocate tsap\n", __FUNCTION__ ); + return -1; + } + self->slsap_sel = self->lsap->slsap_sel; + + /* + * Initialize the call-table for issuing commands + */ + self->issue.data_request = ircomm_lmp_data_request; + self->issue.connect_request = ircomm_lmp_connect_request; + self->issue.connect_response = ircomm_lmp_connect_response; + self->issue.disconnect_request = ircomm_lmp_disconnect_request; + + return 0; +} diff --git a/net/irda/ircomm/ircomm_param.c b/net/irda/ircomm/ircomm_param.c new file mode 100644 index 000000000000..6009bab05091 --- /dev/null +++ b/net/irda/ircomm/ircomm_param.c @@ -0,0 +1,511 @@ +/********************************************************************* + * + * Filename: ircomm_param.c + * Version: 1.0 + * Description: Parameter handling for the IrCOMM protocol + * Status: Experimental. + * Author: Dag Brattli + * Created at: Mon Jun 7 10:25:11 1999 + * Modified at: Sun Jan 30 14:32:03 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include + +static int ircomm_param_service_type(void *instance, irda_param_t *param, + int get); +static int ircomm_param_port_type(void *instance, irda_param_t *param, + int get); +static int ircomm_param_port_name(void *instance, irda_param_t *param, + int get); +static int ircomm_param_service_type(void *instance, irda_param_t *param, + int get); +static int ircomm_param_data_rate(void *instance, irda_param_t *param, + int get); +static int ircomm_param_data_format(void *instance, irda_param_t *param, + int get); +static int ircomm_param_flow_control(void *instance, irda_param_t *param, + int get); +static int ircomm_param_xon_xoff(void *instance, irda_param_t *param, int get); +static int ircomm_param_enq_ack(void *instance, irda_param_t *param, int get); +static int ircomm_param_line_status(void *instance, irda_param_t *param, + int get); +static int ircomm_param_dte(void *instance, irda_param_t *param, int get); +static int ircomm_param_dce(void *instance, irda_param_t *param, int get); +static int ircomm_param_poll(void *instance, irda_param_t *param, int get); + +static pi_minor_info_t pi_minor_call_table_common[] = { + { ircomm_param_service_type, PV_INT_8_BITS }, + { ircomm_param_port_type, PV_INT_8_BITS }, + { ircomm_param_port_name, PV_STRING } +}; +static pi_minor_info_t pi_minor_call_table_non_raw[] = { + { ircomm_param_data_rate, PV_INT_32_BITS | PV_BIG_ENDIAN }, + { ircomm_param_data_format, PV_INT_8_BITS }, + { ircomm_param_flow_control, PV_INT_8_BITS }, + { ircomm_param_xon_xoff, PV_INT_16_BITS }, + { ircomm_param_enq_ack, PV_INT_16_BITS }, + { ircomm_param_line_status, PV_INT_8_BITS } +}; +static pi_minor_info_t pi_minor_call_table_9_wire[] = { + { ircomm_param_dte, PV_INT_8_BITS }, + { ircomm_param_dce, PV_INT_8_BITS }, + { ircomm_param_poll, PV_NO_VALUE }, +}; + +static pi_major_info_t pi_major_call_table[] = { + { pi_minor_call_table_common, 3 }, + { pi_minor_call_table_non_raw, 6 }, + { pi_minor_call_table_9_wire, 3 } +/* { pi_minor_call_table_centronics } */ +}; + +pi_param_info_t ircomm_param_info = { pi_major_call_table, 3, 0x0f, 4 }; + +/* + * Function ircomm_param_request (self, pi, flush) + * + * Queue a parameter for the control channel + * + */ +int ircomm_param_request(struct ircomm_tty_cb *self, __u8 pi, int flush) +{ + struct tty_struct *tty; + unsigned long flags; + struct sk_buff *skb; + int count; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + tty = self->tty; + if (!tty) + return 0; + + /* Make sure we don't send parameters for raw mode */ + if (self->service_type == IRCOMM_3_WIRE_RAW) + return 0; + + spin_lock_irqsave(&self->spinlock, flags); + + skb = self->ctrl_skb; + if (!skb) { + skb = dev_alloc_skb(256); + if (!skb) { + spin_unlock_irqrestore(&self->spinlock, flags); + return -ENOMEM; + } + + skb_reserve(skb, self->max_header_size); + self->ctrl_skb = skb; + } + /* + * Inserting is a little bit tricky since we don't know how much + * room we will need. But this should hopefully work OK + */ + count = irda_param_insert(self, pi, skb->tail, skb_tailroom(skb), + &ircomm_param_info); + if (count < 0) { + IRDA_WARNING("%s(), no room for parameter!\n", __FUNCTION__); + spin_unlock_irqrestore(&self->spinlock, flags); + return -1; + } + skb_put(skb, count); + + spin_unlock_irqrestore(&self->spinlock, flags); + + IRDA_DEBUG(2, "%s(), skb->len=%d\n", __FUNCTION__ , skb->len); + + if (flush) { + /* ircomm_tty_do_softint will take care of the rest */ + schedule_work(&self->tqueue); + } + + return count; +} + +/* + * Function ircomm_param_service_type (self, buf, len) + * + * Handle service type, this function will both be called after the LM-IAS + * query and then the remote device sends its initial parameters + * + */ +static int ircomm_param_service_type(void *instance, irda_param_t *param, + int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + __u8 service_type = (__u8) param->pv.i; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) { + param->pv.i = self->settings.service_type; + return 0; + } + + /* Find all common service types */ + service_type &= self->service_type; + if (!service_type) { + IRDA_DEBUG(2, + "%s(), No common service type to use!\n", __FUNCTION__ ); + return -1; + } + IRDA_DEBUG(0, "%s(), services in common=%02x\n", __FUNCTION__ , + service_type); + + /* + * Now choose a preferred service type of those available + */ + if (service_type & IRCOMM_CENTRONICS) + self->settings.service_type = IRCOMM_CENTRONICS; + else if (service_type & IRCOMM_9_WIRE) + self->settings.service_type = IRCOMM_9_WIRE; + else if (service_type & IRCOMM_3_WIRE) + self->settings.service_type = IRCOMM_3_WIRE; + else if (service_type & IRCOMM_3_WIRE_RAW) + self->settings.service_type = IRCOMM_3_WIRE_RAW; + + IRDA_DEBUG(0, "%s(), resulting service type=0x%02x\n", __FUNCTION__ , + self->settings.service_type); + + /* + * Now the line is ready for some communication. Check if we are a + * server, and send over some initial parameters. + * Client do it in ircomm_tty_state_setup(). + * Note : we may get called from ircomm_tty_getvalue_confirm(), + * therefore before we even have open any socket. And self->client + * is initialised to TRUE only later. So, we check if the link is + * really initialised. - Jean II + */ + if ((self->max_header_size != IRCOMM_TTY_HDR_UNINITIALISED) && + (!self->client) && + (self->settings.service_type != IRCOMM_3_WIRE_RAW)) + { + /* Init connection */ + ircomm_tty_send_initial_parameters(self); + ircomm_tty_link_established(self); + } + + return 0; +} + +/* + * Function ircomm_param_port_type (self, param) + * + * The port type parameter tells if the devices are serial or parallel. + * Since we only advertise serial service, this parameter should only + * be equal to IRCOMM_SERIAL. + */ +static int ircomm_param_port_type(void *instance, irda_param_t *param, int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) + param->pv.i = IRCOMM_SERIAL; + else { + self->settings.port_type = (__u8) param->pv.i; + + IRDA_DEBUG(0, "%s(), port type=%d\n", __FUNCTION__ , + self->settings.port_type); + } + return 0; +} + +/* + * Function ircomm_param_port_name (self, param) + * + * Exchange port name + * + */ +static int ircomm_param_port_name(void *instance, irda_param_t *param, int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) { + IRDA_DEBUG(0, "%s(), not imp!\n", __FUNCTION__ ); + } else { + IRDA_DEBUG(0, "%s(), port-name=%s\n", __FUNCTION__ , param->pv.c); + strncpy(self->settings.port_name, param->pv.c, 32); + } + + return 0; +} + +/* + * Function ircomm_param_data_rate (self, param) + * + * Exchange data rate to be used in this settings + * + */ +static int ircomm_param_data_rate(void *instance, irda_param_t *param, int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) + param->pv.i = self->settings.data_rate; + else + self->settings.data_rate = param->pv.i; + + IRDA_DEBUG(2, "%s(), data rate = %d\n", __FUNCTION__ , param->pv.i); + + return 0; +} + +/* + * Function ircomm_param_data_format (self, param) + * + * Exchange data format to be used in this settings + * + */ +static int ircomm_param_data_format(void *instance, irda_param_t *param, + int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) + param->pv.i = self->settings.data_format; + else + self->settings.data_format = (__u8) param->pv.i; + + return 0; +} + +/* + * Function ircomm_param_flow_control (self, param) + * + * Exchange flow control settings to be used in this settings + * + */ +static int ircomm_param_flow_control(void *instance, irda_param_t *param, + int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) + param->pv.i = self->settings.flow_control; + else + self->settings.flow_control = (__u8) param->pv.i; + + IRDA_DEBUG(1, "%s(), flow control = 0x%02x\n", __FUNCTION__ , (__u8) param->pv.i); + + return 0; +} + +/* + * Function ircomm_param_xon_xoff (self, param) + * + * Exchange XON/XOFF characters + * + */ +static int ircomm_param_xon_xoff(void *instance, irda_param_t *param, int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) { + param->pv.i = self->settings.xonxoff[0]; + param->pv.i |= self->settings.xonxoff[1] << 8; + } else { + self->settings.xonxoff[0] = (__u16) param->pv.i & 0xff; + self->settings.xonxoff[1] = (__u16) param->pv.i >> 8; + } + + IRDA_DEBUG(0, "%s(), XON/XOFF = 0x%02x,0x%02x\n", __FUNCTION__ , + param->pv.i & 0xff, param->pv.i >> 8); + + return 0; +} + +/* + * Function ircomm_param_enq_ack (self, param) + * + * Exchange ENQ/ACK characters + * + */ +static int ircomm_param_enq_ack(void *instance, irda_param_t *param, int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) { + param->pv.i = self->settings.enqack[0]; + param->pv.i |= self->settings.enqack[1] << 8; + } else { + self->settings.enqack[0] = (__u16) param->pv.i & 0xff; + self->settings.enqack[1] = (__u16) param->pv.i >> 8; + } + + IRDA_DEBUG(0, "%s(), ENQ/ACK = 0x%02x,0x%02x\n", __FUNCTION__ , + param->pv.i & 0xff, param->pv.i >> 8); + + return 0; +} + +/* + * Function ircomm_param_line_status (self, param) + * + * + * + */ +static int ircomm_param_line_status(void *instance, irda_param_t *param, + int get) +{ + IRDA_DEBUG(2, "%s(), not impl.\n", __FUNCTION__ ); + + return 0; +} + +/* + * Function ircomm_param_dte (instance, param) + * + * If we get here, there must be some sort of null-modem connection, and + * we are probably working in server mode as well. + */ +static int ircomm_param_dte(void *instance, irda_param_t *param, int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + __u8 dte; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (get) + param->pv.i = self->settings.dte; + else { + dte = (__u8) param->pv.i; + + self->settings.dce = 0; + + if (dte & IRCOMM_DELTA_DTR) + self->settings.dce |= (IRCOMM_DELTA_DSR| + IRCOMM_DELTA_RI | + IRCOMM_DELTA_CD); + if (dte & IRCOMM_DTR) + self->settings.dce |= (IRCOMM_DSR| + IRCOMM_RI | + IRCOMM_CD); + + if (dte & IRCOMM_DELTA_RTS) + self->settings.dce |= IRCOMM_DELTA_CTS; + if (dte & IRCOMM_RTS) + self->settings.dce |= IRCOMM_CTS; + + /* Take appropriate actions */ + ircomm_tty_check_modem_status(self); + + /* Null modem cable emulator */ + self->settings.null_modem = TRUE; + } + + return 0; +} + +/* + * Function ircomm_param_dce (instance, param) + * + * + * + */ +static int ircomm_param_dce(void *instance, irda_param_t *param, int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + __u8 dce; + + IRDA_DEBUG(1, "%s(), dce = 0x%02x\n", __FUNCTION__ , (__u8) param->pv.i); + + dce = (__u8) param->pv.i; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + self->settings.dce = dce; + + /* Check if any of the settings have changed */ + if (dce & 0x0f) { + if (dce & IRCOMM_DELTA_CTS) { + IRDA_DEBUG(2, "%s(), CTS \n", __FUNCTION__ ); + } + } + + ircomm_tty_check_modem_status(self); + + return 0; +} + +/* + * Function ircomm_param_poll (instance, param) + * + * Called when the peer device is polling for the line settings + * + */ +static int ircomm_param_poll(void *instance, irda_param_t *param, int get) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + /* Poll parameters are always of lenght 0 (just a signal) */ + if (!get) { + /* Respond with DTE line settings */ + ircomm_param_request(self, IRCOMM_DTE, TRUE); + } + return 0; +} + + + + + diff --git a/net/irda/ircomm/ircomm_ttp.c b/net/irda/ircomm/ircomm_ttp.c new file mode 100644 index 000000000000..d98bf3570d29 --- /dev/null +++ b/net/irda/ircomm/ircomm_ttp.c @@ -0,0 +1,369 @@ +/********************************************************************* + * + * Filename: ircomm_ttp.c + * Version: 1.0 + * Description: Interface between IrCOMM and IrTTP + * Status: Stable + * Author: Dag Brattli + * Created at: Sun Jun 6 20:48:27 1999 + * Modified at: Mon Dec 13 11:35:13 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1999 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include + +#include +#include +#include +#include + +#include +#include + +static int ircomm_ttp_data_indication(void *instance, void *sap, + struct sk_buff *skb); +static void ircomm_ttp_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb); +static void ircomm_ttp_connect_indication(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb); +static void ircomm_ttp_flow_indication(void *instance, void *sap, + LOCAL_FLOW cmd); +static void ircomm_ttp_disconnect_indication(void *instance, void *sap, + LM_REASON reason, + struct sk_buff *skb); +static int ircomm_ttp_data_request(struct ircomm_cb *self, + struct sk_buff *skb, + int clen); +static int ircomm_ttp_connect_request(struct ircomm_cb *self, + struct sk_buff *userdata, + struct ircomm_info *info); +static int ircomm_ttp_connect_response(struct ircomm_cb *self, + struct sk_buff *userdata); +static int ircomm_ttp_disconnect_request(struct ircomm_cb *self, + struct sk_buff *userdata, + struct ircomm_info *info); + +/* + * Function ircomm_open_tsap (self) + * + * + * + */ +int ircomm_open_tsap(struct ircomm_cb *self) +{ + notify_t notify; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + /* Register callbacks */ + irda_notify_init(¬ify); + notify.data_indication = ircomm_ttp_data_indication; + notify.connect_confirm = ircomm_ttp_connect_confirm; + notify.connect_indication = ircomm_ttp_connect_indication; + notify.flow_indication = ircomm_ttp_flow_indication; + notify.disconnect_indication = ircomm_ttp_disconnect_indication; + notify.instance = self; + strlcpy(notify.name, "IrCOMM", sizeof(notify.name)); + + self->tsap = irttp_open_tsap(LSAP_ANY, DEFAULT_INITIAL_CREDIT, + ¬ify); + if (!self->tsap) { + IRDA_DEBUG(0, "%sfailed to allocate tsap\n", __FUNCTION__ ); + return -1; + } + self->slsap_sel = self->tsap->stsap_sel; + + /* + * Initialize the call-table for issuing commands + */ + self->issue.data_request = ircomm_ttp_data_request; + self->issue.connect_request = ircomm_ttp_connect_request; + self->issue.connect_response = ircomm_ttp_connect_response; + self->issue.disconnect_request = ircomm_ttp_disconnect_request; + + return 0; +} + +/* + * Function ircomm_ttp_connect_request (self, userdata) + * + * + * + */ +static int ircomm_ttp_connect_request(struct ircomm_cb *self, + struct sk_buff *userdata, + struct ircomm_info *info) +{ + int ret = 0; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + /* Don't forget to refcount it - should be NULL anyway */ + if(userdata) + skb_get(userdata); + + ret = irttp_connect_request(self->tsap, info->dlsap_sel, + info->saddr, info->daddr, NULL, + TTP_SAR_DISABLE, userdata); + + return ret; +} + +/* + * Function ircomm_ttp_connect_response (self, skb) + * + * + * + */ +static int ircomm_ttp_connect_response(struct ircomm_cb *self, + struct sk_buff *userdata) +{ + int ret; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + /* Don't forget to refcount it - should be NULL anyway */ + if(userdata) + skb_get(userdata); + + ret = irttp_connect_response(self->tsap, TTP_SAR_DISABLE, userdata); + + return ret; +} + +/* + * Function ircomm_ttp_data_request (self, userdata) + * + * Send IrCOMM data to IrTTP layer. Currently we do not try to combine + * control data with pure data, so they will be sent as separate frames. + * Should not be a big problem though, since control frames are rare. But + * some of them are sent after connection establishment, so this can + * increase the latency a bit. + */ +static int ircomm_ttp_data_request(struct ircomm_cb *self, + struct sk_buff *skb, + int clen) +{ + int ret; + + IRDA_ASSERT(skb != NULL, return -1;); + + IRDA_DEBUG(2, "%s(), clen=%d\n", __FUNCTION__ , clen); + + /* + * Insert clen field, currently we either send data only, or control + * only frames, to make things easier and avoid queueing + */ + IRDA_ASSERT(skb_headroom(skb) >= IRCOMM_HEADER_SIZE, return -1;); + + /* Don't forget to refcount it - see ircomm_tty_do_softint() */ + skb_get(skb); + + skb_push(skb, IRCOMM_HEADER_SIZE); + + skb->data[0] = clen; + + ret = irttp_data_request(self->tsap, skb); + if (ret) { + IRDA_ERROR("%s(), failed\n", __FUNCTION__); + /* irttp_data_request already free the packet */ + } + + return ret; +} + +/* + * Function ircomm_ttp_data_indication (instance, sap, skb) + * + * Incoming data + * + */ +static int ircomm_ttp_data_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct ircomm_cb *self = (struct ircomm_cb *) instance; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return -1;); + IRDA_ASSERT(skb != NULL, return -1;); + + ircomm_do_event(self, IRCOMM_TTP_DATA_INDICATION, skb, NULL); + + /* Drop reference count - see ircomm_tty_data_indication(). */ + dev_kfree_skb(skb); + + return 0; +} + +static void ircomm_ttp_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct ircomm_cb *self = (struct ircomm_cb *) instance; + struct ircomm_info info; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(qos != NULL, goto out;); + + if (max_sdu_size != TTP_SAR_DISABLE) { + IRDA_ERROR("%s(), SAR not allowed for IrCOMM!\n", + __FUNCTION__); + goto out; + } + + info.max_data_size = irttp_get_max_seg_size(self->tsap) + - IRCOMM_HEADER_SIZE; + info.max_header_size = max_header_size + IRCOMM_HEADER_SIZE; + info.qos = qos; + + ircomm_do_event(self, IRCOMM_TTP_CONNECT_CONFIRM, skb, &info); + +out: + /* Drop reference count - see ircomm_tty_connect_confirm(). */ + dev_kfree_skb(skb); +} + +/* + * Function ircomm_ttp_connect_indication (instance, sap, qos, max_sdu_size, + * max_header_size, skb) + * + * + * + */ +static void ircomm_ttp_connect_indication(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct ircomm_cb *self = (struct ircomm_cb *)instance; + struct ircomm_info info; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(qos != NULL, goto out;); + + if (max_sdu_size != TTP_SAR_DISABLE) { + IRDA_ERROR("%s(), SAR not allowed for IrCOMM!\n", + __FUNCTION__); + goto out; + } + + info.max_data_size = irttp_get_max_seg_size(self->tsap) + - IRCOMM_HEADER_SIZE; + info.max_header_size = max_header_size + IRCOMM_HEADER_SIZE; + info.qos = qos; + + ircomm_do_event(self, IRCOMM_TTP_CONNECT_INDICATION, skb, &info); + +out: + /* Drop reference count - see ircomm_tty_connect_indication(). */ + dev_kfree_skb(skb); +} + +/* + * Function ircomm_ttp_disconnect_request (self, userdata, info) + * + * + * + */ +static int ircomm_ttp_disconnect_request(struct ircomm_cb *self, + struct sk_buff *userdata, + struct ircomm_info *info) +{ + int ret; + + /* Don't forget to refcount it - should be NULL anyway */ + if(userdata) + skb_get(userdata); + + ret = irttp_disconnect_request(self->tsap, userdata, P_NORMAL); + + return ret; +} + +/* + * Function ircomm_ttp_disconnect_indication (instance, sap, reason, skb) + * + * + * + */ +static void ircomm_ttp_disconnect_indication(void *instance, void *sap, + LM_REASON reason, + struct sk_buff *skb) +{ + struct ircomm_cb *self = (struct ircomm_cb *) instance; + struct ircomm_info info; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + + info.reason = reason; + + ircomm_do_event(self, IRCOMM_TTP_DISCONNECT_INDICATION, skb, &info); + + /* Drop reference count - see ircomm_tty_disconnect_indication(). */ + if(skb) + dev_kfree_skb(skb); +} + +/* + * Function ircomm_ttp_flow_indication (instance, sap, cmd) + * + * Layer below is telling us to start or stop the flow of data + * + */ +static void ircomm_ttp_flow_indication(void *instance, void *sap, + LOCAL_FLOW cmd) +{ + struct ircomm_cb *self = (struct ircomm_cb *) instance; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_MAGIC, return;); + + if (self->notify.flow_indication) + self->notify.flow_indication(self->notify.instance, self, cmd); +} + + diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c new file mode 100644 index 000000000000..5d1e61168eb7 --- /dev/null +++ b/net/irda/ircomm/ircomm_tty.c @@ -0,0 +1,1405 @@ +/********************************************************************* + * + * Filename: ircomm_tty.c + * Version: 1.0 + * Description: IrCOMM serial TTY driver + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sun Jun 6 21:00:56 1999 + * Modified at: Wed Feb 23 00:09:02 2000 + * Modified by: Dag Brattli + * Sources: serial.c and previous IrCOMM work by Takahide Higuchi + * + * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for MODULE_ALIAS_CHARDEV_MAJOR */ + +#include + +#include +#include + +#include +#include +#include +#include + +static int ircomm_tty_open(struct tty_struct *tty, struct file *filp); +static void ircomm_tty_close(struct tty_struct * tty, struct file *filp); +static int ircomm_tty_write(struct tty_struct * tty, + const unsigned char *buf, int count); +static int ircomm_tty_write_room(struct tty_struct *tty); +static void ircomm_tty_throttle(struct tty_struct *tty); +static void ircomm_tty_unthrottle(struct tty_struct *tty); +static int ircomm_tty_chars_in_buffer(struct tty_struct *tty); +static void ircomm_tty_flush_buffer(struct tty_struct *tty); +static void ircomm_tty_send_xchar(struct tty_struct *tty, char ch); +static void ircomm_tty_wait_until_sent(struct tty_struct *tty, int timeout); +static void ircomm_tty_hangup(struct tty_struct *tty); +static void ircomm_tty_do_softint(void *private_); +static void ircomm_tty_shutdown(struct ircomm_tty_cb *self); +static void ircomm_tty_stop(struct tty_struct *tty); + +static int ircomm_tty_data_indication(void *instance, void *sap, + struct sk_buff *skb); +static int ircomm_tty_control_indication(void *instance, void *sap, + struct sk_buff *skb); +static void ircomm_tty_flow_indication(void *instance, void *sap, + LOCAL_FLOW cmd); +#ifdef CONFIG_PROC_FS +static int ircomm_tty_read_proc(char *buf, char **start, off_t offset, int len, + int *eof, void *unused); +#endif /* CONFIG_PROC_FS */ +static struct tty_driver *driver; + +hashbin_t *ircomm_tty = NULL; + +static struct tty_operations ops = { + .open = ircomm_tty_open, + .close = ircomm_tty_close, + .write = ircomm_tty_write, + .write_room = ircomm_tty_write_room, + .chars_in_buffer = ircomm_tty_chars_in_buffer, + .flush_buffer = ircomm_tty_flush_buffer, + .ioctl = ircomm_tty_ioctl, /* ircomm_tty_ioctl.c */ + .tiocmget = ircomm_tty_tiocmget, /* ircomm_tty_ioctl.c */ + .tiocmset = ircomm_tty_tiocmset, /* ircomm_tty_ioctl.c */ + .throttle = ircomm_tty_throttle, + .unthrottle = ircomm_tty_unthrottle, + .send_xchar = ircomm_tty_send_xchar, + .set_termios = ircomm_tty_set_termios, + .stop = ircomm_tty_stop, + .start = ircomm_tty_start, + .hangup = ircomm_tty_hangup, + .wait_until_sent = ircomm_tty_wait_until_sent, +#ifdef CONFIG_PROC_FS + .read_proc = ircomm_tty_read_proc, +#endif /* CONFIG_PROC_FS */ +}; + +/* + * Function ircomm_tty_init() + * + * Init IrCOMM TTY layer/driver + * + */ +static int __init ircomm_tty_init(void) +{ + driver = alloc_tty_driver(IRCOMM_TTY_PORTS); + if (!driver) + return -ENOMEM; + ircomm_tty = hashbin_new(HB_LOCK); + if (ircomm_tty == NULL) { + IRDA_ERROR("%s(), can't allocate hashbin!\n", __FUNCTION__); + put_tty_driver(driver); + return -ENOMEM; + } + + driver->owner = THIS_MODULE; + driver->driver_name = "ircomm"; + driver->name = "ircomm"; + driver->devfs_name = "ircomm"; + driver->major = IRCOMM_TTY_MAJOR; + driver->minor_start = IRCOMM_TTY_MINOR; + driver->type = TTY_DRIVER_TYPE_SERIAL; + driver->subtype = SERIAL_TYPE_NORMAL; + driver->init_termios = tty_std_termios; + driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL; + driver->flags = TTY_DRIVER_REAL_RAW; + tty_set_operations(driver, &ops); + if (tty_register_driver(driver)) { + IRDA_ERROR("%s(): Couldn't register serial driver\n", + __FUNCTION__); + put_tty_driver(driver); + return -1; + } + return 0; +} + +static void __exit __ircomm_tty_cleanup(struct ircomm_tty_cb *self) +{ + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + ircomm_tty_shutdown(self); + + self->magic = 0; + kfree(self); +} + +/* + * Function ircomm_tty_cleanup () + * + * Remove IrCOMM TTY layer/driver + * + */ +static void __exit ircomm_tty_cleanup(void) +{ + int ret; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + ret = tty_unregister_driver(driver); + if (ret) { + IRDA_ERROR("%s(), failed to unregister driver\n", + __FUNCTION__); + return; + } + + hashbin_delete(ircomm_tty, (FREE_FUNC) __ircomm_tty_cleanup); + put_tty_driver(driver); +} + +/* + * Function ircomm_startup (self) + * + * + * + */ +static int ircomm_tty_startup(struct ircomm_tty_cb *self) +{ + notify_t notify; + int ret = -ENODEV; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + /* Check if already open */ + if (test_and_set_bit(ASYNC_B_INITIALIZED, &self->flags)) { + IRDA_DEBUG(2, "%s(), already open so break out!\n", __FUNCTION__ ); + return 0; + } + + /* Register with IrCOMM */ + irda_notify_init(¬ify); + /* These callbacks we must handle ourselves */ + notify.data_indication = ircomm_tty_data_indication; + notify.udata_indication = ircomm_tty_control_indication; + notify.flow_indication = ircomm_tty_flow_indication; + + /* Use the ircomm_tty interface for these ones */ + notify.disconnect_indication = ircomm_tty_disconnect_indication; + notify.connect_confirm = ircomm_tty_connect_confirm; + notify.connect_indication = ircomm_tty_connect_indication; + strlcpy(notify.name, "ircomm_tty", sizeof(notify.name)); + notify.instance = self; + + if (!self->ircomm) { + self->ircomm = ircomm_open(¬ify, self->service_type, + self->line); + } + if (!self->ircomm) + goto err; + + self->slsap_sel = self->ircomm->slsap_sel; + + /* Connect IrCOMM link with remote device */ + ret = ircomm_tty_attach_cable(self); + if (ret < 0) { + IRDA_ERROR("%s(), error attaching cable!\n", __FUNCTION__); + goto err; + } + + return 0; +err: + clear_bit(ASYNC_B_INITIALIZED, &self->flags); + return ret; +} + +/* + * Function ircomm_block_til_ready (self, filp) + * + * + * + */ +static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self, + struct file *filp) +{ + DECLARE_WAITQUEUE(wait, current); + int retval; + int do_clocal = 0, extra_count = 0; + unsigned long flags; + struct tty_struct *tty; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + tty = self->tty; + + /* + * If non-blocking mode is set, or the port is not enabled, + * then make the check up front and then exit. + */ + if (filp->f_flags & O_NONBLOCK || tty->flags & (1 << TTY_IO_ERROR)){ + /* nonblock mode is set or port is not enabled */ + self->flags |= ASYNC_NORMAL_ACTIVE; + IRDA_DEBUG(1, "%s(), O_NONBLOCK requested!\n", __FUNCTION__ ); + return 0; + } + + if (tty->termios->c_cflag & CLOCAL) { + IRDA_DEBUG(1, "%s(), doing CLOCAL!\n", __FUNCTION__ ); + do_clocal = 1; + } + + /* Wait for carrier detect and the line to become + * free (i.e., not in use by the callout). While we are in + * this loop, self->open_count is dropped by one, so that + * mgsl_close() knows when to free things. We restore it upon + * exit, either normal or abnormal. + */ + + retval = 0; + add_wait_queue(&self->open_wait, &wait); + + IRDA_DEBUG(2, "%s(%d):block_til_ready before block on %s open_count=%d\n", + __FILE__,__LINE__, tty->driver->name, self->open_count ); + + /* As far as I can see, we protect open_count - Jean II */ + spin_lock_irqsave(&self->spinlock, flags); + if (!tty_hung_up_p(filp)) { + extra_count = 1; + self->open_count--; + } + spin_unlock_irqrestore(&self->spinlock, flags); + self->blocked_open++; + + while (1) { + if (tty->termios->c_cflag & CBAUD) { + /* Here, we use to lock those two guys, but + * as ircomm_param_request() does it itself, + * I don't see the point (and I see the deadlock). + * Jean II */ + self->settings.dte |= IRCOMM_RTS + IRCOMM_DTR; + + ircomm_param_request(self, IRCOMM_DTE, TRUE); + } + + current->state = TASK_INTERRUPTIBLE; + + if (tty_hung_up_p(filp) || + !test_bit(ASYNC_B_INITIALIZED, &self->flags)) { + retval = (self->flags & ASYNC_HUP_NOTIFY) ? + -EAGAIN : -ERESTARTSYS; + break; + } + + /* + * Check if link is ready now. Even if CLOCAL is + * specified, we cannot return before the IrCOMM link is + * ready + */ + if (!test_bit(ASYNC_B_CLOSING, &self->flags) && + (do_clocal || (self->settings.dce & IRCOMM_CD)) && + self->state == IRCOMM_TTY_READY) + { + break; + } + + if (signal_pending(current)) { + retval = -ERESTARTSYS; + break; + } + + IRDA_DEBUG(1, "%s(%d):block_til_ready blocking on %s open_count=%d\n", + __FILE__,__LINE__, tty->driver->name, self->open_count ); + + schedule(); + } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&self->open_wait, &wait); + + if (extra_count) { + /* ++ is not atomic, so this should be protected - Jean II */ + spin_lock_irqsave(&self->spinlock, flags); + self->open_count++; + spin_unlock_irqrestore(&self->spinlock, flags); + } + self->blocked_open--; + + IRDA_DEBUG(1, "%s(%d):block_til_ready after blocking on %s open_count=%d\n", + __FILE__,__LINE__, tty->driver->name, self->open_count); + + if (!retval) + self->flags |= ASYNC_NORMAL_ACTIVE; + + return retval; +} + +/* + * Function ircomm_tty_open (tty, filp) + * + * This routine is called when a particular tty device is opened. This + * routine is mandatory; if this routine is not filled in, the attempted + * open will fail with ENODEV. + */ +static int ircomm_tty_open(struct tty_struct *tty, struct file *filp) +{ + struct ircomm_tty_cb *self; + unsigned int line; + unsigned long flags; + int ret; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + line = tty->index; + if ((line < 0) || (line >= IRCOMM_TTY_PORTS)) { + return -ENODEV; + } + + /* Check if instance already exists */ + self = hashbin_lock_find(ircomm_tty, line, NULL); + if (!self) { + /* No, so make new instance */ + self = kmalloc(sizeof(struct ircomm_tty_cb), GFP_KERNEL); + if (self == NULL) { + IRDA_ERROR("%s(), kmalloc failed!\n", __FUNCTION__); + return -ENOMEM; + } + memset(self, 0, sizeof(struct ircomm_tty_cb)); + + self->magic = IRCOMM_TTY_MAGIC; + self->flow = FLOW_STOP; + + self->line = line; + INIT_WORK(&self->tqueue, ircomm_tty_do_softint, self); + self->max_header_size = IRCOMM_TTY_HDR_UNINITIALISED; + self->max_data_size = IRCOMM_TTY_DATA_UNINITIALISED; + self->close_delay = 5*HZ/10; + self->closing_wait = 30*HZ; + + /* Init some important stuff */ + init_timer(&self->watchdog_timer); + init_waitqueue_head(&self->open_wait); + init_waitqueue_head(&self->close_wait); + spin_lock_init(&self->spinlock); + + /* + * Force TTY into raw mode by default which is usually what + * we want for IrCOMM and IrLPT. This way applications will + * not have to twiddle with printcap etc. + */ + tty->termios->c_iflag = 0; + tty->termios->c_oflag = 0; + + /* Insert into hash */ + hashbin_insert(ircomm_tty, (irda_queue_t *) self, line, NULL); + } + /* ++ is not atomic, so this should be protected - Jean II */ + spin_lock_irqsave(&self->spinlock, flags); + self->open_count++; + + tty->driver_data = self; + self->tty = tty; + spin_unlock_irqrestore(&self->spinlock, flags); + + IRDA_DEBUG(1, "%s(), %s%d, count = %d\n", __FUNCTION__ , tty->driver->name, + self->line, self->open_count); + + /* Not really used by us, but lets do it anyway */ + self->tty->low_latency = (self->flags & ASYNC_LOW_LATENCY) ? 1 : 0; + + /* + * If the port is the middle of closing, bail out now + */ + if (tty_hung_up_p(filp) || + test_bit(ASYNC_B_CLOSING, &self->flags)) { + + /* Hm, why are we blocking on ASYNC_CLOSING if we + * do return -EAGAIN/-ERESTARTSYS below anyway? + * IMHO it's either not needed in the first place + * or for some reason we need to make sure the async + * closing has been finished - if so, wouldn't we + * probably better sleep uninterruptible? + */ + + if (wait_event_interruptible(self->close_wait, !test_bit(ASYNC_B_CLOSING, &self->flags))) { + IRDA_WARNING("%s - got signal while blocking on ASYNC_CLOSING!\n", + __FUNCTION__); + return -ERESTARTSYS; + } + +#ifdef SERIAL_DO_RESTART + return ((self->flags & ASYNC_HUP_NOTIFY) ? + -EAGAIN : -ERESTARTSYS); +#else + return -EAGAIN; +#endif + } + + /* Check if this is a "normal" ircomm device, or an irlpt device */ + if (line < 0x10) { + self->service_type = IRCOMM_3_WIRE | IRCOMM_9_WIRE; + self->settings.service_type = IRCOMM_9_WIRE; /* 9 wire as default */ + /* Jan Kiszka -> add DSR/RI -> Conform to IrCOMM spec */ + self->settings.dce = IRCOMM_CTS | IRCOMM_CD | IRCOMM_DSR | IRCOMM_RI; /* Default line settings */ + IRDA_DEBUG(2, "%s(), IrCOMM device\n", __FUNCTION__ ); + } else { + IRDA_DEBUG(2, "%s(), IrLPT device\n", __FUNCTION__ ); + self->service_type = IRCOMM_3_WIRE_RAW; + self->settings.service_type = IRCOMM_3_WIRE_RAW; /* Default */ + } + + ret = ircomm_tty_startup(self); + if (ret) + return ret; + + ret = ircomm_tty_block_til_ready(self, filp); + if (ret) { + IRDA_DEBUG(2, + "%s(), returning after block_til_ready with %d\n", __FUNCTION__ , + ret); + + return ret; + } + return 0; +} + +/* + * Function ircomm_tty_close (tty, filp) + * + * This routine is called when a particular tty device is closed. + * + */ +static void ircomm_tty_close(struct tty_struct *tty, struct file *filp) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + unsigned long flags; + + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + if (!tty) + return; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + spin_lock_irqsave(&self->spinlock, flags); + + if (tty_hung_up_p(filp)) { + spin_unlock_irqrestore(&self->spinlock, flags); + + IRDA_DEBUG(0, "%s(), returning 1\n", __FUNCTION__ ); + return; + } + + if ((tty->count == 1) && (self->open_count != 1)) { + /* + * Uh, oh. tty->count is 1, which means that the tty + * structure will be freed. state->count should always + * be one in these conditions. If it's greater than + * one, we've got real problems, since it means the + * serial port won't be shutdown. + */ + IRDA_DEBUG(0, "%s(), bad serial port count; " + "tty->count is 1, state->count is %d\n", __FUNCTION__ , + self->open_count); + self->open_count = 1; + } + + if (--self->open_count < 0) { + IRDA_ERROR("%s(), bad serial port count for ttys%d: %d\n", + __FUNCTION__, self->line, self->open_count); + self->open_count = 0; + } + if (self->open_count) { + spin_unlock_irqrestore(&self->spinlock, flags); + + IRDA_DEBUG(0, "%s(), open count > 0\n", __FUNCTION__ ); + return; + } + + /* Hum... Should be test_and_set_bit ??? - Jean II */ + set_bit(ASYNC_B_CLOSING, &self->flags); + + /* We need to unlock here (we were unlocking at the end of this + * function), because tty_wait_until_sent() may schedule. + * I don't know if the rest should be protected somehow, + * so someone should check. - Jean II */ + spin_unlock_irqrestore(&self->spinlock, flags); + + /* + * Now we wait for the transmit buffer to clear; and we notify + * the line discipline to only process XON/XOFF characters. + */ + tty->closing = 1; + if (self->closing_wait != ASYNC_CLOSING_WAIT_NONE) + tty_wait_until_sent(tty, self->closing_wait); + + ircomm_tty_shutdown(self); + + if (tty->driver->flush_buffer) + tty->driver->flush_buffer(tty); + if (tty->ldisc.flush_buffer) + tty->ldisc.flush_buffer(tty); + + tty->closing = 0; + self->tty = NULL; + + if (self->blocked_open) { + if (self->close_delay) { + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(self->close_delay); + } + wake_up_interruptible(&self->open_wait); + } + + self->flags &= ~(ASYNC_NORMAL_ACTIVE|ASYNC_CLOSING); + wake_up_interruptible(&self->close_wait); +} + +/* + * Function ircomm_tty_flush_buffer (tty) + * + * + * + */ +static void ircomm_tty_flush_buffer(struct tty_struct *tty) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + /* + * Let do_softint() do this to avoid race condition with + * do_softint() ;-) + */ + schedule_work(&self->tqueue); +} + +/* + * Function ircomm_tty_do_softint (private_) + * + * We use this routine to give the write wakeup to the user at at a + * safe time (as fast as possible after write have completed). This + * can be compared to the Tx interrupt. + */ +static void ircomm_tty_do_softint(void *private_) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) private_; + struct tty_struct *tty; + unsigned long flags; + struct sk_buff *skb, *ctrl_skb; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + if (!self || self->magic != IRCOMM_TTY_MAGIC) + return; + + tty = self->tty; + if (!tty) + return; + + /* Unlink control buffer */ + spin_lock_irqsave(&self->spinlock, flags); + + ctrl_skb = self->ctrl_skb; + self->ctrl_skb = NULL; + + spin_unlock_irqrestore(&self->spinlock, flags); + + /* Flush control buffer if any */ + if(ctrl_skb) { + if(self->flow == FLOW_START) + ircomm_control_request(self->ircomm, ctrl_skb); + /* Drop reference count - see ircomm_ttp_data_request(). */ + dev_kfree_skb(ctrl_skb); + } + + if (tty->hw_stopped) + return; + + /* Unlink transmit buffer */ + spin_lock_irqsave(&self->spinlock, flags); + + skb = self->tx_skb; + self->tx_skb = NULL; + + spin_unlock_irqrestore(&self->spinlock, flags); + + /* Flush transmit buffer if any */ + if (skb) { + ircomm_tty_do_event(self, IRCOMM_TTY_DATA_REQUEST, skb, NULL); + /* Drop reference count - see ircomm_ttp_data_request(). */ + dev_kfree_skb(skb); + } + + /* Check if user (still) wants to be waken up */ + if ((tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) && + tty->ldisc.write_wakeup) + { + (tty->ldisc.write_wakeup)(tty); + } + wake_up_interruptible(&tty->write_wait); +} + +/* + * Function ircomm_tty_write (tty, buf, count) + * + * This routine is called by the kernel to write a series of characters + * to the tty device. The characters may come from user space or kernel + * space. This routine will return the number of characters actually + * accepted for writing. This routine is mandatory. + */ +static int ircomm_tty_write(struct tty_struct *tty, + const unsigned char *buf, int count) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + unsigned long flags; + struct sk_buff *skb; + int tailroom = 0; + int len = 0; + int size; + + IRDA_DEBUG(2, "%s(), count=%d, hw_stopped=%d\n", __FUNCTION__ , count, + tty->hw_stopped); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + /* We may receive packets from the TTY even before we have finished + * our setup. Not cool. + * The problem is that we don't know the final header and data size + * to create the proper skb, so any skb we would create would have + * bogus header and data size, so need care. + * We use a bogus header size to safely detect this condition. + * Another problem is that hw_stopped was set to 0 way before it + * should be, so we would drop this skb. It should now be fixed. + * One option is to not accept data until we are properly setup. + * But, I suspect that when it happens, the ppp line discipline + * just "drops" the data, which might screw up connect scripts. + * The second option is to create a "safe skb", with large header + * and small size (see ircomm_tty_open() for values). + * We just need to make sure that when the real values get filled, + * we don't mess up the original "safe skb" (see tx_data_size). + * Jean II */ + if (self->max_header_size == IRCOMM_TTY_HDR_UNINITIALISED) { + IRDA_DEBUG(1, "%s() : not initialised\n", __FUNCTION__); +#ifdef IRCOMM_NO_TX_BEFORE_INIT + /* We didn't consume anything, TTY will retry */ + return 0; +#endif + } + + if (count < 1) + return 0; + + /* Protect our manipulation of self->tx_skb and related */ + spin_lock_irqsave(&self->spinlock, flags); + + /* Fetch current transmit buffer */ + skb = self->tx_skb; + + /* + * Send out all the data we get, possibly as multiple fragmented + * frames, but this will only happen if the data is larger than the + * max data size. The normal case however is just the opposite, and + * this function may be called multiple times, and will then actually + * defragment the data and send it out as one packet as soon as + * possible, but at a safer point in time + */ + while (count) { + size = count; + + /* Adjust data size to the max data size */ + if (size > self->max_data_size) + size = self->max_data_size; + + /* + * Do we already have a buffer ready for transmit, or do + * we need to allocate a new frame + */ + if (skb) { + /* + * Any room for more data at the end of the current + * transmit buffer? Cannot use skb_tailroom, since + * dev_alloc_skb gives us a larger skb than we + * requested + * Note : use tx_data_size, because max_data_size + * may have changed and we don't want to overwrite + * the skb. - Jean II + */ + if ((tailroom = (self->tx_data_size - skb->len)) > 0) { + /* Adjust data to tailroom */ + if (size > tailroom) + size = tailroom; + } else { + /* + * Current transmit frame is full, so break + * out, so we can send it as soon as possible + */ + break; + } + } else { + /* Prepare a full sized frame */ + skb = dev_alloc_skb(self->max_data_size+ + self->max_header_size); + if (!skb) { + spin_unlock_irqrestore(&self->spinlock, flags); + return -ENOBUFS; + } + skb_reserve(skb, self->max_header_size); + self->tx_skb = skb; + /* Remember skb size because max_data_size may + * change later on - Jean II */ + self->tx_data_size = self->max_data_size; + } + + /* Copy data */ + memcpy(skb_put(skb,size), buf + len, size); + + count -= size; + len += size; + } + + spin_unlock_irqrestore(&self->spinlock, flags); + + /* + * Schedule a new thread which will transmit the frame as soon + * as possible, but at a safe point in time. We do this so the + * "user" can give us data multiple times, as PPP does (because of + * its 256 byte tx buffer). We will then defragment and send out + * all this data as one single packet. + */ + schedule_work(&self->tqueue); + + return len; +} + +/* + * Function ircomm_tty_write_room (tty) + * + * This routine returns the numbers of characters the tty driver will + * accept for queuing to be written. This number is subject to change as + * output buffers get emptied, or if the output flow control is acted. + */ +static int ircomm_tty_write_room(struct tty_struct *tty) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + unsigned long flags; + int ret; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + +#ifdef IRCOMM_NO_TX_BEFORE_INIT + /* max_header_size tells us if the channel is initialised or not. */ + if (self->max_header_size == IRCOMM_TTY_HDR_UNINITIALISED) + /* Don't bother us yet */ + return 0; +#endif + + /* Check if we are allowed to transmit any data. + * hw_stopped is the regular flow control. + * Jean II */ + if (tty->hw_stopped) + ret = 0; + else { + spin_lock_irqsave(&self->spinlock, flags); + if (self->tx_skb) + ret = self->tx_data_size - self->tx_skb->len; + else + ret = self->max_data_size; + spin_unlock_irqrestore(&self->spinlock, flags); + } + IRDA_DEBUG(2, "%s(), ret=%d\n", __FUNCTION__ , ret); + + return ret; +} + +/* + * Function ircomm_tty_wait_until_sent (tty, timeout) + * + * This routine waits until the device has written out all of the + * characters in its transmitter FIFO. + */ +static void ircomm_tty_wait_until_sent(struct tty_struct *tty, int timeout) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + unsigned long orig_jiffies, poll_time; + unsigned long flags; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + orig_jiffies = jiffies; + + /* Set poll time to 200 ms */ + poll_time = IRDA_MIN(timeout, msecs_to_jiffies(200)); + + spin_lock_irqsave(&self->spinlock, flags); + while (self->tx_skb && self->tx_skb->len) { + spin_unlock_irqrestore(&self->spinlock, flags); + current->state = TASK_INTERRUPTIBLE; + schedule_timeout(poll_time); + spin_lock_irqsave(&self->spinlock, flags); + if (signal_pending(current)) + break; + if (timeout && time_after(jiffies, orig_jiffies + timeout)) + break; + } + spin_unlock_irqrestore(&self->spinlock, flags); + current->state = TASK_RUNNING; +} + +/* + * Function ircomm_tty_throttle (tty) + * + * This routine notifies the tty driver that input buffers for the line + * discipline are close to full, and it should somehow signal that no + * more characters should be sent to the tty. + */ +static void ircomm_tty_throttle(struct tty_struct *tty) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + /* Software flow control? */ + if (I_IXOFF(tty)) + ircomm_tty_send_xchar(tty, STOP_CHAR(tty)); + + /* Hardware flow control? */ + if (tty->termios->c_cflag & CRTSCTS) { + self->settings.dte &= ~IRCOMM_RTS; + self->settings.dte |= IRCOMM_DELTA_RTS; + + ircomm_param_request(self, IRCOMM_DTE, TRUE); + } + + ircomm_flow_request(self->ircomm, FLOW_STOP); +} + +/* + * Function ircomm_tty_unthrottle (tty) + * + * This routine notifies the tty drivers that it should signals that + * characters can now be sent to the tty without fear of overrunning the + * input buffers of the line disciplines. + */ +static void ircomm_tty_unthrottle(struct tty_struct *tty) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + /* Using software flow control? */ + if (I_IXOFF(tty)) { + ircomm_tty_send_xchar(tty, START_CHAR(tty)); + } + + /* Using hardware flow control? */ + if (tty->termios->c_cflag & CRTSCTS) { + self->settings.dte |= (IRCOMM_RTS|IRCOMM_DELTA_RTS); + + ircomm_param_request(self, IRCOMM_DTE, TRUE); + IRDA_DEBUG(1, "%s(), FLOW_START\n", __FUNCTION__ ); + } + ircomm_flow_request(self->ircomm, FLOW_START); +} + +/* + * Function ircomm_tty_chars_in_buffer (tty) + * + * Indicates if there are any data in the buffer + * + */ +static int ircomm_tty_chars_in_buffer(struct tty_struct *tty) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + unsigned long flags; + int len = 0; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + spin_lock_irqsave(&self->spinlock, flags); + + if (self->tx_skb) + len = self->tx_skb->len; + + spin_unlock_irqrestore(&self->spinlock, flags); + + return len; +} + +static void ircomm_tty_shutdown(struct ircomm_tty_cb *self) +{ + unsigned long flags; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + if (!test_and_clear_bit(ASYNC_B_INITIALIZED, &self->flags)) + return; + + ircomm_tty_detach_cable(self); + + spin_lock_irqsave(&self->spinlock, flags); + + del_timer(&self->watchdog_timer); + + /* Free parameter buffer */ + if (self->ctrl_skb) { + dev_kfree_skb(self->ctrl_skb); + self->ctrl_skb = NULL; + } + + /* Free transmit buffer */ + if (self->tx_skb) { + dev_kfree_skb(self->tx_skb); + self->tx_skb = NULL; + } + + if (self->ircomm) { + ircomm_close(self->ircomm); + self->ircomm = NULL; + } + + spin_unlock_irqrestore(&self->spinlock, flags); +} + +/* + * Function ircomm_tty_hangup (tty) + * + * This routine notifies the tty driver that it should hangup the tty + * device. + * + */ +static void ircomm_tty_hangup(struct tty_struct *tty) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + unsigned long flags; + + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + if (!tty) + return; + + /* ircomm_tty_flush_buffer(tty); */ + ircomm_tty_shutdown(self); + + /* I guess we need to lock here - Jean II */ + spin_lock_irqsave(&self->spinlock, flags); + self->flags &= ~ASYNC_NORMAL_ACTIVE; + self->tty = NULL; + self->open_count = 0; + spin_unlock_irqrestore(&self->spinlock, flags); + + wake_up_interruptible(&self->open_wait); +} + +/* + * Function ircomm_tty_send_xchar (tty, ch) + * + * This routine is used to send a high-priority XON/XOFF character to + * the device. + */ +static void ircomm_tty_send_xchar(struct tty_struct *tty, char ch) +{ + IRDA_DEBUG(0, "%s(), not impl\n", __FUNCTION__ ); +} + +/* + * Function ircomm_tty_start (tty) + * + * This routine notifies the tty driver that it resume sending + * characters to the tty device. + */ +void ircomm_tty_start(struct tty_struct *tty) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + + ircomm_flow_request(self->ircomm, FLOW_START); +} + +/* + * Function ircomm_tty_stop (tty) + * + * This routine notifies the tty driver that it should stop outputting + * characters to the tty device. + */ +static void ircomm_tty_stop(struct tty_struct *tty) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + ircomm_flow_request(self->ircomm, FLOW_STOP); +} + +/* + * Function ircomm_check_modem_status (self) + * + * Check for any changes in the DCE's line settings. This function should + * be called whenever the dce parameter settings changes, to update the + * flow control settings and other things + */ +void ircomm_tty_check_modem_status(struct ircomm_tty_cb *self) +{ + struct tty_struct *tty; + int status; + + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + tty = self->tty; + + status = self->settings.dce; + + if (status & IRCOMM_DCE_DELTA_ANY) { + /*wake_up_interruptible(&self->delta_msr_wait);*/ + } + if ((self->flags & ASYNC_CHECK_CD) && (status & IRCOMM_DELTA_CD)) { + IRDA_DEBUG(2, + "%s(), ircomm%d CD now %s...\n", __FUNCTION__ , self->line, + (status & IRCOMM_CD) ? "on" : "off"); + + if (status & IRCOMM_CD) { + wake_up_interruptible(&self->open_wait); + } else { + IRDA_DEBUG(2, + "%s(), Doing serial hangup..\n", __FUNCTION__ ); + if (tty) + tty_hangup(tty); + + /* Hangup will remote the tty, so better break out */ + return; + } + } + if (self->flags & ASYNC_CTS_FLOW) { + if (tty->hw_stopped) { + if (status & IRCOMM_CTS) { + IRDA_DEBUG(2, + "%s(), CTS tx start...\n", __FUNCTION__ ); + tty->hw_stopped = 0; + + /* Wake up processes blocked on open */ + wake_up_interruptible(&self->open_wait); + + schedule_work(&self->tqueue); + return; + } + } else { + if (!(status & IRCOMM_CTS)) { + IRDA_DEBUG(2, + "%s(), CTS tx stop...\n", __FUNCTION__ ); + tty->hw_stopped = 1; + } + } + } +} + +/* + * Function ircomm_tty_data_indication (instance, sap, skb) + * + * Handle incoming data, and deliver it to the line discipline + * + */ +static int ircomm_tty_data_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + IRDA_ASSERT(skb != NULL, return -1;); + + if (!self->tty) { + IRDA_DEBUG(0, "%s(), no tty!\n", __FUNCTION__ ); + return 0; + } + + /* + * If we receive data when hardware is stopped then something is wrong. + * We try to poll the peers line settings to check if we are up todate. + * Devices like WinCE can do this, and since they don't send any + * params, we can just as well declare the hardware for running. + */ + if (self->tty->hw_stopped && (self->flow == FLOW_START)) { + IRDA_DEBUG(0, "%s(), polling for line settings!\n", __FUNCTION__ ); + ircomm_param_request(self, IRCOMM_POLL, TRUE); + + /* We can just as well declare the hardware for running */ + ircomm_tty_send_initial_parameters(self); + ircomm_tty_link_established(self); + } + + /* + * Just give it over to the line discipline. There is no need to + * involve the flip buffers, since we are not running in an interrupt + * handler + */ + self->tty->ldisc.receive_buf(self->tty, skb->data, NULL, skb->len); + + /* No need to kfree_skb - see ircomm_ttp_data_indication() */ + + return 0; +} + +/* + * Function ircomm_tty_control_indication (instance, sap, skb) + * + * Parse all incoming parameters (easy!) + * + */ +static int ircomm_tty_control_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + int clen; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + IRDA_ASSERT(skb != NULL, return -1;); + + clen = skb->data[0]; + + irda_param_extract_all(self, skb->data+1, IRDA_MIN(skb->len-1, clen), + &ircomm_param_info); + + /* No need to kfree_skb - see ircomm_control_indication() */ + + return 0; +} + +/* + * Function ircomm_tty_flow_indication (instance, sap, cmd) + * + * This function is called by IrTTP when it wants us to slow down the + * transmission of data. We just mark the hardware as stopped, and wait + * for IrTTP to notify us that things are OK again. + */ +static void ircomm_tty_flow_indication(void *instance, void *sap, + LOCAL_FLOW cmd) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + struct tty_struct *tty; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + tty = self->tty; + + switch (cmd) { + case FLOW_START: + IRDA_DEBUG(2, "%s(), hw start!\n", __FUNCTION__ ); + tty->hw_stopped = 0; + + /* ircomm_tty_do_softint will take care of the rest */ + schedule_work(&self->tqueue); + break; + default: /* If we get here, something is very wrong, better stop */ + case FLOW_STOP: + IRDA_DEBUG(2, "%s(), hw stopped!\n", __FUNCTION__ ); + tty->hw_stopped = 1; + break; + } + self->flow = cmd; +} + +static int ircomm_tty_line_info(struct ircomm_tty_cb *self, char *buf) +{ + int ret=0; + + ret += sprintf(buf+ret, "State: %s\n", ircomm_tty_state[self->state]); + + ret += sprintf(buf+ret, "Service type: "); + if (self->service_type & IRCOMM_9_WIRE) + ret += sprintf(buf+ret, "9_WIRE"); + else if (self->service_type & IRCOMM_3_WIRE) + ret += sprintf(buf+ret, "3_WIRE"); + else if (self->service_type & IRCOMM_3_WIRE_RAW) + ret += sprintf(buf+ret, "3_WIRE_RAW"); + else + ret += sprintf(buf+ret, "No common service type!\n"); + ret += sprintf(buf+ret, "\n"); + + ret += sprintf(buf+ret, "Port name: %s\n", self->settings.port_name); + + ret += sprintf(buf+ret, "DTE status: "); + if (self->settings.dte & IRCOMM_RTS) + ret += sprintf(buf+ret, "RTS|"); + if (self->settings.dte & IRCOMM_DTR) + ret += sprintf(buf+ret, "DTR|"); + if (self->settings.dte) + ret--; /* remove the last | */ + ret += sprintf(buf+ret, "\n"); + + ret += sprintf(buf+ret, "DCE status: "); + if (self->settings.dce & IRCOMM_CTS) + ret += sprintf(buf+ret, "CTS|"); + if (self->settings.dce & IRCOMM_DSR) + ret += sprintf(buf+ret, "DSR|"); + if (self->settings.dce & IRCOMM_CD) + ret += sprintf(buf+ret, "CD|"); + if (self->settings.dce & IRCOMM_RI) + ret += sprintf(buf+ret, "RI|"); + if (self->settings.dce) + ret--; /* remove the last | */ + ret += sprintf(buf+ret, "\n"); + + ret += sprintf(buf+ret, "Configuration: "); + if (!self->settings.null_modem) + ret += sprintf(buf+ret, "DTE <-> DCE\n"); + else + ret += sprintf(buf+ret, + "DTE <-> DTE (null modem emulation)\n"); + + ret += sprintf(buf+ret, "Data rate: %d\n", self->settings.data_rate); + + ret += sprintf(buf+ret, "Flow control: "); + if (self->settings.flow_control & IRCOMM_XON_XOFF_IN) + ret += sprintf(buf+ret, "XON_XOFF_IN|"); + if (self->settings.flow_control & IRCOMM_XON_XOFF_OUT) + ret += sprintf(buf+ret, "XON_XOFF_OUT|"); + if (self->settings.flow_control & IRCOMM_RTS_CTS_IN) + ret += sprintf(buf+ret, "RTS_CTS_IN|"); + if (self->settings.flow_control & IRCOMM_RTS_CTS_OUT) + ret += sprintf(buf+ret, "RTS_CTS_OUT|"); + if (self->settings.flow_control & IRCOMM_DSR_DTR_IN) + ret += sprintf(buf+ret, "DSR_DTR_IN|"); + if (self->settings.flow_control & IRCOMM_DSR_DTR_OUT) + ret += sprintf(buf+ret, "DSR_DTR_OUT|"); + if (self->settings.flow_control & IRCOMM_ENQ_ACK_IN) + ret += sprintf(buf+ret, "ENQ_ACK_IN|"); + if (self->settings.flow_control & IRCOMM_ENQ_ACK_OUT) + ret += sprintf(buf+ret, "ENQ_ACK_OUT|"); + if (self->settings.flow_control) + ret--; /* remove the last | */ + ret += sprintf(buf+ret, "\n"); + + ret += sprintf(buf+ret, "Flags: "); + if (self->flags & ASYNC_CTS_FLOW) + ret += sprintf(buf+ret, "ASYNC_CTS_FLOW|"); + if (self->flags & ASYNC_CHECK_CD) + ret += sprintf(buf+ret, "ASYNC_CHECK_CD|"); + if (self->flags & ASYNC_INITIALIZED) + ret += sprintf(buf+ret, "ASYNC_INITIALIZED|"); + if (self->flags & ASYNC_LOW_LATENCY) + ret += sprintf(buf+ret, "ASYNC_LOW_LATENCY|"); + if (self->flags & ASYNC_CLOSING) + ret += sprintf(buf+ret, "ASYNC_CLOSING|"); + if (self->flags & ASYNC_NORMAL_ACTIVE) + ret += sprintf(buf+ret, "ASYNC_NORMAL_ACTIVE|"); + if (self->flags) + ret--; /* remove the last | */ + ret += sprintf(buf+ret, "\n"); + + ret += sprintf(buf+ret, "Role: %s\n", self->client ? + "client" : "server"); + ret += sprintf(buf+ret, "Open count: %d\n", self->open_count); + ret += sprintf(buf+ret, "Max data size: %d\n", self->max_data_size); + ret += sprintf(buf+ret, "Max header size: %d\n", self->max_header_size); + + if (self->tty) + ret += sprintf(buf+ret, "Hardware: %s\n", + self->tty->hw_stopped ? "Stopped" : "Running"); + + ret += sprintf(buf+ret, "\n"); + return ret; +} + + +/* + * Function ircomm_tty_read_proc (buf, start, offset, len, eof, unused) + * + * + * + */ +#ifdef CONFIG_PROC_FS +static int ircomm_tty_read_proc(char *buf, char **start, off_t offset, int len, + int *eof, void *unused) +{ + struct ircomm_tty_cb *self; + int count = 0, l; + off_t begin = 0; + unsigned long flags; + + spin_lock_irqsave(&ircomm_tty->hb_spinlock, flags); + + self = (struct ircomm_tty_cb *) hashbin_get_first(ircomm_tty); + while ((self != NULL) && (count < 4000)) { + if (self->magic != IRCOMM_TTY_MAGIC) + break; + + l = ircomm_tty_line_info(self, buf + count); + count += l; + if (count+begin > offset+len) + goto done; + if (count+begin < offset) { + begin += count; + count = 0; + } + + self = (struct ircomm_tty_cb *) hashbin_get_next(ircomm_tty); + } + *eof = 1; +done: + spin_unlock_irqrestore(&ircomm_tty->hb_spinlock, flags); + + if (offset >= count+begin) + return 0; + *start = buf + (offset-begin); + return ((len < begin+count-offset) ? len : begin+count-offset); +} +#endif /* CONFIG_PROC_FS */ + +MODULE_AUTHOR("Dag Brattli "); +MODULE_DESCRIPTION("IrCOMM serial TTY driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_CHARDEV_MAJOR(IRCOMM_TTY_MAJOR); + +module_init(ircomm_tty_init); +module_exit(ircomm_tty_cleanup); diff --git a/net/irda/ircomm/ircomm_tty_attach.c b/net/irda/ircomm/ircomm_tty_attach.c new file mode 100644 index 000000000000..99f5eddbb4b7 --- /dev/null +++ b/net/irda/ircomm/ircomm_tty_attach.c @@ -0,0 +1,1006 @@ +/********************************************************************* + * + * Filename: ircomm_tty_attach.c + * Version: + * Description: Code for attaching the serial driver to IrCOMM + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sat Jun 5 17:42:00 1999 + * Modified at: Tue Jan 4 14:20:49 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +static void ircomm_tty_ias_register(struct ircomm_tty_cb *self); +static void ircomm_tty_discovery_indication(discinfo_t *discovery, + DISCOVERY_MODE mode, + void *priv); +static void ircomm_tty_getvalue_confirm(int result, __u16 obj_id, + struct ias_value *value, void *priv); +static void ircomm_tty_start_watchdog_timer(struct ircomm_tty_cb *self, + int timeout); +static void ircomm_tty_watchdog_timer_expired(void *data); + +static int ircomm_tty_state_idle(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info); +static int ircomm_tty_state_search(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info); +static int ircomm_tty_state_query_parameters(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info); +static int ircomm_tty_state_query_lsap_sel(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info); +static int ircomm_tty_state_setup(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info); +static int ircomm_tty_state_ready(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info); + +char *ircomm_tty_state[] = { + "IRCOMM_TTY_IDLE", + "IRCOMM_TTY_SEARCH", + "IRCOMM_TTY_QUERY_PARAMETERS", + "IRCOMM_TTY_QUERY_LSAP_SEL", + "IRCOMM_TTY_SETUP", + "IRCOMM_TTY_READY", + "*** ERROR *** ", +}; + +#ifdef CONFIG_IRDA_DEBUG +static char *ircomm_tty_event[] = { + "IRCOMM_TTY_ATTACH_CABLE", + "IRCOMM_TTY_DETACH_CABLE", + "IRCOMM_TTY_DATA_REQUEST", + "IRCOMM_TTY_DATA_INDICATION", + "IRCOMM_TTY_DISCOVERY_REQUEST", + "IRCOMM_TTY_DISCOVERY_INDICATION", + "IRCOMM_TTY_CONNECT_CONFIRM", + "IRCOMM_TTY_CONNECT_INDICATION", + "IRCOMM_TTY_DISCONNECT_REQUEST", + "IRCOMM_TTY_DISCONNECT_INDICATION", + "IRCOMM_TTY_WD_TIMER_EXPIRED", + "IRCOMM_TTY_GOT_PARAMETERS", + "IRCOMM_TTY_GOT_LSAPSEL", + "*** ERROR ****", +}; +#endif /* CONFIG_IRDA_DEBUG */ + +static int (*state[])(struct ircomm_tty_cb *self, IRCOMM_TTY_EVENT event, + struct sk_buff *skb, struct ircomm_tty_info *info) = +{ + ircomm_tty_state_idle, + ircomm_tty_state_search, + ircomm_tty_state_query_parameters, + ircomm_tty_state_query_lsap_sel, + ircomm_tty_state_setup, + ircomm_tty_state_ready, +}; + +/* + * Function ircomm_tty_attach_cable (driver) + * + * Try to attach cable (IrCOMM link). This function will only return + * when the link has been connected, or if an error condition occurs. + * If success, the return value is the resulting service type. + */ +int ircomm_tty_attach_cable(struct ircomm_tty_cb *self) +{ + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + /* Check if somebody has already connected to us */ + if (ircomm_is_connected(self->ircomm)) { + IRDA_DEBUG(0, "%s(), already connected!\n", __FUNCTION__ ); + return 0; + } + + /* Make sure nobody tries to write before the link is up */ + self->tty->hw_stopped = 1; + + ircomm_tty_ias_register(self); + + ircomm_tty_do_event(self, IRCOMM_TTY_ATTACH_CABLE, NULL, NULL); + + return 0; +} + +/* + * Function ircomm_detach_cable (driver) + * + * Detach cable, or cable has been detached by peer + * + */ +void ircomm_tty_detach_cable(struct ircomm_tty_cb *self) +{ + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + del_timer(&self->watchdog_timer); + + /* Remove discovery handler */ + if (self->ckey) { + irlmp_unregister_client(self->ckey); + self->ckey = NULL; + } + /* Remove IrCOMM hint bits */ + if (self->skey) { + irlmp_unregister_service(self->skey); + self->skey = NULL; + } + + if (self->iriap) { + iriap_close(self->iriap); + self->iriap = NULL; + } + + /* Remove LM-IAS object */ + if (self->obj) { + irias_delete_object(self->obj); + self->obj = NULL; + } + + ircomm_tty_do_event(self, IRCOMM_TTY_DETACH_CABLE, NULL, NULL); + + /* Reset some values */ + self->daddr = self->saddr = 0; + self->dlsap_sel = self->slsap_sel = 0; + + memset(&self->settings, 0, sizeof(struct ircomm_params)); +} + +/* + * Function ircomm_tty_ias_register (self) + * + * Register with LM-IAS depending on which service type we are + * + */ +static void ircomm_tty_ias_register(struct ircomm_tty_cb *self) +{ + __u8 oct_seq[6]; + __u16 hints; + + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + /* Compute hint bits based on service */ + hints = irlmp_service_to_hint(S_COMM); + if (self->service_type & IRCOMM_3_WIRE_RAW) + hints |= irlmp_service_to_hint(S_PRINTER); + + /* Advertise IrCOMM hint bit in discovery */ + if (!self->skey) + self->skey = irlmp_register_service(hints); + /* Set up a discovery handler */ + if (!self->ckey) + self->ckey = irlmp_register_client(hints, + ircomm_tty_discovery_indication, + NULL, (void *) self); + + /* If already done, no need to do it again */ + if (self->obj) + return; + + if (self->service_type & IRCOMM_3_WIRE_RAW) { + /* Register IrLPT with LM-IAS */ + self->obj = irias_new_object("IrLPT", IAS_IRLPT_ID); + irias_add_integer_attrib(self->obj, "IrDA:IrLMP:LsapSel", + self->slsap_sel, IAS_KERNEL_ATTR); + } else { + /* Register IrCOMM with LM-IAS */ + self->obj = irias_new_object("IrDA:IrCOMM", IAS_IRCOMM_ID); + irias_add_integer_attrib(self->obj, "IrDA:TinyTP:LsapSel", + self->slsap_sel, IAS_KERNEL_ATTR); + + /* Code the parameters into the buffer */ + irda_param_pack(oct_seq, "bbbbbb", + IRCOMM_SERVICE_TYPE, 1, self->service_type, + IRCOMM_PORT_TYPE, 1, IRCOMM_SERIAL); + + /* Register parameters with LM-IAS */ + irias_add_octseq_attrib(self->obj, "Parameters", oct_seq, 6, + IAS_KERNEL_ATTR); + } + irias_insert_object(self->obj); +} + +/* + * Function ircomm_tty_ias_unregister (self) + * + * Remove our IAS object and client hook while connected. + * + */ +static void ircomm_tty_ias_unregister(struct ircomm_tty_cb *self) +{ + /* Remove LM-IAS object now so it is not reused. + * IrCOMM deals very poorly with multiple incoming connections. + * It should looks a lot more like IrNET, and "dup" a server TSAP + * to the application TSAP (based on various rules). + * This is a cheap workaround allowing multiple clients to + * connect to us. It will not always work. + * Each IrCOMM socket has an IAS entry. Incoming connection will + * pick the first one found. So, when we are fully connected, + * we remove our IAS entries so that the next IAS entry is used. + * We do that for *both* client and server, because a server + * can also create client instances. + * Jean II */ + if (self->obj) { + irias_delete_object(self->obj); + self->obj = NULL; + } + +#if 0 + /* Remove discovery handler. + * While we are connected, we no longer need to receive + * discovery events. This would be the case if there is + * multiple IrLAP interfaces. Jean II */ + if (self->ckey) { + irlmp_unregister_client(self->ckey); + self->ckey = NULL; + } +#endif +} + +/* + * Function ircomm_send_initial_parameters (self) + * + * Send initial parameters to the remote IrCOMM device. These parameters + * must be sent before any data. + */ +int ircomm_tty_send_initial_parameters(struct ircomm_tty_cb *self) +{ + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (self->service_type & IRCOMM_3_WIRE_RAW) + return 0; + + /* + * Set default values, but only if the application for some reason + * haven't set them already + */ + IRDA_DEBUG(2, "%s(), data-rate = %d\n", __FUNCTION__ , + self->settings.data_rate); + if (!self->settings.data_rate) + self->settings.data_rate = 9600; + IRDA_DEBUG(2, "%s(), data-format = %d\n", __FUNCTION__ , + self->settings.data_format); + if (!self->settings.data_format) + self->settings.data_format = IRCOMM_WSIZE_8; /* 8N1 */ + + IRDA_DEBUG(2, "%s(), flow-control = %d\n", __FUNCTION__ , + self->settings.flow_control); + /*self->settings.flow_control = IRCOMM_RTS_CTS_IN|IRCOMM_RTS_CTS_OUT;*/ + + /* Do not set delta values for the initial parameters */ + self->settings.dte = IRCOMM_DTR | IRCOMM_RTS; + + /* Only send service type parameter when we are the client */ + if (self->client) + ircomm_param_request(self, IRCOMM_SERVICE_TYPE, FALSE); + ircomm_param_request(self, IRCOMM_DATA_RATE, FALSE); + ircomm_param_request(self, IRCOMM_DATA_FORMAT, FALSE); + + /* For a 3 wire service, we just flush the last parameter and return */ + if (self->settings.service_type == IRCOMM_3_WIRE) { + ircomm_param_request(self, IRCOMM_FLOW_CONTROL, TRUE); + return 0; + } + + /* Only 9-wire service types continue here */ + ircomm_param_request(self, IRCOMM_FLOW_CONTROL, FALSE); +#if 0 + ircomm_param_request(self, IRCOMM_XON_XOFF, FALSE); + ircomm_param_request(self, IRCOMM_ENQ_ACK, FALSE); +#endif + /* Notify peer that we are ready to receive data */ + ircomm_param_request(self, IRCOMM_DTE, TRUE); + + return 0; +} + +/* + * Function ircomm_tty_discovery_indication (discovery) + * + * Remote device is discovered, try query the remote IAS to see which + * device it is, and which services it has. + * + */ +static void ircomm_tty_discovery_indication(discinfo_t *discovery, + DISCOVERY_MODE mode, + void *priv) +{ + struct ircomm_tty_cb *self; + struct ircomm_tty_info info; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + /* Important note : + * We need to drop all passive discoveries. + * The LSAP management of IrComm is deficient and doesn't deal + * with the case of two instance connecting to each other + * simultaneously (it will deadlock in LMP). + * The proper fix would be to use the same technique as in IrNET, + * to have one server socket and separate instances for the + * connecting/connected socket. + * The workaround is to drop passive discovery, which drastically + * reduce the probability of this happening. + * Jean II */ + if(mode == DISCOVERY_PASSIVE) + return; + + info.daddr = discovery->daddr; + info.saddr = discovery->saddr; + + /* FIXME. We have a locking problem on the hashbin here. + * We probably need to use hashbin_find_next(), but we first + * need to ensure that "line" is unique. - Jean II */ + self = (struct ircomm_tty_cb *) hashbin_get_first(ircomm_tty); + while (self != NULL) { + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + ircomm_tty_do_event(self, IRCOMM_TTY_DISCOVERY_INDICATION, + NULL, &info); + + self = (struct ircomm_tty_cb *) hashbin_get_next(ircomm_tty); + } +} + +/* + * Function ircomm_tty_disconnect_indication (instance, sap, reason, skb) + * + * Link disconnected + * + */ +void ircomm_tty_disconnect_indication(void *instance, void *sap, + LM_REASON reason, + struct sk_buff *skb) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + if (!self->tty) + return; + + /* This will stop control data transfers */ + self->flow = FLOW_STOP; + + /* Stop data transfers */ + self->tty->hw_stopped = 1; + + ircomm_tty_do_event(self, IRCOMM_TTY_DISCONNECT_INDICATION, NULL, + NULL); +} + +/* + * Function ircomm_tty_getvalue_confirm (result, obj_id, value, priv) + * + * Got result from the IAS query we make + * + */ +static void ircomm_tty_getvalue_confirm(int result, __u16 obj_id, + struct ias_value *value, + void *priv) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) priv; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + /* We probably don't need to make any more queries */ + iriap_close(self->iriap); + self->iriap = NULL; + + /* Check if request succeeded */ + if (result != IAS_SUCCESS) { + IRDA_DEBUG(4, "%s(), got NULL value!\n", __FUNCTION__ ); + return; + } + + switch (value->type) { + case IAS_OCT_SEQ: + IRDA_DEBUG(2, "%s(), got octet sequence\n", __FUNCTION__ ); + + irda_param_extract_all(self, value->t.oct_seq, value->len, + &ircomm_param_info); + + ircomm_tty_do_event(self, IRCOMM_TTY_GOT_PARAMETERS, NULL, + NULL); + break; + case IAS_INTEGER: + /* Got LSAP selector */ + IRDA_DEBUG(2, "%s(), got lsapsel = %d\n", __FUNCTION__ , + value->t.integer); + + if (value->t.integer == -1) { + IRDA_DEBUG(0, "%s(), invalid value!\n", __FUNCTION__ ); + } else + self->dlsap_sel = value->t.integer; + + ircomm_tty_do_event(self, IRCOMM_TTY_GOT_LSAPSEL, NULL, NULL); + break; + case IAS_MISSING: + IRDA_DEBUG(0, "%s(), got IAS_MISSING\n", __FUNCTION__ ); + break; + default: + IRDA_DEBUG(0, "%s(), got unknown type!\n", __FUNCTION__ ); + break; + } + irias_delete_value(value); +} + +/* + * Function ircomm_tty_connect_confirm (instance, sap, qos, max_sdu_size, skb) + * + * Connection confirmed + * + */ +void ircomm_tty_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_data_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + self->client = TRUE; + self->max_data_size = max_data_size; + self->max_header_size = max_header_size; + self->flow = FLOW_START; + + ircomm_tty_do_event(self, IRCOMM_TTY_CONNECT_CONFIRM, NULL, NULL); + + /* No need to kfree_skb - see ircomm_ttp_connect_confirm() */ +} + +/* + * Function ircomm_tty_connect_indication (instance, sap, qos, max_sdu_size, + * skb) + * + * we are discovered and being requested to connect by remote device ! + * + */ +void ircomm_tty_connect_indication(void *instance, void *sap, + struct qos_info *qos, + __u32 max_data_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) instance; + int clen; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + self->client = FALSE; + self->max_data_size = max_data_size; + self->max_header_size = max_header_size; + self->flow = FLOW_START; + + clen = skb->data[0]; + if (clen) + irda_param_extract_all(self, skb->data+1, + IRDA_MIN(skb->len, clen), + &ircomm_param_info); + + ircomm_tty_do_event(self, IRCOMM_TTY_CONNECT_INDICATION, NULL, NULL); + + /* No need to kfree_skb - see ircomm_ttp_connect_indication() */ +} + +/* + * Function ircomm_tty_link_established (self) + * + * Called when the IrCOMM link is established + * + */ +void ircomm_tty_link_established(struct ircomm_tty_cb *self) +{ + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + if (!self->tty) + return; + + del_timer(&self->watchdog_timer); + + /* + * IrCOMM link is now up, and if we are not using hardware + * flow-control, then declare the hardware as running. Otherwise we + * will have to wait for the peer device (DCE) to raise the CTS + * line. + */ + if ((self->flags & ASYNC_CTS_FLOW) && ((self->settings.dce & IRCOMM_CTS) == 0)) { + IRDA_DEBUG(0, "%s(), waiting for CTS ...\n", __FUNCTION__ ); + return; + } else { + IRDA_DEBUG(1, "%s(), starting hardware!\n", __FUNCTION__ ); + + self->tty->hw_stopped = 0; + + /* Wake up processes blocked on open */ + wake_up_interruptible(&self->open_wait); + } + + schedule_work(&self->tqueue); +} + +/* + * Function ircomm_tty_start_watchdog_timer (self, timeout) + * + * Start the watchdog timer. This timer is used to make sure that any + * connection attempt is successful, and if not, we will retry after + * the timeout + */ +static void ircomm_tty_start_watchdog_timer(struct ircomm_tty_cb *self, + int timeout) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + irda_start_timer(&self->watchdog_timer, timeout, (void *) self, + ircomm_tty_watchdog_timer_expired); +} + +/* + * Function ircomm_tty_watchdog_timer_expired (data) + * + * Called when the connect procedure have taken to much time. + * + */ +static void ircomm_tty_watchdog_timer_expired(void *data) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) data; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + ircomm_tty_do_event(self, IRCOMM_TTY_WD_TIMER_EXPIRED, NULL, NULL); +} + + +/* + * Function ircomm_tty_do_event (self, event, skb) + * + * Process event + * + */ +int ircomm_tty_do_event(struct ircomm_tty_cb *self, IRCOMM_TTY_EVENT event, + struct sk_buff *skb, struct ircomm_tty_info *info) +{ + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __FUNCTION__ , + ircomm_tty_state[self->state], ircomm_tty_event[event]); + + return (*state[self->state])(self, event, skb, info); +} + +/* + * Function ircomm_tty_next_state (self, state) + * + * Switch state + * + */ +static inline void ircomm_tty_next_state(struct ircomm_tty_cb *self, IRCOMM_TTY_STATE state) +{ + /* + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;); + + IRDA_DEBUG(2, "%s: next state=%s, service type=%d\n", __FUNCTION__ , + ircomm_tty_state[self->state], self->service_type); + */ + self->state = state; +} + +/* + * Function ircomm_tty_state_idle (self, event, skb, info) + * + * Just hanging around + * + */ +static int ircomm_tty_state_idle(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info) +{ + int ret = 0; + + IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __FUNCTION__ , + ircomm_tty_state[self->state], ircomm_tty_event[event]); + switch (event) { + case IRCOMM_TTY_ATTACH_CABLE: + /* Try to discover any remote devices */ + ircomm_tty_start_watchdog_timer(self, 3*HZ); + ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH); + + irlmp_discovery_request(DISCOVERY_DEFAULT_SLOTS); + break; + case IRCOMM_TTY_DISCOVERY_INDICATION: + self->daddr = info->daddr; + self->saddr = info->saddr; + + if (self->iriap) { + IRDA_WARNING("%s(), busy with a previous query\n", + __FUNCTION__); + return -EBUSY; + } + + self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, + ircomm_tty_getvalue_confirm); + + iriap_getvaluebyclass_request(self->iriap, + self->saddr, self->daddr, + "IrDA:IrCOMM", "Parameters"); + + ircomm_tty_start_watchdog_timer(self, 3*HZ); + ircomm_tty_next_state(self, IRCOMM_TTY_QUERY_PARAMETERS); + break; + case IRCOMM_TTY_CONNECT_INDICATION: + del_timer(&self->watchdog_timer); + + /* Accept connection */ + ircomm_connect_response(self->ircomm, NULL); + ircomm_tty_next_state(self, IRCOMM_TTY_READY); + break; + case IRCOMM_TTY_WD_TIMER_EXPIRED: + /* Just stay idle */ + break; + case IRCOMM_TTY_DETACH_CABLE: + ircomm_tty_next_state(self, IRCOMM_TTY_IDLE); + break; + default: + IRDA_DEBUG(2, "%s(), unknown event: %s\n", __FUNCTION__ , + ircomm_tty_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_tty_state_search (self, event, skb, info) + * + * Trying to discover an IrCOMM device + * + */ +static int ircomm_tty_state_search(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info) +{ + int ret = 0; + + IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __FUNCTION__ , + ircomm_tty_state[self->state], ircomm_tty_event[event]); + + switch (event) { + case IRCOMM_TTY_DISCOVERY_INDICATION: + self->daddr = info->daddr; + self->saddr = info->saddr; + + if (self->iriap) { + IRDA_WARNING("%s(), busy with a previous query\n", + __FUNCTION__); + return -EBUSY; + } + + self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, + ircomm_tty_getvalue_confirm); + + if (self->service_type == IRCOMM_3_WIRE_RAW) { + iriap_getvaluebyclass_request(self->iriap, self->saddr, + self->daddr, "IrLPT", + "IrDA:IrLMP:LsapSel"); + ircomm_tty_next_state(self, IRCOMM_TTY_QUERY_LSAP_SEL); + } else { + iriap_getvaluebyclass_request(self->iriap, self->saddr, + self->daddr, + "IrDA:IrCOMM", + "Parameters"); + + ircomm_tty_next_state(self, IRCOMM_TTY_QUERY_PARAMETERS); + } + ircomm_tty_start_watchdog_timer(self, 3*HZ); + break; + case IRCOMM_TTY_CONNECT_INDICATION: + del_timer(&self->watchdog_timer); + ircomm_tty_ias_unregister(self); + + /* Accept connection */ + ircomm_connect_response(self->ircomm, NULL); + ircomm_tty_next_state(self, IRCOMM_TTY_READY); + break; + case IRCOMM_TTY_WD_TIMER_EXPIRED: +#if 1 + /* Give up */ +#else + /* Try to discover any remote devices */ + ircomm_tty_start_watchdog_timer(self, 3*HZ); + irlmp_discovery_request(DISCOVERY_DEFAULT_SLOTS); +#endif + break; + case IRCOMM_TTY_DETACH_CABLE: + ircomm_tty_next_state(self, IRCOMM_TTY_IDLE); + break; + default: + IRDA_DEBUG(2, "%s(), unknown event: %s\n", __FUNCTION__ , + ircomm_tty_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_tty_state_query (self, event, skb, info) + * + * Querying the remote LM-IAS for IrCOMM parameters + * + */ +static int ircomm_tty_state_query_parameters(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info) +{ + int ret = 0; + + IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __FUNCTION__ , + ircomm_tty_state[self->state], ircomm_tty_event[event]); + + switch (event) { + case IRCOMM_TTY_GOT_PARAMETERS: + if (self->iriap) { + IRDA_WARNING("%s(), busy with a previous query\n", + __FUNCTION__); + return -EBUSY; + } + + self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, + ircomm_tty_getvalue_confirm); + + iriap_getvaluebyclass_request(self->iriap, self->saddr, + self->daddr, "IrDA:IrCOMM", + "IrDA:TinyTP:LsapSel"); + + ircomm_tty_start_watchdog_timer(self, 3*HZ); + ircomm_tty_next_state(self, IRCOMM_TTY_QUERY_LSAP_SEL); + break; + case IRCOMM_TTY_WD_TIMER_EXPIRED: + /* Go back to search mode */ + ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH); + ircomm_tty_start_watchdog_timer(self, 3*HZ); + break; + case IRCOMM_TTY_CONNECT_INDICATION: + del_timer(&self->watchdog_timer); + ircomm_tty_ias_unregister(self); + + /* Accept connection */ + ircomm_connect_response(self->ircomm, NULL); + ircomm_tty_next_state(self, IRCOMM_TTY_READY); + break; + case IRCOMM_TTY_DETACH_CABLE: + ircomm_tty_next_state(self, IRCOMM_TTY_IDLE); + break; + default: + IRDA_DEBUG(2, "%s(), unknown event: %s\n", __FUNCTION__ , + ircomm_tty_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_tty_state_query_lsap_sel (self, event, skb, info) + * + * Query remote LM-IAS for the LSAP selector which we can connect to + * + */ +static int ircomm_tty_state_query_lsap_sel(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info) +{ + int ret = 0; + + IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __FUNCTION__ , + ircomm_tty_state[self->state], ircomm_tty_event[event]); + + switch (event) { + case IRCOMM_TTY_GOT_LSAPSEL: + /* Connect to remote device */ + ret = ircomm_connect_request(self->ircomm, self->dlsap_sel, + self->saddr, self->daddr, + NULL, self->service_type); + ircomm_tty_start_watchdog_timer(self, 3*HZ); + ircomm_tty_next_state(self, IRCOMM_TTY_SETUP); + break; + case IRCOMM_TTY_WD_TIMER_EXPIRED: + /* Go back to search mode */ + ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH); + ircomm_tty_start_watchdog_timer(self, 3*HZ); + break; + case IRCOMM_TTY_CONNECT_INDICATION: + del_timer(&self->watchdog_timer); + ircomm_tty_ias_unregister(self); + + /* Accept connection */ + ircomm_connect_response(self->ircomm, NULL); + ircomm_tty_next_state(self, IRCOMM_TTY_READY); + break; + case IRCOMM_TTY_DETACH_CABLE: + ircomm_tty_next_state(self, IRCOMM_TTY_IDLE); + break; + default: + IRDA_DEBUG(2, "%s(), unknown event: %s\n", __FUNCTION__ , + ircomm_tty_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_tty_state_setup (self, event, skb, info) + * + * Trying to connect + * + */ +static int ircomm_tty_state_setup(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info) +{ + int ret = 0; + + IRDA_DEBUG(2, "%s: state=%s, event=%s\n", __FUNCTION__ , + ircomm_tty_state[self->state], ircomm_tty_event[event]); + + switch (event) { + case IRCOMM_TTY_CONNECT_CONFIRM: + del_timer(&self->watchdog_timer); + ircomm_tty_ias_unregister(self); + + /* + * Send initial parameters. This will also send out queued + * parameters waiting for the connection to come up + */ + ircomm_tty_send_initial_parameters(self); + ircomm_tty_link_established(self); + ircomm_tty_next_state(self, IRCOMM_TTY_READY); + break; + case IRCOMM_TTY_CONNECT_INDICATION: + del_timer(&self->watchdog_timer); + ircomm_tty_ias_unregister(self); + + /* Accept connection */ + ircomm_connect_response(self->ircomm, NULL); + ircomm_tty_next_state(self, IRCOMM_TTY_READY); + break; + case IRCOMM_TTY_WD_TIMER_EXPIRED: + /* Go back to search mode */ + ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH); + ircomm_tty_start_watchdog_timer(self, 3*HZ); + break; + case IRCOMM_TTY_DETACH_CABLE: + /* ircomm_disconnect_request(self->ircomm, NULL); */ + ircomm_tty_next_state(self, IRCOMM_TTY_IDLE); + break; + default: + IRDA_DEBUG(2, "%s(), unknown event: %s\n", __FUNCTION__ , + ircomm_tty_event[event]); + ret = -EINVAL; + } + return ret; +} + +/* + * Function ircomm_tty_state_ready (self, event, skb, info) + * + * IrCOMM is now connected + * + */ +static int ircomm_tty_state_ready(struct ircomm_tty_cb *self, + IRCOMM_TTY_EVENT event, + struct sk_buff *skb, + struct ircomm_tty_info *info) +{ + int ret = 0; + + switch (event) { + case IRCOMM_TTY_DATA_REQUEST: + ret = ircomm_data_request(self->ircomm, skb); + break; + case IRCOMM_TTY_DETACH_CABLE: + ircomm_disconnect_request(self->ircomm, NULL); + ircomm_tty_next_state(self, IRCOMM_TTY_IDLE); + break; + case IRCOMM_TTY_DISCONNECT_INDICATION: + ircomm_tty_ias_register(self); + ircomm_tty_next_state(self, IRCOMM_TTY_SEARCH); + ircomm_tty_start_watchdog_timer(self, 3*HZ); + + if (self->flags & ASYNC_CHECK_CD) { + /* Drop carrier */ + self->settings.dce = IRCOMM_DELTA_CD; + ircomm_tty_check_modem_status(self); + } else { + IRDA_DEBUG(0, "%s(), hanging up!\n", __FUNCTION__ ); + if (self->tty) + tty_hangup(self->tty); + } + break; + default: + IRDA_DEBUG(2, "%s(), unknown event: %s\n", __FUNCTION__ , + ircomm_tty_event[event]); + ret = -EINVAL; + } + return ret; +} + diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c new file mode 100644 index 000000000000..197e3e7ed7e2 --- /dev/null +++ b/net/irda/ircomm/ircomm_tty_ioctl.c @@ -0,0 +1,428 @@ +/********************************************************************* + * + * Filename: ircomm_tty_ioctl.c + * Version: + * Description: + * Status: Experimental. + * Author: Dag Brattli + * Created at: Thu Jun 10 14:39:09 1999 + * Modified at: Wed Jan 5 14:45:43 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include + +#define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK)) + +/* + * Function ircomm_tty_change_speed (driver) + * + * Change speed of the driver. If the remote device is a DCE, then this + * should make it change the speed of its serial port + */ +static void ircomm_tty_change_speed(struct ircomm_tty_cb *self) +{ + unsigned cflag, cval; + int baud; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + if (!self->tty || !self->tty->termios || !self->ircomm) + return; + + cflag = self->tty->termios->c_cflag; + + /* byte size and parity */ + switch (cflag & CSIZE) { + case CS5: cval = IRCOMM_WSIZE_5; break; + case CS6: cval = IRCOMM_WSIZE_6; break; + case CS7: cval = IRCOMM_WSIZE_7; break; + case CS8: cval = IRCOMM_WSIZE_8; break; + default: cval = IRCOMM_WSIZE_5; break; + } + if (cflag & CSTOPB) + cval |= IRCOMM_2_STOP_BIT; + + if (cflag & PARENB) + cval |= IRCOMM_PARITY_ENABLE; + if (!(cflag & PARODD)) + cval |= IRCOMM_PARITY_EVEN; + + /* Determine divisor based on baud rate */ + baud = tty_get_baud_rate(self->tty); + if (!baud) + baud = 9600; /* B0 transition handled in rs_set_termios */ + + self->settings.data_rate = baud; + ircomm_param_request(self, IRCOMM_DATA_RATE, FALSE); + + /* CTS flow control flag and modem status interrupts */ + if (cflag & CRTSCTS) { + self->flags |= ASYNC_CTS_FLOW; + self->settings.flow_control |= IRCOMM_RTS_CTS_IN; + /* This got me. Bummer. Jean II */ + if (self->service_type == IRCOMM_3_WIRE_RAW) + IRDA_WARNING("%s(), enabling RTS/CTS on link that doesn't support it (3-wire-raw)\n", __FUNCTION__); + } else { + self->flags &= ~ASYNC_CTS_FLOW; + self->settings.flow_control &= ~IRCOMM_RTS_CTS_IN; + } + if (cflag & CLOCAL) + self->flags &= ~ASYNC_CHECK_CD; + else + self->flags |= ASYNC_CHECK_CD; +#if 0 + /* + * Set up parity check flag + */ + + if (I_INPCK(self->tty)) + driver->read_status_mask |= LSR_FE | LSR_PE; + if (I_BRKINT(driver->tty) || I_PARMRK(driver->tty)) + driver->read_status_mask |= LSR_BI; + + /* + * Characters to ignore + */ + driver->ignore_status_mask = 0; + if (I_IGNPAR(driver->tty)) + driver->ignore_status_mask |= LSR_PE | LSR_FE; + + if (I_IGNBRK(self->tty)) { + self->ignore_status_mask |= LSR_BI; + /* + * If we're ignore parity and break indicators, ignore + * overruns too. (For real raw support). + */ + if (I_IGNPAR(self->tty)) + self->ignore_status_mask |= LSR_OE; + } +#endif + self->settings.data_format = cval; + + ircomm_param_request(self, IRCOMM_DATA_FORMAT, FALSE); + ircomm_param_request(self, IRCOMM_FLOW_CONTROL, TRUE); +} + +/* + * Function ircomm_tty_set_termios (tty, old_termios) + * + * This routine allows the tty driver to be notified when device's + * termios settings have changed. Note that a well-designed tty driver + * should be prepared to accept the case where old == NULL, and try to + * do something rational. + */ +void ircomm_tty_set_termios(struct tty_struct *tty, + struct termios *old_termios) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + unsigned int cflag = tty->termios->c_cflag; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + if ((cflag == old_termios->c_cflag) && + (RELEVANT_IFLAG(tty->termios->c_iflag) == + RELEVANT_IFLAG(old_termios->c_iflag))) + { + return; + } + + ircomm_tty_change_speed(self); + + /* Handle transition to B0 status */ + if ((old_termios->c_cflag & CBAUD) && + !(cflag & CBAUD)) { + self->settings.dte &= ~(IRCOMM_DTR|IRCOMM_RTS); + ircomm_param_request(self, IRCOMM_DTE, TRUE); + } + + /* Handle transition away from B0 status */ + if (!(old_termios->c_cflag & CBAUD) && + (cflag & CBAUD)) { + self->settings.dte |= IRCOMM_DTR; + if (!(tty->termios->c_cflag & CRTSCTS) || + !test_bit(TTY_THROTTLED, &tty->flags)) { + self->settings.dte |= IRCOMM_RTS; + } + ircomm_param_request(self, IRCOMM_DTE, TRUE); + } + + /* Handle turning off CRTSCTS */ + if ((old_termios->c_cflag & CRTSCTS) && + !(tty->termios->c_cflag & CRTSCTS)) + { + tty->hw_stopped = 0; + ircomm_tty_start(tty); + } +} + +/* + * Function ircomm_tty_tiocmget (tty, file) + * + * + * + */ +int ircomm_tty_tiocmget(struct tty_struct *tty, struct file *file) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + unsigned int result; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + if (tty->flags & (1 << TTY_IO_ERROR)) + return -EIO; + + result = ((self->settings.dte & IRCOMM_RTS) ? TIOCM_RTS : 0) + | ((self->settings.dte & IRCOMM_DTR) ? TIOCM_DTR : 0) + | ((self->settings.dce & IRCOMM_CD) ? TIOCM_CAR : 0) + | ((self->settings.dce & IRCOMM_RI) ? TIOCM_RNG : 0) + | ((self->settings.dce & IRCOMM_DSR) ? TIOCM_DSR : 0) + | ((self->settings.dce & IRCOMM_CTS) ? TIOCM_CTS : 0); + return result; +} + +/* + * Function ircomm_tty_tiocmset (tty, file, set, clear) + * + * + * + */ +int ircomm_tty_tiocmset(struct tty_struct *tty, struct file *file, + unsigned int set, unsigned int clear) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + if (tty->flags & (1 << TTY_IO_ERROR)) + return -EIO; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return -1;); + + if (set & TIOCM_RTS) + self->settings.dte |= IRCOMM_RTS; + if (set & TIOCM_DTR) + self->settings.dte |= IRCOMM_DTR; + + if (clear & TIOCM_RTS) + self->settings.dte &= ~IRCOMM_RTS; + if (clear & TIOCM_DTR) + self->settings.dte &= ~IRCOMM_DTR; + + if ((set|clear) & TIOCM_RTS) + self->settings.dte |= IRCOMM_DELTA_RTS; + if ((set|clear) & TIOCM_DTR) + self->settings.dte |= IRCOMM_DELTA_DTR; + + ircomm_param_request(self, IRCOMM_DTE, TRUE); + + return 0; +} + +/* + * Function get_serial_info (driver, retinfo) + * + * + * + */ +static int ircomm_tty_get_serial_info(struct ircomm_tty_cb *self, + struct serial_struct __user *retinfo) +{ + struct serial_struct info; + + if (!retinfo) + return -EFAULT; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + memset(&info, 0, sizeof(info)); + info.line = self->line; + info.flags = self->flags; + info.baud_base = self->settings.data_rate; + info.close_delay = self->close_delay; + info.closing_wait = self->closing_wait; + + /* For compatibility */ + info.type = PORT_16550A; + info.port = 0; + info.irq = 0; + info.xmit_fifo_size = 0; + info.hub6 = 0; + info.custom_divisor = 0; + + if (copy_to_user(retinfo, &info, sizeof(*retinfo))) + return -EFAULT; + + return 0; +} + +/* + * Function set_serial_info (driver, new_info) + * + * + * + */ +static int ircomm_tty_set_serial_info(struct ircomm_tty_cb *self, + struct serial_struct __user *new_info) +{ +#if 0 + struct serial_struct new_serial; + struct ircomm_tty_cb old_state, *state; + + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + if (copy_from_user(&new_serial,new_info,sizeof(new_serial))) + return -EFAULT; + + + state = self + old_state = *self; + + if (!capable(CAP_SYS_ADMIN)) { + if ((new_serial.baud_base != state->settings.data_rate) || + (new_serial.close_delay != state->close_delay) || + ((new_serial.flags & ~ASYNC_USR_MASK) != + (self->flags & ~ASYNC_USR_MASK))) + return -EPERM; + state->flags = ((state->flags & ~ASYNC_USR_MASK) | + (new_serial.flags & ASYNC_USR_MASK)); + self->flags = ((self->flags & ~ASYNC_USR_MASK) | + (new_serial.flags & ASYNC_USR_MASK)); + /* self->custom_divisor = new_serial.custom_divisor; */ + goto check_and_exit; + } + + /* + * OK, past this point, all the error checking has been done. + * At this point, we start making changes..... + */ + + if (self->settings.data_rate != new_serial.baud_base) { + self->settings.data_rate = new_serial.baud_base; + ircomm_param_request(self, IRCOMM_DATA_RATE, TRUE); + } + + self->close_delay = new_serial.close_delay * HZ/100; + self->closing_wait = new_serial.closing_wait * HZ/100; + /* self->custom_divisor = new_serial.custom_divisor; */ + + self->flags = ((self->flags & ~ASYNC_FLAGS) | + (new_serial.flags & ASYNC_FLAGS)); + self->tty->low_latency = (self->flags & ASYNC_LOW_LATENCY) ? 1 : 0; + + check_and_exit: + + if (self->flags & ASYNC_INITIALIZED) { + if (((old_state.flags & ASYNC_SPD_MASK) != + (self->flags & ASYNC_SPD_MASK)) || + (old_driver.custom_divisor != driver->custom_divisor)) { + if ((driver->flags & ASYNC_SPD_MASK) == ASYNC_SPD_HI) + driver->tty->alt_speed = 57600; + if ((driver->flags & ASYNC_SPD_MASK) == ASYNC_SPD_VHI) + driver->tty->alt_speed = 115200; + if ((driver->flags & ASYNC_SPD_MASK) == ASYNC_SPD_SHI) + driver->tty->alt_speed = 230400; + if ((driver->flags & ASYNC_SPD_MASK) == ASYNC_SPD_WARP) + driver->tty->alt_speed = 460800; + ircomm_tty_change_speed(driver); + } + } +#endif + return 0; +} + +/* + * Function ircomm_tty_ioctl (tty, file, cmd, arg) + * + * + * + */ +int ircomm_tty_ioctl(struct tty_struct *tty, struct file *file, + unsigned int cmd, unsigned long arg) +{ + struct ircomm_tty_cb *self = (struct ircomm_tty_cb *) tty->driver_data; + int ret = 0; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + if ((cmd != TIOCGSERIAL) && (cmd != TIOCSSERIAL) && + (cmd != TIOCSERCONFIG) && (cmd != TIOCSERGSTRUCT) && + (cmd != TIOCMIWAIT) && (cmd != TIOCGICOUNT)) { + if (tty->flags & (1 << TTY_IO_ERROR)) + return -EIO; + } + + switch (cmd) { + case TIOCGSERIAL: + ret = ircomm_tty_get_serial_info(self, (struct serial_struct __user *) arg); + break; + case TIOCSSERIAL: + ret = ircomm_tty_set_serial_info(self, (struct serial_struct __user *) arg); + break; + case TIOCMIWAIT: + IRDA_DEBUG(0, "(), TIOCMIWAIT, not impl!\n"); + break; + + case TIOCGICOUNT: + IRDA_DEBUG(0, "%s(), TIOCGICOUNT not impl!\n", __FUNCTION__ ); +#if 0 + save_flags(flags); cli(); + cnow = driver->icount; + restore_flags(flags); + p_cuser = (struct serial_icounter_struct __user *) arg; + if (put_user(cnow.cts, &p_cuser->cts) || + put_user(cnow.dsr, &p_cuser->dsr) || + put_user(cnow.rng, &p_cuser->rng) || + put_user(cnow.dcd, &p_cuser->dcd) || + put_user(cnow.rx, &p_cuser->rx) || + put_user(cnow.tx, &p_cuser->tx) || + put_user(cnow.frame, &p_cuser->frame) || + put_user(cnow.overrun, &p_cuser->overrun) || + put_user(cnow.parity, &p_cuser->parity) || + put_user(cnow.brk, &p_cuser->brk) || + put_user(cnow.buf_overrun, &p_cuser->buf_overrun)) + return -EFAULT; +#endif + return 0; + default: + ret = -ENOIOCTLCMD; /* ioctls which we must ignore */ + } + return ret; +} + + + diff --git a/net/irda/irda_device.c b/net/irda/irda_device.c new file mode 100644 index 000000000000..fda299e300c0 --- /dev/null +++ b/net/irda/irda_device.c @@ -0,0 +1,489 @@ +/********************************************************************* + * + * Filename: irda_device.c + * Version: 0.9 + * Description: Utility functions used by the device drivers + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sat Oct 9 09:22:27 1999 + * Modified at: Sun Jan 23 17:41:24 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2001 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +static void __irda_task_delete(struct irda_task *task); + +static hashbin_t *dongles = NULL; +static hashbin_t *tasks = NULL; + +#ifdef CONFIG_IRDA_DEBUG +static const char *task_state[] = { + "IRDA_TASK_INIT", + "IRDA_TASK_DONE", + "IRDA_TASK_WAIT", + "IRDA_TASK_WAIT1", + "IRDA_TASK_WAIT2", + "IRDA_TASK_WAIT3", + "IRDA_TASK_CHILD_INIT", + "IRDA_TASK_CHILD_WAIT", + "IRDA_TASK_CHILD_DONE", +}; +#endif /* CONFIG_IRDA_DEBUG */ + +static void irda_task_timer_expired(void *data); + +int __init irda_device_init( void) +{ + dongles = hashbin_new(HB_NOLOCK); + if (dongles == NULL) { + IRDA_WARNING("IrDA: Can't allocate dongles hashbin!\n"); + return -ENOMEM; + } + spin_lock_init(&dongles->hb_spinlock); + + tasks = hashbin_new(HB_LOCK); + if (tasks == NULL) { + IRDA_WARNING("IrDA: Can't allocate tasks hashbin!\n"); + hashbin_delete(dongles, NULL); + return -ENOMEM; + } + + /* We no longer initialise the driver ourselves here, we let + * the system do it for us... - Jean II */ + + return 0; +} + +static void __exit leftover_dongle(void *arg) +{ + struct dongle_reg *reg = arg; + IRDA_WARNING("IrDA: Dongle type %x not unregistered\n", + reg->type); +} + +void __exit irda_device_cleanup(void) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + hashbin_delete(tasks, (FREE_FUNC) __irda_task_delete); + + hashbin_delete(dongles, leftover_dongle); +} + +/* + * Function irda_device_set_media_busy (self, status) + * + * Called when we have detected that another station is transmitting + * in contention mode. + */ +void irda_device_set_media_busy(struct net_device *dev, int status) +{ + struct irlap_cb *self; + + IRDA_DEBUG(4, "%s(%s)\n", __FUNCTION__, status ? "TRUE" : "FALSE"); + + self = (struct irlap_cb *) dev->atalk_ptr; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + if (status) { + self->media_busy = TRUE; + if (status == SMALL) + irlap_start_mbusy_timer(self, SMALLBUSY_TIMEOUT); + else + irlap_start_mbusy_timer(self, MEDIABUSY_TIMEOUT); + IRDA_DEBUG( 4, "Media busy!\n"); + } else { + self->media_busy = FALSE; + irlap_stop_mbusy_timer(self); + } +} +EXPORT_SYMBOL(irda_device_set_media_busy); + + +/* + * Function irda_device_is_receiving (dev) + * + * Check if the device driver is currently receiving data + * + */ +int irda_device_is_receiving(struct net_device *dev) +{ + struct if_irda_req req; + int ret; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + if (!dev->do_ioctl) { + IRDA_ERROR("%s: do_ioctl not impl. by device driver\n", + __FUNCTION__); + return -1; + } + + ret = dev->do_ioctl(dev, (struct ifreq *) &req, SIOCGRECEIVING); + if (ret < 0) + return ret; + + return req.ifr_receiving; +} + +void irda_task_next_state(struct irda_task *task, IRDA_TASK_STATE state) +{ + IRDA_DEBUG(2, "%s(), state = %s\n", __FUNCTION__, task_state[state]); + + task->state = state; +} +EXPORT_SYMBOL(irda_task_next_state); + +static void __irda_task_delete(struct irda_task *task) +{ + del_timer(&task->timer); + + kfree(task); +} + +void irda_task_delete(struct irda_task *task) +{ + /* Unregister task */ + hashbin_remove(tasks, (long) task, NULL); + + __irda_task_delete(task); +} +EXPORT_SYMBOL(irda_task_delete); + +/* + * Function irda_task_kick (task) + * + * Tries to execute a task possible multiple times until the task is either + * finished, or askes for a timeout. When a task is finished, we do post + * processing, and notify the parent task, that is waiting for this task + * to complete. + */ +static int irda_task_kick(struct irda_task *task) +{ + int finished = TRUE; + int count = 0; + int timeout; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(task != NULL, return -1;); + IRDA_ASSERT(task->magic == IRDA_TASK_MAGIC, return -1;); + + /* Execute task until it's finished, or askes for a timeout */ + do { + timeout = task->function(task); + if (count++ > 100) { + IRDA_ERROR("%s: error in task handler!\n", + __FUNCTION__); + irda_task_delete(task); + return TRUE; + } + } while ((timeout == 0) && (task->state != IRDA_TASK_DONE)); + + if (timeout < 0) { + IRDA_ERROR("%s: Error executing task!\n", __FUNCTION__); + irda_task_delete(task); + return TRUE; + } + + /* Check if we are finished */ + if (task->state == IRDA_TASK_DONE) { + del_timer(&task->timer); + + /* Do post processing */ + if (task->finished) + task->finished(task); + + /* Notify parent */ + if (task->parent) { + /* Check if parent is waiting for us to complete */ + if (task->parent->state == IRDA_TASK_CHILD_WAIT) { + task->parent->state = IRDA_TASK_CHILD_DONE; + + /* Stop timer now that we are here */ + del_timer(&task->parent->timer); + + /* Kick parent task */ + irda_task_kick(task->parent); + } + } + irda_task_delete(task); + } else if (timeout > 0) { + irda_start_timer(&task->timer, timeout, (void *) task, + irda_task_timer_expired); + finished = FALSE; + } else { + IRDA_DEBUG(0, "%s(), not finished, and no timeout!\n", + __FUNCTION__); + finished = FALSE; + } + + return finished; +} + +/* + * Function irda_task_execute (instance, function, finished) + * + * This function registers and tries to execute tasks that may take some + * time to complete. We do it this hairy way since we may have been + * called from interrupt context, so it's not possible to use + * schedule_timeout() + * Two important notes : + * o Make sure you irda_task_delete(task); in case you delete the + * calling instance. + * o No real need to lock when calling this function, but you may + * want to lock within the task handler. + * Jean II + */ +struct irda_task *irda_task_execute(void *instance, + IRDA_TASK_CALLBACK function, + IRDA_TASK_CALLBACK finished, + struct irda_task *parent, void *param) +{ + struct irda_task *task; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + task = kmalloc(sizeof(struct irda_task), GFP_ATOMIC); + if (!task) + return NULL; + + task->state = IRDA_TASK_INIT; + task->instance = instance; + task->function = function; + task->finished = finished; + task->parent = parent; + task->param = param; + task->magic = IRDA_TASK_MAGIC; + + init_timer(&task->timer); + + /* Register task */ + hashbin_insert(tasks, (irda_queue_t *) task, (long) task, NULL); + + /* No time to waste, so lets get going! */ + return irda_task_kick(task) ? NULL : task; +} +EXPORT_SYMBOL(irda_task_execute); + +/* + * Function irda_task_timer_expired (data) + * + * Task time has expired. We now try to execute task (again), and restart + * the timer if the task has not finished yet + */ +static void irda_task_timer_expired(void *data) +{ + struct irda_task *task; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + task = (struct irda_task *) data; + + irda_task_kick(task); +} + +/* + * Function irda_device_setup (dev) + * + * This function should be used by low level device drivers in a similar way + * as ether_setup() is used by normal network device drivers + */ +static void irda_device_setup(struct net_device *dev) +{ + dev->hard_header_len = 0; + dev->addr_len = 0; + + dev->type = ARPHRD_IRDA; + dev->tx_queue_len = 8; /* Window size + 1 s-frame */ + + memset(dev->broadcast, 0xff, 4); + + dev->mtu = 2048; + dev->flags = IFF_NOARP; +} + +/* + * Funciton alloc_irdadev + * Allocates and sets up an IRDA device in a manner similar to + * alloc_etherdev. + */ +struct net_device *alloc_irdadev(int sizeof_priv) +{ + return alloc_netdev(sizeof_priv, "irda%d", irda_device_setup); +} +EXPORT_SYMBOL(alloc_irdadev); + +/* + * Function irda_device_init_dongle (self, type, qos) + * + * Initialize attached dongle. + * + * Important : request_module require us to call this function with + * a process context and irq enabled. - Jean II + */ +dongle_t *irda_device_dongle_init(struct net_device *dev, int type) +{ + struct dongle_reg *reg; + dongle_t *dongle = NULL; + + might_sleep(); + + spin_lock(&dongles->hb_spinlock); + reg = hashbin_find(dongles, type, NULL); + +#ifdef CONFIG_KMOD + /* Try to load the module needed */ + if (!reg && capable(CAP_SYS_MODULE)) { + spin_unlock(&dongles->hb_spinlock); + + request_module("irda-dongle-%d", type); + + spin_lock(&dongles->hb_spinlock); + reg = hashbin_find(dongles, type, NULL); + } +#endif + + if (!reg || !try_module_get(reg->owner) ) { + IRDA_ERROR("IrDA: Unable to find requested dongle type %x\n", + type); + goto out; + } + + /* Allocate dongle info for this instance */ + dongle = kmalloc(sizeof(dongle_t), GFP_KERNEL); + if (!dongle) + goto out; + + memset(dongle, 0, sizeof(dongle_t)); + + /* Bind the registration info to this particular instance */ + dongle->issue = reg; + dongle->dev = dev; + + out: + spin_unlock(&dongles->hb_spinlock); + return dongle; +} +EXPORT_SYMBOL(irda_device_dongle_init); + +/* + * Function irda_device_dongle_cleanup (dongle) + */ +int irda_device_dongle_cleanup(dongle_t *dongle) +{ + IRDA_ASSERT(dongle != NULL, return -1;); + + dongle->issue->close(dongle); + module_put(dongle->issue->owner); + kfree(dongle); + + return 0; +} +EXPORT_SYMBOL(irda_device_dongle_cleanup); + +/* + * Function irda_device_register_dongle (dongle) + */ +int irda_device_register_dongle(struct dongle_reg *new) +{ + spin_lock(&dongles->hb_spinlock); + /* Check if this dongle has been registered before */ + if (hashbin_find(dongles, new->type, NULL)) { + IRDA_MESSAGE("%s: Dongle type %x already registered\n", + __FUNCTION__, new->type); + } else { + /* Insert IrDA dongle into hashbin */ + hashbin_insert(dongles, (irda_queue_t *) new, new->type, NULL); + } + spin_unlock(&dongles->hb_spinlock); + + return 0; +} +EXPORT_SYMBOL(irda_device_register_dongle); + +/* + * Function irda_device_unregister_dongle (dongle) + * + * Unregister dongle, and remove dongle from list of registered dongles + * + */ +void irda_device_unregister_dongle(struct dongle_reg *dongle) +{ + struct dongle *node; + + spin_lock(&dongles->hb_spinlock); + node = hashbin_remove(dongles, dongle->type, NULL); + if (!node) + IRDA_ERROR("%s: dongle not found!\n", __FUNCTION__); + spin_unlock(&dongles->hb_spinlock); +} +EXPORT_SYMBOL(irda_device_unregister_dongle); + +#ifdef CONFIG_ISA +/* + * Function setup_dma (idev, buffer, count, mode) + * + * Setup the DMA channel. Commonly used by ISA FIR drivers + * + */ +void irda_setup_dma(int channel, dma_addr_t buffer, int count, int mode) +{ + unsigned long flags; + + flags = claim_dma_lock(); + + disable_dma(channel); + clear_dma_ff(channel); + set_dma_mode(channel, mode); + set_dma_addr(channel, buffer); + set_dma_count(channel, count); + enable_dma(channel); + + release_dma_lock(flags); +} +EXPORT_SYMBOL(irda_setup_dma); +#endif diff --git a/net/irda/iriap.c b/net/irda/iriap.c new file mode 100644 index 000000000000..b8bb78af8b8a --- /dev/null +++ b/net/irda/iriap.c @@ -0,0 +1,1089 @@ +/********************************************************************* + * + * Filename: iriap.c + * Version: 0.8 + * Description: Information Access Protocol (IAP) + * Status: Experimental. + * Author: Dag Brattli + * Created at: Thu Aug 21 00:02:07 1997 + * Modified at: Sat Dec 25 16:42:42 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_IRDA_DEBUG +/* FIXME: This one should go in irlmp.c */ +static const char *ias_charset_types[] = { + "CS_ASCII", + "CS_ISO_8859_1", + "CS_ISO_8859_2", + "CS_ISO_8859_3", + "CS_ISO_8859_4", + "CS_ISO_8859_5", + "CS_ISO_8859_6", + "CS_ISO_8859_7", + "CS_ISO_8859_8", + "CS_ISO_8859_9", + "CS_UNICODE" +}; +#endif /* CONFIG_IRDA_DEBUG */ + +static hashbin_t *iriap = NULL; +static void *service_handle; + +static void __iriap_close(struct iriap_cb *self); +static int iriap_register_lsap(struct iriap_cb *self, __u8 slsap_sel, int mode); +static void iriap_disconnect_indication(void *instance, void *sap, + LM_REASON reason, struct sk_buff *skb); +static void iriap_connect_indication(void *instance, void *sap, + struct qos_info *qos, __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb); +static void iriap_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, __u8 max_header_size, + struct sk_buff *skb); +static int iriap_data_indication(void *instance, void *sap, + struct sk_buff *skb); + +static void iriap_watchdog_timer_expired(void *data); + +static inline void iriap_start_watchdog_timer(struct iriap_cb *self, + int timeout) +{ + irda_start_timer(&self->watchdog_timer, timeout, self, + iriap_watchdog_timer_expired); +} + +/* + * Function iriap_init (void) + * + * Initializes the IrIAP layer, called by the module initialization code + * in irmod.c + */ +int __init iriap_init(void) +{ + struct ias_object *obj; + struct iriap_cb *server; + __u8 oct_seq[6]; + __u16 hints; + + /* Allocate master array */ + iriap = hashbin_new(HB_LOCK); + if (!iriap) + return -ENOMEM; + + /* Object repository - defined in irias_object.c */ + irias_objects = hashbin_new(HB_LOCK); + if (!irias_objects) { + IRDA_WARNING("%s: Can't allocate irias_objects hashbin!\n", + __FUNCTION__); + hashbin_delete(iriap, NULL); + return -ENOMEM; + } + + /* + * Register some default services for IrLMP + */ + hints = irlmp_service_to_hint(S_COMPUTER); + service_handle = irlmp_register_service(hints); + + /* Register the Device object with LM-IAS */ + obj = irias_new_object("Device", IAS_DEVICE_ID); + irias_add_string_attrib(obj, "DeviceName", "Linux", IAS_KERNEL_ATTR); + + oct_seq[0] = 0x01; /* Version 1 */ + oct_seq[1] = 0x00; /* IAS support bits */ + oct_seq[2] = 0x00; /* LM-MUX support bits */ +#ifdef CONFIG_IRDA_ULTRA + oct_seq[2] |= 0x04; /* Connectionless Data support */ +#endif + irias_add_octseq_attrib(obj, "IrLMPSupport", oct_seq, 3, + IAS_KERNEL_ATTR); + irias_insert_object(obj); + + /* + * Register server support with IrLMP so we can accept incoming + * connections + */ + server = iriap_open(LSAP_IAS, IAS_SERVER, NULL, NULL); + if (!server) { + IRDA_DEBUG(0, "%s(), unable to open server\n", __FUNCTION__); + return -1; + } + iriap_register_lsap(server, LSAP_IAS, IAS_SERVER); + + return 0; +} + +/* + * Function iriap_cleanup (void) + * + * Initializes the IrIAP layer, called by the module cleanup code in + * irmod.c + */ +void __exit iriap_cleanup(void) +{ + irlmp_unregister_service(service_handle); + + hashbin_delete(iriap, (FREE_FUNC) __iriap_close); + hashbin_delete(irias_objects, (FREE_FUNC) __irias_delete_object); +} + +/* + * Function iriap_open (void) + * + * Opens an instance of the IrIAP layer, and registers with IrLMP + */ +struct iriap_cb *iriap_open(__u8 slsap_sel, int mode, void *priv, + CONFIRM_CALLBACK callback) +{ + struct iriap_cb *self; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + self = kmalloc(sizeof(struct iriap_cb), GFP_ATOMIC); + if (!self) { + IRDA_WARNING("%s: Unable to kmalloc!\n", __FUNCTION__); + return NULL; + } + + /* + * Initialize instance + */ + memset(self, 0, sizeof(struct iriap_cb)); + + self->magic = IAS_MAGIC; + self->mode = mode; + if (mode == IAS_CLIENT) + iriap_register_lsap(self, slsap_sel, mode); + + self->confirm = callback; + self->priv = priv; + + /* iriap_getvaluebyclass_request() will construct packets before + * we connect, so this must have a sane value... Jean II */ + self->max_header_size = LMP_MAX_HEADER; + + init_timer(&self->watchdog_timer); + + hashbin_insert(iriap, (irda_queue_t *) self, (long) self, NULL); + + /* Initialize state machines */ + iriap_next_client_state(self, S_DISCONNECT); + iriap_next_call_state(self, S_MAKE_CALL); + iriap_next_server_state(self, R_DISCONNECT); + iriap_next_r_connect_state(self, R_WAITING); + + return self; +} +EXPORT_SYMBOL(iriap_open); + +/* + * Function __iriap_close (self) + * + * Removes (deallocates) the IrIAP instance + * + */ +static void __iriap_close(struct iriap_cb *self) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + del_timer(&self->watchdog_timer); + + if (self->request_skb) + dev_kfree_skb(self->request_skb); + + self->magic = 0; + + kfree(self); +} + +/* + * Function iriap_close (void) + * + * Closes IrIAP and deregisters with IrLMP + */ +void iriap_close(struct iriap_cb *self) +{ + struct iriap_cb *entry; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + if (self->lsap) { + irlmp_close_lsap(self->lsap); + self->lsap = NULL; + } + + entry = (struct iriap_cb *) hashbin_remove(iriap, (long) self, NULL); + IRDA_ASSERT(entry == self, return;); + + __iriap_close(self); +} +EXPORT_SYMBOL(iriap_close); + +static int iriap_register_lsap(struct iriap_cb *self, __u8 slsap_sel, int mode) +{ + notify_t notify; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + irda_notify_init(¬ify); + notify.connect_confirm = iriap_connect_confirm; + notify.connect_indication = iriap_connect_indication; + notify.disconnect_indication = iriap_disconnect_indication; + notify.data_indication = iriap_data_indication; + notify.instance = self; + if (mode == IAS_CLIENT) + strcpy(notify.name, "IrIAS cli"); + else + strcpy(notify.name, "IrIAS srv"); + + self->lsap = irlmp_open_lsap(slsap_sel, ¬ify, 0); + if (self->lsap == NULL) { + IRDA_ERROR("%s: Unable to allocated LSAP!\n", __FUNCTION__); + return -1; + } + self->slsap_sel = self->lsap->slsap_sel; + + return 0; +} + +/* + * Function iriap_disconnect_indication (handle, reason) + * + * Got disconnect, so clean up everything associated with this connection + * + */ +static void iriap_disconnect_indication(void *instance, void *sap, + LM_REASON reason, + struct sk_buff *skb) +{ + struct iriap_cb *self; + + IRDA_DEBUG(4, "%s(), reason=%s\n", __FUNCTION__, irlmp_reasons[reason]); + + self = (struct iriap_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + IRDA_ASSERT(iriap != NULL, return;); + + del_timer(&self->watchdog_timer); + + /* Not needed */ + if (skb) + dev_kfree_skb(skb); + + if (self->mode == IAS_CLIENT) { + IRDA_DEBUG(4, "%s(), disconnect as client\n", __FUNCTION__); + + + iriap_do_client_event(self, IAP_LM_DISCONNECT_INDICATION, + NULL); + /* + * Inform service user that the request failed by sending + * it a NULL value. Warning, the client might close us, so + * remember no to use self anymore after calling confirm + */ + if (self->confirm) + self->confirm(IAS_DISCONNECT, 0, NULL, self->priv); + } else { + IRDA_DEBUG(4, "%s(), disconnect as server\n", __FUNCTION__); + iriap_do_server_event(self, IAP_LM_DISCONNECT_INDICATION, + NULL); + iriap_close(self); + } +} + +/* + * Function iriap_disconnect_request (handle) + */ +static void iriap_disconnect_request(struct iriap_cb *self) +{ + struct sk_buff *tx_skb; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + tx_skb = dev_alloc_skb(64); + if (tx_skb == NULL) { + IRDA_DEBUG(0, "%s(), Could not allocate an sk_buff of length %d\n", + __FUNCTION__, 64); + return; + } + + /* + * Reserve space for MUX control and LAP header + */ + skb_reserve(tx_skb, LMP_MAX_HEADER); + + irlmp_disconnect_request(self->lsap, tx_skb); +} + +/* + * Function iriap_getvaluebyclass (addr, name, attr) + * + * Retreive all values from attribute in all objects with given class + * name + */ +int iriap_getvaluebyclass_request(struct iriap_cb *self, + __u32 saddr, __u32 daddr, + char *name, char *attr) +{ + struct sk_buff *tx_skb; + int name_len, attr_len, skb_len; + __u8 *frame; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return -1;); + + /* Client must supply the destination device address */ + if (!daddr) + return -1; + + self->daddr = daddr; + self->saddr = saddr; + + /* + * Save operation, so we know what the later indication is about + */ + self->operation = GET_VALUE_BY_CLASS; + + /* Give ourselves 10 secs to finish this operation */ + iriap_start_watchdog_timer(self, 10*HZ); + + name_len = strlen(name); /* Up to IAS_MAX_CLASSNAME = 60 */ + attr_len = strlen(attr); /* Up to IAS_MAX_ATTRIBNAME = 60 */ + + skb_len = self->max_header_size+2+name_len+1+attr_len+4; + tx_skb = dev_alloc_skb(skb_len); + if (!tx_skb) + return -ENOMEM; + + /* Reserve space for MUX and LAP header */ + skb_reserve(tx_skb, self->max_header_size); + skb_put(tx_skb, 3+name_len+attr_len); + frame = tx_skb->data; + + /* Build frame */ + frame[0] = IAP_LST | GET_VALUE_BY_CLASS; + frame[1] = name_len; /* Insert length of name */ + memcpy(frame+2, name, name_len); /* Insert name */ + frame[2+name_len] = attr_len; /* Insert length of attr */ + memcpy(frame+3+name_len, attr, attr_len); /* Insert attr */ + + iriap_do_client_event(self, IAP_CALL_REQUEST_GVBC, tx_skb); + + /* Drop reference count - see state_s_disconnect(). */ + dev_kfree_skb(tx_skb); + + return 0; +} +EXPORT_SYMBOL(iriap_getvaluebyclass_request); + +/* + * Function iriap_getvaluebyclass_confirm (self, skb) + * + * Got result from GetValueByClass command. Parse it and return result + * to service user. + * + */ +static void iriap_getvaluebyclass_confirm(struct iriap_cb *self, + struct sk_buff *skb) +{ + struct ias_value *value; + int charset; + __u32 value_len; + __u32 tmp_cpu32; + __u16 obj_id; + __u16 len; + __u8 type; + __u8 *fp; + int n; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + /* Initialize variables */ + fp = skb->data; + n = 2; + + /* Get length, MSB first */ + len = be16_to_cpu(get_unaligned((__u16 *)(fp+n))); n += 2; + + IRDA_DEBUG(4, "%s(), len=%d\n", __FUNCTION__, len); + + /* Get object ID, MSB first */ + obj_id = be16_to_cpu(get_unaligned((__u16 *)(fp+n))); n += 2; + + type = fp[n++]; + IRDA_DEBUG(4, "%s(), Value type = %d\n", __FUNCTION__, type); + + switch (type) { + case IAS_INTEGER: + memcpy(&tmp_cpu32, fp+n, 4); n += 4; + be32_to_cpus(&tmp_cpu32); + value = irias_new_integer_value(tmp_cpu32); + + /* Legal values restricted to 0x01-0x6f, page 15 irttp */ + IRDA_DEBUG(4, "%s(), lsap=%d\n", __FUNCTION__, value->t.integer); + break; + case IAS_STRING: + charset = fp[n++]; + + switch (charset) { + case CS_ASCII: + break; +/* case CS_ISO_8859_1: */ +/* case CS_ISO_8859_2: */ +/* case CS_ISO_8859_3: */ +/* case CS_ISO_8859_4: */ +/* case CS_ISO_8859_5: */ +/* case CS_ISO_8859_6: */ +/* case CS_ISO_8859_7: */ +/* case CS_ISO_8859_8: */ +/* case CS_ISO_8859_9: */ +/* case CS_UNICODE: */ + default: + IRDA_DEBUG(0, "%s(), charset %s, not supported\n", + __FUNCTION__, ias_charset_types[charset]); + + /* Aborting, close connection! */ + iriap_disconnect_request(self); + return; + /* break; */ + } + value_len = fp[n++]; + IRDA_DEBUG(4, "%s(), strlen=%d\n", __FUNCTION__, value_len); + + /* Make sure the string is null-terminated */ + fp[n+value_len] = 0x00; + IRDA_DEBUG(4, "Got string %s\n", fp+n); + + /* Will truncate to IAS_MAX_STRING bytes */ + value = irias_new_string_value(fp+n); + break; + case IAS_OCT_SEQ: + value_len = be16_to_cpu(get_unaligned((__u16 *)(fp+n))); + n += 2; + + /* Will truncate to IAS_MAX_OCTET_STRING bytes */ + value = irias_new_octseq_value(fp+n, value_len); + break; + default: + value = irias_new_missing_value(); + break; + } + + /* Finished, close connection! */ + iriap_disconnect_request(self); + + /* Warning, the client might close us, so remember no to use self + * anymore after calling confirm + */ + if (self->confirm) + self->confirm(IAS_SUCCESS, obj_id, value, self->priv); + else { + IRDA_DEBUG(0, "%s(), missing handler!\n", __FUNCTION__); + irias_delete_value(value); + } +} + +/* + * Function iriap_getvaluebyclass_response () + * + * Send answer back to remote LM-IAS + * + */ +static void iriap_getvaluebyclass_response(struct iriap_cb *self, + __u16 obj_id, + __u8 ret_code, + struct ias_value *value) +{ + struct sk_buff *tx_skb; + int n; + __u32 tmp_be32, tmp_be16; + __u8 *fp; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + IRDA_ASSERT(value != NULL, return;); + IRDA_ASSERT(value->len <= 1024, return;); + + /* Initialize variables */ + n = 0; + + /* + * We must adjust the size of the response after the length of the + * value. We add 32 bytes because of the 6 bytes for the frame and + * max 5 bytes for the value coding. + */ + tx_skb = dev_alloc_skb(value->len + self->max_header_size + 32); + if (!tx_skb) + return; + + /* Reserve space for MUX and LAP header */ + skb_reserve(tx_skb, self->max_header_size); + skb_put(tx_skb, 6); + + fp = tx_skb->data; + + /* Build frame */ + fp[n++] = GET_VALUE_BY_CLASS | IAP_LST; + fp[n++] = ret_code; + + /* Insert list length (MSB first) */ + tmp_be16 = __constant_htons(0x0001); + memcpy(fp+n, &tmp_be16, 2); n += 2; + + /* Insert object identifier ( MSB first) */ + tmp_be16 = cpu_to_be16(obj_id); + memcpy(fp+n, &tmp_be16, 2); n += 2; + + switch (value->type) { + case IAS_STRING: + skb_put(tx_skb, 3 + value->len); + fp[n++] = value->type; + fp[n++] = 0; /* ASCII */ + fp[n++] = (__u8) value->len; + memcpy(fp+n, value->t.string, value->len); n+=value->len; + break; + case IAS_INTEGER: + skb_put(tx_skb, 5); + fp[n++] = value->type; + + tmp_be32 = cpu_to_be32(value->t.integer); + memcpy(fp+n, &tmp_be32, 4); n += 4; + break; + case IAS_OCT_SEQ: + skb_put(tx_skb, 3 + value->len); + fp[n++] = value->type; + + tmp_be16 = cpu_to_be16(value->len); + memcpy(fp+n, &tmp_be16, 2); n += 2; + memcpy(fp+n, value->t.oct_seq, value->len); n+=value->len; + break; + case IAS_MISSING: + IRDA_DEBUG( 3, "%s: sending IAS_MISSING\n", __FUNCTION__); + skb_put(tx_skb, 1); + fp[n++] = value->type; + break; + default: + IRDA_DEBUG(0, "%s(), type not implemented!\n", __FUNCTION__); + break; + } + iriap_do_r_connect_event(self, IAP_CALL_RESPONSE, tx_skb); + + /* Drop reference count - see state_r_execute(). */ + dev_kfree_skb(tx_skb); +} + +/* + * Function iriap_getvaluebyclass_indication (self, skb) + * + * getvaluebyclass is requested from peer LM-IAS + * + */ +static void iriap_getvaluebyclass_indication(struct iriap_cb *self, + struct sk_buff *skb) +{ + struct ias_object *obj; + struct ias_attrib *attrib; + int name_len; + int attr_len; + char name[IAS_MAX_CLASSNAME + 1]; /* 60 bytes */ + char attr[IAS_MAX_ATTRIBNAME + 1]; /* 60 bytes */ + __u8 *fp; + int n; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + fp = skb->data; + n = 1; + + name_len = fp[n++]; + memcpy(name, fp+n, name_len); n+=name_len; + name[name_len] = '\0'; + + attr_len = fp[n++]; + memcpy(attr, fp+n, attr_len); n+=attr_len; + attr[attr_len] = '\0'; + + IRDA_DEBUG(4, "LM-IAS: Looking up %s: %s\n", name, attr); + obj = irias_find_object(name); + + if (obj == NULL) { + IRDA_DEBUG(2, "LM-IAS: Object %s not found\n", name); + iriap_getvaluebyclass_response(self, 0x1235, IAS_CLASS_UNKNOWN, + &irias_missing); + return; + } + IRDA_DEBUG(4, "LM-IAS: found %s, id=%d\n", obj->name, obj->id); + + attrib = irias_find_attrib(obj, attr); + if (attrib == NULL) { + IRDA_DEBUG(2, "LM-IAS: Attribute %s not found\n", attr); + iriap_getvaluebyclass_response(self, obj->id, + IAS_ATTRIB_UNKNOWN, + &irias_missing); + return; + } + + /* We have a match; send the value. */ + iriap_getvaluebyclass_response(self, obj->id, IAS_SUCCESS, + attrib->value); + + return; +} + +/* + * Function iriap_send_ack (void) + * + * Currently not used + * + */ +void iriap_send_ack(struct iriap_cb *self) +{ + struct sk_buff *tx_skb; + __u8 *frame; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + tx_skb = dev_alloc_skb(64); + if (!tx_skb) + return; + + /* Reserve space for MUX and LAP header */ + skb_reserve(tx_skb, self->max_header_size); + skb_put(tx_skb, 1); + frame = tx_skb->data; + + /* Build frame */ + frame[0] = IAP_LST | IAP_ACK | self->operation; + + irlmp_data_request(self->lsap, tx_skb); +} + +void iriap_connect_request(struct iriap_cb *self) +{ + int ret; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + ret = irlmp_connect_request(self->lsap, LSAP_IAS, + self->saddr, self->daddr, + NULL, NULL); + if (ret < 0) { + IRDA_DEBUG(0, "%s(), connect failed!\n", __FUNCTION__); + self->confirm(IAS_DISCONNECT, 0, NULL, self->priv); + } +} + +/* + * Function iriap_connect_confirm (handle, skb) + * + * LSAP connection confirmed! + * + */ +static void iriap_connect_confirm(void *instance, void *sap, + struct qos_info *qos, __u32 max_seg_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct iriap_cb *self; + + self = (struct iriap_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + self->max_data_size = max_seg_size; + self->max_header_size = max_header_size; + + del_timer(&self->watchdog_timer); + + iriap_do_client_event(self, IAP_LM_CONNECT_CONFIRM, skb); + + /* Drop reference count - see state_s_make_call(). */ + dev_kfree_skb(skb); +} + +/* + * Function iriap_connect_indication ( handle, skb) + * + * Remote LM-IAS is requesting connection + * + */ +static void iriap_connect_indication(void *instance, void *sap, + struct qos_info *qos, __u32 max_seg_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct iriap_cb *self, *new; + + IRDA_DEBUG(1, "%s()\n", __FUNCTION__); + + self = (struct iriap_cb *) instance; + + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(self != NULL, goto out;); + IRDA_ASSERT(self->magic == IAS_MAGIC, goto out;); + + /* Start new server */ + new = iriap_open(LSAP_IAS, IAS_SERVER, NULL, NULL); + if (!new) { + IRDA_DEBUG(0, "%s(), open failed\n", __FUNCTION__); + goto out; + } + + /* Now attach up the new "socket" */ + new->lsap = irlmp_dup(self->lsap, new); + if (!new->lsap) { + IRDA_DEBUG(0, "%s(), dup failed!\n", __FUNCTION__); + goto out; + } + + new->max_data_size = max_seg_size; + new->max_header_size = max_header_size; + + /* Clean up the original one to keep it in listen state */ + irlmp_listen(self->lsap); + + iriap_do_server_event(new, IAP_LM_CONNECT_INDICATION, skb); + +out: + /* Drop reference count - see state_r_disconnect(). */ + dev_kfree_skb(skb); +} + +/* + * Function iriap_data_indication (handle, skb) + * + * Receives data from connection identified by handle from IrLMP + * + */ +static int iriap_data_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct iriap_cb *self; + __u8 *frame; + __u8 opcode; + + IRDA_DEBUG(3, "%s()\n", __FUNCTION__); + + self = (struct iriap_cb *) instance; + + IRDA_ASSERT(skb != NULL, return 0;); + IRDA_ASSERT(self != NULL, goto out;); + IRDA_ASSERT(self->magic == IAS_MAGIC, goto out;); + + frame = skb->data; + + if (self->mode == IAS_SERVER) { + /* Call server */ + IRDA_DEBUG(4, "%s(), Calling server!\n", __FUNCTION__); + iriap_do_r_connect_event(self, IAP_RECV_F_LST, skb); + goto out; + } + opcode = frame[0]; + if (~opcode & IAP_LST) { + IRDA_WARNING("%s:, IrIAS multiframe commands or " + "results is not implemented yet!\n", + __FUNCTION__); + goto out; + } + + /* Check for ack frames since they don't contain any data */ + if (opcode & IAP_ACK) { + IRDA_DEBUG(0, "%s() Got ack frame!\n", __FUNCTION__); + goto out; + } + + opcode &= ~IAP_LST; /* Mask away LST bit */ + + switch (opcode) { + case GET_INFO_BASE: + IRDA_DEBUG(0, "IrLMP GetInfoBaseDetails not implemented!\n"); + break; + case GET_VALUE_BY_CLASS: + iriap_do_call_event(self, IAP_RECV_F_LST, NULL); + + switch (frame[1]) { + case IAS_SUCCESS: + iriap_getvaluebyclass_confirm(self, skb); + break; + case IAS_CLASS_UNKNOWN: + IRDA_DEBUG(1, "%s(), No such class!\n", __FUNCTION__); + /* Finished, close connection! */ + iriap_disconnect_request(self); + + /* + * Warning, the client might close us, so remember + * no to use self anymore after calling confirm + */ + if (self->confirm) + self->confirm(IAS_CLASS_UNKNOWN, 0, NULL, + self->priv); + break; + case IAS_ATTRIB_UNKNOWN: + IRDA_DEBUG(1, "%s(), No such attribute!\n", __FUNCTION__); + /* Finished, close connection! */ + iriap_disconnect_request(self); + + /* + * Warning, the client might close us, so remember + * no to use self anymore after calling confirm + */ + if (self->confirm) + self->confirm(IAS_ATTRIB_UNKNOWN, 0, NULL, + self->priv); + break; + } + break; + default: + IRDA_DEBUG(0, "%s(), Unknown op-code: %02x\n", __FUNCTION__, + opcode); + break; + } + +out: + /* Cleanup - sub-calls will have done skb_get() as needed. */ + dev_kfree_skb(skb); + return 0; +} + +/* + * Function iriap_call_indication (self, skb) + * + * Received call to server from peer LM-IAS + * + */ +void iriap_call_indication(struct iriap_cb *self, struct sk_buff *skb) +{ + __u8 *fp; + __u8 opcode; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + fp = skb->data; + + opcode = fp[0]; + if (~opcode & 0x80) { + IRDA_WARNING("%s: IrIAS multiframe commands or results" + "is not implemented yet!\n", __FUNCTION__); + return; + } + opcode &= 0x7f; /* Mask away LST bit */ + + switch (opcode) { + case GET_INFO_BASE: + IRDA_WARNING("%s: GetInfoBaseDetails not implemented yet!\n", + __FUNCTION__); + break; + case GET_VALUE_BY_CLASS: + iriap_getvaluebyclass_indication(self, skb); + break; + } + /* skb will be cleaned up in iriap_data_indication */ +} + +/* + * Function iriap_watchdog_timer_expired (data) + * + * Query has taken too long time, so abort + * + */ +static void iriap_watchdog_timer_expired(void *data) +{ + struct iriap_cb *self = (struct iriap_cb *) data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + /* iriap_close(self); */ +} + +#ifdef CONFIG_PROC_FS + +static const char *ias_value_types[] = { + "IAS_MISSING", + "IAS_INTEGER", + "IAS_OCT_SEQ", + "IAS_STRING" +}; + +static inline struct ias_object *irias_seq_idx(loff_t pos) +{ + struct ias_object *obj; + + for (obj = (struct ias_object *) hashbin_get_first(irias_objects); + obj; obj = (struct ias_object *) hashbin_get_next(irias_objects)) { + if (pos-- == 0) + break; + } + + return obj; +} + +static void *irias_seq_start(struct seq_file *seq, loff_t *pos) +{ + spin_lock_irq(&irias_objects->hb_spinlock); + + return *pos ? irias_seq_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *irias_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + + return (v == SEQ_START_TOKEN) + ? (void *) hashbin_get_first(irias_objects) + : (void *) hashbin_get_next(irias_objects); +} + +static void irias_seq_stop(struct seq_file *seq, void *v) +{ + spin_unlock_irq(&irias_objects->hb_spinlock); +} + +static int irias_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "LM-IAS Objects:\n"); + else { + struct ias_object *obj = v; + struct ias_attrib *attrib; + + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return -EINVAL;); + + seq_printf(seq, "name: %s, id=%d\n", + obj->name, obj->id); + + /* Careful for priority inversions here ! + * All other uses of attrib spinlock are independent of + * the object spinlock, so we are safe. Jean II */ + spin_lock(&obj->attribs->hb_spinlock); + + /* List all attributes for this object */ + for (attrib = (struct ias_attrib *) hashbin_get_first(obj->attribs); + attrib != NULL; + attrib = (struct ias_attrib *) hashbin_get_next(obj->attribs)) { + + IRDA_ASSERT(attrib->magic == IAS_ATTRIB_MAGIC, + goto outloop; ); + + seq_printf(seq, " - Attribute name: \"%s\", ", + attrib->name); + seq_printf(seq, "value[%s]: ", + ias_value_types[attrib->value->type]); + + switch (attrib->value->type) { + case IAS_INTEGER: + seq_printf(seq, "%d\n", + attrib->value->t.integer); + break; + case IAS_STRING: + seq_printf(seq, "\"%s\"\n", + attrib->value->t.string); + break; + case IAS_OCT_SEQ: + seq_printf(seq, "octet sequence (%d bytes)\n", + attrib->value->len); + break; + case IAS_MISSING: + seq_puts(seq, "missing\n"); + break; + default: + seq_printf(seq, "type %d?\n", + attrib->value->type); + } + seq_putc(seq, '\n'); + + } + IRDA_ASSERT_LABEL(outloop:) + spin_unlock(&obj->attribs->hb_spinlock); + } + + return 0; +} + +static struct seq_operations irias_seq_ops = { + .start = irias_seq_start, + .next = irias_seq_next, + .stop = irias_seq_stop, + .show = irias_seq_show, +}; + +static int irias_seq_open(struct inode *inode, struct file *file) +{ + IRDA_ASSERT( irias_objects != NULL, return -EINVAL;); + + return seq_open(file, &irias_seq_ops); +} + +struct file_operations irias_seq_fops = { + .owner = THIS_MODULE, + .open = irias_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* PROC_FS */ diff --git a/net/irda/iriap_event.c b/net/irda/iriap_event.c new file mode 100644 index 000000000000..a73607450de1 --- /dev/null +++ b/net/irda/iriap_event.c @@ -0,0 +1,502 @@ +/********************************************************************* + * + * Filename: iriap_event.c + * Version: 0.1 + * Description: IAP Finite State Machine + * Status: Experimental. + * Author: Dag Brattli + * Created at: Thu Aug 21 00:02:07 1997 + * Modified at: Wed Mar 1 11:28:34 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1997, 1999-2000 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include + +static void state_s_disconnect (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_s_connecting (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_s_call (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); + +static void state_s_make_call (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_s_calling (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_s_outstanding (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_s_replying (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_s_wait_for_call(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_s_wait_active (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); + +static void state_r_disconnect (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_r_call (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_r_waiting (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_r_wait_active (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_r_receiving (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_r_execute (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); +static void state_r_returning (struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb); + +static void (*iriap_state[])(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) = { + /* Client FSM */ + state_s_disconnect, + state_s_connecting, + state_s_call, + + /* S-Call FSM */ + state_s_make_call, + state_s_calling, + state_s_outstanding, + state_s_replying, + state_s_wait_for_call, + state_s_wait_active, + + /* Server FSM */ + state_r_disconnect, + state_r_call, + + /* R-Connect FSM */ + state_r_waiting, + state_r_wait_active, + state_r_receiving, + state_r_execute, + state_r_returning, +}; + +void iriap_next_client_state(struct iriap_cb *self, IRIAP_STATE state) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + self->client_state = state; +} + +void iriap_next_call_state(struct iriap_cb *self, IRIAP_STATE state) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + self->call_state = state; +} + +void iriap_next_server_state(struct iriap_cb *self, IRIAP_STATE state) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + self->server_state = state; +} + +void iriap_next_r_connect_state(struct iriap_cb *self, IRIAP_STATE state) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + self->r_connect_state = state; +} + +void iriap_do_client_event(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + (*iriap_state[ self->client_state]) (self, event, skb); +} + +void iriap_do_call_event(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + (*iriap_state[ self->call_state]) (self, event, skb); +} + +void iriap_do_server_event(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + (*iriap_state[ self->server_state]) (self, event, skb); +} + +void iriap_do_r_connect_event(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + (*iriap_state[ self->r_connect_state]) (self, event, skb); +} + + +/* + * Function state_s_disconnect (event, skb) + * + * S-Disconnect, The device has no LSAP connection to a particular + * remote device. + */ +static void state_s_disconnect(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + switch (event) { + case IAP_CALL_REQUEST_GVBC: + iriap_next_client_state(self, S_CONNECTING); + IRDA_ASSERT(self->request_skb == NULL, return;); + /* Don't forget to refcount it - + * see iriap_getvaluebyclass_request(). */ + skb_get(skb); + self->request_skb = skb; + iriap_connect_request(self); + break; + case IAP_LM_DISCONNECT_INDICATION: + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %d\n", __FUNCTION__, event); + break; + } +} + +/* + * Function state_s_connecting (self, event, skb) + * + * S-Connecting + * + */ +static void state_s_connecting(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + switch (event) { + case IAP_LM_CONNECT_CONFIRM: + /* + * Jump to S-Call FSM + */ + iriap_do_call_event(self, IAP_CALL_REQUEST, skb); + /* iriap_call_request(self, 0,0,0); */ + iriap_next_client_state(self, S_CALL); + break; + case IAP_LM_DISCONNECT_INDICATION: + /* Abort calls */ + iriap_next_call_state(self, S_MAKE_CALL); + iriap_next_client_state(self, S_DISCONNECT); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %d\n", __FUNCTION__, event); + break; + } +} + +/* + * Function state_s_call (self, event, skb) + * + * S-Call, The device can process calls to a specific remote + * device. Whenever the LSAP connection is disconnected, this state + * catches that event and clears up + */ +static void state_s_call(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + + switch (event) { + case IAP_LM_DISCONNECT_INDICATION: + /* Abort calls */ + iriap_next_call_state(self, S_MAKE_CALL); + iriap_next_client_state(self, S_DISCONNECT); + break; + default: + IRDA_DEBUG(0, "state_s_call: Unknown event %d\n", event); + break; + } +} + +/* + * Function state_s_make_call (event, skb) + * + * S-Make-Call + * + */ +static void state_s_make_call(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + struct sk_buff *tx_skb; + + IRDA_ASSERT(self != NULL, return;); + + switch (event) { + case IAP_CALL_REQUEST: + /* Already refcounted - see state_s_disconnect() */ + tx_skb = self->request_skb; + self->request_skb = NULL; + + irlmp_data_request(self->lsap, tx_skb); + iriap_next_call_state(self, S_OUTSTANDING); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %d\n", __FUNCTION__, event); + break; + } +} + +/* + * Function state_s_calling (event, skb) + * + * S-Calling + * + */ +static void state_s_calling(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(0, "%s(), Not implemented\n", __FUNCTION__); +} + +/* + * Function state_s_outstanding (event, skb) + * + * S-Outstanding, The device is waiting for a response to a command + * + */ +static void state_s_outstanding(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + + switch (event) { + case IAP_RECV_F_LST: + /*iriap_send_ack(self);*/ + /*LM_Idle_request(idle); */ + + iriap_next_call_state(self, S_WAIT_FOR_CALL); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %d\n", __FUNCTION__, event); + break; + } +} + +/* + * Function state_s_replying (event, skb) + * + * S-Replying, The device is collecting a multiple part response + */ +static void state_s_replying(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(0, "%s(), Not implemented\n", __FUNCTION__); +} + +/* + * Function state_s_wait_for_call (event, skb) + * + * S-Wait-for-Call + * + */ +static void state_s_wait_for_call(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(0, "%s(), Not implemented\n", __FUNCTION__); +} + + +/* + * Function state_s_wait_active (event, skb) + * + * S-Wait-Active + * + */ +static void state_s_wait_active(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(0, "%s(), Not implemented\n", __FUNCTION__); +} + +/************************************************************************** + * + * Server FSM + * + **************************************************************************/ + +/* + * Function state_r_disconnect (self, event, skb) + * + * LM-IAS server is disconnected (not processing any requests!) + * + */ +static void state_r_disconnect(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + struct sk_buff *tx_skb; + + switch (event) { + case IAP_LM_CONNECT_INDICATION: + tx_skb = dev_alloc_skb(64); + if (tx_skb == NULL) { + IRDA_WARNING("%s: unable to malloc!\n", __FUNCTION__); + return; + } + + /* Reserve space for MUX_CONTROL and LAP header */ + skb_reserve(tx_skb, LMP_MAX_HEADER); + + irlmp_connect_response(self->lsap, tx_skb); + /*LM_Idle_request(idle); */ + + iriap_next_server_state(self, R_CALL); + + /* + * Jump to R-Connect FSM, we skip R-Waiting since we do not + * care about LM_Idle_request()! + */ + iriap_next_r_connect_state(self, R_RECEIVING); + break; + default: + IRDA_DEBUG(0, "%s(), unknown event %d\n", __FUNCTION__, event); + break; + } +} + +/* + * Function state_r_call (self, event, skb) + */ +static void state_r_call(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + switch (event) { + case IAP_LM_DISCONNECT_INDICATION: + /* Abort call */ + iriap_next_server_state(self, R_DISCONNECT); + iriap_next_r_connect_state(self, R_WAITING); + break; + default: + IRDA_DEBUG(0, "%s(), unknown event!\n", __FUNCTION__); + break; + } +} + +/* + * R-Connect FSM + */ + +/* + * Function state_r_waiting (self, event, skb) + */ +static void state_r_waiting(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(0, "%s(), Not implemented\n", __FUNCTION__); +} + +static void state_r_wait_active(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(0, "%s(), Not implemented\n", __FUNCTION__); +} + +/* + * Function state_r_receiving (self, event, skb) + * + * We are receiving a command + * + */ +static void state_r_receiving(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + switch (event) { + case IAP_RECV_F_LST: + iriap_next_r_connect_state(self, R_EXECUTE); + + iriap_call_indication(self, skb); + break; + default: + IRDA_DEBUG(0, "%s(), unknown event!\n", __FUNCTION__); + break; + } +} + +/* + * Function state_r_execute (self, event, skb) + * + * The server is processing the request + * + */ +static void state_r_execute(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IAS_MAGIC, return;); + + switch (event) { + case IAP_CALL_RESPONSE: + /* + * Since we don't implement the Waiting state, we return + * to state Receiving instead, DB. + */ + iriap_next_r_connect_state(self, R_RECEIVING); + + /* Don't forget to refcount it - see + * iriap_getvaluebyclass_response(). */ + skb_get(skb); + + irlmp_data_request(self->lsap, skb); + break; + default: + IRDA_DEBUG(0, "%s(), unknown event!\n", __FUNCTION__); + break; + } +} + +static void state_r_returning(struct iriap_cb *self, IRIAP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(0, "%s(), event=%d\n", __FUNCTION__, event); + + switch (event) { + case IAP_RECV_F_LST: + break; + default: + break; + } +} diff --git a/net/irda/irias_object.c b/net/irda/irias_object.c new file mode 100644 index 000000000000..6fec428b4512 --- /dev/null +++ b/net/irda/irias_object.c @@ -0,0 +1,580 @@ +/********************************************************************* + * + * Filename: irias_object.c + * Version: 0.3 + * Description: IAS object database and functions + * Status: Experimental. + * Author: Dag Brattli + * Created at: Thu Oct 1 22:50:04 1998 + * Modified at: Wed Dec 15 11:23:16 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include + +#include +#include + +hashbin_t *irias_objects; + +/* + * Used when a missing value needs to be returned + */ +struct ias_value irias_missing = { IAS_MISSING, 0, 0, 0, {0}}; + +/* + * Function strndup (str, max) + * + * My own kernel version of strndup! + * + * Faster, check boundary... Jean II + */ +static char *strndup(char *str, int max) +{ + char *new_str; + int len; + + /* Check string */ + if (str == NULL) + return NULL; + /* Check length, truncate */ + len = strlen(str); + if(len > max) + len = max; + + /* Allocate new string */ + new_str = kmalloc(len + 1, GFP_ATOMIC); + if (new_str == NULL) { + IRDA_WARNING("%s: Unable to kmalloc!\n", __FUNCTION__); + return NULL; + } + + /* Copy and truncate */ + memcpy(new_str, str, len); + new_str[len] = '\0'; + + return new_str; +} + +/* + * Function ias_new_object (name, id) + * + * Create a new IAS object + * + */ +struct ias_object *irias_new_object( char *name, int id) +{ + struct ias_object *obj; + + IRDA_DEBUG( 4, "%s()\n", __FUNCTION__); + + obj = (struct ias_object *) kmalloc(sizeof(struct ias_object), + GFP_ATOMIC); + if (obj == NULL) { + IRDA_WARNING("%s(), Unable to allocate object!\n", + __FUNCTION__); + return NULL; + } + memset(obj, 0, sizeof( struct ias_object)); + + obj->magic = IAS_OBJECT_MAGIC; + obj->name = strndup(name, IAS_MAX_CLASSNAME); + obj->id = id; + + /* Locking notes : the attrib spinlock has lower precendence + * than the objects spinlock. Never grap the objects spinlock + * while holding any attrib spinlock (risk of deadlock). Jean II */ + obj->attribs = hashbin_new(HB_LOCK); + + if (obj->attribs == NULL) { + IRDA_WARNING("%s(), Unable to allocate attribs!\n", + __FUNCTION__); + kfree(obj); + return NULL; + } + + return obj; +} +EXPORT_SYMBOL(irias_new_object); + +/* + * Function irias_delete_attrib (attrib) + * + * Delete given attribute and deallocate all its memory + * + */ +static void __irias_delete_attrib(struct ias_attrib *attrib) +{ + IRDA_ASSERT(attrib != NULL, return;); + IRDA_ASSERT(attrib->magic == IAS_ATTRIB_MAGIC, return;); + + if (attrib->name) + kfree(attrib->name); + + irias_delete_value(attrib->value); + attrib->magic = ~IAS_ATTRIB_MAGIC; + + kfree(attrib); +} + +void __irias_delete_object(struct ias_object *obj) +{ + IRDA_ASSERT(obj != NULL, return;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;); + + if (obj->name) + kfree(obj->name); + + hashbin_delete(obj->attribs, (FREE_FUNC) __irias_delete_attrib); + + obj->magic = ~IAS_OBJECT_MAGIC; + + kfree(obj); +} + +/* + * Function irias_delete_object (obj) + * + * Remove object from hashbin and deallocate all attributes associated with + * with this object and the object itself + * + */ +int irias_delete_object(struct ias_object *obj) +{ + struct ias_object *node; + + IRDA_ASSERT(obj != NULL, return -1;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return -1;); + + /* Remove from list */ + node = hashbin_remove_this(irias_objects, (irda_queue_t *) obj); + if (!node) + IRDA_DEBUG( 0, "%s(), object already removed!\n", + __FUNCTION__); + + /* Destroy */ + __irias_delete_object(obj); + + return 0; +} +EXPORT_SYMBOL(irias_delete_object); + +/* + * Function irias_delete_attrib (obj) + * + * Remove attribute from hashbin and, if it was the last attribute of + * the object, remove the object as well. + * + */ +int irias_delete_attrib(struct ias_object *obj, struct ias_attrib *attrib, + int cleanobject) +{ + struct ias_attrib *node; + + IRDA_ASSERT(obj != NULL, return -1;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return -1;); + IRDA_ASSERT(attrib != NULL, return -1;); + + /* Remove attribute from object */ + node = hashbin_remove_this(obj->attribs, (irda_queue_t *) attrib); + if (!node) + return 0; /* Already removed or non-existent */ + + /* Deallocate attribute */ + __irias_delete_attrib(node); + + /* Check if object has still some attributes, destroy it if none. + * At first glance, this look dangerous, as the kernel reference + * various IAS objects. However, we only use this function on + * user attributes, not kernel attributes, so there is no risk + * of deleting a kernel object this way. Jean II */ + node = (struct ias_attrib *) hashbin_get_first(obj->attribs); + if (cleanobject && !node) + irias_delete_object(obj); + + return 0; +} + +/* + * Function irias_insert_object (obj) + * + * Insert an object into the LM-IAS database + * + */ +void irias_insert_object(struct ias_object *obj) +{ + IRDA_ASSERT(obj != NULL, return;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;); + + hashbin_insert(irias_objects, (irda_queue_t *) obj, 0, obj->name); +} +EXPORT_SYMBOL(irias_insert_object); + +/* + * Function irias_find_object (name) + * + * Find object with given name + * + */ +struct ias_object *irias_find_object(char *name) +{ + IRDA_ASSERT(name != NULL, return NULL;); + + /* Unsafe (locking), object might change */ + return hashbin_lock_find(irias_objects, 0, name); +} +EXPORT_SYMBOL(irias_find_object); + +/* + * Function irias_find_attrib (obj, name) + * + * Find named attribute in object + * + */ +struct ias_attrib *irias_find_attrib(struct ias_object *obj, char *name) +{ + struct ias_attrib *attrib; + + IRDA_ASSERT(obj != NULL, return NULL;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return NULL;); + IRDA_ASSERT(name != NULL, return NULL;); + + attrib = hashbin_lock_find(obj->attribs, 0, name); + if (attrib == NULL) + return NULL; + + /* Unsafe (locking), attrib might change */ + return attrib; +} +EXPORT_SYMBOL(irias_find_attrib); + +/* + * Function irias_add_attribute (obj, attrib) + * + * Add attribute to object + * + */ +static void irias_add_attrib(struct ias_object *obj, struct ias_attrib *attrib, + int owner) +{ + IRDA_ASSERT(obj != NULL, return;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;); + + IRDA_ASSERT(attrib != NULL, return;); + IRDA_ASSERT(attrib->magic == IAS_ATTRIB_MAGIC, return;); + + /* Set if attrib is owned by kernel or user space */ + attrib->value->owner = owner; + + hashbin_insert(obj->attribs, (irda_queue_t *) attrib, 0, attrib->name); +} + +/* + * Function irias_object_change_attribute (obj_name, attrib_name, new_value) + * + * Change the value of an objects attribute. + * + */ +int irias_object_change_attribute(char *obj_name, char *attrib_name, + struct ias_value *new_value) +{ + struct ias_object *obj; + struct ias_attrib *attrib; + unsigned long flags; + + /* Find object */ + obj = hashbin_lock_find(irias_objects, 0, obj_name); + if (obj == NULL) { + IRDA_WARNING("%s: Unable to find object: %s\n", __FUNCTION__, + obj_name); + return -1; + } + + /* Slightly unsafe (obj might get removed under us) */ + spin_lock_irqsave(&obj->attribs->hb_spinlock, flags); + + /* Find attribute */ + attrib = hashbin_find(obj->attribs, 0, attrib_name); + if (attrib == NULL) { + IRDA_WARNING("%s: Unable to find attribute: %s\n", + __FUNCTION__, attrib_name); + spin_unlock_irqrestore(&obj->attribs->hb_spinlock, flags); + return -1; + } + + if ( attrib->value->type != new_value->type) { + IRDA_DEBUG( 0, "%s(), changing value type not allowed!\n", + __FUNCTION__); + spin_unlock_irqrestore(&obj->attribs->hb_spinlock, flags); + return -1; + } + + /* Delete old value */ + irias_delete_value(attrib->value); + + /* Insert new value */ + attrib->value = new_value; + + /* Success */ + spin_unlock_irqrestore(&obj->attribs->hb_spinlock, flags); + return 0; +} +EXPORT_SYMBOL(irias_object_change_attribute); + +/* + * Function irias_object_add_integer_attrib (obj, name, value) + * + * Add an integer attribute to an LM-IAS object + * + */ +void irias_add_integer_attrib(struct ias_object *obj, char *name, int value, + int owner) +{ + struct ias_attrib *attrib; + + IRDA_ASSERT(obj != NULL, return;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;); + IRDA_ASSERT(name != NULL, return;); + + attrib = (struct ias_attrib *) kmalloc(sizeof(struct ias_attrib), + GFP_ATOMIC); + if (attrib == NULL) { + IRDA_WARNING("%s: Unable to allocate attribute!\n", + __FUNCTION__); + return; + } + memset(attrib, 0, sizeof( struct ias_attrib)); + + attrib->magic = IAS_ATTRIB_MAGIC; + attrib->name = strndup(name, IAS_MAX_ATTRIBNAME); + + /* Insert value */ + attrib->value = irias_new_integer_value(value); + + irias_add_attrib(obj, attrib, owner); +} +EXPORT_SYMBOL(irias_add_integer_attrib); + + /* + * Function irias_add_octseq_attrib (obj, name, octet_seq, len) + * + * Add a octet sequence attribute to an LM-IAS object + * + */ + +void irias_add_octseq_attrib(struct ias_object *obj, char *name, __u8 *octets, + int len, int owner) +{ + struct ias_attrib *attrib; + + IRDA_ASSERT(obj != NULL, return;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;); + + IRDA_ASSERT(name != NULL, return;); + IRDA_ASSERT(octets != NULL, return;); + + attrib = (struct ias_attrib *) kmalloc(sizeof(struct ias_attrib), + GFP_ATOMIC); + if (attrib == NULL) { + IRDA_WARNING("%s: Unable to allocate attribute!\n", + __FUNCTION__); + return; + } + memset(attrib, 0, sizeof( struct ias_attrib)); + + attrib->magic = IAS_ATTRIB_MAGIC; + attrib->name = strndup(name, IAS_MAX_ATTRIBNAME); + + attrib->value = irias_new_octseq_value( octets, len); + + irias_add_attrib(obj, attrib, owner); +} +EXPORT_SYMBOL(irias_add_octseq_attrib); + +/* + * Function irias_object_add_string_attrib (obj, string) + * + * Add a string attribute to an LM-IAS object + * + */ +void irias_add_string_attrib(struct ias_object *obj, char *name, char *value, + int owner) +{ + struct ias_attrib *attrib; + + IRDA_ASSERT(obj != NULL, return;); + IRDA_ASSERT(obj->magic == IAS_OBJECT_MAGIC, return;); + + IRDA_ASSERT(name != NULL, return;); + IRDA_ASSERT(value != NULL, return;); + + attrib = (struct ias_attrib *) kmalloc(sizeof( struct ias_attrib), + GFP_ATOMIC); + if (attrib == NULL) { + IRDA_WARNING("%s: Unable to allocate attribute!\n", + __FUNCTION__); + return; + } + memset(attrib, 0, sizeof( struct ias_attrib)); + + attrib->magic = IAS_ATTRIB_MAGIC; + attrib->name = strndup(name, IAS_MAX_ATTRIBNAME); + + attrib->value = irias_new_string_value(value); + + irias_add_attrib(obj, attrib, owner); +} +EXPORT_SYMBOL(irias_add_string_attrib); + +/* + * Function irias_new_integer_value (integer) + * + * Create new IAS integer value + * + */ +struct ias_value *irias_new_integer_value(int integer) +{ + struct ias_value *value; + + value = kmalloc(sizeof(struct ias_value), GFP_ATOMIC); + if (value == NULL) { + IRDA_WARNING("%s: Unable to kmalloc!\n", __FUNCTION__); + return NULL; + } + memset(value, 0, sizeof(struct ias_value)); + + value->type = IAS_INTEGER; + value->len = 4; + value->t.integer = integer; + + return value; +} +EXPORT_SYMBOL(irias_new_integer_value); + +/* + * Function irias_new_string_value (string) + * + * Create new IAS string value + * + * Per IrLMP 1.1, 4.3.3.2, strings are up to 256 chars - Jean II + */ +struct ias_value *irias_new_string_value(char *string) +{ + struct ias_value *value; + + value = kmalloc(sizeof(struct ias_value), GFP_ATOMIC); + if (value == NULL) { + IRDA_WARNING("%s: Unable to kmalloc!\n", __FUNCTION__); + return NULL; + } + memset( value, 0, sizeof( struct ias_value)); + + value->type = IAS_STRING; + value->charset = CS_ASCII; + value->t.string = strndup(string, IAS_MAX_STRING); + value->len = strlen(value->t.string); + + return value; +} +EXPORT_SYMBOL(irias_new_string_value); + +/* + * Function irias_new_octseq_value (octets, len) + * + * Create new IAS octet-sequence value + * + * Per IrLMP 1.1, 4.3.3.2, octet-sequence are up to 1024 bytes - Jean II + */ +struct ias_value *irias_new_octseq_value(__u8 *octseq , int len) +{ + struct ias_value *value; + + value = kmalloc(sizeof(struct ias_value), GFP_ATOMIC); + if (value == NULL) { + IRDA_WARNING("%s: Unable to kmalloc!\n", __FUNCTION__); + return NULL; + } + memset(value, 0, sizeof(struct ias_value)); + + value->type = IAS_OCT_SEQ; + /* Check length */ + if(len > IAS_MAX_OCTET_STRING) + len = IAS_MAX_OCTET_STRING; + value->len = len; + + value->t.oct_seq = kmalloc(len, GFP_ATOMIC); + if (value->t.oct_seq == NULL){ + IRDA_WARNING("%s: Unable to kmalloc!\n", __FUNCTION__); + kfree(value); + return NULL; + } + memcpy(value->t.oct_seq, octseq , len); + return value; +} +EXPORT_SYMBOL(irias_new_octseq_value); + +struct ias_value *irias_new_missing_value(void) +{ + struct ias_value *value; + + value = kmalloc(sizeof(struct ias_value), GFP_ATOMIC); + if (value == NULL) { + IRDA_WARNING("%s: Unable to kmalloc!\n", __FUNCTION__); + return NULL; + } + memset(value, 0, sizeof(struct ias_value)); + + value->type = IAS_MISSING; + value->len = 0; + + return value; +} + +/* + * Function irias_delete_value (value) + * + * Delete IAS value + * + */ +void irias_delete_value(struct ias_value *value) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(value != NULL, return;); + + switch (value->type) { + case IAS_INTEGER: /* Fallthrough */ + case IAS_MISSING: + /* No need to deallocate */ + break; + case IAS_STRING: + /* If string, deallocate string */ + if (value->t.string != NULL) + kfree(value->t.string); + break; + case IAS_OCT_SEQ: + /* If byte stream, deallocate byte stream */ + if (value->t.oct_seq != NULL) + kfree(value->t.oct_seq); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown value type!\n", __FUNCTION__); + break; + } + kfree(value); +} +EXPORT_SYMBOL(irias_delete_value); diff --git a/net/irda/irlan/Kconfig b/net/irda/irlan/Kconfig new file mode 100644 index 000000000000..951abc2e3a7f --- /dev/null +++ b/net/irda/irlan/Kconfig @@ -0,0 +1,14 @@ +config IRLAN + tristate "IrLAN protocol" + depends on IRDA + help + Say Y here if you want to build support for the IrLAN protocol. + To compile it as a module, choose M here: the module will be called + irlan. IrLAN emulates an Ethernet and makes it possible to put up + a wireless LAN using infrared beams. + + The IrLAN protocol can be used to talk with infrared access points + like the HP NetbeamIR, or the ESI JetEye NET. You can also connect + to another Linux machine running the IrLAN protocol for ad-hoc + networking! + diff --git a/net/irda/irlan/Makefile b/net/irda/irlan/Makefile new file mode 100644 index 000000000000..77549bc8641b --- /dev/null +++ b/net/irda/irlan/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux IrDA IrLAN protocol layer. +# + +obj-$(CONFIG_IRLAN) += irlan.o + +irlan-objs := irlan_common.o irlan_eth.o irlan_event.o irlan_client.o irlan_provider.o irlan_filter.o irlan_provider_event.o irlan_client_event.o diff --git a/net/irda/irlan/irlan_client.c b/net/irda/irlan/irlan_client.c new file mode 100644 index 000000000000..f8e6cb0db04b --- /dev/null +++ b/net/irda/irlan/irlan_client.c @@ -0,0 +1,576 @@ +/********************************************************************* + * + * Filename: irlan_client.c + * Version: 0.9 + * Description: IrDA LAN Access Protocol (IrLAN) Client + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sun Aug 31 20:14:37 1997 + * Modified at: Tue Dec 14 15:47:02 1999 + * Modified by: Dag Brattli + * Sources: skeleton.c by Donald Becker + * slip.c by Laurence Culhane, + * Fred N. van Kempen, + * + * Copyright (c) 1998-1999 Dag Brattli , + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#undef CONFIG_IRLAN_GRATUITOUS_ARP + +static void irlan_client_ctrl_disconnect_indication(void *instance, void *sap, + LM_REASON reason, + struct sk_buff *); +static int irlan_client_ctrl_data_indication(void *instance, void *sap, + struct sk_buff *skb); +static void irlan_client_ctrl_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *); +static void irlan_check_response_param(struct irlan_cb *self, char *param, + char *value, int val_len); +static void irlan_client_open_ctrl_tsap(struct irlan_cb *self); + +static void irlan_client_kick_timer_expired(void *data) +{ + struct irlan_cb *self = (struct irlan_cb *) data; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* + * If we are in peer mode, the client may not have got the discovery + * indication it needs to make progress. If the client is still in + * IDLE state, we must kick it to, but only if the provider is not IDLE + */ + if ((self->provider.access_type == ACCESS_PEER) && + (self->client.state == IRLAN_IDLE) && + (self->provider.state != IRLAN_IDLE)) { + irlan_client_wakeup(self, self->saddr, self->daddr); + } +} + +static void irlan_client_start_kick_timer(struct irlan_cb *self, int timeout) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + irda_start_timer(&self->client.kick_timer, timeout, (void *) self, + irlan_client_kick_timer_expired); +} + +/* + * Function irlan_client_wakeup (self, saddr, daddr) + * + * Wake up client + * + */ +void irlan_client_wakeup(struct irlan_cb *self, __u32 saddr, __u32 daddr) +{ + IRDA_DEBUG(1, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* + * Check if we are already awake, or if we are a provider in direct + * mode (in that case we must leave the client idle + */ + if ((self->client.state != IRLAN_IDLE) || + (self->provider.access_type == ACCESS_DIRECT)) + { + IRDA_DEBUG(0, "%s(), already awake!\n", __FUNCTION__ ); + return; + } + + /* Addresses may have changed! */ + self->saddr = saddr; + self->daddr = daddr; + + if (self->disconnect_reason == LM_USER_REQUEST) { + IRDA_DEBUG(0, "%s(), still stopped by user\n", __FUNCTION__ ); + return; + } + + /* Open TSAPs */ + irlan_client_open_ctrl_tsap(self); + irlan_open_data_tsap(self); + + irlan_do_client_event(self, IRLAN_DISCOVERY_INDICATION, NULL); + + /* Start kick timer */ + irlan_client_start_kick_timer(self, 2*HZ); +} + +/* + * Function irlan_discovery_indication (daddr) + * + * Remote device with IrLAN server support discovered + * + */ +void irlan_client_discovery_indication(discinfo_t *discovery, + DISCOVERY_MODE mode, + void *priv) +{ + struct irlan_cb *self; + __u32 saddr, daddr; + + IRDA_DEBUG(1, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(discovery != NULL, return;); + + /* + * I didn't check it, but I bet that IrLAN suffer from the same + * deficiency as IrComm and doesn't handle two instances + * simultaneously connecting to each other. + * Same workaround, drop passive discoveries. + * Jean II */ + if(mode == DISCOVERY_PASSIVE) + return; + + saddr = discovery->saddr; + daddr = discovery->daddr; + + /* Find instance */ + rcu_read_lock(); + self = irlan_get_any(); + if (self) { + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + IRDA_DEBUG(1, "%s(), Found instance (%08x)!\n", __FUNCTION__ , + daddr); + + irlan_client_wakeup(self, saddr, daddr); + } + rcu_read_unlock(); +} + +/* + * Function irlan_client_data_indication (handle, skb) + * + * This function gets the data that is received on the control channel + * + */ +static int irlan_client_ctrl_data_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct irlan_cb *self; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + self = (struct irlan_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;); + IRDA_ASSERT(skb != NULL, return -1;); + + irlan_do_client_event(self, IRLAN_DATA_INDICATION, skb); + + /* Ready for a new command */ + IRDA_DEBUG(2, "%s(), clearing tx_busy\n", __FUNCTION__ ); + self->client.tx_busy = FALSE; + + /* Check if we have some queued commands waiting to be sent */ + irlan_run_ctrl_tx_queue(self); + + return 0; +} + +static void irlan_client_ctrl_disconnect_indication(void *instance, void *sap, + LM_REASON reason, + struct sk_buff *userdata) +{ + struct irlan_cb *self; + struct tsap_cb *tsap; + struct sk_buff *skb; + + IRDA_DEBUG(4, "%s(), reason=%d\n", __FUNCTION__ , reason); + + self = (struct irlan_cb *) instance; + tsap = (struct tsap_cb *) sap; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + IRDA_ASSERT(tsap != NULL, return;); + IRDA_ASSERT(tsap->magic == TTP_TSAP_MAGIC, return;); + + IRDA_ASSERT(tsap == self->client.tsap_ctrl, return;); + + /* Remove frames queued on the control channel */ + while ((skb = skb_dequeue(&self->client.txq)) != NULL) { + dev_kfree_skb(skb); + } + self->client.tx_busy = FALSE; + + irlan_do_client_event(self, IRLAN_LMP_DISCONNECT, NULL); +} + +/* + * Function irlan_client_open_tsaps (self) + * + * Initialize callbacks and open IrTTP TSAPs + * + */ +static void irlan_client_open_ctrl_tsap(struct irlan_cb *self) +{ + struct tsap_cb *tsap; + notify_t notify; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* Check if already open */ + if (self->client.tsap_ctrl) + return; + + irda_notify_init(¬ify); + + /* Set up callbacks */ + notify.data_indication = irlan_client_ctrl_data_indication; + notify.connect_confirm = irlan_client_ctrl_connect_confirm; + notify.disconnect_indication = irlan_client_ctrl_disconnect_indication; + notify.instance = self; + strlcpy(notify.name, "IrLAN ctrl (c)", sizeof(notify.name)); + + tsap = irttp_open_tsap(LSAP_ANY, DEFAULT_INITIAL_CREDIT, ¬ify); + if (!tsap) { + IRDA_DEBUG(2, "%s(), Got no tsap!\n", __FUNCTION__ ); + return; + } + self->client.tsap_ctrl = tsap; +} + +/* + * Function irlan_client_connect_confirm (handle, skb) + * + * Connection to peer IrLAN laye confirmed + * + */ +static void irlan_client_ctrl_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct irlan_cb *self; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + self = (struct irlan_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + self->client.max_sdu_size = max_sdu_size; + self->client.max_header_size = max_header_size; + + /* TODO: we could set the MTU depending on the max_sdu_size */ + + irlan_do_client_event(self, IRLAN_CONNECT_COMPLETE, NULL); +} + +/* + * Function print_ret_code (code) + * + * Print return code of request to peer IrLAN layer. + * + */ +static void print_ret_code(__u8 code) +{ + switch(code) { + case 0: + printk(KERN_INFO "Success\n"); + break; + case 1: + IRDA_WARNING("IrLAN: Insufficient resources\n"); + break; + case 2: + IRDA_WARNING("IrLAN: Invalid command format\n"); + break; + case 3: + IRDA_WARNING("IrLAN: Command not supported\n"); + break; + case 4: + IRDA_WARNING("IrLAN: Parameter not supported\n"); + break; + case 5: + IRDA_WARNING("IrLAN: Value not supported\n"); + break; + case 6: + IRDA_WARNING("IrLAN: Not open\n"); + break; + case 7: + IRDA_WARNING("IrLAN: Authentication required\n"); + break; + case 8: + IRDA_WARNING("IrLAN: Invalid password\n"); + break; + case 9: + IRDA_WARNING("IrLAN: Protocol error\n"); + break; + case 255: + IRDA_WARNING("IrLAN: Asynchronous status\n"); + break; + } +} + +/* + * Function irlan_client_parse_response (self, skb) + * + * Extract all parameters from received buffer, then feed them to + * check_params for parsing + */ +void irlan_client_parse_response(struct irlan_cb *self, struct sk_buff *skb) +{ + __u8 *frame; + __u8 *ptr; + int count; + int ret; + __u16 val_len; + int i; + char *name; + char *value; + + IRDA_ASSERT(skb != NULL, return;); + + IRDA_DEBUG(4, "%s() skb->len=%d\n", __FUNCTION__ , (int) skb->len); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + if (!skb) { + IRDA_ERROR("%s(), Got NULL skb!\n", __FUNCTION__); + return; + } + frame = skb->data; + + /* + * Check return code and print it if not success + */ + if (frame[0]) { + print_ret_code(frame[0]); + return; + } + + name = kmalloc(255, GFP_ATOMIC); + if (!name) + return; + value = kmalloc(1016, GFP_ATOMIC); + if (!value) { + kfree(name); + return; + } + + /* How many parameters? */ + count = frame[1]; + + IRDA_DEBUG(4, "%s(), got %d parameters\n", __FUNCTION__ , count); + + ptr = frame+2; + + /* For all parameters */ + for (i=0; imagic == IRLAN_MAGIC, return;); + + /* Media type */ + if (strcmp(param, "MEDIA") == 0) { + if (strcmp(value, "802.3") == 0) + self->media = MEDIA_802_3; + else + self->media = MEDIA_802_5; + return; + } + if (strcmp(param, "FILTER_TYPE") == 0) { + if (strcmp(value, "DIRECTED") == 0) + self->client.filter_type |= IRLAN_DIRECTED; + else if (strcmp(value, "FUNCTIONAL") == 0) + self->client.filter_type |= IRLAN_FUNCTIONAL; + else if (strcmp(value, "GROUP") == 0) + self->client.filter_type |= IRLAN_GROUP; + else if (strcmp(value, "MAC_FRAME") == 0) + self->client.filter_type |= IRLAN_MAC_FRAME; + else if (strcmp(value, "MULTICAST") == 0) + self->client.filter_type |= IRLAN_MULTICAST; + else if (strcmp(value, "BROADCAST") == 0) + self->client.filter_type |= IRLAN_BROADCAST; + else if (strcmp(value, "IPX_SOCKET") == 0) + self->client.filter_type |= IRLAN_IPX_SOCKET; + + } + if (strcmp(param, "ACCESS_TYPE") == 0) { + if (strcmp(value, "DIRECT") == 0) + self->client.access_type = ACCESS_DIRECT; + else if (strcmp(value, "PEER") == 0) + self->client.access_type = ACCESS_PEER; + else if (strcmp(value, "HOSTED") == 0) + self->client.access_type = ACCESS_HOSTED; + else { + IRDA_DEBUG(2, "%s(), unknown access type!\n", __FUNCTION__ ); + } + } + /* IRLAN version */ + if (strcmp(param, "IRLAN_VER") == 0) { + IRDA_DEBUG(4, "IrLAN version %d.%d\n", (__u8) value[0], + (__u8) value[1]); + + self->version[0] = value[0]; + self->version[1] = value[1]; + return; + } + /* Which remote TSAP to use for data channel */ + if (strcmp(param, "DATA_CHAN") == 0) { + self->dtsap_sel_data = value[0]; + IRDA_DEBUG(4, "Data TSAP = %02x\n", self->dtsap_sel_data); + return; + } + if (strcmp(param, "CON_ARB") == 0) { + memcpy(&tmp_cpu, value, 2); /* Align value */ + le16_to_cpus(&tmp_cpu); /* Convert to host order */ + self->client.recv_arb_val = tmp_cpu; + IRDA_DEBUG(2, "%s(), receive arb val=%d\n", __FUNCTION__ , + self->client.recv_arb_val); + } + if (strcmp(param, "MAX_FRAME") == 0) { + memcpy(&tmp_cpu, value, 2); /* Align value */ + le16_to_cpus(&tmp_cpu); /* Convert to host order */ + self->client.max_frame = tmp_cpu; + IRDA_DEBUG(4, "%s(), max frame=%d\n", __FUNCTION__ , + self->client.max_frame); + } + + /* RECONNECT_KEY, in case the link goes down! */ + if (strcmp(param, "RECONNECT_KEY") == 0) { + IRDA_DEBUG(4, "Got reconnect key: "); + /* for (i = 0; i < val_len; i++) */ +/* printk("%02x", value[i]); */ + memcpy(self->client.reconnect_key, value, val_len); + self->client.key_len = val_len; + IRDA_DEBUG(4, "\n"); + } + /* FILTER_ENTRY, have we got an ethernet address? */ + if (strcmp(param, "FILTER_ENTRY") == 0) { + bytes = value; + IRDA_DEBUG(4, "Ethernet address = %02x:%02x:%02x:%02x:%02x:%02x\n", + bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], + bytes[5]); + for (i = 0; i < 6; i++) + self->dev->dev_addr[i] = bytes[i]; + } +} + +/* + * Function irlan_client_get_value_confirm (obj_id, value) + * + * Got results from remote LM-IAS + * + */ +void irlan_client_get_value_confirm(int result, __u16 obj_id, + struct ias_value *value, void *priv) +{ + struct irlan_cb *self; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(priv != NULL, return;); + + self = (struct irlan_cb *) priv; + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* We probably don't need to make any more queries */ + iriap_close(self->client.iriap); + self->client.iriap = NULL; + + /* Check if request succeeded */ + if (result != IAS_SUCCESS) { + IRDA_DEBUG(2, "%s(), got NULL value!\n", __FUNCTION__ ); + irlan_do_client_event(self, IRLAN_IAS_PROVIDER_NOT_AVAIL, + NULL); + return; + } + + switch (value->type) { + case IAS_INTEGER: + self->dtsap_sel_ctrl = value->t.integer; + + if (value->t.integer != -1) { + irlan_do_client_event(self, IRLAN_IAS_PROVIDER_AVAIL, + NULL); + return; + } + irias_delete_value(value); + break; + default: + IRDA_DEBUG(2, "%s(), unknown type!\n", __FUNCTION__ ); + break; + } + irlan_do_client_event(self, IRLAN_IAS_PROVIDER_NOT_AVAIL, NULL); +} diff --git a/net/irda/irlan/irlan_client_event.c b/net/irda/irlan/irlan_client_event.c new file mode 100644 index 000000000000..ce943b69e996 --- /dev/null +++ b/net/irda/irlan/irlan_client_event.c @@ -0,0 +1,533 @@ +/********************************************************************* + * + * Filename: irlan_client_event.c + * Version: 0.9 + * Description: IrLAN client state machine + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sun Aug 31 20:14:37 1997 + * Modified at: Sun Dec 26 21:52:24 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli , + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static int irlan_client_state_idle (struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_query(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_conn (struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_info (struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_media(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_open (struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_wait (struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_arb (struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_data (struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_close(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_client_state_sync (struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); + +static int (*state[])(struct irlan_cb *, IRLAN_EVENT event, struct sk_buff *) = +{ + irlan_client_state_idle, + irlan_client_state_query, + irlan_client_state_conn, + irlan_client_state_info, + irlan_client_state_media, + irlan_client_state_open, + irlan_client_state_wait, + irlan_client_state_arb, + irlan_client_state_data, + irlan_client_state_close, + irlan_client_state_sync +}; + +void irlan_do_client_event(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + (*state[ self->client.state]) (self, event, skb); +} + +/* + * Function irlan_client_state_idle (event, skb, info) + * + * IDLE, We are waiting for an indication that there is a provider + * available. + */ +static int irlan_client_state_idle(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;); + + switch (event) { + case IRLAN_DISCOVERY_INDICATION: + if (self->client.iriap) { + IRDA_WARNING("%s(), busy with a previous query\n", + __FUNCTION__); + return -EBUSY; + } + + self->client.iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, + irlan_client_get_value_confirm); + /* Get some values from peer IAS */ + irlan_next_client_state(self, IRLAN_QUERY); + iriap_getvaluebyclass_request(self->client.iriap, + self->saddr, self->daddr, + "IrLAN", "IrDA:TinyTP:LsapSel"); + break; + case IRLAN_WATCHDOG_TIMEOUT: + IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __FUNCTION__ ); + break; + default: + IRDA_DEBUG(4, "%s(), Unknown event %d\n", __FUNCTION__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_query (event, skb, info) + * + * QUERY, We have queryed the remote IAS and is ready to connect + * to provider, just waiting for the confirm. + * + */ +static int irlan_client_state_query(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;); + + switch(event) { + case IRLAN_IAS_PROVIDER_AVAIL: + IRDA_ASSERT(self->dtsap_sel_ctrl != 0, return -1;); + + self->client.open_retries = 0; + + irttp_connect_request(self->client.tsap_ctrl, + self->dtsap_sel_ctrl, + self->saddr, self->daddr, NULL, + IRLAN_MTU, NULL); + irlan_next_client_state(self, IRLAN_CONN); + break; + case IRLAN_IAS_PROVIDER_NOT_AVAIL: + IRDA_DEBUG(2, "%s(), IAS_PROVIDER_NOT_AVAIL\n", __FUNCTION__ ); + irlan_next_client_state(self, IRLAN_IDLE); + + /* Give the client a kick! */ + if ((self->provider.access_type == ACCESS_PEER) && + (self->provider.state != IRLAN_IDLE)) + irlan_client_wakeup(self, self->saddr, self->daddr); + break; + case IRLAN_LMP_DISCONNECT: + case IRLAN_LAP_DISCONNECT: + irlan_next_client_state(self, IRLAN_IDLE); + break; + case IRLAN_WATCHDOG_TIMEOUT: + IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __FUNCTION__ ); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __FUNCTION__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_conn (event, skb, info) + * + * CONN, We have connected to a provider but has not issued any + * commands yet. + * + */ +static int irlan_client_state_conn(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch (event) { + case IRLAN_CONNECT_COMPLETE: + /* Send getinfo cmd */ + irlan_get_provider_info(self); + irlan_next_client_state(self, IRLAN_INFO); + break; + case IRLAN_LMP_DISCONNECT: + case IRLAN_LAP_DISCONNECT: + irlan_next_client_state(self, IRLAN_IDLE); + break; + case IRLAN_WATCHDOG_TIMEOUT: + IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __FUNCTION__ ); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __FUNCTION__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_info (self, event, skb, info) + * + * INFO, We have issued a GetInfo command and is awaiting a reply. + */ +static int irlan_client_state_info(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch (event) { + case IRLAN_DATA_INDICATION: + IRDA_ASSERT(skb != NULL, return -1;); + + irlan_client_parse_response(self, skb); + + irlan_next_client_state(self, IRLAN_MEDIA); + + irlan_get_media_char(self); + break; + + case IRLAN_LMP_DISCONNECT: + case IRLAN_LAP_DISCONNECT: + irlan_next_client_state(self, IRLAN_IDLE); + break; + case IRLAN_WATCHDOG_TIMEOUT: + IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __FUNCTION__ ); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __FUNCTION__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_media (self, event, skb, info) + * + * MEDIA, The irlan_client has issued a GetMedia command and is awaiting a + * reply. + * + */ +static int irlan_client_state_media(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch(event) { + case IRLAN_DATA_INDICATION: + irlan_client_parse_response(self, skb); + irlan_open_data_channel(self); + irlan_next_client_state(self, IRLAN_OPEN); + break; + case IRLAN_LMP_DISCONNECT: + case IRLAN_LAP_DISCONNECT: + irlan_next_client_state(self, IRLAN_IDLE); + break; + case IRLAN_WATCHDOG_TIMEOUT: + IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __FUNCTION__ ); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __FUNCTION__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_open (self, event, skb, info) + * + * OPEN, The irlan_client has issued a OpenData command and is awaiting a + * reply + * + */ +static int irlan_client_state_open(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + struct qos_info qos; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch(event) { + case IRLAN_DATA_INDICATION: + irlan_client_parse_response(self, skb); + + /* + * Check if we have got the remote TSAP for data + * communications + */ + IRDA_ASSERT(self->dtsap_sel_data != 0, return -1;); + + /* Check which access type we are dealing with */ + switch (self->client.access_type) { + case ACCESS_PEER: + if (self->provider.state == IRLAN_OPEN) { + + irlan_next_client_state(self, IRLAN_ARB); + irlan_do_client_event(self, IRLAN_CHECK_CON_ARB, + NULL); + } else { + + irlan_next_client_state(self, IRLAN_WAIT); + } + break; + case ACCESS_DIRECT: + case ACCESS_HOSTED: + qos.link_disc_time.bits = 0x01; /* 3 secs */ + + irttp_connect_request(self->tsap_data, + self->dtsap_sel_data, + self->saddr, self->daddr, &qos, + IRLAN_MTU, NULL); + + irlan_next_client_state(self, IRLAN_DATA); + break; + default: + IRDA_DEBUG(2, "%s(), unknown access type!\n", __FUNCTION__ ); + break; + } + break; + case IRLAN_LMP_DISCONNECT: + case IRLAN_LAP_DISCONNECT: + irlan_next_client_state(self, IRLAN_IDLE); + break; + case IRLAN_WATCHDOG_TIMEOUT: + IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __FUNCTION__ ); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __FUNCTION__ , event); + break; + } + + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_wait (self, event, skb, info) + * + * WAIT, The irlan_client is waiting for the local provider to enter the + * provider OPEN state. + * + */ +static int irlan_client_state_wait(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch(event) { + case IRLAN_PROVIDER_SIGNAL: + irlan_next_client_state(self, IRLAN_ARB); + irlan_do_client_event(self, IRLAN_CHECK_CON_ARB, NULL); + break; + case IRLAN_LMP_DISCONNECT: + case IRLAN_LAP_DISCONNECT: + irlan_next_client_state(self, IRLAN_IDLE); + break; + case IRLAN_WATCHDOG_TIMEOUT: + IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __FUNCTION__ ); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __FUNCTION__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +static int irlan_client_state_arb(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + struct qos_info qos; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch(event) { + case IRLAN_CHECK_CON_ARB: + if (self->client.recv_arb_val == self->provider.send_arb_val) { + irlan_next_client_state(self, IRLAN_CLOSE); + irlan_close_data_channel(self); + } else if (self->client.recv_arb_val < + self->provider.send_arb_val) + { + qos.link_disc_time.bits = 0x01; /* 3 secs */ + + irlan_next_client_state(self, IRLAN_DATA); + irttp_connect_request(self->tsap_data, + self->dtsap_sel_data, + self->saddr, self->daddr, &qos, + IRLAN_MTU, NULL); + } else if (self->client.recv_arb_val > + self->provider.send_arb_val) + { + IRDA_DEBUG(2, "%s(), lost the battle :-(\n", __FUNCTION__ ); + } + break; + case IRLAN_DATA_CONNECT_INDICATION: + irlan_next_client_state(self, IRLAN_DATA); + break; + case IRLAN_LMP_DISCONNECT: + case IRLAN_LAP_DISCONNECT: + irlan_next_client_state(self, IRLAN_IDLE); + break; + case IRLAN_WATCHDOG_TIMEOUT: + IRDA_DEBUG(2, "%s(), IRLAN_WATCHDOG_TIMEOUT\n", __FUNCTION__ ); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __FUNCTION__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_data (self, event, skb, info) + * + * DATA, The data channel is connected, allowing data transfers between + * the local and remote machines. + * + */ +static int irlan_client_state_data(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;); + + switch(event) { + case IRLAN_DATA_INDICATION: + irlan_client_parse_response(self, skb); + break; + case IRLAN_LMP_DISCONNECT: /* FALLTHROUGH */ + case IRLAN_LAP_DISCONNECT: + irlan_next_client_state(self, IRLAN_IDLE); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __FUNCTION__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_close (self, event, skb, info) + * + * + * + */ +static int irlan_client_state_close(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_client_state_sync (self, event, skb, info) + * + * + * + */ +static int irlan_client_state_sync(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + if (skb) + dev_kfree_skb(skb); + + return 0; +} + + + + + + + + + + + + + diff --git a/net/irda/irlan/irlan_common.c b/net/irda/irlan/irlan_common.c new file mode 100644 index 000000000000..657d12210578 --- /dev/null +++ b/net/irda/irlan/irlan_common.c @@ -0,0 +1,1200 @@ +/********************************************************************* + * + * Filename: irlan_common.c + * Version: 0.9 + * Description: IrDA LAN Access Protocol Implementation + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sun Aug 31 20:14:37 1997 + * Modified at: Sun Dec 26 21:53:10 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1997, 1999 Dag Brattli , + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +/* + * Send gratuitous ARP when connected to a new AP or not. May be a clever + * thing to do, but for some reason the machine crashes if you use DHCP. So + * lets not use it by default. + */ +#undef CONFIG_IRLAN_SEND_GRATUITOUS_ARP + +/* extern char sysctl_devname[]; */ + +/* + * Master structure + */ +static LIST_HEAD(irlans); + +static void *ckey; +static void *skey; + +/* Module parameters */ +static int eth; /* Use "eth" or "irlan" name for devices */ +static int access = ACCESS_PEER; /* PEER, DIRECT or HOSTED */ + +#ifdef CONFIG_PROC_FS +static const char *irlan_access[] = { + "UNKNOWN", + "DIRECT", + "PEER", + "HOSTED" +}; + +static const char *irlan_media[] = { + "UNKNOWN", + "802.3", + "802.5" +}; + +extern struct proc_dir_entry *proc_irda; + +static int irlan_seq_open(struct inode *inode, struct file *file); + +static struct file_operations irlan_fops = { + .owner = THIS_MODULE, + .open = irlan_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +extern struct proc_dir_entry *proc_irda; +#endif /* CONFIG_PROC_FS */ + +static struct irlan_cb *irlan_open(__u32 saddr, __u32 daddr); +static void __irlan_close(struct irlan_cb *self); +static int __irlan_insert_param(struct sk_buff *skb, char *param, int type, + __u8 value_byte, __u16 value_short, + __u8 *value_array, __u16 value_len); +static void irlan_open_unicast_addr(struct irlan_cb *self); +static void irlan_get_unicast_addr(struct irlan_cb *self); +void irlan_close_tsaps(struct irlan_cb *self); + +/* + * Function irlan_init (void) + * + * Initialize IrLAN layer + * + */ +static int __init irlan_init(void) +{ + struct irlan_cb *new; + __u16 hints; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + +#ifdef CONFIG_PROC_FS + { struct proc_dir_entry *proc; + proc = create_proc_entry("irlan", 0, proc_irda); + if (!proc) { + printk(KERN_ERR "irlan_init: can't create /proc entry!\n"); + return -ENODEV; + } + + proc->proc_fops = &irlan_fops; + } +#endif /* CONFIG_PROC_FS */ + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + hints = irlmp_service_to_hint(S_LAN); + + /* Register with IrLMP as a client */ + ckey = irlmp_register_client(hints, &irlan_client_discovery_indication, + NULL, NULL); + + /* Register with IrLMP as a service */ + skey = irlmp_register_service(hints); + + /* Start the master IrLAN instance (the only one for now) */ + new = irlan_open(DEV_ADDR_ANY, DEV_ADDR_ANY); + + /* The master will only open its (listen) control TSAP */ + irlan_provider_open_ctrl_tsap(new); + + /* Do some fast discovery! */ + irlmp_discovery_request(DISCOVERY_DEFAULT_SLOTS); + + return 0; +} + +static void __exit irlan_cleanup(void) +{ + struct irlan_cb *self, *next; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + irlmp_unregister_client(ckey); + irlmp_unregister_service(skey); + +#ifdef CONFIG_PROC_FS + remove_proc_entry("irlan", proc_irda); +#endif /* CONFIG_PROC_FS */ + + /* Cleanup any leftover network devices */ + rtnl_lock(); + list_for_each_entry_safe(self, next, &irlans, dev_list) { + __irlan_close(self); + } + rtnl_unlock(); +} + +/* + * Function irlan_open (void) + * + * Open new instance of a client/provider, we should only register the + * network device if this instance is ment for a particular client/provider + */ +static struct irlan_cb *irlan_open(__u32 saddr, __u32 daddr) +{ + struct net_device *dev; + struct irlan_cb *self; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + /* Create network device with irlan */ + dev = alloc_irlandev(eth ? "eth%d" : "irlan%d"); + if (!dev) + return NULL; + + self = dev->priv; + self->dev = dev; + + /* + * Initialize local device structure + */ + self->magic = IRLAN_MAGIC; + self->saddr = saddr; + self->daddr = daddr; + + /* Provider access can only be PEER, DIRECT, or HOSTED */ + self->provider.access_type = access; + if (access == ACCESS_DIRECT) { + /* + * Since we are emulating an IrLAN sever we will have to + * give ourself an ethernet address! + */ + dev->dev_addr[0] = 0x40; + dev->dev_addr[1] = 0x00; + dev->dev_addr[2] = 0x00; + dev->dev_addr[3] = 0x00; + get_random_bytes(dev->dev_addr+4, 1); + get_random_bytes(dev->dev_addr+5, 1); + } + + self->media = MEDIA_802_3; + self->disconnect_reason = LM_USER_REQUEST; + init_timer(&self->watchdog_timer); + init_timer(&self->client.kick_timer); + init_waitqueue_head(&self->open_wait); + + skb_queue_head_init(&self->client.txq); + + irlan_next_client_state(self, IRLAN_IDLE); + irlan_next_provider_state(self, IRLAN_IDLE); + + if (register_netdev(dev)) { + IRDA_DEBUG(2, "%s(), register_netdev() failed!\n", + __FUNCTION__ ); + self = NULL; + free_netdev(dev); + } else { + rtnl_lock(); + list_add_rcu(&self->dev_list, &irlans); + rtnl_unlock(); + } + + return self; +} +/* + * Function __irlan_close (self) + * + * This function closes and deallocates the IrLAN client instances. Be + * aware that other functions which calls client_close() must + * remove self from irlans list first. + */ +static void __irlan_close(struct irlan_cb *self) +{ + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + ASSERT_RTNL(); + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + del_timer_sync(&self->watchdog_timer); + del_timer_sync(&self->client.kick_timer); + + /* Close all open connections and remove TSAPs */ + irlan_close_tsaps(self); + + if (self->client.iriap) + iriap_close(self->client.iriap); + + /* Remove frames queued on the control channel */ + skb_queue_purge(&self->client.txq); + + /* Unregister and free self via destructor */ + unregister_netdevice(self->dev); +} + +/* Find any instance of irlan, used for client discovery wakeup */ +struct irlan_cb *irlan_get_any(void) +{ + struct irlan_cb *self; + + list_for_each_entry_rcu(self, &irlans, dev_list) { + return self; + } + return NULL; +} + +/* + * Function irlan_connect_indication (instance, sap, qos, max_sdu_size, skb) + * + * Here we receive the connect indication for the data channel + * + */ +static void irlan_connect_indication(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct irlan_cb *self; + struct tsap_cb *tsap; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + self = (struct irlan_cb *) instance; + tsap = (struct tsap_cb *) sap; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + IRDA_ASSERT(tsap == self->tsap_data,return;); + + self->max_sdu_size = max_sdu_size; + self->max_header_size = max_header_size; + + IRDA_DEBUG(0, "%s: We are now connected!\n", __FUNCTION__); + + del_timer(&self->watchdog_timer); + + /* If you want to pass the skb to *both* state machines, you will + * need to skb_clone() it, so that you don't free it twice. + * As the state machines don't need it, git rid of it here... + * Jean II */ + if (skb) + dev_kfree_skb(skb); + + irlan_do_provider_event(self, IRLAN_DATA_CONNECT_INDICATION, NULL); + irlan_do_client_event(self, IRLAN_DATA_CONNECT_INDICATION, NULL); + + if (self->provider.access_type == ACCESS_PEER) { + /* + * Data channel is open, so we are now allowed to + * configure the remote filter + */ + irlan_get_unicast_addr(self); + irlan_open_unicast_addr(self); + } + /* Ready to transfer Ethernet frames (at last) */ + netif_start_queue(self->dev); /* Clear reason */ +} + +static void irlan_connect_confirm(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct irlan_cb *self; + + self = (struct irlan_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + self->max_sdu_size = max_sdu_size; + self->max_header_size = max_header_size; + + /* TODO: we could set the MTU depending on the max_sdu_size */ + + IRDA_DEBUG(0, "%s: We are now connected!\n", __FUNCTION__); + del_timer(&self->watchdog_timer); + + /* + * Data channel is open, so we are now allowed to configure the remote + * filter + */ + irlan_get_unicast_addr(self); + irlan_open_unicast_addr(self); + + /* Open broadcast and multicast filter by default */ + irlan_set_broadcast_filter(self, TRUE); + irlan_set_multicast_filter(self, TRUE); + + /* Ready to transfer Ethernet frames */ + netif_start_queue(self->dev); + self->disconnect_reason = 0; /* Clear reason */ +#ifdef CONFIG_IRLAN_SEND_GRATUITOUS_ARP + irlan_eth_send_gratuitous_arp(&self->dev); +#endif + wake_up_interruptible(&self->open_wait); +} + +/* + * Function irlan_client_disconnect_indication (handle) + * + * Callback function for the IrTTP layer. Indicates a disconnection of + * the specified connection (handle) + */ +static void irlan_disconnect_indication(void *instance, + void *sap, LM_REASON reason, + struct sk_buff *userdata) +{ + struct irlan_cb *self; + struct tsap_cb *tsap; + + IRDA_DEBUG(0, "%s(), reason=%d\n", __FUNCTION__ , reason); + + self = (struct irlan_cb *) instance; + tsap = (struct tsap_cb *) sap; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + IRDA_ASSERT(tsap != NULL, return;); + IRDA_ASSERT(tsap->magic == TTP_TSAP_MAGIC, return;); + + IRDA_ASSERT(tsap == self->tsap_data, return;); + + IRDA_DEBUG(2, "IrLAN, data channel disconnected by peer!\n"); + + /* Save reason so we know if we should try to reconnect or not */ + self->disconnect_reason = reason; + + switch (reason) { + case LM_USER_REQUEST: /* User request */ + IRDA_DEBUG(2, "%s(), User requested\n", __FUNCTION__ ); + break; + case LM_LAP_DISCONNECT: /* Unexpected IrLAP disconnect */ + IRDA_DEBUG(2, "%s(), Unexpected IrLAP disconnect\n", __FUNCTION__ ); + break; + case LM_CONNECT_FAILURE: /* Failed to establish IrLAP connection */ + IRDA_DEBUG(2, "%s(), IrLAP connect failed\n", __FUNCTION__ ); + break; + case LM_LAP_RESET: /* IrLAP reset */ + IRDA_DEBUG(2, "%s(), IrLAP reset\n", __FUNCTION__ ); + break; + case LM_INIT_DISCONNECT: + IRDA_DEBUG(2, "%s(), IrLMP connect failed\n", __FUNCTION__ ); + break; + default: + IRDA_ERROR("%s(), Unknown disconnect reason\n", __FUNCTION__); + break; + } + + /* If you want to pass the skb to *both* state machines, you will + * need to skb_clone() it, so that you don't free it twice. + * As the state machines don't need it, git rid of it here... + * Jean II */ + if (userdata) + dev_kfree_skb(userdata); + + irlan_do_client_event(self, IRLAN_LMP_DISCONNECT, NULL); + irlan_do_provider_event(self, IRLAN_LMP_DISCONNECT, NULL); + + wake_up_interruptible(&self->open_wait); +} + +void irlan_open_data_tsap(struct irlan_cb *self) +{ + struct tsap_cb *tsap; + notify_t notify; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* Check if already open */ + if (self->tsap_data) + return; + + irda_notify_init(¬ify); + + notify.data_indication = irlan_eth_receive; + notify.udata_indication = irlan_eth_receive; + notify.connect_indication = irlan_connect_indication; + notify.connect_confirm = irlan_connect_confirm; + notify.flow_indication = irlan_eth_flow_indication; + notify.disconnect_indication = irlan_disconnect_indication; + notify.instance = self; + strlcpy(notify.name, "IrLAN data", sizeof(notify.name)); + + tsap = irttp_open_tsap(LSAP_ANY, DEFAULT_INITIAL_CREDIT, ¬ify); + if (!tsap) { + IRDA_DEBUG(2, "%s(), Got no tsap!\n", __FUNCTION__ ); + return; + } + self->tsap_data = tsap; + + /* + * This is the data TSAP selector which we will pass to the client + * when the client ask for it. + */ + self->stsap_sel_data = self->tsap_data->stsap_sel; +} + +void irlan_close_tsaps(struct irlan_cb *self) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* Disconnect and close all open TSAP connections */ + if (self->tsap_data) { + irttp_disconnect_request(self->tsap_data, NULL, P_NORMAL); + irttp_close_tsap(self->tsap_data); + self->tsap_data = NULL; + } + if (self->client.tsap_ctrl) { + irttp_disconnect_request(self->client.tsap_ctrl, NULL, + P_NORMAL); + irttp_close_tsap(self->client.tsap_ctrl); + self->client.tsap_ctrl = NULL; + } + if (self->provider.tsap_ctrl) { + irttp_disconnect_request(self->provider.tsap_ctrl, NULL, + P_NORMAL); + irttp_close_tsap(self->provider.tsap_ctrl); + self->provider.tsap_ctrl = NULL; + } + self->disconnect_reason = LM_USER_REQUEST; +} + +/* + * Function irlan_ias_register (self, tsap_sel) + * + * Register with LM-IAS + * + */ +void irlan_ias_register(struct irlan_cb *self, __u8 tsap_sel) +{ + struct ias_object *obj; + struct ias_value *new_value; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* + * Check if object has already been registered by a previous provider. + * If that is the case, we just change the value of the attribute + */ + if (!irias_find_object("IrLAN")) { + obj = irias_new_object("IrLAN", IAS_IRLAN_ID); + irias_add_integer_attrib(obj, "IrDA:TinyTP:LsapSel", tsap_sel, + IAS_KERNEL_ATTR); + irias_insert_object(obj); + } else { + new_value = irias_new_integer_value(tsap_sel); + irias_object_change_attribute("IrLAN", "IrDA:TinyTP:LsapSel", + new_value); + } + + /* Register PnP object only if not registered before */ + if (!irias_find_object("PnP")) { + obj = irias_new_object("PnP", IAS_PNP_ID); +#if 0 + irias_add_string_attrib(obj, "Name", sysctl_devname, + IAS_KERNEL_ATTR); +#else + irias_add_string_attrib(obj, "Name", "Linux", IAS_KERNEL_ATTR); +#endif + irias_add_string_attrib(obj, "DeviceID", "HWP19F0", + IAS_KERNEL_ATTR); + irias_add_integer_attrib(obj, "CompCnt", 1, IAS_KERNEL_ATTR); + if (self->provider.access_type == ACCESS_PEER) + irias_add_string_attrib(obj, "Comp#01", "PNP8389", + IAS_KERNEL_ATTR); + else + irias_add_string_attrib(obj, "Comp#01", "PNP8294", + IAS_KERNEL_ATTR); + + irias_add_string_attrib(obj, "Manufacturer", + "Linux-IrDA Project", IAS_KERNEL_ATTR); + irias_insert_object(obj); + } +} + +/* + * Function irlan_run_ctrl_tx_queue (self) + * + * Try to send the next command in the control transmit queue + * + */ +int irlan_run_ctrl_tx_queue(struct irlan_cb *self) +{ + struct sk_buff *skb; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + if (irda_lock(&self->client.tx_busy) == FALSE) + return -EBUSY; + + skb = skb_dequeue(&self->client.txq); + if (!skb) { + self->client.tx_busy = FALSE; + return 0; + } + + /* Check that it's really possible to send commands */ + if ((self->client.tsap_ctrl == NULL) || + (self->client.state == IRLAN_IDLE)) + { + self->client.tx_busy = FALSE; + dev_kfree_skb(skb); + return -1; + } + IRDA_DEBUG(2, "%s(), sending ...\n", __FUNCTION__ ); + + return irttp_data_request(self->client.tsap_ctrl, skb); +} + +/* + * Function irlan_ctrl_data_request (self, skb) + * + * This function makes sure that commands on the control channel is being + * sent in a command/response fashion + */ +static void irlan_ctrl_data_request(struct irlan_cb *self, struct sk_buff *skb) +{ + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + /* Queue command */ + skb_queue_tail(&self->client.txq, skb); + + /* Try to send command */ + irlan_run_ctrl_tx_queue(self); +} + +/* + * Function irlan_get_provider_info (self) + * + * Send Get Provider Information command to peer IrLAN layer + * + */ +void irlan_get_provider_info(struct irlan_cb *self) +{ + struct sk_buff *skb; + __u8 *frame; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + skb = dev_alloc_skb(64); + if (!skb) + return; + + /* Reserve space for TTP, LMP, and LAP header */ + skb_reserve(skb, self->client.max_header_size); + skb_put(skb, 2); + + frame = skb->data; + + frame[0] = CMD_GET_PROVIDER_INFO; + frame[1] = 0x00; /* Zero parameters */ + + irlan_ctrl_data_request(self, skb); +} + +/* + * Function irlan_open_data_channel (self) + * + * Send an Open Data Command to provider + * + */ +void irlan_open_data_channel(struct irlan_cb *self) +{ + struct sk_buff *skb; + __u8 *frame; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + skb = dev_alloc_skb(64); + if (!skb) + return; + + skb_reserve(skb, self->client.max_header_size); + skb_put(skb, 2); + + frame = skb->data; + + /* Build frame */ + frame[0] = CMD_OPEN_DATA_CHANNEL; + frame[1] = 0x02; /* Two parameters */ + + irlan_insert_string_param(skb, "MEDIA", "802.3"); + irlan_insert_string_param(skb, "ACCESS_TYPE", "DIRECT"); + /* irlan_insert_string_param(skb, "MODE", "UNRELIABLE"); */ + +/* self->use_udata = TRUE; */ + + irlan_ctrl_data_request(self, skb); +} + +void irlan_close_data_channel(struct irlan_cb *self) +{ + struct sk_buff *skb; + __u8 *frame; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* Check if the TSAP is still there */ + if (self->client.tsap_ctrl == NULL) + return; + + skb = dev_alloc_skb(64); + if (!skb) + return; + + skb_reserve(skb, self->client.max_header_size); + skb_put(skb, 2); + + frame = skb->data; + + /* Build frame */ + frame[0] = CMD_CLOSE_DATA_CHAN; + frame[1] = 0x01; /* Two parameters */ + + irlan_insert_byte_param(skb, "DATA_CHAN", self->dtsap_sel_data); + + irlan_ctrl_data_request(self, skb); +} + +/* + * Function irlan_open_unicast_addr (self) + * + * Make IrLAN provider accept ethernet frames addressed to the unicast + * address. + * + */ +static void irlan_open_unicast_addr(struct irlan_cb *self) +{ + struct sk_buff *skb; + __u8 *frame; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + skb = dev_alloc_skb(128); + if (!skb) + return; + + /* Reserve space for TTP, LMP, and LAP header */ + skb_reserve(skb, self->max_header_size); + skb_put(skb, 2); + + frame = skb->data; + + frame[0] = CMD_FILTER_OPERATION; + frame[1] = 0x03; /* Three parameters */ + irlan_insert_byte_param(skb, "DATA_CHAN" , self->dtsap_sel_data); + irlan_insert_string_param(skb, "FILTER_TYPE", "DIRECTED"); + irlan_insert_string_param(skb, "FILTER_MODE", "FILTER"); + + irlan_ctrl_data_request(self, skb); +} + +/* + * Function irlan_set_broadcast_filter (self, status) + * + * Make IrLAN provider accept ethernet frames addressed to the broadcast + * address. Be careful with the use of this one, since there may be a lot + * of broadcast traffic out there. We can still function without this + * one but then _we_ have to initiate all communication with other + * hosts, since ARP request for this host will not be answered. + */ +void irlan_set_broadcast_filter(struct irlan_cb *self, int status) +{ + struct sk_buff *skb; + __u8 *frame; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + skb = dev_alloc_skb(128); + if (!skb) + return; + + /* Reserve space for TTP, LMP, and LAP header */ + skb_reserve(skb, self->client.max_header_size); + skb_put(skb, 2); + + frame = skb->data; + + frame[0] = CMD_FILTER_OPERATION; + frame[1] = 0x03; /* Three parameters */ + irlan_insert_byte_param(skb, "DATA_CHAN", self->dtsap_sel_data); + irlan_insert_string_param(skb, "FILTER_TYPE", "BROADCAST"); + if (status) + irlan_insert_string_param(skb, "FILTER_MODE", "FILTER"); + else + irlan_insert_string_param(skb, "FILTER_MODE", "NONE"); + + irlan_ctrl_data_request(self, skb); +} + +/* + * Function irlan_set_multicast_filter (self, status) + * + * Make IrLAN provider accept ethernet frames addressed to the multicast + * address. + * + */ +void irlan_set_multicast_filter(struct irlan_cb *self, int status) +{ + struct sk_buff *skb; + __u8 *frame; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + skb = dev_alloc_skb(128); + if (!skb) + return; + + /* Reserve space for TTP, LMP, and LAP header */ + skb_reserve(skb, self->client.max_header_size); + skb_put(skb, 2); + + frame = skb->data; + + frame[0] = CMD_FILTER_OPERATION; + frame[1] = 0x03; /* Three parameters */ + irlan_insert_byte_param(skb, "DATA_CHAN", self->dtsap_sel_data); + irlan_insert_string_param(skb, "FILTER_TYPE", "MULTICAST"); + if (status) + irlan_insert_string_param(skb, "FILTER_MODE", "ALL"); + else + irlan_insert_string_param(skb, "FILTER_MODE", "NONE"); + + irlan_ctrl_data_request(self, skb); +} + +/* + * Function irlan_get_unicast_addr (self) + * + * Retrieves the unicast address from the IrLAN provider. This address + * will be inserted into the devices structure, so the ethernet layer + * can construct its packets. + * + */ +static void irlan_get_unicast_addr(struct irlan_cb *self) +{ + struct sk_buff *skb; + __u8 *frame; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + skb = dev_alloc_skb(128); + if (!skb) + return; + + /* Reserve space for TTP, LMP, and LAP header */ + skb_reserve(skb, self->client.max_header_size); + skb_put(skb, 2); + + frame = skb->data; + + frame[0] = CMD_FILTER_OPERATION; + frame[1] = 0x03; /* Three parameters */ + irlan_insert_byte_param(skb, "DATA_CHAN", self->dtsap_sel_data); + irlan_insert_string_param(skb, "FILTER_TYPE", "DIRECTED"); + irlan_insert_string_param(skb, "FILTER_OPERATION", "DYNAMIC"); + + irlan_ctrl_data_request(self, skb); +} + +/* + * Function irlan_get_media_char (self) + * + * + * + */ +void irlan_get_media_char(struct irlan_cb *self) +{ + struct sk_buff *skb; + __u8 *frame; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + skb = dev_alloc_skb(64); + if (!skb) + return; + + /* Reserve space for TTP, LMP, and LAP header */ + skb_reserve(skb, self->client.max_header_size); + skb_put(skb, 2); + + frame = skb->data; + + /* Build frame */ + frame[0] = CMD_GET_MEDIA_CHAR; + frame[1] = 0x01; /* One parameter */ + + irlan_insert_string_param(skb, "MEDIA", "802.3"); + irlan_ctrl_data_request(self, skb); +} + +/* + * Function insert_byte_param (skb, param, value) + * + * Insert byte parameter into frame + * + */ +int irlan_insert_byte_param(struct sk_buff *skb, char *param, __u8 value) +{ + return __irlan_insert_param(skb, param, IRLAN_BYTE, value, 0, NULL, 0); +} + +int irlan_insert_short_param(struct sk_buff *skb, char *param, __u16 value) +{ + return __irlan_insert_param(skb, param, IRLAN_SHORT, 0, value, NULL, 0); +} + +/* + * Function insert_string (skb, param, value) + * + * Insert string parameter into frame + * + */ +int irlan_insert_string_param(struct sk_buff *skb, char *param, char *string) +{ + int string_len = strlen(string); + + return __irlan_insert_param(skb, param, IRLAN_ARRAY, 0, 0, string, + string_len); +} + +/* + * Function insert_array_param(skb, param, value, len_value) + * + * Insert array parameter into frame + * + */ +int irlan_insert_array_param(struct sk_buff *skb, char *name, __u8 *array, + __u16 array_len) +{ + return __irlan_insert_param(skb, name, IRLAN_ARRAY, 0, 0, array, + array_len); +} + +/* + * Function insert_param (skb, param, value, byte) + * + * Insert parameter at end of buffer, structure of a parameter is: + * + * ----------------------------------------------------------------------- + * | Name Length[1] | Param Name[1..255] | Val Length[2] | Value[0..1016]| + * ----------------------------------------------------------------------- + */ +static int __irlan_insert_param(struct sk_buff *skb, char *param, int type, + __u8 value_byte, __u16 value_short, + __u8 *value_array, __u16 value_len) +{ + __u8 *frame; + __u8 param_len; + __u16 tmp_le; /* Temporary value in little endian format */ + int n=0; + + if (skb == NULL) { + IRDA_DEBUG(2, "%s(), Got NULL skb\n", __FUNCTION__ ); + return 0; + } + + param_len = strlen(param); + switch (type) { + case IRLAN_BYTE: + value_len = 1; + break; + case IRLAN_SHORT: + value_len = 2; + break; + case IRLAN_ARRAY: + IRDA_ASSERT(value_array != NULL, return 0;); + IRDA_ASSERT(value_len > 0, return 0;); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown parameter type!\n", __FUNCTION__ ); + return 0; + break; + } + + /* Insert at end of sk-buffer */ + frame = skb->tail; + + /* Make space for data */ + if (skb_tailroom(skb) < (param_len+value_len+3)) { + IRDA_DEBUG(2, "%s(), No more space at end of skb\n", __FUNCTION__ ); + return 0; + } + skb_put(skb, param_len+value_len+3); + + /* Insert parameter length */ + frame[n++] = param_len; + + /* Insert parameter */ + memcpy(frame+n, param, param_len); n += param_len; + + /* Insert value length (2 byte little endian format, LSB first) */ + tmp_le = cpu_to_le16(value_len); + memcpy(frame+n, &tmp_le, 2); n += 2; /* To avoid alignment problems */ + + /* Insert value */ + switch (type) { + case IRLAN_BYTE: + frame[n++] = value_byte; + break; + case IRLAN_SHORT: + tmp_le = cpu_to_le16(value_short); + memcpy(frame+n, &tmp_le, 2); n += 2; + break; + case IRLAN_ARRAY: + memcpy(frame+n, value_array, value_len); n+=value_len; + break; + default: + break; + } + IRDA_ASSERT(n == (param_len+value_len+3), return 0;); + + return param_len+value_len+3; +} + +/* + * Function irlan_extract_param (buf, name, value, len) + * + * Extracts a single parameter name/value pair from buffer and updates + * the buffer pointer to point to the next name/value pair. + */ +int irlan_extract_param(__u8 *buf, char *name, char *value, __u16 *len) +{ + __u8 name_len; + __u16 val_len; + int n=0; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + /* get length of parameter name (1 byte) */ + name_len = buf[n++]; + + if (name_len > 254) { + IRDA_DEBUG(2, "%s(), name_len > 254\n", __FUNCTION__ ); + return -RSP_INVALID_COMMAND_FORMAT; + } + + /* get parameter name */ + memcpy(name, buf+n, name_len); + name[name_len] = '\0'; + n+=name_len; + + /* + * Get length of parameter value (2 bytes in little endian + * format) + */ + memcpy(&val_len, buf+n, 2); /* To avoid alignment problems */ + le16_to_cpus(&val_len); n+=2; + + if (val_len > 1016) { + IRDA_DEBUG(2, "%s(), parameter length to long\n", __FUNCTION__ ); + return -RSP_INVALID_COMMAND_FORMAT; + } + *len = val_len; + + /* get parameter value */ + memcpy(value, buf+n, val_len); + value[val_len] = '\0'; + n+=val_len; + + IRDA_DEBUG(4, "Parameter: %s ", name); + IRDA_DEBUG(4, "Value: %s\n", value); + + return n; +} + +#ifdef CONFIG_PROC_FS + +/* + * Start of reading /proc entries. + * Return entry at pos, + * or start_token to indicate print header line + * or NULL if end of file + */ +static void *irlan_seq_start(struct seq_file *seq, loff_t *pos) +{ + int i = 1; + struct irlan_cb *self; + + rcu_read_lock(); + if (*pos == 0) + return SEQ_START_TOKEN; + + list_for_each_entry(self, &irlans, dev_list) { + if (*pos == i) + return self; + ++i; + } + return NULL; +} + +/* Return entry after v, and increment pos */ +static void *irlan_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct list_head *nxt; + + ++*pos; + if (v == SEQ_START_TOKEN) + nxt = irlans.next; + else + nxt = ((struct irlan_cb *)v)->dev_list.next; + + return (nxt == &irlans) ? NULL + : list_entry(nxt, struct irlan_cb, dev_list); +} + +/* End of reading /proc file */ +static void irlan_seq_stop(struct seq_file *seq, void *v) +{ + rcu_read_unlock(); +} + + +/* + * Show one entry in /proc file. + */ +static int irlan_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "IrLAN instances:\n"); + else { + struct irlan_cb *self = v; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;); + + seq_printf(seq,"ifname: %s,\n", + self->dev->name); + seq_printf(seq,"client state: %s, ", + irlan_state[ self->client.state]); + seq_printf(seq,"provider state: %s,\n", + irlan_state[ self->provider.state]); + seq_printf(seq,"saddr: %#08x, ", + self->saddr); + seq_printf(seq,"daddr: %#08x\n", + self->daddr); + seq_printf(seq,"version: %d.%d,\n", + self->version[1], self->version[0]); + seq_printf(seq,"access type: %s\n", + irlan_access[self->client.access_type]); + seq_printf(seq,"media: %s\n", + irlan_media[self->media]); + + seq_printf(seq,"local filter:\n"); + seq_printf(seq,"remote filter: "); + irlan_print_filter(seq, self->client.filter_type); + seq_printf(seq,"tx busy: %s\n", + netif_queue_stopped(self->dev) ? "TRUE" : "FALSE"); + + seq_putc(seq,'\n'); + } + return 0; +} + +static struct seq_operations irlan_seq_ops = { + .start = irlan_seq_start, + .next = irlan_seq_next, + .stop = irlan_seq_stop, + .show = irlan_seq_show, +}; + +static int irlan_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &irlan_seq_ops); +} +#endif + +MODULE_AUTHOR("Dag Brattli "); +MODULE_DESCRIPTION("The Linux IrDA LAN protocol"); +MODULE_LICENSE("GPL"); + +module_param(eth, bool, 0); +MODULE_PARM_DESC(eth, "Name devices ethX (0) or irlanX (1)"); +module_param(access, int, 0); +MODULE_PARM_DESC(access, "Access type DIRECT=1, PEER=2, HOSTED=3"); + +module_init(irlan_init); +module_exit(irlan_cleanup); + diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c new file mode 100644 index 000000000000..071cd2cefd8a --- /dev/null +++ b/net/irda/irlan/irlan_eth.c @@ -0,0 +1,387 @@ +/********************************************************************* + * + * Filename: irlan_eth.c + * Version: + * Description: + * Status: Experimental. + * Author: Dag Brattli + * Created at: Thu Oct 15 08:37:58 1998 + * Modified at: Tue Mar 21 09:06:41 2000 + * Modified by: Dag Brattli + * Sources: skeleton.c by Donald Becker + * slip.c by Laurence Culhane, + * Fred N. van Kempen, + * + * Copyright (c) 1998-2000 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static int irlan_eth_open(struct net_device *dev); +static int irlan_eth_close(struct net_device *dev); +static int irlan_eth_xmit(struct sk_buff *skb, struct net_device *dev); +static void irlan_eth_set_multicast_list( struct net_device *dev); +static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev); + +/* + * Function irlan_eth_setup (dev) + * + * The network device initialization function. + * + */ +static void irlan_eth_setup(struct net_device *dev) +{ + dev->open = irlan_eth_open; + dev->stop = irlan_eth_close; + dev->hard_start_xmit = irlan_eth_xmit; + dev->get_stats = irlan_eth_get_stats; + dev->set_multicast_list = irlan_eth_set_multicast_list; + dev->destructor = free_netdev; + + SET_MODULE_OWNER(dev); + + ether_setup(dev); + + /* + * Lets do all queueing in IrTTP instead of this device driver. + * Queueing here as well can introduce some strange latency + * problems, which we will avoid by setting the queue size to 0. + */ + /* + * The bugs in IrTTP and IrLAN that created this latency issue + * have now been fixed, and we can propagate flow control properly + * to the network layer. However, this requires a minimal queue of + * packets for the device. + * Without flow control, the Tx Queue is 14 (ttp) + 0 (dev) = 14 + * With flow control, the Tx Queue is 7 (ttp) + 4 (dev) = 11 + * See irlan_eth_flow_indication()... + * Note : this number was randomly selected and would need to + * be adjusted. + * Jean II */ + dev->tx_queue_len = 4; +} + +/* + * Function alloc_irlandev + * + * Allocate network device and control block + * + */ +struct net_device *alloc_irlandev(const char *name) +{ + return alloc_netdev(sizeof(struct irlan_cb), name, + irlan_eth_setup); +} + +/* + * Function irlan_eth_open (dev) + * + * Network device has been opened by user + * + */ +static int irlan_eth_open(struct net_device *dev) +{ + struct irlan_cb *self = netdev_priv(dev); + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + /* Ready to play! */ + netif_stop_queue(dev); /* Wait until data link is ready */ + + /* We are now open, so time to do some work */ + self->disconnect_reason = 0; + irlan_client_wakeup(self, self->saddr, self->daddr); + + /* Make sure we have a hardware address before we return, + so DHCP clients gets happy */ + return wait_event_interruptible(self->open_wait, + !self->tsap_data->connected); +} + +/* + * Function irlan_eth_close (dev) + * + * Stop the ether network device, his function will usually be called by + * ifconfig down. We should now disconnect the link, We start the + * close timer, so that the instance will be removed if we are unable + * to discover the remote device after the disconnect. + */ +static int irlan_eth_close(struct net_device *dev) +{ + struct irlan_cb *self = netdev_priv(dev); + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + /* Stop device */ + netif_stop_queue(dev); + + irlan_close_data_channel(self); + irlan_close_tsaps(self); + + irlan_do_client_event(self, IRLAN_LMP_DISCONNECT, NULL); + irlan_do_provider_event(self, IRLAN_LMP_DISCONNECT, NULL); + + /* Remove frames queued on the control channel */ + skb_queue_purge(&self->client.txq); + + self->client.tx_busy = 0; + + return 0; +} + +/* + * Function irlan_eth_tx (skb) + * + * Transmits ethernet frames over IrDA link. + * + */ +static int irlan_eth_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct irlan_cb *self = netdev_priv(dev); + int ret; + + /* skb headroom large enough to contain all IrDA-headers? */ + if ((skb_headroom(skb) < self->max_header_size) || (skb_shared(skb))) { + struct sk_buff *new_skb = + skb_realloc_headroom(skb, self->max_header_size); + + /* We have to free the original skb anyway */ + dev_kfree_skb(skb); + + /* Did the realloc succeed? */ + if (new_skb == NULL) + return 0; + + /* Use the new skb instead */ + skb = new_skb; + } + + dev->trans_start = jiffies; + + /* Now queue the packet in the transport layer */ + if (self->use_udata) + ret = irttp_udata_request(self->tsap_data, skb); + else + ret = irttp_data_request(self->tsap_data, skb); + + if (ret < 0) { + /* + * IrTTPs tx queue is full, so we just have to + * drop the frame! You might think that we should + * just return -1 and don't deallocate the frame, + * but that is dangerous since it's possible that + * we have replaced the original skb with a new + * one with larger headroom, and that would really + * confuse do_dev_queue_xmit() in dev.c! I have + * tried :-) DB + */ + /* irttp_data_request already free the packet */ + self->stats.tx_dropped++; + } else { + self->stats.tx_packets++; + self->stats.tx_bytes += skb->len; + } + + return 0; +} + +/* + * Function irlan_eth_receive (handle, skb) + * + * This function gets the data that is received on the data channel + * + */ +int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb) +{ + struct irlan_cb *self = instance; + + if (skb == NULL) { + ++self->stats.rx_dropped; + return 0; + } + if (skb->len < ETH_HLEN) { + IRDA_DEBUG(0, "%s() : IrLAN frame too short (%d)\n", + __FUNCTION__, skb->len); + ++self->stats.rx_dropped; + dev_kfree_skb(skb); + return 0; + } + + /* + * Adopt this frame! Important to set all these fields since they + * might have been previously set by the low level IrDA network + * device driver + */ + skb->dev = self->dev; + skb->protocol=eth_type_trans(skb, skb->dev); /* Remove eth header */ + + self->stats.rx_packets++; + self->stats.rx_bytes += skb->len; + + netif_rx(skb); /* Eat it! */ + + return 0; +} + +/* + * Function irlan_eth_flow (status) + * + * Do flow control between IP/Ethernet and IrLAN/IrTTP. This is done by + * controlling the queue stop/start. + * + * The IrDA link layer has the advantage to have flow control, and + * IrTTP now properly handles that. Flow controlling the higher layers + * prevent us to drop Tx packets in here (up to 15% for a TCP socket, + * more for UDP socket). + * Also, this allow us to reduce the overall transmit queue, which means + * less latency in case of mixed traffic. + * Jean II + */ +void irlan_eth_flow_indication(void *instance, void *sap, LOCAL_FLOW flow) +{ + struct irlan_cb *self; + struct net_device *dev; + + self = (struct irlan_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + dev = self->dev; + + IRDA_ASSERT(dev != NULL, return;); + + IRDA_DEBUG(0, "%s() : flow %s ; running %d\n", __FUNCTION__, + flow == FLOW_STOP ? "FLOW_STOP" : "FLOW_START", + netif_running(dev)); + + switch (flow) { + case FLOW_STOP: + /* IrTTP is full, stop higher layers */ + netif_stop_queue(dev); + break; + case FLOW_START: + default: + /* Tell upper layers that its time to transmit frames again */ + /* Schedule network layer */ + netif_wake_queue(dev); + break; + } +} + +/* + * Function irlan_etc_send_gratuitous_arp (dev) + * + * Send gratuitous ARP to announce that we have changed + * hardware address, so that all peers updates their ARP tables + */ +void irlan_eth_send_gratuitous_arp(struct net_device *dev) +{ + struct in_device *in_dev; + + /* + * When we get a new MAC address do a gratuitous ARP. This + * is useful if we have changed access points on the same + * subnet. + */ +#ifdef CONFIG_INET + IRDA_DEBUG(4, "IrLAN: Sending gratuitous ARP\n"); + rcu_read_lock(); + in_dev = __in_dev_get(dev); + if (in_dev == NULL) + goto out; + if (in_dev->ifa_list) + + arp_send(ARPOP_REQUEST, ETH_P_ARP, + in_dev->ifa_list->ifa_address, + dev, + in_dev->ifa_list->ifa_address, + NULL, dev->dev_addr, NULL); +out: + rcu_read_unlock(); +#endif /* CONFIG_INET */ +} + +/* + * Function set_multicast_list (dev) + * + * Configure the filtering of the device + * + */ +#define HW_MAX_ADDRS 4 /* Must query to get it! */ +static void irlan_eth_set_multicast_list(struct net_device *dev) +{ + struct irlan_cb *self = netdev_priv(dev); + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__ ); + + /* Check if data channel has been connected yet */ + if (self->client.state != IRLAN_DATA) { + IRDA_DEBUG(1, "%s(), delaying!\n", __FUNCTION__ ); + return; + } + + if (dev->flags & IFF_PROMISC) { + /* Enable promiscuous mode */ + IRDA_WARNING("Promiscous mode not implemented by IrLAN!\n"); + } + else if ((dev->flags & IFF_ALLMULTI) || dev->mc_count > HW_MAX_ADDRS) { + /* Disable promiscuous mode, use normal mode. */ + IRDA_DEBUG(4, "%s(), Setting multicast filter\n", __FUNCTION__ ); + /* hardware_set_filter(NULL); */ + + irlan_set_multicast_filter(self, TRUE); + } + else if (dev->mc_count) { + IRDA_DEBUG(4, "%s(), Setting multicast filter\n", __FUNCTION__ ); + /* Walk the address list, and load the filter */ + /* hardware_set_filter(dev->mc_list); */ + + irlan_set_multicast_filter(self, TRUE); + } + else { + IRDA_DEBUG(4, "%s(), Clearing multicast filter\n", __FUNCTION__ ); + irlan_set_multicast_filter(self, FALSE); + } + + if (dev->flags & IFF_BROADCAST) + irlan_set_broadcast_filter(self, TRUE); + else + irlan_set_broadcast_filter(self, FALSE); +} + +/* + * Function irlan_get_stats (dev) + * + * Get the current statistics for this device + * + */ +static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev) +{ + struct irlan_cb *self = netdev_priv(dev); + + return &self->stats; +} diff --git a/net/irda/irlan/irlan_event.c b/net/irda/irlan/irlan_event.c new file mode 100644 index 000000000000..2778d8c6aa31 --- /dev/null +++ b/net/irda/irlan/irlan_event.c @@ -0,0 +1,60 @@ +/********************************************************************* + * + * Filename: irlan_event.c + * Version: + * Description: + * Status: Experimental. + * Author: Dag Brattli + * Created at: Tue Oct 20 09:10:16 1998 + * Modified at: Sat Oct 30 12:59:01 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include + +char *irlan_state[] = { + "IRLAN_IDLE", + "IRLAN_QUERY", + "IRLAN_CONN", + "IRLAN_INFO", + "IRLAN_MEDIA", + "IRLAN_OPEN", + "IRLAN_WAIT", + "IRLAN_ARB", + "IRLAN_DATA", + "IRLAN_CLOSE", + "IRLAN_SYNC", +}; + +void irlan_next_client_state(struct irlan_cb *self, IRLAN_STATE state) +{ + IRDA_DEBUG(2, "%s(), %s\n", __FUNCTION__ , irlan_state[state]); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + self->client.state = state; +} + +void irlan_next_provider_state(struct irlan_cb *self, IRLAN_STATE state) +{ + IRDA_DEBUG(2, "%s(), %s\n", __FUNCTION__ , irlan_state[state]); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + self->provider.state = state; +} + diff --git a/net/irda/irlan/irlan_filter.c b/net/irda/irlan/irlan_filter.c new file mode 100644 index 000000000000..343c5d4a1a1d --- /dev/null +++ b/net/irda/irlan/irlan_filter.c @@ -0,0 +1,246 @@ +/********************************************************************* + * + * Filename: irlan_filter.c + * Version: + * Description: + * Status: Experimental. + * Author: Dag Brattli + * Created at: Fri Jan 29 11:16:38 1999 + * Modified at: Sat Oct 30 12:58:45 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include + +#include + +/* + * Function irlan_filter_request (self, skb) + * + * Handle filter request from client peer device + * + */ +void irlan_filter_request(struct irlan_cb *self, struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + if ((self->provider.filter_type == IRLAN_DIRECTED) && + (self->provider.filter_operation == DYNAMIC)) + { + IRDA_DEBUG(0, "Giving peer a dynamic Ethernet address\n"); + self->provider.mac_address[0] = 0x40; + self->provider.mac_address[1] = 0x00; + self->provider.mac_address[2] = 0x00; + self->provider.mac_address[3] = 0x00; + + /* Use arbitration value to generate MAC address */ + if (self->provider.access_type == ACCESS_PEER) { + self->provider.mac_address[4] = + self->provider.send_arb_val & 0xff; + self->provider.mac_address[5] = + (self->provider.send_arb_val >> 8) & 0xff; + } else { + /* Just generate something for now */ + get_random_bytes(self->provider.mac_address+4, 1); + get_random_bytes(self->provider.mac_address+5, 1); + } + + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x03; + irlan_insert_string_param(skb, "FILTER_MODE", "NONE"); + irlan_insert_short_param(skb, "MAX_ENTRY", 0x0001); + irlan_insert_array_param(skb, "FILTER_ENTRY", + self->provider.mac_address, 6); + return; + } + + if ((self->provider.filter_type == IRLAN_DIRECTED) && + (self->provider.filter_mode == FILTER)) + { + IRDA_DEBUG(0, "Directed filter on\n"); + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x00; + return; + } + if ((self->provider.filter_type == IRLAN_DIRECTED) && + (self->provider.filter_mode == NONE)) + { + IRDA_DEBUG(0, "Directed filter off\n"); + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x00; + return; + } + + if ((self->provider.filter_type == IRLAN_BROADCAST) && + (self->provider.filter_mode == FILTER)) + { + IRDA_DEBUG(0, "Broadcast filter on\n"); + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x00; + return; + } + if ((self->provider.filter_type == IRLAN_BROADCAST) && + (self->provider.filter_mode == NONE)) + { + IRDA_DEBUG(0, "Broadcast filter off\n"); + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x00; + return; + } + if ((self->provider.filter_type == IRLAN_MULTICAST) && + (self->provider.filter_mode == FILTER)) + { + IRDA_DEBUG(0, "Multicast filter on\n"); + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x00; + return; + } + if ((self->provider.filter_type == IRLAN_MULTICAST) && + (self->provider.filter_mode == NONE)) + { + IRDA_DEBUG(0, "Multicast filter off\n"); + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x00; + return; + } + if ((self->provider.filter_type == IRLAN_MULTICAST) && + (self->provider.filter_operation == GET)) + { + IRDA_DEBUG(0, "Multicast filter get\n"); + skb->data[0] = 0x00; /* Success? */ + skb->data[1] = 0x02; + irlan_insert_string_param(skb, "FILTER_MODE", "NONE"); + irlan_insert_short_param(skb, "MAX_ENTRY", 16); + return; + } + skb->data[0] = 0x00; /* Command not supported */ + skb->data[1] = 0x00; + + IRDA_DEBUG(0, "Not implemented!\n"); +} + +/* + * Function check_request_param (self, param, value) + * + * Check parameters in request from peer device + * + */ +void irlan_check_command_param(struct irlan_cb *self, char *param, char *value) +{ + __u8 *bytes; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + bytes = value; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + IRDA_DEBUG(4, "%s, %s\n", param, value); + + /* + * This is experimental!! DB. + */ + if (strcmp(param, "MODE") == 0) { + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + self->use_udata = TRUE; + return; + } + + /* + * FILTER_TYPE + */ + if (strcmp(param, "FILTER_TYPE") == 0) { + if (strcmp(value, "DIRECTED") == 0) { + self->provider.filter_type = IRLAN_DIRECTED; + return; + } + if (strcmp(value, "MULTICAST") == 0) { + self->provider.filter_type = IRLAN_MULTICAST; + return; + } + if (strcmp(value, "BROADCAST") == 0) { + self->provider.filter_type = IRLAN_BROADCAST; + return; + } + } + /* + * FILTER_MODE + */ + if (strcmp(param, "FILTER_MODE") == 0) { + if (strcmp(value, "ALL") == 0) { + self->provider.filter_mode = ALL; + return; + } + if (strcmp(value, "FILTER") == 0) { + self->provider.filter_mode = FILTER; + return; + } + if (strcmp(value, "NONE") == 0) { + self->provider.filter_mode = FILTER; + return; + } + } + /* + * FILTER_OPERATION + */ + if (strcmp(param, "FILTER_OPERATION") == 0) { + if (strcmp(value, "DYNAMIC") == 0) { + self->provider.filter_operation = DYNAMIC; + return; + } + if (strcmp(value, "GET") == 0) { + self->provider.filter_operation = GET; + return; + } + } +} + +/* + * Function irlan_print_filter (filter_type, buf) + * + * Print status of filter. Used by /proc file system + * + */ +#ifdef CONFIG_PROC_FS +#define MASK2STR(m,s) { .mask = m, .str = s } + +void irlan_print_filter(struct seq_file *seq, int filter_type) +{ + static struct { + int mask; + const char *str; + } filter_mask2str[] = { + MASK2STR(IRLAN_DIRECTED, "DIRECTED"), + MASK2STR(IRLAN_FUNCTIONAL, "FUNCTIONAL"), + MASK2STR(IRLAN_GROUP, "GROUP"), + MASK2STR(IRLAN_MAC_FRAME, "MAC_FRAME"), + MASK2STR(IRLAN_MULTICAST, "MULTICAST"), + MASK2STR(IRLAN_BROADCAST, "BROADCAST"), + MASK2STR(IRLAN_IPX_SOCKET, "IPX_SOCKET"), + MASK2STR(0, NULL) + }, *p; + + for (p = filter_mask2str; p->str; p++) { + if (filter_type & p->mask) + seq_printf(seq, "%s ", p->str); + } + seq_putc(seq, '\n'); +} +#undef MASK2STR +#endif diff --git a/net/irda/irlan/irlan_provider.c b/net/irda/irlan/irlan_provider.c new file mode 100644 index 000000000000..39c202d1c374 --- /dev/null +++ b/net/irda/irlan/irlan_provider.c @@ -0,0 +1,413 @@ +/********************************************************************* + * + * Filename: irlan_provider.c + * Version: 0.9 + * Description: IrDA LAN Access Protocol Implementation + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sun Aug 31 20:14:37 1997 + * Modified at: Sat Oct 30 12:52:10 1999 + * Modified by: Dag Brattli + * Sources: skeleton.c by Donald Becker + * slip.c by Laurence Culhane, + * Fred N. van Kempen, + * + * Copyright (c) 1998-1999 Dag Brattli , + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static void irlan_provider_connect_indication(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb); + +/* + * Function irlan_provider_control_data_indication (handle, skb) + * + * This function gets the data that is received on the control channel + * + */ +static int irlan_provider_data_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct irlan_cb *self; + __u8 code; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + self = (struct irlan_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;); + + IRDA_ASSERT(skb != NULL, return -1;); + + code = skb->data[0]; + switch(code) { + case CMD_GET_PROVIDER_INFO: + IRDA_DEBUG(4, "Got GET_PROVIDER_INFO command!\n"); + irlan_do_provider_event(self, IRLAN_GET_INFO_CMD, skb); + break; + + case CMD_GET_MEDIA_CHAR: + IRDA_DEBUG(4, "Got GET_MEDIA_CHAR command!\n"); + irlan_do_provider_event(self, IRLAN_GET_MEDIA_CMD, skb); + break; + case CMD_OPEN_DATA_CHANNEL: + IRDA_DEBUG(4, "Got OPEN_DATA_CHANNEL command!\n"); + irlan_do_provider_event(self, IRLAN_OPEN_DATA_CMD, skb); + break; + case CMD_FILTER_OPERATION: + IRDA_DEBUG(4, "Got FILTER_OPERATION command!\n"); + irlan_do_provider_event(self, IRLAN_FILTER_CONFIG_CMD, skb); + break; + case CMD_RECONNECT_DATA_CHAN: + IRDA_DEBUG(2, "%s(), Got RECONNECT_DATA_CHAN command\n", __FUNCTION__ ); + IRDA_DEBUG(2, "%s(), NOT IMPLEMENTED\n", __FUNCTION__ ); + break; + case CMD_CLOSE_DATA_CHAN: + IRDA_DEBUG(2, "Got CLOSE_DATA_CHAN command!\n"); + IRDA_DEBUG(2, "%s(), NOT IMPLEMENTED\n", __FUNCTION__ ); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown command!\n", __FUNCTION__ ); + break; + } + return 0; +} + +/* + * Function irlan_provider_connect_indication (handle, skb, priv) + * + * Got connection from peer IrLAN client + * + */ +static void irlan_provider_connect_indication(void *instance, void *sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + struct irlan_cb *self; + struct tsap_cb *tsap; + __u32 saddr, daddr; + + IRDA_DEBUG(0, "%s()\n", __FUNCTION__ ); + + self = (struct irlan_cb *) instance; + tsap = (struct tsap_cb *) sap; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + IRDA_ASSERT(tsap == self->provider.tsap_ctrl,return;); + IRDA_ASSERT(self->provider.state == IRLAN_IDLE, return;); + + daddr = irttp_get_daddr(tsap); + saddr = irttp_get_saddr(tsap); + self->provider.max_sdu_size = max_sdu_size; + self->provider.max_header_size = max_header_size; + + irlan_do_provider_event(self, IRLAN_CONNECT_INDICATION, NULL); + + /* + * If we are in peer mode, the client may not have got the discovery + * indication it needs to make progress. If the client is still in + * IDLE state, we must kick it. + */ + if ((self->provider.access_type == ACCESS_PEER) && + (self->client.state == IRLAN_IDLE)) + { + irlan_client_wakeup(self, self->saddr, self->daddr); + } +} + +/* + * Function irlan_provider_connect_response (handle) + * + * Accept incoming connection + * + */ +void irlan_provider_connect_response(struct irlan_cb *self, + struct tsap_cb *tsap) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + + /* Just accept */ + irttp_connect_response(tsap, IRLAN_MTU, NULL); +} + +static void irlan_provider_disconnect_indication(void *instance, void *sap, + LM_REASON reason, + struct sk_buff *userdata) +{ + struct irlan_cb *self; + struct tsap_cb *tsap; + + IRDA_DEBUG(4, "%s(), reason=%d\n", __FUNCTION__ , reason); + + self = (struct irlan_cb *) instance; + tsap = (struct tsap_cb *) sap; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return;); + IRDA_ASSERT(tsap != NULL, return;); + IRDA_ASSERT(tsap->magic == TTP_TSAP_MAGIC, return;); + + IRDA_ASSERT(tsap == self->provider.tsap_ctrl, return;); + + irlan_do_provider_event(self, IRLAN_LMP_DISCONNECT, NULL); +} + +/* + * Function irlan_parse_open_data_cmd (self, skb) + * + * + * + */ +int irlan_parse_open_data_cmd(struct irlan_cb *self, struct sk_buff *skb) +{ + int ret; + + ret = irlan_provider_parse_command(self, CMD_OPEN_DATA_CHANNEL, skb); + + /* Open data channel */ + irlan_open_data_tsap(self); + + return ret; +} + +/* + * Function parse_command (skb) + * + * Extract all parameters from received buffer, then feed them to + * check_params for parsing + * + */ +int irlan_provider_parse_command(struct irlan_cb *self, int cmd, + struct sk_buff *skb) +{ + __u8 *frame; + __u8 *ptr; + int count; + __u16 val_len; + int i; + char *name; + char *value; + int ret = RSP_SUCCESS; + + IRDA_ASSERT(skb != NULL, return -RSP_PROTOCOL_ERROR;); + + IRDA_DEBUG(4, "%s(), skb->len=%d\n", __FUNCTION__ , (int)skb->len); + + IRDA_ASSERT(self != NULL, return -RSP_PROTOCOL_ERROR;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -RSP_PROTOCOL_ERROR;); + + if (!skb) + return -RSP_PROTOCOL_ERROR; + + frame = skb->data; + + name = kmalloc(255, GFP_ATOMIC); + if (!name) + return -RSP_INSUFFICIENT_RESOURCES; + value = kmalloc(1016, GFP_ATOMIC); + if (!value) { + kfree(name); + return -RSP_INSUFFICIENT_RESOURCES; + } + + /* How many parameters? */ + count = frame[1]; + + IRDA_DEBUG(4, "Got %d parameters\n", count); + + ptr = frame+2; + + /* For all parameters */ + for (i=0; imagic == IRLAN_MAGIC, return;); + + skb = dev_alloc_skb(128); + if (!skb) + return; + + /* Reserve space for TTP, LMP, and LAP header */ + skb_reserve(skb, self->provider.max_header_size); + skb_put(skb, 2); + + switch (command) { + case CMD_GET_PROVIDER_INFO: + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x02; /* 2 parameters */ + switch (self->media) { + case MEDIA_802_3: + irlan_insert_string_param(skb, "MEDIA", "802.3"); + break; + case MEDIA_802_5: + irlan_insert_string_param(skb, "MEDIA", "802.5"); + break; + default: + IRDA_DEBUG(2, "%s(), unknown media type!\n", __FUNCTION__ ); + break; + } + irlan_insert_short_param(skb, "IRLAN_VER", 0x0101); + break; + + case CMD_GET_MEDIA_CHAR: + skb->data[0] = 0x00; /* Success */ + skb->data[1] = 0x05; /* 5 parameters */ + irlan_insert_string_param(skb, "FILTER_TYPE", "DIRECTED"); + irlan_insert_string_param(skb, "FILTER_TYPE", "BROADCAST"); + irlan_insert_string_param(skb, "FILTER_TYPE", "MULTICAST"); + + switch (self->provider.access_type) { + case ACCESS_DIRECT: + irlan_insert_string_param(skb, "ACCESS_TYPE", "DIRECT"); + break; + case ACCESS_PEER: + irlan_insert_string_param(skb, "ACCESS_TYPE", "PEER"); + break; + case ACCESS_HOSTED: + irlan_insert_string_param(skb, "ACCESS_TYPE", "HOSTED"); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown access type\n", __FUNCTION__ ); + break; + } + irlan_insert_short_param(skb, "MAX_FRAME", 0x05ee); + break; + case CMD_OPEN_DATA_CHANNEL: + skb->data[0] = 0x00; /* Success */ + if (self->provider.send_arb_val) { + skb->data[1] = 0x03; /* 3 parameters */ + irlan_insert_short_param(skb, "CON_ARB", + self->provider.send_arb_val); + } else + skb->data[1] = 0x02; /* 2 parameters */ + irlan_insert_byte_param(skb, "DATA_CHAN", self->stsap_sel_data); + irlan_insert_array_param(skb, "RECONNECT_KEY", "LINUX RULES!", + 12); + break; + case CMD_FILTER_OPERATION: + irlan_filter_request(self, skb); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown command!\n", __FUNCTION__ ); + break; + } + + irttp_data_request(self->provider.tsap_ctrl, skb); +} + +/* + * Function irlan_provider_register(void) + * + * Register provider support so we can accept incoming connections. + * + */ +int irlan_provider_open_ctrl_tsap(struct irlan_cb *self) +{ + struct tsap_cb *tsap; + notify_t notify; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;); + + /* Check if already open */ + if (self->provider.tsap_ctrl) + return -1; + + /* + * First register well known control TSAP + */ + irda_notify_init(¬ify); + notify.data_indication = irlan_provider_data_indication; + notify.connect_indication = irlan_provider_connect_indication; + notify.disconnect_indication = irlan_provider_disconnect_indication; + notify.instance = self; + strlcpy(notify.name, "IrLAN ctrl (p)", sizeof(notify.name)); + + tsap = irttp_open_tsap(LSAP_ANY, 1, ¬ify); + if (!tsap) { + IRDA_DEBUG(2, "%s(), Got no tsap!\n", __FUNCTION__ ); + return -1; + } + self->provider.tsap_ctrl = tsap; + + /* Register with LM-IAS */ + irlan_ias_register(self, tsap->stsap_sel); + + return 0; +} + diff --git a/net/irda/irlan/irlan_provider_event.c b/net/irda/irlan/irlan_provider_event.c new file mode 100644 index 000000000000..5a086f9827ed --- /dev/null +++ b/net/irda/irlan/irlan_provider_event.c @@ -0,0 +1,241 @@ +/********************************************************************* + * + * Filename: irlan_provider_event.c + * Version: 0.9 + * Description: IrLAN provider state machine) + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sun Aug 31 20:14:37 1997 + * Modified at: Sat Oct 30 12:52:41 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli , All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include + +#include +#include + +static int irlan_provider_state_idle(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_provider_state_info(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_provider_state_open(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); +static int irlan_provider_state_data(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb); + +static int (*state[])(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) = +{ + irlan_provider_state_idle, + NULL, /* Query */ + NULL, /* Info */ + irlan_provider_state_info, + NULL, /* Media */ + irlan_provider_state_open, + NULL, /* Wait */ + NULL, /* Arb */ + irlan_provider_state_data, + NULL, /* Close */ + NULL, /* Sync */ +}; + +void irlan_do_provider_event(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(*state[ self->provider.state] != NULL, return;); + + (*state[self->provider.state]) (self, event, skb); +} + +/* + * Function irlan_provider_state_idle (event, skb, info) + * + * IDLE, We are waiting for an indication that there is a provider + * available. + */ +static int irlan_provider_state_idle(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch(event) { + case IRLAN_CONNECT_INDICATION: + irlan_provider_connect_response( self, self->provider.tsap_ctrl); + irlan_next_provider_state( self, IRLAN_INFO); + break; + default: + IRDA_DEBUG(4, "%s(), Unknown event %d\n", __FUNCTION__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_provider_state_info (self, event, skb, info) + * + * INFO, We have issued a GetInfo command and is awaiting a reply. + */ +static int irlan_provider_state_info(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + int ret; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch(event) { + case IRLAN_GET_INFO_CMD: + /* Be sure to use 802.3 in case of peer mode */ + if (self->provider.access_type == ACCESS_PEER) { + self->media = MEDIA_802_3; + + /* Check if client has started yet */ + if (self->client.state == IRLAN_IDLE) { + /* This should get the client going */ + irlmp_discovery_request(8); + } + } + + irlan_provider_send_reply(self, CMD_GET_PROVIDER_INFO, + RSP_SUCCESS); + /* Keep state */ + break; + case IRLAN_GET_MEDIA_CMD: + irlan_provider_send_reply(self, CMD_GET_MEDIA_CHAR, + RSP_SUCCESS); + /* Keep state */ + break; + case IRLAN_OPEN_DATA_CMD: + ret = irlan_parse_open_data_cmd(self, skb); + if (self->provider.access_type == ACCESS_PEER) { + /* FIXME: make use of random functions! */ + self->provider.send_arb_val = (jiffies & 0xffff); + } + irlan_provider_send_reply(self, CMD_OPEN_DATA_CHANNEL, ret); + + if (ret == RSP_SUCCESS) { + irlan_next_provider_state(self, IRLAN_OPEN); + + /* Signal client that we are now open */ + irlan_do_client_event(self, IRLAN_PROVIDER_SIGNAL, NULL); + } + break; + case IRLAN_LMP_DISCONNECT: /* FALLTHROUGH */ + case IRLAN_LAP_DISCONNECT: + irlan_next_provider_state(self, IRLAN_IDLE); + break; + default: + IRDA_DEBUG( 0, "%s(), Unknown event %d\n", __FUNCTION__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_provider_state_open (self, event, skb, info) + * + * OPEN, The client has issued a OpenData command and is awaiting a + * reply + * + */ +static int irlan_provider_state_open(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + + switch(event) { + case IRLAN_FILTER_CONFIG_CMD: + irlan_provider_parse_command(self, CMD_FILTER_OPERATION, skb); + irlan_provider_send_reply(self, CMD_FILTER_OPERATION, + RSP_SUCCESS); + /* Keep state */ + break; + case IRLAN_DATA_CONNECT_INDICATION: + irlan_next_provider_state(self, IRLAN_DATA); + irlan_provider_connect_response(self, self->tsap_data); + break; + case IRLAN_LMP_DISCONNECT: /* FALLTHROUGH */ + case IRLAN_LAP_DISCONNECT: + irlan_next_provider_state(self, IRLAN_IDLE); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %d\n", __FUNCTION__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irlan_provider_state_data (self, event, skb, info) + * + * DATA, The data channel is connected, allowing data transfers between + * the local and remote machines. + * + */ +static int irlan_provider_state_data(struct irlan_cb *self, IRLAN_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__ ); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == IRLAN_MAGIC, return -1;); + + switch(event) { + case IRLAN_FILTER_CONFIG_CMD: + irlan_provider_parse_command(self, CMD_FILTER_OPERATION, skb); + irlan_provider_send_reply(self, CMD_FILTER_OPERATION, + RSP_SUCCESS); + break; + case IRLAN_LMP_DISCONNECT: /* FALLTHROUGH */ + case IRLAN_LAP_DISCONNECT: + irlan_next_provider_state(self, IRLAN_IDLE); + break; + default: + IRDA_DEBUG( 0, "%s(), Unknown event %d\n", __FUNCTION__ , event); + break; + } + if (skb) + dev_kfree_skb(skb); + + return 0; +} + + + + + + + + + + diff --git a/net/irda/irlap.c b/net/irda/irlap.c new file mode 100644 index 000000000000..046ad0750e48 --- /dev/null +++ b/net/irda/irlap.c @@ -0,0 +1,1258 @@ +/********************************************************************* + * + * Filename: irlap.c + * Version: 1.0 + * Description: IrLAP implementation for Linux + * Status: Stable + * Author: Dag Brattli + * Created at: Mon Aug 4 20:40:53 1997 + * Modified at: Tue Dec 14 09:26:44 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static hashbin_t *irlap = NULL; +int sysctl_slot_timeout = SLOT_TIMEOUT * 1000 / HZ; + +/* This is the delay of missed pf period before generating an event + * to the application. The spec mandate 3 seconds, but in some cases + * it's way too long. - Jean II */ +int sysctl_warn_noreply_time = 3; + +extern void irlap_queue_xmit(struct irlap_cb *self, struct sk_buff *skb); +static void __irlap_close(struct irlap_cb *self); +static void irlap_init_qos_capabilities(struct irlap_cb *self, + struct qos_info *qos_user); + +#ifdef CONFIG_IRDA_DEBUG +static char *lap_reasons[] = { + "ERROR, NOT USED", + "LAP_DISC_INDICATION", + "LAP_NO_RESPONSE", + "LAP_RESET_INDICATION", + "LAP_FOUND_NONE", + "LAP_MEDIA_BUSY", + "LAP_PRIMARY_CONFLICT", + "ERROR, NOT USED", +}; +#endif /* CONFIG_IRDA_DEBUG */ + +int __init irlap_init(void) +{ + /* Check if the compiler did its job properly. + * May happen on some ARM configuration, check with Russell King. */ + IRDA_ASSERT(sizeof(struct xid_frame) == 14, ;); + IRDA_ASSERT(sizeof(struct test_frame) == 10, ;); + IRDA_ASSERT(sizeof(struct ua_frame) == 10, ;); + IRDA_ASSERT(sizeof(struct snrm_frame) == 11, ;); + + /* Allocate master array */ + irlap = hashbin_new(HB_LOCK); + if (irlap == NULL) { + IRDA_ERROR("%s: can't allocate irlap hashbin!\n", + __FUNCTION__); + return -ENOMEM; + } + + return 0; +} + +void __exit irlap_cleanup(void) +{ + IRDA_ASSERT(irlap != NULL, return;); + + hashbin_delete(irlap, (FREE_FUNC) __irlap_close); +} + +/* + * Function irlap_open (driver) + * + * Initialize IrLAP layer + * + */ +struct irlap_cb *irlap_open(struct net_device *dev, struct qos_info *qos, + const char *hw_name) +{ + struct irlap_cb *self; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + /* Initialize the irlap structure. */ + self = kmalloc(sizeof(struct irlap_cb), GFP_KERNEL); + if (self == NULL) + return NULL; + + memset(self, 0, sizeof(struct irlap_cb)); + self->magic = LAP_MAGIC; + + /* Make a binding between the layers */ + self->netdev = dev; + self->qos_dev = qos; + /* Copy hardware name */ + if(hw_name != NULL) { + strlcpy(self->hw_name, hw_name, sizeof(self->hw_name)); + } else { + self->hw_name[0] = '\0'; + } + + /* FIXME: should we get our own field? */ + dev->atalk_ptr = self; + + self->state = LAP_OFFLINE; + + /* Initialize transmit queue */ + skb_queue_head_init(&self->txq); + skb_queue_head_init(&self->txq_ultra); + skb_queue_head_init(&self->wx_list); + + /* My unique IrLAP device address! */ + /* We don't want the broadcast address, neither the NULL address + * (most often used to signify "invalid"), and we don't want an + * address already in use (otherwise connect won't be able + * to select the proper link). - Jean II */ + do { + get_random_bytes(&self->saddr, sizeof(self->saddr)); + } while ((self->saddr == 0x0) || (self->saddr == BROADCAST) || + (hashbin_lock_find(irlap, self->saddr, NULL)) ); + /* Copy to the driver */ + memcpy(dev->dev_addr, &self->saddr, 4); + + init_timer(&self->slot_timer); + init_timer(&self->query_timer); + init_timer(&self->discovery_timer); + init_timer(&self->final_timer); + init_timer(&self->poll_timer); + init_timer(&self->wd_timer); + init_timer(&self->backoff_timer); + init_timer(&self->media_busy_timer); + + irlap_apply_default_connection_parameters(self); + + self->N3 = 3; /* # connections attemts to try before giving up */ + + self->state = LAP_NDM; + + hashbin_insert(irlap, (irda_queue_t *) self, self->saddr, NULL); + + irlmp_register_link(self, self->saddr, &self->notify); + + return self; +} +EXPORT_SYMBOL(irlap_open); + +/* + * Function __irlap_close (self) + * + * Remove IrLAP and all allocated memory. Stop any pending timers. + * + */ +static void __irlap_close(struct irlap_cb *self) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Stop timers */ + del_timer(&self->slot_timer); + del_timer(&self->query_timer); + del_timer(&self->discovery_timer); + del_timer(&self->final_timer); + del_timer(&self->poll_timer); + del_timer(&self->wd_timer); + del_timer(&self->backoff_timer); + del_timer(&self->media_busy_timer); + + irlap_flush_all_queues(self); + + self->magic = 0; + + kfree(self); +} + +/* + * Function irlap_close (self) + * + * Remove IrLAP instance + * + */ +void irlap_close(struct irlap_cb *self) +{ + struct irlap_cb *lap; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* We used to send a LAP_DISC_INDICATION here, but this was + * racy. This has been move within irlmp_unregister_link() + * itself. Jean II */ + + /* Kill the LAP and all LSAPs on top of it */ + irlmp_unregister_link(self->saddr); + self->notify.instance = NULL; + + /* Be sure that we manage to remove ourself from the hash */ + lap = hashbin_remove(irlap, self->saddr, NULL); + if (!lap) { + IRDA_DEBUG(1, "%s(), Didn't find myself!\n", __FUNCTION__); + return; + } + __irlap_close(lap); +} +EXPORT_SYMBOL(irlap_close); + +/* + * Function irlap_connect_indication (self, skb) + * + * Another device is attempting to make a connection + * + */ +void irlap_connect_indication(struct irlap_cb *self, struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + irlap_init_qos_capabilities(self, NULL); /* No user QoS! */ + + irlmp_link_connect_indication(self->notify.instance, self->saddr, + self->daddr, &self->qos_tx, skb); +} + +/* + * Function irlap_connect_response (self, skb) + * + * Service user has accepted incoming connection + * + */ +void irlap_connect_response(struct irlap_cb *self, struct sk_buff *userdata) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + irlap_do_event(self, CONNECT_RESPONSE, userdata, NULL); +} + +/* + * Function irlap_connect_request (self, daddr, qos_user, sniff) + * + * Request connection with another device, sniffing is not implemented + * yet. + * + */ +void irlap_connect_request(struct irlap_cb *self, __u32 daddr, + struct qos_info *qos_user, int sniff) +{ + IRDA_DEBUG(3, "%s(), daddr=0x%08x\n", __FUNCTION__, daddr); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + self->daddr = daddr; + + /* + * If the service user specifies QoS values for this connection, + * then use them + */ + irlap_init_qos_capabilities(self, qos_user); + + if ((self->state == LAP_NDM) && !self->media_busy) + irlap_do_event(self, CONNECT_REQUEST, NULL, NULL); + else + self->connect_pending = TRUE; +} + +/* + * Function irlap_connect_confirm (self, skb) + * + * Connection request has been accepted + * + */ +void irlap_connect_confirm(struct irlap_cb *self, struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + irlmp_link_connect_confirm(self->notify.instance, &self->qos_tx, skb); +} + +/* + * Function irlap_data_indication (self, skb) + * + * Received data frames from IR-port, so we just pass them up to + * IrLMP for further processing + * + */ +void irlap_data_indication(struct irlap_cb *self, struct sk_buff *skb, + int unreliable) +{ + /* Hide LAP header from IrLMP layer */ + skb_pull(skb, LAP_ADDR_HEADER+LAP_CTRL_HEADER); + + irlmp_link_data_indication(self->notify.instance, skb, unreliable); +} + + +/* + * Function irlap_data_request (self, skb) + * + * Queue data for transmission, must wait until XMIT state + * + */ +void irlap_data_request(struct irlap_cb *self, struct sk_buff *skb, + int unreliable) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + IRDA_DEBUG(3, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(skb_headroom(skb) >= (LAP_ADDR_HEADER+LAP_CTRL_HEADER), + return;); + skb_push(skb, LAP_ADDR_HEADER+LAP_CTRL_HEADER); + + /* + * Must set frame format now so that the rest of the code knows + * if its dealing with an I or an UI frame + */ + if (unreliable) + skb->data[1] = UI_FRAME; + else + skb->data[1] = I_FRAME; + + /* Don't forget to refcount it - see irlmp_connect_request(). */ + skb_get(skb); + + /* Add at the end of the queue (keep ordering) - Jean II */ + skb_queue_tail(&self->txq, skb); + + /* + * Send event if this frame only if we are in the right state + * FIXME: udata should be sent first! (skb_queue_head?) + */ + if ((self->state == LAP_XMIT_P) || (self->state == LAP_XMIT_S)) { + /* If we are not already processing the Tx queue, trigger + * transmission immediately - Jean II */ + if((skb_queue_len(&self->txq) <= 1) && (!self->local_busy)) + irlap_do_event(self, DATA_REQUEST, skb, NULL); + /* Otherwise, the packets will be sent normally at the + * next pf-poll - Jean II */ + } +} + +/* + * Function irlap_unitdata_request (self, skb) + * + * Send Ultra data. This is data that must be sent outside any connection + * + */ +#ifdef CONFIG_IRDA_ULTRA +void irlap_unitdata_request(struct irlap_cb *self, struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + IRDA_DEBUG(3, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(skb_headroom(skb) >= (LAP_ADDR_HEADER+LAP_CTRL_HEADER), + return;); + skb_push(skb, LAP_ADDR_HEADER+LAP_CTRL_HEADER); + + skb->data[0] = CBROADCAST; + skb->data[1] = UI_FRAME; + + /* Don't need to refcount, see irlmp_connless_data_request() */ + + skb_queue_tail(&self->txq_ultra, skb); + + irlap_do_event(self, SEND_UI_FRAME, NULL, NULL); +} +#endif /*CONFIG_IRDA_ULTRA */ + +/* + * Function irlap_udata_indication (self, skb) + * + * Receive Ultra data. This is data that is received outside any connection + * + */ +#ifdef CONFIG_IRDA_ULTRA +void irlap_unitdata_indication(struct irlap_cb *self, struct sk_buff *skb) +{ + IRDA_DEBUG(1, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + /* Hide LAP header from IrLMP layer */ + skb_pull(skb, LAP_ADDR_HEADER+LAP_CTRL_HEADER); + + irlmp_link_unitdata_indication(self->notify.instance, skb); +} +#endif /* CONFIG_IRDA_ULTRA */ + +/* + * Function irlap_disconnect_request (void) + * + * Request to disconnect connection by service user + */ +void irlap_disconnect_request(struct irlap_cb *self) +{ + IRDA_DEBUG(3, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Don't disconnect until all data frames are successfully sent */ + if (skb_queue_len(&self->txq) > 0) { + self->disconnect_pending = TRUE; + + return; + } + + /* Check if we are in the right state for disconnecting */ + switch (self->state) { + case LAP_XMIT_P: /* FALLTROUGH */ + case LAP_XMIT_S: /* FALLTROUGH */ + case LAP_CONN: /* FALLTROUGH */ + case LAP_RESET_WAIT: /* FALLTROUGH */ + case LAP_RESET_CHECK: + irlap_do_event(self, DISCONNECT_REQUEST, NULL, NULL); + break; + default: + IRDA_DEBUG(2, "%s(), disconnect pending!\n", __FUNCTION__); + self->disconnect_pending = TRUE; + break; + } +} + +/* + * Function irlap_disconnect_indication (void) + * + * Disconnect request from other device + * + */ +void irlap_disconnect_indication(struct irlap_cb *self, LAP_REASON reason) +{ + IRDA_DEBUG(1, "%s(), reason=%s\n", __FUNCTION__, lap_reasons[reason]); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Flush queues */ + irlap_flush_all_queues(self); + + switch (reason) { + case LAP_RESET_INDICATION: + IRDA_DEBUG(1, "%s(), Sending reset request!\n", __FUNCTION__); + irlap_do_event(self, RESET_REQUEST, NULL, NULL); + break; + case LAP_NO_RESPONSE: /* FALLTROUGH */ + case LAP_DISC_INDICATION: /* FALLTROUGH */ + case LAP_FOUND_NONE: /* FALLTROUGH */ + case LAP_MEDIA_BUSY: + irlmp_link_disconnect_indication(self->notify.instance, self, + reason, NULL); + break; + default: + IRDA_ERROR("%s: Unknown reason %d\n", __FUNCTION__, reason); + } +} + +/* + * Function irlap_discovery_request (gen_addr_bit) + * + * Start one single discovery operation. + * + */ +void irlap_discovery_request(struct irlap_cb *self, discovery_t *discovery) +{ + struct irlap_info info; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + IRDA_ASSERT(discovery != NULL, return;); + + IRDA_DEBUG(4, "%s(), nslots = %d\n", __FUNCTION__, discovery->nslots); + + IRDA_ASSERT((discovery->nslots == 1) || (discovery->nslots == 6) || + (discovery->nslots == 8) || (discovery->nslots == 16), + return;); + + /* Discovery is only possible in NDM mode */ + if (self->state != LAP_NDM) { + IRDA_DEBUG(4, "%s(), discovery only possible in NDM mode\n", + __FUNCTION__); + irlap_discovery_confirm(self, NULL); + /* Note : in theory, if we are not in NDM, we could postpone + * the discovery like we do for connection request. + * In practice, it's not worth it. If the media was busy, + * it's likely next time around it won't be busy. If we are + * in REPLY state, we will get passive discovery info & event. + * Jean II */ + return; + } + + /* Check if last discovery request finished in time, or if + * it was aborted due to the media busy flag. */ + if (self->discovery_log != NULL) { + hashbin_delete(self->discovery_log, (FREE_FUNC) kfree); + self->discovery_log = NULL; + } + + /* All operations will occur at predictable time, no need to lock */ + self->discovery_log = hashbin_new(HB_NOLOCK); + + if (self->discovery_log == NULL) { + IRDA_WARNING("%s(), Unable to allocate discovery log!\n", + __FUNCTION__); + return; + } + + info.S = discovery->nslots; /* Number of slots */ + info.s = 0; /* Current slot */ + + self->discovery_cmd = discovery; + info.discovery = discovery; + + /* sysctl_slot_timeout bounds are checked in irsysctl.c - Jean II */ + self->slot_timeout = sysctl_slot_timeout * HZ / 1000; + + irlap_do_event(self, DISCOVERY_REQUEST, NULL, &info); +} + +/* + * Function irlap_discovery_confirm (log) + * + * A device has been discovered in front of this station, we + * report directly to LMP. + */ +void irlap_discovery_confirm(struct irlap_cb *self, hashbin_t *discovery_log) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + IRDA_ASSERT(self->notify.instance != NULL, return;); + + /* + * Check for successful discovery, since we are then allowed to clear + * the media busy condition (IrLAP 6.13.4 - p.94). This should allow + * us to make connection attempts much faster and easier (i.e. no + * collisions). + * Setting media busy to false will also generate an event allowing + * to process pending events in NDM state machine. + * Note : the spec doesn't define what's a successful discovery is. + * If we want Ultra to work, it's successful even if there is + * nobody discovered - Jean II + */ + if (discovery_log) + irda_device_set_media_busy(self->netdev, FALSE); + + /* Inform IrLMP */ + irlmp_link_discovery_confirm(self->notify.instance, discovery_log); +} + +/* + * Function irlap_discovery_indication (log) + * + * Somebody is trying to discover us! + * + */ +void irlap_discovery_indication(struct irlap_cb *self, discovery_t *discovery) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + IRDA_ASSERT(discovery != NULL, return;); + + IRDA_ASSERT(self->notify.instance != NULL, return;); + + /* A device is very likely to connect immediately after it performs + * a successful discovery. This means that in our case, we are much + * more likely to receive a connection request over the medium. + * So, we backoff to avoid collisions. + * IrLAP spec 6.13.4 suggest 100ms... + * Note : this little trick actually make a *BIG* difference. If I set + * my Linux box with discovery enabled and one Ultra frame sent every + * second, my Palm has no trouble connecting to it every time ! + * Jean II */ + irda_device_set_media_busy(self->netdev, SMALL); + + irlmp_link_discovery_indication(self->notify.instance, discovery); +} + +/* + * Function irlap_status_indication (quality_of_link) + */ +void irlap_status_indication(struct irlap_cb *self, int quality_of_link) +{ + switch (quality_of_link) { + case STATUS_NO_ACTIVITY: + IRDA_MESSAGE("IrLAP, no activity on link!\n"); + break; + case STATUS_NOISY: + IRDA_MESSAGE("IrLAP, noisy link!\n"); + break; + default: + break; + } + irlmp_status_indication(self->notify.instance, + quality_of_link, LOCK_NO_CHANGE); +} + +/* + * Function irlap_reset_indication (void) + */ +void irlap_reset_indication(struct irlap_cb *self) +{ + IRDA_DEBUG(1, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + if (self->state == LAP_RESET_WAIT) + irlap_do_event(self, RESET_REQUEST, NULL, NULL); + else + irlap_do_event(self, RESET_RESPONSE, NULL, NULL); +} + +/* + * Function irlap_reset_confirm (void) + */ +void irlap_reset_confirm(void) +{ + IRDA_DEBUG(1, "%s()\n", __FUNCTION__); +} + +/* + * Function irlap_generate_rand_time_slot (S, s) + * + * Generate a random time slot between s and S-1 where + * S = Number of slots (0 -> S-1) + * s = Current slot + */ +int irlap_generate_rand_time_slot(int S, int s) +{ + static int rand; + int slot; + + IRDA_ASSERT((S - s) > 0, return 0;); + + rand += jiffies; + rand ^= (rand << 12); + rand ^= (rand >> 20); + + slot = s + rand % (S-s); + + IRDA_ASSERT((slot >= s) || (slot < S), return 0;); + + return slot; +} + +/* + * Function irlap_update_nr_received (nr) + * + * Remove all acknowledged frames in current window queue. This code is + * not intuitive and you should not try to change it. If you think it + * contains bugs, please mail a patch to the author instead. + */ +void irlap_update_nr_received(struct irlap_cb *self, int nr) +{ + struct sk_buff *skb = NULL; + int count = 0; + + /* + * Remove all the ack-ed frames from the window queue. + */ + + /* + * Optimize for the common case. It is most likely that the receiver + * will acknowledge all the frames we have sent! So in that case we + * delete all frames stored in window. + */ + if (nr == self->vs) { + while ((skb = skb_dequeue(&self->wx_list)) != NULL) { + dev_kfree_skb(skb); + } + /* The last acked frame is the next to send minus one */ + self->va = nr - 1; + } else { + /* Remove all acknowledged frames in current window */ + while ((skb_peek(&self->wx_list) != NULL) && + (((self->va+1) % 8) != nr)) + { + skb = skb_dequeue(&self->wx_list); + dev_kfree_skb(skb); + + self->va = (self->va + 1) % 8; + count++; + } + } + + /* Advance window */ + self->window = self->window_size - skb_queue_len(&self->wx_list); +} + +/* + * Function irlap_validate_ns_received (ns) + * + * Validate the next to send (ns) field from received frame. + */ +int irlap_validate_ns_received(struct irlap_cb *self, int ns) +{ + /* ns as expected? */ + if (ns == self->vr) + return NS_EXPECTED; + /* + * Stations are allowed to treat invalid NS as unexpected NS + * IrLAP, Recv ... with-invalid-Ns. p. 84 + */ + return NS_UNEXPECTED; + + /* return NR_INVALID; */ +} +/* + * Function irlap_validate_nr_received (nr) + * + * Validate the next to receive (nr) field from received frame. + * + */ +int irlap_validate_nr_received(struct irlap_cb *self, int nr) +{ + /* nr as expected? */ + if (nr == self->vs) { + IRDA_DEBUG(4, "%s(), expected!\n", __FUNCTION__); + return NR_EXPECTED; + } + + /* + * unexpected nr? (but within current window), first we check if the + * ns numbers of the frames in the current window wrap. + */ + if (self->va < self->vs) { + if ((nr >= self->va) && (nr <= self->vs)) + return NR_UNEXPECTED; + } else { + if ((nr >= self->va) || (nr <= self->vs)) + return NR_UNEXPECTED; + } + + /* Invalid nr! */ + return NR_INVALID; +} + +/* + * Function irlap_initiate_connection_state () + * + * Initialize the connection state parameters + * + */ +void irlap_initiate_connection_state(struct irlap_cb *self) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Next to send and next to receive */ + self->vs = self->vr = 0; + + /* Last frame which got acked (0 - 1) % 8 */ + self->va = 7; + + self->window = 1; + + self->remote_busy = FALSE; + self->retry_count = 0; +} + +/* + * Function irlap_wait_min_turn_around (self, qos) + * + * Wait negotiated minimum turn around time, this function actually sets + * the number of BOS's that must be sent before the next transmitted + * frame in order to delay for the specified amount of time. This is + * done to avoid using timers, and the forbidden udelay! + */ +void irlap_wait_min_turn_around(struct irlap_cb *self, struct qos_info *qos) +{ + __u32 min_turn_time; + __u32 speed; + + /* Get QoS values. */ + speed = qos->baud_rate.value; + min_turn_time = qos->min_turn_time.value; + + /* No need to calculate XBOFs for speeds over 115200 bps */ + if (speed > 115200) { + self->mtt_required = min_turn_time; + return; + } + + /* + * Send additional BOF's for the next frame for the requested + * min turn time, so now we must calculate how many chars (XBOF's) we + * must send for the requested time period (min turn time) + */ + self->xbofs_delay = irlap_min_turn_time_in_bytes(speed, min_turn_time); +} + +/* + * Function irlap_flush_all_queues (void) + * + * Flush all queues + * + */ +void irlap_flush_all_queues(struct irlap_cb *self) +{ + struct sk_buff* skb; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Free transmission queue */ + while ((skb = skb_dequeue(&self->txq)) != NULL) + dev_kfree_skb(skb); + + while ((skb = skb_dequeue(&self->txq_ultra)) != NULL) + dev_kfree_skb(skb); + + /* Free sliding window buffered packets */ + while ((skb = skb_dequeue(&self->wx_list)) != NULL) + dev_kfree_skb(skb); +} + +/* + * Function irlap_setspeed (self, speed) + * + * Change the speed of the IrDA port + * + */ +static void irlap_change_speed(struct irlap_cb *self, __u32 speed, int now) +{ + struct sk_buff *skb; + + IRDA_DEBUG(0, "%s(), setting speed to %d\n", __FUNCTION__, speed); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + self->speed = speed; + + /* Change speed now, or just piggyback speed on frames */ + if (now) { + /* Send down empty frame to trigger speed change */ + skb = dev_alloc_skb(0); + irlap_queue_xmit(self, skb); + } +} + +/* + * Function irlap_init_qos_capabilities (self, qos) + * + * Initialize QoS for this IrLAP session, What we do is to compute the + * intersection of the QoS capabilities for the user, driver and for + * IrLAP itself. Normally, IrLAP will not specify any values, but it can + * be used to restrict certain values. + */ +static void irlap_init_qos_capabilities(struct irlap_cb *self, + struct qos_info *qos_user) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + IRDA_ASSERT(self->netdev != NULL, return;); + + /* Start out with the maximum QoS support possible */ + irda_init_max_qos_capabilies(&self->qos_rx); + + /* Apply drivers QoS capabilities */ + irda_qos_compute_intersection(&self->qos_rx, self->qos_dev); + + /* + * Check for user supplied QoS parameters. The service user is only + * allowed to supply these values. We check each parameter since the + * user may not have set all of them. + */ + if (qos_user) { + IRDA_DEBUG(1, "%s(), Found user specified QoS!\n", __FUNCTION__); + + if (qos_user->baud_rate.bits) + self->qos_rx.baud_rate.bits &= qos_user->baud_rate.bits; + + if (qos_user->max_turn_time.bits) + self->qos_rx.max_turn_time.bits &= qos_user->max_turn_time.bits; + if (qos_user->data_size.bits) + self->qos_rx.data_size.bits &= qos_user->data_size.bits; + + if (qos_user->link_disc_time.bits) + self->qos_rx.link_disc_time.bits &= qos_user->link_disc_time.bits; + } + + /* Use 500ms in IrLAP for now */ + self->qos_rx.max_turn_time.bits &= 0x01; + + /* Set data size */ + /*self->qos_rx.data_size.bits &= 0x03;*/ + + irda_qos_bits_to_value(&self->qos_rx); +} + +/* + * Function irlap_apply_default_connection_parameters (void, now) + * + * Use the default connection and transmission parameters + */ +void irlap_apply_default_connection_parameters(struct irlap_cb *self) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* xbofs : Default value in NDM */ + self->next_bofs = 12; + self->bofs_count = 12; + + /* NDM Speed is 9600 */ + irlap_change_speed(self, 9600, TRUE); + + /* Set mbusy when going to NDM state */ + irda_device_set_media_busy(self->netdev, TRUE); + + /* + * Generate random connection address for this session, which must + * be 7 bits wide and different from 0x00 and 0xfe + */ + while ((self->caddr == 0x00) || (self->caddr == 0xfe)) { + get_random_bytes(&self->caddr, sizeof(self->caddr)); + self->caddr &= 0xfe; + } + + /* Use default values until connection has been negitiated */ + self->slot_timeout = sysctl_slot_timeout; + self->final_timeout = FINAL_TIMEOUT; + self->poll_timeout = POLL_TIMEOUT; + self->wd_timeout = WD_TIMEOUT; + + /* Set some default values */ + self->qos_tx.baud_rate.value = 9600; + self->qos_rx.baud_rate.value = 9600; + self->qos_tx.max_turn_time.value = 0; + self->qos_rx.max_turn_time.value = 0; + self->qos_tx.min_turn_time.value = 0; + self->qos_rx.min_turn_time.value = 0; + self->qos_tx.data_size.value = 64; + self->qos_rx.data_size.value = 64; + self->qos_tx.window_size.value = 1; + self->qos_rx.window_size.value = 1; + self->qos_tx.additional_bofs.value = 12; + self->qos_rx.additional_bofs.value = 12; + self->qos_tx.link_disc_time.value = 0; + self->qos_rx.link_disc_time.value = 0; + + irlap_flush_all_queues(self); + + self->disconnect_pending = FALSE; + self->connect_pending = FALSE; +} + +/* + * Function irlap_apply_connection_parameters (qos, now) + * + * Initialize IrLAP with the negotiated QoS values + * + * If 'now' is false, the speed and xbofs will be changed after the next + * frame is sent. + * If 'now' is true, the speed and xbofs is changed immediately + */ +void irlap_apply_connection_parameters(struct irlap_cb *self, int now) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Set the negotiated xbofs value */ + self->next_bofs = self->qos_tx.additional_bofs.value; + if (now) + self->bofs_count = self->next_bofs; + + /* Set the negotiated link speed (may need the new xbofs value) */ + irlap_change_speed(self, self->qos_tx.baud_rate.value, now); + + self->window_size = self->qos_tx.window_size.value; + self->window = self->qos_tx.window_size.value; + +#ifdef CONFIG_IRDA_DYNAMIC_WINDOW + /* + * Calculate how many bytes it is possible to transmit before the + * link must be turned around + */ + self->line_capacity = + irlap_max_line_capacity(self->qos_tx.baud_rate.value, + self->qos_tx.max_turn_time.value); + self->bytes_left = self->line_capacity; +#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ + + + /* + * Initialize timeout values, some of the rules are listed on + * page 92 in IrLAP. + */ + IRDA_ASSERT(self->qos_tx.max_turn_time.value != 0, return;); + IRDA_ASSERT(self->qos_rx.max_turn_time.value != 0, return;); + /* The poll timeout applies only to the primary station. + * It defines the maximum time the primary stay in XMIT mode + * before timeout and turning the link around (sending a RR). + * Or, this is how much we can keep the pf bit in primary mode. + * Therefore, it must be lower or equal than our *OWN* max turn around. + * Jean II */ + self->poll_timeout = self->qos_tx.max_turn_time.value * HZ / 1000; + /* The Final timeout applies only to the primary station. + * It defines the maximum time the primary wait (mostly in RECV mode) + * for an answer from the secondary station before polling it again. + * Therefore, it must be greater or equal than our *PARTNER* + * max turn around time - Jean II */ + self->final_timeout = self->qos_rx.max_turn_time.value * HZ / 1000; + /* The Watchdog Bit timeout applies only to the secondary station. + * It defines the maximum time the secondary wait (mostly in RECV mode) + * for poll from the primary station before getting annoyed. + * Therefore, it must be greater or equal than our *PARTNER* + * max turn around time - Jean II */ + self->wd_timeout = self->final_timeout * 2; + + /* + * N1 and N2 are maximum retry count for *both* the final timer + * and the wd timer (with a factor 2) as defined above. + * After N1 retry of a timer, we give a warning to the user. + * After N2 retry, we consider the link dead and disconnect it. + * Jean II + */ + + /* + * Set N1 to 0 if Link Disconnect/Threshold Time = 3 and set it to + * 3 seconds otherwise. See page 71 in IrLAP for more details. + * Actually, it's not always 3 seconds, as we allow to set + * it via sysctl... Max maxtt is 500ms, and N1 need to be multiple + * of 2, so 1 second is minimum we can allow. - Jean II + */ + if (self->qos_tx.link_disc_time.value == sysctl_warn_noreply_time) + /* + * If we set N1 to 0, it will trigger immediately, which is + * not what we want. What we really want is to disable it, + * Jean II + */ + self->N1 = -2; /* Disable - Need to be multiple of 2*/ + else + self->N1 = sysctl_warn_noreply_time * 1000 / + self->qos_rx.max_turn_time.value; + + IRDA_DEBUG(4, "Setting N1 = %d\n", self->N1); + + /* Set N2 to match our own disconnect time */ + self->N2 = self->qos_tx.link_disc_time.value * 1000 / + self->qos_rx.max_turn_time.value; + IRDA_DEBUG(4, "Setting N2 = %d\n", self->N2); +} + +#ifdef CONFIG_PROC_FS +struct irlap_iter_state { + int id; +}; + +static void *irlap_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct irlap_iter_state *iter = seq->private; + struct irlap_cb *self; + + /* Protect our access to the tsap list */ + spin_lock_irq(&irlap->hb_spinlock); + iter->id = 0; + + for (self = (struct irlap_cb *) hashbin_get_first(irlap); + self; self = (struct irlap_cb *) hashbin_get_next(irlap)) { + if (iter->id == *pos) + break; + ++iter->id; + } + + return self; +} + +static void *irlap_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct irlap_iter_state *iter = seq->private; + + ++*pos; + ++iter->id; + return (void *) hashbin_get_next(irlap); +} + +static void irlap_seq_stop(struct seq_file *seq, void *v) +{ + spin_unlock_irq(&irlap->hb_spinlock); +} + +static int irlap_seq_show(struct seq_file *seq, void *v) +{ + const struct irlap_iter_state *iter = seq->private; + const struct irlap_cb *self = v; + + IRDA_ASSERT(self->magic == LAP_MAGIC, return -EINVAL;); + + seq_printf(seq, "irlap%d ", iter->id); + seq_printf(seq, "state: %s\n", + irlap_state[self->state]); + + seq_printf(seq, " device name: %s, ", + (self->netdev) ? self->netdev->name : "bug"); + seq_printf(seq, "hardware name: %s\n", self->hw_name); + + seq_printf(seq, " caddr: %#02x, ", self->caddr); + seq_printf(seq, "saddr: %#08x, ", self->saddr); + seq_printf(seq, "daddr: %#08x\n", self->daddr); + + seq_printf(seq, " win size: %d, ", + self->window_size); + seq_printf(seq, "win: %d, ", self->window); +#ifdef CONFIG_IRDA_DYNAMIC_WINDOW + seq_printf(seq, "line capacity: %d, ", + self->line_capacity); + seq_printf(seq, "bytes left: %d\n", self->bytes_left); +#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ + seq_printf(seq, " tx queue len: %d ", + skb_queue_len(&self->txq)); + seq_printf(seq, "win queue len: %d ", + skb_queue_len(&self->wx_list)); + seq_printf(seq, "rbusy: %s", self->remote_busy ? + "TRUE" : "FALSE"); + seq_printf(seq, " mbusy: %s\n", self->media_busy ? + "TRUE" : "FALSE"); + + seq_printf(seq, " retrans: %d ", self->retry_count); + seq_printf(seq, "vs: %d ", self->vs); + seq_printf(seq, "vr: %d ", self->vr); + seq_printf(seq, "va: %d\n", self->va); + + seq_printf(seq, " qos\tbps\tmaxtt\tdsize\twinsize\taddbofs\tmintt\tldisc\tcomp\n"); + + seq_printf(seq, " tx\t%d\t", + self->qos_tx.baud_rate.value); + seq_printf(seq, "%d\t", + self->qos_tx.max_turn_time.value); + seq_printf(seq, "%d\t", + self->qos_tx.data_size.value); + seq_printf(seq, "%d\t", + self->qos_tx.window_size.value); + seq_printf(seq, "%d\t", + self->qos_tx.additional_bofs.value); + seq_printf(seq, "%d\t", + self->qos_tx.min_turn_time.value); + seq_printf(seq, "%d\t", + self->qos_tx.link_disc_time.value); + seq_printf(seq, "\n"); + + seq_printf(seq, " rx\t%d\t", + self->qos_rx.baud_rate.value); + seq_printf(seq, "%d\t", + self->qos_rx.max_turn_time.value); + seq_printf(seq, "%d\t", + self->qos_rx.data_size.value); + seq_printf(seq, "%d\t", + self->qos_rx.window_size.value); + seq_printf(seq, "%d\t", + self->qos_rx.additional_bofs.value); + seq_printf(seq, "%d\t", + self->qos_rx.min_turn_time.value); + seq_printf(seq, "%d\n", + self->qos_rx.link_disc_time.value); + + return 0; +} + +static struct seq_operations irlap_seq_ops = { + .start = irlap_seq_start, + .next = irlap_seq_next, + .stop = irlap_seq_stop, + .show = irlap_seq_show, +}; + +static int irlap_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct irlap_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); + + if (!s) + goto out; + + if (irlap == NULL) { + rc = -EINVAL; + goto out_kfree; + } + + rc = seq_open(file, &irlap_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +struct file_operations irlap_seq_fops = { + .owner = THIS_MODULE, + .open = irlap_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* CONFIG_PROC_FS */ diff --git a/net/irda/irlap_event.c b/net/irda/irlap_event.c new file mode 100644 index 000000000000..1cd89f5f3b75 --- /dev/null +++ b/net/irda/irlap_event.c @@ -0,0 +1,2334 @@ +/********************************************************************* + * + * Filename: irlap_event.c + * Version: 0.9 + * Description: IrLAP state machine implementation + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sat Aug 16 00:59:29 1997 + * Modified at: Sat Dec 25 21:07:57 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-2000 Dag Brattli , + * Copyright (c) 1998 Thomas Davis + * All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include /* irlmp_flow_indication(), ... */ + +#include + +#ifdef CONFIG_IRDA_FAST_RR +int sysctl_fast_poll_increase = 50; +#endif + +static int irlap_state_ndm (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_query (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_reply (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_conn (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_setup (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_offline(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_xmit_p (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_pclose (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_nrm_p (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_reset_wait(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_reset (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_nrm_s (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_xmit_s (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_sclose (struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info); +static int irlap_state_reset_check(struct irlap_cb *, IRLAP_EVENT event, + struct sk_buff *, struct irlap_info *); + +#ifdef CONFIG_IRDA_DEBUG +static const char *irlap_event[] = { + "DISCOVERY_REQUEST", + "CONNECT_REQUEST", + "CONNECT_RESPONSE", + "DISCONNECT_REQUEST", + "DATA_REQUEST", + "RESET_REQUEST", + "RESET_RESPONSE", + "SEND_I_CMD", + "SEND_UI_FRAME", + "RECV_DISCOVERY_XID_CMD", + "RECV_DISCOVERY_XID_RSP", + "RECV_SNRM_CMD", + "RECV_TEST_CMD", + "RECV_TEST_RSP", + "RECV_UA_RSP", + "RECV_DM_RSP", + "RECV_RD_RSP", + "RECV_I_CMD", + "RECV_I_RSP", + "RECV_UI_FRAME", + "RECV_FRMR_RSP", + "RECV_RR_CMD", + "RECV_RR_RSP", + "RECV_RNR_CMD", + "RECV_RNR_RSP", + "RECV_REJ_CMD", + "RECV_REJ_RSP", + "RECV_SREJ_CMD", + "RECV_SREJ_RSP", + "RECV_DISC_CMD", + "SLOT_TIMER_EXPIRED", + "QUERY_TIMER_EXPIRED", + "FINAL_TIMER_EXPIRED", + "POLL_TIMER_EXPIRED", + "DISCOVERY_TIMER_EXPIRED", + "WD_TIMER_EXPIRED", + "BACKOFF_TIMER_EXPIRED", + "MEDIA_BUSY_TIMER_EXPIRED", +}; +#endif /* CONFIG_IRDA_DEBUG */ + +const char *irlap_state[] = { + "LAP_NDM", + "LAP_QUERY", + "LAP_REPLY", + "LAP_CONN", + "LAP_SETUP", + "LAP_OFFLINE", + "LAP_XMIT_P", + "LAP_PCLOSE", + "LAP_NRM_P", + "LAP_RESET_WAIT", + "LAP_RESET", + "LAP_NRM_S", + "LAP_XMIT_S", + "LAP_SCLOSE", + "LAP_RESET_CHECK", +}; + +static int (*state[])(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) = +{ + irlap_state_ndm, + irlap_state_query, + irlap_state_reply, + irlap_state_conn, + irlap_state_setup, + irlap_state_offline, + irlap_state_xmit_p, + irlap_state_pclose, + irlap_state_nrm_p, + irlap_state_reset_wait, + irlap_state_reset, + irlap_state_nrm_s, + irlap_state_xmit_s, + irlap_state_sclose, + irlap_state_reset_check, +}; + +/* + * Function irda_poll_timer_expired (data) + * + * Poll timer has expired. Normally we must now send a RR frame to the + * remote device + */ +static void irlap_poll_timer_expired(void *data) +{ + struct irlap_cb *self = (struct irlap_cb *) data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + irlap_do_event(self, POLL_TIMER_EXPIRED, NULL, NULL); +} + +/* + * Calculate and set time before we will have to send back the pf bit + * to the peer. Use in primary. + * Make sure that state is XMIT_P/XMIT_S when calling this function + * (and that nobody messed up with the state). - Jean II + */ +static void irlap_start_poll_timer(struct irlap_cb *self, int timeout) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + +#ifdef CONFIG_IRDA_FAST_RR + /* + * Send out the RR frames faster if our own transmit queue is empty, or + * if the peer is busy. The effect is a much faster conversation + */ + if ((skb_queue_len(&self->txq) == 0) || (self->remote_busy)) { + if (self->fast_RR == TRUE) { + /* + * Assert that the fast poll timer has not reached the + * normal poll timer yet + */ + if (self->fast_RR_timeout < timeout) { + /* + * FIXME: this should be a more configurable + * function + */ + self->fast_RR_timeout += + (sysctl_fast_poll_increase * HZ/1000); + + /* Use this fast(er) timeout instead */ + timeout = self->fast_RR_timeout; + } + } else { + self->fast_RR = TRUE; + + /* Start with just 0 ms */ + self->fast_RR_timeout = 0; + timeout = 0; + } + } else + self->fast_RR = FALSE; + + IRDA_DEBUG(3, "%s(), timeout=%d (%ld)\n", __FUNCTION__, timeout, jiffies); +#endif /* CONFIG_IRDA_FAST_RR */ + + if (timeout == 0) + irlap_do_event(self, POLL_TIMER_EXPIRED, NULL, NULL); + else + irda_start_timer(&self->poll_timer, timeout, self, + irlap_poll_timer_expired); +} + +/* + * Function irlap_do_event (event, skb, info) + * + * Rushes through the state machine without any delay. If state == XMIT + * then send queued data frames. + */ +void irlap_do_event(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret; + + if (!self || self->magic != LAP_MAGIC) + return; + + IRDA_DEBUG(3, "%s(), event = %s, state = %s\n", __FUNCTION__, + irlap_event[event], irlap_state[self->state]); + + ret = (*state[self->state])(self, event, skb, info); + + /* + * Check if there are any pending events that needs to be executed + */ + switch (self->state) { + case LAP_XMIT_P: /* FALLTHROUGH */ + case LAP_XMIT_S: + /* + * We just received the pf bit and are at the beginning + * of a new LAP transmit window. + * Check if there are any queued data frames, and do not + * try to disconnect link if we send any data frames, since + * that will change the state away form XMIT + */ + IRDA_DEBUG(2, "%s() : queue len = %d\n", __FUNCTION__, + skb_queue_len(&self->txq)); + + if (skb_queue_len(&self->txq)) { + /* Prevent race conditions with irlap_data_request() */ + self->local_busy = TRUE; + + /* Theory of operation. + * We send frames up to when we fill the window or + * reach line capacity. Those frames will queue up + * in the device queue, and the driver will slowly + * send them. + * After each frame that we send, we poll the higher + * layer for more data. It's the right time to do + * that because the link layer need to perform the mtt + * and then send the first frame, so we can afford + * to send a bit of time in kernel space. + * The explicit flow indication allow to minimise + * buffers (== lower latency), to avoid higher layer + * polling via timers (== less context switches) and + * to implement a crude scheduler - Jean II */ + + /* Try to send away all queued data frames */ + while ((skb = skb_dequeue(&self->txq)) != NULL) { + /* Send one frame */ + ret = (*state[self->state])(self, SEND_I_CMD, + skb, NULL); + /* Drop reference count. + * It will be increase as needed in + * irlap_send_data_xxx() */ + kfree_skb(skb); + + /* Poll the higher layers for one more frame */ + irlmp_flow_indication(self->notify.instance, + FLOW_START); + + if (ret == -EPROTO) + break; /* Try again later! */ + } + /* Finished transmitting */ + self->local_busy = FALSE; + } else if (self->disconnect_pending) { + self->disconnect_pending = FALSE; + + ret = (*state[self->state])(self, DISCONNECT_REQUEST, + NULL, NULL); + } + break; +/* case LAP_NDM: */ +/* case LAP_CONN: */ +/* case LAP_RESET_WAIT: */ +/* case LAP_RESET_CHECK: */ + default: + break; + } +} + +/* + * Function irlap_next_state (self, state) + * + * Switches state and provides debug information + * + */ +static inline void irlap_next_state(struct irlap_cb *self, IRLAP_STATE state) +{ + /* + if (!self || self->magic != LAP_MAGIC) + return; + + IRDA_DEBUG(4, "next LAP state = %s\n", irlap_state[state]); + */ + self->state = state; +} + +/* + * Function irlap_state_ndm (event, skb, frame) + * + * NDM (Normal Disconnected Mode) state + * + */ +static int irlap_state_ndm(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + discovery_t *discovery_rsp; + int ret = 0; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case CONNECT_REQUEST: + IRDA_ASSERT(self->netdev != NULL, return -1;); + + if (self->media_busy) { + /* Note : this will never happen, because we test + * media busy in irlap_connect_request() and + * postpone the event... - Jean II */ + IRDA_DEBUG(0, "%s(), CONNECT_REQUEST: media busy!\n", + __FUNCTION__); + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + irlap_disconnect_indication(self, LAP_MEDIA_BUSY); + } else { + irlap_send_snrm_frame(self, &self->qos_rx); + + /* Start Final-bit timer */ + irlap_start_final_timer(self, self->final_timeout); + + self->retry_count = 0; + irlap_next_state(self, LAP_SETUP); + } + break; + case RECV_SNRM_CMD: + /* Check if the frame contains and I field */ + if (info) { + self->daddr = info->daddr; + self->caddr = info->caddr; + + irlap_next_state(self, LAP_CONN); + + irlap_connect_indication(self, skb); + } else { + IRDA_DEBUG(0, "%s(), SNRM frame does not " + "contain an I field!\n", __FUNCTION__); + } + break; + case DISCOVERY_REQUEST: + IRDA_ASSERT(info != NULL, return -1;); + + if (self->media_busy) { + IRDA_DEBUG(1, "%s(), DISCOVERY_REQUEST: media busy!\n", + __FUNCTION__); + /* irlap->log.condition = MEDIA_BUSY; */ + + /* This will make IrLMP try again */ + irlap_discovery_confirm(self, NULL); + /* Note : the discovery log is not cleaned up here, + * it will be done in irlap_discovery_request() + * Jean II */ + return 0; + } + + self->S = info->S; + self->s = info->s; + irlap_send_discovery_xid_frame(self, info->S, info->s, TRUE, + info->discovery); + self->frame_sent = FALSE; + self->s++; + + irlap_start_slot_timer(self, self->slot_timeout); + irlap_next_state(self, LAP_QUERY); + break; + case RECV_DISCOVERY_XID_CMD: + IRDA_ASSERT(info != NULL, return -1;); + + /* Assert that this is not the final slot */ + if (info->s <= info->S) { + self->slot = irlap_generate_rand_time_slot(info->S, + info->s); + if (self->slot == info->s) { + discovery_rsp = irlmp_get_discovery_response(); + discovery_rsp->data.daddr = info->daddr; + + irlap_send_discovery_xid_frame(self, info->S, + self->slot, + FALSE, + discovery_rsp); + self->frame_sent = TRUE; + } else + self->frame_sent = FALSE; + + /* + * Go to reply state until end of discovery to + * inhibit our own transmissions. Set the timer + * to not stay forever there... Jean II + */ + irlap_start_query_timer(self, info->S, info->s); + irlap_next_state(self, LAP_REPLY); + } else { + /* This is the final slot. How is it possible ? + * This would happen is both discoveries are just slightly + * offset (if they are in sync, all packets are lost). + * Most often, all the discovery requests will be received + * in QUERY state (see my comment there), except for the + * last frame that will come here. + * The big trouble when it happen is that active discovery + * doesn't happen, because nobody answer the discoveries + * frame of the other guy, so the log shows up empty. + * What should we do ? + * Not much. It's too late to answer those discovery frames, + * so we just pass the info to IrLMP who will put it in the + * log (and post an event). + * Another cause would be devices that do discovery much + * slower than us, however the latest fixes should minimise + * those cases... + * Jean II + */ + IRDA_DEBUG(1, "%s(), Receiving final discovery request, missed the discovery slots :-(\n", __FUNCTION__); + + /* Last discovery request -> in the log */ + irlap_discovery_indication(self, info->discovery); + } + break; + case MEDIA_BUSY_TIMER_EXPIRED: + /* A bunch of events may be postponed because the media is + * busy (usually immediately after we close a connection), + * or while we are doing discovery (state query/reply). + * In all those cases, the media busy flag will be cleared + * when it's OK for us to process those postponed events. + * This event is not mentioned in the state machines in the + * IrLAP spec. It's because they didn't consider Ultra and + * postponing connection request is optional. + * Jean II */ +#ifdef CONFIG_IRDA_ULTRA + /* Send any pending Ultra frames if any */ + if (!skb_queue_empty(&self->txq_ultra)) { + /* We don't send the frame, just post an event. + * Also, previously this code was in timer.c... + * Jean II */ + ret = (*state[self->state])(self, SEND_UI_FRAME, + NULL, NULL); + } +#endif /* CONFIG_IRDA_ULTRA */ + /* Check if we should try to connect. + * This code was previously in irlap_do_event() */ + if (self->connect_pending) { + self->connect_pending = FALSE; + + /* This one *should* not pend in this state, except + * if a socket try to connect and immediately + * disconnect. - clear - Jean II */ + if (self->disconnect_pending) + irlap_disconnect_indication(self, LAP_DISC_INDICATION); + else + ret = (*state[self->state])(self, + CONNECT_REQUEST, + NULL, NULL); + self->disconnect_pending = FALSE; + } + /* Note : one way to test if this code works well (including + * media busy and small busy) is to create a user space + * application generating an Ultra packet every 3.05 sec (or + * 2.95 sec) and to see how it interact with discovery. + * It's fairly easy to check that no packet is lost, that the + * packets are postponed during discovery and that after + * discovery indication you have a 100ms "gap". + * As connection request and Ultra are now processed the same + * way, this avoid the tedious job of trying IrLAP connection + * in all those cases... + * Jean II */ + break; +#ifdef CONFIG_IRDA_ULTRA + case SEND_UI_FRAME: + { + int i; + /* Only allowed to repeat an operation twice */ + for (i=0; ((i<2) && (self->media_busy == FALSE)); i++) { + skb = skb_dequeue(&self->txq_ultra); + if (skb) + irlap_send_ui_frame(self, skb, CBROADCAST, + CMD_FRAME); + else + break; + /* irlap_send_ui_frame() won't increase skb reference + * count, so no dev_kfree_skb() - Jean II */ + } + if (i == 2) { + /* Force us to listen 500 ms again */ + irda_device_set_media_busy(self->netdev, TRUE); + } + break; + } + case RECV_UI_FRAME: + /* Only accept broadcast frames in NDM mode */ + if (info->caddr != CBROADCAST) { + IRDA_DEBUG(0, "%s(), not a broadcast frame!\n", + __FUNCTION__); + } else + irlap_unitdata_indication(self, skb); + break; +#endif /* CONFIG_IRDA_ULTRA */ + case RECV_TEST_CMD: + /* Remove test frame header */ + skb_pull(skb, sizeof(struct test_frame)); + + /* + * Send response. This skb will not be sent out again, and + * will only be used to send out the same info as the cmd + */ + irlap_send_test_frame(self, CBROADCAST, info->daddr, skb); + break; + case RECV_TEST_RSP: + IRDA_DEBUG(0, "%s() not implemented!\n", __FUNCTION__); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %s\n", __FUNCTION__, + irlap_event[event]); + + ret = -1; + break; + } + return ret; +} + +/* + * Function irlap_state_query (event, skb, info) + * + * QUERY state + * + */ +static int irlap_state_query(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case RECV_DISCOVERY_XID_RSP: + IRDA_ASSERT(info != NULL, return -1;); + IRDA_ASSERT(info->discovery != NULL, return -1;); + + IRDA_DEBUG(4, "%s(), daddr=%08x\n", __FUNCTION__, + info->discovery->data.daddr); + + if (!self->discovery_log) { + IRDA_WARNING("%s: discovery log is gone! " + "maybe the discovery timeout has been set" + " to short?\n", __FUNCTION__); + break; + } + hashbin_insert(self->discovery_log, + (irda_queue_t *) info->discovery, + info->discovery->data.daddr, NULL); + + /* Keep state */ + /* irlap_next_state(self, LAP_QUERY); */ + + break; + case RECV_DISCOVERY_XID_CMD: + /* Yes, it is possible to receive those frames in this mode. + * Note that most often the last discovery request won't + * occur here but in NDM state (see my comment there). + * What should we do ? + * Not much. We are currently performing our own discovery, + * therefore we can't answer those frames. We don't want + * to change state either. We just pass the info to + * IrLMP who will put it in the log (and post an event). + * Jean II + */ + + IRDA_ASSERT(info != NULL, return -1;); + + IRDA_DEBUG(1, "%s(), Receiving discovery request (s = %d) while performing discovery :-(\n", __FUNCTION__, info->s); + + /* Last discovery request ? */ + if (info->s == 0xff) + irlap_discovery_indication(self, info->discovery); + break; + case SLOT_TIMER_EXPIRED: + /* + * Wait a little longer if we detect an incoming frame. This + * is not mentioned in the spec, but is a good thing to do, + * since we want to work even with devices that violate the + * timing requirements. + */ + if (irda_device_is_receiving(self->netdev) && !self->add_wait) { + IRDA_DEBUG(2, "%s(), device is slow to answer, " + "waiting some more!\n", __FUNCTION__); + irlap_start_slot_timer(self, msecs_to_jiffies(10)); + self->add_wait = TRUE; + return ret; + } + self->add_wait = FALSE; + + if (self->s < self->S) { + irlap_send_discovery_xid_frame(self, self->S, + self->s, TRUE, + self->discovery_cmd); + self->s++; + irlap_start_slot_timer(self, self->slot_timeout); + + /* Keep state */ + irlap_next_state(self, LAP_QUERY); + } else { + /* This is the final slot! */ + irlap_send_discovery_xid_frame(self, self->S, 0xff, + TRUE, + self->discovery_cmd); + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + /* + * We are now finished with the discovery procedure, + * so now we must return the results + */ + irlap_discovery_confirm(self, self->discovery_log); + + /* IrLMP should now have taken care of the log */ + self->discovery_log = NULL; + } + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %s\n", __FUNCTION__, + irlap_event[event]); + + ret = -1; + break; + } + return ret; +} + +/* + * Function irlap_state_reply (self, event, skb, info) + * + * REPLY, we have received a XID discovery frame from a device and we + * are waiting for the right time slot to send a response XID frame + * + */ +static int irlap_state_reply(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + discovery_t *discovery_rsp; + int ret=0; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case QUERY_TIMER_EXPIRED: + IRDA_DEBUG(0, "%s(), QUERY_TIMER_EXPIRED <%ld>\n", + __FUNCTION__, jiffies); + irlap_next_state(self, LAP_NDM); + break; + case RECV_DISCOVERY_XID_CMD: + IRDA_ASSERT(info != NULL, return -1;); + /* Last frame? */ + if (info->s == 0xff) { + del_timer(&self->query_timer); + + /* info->log.condition = REMOTE; */ + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + irlap_discovery_indication(self, info->discovery); + } else { + /* If it's our slot, send our reply */ + if ((info->s >= self->slot) && (!self->frame_sent)) { + discovery_rsp = irlmp_get_discovery_response(); + discovery_rsp->data.daddr = info->daddr; + + irlap_send_discovery_xid_frame(self, info->S, + self->slot, + FALSE, + discovery_rsp); + + self->frame_sent = TRUE; + } + /* Readjust our timer to accomodate devices + * doing faster or slower discovery than us... + * Jean II */ + irlap_start_query_timer(self, info->S, info->s); + + /* Keep state */ + //irlap_next_state(self, LAP_REPLY); + } + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %d, %s\n", __FUNCTION__, + event, irlap_event[event]); + + ret = -1; + break; + } + return ret; +} + +/* + * Function irlap_state_conn (event, skb, info) + * + * CONN, we have received a SNRM command and is waiting for the upper + * layer to accept or refuse connection + * + */ +static int irlap_state_conn(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + + IRDA_DEBUG(4, "%s(), event=%s\n", __FUNCTION__, irlap_event[ event]); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case CONNECT_RESPONSE: + skb_pull(skb, sizeof(struct snrm_frame)); + + IRDA_ASSERT(self->netdev != NULL, return -1;); + + irlap_qos_negotiate(self, skb); + + irlap_initiate_connection_state(self); + + /* + * Applying the parameters now will make sure we change speed + * *after* we have sent the next frame + */ + irlap_apply_connection_parameters(self, FALSE); + + /* + * Sending this frame will force a speed change after it has + * been sent (i.e. the frame will be sent at 9600). + */ + irlap_send_ua_response_frame(self, &self->qos_rx); + +#if 0 + /* + * We are allowed to send two frames, but this may increase + * the connect latency, so lets not do it for now. + */ + /* This is full of good intentions, but doesn't work in + * practice. + * After sending the first UA response, we switch the + * dongle to the negotiated speed, which is usually + * different than 9600 kb/s. + * From there, there is two solutions : + * 1) The other end has received the first UA response : + * it will set up the connection, move to state LAP_NRM_P, + * and will ignore and drop the second UA response. + * Actually, it's even worse : the other side will almost + * immediately send a RR that will likely collide with the + * UA response (depending on negotiated turnaround). + * 2) The other end has not received the first UA response, + * will stay at 9600 and will never see the second UA response. + * Jean II */ + irlap_send_ua_response_frame(self, &self->qos_rx); +#endif + + /* + * The WD-timer could be set to the duration of the P-timer + * for this case, but it is recommended to use twice the + * value (note 3 IrLAP p. 60). + */ + irlap_start_wd_timer(self, self->wd_timeout); + irlap_next_state(self, LAP_NRM_S); + + break; + case RECV_DISCOVERY_XID_CMD: + IRDA_DEBUG(3, "%s(), event RECV_DISCOVER_XID_CMD!\n", + __FUNCTION__); + irlap_next_state(self, LAP_NDM); + + break; + case DISCONNECT_REQUEST: + IRDA_DEBUG(0, "%s(), Disconnect request!\n", __FUNCTION__); + irlap_send_dm_frame(self); + irlap_next_state( self, LAP_NDM); + irlap_disconnect_indication(self, LAP_DISC_INDICATION); + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %d, %s\n", __FUNCTION__, + event, irlap_event[event]); + + ret = -1; + break; + } + + return ret; +} + +/* + * Function irlap_state_setup (event, skb, frame) + * + * SETUP state, The local layer has transmitted a SNRM command frame to + * a remote peer layer and is awaiting a reply . + * + */ +static int irlap_state_setup(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case FINAL_TIMER_EXPIRED: + if (self->retry_count < self->N3) { +/* + * Perform random backoff, Wait a random number of time units, minimum + * duration half the time taken to transmitt a SNRM frame, maximum duration + * 1.5 times the time taken to transmit a SNRM frame. So this time should + * between 15 msecs and 45 msecs. + */ + irlap_start_backoff_timer(self, msecs_to_jiffies(20 + + (jiffies % 30))); + } else { + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + irlap_disconnect_indication(self, LAP_FOUND_NONE); + } + break; + case BACKOFF_TIMER_EXPIRED: + irlap_send_snrm_frame(self, &self->qos_rx); + irlap_start_final_timer(self, self->final_timeout); + self->retry_count++; + break; + case RECV_SNRM_CMD: + IRDA_DEBUG(4, "%s(), SNRM battle!\n", __FUNCTION__); + + IRDA_ASSERT(skb != NULL, return 0;); + IRDA_ASSERT(info != NULL, return 0;); + + /* + * The device with the largest device address wins the battle + * (both have sent a SNRM command!) + */ + if (info &&(info->daddr > self->saddr)) { + del_timer(&self->final_timer); + irlap_initiate_connection_state(self); + + IRDA_ASSERT(self->netdev != NULL, return -1;); + + skb_pull(skb, sizeof(struct snrm_frame)); + + irlap_qos_negotiate(self, skb); + + /* Send UA frame and then change link settings */ + irlap_apply_connection_parameters(self, FALSE); + irlap_send_ua_response_frame(self, &self->qos_rx); + + irlap_next_state(self, LAP_NRM_S); + irlap_connect_confirm(self, skb); + + /* + * The WD-timer could be set to the duration of the + * P-timer for this case, but it is recommended + * to use twice the value (note 3 IrLAP p. 60). + */ + irlap_start_wd_timer(self, self->wd_timeout); + } else { + /* We just ignore the other device! */ + irlap_next_state(self, LAP_SETUP); + } + break; + case RECV_UA_RSP: + /* Stop F-timer */ + del_timer(&self->final_timer); + + /* Initiate connection state */ + irlap_initiate_connection_state(self); + + /* Negotiate connection parameters */ + IRDA_ASSERT(skb->len > 10, return -1;); + + skb_pull(skb, sizeof(struct ua_frame)); + + IRDA_ASSERT(self->netdev != NULL, return -1;); + + irlap_qos_negotiate(self, skb); + + /* Set the new link setting *now* (before the rr frame) */ + irlap_apply_connection_parameters(self, TRUE); + self->retry_count = 0; + + /* Wait for turnaround time to give a chance to the other + * device to be ready to receive us. + * Note : the time to switch speed is typically larger + * than the turnaround time, but as we don't have the other + * side speed switch time, that's our best guess... + * Jean II */ + irlap_wait_min_turn_around(self, &self->qos_tx); + + /* This frame will actually be sent at the new speed */ + irlap_send_rr_frame(self, CMD_FRAME); + + /* The timer is set to half the normal timer to quickly + * detect a failure to negociate the new connection + * parameters. IrLAP 6.11.3.2, note 3. + * Note that currently we don't process this failure + * properly, as we should do a quick disconnect. + * Jean II */ + irlap_start_final_timer(self, self->final_timeout/2); + irlap_next_state(self, LAP_NRM_P); + + irlap_connect_confirm(self, skb); + break; + case RECV_DM_RSP: /* FALLTHROUGH */ + case RECV_DISC_CMD: + del_timer(&self->final_timer); + irlap_next_state(self, LAP_NDM); + + irlap_disconnect_indication(self, LAP_DISC_INDICATION); + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %d, %s\n", __FUNCTION__, + event, irlap_event[event]); + + ret = -1; + break; + } + return ret; +} + +/* + * Function irlap_state_offline (self, event, skb, info) + * + * OFFLINE state, not used for now! + * + */ +static int irlap_state_offline(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + IRDA_DEBUG( 0, "%s(), Unknown event\n", __FUNCTION__); + + return -1; +} + +/* + * Function irlap_state_xmit_p (self, event, skb, info) + * + * XMIT, Only the primary station has right to transmit, and we + * therefore do not expect to receive any transmissions from other + * stations. + * + */ +static int irlap_state_xmit_p(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + + switch (event) { + case SEND_I_CMD: + /* + * Only send frame if send-window > 0. + */ + if ((self->window > 0) && (!self->remote_busy)) { + int nextfit; +#ifdef CONFIG_IRDA_DYNAMIC_WINDOW + struct sk_buff *skb_next; + + /* With DYNAMIC_WINDOW, we keep the window size + * maximum, and adapt on the packets we are sending. + * At 115k, we can send only 2 packets of 2048 bytes + * in a 500 ms turnaround. Without this option, we + * would always limit the window to 2. With this + * option, if we send smaller packets, we can send + * up to 7 of them (always depending on QoS). + * Jean II */ + + /* Look at the next skb. This is safe, as we are + * the only consumer of the Tx queue (if we are not, + * we have other problems) - Jean II */ + skb_next = skb_peek(&self->txq); + + /* Check if a subsequent skb exist and would fit in + * the current window (with respect to turnaround + * time). + * This allow us to properly mark the current packet + * with the pf bit, to avoid falling back on the + * second test below, and avoid waiting the + * end of the window and sending a extra RR. + * Note : (skb_next != NULL) <=> (skb_queue_len() > 0) + * Jean II */ + nextfit = ((skb_next != NULL) && + ((skb_next->len + skb->len) <= + self->bytes_left)); + + /* + * The current packet may not fit ! Because of test + * above, this should not happen any more !!! + * Test if we have transmitted more bytes over the + * link than its possible to do with the current + * speed and turn-around-time. + */ + if((!nextfit) && (skb->len > self->bytes_left)) { + IRDA_DEBUG(0, "%s(), Not allowed to transmit" + " more bytes!\n", __FUNCTION__); + /* Requeue the skb */ + skb_queue_head(&self->txq, skb_get(skb)); + /* + * We should switch state to LAP_NRM_P, but + * that is not possible since we must be sure + * that we poll the other side. Since we have + * used up our time, the poll timer should + * trigger anyway now, so we just wait for it + * DB + */ + /* + * Sorry, but that's not totally true. If + * we send 2000B packets, we may wait another + * 1000B until our turnaround expire. That's + * why we need to be proactive in avoiding + * coming here. - Jean II + */ + return -EPROTO; + } + + /* Substract space used by this skb */ + self->bytes_left -= skb->len; +#else /* CONFIG_IRDA_DYNAMIC_WINDOW */ + /* Window has been adjusted for the max packet + * size, so much simpler... - Jean II */ + nextfit = (skb_queue_len(&self->txq) > 0); +#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ + /* + * Send data with poll bit cleared only if window > 1 + * and there is more frames after this one to be sent + */ + if ((self->window > 1) && (nextfit)) { + /* More packet to send in current window */ + irlap_send_data_primary(self, skb); + irlap_next_state(self, LAP_XMIT_P); + } else { + /* Final packet of window */ + irlap_send_data_primary_poll(self, skb); + irlap_next_state(self, LAP_NRM_P); + + /* + * Make sure state machine does not try to send + * any more frames + */ + ret = -EPROTO; + } +#ifdef CONFIG_IRDA_FAST_RR + /* Peer may want to reply immediately */ + self->fast_RR = FALSE; +#endif /* CONFIG_IRDA_FAST_RR */ + } else { + IRDA_DEBUG(4, "%s(), Unable to send! remote busy?\n", + __FUNCTION__); + skb_queue_head(&self->txq, skb_get(skb)); + + /* + * The next ret is important, because it tells + * irlap_next_state _not_ to deliver more frames + */ + ret = -EPROTO; + } + break; + case POLL_TIMER_EXPIRED: + IRDA_DEBUG(3, "%s(), POLL_TIMER_EXPIRED <%ld>\n", + __FUNCTION__, jiffies); + irlap_send_rr_frame(self, CMD_FRAME); + /* Return to NRM properly - Jean II */ + self->window = self->window_size; +#ifdef CONFIG_IRDA_DYNAMIC_WINDOW + /* Allowed to transmit a maximum number of bytes again. */ + self->bytes_left = self->line_capacity; +#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ + irlap_start_final_timer(self, self->final_timeout); + irlap_next_state(self, LAP_NRM_P); + break; + case DISCONNECT_REQUEST: + del_timer(&self->poll_timer); + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_disc_frame(self); + irlap_flush_all_queues(self); + irlap_start_final_timer(self, self->final_timeout); + self->retry_count = 0; + irlap_next_state(self, LAP_PCLOSE); + break; + case DATA_REQUEST: + /* Nothing to do, irlap_do_event() will send the packet + * when we return... - Jean II */ + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %s\n", + __FUNCTION__, irlap_event[event]); + + ret = -EINVAL; + break; + } + return ret; +} + +/* + * Function irlap_state_pclose (event, skb, info) + * + * PCLOSE state + */ +static int irlap_state_pclose(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + + IRDA_DEBUG(1, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case RECV_UA_RSP: /* FALLTHROUGH */ + case RECV_DM_RSP: + del_timer(&self->final_timer); + + /* Set new link parameters */ + irlap_apply_default_connection_parameters(self); + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + irlap_disconnect_indication(self, LAP_DISC_INDICATION); + break; + case FINAL_TIMER_EXPIRED: + if (self->retry_count < self->N3) { + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_disc_frame(self); + irlap_start_final_timer(self, self->final_timeout); + self->retry_count++; + /* Keep state */ + } else { + irlap_apply_default_connection_parameters(self); + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + irlap_disconnect_indication(self, LAP_NO_RESPONSE); + } + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %d\n", __FUNCTION__, event); + + ret = -1; + break; + } + return ret; +} + +/* + * Function irlap_state_nrm_p (self, event, skb, info) + * + * NRM_P (Normal Response Mode as Primary), The primary station has given + * permissions to a secondary station to transmit IrLAP resonse frames + * (by sending a frame with the P bit set). The primary station will not + * transmit any frames and is expecting to receive frames only from the + * secondary to which transmission permissions has been given. + */ +static int irlap_state_nrm_p(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + int ns_status; + int nr_status; + + switch (event) { + case RECV_I_RSP: /* Optimize for the common case */ + /* FIXME: must check for remote_busy below */ +#ifdef CONFIG_IRDA_FAST_RR + /* + * Reset the fast_RR so we can use the fast RR code with + * full speed the next time since peer may have more frames + * to transmitt + */ + self->fast_RR = FALSE; +#endif /* CONFIG_IRDA_FAST_RR */ + IRDA_ASSERT( info != NULL, return -1;); + + ns_status = irlap_validate_ns_received(self, info->ns); + nr_status = irlap_validate_nr_received(self, info->nr); + + /* + * Check for expected I(nformation) frame + */ + if ((ns_status == NS_EXPECTED) && (nr_status == NR_EXPECTED)) { + + /* Update Vr (next frame for us to receive) */ + self->vr = (self->vr + 1) % 8; + + /* Update Nr received, cleanup our retry queue */ + irlap_update_nr_received(self, info->nr); + + /* + * Got expected NR, so reset the + * retry_count. This is not done by IrLAP spec, + * which is strange! + */ + self->retry_count = 0; + self->ack_required = TRUE; + + /* poll bit cleared? */ + if (!info->pf) { + /* Keep state, do not move this line */ + irlap_next_state(self, LAP_NRM_P); + + irlap_data_indication(self, skb, FALSE); + } else { + /* No longer waiting for pf */ + del_timer(&self->final_timer); + + irlap_wait_min_turn_around(self, &self->qos_tx); + + /* Call higher layer *before* changing state + * to give them a chance to send data in the + * next LAP frame. + * Jean II */ + irlap_data_indication(self, skb, FALSE); + + /* XMIT states are the most dangerous state + * to be in, because user requests are + * processed directly and may change state. + * On the other hand, in NDM_P, those + * requests are queued and we will process + * them when we return to irlap_do_event(). + * Jean II + */ + irlap_next_state(self, LAP_XMIT_P); + + /* This is the last frame. + * Make sure it's always called in XMIT state. + * - Jean II */ + irlap_start_poll_timer(self, self->poll_timeout); + } + break; + + } + /* Unexpected next to send (Ns) */ + if ((ns_status == NS_UNEXPECTED) && (nr_status == NR_EXPECTED)) + { + if (!info->pf) { + irlap_update_nr_received(self, info->nr); + + /* + * Wait until the last frame before doing + * anything + */ + + /* Keep state */ + irlap_next_state(self, LAP_NRM_P); + } else { + IRDA_DEBUG(4, + "%s(), missing or duplicate frame!\n", + __FUNCTION__); + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, CMD_FRAME); + + self->ack_required = FALSE; + + irlap_start_final_timer(self, self->final_timeout); + irlap_next_state(self, LAP_NRM_P); + } + break; + } + /* + * Unexpected next to receive (Nr) + */ + if ((ns_status == NS_EXPECTED) && (nr_status == NR_UNEXPECTED)) + { + if (info->pf) { + self->vr = (self->vr + 1) % 8; + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + /* Resend rejected frames */ + irlap_resend_rejected_frames(self, CMD_FRAME); + + self->ack_required = FALSE; + + /* Make sure we account for the time + * to transmit our frames. See comemnts + * in irlap_send_data_primary_poll(). + * Jean II */ + irlap_start_final_timer(self, 2 * self->final_timeout); + + /* Keep state, do not move this line */ + irlap_next_state(self, LAP_NRM_P); + + irlap_data_indication(self, skb, FALSE); + } else { + /* + * Do not resend frames until the last + * frame has arrived from the other + * device. This is not documented in + * IrLAP!! + */ + self->vr = (self->vr + 1) % 8; + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + self->ack_required = FALSE; + + /* Keep state, do not move this line!*/ + irlap_next_state(self, LAP_NRM_P); + + irlap_data_indication(self, skb, FALSE); + } + break; + } + /* + * Unexpected next to send (Ns) and next to receive (Nr) + * Not documented by IrLAP! + */ + if ((ns_status == NS_UNEXPECTED) && + (nr_status == NR_UNEXPECTED)) + { + IRDA_DEBUG(4, "%s(), unexpected nr and ns!\n", + __FUNCTION__); + if (info->pf) { + /* Resend rejected frames */ + irlap_resend_rejected_frames(self, CMD_FRAME); + + /* Give peer some time to retransmit! + * But account for our own Tx. */ + irlap_start_final_timer(self, 2 * self->final_timeout); + + /* Keep state, do not move this line */ + irlap_next_state(self, LAP_NRM_P); + } else { + /* Update Nr received */ + /* irlap_update_nr_received( info->nr); */ + + self->ack_required = FALSE; + } + break; + } + + /* + * Invalid NR or NS + */ + if ((nr_status == NR_INVALID) || (ns_status == NS_INVALID)) { + if (info->pf) { + del_timer(&self->final_timer); + + irlap_next_state(self, LAP_RESET_WAIT); + + irlap_disconnect_indication(self, LAP_RESET_INDICATION); + self->xmitflag = TRUE; + } else { + del_timer(&self->final_timer); + + irlap_disconnect_indication(self, LAP_RESET_INDICATION); + + self->xmitflag = FALSE; + } + break; + } + IRDA_DEBUG(1, "%s(), Not implemented!\n", __FUNCTION__); + IRDA_DEBUG(1, "%s(), event=%s, ns_status=%d, nr_status=%d\n", + __FUNCTION__, irlap_event[event], ns_status, nr_status); + break; + case RECV_UI_FRAME: + /* Poll bit cleared? */ + if (!info->pf) { + irlap_data_indication(self, skb, TRUE); + irlap_next_state(self, LAP_NRM_P); + } else { + del_timer(&self->final_timer); + irlap_data_indication(self, skb, TRUE); + irlap_next_state(self, LAP_XMIT_P); + IRDA_DEBUG(1, "%s: RECV_UI_FRAME: next state %s\n", __FUNCTION__, irlap_state[self->state]); + irlap_start_poll_timer(self, self->poll_timeout); + } + break; + case RECV_RR_RSP: + /* + * If you get a RR, the remote isn't busy anymore, + * no matter what the NR + */ + self->remote_busy = FALSE; + + /* + * Nr as expected? + */ + ret = irlap_validate_nr_received(self, info->nr); + if (ret == NR_EXPECTED) { + /* Stop final timer */ + del_timer(&self->final_timer); + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + /* + * Got expected NR, so reset the retry_count. This + * is not done by the IrLAP standard , which is + * strange! DB. + */ + self->retry_count = 0; + irlap_wait_min_turn_around(self, &self->qos_tx); + + irlap_next_state(self, LAP_XMIT_P); + + /* Start poll timer */ + irlap_start_poll_timer(self, self->poll_timeout); + } else if (ret == NR_UNEXPECTED) { + IRDA_ASSERT(info != NULL, return -1;); + /* + * Unexpected nr! + */ + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + IRDA_DEBUG(4, "RECV_RR_FRAME: Retrans:%d, nr=%d, va=%d, " + "vs=%d, vr=%d\n", + self->retry_count, info->nr, self->va, + self->vs, self->vr); + + /* Resend rejected frames */ + irlap_resend_rejected_frames(self, CMD_FRAME); + + /* Final timer ??? Jean II */ + + irlap_next_state(self, LAP_NRM_P); + } else if (ret == NR_INVALID) { + IRDA_DEBUG(1, "%s(), Received RR with " + "invalid nr !\n", __FUNCTION__); + del_timer(&self->final_timer); + + irlap_next_state(self, LAP_RESET_WAIT); + + irlap_disconnect_indication(self, LAP_RESET_INDICATION); + self->xmitflag = TRUE; + } + break; + case RECV_RNR_RSP: + IRDA_ASSERT(info != NULL, return -1;); + + /* Stop final timer */ + del_timer(&self->final_timer); + self->remote_busy = TRUE; + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + irlap_next_state(self, LAP_XMIT_P); + + /* Start poll timer */ + irlap_start_poll_timer(self, self->poll_timeout); + break; + case RECV_FRMR_RSP: + del_timer(&self->final_timer); + self->xmitflag = TRUE; + irlap_next_state(self, LAP_RESET_WAIT); + irlap_reset_indication(self); + break; + case FINAL_TIMER_EXPIRED: + /* + * We are allowed to wait for additional 300 ms if + * final timer expires when we are in the middle + * of receiving a frame (page 45, IrLAP). Check that + * we only do this once for each frame. + */ + if (irda_device_is_receiving(self->netdev) && !self->add_wait) { + IRDA_DEBUG(1, "FINAL_TIMER_EXPIRED when receiving a " + "frame! Waiting a little bit more!\n"); + irlap_start_final_timer(self, msecs_to_jiffies(300)); + + /* + * Don't allow this to happen one more time in a row, + * or else we can get a pretty tight loop here if + * if we only receive half a frame. DB. + */ + self->add_wait = TRUE; + break; + } + self->add_wait = FALSE; + + /* N2 is the disconnect timer. Until we reach it, we retry */ + if (self->retry_count < self->N2) { + /* Retry sending the pf bit to the secondary */ + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, CMD_FRAME); + + irlap_start_final_timer(self, self->final_timeout); + self->retry_count++; + IRDA_DEBUG(4, "irlap_state_nrm_p: FINAL_TIMER_EXPIRED:" + " retry_count=%d\n", self->retry_count); + + /* Early warning event. I'm using a pretty liberal + * interpretation of the spec and generate an event + * every time the timer is multiple of N1 (and not + * only the first time). This allow application + * to know precisely if connectivity restart... + * Jean II */ + if((self->retry_count % self->N1) == 0) + irlap_status_indication(self, + STATUS_NO_ACTIVITY); + + /* Keep state */ + } else { + irlap_apply_default_connection_parameters(self); + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + irlap_disconnect_indication(self, LAP_NO_RESPONSE); + } + break; + case RECV_REJ_RSP: + irlap_update_nr_received(self, info->nr); + if (self->remote_busy) { + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, CMD_FRAME); + } else + irlap_resend_rejected_frames(self, CMD_FRAME); + irlap_start_final_timer(self, 2 * self->final_timeout); + break; + case RECV_SREJ_RSP: + irlap_update_nr_received(self, info->nr); + if (self->remote_busy) { + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, CMD_FRAME); + } else + irlap_resend_rejected_frame(self, CMD_FRAME); + irlap_start_final_timer(self, 2 * self->final_timeout); + break; + case RECV_RD_RSP: + IRDA_DEBUG(1, "%s(), RECV_RD_RSP\n", __FUNCTION__); + + irlap_flush_all_queues(self); + irlap_next_state(self, LAP_XMIT_P); + /* Call back the LAP state machine to do a proper disconnect */ + irlap_disconnect_request(self); + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %s\n", + __FUNCTION__, irlap_event[event]); + + ret = -1; + break; + } + return ret; +} + +/* + * Function irlap_state_reset_wait (event, skb, info) + * + * We have informed the service user of a reset condition, and is + * awaiting reset of disconnect request. + * + */ +static int irlap_state_reset_wait(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + + IRDA_DEBUG(3, "%s(), event = %s\n", __FUNCTION__, irlap_event[event]); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case RESET_REQUEST: + if (self->xmitflag) { + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_snrm_frame(self, NULL); + irlap_start_final_timer(self, self->final_timeout); + irlap_next_state(self, LAP_RESET); + } else { + irlap_start_final_timer(self, self->final_timeout); + irlap_next_state(self, LAP_RESET); + } + break; + case DISCONNECT_REQUEST: + irlap_wait_min_turn_around( self, &self->qos_tx); + irlap_send_disc_frame( self); + irlap_flush_all_queues( self); + irlap_start_final_timer( self, self->final_timeout); + self->retry_count = 0; + irlap_next_state( self, LAP_PCLOSE); + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %s\n", __FUNCTION__, + irlap_event[event]); + + ret = -1; + break; + } + return ret; +} + +/* + * Function irlap_state_reset (self, event, skb, info) + * + * We have sent a SNRM reset command to the peer layer, and is awaiting + * reply. + * + */ +static int irlap_state_reset(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + + IRDA_DEBUG(3, "%s(), event = %s\n", __FUNCTION__, irlap_event[event]); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case RECV_DISC_CMD: + del_timer(&self->final_timer); + + irlap_apply_default_connection_parameters(self); + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + irlap_disconnect_indication(self, LAP_NO_RESPONSE); + + break; + case RECV_UA_RSP: + del_timer(&self->final_timer); + + /* Initiate connection state */ + irlap_initiate_connection_state(self); + + irlap_reset_confirm(); + + self->remote_busy = FALSE; + + irlap_next_state(self, LAP_XMIT_P); + + irlap_start_poll_timer(self, self->poll_timeout); + + break; + case FINAL_TIMER_EXPIRED: + if (self->retry_count < 3) { + irlap_wait_min_turn_around(self, &self->qos_tx); + + IRDA_ASSERT(self->netdev != NULL, return -1;); + irlap_send_snrm_frame(self, self->qos_dev); + + self->retry_count++; /* Experimental!! */ + + irlap_start_final_timer(self, self->final_timeout); + irlap_next_state(self, LAP_RESET); + } else if (self->retry_count >= self->N3) { + irlap_apply_default_connection_parameters(self); + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + irlap_disconnect_indication(self, LAP_NO_RESPONSE); + } + break; + case RECV_SNRM_CMD: + /* + * SNRM frame is not allowed to contain an I-field in this + * state + */ + if (!info) { + IRDA_DEBUG(3, "%s(), RECV_SNRM_CMD\n", __FUNCTION__); + irlap_initiate_connection_state(self); + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_ua_response_frame(self, &self->qos_rx); + irlap_reset_confirm(); + irlap_start_wd_timer(self, self->wd_timeout); + irlap_next_state(self, LAP_NDM); + } else { + IRDA_DEBUG(0, + "%s(), SNRM frame contained an I field!\n", + __FUNCTION__); + } + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %s\n", + __FUNCTION__, irlap_event[event]); + + ret = -1; + break; + } + return ret; +} + +/* + * Function irlap_state_xmit_s (event, skb, info) + * + * XMIT_S, The secondary station has been given the right to transmit, + * and we therefor do not expect to receive any transmissions from other + * stations. + */ +static int irlap_state_xmit_s(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + + IRDA_DEBUG(4, "%s(), event=%s\n", __FUNCTION__, irlap_event[event]); + + IRDA_ASSERT(self != NULL, return -ENODEV;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -EBADR;); + + switch (event) { + case SEND_I_CMD: + /* + * Send frame only if send window > 0 + */ + if ((self->window > 0) && (!self->remote_busy)) { + int nextfit; +#ifdef CONFIG_IRDA_DYNAMIC_WINDOW + struct sk_buff *skb_next; + + /* + * Same deal as in irlap_state_xmit_p(), so see + * the comments at that point. + * We are the secondary, so there are only subtle + * differences. - Jean II + */ + + /* Check if a subsequent skb exist and would fit in + * the current window (with respect to turnaround + * time). - Jean II */ + skb_next = skb_peek(&self->txq); + nextfit = ((skb_next != NULL) && + ((skb_next->len + skb->len) <= + self->bytes_left)); + + /* + * Test if we have transmitted more bytes over the + * link than its possible to do with the current + * speed and turn-around-time. + */ + if((!nextfit) && (skb->len > self->bytes_left)) { + IRDA_DEBUG(0, "%s(), Not allowed to transmit" + " more bytes!\n", __FUNCTION__); + /* Requeue the skb */ + skb_queue_head(&self->txq, skb_get(skb)); + + /* + * Switch to NRM_S, this is only possible + * when we are in secondary mode, since we + * must be sure that we don't miss any RR + * frames + */ + self->window = self->window_size; + self->bytes_left = self->line_capacity; + irlap_start_wd_timer(self, self->wd_timeout); + + irlap_next_state(self, LAP_NRM_S); + /* Slight difference with primary : + * here we would wait for the other side to + * expire the turnaround. - Jean II */ + + return -EPROTO; /* Try again later */ + } + /* Substract space used by this skb */ + self->bytes_left -= skb->len; +#else /* CONFIG_IRDA_DYNAMIC_WINDOW */ + /* Window has been adjusted for the max packet + * size, so much simpler... - Jean II */ + nextfit = (skb_queue_len(&self->txq) > 0); +#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ + /* + * Send data with final bit cleared only if window > 1 + * and there is more frames to be sent + */ + if ((self->window > 1) && (nextfit)) { + irlap_send_data_secondary(self, skb); + irlap_next_state(self, LAP_XMIT_S); + } else { + irlap_send_data_secondary_final(self, skb); + irlap_next_state(self, LAP_NRM_S); + + /* + * Make sure state machine does not try to send + * any more frames + */ + ret = -EPROTO; + } + } else { + IRDA_DEBUG(2, "%s(), Unable to send!\n", __FUNCTION__); + skb_queue_head(&self->txq, skb_get(skb)); + ret = -EPROTO; + } + break; + case DISCONNECT_REQUEST: + irlap_send_rd_frame(self); + irlap_flush_all_queues(self); + irlap_start_wd_timer(self, self->wd_timeout); + irlap_next_state(self, LAP_SCLOSE); + break; + case DATA_REQUEST: + /* Nothing to do, irlap_do_event() will send the packet + * when we return... - Jean II */ + break; + default: + IRDA_DEBUG(2, "%s(), Unknown event %s\n", __FUNCTION__, + irlap_event[event]); + + ret = -EINVAL; + break; + } + return ret; +} + +/* + * Function irlap_state_nrm_s (event, skb, info) + * + * NRM_S (Normal Response Mode as Secondary) state, in this state we are + * expecting to receive frames from the primary station + * + */ +static int irlap_state_nrm_s(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ns_status; + int nr_status; + int ret = 0; + + IRDA_DEBUG(4, "%s(), event=%s\n", __FUNCTION__, irlap_event[ event]); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + switch (event) { + case RECV_I_CMD: /* Optimize for the common case */ + /* FIXME: must check for remote_busy below */ + IRDA_DEBUG(4, "%s(), event=%s nr=%d, vs=%d, ns=%d, " + "vr=%d, pf=%d\n", __FUNCTION__, + irlap_event[event], info->nr, + self->vs, info->ns, self->vr, info->pf); + + self->retry_count = 0; + + ns_status = irlap_validate_ns_received(self, info->ns); + nr_status = irlap_validate_nr_received(self, info->nr); + /* + * Check for expected I(nformation) frame + */ + if ((ns_status == NS_EXPECTED) && (nr_status == NR_EXPECTED)) { + + /* Update Vr (next frame for us to receive) */ + self->vr = (self->vr + 1) % 8; + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + /* + * poll bit cleared? + */ + if (!info->pf) { + + self->ack_required = TRUE; + + /* + * Starting WD-timer here is optional, but + * not recommended. Note 6 IrLAP p. 83 + */ +#if 0 + irda_start_timer(WD_TIMER, self->wd_timeout); +#endif + /* Keep state, do not move this line */ + irlap_next_state(self, LAP_NRM_S); + + irlap_data_indication(self, skb, FALSE); + break; + } else { + /* + * We should wait before sending RR, and + * also before changing to XMIT_S + * state. (note 1, IrLAP p. 82) + */ + irlap_wait_min_turn_around(self, &self->qos_tx); + + /* + * Give higher layers a chance to + * immediately reply with some data before + * we decide if we should send a RR frame + * or not + */ + irlap_data_indication(self, skb, FALSE); + + /* Any pending data requests? */ + if ((skb_queue_len(&self->txq) > 0) && + (self->window > 0)) + { + self->ack_required = TRUE; + + del_timer(&self->wd_timer); + + irlap_next_state(self, LAP_XMIT_S); + } else { + irlap_send_rr_frame(self, RSP_FRAME); + irlap_start_wd_timer(self, + self->wd_timeout); + + /* Keep the state */ + irlap_next_state(self, LAP_NRM_S); + } + break; + } + } + /* + * Check for Unexpected next to send (Ns) + */ + if ((ns_status == NS_UNEXPECTED) && (nr_status == NR_EXPECTED)) + { + /* Unexpected next to send, with final bit cleared */ + if (!info->pf) { + irlap_update_nr_received(self, info->nr); + + irlap_start_wd_timer(self, self->wd_timeout); + } else { + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, RSP_FRAME); + + irlap_start_wd_timer(self, self->wd_timeout); + } + break; + } + + /* + * Unexpected Next to Receive(NR) ? + */ + if ((ns_status == NS_EXPECTED) && (nr_status == NR_UNEXPECTED)) + { + if (info->pf) { + IRDA_DEBUG(4, "RECV_I_RSP: frame(s) lost\n"); + + self->vr = (self->vr + 1) % 8; + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + /* Resend rejected frames */ + irlap_resend_rejected_frames(self, RSP_FRAME); + + /* Keep state, do not move this line */ + irlap_next_state(self, LAP_NRM_S); + + irlap_data_indication(self, skb, FALSE); + irlap_start_wd_timer(self, self->wd_timeout); + break; + } + /* + * This is not documented in IrLAP!! Unexpected NR + * with poll bit cleared + */ + if (!info->pf) { + self->vr = (self->vr + 1) % 8; + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + + /* Keep state, do not move this line */ + irlap_next_state(self, LAP_NRM_S); + + irlap_data_indication(self, skb, FALSE); + irlap_start_wd_timer(self, self->wd_timeout); + } + break; + } + + if (ret == NR_INVALID) { + IRDA_DEBUG(0, "NRM_S, NR_INVALID not implemented!\n"); + } + if (ret == NS_INVALID) { + IRDA_DEBUG(0, "NRM_S, NS_INVALID not implemented!\n"); + } + break; + case RECV_UI_FRAME: + /* + * poll bit cleared? + */ + if (!info->pf) { + irlap_data_indication(self, skb, TRUE); + irlap_next_state(self, LAP_NRM_S); /* Keep state */ + } else { + /* + * Any pending data requests? + */ + if ((skb_queue_len(&self->txq) > 0) && + (self->window > 0) && !self->remote_busy) + { + irlap_data_indication(self, skb, TRUE); + + del_timer(&self->wd_timer); + + irlap_next_state(self, LAP_XMIT_S); + } else { + irlap_data_indication(self, skb, TRUE); + + irlap_wait_min_turn_around(self, &self->qos_tx); + + irlap_send_rr_frame(self, RSP_FRAME); + self->ack_required = FALSE; + + irlap_start_wd_timer(self, self->wd_timeout); + + /* Keep the state */ + irlap_next_state(self, LAP_NRM_S); + } + } + break; + case RECV_RR_CMD: + self->retry_count = 0; + + /* + * Nr as expected? + */ + nr_status = irlap_validate_nr_received(self, info->nr); + if (nr_status == NR_EXPECTED) { + if ((skb_queue_len( &self->txq) > 0) && + (self->window > 0)) { + self->remote_busy = FALSE; + + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + del_timer(&self->wd_timer); + + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_next_state(self, LAP_XMIT_S); + } else { + self->remote_busy = FALSE; + /* Update Nr received */ + irlap_update_nr_received(self, info->nr); + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_start_wd_timer(self, self->wd_timeout); + + /* Note : if the link is idle (this case), + * we never go in XMIT_S, so we never get a + * chance to process any DISCONNECT_REQUEST. + * Do it now ! - Jean II */ + if (self->disconnect_pending) { + /* Disconnect */ + irlap_send_rd_frame(self); + irlap_flush_all_queues(self); + + irlap_next_state(self, LAP_SCLOSE); + } else { + /* Just send back pf bit */ + irlap_send_rr_frame(self, RSP_FRAME); + + irlap_next_state(self, LAP_NRM_S); + } + } + } else if (nr_status == NR_UNEXPECTED) { + self->remote_busy = FALSE; + irlap_update_nr_received(self, info->nr); + irlap_resend_rejected_frames(self, RSP_FRAME); + + irlap_start_wd_timer(self, self->wd_timeout); + + /* Keep state */ + irlap_next_state(self, LAP_NRM_S); + } else { + IRDA_DEBUG(1, "%s(), invalid nr not implemented!\n", + __FUNCTION__); + } + break; + case RECV_SNRM_CMD: + /* SNRM frame is not allowed to contain an I-field */ + if (!info) { + del_timer(&self->wd_timer); + IRDA_DEBUG(1, "%s(), received SNRM cmd\n", __FUNCTION__); + irlap_next_state(self, LAP_RESET_CHECK); + + irlap_reset_indication(self); + } else { + IRDA_DEBUG(0, + "%s(), SNRM frame contained an I-field!\n", + __FUNCTION__); + + } + break; + case RECV_REJ_CMD: + irlap_update_nr_received(self, info->nr); + if (self->remote_busy) { + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, RSP_FRAME); + } else + irlap_resend_rejected_frames(self, RSP_FRAME); + irlap_start_wd_timer(self, self->wd_timeout); + break; + case RECV_SREJ_CMD: + irlap_update_nr_received(self, info->nr); + if (self->remote_busy) { + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, RSP_FRAME); + } else + irlap_resend_rejected_frame(self, RSP_FRAME); + irlap_start_wd_timer(self, self->wd_timeout); + break; + case WD_TIMER_EXPIRED: + /* + * Wait until retry_count * n matches negotiated threshold/ + * disconnect time (note 2 in IrLAP p. 82) + * + * Similar to irlap_state_nrm_p() -> FINAL_TIMER_EXPIRED + * Note : self->wd_timeout = (self->final_timeout * 2), + * which explain why we use (self->N2 / 2) here !!! + * Jean II + */ + IRDA_DEBUG(1, "%s(), retry_count = %d\n", __FUNCTION__, + self->retry_count); + + if (self->retry_count < (self->N2 / 2)) { + /* No retry, just wait for primary */ + irlap_start_wd_timer(self, self->wd_timeout); + self->retry_count++; + + if((self->retry_count % (self->N1 / 2)) == 0) + irlap_status_indication(self, + STATUS_NO_ACTIVITY); + } else { + irlap_apply_default_connection_parameters(self); + + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + irlap_disconnect_indication(self, LAP_NO_RESPONSE); + } + break; + case RECV_DISC_CMD: + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + /* Send disconnect response */ + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_ua_response_frame(self, NULL); + + del_timer(&self->wd_timer); + irlap_flush_all_queues(self); + /* Set default link parameters */ + irlap_apply_default_connection_parameters(self); + + irlap_disconnect_indication(self, LAP_DISC_INDICATION); + break; + case RECV_DISCOVERY_XID_CMD: + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rr_frame(self, RSP_FRAME); + self->ack_required = TRUE; + irlap_start_wd_timer(self, self->wd_timeout); + irlap_next_state(self, LAP_NRM_S); + + break; + case RECV_TEST_CMD: + /* Remove test frame header (only LAP header in NRM) */ + skb_pull(skb, LAP_ADDR_HEADER + LAP_CTRL_HEADER); + + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_start_wd_timer(self, self->wd_timeout); + + /* Send response (info will be copied) */ + irlap_send_test_frame(self, self->caddr, info->daddr, skb); + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %d, (%s)\n", __FUNCTION__, + event, irlap_event[event]); + + ret = -EINVAL; + break; + } + return ret; +} + +/* + * Function irlap_state_sclose (self, event, skb, info) + */ +static int irlap_state_sclose(struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, struct irlap_info *info) +{ + int ret = 0; + + IRDA_DEBUG(1, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return -ENODEV;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -EBADR;); + + switch (event) { + case RECV_DISC_CMD: + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + /* Send disconnect response */ + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_ua_response_frame(self, NULL); + + del_timer(&self->wd_timer); + /* Set default link parameters */ + irlap_apply_default_connection_parameters(self); + + irlap_disconnect_indication(self, LAP_DISC_INDICATION); + break; + case RECV_DM_RSP: + /* IrLAP-1.1 p.82: in SCLOSE, S and I type RSP frames + * shall take us down into default NDM state, like DM_RSP + */ + case RECV_RR_RSP: + case RECV_RNR_RSP: + case RECV_REJ_RSP: + case RECV_SREJ_RSP: + case RECV_I_RSP: + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + del_timer(&self->wd_timer); + irlap_apply_default_connection_parameters(self); + + irlap_disconnect_indication(self, LAP_DISC_INDICATION); + break; + case WD_TIMER_EXPIRED: + /* Always switch state before calling upper layers */ + irlap_next_state(self, LAP_NDM); + + irlap_apply_default_connection_parameters(self); + + irlap_disconnect_indication(self, LAP_DISC_INDICATION); + break; + default: + /* IrLAP-1.1 p.82: in SCLOSE, basically any received frame + * with pf=1 shall restart the wd-timer and resend the rd:rsp + */ + if (info != NULL && info->pf) { + del_timer(&self->wd_timer); + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rd_frame(self); + irlap_start_wd_timer(self, self->wd_timeout); + break; /* stay in SCLOSE */ + } + + IRDA_DEBUG(1, "%s(), Unknown event %d, (%s)\n", __FUNCTION__, + event, irlap_event[event]); + + ret = -EINVAL; + break; + } + + return -1; +} + +static int irlap_state_reset_check( struct irlap_cb *self, IRLAP_EVENT event, + struct sk_buff *skb, + struct irlap_info *info) +{ + int ret = 0; + + IRDA_DEBUG(1, "%s(), event=%s\n", __FUNCTION__, irlap_event[event]); + + IRDA_ASSERT(self != NULL, return -ENODEV;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -EBADR;); + + switch (event) { + case RESET_RESPONSE: + irlap_send_ua_response_frame(self, &self->qos_rx); + irlap_initiate_connection_state(self); + irlap_start_wd_timer(self, WD_TIMEOUT); + irlap_flush_all_queues(self); + + irlap_next_state(self, LAP_NRM_S); + break; + case DISCONNECT_REQUEST: + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_send_rd_frame(self); + irlap_start_wd_timer(self, WD_TIMEOUT); + irlap_next_state(self, LAP_SCLOSE); + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %d, (%s)\n", __FUNCTION__, + event, irlap_event[event]); + + ret = -EINVAL; + break; + } + return ret; +} diff --git a/net/irda/irlap_frame.c b/net/irda/irlap_frame.c new file mode 100644 index 000000000000..040abe714aa3 --- /dev/null +++ b/net/irda/irlap_frame.c @@ -0,0 +1,1437 @@ +/********************************************************************* + * + * Filename: irlap_frame.c + * Version: 1.0 + * Description: Build and transmit IrLAP frames + * Status: Stable + * Author: Dag Brattli + * Created at: Tue Aug 19 10:27:26 1997 + * Modified at: Wed Jan 5 08:59:04 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-2000 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +static void irlap_send_i_frame(struct irlap_cb *self, struct sk_buff *skb, + int command); + +/* + * Function irlap_insert_info (self, skb) + * + * Insert minimum turnaround time and speed information into the skb. We + * need to do this since it's per packet relevant information. Safe to + * have this function inlined since it's only called from one place + */ +static inline void irlap_insert_info(struct irlap_cb *self, + struct sk_buff *skb) +{ + struct irda_skb_cb *cb = (struct irda_skb_cb *) skb->cb; + + /* + * Insert MTT (min. turn time) and speed into skb, so that the + * device driver knows which settings to use + */ + cb->magic = LAP_MAGIC; + cb->mtt = self->mtt_required; + cb->next_speed = self->speed; + + /* Reset */ + self->mtt_required = 0; + + /* + * Delay equals negotiated BOFs count, plus the number of BOFs to + * force the negotiated minimum turnaround time + */ + cb->xbofs = self->bofs_count; + cb->next_xbofs = self->next_bofs; + cb->xbofs_delay = self->xbofs_delay; + + /* Reset XBOF's delay (used only for getting min turn time) */ + self->xbofs_delay = 0; + /* Put the correct xbofs value for the next packet */ + self->bofs_count = self->next_bofs; +} + +/* + * Function irlap_queue_xmit (self, skb) + * + * A little wrapper for dev_queue_xmit, so we can insert some common + * code into it. + */ +void irlap_queue_xmit(struct irlap_cb *self, struct sk_buff *skb) +{ + /* Some common init stuff */ + skb->dev = self->netdev; + skb->h.raw = skb->nh.raw = skb->mac.raw = skb->data; + skb->protocol = htons(ETH_P_IRDA); + skb->priority = TC_PRIO_BESTEFFORT; + + irlap_insert_info(self, skb); + + dev_queue_xmit(skb); +} + +/* + * Function irlap_send_snrm_cmd (void) + * + * Transmits a connect SNRM command frame + */ +void irlap_send_snrm_frame(struct irlap_cb *self, struct qos_info *qos) +{ + struct sk_buff *tx_skb; + struct snrm_frame *frame; + int ret; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Allocate frame */ + tx_skb = dev_alloc_skb(64); + if (!tx_skb) + return; + + frame = (struct snrm_frame *) skb_put(tx_skb, 2); + + /* Insert connection address field */ + if (qos) + frame->caddr = CMD_FRAME | CBROADCAST; + else + frame->caddr = CMD_FRAME | self->caddr; + + /* Insert control field */ + frame->control = SNRM_CMD | PF_BIT; + + /* + * If we are establishing a connection then insert QoS paramerters + */ + if (qos) { + skb_put(tx_skb, 9); /* 21 left */ + frame->saddr = cpu_to_le32(self->saddr); + frame->daddr = cpu_to_le32(self->daddr); + + frame->ncaddr = self->caddr; + + ret = irlap_insert_qos_negotiation_params(self, tx_skb); + if (ret < 0) { + dev_kfree_skb(tx_skb); + return; + } + } + irlap_queue_xmit(self, tx_skb); +} + +/* + * Function irlap_recv_snrm_cmd (skb, info) + * + * Received SNRM (Set Normal Response Mode) command frame + * + */ +static void irlap_recv_snrm_cmd(struct irlap_cb *self, struct sk_buff *skb, + struct irlap_info *info) +{ + struct snrm_frame *frame; + + if (pskb_may_pull(skb,sizeof(struct snrm_frame))) { + frame = (struct snrm_frame *) skb->data; + + /* Copy the new connection address ignoring the C/R bit */ + info->caddr = frame->ncaddr & 0xFE; + + /* Check if the new connection address is valid */ + if ((info->caddr == 0x00) || (info->caddr == 0xfe)) { + IRDA_DEBUG(3, "%s(), invalid connection address!\n", + __FUNCTION__); + return; + } + + /* Copy peer device address */ + info->daddr = le32_to_cpu(frame->saddr); + info->saddr = le32_to_cpu(frame->daddr); + + /* Only accept if addressed directly to us */ + if (info->saddr != self->saddr) { + IRDA_DEBUG(2, "%s(), not addressed to us!\n", + __FUNCTION__); + return; + } + irlap_do_event(self, RECV_SNRM_CMD, skb, info); + } else { + /* Signal that this SNRM frame does not contain and I-field */ + irlap_do_event(self, RECV_SNRM_CMD, skb, NULL); + } +} + +/* + * Function irlap_send_ua_response_frame (qos) + * + * Send UA (Unnumbered Acknowledgement) frame + * + */ +void irlap_send_ua_response_frame(struct irlap_cb *self, struct qos_info *qos) +{ + struct sk_buff *tx_skb; + struct ua_frame *frame; + int ret; + + IRDA_DEBUG(2, "%s() <%ld>\n", __FUNCTION__, jiffies); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Allocate frame */ + tx_skb = dev_alloc_skb(64); + if (!tx_skb) + return; + + frame = (struct ua_frame *) skb_put(tx_skb, 10); + + /* Build UA response */ + frame->caddr = self->caddr; + frame->control = UA_RSP | PF_BIT; + + frame->saddr = cpu_to_le32(self->saddr); + frame->daddr = cpu_to_le32(self->daddr); + + /* Should we send QoS negotiation parameters? */ + if (qos) { + ret = irlap_insert_qos_negotiation_params(self, tx_skb); + if (ret < 0) { + dev_kfree_skb(tx_skb); + return; + } + } + + irlap_queue_xmit(self, tx_skb); +} + + +/* + * Function irlap_send_dm_frame (void) + * + * Send disconnected mode (DM) frame + * + */ +void irlap_send_dm_frame( struct irlap_cb *self) +{ + struct sk_buff *tx_skb = NULL; + __u8 *frame; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + tx_skb = dev_alloc_skb(32); + if (!tx_skb) + return; + + frame = skb_put(tx_skb, 2); + + if (self->state == LAP_NDM) + frame[0] = CBROADCAST; + else + frame[0] = self->caddr; + + frame[1] = DM_RSP | PF_BIT; + + irlap_queue_xmit(self, tx_skb); +} + +/* + * Function irlap_send_disc_frame (void) + * + * Send disconnect (DISC) frame + * + */ +void irlap_send_disc_frame(struct irlap_cb *self) +{ + struct sk_buff *tx_skb = NULL; + __u8 *frame; + + IRDA_DEBUG(3, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + tx_skb = dev_alloc_skb(16); + if (!tx_skb) + return; + + frame = skb_put(tx_skb, 2); + + frame[0] = self->caddr | CMD_FRAME; + frame[1] = DISC_CMD | PF_BIT; + + irlap_queue_xmit(self, tx_skb); +} + +/* + * Function irlap_send_discovery_xid_frame (S, s, command) + * + * Build and transmit a XID (eXchange station IDentifier) discovery + * frame. + */ +void irlap_send_discovery_xid_frame(struct irlap_cb *self, int S, __u8 s, + __u8 command, discovery_t *discovery) +{ + struct sk_buff *tx_skb = NULL; + struct xid_frame *frame; + __u32 bcast = BROADCAST; + __u8 *info; + + IRDA_DEBUG(4, "%s(), s=%d, S=%d, command=%d\n", __FUNCTION__, + s, S, command); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + IRDA_ASSERT(discovery != NULL, return;); + + tx_skb = dev_alloc_skb(64); + if (!tx_skb) + return; + + skb_put(tx_skb, 14); + frame = (struct xid_frame *) tx_skb->data; + + if (command) { + frame->caddr = CBROADCAST | CMD_FRAME; + frame->control = XID_CMD | PF_BIT; + } else { + frame->caddr = CBROADCAST; + frame->control = XID_RSP | PF_BIT; + } + frame->ident = XID_FORMAT; + + frame->saddr = cpu_to_le32(self->saddr); + + if (command) + frame->daddr = cpu_to_le32(bcast); + else + frame->daddr = cpu_to_le32(discovery->data.daddr); + + switch (S) { + case 1: + frame->flags = 0x00; + break; + case 6: + frame->flags = 0x01; + break; + case 8: + frame->flags = 0x02; + break; + case 16: + frame->flags = 0x03; + break; + default: + frame->flags = 0x02; + break; + } + + frame->slotnr = s; + frame->version = 0x00; + + /* + * Provide info for final slot only in commands, and for all + * responses. Send the second byte of the hint only if the + * EXTENSION bit is set in the first byte. + */ + if (!command || (frame->slotnr == 0xff)) { + int len; + + if (discovery->data.hints[0] & HINT_EXTENSION) { + info = skb_put(tx_skb, 2); + info[0] = discovery->data.hints[0]; + info[1] = discovery->data.hints[1]; + } else { + info = skb_put(tx_skb, 1); + info[0] = discovery->data.hints[0]; + } + info = skb_put(tx_skb, 1); + info[0] = discovery->data.charset; + + len = IRDA_MIN(discovery->name_len, skb_tailroom(tx_skb)); + info = skb_put(tx_skb, len); + memcpy(info, discovery->data.info, len); + } + irlap_queue_xmit(self, tx_skb); +} + +/* + * Function irlap_recv_discovery_xid_rsp (skb, info) + * + * Received a XID discovery response + * + */ +static void irlap_recv_discovery_xid_rsp(struct irlap_cb *self, + struct sk_buff *skb, + struct irlap_info *info) +{ + struct xid_frame *xid; + discovery_t *discovery = NULL; + __u8 *discovery_info; + char *text; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + if (!pskb_may_pull(skb, sizeof(struct xid_frame))) { + IRDA_ERROR("%s: frame to short!\n", __FUNCTION__); + return; + } + + xid = (struct xid_frame *) skb->data; + + info->daddr = le32_to_cpu(xid->saddr); + info->saddr = le32_to_cpu(xid->daddr); + + /* Make sure frame is addressed to us */ + if ((info->saddr != self->saddr) && (info->saddr != BROADCAST)) { + IRDA_DEBUG(0, "%s(), frame is not addressed to us!\n", + __FUNCTION__); + return; + } + + if ((discovery = kmalloc(sizeof(discovery_t), GFP_ATOMIC)) == NULL) { + IRDA_WARNING("%s: kmalloc failed!\n", __FUNCTION__); + return; + } + memset(discovery, 0, sizeof(discovery_t)); + + discovery->data.daddr = info->daddr; + discovery->data.saddr = self->saddr; + discovery->timestamp = jiffies; + + IRDA_DEBUG(4, "%s(), daddr=%08x\n", __FUNCTION__, + discovery->data.daddr); + + discovery_info = skb_pull(skb, sizeof(struct xid_frame)); + + /* Get info returned from peer */ + discovery->data.hints[0] = discovery_info[0]; + if (discovery_info[0] & HINT_EXTENSION) { + IRDA_DEBUG(4, "EXTENSION\n"); + discovery->data.hints[1] = discovery_info[1]; + discovery->data.charset = discovery_info[2]; + text = (char *) &discovery_info[3]; + } else { + discovery->data.hints[1] = 0; + discovery->data.charset = discovery_info[1]; + text = (char *) &discovery_info[2]; + } + /* + * Terminate info string, should be safe since this is where the + * FCS bytes resides. + */ + skb->data[skb->len] = '\0'; + strncpy(discovery->data.info, text, NICKNAME_MAX_LEN); + discovery->name_len = strlen(discovery->data.info); + + info->discovery = discovery; + + irlap_do_event(self, RECV_DISCOVERY_XID_RSP, skb, info); +} + +/* + * Function irlap_recv_discovery_xid_cmd (skb, info) + * + * Received a XID discovery command + * + */ +static void irlap_recv_discovery_xid_cmd(struct irlap_cb *self, + struct sk_buff *skb, + struct irlap_info *info) +{ + struct xid_frame *xid; + discovery_t *discovery = NULL; + __u8 *discovery_info; + char *text; + + if (!pskb_may_pull(skb, sizeof(struct xid_frame))) { + IRDA_ERROR("%s: frame to short!\n", __FUNCTION__); + return; + } + + xid = (struct xid_frame *) skb->data; + + info->daddr = le32_to_cpu(xid->saddr); + info->saddr = le32_to_cpu(xid->daddr); + + /* Make sure frame is addressed to us */ + if ((info->saddr != self->saddr) && (info->saddr != BROADCAST)) { + IRDA_DEBUG(0, "%s(), frame is not addressed to us!\n", + __FUNCTION__); + return; + } + + switch (xid->flags & 0x03) { + case 0x00: + info->S = 1; + break; + case 0x01: + info->S = 6; + break; + case 0x02: + info->S = 8; + break; + case 0x03: + info->S = 16; + break; + default: + /* Error!! */ + return; + } + info->s = xid->slotnr; + + discovery_info = skb_pull(skb, sizeof(struct xid_frame)); + + /* + * Check if last frame + */ + if (info->s == 0xff) { + /* Check if things are sane at this point... */ + if((discovery_info == NULL) || + !pskb_may_pull(skb, 3)) { + IRDA_ERROR("%s: discovery frame to short!\n", + __FUNCTION__); + return; + } + + /* + * We now have some discovery info to deliver! + */ + discovery = kmalloc(sizeof(discovery_t), GFP_ATOMIC); + if (!discovery) { + IRDA_WARNING("%s: unable to malloc!\n", __FUNCTION__); + return; + } + + discovery->data.daddr = info->daddr; + discovery->data.saddr = self->saddr; + discovery->timestamp = jiffies; + + discovery->data.hints[0] = discovery_info[0]; + if (discovery_info[0] & HINT_EXTENSION) { + discovery->data.hints[1] = discovery_info[1]; + discovery->data.charset = discovery_info[2]; + text = (char *) &discovery_info[3]; + } else { + discovery->data.hints[1] = 0; + discovery->data.charset = discovery_info[1]; + text = (char *) &discovery_info[2]; + } + /* + * Terminate string, should be safe since this is where the + * FCS bytes resides. + */ + skb->data[skb->len] = '\0'; + strncpy(discovery->data.info, text, NICKNAME_MAX_LEN); + discovery->name_len = strlen(discovery->data.info); + + info->discovery = discovery; + } else + info->discovery = NULL; + + irlap_do_event(self, RECV_DISCOVERY_XID_CMD, skb, info); +} + +/* + * Function irlap_send_rr_frame (self, command) + * + * Build and transmit RR (Receive Ready) frame. Notice that it is currently + * only possible to send RR frames with the poll bit set. + */ +void irlap_send_rr_frame(struct irlap_cb *self, int command) +{ + struct sk_buff *tx_skb; + __u8 *frame; + + tx_skb = dev_alloc_skb(16); + if (!tx_skb) + return; + + frame = skb_put(tx_skb, 2); + + frame[0] = self->caddr; + frame[0] |= (command) ? CMD_FRAME : 0; + + frame[1] = RR | PF_BIT | (self->vr << 5); + + irlap_queue_xmit(self, tx_skb); +} + +/* + * Function irlap_send_rd_frame (self) + * + * Request disconnect. Used by a secondary station to request the + * disconnection of the link. + */ +void irlap_send_rd_frame(struct irlap_cb *self) +{ + struct sk_buff *tx_skb; + __u8 *frame; + + tx_skb = dev_alloc_skb(16); + if (!tx_skb) + return; + + frame = skb_put(tx_skb, 2); + + frame[0] = self->caddr; + frame[1] = RD_RSP | PF_BIT; + + irlap_queue_xmit(self, tx_skb); +} + +/* + * Function irlap_recv_rr_frame (skb, info) + * + * Received RR (Receive Ready) frame from peer station, no harm in + * making it inline since its called only from one single place + * (irlap_driver_rcv). + */ +static inline void irlap_recv_rr_frame(struct irlap_cb *self, + struct sk_buff *skb, + struct irlap_info *info, int command) +{ + info->nr = skb->data[1] >> 5; + + /* Check if this is a command or a response frame */ + if (command) + irlap_do_event(self, RECV_RR_CMD, skb, info); + else + irlap_do_event(self, RECV_RR_RSP, skb, info); +} + +/* + * Function irlap_recv_rnr_frame (self, skb, info) + * + * Received RNR (Receive Not Ready) frame from peer station + * + */ +static void irlap_recv_rnr_frame(struct irlap_cb *self, struct sk_buff *skb, + struct irlap_info *info, int command) +{ + info->nr = skb->data[1] >> 5; + + IRDA_DEBUG(4, "%s(), nr=%d, %ld\n", __FUNCTION__, info->nr, jiffies); + + if (command) + irlap_do_event(self, RECV_RNR_CMD, skb, info); + else + irlap_do_event(self, RECV_RNR_RSP, skb, info); +} + +static void irlap_recv_rej_frame(struct irlap_cb *self, struct sk_buff *skb, + struct irlap_info *info, int command) +{ + IRDA_DEBUG(0, "%s()\n", __FUNCTION__); + + info->nr = skb->data[1] >> 5; + + /* Check if this is a command or a response frame */ + if (command) + irlap_do_event(self, RECV_REJ_CMD, skb, info); + else + irlap_do_event(self, RECV_REJ_RSP, skb, info); +} + +static void irlap_recv_srej_frame(struct irlap_cb *self, struct sk_buff *skb, + struct irlap_info *info, int command) +{ + IRDA_DEBUG(0, "%s()\n", __FUNCTION__); + + info->nr = skb->data[1] >> 5; + + /* Check if this is a command or a response frame */ + if (command) + irlap_do_event(self, RECV_SREJ_CMD, skb, info); + else + irlap_do_event(self, RECV_SREJ_RSP, skb, info); +} + +static void irlap_recv_disc_frame(struct irlap_cb *self, struct sk_buff *skb, + struct irlap_info *info, int command) +{ + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + /* Check if this is a command or a response frame */ + if (command) + irlap_do_event(self, RECV_DISC_CMD, skb, info); + else + irlap_do_event(self, RECV_RD_RSP, skb, info); +} + +/* + * Function irlap_recv_ua_frame (skb, frame) + * + * Received UA (Unnumbered Acknowledgement) frame + * + */ +static inline void irlap_recv_ua_frame(struct irlap_cb *self, + struct sk_buff *skb, + struct irlap_info *info) +{ + irlap_do_event(self, RECV_UA_RSP, skb, info); +} + +/* + * Function irlap_send_data_primary(self, skb) + * + * Send I-frames as the primary station but without the poll bit set + * + */ +void irlap_send_data_primary(struct irlap_cb *self, struct sk_buff *skb) +{ + struct sk_buff *tx_skb; + + if (skb->data[1] == I_FRAME) { + + /* + * Insert frame sequence number (Vs) in control field before + * inserting into transmit window queue. + */ + skb->data[1] = I_FRAME | (self->vs << 1); + + /* + * Insert frame in store, in case of retransmissions + * Increase skb reference count, see irlap_do_event() + */ + skb_get(skb); + skb_queue_tail(&self->wx_list, skb); + + /* Copy buffer */ + tx_skb = skb_clone(skb, GFP_ATOMIC); + if (tx_skb == NULL) { + return; + } + + self->vs = (self->vs + 1) % 8; + self->ack_required = FALSE; + self->window -= 1; + + irlap_send_i_frame( self, tx_skb, CMD_FRAME); + } else { + IRDA_DEBUG(4, "%s(), sending unreliable frame\n", __FUNCTION__); + irlap_send_ui_frame(self, skb_get(skb), self->caddr, CMD_FRAME); + self->window -= 1; + } +} +/* + * Function irlap_send_data_primary_poll (self, skb) + * + * Send I(nformation) frame as primary with poll bit set + */ +void irlap_send_data_primary_poll(struct irlap_cb *self, struct sk_buff *skb) +{ + struct sk_buff *tx_skb; + int transmission_time; + + /* Stop P timer */ + del_timer(&self->poll_timer); + + /* Is this reliable or unreliable data? */ + if (skb->data[1] == I_FRAME) { + + /* + * Insert frame sequence number (Vs) in control field before + * inserting into transmit window queue. + */ + skb->data[1] = I_FRAME | (self->vs << 1); + + /* + * Insert frame in store, in case of retransmissions + * Increase skb reference count, see irlap_do_event() + */ + skb_get(skb); + skb_queue_tail(&self->wx_list, skb); + + /* Copy buffer */ + tx_skb = skb_clone(skb, GFP_ATOMIC); + if (tx_skb == NULL) { + return; + } + + /* + * Set poll bit if necessary. We do this to the copied + * skb, since retransmitted need to set or clear the poll + * bit depending on when they are sent. + */ + tx_skb->data[1] |= PF_BIT; + + self->vs = (self->vs + 1) % 8; + self->ack_required = FALSE; + + irlap_send_i_frame(self, tx_skb, CMD_FRAME); + } else { + IRDA_DEBUG(4, "%s(), sending unreliable frame\n", __FUNCTION__); + + if (self->ack_required) { + irlap_send_ui_frame(self, skb_get(skb), self->caddr, CMD_FRAME); + irlap_send_rr_frame(self, CMD_FRAME); + self->ack_required = FALSE; + } else { + skb->data[1] |= PF_BIT; + irlap_send_ui_frame(self, skb_get(skb), self->caddr, CMD_FRAME); + } + } + + /* How much time we took for transmission of all frames. + * We don't know, so let assume we used the full window. Jean II */ + transmission_time = self->final_timeout; + + /* Reset parameter so that we can fill next window */ + self->window = self->window_size; + +#ifdef CONFIG_IRDA_DYNAMIC_WINDOW + /* Remove what we have not used. Just do a prorata of the + * bytes left in window to window capacity. + * See max_line_capacities[][] in qos.c for details. Jean II */ + transmission_time -= (self->final_timeout * self->bytes_left + / self->line_capacity); + IRDA_DEBUG(4, "%s() adjusting transmission_time : ft=%d, bl=%d, lc=%d -> tt=%d\n", __FUNCTION__, self->final_timeout, self->bytes_left, self->line_capacity, transmission_time); + + /* We are allowed to transmit a maximum number of bytes again. */ + self->bytes_left = self->line_capacity; +#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ + + /* + * The network layer has a intermediate buffer between IrLAP + * and the IrDA driver which can contain 8 frames. So, even + * though IrLAP is currently sending the *last* frame of the + * tx-window, the driver most likely has only just started + * sending the *first* frame of the same tx-window. + * I.e. we are always at the very begining of or Tx window. + * Now, we are supposed to set the final timer from the end + * of our tx-window to let the other peer reply. So, we need + * to add extra time to compensate for the fact that we + * are really at the start of tx-window, otherwise the final timer + * might expire before he can answer... + * Jean II + */ + irlap_start_final_timer(self, self->final_timeout + transmission_time); + + /* + * The clever amongst you might ask why we do this adjustement + * only here, and not in all the other cases in irlap_event.c. + * In all those other case, we only send a very short management + * frame (few bytes), so the adjustement would be lost in the + * noise... + * The exception of course is irlap_resend_rejected_frame(). + * Jean II */ +} + +/* + * Function irlap_send_data_secondary_final (self, skb) + * + * Send I(nformation) frame as secondary with final bit set + * + */ +void irlap_send_data_secondary_final(struct irlap_cb *self, + struct sk_buff *skb) +{ + struct sk_buff *tx_skb = NULL; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + /* Is this reliable or unreliable data? */ + if (skb->data[1] == I_FRAME) { + + /* + * Insert frame sequence number (Vs) in control field before + * inserting into transmit window queue. + */ + skb->data[1] = I_FRAME | (self->vs << 1); + + /* + * Insert frame in store, in case of retransmissions + * Increase skb reference count, see irlap_do_event() + */ + skb_get(skb); + skb_queue_tail(&self->wx_list, skb); + + tx_skb = skb_clone(skb, GFP_ATOMIC); + if (tx_skb == NULL) { + return; + } + + tx_skb->data[1] |= PF_BIT; + + self->vs = (self->vs + 1) % 8; + self->ack_required = FALSE; + + irlap_send_i_frame(self, tx_skb, RSP_FRAME); + } else { + if (self->ack_required) { + irlap_send_ui_frame(self, skb_get(skb), self->caddr, RSP_FRAME); + irlap_send_rr_frame(self, RSP_FRAME); + self->ack_required = FALSE; + } else { + skb->data[1] |= PF_BIT; + irlap_send_ui_frame(self, skb_get(skb), self->caddr, RSP_FRAME); + } + } + + self->window = self->window_size; +#ifdef CONFIG_IRDA_DYNAMIC_WINDOW + /* We are allowed to transmit a maximum number of bytes again. */ + self->bytes_left = self->line_capacity; +#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ + + irlap_start_wd_timer(self, self->wd_timeout); +} + +/* + * Function irlap_send_data_secondary (self, skb) + * + * Send I(nformation) frame as secondary without final bit set + * + */ +void irlap_send_data_secondary(struct irlap_cb *self, struct sk_buff *skb) +{ + struct sk_buff *tx_skb = NULL; + + /* Is this reliable or unreliable data? */ + if (skb->data[1] == I_FRAME) { + + /* + * Insert frame sequence number (Vs) in control field before + * inserting into transmit window queue. + */ + skb->data[1] = I_FRAME | (self->vs << 1); + + /* + * Insert frame in store, in case of retransmissions + * Increase skb reference count, see irlap_do_event() + */ + skb_get(skb); + skb_queue_tail(&self->wx_list, skb); + + tx_skb = skb_clone(skb, GFP_ATOMIC); + if (tx_skb == NULL) { + return; + } + + self->vs = (self->vs + 1) % 8; + self->ack_required = FALSE; + self->window -= 1; + + irlap_send_i_frame(self, tx_skb, RSP_FRAME); + } else { + irlap_send_ui_frame(self, skb_get(skb), self->caddr, RSP_FRAME); + self->window -= 1; + } +} + +/* + * Function irlap_resend_rejected_frames (nr) + * + * Resend frames which has not been acknowledged. Should be safe to + * traverse the list without locking it since this function will only be + * called from interrupt context (BH) + */ +void irlap_resend_rejected_frames(struct irlap_cb *self, int command) +{ + struct sk_buff *tx_skb; + struct sk_buff *skb; + int count; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Initialize variables */ + count = skb_queue_len(&self->wx_list); + + /* Resend unacknowledged frame(s) */ + skb = skb_peek(&self->wx_list); + while (skb != NULL) { + irlap_wait_min_turn_around(self, &self->qos_tx); + + /* We copy the skb to be retransmitted since we will have to + * modify it. Cloning will confuse packet sniffers + */ + /* tx_skb = skb_clone( skb, GFP_ATOMIC); */ + tx_skb = skb_copy(skb, GFP_ATOMIC); + if (!tx_skb) { + IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__); + return; + } + /* Unlink tx_skb from list */ + tx_skb->next = tx_skb->prev = NULL; + tx_skb->list = NULL; + + /* Clear old Nr field + poll bit */ + tx_skb->data[1] &= 0x0f; + + /* + * Set poll bit on the last frame retransmitted + */ + if (count-- == 1) + tx_skb->data[1] |= PF_BIT; /* Set p/f bit */ + else + tx_skb->data[1] &= ~PF_BIT; /* Clear p/f bit */ + + irlap_send_i_frame(self, tx_skb, command); + + /* + * If our skb is the last buffer in the list, then + * we are finished, if not, move to the next sk-buffer + */ + if (skb == skb_peek_tail(&self->wx_list)) + skb = NULL; + else + skb = skb->next; + } +#if 0 /* Not yet */ + /* + * We can now fill the window with additional data frames + */ + while (skb_queue_len( &self->txq) > 0) { + + IRDA_DEBUG(0, "%s(), sending additional frames!\n", __FUNCTION__); + if ((skb_queue_len( &self->txq) > 0) && + (self->window > 0)) { + skb = skb_dequeue( &self->txq); + IRDA_ASSERT(skb != NULL, return;); + + /* + * If send window > 1 then send frame with pf + * bit cleared + */ + if ((self->window > 1) && + skb_queue_len(&self->txq) > 0) + { + irlap_send_data_primary(self, skb); + } else { + irlap_send_data_primary_poll(self, skb); + } + kfree_skb(skb); + } + } +#endif +} + +void irlap_resend_rejected_frame(struct irlap_cb *self, int command) +{ + struct sk_buff *tx_skb; + struct sk_buff *skb; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + /* Resend unacknowledged frame(s) */ + skb = skb_peek(&self->wx_list); + if (skb != NULL) { + irlap_wait_min_turn_around(self, &self->qos_tx); + + /* We copy the skb to be retransmitted since we will have to + * modify it. Cloning will confuse packet sniffers + */ + /* tx_skb = skb_clone( skb, GFP_ATOMIC); */ + tx_skb = skb_copy(skb, GFP_ATOMIC); + if (!tx_skb) { + IRDA_DEBUG(0, "%s(), unable to copy\n", __FUNCTION__); + return; + } + /* Unlink tx_skb from list */ + tx_skb->next = tx_skb->prev = NULL; + tx_skb->list = NULL; + + /* Clear old Nr field + poll bit */ + tx_skb->data[1] &= 0x0f; + + /* Set poll/final bit */ + tx_skb->data[1] |= PF_BIT; /* Set p/f bit */ + + irlap_send_i_frame(self, tx_skb, command); + } +} + +/* + * Function irlap_send_ui_frame (self, skb, command) + * + * Contruct and transmit an Unnumbered Information (UI) frame + * + */ +void irlap_send_ui_frame(struct irlap_cb *self, struct sk_buff *skb, + __u8 caddr, int command) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + /* Insert connection address */ + skb->data[0] = caddr | ((command) ? CMD_FRAME : 0); + + irlap_queue_xmit(self, skb); +} + +/* + * Function irlap_send_i_frame (skb) + * + * Contruct and transmit Information (I) frame + */ +static void irlap_send_i_frame(struct irlap_cb *self, struct sk_buff *skb, + int command) +{ + /* Insert connection address */ + skb->data[0] = self->caddr; + skb->data[0] |= (command) ? CMD_FRAME : 0; + + /* Insert next to receive (Vr) */ + skb->data[1] |= (self->vr << 5); /* insert nr */ + + irlap_queue_xmit(self, skb); +} + +/* + * Function irlap_recv_i_frame (skb, frame) + * + * Receive and parse an I (Information) frame, no harm in making it inline + * since it's called only from one single place (irlap_driver_rcv). + */ +static inline void irlap_recv_i_frame(struct irlap_cb *self, + struct sk_buff *skb, + struct irlap_info *info, int command) +{ + info->nr = skb->data[1] >> 5; /* Next to receive */ + info->pf = skb->data[1] & PF_BIT; /* Final bit */ + info->ns = (skb->data[1] >> 1) & 0x07; /* Next to send */ + + /* Check if this is a command or a response frame */ + if (command) + irlap_do_event(self, RECV_I_CMD, skb, info); + else + irlap_do_event(self, RECV_I_RSP, skb, info); +} + +/* + * Function irlap_recv_ui_frame (self, skb, info) + * + * Receive and parse an Unnumbered Information (UI) frame + * + */ +static void irlap_recv_ui_frame(struct irlap_cb *self, struct sk_buff *skb, + struct irlap_info *info) +{ + IRDA_DEBUG( 4, "%s()\n", __FUNCTION__); + + info->pf = skb->data[1] & PF_BIT; /* Final bit */ + + irlap_do_event(self, RECV_UI_FRAME, skb, info); +} + +/* + * Function irlap_recv_frmr_frame (skb, frame) + * + * Received Frame Reject response. + * + */ +static void irlap_recv_frmr_frame(struct irlap_cb *self, struct sk_buff *skb, + struct irlap_info *info) +{ + __u8 *frame; + int w, x, y, z; + + IRDA_DEBUG(0, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(info != NULL, return;); + + if (!pskb_may_pull(skb, 4)) { + IRDA_ERROR("%s: frame to short!\n", __FUNCTION__); + return; + } + + frame = skb->data; + + info->nr = frame[2] >> 5; /* Next to receive */ + info->pf = frame[2] & PF_BIT; /* Final bit */ + info->ns = (frame[2] >> 1) & 0x07; /* Next to send */ + + w = frame[3] & 0x01; + x = frame[3] & 0x02; + y = frame[3] & 0x04; + z = frame[3] & 0x08; + + if (w) { + IRDA_DEBUG(0, "Rejected control field is undefined or not " + "implemented.\n"); + } + if (x) { + IRDA_DEBUG(0, "Rejected control field was invalid because it " + "contained a non permitted I field.\n"); + } + if (y) { + IRDA_DEBUG(0, "Received I field exceeded the maximum negotiated " + "for the existing connection or exceeded the maximum " + "this station supports if no connection exists.\n"); + } + if (z) { + IRDA_DEBUG(0, "Rejected control field control field contained an " + "invalid Nr count.\n"); + } + irlap_do_event(self, RECV_FRMR_RSP, skb, info); +} + +/* + * Function irlap_send_test_frame (self, daddr) + * + * Send a test frame response + * + */ +void irlap_send_test_frame(struct irlap_cb *self, __u8 caddr, __u32 daddr, + struct sk_buff *cmd) +{ + struct sk_buff *tx_skb; + struct test_frame *frame; + __u8 *info; + + tx_skb = dev_alloc_skb(cmd->len+sizeof(struct test_frame)); + if (!tx_skb) + return; + + /* Broadcast frames must include saddr and daddr fields */ + if (caddr == CBROADCAST) { + frame = (struct test_frame *) + skb_put(tx_skb, sizeof(struct test_frame)); + + /* Insert the swapped addresses */ + frame->saddr = cpu_to_le32(self->saddr); + frame->daddr = cpu_to_le32(daddr); + } else + frame = (struct test_frame *) skb_put(tx_skb, LAP_ADDR_HEADER + LAP_CTRL_HEADER); + + frame->caddr = caddr; + frame->control = TEST_RSP | PF_BIT; + + /* Copy info */ + info = skb_put(tx_skb, cmd->len); + memcpy(info, cmd->data, cmd->len); + + /* Return to sender */ + irlap_wait_min_turn_around(self, &self->qos_tx); + irlap_queue_xmit(self, tx_skb); +} + +/* + * Function irlap_recv_test_frame (self, skb) + * + * Receive a test frame + * + */ +static void irlap_recv_test_frame(struct irlap_cb *self, struct sk_buff *skb, + struct irlap_info *info, int command) +{ + struct test_frame *frame; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + if (!pskb_may_pull(skb, sizeof(*frame))) { + IRDA_ERROR("%s: frame to short!\n", __FUNCTION__); + return; + } + frame = (struct test_frame *) skb->data; + + /* Broadcast frames must carry saddr and daddr fields */ + if (info->caddr == CBROADCAST) { + if (skb->len < sizeof(struct test_frame)) { + IRDA_DEBUG(0, "%s() test frame to short!\n", + __FUNCTION__); + return; + } + + /* Read and swap addresses */ + info->daddr = le32_to_cpu(frame->saddr); + info->saddr = le32_to_cpu(frame->daddr); + + /* Make sure frame is addressed to us */ + if ((info->saddr != self->saddr) && + (info->saddr != BROADCAST)) { + return; + } + } + + if (command) + irlap_do_event(self, RECV_TEST_CMD, skb, info); + else + irlap_do_event(self, RECV_TEST_RSP, skb, info); +} + +/* + * Function irlap_driver_rcv (skb, netdev, ptype) + * + * Called when a frame is received. Dispatches the right receive function + * for processing of the frame. + * + * Note on skb management : + * After calling the higher layers of the IrDA stack, we always + * kfree() the skb, which drop the reference count (and potentially + * destroy it). + * If a higher layer of the stack want to keep the skb around (to put + * in a queue or pass it to the higher layer), it will need to use + * skb_get() to keep a reference on it. This is usually done at the + * LMP level in irlmp.c. + * Jean II + */ +int irlap_driver_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *ptype) +{ + struct irlap_info info; + struct irlap_cb *self; + int command; + __u8 control; + + /* FIXME: should we get our own field? */ + self = (struct irlap_cb *) dev->atalk_ptr; + + /* If the net device is down, then IrLAP is gone! */ + if (!self || self->magic != LAP_MAGIC) { + dev_kfree_skb(skb); + return -1; + } + + /* We are no longer an "old" protocol, so we need to handle + * share and non linear skbs. This should never happen, so + * we don't need to be clever about it. Jean II */ + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) { + IRDA_ERROR("%s: can't clone shared skb!\n", __FUNCTION__); + dev_kfree_skb(skb); + return -1; + } + + /* Check if frame is large enough for parsing */ + if (!pskb_may_pull(skb, 2)) { + IRDA_ERROR("%s: frame to short!\n", __FUNCTION__); + dev_kfree_skb(skb); + return -1; + } + + command = skb->data[0] & CMD_FRAME; + info.caddr = skb->data[0] & CBROADCAST; + + info.pf = skb->data[1] & PF_BIT; + info.control = skb->data[1] & ~PF_BIT; /* Mask away poll/final bit */ + + control = info.control; + + /* First we check if this frame has a valid connection address */ + if ((info.caddr != self->caddr) && (info.caddr != CBROADCAST)) { + IRDA_DEBUG(0, "%s(), wrong connection address!\n", + __FUNCTION__); + goto out; + } + /* + * Optimize for the common case and check if the frame is an + * I(nformation) frame. Only I-frames have bit 0 set to 0 + */ + if (~control & 0x01) { + irlap_recv_i_frame(self, skb, &info, command); + goto out; + } + /* + * We now check is the frame is an S(upervisory) frame. Only + * S-frames have bit 0 set to 1 and bit 1 set to 0 + */ + if (~control & 0x02) { + /* + * Received S(upervisory) frame, check which frame type it is + * only the first nibble is of interest + */ + switch (control & 0x0f) { + case RR: + irlap_recv_rr_frame(self, skb, &info, command); + break; + case RNR: + irlap_recv_rnr_frame(self, skb, &info, command); + break; + case REJ: + irlap_recv_rej_frame(self, skb, &info, command); + break; + case SREJ: + irlap_recv_srej_frame(self, skb, &info, command); + break; + default: + IRDA_WARNING("%s: Unknown S-frame %02x received!\n", + __FUNCTION__, info.control); + break; + } + goto out; + } + /* + * This must be a C(ontrol) frame + */ + switch (control) { + case XID_RSP: + irlap_recv_discovery_xid_rsp(self, skb, &info); + break; + case XID_CMD: + irlap_recv_discovery_xid_cmd(self, skb, &info); + break; + case SNRM_CMD: + irlap_recv_snrm_cmd(self, skb, &info); + break; + case DM_RSP: + irlap_do_event(self, RECV_DM_RSP, skb, &info); + break; + case DISC_CMD: /* And RD_RSP since they have the same value */ + irlap_recv_disc_frame(self, skb, &info, command); + break; + case TEST_CMD: + irlap_recv_test_frame(self, skb, &info, command); + break; + case UA_RSP: + irlap_recv_ua_frame(self, skb, &info); + break; + case FRMR_RSP: + irlap_recv_frmr_frame(self, skb, &info); + break; + case UI_FRAME: + irlap_recv_ui_frame(self, skb, &info); + break; + default: + IRDA_WARNING("%s: Unknown frame %02x received!\n", + __FUNCTION__, info.control); + break; + } +out: + /* Always drop our reference on the skb */ + dev_kfree_skb(skb); + return 0; +} diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c new file mode 100644 index 000000000000..7a4a4d7fbe66 --- /dev/null +++ b/net/irda/irlmp.c @@ -0,0 +1,2041 @@ +/********************************************************************* + * + * Filename: irlmp.c + * Version: 1.0 + * Description: IrDA Link Management Protocol (LMP) layer + * Status: Stable. + * Author: Dag Brattli + * Created at: Sun Aug 17 20:54:32 1997 + * Modified at: Wed Jan 5 11:26:03 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-2000 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +static __u8 irlmp_find_free_slsap(void); +static int irlmp_slsap_inuse(__u8 slsap_sel); + +/* Master structure */ +struct irlmp_cb *irlmp = NULL; + +/* These can be altered by the sysctl interface */ +int sysctl_discovery = 0; +int sysctl_discovery_timeout = 3; /* 3 seconds by default */ +EXPORT_SYMBOL(sysctl_discovery_timeout); +int sysctl_discovery_slots = 6; /* 6 slots by default */ +int sysctl_lap_keepalive_time = LM_IDLE_TIMEOUT * 1000 / HZ; +char sysctl_devname[65]; + +const char *irlmp_reasons[] = { + "ERROR, NOT USED", + "LM_USER_REQUEST", + "LM_LAP_DISCONNECT", + "LM_CONNECT_FAILURE", + "LM_LAP_RESET", + "LM_INIT_DISCONNECT", + "ERROR, NOT USED", +}; +EXPORT_SYMBOL(irlmp_reasons); + +/* + * Function irlmp_init (void) + * + * Create (allocate) the main IrLMP structure + * + */ +int __init irlmp_init(void) +{ + IRDA_DEBUG(1, "%s()\n", __FUNCTION__); + /* Initialize the irlmp structure. */ + irlmp = kmalloc( sizeof(struct irlmp_cb), GFP_KERNEL); + if (irlmp == NULL) + return -ENOMEM; + memset(irlmp, 0, sizeof(struct irlmp_cb)); + + irlmp->magic = LMP_MAGIC; + + irlmp->clients = hashbin_new(HB_LOCK); + irlmp->services = hashbin_new(HB_LOCK); + irlmp->links = hashbin_new(HB_LOCK); + irlmp->unconnected_lsaps = hashbin_new(HB_LOCK); + irlmp->cachelog = hashbin_new(HB_NOLOCK); + + if ((irlmp->clients == NULL) || + (irlmp->services == NULL) || + (irlmp->links == NULL) || + (irlmp->unconnected_lsaps == NULL) || + (irlmp->cachelog == NULL)) { + return -ENOMEM; + } + + spin_lock_init(&irlmp->cachelog->hb_spinlock); + + irlmp->last_lsap_sel = 0x0f; /* Reserved 0x00-0x0f */ + strcpy(sysctl_devname, "Linux"); + + /* Do discovery every 3 seconds */ + init_timer(&irlmp->discovery_timer); + irlmp_start_discovery_timer(irlmp, sysctl_discovery_timeout*HZ); + + return 0; +} + +/* + * Function irlmp_cleanup (void) + * + * Remove IrLMP layer + * + */ +void __exit irlmp_cleanup(void) +{ + /* Check for main structure */ + IRDA_ASSERT(irlmp != NULL, return;); + IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return;); + + del_timer(&irlmp->discovery_timer); + + hashbin_delete(irlmp->links, (FREE_FUNC) kfree); + hashbin_delete(irlmp->unconnected_lsaps, (FREE_FUNC) kfree); + hashbin_delete(irlmp->clients, (FREE_FUNC) kfree); + hashbin_delete(irlmp->services, (FREE_FUNC) kfree); + hashbin_delete(irlmp->cachelog, (FREE_FUNC) kfree); + + /* De-allocate main structure */ + kfree(irlmp); + irlmp = NULL; +} + +/* + * Function irlmp_open_lsap (slsap, notify) + * + * Register with IrLMP and create a local LSAP, + * returns handle to LSAP. + */ +struct lsap_cb *irlmp_open_lsap(__u8 slsap_sel, notify_t *notify, __u8 pid) +{ + struct lsap_cb *self; + + IRDA_ASSERT(notify != NULL, return NULL;); + IRDA_ASSERT(irlmp != NULL, return NULL;); + IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return NULL;); + IRDA_ASSERT(notify->instance != NULL, return NULL;); + + /* Does the client care which Source LSAP selector it gets? */ + if (slsap_sel == LSAP_ANY) { + slsap_sel = irlmp_find_free_slsap(); + if (!slsap_sel) + return NULL; + } else if (irlmp_slsap_inuse(slsap_sel)) + return NULL; + + /* Allocate new instance of a LSAP connection */ + self = kmalloc(sizeof(struct lsap_cb), GFP_ATOMIC); + if (self == NULL) { + IRDA_ERROR("%s: can't allocate memory\n", __FUNCTION__); + return NULL; + } + memset(self, 0, sizeof(struct lsap_cb)); + + self->magic = LMP_LSAP_MAGIC; + self->slsap_sel = slsap_sel; + + /* Fix connectionless LSAP's */ + if (slsap_sel == LSAP_CONNLESS) { +#ifdef CONFIG_IRDA_ULTRA + self->dlsap_sel = LSAP_CONNLESS; + self->pid = pid; +#endif /* CONFIG_IRDA_ULTRA */ + } else + self->dlsap_sel = LSAP_ANY; + /* self->connected = FALSE; -> already NULL via memset() */ + + init_timer(&self->watchdog_timer); + + self->notify = *notify; + + self->lsap_state = LSAP_DISCONNECTED; + + /* Insert into queue of unconnected LSAPs */ + hashbin_insert(irlmp->unconnected_lsaps, (irda_queue_t *) self, + (long) self, NULL); + + return self; +} +EXPORT_SYMBOL(irlmp_open_lsap); + +/* + * Function __irlmp_close_lsap (self) + * + * Remove an instance of LSAP + */ +static void __irlmp_close_lsap(struct lsap_cb *self) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;); + + /* + * Set some of the variables to preset values + */ + self->magic = 0; + del_timer(&self->watchdog_timer); /* Important! */ + + if (self->conn_skb) + dev_kfree_skb(self->conn_skb); + + kfree(self); +} + +/* + * Function irlmp_close_lsap (self) + * + * Close and remove LSAP + * + */ +void irlmp_close_lsap(struct lsap_cb *self) +{ + struct lap_cb *lap; + struct lsap_cb *lsap = NULL; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;); + + /* + * Find out if we should remove this LSAP from a link or from the + * list of unconnected lsaps (not associated with a link) + */ + lap = self->lap; + if (lap) { + IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return;); + /* We might close a LSAP before it has completed the + * connection setup. In those case, higher layers won't + * send a proper disconnect request. Harmless, except + * that we will forget to close LAP... - Jean II */ + if(self->lsap_state != LSAP_DISCONNECTED) { + self->lsap_state = LSAP_DISCONNECTED; + irlmp_do_lap_event(self->lap, + LM_LAP_DISCONNECT_REQUEST, NULL); + } + /* Now, remove from the link */ + lsap = hashbin_remove(lap->lsaps, (long) self, NULL); +#ifdef CONFIG_IRDA_CACHE_LAST_LSAP + lap->cache.valid = FALSE; +#endif + } + self->lap = NULL; + /* Check if we found the LSAP! If not then try the unconnected lsaps */ + if (!lsap) { + lsap = hashbin_remove(irlmp->unconnected_lsaps, (long) self, + NULL); + } + if (!lsap) { + IRDA_DEBUG(0, + "%s(), Looks like somebody has removed me already!\n", + __FUNCTION__); + return; + } + __irlmp_close_lsap(self); +} +EXPORT_SYMBOL(irlmp_close_lsap); + +/* + * Function irlmp_register_irlap (saddr, notify) + * + * Register IrLAP layer with IrLMP. There is possible to have multiple + * instances of the IrLAP layer, each connected to different IrDA ports + * + */ +void irlmp_register_link(struct irlap_cb *irlap, __u32 saddr, notify_t *notify) +{ + struct lap_cb *lap; + + IRDA_ASSERT(irlmp != NULL, return;); + IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return;); + IRDA_ASSERT(notify != NULL, return;); + + /* + * Allocate new instance of a LSAP connection + */ + lap = kmalloc(sizeof(struct lap_cb), GFP_KERNEL); + if (lap == NULL) { + IRDA_ERROR("%s: unable to kmalloc\n", __FUNCTION__); + return; + } + memset(lap, 0, sizeof(struct lap_cb)); + + lap->irlap = irlap; + lap->magic = LMP_LAP_MAGIC; + lap->saddr = saddr; + lap->daddr = DEV_ADDR_ANY; +#ifdef CONFIG_IRDA_CACHE_LAST_LSAP + lap->cache.valid = FALSE; +#endif + lap->lsaps = hashbin_new(HB_LOCK); + if (lap->lsaps == NULL) { + IRDA_WARNING("%s(), unable to kmalloc lsaps\n", __FUNCTION__); + kfree(lap); + return; + } + + lap->lap_state = LAP_STANDBY; + + init_timer(&lap->idle_timer); + + /* + * Insert into queue of LMP links + */ + hashbin_insert(irlmp->links, (irda_queue_t *) lap, lap->saddr, NULL); + + /* + * We set only this variable so IrLAP can tell us on which link the + * different events happened on + */ + irda_notify_init(notify); + notify->instance = lap; +} + +/* + * Function irlmp_unregister_irlap (saddr) + * + * IrLAP layer has been removed! + * + */ +void irlmp_unregister_link(__u32 saddr) +{ + struct lap_cb *link; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + /* We must remove ourselves from the hashbin *first*. This ensure + * that no more LSAPs will be open on this link and no discovery + * will be triggered anymore. Jean II */ + link = hashbin_remove(irlmp->links, saddr, NULL); + if (link) { + IRDA_ASSERT(link->magic == LMP_LAP_MAGIC, return;); + + /* Kill all the LSAPs on this link. Jean II */ + link->reason = LAP_DISC_INDICATION; + link->daddr = DEV_ADDR_ANY; + irlmp_do_lap_event(link, LM_LAP_DISCONNECT_INDICATION, NULL); + + /* Remove all discoveries discovered at this link */ + irlmp_expire_discoveries(irlmp->cachelog, link->saddr, TRUE); + + /* Final cleanup */ + del_timer(&link->idle_timer); + link->magic = 0; + kfree(link); + } +} + +/* + * Function irlmp_connect_request (handle, dlsap, userdata) + * + * Connect with a peer LSAP + * + */ +int irlmp_connect_request(struct lsap_cb *self, __u8 dlsap_sel, + __u32 saddr, __u32 daddr, + struct qos_info *qos, struct sk_buff *userdata) +{ + struct sk_buff *tx_skb = userdata; + struct lap_cb *lap; + struct lsap_cb *lsap; + int ret; + + IRDA_ASSERT(self != NULL, return -EBADR;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -EBADR;); + + IRDA_DEBUG(2, + "%s(), slsap_sel=%02x, dlsap_sel=%02x, saddr=%08x, daddr=%08x\n", + __FUNCTION__, self->slsap_sel, dlsap_sel, saddr, daddr); + + if (test_bit(0, &self->connected)) { + ret = -EISCONN; + goto err; + } + + /* Client must supply destination device address */ + if (!daddr) { + ret = -EINVAL; + goto err; + } + + /* Any userdata? */ + if (tx_skb == NULL) { + tx_skb = dev_alloc_skb(64); + if (!tx_skb) + return -ENOMEM; + + skb_reserve(tx_skb, LMP_MAX_HEADER); + } + + /* Make room for MUX control header (3 bytes) */ + IRDA_ASSERT(skb_headroom(tx_skb) >= LMP_CONTROL_HEADER, return -1;); + skb_push(tx_skb, LMP_CONTROL_HEADER); + + self->dlsap_sel = dlsap_sel; + + /* + * Find the link to where we should try to connect since there may + * be more than one IrDA port on this machine. If the client has + * passed us the saddr (and already knows which link to use), then + * we use that to find the link, if not then we have to look in the + * discovery log and check if any of the links has discovered a + * device with the given daddr + */ + if ((!saddr) || (saddr == DEV_ADDR_ANY)) { + discovery_t *discovery; + unsigned long flags; + + spin_lock_irqsave(&irlmp->cachelog->hb_spinlock, flags); + if (daddr != DEV_ADDR_ANY) + discovery = hashbin_find(irlmp->cachelog, daddr, NULL); + else { + IRDA_DEBUG(2, "%s(), no daddr\n", __FUNCTION__); + discovery = (discovery_t *) + hashbin_get_first(irlmp->cachelog); + } + + if (discovery) { + saddr = discovery->data.saddr; + daddr = discovery->data.daddr; + } + spin_unlock_irqrestore(&irlmp->cachelog->hb_spinlock, flags); + } + lap = hashbin_lock_find(irlmp->links, saddr, NULL); + if (lap == NULL) { + IRDA_DEBUG(1, "%s(), Unable to find a usable link!\n", __FUNCTION__); + ret = -EHOSTUNREACH; + goto err; + } + + /* Check if LAP is disconnected or already connected */ + if (lap->daddr == DEV_ADDR_ANY) + lap->daddr = daddr; + else if (lap->daddr != daddr) { + /* Check if some LSAPs are active on this LAP */ + if (HASHBIN_GET_SIZE(lap->lsaps) == 0) { + /* No active connection, but LAP hasn't been + * disconnected yet (waiting for timeout in LAP). + * Maybe we could give LAP a bit of help in this case. + */ + IRDA_DEBUG(0, "%s(), sorry, but I'm waiting for LAP to timeout!\n", __FUNCTION__); + ret = -EAGAIN; + goto err; + } + + /* LAP is already connected to a different node, and LAP + * can only talk to one node at a time */ + IRDA_DEBUG(0, "%s(), sorry, but link is busy!\n", __FUNCTION__); + ret = -EBUSY; + goto err; + } + + self->lap = lap; + + /* + * Remove LSAP from list of unconnected LSAPs and insert it into the + * list of connected LSAPs for the particular link + */ + lsap = hashbin_remove(irlmp->unconnected_lsaps, (long) self, NULL); + + IRDA_ASSERT(lsap != NULL, return -1;); + IRDA_ASSERT(lsap->magic == LMP_LSAP_MAGIC, return -1;); + IRDA_ASSERT(lsap->lap != NULL, return -1;); + IRDA_ASSERT(lsap->lap->magic == LMP_LAP_MAGIC, return -1;); + + hashbin_insert(self->lap->lsaps, (irda_queue_t *) self, (long) self, + NULL); + + set_bit(0, &self->connected); /* TRUE */ + + /* + * User supplied qos specifications? + */ + if (qos) + self->qos = *qos; + + irlmp_do_lsap_event(self, LM_CONNECT_REQUEST, tx_skb); + + /* Drop reference count - see irlap_data_request(). */ + dev_kfree_skb(tx_skb); + + return 0; + +err: + /* Cleanup */ + if(tx_skb) + dev_kfree_skb(tx_skb); + return ret; +} +EXPORT_SYMBOL(irlmp_connect_request); + +/* + * Function irlmp_connect_indication (self) + * + * Incoming connection + * + */ +void irlmp_connect_indication(struct lsap_cb *self, struct sk_buff *skb) +{ + int max_seg_size; + int lap_header_size; + int max_header_size; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(self->lap != NULL, return;); + + IRDA_DEBUG(2, "%s(), slsap_sel=%02x, dlsap_sel=%02x\n", + __FUNCTION__, self->slsap_sel, self->dlsap_sel); + + /* Note : self->lap is set in irlmp_link_data_indication(), + * (case CONNECT_CMD:) because we have no way to set it here. + * Similarly, self->dlsap_sel is usually set in irlmp_find_lsap(). + * Jean II */ + + self->qos = *self->lap->qos; + + max_seg_size = self->lap->qos->data_size.value-LMP_HEADER; + lap_header_size = IRLAP_GET_HEADER_SIZE(self->lap->irlap); + max_header_size = LMP_HEADER + lap_header_size; + + /* Hide LMP_CONTROL_HEADER header from layer above */ + skb_pull(skb, LMP_CONTROL_HEADER); + + if (self->notify.connect_indication) { + /* Don't forget to refcount it - see irlap_driver_rcv(). */ + skb_get(skb); + self->notify.connect_indication(self->notify.instance, self, + &self->qos, max_seg_size, + max_header_size, skb); + } +} + +/* + * Function irlmp_connect_response (handle, userdata) + * + * Service user is accepting connection + * + */ +int irlmp_connect_response(struct lsap_cb *self, struct sk_buff *userdata) +{ + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + IRDA_ASSERT(userdata != NULL, return -1;); + + /* We set the connected bit and move the lsap to the connected list + * in the state machine itself. Jean II */ + + IRDA_DEBUG(2, "%s(), slsap_sel=%02x, dlsap_sel=%02x\n", + __FUNCTION__, self->slsap_sel, self->dlsap_sel); + + /* Make room for MUX control header (3 bytes) */ + IRDA_ASSERT(skb_headroom(userdata) >= LMP_CONTROL_HEADER, return -1;); + skb_push(userdata, LMP_CONTROL_HEADER); + + irlmp_do_lsap_event(self, LM_CONNECT_RESPONSE, userdata); + + /* Drop reference count - see irlap_data_request(). */ + dev_kfree_skb(userdata); + + return 0; +} +EXPORT_SYMBOL(irlmp_connect_response); + +/* + * Function irlmp_connect_confirm (handle, skb) + * + * LSAP connection confirmed peer device! + */ +void irlmp_connect_confirm(struct lsap_cb *self, struct sk_buff *skb) +{ + int max_header_size; + int lap_header_size; + int max_seg_size; + + IRDA_DEBUG(3, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(skb != NULL, return;); + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;); + IRDA_ASSERT(self->lap != NULL, return;); + + self->qos = *self->lap->qos; + + max_seg_size = self->lap->qos->data_size.value-LMP_HEADER; + lap_header_size = IRLAP_GET_HEADER_SIZE(self->lap->irlap); + max_header_size = LMP_HEADER + lap_header_size; + + IRDA_DEBUG(2, "%s(), max_header_size=%d\n", + __FUNCTION__, max_header_size); + + /* Hide LMP_CONTROL_HEADER header from layer above */ + skb_pull(skb, LMP_CONTROL_HEADER); + + if (self->notify.connect_confirm) { + /* Don't forget to refcount it - see irlap_driver_rcv() */ + skb_get(skb); + self->notify.connect_confirm(self->notify.instance, self, + &self->qos, max_seg_size, + max_header_size, skb); + } +} + +/* + * Function irlmp_dup (orig, instance) + * + * Duplicate LSAP, can be used by servers to confirm a connection on a + * new LSAP so it can keep listening on the old one. + * + */ +struct lsap_cb *irlmp_dup(struct lsap_cb *orig, void *instance) +{ + struct lsap_cb *new; + unsigned long flags; + + IRDA_DEBUG(1, "%s()\n", __FUNCTION__); + + spin_lock_irqsave(&irlmp->unconnected_lsaps->hb_spinlock, flags); + + /* Only allowed to duplicate unconnected LSAP's, and only LSAPs + * that have received a connect indication. Jean II */ + if ((!hashbin_find(irlmp->unconnected_lsaps, (long) orig, NULL)) || + (orig->lap == NULL)) { + IRDA_DEBUG(0, "%s(), invalid LSAP (wrong state)\n", + __FUNCTION__); + spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, + flags); + return NULL; + } + + /* Allocate a new instance */ + new = kmalloc(sizeof(struct lsap_cb), GFP_ATOMIC); + if (!new) { + IRDA_DEBUG(0, "%s(), unable to kmalloc\n", __FUNCTION__); + spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, + flags); + return NULL; + } + /* Dup */ + memcpy(new, orig, sizeof(struct lsap_cb)); + /* new->lap = orig->lap; => done in the memcpy() */ + /* new->slsap_sel = orig->slsap_sel; => done in the memcpy() */ + new->conn_skb = NULL; + + spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, flags); + + /* Not everything is the same */ + new->notify.instance = instance; + + init_timer(&new->watchdog_timer); + + hashbin_insert(irlmp->unconnected_lsaps, (irda_queue_t *) new, + (long) new, NULL); + +#ifdef CONFIG_IRDA_CACHE_LAST_LSAP + /* Make sure that we invalidate the LSAP cache */ + new->lap->cache.valid = FALSE; +#endif /* CONFIG_IRDA_CACHE_LAST_LSAP */ + + return new; +} +EXPORT_SYMBOL(irlmp_dup); + +/* + * Function irlmp_disconnect_request (handle, userdata) + * + * The service user is requesting disconnection, this will not remove the + * LSAP, but only mark it as disconnected + */ +int irlmp_disconnect_request(struct lsap_cb *self, struct sk_buff *userdata) +{ + struct lsap_cb *lsap; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + IRDA_ASSERT(userdata != NULL, return -1;); + + /* Already disconnected ? + * There is a race condition between irlmp_disconnect_indication() + * and us that might mess up the hashbins below. This fixes it. + * Jean II */ + if (! test_and_clear_bit(0, &self->connected)) { + IRDA_DEBUG(0, "%s(), already disconnected!\n", __FUNCTION__); + dev_kfree_skb(userdata); + return -1; + } + + skb_push(userdata, LMP_CONTROL_HEADER); + + /* + * Do the event before the other stuff since we must know + * which lap layer that the frame should be transmitted on + */ + irlmp_do_lsap_event(self, LM_DISCONNECT_REQUEST, userdata); + + /* Drop reference count - see irlap_data_request(). */ + dev_kfree_skb(userdata); + + /* + * Remove LSAP from list of connected LSAPs for the particular link + * and insert it into the list of unconnected LSAPs + */ + IRDA_ASSERT(self->lap != NULL, return -1;); + IRDA_ASSERT(self->lap->magic == LMP_LAP_MAGIC, return -1;); + IRDA_ASSERT(self->lap->lsaps != NULL, return -1;); + + lsap = hashbin_remove(self->lap->lsaps, (long) self, NULL); +#ifdef CONFIG_IRDA_CACHE_LAST_LSAP + self->lap->cache.valid = FALSE; +#endif + + IRDA_ASSERT(lsap != NULL, return -1;); + IRDA_ASSERT(lsap->magic == LMP_LSAP_MAGIC, return -1;); + IRDA_ASSERT(lsap == self, return -1;); + + hashbin_insert(irlmp->unconnected_lsaps, (irda_queue_t *) self, + (long) self, NULL); + + /* Reset some values */ + self->dlsap_sel = LSAP_ANY; + self->lap = NULL; + + return 0; +} +EXPORT_SYMBOL(irlmp_disconnect_request); + +/* + * Function irlmp_disconnect_indication (reason, userdata) + * + * LSAP is being closed! + */ +void irlmp_disconnect_indication(struct lsap_cb *self, LM_REASON reason, + struct sk_buff *skb) +{ + struct lsap_cb *lsap; + + IRDA_DEBUG(1, "%s(), reason=%s\n", __FUNCTION__, irlmp_reasons[reason]); + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;); + + IRDA_DEBUG(3, "%s(), slsap_sel=%02x, dlsap_sel=%02x\n", + __FUNCTION__, self->slsap_sel, self->dlsap_sel); + + /* Already disconnected ? + * There is a race condition between irlmp_disconnect_request() + * and us that might mess up the hashbins below. This fixes it. + * Jean II */ + if (! test_and_clear_bit(0, &self->connected)) { + IRDA_DEBUG(0, "%s(), already disconnected!\n", __FUNCTION__); + return; + } + + /* + * Remove association between this LSAP and the link it used + */ + IRDA_ASSERT(self->lap != NULL, return;); + IRDA_ASSERT(self->lap->lsaps != NULL, return;); + + lsap = hashbin_remove(self->lap->lsaps, (long) self, NULL); +#ifdef CONFIG_IRDA_CACHE_LAST_LSAP + self->lap->cache.valid = FALSE; +#endif + + IRDA_ASSERT(lsap != NULL, return;); + IRDA_ASSERT(lsap == self, return;); + hashbin_insert(irlmp->unconnected_lsaps, (irda_queue_t *) lsap, + (long) lsap, NULL); + + self->dlsap_sel = LSAP_ANY; + self->lap = NULL; + + /* + * Inform service user + */ + if (self->notify.disconnect_indication) { + /* Don't forget to refcount it - see irlap_driver_rcv(). */ + if(skb) + skb_get(skb); + self->notify.disconnect_indication(self->notify.instance, + self, reason, skb); + } else { + IRDA_DEBUG(0, "%s(), no handler\n", __FUNCTION__); + } +} + +/* + * Function irlmp_do_expiry (void) + * + * Do a cleanup of the discovery log (remove old entries) + * + * Note : separate from irlmp_do_discovery() so that we can handle + * passive discovery properly. + */ +void irlmp_do_expiry(void) +{ + struct lap_cb *lap; + + /* + * Expire discovery on all links which are *not* connected. + * On links which are connected, we can't do discovery + * anymore and can't refresh the log, so we freeze the + * discovery log to keep info about the device we are + * connected to. + * This info is mandatory if we want irlmp_connect_request() + * to work properly. - Jean II + */ + lap = (struct lap_cb *) hashbin_get_first(irlmp->links); + while (lap != NULL) { + IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return;); + + if (lap->lap_state == LAP_STANDBY) { + /* Expire discoveries discovered on this link */ + irlmp_expire_discoveries(irlmp->cachelog, lap->saddr, + FALSE); + } + lap = (struct lap_cb *) hashbin_get_next(irlmp->links); + } +} + +/* + * Function irlmp_do_discovery (nslots) + * + * Do some discovery on all links + * + * Note : log expiry is done above. + */ +void irlmp_do_discovery(int nslots) +{ + struct lap_cb *lap; + + /* Make sure the value is sane */ + if ((nslots != 1) && (nslots != 6) && (nslots != 8) && (nslots != 16)){ + IRDA_WARNING("%s: invalid value for number of slots!\n", + __FUNCTION__); + nslots = sysctl_discovery_slots = 8; + } + + /* Construct new discovery info to be used by IrLAP, */ + u16ho(irlmp->discovery_cmd.data.hints) = irlmp->hints.word; + + /* + * Set character set for device name (we use ASCII), and + * copy device name. Remember to make room for a \0 at the + * end + */ + irlmp->discovery_cmd.data.charset = CS_ASCII; + strncpy(irlmp->discovery_cmd.data.info, sysctl_devname, + NICKNAME_MAX_LEN); + irlmp->discovery_cmd.name_len = strlen(irlmp->discovery_cmd.data.info); + irlmp->discovery_cmd.nslots = nslots; + + /* + * Try to send discovery packets on all links + */ + lap = (struct lap_cb *) hashbin_get_first(irlmp->links); + while (lap != NULL) { + IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return;); + + if (lap->lap_state == LAP_STANDBY) { + /* Try to discover */ + irlmp_do_lap_event(lap, LM_LAP_DISCOVERY_REQUEST, + NULL); + } + lap = (struct lap_cb *) hashbin_get_next(irlmp->links); + } +} + +/* + * Function irlmp_discovery_request (nslots) + * + * Do a discovery of devices in front of the computer + * + * If the caller has registered a client discovery callback, this + * allow him to receive the full content of the discovery log through + * this callback (as normally he will receive only new discoveries). + */ +void irlmp_discovery_request(int nslots) +{ + /* Return current cached discovery log (in full) */ + irlmp_discovery_confirm(irlmp->cachelog, DISCOVERY_LOG); + + /* + * Start a single discovery operation if discovery is not already + * running + */ + if (!sysctl_discovery) { + /* Check if user wants to override the default */ + if (nslots == DISCOVERY_DEFAULT_SLOTS) + nslots = sysctl_discovery_slots; + + irlmp_do_discovery(nslots); + /* Note : we never do expiry here. Expiry will run on the + * discovery timer regardless of the state of sysctl_discovery + * Jean II */ + } +} +EXPORT_SYMBOL(irlmp_discovery_request); + +/* + * Function irlmp_get_discoveries (pn, mask, slots) + * + * Return the current discovery log + * + * If discovery is not enabled, you should call this function again + * after 1 or 2 seconds (i.e. after discovery has been done). + */ +struct irda_device_info *irlmp_get_discoveries(int *pn, __u16 mask, int nslots) +{ + /* If discovery is not enabled, it's likely that the discovery log + * will be empty. So, we trigger a single discovery, so that next + * time the user call us there might be some results in the log. + * Jean II + */ + if (!sysctl_discovery) { + /* Check if user wants to override the default */ + if (nslots == DISCOVERY_DEFAULT_SLOTS) + nslots = sysctl_discovery_slots; + + /* Start discovery - will complete sometime later */ + irlmp_do_discovery(nslots); + /* Note : we never do expiry here. Expiry will run on the + * discovery timer regardless of the state of sysctl_discovery + * Jean II */ + } + + /* Return current cached discovery log */ + return(irlmp_copy_discoveries(irlmp->cachelog, pn, mask, TRUE)); +} +EXPORT_SYMBOL(irlmp_get_discoveries); + +/* + * Function irlmp_notify_client (log) + * + * Notify all about discovered devices + * + * Clients registered with IrLMP are : + * o IrComm + * o IrLAN + * o Any socket (in any state - ouch, that may be a lot !) + * The client may have defined a callback to be notified in case of + * partial/selective discovery based on the hints that it passed to IrLMP. + */ +static inline void +irlmp_notify_client(irlmp_client_t *client, + hashbin_t *log, DISCOVERY_MODE mode) +{ + discinfo_t *discoveries; /* Copy of the discovery log */ + int number; /* Number of nodes in the log */ + int i; + + IRDA_DEBUG(3, "%s()\n", __FUNCTION__); + + /* Check if client wants or not partial/selective log (optimisation) */ + if (!client->disco_callback) + return; + + /* + * Locking notes : + * the old code was manipulating the log directly, which was + * very racy. Now, we use copy_discoveries, that protects + * itself while dumping the log for us. + * The overhead of the copy is compensated by the fact that + * we only pass new discoveries in normal mode and don't + * pass the same old entry every 3s to the caller as we used + * to do (virtual function calling is expensive). + * Jean II + */ + + /* + * Now, check all discovered devices (if any), and notify client + * only about the services that the client is interested in + * We also notify only about the new devices unless the caller + * explicitly request a dump of the log. Jean II + */ + discoveries = irlmp_copy_discoveries(log, &number, + client->hint_mask.word, + (mode == DISCOVERY_LOG)); + /* Check if the we got some results */ + if (discoveries == NULL) + return; /* No nodes discovered */ + + /* Pass all entries to the listener */ + for(i = 0; i < number; i++) + client->disco_callback(&(discoveries[i]), mode, client->priv); + + /* Free up our buffer */ + kfree(discoveries); +} + +/* + * Function irlmp_discovery_confirm ( self, log) + * + * Some device(s) answered to our discovery request! Check to see which + * device it is, and give indication to the client(s) + * + */ +void irlmp_discovery_confirm(hashbin_t *log, DISCOVERY_MODE mode) +{ + irlmp_client_t *client; + irlmp_client_t *client_next; + + IRDA_DEBUG(3, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(log != NULL, return;); + + if (!(HASHBIN_GET_SIZE(log))) + return; + + /* For each client - notify callback may touch client list */ + client = (irlmp_client_t *) hashbin_get_first(irlmp->clients); + while (NULL != hashbin_find_next(irlmp->clients, (long) client, NULL, + (void *) &client_next) ) { + /* Check if we should notify client */ + irlmp_notify_client(client, log, mode); + + client = client_next; + } +} + +/* + * Function irlmp_discovery_expiry (expiry) + * + * This device is no longer been discovered, and therefore it is being + * purged from the discovery log. Inform all clients who have + * registered for this event... + * + * Note : called exclusively from discovery.c + * Note : this is no longer called under discovery spinlock, so the + * client can do whatever he wants in the callback. + */ +void irlmp_discovery_expiry(discinfo_t *expiries, int number) +{ + irlmp_client_t *client; + irlmp_client_t *client_next; + int i; + + IRDA_DEBUG(3, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(expiries != NULL, return;); + + /* For each client - notify callback may touch client list */ + client = (irlmp_client_t *) hashbin_get_first(irlmp->clients); + while (NULL != hashbin_find_next(irlmp->clients, (long) client, NULL, + (void *) &client_next) ) { + + /* Pass all entries to the listener */ + for(i = 0; i < number; i++) { + /* Check if we should notify client */ + if ((client->expir_callback) && + (client->hint_mask.word & u16ho(expiries[i].hints) + & 0x7f7f) ) + client->expir_callback(&(expiries[i]), + EXPIRY_TIMEOUT, + client->priv); + } + + /* Next client */ + client = client_next; + } +} + +/* + * Function irlmp_get_discovery_response () + * + * Used by IrLAP to get the discovery info it needs when answering + * discovery requests by other devices. + */ +discovery_t *irlmp_get_discovery_response(void) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(irlmp != NULL, return NULL;); + + u16ho(irlmp->discovery_rsp.data.hints) = irlmp->hints.word; + + /* + * Set character set for device name (we use ASCII), and + * copy device name. Remember to make room for a \0 at the + * end + */ + irlmp->discovery_rsp.data.charset = CS_ASCII; + + strncpy(irlmp->discovery_rsp.data.info, sysctl_devname, + NICKNAME_MAX_LEN); + irlmp->discovery_rsp.name_len = strlen(irlmp->discovery_rsp.data.info); + + return &irlmp->discovery_rsp; +} + +/* + * Function irlmp_data_request (self, skb) + * + * Send some data to peer device + * + * Note on skb management : + * After calling the lower layers of the IrDA stack, we always + * kfree() the skb, which drop the reference count (and potentially + * destroy it). + * IrLMP and IrLAP may queue the packet, and in those cases will need + * to use skb_get() to keep it around. + * Jean II + */ +int irlmp_data_request(struct lsap_cb *self, struct sk_buff *userdata) +{ + int ret; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + + /* Make room for MUX header */ + IRDA_ASSERT(skb_headroom(userdata) >= LMP_HEADER, return -1;); + skb_push(userdata, LMP_HEADER); + + ret = irlmp_do_lsap_event(self, LM_DATA_REQUEST, userdata); + + /* Drop reference count - see irlap_data_request(). */ + dev_kfree_skb(userdata); + + return ret; +} +EXPORT_SYMBOL(irlmp_data_request); + +/* + * Function irlmp_data_indication (handle, skb) + * + * Got data from LAP layer so pass it up to upper layer + * + */ +void irlmp_data_indication(struct lsap_cb *self, struct sk_buff *skb) +{ + /* Hide LMP header from layer above */ + skb_pull(skb, LMP_HEADER); + + if (self->notify.data_indication) { + /* Don't forget to refcount it - see irlap_driver_rcv(). */ + skb_get(skb); + self->notify.data_indication(self->notify.instance, self, skb); + } +} + +/* + * Function irlmp_udata_request (self, skb) + */ +int irlmp_udata_request(struct lsap_cb *self, struct sk_buff *userdata) +{ + int ret; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(userdata != NULL, return -1;); + + /* Make room for MUX header */ + IRDA_ASSERT(skb_headroom(userdata) >= LMP_HEADER, return -1;); + skb_push(userdata, LMP_HEADER); + + ret = irlmp_do_lsap_event(self, LM_UDATA_REQUEST, userdata); + + /* Drop reference count - see irlap_data_request(). */ + dev_kfree_skb(userdata); + + return ret; +} + +/* + * Function irlmp_udata_indication (self, skb) + * + * Send unreliable data (but still within the connection) + * + */ +void irlmp_udata_indication(struct lsap_cb *self, struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + /* Hide LMP header from layer above */ + skb_pull(skb, LMP_HEADER); + + if (self->notify.udata_indication) { + /* Don't forget to refcount it - see irlap_driver_rcv(). */ + skb_get(skb); + self->notify.udata_indication(self->notify.instance, self, + skb); + } +} + +/* + * Function irlmp_connless_data_request (self, skb) + */ +#ifdef CONFIG_IRDA_ULTRA +int irlmp_connless_data_request(struct lsap_cb *self, struct sk_buff *userdata, + __u8 pid) +{ + struct sk_buff *clone_skb; + struct lap_cb *lap; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(userdata != NULL, return -1;); + + /* Make room for MUX and PID header */ + IRDA_ASSERT(skb_headroom(userdata) >= LMP_HEADER+LMP_PID_HEADER, + return -1;); + + /* Insert protocol identifier */ + skb_push(userdata, LMP_PID_HEADER); + if(self != NULL) + userdata->data[0] = self->pid; + else + userdata->data[0] = pid; + + /* Connectionless sockets must use 0x70 */ + skb_push(userdata, LMP_HEADER); + userdata->data[0] = userdata->data[1] = LSAP_CONNLESS; + + /* Try to send Connectionless packets out on all links */ + lap = (struct lap_cb *) hashbin_get_first(irlmp->links); + while (lap != NULL) { + IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return -1;); + + clone_skb = skb_clone(userdata, GFP_ATOMIC); + if (!clone_skb) { + dev_kfree_skb(userdata); + return -ENOMEM; + } + + irlap_unitdata_request(lap->irlap, clone_skb); + /* irlap_unitdata_request() don't increase refcount, + * so no dev_kfree_skb() - Jean II */ + + lap = (struct lap_cb *) hashbin_get_next(irlmp->links); + } + dev_kfree_skb(userdata); + + return 0; +} +#endif /* CONFIG_IRDA_ULTRA */ + +/* + * Function irlmp_connless_data_indication (self, skb) + * + * Receive unreliable data outside any connection. Mostly used by Ultra + * + */ +#ifdef CONFIG_IRDA_ULTRA +void irlmp_connless_data_indication(struct lsap_cb *self, struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + /* Hide LMP and PID header from layer above */ + skb_pull(skb, LMP_HEADER+LMP_PID_HEADER); + + if (self->notify.udata_indication) { + /* Don't forget to refcount it - see irlap_driver_rcv(). */ + skb_get(skb); + self->notify.udata_indication(self->notify.instance, self, + skb); + } +} +#endif /* CONFIG_IRDA_ULTRA */ + +/* + * Propagate status indication from LAP to LSAPs (via LMP) + * This don't trigger any change of state in lap_cb, lmp_cb or lsap_cb, + * and the event is stateless, therefore we can bypass both state machines + * and send the event direct to the LSAP user. + * Jean II + */ +void irlmp_status_indication(struct lap_cb *self, + LINK_STATUS link, LOCK_STATUS lock) +{ + struct lsap_cb *next; + struct lsap_cb *curr; + + /* Send status_indication to all LSAPs using this link */ + curr = (struct lsap_cb *) hashbin_get_first( self->lsaps); + while (NULL != hashbin_find_next(self->lsaps, (long) curr, NULL, + (void *) &next) ) { + IRDA_ASSERT(curr->magic == LMP_LSAP_MAGIC, return;); + /* + * Inform service user if he has requested it + */ + if (curr->notify.status_indication != NULL) + curr->notify.status_indication(curr->notify.instance, + link, lock); + else + IRDA_DEBUG(2, "%s(), no handler\n", __FUNCTION__); + + curr = next; + } +} + +/* + * Receive flow control indication from LAP. + * LAP want us to send it one more frame. We implement a simple round + * robin scheduler between the active sockets so that we get a bit of + * fairness. Note that the round robin is far from perfect, but it's + * better than nothing. + * We then poll the selected socket so that we can do synchronous + * refilling of IrLAP (which allow to minimise the number of buffers). + * Jean II + */ +void irlmp_flow_indication(struct lap_cb *self, LOCAL_FLOW flow) +{ + struct lsap_cb *next; + struct lsap_cb *curr; + int lsap_todo; + + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + IRDA_ASSERT(flow == FLOW_START, return;); + + /* Get the number of lsap. That's the only safe way to know + * that we have looped around... - Jean II */ + lsap_todo = HASHBIN_GET_SIZE(self->lsaps); + IRDA_DEBUG(4, "%s() : %d lsaps to scan\n", __FUNCTION__, lsap_todo); + + /* Poll lsap in order until the queue is full or until we + * tried them all. + * Most often, the current LSAP will have something to send, + * so we will go through this loop only once. - Jean II */ + while((lsap_todo--) && + (IRLAP_GET_TX_QUEUE_LEN(self->irlap) < LAP_HIGH_THRESHOLD)) { + /* Try to find the next lsap we should poll. */ + next = self->flow_next; + /* If we have no lsap, restart from first one */ + if(next == NULL) + next = (struct lsap_cb *) hashbin_get_first(self->lsaps); + /* Verify current one and find the next one */ + curr = hashbin_find_next(self->lsaps, (long) next, NULL, + (void *) &self->flow_next); + /* Uh-oh... Paranoia */ + if(curr == NULL) + break; + IRDA_DEBUG(4, "%s() : curr is %p, next was %p and is now %p, still %d to go - queue len = %d\n", __FUNCTION__, curr, next, self->flow_next, lsap_todo, IRLAP_GET_TX_QUEUE_LEN(self->irlap)); + + /* Inform lsap user that it can send one more packet. */ + if (curr->notify.flow_indication != NULL) + curr->notify.flow_indication(curr->notify.instance, + curr, flow); + else + IRDA_DEBUG(1, "%s(), no handler\n", __FUNCTION__); + } +} + +#if 0 +/* + * Function irlmp_hint_to_service (hint) + * + * Returns a list of all servics contained in the given hint bits. This + * function assumes that the hint bits have the size of two bytes only + */ +__u8 *irlmp_hint_to_service(__u8 *hint) +{ + __u8 *service; + int i = 0; + + /* + * Allocate array to store services in. 16 entries should be safe + * since we currently only support 2 hint bytes + */ + service = kmalloc(16, GFP_ATOMIC); + if (!service) { + IRDA_DEBUG(1, "%s(), Unable to kmalloc!\n", __FUNCTION__); + return NULL; + } + + if (!hint[0]) { + IRDA_DEBUG(1, "\n"); + kfree(service); + return NULL; + } + if (hint[0] & HINT_PNP) + IRDA_DEBUG(1, "PnP Compatible "); + if (hint[0] & HINT_PDA) + IRDA_DEBUG(1, "PDA/Palmtop "); + if (hint[0] & HINT_COMPUTER) + IRDA_DEBUG(1, "Computer "); + if (hint[0] & HINT_PRINTER) { + IRDA_DEBUG(1, "Printer "); + service[i++] = S_PRINTER; + } + if (hint[0] & HINT_MODEM) + IRDA_DEBUG(1, "Modem "); + if (hint[0] & HINT_FAX) + IRDA_DEBUG(1, "Fax "); + if (hint[0] & HINT_LAN) { + IRDA_DEBUG(1, "LAN Access "); + service[i++] = S_LAN; + } + /* + * Test if extension byte exists. This byte will usually be + * there, but this is not really required by the standard. + * (IrLMP p. 29) + */ + if (hint[0] & HINT_EXTENSION) { + if (hint[1] & HINT_TELEPHONY) { + IRDA_DEBUG(1, "Telephony "); + service[i++] = S_TELEPHONY; + } if (hint[1] & HINT_FILE_SERVER) + IRDA_DEBUG(1, "File Server "); + + if (hint[1] & HINT_COMM) { + IRDA_DEBUG(1, "IrCOMM "); + service[i++] = S_COMM; + } + if (hint[1] & HINT_OBEX) { + IRDA_DEBUG(1, "IrOBEX "); + service[i++] = S_OBEX; + } + } + IRDA_DEBUG(1, "\n"); + + /* So that client can be notified about any discovery */ + service[i++] = S_ANY; + + service[i] = S_END; + + return service; +} +#endif + +static const __u16 service_hint_mapping[S_END][2] = { + { HINT_PNP, 0 }, /* S_PNP */ + { HINT_PDA, 0 }, /* S_PDA */ + { HINT_COMPUTER, 0 }, /* S_COMPUTER */ + { HINT_PRINTER, 0 }, /* S_PRINTER */ + { HINT_MODEM, 0 }, /* S_MODEM */ + { HINT_FAX, 0 }, /* S_FAX */ + { HINT_LAN, 0 }, /* S_LAN */ + { HINT_EXTENSION, HINT_TELEPHONY }, /* S_TELEPHONY */ + { HINT_EXTENSION, HINT_COMM }, /* S_COMM */ + { HINT_EXTENSION, HINT_OBEX }, /* S_OBEX */ + { 0xFF, 0xFF }, /* S_ANY */ +}; + +/* + * Function irlmp_service_to_hint (service) + * + * Converts a service type, to a hint bit + * + * Returns: a 16 bit hint value, with the service bit set + */ +__u16 irlmp_service_to_hint(int service) +{ + __u16_host_order hint; + + hint.byte[0] = service_hint_mapping[service][0]; + hint.byte[1] = service_hint_mapping[service][1]; + + return hint.word; +} +EXPORT_SYMBOL(irlmp_service_to_hint); + +/* + * Function irlmp_register_service (service) + * + * Register local service with IrLMP + * + */ +void *irlmp_register_service(__u16 hints) +{ + irlmp_service_t *service; + + IRDA_DEBUG(4, "%s(), hints = %04x\n", __FUNCTION__, hints); + + /* Make a new registration */ + service = kmalloc(sizeof(irlmp_service_t), GFP_ATOMIC); + if (!service) { + IRDA_DEBUG(1, "%s(), Unable to kmalloc!\n", __FUNCTION__); + return NULL; + } + service->hints.word = hints; + hashbin_insert(irlmp->services, (irda_queue_t *) service, + (long) service, NULL); + + irlmp->hints.word |= hints; + + return (void *)service; +} +EXPORT_SYMBOL(irlmp_register_service); + +/* + * Function irlmp_unregister_service (handle) + * + * Unregister service with IrLMP. + * + * Returns: 0 on success, -1 on error + */ +int irlmp_unregister_service(void *handle) +{ + irlmp_service_t *service; + unsigned long flags; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + if (!handle) + return -1; + + /* Caller may call with invalid handle (it's legal) - Jean II */ + service = hashbin_lock_find(irlmp->services, (long) handle, NULL); + if (!service) { + IRDA_DEBUG(1, "%s(), Unknown service!\n", __FUNCTION__); + return -1; + } + + hashbin_remove_this(irlmp->services, (irda_queue_t *) service); + kfree(service); + + /* Remove old hint bits */ + irlmp->hints.word = 0; + + /* Refresh current hint bits */ + spin_lock_irqsave(&irlmp->services->hb_spinlock, flags); + service = (irlmp_service_t *) hashbin_get_first(irlmp->services); + while (service) { + irlmp->hints.word |= service->hints.word; + + service = (irlmp_service_t *)hashbin_get_next(irlmp->services); + } + spin_unlock_irqrestore(&irlmp->services->hb_spinlock, flags); + return 0; +} +EXPORT_SYMBOL(irlmp_unregister_service); + +/* + * Function irlmp_register_client (hint_mask, callback1, callback2) + * + * Register a local client with IrLMP + * First callback is selective discovery (based on hints) + * Second callback is for selective discovery expiries + * + * Returns: handle > 0 on success, 0 on error + */ +void *irlmp_register_client(__u16 hint_mask, DISCOVERY_CALLBACK1 disco_clb, + DISCOVERY_CALLBACK2 expir_clb, void *priv) +{ + irlmp_client_t *client; + + IRDA_DEBUG(1, "%s()\n", __FUNCTION__); + IRDA_ASSERT(irlmp != NULL, return NULL;); + + /* Make a new registration */ + client = kmalloc(sizeof(irlmp_client_t), GFP_ATOMIC); + if (!client) { + IRDA_DEBUG( 1, "%s(), Unable to kmalloc!\n", __FUNCTION__); + return NULL; + } + + /* Register the details */ + client->hint_mask.word = hint_mask; + client->disco_callback = disco_clb; + client->expir_callback = expir_clb; + client->priv = priv; + + hashbin_insert(irlmp->clients, (irda_queue_t *) client, + (long) client, NULL); + + return (void *) client; +} +EXPORT_SYMBOL(irlmp_register_client); + +/* + * Function irlmp_update_client (handle, hint_mask, callback1, callback2) + * + * Updates specified client (handle) with possibly new hint_mask and + * callback + * + * Returns: 0 on success, -1 on error + */ +int irlmp_update_client(void *handle, __u16 hint_mask, + DISCOVERY_CALLBACK1 disco_clb, + DISCOVERY_CALLBACK2 expir_clb, void *priv) +{ + irlmp_client_t *client; + + if (!handle) + return -1; + + client = hashbin_lock_find(irlmp->clients, (long) handle, NULL); + if (!client) { + IRDA_DEBUG(1, "%s(), Unknown client!\n", __FUNCTION__); + return -1; + } + + client->hint_mask.word = hint_mask; + client->disco_callback = disco_clb; + client->expir_callback = expir_clb; + client->priv = priv; + + return 0; +} +EXPORT_SYMBOL(irlmp_update_client); + +/* + * Function irlmp_unregister_client (handle) + * + * Returns: 0 on success, -1 on error + * + */ +int irlmp_unregister_client(void *handle) +{ + struct irlmp_client *client; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + if (!handle) + return -1; + + /* Caller may call with invalid handle (it's legal) - Jean II */ + client = hashbin_lock_find(irlmp->clients, (long) handle, NULL); + if (!client) { + IRDA_DEBUG(1, "%s(), Unknown client!\n", __FUNCTION__); + return -1; + } + + IRDA_DEBUG(4, "%s(), removing client!\n", __FUNCTION__); + hashbin_remove_this(irlmp->clients, (irda_queue_t *) client); + kfree(client); + + return 0; +} +EXPORT_SYMBOL(irlmp_unregister_client); + +/* + * Function irlmp_slsap_inuse (slsap) + * + * Check if the given source LSAP selector is in use + * + * This function is clearly not very efficient. On the mitigating side, the + * stack make sure that in 99% of the cases, we are called only once + * for each socket allocation. We could probably keep a bitmap + * of the allocated LSAP, but I'm not sure the complexity is worth it. + * Jean II + */ +static int irlmp_slsap_inuse(__u8 slsap_sel) +{ + struct lsap_cb *self; + struct lap_cb *lap; + unsigned long flags; + + IRDA_ASSERT(irlmp != NULL, return TRUE;); + IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return TRUE;); + IRDA_ASSERT(slsap_sel != LSAP_ANY, return TRUE;); + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + +#ifdef CONFIG_IRDA_ULTRA + /* Accept all bindings to the connectionless LSAP */ + if (slsap_sel == LSAP_CONNLESS) + return FALSE; +#endif /* CONFIG_IRDA_ULTRA */ + + /* Valid values are between 0 and 127 (0x0-0x6F) */ + if (slsap_sel > LSAP_MAX) + return TRUE; + + /* + * Check if slsap is already in use. To do this we have to loop over + * every IrLAP connection and check every LSAP associated with each + * the connection. + */ + spin_lock_irqsave(&irlmp->links->hb_spinlock, flags); + lap = (struct lap_cb *) hashbin_get_first(irlmp->links); + while (lap != NULL) { + IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, goto errlap;); + + /* Careful for priority inversions here ! + * irlmp->links is never taken while another IrDA + * spinlock is held, so we are safe. Jean II */ + spin_lock(&lap->lsaps->hb_spinlock); + + /* For this IrLAP, check all the LSAPs */ + self = (struct lsap_cb *) hashbin_get_first(lap->lsaps); + while (self != NULL) { + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, + goto errlsap;); + + if ((self->slsap_sel == slsap_sel)) { + IRDA_DEBUG(4, "Source LSAP selector=%02x in use\n", + self->slsap_sel); + goto errlsap; + } + self = (struct lsap_cb*) hashbin_get_next(lap->lsaps); + } + spin_unlock(&lap->lsaps->hb_spinlock); + + /* Next LAP */ + lap = (struct lap_cb *) hashbin_get_next(irlmp->links); + } + spin_unlock_irqrestore(&irlmp->links->hb_spinlock, flags); + + /* + * Server sockets are typically waiting for connections and + * therefore reside in the unconnected list. We don't want + * to give out their LSAPs for obvious reasons... + * Jean II + */ + spin_lock_irqsave(&irlmp->unconnected_lsaps->hb_spinlock, flags); + + self = (struct lsap_cb *) hashbin_get_first(irlmp->unconnected_lsaps); + while (self != NULL) { + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, goto erruncon;); + if ((self->slsap_sel == slsap_sel)) { + IRDA_DEBUG(4, "Source LSAP selector=%02x in use (unconnected)\n", + self->slsap_sel); + goto erruncon; + } + self = (struct lsap_cb*) hashbin_get_next(irlmp->unconnected_lsaps); + } + spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, flags); + + return FALSE; + + /* Error exit from within one of the two nested loops. + * Make sure we release the right spinlock in the righ order. + * Jean II */ +errlsap: + spin_unlock(&lap->lsaps->hb_spinlock); +IRDA_ASSERT_LABEL(errlap:) + spin_unlock_irqrestore(&irlmp->links->hb_spinlock, flags); + return TRUE; + + /* Error exit from within the unconnected loop. + * Just one spinlock to release... Jean II */ +erruncon: + spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, flags); + return TRUE; +} + +/* + * Function irlmp_find_free_slsap () + * + * Find a free source LSAP to use. This function is called if the service + * user has requested a source LSAP equal to LM_ANY + */ +static __u8 irlmp_find_free_slsap(void) +{ + __u8 lsap_sel; + int wrapped = 0; + + IRDA_ASSERT(irlmp != NULL, return -1;); + IRDA_ASSERT(irlmp->magic == LMP_MAGIC, return -1;); + + /* Most users don't really care which LSAPs they are given, + * and therefore we automatically give them a free LSAP. + * This function try to find a suitable LSAP, i.e. which is + * not in use and is within the acceptable range. Jean II */ + + do { + /* Always increment to LSAP number before using it. + * In theory, we could reuse the last LSAP number, as long + * as it is no longer in use. Some IrDA stack do that. + * However, the previous socket may be half closed, i.e. + * we closed it, we think it's no longer in use, but the + * other side did not receive our close and think it's + * active and still send data on it. + * This is similar to what is done with PIDs and TCP ports. + * Also, this reduce the number of calls to irlmp_slsap_inuse() + * which is an expensive function to call. + * Jean II */ + irlmp->last_lsap_sel++; + + /* Check if we need to wraparound (0x70-0x7f are reserved) */ + if (irlmp->last_lsap_sel > LSAP_MAX) { + /* 0x00-0x10 are also reserved for well know ports */ + irlmp->last_lsap_sel = 0x10; + + /* Make sure we terminate the loop */ + if (wrapped++) { + IRDA_ERROR("%s: no more free LSAPs !\n", + __FUNCTION__); + return 0; + } + } + + /* If the LSAP is in use, try the next one. + * Despite the autoincrement, we need to check if the lsap + * is really in use or not, first because LSAP may be + * directly allocated in irlmp_open_lsap(), and also because + * we may wraparound on old sockets. Jean II */ + } while (irlmp_slsap_inuse(irlmp->last_lsap_sel)); + + /* Got it ! */ + lsap_sel = irlmp->last_lsap_sel; + IRDA_DEBUG(4, "%s(), found free lsap_sel=%02x\n", + __FUNCTION__, lsap_sel); + + return lsap_sel; +} + +/* + * Function irlmp_convert_lap_reason (lap_reason) + * + * Converts IrLAP disconnect reason codes to IrLMP disconnect reason + * codes + * + */ +LM_REASON irlmp_convert_lap_reason( LAP_REASON lap_reason) +{ + int reason = LM_LAP_DISCONNECT; + + switch (lap_reason) { + case LAP_DISC_INDICATION: /* Received a disconnect request from peer */ + IRDA_DEBUG( 1, "%s(), LAP_DISC_INDICATION\n", __FUNCTION__); + reason = LM_USER_REQUEST; + break; + case LAP_NO_RESPONSE: /* To many retransmits without response */ + IRDA_DEBUG( 1, "%s(), LAP_NO_RESPONSE\n", __FUNCTION__); + reason = LM_LAP_DISCONNECT; + break; + case LAP_RESET_INDICATION: + IRDA_DEBUG( 1, "%s(), LAP_RESET_INDICATION\n", __FUNCTION__); + reason = LM_LAP_RESET; + break; + case LAP_FOUND_NONE: + case LAP_MEDIA_BUSY: + case LAP_PRIMARY_CONFLICT: + IRDA_DEBUG(1, "%s(), LAP_FOUND_NONE, LAP_MEDIA_BUSY or LAP_PRIMARY_CONFLICT\n", __FUNCTION__); + reason = LM_CONNECT_FAILURE; + break; + default: + IRDA_DEBUG(1, "%s(), Unknow IrLAP disconnect reason %d!\n", + __FUNCTION__, lap_reason); + reason = LM_LAP_DISCONNECT; + break; + } + + return reason; +} + +#ifdef CONFIG_PROC_FS + +struct irlmp_iter_state { + hashbin_t *hashbin; +}; + +#define LSAP_START_TOKEN ((void *)1) +#define LINK_START_TOKEN ((void *)2) + +static void *irlmp_seq_hb_idx(struct irlmp_iter_state *iter, loff_t *off) +{ + void *element; + + spin_lock_irq(&iter->hashbin->hb_spinlock); + for (element = hashbin_get_first(iter->hashbin); + element != NULL; + element = hashbin_get_next(iter->hashbin)) { + if (!off || *off-- == 0) { + /* NB: hashbin left locked */ + return element; + } + } + spin_unlock_irq(&iter->hashbin->hb_spinlock); + iter->hashbin = NULL; + return NULL; +} + + +static void *irlmp_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct irlmp_iter_state *iter = seq->private; + void *v; + loff_t off = *pos; + + iter->hashbin = NULL; + if (off-- == 0) + return LSAP_START_TOKEN; + + iter->hashbin = irlmp->unconnected_lsaps; + v = irlmp_seq_hb_idx(iter, &off); + if (v) + return v; + + if (off-- == 0) + return LINK_START_TOKEN; + + iter->hashbin = irlmp->links; + return irlmp_seq_hb_idx(iter, &off); +} + +static void *irlmp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct irlmp_iter_state *iter = seq->private; + + ++*pos; + + if (v == LSAP_START_TOKEN) { /* start of list of lsaps */ + iter->hashbin = irlmp->unconnected_lsaps; + v = irlmp_seq_hb_idx(iter, NULL); + return v ? v : LINK_START_TOKEN; + } + + if (v == LINK_START_TOKEN) { /* start of list of links */ + iter->hashbin = irlmp->links; + return irlmp_seq_hb_idx(iter, NULL); + } + + v = hashbin_get_next(iter->hashbin); + + if (v == NULL) { /* no more in this hash bin */ + spin_unlock_irq(&iter->hashbin->hb_spinlock); + + if (iter->hashbin == irlmp->unconnected_lsaps) + v = LINK_START_TOKEN; + + iter->hashbin = NULL; + } + return v; +} + +static void irlmp_seq_stop(struct seq_file *seq, void *v) +{ + struct irlmp_iter_state *iter = seq->private; + + if (iter->hashbin) + spin_unlock_irq(&iter->hashbin->hb_spinlock); +} + +static int irlmp_seq_show(struct seq_file *seq, void *v) +{ + const struct irlmp_iter_state *iter = seq->private; + struct lsap_cb *self = v; + + if (v == LSAP_START_TOKEN) + seq_puts(seq, "Unconnected LSAPs:\n"); + else if (v == LINK_START_TOKEN) + seq_puts(seq, "\nRegistered Link Layers:\n"); + else if (iter->hashbin == irlmp->unconnected_lsaps) { + self = v; + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -EINVAL; ); + seq_printf(seq, "lsap state: %s, ", + irlsap_state[ self->lsap_state]); + seq_printf(seq, + "slsap_sel: %#02x, dlsap_sel: %#02x, ", + self->slsap_sel, self->dlsap_sel); + seq_printf(seq, "(%s)", self->notify.name); + seq_printf(seq, "\n"); + } else if (iter->hashbin == irlmp->links) { + struct lap_cb *lap = v; + + seq_printf(seq, "lap state: %s, ", + irlmp_state[lap->lap_state]); + + seq_printf(seq, "saddr: %#08x, daddr: %#08x, ", + lap->saddr, lap->daddr); + seq_printf(seq, "num lsaps: %d", + HASHBIN_GET_SIZE(lap->lsaps)); + seq_printf(seq, "\n"); + + /* Careful for priority inversions here ! + * All other uses of attrib spinlock are independent of + * the object spinlock, so we are safe. Jean II */ + spin_lock(&lap->lsaps->hb_spinlock); + + seq_printf(seq, "\n Connected LSAPs:\n"); + for (self = (struct lsap_cb *) hashbin_get_first(lap->lsaps); + self != NULL; + self = (struct lsap_cb *)hashbin_get_next(lap->lsaps)) { + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, + goto outloop;); + seq_printf(seq, " lsap state: %s, ", + irlsap_state[ self->lsap_state]); + seq_printf(seq, + "slsap_sel: %#02x, dlsap_sel: %#02x, ", + self->slsap_sel, self->dlsap_sel); + seq_printf(seq, "(%s)", self->notify.name); + seq_putc(seq, '\n'); + + } + IRDA_ASSERT_LABEL(outloop:) + spin_unlock(&lap->lsaps->hb_spinlock); + seq_putc(seq, '\n'); + } else + return -EINVAL; + + return 0; +} + +static struct seq_operations irlmp_seq_ops = { + .start = irlmp_seq_start, + .next = irlmp_seq_next, + .stop = irlmp_seq_stop, + .show = irlmp_seq_show, +}; + +static int irlmp_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct irlmp_iter_state *s; + + IRDA_ASSERT(irlmp != NULL, return -EINVAL;); + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + goto out; + + rc = seq_open(file, &irlmp_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +struct file_operations irlmp_seq_fops = { + .owner = THIS_MODULE, + .open = irlmp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* PROC_FS */ diff --git a/net/irda/irlmp_event.c b/net/irda/irlmp_event.c new file mode 100644 index 000000000000..26649f6528e6 --- /dev/null +++ b/net/irda/irlmp_event.c @@ -0,0 +1,912 @@ +/********************************************************************* + * + * Filename: irlmp_event.c + * Version: 0.8 + * Description: An IrDA LMP event driver for Linux + * Status: Experimental. + * Author: Dag Brattli + * Created at: Mon Aug 4 20:40:53 1997 + * Modified at: Tue Dec 14 23:04:16 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include + +#include +#include +#include +#include +#include +#include + +const char *irlmp_state[] = { + "LAP_STANDBY", + "LAP_U_CONNECT", + "LAP_ACTIVE", +}; + +const char *irlsap_state[] = { + "LSAP_DISCONNECTED", + "LSAP_CONNECT", + "LSAP_CONNECT_PEND", + "LSAP_DATA_TRANSFER_READY", + "LSAP_SETUP", + "LSAP_SETUP_PEND", +}; + +#ifdef CONFIG_IRDA_DEBUG +static const char *irlmp_event[] = { + "LM_CONNECT_REQUEST", + "LM_CONNECT_CONFIRM", + "LM_CONNECT_RESPONSE", + "LM_CONNECT_INDICATION", + + "LM_DISCONNECT_INDICATION", + "LM_DISCONNECT_REQUEST", + + "LM_DATA_REQUEST", + "LM_UDATA_REQUEST", + "LM_DATA_INDICATION", + "LM_UDATA_INDICATION", + + "LM_WATCHDOG_TIMEOUT", + + /* IrLAP events */ + "LM_LAP_CONNECT_REQUEST", + "LM_LAP_CONNECT_INDICATION", + "LM_LAP_CONNECT_CONFIRM", + "LM_LAP_DISCONNECT_INDICATION", + "LM_LAP_DISCONNECT_REQUEST", + "LM_LAP_DISCOVERY_REQUEST", + "LM_LAP_DISCOVERY_CONFIRM", + "LM_LAP_IDLE_TIMEOUT", +}; +#endif /* CONFIG_IRDA_DEBUG */ + +/* LAP Connection control proto declarations */ +static void irlmp_state_standby (struct lap_cb *, IRLMP_EVENT, + struct sk_buff *); +static void irlmp_state_u_connect(struct lap_cb *, IRLMP_EVENT, + struct sk_buff *); +static void irlmp_state_active (struct lap_cb *, IRLMP_EVENT, + struct sk_buff *); + +/* LSAP Connection control proto declarations */ +static int irlmp_state_disconnected(struct lsap_cb *, IRLMP_EVENT, + struct sk_buff *); +static int irlmp_state_connect (struct lsap_cb *, IRLMP_EVENT, + struct sk_buff *); +static int irlmp_state_connect_pend(struct lsap_cb *, IRLMP_EVENT, + struct sk_buff *); +static int irlmp_state_dtr (struct lsap_cb *, IRLMP_EVENT, + struct sk_buff *); +static int irlmp_state_setup (struct lsap_cb *, IRLMP_EVENT, + struct sk_buff *); +static int irlmp_state_setup_pend (struct lsap_cb *, IRLMP_EVENT, + struct sk_buff *); + +static void (*lap_state[]) (struct lap_cb *, IRLMP_EVENT, struct sk_buff *) = +{ + irlmp_state_standby, + irlmp_state_u_connect, + irlmp_state_active, +}; + +static int (*lsap_state[])( struct lsap_cb *, IRLMP_EVENT, struct sk_buff *) = +{ + irlmp_state_disconnected, + irlmp_state_connect, + irlmp_state_connect_pend, + irlmp_state_dtr, + irlmp_state_setup, + irlmp_state_setup_pend +}; + +static inline void irlmp_next_lap_state(struct lap_cb *self, + IRLMP_STATE state) +{ + /* + IRDA_DEBUG(4, "%s(), LMP LAP = %s\n", __FUNCTION__, irlmp_state[state]); + */ + self->lap_state = state; +} + +static inline void irlmp_next_lsap_state(struct lsap_cb *self, + LSAP_STATE state) +{ + /* + IRDA_ASSERT(self != NULL, return;); + IRDA_DEBUG(4, "%s(), LMP LSAP = %s\n", __FUNCTION__, irlsap_state[state]); + */ + self->lsap_state = state; +} + +/* Do connection control events */ +int irlmp_do_lsap_event(struct lsap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + + IRDA_DEBUG(4, "%s(), EVENT = %s, STATE = %s\n", + __FUNCTION__, irlmp_event[event], irlsap_state[ self->lsap_state]); + + return (*lsap_state[self->lsap_state]) (self, event, skb); +} + +/* + * Function do_lap_event (event, skb, info) + * + * Do IrLAP control events + * + */ +void irlmp_do_lap_event(struct lap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + + IRDA_DEBUG(4, "%s(), EVENT = %s, STATE = %s\n", __FUNCTION__, + irlmp_event[event], + irlmp_state[self->lap_state]); + + (*lap_state[self->lap_state]) (self, event, skb); +} + +void irlmp_discovery_timer_expired(void *data) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + /* We always cleanup the log (active & passive discovery) */ + irlmp_do_expiry(); + + /* Active discovery is conditional */ + if (sysctl_discovery) + irlmp_do_discovery(sysctl_discovery_slots); + + /* Restart timer */ + irlmp_start_discovery_timer(irlmp, sysctl_discovery_timeout * HZ); +} + +void irlmp_watchdog_timer_expired(void *data) +{ + struct lsap_cb *self = (struct lsap_cb *) data; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return;); + + irlmp_do_lsap_event(self, LM_WATCHDOG_TIMEOUT, NULL); +} + +void irlmp_idle_timer_expired(void *data) +{ + struct lap_cb *self = (struct lap_cb *) data; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + + irlmp_do_lap_event(self, LM_LAP_IDLE_TIMEOUT, NULL); +} + +/* + * Send an event on all LSAPs attached to this LAP. + */ +static inline void +irlmp_do_all_lsap_event(hashbin_t * lsap_hashbin, + IRLMP_EVENT event) +{ + struct lsap_cb *lsap; + struct lsap_cb *lsap_next; + + /* Note : this function use the new hashbin_find_next() + * function, instead of the old hashbin_get_next(). + * This make sure that we are always pointing one lsap + * ahead, so that if the current lsap is removed as the + * result of sending the event, we don't care. + * Also, as we store the context ourselves, if an enumeration + * of the same lsap hashbin happens as the result of sending the + * event, we don't care. + * The only problem is if the next lsap is removed. In that case, + * hashbin_find_next() will return NULL and we will abort the + * enumeration. - Jean II */ + + /* Also : we don't accept any skb in input. We can *NOT* pass + * the same skb to multiple clients safely, we would need to + * skb_clone() it. - Jean II */ + + lsap = (struct lsap_cb *) hashbin_get_first(lsap_hashbin); + + while (NULL != hashbin_find_next(lsap_hashbin, + (long) lsap, + NULL, + (void *) &lsap_next) ) { + irlmp_do_lsap_event(lsap, event, NULL); + lsap = lsap_next; + } +} + +/********************************************************************* + * + * LAP connection control states + * + ********************************************************************/ + +/* + * Function irlmp_state_standby (event, skb, info) + * + * STANDBY, The IrLAP connection does not exist. + * + */ +static void irlmp_state_standby(struct lap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + IRDA_ASSERT(self->irlap != NULL, return;); + + switch (event) { + case LM_LAP_DISCOVERY_REQUEST: + /* irlmp_next_station_state( LMP_DISCOVER); */ + + irlap_discovery_request(self->irlap, &irlmp->discovery_cmd); + break; + case LM_LAP_CONNECT_INDICATION: + /* It's important to switch state first, to avoid IrLMP to + * think that the link is free since IrLMP may then start + * discovery before the connection is properly set up. DB. + */ + irlmp_next_lap_state(self, LAP_ACTIVE); + + /* Just accept connection TODO, this should be fixed */ + irlap_connect_response(self->irlap, skb); + break; + case LM_LAP_CONNECT_REQUEST: + IRDA_DEBUG(4, "%s() LS_CONNECT_REQUEST\n", __FUNCTION__); + + irlmp_next_lap_state(self, LAP_U_CONNECT); + + /* FIXME: need to set users requested QoS */ + irlap_connect_request(self->irlap, self->daddr, NULL, 0); + break; + case LM_LAP_DISCONNECT_INDICATION: + IRDA_DEBUG(4, "%s(), Error LM_LAP_DISCONNECT_INDICATION\n", + __FUNCTION__); + + irlmp_next_lap_state(self, LAP_STANDBY); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %s\n", + __FUNCTION__, irlmp_event[event]); + break; + } +} + +/* + * Function irlmp_state_u_connect (event, skb, info) + * + * U_CONNECT, The layer above has tried to open an LSAP connection but + * since the IrLAP connection does not exist, we must first start an + * IrLAP connection. We are now waiting response from IrLAP. + * */ +static void irlmp_state_u_connect(struct lap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(2, "%s(), event=%s\n", __FUNCTION__, irlmp_event[event]); + + switch (event) { + case LM_LAP_CONNECT_INDICATION: + /* It's important to switch state first, to avoid IrLMP to + * think that the link is free since IrLMP may then start + * discovery before the connection is properly set up. DB. + */ + irlmp_next_lap_state(self, LAP_ACTIVE); + + /* Just accept connection TODO, this should be fixed */ + irlap_connect_response(self->irlap, skb); + + /* Tell LSAPs that they can start sending data */ + irlmp_do_all_lsap_event(self->lsaps, LM_LAP_CONNECT_CONFIRM); + + /* Note : by the time we get there (LAP retries and co), + * the lsaps may already have gone. This avoid getting stuck + * forever in LAP_ACTIVE state - Jean II */ + if (HASHBIN_GET_SIZE(self->lsaps) == 0) { + IRDA_DEBUG(0, "%s() NO LSAPs !\n", __FUNCTION__); + irlmp_start_idle_timer(self, LM_IDLE_TIMEOUT); + } + break; + case LM_LAP_CONNECT_REQUEST: + /* Already trying to connect */ + break; + case LM_LAP_CONNECT_CONFIRM: + /* For all lsap_ce E Associated do LS_Connect_confirm */ + irlmp_next_lap_state(self, LAP_ACTIVE); + + /* Tell LSAPs that they can start sending data */ + irlmp_do_all_lsap_event(self->lsaps, LM_LAP_CONNECT_CONFIRM); + + /* Note : by the time we get there (LAP retries and co), + * the lsaps may already have gone. This avoid getting stuck + * forever in LAP_ACTIVE state - Jean II */ + if (HASHBIN_GET_SIZE(self->lsaps) == 0) { + IRDA_DEBUG(0, "%s() NO LSAPs !\n", __FUNCTION__); + irlmp_start_idle_timer(self, LM_IDLE_TIMEOUT); + } + break; + case LM_LAP_DISCONNECT_INDICATION: + IRDA_DEBUG(4, "%s(), LM_LAP_DISCONNECT_INDICATION\n", __FUNCTION__); + irlmp_next_lap_state(self, LAP_STANDBY); + + /* Send disconnect event to all LSAPs using this link */ + irlmp_do_all_lsap_event(self->lsaps, + LM_LAP_DISCONNECT_INDICATION); + break; + case LM_LAP_DISCONNECT_REQUEST: + IRDA_DEBUG(4, "%s(), LM_LAP_DISCONNECT_REQUEST\n", __FUNCTION__); + + /* One of the LSAP did timeout or was closed, if it was + * the last one, try to get out of here - Jean II */ + if (HASHBIN_GET_SIZE(self->lsaps) <= 1) { + irlap_disconnect_request(self->irlap); + } + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %s\n", + __FUNCTION__, irlmp_event[event]); + break; + } +} + +/* + * Function irlmp_state_active (event, skb, info) + * + * ACTIVE, IrLAP connection is active + * + */ +static void irlmp_state_active(struct lap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + switch (event) { + case LM_LAP_CONNECT_REQUEST: + IRDA_DEBUG(4, "%s(), LS_CONNECT_REQUEST\n", __FUNCTION__); + + /* + * IrLAP may have a pending disconnect. We tried to close + * IrLAP, but it was postponed because the link was + * busy or we were still sending packets. As we now + * need it, make sure it stays on. Jean II + */ + irlap_clear_disconnect(self->irlap); + + /* + * LAP connection already active, just bounce back! Since we + * don't know which LSAP that tried to do this, we have to + * notify all LSAPs using this LAP, but that should be safe to + * do anyway. + */ + irlmp_do_all_lsap_event(self->lsaps, LM_LAP_CONNECT_CONFIRM); + + /* Needed by connect indication */ + irlmp_do_all_lsap_event(irlmp->unconnected_lsaps, + LM_LAP_CONNECT_CONFIRM); + /* Keep state */ + break; + case LM_LAP_DISCONNECT_REQUEST: + /* + * Need to find out if we should close IrLAP or not. If there + * is only one LSAP connection left on this link, that LSAP + * must be the one that tries to close IrLAP. It will be + * removed later and moved to the list of unconnected LSAPs + */ + if (HASHBIN_GET_SIZE(self->lsaps) > 0) { + /* Timer value is checked in irsysctl - Jean II */ + irlmp_start_idle_timer(self, sysctl_lap_keepalive_time * HZ / 1000); + } else { + /* No more connections, so close IrLAP */ + + /* We don't want to change state just yet, because + * we want to reflect accurately the real state of + * the LAP, not the state we wish it was in, + * so that we don't lose LM_LAP_CONNECT_REQUEST. + * In some cases, IrLAP won't close the LAP + * immediately. For example, it might still be + * retrying packets or waiting for the pf bit. + * As the LAP always send a DISCONNECT_INDICATION + * in PCLOSE or SCLOSE, just change state on that. + * Jean II */ + irlap_disconnect_request(self->irlap); + } + break; + case LM_LAP_IDLE_TIMEOUT: + if (HASHBIN_GET_SIZE(self->lsaps) == 0) { + /* Same reasoning as above - keep state */ + irlap_disconnect_request(self->irlap); + } + break; + case LM_LAP_DISCONNECT_INDICATION: + irlmp_next_lap_state(self, LAP_STANDBY); + + /* In some case, at this point our side has already closed + * all lsaps, and we are waiting for the idle_timer to + * expire. If another device reconnect immediately, the + * idle timer will expire in the midle of the connection + * initialisation, screwing up things a lot... + * Therefore, we must stop the timer... */ + irlmp_stop_idle_timer(self); + + /* + * Inform all connected LSAP's using this link + */ + irlmp_do_all_lsap_event(self->lsaps, + LM_LAP_DISCONNECT_INDICATION); + + /* Force an expiry of the discovery log. + * Now that the LAP is free, the system may attempt to + * connect to another device. Unfortunately, our entries + * are stale. There is a small window (<3s) before the + * normal discovery will run and where irlmp_connect_request() + * can get the wrong info, so make sure things get + * cleaned *NOW* ;-) - Jean II */ + irlmp_do_expiry(); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %s\n", + __FUNCTION__, irlmp_event[event]); + break; + } +} + +/********************************************************************* + * + * LSAP connection control states + * + ********************************************************************/ + +/* + * Function irlmp_state_disconnected (event, skb, info) + * + * DISCONNECTED + * + */ +static int irlmp_state_disconnected(struct lsap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + int ret = 0; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + + switch (event) { +#ifdef CONFIG_IRDA_ULTRA + case LM_UDATA_INDICATION: + /* This is most bizzare. Those packets are aka unreliable + * connected, aka IrLPT or SOCK_DGRAM/IRDAPROTO_UNITDATA. + * Why do we pass them as Ultra ??? Jean II */ + irlmp_connless_data_indication(self, skb); + break; +#endif /* CONFIG_IRDA_ULTRA */ + case LM_CONNECT_REQUEST: + IRDA_DEBUG(4, "%s(), LM_CONNECT_REQUEST\n", __FUNCTION__); + + if (self->conn_skb) { + IRDA_WARNING("%s: busy with another request!\n", + __FUNCTION__); + return -EBUSY; + } + /* Don't forget to refcount it (see irlmp_connect_request()) */ + skb_get(skb); + self->conn_skb = skb; + + irlmp_next_lsap_state(self, LSAP_SETUP_PEND); + + /* Start watchdog timer (5 secs for now) */ + irlmp_start_watchdog_timer(self, 5*HZ); + + irlmp_do_lap_event(self->lap, LM_LAP_CONNECT_REQUEST, NULL); + break; + case LM_CONNECT_INDICATION: + if (self->conn_skb) { + IRDA_WARNING("%s: busy with another request!\n", + __FUNCTION__); + return -EBUSY; + } + /* Don't forget to refcount it (see irlap_driver_rcv()) */ + skb_get(skb); + self->conn_skb = skb; + + irlmp_next_lsap_state(self, LSAP_CONNECT_PEND); + + /* Start watchdog timer + * This is not mentionned in the spec, but there is a rare + * race condition that can get the socket stuck. + * If we receive this event while our LAP is closing down, + * the LM_LAP_CONNECT_REQUEST get lost and we get stuck in + * CONNECT_PEND state forever. + * The other cause of getting stuck down there is if the + * higher layer never reply to the CONNECT_INDICATION. + * Anyway, it make sense to make sure that we always have + * a backup plan. 1 second is plenty (should be immediate). + * Jean II */ + irlmp_start_watchdog_timer(self, 1*HZ); + + irlmp_do_lap_event(self->lap, LM_LAP_CONNECT_REQUEST, NULL); + break; + default: + IRDA_DEBUG(1, "%s(), Unknown event %s on LSAP %#02x\n", + __FUNCTION__, irlmp_event[event], self->slsap_sel); + break; + } + return ret; +} + +/* + * Function irlmp_state_connect (self, event, skb) + * + * CONNECT + * + */ +static int irlmp_state_connect(struct lsap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + struct lsap_cb *lsap; + int ret = 0; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + + switch (event) { + case LM_CONNECT_RESPONSE: + /* + * Bind this LSAP to the IrLAP link where the connect was + * received + */ + lsap = hashbin_remove(irlmp->unconnected_lsaps, (long) self, + NULL); + + IRDA_ASSERT(lsap == self, return -1;); + IRDA_ASSERT(self->lap != NULL, return -1;); + IRDA_ASSERT(self->lap->lsaps != NULL, return -1;); + + hashbin_insert(self->lap->lsaps, (irda_queue_t *) self, + (long) self, NULL); + + set_bit(0, &self->connected); /* TRUE */ + + irlmp_send_lcf_pdu(self->lap, self->dlsap_sel, + self->slsap_sel, CONNECT_CNF, skb); + + del_timer(&self->watchdog_timer); + + irlmp_next_lsap_state(self, LSAP_DATA_TRANSFER_READY); + break; + case LM_WATCHDOG_TIMEOUT: + /* May happen, who knows... + * Jean II */ + IRDA_DEBUG(0, "%s() WATCHDOG_TIMEOUT!\n", __FUNCTION__); + + /* Disconnect, get out... - Jean II */ + self->lap = NULL; + self->dlsap_sel = LSAP_ANY; + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + break; + default: + /* LM_LAP_DISCONNECT_INDICATION : Should never happen, we + * are *not* yet bound to the IrLAP link. Jean II */ + IRDA_DEBUG(0, "%s(), Unknown event %s on LSAP %#02x\n", + __FUNCTION__, irlmp_event[event], self->slsap_sel); + break; + } + return ret; +} + +/* + * Function irlmp_state_connect_pend (event, skb, info) + * + * CONNECT_PEND + * + */ +static int irlmp_state_connect_pend(struct lsap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + struct sk_buff *tx_skb; + int ret = 0; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + + switch (event) { + case LM_CONNECT_REQUEST: + /* Keep state */ + break; + case LM_CONNECT_RESPONSE: + IRDA_DEBUG(0, "%s(), LM_CONNECT_RESPONSE, " + "no indication issued yet\n", __FUNCTION__); + /* Keep state */ + break; + case LM_DISCONNECT_REQUEST: + IRDA_DEBUG(0, "%s(), LM_DISCONNECT_REQUEST, " + "not yet bound to IrLAP connection\n", __FUNCTION__); + /* Keep state */ + break; + case LM_LAP_CONNECT_CONFIRM: + IRDA_DEBUG(4, "%s(), LS_CONNECT_CONFIRM\n", __FUNCTION__); + irlmp_next_lsap_state(self, LSAP_CONNECT); + + tx_skb = self->conn_skb; + self->conn_skb = NULL; + + irlmp_connect_indication(self, tx_skb); + /* Drop reference count - see irlmp_connect_indication(). */ + dev_kfree_skb(tx_skb); + break; + case LM_WATCHDOG_TIMEOUT: + /* Will happen in some rare cases because of a race condition. + * Just make sure we don't stay there forever... + * Jean II */ + IRDA_DEBUG(0, "%s() WATCHDOG_TIMEOUT!\n", __FUNCTION__); + + /* Go back to disconnected mode, keep the socket waiting */ + self->lap = NULL; + self->dlsap_sel = LSAP_ANY; + if(self->conn_skb) + dev_kfree_skb(self->conn_skb); + self->conn_skb = NULL; + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + break; + default: + /* LM_LAP_DISCONNECT_INDICATION : Should never happen, we + * are *not* yet bound to the IrLAP link. Jean II */ + IRDA_DEBUG(0, "%s(), Unknown event %s on LSAP %#02x\n", + __FUNCTION__, irlmp_event[event], self->slsap_sel); + break; + } + return ret; +} + +/* + * Function irlmp_state_dtr (self, event, skb) + * + * DATA_TRANSFER_READY + * + */ +static int irlmp_state_dtr(struct lsap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + LM_REASON reason; + int ret = 0; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + IRDA_ASSERT(self->lap != NULL, return -1;); + + switch (event) { + case LM_DATA_REQUEST: /* Optimize for the common case */ + irlmp_send_data_pdu(self->lap, self->dlsap_sel, + self->slsap_sel, FALSE, skb); + break; + case LM_DATA_INDICATION: /* Optimize for the common case */ + irlmp_data_indication(self, skb); + break; + case LM_UDATA_REQUEST: + IRDA_ASSERT(skb != NULL, return -1;); + irlmp_send_data_pdu(self->lap, self->dlsap_sel, + self->slsap_sel, TRUE, skb); + break; + case LM_UDATA_INDICATION: + irlmp_udata_indication(self, skb); + break; + case LM_CONNECT_REQUEST: + IRDA_DEBUG(0, "%s(), LM_CONNECT_REQUEST, " + "error, LSAP already connected\n", __FUNCTION__); + /* Keep state */ + break; + case LM_CONNECT_RESPONSE: + IRDA_DEBUG(0, "%s(), LM_CONNECT_RESPONSE, " + "error, LSAP already connected\n", __FUNCTION__); + /* Keep state */ + break; + case LM_DISCONNECT_REQUEST: + irlmp_send_lcf_pdu(self->lap, self->dlsap_sel, self->slsap_sel, + DISCONNECT, skb); + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + /* Called only from irlmp_disconnect_request(), will + * unbind from LAP over there. Jean II */ + + /* Try to close the LAP connection if its still there */ + if (self->lap) { + IRDA_DEBUG(4, "%s(), trying to close IrLAP\n", + __FUNCTION__); + irlmp_do_lap_event(self->lap, + LM_LAP_DISCONNECT_REQUEST, + NULL); + } + break; + case LM_LAP_DISCONNECT_INDICATION: + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + + reason = irlmp_convert_lap_reason(self->lap->reason); + + irlmp_disconnect_indication(self, reason, NULL); + break; + case LM_DISCONNECT_INDICATION: + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + + IRDA_ASSERT(self->lap != NULL, return -1;); + IRDA_ASSERT(self->lap->magic == LMP_LAP_MAGIC, return -1;); + + IRDA_ASSERT(skb != NULL, return -1;); + IRDA_ASSERT(skb->len > 3, return -1;); + reason = skb->data[3]; + + /* Try to close the LAP connection */ + IRDA_DEBUG(4, "%s(), trying to close IrLAP\n", __FUNCTION__); + irlmp_do_lap_event(self->lap, LM_LAP_DISCONNECT_REQUEST, NULL); + + irlmp_disconnect_indication(self, reason, skb); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %s on LSAP %#02x\n", + __FUNCTION__, irlmp_event[event], self->slsap_sel); + break; + } + return ret; +} + +/* + * Function irlmp_state_setup (event, skb, info) + * + * SETUP, Station Control has set up the underlying IrLAP connection. + * An LSAP connection request has been transmitted to the peer + * LSAP-Connection Control FSM and we are awaiting reply. + */ +static int irlmp_state_setup(struct lsap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + LM_REASON reason; + int ret = 0; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LMP_LSAP_MAGIC, return -1;); + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + switch (event) { + case LM_CONNECT_CONFIRM: + irlmp_next_lsap_state(self, LSAP_DATA_TRANSFER_READY); + + del_timer(&self->watchdog_timer); + + irlmp_connect_confirm(self, skb); + break; + case LM_DISCONNECT_INDICATION: + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + + IRDA_ASSERT(self->lap != NULL, return -1;); + IRDA_ASSERT(self->lap->magic == LMP_LAP_MAGIC, return -1;); + + IRDA_ASSERT(skb != NULL, return -1;); + IRDA_ASSERT(skb->len > 3, return -1;); + reason = skb->data[3]; + + /* Try to close the LAP connection */ + IRDA_DEBUG(4, "%s(), trying to close IrLAP\n", __FUNCTION__); + irlmp_do_lap_event(self->lap, LM_LAP_DISCONNECT_REQUEST, NULL); + + irlmp_disconnect_indication(self, reason, skb); + break; + case LM_LAP_DISCONNECT_INDICATION: + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + + del_timer(&self->watchdog_timer); + + IRDA_ASSERT(self->lap != NULL, return -1;); + IRDA_ASSERT(self->lap->magic == LMP_LAP_MAGIC, return -1;); + + reason = irlmp_convert_lap_reason(self->lap->reason); + + irlmp_disconnect_indication(self, reason, skb); + break; + case LM_WATCHDOG_TIMEOUT: + IRDA_DEBUG(0, "%s() WATCHDOG_TIMEOUT!\n", __FUNCTION__); + + IRDA_ASSERT(self->lap != NULL, return -1;); + irlmp_do_lap_event(self->lap, LM_LAP_DISCONNECT_REQUEST, NULL); + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + + irlmp_disconnect_indication(self, LM_CONNECT_FAILURE, NULL); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %s on LSAP %#02x\n", + __FUNCTION__, irlmp_event[event], self->slsap_sel); + break; + } + return ret; +} + +/* + * Function irlmp_state_setup_pend (event, skb, info) + * + * SETUP_PEND, An LM_CONNECT_REQUEST has been received from the service + * user to set up an LSAP connection. A request has been sent to the + * LAP FSM to set up the underlying IrLAP connection, and we + * are awaiting confirm. + */ +static int irlmp_state_setup_pend(struct lsap_cb *self, IRLMP_EVENT event, + struct sk_buff *skb) +{ + struct sk_buff *tx_skb; + LM_REASON reason; + int ret = 0; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(irlmp != NULL, return -1;); + + switch (event) { + case LM_LAP_CONNECT_CONFIRM: + IRDA_ASSERT(self->conn_skb != NULL, return -1;); + + tx_skb = self->conn_skb; + self->conn_skb = NULL; + + irlmp_send_lcf_pdu(self->lap, self->dlsap_sel, + self->slsap_sel, CONNECT_CMD, tx_skb); + /* Drop reference count - see irlap_data_request(). */ + dev_kfree_skb(tx_skb); + + irlmp_next_lsap_state(self, LSAP_SETUP); + break; + case LM_WATCHDOG_TIMEOUT: + IRDA_DEBUG(0, "%s() : WATCHDOG_TIMEOUT !\n", __FUNCTION__); + + IRDA_ASSERT(self->lap != NULL, return -1;); + irlmp_do_lap_event(self->lap, LM_LAP_DISCONNECT_REQUEST, NULL); + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + + irlmp_disconnect_indication(self, LM_CONNECT_FAILURE, NULL); + break; + case LM_LAP_DISCONNECT_INDICATION: /* LS_Disconnect.indication */ + del_timer( &self->watchdog_timer); + + irlmp_next_lsap_state(self, LSAP_DISCONNECTED); + + reason = irlmp_convert_lap_reason(self->lap->reason); + + irlmp_disconnect_indication(self, reason, NULL); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown event %s on LSAP %#02x\n", + __FUNCTION__, irlmp_event[event], self->slsap_sel); + break; + } + return ret; +} diff --git a/net/irda/irlmp_frame.c b/net/irda/irlmp_frame.c new file mode 100644 index 000000000000..91cd268172fa --- /dev/null +++ b/net/irda/irlmp_frame.c @@ -0,0 +1,491 @@ +/********************************************************************* + * + * Filename: irlmp_frame.c + * Version: 0.9 + * Description: IrLMP frame implementation + * Status: Experimental. + * Author: Dag Brattli + * Created at: Tue Aug 19 02:09:59 1997 + * Modified at: Mon Dec 13 13:41:12 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999 Dag Brattli + * All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +static struct lsap_cb *irlmp_find_lsap(struct lap_cb *self, __u8 dlsap, + __u8 slsap, int status, hashbin_t *); + +inline void irlmp_send_data_pdu(struct lap_cb *self, __u8 dlsap, __u8 slsap, + int expedited, struct sk_buff *skb) +{ + skb->data[0] = dlsap; + skb->data[1] = slsap; + + if (expedited) { + IRDA_DEBUG(4, "%s(), sending expedited data\n", __FUNCTION__); + irlap_data_request(self->irlap, skb, TRUE); + } else + irlap_data_request(self->irlap, skb, FALSE); +} + +/* + * Function irlmp_send_lcf_pdu (dlsap, slsap, opcode,skb) + * + * Send Link Control Frame to IrLAP + */ +void irlmp_send_lcf_pdu(struct lap_cb *self, __u8 dlsap, __u8 slsap, + __u8 opcode, struct sk_buff *skb) +{ + __u8 *frame; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + frame = skb->data; + + frame[0] = dlsap | CONTROL_BIT; + frame[1] = slsap; + + frame[2] = opcode; + + if (opcode == DISCONNECT) + frame[3] = 0x01; /* Service user request */ + else + frame[3] = 0x00; /* rsvd */ + + irlap_data_request(self->irlap, skb, FALSE); +} + +/* + * Function irlmp_input (skb) + * + * Used by IrLAP to pass received data frames to IrLMP layer + * + */ +void irlmp_link_data_indication(struct lap_cb *self, struct sk_buff *skb, + int unreliable) +{ + struct lsap_cb *lsap; + __u8 slsap_sel; /* Source (this) LSAP address */ + __u8 dlsap_sel; /* Destination LSAP address */ + __u8 *fp; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + IRDA_ASSERT(skb->len > 2, return;); + + fp = skb->data; + + /* + * The next statements may be confusing, but we do this so that + * destination LSAP of received frame is source LSAP in our view + */ + slsap_sel = fp[0] & LSAP_MASK; + dlsap_sel = fp[1]; + + /* + * Check if this is an incoming connection, since we must deal with + * it in a different way than other established connections. + */ + if ((fp[0] & CONTROL_BIT) && (fp[2] == CONNECT_CMD)) { + IRDA_DEBUG(3, "%s(), incoming connection, " + "source LSAP=%d, dest LSAP=%d\n", + __FUNCTION__, slsap_sel, dlsap_sel); + + /* Try to find LSAP among the unconnected LSAPs */ + lsap = irlmp_find_lsap(self, dlsap_sel, slsap_sel, CONNECT_CMD, + irlmp->unconnected_lsaps); + + /* Maybe LSAP was already connected, so try one more time */ + if (!lsap) { + IRDA_DEBUG(1, "%s(), incoming connection for LSAP already connected\n", __FUNCTION__); + lsap = irlmp_find_lsap(self, dlsap_sel, slsap_sel, 0, + self->lsaps); + } + } else + lsap = irlmp_find_lsap(self, dlsap_sel, slsap_sel, 0, + self->lsaps); + + if (lsap == NULL) { + IRDA_DEBUG(2, "IrLMP, Sorry, no LSAP for received frame!\n"); + IRDA_DEBUG(2, "%s(), slsap_sel = %02x, dlsap_sel = %02x\n", + __FUNCTION__, slsap_sel, dlsap_sel); + if (fp[0] & CONTROL_BIT) { + IRDA_DEBUG(2, "%s(), received control frame %02x\n", + __FUNCTION__, fp[2]); + } else { + IRDA_DEBUG(2, "%s(), received data frame\n", __FUNCTION__); + } + return; + } + + /* + * Check if we received a control frame? + */ + if (fp[0] & CONTROL_BIT) { + switch (fp[2]) { + case CONNECT_CMD: + lsap->lap = self; + irlmp_do_lsap_event(lsap, LM_CONNECT_INDICATION, skb); + break; + case CONNECT_CNF: + irlmp_do_lsap_event(lsap, LM_CONNECT_CONFIRM, skb); + break; + case DISCONNECT: + IRDA_DEBUG(4, "%s(), Disconnect indication!\n", + __FUNCTION__); + irlmp_do_lsap_event(lsap, LM_DISCONNECT_INDICATION, + skb); + break; + case ACCESSMODE_CMD: + IRDA_DEBUG(0, "Access mode cmd not implemented!\n"); + break; + case ACCESSMODE_CNF: + IRDA_DEBUG(0, "Access mode cnf not implemented!\n"); + break; + default: + IRDA_DEBUG(0, "%s(), Unknown control frame %02x\n", + __FUNCTION__, fp[2]); + break; + } + } else if (unreliable) { + /* Optimize and bypass the state machine if possible */ + if (lsap->lsap_state == LSAP_DATA_TRANSFER_READY) + irlmp_udata_indication(lsap, skb); + else + irlmp_do_lsap_event(lsap, LM_UDATA_INDICATION, skb); + } else { + /* Optimize and bypass the state machine if possible */ + if (lsap->lsap_state == LSAP_DATA_TRANSFER_READY) + irlmp_data_indication(lsap, skb); + else + irlmp_do_lsap_event(lsap, LM_DATA_INDICATION, skb); + } +} + +/* + * Function irlmp_link_unitdata_indication (self, skb) + * + * + * + */ +#ifdef CONFIG_IRDA_ULTRA +void irlmp_link_unitdata_indication(struct lap_cb *self, struct sk_buff *skb) +{ + struct lsap_cb *lsap; + __u8 slsap_sel; /* Source (this) LSAP address */ + __u8 dlsap_sel; /* Destination LSAP address */ + __u8 pid; /* Protocol identifier */ + __u8 *fp; + unsigned long flags; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + IRDA_ASSERT(skb->len > 2, return;); + + fp = skb->data; + + /* + * The next statements may be confusing, but we do this so that + * destination LSAP of received frame is source LSAP in our view + */ + slsap_sel = fp[0] & LSAP_MASK; + dlsap_sel = fp[1]; + pid = fp[2]; + + if (pid & 0x80) { + IRDA_DEBUG(0, "%s(), extension in PID not supp!\n", + __FUNCTION__); + return; + } + + /* Check if frame is addressed to the connectionless LSAP */ + if ((slsap_sel != LSAP_CONNLESS) || (dlsap_sel != LSAP_CONNLESS)) { + IRDA_DEBUG(0, "%s(), dropping frame!\n", __FUNCTION__); + return; + } + + /* Search the connectionless LSAP */ + spin_lock_irqsave(&irlmp->unconnected_lsaps->hb_spinlock, flags); + lsap = (struct lsap_cb *) hashbin_get_first(irlmp->unconnected_lsaps); + while (lsap != NULL) { + /* + * Check if source LSAP and dest LSAP selectors and PID match. + */ + if ((lsap->slsap_sel == slsap_sel) && + (lsap->dlsap_sel == dlsap_sel) && + (lsap->pid == pid)) + { + break; + } + lsap = (struct lsap_cb *) hashbin_get_next(irlmp->unconnected_lsaps); + } + spin_unlock_irqrestore(&irlmp->unconnected_lsaps->hb_spinlock, flags); + + if (lsap) + irlmp_connless_data_indication(lsap, skb); + else { + IRDA_DEBUG(0, "%s(), found no matching LSAP!\n", __FUNCTION__); + } +} +#endif /* CONFIG_IRDA_ULTRA */ + +/* + * Function irlmp_link_disconnect_indication (reason, userdata) + * + * IrLAP has disconnected + * + */ +void irlmp_link_disconnect_indication(struct lap_cb *lap, + struct irlap_cb *irlap, + LAP_REASON reason, + struct sk_buff *skb) +{ + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(lap != NULL, return;); + IRDA_ASSERT(lap->magic == LMP_LAP_MAGIC, return;); + + lap->reason = reason; + lap->daddr = DEV_ADDR_ANY; + + /* FIXME: must do something with the skb if any */ + + /* + * Inform station state machine + */ + irlmp_do_lap_event(lap, LM_LAP_DISCONNECT_INDICATION, NULL); +} + +/* + * Function irlmp_link_connect_indication (qos) + * + * Incoming LAP connection! + * + */ +void irlmp_link_connect_indication(struct lap_cb *self, __u32 saddr, + __u32 daddr, struct qos_info *qos, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + /* Copy QoS settings for this session */ + self->qos = qos; + + /* Update destination device address */ + self->daddr = daddr; + IRDA_ASSERT(self->saddr == saddr, return;); + + irlmp_do_lap_event(self, LM_LAP_CONNECT_INDICATION, skb); +} + +/* + * Function irlmp_link_connect_confirm (qos) + * + * LAP connection confirmed! + * + */ +void irlmp_link_connect_confirm(struct lap_cb *self, struct qos_info *qos, + struct sk_buff *skb) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + IRDA_ASSERT(qos != NULL, return;); + + /* Don't need use the skb for now */ + + /* Copy QoS settings for this session */ + self->qos = qos; + + irlmp_do_lap_event(self, LM_LAP_CONNECT_CONFIRM, NULL); +} + +/* + * Function irlmp_link_discovery_indication (self, log) + * + * Device is discovering us + * + * It's not an answer to our own discoveries, just another device trying + * to perform discovery, but we don't want to miss the opportunity + * to exploit this information, because : + * o We may not actively perform discovery (just passive discovery) + * o This type of discovery is much more reliable. In some cases, it + * seem that less than 50% of our discoveries get an answer, while + * we always get ~100% of these. + * o Make faster discovery, statistically divide time of discovery + * events by 2 (important for the latency aspect and user feel) + * o Even is we do active discovery, the other node might not + * answer our discoveries (ex: Palm). The Palm will just perform + * one active discovery and connect directly to us. + * + * However, when both devices discover each other, they might attempt to + * connect to each other following the discovery event, and it would create + * collisions on the medium (SNRM battle). + * The "fix" for that is to disable all connection requests in IrLAP + * for 100ms after a discovery indication by setting the media_busy flag. + * Previously, we used to postpone the event which was quite ugly. Now + * that IrLAP takes care of this problem, just pass the event up... + * + * Jean II + */ +void irlmp_link_discovery_indication(struct lap_cb *self, + discovery_t *discovery) +{ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + + /* Add to main log, cleanup */ + irlmp_add_discovery(irlmp->cachelog, discovery); + + /* Just handle it the same way as a discovery confirm, + * bypass the LM_LAP state machine (see below) */ + irlmp_discovery_confirm(irlmp->cachelog, DISCOVERY_PASSIVE); +} + +/* + * Function irlmp_link_discovery_confirm (self, log) + * + * Called by IrLAP with a list of discoveries after the discovery + * request has been carried out. A NULL log is received if IrLAP + * was unable to carry out the discovery request + * + */ +void irlmp_link_discovery_confirm(struct lap_cb *self, hashbin_t *log) +{ + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LMP_LAP_MAGIC, return;); + + /* Add to main log, cleanup */ + irlmp_add_discovery_log(irlmp->cachelog, log); + + /* Propagate event to various LSAPs registered for it. + * We bypass the LM_LAP state machine because + * 1) We do it regardless of the LM_LAP state + * 2) It doesn't affect the LM_LAP state + * 3) Faster, slimer, simpler, ... + * Jean II */ + irlmp_discovery_confirm(irlmp->cachelog, DISCOVERY_ACTIVE); +} + +#ifdef CONFIG_IRDA_CACHE_LAST_LSAP +static inline void irlmp_update_cache(struct lap_cb *lap, + struct lsap_cb *lsap) +{ + /* Prevent concurrent read to get garbage */ + lap->cache.valid = FALSE; + /* Update cache entry */ + lap->cache.dlsap_sel = lsap->dlsap_sel; + lap->cache.slsap_sel = lsap->slsap_sel; + lap->cache.lsap = lsap; + lap->cache.valid = TRUE; +} +#endif + +/* + * Function irlmp_find_handle (self, dlsap_sel, slsap_sel, status, queue) + * + * Find handle associated with destination and source LSAP + * + * Any IrDA connection (LSAP/TSAP) is uniquely identified by + * 3 parameters, the local lsap, the remote lsap and the remote address. + * We may initiate multiple connections to the same remote service + * (they will have different local lsap), a remote device may initiate + * multiple connections to the same local service (they will have + * different remote lsap), or multiple devices may connect to the same + * service and may use the same remote lsap (and they will have + * different remote address). + * So, where is the remote address ? Each LAP connection is made with + * a single remote device, so imply a specific remote address. + * Jean II + */ +static struct lsap_cb *irlmp_find_lsap(struct lap_cb *self, __u8 dlsap_sel, + __u8 slsap_sel, int status, + hashbin_t *queue) +{ + struct lsap_cb *lsap; + unsigned long flags; + + /* + * Optimize for the common case. We assume that the last frame + * received is in the same connection as the last one, so check in + * cache first to avoid the linear search + */ +#ifdef CONFIG_IRDA_CACHE_LAST_LSAP + if ((self->cache.valid) && + (self->cache.slsap_sel == slsap_sel) && + (self->cache.dlsap_sel == dlsap_sel)) + { + return (self->cache.lsap); + } +#endif + + spin_lock_irqsave(&queue->hb_spinlock, flags); + + lsap = (struct lsap_cb *) hashbin_get_first(queue); + while (lsap != NULL) { + /* + * If this is an incoming connection, then the destination + * LSAP selector may have been specified as LM_ANY so that + * any client can connect. In that case we only need to check + * if the source LSAP (in our view!) match! + */ + if ((status == CONNECT_CMD) && + (lsap->slsap_sel == slsap_sel) && + (lsap->dlsap_sel == LSAP_ANY)) { + /* This is where the dest lsap sel is set on incoming + * lsaps */ + lsap->dlsap_sel = dlsap_sel; + break; + } + /* + * Check if source LSAP and dest LSAP selectors match. + */ + if ((lsap->slsap_sel == slsap_sel) && + (lsap->dlsap_sel == dlsap_sel)) + break; + + lsap = (struct lsap_cb *) hashbin_get_next(queue); + } +#ifdef CONFIG_IRDA_CACHE_LAST_LSAP + if(lsap) + irlmp_update_cache(self, lsap); +#endif + spin_unlock_irqrestore(&queue->hb_spinlock, flags); + + /* Return what we've found or NULL */ + return lsap; +} diff --git a/net/irda/irmod.c b/net/irda/irmod.c new file mode 100644 index 000000000000..6ffaed4544e9 --- /dev/null +++ b/net/irda/irmod.c @@ -0,0 +1,185 @@ +/********************************************************************* + * + * Filename: irmod.c + * Version: 0.9 + * Description: IrDA stack main entry points + * Status: Experimental. + * Author: Dag Brattli + * Created at: Mon Dec 15 13:55:39 1997 + * Modified at: Wed Jan 5 15:12:41 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1997, 1999-2000 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2004 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +/* + * This file contains the main entry points of the IrDA stack. + * They are in this file and not af_irda.c because some developpers + * are using the IrDA stack without the socket API (compiling out + * af_irda.c). + * Jean II + */ + +#include +#include +#include + +#include +#include /* notify_t */ +#include /* irlap_init */ +#include /* irlmp_init */ +#include /* iriap_init */ +#include /* irttp_init */ +#include /* irda_device_init */ + +/* irproc.c */ +extern void irda_proc_register(void); +extern void irda_proc_unregister(void); +/* irsysctl.c */ +extern int irda_sysctl_register(void); +extern void irda_sysctl_unregister(void); +/* af_irda.c */ +extern int irsock_init(void); +extern void irsock_cleanup(void); +/* irlap_frame.c */ +extern int irlap_driver_rcv(struct sk_buff *, struct net_device *, + struct packet_type *); + +/* + * Module parameters + */ +#ifdef CONFIG_IRDA_DEBUG +unsigned int irda_debug = IRDA_DEBUG_LEVEL; +module_param_named(debug, irda_debug, uint, 0); +MODULE_PARM_DESC(debug, "IRDA debugging level"); +EXPORT_SYMBOL(irda_debug); +#endif + +/* Packet type handler. + * Tell the kernel how IrDA packets should be handled. + */ +static struct packet_type irda_packet_type = { + .type = __constant_htons(ETH_P_IRDA), + .func = irlap_driver_rcv, /* Packet type handler irlap_frame.c */ +}; + +/* + * Function irda_notify_init (notify) + * + * Used for initializing the notify structure + * + */ +void irda_notify_init(notify_t *notify) +{ + notify->data_indication = NULL; + notify->udata_indication = NULL; + notify->connect_confirm = NULL; + notify->connect_indication = NULL; + notify->disconnect_indication = NULL; + notify->flow_indication = NULL; + notify->status_indication = NULL; + notify->instance = NULL; + strlcpy(notify->name, "Unknown", sizeof(notify->name)); +} +EXPORT_SYMBOL(irda_notify_init); + +/* + * Function irda_init (void) + * + * Protocol stack initialisation entry point. + * Initialise the various components of the IrDA stack + */ +static int __init irda_init(void) +{ + IRDA_DEBUG(0, "%s()\n", __FUNCTION__); + + /* Lower layer of the stack */ + irlmp_init(); + irlap_init(); + + /* Higher layers of the stack */ + iriap_init(); + irttp_init(); + irsock_init(); + + /* Add IrDA packet type (Start receiving packets) */ + dev_add_pack(&irda_packet_type); + + /* External APIs */ +#ifdef CONFIG_PROC_FS + irda_proc_register(); +#endif +#ifdef CONFIG_SYSCTL + irda_sysctl_register(); +#endif + + /* Driver/dongle support */ + irda_device_init(); + + return 0; +} + +/* + * Function irda_cleanup (void) + * + * Protocol stack cleanup/removal entry point. + * Cleanup the various components of the IrDA stack + */ +static void __exit irda_cleanup(void) +{ + /* Remove External APIs */ +#ifdef CONFIG_SYSCTL + irda_sysctl_unregister(); +#endif +#ifdef CONFIG_PROC_FS + irda_proc_unregister(); +#endif + + /* Remove IrDA packet type (stop receiving packets) */ + dev_remove_pack(&irda_packet_type); + + /* Remove higher layers */ + irsock_cleanup(); + irttp_cleanup(); + iriap_cleanup(); + + /* Remove lower layers */ + irda_device_cleanup(); + irlap_cleanup(); /* Must be done before irlmp_cleanup()! DB */ + + /* Remove middle layer */ + irlmp_cleanup(); +} + +/* + * The IrDA stack must be initialised *before* drivers get initialised, + * and *before* higher protocols (IrLAN/IrCOMM/IrNET) get initialised, + * otherwise bad things will happen (hashbins will be NULL for example). + * Those modules are at module_init()/device_initcall() level. + * + * On the other hand, it needs to be initialised *after* the basic + * networking, the /proc/net filesystem and sysctl module. Those are + * currently initialised in .../init/main.c (before initcalls). + * Also, IrDA drivers needs to be initialised *after* the random number + * generator (main stack and higher layer init don't need it anymore). + * + * Jean II + */ +subsys_initcall(irda_init); +module_exit(irda_cleanup); + +MODULE_AUTHOR("Dag Brattli & Jean Tourrilhes "); +MODULE_DESCRIPTION("The Linux IrDA Protocol Stack"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_IRDA); diff --git a/net/irda/irnet/Kconfig b/net/irda/irnet/Kconfig new file mode 100644 index 000000000000..28c557f0fdd2 --- /dev/null +++ b/net/irda/irnet/Kconfig @@ -0,0 +1,13 @@ +config IRNET + tristate "IrNET protocol" + depends on IRDA && PPP + help + Say Y here if you want to build support for the IrNET protocol. + To compile it as a module, choose M here: the module will be + called irnet. IrNET is a PPP driver, so you will also need a + working PPP subsystem (driver, daemon and config)... + + IrNET is an alternate way to transfer TCP/IP traffic over IrDA. It + uses synchronous PPP over a set of point to point IrDA sockets. You + can use it between Linux machine or with W2k. + diff --git a/net/irda/irnet/Makefile b/net/irda/irnet/Makefile new file mode 100644 index 000000000000..b3ee01e0def3 --- /dev/null +++ b/net/irda/irnet/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux IrDA IrNET protocol layer. +# + +obj-$(CONFIG_IRNET) += irnet.o + +irnet-objs := irnet_ppp.o irnet_irda.o diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h new file mode 100644 index 000000000000..9004f7349a76 --- /dev/null +++ b/net/irda/irnet/irnet.h @@ -0,0 +1,529 @@ +/* + * IrNET protocol module : Synchronous PPP over an IrDA socket. + * + * Jean II - HPL `00 - + * + * This file contains definitions and declarations global to the IrNET module, + * all grouped in one place... + * This file is a *private* header, so other modules don't want to know + * what's in there... + * + * Note : as most part of the Linux kernel, this module is available + * under the GNU General Public License (GPL). + */ + +#ifndef IRNET_H +#define IRNET_H + +/************************** DOCUMENTATION ***************************/ +/* + * What is IrNET + * ------------- + * IrNET is a protocol allowing to carry TCP/IP traffic between two + * IrDA peers in an efficient fashion. It is a thin layer, passing PPP + * packets to IrTTP and vice versa. It uses PPP in synchronous mode, + * because IrTTP offer a reliable sequenced packet service (as opposed + * to a byte stream). In fact, you could see IrNET as carrying TCP/IP + * in a IrDA socket, using PPP to provide the glue. + * + * The main difference with traditional PPP over IrCOMM is that we + * avoid the framing and serial emulation which are a performance + * bottleneck. It also allows multipoint communications in a sensible + * fashion. + * + * The main difference with IrLAN is that we use PPP for the link + * management, which is more standard, interoperable and flexible than + * the IrLAN protocol. For example, PPP adds authentication, + * encryption, compression, header compression and automated routing + * setup. And, as IrNET let PPP do the hard work, the implementation + * is much simpler than IrLAN. + * + * The Linux implementation + * ------------------------ + * IrNET is written on top of the Linux-IrDA stack, and interface with + * the generic Linux PPP driver. Because IrNET depend on recent + * changes of the PPP driver interface, IrNET will work only with very + * recent kernel (2.3.99-pre6 and up). + * + * The present implementation offer the following features : + * o simple user interface using pppd + * o efficient implementation (interface directly to PPP and IrTTP) + * o addressing (you can specify the name of the IrNET recipient) + * o multipoint operation (limited by IrLAP specification) + * o information in /proc/net/irda/irnet + * o IrNET events on /dev/irnet (for user space daemon) + * o IrNET daemon (irnetd) to automatically handle incoming requests + * o Windows 2000 compatibility (tested, but need more work) + * Currently missing : + * o Lot's of testing (that's your job) + * o Connection retries (may be too hard to do) + * o Check pppd persist mode + * o User space daemon (to automatically handle incoming requests) + * + * The setup is not currently the most easy, but this should get much + * better when everything will get integrated... + * + * Acknowledgements + * ---------------- + * This module is based on : + * o The PPP driver (ppp_synctty/ppp_generic) by Paul Mackerras + * o The IrLAN protocol (irlan_common/XXX) by Dag Brattli + * o The IrSock interface (af_irda) by Dag Brattli + * o Some other bits from the kernel and my drivers... + * Infinite thanks to those brave souls for providing the infrastructure + * upon which IrNET is built. + * + * Thanks to all my collegues in HP for helping me. In particular, + * thanks to Salil Pradhan and Bill Serra for W2k testing... + * Thanks to Luiz Magalhaes for irnetd and much testing... + * + * Thanks to Alan Cox for answering lot's of my stupid questions, and + * to Paul Mackerras answering my questions on how to best integrate + * IrNET and pppd. + * + * Jean II + * + * Note on some implementations choices... + * ------------------------------------ + * 1) Direct interface vs tty/socket + * I could have used a tty interface to hook to ppp and use the full + * socket API to connect to IrDA. The code would have been easier to + * maintain, and maybe the code would have been smaller... + * Instead, we hook directly to ppp_generic and to IrTTP, which make + * things more complicated... + * + * The first reason is flexibility : this allow us to create IrNET + * instances on demand (no /dev/ircommX crap) and to allow linkname + * specification on pppd command line... + * + * Second reason is speed optimisation. If you look closely at the + * transmit and receive paths, you will notice that they are "super lean" + * (that's why they look ugly), with no function calls and as little data + * copy and modification as I could... + * + * 2) irnetd in user space + * irnetd is implemented in user space, which is necessary to call pppd. + * This also give maximum benefits in term of flexibility and customability, + * and allow to offer the event channel, useful for other stuff like debug. + * + * On the other hand, this require a loose coordination between the + * present module and irnetd. One critical area is how incoming request + * are handled. + * When irnet receive an incoming request, it send an event to irnetd and + * drop the incoming IrNET socket. + * irnetd start a pppd instance, which create a new IrNET socket. This new + * socket is then connected in the originating node to the pppd instance. + * At this point, in the originating node, the first socket is closed. + * + * I admit, this is a bit messy and waste some resources. The alternative + * is caching incoming socket, and that's also quite messy and waste + * resources. + * We also make connection time slower. For example, on a 115 kb/s link it + * adds 60ms to the connection time (770 ms). However, this is slower than + * the time it takes to fire up pppd on my P133... + * + * + * History : + * ------- + * + * v1 - 15.5.00 - Jean II + * o Basic IrNET (hook to ppp_generic & IrTTP - incl. multipoint) + * o control channel on /dev/irnet (set name/address) + * o event channel on /dev/irnet (for user space daemon) + * + * v2 - 5.6.00 - Jean II + * o Enable DROP_NOT_READY to avoid PPP timeouts & other weirdness... + * o Add DISCONNECT_TO event and rename DISCONNECT_FROM. + * o Set official device number alloaction on /dev/irnet + * + * v3 - 30.8.00 - Jean II + * o Update to latest Linux-IrDA changes : + * - queue_t => irda_queue_t + * o Update to ppp-2.4.0 : + * - move irda_irnet_connect from PPPIOCATTACH to TIOCSETD + * o Add EXPIRE event (depend on new IrDA-Linux patch) + * o Switch from `hashbin_remove' to `hashbin_remove_this' to fix + * a multilink bug... (depend on new IrDA-Linux patch) + * o fix a self->daddr to self->raddr in irda_irnet_connect to fix + * another multilink bug (darn !) + * o Remove LINKNAME_IOCTL cruft + * + * v3b - 31.8.00 - Jean II + * o Dump discovery log at event channel startup + * + * v4 - 28.9.00 - Jean II + * o Fix interaction between poll/select and dump discovery log + * o Add IRNET_BLOCKED_LINK event (depend on new IrDA-Linux patch) + * o Add IRNET_NOANSWER_FROM event (mostly to help support) + * o Release flow control in disconnect_indication + * o Block packets while connecting (speed up connections) + * + * v5 - 11.01.01 - Jean II + * o Init self->max_header_size, just in case... + * o Set up ap->chan.hdrlen, to get zero copy on tx side working. + * o avoid tx->ttp->flow->ppp->tx->... loop, by checking flow state + * Thanks to Christian Gennerat for finding this bug ! + * --- + * o Declare the proper MTU/MRU that we can support + * (but PPP doesn't read the MTU value :-() + * o Declare hashbin HB_NOLOCK instead of HB_LOCAL to avoid + * disabling and enabling irq twice + * + * v6 - 31.05.01 - Jean II + * o Print source address in Found, Discovery, Expiry & Request events + * o Print requested source address in /proc/net/irnet + * o Change control channel input. Allow multiple commands in one line. + * o Add saddr command to change ap->rsaddr (and use that in IrDA) + * --- + * o Make the IrDA connection procedure totally asynchronous. + * Heavy rewrite of the IAS query code and the whole connection + * procedure. Now, irnet_connect() no longer need to be called from + * a process context... + * o Enable IrDA connect retries in ppp_irnet_send(). The good thing + * is that IrDA connect retries are directly driven by PPP LCP + * retries (we retry for each LCP packet), so that everything + * is transparently controlled from pppd lcp-max-configure. + * o Add ttp_connect flag to prevent rentry on the connect procedure + * o Test and fixups to eliminate side effects of retries + * + * v7 - 22.08.01 - Jean II + * o Cleanup : Change "saddr = 0x0" to "saddr = DEV_ADDR_ANY" + * o Fix bug in BLOCK_WHEN_CONNECT introduced in v6 : due to the + * asynchronous IAS query, self->tsap is NULL when PPP send the + * first packet. This was preventing "connect-delay 0" to work. + * Change the test in ppp_irnet_send() to self->ttp_connect. + * + * v8 - 1.11.01 - Jean II + * o Tighten the use of self->ttp_connect and self->ttp_open to + * prevent various race conditions. + * o Avoid leaking discovery log and skb + * o Replace "self" with "server" in irnet_connect_indication() to + * better detect cut'n'paste error ;-) + * + * v9 - 29.11.01 - Jean II + * o Fix event generation in disconnect indication that I broke in v8 + * It was always generation "No-Answer" because I was testing ttp_open + * just after clearing it. *blush*. + * o Use newly created irttp_listen() to fix potential crash when LAP + * destroyed before irnet module removed. + * + * v10 - 4.3.2 - Jean II + * o When receiving a disconnect indication, don't reenable the + * PPP Tx queue, this will trigger a reconnect. Instead, close + * the channel, which will kill pppd... + * + * v11 - 20.3.02 - Jean II + * o Oops ! v10 fix disabled IrNET retries and passive behaviour. + * Better fix in irnet_disconnect_indication() : + * - if connected, kill pppd via hangup. + * - if not connected, reenable ppp Tx, which trigger IrNET retry. + * + * v12 - 10.4.02 - Jean II + * o Fix race condition in irnet_connect_indication(). + * If the socket was already trying to connect, drop old connection + * and use new one only if acting as primary. See comments. + * + * v13 - 30.5.02 - Jean II + * o Update module init code + * + * v14 - 20.2.03 - Jean II + * o Add discovery hint bits in the control channel. + * o Remove obsolete MOD_INC/DEC_USE_COUNT in favor of .owner + * + * v15 - 7.4.03 - Jean II + * o Replace spin_lock_irqsave() with spin_lock_bh() so that we can + * use ppp_unit_number(). It's probably also better overall... + * o Disable call to ppp_unregister_channel(), because we can't do it. + */ + +/***************************** INCLUDES *****************************/ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* isspace() */ +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/***************************** OPTIONS *****************************/ +/* + * Define or undefine to compile or not some optional part of the + * IrNET driver... + * Note : the present defaults make sense, play with that at your + * own risk... + */ +/* IrDA side of the business... */ +#define DISCOVERY_NOMASK /* To enable W2k compatibility... */ +#define ADVERTISE_HINT /* Advertise IrLAN hint bit */ +#define ALLOW_SIMULT_CONNECT /* This seem to work, cross fingers... */ +#define DISCOVERY_EVENTS /* Query the discovery log to post events */ +#define INITIAL_DISCOVERY /* Dump current discovery log as events */ +#undef STREAM_COMPAT /* Not needed - potentially messy */ +#undef CONNECT_INDIC_KICK /* Might mess IrDA, not needed */ +#undef FAIL_SEND_DISCONNECT /* Might mess IrDA, not needed */ +#undef PASS_CONNECT_PACKETS /* Not needed ? Safe */ +#undef MISSING_PPP_API /* Stuff I wish I could do */ + +/* PPP side of the business */ +#define BLOCK_WHEN_CONNECT /* Block packets when connecting */ +#define CONNECT_IN_SEND /* Retry IrDA connection procedure */ +#undef FLUSH_TO_PPP /* Not sure about this one, let's play safe */ +#undef SECURE_DEVIRNET /* Bah... */ + +/****************************** DEBUG ******************************/ + +/* + * This set of flags enable and disable all the various warning, + * error and debug message of this driver. + * Each section can be enabled and disabled independently + */ +/* In the PPP part */ +#define DEBUG_CTRL_TRACE 0 /* Control channel */ +#define DEBUG_CTRL_INFO 0 /* various info */ +#define DEBUG_CTRL_ERROR 1 /* problems */ +#define DEBUG_FS_TRACE 0 /* filesystem callbacks */ +#define DEBUG_FS_INFO 0 /* various info */ +#define DEBUG_FS_ERROR 1 /* problems */ +#define DEBUG_PPP_TRACE 0 /* PPP related functions */ +#define DEBUG_PPP_INFO 0 /* various info */ +#define DEBUG_PPP_ERROR 1 /* problems */ +#define DEBUG_MODULE_TRACE 0 /* module insertion/removal */ +#define DEBUG_MODULE_ERROR 1 /* problems */ + +/* In the IrDA part */ +#define DEBUG_IRDA_SR_TRACE 0 /* IRDA subroutines */ +#define DEBUG_IRDA_SR_INFO 0 /* various info */ +#define DEBUG_IRDA_SR_ERROR 1 /* problems */ +#define DEBUG_IRDA_SOCK_TRACE 0 /* IRDA main socket functions */ +#define DEBUG_IRDA_SOCK_INFO 0 /* various info */ +#define DEBUG_IRDA_SOCK_ERROR 1 /* problems */ +#define DEBUG_IRDA_SERV_TRACE 0 /* The IrNET server */ +#define DEBUG_IRDA_SERV_INFO 0 /* various info */ +#define DEBUG_IRDA_SERV_ERROR 1 /* problems */ +#define DEBUG_IRDA_TCB_TRACE 0 /* IRDA IrTTP callbacks */ +#define DEBUG_IRDA_CB_INFO 0 /* various info */ +#define DEBUG_IRDA_CB_ERROR 1 /* problems */ +#define DEBUG_IRDA_OCB_TRACE 0 /* IRDA other callbacks */ +#define DEBUG_IRDA_OCB_INFO 0 /* various info */ +#define DEBUG_IRDA_OCB_ERROR 1 /* problems */ + +#define DEBUG_ASSERT 0 /* Verify all assertions */ + +/* + * These are the macros we are using to actually print the debug + * statements. Don't look at it, it's ugly... + * + * One of the trick is that, as the DEBUG_XXX are constant, the + * compiler will optimise away the if() in all cases. + */ +/* All error messages (will show up in the normal logs) */ +#define DERROR(dbg, format, args...) \ + {if(DEBUG_##dbg) \ + printk(KERN_INFO "irnet: %s(): " format, __FUNCTION__ , ##args);} + +/* Normal debug message (will show up in /var/log/debug) */ +#define DEBUG(dbg, format, args...) \ + {if(DEBUG_##dbg) \ + printk(KERN_DEBUG "irnet: %s(): " format, __FUNCTION__ , ##args);} + +/* Entering a function (trace) */ +#define DENTER(dbg, format, args...) \ + {if(DEBUG_##dbg) \ + printk(KERN_DEBUG "irnet: -> %s" format, __FUNCTION__ , ##args);} + +/* Entering and exiting a function in one go (trace) */ +#define DPASS(dbg, format, args...) \ + {if(DEBUG_##dbg) \ + printk(KERN_DEBUG "irnet: <>%s" format, __FUNCTION__ , ##args);} + +/* Exiting a function (trace) */ +#define DEXIT(dbg, format, args...) \ + {if(DEBUG_##dbg) \ + printk(KERN_DEBUG "irnet: <-%s()" format, __FUNCTION__ , ##args);} + +/* Exit a function with debug */ +#define DRETURN(ret, dbg, args...) \ + {DEXIT(dbg, ": " args);\ + return ret; } + +/* Exit a function on failed condition */ +#define DABORT(cond, ret, dbg, args...) \ + {if(cond) {\ + DERROR(dbg, args);\ + return ret; }} + +/* Invalid assertion, print out an error and exit... */ +#define DASSERT(cond, ret, dbg, args...) \ + {if((DEBUG_ASSERT) && !(cond)) {\ + DERROR(dbg, "Invalid assertion: " args);\ + return ret; }} + +/************************ CONSTANTS & MACROS ************************/ + +/* Paranoia */ +#define IRNET_MAGIC 0xB00754 + +/* Number of control events in the control channel buffer... */ +#define IRNET_MAX_EVENTS 8 /* Should be more than enough... */ + +/****************************** TYPES ******************************/ + +/* + * This is the main structure where we store all the data pertaining to + * one instance of irnet. + * Note : in irnet functions, a pointer this structure is usually called + * "ap" or "self". If the code is borrowed from the IrDA stack, it tend + * to be called "self", and if it is borrowed from the PPP driver it is + * "ap". Apart from that, it's exactly the same structure ;-) + */ +typedef struct irnet_socket +{ + /* ------------------- Instance management ------------------- */ + /* We manage a linked list of IrNET socket instances */ + irda_queue_t q; /* Must be first - for hasbin */ + int magic; /* Paranoia */ + + /* --------------------- FileSystem part --------------------- */ + /* "pppd" interact directly with us on a /dev/ file */ + struct file * file; /* File descriptor of this instance */ + /* TTY stuff - to keep "pppd" happy */ + struct termios termios; /* Various tty flags */ + /* Stuff for the control channel */ + int event_index; /* Last read in the event log */ + + /* ------------------------- PPP part ------------------------- */ + /* We interface directly to the ppp_generic driver in the kernel */ + int ppp_open; /* registered with ppp_generic */ + struct ppp_channel chan; /* Interface to generic ppp layer */ + + int mru; /* Max size of PPP payload */ + u32 xaccm[8]; /* Asynchronous character map (just */ + u32 raccm; /* to please pppd - dummy) */ + unsigned int flags; /* PPP flags (compression, ...) */ + unsigned int rbits; /* Unused receive flags ??? */ + + /* ------------------------ IrTTP part ------------------------ */ + /* We create a pseudo "socket" over the IrDA tranport */ + unsigned long ttp_open; /* Set when IrTTP is ready */ + unsigned long ttp_connect; /* Set when IrTTP is connecting */ + struct tsap_cb * tsap; /* IrTTP instance (the connection) */ + + char rname[NICKNAME_MAX_LEN + 1]; + /* IrDA nickname of destination */ + __u32 rdaddr; /* Requested peer IrDA address */ + __u32 rsaddr; /* Requested local IrDA address */ + __u32 daddr; /* actual peer IrDA address */ + __u32 saddr; /* my local IrDA address */ + __u8 dtsap_sel; /* Remote TSAP selector */ + __u8 stsap_sel; /* Local TSAP selector */ + + __u32 max_sdu_size_rx;/* Socket parameters used for IrTTP */ + __u32 max_sdu_size_tx; + __u32 max_data_size; + __u8 max_header_size; + LOCAL_FLOW tx_flow; /* State of the Tx path in IrTTP */ + + /* ------------------- IrLMP and IrIAS part ------------------- */ + /* Used for IrDA Discovery and socket name resolution */ + void * ckey; /* IrLMP client handle */ + __u16 mask; /* Hint bits mask (filter discov.)*/ + int nslots; /* Number of slots for discovery */ + + struct iriap_cb * iriap; /* Used to query remote IAS */ + int errno; /* status of the IAS query */ + + /* -------------------- Discovery log part -------------------- */ + /* Used by initial discovery on the control channel + * and by irnet_discover_daddr_and_lsap_sel() */ + struct irda_device_info *discoveries; /* Copy of the discovery log */ + int disco_index; /* Last read in the discovery log */ + int disco_number; /* Size of the discovery log */ + +} irnet_socket; + +/* + * This is the various event that we will generate on the control channel + */ +typedef enum irnet_event +{ + IRNET_DISCOVER, /* New IrNET node discovered */ + IRNET_EXPIRE, /* IrNET node expired */ + IRNET_CONNECT_TO, /* IrNET socket has connected to other node */ + IRNET_CONNECT_FROM, /* Other node has connected to IrNET socket */ + IRNET_REQUEST_FROM, /* Non satisfied connection request */ + IRNET_NOANSWER_FROM, /* Failed connection request */ + IRNET_BLOCKED_LINK, /* Link (IrLAP) is blocked for > 3s */ + IRNET_DISCONNECT_FROM, /* IrNET socket has disconnected */ + IRNET_DISCONNECT_TO /* Closing IrNET socket */ +} irnet_event; + +/* + * This is the storage for an event and its arguments + */ +typedef struct irnet_log +{ + irnet_event event; + int unit; + __u32 saddr; + __u32 daddr; + char name[NICKNAME_MAX_LEN + 1]; /* 21 + 1 */ + __u16_host_order hints; /* Discovery hint bits */ +} irnet_log; + +/* + * This is the storage for all events and related stuff... + */ +typedef struct irnet_ctrl_channel +{ + irnet_log log[IRNET_MAX_EVENTS]; /* Event log */ + int index; /* Current index in log */ + spinlock_t spinlock; /* Serialize access to the event log */ + wait_queue_head_t rwait; /* processes blocked on read (or poll) */ +} irnet_ctrl_channel; + +/**************************** PROTOTYPES ****************************/ +/* + * Global functions of the IrNET module + * Note : we list here also functions called from one file to the other. + */ + +/* -------------------------- IRDA PART -------------------------- */ +extern int + irda_irnet_create(irnet_socket *); /* Initialise a IrNET socket */ +extern int + irda_irnet_connect(irnet_socket *); /* Try to connect over IrDA */ +extern void + irda_irnet_destroy(irnet_socket *); /* Teardown a IrNET socket */ +extern int + irda_irnet_init(void); /* Initialise IrDA part of IrNET */ +extern void + irda_irnet_cleanup(void); /* Teardown IrDA part of IrNET */ +/* ---------------------------- MODULE ---------------------------- */ +extern int + irnet_init(void); /* Initialise IrNET module */ + +/**************************** VARIABLES ****************************/ + +/* Control channel stuff - allocated in irnet_irda.h */ +extern struct irnet_ctrl_channel irnet_events; + +#endif /* IRNET_H */ diff --git a/net/irda/irnet/irnet_irda.c b/net/irda/irnet/irnet_irda.c new file mode 100644 index 000000000000..07ec326c71f5 --- /dev/null +++ b/net/irda/irnet/irnet_irda.c @@ -0,0 +1,1866 @@ +/* + * IrNET protocol module : Synchronous PPP over an IrDA socket. + * + * Jean II - HPL `00 - + * + * This file implement the IRDA interface of IrNET. + * Basically, we sit on top of IrTTP. We set up IrTTP, IrIAS properly, + * and exchange frames with IrTTP. + */ + +#include "irnet_irda.h" /* Private header */ + +/************************* CONTROL CHANNEL *************************/ +/* + * When ppp is not active, /dev/irnet act as a control channel. + * Writing allow to set up the IrDA destination of the IrNET channel, + * and any application may be read events happening on IrNET... + */ + +/*------------------------------------------------------------------*/ +/* + * Post an event to the control channel... + * Put the event in the log, and then wait all process blocked on read + * so they can read the log... + */ +static void +irnet_post_event(irnet_socket * ap, + irnet_event event, + __u32 saddr, + __u32 daddr, + char * name, + __u16 hints) +{ + int index; /* In the log */ + + DENTER(CTRL_TRACE, "(ap=0x%p, event=%d, daddr=%08x, name=``%s'')\n", + ap, event, daddr, name); + + /* Protect this section via spinlock. + * Note : as we are the only event producer, we only need to exclude + * ourself when touching the log, which is nice and easy. + */ + spin_lock_bh(&irnet_events.spinlock); + + /* Copy the event in the log */ + index = irnet_events.index; + irnet_events.log[index].event = event; + irnet_events.log[index].daddr = daddr; + irnet_events.log[index].saddr = saddr; + /* Try to copy IrDA nickname */ + if(name) + strcpy(irnet_events.log[index].name, name); + else + irnet_events.log[index].name[0] = '\0'; + /* Copy hints */ + irnet_events.log[index].hints.word = hints; + /* Try to get ppp unit number */ + if((ap != (irnet_socket *) NULL) && (ap->ppp_open)) + irnet_events.log[index].unit = ppp_unit_number(&ap->chan); + else + irnet_events.log[index].unit = -1; + + /* Increment the index + * Note that we increment the index only after the event is written, + * to make sure that the readers don't get garbage... */ + irnet_events.index = (index + 1) % IRNET_MAX_EVENTS; + + DEBUG(CTRL_INFO, "New event index is %d\n", irnet_events.index); + + /* Spin lock end */ + spin_unlock_bh(&irnet_events.spinlock); + + /* Now : wake up everybody waiting for events... */ + wake_up_interruptible_all(&irnet_events.rwait); + + DEXIT(CTRL_TRACE, "\n"); +} + +/************************* IRDA SUBROUTINES *************************/ +/* + * These are a bunch of subroutines called from other functions + * down there, mostly common code or to improve readability... + * + * Note : we duplicate quite heavily some routines of af_irda.c, + * because our input structure (self) is quite different + * (struct irnet instead of struct irda_sock), which make sharing + * the same code impossible (at least, without templates). + */ + +/*------------------------------------------------------------------*/ +/* + * Function irda_open_tsap (self) + * + * Open local Transport Service Access Point (TSAP) + * + * Create a IrTTP instance for us and set all the IrTTP callbacks. + */ +static inline int +irnet_open_tsap(irnet_socket * self) +{ + notify_t notify; /* Callback structure */ + + DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self); + + DABORT(self->tsap != NULL, -EBUSY, IRDA_SR_ERROR, "Already busy !\n"); + + /* Initialize IrTTP callbacks to be used by the IrDA stack */ + irda_notify_init(¬ify); + notify.connect_confirm = irnet_connect_confirm; + notify.connect_indication = irnet_connect_indication; + notify.disconnect_indication = irnet_disconnect_indication; + notify.data_indication = irnet_data_indication; + /*notify.udata_indication = NULL;*/ + notify.flow_indication = irnet_flow_indication; + notify.status_indication = irnet_status_indication; + notify.instance = self; + strlcpy(notify.name, IRNET_NOTIFY_NAME, sizeof(notify.name)); + + /* Open an IrTTP instance */ + self->tsap = irttp_open_tsap(LSAP_ANY, DEFAULT_INITIAL_CREDIT, + ¬ify); + DABORT(self->tsap == NULL, -ENOMEM, + IRDA_SR_ERROR, "Unable to allocate TSAP !\n"); + + /* Remember which TSAP selector we actually got */ + self->stsap_sel = self->tsap->stsap_sel; + + DEXIT(IRDA_SR_TRACE, " - tsap=0x%p, sel=0x%X\n", + self->tsap, self->stsap_sel); + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_ias_to_tsap (self, result, value) + * + * Examine an IAS object and extract TSAP + * + * We do an IAP query to find the TSAP associated with the IrNET service. + * When IrIAP pass us the result of the query, this function look at + * the return values to check for failures and extract the TSAP if + * possible. + * Also deallocate value + * The failure is in self->errno + * Return TSAP or -1 + */ +static inline __u8 +irnet_ias_to_tsap(irnet_socket * self, + int result, + struct ias_value * value) +{ + __u8 dtsap_sel = 0; /* TSAP we are looking for */ + + DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self); + + /* By default, no error */ + self->errno = 0; + + /* Check if request succeeded */ + switch(result) + { + /* Standard errors : service not available */ + case IAS_CLASS_UNKNOWN: + case IAS_ATTRIB_UNKNOWN: + DEBUG(IRDA_SR_INFO, "IAS object doesn't exist ! (%d)\n", result); + self->errno = -EADDRNOTAVAIL; + break; + + /* Other errors, most likely IrDA stack failure */ + default : + DEBUG(IRDA_SR_INFO, "IAS query failed ! (%d)\n", result); + self->errno = -EHOSTUNREACH; + break; + + /* Success : we got what we wanted */ + case IAS_SUCCESS: + break; + } + + /* Check what was returned to us */ + if(value != NULL) + { + /* What type of argument have we got ? */ + switch(value->type) + { + case IAS_INTEGER: + DEBUG(IRDA_SR_INFO, "result=%d\n", value->t.integer); + if(value->t.integer != -1) + /* Get the remote TSAP selector */ + dtsap_sel = value->t.integer; + else + self->errno = -EADDRNOTAVAIL; + break; + default: + self->errno = -EADDRNOTAVAIL; + DERROR(IRDA_SR_ERROR, "bad type ! (0x%X)\n", value->type); + break; + } + + /* Cleanup */ + irias_delete_value(value); + } + else /* value == NULL */ + { + /* Nothing returned to us - usually result != SUCCESS */ + if(!(self->errno)) + { + DERROR(IRDA_SR_ERROR, + "IrDA bug : result == SUCCESS && value == NULL\n"); + self->errno = -EHOSTUNREACH; + } + } + DEXIT(IRDA_SR_TRACE, "\n"); + + /* Return the TSAP */ + return(dtsap_sel); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_find_lsap_sel (self) + * + * Try to lookup LSAP selector in remote LM-IAS + * + * Basically, we start a IAP query, and then go to sleep. When the query + * return, irnet_getvalue_confirm will wake us up, and we can examine the + * result of the query... + * Note that in some case, the query fail even before we go to sleep, + * creating some races... + */ +static inline int +irnet_find_lsap_sel(irnet_socket * self) +{ + DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self); + + /* This should not happen */ + DABORT(self->iriap, -EBUSY, IRDA_SR_ERROR, "busy with a previous query.\n"); + + /* Create an IAP instance, will be closed in irnet_getvalue_confirm() */ + self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, + irnet_getvalue_confirm); + + /* Treat unexpected signals as disconnect */ + self->errno = -EHOSTUNREACH; + + /* Query remote LM-IAS */ + iriap_getvaluebyclass_request(self->iriap, self->rsaddr, self->daddr, + IRNET_SERVICE_NAME, IRNET_IAS_VALUE); + + /* The above request is non-blocking. + * After a while, IrDA will call us back in irnet_getvalue_confirm() + * We will then call irnet_ias_to_tsap() and finish the + * connection procedure */ + + DEXIT(IRDA_SR_TRACE, "\n"); + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_connect_tsap (self) + * + * Initialise the TTP socket and initiate TTP connection + * + */ +static inline int +irnet_connect_tsap(irnet_socket * self) +{ + int err; + + DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self); + + /* Open a local TSAP (an IrTTP instance) */ + err = irnet_open_tsap(self); + if(err != 0) + { + clear_bit(0, &self->ttp_connect); + DERROR(IRDA_SR_ERROR, "connect aborted!\n"); + return(err); + } + + /* Connect to remote device */ + err = irttp_connect_request(self->tsap, self->dtsap_sel, + self->rsaddr, self->daddr, NULL, + self->max_sdu_size_rx, NULL); + if(err != 0) + { + clear_bit(0, &self->ttp_connect); + DERROR(IRDA_SR_ERROR, "connect aborted!\n"); + return(err); + } + + /* The above call is non-blocking. + * After a while, the IrDA stack will either call us back in + * irnet_connect_confirm() or irnet_disconnect_indication() + * See you there ;-) */ + + DEXIT(IRDA_SR_TRACE, "\n"); + return(err); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_discover_next_daddr (self) + * + * Query the IrNET TSAP of the next device in the log. + * + * Used in the TSAP discovery procedure. + */ +static inline int +irnet_discover_next_daddr(irnet_socket * self) +{ + /* Close the last instance of IrIAP, and open a new one. + * We can't reuse the IrIAP instance in the IrIAP callback */ + if(self->iriap) + { + iriap_close(self->iriap); + self->iriap = NULL; + } + /* Create a new IAP instance */ + self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, + irnet_discovervalue_confirm); + if(self->iriap == NULL) + return -ENOMEM; + + /* Next discovery - before the call to avoid races */ + self->disco_index++; + + /* Check if we have one more address to try */ + if(self->disco_index < self->disco_number) + { + /* Query remote LM-IAS */ + iriap_getvaluebyclass_request(self->iriap, + self->discoveries[self->disco_index].saddr, + self->discoveries[self->disco_index].daddr, + IRNET_SERVICE_NAME, IRNET_IAS_VALUE); + /* The above request is non-blocking. + * After a while, IrDA will call us back in irnet_discovervalue_confirm() + * We will then call irnet_ias_to_tsap() and come back here again... */ + return(0); + } + else + return(1); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_discover_daddr_and_lsap_sel (self) + * + * This try to find a device with the requested service. + * + * Initiate a TSAP discovery procedure. + * It basically look into the discovery log. For each address in the list, + * it queries the LM-IAS of the device to find if this device offer + * the requested service. + * If there is more than one node supporting the service, we complain + * to the user (it should move devices around). + * If we find one node which have the requested TSAP, we connect to it. + * + * This function just start the whole procedure. It request the discovery + * log and submit the first IAS query. + * The bulk of the job is handled in irnet_discovervalue_confirm() + * + * Note : this procedure fails if there is more than one device in range + * on the same dongle, because IrLMP doesn't disconnect the LAP when the + * last LSAP is closed. Moreover, we would need to wait the LAP + * disconnection... + */ +static inline int +irnet_discover_daddr_and_lsap_sel(irnet_socket * self) +{ + int ret; + + DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self); + + /* Ask lmp for the current discovery log */ + self->discoveries = irlmp_get_discoveries(&self->disco_number, self->mask, + DISCOVERY_DEFAULT_SLOTS); + + /* Check if the we got some results */ + if(self->discoveries == NULL) + { + self->disco_number = -1; + clear_bit(0, &self->ttp_connect); + DRETURN(-ENETUNREACH, IRDA_SR_INFO, "No Cachelog...\n"); + } + DEBUG(IRDA_SR_INFO, "Got the log (0x%p), size is %d\n", + self->discoveries, self->disco_number); + + /* Start with the first discovery */ + self->disco_index = -1; + self->daddr = DEV_ADDR_ANY; + + /* This will fail if the log is empty - this is non-blocking */ + ret = irnet_discover_next_daddr(self); + if(ret) + { + /* Close IAP */ + if(self->iriap) + iriap_close(self->iriap); + self->iriap = NULL; + + /* Cleanup our copy of the discovery log */ + kfree(self->discoveries); + self->discoveries = NULL; + + clear_bit(0, &self->ttp_connect); + DRETURN(-ENETUNREACH, IRDA_SR_INFO, "Cachelog empty...\n"); + } + + /* Follow me in irnet_discovervalue_confirm() */ + + DEXIT(IRDA_SR_TRACE, "\n"); + return(0); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_dname_to_daddr (self) + * + * Convert an IrDA nickname to a valid IrDA address + * + * It basically look into the discovery log until there is a match. + */ +static inline int +irnet_dname_to_daddr(irnet_socket * self) +{ + struct irda_device_info *discoveries; /* Copy of the discovery log */ + int number; /* Number of nodes in the log */ + int i; + + DENTER(IRDA_SR_TRACE, "(self=0x%p)\n", self); + + /* Ask lmp for the current discovery log */ + discoveries = irlmp_get_discoveries(&number, 0xffff, + DISCOVERY_DEFAULT_SLOTS); + /* Check if the we got some results */ + if(discoveries == NULL) + DRETURN(-ENETUNREACH, IRDA_SR_INFO, "Cachelog empty...\n"); + + /* + * Now, check all discovered devices (if any), and connect + * client only about the services that the client is + * interested in... + */ + for(i = 0; i < number; i++) + { + /* Does the name match ? */ + if(!strncmp(discoveries[i].info, self->rname, NICKNAME_MAX_LEN)) + { + /* Yes !!! Get it.. */ + self->daddr = discoveries[i].daddr; + DEBUG(IRDA_SR_INFO, "discovered device ``%s'' at address 0x%08x.\n", + self->rname, self->daddr); + kfree(discoveries); + DEXIT(IRDA_SR_TRACE, "\n"); + return 0; + } + } + /* No luck ! */ + DEBUG(IRDA_SR_INFO, "cannot discover device ``%s'' !!!\n", self->rname); + kfree(discoveries); + return(-EADDRNOTAVAIL); +} + + +/************************* SOCKET ROUTINES *************************/ +/* + * This are the main operations on IrNET sockets, basically to create + * and destroy IrNET sockets. These are called from the PPP part... + */ + +/*------------------------------------------------------------------*/ +/* + * Create a IrNET instance : just initialise some parameters... + */ +int +irda_irnet_create(irnet_socket * self) +{ + DENTER(IRDA_SOCK_TRACE, "(self=0x%p)\n", self); + + self->magic = IRNET_MAGIC; /* Paranoia */ + + self->ttp_open = 0; /* Prevent higher layer from accessing IrTTP */ + self->ttp_connect = 0; /* Not connecting yet */ + self->rname[0] = '\0'; /* May be set via control channel */ + self->rdaddr = DEV_ADDR_ANY; /* May be set via control channel */ + self->rsaddr = DEV_ADDR_ANY; /* May be set via control channel */ + self->daddr = DEV_ADDR_ANY; /* Until we get connected */ + self->saddr = DEV_ADDR_ANY; /* Until we get connected */ + self->max_sdu_size_rx = TTP_SAR_UNBOUND; + + /* Register as a client with IrLMP */ + self->ckey = irlmp_register_client(0, NULL, NULL, NULL); +#ifdef DISCOVERY_NOMASK + self->mask = 0xffff; /* For W2k compatibility */ +#else /* DISCOVERY_NOMASK */ + self->mask = irlmp_service_to_hint(S_LAN); +#endif /* DISCOVERY_NOMASK */ + self->tx_flow = FLOW_START; /* Flow control from IrTTP */ + + DEXIT(IRDA_SOCK_TRACE, "\n"); + return(0); +} + +/*------------------------------------------------------------------*/ +/* + * Connect to the other side : + * o convert device name to an address + * o find the socket number (dlsap) + * o Establish the connection + * + * Note : We no longer mimic af_irda. The IAS query for finding the TSAP + * is done asynchronously, like the TTP connection. This allow us to + * call this function from any context (not only process). + * The downside is that following what's happening in there is tricky + * because it involve various functions all over the place... + */ +int +irda_irnet_connect(irnet_socket * self) +{ + int err; + + DENTER(IRDA_SOCK_TRACE, "(self=0x%p)\n", self); + + /* Check if we are already trying to connect. + * Because irda_irnet_connect() can be called directly by pppd plus + * packet retries in ppp_generic and connect may take time, plus we may + * race with irnet_connect_indication(), we need to be careful there... */ + if(test_and_set_bit(0, &self->ttp_connect)) + DRETURN(-EBUSY, IRDA_SOCK_INFO, "Already connecting...\n"); + if((self->iriap != NULL) || (self->tsap != NULL)) + DERROR(IRDA_SOCK_ERROR, "Socket not cleaned up...\n"); + + /* Insert ourselves in the hashbin so that the IrNET server can find us. + * Notes : 4th arg is string of 32 char max and must be null terminated + * When 4th arg is used (string), 3rd arg isn't (int) + * Can't re-insert (MUST remove first) so check for that... */ + if((irnet_server.running) && (self->q.q_next == NULL)) + { + spin_lock_bh(&irnet_server.spinlock); + hashbin_insert(irnet_server.list, (irda_queue_t *) self, 0, self->rname); + spin_unlock_bh(&irnet_server.spinlock); + DEBUG(IRDA_SOCK_INFO, "Inserted ``%s'' in hashbin...\n", self->rname); + } + + /* If we don't have anything (no address, no name) */ + if((self->rdaddr == DEV_ADDR_ANY) && (self->rname[0] == '\0')) + { + /* Try to find a suitable address */ + if((err = irnet_discover_daddr_and_lsap_sel(self)) != 0) + DRETURN(err, IRDA_SOCK_INFO, "auto-connect failed!\n"); + /* In most cases, the call above is non-blocking */ + } + else + { + /* If we have only the name (no address), try to get an address */ + if(self->rdaddr == DEV_ADDR_ANY) + { + if((err = irnet_dname_to_daddr(self)) != 0) + DRETURN(err, IRDA_SOCK_INFO, "name connect failed!\n"); + } + else + /* Use the requested destination address */ + self->daddr = self->rdaddr; + + /* Query remote LM-IAS to find LSAP selector */ + irnet_find_lsap_sel(self); + /* The above call is non blocking */ + } + + /* At this point, we are waiting for the IrDA stack to call us back, + * or we have already failed. + * We will finish the connection procedure in irnet_connect_tsap(). + */ + DEXIT(IRDA_SOCK_TRACE, "\n"); + return(0); +} + +/*------------------------------------------------------------------*/ +/* + * Function irda_irnet_destroy(self) + * + * Destroy irnet instance + * + * Note : this need to be called from a process context. + */ +void +irda_irnet_destroy(irnet_socket * self) +{ + DENTER(IRDA_SOCK_TRACE, "(self=0x%p)\n", self); + if(self == NULL) + return; + + /* Remove ourselves from hashbin (if we are queued in hashbin) + * Note : `irnet_server.running' protect us from calls in hashbin_delete() */ + if((irnet_server.running) && (self->q.q_next != NULL)) + { + struct irnet_socket * entry; + DEBUG(IRDA_SOCK_INFO, "Removing from hash..\n"); + spin_lock_bh(&irnet_server.spinlock); + entry = hashbin_remove_this(irnet_server.list, (irda_queue_t *) self); + self->q.q_next = NULL; + spin_unlock_bh(&irnet_server.spinlock); + DASSERT(entry == self, , IRDA_SOCK_ERROR, "Can't remove from hash.\n"); + } + + /* If we were connected, post a message */ + if(test_bit(0, &self->ttp_open)) + { + /* Note : as the disconnect comes from ppp_generic, the unit number + * doesn't exist anymore when we post the event, so we need to pass + * NULL as the first arg... */ + irnet_post_event(NULL, IRNET_DISCONNECT_TO, + self->saddr, self->daddr, self->rname, 0); + } + + /* Prevent various IrDA callbacks from messing up things + * Need to be first */ + clear_bit(0, &self->ttp_connect); + + /* Prevent higher layer from accessing IrTTP */ + clear_bit(0, &self->ttp_open); + + /* Unregister with IrLMP */ + irlmp_unregister_client(self->ckey); + + /* Unregister with LM-IAS */ + if(self->iriap) + { + iriap_close(self->iriap); + self->iriap = NULL; + } + + /* Cleanup eventual discoveries from connection attempt or control channel */ + if(self->discoveries != NULL) + { + /* Cleanup our copy of the discovery log */ + kfree(self->discoveries); + self->discoveries = NULL; + } + + /* Close our IrTTP connection */ + if(self->tsap) + { + DEBUG(IRDA_SOCK_INFO, "Closing our TTP connection.\n"); + irttp_disconnect_request(self->tsap, NULL, P_NORMAL); + irttp_close_tsap(self->tsap); + self->tsap = NULL; + } + self->stsap_sel = 0; + + DEXIT(IRDA_SOCK_TRACE, "\n"); + return; +} + + +/************************** SERVER SOCKET **************************/ +/* + * The IrNET service is composed of one server socket and a variable + * number of regular IrNET sockets. The server socket is supposed to + * handle incoming connections and redirect them to one IrNET sockets. + * It's a superset of the regular IrNET socket, but has a very distinct + * behaviour... + */ + +/*------------------------------------------------------------------*/ +/* + * Function irnet_daddr_to_dname (self) + * + * Convert an IrDA address to a IrDA nickname + * + * It basically look into the discovery log until there is a match. + */ +static inline int +irnet_daddr_to_dname(irnet_socket * self) +{ + struct irda_device_info *discoveries; /* Copy of the discovery log */ + int number; /* Number of nodes in the log */ + int i; + + DENTER(IRDA_SERV_TRACE, "(self=0x%p)\n", self); + + /* Ask lmp for the current discovery log */ + discoveries = irlmp_get_discoveries(&number, 0xffff, + DISCOVERY_DEFAULT_SLOTS); + /* Check if the we got some results */ + if (discoveries == NULL) + DRETURN(-ENETUNREACH, IRDA_SERV_INFO, "Cachelog empty...\n"); + + /* Now, check all discovered devices (if any) */ + for(i = 0; i < number; i++) + { + /* Does the name match ? */ + if(discoveries[i].daddr == self->daddr) + { + /* Yes !!! Get it.. */ + strlcpy(self->rname, discoveries[i].info, sizeof(self->rname)); + self->rname[NICKNAME_MAX_LEN + 1] = '\0'; + DEBUG(IRDA_SERV_INFO, "Device 0x%08x is in fact ``%s''.\n", + self->daddr, self->rname); + kfree(discoveries); + DEXIT(IRDA_SERV_TRACE, "\n"); + return 0; + } + } + /* No luck ! */ + DEXIT(IRDA_SERV_INFO, ": cannot discover device 0x%08x !!!\n", self->daddr); + kfree(discoveries); + return(-EADDRNOTAVAIL); +} + +/*------------------------------------------------------------------*/ +/* + * Function irda_find_socket (self) + * + * Find the correct IrNET socket + * + * Look into the list of IrNET sockets and finds one with the right + * properties... + */ +static inline irnet_socket * +irnet_find_socket(irnet_socket * self) +{ + irnet_socket * new = (irnet_socket *) NULL; + int err; + + DENTER(IRDA_SERV_TRACE, "(self=0x%p)\n", self); + + /* Get the addresses of the requester */ + self->daddr = irttp_get_daddr(self->tsap); + self->saddr = irttp_get_saddr(self->tsap); + + /* Try to get the IrDA nickname of the requester */ + err = irnet_daddr_to_dname(self); + + /* Protect access to the instance list */ + spin_lock_bh(&irnet_server.spinlock); + + /* So now, try to get an socket having specifically + * requested that nickname */ + if(err == 0) + { + new = (irnet_socket *) hashbin_find(irnet_server.list, + 0, self->rname); + if(new) + DEBUG(IRDA_SERV_INFO, "Socket 0x%p matches rname ``%s''.\n", + new, new->rname); + } + + /* If no name matches, try to find an socket by the destination address */ + /* It can be either the requested destination address (set via the + * control channel), or the current destination address if the + * socket is in the middle of a connection request */ + if(new == (irnet_socket *) NULL) + { + new = (irnet_socket *) hashbin_get_first(irnet_server.list); + while(new !=(irnet_socket *) NULL) + { + /* Does it have the same address ? */ + if((new->rdaddr == self->daddr) || (new->daddr == self->daddr)) + { + /* Yes !!! Get it.. */ + DEBUG(IRDA_SERV_INFO, "Socket 0x%p matches daddr %#08x.\n", + new, self->daddr); + break; + } + new = (irnet_socket *) hashbin_get_next(irnet_server.list); + } + } + + /* If we don't have any socket, get the first unconnected socket */ + if(new == (irnet_socket *) NULL) + { + new = (irnet_socket *) hashbin_get_first(irnet_server.list); + while(new !=(irnet_socket *) NULL) + { + /* Is it available ? */ + if(!(test_bit(0, &new->ttp_open)) && (new->rdaddr == DEV_ADDR_ANY) && + (new->rname[0] == '\0') && (new->ppp_open)) + { + /* Yes !!! Get it.. */ + DEBUG(IRDA_SERV_INFO, "Socket 0x%p is free.\n", + new); + break; + } + new = (irnet_socket *) hashbin_get_next(irnet_server.list); + } + } + + /* Spin lock end */ + spin_unlock_bh(&irnet_server.spinlock); + + DEXIT(IRDA_SERV_TRACE, " - new = 0x%p\n", new); + return new; +} + +/*------------------------------------------------------------------*/ +/* + * Function irda_connect_socket (self) + * + * Connect an incoming connection to the socket + * + */ +static inline int +irnet_connect_socket(irnet_socket * server, + irnet_socket * new, + struct qos_info * qos, + __u32 max_sdu_size, + __u8 max_header_size) +{ + DENTER(IRDA_SERV_TRACE, "(server=0x%p, new=0x%p)\n", + server, new); + + /* Now attach up the new socket */ + new->tsap = irttp_dup(server->tsap, new); + DABORT(new->tsap == NULL, -1, IRDA_SERV_ERROR, "dup failed!\n"); + + /* Set up all the relevant parameters on the new socket */ + new->stsap_sel = new->tsap->stsap_sel; + new->dtsap_sel = new->tsap->dtsap_sel; + new->saddr = irttp_get_saddr(new->tsap); + new->daddr = irttp_get_daddr(new->tsap); + + new->max_header_size = max_header_size; + new->max_sdu_size_tx = max_sdu_size; + new->max_data_size = max_sdu_size; +#ifdef STREAM_COMPAT + /* If we want to receive "stream sockets" */ + if(max_sdu_size == 0) + new->max_data_size = irttp_get_max_seg_size(new->tsap); +#endif /* STREAM_COMPAT */ + + /* Clean up the original one to keep it in listen state */ + irttp_listen(server->tsap); + + /* Send a connection response on the new socket */ + irttp_connect_response(new->tsap, new->max_sdu_size_rx, NULL); + + /* Allow PPP to send its junk over the new socket... */ + set_bit(0, &new->ttp_open); + + /* Not connecting anymore, and clean up last possible remains + * of connection attempts on the socket */ + clear_bit(0, &new->ttp_connect); + if(new->iriap) + { + iriap_close(new->iriap); + new->iriap = NULL; + } + if(new->discoveries != NULL) + { + kfree(new->discoveries); + new->discoveries = NULL; + } + +#ifdef CONNECT_INDIC_KICK + /* As currently we don't block packets in ppp_irnet_send() while passive, + * this is not really needed... + * Also, not doing it give IrDA a chance to finish the setup properly + * before being swamped with packets... */ + ppp_output_wakeup(&new->chan); +#endif /* CONNECT_INDIC_KICK */ + + /* Notify the control channel */ + irnet_post_event(new, IRNET_CONNECT_FROM, + new->saddr, new->daddr, server->rname, 0); + + DEXIT(IRDA_SERV_TRACE, "\n"); + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Function irda_disconnect_server (self) + * + * Cleanup the server socket when the incoming connection abort + * + */ +static inline void +irnet_disconnect_server(irnet_socket * self, + struct sk_buff *skb) +{ + DENTER(IRDA_SERV_TRACE, "(self=0x%p)\n", self); + + /* Put the received packet in the black hole */ + kfree_skb(skb); + +#ifdef FAIL_SEND_DISCONNECT + /* Tell the other party we don't want to be connected */ + /* Hum... Is it the right thing to do ? And do we need to send + * a connect response before ? It looks ok without this... */ + irttp_disconnect_request(self->tsap, NULL, P_NORMAL); +#endif /* FAIL_SEND_DISCONNECT */ + + /* Notify the control channel (see irnet_find_socket()) */ + irnet_post_event(NULL, IRNET_REQUEST_FROM, + self->saddr, self->daddr, self->rname, 0); + + /* Clean up the server to keep it in listen state */ + irttp_listen(self->tsap); + + DEXIT(IRDA_SERV_TRACE, "\n"); + return; +} + +/*------------------------------------------------------------------*/ +/* + * Function irda_setup_server (self) + * + * Create a IrTTP server and set it up... + * + * Register the IrLAN hint bit, create a IrTTP instance for us, + * set all the IrTTP callbacks and create an IrIAS entry... + */ +static inline int +irnet_setup_server(void) +{ + __u16 hints; + + DENTER(IRDA_SERV_TRACE, "()\n"); + + /* Initialise the regular socket part of the server */ + irda_irnet_create(&irnet_server.s); + + /* Open a local TSAP (an IrTTP instance) for the server */ + irnet_open_tsap(&irnet_server.s); + + /* PPP part setup */ + irnet_server.s.ppp_open = 0; + irnet_server.s.chan.private = NULL; + irnet_server.s.file = NULL; + + /* Get the hint bit corresponding to IrLAN */ + /* Note : we overload the IrLAN hint bit. As it is only a "hint", and as + * we provide roughly the same functionality as IrLAN, this is ok. + * In fact, the situation is similar as JetSend overloading the Obex hint + */ + hints = irlmp_service_to_hint(S_LAN); + +#ifdef ADVERTISE_HINT + /* Register with IrLMP as a service (advertise our hint bit) */ + irnet_server.skey = irlmp_register_service(hints); +#endif /* ADVERTISE_HINT */ + + /* Register with LM-IAS (so that people can connect to us) */ + irnet_server.ias_obj = irias_new_object(IRNET_SERVICE_NAME, jiffies); + irias_add_integer_attrib(irnet_server.ias_obj, IRNET_IAS_VALUE, + irnet_server.s.stsap_sel, IAS_KERNEL_ATTR); + irias_insert_object(irnet_server.ias_obj); + +#ifdef DISCOVERY_EVENTS + /* Tell IrLMP we want to be notified of newly discovered nodes */ + irlmp_update_client(irnet_server.s.ckey, hints, + irnet_discovery_indication, irnet_expiry_indication, + (void *) &irnet_server.s); +#endif + + DEXIT(IRDA_SERV_TRACE, " - self=0x%p\n", &irnet_server.s); + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Function irda_destroy_server (self) + * + * Destroy the IrTTP server... + * + * Reverse of the previous function... + */ +static inline void +irnet_destroy_server(void) +{ + DENTER(IRDA_SERV_TRACE, "()\n"); + +#ifdef ADVERTISE_HINT + /* Unregister with IrLMP */ + irlmp_unregister_service(irnet_server.skey); +#endif /* ADVERTISE_HINT */ + + /* Unregister with LM-IAS */ + if(irnet_server.ias_obj) + irias_delete_object(irnet_server.ias_obj); + + /* Cleanup the socket part */ + irda_irnet_destroy(&irnet_server.s); + + DEXIT(IRDA_SERV_TRACE, "\n"); + return; +} + + +/************************ IRDA-TTP CALLBACKS ************************/ +/* + * When we create a IrTTP instance, we pass to it a set of callbacks + * that IrTTP will call in case of various events. + * We take care of those events here. + */ + +/*------------------------------------------------------------------*/ +/* + * Function irnet_data_indication (instance, sap, skb) + * + * Received some data from TinyTP. Just queue it on the receive queue + * + */ +static int +irnet_data_indication(void * instance, + void * sap, + struct sk_buff *skb) +{ + irnet_socket * ap = (irnet_socket *) instance; + unsigned char * p; + int code = 0; + + DENTER(IRDA_TCB_TRACE, "(self/ap=0x%p, skb=0x%p)\n", + ap, skb); + DASSERT(skb != NULL, 0, IRDA_CB_ERROR, "skb is NULL !!!\n"); + + /* Check is ppp is ready to receive our packet */ + if(!ap->ppp_open) + { + DERROR(IRDA_CB_ERROR, "PPP not ready, dropping packet...\n"); + /* When we return error, TTP will need to requeue the skb and + * will stop the sender. IrTTP will stall until we send it a + * flow control request... */ + return -ENOMEM; + } + + /* strip address/control field if present */ + p = skb->data; + if((p[0] == PPP_ALLSTATIONS) && (p[1] == PPP_UI)) + { + /* chop off address/control */ + if(skb->len < 3) + goto err_exit; + p = skb_pull(skb, 2); + } + + /* decompress protocol field if compressed */ + if(p[0] & 1) + { + /* protocol is compressed */ + skb_push(skb, 1)[0] = 0; + } + else + if(skb->len < 2) + goto err_exit; + + /* pass to generic ppp layer */ + /* Note : how do I know if ppp can accept or not the packet ? This is + * essential if I want to manage flow control smoothly... */ + ppp_input(&ap->chan, skb); + + DEXIT(IRDA_TCB_TRACE, "\n"); + return 0; + + err_exit: + DERROR(IRDA_CB_ERROR, "Packet too small, dropping...\n"); + kfree_skb(skb); + ppp_input_error(&ap->chan, code); + return 0; /* Don't return an error code, only for flow control... */ +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_disconnect_indication (instance, sap, reason, skb) + * + * Connection has been closed. Chech reason to find out why + * + * Note : there are many cases where we come here : + * o attempted to connect, timeout + * o connected, link is broken, LAP has timeout + * o connected, other side close the link + * o connection request on the server not handled + */ +static void +irnet_disconnect_indication(void * instance, + void * sap, + LM_REASON reason, + struct sk_buff *skb) +{ + irnet_socket * self = (irnet_socket *) instance; + int test_open; + int test_connect; + + DENTER(IRDA_TCB_TRACE, "(self=0x%p)\n", self); + DASSERT(self != NULL, , IRDA_CB_ERROR, "Self is NULL !!!\n"); + + /* Don't care about it, but let's not leak it */ + if(skb) + dev_kfree_skb(skb); + + /* Prevent higher layer from accessing IrTTP */ + test_open = test_and_clear_bit(0, &self->ttp_open); + /* Not connecting anymore... + * (note : TSAP is open, so IAP callbacks are no longer pending...) */ + test_connect = test_and_clear_bit(0, &self->ttp_connect); + + /* If both self->ttp_open and self->ttp_connect are NULL, it mean that we + * have a race condition with irda_irnet_destroy() or + * irnet_connect_indication(), so don't mess up tsap... + */ + if(!(test_open || test_connect)) + { + DERROR(IRDA_CB_ERROR, "Race condition detected...\n"); + return; + } + + /* If we were active, notify the control channel */ + if(test_open) + irnet_post_event(self, IRNET_DISCONNECT_FROM, + self->saddr, self->daddr, self->rname, 0); + else + /* If we were trying to connect, notify the control channel */ + if((self->tsap) && (self != &irnet_server.s)) + irnet_post_event(self, IRNET_NOANSWER_FROM, + self->saddr, self->daddr, self->rname, 0); + + /* Close our IrTTP connection, cleanup tsap */ + if((self->tsap) && (self != &irnet_server.s)) + { + DEBUG(IRDA_CB_INFO, "Closing our TTP connection.\n"); + irttp_close_tsap(self->tsap); + self->tsap = NULL; + } + /* Cleanup the socket in case we want to reconnect in ppp_output_wakeup() */ + self->stsap_sel = 0; + self->daddr = DEV_ADDR_ANY; + self->tx_flow = FLOW_START; + + /* Deal with the ppp instance if it's still alive */ + if(self->ppp_open) + { + if(test_open) + { +#ifdef MISSING_PPP_API + /* ppp_unregister_channel() wants a user context, which we + * are guaranteed to NOT have here. What are we supposed + * to do here ? Jean II */ + /* If we were connected, cleanup & close the PPP channel, + * which will kill pppd (hangup) and the rest */ + ppp_unregister_channel(&self->chan); + self->ppp_open = 0; +#endif + } + else + { + /* If we were trying to connect, flush (drain) ppp_generic + * Tx queue (most often we have blocked it), which will + * trigger an other attempt to connect. If we are passive, + * this will empty the Tx queue after last try. */ + ppp_output_wakeup(&self->chan); + } + } + + DEXIT(IRDA_TCB_TRACE, "\n"); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_connect_confirm (instance, sap, qos, max_sdu_size, skb) + * + * Connections has been confirmed by the remote device + * + */ +static void +irnet_connect_confirm(void * instance, + void * sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + irnet_socket * self = (irnet_socket *) instance; + + DENTER(IRDA_TCB_TRACE, "(self=0x%p)\n", self); + + /* Check if socket is closing down (via irda_irnet_destroy()) */ + if(! test_bit(0, &self->ttp_connect)) + { + DERROR(IRDA_CB_ERROR, "Socket no longer connecting. Ouch !\n"); + return; + } + + /* How much header space do we need to reserve */ + self->max_header_size = max_header_size; + + /* IrTTP max SDU size in transmit direction */ + self->max_sdu_size_tx = max_sdu_size; + self->max_data_size = max_sdu_size; +#ifdef STREAM_COMPAT + if(max_sdu_size == 0) + self->max_data_size = irttp_get_max_seg_size(self->tsap); +#endif /* STREAM_COMPAT */ + + /* At this point, IrLMP has assigned our source address */ + self->saddr = irttp_get_saddr(self->tsap); + + /* Allow higher layer to access IrTTP */ + set_bit(0, &self->ttp_open); + clear_bit(0, &self->ttp_connect); /* Not racy, IrDA traffic is serial */ + /* Give a kick in the ass of ppp_generic so that he sends us some data */ + ppp_output_wakeup(&self->chan); + + /* Check size of received packet */ + if(skb->len > 0) + { +#ifdef PASS_CONNECT_PACKETS + DEBUG(IRDA_CB_INFO, "Passing connect packet to PPP.\n"); + /* Try to pass it to PPP */ + irnet_data_indication(instance, sap, skb); +#else /* PASS_CONNECT_PACKETS */ + DERROR(IRDA_CB_ERROR, "Dropping non empty packet.\n"); + kfree_skb(skb); /* Note : will be optimised with other kfree... */ +#endif /* PASS_CONNECT_PACKETS */ + } + else + kfree_skb(skb); + + /* Notify the control channel */ + irnet_post_event(self, IRNET_CONNECT_TO, + self->saddr, self->daddr, self->rname, 0); + + DEXIT(IRDA_TCB_TRACE, "\n"); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_flow_indication (instance, sap, flow) + * + * Used by TinyTP to tell us if it can accept more data or not + * + */ +static void +irnet_flow_indication(void * instance, + void * sap, + LOCAL_FLOW flow) +{ + irnet_socket * self = (irnet_socket *) instance; + LOCAL_FLOW oldflow = self->tx_flow; + + DENTER(IRDA_TCB_TRACE, "(self=0x%p, flow=%d)\n", self, flow); + + /* Update our state */ + self->tx_flow = flow; + + /* Check what IrTTP want us to do... */ + switch(flow) + { + case FLOW_START: + DEBUG(IRDA_CB_INFO, "IrTTP wants us to start again\n"); + /* Check if we really need to wake up PPP */ + if(oldflow == FLOW_STOP) + ppp_output_wakeup(&self->chan); + else + DEBUG(IRDA_CB_INFO, "But we were already transmitting !!!\n"); + break; + case FLOW_STOP: + DEBUG(IRDA_CB_INFO, "IrTTP wants us to slow down\n"); + break; + default: + DEBUG(IRDA_CB_INFO, "Unknown flow command!\n"); + break; + } + + DEXIT(IRDA_TCB_TRACE, "\n"); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_status_indication (instance, sap, reason, skb) + * + * Link (IrLAP) status report. + * + */ +static void +irnet_status_indication(void * instance, + LINK_STATUS link, + LOCK_STATUS lock) +{ + irnet_socket * self = (irnet_socket *) instance; + + DENTER(IRDA_TCB_TRACE, "(self=0x%p)\n", self); + DASSERT(self != NULL, , IRDA_CB_ERROR, "Self is NULL !!!\n"); + + /* We can only get this event if we are connected */ + switch(link) + { + case STATUS_NO_ACTIVITY: + irnet_post_event(self, IRNET_BLOCKED_LINK, + self->saddr, self->daddr, self->rname, 0); + break; + default: + DEBUG(IRDA_CB_INFO, "Unknown status...\n"); + } + + DEXIT(IRDA_TCB_TRACE, "\n"); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_connect_indication(instance, sap, qos, max_sdu_size, userdata) + * + * Incoming connection + * + * In theory, this function is called only on the server socket. + * Some other node is attempting to connect to the IrNET service, and has + * sent a connection request on our server socket. + * We just redirect the connection to the relevant IrNET socket. + * + * Note : we also make sure that between 2 irnet nodes, there can + * exist only one irnet connection. + */ +static void +irnet_connect_indication(void * instance, + void * sap, + struct qos_info *qos, + __u32 max_sdu_size, + __u8 max_header_size, + struct sk_buff *skb) +{ + irnet_socket * server = &irnet_server.s; + irnet_socket * new = (irnet_socket *) NULL; + + DENTER(IRDA_TCB_TRACE, "(server=0x%p)\n", server); + DASSERT(instance == &irnet_server, , IRDA_CB_ERROR, + "Invalid instance (0x%p) !!!\n", instance); + DASSERT(sap == irnet_server.s.tsap, , IRDA_CB_ERROR, "Invalid sap !!!\n"); + + /* Try to find the most appropriate IrNET socket */ + new = irnet_find_socket(server); + + /* After all this hard work, do we have an socket ? */ + if(new == (irnet_socket *) NULL) + { + DEXIT(IRDA_CB_INFO, ": No socket waiting for this connection.\n"); + irnet_disconnect_server(server, skb); + return; + } + + /* Is the socket already busy ? */ + if(test_bit(0, &new->ttp_open)) + { + DEXIT(IRDA_CB_INFO, ": Socket already connected.\n"); + irnet_disconnect_server(server, skb); + return; + } + + /* The following code is a bit tricky, so need comments ;-) + */ + /* If ttp_connect is set, the socket is trying to connect to the other + * end and may have sent a IrTTP connection request and is waiting for + * a connection response (that may never come). + * Now, the pain is that the socket may have opened a tsap and is + * waiting on it, while the other end is trying to connect to it on + * another tsap. + * Because IrNET can be peer to peer, we need to workaround this. + * Furthermore, the way the irnetd script is implemented, the + * target will create a second IrNET connection back to the + * originator and expect the originator to bind this new connection + * to the original PPPD instance. + * And of course, if we don't use irnetd, we can have a race when + * both side try to connect simultaneously, which could leave both + * connections half closed (yuck). + * Conclusions : + * 1) The "originator" must accept the new connection and get rid + * of the old one so that irnetd works + * 2) One side must deny the new connection to avoid races, + * but both side must agree on which side it is... + * Most often, the originator is primary at the LAP layer. + * Jean II + */ + /* Now, let's look at the way I wrote the test... + * We need to clear up the ttp_connect flag atomically to prevent + * irnet_disconnect_indication() to mess up the tsap we are going to close. + * We want to clear the ttp_connect flag only if we close the tsap, + * otherwise we will never close it, so we need to check for primary + * *before* doing the test on the flag. + * And of course, ALLOW_SIMULT_CONNECT can disable this entirely... + * Jean II + */ + + /* Socket already connecting ? On primary ? */ + if(0 +#ifdef ALLOW_SIMULT_CONNECT + || ((irttp_is_primary(server->tsap) == 1) /* primary */ + && (test_and_clear_bit(0, &new->ttp_connect))) +#endif /* ALLOW_SIMULT_CONNECT */ + ) + { + DERROR(IRDA_CB_ERROR, "Socket already connecting, but going to reuse it !\n"); + + /* Cleanup the old TSAP if necessary - IrIAP will be cleaned up later */ + if(new->tsap != NULL) + { + /* Close the old connection the new socket was attempting, + * so that we can hook it up to the new connection. + * It's now safe to do it... */ + irttp_close_tsap(new->tsap); + new->tsap = NULL; + } + } + else + { + /* Three options : + * 1) socket was not connecting or connected : ttp_connect should be 0. + * 2) we don't want to connect the socket because we are secondary or + * ALLOW_SIMULT_CONNECT is undefined. ttp_connect should be 1. + * 3) we are half way in irnet_disconnect_indication(), and it's a + * nice race condition... Fortunately, we can detect that by checking + * if tsap is still alive. On the other hand, we can't be in + * irda_irnet_destroy() otherwise we would not have found this + * socket in the hashbin. + * Jean II */ + if((test_bit(0, &new->ttp_connect)) || (new->tsap != NULL)) + { + /* Don't mess this socket, somebody else in in charge... */ + DERROR(IRDA_CB_ERROR, "Race condition detected, socket in use, abort connect...\n"); + irnet_disconnect_server(server, skb); + return; + } + } + + /* So : at this point, we have a socket, and it is idle. Good ! */ + irnet_connect_socket(server, new, qos, max_sdu_size, max_header_size); + + /* Check size of received packet */ + if(skb->len > 0) + { +#ifdef PASS_CONNECT_PACKETS + DEBUG(IRDA_CB_INFO, "Passing connect packet to PPP.\n"); + /* Try to pass it to PPP */ + irnet_data_indication(new, new->tsap, skb); +#else /* PASS_CONNECT_PACKETS */ + DERROR(IRDA_CB_ERROR, "Dropping non empty packet.\n"); + kfree_skb(skb); /* Note : will be optimised with other kfree... */ +#endif /* PASS_CONNECT_PACKETS */ + } + else + kfree_skb(skb); + + DEXIT(IRDA_TCB_TRACE, "\n"); +} + + +/********************** IRDA-IAS/LMP CALLBACKS **********************/ +/* + * These are the callbacks called by other layers of the IrDA stack, + * mainly LMP for discovery and IAS for name queries. + */ + +/*------------------------------------------------------------------*/ +/* + * Function irnet_getvalue_confirm (result, obj_id, value, priv) + * + * Got answer from remote LM-IAS, just connect + * + * This is the reply to a IAS query we were doing to find the TSAP of + * the device we want to connect to. + * If we have found a valid TSAP, just initiate the TTP connection + * on this TSAP. + */ +static void +irnet_getvalue_confirm(int result, + __u16 obj_id, + struct ias_value *value, + void * priv) +{ + irnet_socket * self = (irnet_socket *) priv; + + DENTER(IRDA_OCB_TRACE, "(self=0x%p)\n", self); + DASSERT(self != NULL, , IRDA_OCB_ERROR, "Self is NULL !!!\n"); + + /* Check if already connected (via irnet_connect_socket()) + * or socket is closing down (via irda_irnet_destroy()) */ + if(! test_bit(0, &self->ttp_connect)) + { + DERROR(IRDA_OCB_ERROR, "Socket no longer connecting. Ouch !\n"); + return; + } + + /* We probably don't need to make any more queries */ + iriap_close(self->iriap); + self->iriap = NULL; + + /* Post process the IAS reply */ + self->dtsap_sel = irnet_ias_to_tsap(self, result, value); + + /* If error, just go out */ + if(self->errno) + { + clear_bit(0, &self->ttp_connect); + DERROR(IRDA_OCB_ERROR, "IAS connect failed ! (0x%X)\n", self->errno); + return; + } + + DEBUG(IRDA_OCB_INFO, "daddr = %08x, lsap = %d, starting IrTTP connection\n", + self->daddr, self->dtsap_sel); + + /* Start up TTP - non blocking */ + irnet_connect_tsap(self); + + DEXIT(IRDA_OCB_TRACE, "\n"); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_discovervalue_confirm (result, obj_id, value, priv) + * + * Handle the TSAP discovery procedure state machine. + * Got answer from remote LM-IAS, try next device + * + * We are doing a TSAP discovery procedure, and we got an answer to + * a IAS query we were doing to find the TSAP on one of the address + * in the discovery log. + * + * If we have found a valid TSAP for the first time, save it. If it's + * not the first time we found one, complain. + * + * If we have more addresses in the log, just initiate a new query. + * Note that those query may fail (see irnet_discover_daddr_and_lsap_sel()) + * + * Otherwise, wrap up the procedure (cleanup), check if we have found + * any device and connect to it. + */ +static void +irnet_discovervalue_confirm(int result, + __u16 obj_id, + struct ias_value *value, + void * priv) +{ + irnet_socket * self = (irnet_socket *) priv; + __u8 dtsap_sel; /* TSAP we are looking for */ + + DENTER(IRDA_OCB_TRACE, "(self=0x%p)\n", self); + DASSERT(self != NULL, , IRDA_OCB_ERROR, "Self is NULL !!!\n"); + + /* Check if already connected (via irnet_connect_socket()) + * or socket is closing down (via irda_irnet_destroy()) */ + if(! test_bit(0, &self->ttp_connect)) + { + DERROR(IRDA_OCB_ERROR, "Socket no longer connecting. Ouch !\n"); + return; + } + + /* Post process the IAS reply */ + dtsap_sel = irnet_ias_to_tsap(self, result, value); + + /* Have we got something ? */ + if(self->errno == 0) + { + /* We found the requested service */ + if(self->daddr != DEV_ADDR_ANY) + { + DERROR(IRDA_OCB_ERROR, "More than one device in range supports IrNET...\n"); + } + else + { + /* First time we found that one, save it ! */ + self->daddr = self->discoveries[self->disco_index].daddr; + self->dtsap_sel = dtsap_sel; + } + } + + /* If no failure */ + if((self->errno == -EADDRNOTAVAIL) || (self->errno == 0)) + { + int ret; + + /* Search the next node */ + ret = irnet_discover_next_daddr(self); + if(!ret) + { + /* In this case, the above request was non-blocking. + * We will return here after a while... */ + return; + } + /* In this case, we have processed the last discovery item */ + } + + /* No more queries to be done (failure or last one) */ + + /* We probably don't need to make any more queries */ + iriap_close(self->iriap); + self->iriap = NULL; + + /* No more items : remove the log and signal termination */ + DEBUG(IRDA_OCB_INFO, "Cleaning up log (0x%p)\n", + self->discoveries); + if(self->discoveries != NULL) + { + /* Cleanup our copy of the discovery log */ + kfree(self->discoveries); + self->discoveries = NULL; + } + self->disco_number = -1; + + /* Check out what we found */ + if(self->daddr == DEV_ADDR_ANY) + { + self->daddr = DEV_ADDR_ANY; + clear_bit(0, &self->ttp_connect); + DEXIT(IRDA_OCB_TRACE, ": cannot discover IrNET in any device !!!\n"); + return; + } + + /* We have a valid address - just connect */ + + DEBUG(IRDA_OCB_INFO, "daddr = %08x, lsap = %d, starting IrTTP connection\n", + self->daddr, self->dtsap_sel); + + /* Start up TTP - non blocking */ + irnet_connect_tsap(self); + + DEXIT(IRDA_OCB_TRACE, "\n"); +} + +#ifdef DISCOVERY_EVENTS +/*------------------------------------------------------------------*/ +/* + * Function irnet_discovery_indication (discovery) + * + * Got a discovery indication from IrLMP, post an event + * + * Note : IrLMP take care of matching the hint mask for us, and also + * check if it is a "new" node for us... + * + * As IrLMP filter on the IrLAN hint bit, we get both IrLAN and IrNET + * nodes, so it's only at connection time that we will know if the + * node support IrNET, IrLAN or both. The other solution is to check + * in IAS the PNP ids and service name. + * Note : even if a node support IrNET (or IrLAN), it's no guarantee + * that we will be able to connect to it, the node might already be + * busy... + * + * One last thing : in some case, this function will trigger duplicate + * discovery events. On the other hand, we should catch all + * discoveries properly (i.e. not miss one). Filtering duplicate here + * is to messy, so we leave that to user space... + */ +static void +irnet_discovery_indication(discinfo_t * discovery, + DISCOVERY_MODE mode, + void * priv) +{ + irnet_socket * self = &irnet_server.s; + + DENTER(IRDA_OCB_TRACE, "(self=0x%p)\n", self); + DASSERT(priv == &irnet_server, , IRDA_OCB_ERROR, + "Invalid instance (0x%p) !!!\n", priv); + + DEBUG(IRDA_OCB_INFO, "Discovered new IrNET/IrLAN node %s...\n", + discovery->info); + + /* Notify the control channel */ + irnet_post_event(NULL, IRNET_DISCOVER, + discovery->saddr, discovery->daddr, discovery->info, + u16ho(discovery->hints)); + + DEXIT(IRDA_OCB_TRACE, "\n"); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_expiry_indication (expiry) + * + * Got a expiry indication from IrLMP, post an event + * + * Note : IrLMP take care of matching the hint mask for us, we only + * check if it is a "new" node... + */ +static void +irnet_expiry_indication(discinfo_t * expiry, + DISCOVERY_MODE mode, + void * priv) +{ + irnet_socket * self = &irnet_server.s; + + DENTER(IRDA_OCB_TRACE, "(self=0x%p)\n", self); + DASSERT(priv == &irnet_server, , IRDA_OCB_ERROR, + "Invalid instance (0x%p) !!!\n", priv); + + DEBUG(IRDA_OCB_INFO, "IrNET/IrLAN node %s expired...\n", + expiry->info); + + /* Notify the control channel */ + irnet_post_event(NULL, IRNET_EXPIRE, + expiry->saddr, expiry->daddr, expiry->info, + u16ho(expiry->hints)); + + DEXIT(IRDA_OCB_TRACE, "\n"); +} +#endif /* DISCOVERY_EVENTS */ + + +/*********************** PROC ENTRY CALLBACKS ***********************/ +/* + * We create a instance in the /proc filesystem, and here we take care + * of that... + */ + +#ifdef CONFIG_PROC_FS +/*------------------------------------------------------------------*/ +/* + * Function irnet_proc_read (buf, start, offset, len, unused) + * + * Give some info to the /proc file system + */ +static int +irnet_proc_read(char * buf, + char ** start, + off_t offset, + int len) +{ + irnet_socket * self; + char * state; + int i = 0; + + len = 0; + + /* Get the IrNET server information... */ + len += sprintf(buf+len, "IrNET server - "); + len += sprintf(buf+len, "IrDA state: %s, ", + (irnet_server.running ? "running" : "dead")); + len += sprintf(buf+len, "stsap_sel: %02x, ", irnet_server.s.stsap_sel); + len += sprintf(buf+len, "dtsap_sel: %02x\n", irnet_server.s.dtsap_sel); + + /* Do we need to continue ? */ + if(!irnet_server.running) + return len; + + /* Protect access to the instance list */ + spin_lock_bh(&irnet_server.spinlock); + + /* Get the sockets one by one... */ + self = (irnet_socket *) hashbin_get_first(irnet_server.list); + while(self != NULL) + { + /* Start printing info about the socket. */ + len += sprintf(buf+len, "\nIrNET socket %d - ", i++); + + /* First, get the requested configuration */ + len += sprintf(buf+len, "Requested IrDA name: \"%s\", ", self->rname); + len += sprintf(buf+len, "daddr: %08x, ", self->rdaddr); + len += sprintf(buf+len, "saddr: %08x\n", self->rsaddr); + + /* Second, get all the PPP info */ + len += sprintf(buf+len, " PPP state: %s", + (self->ppp_open ? "registered" : "unregistered")); + if(self->ppp_open) + { + len += sprintf(buf+len, ", unit: ppp%d", + ppp_unit_number(&self->chan)); + len += sprintf(buf+len, ", channel: %d", + ppp_channel_index(&self->chan)); + len += sprintf(buf+len, ", mru: %d", + self->mru); + /* Maybe add self->flags ? Later... */ + } + + /* Then, get all the IrDA specific info... */ + if(self->ttp_open) + state = "connected"; + else + if(self->tsap != NULL) + state = "connecting"; + else + if(self->iriap != NULL) + state = "searching"; + else + if(self->ttp_connect) + state = "weird"; + else + state = "idle"; + len += sprintf(buf+len, "\n IrDA state: %s, ", state); + len += sprintf(buf+len, "daddr: %08x, ", self->daddr); + len += sprintf(buf+len, "stsap_sel: %02x, ", self->stsap_sel); + len += sprintf(buf+len, "dtsap_sel: %02x\n", self->dtsap_sel); + + /* Next socket, please... */ + self = (irnet_socket *) hashbin_get_next(irnet_server.list); + } + + /* Spin lock end */ + spin_unlock_bh(&irnet_server.spinlock); + + return len; +} +#endif /* PROC_FS */ + + +/********************** CONFIGURATION/CLEANUP **********************/ +/* + * Initialisation and teardown of the IrDA part, called at module + * insertion and removal... + */ + +/*------------------------------------------------------------------*/ +/* + * Prepare the IrNET layer for operation... + */ +int __init +irda_irnet_init(void) +{ + int err = 0; + + DENTER(MODULE_TRACE, "()\n"); + + /* Pure paranoia - should be redundant */ + memset(&irnet_server, 0, sizeof(struct irnet_root)); + + /* Setup start of irnet instance list */ + irnet_server.list = hashbin_new(HB_NOLOCK); + DABORT(irnet_server.list == NULL, -ENOMEM, + MODULE_ERROR, "Can't allocate hashbin!\n"); + /* Init spinlock for instance list */ + spin_lock_init(&irnet_server.spinlock); + + /* Initialise control channel */ + init_waitqueue_head(&irnet_events.rwait); + irnet_events.index = 0; + /* Init spinlock for event logging */ + spin_lock_init(&irnet_events.spinlock); + +#ifdef CONFIG_PROC_FS + /* Add a /proc file for irnet infos */ + create_proc_info_entry("irnet", 0, proc_irda, irnet_proc_read); +#endif /* CONFIG_PROC_FS */ + + /* Setup the IrNET server */ + err = irnet_setup_server(); + + if(!err) + /* We are no longer functional... */ + irnet_server.running = 1; + + DEXIT(MODULE_TRACE, "\n"); + return err; +} + +/*------------------------------------------------------------------*/ +/* + * Cleanup at exit... + */ +void __exit +irda_irnet_cleanup(void) +{ + DENTER(MODULE_TRACE, "()\n"); + + /* We are no longer there... */ + irnet_server.running = 0; + +#ifdef CONFIG_PROC_FS + /* Remove our /proc file */ + remove_proc_entry("irnet", proc_irda); +#endif /* CONFIG_PROC_FS */ + + /* Remove our IrNET server from existence */ + irnet_destroy_server(); + + /* Remove all instances of IrNET socket still present */ + hashbin_delete(irnet_server.list, (FREE_FUNC) irda_irnet_destroy); + + DEXIT(MODULE_TRACE, "\n"); +} diff --git a/net/irda/irnet/irnet_irda.h b/net/irda/irnet/irnet_irda.h new file mode 100644 index 000000000000..f2fecd32d8f6 --- /dev/null +++ b/net/irda/irnet/irnet_irda.h @@ -0,0 +1,186 @@ +/* + * IrNET protocol module : Synchronous PPP over an IrDA socket. + * + * Jean II - HPL `00 - + * + * This file contains all definitions and declarations necessary for the + * IRDA part of the IrNET module (dealing with IrTTP, IrIAS and co). + * This file is a private header, so other modules don't want to know + * what's in there... + */ + +#ifndef IRNET_IRDA_H +#define IRNET_IRDA_H + +/***************************** INCLUDES *****************************/ +/* Please add other headers in irnet.h */ + +#include "irnet.h" /* Module global include */ + +/************************ CONSTANTS & MACROS ************************/ + +/* + * Name of the service (socket name) used by IrNET + */ +/* IAS object name (or part of it) */ +#define IRNET_SERVICE_NAME "IrNetv1" +/* IAS attribute */ +#define IRNET_IAS_VALUE "IrDA:TinyTP:LsapSel" +/* LMP notify name for client (only for /proc/net/irda/irlmp) */ +#define IRNET_NOTIFY_NAME "IrNET socket" +/* LMP notify name for server (only for /proc/net/irda/irlmp) */ +#define IRNET_NOTIFY_NAME_SERV "IrNET server" + +/****************************** TYPES ******************************/ + +/* + * This is the main structure where we store all the data pertaining to + * the IrNET server (listen for connection requests) and the root + * of the IrNET socket list + */ +typedef struct irnet_root +{ + irnet_socket s; /* To pretend we are a client... */ + + /* Generic stuff */ + int magic; /* Paranoia */ + int running; /* Are we operational ? */ + + /* Link list of all IrNET instances opened */ + hashbin_t * list; + spinlock_t spinlock; /* Serialize access to the list */ + /* Note : the way hashbin has been designed is absolutely not + * reentrant, beware... So, we blindly protect all with spinlock */ + + /* Handle for the hint bit advertised in IrLMP */ + void * skey; + + /* Server socket part */ + struct ias_object * ias_obj; /* Our service name + lsap in IAS */ + +} irnet_root; + + +/**************************** PROTOTYPES ****************************/ + +/* ----------------------- CONTROL CHANNEL ----------------------- */ +static void + irnet_post_event(irnet_socket *, + irnet_event, + __u32, + __u32, + char *, + __u16); +/* ----------------------- IRDA SUBROUTINES ----------------------- */ +static inline int + irnet_open_tsap(irnet_socket *); +static inline __u8 + irnet_ias_to_tsap(irnet_socket *, + int, + struct ias_value *); +static inline int + irnet_find_lsap_sel(irnet_socket *); +static inline int + irnet_connect_tsap(irnet_socket *); +static inline int + irnet_discover_next_daddr(irnet_socket *); +static inline int + irnet_discover_daddr_and_lsap_sel(irnet_socket *); +static inline int + irnet_dname_to_daddr(irnet_socket *); +/* ------------------------ SERVER SOCKET ------------------------ */ +static inline int + irnet_daddr_to_dname(irnet_socket *); +static inline irnet_socket * + irnet_find_socket(irnet_socket *); +static inline int + irnet_connect_socket(irnet_socket *, + irnet_socket *, + struct qos_info *, + __u32, + __u8); +static inline void + irnet_disconnect_server(irnet_socket *, + struct sk_buff *); +static inline int + irnet_setup_server(void); +static inline void + irnet_destroy_server(void); +/* ---------------------- IRDA-TTP CALLBACKS ---------------------- */ +static int + irnet_data_indication(void *, /* instance */ + void *, /* sap */ + struct sk_buff *); +static void + irnet_disconnect_indication(void *, + void *, + LM_REASON, + struct sk_buff *); +static void + irnet_connect_confirm(void *, + void *, + struct qos_info *, + __u32, + __u8, + struct sk_buff *); +static void + irnet_flow_indication(void *, + void *, + LOCAL_FLOW); +static void + irnet_status_indication(void *, + LINK_STATUS, + LOCK_STATUS); +static void + irnet_connect_indication(void *, + void *, + struct qos_info *, + __u32, + __u8, + struct sk_buff *); +/* -------------------- IRDA-IAS/LMP CALLBACKS -------------------- */ +static void + irnet_getvalue_confirm(int, + __u16, + struct ias_value *, + void *); +static void + irnet_discovervalue_confirm(int, + __u16, + struct ias_value *, + void *); +#ifdef DISCOVERY_EVENTS +static void + irnet_discovery_indication(discinfo_t *, + DISCOVERY_MODE, + void *); +static void + irnet_expiry_indication(discinfo_t *, + DISCOVERY_MODE, + void *); +#endif +/* -------------------------- PROC ENTRY -------------------------- */ +#ifdef CONFIG_PROC_FS +static int + irnet_proc_read(char *, + char **, + off_t, + int); +#endif /* CONFIG_PROC_FS */ + +/**************************** VARIABLES ****************************/ + +/* + * The IrNET server. Listen to connection requests and co... + */ +static struct irnet_root irnet_server; + +/* Control channel stuff (note : extern) */ +struct irnet_ctrl_channel irnet_events; + +/* The /proc/net/irda directory, defined elsewhere... */ +#ifdef CONFIG_PROC_FS +extern struct proc_dir_entry *proc_irda; +#endif /* CONFIG_PROC_FS */ + +#endif /* IRNET_IRDA_H */ diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c new file mode 100644 index 000000000000..f8f984bb9922 --- /dev/null +++ b/net/irda/irnet/irnet_ppp.c @@ -0,0 +1,1142 @@ +/* + * IrNET protocol module : Synchronous PPP over an IrDA socket. + * + * Jean II - HPL `00 - + * + * This file implement the PPP interface and /dev/irnet character device. + * The PPP interface hook to the ppp_generic module, handle all our + * relationship to the PPP code in the kernel (and by extension to pppd), + * and exchange PPP frames with this module (send/receive). + * The /dev/irnet device is used primarily for 2 functions : + * 1) as a stub for pppd (the ppp daemon), so that we can appropriately + * generate PPP sessions (we pretend we are a tty). + * 2) as a control channel (write commands, read events) + */ + +#include "irnet_ppp.h" /* Private header */ +/* Please put other headers in irnet.h - Thanks */ + +/* Generic PPP callbacks (to call us) */ +static struct ppp_channel_ops irnet_ppp_ops = { + .start_xmit = ppp_irnet_send, + .ioctl = ppp_irnet_ioctl +}; + +/************************* CONTROL CHANNEL *************************/ +/* + * When a pppd instance is not active on /dev/irnet, it acts as a control + * channel. + * Writing allow to set up the IrDA destination of the IrNET channel, + * and any application may be read events happening in IrNET... + */ + +/*------------------------------------------------------------------*/ +/* + * Write is used to send a command to configure a IrNET channel + * before it is open by pppd. The syntax is : "command argument" + * Currently there is only two defined commands : + * o name : set the requested IrDA nickname of the IrNET peer. + * o addr : set the requested IrDA address of the IrNET peer. + * Note : the code is crude, but effective... + */ +static inline ssize_t +irnet_ctrl_write(irnet_socket * ap, + const char __user *buf, + size_t count) +{ + char command[IRNET_MAX_COMMAND]; + char * start; /* Current command being processed */ + char * next; /* Next command to process */ + int length; /* Length of current command */ + + DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count); + + /* Check for overflow... */ + DABORT(count >= IRNET_MAX_COMMAND, -ENOMEM, + CTRL_ERROR, "Too much data !!!\n"); + + /* Get the data in the driver */ + if(copy_from_user(command, buf, count)) + { + DERROR(CTRL_ERROR, "Invalid user space pointer.\n"); + return -EFAULT; + } + + /* Safe terminate the string */ + command[count] = '\0'; + DEBUG(CTRL_INFO, "Command line received is ``%s'' (%Zd).\n", + command, count); + + /* Check every commands in the command line */ + next = command; + while(next != NULL) + { + /* Look at the next command */ + start = next; + + /* Scrap whitespaces before the command */ + while(isspace(*start)) + start++; + + /* ',' is our command separator */ + next = strchr(start, ','); + if(next) + { + *next = '\0'; /* Terminate command */ + length = next - start; /* Length */ + next++; /* Skip the '\0' */ + } + else + length = strlen(start); + + DEBUG(CTRL_INFO, "Found command ``%s'' (%d).\n", start, length); + + /* Check if we recognised one of the known command + * We can't use "switch" with strings, so hack with "continue" */ + + /* First command : name -> Requested IrDA nickname */ + if(!strncmp(start, "name", 4)) + { + /* Copy the name only if is included and not "any" */ + if((length > 5) && (strcmp(start + 5, "any"))) + { + /* Strip out trailing whitespaces */ + while(isspace(start[length - 1])) + length--; + + /* Copy the name for later reuse */ + memcpy(ap->rname, start + 5, length - 5); + ap->rname[length - 5] = '\0'; + } + else + ap->rname[0] = '\0'; + DEBUG(CTRL_INFO, "Got rname = ``%s''\n", ap->rname); + + /* Restart the loop */ + continue; + } + + /* Second command : addr, daddr -> Requested IrDA destination address + * Also process : saddr -> Requested IrDA source address */ + if((!strncmp(start, "addr", 4)) || + (!strncmp(start, "daddr", 5)) || + (!strncmp(start, "saddr", 5))) + { + __u32 addr = DEV_ADDR_ANY; + + /* Copy the address only if is included and not "any" */ + if((length > 5) && (strcmp(start + 5, "any"))) + { + char * begp = start + 5; + char * endp; + + /* Scrap whitespaces before the command */ + while(isspace(*begp)) + begp++; + + /* Convert argument to a number (last arg is the base) */ + addr = simple_strtoul(begp, &endp, 16); + /* Has it worked ? (endp should be start + length) */ + DABORT(endp <= (start + 5), -EINVAL, + CTRL_ERROR, "Invalid address.\n"); + } + /* Which type of address ? */ + if(start[0] == 's') + { + /* Save it */ + ap->rsaddr = addr; + DEBUG(CTRL_INFO, "Got rsaddr = %08x\n", ap->rsaddr); + } + else + { + /* Save it */ + ap->rdaddr = addr; + DEBUG(CTRL_INFO, "Got rdaddr = %08x\n", ap->rdaddr); + } + + /* Restart the loop */ + continue; + } + + /* Other possible command : connect N (number of retries) */ + + /* No command matched -> Failed... */ + DABORT(1, -EINVAL, CTRL_ERROR, "Not a recognised IrNET command.\n"); + } + + /* Success : we have parsed all commands successfully */ + return(count); +} + +#ifdef INITIAL_DISCOVERY +/*------------------------------------------------------------------*/ +/* + * Function irnet_get_discovery_log (self) + * + * Query the content on the discovery log if not done + * + * This function query the current content of the discovery log + * at the startup of the event channel and save it in the internal struct. + */ +static void +irnet_get_discovery_log(irnet_socket * ap) +{ + __u16 mask = irlmp_service_to_hint(S_LAN); + + /* Ask IrLMP for the current discovery log */ + ap->discoveries = irlmp_get_discoveries(&ap->disco_number, mask, + DISCOVERY_DEFAULT_SLOTS); + + /* Check if the we got some results */ + if(ap->discoveries == NULL) + ap->disco_number = -1; + + DEBUG(CTRL_INFO, "Got the log (0x%p), size is %d\n", + ap->discoveries, ap->disco_number); +} + +/*------------------------------------------------------------------*/ +/* + * Function irnet_read_discovery_log (self, event) + * + * Read the content on the discovery log + * + * This function dump the current content of the discovery log + * at the startup of the event channel. + * Return 1 if wrote an event on the control channel... + * + * State of the ap->disco_XXX variables : + * Socket creation : discoveries = NULL ; disco_index = 0 ; disco_number = 0 + * While reading : discoveries = ptr ; disco_index = X ; disco_number = Y + * After reading : discoveries = NULL ; disco_index = Y ; disco_number = -1 + */ +static inline int +irnet_read_discovery_log(irnet_socket * ap, + char * event) +{ + int done_event = 0; + + DENTER(CTRL_TRACE, "(ap=0x%p, event=0x%p)\n", + ap, event); + + /* Test if we have some work to do or we have already finished */ + if(ap->disco_number == -1) + { + DEBUG(CTRL_INFO, "Already done\n"); + return 0; + } + + /* Test if it's the first time and therefore we need to get the log */ + if(ap->discoveries == NULL) + irnet_get_discovery_log(ap); + + /* Check if we have more item to dump */ + if(ap->disco_index < ap->disco_number) + { + /* Write an event */ + sprintf(event, "Found %08x (%s) behind %08x {hints %02X-%02X}\n", + ap->discoveries[ap->disco_index].daddr, + ap->discoveries[ap->disco_index].info, + ap->discoveries[ap->disco_index].saddr, + ap->discoveries[ap->disco_index].hints[0], + ap->discoveries[ap->disco_index].hints[1]); + DEBUG(CTRL_INFO, "Writing discovery %d : %s\n", + ap->disco_index, ap->discoveries[ap->disco_index].info); + + /* We have an event */ + done_event = 1; + /* Next discovery */ + ap->disco_index++; + } + + /* Check if we have done the last item */ + if(ap->disco_index >= ap->disco_number) + { + /* No more items : remove the log and signal termination */ + DEBUG(CTRL_INFO, "Cleaning up log (0x%p)\n", + ap->discoveries); + if(ap->discoveries != NULL) + { + /* Cleanup our copy of the discovery log */ + kfree(ap->discoveries); + ap->discoveries = NULL; + } + ap->disco_number = -1; + } + + return done_event; +} +#endif /* INITIAL_DISCOVERY */ + +/*------------------------------------------------------------------*/ +/* + * Read is used to get IrNET events + */ +static inline ssize_t +irnet_ctrl_read(irnet_socket * ap, + struct file * file, + char __user * buf, + size_t count) +{ + DECLARE_WAITQUEUE(wait, current); + char event[64]; /* Max event is 61 char */ + ssize_t ret = 0; + + DENTER(CTRL_TRACE, "(ap=0x%p, count=%Zd)\n", ap, count); + + /* Check if we can write an event out in one go */ + DABORT(count < sizeof(event), -EOVERFLOW, CTRL_ERROR, "Buffer to small.\n"); + +#ifdef INITIAL_DISCOVERY + /* Check if we have read the log */ + if(irnet_read_discovery_log(ap, event)) + { + /* We have an event !!! Copy it to the user */ + if(copy_to_user(buf, event, strlen(event))) + { + DERROR(CTRL_ERROR, "Invalid user space pointer.\n"); + return -EFAULT; + } + + DEXIT(CTRL_TRACE, "\n"); + return(strlen(event)); + } +#endif /* INITIAL_DISCOVERY */ + + /* Put ourselves on the wait queue to be woken up */ + add_wait_queue(&irnet_events.rwait, &wait); + current->state = TASK_INTERRUPTIBLE; + for(;;) + { + /* If there is unread events */ + ret = 0; + if(ap->event_index != irnet_events.index) + break; + ret = -EAGAIN; + if(file->f_flags & O_NONBLOCK) + break; + ret = -ERESTARTSYS; + if(signal_pending(current)) + break; + /* Yield and wait to be woken up */ + schedule(); + } + current->state = TASK_RUNNING; + remove_wait_queue(&irnet_events.rwait, &wait); + + /* Did we got it ? */ + if(ret != 0) + { + /* No, return the error code */ + DEXIT(CTRL_TRACE, " - ret %Zd\n", ret); + return ret; + } + + /* Which event is it ? */ + switch(irnet_events.log[ap->event_index].event) + { + case IRNET_DISCOVER: + sprintf(event, "Discovered %08x (%s) behind %08x {hints %02X-%02X}\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].saddr, + irnet_events.log[ap->event_index].hints.byte[0], + irnet_events.log[ap->event_index].hints.byte[1]); + break; + case IRNET_EXPIRE: + sprintf(event, "Expired %08x (%s) behind %08x {hints %02X-%02X}\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].saddr, + irnet_events.log[ap->event_index].hints.byte[0], + irnet_events.log[ap->event_index].hints.byte[1]); + break; + case IRNET_CONNECT_TO: + sprintf(event, "Connected to %08x (%s) on ppp%d\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].unit); + break; + case IRNET_CONNECT_FROM: + sprintf(event, "Connection from %08x (%s) on ppp%d\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].unit); + break; + case IRNET_REQUEST_FROM: + sprintf(event, "Request from %08x (%s) behind %08x\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].saddr); + break; + case IRNET_NOANSWER_FROM: + sprintf(event, "No-answer from %08x (%s) on ppp%d\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].unit); + break; + case IRNET_BLOCKED_LINK: + sprintf(event, "Blocked link with %08x (%s) on ppp%d\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].unit); + break; + case IRNET_DISCONNECT_FROM: + sprintf(event, "Disconnection from %08x (%s) on ppp%d\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name, + irnet_events.log[ap->event_index].unit); + break; + case IRNET_DISCONNECT_TO: + sprintf(event, "Disconnected to %08x (%s)\n", + irnet_events.log[ap->event_index].daddr, + irnet_events.log[ap->event_index].name); + break; + default: + sprintf(event, "Bug\n"); + } + /* Increment our event index */ + ap->event_index = (ap->event_index + 1) % IRNET_MAX_EVENTS; + + DEBUG(CTRL_INFO, "Event is :%s", event); + + /* Copy it to the user */ + if(copy_to_user(buf, event, strlen(event))) + { + DERROR(CTRL_ERROR, "Invalid user space pointer.\n"); + return -EFAULT; + } + + DEXIT(CTRL_TRACE, "\n"); + return(strlen(event)); +} + +/*------------------------------------------------------------------*/ +/* + * Poll : called when someone do a select on /dev/irnet. + * Just check if there are new events... + */ +static inline unsigned int +irnet_ctrl_poll(irnet_socket * ap, + struct file * file, + poll_table * wait) +{ + unsigned int mask; + + DENTER(CTRL_TRACE, "(ap=0x%p)\n", ap); + + poll_wait(file, &irnet_events.rwait, wait); + mask = POLLOUT | POLLWRNORM; + /* If there is unread events */ + if(ap->event_index != irnet_events.index) + mask |= POLLIN | POLLRDNORM; +#ifdef INITIAL_DISCOVERY + if(ap->disco_number != -1) + { + /* Test if it's the first time and therefore we need to get the log */ + if(ap->discoveries == NULL) + irnet_get_discovery_log(ap); + /* Recheck */ + if(ap->disco_number != -1) + mask |= POLLIN | POLLRDNORM; + } +#endif /* INITIAL_DISCOVERY */ + + DEXIT(CTRL_TRACE, " - mask=0x%X\n", mask); + return mask; +} + + +/*********************** FILESYSTEM CALLBACKS ***********************/ +/* + * Implement the usual open, read, write functions that will be called + * by the file system when some action is performed on /dev/irnet. + * Most of those actions will in fact be performed by "pppd" or + * the control channel, we just act as a redirector... + */ + +/*------------------------------------------------------------------*/ +/* + * Open : when somebody open /dev/irnet + * We basically create a new instance of irnet and initialise it. + */ +static int +dev_irnet_open(struct inode * inode, + struct file * file) +{ + struct irnet_socket * ap; + int err; + + DENTER(FS_TRACE, "(file=0x%p)\n", file); + +#ifdef SECURE_DEVIRNET + /* This could (should?) be enforced by the permissions on /dev/irnet. */ + if(!capable(CAP_NET_ADMIN)) + return -EPERM; +#endif /* SECURE_DEVIRNET */ + + /* Allocate a private structure for this IrNET instance */ + ap = kmalloc(sizeof(*ap), GFP_KERNEL); + DABORT(ap == NULL, -ENOMEM, FS_ERROR, "Can't allocate struct irnet...\n"); + + /* initialize the irnet structure */ + memset(ap, 0, sizeof(*ap)); + ap->file = file; + + /* PPP channel setup */ + ap->ppp_open = 0; + ap->chan.private = ap; + ap->chan.ops = &irnet_ppp_ops; + ap->chan.mtu = (2048 - TTP_MAX_HEADER - 2 - PPP_HDRLEN); + ap->chan.hdrlen = 2 + TTP_MAX_HEADER; /* for A/C + Max IrDA hdr */ + /* PPP parameters */ + ap->mru = (2048 - TTP_MAX_HEADER - 2 - PPP_HDRLEN); + ap->xaccm[0] = ~0U; + ap->xaccm[3] = 0x60000000U; + ap->raccm = ~0U; + + /* Setup the IrDA part... */ + err = irda_irnet_create(ap); + if(err) + { + DERROR(FS_ERROR, "Can't setup IrDA link...\n"); + kfree(ap); + return err; + } + + /* For the control channel */ + ap->event_index = irnet_events.index; /* Cancel all past events */ + + /* Put our stuff where we will be able to find it later */ + file->private_data = ap; + + DEXIT(FS_TRACE, " - ap=0x%p\n", ap); + return 0; +} + + +/*------------------------------------------------------------------*/ +/* + * Close : when somebody close /dev/irnet + * Destroy the instance of /dev/irnet + */ +static int +dev_irnet_close(struct inode * inode, + struct file * file) +{ + irnet_socket * ap = (struct irnet_socket *) file->private_data; + + DENTER(FS_TRACE, "(file=0x%p, ap=0x%p)\n", + file, ap); + DABORT(ap == NULL, 0, FS_ERROR, "ap is NULL !!!\n"); + + /* Detach ourselves */ + file->private_data = NULL; + + /* Close IrDA stuff */ + irda_irnet_destroy(ap); + + /* Disconnect from the generic PPP layer if not already done */ + if(ap->ppp_open) + { + DERROR(FS_ERROR, "Channel still registered - deregistering !\n"); + ap->ppp_open = 0; + ppp_unregister_channel(&ap->chan); + } + + kfree(ap); + + DEXIT(FS_TRACE, "\n"); + return 0; +} + +/*------------------------------------------------------------------*/ +/* + * Write does nothing. + * (we receive packet from ppp_generic through ppp_irnet_send()) + */ +static ssize_t +dev_irnet_write(struct file * file, + const char __user *buf, + size_t count, + loff_t * ppos) +{ + irnet_socket * ap = (struct irnet_socket *) file->private_data; + + DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n", + file, ap, count); + DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n"); + + /* If we are connected to ppp_generic, let it handle the job */ + if(ap->ppp_open) + return -EAGAIN; + else + return irnet_ctrl_write(ap, buf, count); +} + +/*------------------------------------------------------------------*/ +/* + * Read doesn't do much either. + * (pppd poll us, but ultimately reads through /dev/ppp) + */ +static ssize_t +dev_irnet_read(struct file * file, + char __user * buf, + size_t count, + loff_t * ppos) +{ + irnet_socket * ap = (struct irnet_socket *) file->private_data; + + DPASS(FS_TRACE, "(file=0x%p, ap=0x%p, count=%Zd)\n", + file, ap, count); + DABORT(ap == NULL, -ENXIO, FS_ERROR, "ap is NULL !!!\n"); + + /* If we are connected to ppp_generic, let it handle the job */ + if(ap->ppp_open) + return -EAGAIN; + else + return irnet_ctrl_read(ap, file, buf, count); +} + +/*------------------------------------------------------------------*/ +/* + * Poll : called when someone do a select on /dev/irnet + */ +static unsigned int +dev_irnet_poll(struct file * file, + poll_table * wait) +{ + irnet_socket * ap = (struct irnet_socket *) file->private_data; + unsigned int mask; + + DENTER(FS_TRACE, "(file=0x%p, ap=0x%p)\n", + file, ap); + + mask = POLLOUT | POLLWRNORM; + DABORT(ap == NULL, mask, FS_ERROR, "ap is NULL !!!\n"); + + /* If we are connected to ppp_generic, let it handle the job */ + if(!ap->ppp_open) + mask |= irnet_ctrl_poll(ap, file, wait); + + DEXIT(FS_TRACE, " - mask=0x%X\n", mask); + return(mask); +} + +/*------------------------------------------------------------------*/ +/* + * IOCtl : Called when someone does some ioctls on /dev/irnet + * This is the way pppd configure us and control us while the PPP + * instance is active. + */ +static int +dev_irnet_ioctl(struct inode * inode, + struct file * file, + unsigned int cmd, + unsigned long arg) +{ + irnet_socket * ap = (struct irnet_socket *) file->private_data; + int err; + int val; + void __user *argp = (void __user *)arg; + + DENTER(FS_TRACE, "(file=0x%p, ap=0x%p, cmd=0x%X)\n", + file, ap, cmd); + + /* Basic checks... */ + DASSERT(ap != NULL, -ENXIO, PPP_ERROR, "ap is NULL...\n"); +#ifdef SECURE_DEVIRNET + if(!capable(CAP_NET_ADMIN)) + return -EPERM; +#endif /* SECURE_DEVIRNET */ + + err = -EFAULT; + switch(cmd) + { + /* Set discipline (should be N_SYNC_PPP or N_TTY) */ + case TIOCSETD: + if(get_user(val, (int __user *)argp)) + break; + if((val == N_SYNC_PPP) || (val == N_PPP)) + { + DEBUG(FS_INFO, "Entering PPP discipline.\n"); + /* PPP channel setup (ap->chan in configued in dev_irnet_open())*/ + err = ppp_register_channel(&ap->chan); + if(err == 0) + { + /* Our ppp side is active */ + ap->ppp_open = 1; + + DEBUG(FS_INFO, "Trying to establish a connection.\n"); + /* Setup the IrDA link now - may fail... */ + irda_irnet_connect(ap); + } + else + DERROR(FS_ERROR, "Can't setup PPP channel...\n"); + } + else + { + /* In theory, should be N_TTY */ + DEBUG(FS_INFO, "Exiting PPP discipline.\n"); + /* Disconnect from the generic PPP layer */ + if(ap->ppp_open) + { + ap->ppp_open = 0; + ppp_unregister_channel(&ap->chan); + } + else + DERROR(FS_ERROR, "Channel not registered !\n"); + err = 0; + } + break; + + /* Query PPP channel and unit number */ + case PPPIOCGCHAN: + if(!ap->ppp_open) + break; + if(put_user(ppp_channel_index(&ap->chan), (int __user *)argp)) + break; + DEBUG(FS_INFO, "Query channel.\n"); + err = 0; + break; + case PPPIOCGUNIT: + if(!ap->ppp_open) + break; + if(put_user(ppp_unit_number(&ap->chan), (int __user *)argp)) + break; + DEBUG(FS_INFO, "Query unit number.\n"); + err = 0; + break; + + /* All these ioctls can be passed both directly and from ppp_generic, + * so we just deal with them in one place... + */ + case PPPIOCGFLAGS: + case PPPIOCSFLAGS: + case PPPIOCGASYNCMAP: + case PPPIOCSASYNCMAP: + case PPPIOCGRASYNCMAP: + case PPPIOCSRASYNCMAP: + case PPPIOCGXASYNCMAP: + case PPPIOCSXASYNCMAP: + case PPPIOCGMRU: + case PPPIOCSMRU: + DEBUG(FS_INFO, "Standard PPP ioctl.\n"); + if(!capable(CAP_NET_ADMIN)) + err = -EPERM; + else + err = ppp_irnet_ioctl(&ap->chan, cmd, arg); + break; + + /* TTY IOCTLs : Pretend that we are a tty, to keep pppd happy */ + /* Get termios */ + case TCGETS: + DEBUG(FS_INFO, "Get termios.\n"); + if(kernel_termios_to_user_termios((struct termios __user *)argp, &ap->termios)) + break; + err = 0; + break; + /* Set termios */ + case TCSETSF: + DEBUG(FS_INFO, "Set termios.\n"); + if(user_termios_to_kernel_termios(&ap->termios, (struct termios __user *)argp)) + break; + err = 0; + break; + + /* Set DTR/RTS */ + case TIOCMBIS: + case TIOCMBIC: + /* Set exclusive/non-exclusive mode */ + case TIOCEXCL: + case TIOCNXCL: + DEBUG(FS_INFO, "TTY compatibility.\n"); + err = 0; + break; + + case TCGETA: + DEBUG(FS_INFO, "TCGETA\n"); + break; + + case TCFLSH: + DEBUG(FS_INFO, "TCFLSH\n"); + /* Note : this will flush buffers in PPP, so it *must* be done + * We should also worry that we don't accept junk here and that + * we get rid of our own buffers */ +#ifdef FLUSH_TO_PPP + ppp_output_wakeup(&ap->chan); +#endif /* FLUSH_TO_PPP */ + err = 0; + break; + + case FIONREAD: + DEBUG(FS_INFO, "FIONREAD\n"); + val = 0; + if(put_user(val, (int __user *)argp)) + break; + err = 0; + break; + + default: + DERROR(FS_ERROR, "Unsupported ioctl (0x%X)\n", cmd); + err = -ENOIOCTLCMD; + } + + DEXIT(FS_TRACE, " - err = 0x%X\n", err); + return err; +} + +/************************** PPP CALLBACKS **************************/ +/* + * This are the functions that the generic PPP driver in the kernel + * will call to communicate to us. + */ + +/*------------------------------------------------------------------*/ +/* + * Prepare the ppp frame for transmission over the IrDA socket. + * We make sure that the header space is enough, and we change ppp header + * according to flags passed by pppd. + * This is not a callback, but just a helper function used in ppp_irnet_send() + */ +static inline struct sk_buff * +irnet_prepare_skb(irnet_socket * ap, + struct sk_buff * skb) +{ + unsigned char * data; + int proto; /* PPP protocol */ + int islcp; /* Protocol == LCP */ + int needaddr; /* Need PPP address */ + + DENTER(PPP_TRACE, "(ap=0x%p, skb=0x%p)\n", + ap, skb); + + /* Extract PPP protocol from the frame */ + data = skb->data; + proto = (data[0] << 8) + data[1]; + + /* LCP packets with codes between 1 (configure-request) + * and 7 (code-reject) must be sent as though no options + * have been negotiated. */ + islcp = (proto == PPP_LCP) && (1 <= data[2]) && (data[2] <= 7); + + /* compress protocol field if option enabled */ + if((data[0] == 0) && (ap->flags & SC_COMP_PROT) && (!islcp)) + skb_pull(skb,1); + + /* Check if we need address/control fields */ + needaddr = 2*((ap->flags & SC_COMP_AC) == 0 || islcp); + + /* Is the skb headroom large enough to contain all IrDA-headers? */ + if((skb_headroom(skb) < (ap->max_header_size + needaddr)) || + (skb_shared(skb))) + { + struct sk_buff * new_skb; + + DEBUG(PPP_INFO, "Reallocating skb\n"); + + /* Create a new skb */ + new_skb = skb_realloc_headroom(skb, ap->max_header_size + needaddr); + + /* We have to free the original skb anyway */ + dev_kfree_skb(skb); + + /* Did the realloc succeed ? */ + DABORT(new_skb == NULL, NULL, PPP_ERROR, "Could not realloc skb\n"); + + /* Use the new skb instead */ + skb = new_skb; + } + + /* prepend address/control fields if necessary */ + if(needaddr) + { + skb_push(skb, 2); + skb->data[0] = PPP_ALLSTATIONS; + skb->data[1] = PPP_UI; + } + + DEXIT(PPP_TRACE, "\n"); + + return skb; +} + +/*------------------------------------------------------------------*/ +/* + * Send a packet to the peer over the IrTTP connection. + * Returns 1 iff the packet was accepted. + * Returns 0 iff packet was not consumed. + * If the packet was not accepted, we will call ppp_output_wakeup + * at some later time to reactivate flow control in ppp_generic. + */ +static int +ppp_irnet_send(struct ppp_channel * chan, + struct sk_buff * skb) +{ + irnet_socket * self = (struct irnet_socket *) chan->private; + int ret; + + DENTER(PPP_TRACE, "(channel=0x%p, ap/self=0x%p)\n", + chan, self); + + /* Check if things are somewhat valid... */ + DASSERT(self != NULL, 0, PPP_ERROR, "Self is NULL !!!\n"); + + /* Check if we are connected */ + if(!(test_bit(0, &self->ttp_open))) + { +#ifdef CONNECT_IN_SEND + /* Let's try to connect one more time... */ + /* Note : we won't be connected after this call, but we should be + * ready for next packet... */ + /* If we are already connecting, this will fail */ + irda_irnet_connect(self); +#endif /* CONNECT_IN_SEND */ + + DEBUG(PPP_INFO, "IrTTP not ready ! (%ld-%ld)\n", + self->ttp_open, self->ttp_connect); + + /* Note : we can either drop the packet or block the packet. + * + * Blocking the packet allow us a better connection time, + * because by calling ppp_output_wakeup() we can have + * ppp_generic resending the LCP request immediately to us, + * rather than waiting for one of pppd periodic transmission of + * LCP request. + * + * On the other hand, if we block all packet, all those periodic + * transmissions of pppd accumulate in ppp_generic, creating a + * backlog of LCP request. When we eventually connect later on, + * we have to transmit all this backlog before we can connect + * proper (if we don't timeout before). + * + * The current strategy is as follow : + * While we are attempting to connect, we block packets to get + * a better connection time. + * If we fail to connect, we drain the queue and start dropping packets + */ +#ifdef BLOCK_WHEN_CONNECT + /* If we are attempting to connect */ + if(test_bit(0, &self->ttp_connect)) + { + /* Blocking packet, ppp_generic will retry later */ + return 0; + } +#endif /* BLOCK_WHEN_CONNECT */ + + /* Dropping packet, pppd will retry later */ + dev_kfree_skb(skb); + return 1; + } + + /* Check if the queue can accept any packet, otherwise block */ + if(self->tx_flow != FLOW_START) + DRETURN(0, PPP_INFO, "IrTTP queue full (%d skbs)...\n", + skb_queue_len(&self->tsap->tx_queue)); + + /* Prepare ppp frame for transmission */ + skb = irnet_prepare_skb(self, skb); + DABORT(skb == NULL, 1, PPP_ERROR, "Prepare skb for Tx failed.\n"); + + /* Send the packet to IrTTP */ + ret = irttp_data_request(self->tsap, skb); + if(ret < 0) + { + /* + * > IrTTPs tx queue is full, so we just have to + * > drop the frame! You might think that we should + * > just return -1 and don't deallocate the frame, + * > but that is dangerous since it's possible that + * > we have replaced the original skb with a new + * > one with larger headroom, and that would really + * > confuse do_dev_queue_xmit() in dev.c! I have + * > tried :-) DB + * Correction : we verify the flow control above (self->tx_flow), + * so we come here only if IrTTP doesn't like the packet (empty, + * too large, IrTTP not connected). In those rare cases, it's ok + * to drop it, we don't want to see it here again... + * Jean II + */ + DERROR(PPP_ERROR, "IrTTP doesn't like this packet !!! (0x%X)\n", ret); + /* irttp_data_request already free the packet */ + } + + DEXIT(PPP_TRACE, "\n"); + return 1; /* Packet has been consumed */ +} + +/*------------------------------------------------------------------*/ +/* + * Take care of the ioctls that ppp_generic doesn't want to deal with... + * Note : we are also called from dev_irnet_ioctl(). + */ +static int +ppp_irnet_ioctl(struct ppp_channel * chan, + unsigned int cmd, + unsigned long arg) +{ + irnet_socket * ap = (struct irnet_socket *) chan->private; + int err; + int val; + u32 accm[8]; + void __user *argp = (void __user *)arg; + + DENTER(PPP_TRACE, "(channel=0x%p, ap=0x%p, cmd=0x%X)\n", + chan, ap, cmd); + + /* Basic checks... */ + DASSERT(ap != NULL, -ENXIO, PPP_ERROR, "ap is NULL...\n"); + + err = -EFAULT; + switch(cmd) + { + /* PPP flags */ + case PPPIOCGFLAGS: + val = ap->flags | ap->rbits; + if(put_user(val, (int __user *) argp)) + break; + err = 0; + break; + case PPPIOCSFLAGS: + if(get_user(val, (int __user *) argp)) + break; + ap->flags = val & ~SC_RCV_BITS; + ap->rbits = val & SC_RCV_BITS; + err = 0; + break; + + /* Async map stuff - all dummy to please pppd */ + case PPPIOCGASYNCMAP: + if(put_user(ap->xaccm[0], (u32 __user *) argp)) + break; + err = 0; + break; + case PPPIOCSASYNCMAP: + if(get_user(ap->xaccm[0], (u32 __user *) argp)) + break; + err = 0; + break; + case PPPIOCGRASYNCMAP: + if(put_user(ap->raccm, (u32 __user *) argp)) + break; + err = 0; + break; + case PPPIOCSRASYNCMAP: + if(get_user(ap->raccm, (u32 __user *) argp)) + break; + err = 0; + break; + case PPPIOCGXASYNCMAP: + if(copy_to_user(argp, ap->xaccm, sizeof(ap->xaccm))) + break; + err = 0; + break; + case PPPIOCSXASYNCMAP: + if(copy_from_user(accm, argp, sizeof(accm))) + break; + accm[2] &= ~0x40000000U; /* can't escape 0x5e */ + accm[3] |= 0x60000000U; /* must escape 0x7d, 0x7e */ + memcpy(ap->xaccm, accm, sizeof(ap->xaccm)); + err = 0; + break; + + /* Max PPP frame size */ + case PPPIOCGMRU: + if(put_user(ap->mru, (int __user *) argp)) + break; + err = 0; + break; + case PPPIOCSMRU: + if(get_user(val, (int __user *) argp)) + break; + if(val < PPP_MRU) + val = PPP_MRU; + ap->mru = val; + err = 0; + break; + + default: + DEBUG(PPP_INFO, "Unsupported ioctl (0x%X)\n", cmd); + err = -ENOIOCTLCMD; + } + + DEXIT(PPP_TRACE, " - err = 0x%X\n", err); + return err; +} + +/************************** INITIALISATION **************************/ +/* + * Module initialisation and all that jazz... + */ + +/*------------------------------------------------------------------*/ +/* + * Hook our device callbacks in the filesystem, to connect our code + * to /dev/irnet + */ +static inline int __init +ppp_irnet_init(void) +{ + int err = 0; + + DENTER(MODULE_TRACE, "()\n"); + + /* Allocate ourselves as a minor in the misc range */ + err = misc_register(&irnet_misc_device); + + DEXIT(MODULE_TRACE, "\n"); + return err; +} + +/*------------------------------------------------------------------*/ +/* + * Cleanup at exit... + */ +static inline void __exit +ppp_irnet_cleanup(void) +{ + DENTER(MODULE_TRACE, "()\n"); + + /* De-allocate /dev/irnet minor in misc range */ + misc_deregister(&irnet_misc_device); + + DEXIT(MODULE_TRACE, "\n"); +} + +/*------------------------------------------------------------------*/ +/* + * Module main entry point + */ +int __init +irnet_init(void) +{ + int err; + + /* Initialise both parts... */ + err = irda_irnet_init(); + if(!err) + err = ppp_irnet_init(); + return err; +} + +/*------------------------------------------------------------------*/ +/* + * Module exit + */ +static void __exit +irnet_cleanup(void) +{ + irda_irnet_cleanup(); + ppp_irnet_cleanup(); +} + +/*------------------------------------------------------------------*/ +/* + * Module magic + */ +module_init(irnet_init); +module_exit(irnet_cleanup); +MODULE_AUTHOR("Jean Tourrilhes "); +MODULE_DESCRIPTION("IrNET : Synchronous PPP over IrDA"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_CHARDEV(10, 187); diff --git a/net/irda/irnet/irnet_ppp.h b/net/irda/irnet/irnet_ppp.h new file mode 100644 index 000000000000..d2beb7df8f7f --- /dev/null +++ b/net/irda/irnet/irnet_ppp.h @@ -0,0 +1,119 @@ +/* + * IrNET protocol module : Synchronous PPP over an IrDA socket. + * + * Jean II - HPL `00 - + * + * This file contains all definitions and declarations necessary for the + * PPP part of the IrNET module. + * This file is a private header, so other modules don't want to know + * what's in there... + */ + +#ifndef IRNET_PPP_H +#define IRNET_PPP_H + +/***************************** INCLUDES *****************************/ + +#include "irnet.h" /* Module global include */ + +/************************ CONSTANTS & MACROS ************************/ + +/* /dev/irnet file constants */ +#define IRNET_MAJOR 10 /* Misc range */ +#define IRNET_MINOR 187 /* Official allocation */ + +/* IrNET control channel stuff */ +#define IRNET_MAX_COMMAND 256 /* Max length of a command line */ + +/* PPP hardcore stuff */ + +/* Bits in rbits (PPP flags in irnet struct) */ +#define SC_RCV_BITS (SC_RCV_B7_1|SC_RCV_B7_0|SC_RCV_ODDP|SC_RCV_EVNP) + +/* Bit numbers in busy */ +#define XMIT_BUSY 0 +#define RECV_BUSY 1 +#define XMIT_WAKEUP 2 +#define XMIT_FULL 3 + +/* Queue management */ +#define PPPSYNC_MAX_RQLEN 32 /* arbitrary */ + +/****************************** TYPES ******************************/ + + +/**************************** PROTOTYPES ****************************/ + +/* ----------------------- CONTROL CHANNEL ----------------------- */ +static inline ssize_t + irnet_ctrl_write(irnet_socket *, + const char *, + size_t); +static inline ssize_t + irnet_ctrl_read(irnet_socket *, + struct file *, + char *, + size_t); +static inline unsigned int + irnet_ctrl_poll(irnet_socket *, + struct file *, + poll_table *); +/* ----------------------- CHARACTER DEVICE ----------------------- */ +static int + dev_irnet_open(struct inode *, /* fs callback : open */ + struct file *), + dev_irnet_close(struct inode *, + struct file *); +static ssize_t + dev_irnet_write(struct file *, + const char __user *, + size_t, + loff_t *), + dev_irnet_read(struct file *, + char __user *, + size_t, + loff_t *); +static unsigned int + dev_irnet_poll(struct file *, + poll_table *); +static int + dev_irnet_ioctl(struct inode *, + struct file *, + unsigned int, + unsigned long); +/* ------------------------ PPP INTERFACE ------------------------ */ +static inline struct sk_buff * + irnet_prepare_skb(irnet_socket *, + struct sk_buff *); +static int + ppp_irnet_send(struct ppp_channel *, + struct sk_buff *); +static int + ppp_irnet_ioctl(struct ppp_channel *, + unsigned int, + unsigned long); + +/**************************** VARIABLES ****************************/ + +/* Filesystem callbacks (to call us) */ +static struct file_operations irnet_device_fops = +{ + .owner = THIS_MODULE, + .read = dev_irnet_read, + .write = dev_irnet_write, + .poll = dev_irnet_poll, + .ioctl = dev_irnet_ioctl, + .open = dev_irnet_open, + .release = dev_irnet_close + /* Also : llseek, readdir, mmap, flush, fsync, fasync, lock, readv, writev */ +}; + +/* Structure so that the misc major (drivers/char/misc.c) take care of us... */ +static struct miscdevice irnet_misc_device = +{ + IRNET_MINOR, + "irnet", + &irnet_device_fops +}; + +#endif /* IRNET_PPP_H */ diff --git a/net/irda/irproc.c b/net/irda/irproc.c new file mode 100644 index 000000000000..88b9c43f6370 --- /dev/null +++ b/net/irda/irproc.c @@ -0,0 +1,100 @@ +/********************************************************************* + * + * Filename: irproc.c + * Version: 1.0 + * Description: Various entries in the /proc file system + * Status: Experimental. + * Author: Thomas Davis, + * Created at: Sat Feb 21 21:33:24 1998 + * Modified at: Sun Nov 14 08:54:54 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-1999, Dag Brattli + * Copyright (c) 1998, Thomas Davis, , + * All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * I, Thomas Davis, provide no warranty for any of this software. + * This material is provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include + +#include +#include +#include + +extern struct file_operations discovery_seq_fops; +extern struct file_operations irlap_seq_fops; +extern struct file_operations irlmp_seq_fops; +extern struct file_operations irttp_seq_fops; +extern struct file_operations irias_seq_fops; + +struct irda_entry { + const char *name; + struct file_operations *fops; +}; + +struct proc_dir_entry *proc_irda; +EXPORT_SYMBOL(proc_irda); + +static struct irda_entry irda_dirs[] = { + {"discovery", &discovery_seq_fops}, + {"irttp", &irttp_seq_fops}, + {"irlmp", &irlmp_seq_fops}, + {"irlap", &irlap_seq_fops}, + {"irias", &irias_seq_fops}, +}; + +/* + * Function irda_proc_register (void) + * + * Register irda entry in /proc file system + * + */ +void __init irda_proc_register(void) +{ + int i; + struct proc_dir_entry *d; + + proc_irda = proc_mkdir("irda", proc_net); + if (proc_irda == NULL) + return; + proc_irda->owner = THIS_MODULE; + + for (i=0; iproc_fops = irda_dirs[i].fops; + } +} + +/* + * Function irda_proc_unregister (void) + * + * Unregister irda entry in /proc file system + * + */ +void __exit irda_proc_unregister(void) +{ + int i; + + if (proc_irda) { + for (i=0; i + * Created at: Tue Jun 9 13:29:31 1998 + * Modified at: Sun Dec 12 13:48:22 1999 + * Modified by: Dag Brattli + * Modified at: Thu Jan 4 14:29:10 CET 2001 + * Modified by: Marc Zyngier + * + * Copyright (C) 1998-1999, Aage Kvalnes + * Copyright (C) 1998, Dag Brattli, + * All Rights Reserved. + * + * This code is taken from the Vortex Operating System written by Aage + * Kvalnes. Aage has agreed that this code can use the GPL licence, + * although he does not use that licence in his own code. + * + * This copyright does however _not_ include the ELF hash() function + * which I currently don't know which licence or copyright it + * has. Please inform me if you know. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +/* + * NOTE : + * There are various problems with this package : + * o the hash function for ints is pathetic (but could be changed) + * o locking is sometime suspicious (especially during enumeration) + * o most users have only a few elements (== overhead) + * o most users never use seach, so don't benefit from hashing + * Problem already fixed : + * o not 64 bit compliant (most users do hashv = (int) self) + * o hashbin_remove() is broken => use hashbin_remove_this() + * I think most users would be better served by a simple linked list + * (like include/linux/list.h) with a global spinlock per list. + * Jean II + */ + +/* + * Notes on the concurrent access to hashbin and other SMP issues + * ------------------------------------------------------------- + * Hashbins are very often in the IrDA stack a global repository of + * information, and therefore used in a very asynchronous manner following + * various events (driver calls, timers, user calls...). + * Therefore, very often it is highly important to consider the + * management of concurrent access to the hashbin and how to guarantee the + * consistency of the operations on it. + * + * First, we need to define the objective of locking : + * 1) Protect user data (content pointed by the hashbin) + * 2) Protect hashbin structure itself (linked list in each bin) + * + * OLD LOCKING + * ----------- + * + * The previous locking strategy, either HB_LOCAL or HB_GLOBAL were + * both inadequate in *both* aspect. + * o HB_GLOBAL was using a spinlock for each bin (local locking). + * o HB_LOCAL was disabling irq on *all* CPUs, so use a single + * global semaphore. + * The problems were : + * A) Global irq disabling is no longer supported by the kernel + * B) No protection for the hashbin struct global data + * o hashbin_delete() + * o hb_current + * C) No protection for user data in some cases + * + * A) HB_LOCAL use global irq disabling, so doesn't work on kernel + * 2.5.X. Even when it is supported (kernel 2.4.X and earlier), its + * performance is not satisfactory on SMP setups. Most hashbins were + * HB_LOCAL, so (A) definitely need fixing. + * B) HB_LOCAL could be modified to fix (B). However, because HB_GLOBAL + * lock only the individual bins, it will never be able to lock the + * global data, so can't do (B). + * C) Some functions return pointer to data that is still in the + * hashbin : + * o hashbin_find() + * o hashbin_get_first() + * o hashbin_get_next() + * As the data is still in the hashbin, it may be changed or free'd + * while the caller is examinimg the data. In those case, locking can't + * be done within the hashbin, but must include use of the data within + * the caller. + * The caller can easily do this with HB_LOCAL (just disable irqs). + * However, this is impossible with HB_GLOBAL because the caller has no + * way to know the proper bin, so don't know which spinlock to use. + * + * Quick summary : can no longer use HB_LOCAL, and HB_GLOBAL is + * fundamentally broken and will never work. + * + * NEW LOCKING + * ----------- + * + * To fix those problems, I've introduce a few changes in the + * hashbin locking : + * 1) New HB_LOCK scheme + * 2) hashbin->hb_spinlock + * 3) New hashbin usage policy + * + * HB_LOCK : + * ------- + * HB_LOCK is a locking scheme intermediate between the old HB_LOCAL + * and HB_GLOBAL. It uses a single spinlock to protect the whole content + * of the hashbin. As it is a single spinlock, it can protect the global + * data of the hashbin and not only the bins themselves. + * HB_LOCK can only protect some of the hashbin calls, so it only lock + * call that can be made 100% safe and leave other call unprotected. + * HB_LOCK in theory is slower than HB_GLOBAL, but as the hashbin + * content is always small contention is not high, so it doesn't matter + * much. HB_LOCK is probably faster than HB_LOCAL. + * + * hashbin->hb_spinlock : + * -------------------- + * The spinlock that HB_LOCK uses is available for caller, so that + * the caller can protect unprotected calls (see below). + * If the caller want to do entirely its own locking (HB_NOLOCK), he + * can do so and may use safely this spinlock. + * Locking is done like this : + * spin_lock_irqsave(&hashbin->hb_spinlock, flags); + * Releasing the lock : + * spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); + * + * Safe & Protected calls : + * ---------------------- + * The following calls are safe or protected via HB_LOCK : + * o hashbin_new() -> safe + * o hashbin_delete() + * o hashbin_insert() + * o hashbin_remove_first() + * o hashbin_remove() + * o hashbin_remove_this() + * o HASHBIN_GET_SIZE() -> atomic + * + * The following calls only protect the hashbin itself : + * o hashbin_lock_find() + * o hashbin_find_next() + * + * Unprotected calls : + * ----------------- + * The following calls need to be protected by the caller : + * o hashbin_find() + * o hashbin_get_first() + * o hashbin_get_next() + * + * Locking Policy : + * -------------- + * If the hashbin is used only in a single thread of execution + * (explicitly or implicitely), you can use HB_NOLOCK + * If the calling module already provide concurrent access protection, + * you may use HB_NOLOCK. + * + * In all other cases, you need to use HB_LOCK and lock the hashbin + * every time before calling one of the unprotected calls. You also must + * use the pointer returned by the unprotected call within the locked + * region. + * + * Extra care for enumeration : + * -------------------------- + * hashbin_get_first() and hashbin_get_next() use the hashbin to + * store the current position, in hb_current. + * As long as the hashbin remains locked, this is safe. If you unlock + * the hashbin, the current position may change if anybody else modify + * or enumerate the hashbin. + * Summary : do the full enumeration while locked. + * + * Alternatively, you may use hashbin_find_next(). But, this will + * be slower, is more complex to use and doesn't protect the hashbin + * content. So, care is needed here as well. + * + * Other issues : + * ------------ + * I believe that we are overdoing it by using spin_lock_irqsave() + * and we should use only spin_lock_bh() or similar. But, I don't have + * the balls to try it out. + * Don't believe that because hashbin are now (somewhat) SMP safe + * that the rest of the code is. Higher layers tend to be safest, + * but LAP and LMP would need some serious dedicated love. + * + * Jean II + */ +#include + +#include +#include + +/************************ QUEUE SUBROUTINES ************************/ + +/* + * Hashbin + */ +#define GET_HASHBIN(x) ( x & HASHBIN_MASK ) + +/* + * Function hash (name) + * + * This function hash the input string 'name' using the ELF hash + * function for strings. + */ +static __u32 hash( const char* name) +{ + __u32 h = 0; + __u32 g; + + while(*name) { + h = (h<<4) + *name++; + if ((g = (h & 0xf0000000))) + h ^=g>>24; + h &=~g; + } + return h; +} + +/* + * Function enqueue_first (queue, proc) + * + * Insert item first in queue. + * + */ +static void enqueue_first(irda_queue_t **queue, irda_queue_t* element) +{ + + IRDA_DEBUG( 4, "%s()\n", __FUNCTION__); + + /* + * Check if queue is empty. + */ + if ( *queue == NULL ) { + /* + * Queue is empty. Insert one element into the queue. + */ + element->q_next = element->q_prev = *queue = element; + + } else { + /* + * Queue is not empty. Insert element into front of queue. + */ + element->q_next = (*queue); + (*queue)->q_prev->q_next = element; + element->q_prev = (*queue)->q_prev; + (*queue)->q_prev = element; + (*queue) = element; + } +} + + +/* + * Function dequeue (queue) + * + * Remove first entry in queue + * + */ +static irda_queue_t *dequeue_first(irda_queue_t **queue) +{ + irda_queue_t *ret; + + IRDA_DEBUG( 4, "dequeue_first()\n"); + + /* + * Set return value + */ + ret = *queue; + + if ( *queue == NULL ) { + /* + * Queue was empty. + */ + } else if ( (*queue)->q_next == *queue ) { + /* + * Queue only contained a single element. It will now be + * empty. + */ + *queue = NULL; + } else { + /* + * Queue contained several element. Remove the first one. + */ + (*queue)->q_prev->q_next = (*queue)->q_next; + (*queue)->q_next->q_prev = (*queue)->q_prev; + *queue = (*queue)->q_next; + } + + /* + * Return the removed entry (or NULL of queue was empty). + */ + return ret; +} + +/* + * Function dequeue_general (queue, element) + * + * + */ +static irda_queue_t *dequeue_general(irda_queue_t **queue, irda_queue_t* element) +{ + irda_queue_t *ret; + + IRDA_DEBUG( 4, "dequeue_general()\n"); + + /* + * Set return value + */ + ret = *queue; + + if ( *queue == NULL ) { + /* + * Queue was empty. + */ + } else if ( (*queue)->q_next == *queue ) { + /* + * Queue only contained a single element. It will now be + * empty. + */ + *queue = NULL; + + } else { + /* + * Remove specific element. + */ + element->q_prev->q_next = element->q_next; + element->q_next->q_prev = element->q_prev; + if ( (*queue) == element) + (*queue) = element->q_next; + } + + /* + * Return the removed entry (or NULL of queue was empty). + */ + return ret; +} + +/************************ HASHBIN MANAGEMENT ************************/ + +/* + * Function hashbin_create ( type, name ) + * + * Create hashbin! + * + */ +hashbin_t *hashbin_new(int type) +{ + hashbin_t* hashbin; + + /* + * Allocate new hashbin + */ + hashbin = kmalloc( sizeof(hashbin_t), GFP_ATOMIC); + if (!hashbin) + return NULL; + + /* + * Initialize structure + */ + memset(hashbin, 0, sizeof(hashbin_t)); + hashbin->hb_type = type; + hashbin->magic = HB_MAGIC; + //hashbin->hb_current = NULL; + + /* Make sure all spinlock's are unlocked */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_lock_init(&hashbin->hb_spinlock); + } + + return hashbin; +} +EXPORT_SYMBOL(hashbin_new); + + +/* + * Function hashbin_delete (hashbin, free_func) + * + * Destroy hashbin, the free_func can be a user supplied special routine + * for deallocating this structure if it's complex. If not the user can + * just supply kfree, which should take care of the job. + */ +int hashbin_delete( hashbin_t* hashbin, FREE_FUNC free_func) +{ + irda_queue_t* queue; + unsigned long flags = 0; + int i; + + IRDA_ASSERT(hashbin != NULL, return -1;); + IRDA_ASSERT(hashbin->magic == HB_MAGIC, return -1;); + + /* Synchronize */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_lock_irqsave(&hashbin->hb_spinlock, flags); + } + + /* + * Free the entries in the hashbin, TODO: use hashbin_clear when + * it has been shown to work + */ + for (i = 0; i < HASHBIN_SIZE; i ++ ) { + queue = dequeue_first((irda_queue_t**) &hashbin->hb_queue[i]); + while (queue ) { + if (free_func) + (*free_func)(queue); + queue = dequeue_first( + (irda_queue_t**) &hashbin->hb_queue[i]); + } + } + + /* Cleanup local data */ + hashbin->hb_current = NULL; + hashbin->magic = ~HB_MAGIC; + + /* Release lock */ + if ( hashbin->hb_type & HB_LOCK) { + spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); + } + + /* + * Free the hashbin structure + */ + kfree(hashbin); + + return 0; +} +EXPORT_SYMBOL(hashbin_delete); + +/********************* HASHBIN LIST OPERATIONS *********************/ + +/* + * Function hashbin_insert (hashbin, entry, name) + * + * Insert an entry into the hashbin + * + */ +void hashbin_insert(hashbin_t* hashbin, irda_queue_t* entry, long hashv, + const char* name) +{ + unsigned long flags = 0; + int bin; + + IRDA_DEBUG( 4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT( hashbin != NULL, return;); + IRDA_ASSERT( hashbin->magic == HB_MAGIC, return;); + + /* + * Locate hashbin + */ + if ( name ) + hashv = hash( name ); + bin = GET_HASHBIN( hashv ); + + /* Synchronize */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_lock_irqsave(&hashbin->hb_spinlock, flags); + } /* Default is no-lock */ + + /* + * Store name and key + */ + entry->q_hash = hashv; + if ( name ) + strlcpy( entry->q_name, name, sizeof(entry->q_name)); + + /* + * Insert new entry first + */ + enqueue_first( (irda_queue_t**) &hashbin->hb_queue[ bin ], + entry); + hashbin->hb_size++; + + /* Release lock */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); + } /* Default is no-lock */ +} +EXPORT_SYMBOL(hashbin_insert); + +/* + * Function hashbin_remove_first (hashbin) + * + * Remove first entry of the hashbin + * + * Note : this function no longer use hashbin_remove(), but does things + * similar to hashbin_remove_this(), so can be considered safe. + * Jean II + */ +void *hashbin_remove_first( hashbin_t *hashbin) +{ + unsigned long flags = 0; + irda_queue_t *entry = NULL; + + /* Synchronize */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_lock_irqsave(&hashbin->hb_spinlock, flags); + } /* Default is no-lock */ + + entry = hashbin_get_first( hashbin); + if ( entry != NULL) { + int bin; + long hashv; + /* + * Locate hashbin + */ + hashv = entry->q_hash; + bin = GET_HASHBIN( hashv ); + + /* + * Dequeue the entry... + */ + dequeue_general( (irda_queue_t**) &hashbin->hb_queue[ bin ], + (irda_queue_t*) entry ); + hashbin->hb_size--; + entry->q_next = NULL; + entry->q_prev = NULL; + + /* + * Check if this item is the currently selected item, and in + * that case we must reset hb_current + */ + if ( entry == hashbin->hb_current) + hashbin->hb_current = NULL; + } + + /* Release lock */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); + } /* Default is no-lock */ + + return entry; +} + + +/* + * Function hashbin_remove (hashbin, hashv, name) + * + * Remove entry with the given name + * + * The use of this function is highly discouraged, because the whole + * concept behind hashbin_remove() is broken. In many cases, it's not + * possible to guarantee the unicity of the index (either hashv or name), + * leading to removing the WRONG entry. + * The only simple safe use is : + * hashbin_remove(hasbin, (int) self, NULL); + * In other case, you must think hard to guarantee unicity of the index. + * Jean II + */ +void* hashbin_remove( hashbin_t* hashbin, long hashv, const char* name) +{ + int bin, found = FALSE; + unsigned long flags = 0; + irda_queue_t* entry; + + IRDA_DEBUG( 4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT( hashbin != NULL, return NULL;); + IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;); + + /* + * Locate hashbin + */ + if ( name ) + hashv = hash( name ); + bin = GET_HASHBIN( hashv ); + + /* Synchronize */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_lock_irqsave(&hashbin->hb_spinlock, flags); + } /* Default is no-lock */ + + /* + * Search for entry + */ + entry = hashbin->hb_queue[ bin ]; + if ( entry ) { + do { + /* + * Check for key + */ + if ( entry->q_hash == hashv ) { + /* + * Name compare too? + */ + if ( name ) { + if ( strcmp( entry->q_name, name) == 0) + { + found = TRUE; + break; + } + } else { + found = TRUE; + break; + } + } + entry = entry->q_next; + } while ( entry != hashbin->hb_queue[ bin ] ); + } + + /* + * If entry was found, dequeue it + */ + if ( found ) { + dequeue_general( (irda_queue_t**) &hashbin->hb_queue[ bin ], + (irda_queue_t*) entry ); + hashbin->hb_size--; + + /* + * Check if this item is the currently selected item, and in + * that case we must reset hb_current + */ + if ( entry == hashbin->hb_current) + hashbin->hb_current = NULL; + } + + /* Release lock */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); + } /* Default is no-lock */ + + + /* Return */ + if ( found ) + return entry; + else + return NULL; + +} +EXPORT_SYMBOL(hashbin_remove); + +/* + * Function hashbin_remove_this (hashbin, entry) + * + * Remove entry with the given name + * + * In some cases, the user of hashbin can't guarantee the unicity + * of either the hashv or name. + * In those cases, using the above function is guaranteed to cause troubles, + * so we use this one instead... + * And by the way, it's also faster, because we skip the search phase ;-) + */ +void* hashbin_remove_this( hashbin_t* hashbin, irda_queue_t* entry) +{ + unsigned long flags = 0; + int bin; + long hashv; + + IRDA_DEBUG( 4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT( hashbin != NULL, return NULL;); + IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;); + IRDA_ASSERT( entry != NULL, return NULL;); + + /* Synchronize */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_lock_irqsave(&hashbin->hb_spinlock, flags); + } /* Default is no-lock */ + + /* Check if valid and not already removed... */ + if((entry->q_next == NULL) || (entry->q_prev == NULL)) { + entry = NULL; + goto out; + } + + /* + * Locate hashbin + */ + hashv = entry->q_hash; + bin = GET_HASHBIN( hashv ); + + /* + * Dequeue the entry... + */ + dequeue_general( (irda_queue_t**) &hashbin->hb_queue[ bin ], + (irda_queue_t*) entry ); + hashbin->hb_size--; + entry->q_next = NULL; + entry->q_prev = NULL; + + /* + * Check if this item is the currently selected item, and in + * that case we must reset hb_current + */ + if ( entry == hashbin->hb_current) + hashbin->hb_current = NULL; +out: + /* Release lock */ + if ( hashbin->hb_type & HB_LOCK ) { + spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); + } /* Default is no-lock */ + + return entry; +} +EXPORT_SYMBOL(hashbin_remove_this); + +/*********************** HASHBIN ENUMERATION ***********************/ + +/* + * Function hashbin_common_find (hashbin, hashv, name) + * + * Find item with the given hashv or name + * + */ +void* hashbin_find( hashbin_t* hashbin, long hashv, const char* name ) +{ + int bin; + irda_queue_t* entry; + + IRDA_DEBUG( 4, "hashbin_find()\n"); + + IRDA_ASSERT( hashbin != NULL, return NULL;); + IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;); + + /* + * Locate hashbin + */ + if ( name ) + hashv = hash( name ); + bin = GET_HASHBIN( hashv ); + + /* + * Search for entry + */ + entry = hashbin->hb_queue[ bin]; + if ( entry ) { + do { + /* + * Check for key + */ + if ( entry->q_hash == hashv ) { + /* + * Name compare too? + */ + if ( name ) { + if ( strcmp( entry->q_name, name ) == 0 ) { + return entry; + } + } else { + return entry; + } + } + entry = entry->q_next; + } while ( entry != hashbin->hb_queue[ bin ] ); + } + + return NULL; +} +EXPORT_SYMBOL(hashbin_find); + +/* + * Function hashbin_lock_find (hashbin, hashv, name) + * + * Find item with the given hashv or name + * + * Same, but with spinlock protection... + * I call it safe, but it's only safe with respect to the hashbin, not its + * content. - Jean II + */ +void* hashbin_lock_find( hashbin_t* hashbin, long hashv, const char* name ) +{ + unsigned long flags = 0; + irda_queue_t* entry; + + /* Synchronize */ + spin_lock_irqsave(&hashbin->hb_spinlock, flags); + + /* + * Search for entry + */ + entry = (irda_queue_t* ) hashbin_find( hashbin, hashv, name ); + + /* Release lock */ + spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); + + return entry; +} +EXPORT_SYMBOL(hashbin_lock_find); + +/* + * Function hashbin_find (hashbin, hashv, name, pnext) + * + * Find an item with the given hashv or name, and its successor + * + * This function allow to do concurrent enumerations without the + * need to lock over the whole session, because the caller keep the + * context of the search. On the other hand, it might fail and return + * NULL if the entry is removed. - Jean II + */ +void* hashbin_find_next( hashbin_t* hashbin, long hashv, const char* name, + void ** pnext) +{ + unsigned long flags = 0; + irda_queue_t* entry; + + /* Synchronize */ + spin_lock_irqsave(&hashbin->hb_spinlock, flags); + + /* + * Search for current entry + * This allow to check if the current item is still in the + * hashbin or has been removed. + */ + entry = (irda_queue_t* ) hashbin_find( hashbin, hashv, name ); + + /* + * Trick hashbin_get_next() to return what we want + */ + if(entry) { + hashbin->hb_current = entry; + *pnext = hashbin_get_next( hashbin ); + } else + *pnext = NULL; + + /* Release lock */ + spin_unlock_irqrestore(&hashbin->hb_spinlock, flags); + + return entry; +} +EXPORT_SYMBOL(hashbin_find_next); + +/* + * Function hashbin_get_first (hashbin) + * + * Get a pointer to first element in hashbin, this function must be + * called before any calls to hashbin_get_next()! + * + */ +irda_queue_t *hashbin_get_first( hashbin_t* hashbin) +{ + irda_queue_t *entry; + int i; + + IRDA_ASSERT( hashbin != NULL, return NULL;); + IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;); + + if ( hashbin == NULL) + return NULL; + + for ( i = 0; i < HASHBIN_SIZE; i ++ ) { + entry = hashbin->hb_queue[ i]; + if ( entry) { + hashbin->hb_current = entry; + return entry; + } + } + /* + * Did not find any item in hashbin + */ + return NULL; +} +EXPORT_SYMBOL(hashbin_get_first); + +/* + * Function hashbin_get_next (hashbin) + * + * Get next item in hashbin. A series of hashbin_get_next() calls must + * be started by a call to hashbin_get_first(). The function returns + * NULL when all items have been traversed + * + * The context of the search is stored within the hashbin, so you must + * protect yourself from concurrent enumerations. - Jean II + */ +irda_queue_t *hashbin_get_next( hashbin_t *hashbin) +{ + irda_queue_t* entry; + int bin; + int i; + + IRDA_ASSERT( hashbin != NULL, return NULL;); + IRDA_ASSERT( hashbin->magic == HB_MAGIC, return NULL;); + + if ( hashbin->hb_current == NULL) { + IRDA_ASSERT( hashbin->hb_current != NULL, return NULL;); + return NULL; + } + entry = hashbin->hb_current->q_next; + bin = GET_HASHBIN( entry->q_hash); + + /* + * Make sure that we are not back at the beginning of the queue + * again + */ + if ( entry != hashbin->hb_queue[ bin ]) { + hashbin->hb_current = entry; + + return entry; + } + + /* + * Check that this is not the last queue in hashbin + */ + if ( bin >= HASHBIN_SIZE) + return NULL; + + /* + * Move to next queue in hashbin + */ + bin++; + for ( i = bin; i < HASHBIN_SIZE; i++ ) { + entry = hashbin->hb_queue[ i]; + if ( entry) { + hashbin->hb_current = entry; + + return entry; + } + } + return NULL; +} +EXPORT_SYMBOL(hashbin_get_next); diff --git a/net/irda/irsysctl.c b/net/irda/irsysctl.c new file mode 100644 index 000000000000..1b1c4193359a --- /dev/null +++ b/net/irda/irsysctl.c @@ -0,0 +1,297 @@ +/********************************************************************* + * + * Filename: irsysctl.c + * Version: 1.0 + * Description: Sysctl interface for IrDA + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sun May 24 22:12:06 1998 + * Modified at: Fri Jun 4 02:50:15 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1997, 1999 Dag Brattli, All Rights Reserved. + * Copyright (c) 2000-2001 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include +#include + +#include /* irda_debug */ +#include + +#define NET_IRDA 412 /* Random number */ +enum { DISCOVERY=1, DEVNAME, DEBUG, FAST_POLL, DISCOVERY_SLOTS, + DISCOVERY_TIMEOUT, SLOT_TIMEOUT, MAX_BAUD_RATE, MIN_TX_TURN_TIME, + MAX_TX_DATA_SIZE, MAX_TX_WINDOW, MAX_NOREPLY_TIME, WARN_NOREPLY_TIME, + LAP_KEEPALIVE_TIME }; + +extern int sysctl_discovery; +extern int sysctl_discovery_slots; +extern int sysctl_discovery_timeout; +extern int sysctl_slot_timeout; +extern int sysctl_fast_poll_increase; +extern char sysctl_devname[]; +extern int sysctl_max_baud_rate; +extern int sysctl_min_tx_turn_time; +extern int sysctl_max_tx_data_size; +extern int sysctl_max_tx_window; +extern int sysctl_max_noreply_time; +extern int sysctl_warn_noreply_time; +extern int sysctl_lap_keepalive_time; + +/* this is needed for the proc_dointvec_minmax - Jean II */ +static int max_discovery_slots = 16; /* ??? */ +static int min_discovery_slots = 1; +/* IrLAP 6.13.2 says 25ms to 10+70ms - allow higher since some devices + * seems to require it. (from Dag's comment) */ +static int max_slot_timeout = 160; +static int min_slot_timeout = 20; +static int max_max_baud_rate = 16000000; /* See qos.c - IrLAP spec */ +static int min_max_baud_rate = 2400; +static int max_min_tx_turn_time = 10000; /* See qos.c - IrLAP spec */ +static int min_min_tx_turn_time; +static int max_max_tx_data_size = 2048; /* See qos.c - IrLAP spec */ +static int min_max_tx_data_size = 64; +static int max_max_tx_window = 7; /* See qos.c - IrLAP spec */ +static int min_max_tx_window = 1; +static int max_max_noreply_time = 40; /* See qos.c - IrLAP spec */ +static int min_max_noreply_time = 3; +static int max_warn_noreply_time = 3; /* 3s == standard */ +static int min_warn_noreply_time = 1; /* 1s == min WD_TIMER */ +static int max_lap_keepalive_time = 10000; /* 10s */ +static int min_lap_keepalive_time = 100; /* 100us */ +/* For other sysctl, I've no idea of the range. Maybe Dag could help + * us on that - Jean II */ + +static int do_devname(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dostring(table, write, filp, buffer, lenp, ppos); + if (ret == 0 && write) { + struct ias_value *val; + + val = irias_new_string_value(sysctl_devname); + if (val) + irias_object_change_attribute("Device", "DeviceName", val); + } + return ret; +} + +/* One file */ +static ctl_table irda_table[] = { + { + .ctl_name = DISCOVERY, + .procname = "discovery", + .data = &sysctl_discovery, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = DEVNAME, + .procname = "devname", + .data = sysctl_devname, + .maxlen = 65, + .mode = 0644, + .proc_handler = &do_devname, + .strategy = &sysctl_string + }, +#ifdef CONFIG_IRDA_DEBUG + { + .ctl_name = DEBUG, + .procname = "debug", + .data = &irda_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, +#endif +#ifdef CONFIG_IRDA_FAST_RR + { + .ctl_name = FAST_POLL, + .procname = "fast_poll_increase", + .data = &sysctl_fast_poll_increase, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, +#endif + { + .ctl_name = DISCOVERY_SLOTS, + .procname = "discovery_slots", + .data = &sysctl_discovery_slots, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_discovery_slots, + .extra2 = &max_discovery_slots + }, + { + .ctl_name = DISCOVERY_TIMEOUT, + .procname = "discovery_timeout", + .data = &sysctl_discovery_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = SLOT_TIMEOUT, + .procname = "slot_timeout", + .data = &sysctl_slot_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_slot_timeout, + .extra2 = &max_slot_timeout + }, + { + .ctl_name = MAX_BAUD_RATE, + .procname = "max_baud_rate", + .data = &sysctl_max_baud_rate, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_max_baud_rate, + .extra2 = &max_max_baud_rate + }, + { + .ctl_name = MIN_TX_TURN_TIME, + .procname = "min_tx_turn_time", + .data = &sysctl_min_tx_turn_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_min_tx_turn_time, + .extra2 = &max_min_tx_turn_time + }, + { + .ctl_name = MAX_TX_DATA_SIZE, + .procname = "max_tx_data_size", + .data = &sysctl_max_tx_data_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_max_tx_data_size, + .extra2 = &max_max_tx_data_size + }, + { + .ctl_name = MAX_TX_WINDOW, + .procname = "max_tx_window", + .data = &sysctl_max_tx_window, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_max_tx_window, + .extra2 = &max_max_tx_window + }, + { + .ctl_name = MAX_NOREPLY_TIME, + .procname = "max_noreply_time", + .data = &sysctl_max_noreply_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_max_noreply_time, + .extra2 = &max_max_noreply_time + }, + { + .ctl_name = WARN_NOREPLY_TIME, + .procname = "warn_noreply_time", + .data = &sysctl_warn_noreply_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_warn_noreply_time, + .extra2 = &max_warn_noreply_time + }, + { + .ctl_name = LAP_KEEPALIVE_TIME, + .procname = "lap_keepalive_time", + .data = &sysctl_lap_keepalive_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_lap_keepalive_time, + .extra2 = &max_lap_keepalive_time + }, + { .ctl_name = 0 } +}; + +/* One directory */ +static ctl_table irda_net_table[] = { + { + .ctl_name = NET_IRDA, + .procname = "irda", + .maxlen = 0, + .mode = 0555, + .child = irda_table + }, + { .ctl_name = 0 } +}; + +/* The parent directory */ +static ctl_table irda_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .maxlen = 0, + .mode = 0555, + .child = irda_net_table + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header *irda_table_header; + +/* + * Function irda_sysctl_register (void) + * + * Register our sysctl interface + * + */ +int __init irda_sysctl_register(void) +{ + irda_table_header = register_sysctl_table(irda_root_table, 0); + if (!irda_table_header) + return -ENOMEM; + + return 0; +} + +/* + * Function irda_sysctl_unregister (void) + * + * Unregister our sysctl interface + * + */ +void __exit irda_sysctl_unregister(void) +{ + unregister_sysctl_table(irda_table_header); +} + + + diff --git a/net/irda/irttp.c b/net/irda/irttp.c new file mode 100644 index 000000000000..d091ccf773b3 --- /dev/null +++ b/net/irda/irttp.c @@ -0,0 +1,1912 @@ +/********************************************************************* + * + * Filename: irttp.c + * Version: 1.2 + * Description: Tiny Transport Protocol (TTP) implementation + * Status: Stable + * Author: Dag Brattli + * Created at: Sun Aug 31 20:14:31 1997 + * Modified at: Wed Jan 5 11:31:27 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-2000 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2003 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +static struct irttp_cb *irttp = NULL; + +static void __irttp_close_tsap(struct tsap_cb *self); + +static int irttp_data_indication(void *instance, void *sap, + struct sk_buff *skb); +static int irttp_udata_indication(void *instance, void *sap, + struct sk_buff *skb); +static void irttp_disconnect_indication(void *instance, void *sap, + LM_REASON reason, struct sk_buff *); +static void irttp_connect_indication(void *instance, void *sap, + struct qos_info *qos, __u32 max_sdu_size, + __u8 header_size, struct sk_buff *skb); +static void irttp_connect_confirm(void *instance, void *sap, + struct qos_info *qos, __u32 max_sdu_size, + __u8 header_size, struct sk_buff *skb); +static void irttp_run_tx_queue(struct tsap_cb *self); +static void irttp_run_rx_queue(struct tsap_cb *self); + +static void irttp_flush_queues(struct tsap_cb *self); +static void irttp_fragment_skb(struct tsap_cb *self, struct sk_buff *skb); +static struct sk_buff *irttp_reassemble_skb(struct tsap_cb *self); +static void irttp_todo_expired(unsigned long data); +static int irttp_param_max_sdu_size(void *instance, irda_param_t *param, + int get); + +static void irttp_flow_indication(void *instance, void *sap, LOCAL_FLOW flow); +static void irttp_status_indication(void *instance, + LINK_STATUS link, LOCK_STATUS lock); + +/* Information for parsing parameters in IrTTP */ +static pi_minor_info_t pi_minor_call_table[] = { + { NULL, 0 }, /* 0x00 */ + { irttp_param_max_sdu_size, PV_INTEGER | PV_BIG_ENDIAN } /* 0x01 */ +}; +static pi_major_info_t pi_major_call_table[] = {{ pi_minor_call_table, 2 }}; +static pi_param_info_t param_info = { pi_major_call_table, 1, 0x0f, 4 }; + +/************************ GLOBAL PROCEDURES ************************/ + +/* + * Function irttp_init (void) + * + * Initialize the IrTTP layer. Called by module initialization code + * + */ +int __init irttp_init(void) +{ + /* Initialize the irttp structure. */ + if (irttp == NULL) { + irttp = kmalloc(sizeof(struct irttp_cb), GFP_KERNEL); + if (irttp == NULL) + return -ENOMEM; + } + memset(irttp, 0, sizeof(struct irttp_cb)); + + irttp->magic = TTP_MAGIC; + + irttp->tsaps = hashbin_new(HB_LOCK); + if (!irttp->tsaps) { + IRDA_ERROR("%s: can't allocate IrTTP hashbin!\n", + __FUNCTION__); + return -ENOMEM; + } + + return 0; +} + +/* + * Function irttp_cleanup (void) + * + * Called by module destruction/cleanup code + * + */ +void __exit irttp_cleanup(void) +{ + /* Check for main structure */ + IRDA_ASSERT(irttp != NULL, return;); + IRDA_ASSERT(irttp->magic == TTP_MAGIC, return;); + + /* + * Delete hashbin and close all TSAP instances in it + */ + hashbin_delete(irttp->tsaps, (FREE_FUNC) __irttp_close_tsap); + + irttp->magic = 0; + + /* De-allocate main structure */ + kfree(irttp); + + irttp = NULL; +} + +/*************************** SUBROUTINES ***************************/ + +/* + * Function irttp_start_todo_timer (self, timeout) + * + * Start todo timer. + * + * Made it more effient and unsensitive to race conditions - Jean II + */ +static inline void irttp_start_todo_timer(struct tsap_cb *self, int timeout) +{ + /* Set new value for timer */ + mod_timer(&self->todo_timer, jiffies + timeout); +} + +/* + * Function irttp_todo_expired (data) + * + * Todo timer has expired! + * + * One of the restriction of the timer is that it is run only on the timer + * interrupt which run every 10ms. This mean that even if you set the timer + * with a delay of 0, it may take up to 10ms before it's run. + * So, to minimise latency and keep cache fresh, we try to avoid using + * it as much as possible. + * Note : we can't use tasklets, because they can't be asynchronously + * killed (need user context), and we can't guarantee that here... + * Jean II + */ +static void irttp_todo_expired(unsigned long data) +{ + struct tsap_cb *self = (struct tsap_cb *) data; + + /* Check that we still exist */ + if (!self || self->magic != TTP_TSAP_MAGIC) + return; + + IRDA_DEBUG(4, "%s(instance=%p)\n", __FUNCTION__, self); + + /* Try to make some progress, especially on Tx side - Jean II */ + irttp_run_rx_queue(self); + irttp_run_tx_queue(self); + + /* Check if time for disconnect */ + if (test_bit(0, &self->disconnect_pend)) { + /* Check if it's possible to disconnect yet */ + if (skb_queue_empty(&self->tx_queue)) { + /* Make sure disconnect is not pending anymore */ + clear_bit(0, &self->disconnect_pend); /* FALSE */ + + /* Note : self->disconnect_skb may be NULL */ + irttp_disconnect_request(self, self->disconnect_skb, + P_NORMAL); + self->disconnect_skb = NULL; + } else { + /* Try again later */ + irttp_start_todo_timer(self, HZ/10); + + /* No reason to try and close now */ + return; + } + } + + /* Check if it's closing time */ + if (self->close_pend) + /* Finish cleanup */ + irttp_close_tsap(self); +} + +/* + * Function irttp_flush_queues (self) + * + * Flushes (removes all frames) in transitt-buffer (tx_list) + */ +void irttp_flush_queues(struct tsap_cb *self) +{ + struct sk_buff* skb; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + + /* Deallocate frames waiting to be sent */ + while ((skb = skb_dequeue(&self->tx_queue)) != NULL) + dev_kfree_skb(skb); + + /* Deallocate received frames */ + while ((skb = skb_dequeue(&self->rx_queue)) != NULL) + dev_kfree_skb(skb); + + /* Deallocate received fragments */ + while ((skb = skb_dequeue(&self->rx_fragments)) != NULL) + dev_kfree_skb(skb); +} + +/* + * Function irttp_reassemble (self) + * + * Makes a new (continuous) skb of all the fragments in the fragment + * queue + * + */ +static struct sk_buff *irttp_reassemble_skb(struct tsap_cb *self) +{ + struct sk_buff *skb, *frag; + int n = 0; /* Fragment index */ + + IRDA_ASSERT(self != NULL, return NULL;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return NULL;); + + IRDA_DEBUG(2, "%s(), self->rx_sdu_size=%d\n", __FUNCTION__, + self->rx_sdu_size); + + skb = dev_alloc_skb(TTP_HEADER + self->rx_sdu_size); + if (!skb) + return NULL; + + /* + * Need to reserve space for TTP header in case this skb needs to + * be requeued in case delivery failes + */ + skb_reserve(skb, TTP_HEADER); + skb_put(skb, self->rx_sdu_size); + + /* + * Copy all fragments to a new buffer + */ + while ((frag = skb_dequeue(&self->rx_fragments)) != NULL) { + memcpy(skb->data+n, frag->data, frag->len); + n += frag->len; + + dev_kfree_skb(frag); + } + + IRDA_DEBUG(2, + "%s(), frame len=%d, rx_sdu_size=%d, rx_max_sdu_size=%d\n", + __FUNCTION__, n, self->rx_sdu_size, self->rx_max_sdu_size); + /* Note : irttp_run_rx_queue() calculate self->rx_sdu_size + * by summing the size of all fragments, so we should always + * have n == self->rx_sdu_size, except in cases where we + * droped the last fragment (when self->rx_sdu_size exceed + * self->rx_max_sdu_size), where n < self->rx_sdu_size. + * Jean II */ + IRDA_ASSERT(n <= self->rx_sdu_size, n = self->rx_sdu_size;); + + /* Set the new length */ + skb_trim(skb, n); + + self->rx_sdu_size = 0; + + return skb; +} + +/* + * Function irttp_fragment_skb (skb) + * + * Fragments a frame and queues all the fragments for transmission + * + */ +static inline void irttp_fragment_skb(struct tsap_cb *self, + struct sk_buff *skb) +{ + struct sk_buff *frag; + __u8 *frame; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + /* + * Split frame into a number of segments + */ + while (skb->len > self->max_seg_size) { + IRDA_DEBUG(2, "%s(), fragmenting ...\n", __FUNCTION__); + + /* Make new segment */ + frag = dev_alloc_skb(self->max_seg_size+self->max_header_size); + if (!frag) + return; + + skb_reserve(frag, self->max_header_size); + + /* Copy data from the original skb into this fragment. */ + memcpy(skb_put(frag, self->max_seg_size), skb->data, + self->max_seg_size); + + /* Insert TTP header, with the more bit set */ + frame = skb_push(frag, TTP_HEADER); + frame[0] = TTP_MORE; + + /* Hide the copied data from the original skb */ + skb_pull(skb, self->max_seg_size); + + /* Queue fragment */ + skb_queue_tail(&self->tx_queue, frag); + } + /* Queue what is left of the original skb */ + IRDA_DEBUG(2, "%s(), queuing last segment\n", __FUNCTION__); + + frame = skb_push(skb, TTP_HEADER); + frame[0] = 0x00; /* Clear more bit */ + + /* Queue fragment */ + skb_queue_tail(&self->tx_queue, skb); +} + +/* + * Function irttp_param_max_sdu_size (self, param) + * + * Handle the MaxSduSize parameter in the connect frames, this function + * will be called both when this parameter needs to be inserted into, and + * extracted from the connect frames + */ +static int irttp_param_max_sdu_size(void *instance, irda_param_t *param, + int get) +{ + struct tsap_cb *self; + + self = (struct tsap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;); + + if (get) + param->pv.i = self->tx_max_sdu_size; + else + self->tx_max_sdu_size = param->pv.i; + + IRDA_DEBUG(1, "%s(), MaxSduSize=%d\n", __FUNCTION__, param->pv.i); + + return 0; +} + +/*************************** CLIENT CALLS ***************************/ +/************************** LMP CALLBACKS **************************/ +/* Everything is happily mixed up. Waiting for next clean up - Jean II */ + +/* + * Function irttp_open_tsap (stsap, notify) + * + * Create TSAP connection endpoint, + */ +struct tsap_cb *irttp_open_tsap(__u8 stsap_sel, int credit, notify_t *notify) +{ + struct tsap_cb *self; + struct lsap_cb *lsap; + notify_t ttp_notify; + + IRDA_ASSERT(irttp != NULL, return NULL;); + IRDA_ASSERT(irttp->magic == TTP_MAGIC, return NULL;); + + /* The IrLMP spec (IrLMP 1.1 p10) says that we have the right to + * use only 0x01-0x6F. Of course, we can use LSAP_ANY as well. + * JeanII */ + if((stsap_sel != LSAP_ANY) && + ((stsap_sel < 0x01) || (stsap_sel >= 0x70))) { + IRDA_DEBUG(0, "%s(), invalid tsap!\n", __FUNCTION__); + return NULL; + } + + self = kmalloc(sizeof(struct tsap_cb), GFP_ATOMIC); + if (self == NULL) { + IRDA_DEBUG(0, "%s(), unable to kmalloc!\n", __FUNCTION__); + return NULL; + } + memset(self, 0, sizeof(struct tsap_cb)); + spin_lock_init(&self->lock); + + /* Initialise todo timer */ + init_timer(&self->todo_timer); + self->todo_timer.data = (unsigned long) self; + self->todo_timer.function = &irttp_todo_expired; + + /* Initialize callbacks for IrLMP to use */ + irda_notify_init(&ttp_notify); + ttp_notify.connect_confirm = irttp_connect_confirm; + ttp_notify.connect_indication = irttp_connect_indication; + ttp_notify.disconnect_indication = irttp_disconnect_indication; + ttp_notify.data_indication = irttp_data_indication; + ttp_notify.udata_indication = irttp_udata_indication; + ttp_notify.flow_indication = irttp_flow_indication; + if(notify->status_indication != NULL) + ttp_notify.status_indication = irttp_status_indication; + ttp_notify.instance = self; + strncpy(ttp_notify.name, notify->name, NOTIFY_MAX_NAME); + + self->magic = TTP_TSAP_MAGIC; + self->connected = FALSE; + + skb_queue_head_init(&self->rx_queue); + skb_queue_head_init(&self->tx_queue); + skb_queue_head_init(&self->rx_fragments); + /* + * Create LSAP at IrLMP layer + */ + lsap = irlmp_open_lsap(stsap_sel, &ttp_notify, 0); + if (lsap == NULL) { + IRDA_WARNING("%s: unable to allocate LSAP!!\n", __FUNCTION__); + return NULL; + } + + /* + * If user specified LSAP_ANY as source TSAP selector, then IrLMP + * will replace it with whatever source selector which is free, so + * the stsap_sel we have might not be valid anymore + */ + self->stsap_sel = lsap->slsap_sel; + IRDA_DEBUG(4, "%s(), stsap_sel=%02x\n", __FUNCTION__, self->stsap_sel); + + self->notify = *notify; + self->lsap = lsap; + + hashbin_insert(irttp->tsaps, (irda_queue_t *) self, (long) self, NULL); + + if (credit > TTP_RX_MAX_CREDIT) + self->initial_credit = TTP_RX_MAX_CREDIT; + else + self->initial_credit = credit; + + return self; +} +EXPORT_SYMBOL(irttp_open_tsap); + +/* + * Function irttp_close (handle) + * + * Remove an instance of a TSAP. This function should only deal with the + * deallocation of the TSAP, and resetting of the TSAPs values; + * + */ +static void __irttp_close_tsap(struct tsap_cb *self) +{ + /* First make sure we're connected. */ + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + + irttp_flush_queues(self); + + del_timer(&self->todo_timer); + + /* This one won't be cleaned up if we are disconnect_pend + close_pend + * and we receive a disconnect_indication */ + if (self->disconnect_skb) + dev_kfree_skb(self->disconnect_skb); + + self->connected = FALSE; + self->magic = ~TTP_TSAP_MAGIC; + + kfree(self); +} + +/* + * Function irttp_close (self) + * + * Remove TSAP from list of all TSAPs and then deallocate all resources + * associated with this TSAP + * + * Note : because we *free* the tsap structure, it is the responsibility + * of the caller to make sure we are called only once and to deal with + * possible race conditions. - Jean II + */ +int irttp_close_tsap(struct tsap_cb *self) +{ + struct tsap_cb *tsap; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;); + + /* Make sure tsap has been disconnected */ + if (self->connected) { + /* Check if disconnect is not pending */ + if (!test_bit(0, &self->disconnect_pend)) { + IRDA_WARNING("%s: TSAP still connected!\n", + __FUNCTION__); + irttp_disconnect_request(self, NULL, P_NORMAL); + } + self->close_pend = TRUE; + irttp_start_todo_timer(self, HZ/10); + + return 0; /* Will be back! */ + } + + tsap = hashbin_remove(irttp->tsaps, (long) self, NULL); + + IRDA_ASSERT(tsap == self, return -1;); + + /* Close corresponding LSAP */ + if (self->lsap) { + irlmp_close_lsap(self->lsap); + self->lsap = NULL; + } + + __irttp_close_tsap(self); + + return 0; +} +EXPORT_SYMBOL(irttp_close_tsap); + +/* + * Function irttp_udata_request (self, skb) + * + * Send unreliable data on this TSAP + * + */ +int irttp_udata_request(struct tsap_cb *self, struct sk_buff *skb) +{ + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;); + IRDA_ASSERT(skb != NULL, return -1;); + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + /* Check that nothing bad happens */ + if ((skb->len == 0) || (!self->connected)) { + IRDA_DEBUG(1, "%s(), No data, or not connected\n", + __FUNCTION__); + goto err; + } + + if (skb->len > self->max_seg_size) { + IRDA_DEBUG(1, "%s(), UData is to large for IrLAP!\n", + __FUNCTION__); + goto err; + } + + irlmp_udata_request(self->lsap, skb); + self->stats.tx_packets++; + + return 0; + +err: + dev_kfree_skb(skb); + return -1; +} +EXPORT_SYMBOL(irttp_udata_request); + + +/* + * Function irttp_data_request (handle, skb) + * + * Queue frame for transmission. If SAR is enabled, fragement the frame + * and queue the fragments for transmission + */ +int irttp_data_request(struct tsap_cb *self, struct sk_buff *skb) +{ + __u8 *frame; + int ret; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;); + IRDA_ASSERT(skb != NULL, return -1;); + + IRDA_DEBUG(2, "%s() : queue len = %d\n", __FUNCTION__, + skb_queue_len(&self->tx_queue)); + + /* Check that nothing bad happens */ + if ((skb->len == 0) || (!self->connected)) { + IRDA_WARNING("%s: No data, or not connected\n", __FUNCTION__); + ret = -ENOTCONN; + goto err; + } + + /* + * Check if SAR is disabled, and the frame is larger than what fits + * inside an IrLAP frame + */ + if ((self->tx_max_sdu_size == 0) && (skb->len > self->max_seg_size)) { + IRDA_ERROR("%s: SAR disabled, and data is to large for IrLAP!\n", + __FUNCTION__); + ret = -EMSGSIZE; + goto err; + } + + /* + * Check if SAR is enabled, and the frame is larger than the + * TxMaxSduSize + */ + if ((self->tx_max_sdu_size != 0) && + (self->tx_max_sdu_size != TTP_SAR_UNBOUND) && + (skb->len > self->tx_max_sdu_size)) + { + IRDA_ERROR("%s: SAR enabled, but data is larger than TxMaxSduSize!\n", + __FUNCTION__); + ret = -EMSGSIZE; + goto err; + } + /* + * Check if transmit queue is full + */ + if (skb_queue_len(&self->tx_queue) >= TTP_TX_MAX_QUEUE) { + /* + * Give it a chance to empty itself + */ + irttp_run_tx_queue(self); + + /* Drop packet. This error code should trigger the caller + * to resend the data in the client code - Jean II */ + ret = -ENOBUFS; + goto err; + } + + /* Queue frame, or queue frame segments */ + if ((self->tx_max_sdu_size == 0) || (skb->len < self->max_seg_size)) { + /* Queue frame */ + IRDA_ASSERT(skb_headroom(skb) >= TTP_HEADER, return -1;); + frame = skb_push(skb, TTP_HEADER); + frame[0] = 0x00; /* Clear more bit */ + + skb_queue_tail(&self->tx_queue, skb); + } else { + /* + * Fragment the frame, this function will also queue the + * fragments, we don't care about the fact the transmit + * queue may be overfilled by all the segments for a little + * while + */ + irttp_fragment_skb(self, skb); + } + + /* Check if we can accept more data from client */ + if ((!self->tx_sdu_busy) && + (skb_queue_len(&self->tx_queue) > TTP_TX_HIGH_THRESHOLD)) { + /* Tx queue filling up, so stop client. */ + if (self->notify.flow_indication) { + self->notify.flow_indication(self->notify.instance, + self, FLOW_STOP); + } + /* self->tx_sdu_busy is the state of the client. + * Update state after notifying client to avoid + * race condition with irttp_flow_indication(). + * If the queue empty itself after our test but before + * we set the flag, we will fix ourselves below in + * irttp_run_tx_queue(). + * Jean II */ + self->tx_sdu_busy = TRUE; + } + + /* Try to make some progress */ + irttp_run_tx_queue(self); + + return 0; + +err: + dev_kfree_skb(skb); + return ret; +} +EXPORT_SYMBOL(irttp_data_request); + +/* + * Function irttp_run_tx_queue (self) + * + * Transmit packets queued for transmission (if possible) + * + */ +static void irttp_run_tx_queue(struct tsap_cb *self) +{ + struct sk_buff *skb; + unsigned long flags; + int n; + + IRDA_DEBUG(2, "%s() : send_credit = %d, queue_len = %d\n", + __FUNCTION__, + self->send_credit, skb_queue_len(&self->tx_queue)); + + /* Get exclusive access to the tx queue, otherwise don't touch it */ + if (irda_lock(&self->tx_queue_lock) == FALSE) + return; + + /* Try to send out frames as long as we have credits + * and as long as LAP is not full. If LAP is full, it will + * poll us through irttp_flow_indication() - Jean II */ + while ((self->send_credit > 0) && + (!irlmp_lap_tx_queue_full(self->lsap)) && + (skb = skb_dequeue(&self->tx_queue))) + { + /* + * Since we can transmit and receive frames concurrently, + * the code below is a critical region and we must assure that + * nobody messes with the credits while we update them. + */ + spin_lock_irqsave(&self->lock, flags); + + n = self->avail_credit; + self->avail_credit = 0; + + /* Only room for 127 credits in frame */ + if (n > 127) { + self->avail_credit = n-127; + n = 127; + } + self->remote_credit += n; + self->send_credit--; + + spin_unlock_irqrestore(&self->lock, flags); + + /* + * More bit must be set by the data_request() or fragment() + * functions + */ + skb->data[0] |= (n & 0x7f); + + /* Detach from socket. + * The current skb has a reference to the socket that sent + * it (skb->sk). When we pass it to IrLMP, the skb will be + * stored in in IrLAP (self->wx_list). When we are within + * IrLAP, we lose the notion of socket, so we should not + * have a reference to a socket. So, we drop it here. + * + * Why does it matter ? + * When the skb is freed (kfree_skb), if it is associated + * with a socket, it release buffer space on the socket + * (through sock_wfree() and sock_def_write_space()). + * If the socket no longer exist, we may crash. Hard. + * When we close a socket, we make sure that associated packets + * in IrTTP are freed. However, we have no way to cancel + * the packet that we have passed to IrLAP. So, if a packet + * remains in IrLAP (retry on the link or else) after we + * close the socket, we are dead ! + * Jean II */ + if (skb->sk != NULL) { + /* IrSOCK application, IrOBEX, ... */ + skb_orphan(skb); + } + /* IrCOMM over IrTTP, IrLAN, ... */ + + /* Pass the skb to IrLMP - done */ + irlmp_data_request(self->lsap, skb); + self->stats.tx_packets++; + } + + /* Check if we can accept more frames from client. + * We don't want to wait until the todo timer to do that, and we + * can't use tasklets (grr...), so we are obliged to give control + * to client. That's ok, this test will be true not too often + * (max once per LAP window) and we are called from places + * where we can spend a bit of time doing stuff. - Jean II */ + if ((self->tx_sdu_busy) && + (skb_queue_len(&self->tx_queue) < TTP_TX_LOW_THRESHOLD) && + (!self->close_pend)) + { + if (self->notify.flow_indication) + self->notify.flow_indication(self->notify.instance, + self, FLOW_START); + + /* self->tx_sdu_busy is the state of the client. + * We don't really have a race here, but it's always safer + * to update our state after the client - Jean II */ + self->tx_sdu_busy = FALSE; + } + + /* Reset lock */ + self->tx_queue_lock = 0; +} + +/* + * Function irttp_give_credit (self) + * + * Send a dataless flowdata TTP-PDU and give available credit to peer + * TSAP + */ +static inline void irttp_give_credit(struct tsap_cb *self) +{ + struct sk_buff *tx_skb = NULL; + unsigned long flags; + int n; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + + IRDA_DEBUG(4, "%s() send=%d,avail=%d,remote=%d\n", + __FUNCTION__, + self->send_credit, self->avail_credit, self->remote_credit); + + /* Give credit to peer */ + tx_skb = dev_alloc_skb(64); + if (!tx_skb) + return; + + /* Reserve space for LMP, and LAP header */ + skb_reserve(tx_skb, self->max_header_size); + + /* + * Since we can transmit and receive frames concurrently, + * the code below is a critical region and we must assure that + * nobody messes with the credits while we update them. + */ + spin_lock_irqsave(&self->lock, flags); + + n = self->avail_credit; + self->avail_credit = 0; + + /* Only space for 127 credits in frame */ + if (n > 127) { + self->avail_credit = n - 127; + n = 127; + } + self->remote_credit += n; + + spin_unlock_irqrestore(&self->lock, flags); + + skb_put(tx_skb, 1); + tx_skb->data[0] = (__u8) (n & 0x7f); + + irlmp_data_request(self->lsap, tx_skb); + self->stats.tx_packets++; +} + +/* + * Function irttp_udata_indication (instance, sap, skb) + * + * Received some unit-data (unreliable) + * + */ +static int irttp_udata_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct tsap_cb *self; + int err; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + self = (struct tsap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;); + IRDA_ASSERT(skb != NULL, return -1;); + + self->stats.rx_packets++; + + /* Just pass data to layer above */ + if (self->notify.udata_indication) { + err = self->notify.udata_indication(self->notify.instance, + self,skb); + /* Same comment as in irttp_do_data_indication() */ + if (!err) + return 0; + } + /* Either no handler, or handler returns an error */ + dev_kfree_skb(skb); + + return 0; +} + +/* + * Function irttp_data_indication (instance, sap, skb) + * + * Receive segment from IrLMP. + * + */ +static int irttp_data_indication(void *instance, void *sap, + struct sk_buff *skb) +{ + struct tsap_cb *self; + unsigned long flags; + int n; + + self = (struct tsap_cb *) instance; + + n = skb->data[0] & 0x7f; /* Extract the credits */ + + self->stats.rx_packets++; + + /* Deal with inbound credit + * Since we can transmit and receive frames concurrently, + * the code below is a critical region and we must assure that + * nobody messes with the credits while we update them. + */ + spin_lock_irqsave(&self->lock, flags); + self->send_credit += n; + if (skb->len > 1) + self->remote_credit--; + spin_unlock_irqrestore(&self->lock, flags); + + /* + * Data or dataless packet? Dataless frames contains only the + * TTP_HEADER. + */ + if (skb->len > 1) { + /* + * We don't remove the TTP header, since we must preserve the + * more bit, so the defragment routing knows what to do + */ + skb_queue_tail(&self->rx_queue, skb); + } else { + /* Dataless flowdata TTP-PDU */ + dev_kfree_skb(skb); + } + + + /* Push data to the higher layer. + * We do it synchronously because running the todo timer for each + * receive packet would be too much overhead and latency. + * By passing control to the higher layer, we run the risk that + * it may take time or grab a lock. Most often, the higher layer + * will only put packet in a queue. + * Anyway, packets are only dripping through the IrDA, so we can + * have time before the next packet. + * Further, we are run from NET_BH, so the worse that can happen is + * us missing the optimal time to send back the PF bit in LAP. + * Jean II */ + irttp_run_rx_queue(self); + + /* We now give credits to peer in irttp_run_rx_queue(). + * We need to send credit *NOW*, otherwise we are going + * to miss the next Tx window. The todo timer may take + * a while before it's run... - Jean II */ + + /* + * If the peer device has given us some credits and we didn't have + * anyone from before, then we need to shedule the tx queue. + * We need to do that because our Tx have stopped (so we may not + * get any LAP flow indication) and the user may be stopped as + * well. - Jean II + */ + if (self->send_credit == n) { + /* Restart pushing stuff to LAP */ + irttp_run_tx_queue(self); + /* Note : we don't want to schedule the todo timer + * because it has horrible latency. No tasklets + * because the tasklet API is broken. - Jean II */ + } + + return 0; +} + +/* + * Function irttp_status_indication (self, reason) + * + * Status_indication, just pass to the higher layer... + * + */ +static void irttp_status_indication(void *instance, + LINK_STATUS link, LOCK_STATUS lock) +{ + struct tsap_cb *self; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + self = (struct tsap_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + + /* Check if client has already closed the TSAP and gone away */ + if (self->close_pend) + return; + + /* + * Inform service user if he has requested it + */ + if (self->notify.status_indication != NULL) + self->notify.status_indication(self->notify.instance, + link, lock); + else + IRDA_DEBUG(2, "%s(), no handler\n", __FUNCTION__); +} + +/* + * Function irttp_flow_indication (self, reason) + * + * Flow_indication : IrLAP tells us to send more data. + * + */ +static void irttp_flow_indication(void *instance, void *sap, LOCAL_FLOW flow) +{ + struct tsap_cb *self; + + self = (struct tsap_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + + IRDA_DEBUG(4, "%s(instance=%p)\n", __FUNCTION__, self); + + /* We are "polled" directly from LAP, and the LAP want to fill + * its Tx window. We want to do our best to send it data, so that + * we maximise the window. On the other hand, we want to limit the + * amount of work here so that LAP doesn't hang forever waiting + * for packets. - Jean II */ + + /* Try to send some packets. Currently, LAP calls us every time + * there is one free slot, so we will send only one packet. + * This allow the scheduler to do its round robin - Jean II */ + irttp_run_tx_queue(self); + + /* Note regarding the interraction with higher layer. + * irttp_run_tx_queue() may call the client when its queue + * start to empty, via notify.flow_indication(). Initially. + * I wanted this to happen in a tasklet, to avoid client + * grabbing the CPU, but we can't use tasklets safely. And timer + * is definitely too slow. + * This will happen only once per LAP window, and usually at + * the third packet (unless window is smaller). LAP is still + * doing mtt and sending first packet so it's sort of OK + * to do that. Jean II */ + + /* If we need to send disconnect. try to do it now */ + if(self->disconnect_pend) + irttp_start_todo_timer(self, 0); +} + +/* + * Function irttp_flow_request (self, command) + * + * This function could be used by the upper layers to tell IrTTP to stop + * delivering frames if the receive queues are starting to get full, or + * to tell IrTTP to start delivering frames again. + */ +void irttp_flow_request(struct tsap_cb *self, LOCAL_FLOW flow) +{ + IRDA_DEBUG(1, "%s()\n", __FUNCTION__); + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + + switch (flow) { + case FLOW_STOP: + IRDA_DEBUG(1, "%s(), flow stop\n", __FUNCTION__); + self->rx_sdu_busy = TRUE; + break; + case FLOW_START: + IRDA_DEBUG(1, "%s(), flow start\n", __FUNCTION__); + self->rx_sdu_busy = FALSE; + + /* Client say he can accept more data, try to free our + * queues ASAP - Jean II */ + irttp_run_rx_queue(self); + + break; + default: + IRDA_DEBUG(1, "%s(), Unknown flow command!\n", __FUNCTION__); + } +} +EXPORT_SYMBOL(irttp_flow_request); + +/* + * Function irttp_connect_request (self, dtsap_sel, daddr, qos) + * + * Try to connect to remote destination TSAP selector + * + */ +int irttp_connect_request(struct tsap_cb *self, __u8 dtsap_sel, + __u32 saddr, __u32 daddr, + struct qos_info *qos, __u32 max_sdu_size, + struct sk_buff *userdata) +{ + struct sk_buff *tx_skb; + __u8 *frame; + __u8 n; + + IRDA_DEBUG(4, "%s(), max_sdu_size=%d\n", __FUNCTION__, max_sdu_size); + + IRDA_ASSERT(self != NULL, return -EBADR;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -EBADR;); + + if (self->connected) { + if(userdata) + dev_kfree_skb(userdata); + return -EISCONN; + } + + /* Any userdata supplied? */ + if (userdata == NULL) { + tx_skb = dev_alloc_skb(64); + if (!tx_skb) + return -ENOMEM; + + /* Reserve space for MUX_CONTROL and LAP header */ + skb_reserve(tx_skb, TTP_MAX_HEADER); + } else { + tx_skb = userdata; + /* + * Check that the client has reserved enough space for + * headers + */ + IRDA_ASSERT(skb_headroom(userdata) >= TTP_MAX_HEADER, + { dev_kfree_skb(userdata); return -1; } ); + } + + /* Initialize connection parameters */ + self->connected = FALSE; + self->avail_credit = 0; + self->rx_max_sdu_size = max_sdu_size; + self->rx_sdu_size = 0; + self->rx_sdu_busy = FALSE; + self->dtsap_sel = dtsap_sel; + + n = self->initial_credit; + + self->remote_credit = 0; + self->send_credit = 0; + + /* + * Give away max 127 credits for now + */ + if (n > 127) { + self->avail_credit=n-127; + n = 127; + } + + self->remote_credit = n; + + /* SAR enabled? */ + if (max_sdu_size > 0) { + IRDA_ASSERT(skb_headroom(tx_skb) >= (TTP_MAX_HEADER + TTP_SAR_HEADER), + { dev_kfree_skb(tx_skb); return -1; } ); + + /* Insert SAR parameters */ + frame = skb_push(tx_skb, TTP_HEADER+TTP_SAR_HEADER); + + frame[0] = TTP_PARAMETERS | n; + frame[1] = 0x04; /* Length */ + frame[2] = 0x01; /* MaxSduSize */ + frame[3] = 0x02; /* Value length */ + + put_unaligned(cpu_to_be16((__u16) max_sdu_size), + (__u16 *)(frame+4)); + } else { + /* Insert plain TTP header */ + frame = skb_push(tx_skb, TTP_HEADER); + + /* Insert initial credit in frame */ + frame[0] = n & 0x7f; + } + + /* Connect with IrLMP. No QoS parameters for now */ + return irlmp_connect_request(self->lsap, dtsap_sel, saddr, daddr, qos, + tx_skb); +} +EXPORT_SYMBOL(irttp_connect_request); + +/* + * Function irttp_connect_confirm (handle, qos, skb) + * + * Sevice user confirms TSAP connection with peer. + * + */ +static void irttp_connect_confirm(void *instance, void *sap, + struct qos_info *qos, __u32 max_seg_size, + __u8 max_header_size, struct sk_buff *skb) +{ + struct tsap_cb *self; + int parameters; + int ret; + __u8 plen; + __u8 n; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + self = (struct tsap_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + self->max_seg_size = max_seg_size - TTP_HEADER; + self->max_header_size = max_header_size + TTP_HEADER; + + /* + * Check if we have got some QoS parameters back! This should be the + * negotiated QoS for the link. + */ + if (qos) { + IRDA_DEBUG(4, "IrTTP, Negotiated BAUD_RATE: %02x\n", + qos->baud_rate.bits); + IRDA_DEBUG(4, "IrTTP, Negotiated BAUD_RATE: %d bps.\n", + qos->baud_rate.value); + } + + n = skb->data[0] & 0x7f; + + IRDA_DEBUG(4, "%s(), Initial send_credit=%d\n", __FUNCTION__, n); + + self->send_credit = n; + self->tx_max_sdu_size = 0; + self->connected = TRUE; + + parameters = skb->data[0] & 0x80; + + IRDA_ASSERT(skb->len >= TTP_HEADER, return;); + skb_pull(skb, TTP_HEADER); + + if (parameters) { + plen = skb->data[0]; + + ret = irda_param_extract_all(self, skb->data+1, + IRDA_MIN(skb->len-1, plen), + ¶m_info); + + /* Any errors in the parameter list? */ + if (ret < 0) { + IRDA_WARNING("%s: error extracting parameters\n", + __FUNCTION__); + dev_kfree_skb(skb); + + /* Do not accept this connection attempt */ + return; + } + /* Remove parameters */ + skb_pull(skb, IRDA_MIN(skb->len, plen+1)); + } + + IRDA_DEBUG(4, "%s() send=%d,avail=%d,remote=%d\n", __FUNCTION__, + self->send_credit, self->avail_credit, self->remote_credit); + + IRDA_DEBUG(2, "%s(), MaxSduSize=%d\n", __FUNCTION__, + self->tx_max_sdu_size); + + if (self->notify.connect_confirm) { + self->notify.connect_confirm(self->notify.instance, self, qos, + self->tx_max_sdu_size, + self->max_header_size, skb); + } else + dev_kfree_skb(skb); +} + +/* + * Function irttp_connect_indication (handle, skb) + * + * Some other device is connecting to this TSAP + * + */ +void irttp_connect_indication(void *instance, void *sap, struct qos_info *qos, + __u32 max_seg_size, __u8 max_header_size, + struct sk_buff *skb) +{ + struct tsap_cb *self; + struct lsap_cb *lsap; + int parameters; + int ret; + __u8 plen; + __u8 n; + + self = (struct tsap_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + IRDA_ASSERT(skb != NULL, return;); + + lsap = (struct lsap_cb *) sap; + + self->max_seg_size = max_seg_size - TTP_HEADER; + self->max_header_size = max_header_size+TTP_HEADER; + + IRDA_DEBUG(4, "%s(), TSAP sel=%02x\n", __FUNCTION__, self->stsap_sel); + + /* Need to update dtsap_sel if its equal to LSAP_ANY */ + self->dtsap_sel = lsap->dlsap_sel; + + n = skb->data[0] & 0x7f; + + self->send_credit = n; + self->tx_max_sdu_size = 0; + + parameters = skb->data[0] & 0x80; + + IRDA_ASSERT(skb->len >= TTP_HEADER, return;); + skb_pull(skb, TTP_HEADER); + + if (parameters) { + plen = skb->data[0]; + + ret = irda_param_extract_all(self, skb->data+1, + IRDA_MIN(skb->len-1, plen), + ¶m_info); + + /* Any errors in the parameter list? */ + if (ret < 0) { + IRDA_WARNING("%s: error extracting parameters\n", + __FUNCTION__); + dev_kfree_skb(skb); + + /* Do not accept this connection attempt */ + return; + } + + /* Remove parameters */ + skb_pull(skb, IRDA_MIN(skb->len, plen+1)); + } + + if (self->notify.connect_indication) { + self->notify.connect_indication(self->notify.instance, self, + qos, self->tx_max_sdu_size, + self->max_header_size, skb); + } else + dev_kfree_skb(skb); +} + +/* + * Function irttp_connect_response (handle, userdata) + * + * Service user is accepting the connection, just pass it down to + * IrLMP! + * + */ +int irttp_connect_response(struct tsap_cb *self, __u32 max_sdu_size, + struct sk_buff *userdata) +{ + struct sk_buff *tx_skb; + __u8 *frame; + int ret; + __u8 n; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;); + + IRDA_DEBUG(4, "%s(), Source TSAP selector=%02x\n", __FUNCTION__, + self->stsap_sel); + + /* Any userdata supplied? */ + if (userdata == NULL) { + tx_skb = dev_alloc_skb(64); + if (!tx_skb) + return -ENOMEM; + + /* Reserve space for MUX_CONTROL and LAP header */ + skb_reserve(tx_skb, TTP_MAX_HEADER); + } else { + tx_skb = userdata; + /* + * Check that the client has reserved enough space for + * headers + */ + IRDA_ASSERT(skb_headroom(userdata) >= TTP_MAX_HEADER, + { dev_kfree_skb(userdata); return -1; } ); + } + + self->avail_credit = 0; + self->remote_credit = 0; + self->rx_max_sdu_size = max_sdu_size; + self->rx_sdu_size = 0; + self->rx_sdu_busy = FALSE; + + n = self->initial_credit; + + /* Frame has only space for max 127 credits (7 bits) */ + if (n > 127) { + self->avail_credit = n - 127; + n = 127; + } + + self->remote_credit = n; + self->connected = TRUE; + + /* SAR enabled? */ + if (max_sdu_size > 0) { + IRDA_ASSERT(skb_headroom(tx_skb) >= (TTP_MAX_HEADER + TTP_SAR_HEADER), + { dev_kfree_skb(tx_skb); return -1; } ); + + /* Insert TTP header with SAR parameters */ + frame = skb_push(tx_skb, TTP_HEADER+TTP_SAR_HEADER); + + frame[0] = TTP_PARAMETERS | n; + frame[1] = 0x04; /* Length */ + + /* irda_param_insert(self, IRTTP_MAX_SDU_SIZE, frame+1, */ +/* TTP_SAR_HEADER, ¶m_info) */ + + frame[2] = 0x01; /* MaxSduSize */ + frame[3] = 0x02; /* Value length */ + + put_unaligned(cpu_to_be16((__u16) max_sdu_size), + (__u16 *)(frame+4)); + } else { + /* Insert TTP header */ + frame = skb_push(tx_skb, TTP_HEADER); + + frame[0] = n & 0x7f; + } + + ret = irlmp_connect_response(self->lsap, tx_skb); + + return ret; +} +EXPORT_SYMBOL(irttp_connect_response); + +/* + * Function irttp_dup (self, instance) + * + * Duplicate TSAP, can be used by servers to confirm a connection on a + * new TSAP so it can keep listening on the old one. + */ +struct tsap_cb *irttp_dup(struct tsap_cb *orig, void *instance) +{ + struct tsap_cb *new; + unsigned long flags; + + IRDA_DEBUG(1, "%s()\n", __FUNCTION__); + + /* Protect our access to the old tsap instance */ + spin_lock_irqsave(&irttp->tsaps->hb_spinlock, flags); + + /* Find the old instance */ + if (!hashbin_find(irttp->tsaps, (long) orig, NULL)) { + IRDA_DEBUG(0, "%s(), unable to find TSAP\n", __FUNCTION__); + spin_unlock_irqrestore(&irttp->tsaps->hb_spinlock, flags); + return NULL; + } + + /* Allocate a new instance */ + new = kmalloc(sizeof(struct tsap_cb), GFP_ATOMIC); + if (!new) { + IRDA_DEBUG(0, "%s(), unable to kmalloc\n", __FUNCTION__); + spin_unlock_irqrestore(&irttp->tsaps->hb_spinlock, flags); + return NULL; + } + /* Dup */ + memcpy(new, orig, sizeof(struct tsap_cb)); + + /* We don't need the old instance any more */ + spin_unlock_irqrestore(&irttp->tsaps->hb_spinlock, flags); + + /* Try to dup the LSAP (may fail if we were too slow) */ + new->lsap = irlmp_dup(orig->lsap, new); + if (!new->lsap) { + IRDA_DEBUG(0, "%s(), dup failed!\n", __FUNCTION__); + kfree(new); + return NULL; + } + + /* Not everything should be copied */ + new->notify.instance = instance; + init_timer(&new->todo_timer); + + skb_queue_head_init(&new->rx_queue); + skb_queue_head_init(&new->tx_queue); + skb_queue_head_init(&new->rx_fragments); + + /* This is locked */ + hashbin_insert(irttp->tsaps, (irda_queue_t *) new, (long) new, NULL); + + return new; +} +EXPORT_SYMBOL(irttp_dup); + +/* + * Function irttp_disconnect_request (self) + * + * Close this connection please! If priority is high, the queued data + * segments, if any, will be deallocated first + * + */ +int irttp_disconnect_request(struct tsap_cb *self, struct sk_buff *userdata, + int priority) +{ + int ret; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return -1;); + + /* Already disconnected? */ + if (!self->connected) { + IRDA_DEBUG(4, "%s(), already disconnected!\n", __FUNCTION__); + if (userdata) + dev_kfree_skb(userdata); + return -1; + } + + /* Disconnect already pending ? + * We need to use an atomic operation to prevent reentry. This + * function may be called from various context, like user, timer + * for following a disconnect_indication() (i.e. net_bh). + * Jean II */ + if(test_and_set_bit(0, &self->disconnect_pend)) { + IRDA_DEBUG(0, "%s(), disconnect already pending\n", + __FUNCTION__); + if (userdata) + dev_kfree_skb(userdata); + + /* Try to make some progress */ + irttp_run_tx_queue(self); + return -1; + } + + /* + * Check if there is still data segments in the transmit queue + */ + if (skb_queue_len(&self->tx_queue) > 0) { + if (priority == P_HIGH) { + /* + * No need to send the queued data, if we are + * disconnecting right now since the data will + * not have any usable connection to be sent on + */ + IRDA_DEBUG(1, "%s(): High priority!!()\n", __FUNCTION__); + irttp_flush_queues(self); + } else if (priority == P_NORMAL) { + /* + * Must delay disconnect until after all data segments + * have been sent and the tx_queue is empty + */ + /* We'll reuse this one later for the disconnect */ + self->disconnect_skb = userdata; /* May be NULL */ + + irttp_run_tx_queue(self); + + irttp_start_todo_timer(self, HZ/10); + return -1; + } + } + /* Note : we don't need to check if self->rx_queue is full and the + * state of self->rx_sdu_busy because the disconnect response will + * be sent at the LMP level (so even if the peer has its Tx queue + * full of data). - Jean II */ + + IRDA_DEBUG(1, "%s(), Disconnecting ...\n", __FUNCTION__); + self->connected = FALSE; + + if (!userdata) { + struct sk_buff *tx_skb; + tx_skb = dev_alloc_skb(64); + if (!tx_skb) + return -ENOMEM; + + /* + * Reserve space for MUX and LAP header + */ + skb_reserve(tx_skb, TTP_MAX_HEADER); + + userdata = tx_skb; + } + ret = irlmp_disconnect_request(self->lsap, userdata); + + /* The disconnect is no longer pending */ + clear_bit(0, &self->disconnect_pend); /* FALSE */ + + return ret; +} +EXPORT_SYMBOL(irttp_disconnect_request); + +/* + * Function irttp_disconnect_indication (self, reason) + * + * Disconnect indication, TSAP disconnected by peer? + * + */ +void irttp_disconnect_indication(void *instance, void *sap, LM_REASON reason, + struct sk_buff *skb) +{ + struct tsap_cb *self; + + IRDA_DEBUG(4, "%s()\n", __FUNCTION__); + + self = (struct tsap_cb *) instance; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == TTP_TSAP_MAGIC, return;); + + /* Prevent higher layer to send more data */ + self->connected = FALSE; + + /* Check if client has already tried to close the TSAP */ + if (self->close_pend) { + /* In this case, the higher layer is probably gone. Don't + * bother it and clean up the remains - Jean II */ + if (skb) + dev_kfree_skb(skb); + irttp_close_tsap(self); + return; + } + + /* If we are here, we assume that is the higher layer is still + * waiting for the disconnect notification and able to process it, + * even if he tried to disconnect. Otherwise, it would have already + * attempted to close the tsap and self->close_pend would be TRUE. + * Jean II */ + + /* No need to notify the client if has already tried to disconnect */ + if(self->notify.disconnect_indication) + self->notify.disconnect_indication(self->notify.instance, self, + reason, skb); + else + if (skb) + dev_kfree_skb(skb); +} + +/* + * Function irttp_do_data_indication (self, skb) + * + * Try to deliver reassembled skb to layer above, and requeue it if that + * for some reason should fail. We mark rx sdu as busy to apply back + * pressure is necessary. + */ +static void irttp_do_data_indication(struct tsap_cb *self, struct sk_buff *skb) +{ + int err; + + /* Check if client has already closed the TSAP and gone away */ + if (self->close_pend) { + dev_kfree_skb(skb); + return; + } + + err = self->notify.data_indication(self->notify.instance, self, skb); + + /* Usually the layer above will notify that it's input queue is + * starting to get filled by using the flow request, but this may + * be difficult, so it can instead just refuse to eat it and just + * give an error back + */ + if (err) { + IRDA_DEBUG(0, "%s() requeueing skb!\n", __FUNCTION__); + + /* Make sure we take a break */ + self->rx_sdu_busy = TRUE; + + /* Need to push the header in again */ + skb_push(skb, TTP_HEADER); + skb->data[0] = 0x00; /* Make sure MORE bit is cleared */ + + /* Put skb back on queue */ + skb_queue_head(&self->rx_queue, skb); + } +} + +/* + * Function irttp_run_rx_queue (self) + * + * Check if we have any frames to be transmitted, or if we have any + * available credit to give away. + */ +void irttp_run_rx_queue(struct tsap_cb *self) +{ + struct sk_buff *skb; + int more = 0; + + IRDA_DEBUG(2, "%s() send=%d,avail=%d,remote=%d\n", __FUNCTION__, + self->send_credit, self->avail_credit, self->remote_credit); + + /* Get exclusive access to the rx queue, otherwise don't touch it */ + if (irda_lock(&self->rx_queue_lock) == FALSE) + return; + + /* + * Reassemble all frames in receive queue and deliver them + */ + while (!self->rx_sdu_busy && (skb = skb_dequeue(&self->rx_queue))) { + /* This bit will tell us if it's the last fragment or not */ + more = skb->data[0] & 0x80; + + /* Remove TTP header */ + skb_pull(skb, TTP_HEADER); + + /* Add the length of the remaining data */ + self->rx_sdu_size += skb->len; + + /* + * If SAR is disabled, or user has requested no reassembly + * of received fragments then we just deliver them + * immediately. This can be requested by clients that + * implements byte streams without any message boundaries + */ + if (self->rx_max_sdu_size == TTP_SAR_DISABLE) { + irttp_do_data_indication(self, skb); + self->rx_sdu_size = 0; + + continue; + } + + /* Check if this is a fragment, and not the last fragment */ + if (more) { + /* + * Queue the fragment if we still are within the + * limits of the maximum size of the rx_sdu + */ + if (self->rx_sdu_size <= self->rx_max_sdu_size) { + IRDA_DEBUG(4, "%s(), queueing frag\n", + __FUNCTION__); + skb_queue_tail(&self->rx_fragments, skb); + } else { + /* Free the part of the SDU that is too big */ + dev_kfree_skb(skb); + } + continue; + } + /* + * This is the last fragment, so time to reassemble! + */ + if ((self->rx_sdu_size <= self->rx_max_sdu_size) || + (self->rx_max_sdu_size == TTP_SAR_UNBOUND)) + { + /* + * A little optimizing. Only queue the fragment if + * there are other fragments. Since if this is the + * last and only fragment, there is no need to + * reassemble :-) + */ + if (!skb_queue_empty(&self->rx_fragments)) { + skb_queue_tail(&self->rx_fragments, + skb); + + skb = irttp_reassemble_skb(self); + } + + /* Now we can deliver the reassembled skb */ + irttp_do_data_indication(self, skb); + } else { + IRDA_DEBUG(1, "%s(), Truncated frame\n", __FUNCTION__); + + /* Free the part of the SDU that is too big */ + dev_kfree_skb(skb); + + /* Deliver only the valid but truncated part of SDU */ + skb = irttp_reassemble_skb(self); + + irttp_do_data_indication(self, skb); + } + self->rx_sdu_size = 0; + } + + /* + * It's not trivial to keep track of how many credits are available + * by incrementing at each packet, because delivery may fail + * (irttp_do_data_indication() may requeue the frame) and because + * we need to take care of fragmentation. + * We want the other side to send up to initial_credit packets. + * We have some frames in our queues, and we have already allowed it + * to send remote_credit. + * No need to spinlock, write is atomic and self correcting... + * Jean II + */ + self->avail_credit = (self->initial_credit - + (self->remote_credit + + skb_queue_len(&self->rx_queue) + + skb_queue_len(&self->rx_fragments))); + + /* Do we have too much credits to send to peer ? */ + if ((self->remote_credit <= TTP_RX_MIN_CREDIT) && + (self->avail_credit > 0)) { + /* Send explicit credit frame */ + irttp_give_credit(self); + /* Note : do *NOT* check if tx_queue is non-empty, that + * will produce deadlocks. I repeat : send a credit frame + * even if we have something to send in our Tx queue. + * If we have credits, it means that our Tx queue is blocked. + * + * Let's suppose the peer can't keep up with our Tx. He will + * flow control us by not sending us any credits, and we + * will stop Tx and start accumulating credits here. + * Up to the point where the peer will stop its Tx queue, + * for lack of credits. + * Let's assume the peer application is single threaded. + * It will block on Tx and never consume any Rx buffer. + * Deadlock. Guaranteed. - Jean II + */ + } + + /* Reset lock */ + self->rx_queue_lock = 0; +} + +#ifdef CONFIG_PROC_FS +struct irttp_iter_state { + int id; +}; + +static void *irttp_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct irttp_iter_state *iter = seq->private; + struct tsap_cb *self; + + /* Protect our access to the tsap list */ + spin_lock_irq(&irttp->tsaps->hb_spinlock); + iter->id = 0; + + for (self = (struct tsap_cb *) hashbin_get_first(irttp->tsaps); + self != NULL; + self = (struct tsap_cb *) hashbin_get_next(irttp->tsaps)) { + if (iter->id == *pos) + break; + ++iter->id; + } + + return self; +} + +static void *irttp_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct irttp_iter_state *iter = seq->private; + + ++*pos; + ++iter->id; + return (void *) hashbin_get_next(irttp->tsaps); +} + +static void irttp_seq_stop(struct seq_file *seq, void *v) +{ + spin_unlock_irq(&irttp->tsaps->hb_spinlock); +} + +static int irttp_seq_show(struct seq_file *seq, void *v) +{ + const struct irttp_iter_state *iter = seq->private; + const struct tsap_cb *self = v; + + seq_printf(seq, "TSAP %d, ", iter->id); + seq_printf(seq, "stsap_sel: %02x, ", + self->stsap_sel); + seq_printf(seq, "dtsap_sel: %02x\n", + self->dtsap_sel); + seq_printf(seq, " connected: %s, ", + self->connected? "TRUE":"FALSE"); + seq_printf(seq, "avail credit: %d, ", + self->avail_credit); + seq_printf(seq, "remote credit: %d, ", + self->remote_credit); + seq_printf(seq, "send credit: %d\n", + self->send_credit); + seq_printf(seq, " tx packets: %ld, ", + self->stats.tx_packets); + seq_printf(seq, "rx packets: %ld, ", + self->stats.rx_packets); + seq_printf(seq, "tx_queue len: %d ", + skb_queue_len(&self->tx_queue)); + seq_printf(seq, "rx_queue len: %d\n", + skb_queue_len(&self->rx_queue)); + seq_printf(seq, " tx_sdu_busy: %s, ", + self->tx_sdu_busy? "TRUE":"FALSE"); + seq_printf(seq, "rx_sdu_busy: %s\n", + self->rx_sdu_busy? "TRUE":"FALSE"); + seq_printf(seq, " max_seg_size: %d, ", + self->max_seg_size); + seq_printf(seq, "tx_max_sdu_size: %d, ", + self->tx_max_sdu_size); + seq_printf(seq, "rx_max_sdu_size: %d\n", + self->rx_max_sdu_size); + + seq_printf(seq, " Used by (%s)\n\n", + self->notify.name); + return 0; +} + +static struct seq_operations irttp_seq_ops = { + .start = irttp_seq_start, + .next = irttp_seq_next, + .stop = irttp_seq_stop, + .show = irttp_seq_show, +}; + +static int irttp_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + struct irttp_iter_state *s; + + IRDA_ASSERT(irttp != NULL, return -EINVAL;); + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + goto out; + + rc = seq_open(file, &irttp_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = s; + memset(s, 0, sizeof(*s)); +out: + return rc; +out_kfree: + kfree(s); + goto out; +} + +struct file_operations irttp_seq_fops = { + .owner = THIS_MODULE, + .open = irttp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif /* PROC_FS */ diff --git a/net/irda/parameters.c b/net/irda/parameters.c new file mode 100644 index 000000000000..1324942f976c --- /dev/null +++ b/net/irda/parameters.c @@ -0,0 +1,589 @@ +/********************************************************************* + * + * Filename: parameters.c + * Version: 1.0 + * Description: A more general way to handle (pi,pl,pv) parameters + * Status: Experimental. + * Author: Dag Brattli + * Created at: Mon Jun 7 10:25:11 1999 + * Modified at: Sun Jan 30 14:08:39 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1999-2000 Dag Brattli, All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include + +#include +#include + +#include +#include + +static int irda_extract_integer(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func); +static int irda_extract_string(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func); +static int irda_extract_octseq(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func); +static int irda_extract_no_value(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func); + +static int irda_insert_integer(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func); +static int irda_insert_no_value(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func); + +static int irda_param_unpack(__u8 *buf, char *fmt, ...); + +/* Parameter value call table. Must match PV_TYPE */ +static PV_HANDLER pv_extract_table[] = { + irda_extract_integer, /* Handler for any length integers */ + irda_extract_integer, /* Handler for 8 bits integers */ + irda_extract_integer, /* Handler for 16 bits integers */ + irda_extract_string, /* Handler for strings */ + irda_extract_integer, /* Handler for 32 bits integers */ + irda_extract_octseq, /* Handler for octet sequences */ + irda_extract_no_value /* Handler for no value parameters */ +}; + +static PV_HANDLER pv_insert_table[] = { + irda_insert_integer, /* Handler for any length integers */ + irda_insert_integer, /* Handler for 8 bits integers */ + irda_insert_integer, /* Handler for 16 bits integers */ + NULL, /* Handler for strings */ + irda_insert_integer, /* Handler for 32 bits integers */ + NULL, /* Handler for octet sequences */ + irda_insert_no_value /* Handler for no value parameters */ +}; + +/* + * Function irda_insert_no_value (self, buf, len, pi, type, func) + */ +static int irda_insert_no_value(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func) +{ + irda_param_t p; + int ret; + + p.pi = pi; + p.pl = 0; + + /* Call handler for this parameter */ + ret = (*func)(self, &p, PV_GET); + + /* Extract values anyway, since handler may need them */ + irda_param_pack(buf, "bb", p.pi, p.pl); + + if (ret < 0) + return ret; + + return 2; /* Inserted pl+2 bytes */ +} + +/* + * Function irda_extract_no_value (self, buf, len, type, func) + * + * Extracts a parameter without a pv field (pl=0) + * + */ +static int irda_extract_no_value(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func) +{ + irda_param_t p; + int ret; + + /* Extract values anyway, since handler may need them */ + irda_param_unpack(buf, "bb", &p.pi, &p.pl); + + /* Call handler for this parameter */ + ret = (*func)(self, &p, PV_PUT); + + if (ret < 0) + return ret; + + return 2; /* Extracted pl+2 bytes */ +} + +/* + * Function irda_insert_integer (self, buf, len, pi, type, func) + */ +static int irda_insert_integer(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func) +{ + irda_param_t p; + int n = 0; + int err; + + p.pi = pi; /* In case handler needs to know */ + p.pl = type & PV_MASK; /* The integer type codes the lenght as well */ + p.pv.i = 0; /* Clear value */ + + /* Call handler for this parameter */ + err = (*func)(self, &p, PV_GET); + if (err < 0) + return err; + + /* + * If parameter lenght is still 0, then (1) this is an any length + * integer, and (2) the handler function does not care which length + * we choose to use, so we pick the one the gives the fewest bytes. + */ + if (p.pl == 0) { + if (p.pv.i < 0xff) { + IRDA_DEBUG(2, "%s(), using 1 byte\n", __FUNCTION__); + p.pl = 1; + } else if (p.pv.i < 0xffff) { + IRDA_DEBUG(2, "%s(), using 2 bytes\n", __FUNCTION__); + p.pl = 2; + } else { + IRDA_DEBUG(2, "%s(), using 4 bytes\n", __FUNCTION__); + p.pl = 4; /* Default length */ + } + } + /* Check if buffer is long enough for insertion */ + if (len < (2+p.pl)) { + IRDA_WARNING("%s: buffer to short for insertion!\n", + __FUNCTION__); + return -1; + } + IRDA_DEBUG(2, "%s(), pi=%#x, pl=%d, pi=%d\n", __FUNCTION__, + p.pi, p.pl, p.pv.i); + switch (p.pl) { + case 1: + n += irda_param_pack(buf, "bbb", p.pi, p.pl, (__u8) p.pv.i); + break; + case 2: + if (type & PV_BIG_ENDIAN) + p.pv.i = cpu_to_be16((__u16) p.pv.i); + else + p.pv.i = cpu_to_le16((__u16) p.pv.i); + n += irda_param_pack(buf, "bbs", p.pi, p.pl, (__u16) p.pv.i); + break; + case 4: + if (type & PV_BIG_ENDIAN) + cpu_to_be32s(&p.pv.i); + else + cpu_to_le32s(&p.pv.i); + n += irda_param_pack(buf, "bbi", p.pi, p.pl, p.pv.i); + + break; + default: + IRDA_WARNING("%s: length %d not supported\n", + __FUNCTION__, p.pl); + /* Skip parameter */ + return -1; + } + + return p.pl+2; /* Inserted pl+2 bytes */ +} + +/* + * Function irda_extract integer (self, buf, len, pi, type, func) + * + * Extract a possibly variable length integer from buffer, and call + * handler for processing of the parameter + */ +static int irda_extract_integer(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func) +{ + irda_param_t p; + int n = 0; + int extract_len; /* Real lenght we extract */ + int err; + + p.pi = pi; /* In case handler needs to know */ + p.pl = buf[1]; /* Extract lenght of value */ + p.pv.i = 0; /* Clear value */ + extract_len = p.pl; /* Default : extract all */ + + /* Check if buffer is long enough for parsing */ + if (len < (2+p.pl)) { + IRDA_WARNING("%s: buffer to short for parsing! " + "Need %d bytes, but len is only %d\n", + __FUNCTION__, p.pl, len); + return -1; + } + + /* + * Check that the integer length is what we expect it to be. If the + * handler want a 16 bits integer then a 32 bits is not good enough + * PV_INTEGER means that the handler is flexible. + */ + if (((type & PV_MASK) != PV_INTEGER) && ((type & PV_MASK) != p.pl)) { + IRDA_ERROR("%s: invalid parameter length! " + "Expected %d bytes, but value had %d bytes!\n", + __FUNCTION__, type & PV_MASK, p.pl); + + /* Most parameters are bit/byte fields or little endian, + * so it's ok to only extract a subset of it (the subset + * that the handler expect). This is necessary, as some + * broken implementations seems to add extra undefined bits. + * If the parameter is shorter than we expect or is big + * endian, we can't play those tricks. Jean II */ + if((p.pl < (type & PV_MASK)) || (type & PV_BIG_ENDIAN)) { + /* Skip parameter */ + return p.pl+2; + } else { + /* Extract subset of it, fallthrough */ + extract_len = type & PV_MASK; + } + } + + + switch (extract_len) { + case 1: + n += irda_param_unpack(buf+2, "b", &p.pv.i); + break; + case 2: + n += irda_param_unpack(buf+2, "s", &p.pv.i); + if (type & PV_BIG_ENDIAN) + p.pv.i = be16_to_cpu((__u16) p.pv.i); + else + p.pv.i = le16_to_cpu((__u16) p.pv.i); + break; + case 4: + n += irda_param_unpack(buf+2, "i", &p.pv.i); + if (type & PV_BIG_ENDIAN) + be32_to_cpus(&p.pv.i); + else + le32_to_cpus(&p.pv.i); + break; + default: + IRDA_WARNING("%s: length %d not supported\n", + __FUNCTION__, p.pl); + + /* Skip parameter */ + return p.pl+2; + } + + IRDA_DEBUG(2, "%s(), pi=%#x, pl=%d, pi=%d\n", __FUNCTION__, + p.pi, p.pl, p.pv.i); + /* Call handler for this parameter */ + err = (*func)(self, &p, PV_PUT); + if (err < 0) + return err; + + return p.pl+2; /* Extracted pl+2 bytes */ +} + +/* + * Function irda_extract_string (self, buf, len, type, func) + */ +static int irda_extract_string(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func) +{ + char str[33]; + irda_param_t p; + int err; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + p.pi = pi; /* In case handler needs to know */ + p.pl = buf[1]; /* Extract lenght of value */ + + IRDA_DEBUG(2, "%s(), pi=%#x, pl=%d\n", __FUNCTION__, + p.pi, p.pl); + + /* Check if buffer is long enough for parsing */ + if (len < (2+p.pl)) { + IRDA_WARNING("%s: buffer to short for parsing! " + "Need %d bytes, but len is only %d\n", + __FUNCTION__, p.pl, len); + return -1; + } + + /* Should be safe to copy string like this since we have already + * checked that the buffer is long enough */ + strncpy(str, buf+2, p.pl); + + IRDA_DEBUG(2, "%s(), str=0x%02x 0x%02x\n", __FUNCTION__, + (__u8) str[0], (__u8) str[1]); + + /* Null terminate string */ + str[p.pl+1] = '\0'; + + p.pv.c = str; /* Handler will need to take a copy */ + + /* Call handler for this parameter */ + err = (*func)(self, &p, PV_PUT); + if (err < 0) + return err; + + return p.pl+2; /* Extracted pl+2 bytes */ +} + +/* + * Function irda_extract_octseq (self, buf, len, type, func) + */ +static int irda_extract_octseq(void *self, __u8 *buf, int len, __u8 pi, + PV_TYPE type, PI_HANDLER func) +{ + irda_param_t p; + + p.pi = pi; /* In case handler needs to know */ + p.pl = buf[1]; /* Extract lenght of value */ + + /* Check if buffer is long enough for parsing */ + if (len < (2+p.pl)) { + IRDA_WARNING("%s: buffer to short for parsing! " + "Need %d bytes, but len is only %d\n", + __FUNCTION__, p.pl, len); + return -1; + } + + IRDA_DEBUG(0, "%s(), not impl\n", __FUNCTION__); + + return p.pl+2; /* Extracted pl+2 bytes */ +} + +/* + * Function irda_param_pack (skb, fmt, ...) + * + * Format: + * 'i' = 32 bits integer + * 's' = string + * + */ +int irda_param_pack(__u8 *buf, char *fmt, ...) +{ + irda_pv_t arg; + va_list args; + char *p; + int n = 0; + + va_start(args, fmt); + + for (p = fmt; *p != '\0'; p++) { + switch (*p) { + case 'b': /* 8 bits unsigned byte */ + buf[n++] = (__u8)va_arg(args, int); + break; + case 's': /* 16 bits unsigned short */ + arg.i = (__u16)va_arg(args, int); + put_unaligned((__u16)arg.i, (__u16 *)(buf+n)); n+=2; + break; + case 'i': /* 32 bits unsigned integer */ + arg.i = va_arg(args, __u32); + put_unaligned(arg.i, (__u32 *)(buf+n)); n+=4; + break; +#if 0 + case 'c': /* \0 terminated string */ + arg.c = va_arg(args, char *); + strcpy(buf+n, arg.c); + n += strlen(arg.c) + 1; + break; +#endif + default: + va_end(args); + return -1; + } + } + va_end(args); + + return 0; +} +EXPORT_SYMBOL(irda_param_pack); + +/* + * Function irda_param_unpack (skb, fmt, ...) + */ +static int irda_param_unpack(__u8 *buf, char *fmt, ...) +{ + irda_pv_t arg; + va_list args; + char *p; + int n = 0; + + va_start(args, fmt); + + for (p = fmt; *p != '\0'; p++) { + switch (*p) { + case 'b': /* 8 bits byte */ + arg.ip = va_arg(args, __u32 *); + *arg.ip = buf[n++]; + break; + case 's': /* 16 bits short */ + arg.ip = va_arg(args, __u32 *); + *arg.ip = get_unaligned((__u16 *)(buf+n)); n+=2; + break; + case 'i': /* 32 bits unsigned integer */ + arg.ip = va_arg(args, __u32 *); + *arg.ip = get_unaligned((__u32 *)(buf+n)); n+=4; + break; +#if 0 + case 'c': /* \0 terminated string */ + arg.c = va_arg(args, char *); + strcpy(arg.c, buf+n); + n += strlen(arg.c) + 1; + break; +#endif + default: + va_end(args); + return -1; + } + + } + va_end(args); + + return 0; +} + +/* + * Function irda_param_insert (self, pi, buf, len, info) + * + * Insert the specified parameter (pi) into buffer. Returns number of + * bytes inserted + */ +int irda_param_insert(void *self, __u8 pi, __u8 *buf, int len, + pi_param_info_t *info) +{ + pi_minor_info_t *pi_minor_info; + __u8 pi_minor; + __u8 pi_major; + int type; + int ret = -1; + int n = 0; + + IRDA_ASSERT(buf != NULL, return ret;); + IRDA_ASSERT(info != 0, return ret;); + + pi_minor = pi & info->pi_mask; + pi_major = pi >> info->pi_major_offset; + + /* Check if the identifier value (pi) is valid */ + if ((pi_major > info->len-1) || + (pi_minor > info->tables[pi_major].len-1)) + { + IRDA_DEBUG(0, "%s(), no handler for parameter=0x%02x\n", + __FUNCTION__, pi); + + /* Skip this parameter */ + return -1; + } + + /* Lookup the info on how to parse this parameter */ + pi_minor_info = &info->tables[pi_major].pi_minor_call_table[pi_minor]; + + /* Find expected data type for this parameter identifier (pi)*/ + type = pi_minor_info->type; + + /* Check if handler has been implemented */ + if (!pi_minor_info->func) { + IRDA_MESSAGE("%s: no handler for pi=%#x\n", __FUNCTION__, pi); + /* Skip this parameter */ + return -1; + } + + /* Insert parameter value */ + ret = (*pv_insert_table[type & PV_MASK])(self, buf+n, len, pi, type, + pi_minor_info->func); + return ret; +} +EXPORT_SYMBOL(irda_param_insert); + +/* + * Function irda_param_extract (self, buf, len, info) + * + * Parse all parameters. If len is correct, then everything should be + * safe. Returns the number of bytes that was parsed + * + */ +static int irda_param_extract(void *self, __u8 *buf, int len, + pi_param_info_t *info) +{ + pi_minor_info_t *pi_minor_info; + __u8 pi_minor; + __u8 pi_major; + int type; + int ret = -1; + int n = 0; + + IRDA_ASSERT(buf != NULL, return ret;); + IRDA_ASSERT(info != 0, return ret;); + + pi_minor = buf[n] & info->pi_mask; + pi_major = buf[n] >> info->pi_major_offset; + + /* Check if the identifier value (pi) is valid */ + if ((pi_major > info->len-1) || + (pi_minor > info->tables[pi_major].len-1)) + { + IRDA_DEBUG(0, "%s(), no handler for parameter=0x%02x\n", + __FUNCTION__, buf[0]); + + /* Skip this parameter */ + return 2 + buf[n + 1]; /* Continue */ + } + + /* Lookup the info on how to parse this parameter */ + pi_minor_info = &info->tables[pi_major].pi_minor_call_table[pi_minor]; + + /* Find expected data type for this parameter identifier (pi)*/ + type = pi_minor_info->type; + + IRDA_DEBUG(3, "%s(), pi=[%d,%d], type=%d\n", __FUNCTION__, + pi_major, pi_minor, type); + + /* Check if handler has been implemented */ + if (!pi_minor_info->func) { + IRDA_MESSAGE("%s: no handler for pi=%#x\n", + __FUNCTION__, buf[n]); + /* Skip this parameter */ + return 2 + buf[n + 1]; /* Continue */ + } + + /* Parse parameter value */ + ret = (*pv_extract_table[type & PV_MASK])(self, buf+n, len, buf[n], + type, pi_minor_info->func); + return ret; +} + +/* + * Function irda_param_extract_all (self, buf, len, info) + * + * Parse all parameters. If len is correct, then everything should be + * safe. Returns the number of bytes that was parsed + * + */ +int irda_param_extract_all(void *self, __u8 *buf, int len, + pi_param_info_t *info) +{ + int ret = -1; + int n = 0; + + IRDA_ASSERT(buf != NULL, return ret;); + IRDA_ASSERT(info != 0, return ret;); + + /* + * Parse all parameters. Each parameter must be at least two bytes + * long or else there is no point in trying to parse it + */ + while (len > 2) { + ret = irda_param_extract(self, buf+n, len, info); + if (ret < 0) + return ret; + + n += ret; + len -= ret; + } + return n; +} +EXPORT_SYMBOL(irda_param_extract_all); diff --git a/net/irda/qos.c b/net/irda/qos.c new file mode 100644 index 000000000000..df732d56cc57 --- /dev/null +++ b/net/irda/qos.c @@ -0,0 +1,774 @@ +/********************************************************************* + * + * Filename: qos.c + * Version: 1.0 + * Description: IrLAP QoS parameter negotiation + * Status: Stable + * Author: Dag Brattli + * Created at: Tue Sep 9 00:00:26 1997 + * Modified at: Sun Jan 30 14:29:16 2000 + * Modified by: Dag Brattli + * + * Copyright (c) 1998-2000 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2001 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, + * MA 02111-1307 USA + * + ********************************************************************/ + +#include +#include + +#include +#include +#include +#include + +/* + * Maximum values of the baud rate we negociate with the other end. + * Most often, you don't have to change that, because Linux-IrDA will + * use the maximum offered by the link layer, which usually works fine. + * In some very rare cases, you may want to limit it to lower speeds... + */ +int sysctl_max_baud_rate = 16000000; +/* + * Maximum value of the lap disconnect timer we negociate with the other end. + * Most often, the value below represent the best compromise, but some user + * may want to keep the LAP alive longuer or shorter in case of link failure. + * Remember that the threshold time (early warning) is fixed to 3s... + */ +int sysctl_max_noreply_time = 12; +/* + * Minimum turn time to be applied before transmitting to the peer. + * Nonzero values (usec) are used as lower limit to the per-connection + * mtt value which was announced by the other end during negotiation. + * Might be helpful if the peer device provides too short mtt. + * Default is 10us which means using the unmodified value given by the + * peer except if it's 0 (0 is likely a bug in the other stack). + */ +unsigned sysctl_min_tx_turn_time = 10; +/* + * Maximum data size to be used in transmission in payload of LAP frame. + * There is a bit of confusion in the IrDA spec : + * The LAP spec defines the payload of a LAP frame (I field) to be + * 2048 bytes max (IrLAP 1.1, chapt 6.6.5, p40). + * On the other hand, the PHY mention frames of 2048 bytes max (IrPHY + * 1.2, chapt 5.3.2.1, p41). But, this number includes the LAP header + * (2 bytes), and CRC (32 bits at 4 Mb/s). So, for the I field (LAP + * payload), that's only 2042 bytes. Oups ! + * My nsc-ircc hardware has troubles receiving 2048 bytes frames at 4 Mb/s, + * so adjust to 2042... I don't know if this bug applies only for 2048 + * bytes frames or all negotiated frame sizes, but you can use the sysctl + * to play with this value anyway. + * Jean II */ +unsigned sysctl_max_tx_data_size = 2042; +/* + * Maximum transmit window, i.e. number of LAP frames between turn-around. + * This allow to override what the peer told us. Some peers are buggy and + * don't always support what they tell us. + * Jean II */ +unsigned sysctl_max_tx_window = 7; + +static int irlap_param_baud_rate(void *instance, irda_param_t *param, int get); +static int irlap_param_link_disconnect(void *instance, irda_param_t *parm, + int get); +static int irlap_param_max_turn_time(void *instance, irda_param_t *param, + int get); +static int irlap_param_data_size(void *instance, irda_param_t *param, int get); +static int irlap_param_window_size(void *instance, irda_param_t *param, + int get); +static int irlap_param_additional_bofs(void *instance, irda_param_t *parm, + int get); +static int irlap_param_min_turn_time(void *instance, irda_param_t *param, + int get); + +#ifndef CONFIG_IRDA_DYNAMIC_WINDOW +static __u32 irlap_requested_line_capacity(struct qos_info *qos); +#endif + +static __u32 min_turn_times[] = { 10000, 5000, 1000, 500, 100, 50, 10, 0 }; /* us */ +static __u32 baud_rates[] = { 2400, 9600, 19200, 38400, 57600, 115200, 576000, + 1152000, 4000000, 16000000 }; /* bps */ +static __u32 data_sizes[] = { 64, 128, 256, 512, 1024, 2048 }; /* bytes */ +static __u32 add_bofs[] = { 48, 24, 12, 5, 3, 2, 1, 0 }; /* bytes */ +static __u32 max_turn_times[] = { 500, 250, 100, 50 }; /* ms */ +static __u32 link_disc_times[] = { 3, 8, 12, 16, 20, 25, 30, 40 }; /* secs */ + +static __u32 max_line_capacities[10][4] = { + /* 500 ms 250 ms 100 ms 50 ms (max turn time) */ + { 100, 0, 0, 0 }, /* 2400 bps */ + { 400, 0, 0, 0 }, /* 9600 bps */ + { 800, 0, 0, 0 }, /* 19200 bps */ + { 1600, 0, 0, 0 }, /* 38400 bps */ + { 2360, 0, 0, 0 }, /* 57600 bps */ + { 4800, 2400, 960, 480 }, /* 115200 bps */ + { 28800, 11520, 5760, 2880 }, /* 576000 bps */ + { 57600, 28800, 11520, 5760 }, /* 1152000 bps */ + { 200000, 100000, 40000, 20000 }, /* 4000000 bps */ + { 800000, 400000, 160000, 80000 }, /* 16000000 bps */ +}; + +static pi_minor_info_t pi_minor_call_table_type_0[] = { + { NULL, 0 }, +/* 01 */{ irlap_param_baud_rate, PV_INTEGER | PV_LITTLE_ENDIAN }, + { NULL, 0 }, + { NULL, 0 }, + { NULL, 0 }, + { NULL, 0 }, + { NULL, 0 }, + { NULL, 0 }, +/* 08 */{ irlap_param_link_disconnect, PV_INT_8_BITS } +}; + +static pi_minor_info_t pi_minor_call_table_type_1[] = { + { NULL, 0 }, + { NULL, 0 }, +/* 82 */{ irlap_param_max_turn_time, PV_INT_8_BITS }, +/* 83 */{ irlap_param_data_size, PV_INT_8_BITS }, +/* 84 */{ irlap_param_window_size, PV_INT_8_BITS }, +/* 85 */{ irlap_param_additional_bofs, PV_INT_8_BITS }, +/* 86 */{ irlap_param_min_turn_time, PV_INT_8_BITS }, +}; + +static pi_major_info_t pi_major_call_table[] = { + { pi_minor_call_table_type_0, 9 }, + { pi_minor_call_table_type_1, 7 }, +}; + +static pi_param_info_t irlap_param_info = { pi_major_call_table, 2, 0x7f, 7 }; + +/* ---------------------- LOCAL SUBROUTINES ---------------------- */ +/* Note : we start with a bunch of local subroutines. + * As the compiler is "one pass", this is the only way to get them to + * inline properly... + * Jean II + */ +/* + * Function value_index (value, array, size) + * + * Returns the index to the value in the specified array + */ +static inline int value_index(__u32 value, __u32 *array, int size) +{ + int i; + + for (i=0; i < size; i++) + if (array[i] == value) + break; + return i; +} + +/* + * Function index_value (index, array) + * + * Returns value to index in array, easy! + * + */ +static inline __u32 index_value(int index, __u32 *array) +{ + return array[index]; +} + +/* + * Function msb_index (word) + * + * Returns index to most significant bit (MSB) in word + * + */ +static int msb_index (__u16 word) +{ + __u16 msb = 0x8000; + int index = 15; /* Current MSB */ + + /* Check for buggy peers. + * Note : there is a small probability that it could be us, but I + * would expect driver authors to catch that pretty early and be + * able to check precisely what's going on. If a end user sees this, + * it's very likely the peer. - Jean II */ + if (word == 0) { + IRDA_WARNING("%s(), Detected buggy peer, adjust null PV to 0x1!\n", + __FUNCTION__); + /* The only safe choice (we don't know the array size) */ + word = 0x1; + } + + while (msb) { + if (word & msb) + break; /* Found it! */ + msb >>=1; + index--; + } + return index; +} + +/* + * Function value_lower_bits (value, array) + * + * Returns a bit field marking all possibility lower than value. + */ +static inline int value_lower_bits(__u32 value, __u32 *array, int size, __u16 *field) +{ + int i; + __u16 mask = 0x1; + __u16 result = 0x0; + + for (i=0; i < size; i++) { + /* Add the current value to the bit field, shift mask */ + result |= mask; + mask <<= 1; + /* Finished ? */ + if (array[i] >= value) + break; + } + /* Send back a valid index */ + if(i >= size) + i = size - 1; /* Last item */ + *field = result; + return i; +} + +/* + * Function value_highest_bit (value, array) + * + * Returns a bit field marking the highest possibility lower than value. + */ +static inline int value_highest_bit(__u32 value, __u32 *array, int size, __u16 *field) +{ + int i; + __u16 mask = 0x1; + __u16 result = 0x0; + + for (i=0; i < size; i++) { + /* Finished ? */ + if (array[i] <= value) + break; + /* Shift mask */ + mask <<= 1; + } + /* Set the current value to the bit field */ + result |= mask; + /* Send back a valid index */ + if(i >= size) + i = size - 1; /* Last item */ + *field = result; + return i; +} + +/* -------------------------- MAIN CALLS -------------------------- */ + +/* + * Function irda_qos_compute_intersection (qos, new) + * + * Compute the intersection of the old QoS capabilities with new ones + * + */ +void irda_qos_compute_intersection(struct qos_info *qos, struct qos_info *new) +{ + IRDA_ASSERT(qos != NULL, return;); + IRDA_ASSERT(new != NULL, return;); + + /* Apply */ + qos->baud_rate.bits &= new->baud_rate.bits; + qos->window_size.bits &= new->window_size.bits; + qos->min_turn_time.bits &= new->min_turn_time.bits; + qos->max_turn_time.bits &= new->max_turn_time.bits; + qos->data_size.bits &= new->data_size.bits; + qos->link_disc_time.bits &= new->link_disc_time.bits; + qos->additional_bofs.bits &= new->additional_bofs.bits; + + irda_qos_bits_to_value(qos); +} + +/* + * Function irda_init_max_qos_capabilies (qos) + * + * The purpose of this function is for layers and drivers to be able to + * set the maximum QoS possible and then "and in" their own limitations + * + */ +void irda_init_max_qos_capabilies(struct qos_info *qos) +{ + int i; + /* + * These are the maximum supported values as specified on pages + * 39-43 in IrLAP + */ + + /* Use sysctl to set some configurable values... */ + /* Set configured max speed */ + i = value_lower_bits(sysctl_max_baud_rate, baud_rates, 10, + &qos->baud_rate.bits); + sysctl_max_baud_rate = index_value(i, baud_rates); + + /* Set configured max disc time */ + i = value_lower_bits(sysctl_max_noreply_time, link_disc_times, 8, + &qos->link_disc_time.bits); + sysctl_max_noreply_time = index_value(i, link_disc_times); + + /* LSB is first byte, MSB is second byte */ + qos->baud_rate.bits &= 0x03ff; + + qos->window_size.bits = 0x7f; + qos->min_turn_time.bits = 0xff; + qos->max_turn_time.bits = 0x0f; + qos->data_size.bits = 0x3f; + qos->link_disc_time.bits &= 0xff; + qos->additional_bofs.bits = 0xff; +} +EXPORT_SYMBOL(irda_init_max_qos_capabilies); + +/* + * Function irlap_adjust_qos_settings (qos) + * + * Adjust QoS settings in case some values are not possible to use because + * of other settings + */ +static void irlap_adjust_qos_settings(struct qos_info *qos) +{ + __u32 line_capacity; + int index; + + IRDA_DEBUG(2, "%s()\n", __FUNCTION__); + + /* + * Make sure the mintt is sensible. + * Main culprit : Ericsson T39. - Jean II + */ + if (sysctl_min_tx_turn_time > qos->min_turn_time.value) { + int i; + + IRDA_WARNING("%s(), Detected buggy peer, adjust mtt to %dus!\n", + __FUNCTION__, sysctl_min_tx_turn_time); + + /* We don't really need bits, but easier this way */ + i = value_highest_bit(sysctl_min_tx_turn_time, min_turn_times, + 8, &qos->min_turn_time.bits); + sysctl_min_tx_turn_time = index_value(i, min_turn_times); + qos->min_turn_time.value = sysctl_min_tx_turn_time; + } + + /* + * Not allowed to use a max turn time less than 500 ms if the baudrate + * is less than 115200 + */ + if ((qos->baud_rate.value < 115200) && + (qos->max_turn_time.value < 500)) + { + IRDA_DEBUG(0, + "%s(), adjusting max turn time from %d to 500 ms\n", + __FUNCTION__, qos->max_turn_time.value); + qos->max_turn_time.value = 500; + } + + /* + * The data size must be adjusted according to the baud rate and max + * turn time + */ + index = value_index(qos->data_size.value, data_sizes, 6); + line_capacity = irlap_max_line_capacity(qos->baud_rate.value, + qos->max_turn_time.value); + +#ifdef CONFIG_IRDA_DYNAMIC_WINDOW + while ((qos->data_size.value > line_capacity) && (index > 0)) { + qos->data_size.value = data_sizes[index--]; + IRDA_DEBUG(2, "%s(), reducing data size to %d\n", + __FUNCTION__, qos->data_size.value); + } +#else /* Use method described in section 6.6.11 of IrLAP */ + while (irlap_requested_line_capacity(qos) > line_capacity) { + IRDA_ASSERT(index != 0, return;); + + /* Must be able to send at least one frame */ + if (qos->window_size.value > 1) { + qos->window_size.value--; + IRDA_DEBUG(2, "%s(), reducing window size to %d\n", + __FUNCTION__, qos->window_size.value); + } else if (index > 1) { + qos->data_size.value = data_sizes[index--]; + IRDA_DEBUG(2, "%s(), reducing data size to %d\n", + __FUNCTION__, qos->data_size.value); + } else { + IRDA_WARNING("%s(), nothing more we can do!\n", + __FUNCTION__); + } + } +#endif /* CONFIG_IRDA_DYNAMIC_WINDOW */ + /* + * Fix tx data size according to user limits - Jean II + */ + if (qos->data_size.value > sysctl_max_tx_data_size) + /* Allow non discrete adjustement to avoid loosing capacity */ + qos->data_size.value = sysctl_max_tx_data_size; + /* + * Override Tx window if user request it. - Jean II + */ + if (qos->window_size.value > sysctl_max_tx_window) + qos->window_size.value = sysctl_max_tx_window; +} + +/* + * Function irlap_negotiate (qos_device, qos_session, skb) + * + * Negotiate QoS values, not really that much negotiation :-) + * We just set the QoS capabilities for the peer station + * + */ +int irlap_qos_negotiate(struct irlap_cb *self, struct sk_buff *skb) +{ + int ret; + + ret = irda_param_extract_all(self, skb->data, skb->len, + &irlap_param_info); + + /* Convert the negotiated bits to values */ + irda_qos_bits_to_value(&self->qos_tx); + irda_qos_bits_to_value(&self->qos_rx); + + irlap_adjust_qos_settings(&self->qos_tx); + + IRDA_DEBUG(2, "Setting BAUD_RATE to %d bps.\n", + self->qos_tx.baud_rate.value); + IRDA_DEBUG(2, "Setting DATA_SIZE to %d bytes\n", + self->qos_tx.data_size.value); + IRDA_DEBUG(2, "Setting WINDOW_SIZE to %d\n", + self->qos_tx.window_size.value); + IRDA_DEBUG(2, "Setting XBOFS to %d\n", + self->qos_tx.additional_bofs.value); + IRDA_DEBUG(2, "Setting MAX_TURN_TIME to %d ms.\n", + self->qos_tx.max_turn_time.value); + IRDA_DEBUG(2, "Setting MIN_TURN_TIME to %d usecs.\n", + self->qos_tx.min_turn_time.value); + IRDA_DEBUG(2, "Setting LINK_DISC to %d secs.\n", + self->qos_tx.link_disc_time.value); + return ret; +} + +/* + * Function irlap_insert_negotiation_params (qos, fp) + * + * Insert QoS negotiaion pararameters into frame + * + */ +int irlap_insert_qos_negotiation_params(struct irlap_cb *self, + struct sk_buff *skb) +{ + int ret; + + /* Insert data rate */ + ret = irda_param_insert(self, PI_BAUD_RATE, skb->tail, + skb_tailroom(skb), &irlap_param_info); + if (ret < 0) + return ret; + skb_put(skb, ret); + + /* Insert max turnaround time */ + ret = irda_param_insert(self, PI_MAX_TURN_TIME, skb->tail, + skb_tailroom(skb), &irlap_param_info); + if (ret < 0) + return ret; + skb_put(skb, ret); + + /* Insert data size */ + ret = irda_param_insert(self, PI_DATA_SIZE, skb->tail, + skb_tailroom(skb), &irlap_param_info); + if (ret < 0) + return ret; + skb_put(skb, ret); + + /* Insert window size */ + ret = irda_param_insert(self, PI_WINDOW_SIZE, skb->tail, + skb_tailroom(skb), &irlap_param_info); + if (ret < 0) + return ret; + skb_put(skb, ret); + + /* Insert additional BOFs */ + ret = irda_param_insert(self, PI_ADD_BOFS, skb->tail, + skb_tailroom(skb), &irlap_param_info); + if (ret < 0) + return ret; + skb_put(skb, ret); + + /* Insert minimum turnaround time */ + ret = irda_param_insert(self, PI_MIN_TURN_TIME, skb->tail, + skb_tailroom(skb), &irlap_param_info); + if (ret < 0) + return ret; + skb_put(skb, ret); + + /* Insert link disconnect/threshold time */ + ret = irda_param_insert(self, PI_LINK_DISC, skb->tail, + skb_tailroom(skb), &irlap_param_info); + if (ret < 0) + return ret; + skb_put(skb, ret); + + return 0; +} + +/* + * Function irlap_param_baud_rate (instance, param, get) + * + * Negotiate data-rate + * + */ +static int irlap_param_baud_rate(void *instance, irda_param_t *param, int get) +{ + __u16 final; + + struct irlap_cb *self = (struct irlap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + if (get) { + param->pv.i = self->qos_rx.baud_rate.bits; + IRDA_DEBUG(2, "%s(), baud rate = 0x%02x\n", + __FUNCTION__, param->pv.i); + } else { + /* + * Stations must agree on baud rate, so calculate + * intersection + */ + IRDA_DEBUG(2, "Requested BAUD_RATE: 0x%04x\n", (__u16) param->pv.i); + final = (__u16) param->pv.i & self->qos_rx.baud_rate.bits; + + IRDA_DEBUG(2, "Final BAUD_RATE: 0x%04x\n", final); + self->qos_tx.baud_rate.bits = final; + self->qos_rx.baud_rate.bits = final; + } + + return 0; +} + +/* + * Function irlap_param_link_disconnect (instance, param, get) + * + * Negotiate link disconnect/threshold time. + * + */ +static int irlap_param_link_disconnect(void *instance, irda_param_t *param, + int get) +{ + __u16 final; + + struct irlap_cb *self = (struct irlap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + if (get) + param->pv.i = self->qos_rx.link_disc_time.bits; + else { + /* + * Stations must agree on link disconnect/threshold + * time. + */ + IRDA_DEBUG(2, "LINK_DISC: %02x\n", (__u8) param->pv.i); + final = (__u8) param->pv.i & self->qos_rx.link_disc_time.bits; + + IRDA_DEBUG(2, "Final LINK_DISC: %02x\n", final); + self->qos_tx.link_disc_time.bits = final; + self->qos_rx.link_disc_time.bits = final; + } + return 0; +} + +/* + * Function irlap_param_max_turn_time (instance, param, get) + * + * Negotiate the maximum turnaround time. This is a type 1 parameter and + * will be negotiated independently for each station + * + */ +static int irlap_param_max_turn_time(void *instance, irda_param_t *param, + int get) +{ + struct irlap_cb *self = (struct irlap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + if (get) + param->pv.i = self->qos_rx.max_turn_time.bits; + else + self->qos_tx.max_turn_time.bits = (__u8) param->pv.i; + + return 0; +} + +/* + * Function irlap_param_data_size (instance, param, get) + * + * Negotiate the data size. This is a type 1 parameter and + * will be negotiated independently for each station + * + */ +static int irlap_param_data_size(void *instance, irda_param_t *param, int get) +{ + struct irlap_cb *self = (struct irlap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + if (get) + param->pv.i = self->qos_rx.data_size.bits; + else + self->qos_tx.data_size.bits = (__u8) param->pv.i; + + return 0; +} + +/* + * Function irlap_param_window_size (instance, param, get) + * + * Negotiate the window size. This is a type 1 parameter and + * will be negotiated independently for each station + * + */ +static int irlap_param_window_size(void *instance, irda_param_t *param, + int get) +{ + struct irlap_cb *self = (struct irlap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + if (get) + param->pv.i = self->qos_rx.window_size.bits; + else + self->qos_tx.window_size.bits = (__u8) param->pv.i; + + return 0; +} + +/* + * Function irlap_param_additional_bofs (instance, param, get) + * + * Negotiate additional BOF characters. This is a type 1 parameter and + * will be negotiated independently for each station. + */ +static int irlap_param_additional_bofs(void *instance, irda_param_t *param, int get) +{ + struct irlap_cb *self = (struct irlap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + if (get) + param->pv.i = self->qos_rx.additional_bofs.bits; + else + self->qos_tx.additional_bofs.bits = (__u8) param->pv.i; + + return 0; +} + +/* + * Function irlap_param_min_turn_time (instance, param, get) + * + * Negotiate the minimum turn around time. This is a type 1 parameter and + * will be negotiated independently for each station + */ +static int irlap_param_min_turn_time(void *instance, irda_param_t *param, + int get) +{ + struct irlap_cb *self = (struct irlap_cb *) instance; + + IRDA_ASSERT(self != NULL, return -1;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return -1;); + + if (get) + param->pv.i = self->qos_rx.min_turn_time.bits; + else + self->qos_tx.min_turn_time.bits = (__u8) param->pv.i; + + return 0; +} + +/* + * Function irlap_max_line_capacity (speed, max_turn_time, min_turn_time) + * + * Calculate the maximum line capacity + * + */ +__u32 irlap_max_line_capacity(__u32 speed, __u32 max_turn_time) +{ + __u32 line_capacity; + int i,j; + + IRDA_DEBUG(2, "%s(), speed=%d, max_turn_time=%d\n", + __FUNCTION__, speed, max_turn_time); + + i = value_index(speed, baud_rates, 10); + j = value_index(max_turn_time, max_turn_times, 4); + + IRDA_ASSERT(((i >=0) && (i <10)), return 0;); + IRDA_ASSERT(((j >=0) && (j <4)), return 0;); + + line_capacity = max_line_capacities[i][j]; + + IRDA_DEBUG(2, "%s(), line capacity=%d bytes\n", + __FUNCTION__, line_capacity); + + return line_capacity; +} + +#ifndef CONFIG_IRDA_DYNAMIC_WINDOW +static __u32 irlap_requested_line_capacity(struct qos_info *qos) +{ + __u32 line_capacity; + + line_capacity = qos->window_size.value * + (qos->data_size.value + 6 + qos->additional_bofs.value) + + irlap_min_turn_time_in_bytes(qos->baud_rate.value, + qos->min_turn_time.value); + + IRDA_DEBUG(2, "%s(), requested line capacity=%d\n", + __FUNCTION__, line_capacity); + + return line_capacity; +} +#endif + +void irda_qos_bits_to_value(struct qos_info *qos) +{ + int index; + + IRDA_ASSERT(qos != NULL, return;); + + index = msb_index(qos->baud_rate.bits); + qos->baud_rate.value = baud_rates[index]; + + index = msb_index(qos->data_size.bits); + qos->data_size.value = data_sizes[index]; + + index = msb_index(qos->window_size.bits); + qos->window_size.value = index+1; + + index = msb_index(qos->min_turn_time.bits); + qos->min_turn_time.value = min_turn_times[index]; + + index = msb_index(qos->max_turn_time.bits); + qos->max_turn_time.value = max_turn_times[index]; + + index = msb_index(qos->link_disc_time.bits); + qos->link_disc_time.value = link_disc_times[index]; + + index = msb_index(qos->additional_bofs.bits); + qos->additional_bofs.value = add_bofs[index]; +} +EXPORT_SYMBOL(irda_qos_bits_to_value); diff --git a/net/irda/timer.c b/net/irda/timer.c new file mode 100644 index 000000000000..0e17f976add6 --- /dev/null +++ b/net/irda/timer.c @@ -0,0 +1,233 @@ +/********************************************************************* + * + * Filename: timer.c + * Version: + * Description: + * Status: Experimental. + * Author: Dag Brattli + * Created at: Sat Aug 16 00:59:29 1997 + * Modified at: Wed Dec 8 12:50:34 1999 + * Modified by: Dag Brattli + * + * Copyright (c) 1997, 1999 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2002 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include + +#include +#include +#include +#include +#include + +extern int sysctl_slot_timeout; + +static void irlap_slot_timer_expired(void* data); +static void irlap_query_timer_expired(void* data); +static void irlap_final_timer_expired(void* data); +static void irlap_wd_timer_expired(void* data); +static void irlap_backoff_timer_expired(void* data); +static void irlap_media_busy_expired(void* data); + +void irlap_start_slot_timer(struct irlap_cb *self, int timeout) +{ + irda_start_timer(&self->slot_timer, timeout, (void *) self, + irlap_slot_timer_expired); +} + +void irlap_start_query_timer(struct irlap_cb *self, int S, int s) +{ + int timeout; + + /* Calculate when the peer discovery should end. Normally, we + * get the end-of-discovery frame, so this is just in case + * we miss it. + * Basically, we multiply the number of remaining slots by our + * slot time, plus add some extra time to properly receive the last + * discovery packet (which is longer due to extra discovery info), + * to avoid messing with for incomming connections requests and + * to accomodate devices that perform discovery slower than us. + * Jean II */ + timeout = ((sysctl_slot_timeout * HZ / 1000) * (S - s) + + XIDEXTRA_TIMEOUT + SMALLBUSY_TIMEOUT); + + /* Set or re-set the timer. We reset the timer for each received + * discovery query, which allow us to automatically adjust to + * the speed of the peer discovery (faster or slower). Jean II */ + irda_start_timer( &self->query_timer, timeout, (void *) self, + irlap_query_timer_expired); +} + +void irlap_start_final_timer(struct irlap_cb *self, int timeout) +{ + irda_start_timer(&self->final_timer, timeout, (void *) self, + irlap_final_timer_expired); +} + +void irlap_start_wd_timer(struct irlap_cb *self, int timeout) +{ + irda_start_timer(&self->wd_timer, timeout, (void *) self, + irlap_wd_timer_expired); +} + +void irlap_start_backoff_timer(struct irlap_cb *self, int timeout) +{ + irda_start_timer(&self->backoff_timer, timeout, (void *) self, + irlap_backoff_timer_expired); +} + +void irlap_start_mbusy_timer(struct irlap_cb *self, int timeout) +{ + irda_start_timer(&self->media_busy_timer, timeout, + (void *) self, irlap_media_busy_expired); +} + +void irlap_stop_mbusy_timer(struct irlap_cb *self) +{ + /* If timer is activated, kill it! */ + del_timer(&self->media_busy_timer); + + /* If we are in NDM, there is a bunch of events in LAP that + * that be pending due to the media_busy condition, such as + * CONNECT_REQUEST and SEND_UI_FRAME. If we don't generate + * an event, they will wait forever... + * Jean II */ + if (self->state == LAP_NDM) + irlap_do_event(self, MEDIA_BUSY_TIMER_EXPIRED, NULL, NULL); +} + +void irlmp_start_watchdog_timer(struct lsap_cb *self, int timeout) +{ + irda_start_timer(&self->watchdog_timer, timeout, (void *) self, + irlmp_watchdog_timer_expired); +} + +void irlmp_start_discovery_timer(struct irlmp_cb *self, int timeout) +{ + irda_start_timer(&self->discovery_timer, timeout, (void *) self, + irlmp_discovery_timer_expired); +} + +void irlmp_start_idle_timer(struct lap_cb *self, int timeout) +{ + irda_start_timer(&self->idle_timer, timeout, (void *) self, + irlmp_idle_timer_expired); +} + +void irlmp_stop_idle_timer(struct lap_cb *self) +{ + /* If timer is activated, kill it! */ + del_timer(&self->idle_timer); +} + +/* + * Function irlap_slot_timer_expired (data) + * + * IrLAP slot timer has expired + * + */ +static void irlap_slot_timer_expired(void *data) +{ + struct irlap_cb *self = (struct irlap_cb *) data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + irlap_do_event(self, SLOT_TIMER_EXPIRED, NULL, NULL); +} + +/* + * Function irlap_query_timer_expired (data) + * + * IrLAP query timer has expired + * + */ +static void irlap_query_timer_expired(void *data) +{ + struct irlap_cb *self = (struct irlap_cb *) data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + irlap_do_event(self, QUERY_TIMER_EXPIRED, NULL, NULL); +} + +/* + * Function irda_final_timer_expired (data) + * + * + * + */ +static void irlap_final_timer_expired(void *data) +{ + struct irlap_cb *self = (struct irlap_cb *) data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + irlap_do_event(self, FINAL_TIMER_EXPIRED, NULL, NULL); +} + +/* + * Function irda_wd_timer_expired (data) + * + * + * + */ +static void irlap_wd_timer_expired(void *data) +{ + struct irlap_cb *self = (struct irlap_cb *) data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + irlap_do_event(self, WD_TIMER_EXPIRED, NULL, NULL); +} + +/* + * Function irda_backoff_timer_expired (data) + * + * + * + */ +static void irlap_backoff_timer_expired(void *data) +{ + struct irlap_cb *self = (struct irlap_cb *) data; + + IRDA_ASSERT(self != NULL, return;); + IRDA_ASSERT(self->magic == LAP_MAGIC, return;); + + irlap_do_event(self, BACKOFF_TIMER_EXPIRED, NULL, NULL); +} + + +/* + * Function irtty_media_busy_expired (data) + * + * + */ +void irlap_media_busy_expired(void* data) +{ + struct irlap_cb *self = (struct irlap_cb *) data; + + IRDA_ASSERT(self != NULL, return;); + + irda_device_set_media_busy(self->netdev, FALSE); + /* Note : the LAP event will be send in irlap_stop_mbusy_timer(), + * to catch other cases where the flag is cleared (for example + * after a discovery) - Jean II */ +} diff --git a/net/irda/wrapper.c b/net/irda/wrapper.c new file mode 100644 index 000000000000..87130c1c8693 --- /dev/null +++ b/net/irda/wrapper.c @@ -0,0 +1,491 @@ +/********************************************************************* + * + * Filename: wrapper.c + * Version: 1.2 + * Description: IrDA SIR async wrapper layer + * Status: Stable + * Author: Dag Brattli + * Created at: Mon Aug 4 20:40:53 1997 + * Modified at: Fri Jan 28 13:21:09 2000 + * Modified by: Dag Brattli + * Modified at: Fri May 28 3:11 CST 1999 + * Modified by: Horst von Brand + * + * Copyright (c) 1998-2000 Dag Brattli , + * All Rights Reserved. + * Copyright (c) 2000-2002 Jean Tourrilhes + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * Neither Dag Brattli nor University of Tromsø admit liability nor + * provide warranty for any of this software. This material is + * provided "AS-IS" and at no charge. + * + ********************************************************************/ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/************************** FRAME WRAPPING **************************/ +/* + * Unwrap and unstuff SIR frames + * + * Note : at FIR and MIR, HDLC framing is used and usually handled + * by the controller, so we come here only for SIR... Jean II + */ + +/* + * Function stuff_byte (byte, buf) + * + * Byte stuff one single byte and put the result in buffer pointed to by + * buf. The buffer must at all times be able to have two bytes inserted. + * + * This is in a tight loop, better inline it, so need to be prior to callers. + * (2000 bytes on P6 200MHz, non-inlined ~370us, inline ~170us) - Jean II + */ +static inline int stuff_byte(__u8 byte, __u8 *buf) +{ + switch (byte) { + case BOF: /* FALLTHROUGH */ + case EOF: /* FALLTHROUGH */ + case CE: + /* Insert transparently coded */ + buf[0] = CE; /* Send link escape */ + buf[1] = byte^IRDA_TRANS; /* Complement bit 5 */ + return 2; + /* break; */ + default: + /* Non-special value, no transparency required */ + buf[0] = byte; + return 1; + /* break; */ + } +} + +/* + * Function async_wrap (skb, *tx_buff, buffsize) + * + * Makes a new buffer with wrapping and stuffing, should check that + * we don't get tx buffer overflow. + */ +int async_wrap_skb(struct sk_buff *skb, __u8 *tx_buff, int buffsize) +{ + struct irda_skb_cb *cb = (struct irda_skb_cb *) skb->cb; + int xbofs; + int i; + int n; + union { + __u16 value; + __u8 bytes[2]; + } fcs; + + /* Initialize variables */ + fcs.value = INIT_FCS; + n = 0; + + /* + * Send XBOF's for required min. turn time and for the negotiated + * additional XBOFS + */ + + if (cb->magic != LAP_MAGIC) { + /* + * This will happen for all frames sent from user-space. + * Nothing to worry about, but we set the default number of + * BOF's + */ + IRDA_DEBUG(1, "%s(), wrong magic in skb!\n", __FUNCTION__); + xbofs = 10; + } else + xbofs = cb->xbofs + cb->xbofs_delay; + + IRDA_DEBUG(4, "%s(), xbofs=%d\n", __FUNCTION__, xbofs); + + /* Check that we never use more than 115 + 48 xbofs */ + if (xbofs > 163) { + IRDA_DEBUG(0, "%s(), too many xbofs (%d)\n", __FUNCTION__, + xbofs); + xbofs = 163; + } + + memset(tx_buff + n, XBOF, xbofs); + n += xbofs; + + /* Start of packet character BOF */ + tx_buff[n++] = BOF; + + /* Insert frame and calc CRC */ + for (i=0; i < skb->len; i++) { + /* + * Check for the possibility of tx buffer overflow. We use + * bufsize-5 since the maximum number of bytes that can be + * transmitted after this point is 5. + */ + if(n >= (buffsize-5)) { + IRDA_ERROR("%s(), tx buffer overflow (n=%d)\n", + __FUNCTION__, n); + return n; + } + + n += stuff_byte(skb->data[i], tx_buff+n); + fcs.value = irda_fcs(fcs.value, skb->data[i]); + } + + /* Insert CRC in little endian format (LSB first) */ + fcs.value = ~fcs.value; +#ifdef __LITTLE_ENDIAN + n += stuff_byte(fcs.bytes[0], tx_buff+n); + n += stuff_byte(fcs.bytes[1], tx_buff+n); +#else /* ifdef __BIG_ENDIAN */ + n += stuff_byte(fcs.bytes[1], tx_buff+n); + n += stuff_byte(fcs.bytes[0], tx_buff+n); +#endif + tx_buff[n++] = EOF; + + return n; +} +EXPORT_SYMBOL(async_wrap_skb); + +/************************* FRAME UNWRAPPING *************************/ +/* + * Unwrap and unstuff SIR frames + * + * Complete rewrite by Jean II : + * More inline, faster, more compact, more logical. Jean II + * (16 bytes on P6 200MHz, old 5 to 7 us, new 4 to 6 us) + * (24 bytes on P6 200MHz, old 9 to 10 us, new 7 to 8 us) + * (for reference, 115200 b/s is 1 byte every 69 us) + * And reduce wrapper.o by ~900B in the process ;-) + * + * Then, we have the addition of ZeroCopy, which is optional + * (i.e. the driver must initiate it) and improve final processing. + * (2005 B frame + EOF on P6 200MHz, without 30 to 50 us, with 10 to 25 us) + * + * Note : at FIR and MIR, HDLC framing is used and usually handled + * by the controller, so we come here only for SIR... Jean II + */ + +/* + * We can also choose where we want to do the CRC calculation. We can + * do it "inline", as we receive the bytes, or "postponed", when + * receiving the End-Of-Frame. + * (16 bytes on P6 200MHz, inlined 4 to 6 us, postponed 4 to 5 us) + * (24 bytes on P6 200MHz, inlined 7 to 8 us, postponed 5 to 7 us) + * With ZeroCopy : + * (2005 B frame on P6 200MHz, inlined 10 to 25 us, postponed 140 to 180 us) + * Without ZeroCopy : + * (2005 B frame on P6 200MHz, inlined 30 to 50 us, postponed 150 to 180 us) + * (Note : numbers taken with irq disabled) + * + * From those numbers, it's not clear which is the best strategy, because + * we end up running through a lot of data one way or another (i.e. cache + * misses). I personally prefer to avoid the huge latency spike of the + * "postponed" solution, because it come just at the time when we have + * lot's of protocol processing to do and it will hurt our ability to + * reach low link turnaround times... Jean II + */ +//#define POSTPONE_RX_CRC + +/* + * Function async_bump (buf, len, stats) + * + * Got a frame, make a copy of it, and pass it up the stack! We can try + * to inline it since it's only called from state_inside_frame + */ +static inline void +async_bump(struct net_device *dev, + struct net_device_stats *stats, + iobuff_t *rx_buff) +{ + struct sk_buff *newskb; + struct sk_buff *dataskb; + int docopy; + + /* Check if we need to copy the data to a new skb or not. + * If the driver doesn't use ZeroCopy Rx, we have to do it. + * With ZeroCopy Rx, the rx_buff already point to a valid + * skb. But, if the frame is small, it is more efficient to + * copy it to save memory (copy will be fast anyway - that's + * called Rx-copy-break). Jean II */ + docopy = ((rx_buff->skb == NULL) || + (rx_buff->len < IRDA_RX_COPY_THRESHOLD)); + + /* Allocate a new skb */ + newskb = dev_alloc_skb(docopy ? rx_buff->len + 1 : rx_buff->truesize); + if (!newskb) { + stats->rx_dropped++; + /* We could deliver the current skb if doing ZeroCopy Rx, + * but this would stall the Rx path. Better drop the + * packet... Jean II */ + return; + } + + /* Align IP header to 20 bytes (i.e. increase skb->data) + * Note this is only useful with IrLAN, as PPP has a variable + * header size (2 or 1 bytes) - Jean II */ + skb_reserve(newskb, 1); + + if(docopy) { + /* Copy data without CRC (lenght already checked) */ + memcpy(newskb->data, rx_buff->data, rx_buff->len - 2); + /* Deliver this skb */ + dataskb = newskb; + } else { + /* We are using ZeroCopy. Deliver old skb */ + dataskb = rx_buff->skb; + /* And hook the new skb to the rx_buff */ + rx_buff->skb = newskb; + rx_buff->head = newskb->data; /* NOT newskb->head */ + //printk(KERN_DEBUG "ZeroCopy : len = %d, dataskb = %p, newskb = %p\n", rx_buff->len, dataskb, newskb); + } + + /* Set proper length on skb (without CRC) */ + skb_put(dataskb, rx_buff->len - 2); + + /* Feed it to IrLAP layer */ + dataskb->dev = dev; + dataskb->mac.raw = dataskb->data; + dataskb->protocol = htons(ETH_P_IRDA); + + netif_rx(dataskb); + + stats->rx_packets++; + stats->rx_bytes += rx_buff->len; + + /* Clean up rx_buff (redundant with async_unwrap_bof() ???) */ + rx_buff->data = rx_buff->head; + rx_buff->len = 0; +} + +/* + * Function async_unwrap_bof(dev, byte) + * + * Handle Beginning Of Frame character received within a frame + * + */ +static inline void +async_unwrap_bof(struct net_device *dev, + struct net_device_stats *stats, + iobuff_t *rx_buff, __u8 byte) +{ + switch(rx_buff->state) { + case LINK_ESCAPE: + case INSIDE_FRAME: + /* Not supposed to happen, the previous frame is not + * finished - Jean II */ + IRDA_DEBUG(1, "%s(), Discarding incomplete frame\n", + __FUNCTION__); + stats->rx_errors++; + stats->rx_missed_errors++; + irda_device_set_media_busy(dev, TRUE); + break; + + case OUTSIDE_FRAME: + case BEGIN_FRAME: + default: + /* We may receive multiple BOF at the start of frame */ + break; + } + + /* Now receiving frame */ + rx_buff->state = BEGIN_FRAME; + rx_buff->in_frame = TRUE; + + /* Time to initialize receive buffer */ + rx_buff->data = rx_buff->head; + rx_buff->len = 0; + rx_buff->fcs = INIT_FCS; +} + +/* + * Function async_unwrap_eof(dev, byte) + * + * Handle End Of Frame character received within a frame + * + */ +static inline void +async_unwrap_eof(struct net_device *dev, + struct net_device_stats *stats, + iobuff_t *rx_buff, __u8 byte) +{ +#ifdef POSTPONE_RX_CRC + int i; +#endif + + switch(rx_buff->state) { + case OUTSIDE_FRAME: + /* Probably missed the BOF */ + stats->rx_errors++; + stats->rx_missed_errors++; + irda_device_set_media_busy(dev, TRUE); + break; + + case BEGIN_FRAME: + case LINK_ESCAPE: + case INSIDE_FRAME: + default: + /* Note : in the case of BEGIN_FRAME and LINK_ESCAPE, + * the fcs will most likely not match and generate an + * error, as expected - Jean II */ + rx_buff->state = OUTSIDE_FRAME; + rx_buff->in_frame = FALSE; + +#ifdef POSTPONE_RX_CRC + /* If we haven't done the CRC as we receive bytes, we + * must do it now... Jean II */ + for(i = 0; i < rx_buff->len; i++) + rx_buff->fcs = irda_fcs(rx_buff->fcs, + rx_buff->data[i]); +#endif + + /* Test FCS and signal success if the frame is good */ + if (rx_buff->fcs == GOOD_FCS) { + /* Deliver frame */ + async_bump(dev, stats, rx_buff); + break; + } else { + /* Wrong CRC, discard frame! */ + irda_device_set_media_busy(dev, TRUE); + + IRDA_DEBUG(1, "%s(), crc error\n", __FUNCTION__); + stats->rx_errors++; + stats->rx_crc_errors++; + } + break; + } +} + +/* + * Function async_unwrap_ce(dev, byte) + * + * Handle Character Escape character received within a frame + * + */ +static inline void +async_unwrap_ce(struct net_device *dev, + struct net_device_stats *stats, + iobuff_t *rx_buff, __u8 byte) +{ + switch(rx_buff->state) { + case OUTSIDE_FRAME: + /* Activate carrier sense */ + irda_device_set_media_busy(dev, TRUE); + break; + + case LINK_ESCAPE: + IRDA_WARNING("%s: state not defined\n", __FUNCTION__); + break; + + case BEGIN_FRAME: + case INSIDE_FRAME: + default: + /* Stuffed byte coming */ + rx_buff->state = LINK_ESCAPE; + break; + } +} + +/* + * Function async_unwrap_other(dev, byte) + * + * Handle other characters received within a frame + * + */ +static inline void +async_unwrap_other(struct net_device *dev, + struct net_device_stats *stats, + iobuff_t *rx_buff, __u8 byte) +{ + switch(rx_buff->state) { + /* This is on the critical path, case are ordered by + * probability (most frequent first) - Jean II */ + case INSIDE_FRAME: + /* Must be the next byte of the frame */ + if (rx_buff->len < rx_buff->truesize) { + rx_buff->data[rx_buff->len++] = byte; +#ifndef POSTPONE_RX_CRC + rx_buff->fcs = irda_fcs(rx_buff->fcs, byte); +#endif + } else { + IRDA_DEBUG(1, "%s(), Rx buffer overflow, aborting\n", + __FUNCTION__); + rx_buff->state = OUTSIDE_FRAME; + } + break; + + case LINK_ESCAPE: + /* + * Stuffed char, complement bit 5 of byte + * following CE, IrLAP p.114 + */ + byte ^= IRDA_TRANS; + if (rx_buff->len < rx_buff->truesize) { + rx_buff->data[rx_buff->len++] = byte; +#ifndef POSTPONE_RX_CRC + rx_buff->fcs = irda_fcs(rx_buff->fcs, byte); +#endif + rx_buff->state = INSIDE_FRAME; + } else { + IRDA_DEBUG(1, "%s(), Rx buffer overflow, aborting\n", + __FUNCTION__); + rx_buff->state = OUTSIDE_FRAME; + } + break; + + case OUTSIDE_FRAME: + /* Activate carrier sense */ + if(byte != XBOF) + irda_device_set_media_busy(dev, TRUE); + break; + + case BEGIN_FRAME: + default: + rx_buff->data[rx_buff->len++] = byte; +#ifndef POSTPONE_RX_CRC + rx_buff->fcs = irda_fcs(rx_buff->fcs, byte); +#endif + rx_buff->state = INSIDE_FRAME; + break; + } +} + +/* + * Function async_unwrap_char (dev, rx_buff, byte) + * + * Parse and de-stuff frame received from the IrDA-port + * + * This is the main entry point for SIR drivers. + */ +void async_unwrap_char(struct net_device *dev, + struct net_device_stats *stats, + iobuff_t *rx_buff, __u8 byte) +{ + switch(byte) { + case CE: + async_unwrap_ce(dev, stats, rx_buff, byte); + break; + case BOF: + async_unwrap_bof(dev, stats, rx_buff, byte); + break; + case EOF: + async_unwrap_eof(dev, stats, rx_buff, byte); + break; + default: + async_unwrap_other(dev, stats, rx_buff, byte); + break; + } +} +EXPORT_SYMBOL(async_unwrap_char); + diff --git a/net/key/Makefile b/net/key/Makefile new file mode 100644 index 000000000000..857608042475 --- /dev/null +++ b/net/key/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the key AF. +# + +obj-$(CONFIG_NET_KEY) += af_key.o diff --git a/net/key/af_key.c b/net/key/af_key.c new file mode 100644 index 000000000000..ce980aa94ed8 --- /dev/null +++ b/net/key/af_key.c @@ -0,0 +1,2903 @@ +/* + * net/key/af_key.c An implementation of PF_KEYv2 sockets. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Maxim Giryaev + * David S. Miller + * Alexey Kuznetsov + * Kunihiro Ishiguro + * Kazunori MIYAZAWA / USAGI Project + * Derek Atkins + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#define _X2KEY(x) ((x) == XFRM_INF ? 0 : (x)) +#define _KEY2X(x) ((x) == 0 ? XFRM_INF : (x)) + + +/* List of all pfkey sockets. */ +static HLIST_HEAD(pfkey_table); +static DECLARE_WAIT_QUEUE_HEAD(pfkey_table_wait); +static DEFINE_RWLOCK(pfkey_table_lock); +static atomic_t pfkey_table_users = ATOMIC_INIT(0); + +static atomic_t pfkey_socks_nr = ATOMIC_INIT(0); + +struct pfkey_sock { + /* struct sock must be the first member of struct pfkey_sock */ + struct sock sk; + int registered; + int promisc; +}; + +static inline struct pfkey_sock *pfkey_sk(struct sock *sk) +{ + return (struct pfkey_sock *)sk; +} + +static void pfkey_sock_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->sk_receive_queue); + + if (!sock_flag(sk, SOCK_DEAD)) { + printk("Attempt to release alive pfkey socket: %p\n", sk); + return; + } + + BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); + BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); + + atomic_dec(&pfkey_socks_nr); +} + +static void pfkey_table_grab(void) +{ + write_lock_bh(&pfkey_table_lock); + + if (atomic_read(&pfkey_table_users)) { + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(&pfkey_table_wait, &wait); + for(;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&pfkey_table_users) == 0) + break; + write_unlock_bh(&pfkey_table_lock); + schedule(); + write_lock_bh(&pfkey_table_lock); + } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&pfkey_table_wait, &wait); + } +} + +static __inline__ void pfkey_table_ungrab(void) +{ + write_unlock_bh(&pfkey_table_lock); + wake_up(&pfkey_table_wait); +} + +static __inline__ void pfkey_lock_table(void) +{ + /* read_lock() synchronizes us to pfkey_table_grab */ + + read_lock(&pfkey_table_lock); + atomic_inc(&pfkey_table_users); + read_unlock(&pfkey_table_lock); +} + +static __inline__ void pfkey_unlock_table(void) +{ + if (atomic_dec_and_test(&pfkey_table_users)) + wake_up(&pfkey_table_wait); +} + + +static struct proto_ops pfkey_ops; + +static void pfkey_insert(struct sock *sk) +{ + pfkey_table_grab(); + sk_add_node(sk, &pfkey_table); + pfkey_table_ungrab(); +} + +static void pfkey_remove(struct sock *sk) +{ + pfkey_table_grab(); + sk_del_node_init(sk); + pfkey_table_ungrab(); +} + +static struct proto key_proto = { + .name = "KEY", + .owner = THIS_MODULE, + .obj_size = sizeof(struct pfkey_sock), +}; + +static int pfkey_create(struct socket *sock, int protocol) +{ + struct sock *sk; + int err; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + if (sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + if (protocol != PF_KEY_V2) + return -EPROTONOSUPPORT; + + err = -ENOMEM; + sk = sk_alloc(PF_KEY, GFP_KERNEL, &key_proto, 1); + if (sk == NULL) + goto out; + + sock->ops = &pfkey_ops; + sock_init_data(sock, sk); + + sk->sk_family = PF_KEY; + sk->sk_destruct = pfkey_sock_destruct; + + atomic_inc(&pfkey_socks_nr); + + pfkey_insert(sk); + + return 0; +out: + return err; +} + +static int pfkey_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (!sk) + return 0; + + pfkey_remove(sk); + + sock_orphan(sk); + sock->sk = NULL; + skb_queue_purge(&sk->sk_write_queue); + sock_put(sk); + + return 0; +} + +static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2, + int allocation, struct sock *sk) +{ + int err = -ENOBUFS; + + sock_hold(sk); + if (*skb2 == NULL) { + if (atomic_read(&skb->users) != 1) { + *skb2 = skb_clone(skb, allocation); + } else { + *skb2 = skb; + atomic_inc(&skb->users); + } + } + if (*skb2 != NULL) { + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) { + skb_orphan(*skb2); + skb_set_owner_r(*skb2, sk); + skb_queue_tail(&sk->sk_receive_queue, *skb2); + sk->sk_data_ready(sk, (*skb2)->len); + *skb2 = NULL; + err = 0; + } + } + sock_put(sk); + return err; +} + +/* Send SKB to all pfkey sockets matching selected criteria. */ +#define BROADCAST_ALL 0 +#define BROADCAST_ONE 1 +#define BROADCAST_REGISTERED 2 +#define BROADCAST_PROMISC_ONLY 4 +static int pfkey_broadcast(struct sk_buff *skb, int allocation, + int broadcast_flags, struct sock *one_sk) +{ + struct sock *sk; + struct hlist_node *node; + struct sk_buff *skb2 = NULL; + int err = -ESRCH; + + /* XXX Do we need something like netlink_overrun? I think + * XXX PF_KEY socket apps will not mind current behavior. + */ + if (!skb) + return -ENOMEM; + + pfkey_lock_table(); + sk_for_each(sk, node, &pfkey_table) { + struct pfkey_sock *pfk = pfkey_sk(sk); + int err2; + + /* Yes, it means that if you are meant to receive this + * pfkey message you receive it twice as promiscuous + * socket. + */ + if (pfk->promisc) + pfkey_broadcast_one(skb, &skb2, allocation, sk); + + /* the exact target will be processed later */ + if (sk == one_sk) + continue; + if (broadcast_flags != BROADCAST_ALL) { + if (broadcast_flags & BROADCAST_PROMISC_ONLY) + continue; + if ((broadcast_flags & BROADCAST_REGISTERED) && + !pfk->registered) + continue; + if (broadcast_flags & BROADCAST_ONE) + continue; + } + + err2 = pfkey_broadcast_one(skb, &skb2, allocation, sk); + + /* Error is cleare after succecful sending to at least one + * registered KM */ + if ((broadcast_flags & BROADCAST_REGISTERED) && err) + err = err2; + } + pfkey_unlock_table(); + + if (one_sk != NULL) + err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk); + + if (skb2) + kfree_skb(skb2); + kfree_skb(skb); + return err; +} + +static inline void pfkey_hdr_dup(struct sadb_msg *new, struct sadb_msg *orig) +{ + *new = *orig; +} + +static int pfkey_error(struct sadb_msg *orig, int err, struct sock *sk) +{ + struct sk_buff *skb = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL); + struct sadb_msg *hdr; + + if (!skb) + return -ENOBUFS; + + /* Woe be to the platform trying to support PFKEY yet + * having normal errnos outside the 1-255 range, inclusive. + */ + err = -err; + if (err == ERESTARTSYS || + err == ERESTARTNOHAND || + err == ERESTARTNOINTR) + err = EINTR; + if (err >= 512) + err = EINVAL; + if (err <= 0 || err >= 256) + BUG(); + + hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); + pfkey_hdr_dup(hdr, orig); + hdr->sadb_msg_errno = (uint8_t) err; + hdr->sadb_msg_len = (sizeof(struct sadb_msg) / + sizeof(uint64_t)); + + pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk); + + return 0; +} + +static u8 sadb_ext_min_len[] = { + [SADB_EXT_RESERVED] = (u8) 0, + [SADB_EXT_SA] = (u8) sizeof(struct sadb_sa), + [SADB_EXT_LIFETIME_CURRENT] = (u8) sizeof(struct sadb_lifetime), + [SADB_EXT_LIFETIME_HARD] = (u8) sizeof(struct sadb_lifetime), + [SADB_EXT_LIFETIME_SOFT] = (u8) sizeof(struct sadb_lifetime), + [SADB_EXT_ADDRESS_SRC] = (u8) sizeof(struct sadb_address), + [SADB_EXT_ADDRESS_DST] = (u8) sizeof(struct sadb_address), + [SADB_EXT_ADDRESS_PROXY] = (u8) sizeof(struct sadb_address), + [SADB_EXT_KEY_AUTH] = (u8) sizeof(struct sadb_key), + [SADB_EXT_KEY_ENCRYPT] = (u8) sizeof(struct sadb_key), + [SADB_EXT_IDENTITY_SRC] = (u8) sizeof(struct sadb_ident), + [SADB_EXT_IDENTITY_DST] = (u8) sizeof(struct sadb_ident), + [SADB_EXT_SENSITIVITY] = (u8) sizeof(struct sadb_sens), + [SADB_EXT_PROPOSAL] = (u8) sizeof(struct sadb_prop), + [SADB_EXT_SUPPORTED_AUTH] = (u8) sizeof(struct sadb_supported), + [SADB_EXT_SUPPORTED_ENCRYPT] = (u8) sizeof(struct sadb_supported), + [SADB_EXT_SPIRANGE] = (u8) sizeof(struct sadb_spirange), + [SADB_X_EXT_KMPRIVATE] = (u8) sizeof(struct sadb_x_kmprivate), + [SADB_X_EXT_POLICY] = (u8) sizeof(struct sadb_x_policy), + [SADB_X_EXT_SA2] = (u8) sizeof(struct sadb_x_sa2), + [SADB_X_EXT_NAT_T_TYPE] = (u8) sizeof(struct sadb_x_nat_t_type), + [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port), + [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port), + [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address), +}; + +/* Verify sadb_address_{len,prefixlen} against sa_family. */ +static int verify_address_len(void *p) +{ + struct sadb_address *sp = p; + struct sockaddr *addr = (struct sockaddr *)(sp + 1); + struct sockaddr_in *sin; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct sockaddr_in6 *sin6; +#endif + int len; + + switch (addr->sa_family) { + case AF_INET: + len = sizeof(*sp) + sizeof(*sin) + (sizeof(uint64_t) - 1); + len /= sizeof(uint64_t); + if (sp->sadb_address_len != len || + sp->sadb_address_prefixlen > 32) + return -EINVAL; + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + len = sizeof(*sp) + sizeof(*sin6) + (sizeof(uint64_t) - 1); + len /= sizeof(uint64_t); + if (sp->sadb_address_len != len || + sp->sadb_address_prefixlen > 128) + return -EINVAL; + break; +#endif + default: + /* It is user using kernel to keep track of security + * associations for another protocol, such as + * OSPF/RSVP/RIPV2/MIP. It is user's job to verify + * lengths. + * + * XXX Actually, association/policy database is not yet + * XXX able to cope with arbitrary sockaddr families. + * XXX When it can, remove this -EINVAL. -DaveM + */ + return -EINVAL; + break; + }; + + return 0; +} + +static int present_and_same_family(struct sadb_address *src, + struct sadb_address *dst) +{ + struct sockaddr *s_addr, *d_addr; + + if (!src || !dst) + return 0; + + s_addr = (struct sockaddr *)(src + 1); + d_addr = (struct sockaddr *)(dst + 1); + if (s_addr->sa_family != d_addr->sa_family) + return 0; + if (s_addr->sa_family != AF_INET +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + && s_addr->sa_family != AF_INET6 +#endif + ) + return 0; + + return 1; +} + +static int parse_exthdrs(struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + char *p = (char *) hdr; + int len = skb->len; + + len -= sizeof(*hdr); + p += sizeof(*hdr); + while (len > 0) { + struct sadb_ext *ehdr = (struct sadb_ext *) p; + uint16_t ext_type; + int ext_len; + + ext_len = ehdr->sadb_ext_len; + ext_len *= sizeof(uint64_t); + ext_type = ehdr->sadb_ext_type; + if (ext_len < sizeof(uint64_t) || + ext_len > len || + ext_type == SADB_EXT_RESERVED) + return -EINVAL; + + if (ext_type <= SADB_EXT_MAX) { + int min = (int) sadb_ext_min_len[ext_type]; + if (ext_len < min) + return -EINVAL; + if (ext_hdrs[ext_type-1] != NULL) + return -EINVAL; + if (ext_type == SADB_EXT_ADDRESS_SRC || + ext_type == SADB_EXT_ADDRESS_DST || + ext_type == SADB_EXT_ADDRESS_PROXY || + ext_type == SADB_X_EXT_NAT_T_OA) { + if (verify_address_len(p)) + return -EINVAL; + } + ext_hdrs[ext_type-1] = p; + } + p += ext_len; + len -= ext_len; + } + + return 0; +} + +static uint16_t +pfkey_satype2proto(uint8_t satype) +{ + switch (satype) { + case SADB_SATYPE_UNSPEC: + return IPSEC_PROTO_ANY; + case SADB_SATYPE_AH: + return IPPROTO_AH; + case SADB_SATYPE_ESP: + return IPPROTO_ESP; + case SADB_X_SATYPE_IPCOMP: + return IPPROTO_COMP; + break; + default: + return 0; + } + /* NOTREACHED */ +} + +static uint8_t +pfkey_proto2satype(uint16_t proto) +{ + switch (proto) { + case IPPROTO_AH: + return SADB_SATYPE_AH; + case IPPROTO_ESP: + return SADB_SATYPE_ESP; + case IPPROTO_COMP: + return SADB_X_SATYPE_IPCOMP; + break; + default: + return 0; + } + /* NOTREACHED */ +} + +/* BTW, this scheme means that there is no way with PFKEY2 sockets to + * say specifically 'just raw sockets' as we encode them as 255. + */ + +static uint8_t pfkey_proto_to_xfrm(uint8_t proto) +{ + return (proto == IPSEC_PROTO_ANY ? 0 : proto); +} + +static uint8_t pfkey_proto_from_xfrm(uint8_t proto) +{ + return (proto ? proto : IPSEC_PROTO_ANY); +} + +static int pfkey_sadb_addr2xfrm_addr(struct sadb_address *addr, + xfrm_address_t *xaddr) +{ + switch (((struct sockaddr*)(addr + 1))->sa_family) { + case AF_INET: + xaddr->a4 = + ((struct sockaddr_in *)(addr + 1))->sin_addr.s_addr; + return AF_INET; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + memcpy(xaddr->a6, + &((struct sockaddr_in6 *)(addr + 1))->sin6_addr, + sizeof(struct in6_addr)); + return AF_INET6; +#endif + default: + return 0; + } + /* NOTREACHED */ +} + +static struct xfrm_state *pfkey_xfrm_state_lookup(struct sadb_msg *hdr, void **ext_hdrs) +{ + struct sadb_sa *sa; + struct sadb_address *addr; + uint16_t proto; + unsigned short family; + xfrm_address_t *xaddr; + + sa = (struct sadb_sa *) ext_hdrs[SADB_EXT_SA-1]; + if (sa == NULL) + return NULL; + + proto = pfkey_satype2proto(hdr->sadb_msg_satype); + if (proto == 0) + return NULL; + + /* sadb_address_len should be checked by caller */ + addr = (struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1]; + if (addr == NULL) + return NULL; + + family = ((struct sockaddr *)(addr + 1))->sa_family; + switch (family) { + case AF_INET: + xaddr = (xfrm_address_t *)&((struct sockaddr_in *)(addr + 1))->sin_addr; + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + xaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(addr + 1))->sin6_addr; + break; +#endif + default: + xaddr = NULL; + } + + if (!xaddr) + return NULL; + + return xfrm_state_lookup(xaddr, sa->sadb_sa_spi, proto, family); +} + +#define PFKEY_ALIGN8(a) (1 + (((a) - 1) | (8 - 1))) +static int +pfkey_sockaddr_size(sa_family_t family) +{ + switch (family) { + case AF_INET: + return PFKEY_ALIGN8(sizeof(struct sockaddr_in)); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + return PFKEY_ALIGN8(sizeof(struct sockaddr_in6)); +#endif + default: + return 0; + } + /* NOTREACHED */ +} + +static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys, int hsc) +{ + struct sk_buff *skb; + struct sadb_msg *hdr; + struct sadb_sa *sa; + struct sadb_lifetime *lifetime; + struct sadb_address *addr; + struct sadb_key *key; + struct sadb_x_sa2 *sa2; + struct sockaddr_in *sin; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct sockaddr_in6 *sin6; +#endif + int size; + int auth_key_size = 0; + int encrypt_key_size = 0; + int sockaddr_size; + struct xfrm_encap_tmpl *natt = NULL; + + /* address family check */ + sockaddr_size = pfkey_sockaddr_size(x->props.family); + if (!sockaddr_size) + return ERR_PTR(-EINVAL); + + /* base, SA, (lifetime (HSC),) address(SD), (address(P),) + key(AE), (identity(SD),) (sensitivity)> */ + size = sizeof(struct sadb_msg) +sizeof(struct sadb_sa) + + sizeof(struct sadb_lifetime) + + ((hsc & 1) ? sizeof(struct sadb_lifetime) : 0) + + ((hsc & 2) ? sizeof(struct sadb_lifetime) : 0) + + sizeof(struct sadb_address)*2 + + sockaddr_size*2 + + sizeof(struct sadb_x_sa2); + /* identity & sensitivity */ + + if ((x->props.family == AF_INET && + x->sel.saddr.a4 != x->props.saddr.a4) +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + || (x->props.family == AF_INET6 && + memcmp (x->sel.saddr.a6, x->props.saddr.a6, sizeof (struct in6_addr))) +#endif + ) + size += sizeof(struct sadb_address) + sockaddr_size; + + if (add_keys) { + if (x->aalg && x->aalg->alg_key_len) { + auth_key_size = + PFKEY_ALIGN8((x->aalg->alg_key_len + 7) / 8); + size += sizeof(struct sadb_key) + auth_key_size; + } + if (x->ealg && x->ealg->alg_key_len) { + encrypt_key_size = + PFKEY_ALIGN8((x->ealg->alg_key_len+7) / 8); + size += sizeof(struct sadb_key) + encrypt_key_size; + } + } + if (x->encap) + natt = x->encap; + + if (natt && natt->encap_type) { + size += sizeof(struct sadb_x_nat_t_type); + size += sizeof(struct sadb_x_nat_t_port); + size += sizeof(struct sadb_x_nat_t_port); + } + + skb = alloc_skb(size + 16, GFP_ATOMIC); + if (skb == NULL) + return ERR_PTR(-ENOBUFS); + + /* call should fill header later */ + hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); + memset(hdr, 0, size); /* XXX do we need this ? */ + hdr->sadb_msg_len = size / sizeof(uint64_t); + + /* sa */ + sa = (struct sadb_sa *) skb_put(skb, sizeof(struct sadb_sa)); + sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t); + sa->sadb_sa_exttype = SADB_EXT_SA; + sa->sadb_sa_spi = x->id.spi; + sa->sadb_sa_replay = x->props.replay_window; + sa->sadb_sa_state = SADB_SASTATE_DYING; + if (x->km.state == XFRM_STATE_VALID && !x->km.dying) + sa->sadb_sa_state = SADB_SASTATE_MATURE; + else if (x->km.state == XFRM_STATE_ACQ) + sa->sadb_sa_state = SADB_SASTATE_LARVAL; + else if (x->km.state == XFRM_STATE_EXPIRED) + sa->sadb_sa_state = SADB_SASTATE_DEAD; + sa->sadb_sa_auth = 0; + if (x->aalg) { + struct xfrm_algo_desc *a = xfrm_aalg_get_byname(x->aalg->alg_name, 0); + sa->sadb_sa_auth = a ? a->desc.sadb_alg_id : 0; + } + sa->sadb_sa_encrypt = 0; + BUG_ON(x->ealg && x->calg); + if (x->ealg) { + struct xfrm_algo_desc *a = xfrm_ealg_get_byname(x->ealg->alg_name, 0); + sa->sadb_sa_encrypt = a ? a->desc.sadb_alg_id : 0; + } + /* KAME compatible: sadb_sa_encrypt is overloaded with calg id */ + if (x->calg) { + struct xfrm_algo_desc *a = xfrm_calg_get_byname(x->calg->alg_name, 0); + sa->sadb_sa_encrypt = a ? a->desc.sadb_alg_id : 0; + } + + sa->sadb_sa_flags = 0; + if (x->props.flags & XFRM_STATE_NOECN) + sa->sadb_sa_flags |= SADB_SAFLAGS_NOECN; + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + sa->sadb_sa_flags |= SADB_SAFLAGS_DECAP_DSCP; + + /* hard time */ + if (hsc & 2) { + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; + lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.hard_packet_limit); + lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.hard_byte_limit); + lifetime->sadb_lifetime_addtime = x->lft.hard_add_expires_seconds; + lifetime->sadb_lifetime_usetime = x->lft.hard_use_expires_seconds; + } + /* soft time */ + if (hsc & 1) { + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; + lifetime->sadb_lifetime_allocations = _X2KEY(x->lft.soft_packet_limit); + lifetime->sadb_lifetime_bytes = _X2KEY(x->lft.soft_byte_limit); + lifetime->sadb_lifetime_addtime = x->lft.soft_add_expires_seconds; + lifetime->sadb_lifetime_usetime = x->lft.soft_use_expires_seconds; + } + /* current time */ + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; + lifetime->sadb_lifetime_allocations = x->curlft.packets; + lifetime->sadb_lifetime_bytes = x->curlft.bytes; + lifetime->sadb_lifetime_addtime = x->curlft.add_time; + lifetime->sadb_lifetime_usetime = x->curlft.use_time; + /* src address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; + /* "if the ports are non-zero, then the sadb_address_proto field, + normally zero, MUST be filled in with the transport + protocol's number." - RFC2367 */ + addr->sadb_address_proto = 0; + addr->sadb_address_reserved = 0; + if (x->props.family == AF_INET) { + addr->sadb_address_prefixlen = 32; + + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = x->props.saddr.a4; + sin->sin_port = 0; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (x->props.family == AF_INET6) { + addr->sadb_address_prefixlen = 128; + + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, x->props.saddr.a6, + sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + } +#endif + else + BUG(); + + /* dst address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; + addr->sadb_address_proto = 0; + addr->sadb_address_prefixlen = 32; /* XXX */ + addr->sadb_address_reserved = 0; + if (x->props.family == AF_INET) { + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = x->id.daddr.a4; + sin->sin_port = 0; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + + if (x->sel.saddr.a4 != x->props.saddr.a4) { + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_PROXY; + addr->sadb_address_proto = + pfkey_proto_from_xfrm(x->sel.proto); + addr->sadb_address_prefixlen = x->sel.prefixlen_s; + addr->sadb_address_reserved = 0; + + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = x->sel.saddr.a4; + sin->sin_port = x->sel.sport; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (x->props.family == AF_INET6) { + addr->sadb_address_prefixlen = 128; + + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, x->id.daddr.a6, sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + + if (memcmp (x->sel.saddr.a6, x->props.saddr.a6, + sizeof(struct in6_addr))) { + addr = (struct sadb_address *) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_PROXY; + addr->sadb_address_proto = + pfkey_proto_from_xfrm(x->sel.proto); + addr->sadb_address_prefixlen = x->sel.prefixlen_s; + addr->sadb_address_reserved = 0; + + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = x->sel.sport; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, x->sel.saddr.a6, + sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + } + } +#endif + else + BUG(); + + /* auth key */ + if (add_keys && auth_key_size) { + key = (struct sadb_key *) skb_put(skb, + sizeof(struct sadb_key)+auth_key_size); + key->sadb_key_len = (sizeof(struct sadb_key) + auth_key_size) / + sizeof(uint64_t); + key->sadb_key_exttype = SADB_EXT_KEY_AUTH; + key->sadb_key_bits = x->aalg->alg_key_len; + key->sadb_key_reserved = 0; + memcpy(key + 1, x->aalg->alg_key, (x->aalg->alg_key_len+7)/8); + } + /* encrypt key */ + if (add_keys && encrypt_key_size) { + key = (struct sadb_key *) skb_put(skb, + sizeof(struct sadb_key)+encrypt_key_size); + key->sadb_key_len = (sizeof(struct sadb_key) + + encrypt_key_size) / sizeof(uint64_t); + key->sadb_key_exttype = SADB_EXT_KEY_ENCRYPT; + key->sadb_key_bits = x->ealg->alg_key_len; + key->sadb_key_reserved = 0; + memcpy(key + 1, x->ealg->alg_key, + (x->ealg->alg_key_len+7)/8); + } + + /* sa */ + sa2 = (struct sadb_x_sa2 *) skb_put(skb, sizeof(struct sadb_x_sa2)); + sa2->sadb_x_sa2_len = sizeof(struct sadb_x_sa2)/sizeof(uint64_t); + sa2->sadb_x_sa2_exttype = SADB_X_EXT_SA2; + sa2->sadb_x_sa2_mode = x->props.mode + 1; + sa2->sadb_x_sa2_reserved1 = 0; + sa2->sadb_x_sa2_reserved2 = 0; + sa2->sadb_x_sa2_sequence = 0; + sa2->sadb_x_sa2_reqid = x->props.reqid; + + if (natt && natt->encap_type) { + struct sadb_x_nat_t_type *n_type; + struct sadb_x_nat_t_port *n_port; + + /* type */ + n_type = (struct sadb_x_nat_t_type*) skb_put(skb, sizeof(*n_type)); + n_type->sadb_x_nat_t_type_len = sizeof(*n_type)/sizeof(uint64_t); + n_type->sadb_x_nat_t_type_exttype = SADB_X_EXT_NAT_T_TYPE; + n_type->sadb_x_nat_t_type_type = natt->encap_type; + n_type->sadb_x_nat_t_type_reserved[0] = 0; + n_type->sadb_x_nat_t_type_reserved[1] = 0; + n_type->sadb_x_nat_t_type_reserved[2] = 0; + + /* source port */ + n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port)); + n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); + n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT; + n_port->sadb_x_nat_t_port_port = natt->encap_sport; + n_port->sadb_x_nat_t_port_reserved = 0; + + /* dest port */ + n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port)); + n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); + n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT; + n_port->sadb_x_nat_t_port_port = natt->encap_dport; + n_port->sadb_x_nat_t_port_reserved = 0; + } + + return skb; +} + +static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr, + void **ext_hdrs) +{ + struct xfrm_state *x; + struct sadb_lifetime *lifetime; + struct sadb_sa *sa; + struct sadb_key *key; + uint16_t proto; + int err; + + + sa = (struct sadb_sa *) ext_hdrs[SADB_EXT_SA-1]; + if (!sa || + !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1])) + return ERR_PTR(-EINVAL); + if (hdr->sadb_msg_satype == SADB_SATYPE_ESP && + !ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]) + return ERR_PTR(-EINVAL); + if (hdr->sadb_msg_satype == SADB_SATYPE_AH && + !ext_hdrs[SADB_EXT_KEY_AUTH-1]) + return ERR_PTR(-EINVAL); + if (!!ext_hdrs[SADB_EXT_LIFETIME_HARD-1] != + !!ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) + return ERR_PTR(-EINVAL); + + proto = pfkey_satype2proto(hdr->sadb_msg_satype); + if (proto == 0) + return ERR_PTR(-EINVAL); + + /* default error is no buffer space */ + err = -ENOBUFS; + + /* RFC2367: + + Only SADB_SASTATE_MATURE SAs may be submitted in an SADB_ADD message. + SADB_SASTATE_LARVAL SAs are created by SADB_GETSPI and it is not + sensible to add a new SA in the DYING or SADB_SASTATE_DEAD state. + Therefore, the sadb_sa_state field of all submitted SAs MUST be + SADB_SASTATE_MATURE and the kernel MUST return an error if this is + not true. + + However, KAME setkey always uses SADB_SASTATE_LARVAL. + Hence, we have to _ignore_ sadb_sa_state, which is also reasonable. + */ + if (sa->sadb_sa_auth > SADB_AALG_MAX || + (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP && + sa->sadb_sa_encrypt > SADB_X_CALG_MAX) || + sa->sadb_sa_encrypt > SADB_EALG_MAX) + return ERR_PTR(-EINVAL); + key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1]; + if (key != NULL && + sa->sadb_sa_auth != SADB_X_AALG_NULL && + ((key->sadb_key_bits+7) / 8 == 0 || + (key->sadb_key_bits+7) / 8 > key->sadb_key_len * sizeof(uint64_t))) + return ERR_PTR(-EINVAL); + key = ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]; + if (key != NULL && + sa->sadb_sa_encrypt != SADB_EALG_NULL && + ((key->sadb_key_bits+7) / 8 == 0 || + (key->sadb_key_bits+7) / 8 > key->sadb_key_len * sizeof(uint64_t))) + return ERR_PTR(-EINVAL); + + x = xfrm_state_alloc(); + if (x == NULL) + return ERR_PTR(-ENOBUFS); + + x->id.proto = proto; + x->id.spi = sa->sadb_sa_spi; + x->props.replay_window = sa->sadb_sa_replay; + if (sa->sadb_sa_flags & SADB_SAFLAGS_NOECN) + x->props.flags |= XFRM_STATE_NOECN; + if (sa->sadb_sa_flags & SADB_SAFLAGS_DECAP_DSCP) + x->props.flags |= XFRM_STATE_DECAP_DSCP; + + lifetime = (struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_HARD-1]; + if (lifetime != NULL) { + x->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); + x->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); + x->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime; + x->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime; + } + lifetime = (struct sadb_lifetime*) ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]; + if (lifetime != NULL) { + x->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); + x->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); + x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; + x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; + } + key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1]; + if (sa->sadb_sa_auth) { + int keysize = 0; + struct xfrm_algo_desc *a = xfrm_aalg_get_byid(sa->sadb_sa_auth); + if (!a) { + err = -ENOSYS; + goto out; + } + if (key) + keysize = (key->sadb_key_bits + 7) / 8; + x->aalg = kmalloc(sizeof(*x->aalg) + keysize, GFP_KERNEL); + if (!x->aalg) + goto out; + strcpy(x->aalg->alg_name, a->name); + x->aalg->alg_key_len = 0; + if (key) { + x->aalg->alg_key_len = key->sadb_key_bits; + memcpy(x->aalg->alg_key, key+1, keysize); + } + x->props.aalgo = sa->sadb_sa_auth; + /* x->algo.flags = sa->sadb_sa_flags; */ + } + if (sa->sadb_sa_encrypt) { + if (hdr->sadb_msg_satype == SADB_X_SATYPE_IPCOMP) { + struct xfrm_algo_desc *a = xfrm_calg_get_byid(sa->sadb_sa_encrypt); + if (!a) { + err = -ENOSYS; + goto out; + } + x->calg = kmalloc(sizeof(*x->calg), GFP_KERNEL); + if (!x->calg) + goto out; + strcpy(x->calg->alg_name, a->name); + x->props.calgo = sa->sadb_sa_encrypt; + } else { + int keysize = 0; + struct xfrm_algo_desc *a = xfrm_ealg_get_byid(sa->sadb_sa_encrypt); + if (!a) { + err = -ENOSYS; + goto out; + } + key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_ENCRYPT-1]; + if (key) + keysize = (key->sadb_key_bits + 7) / 8; + x->ealg = kmalloc(sizeof(*x->ealg) + keysize, GFP_KERNEL); + if (!x->ealg) + goto out; + strcpy(x->ealg->alg_name, a->name); + x->ealg->alg_key_len = 0; + if (key) { + x->ealg->alg_key_len = key->sadb_key_bits; + memcpy(x->ealg->alg_key, key+1, keysize); + } + x->props.ealgo = sa->sadb_sa_encrypt; + } + } + /* x->algo.flags = sa->sadb_sa_flags; */ + + x->props.family = pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + &x->props.saddr); + if (!x->props.family) { + err = -EAFNOSUPPORT; + goto out; + } + pfkey_sadb_addr2xfrm_addr((struct sadb_address *) ext_hdrs[SADB_EXT_ADDRESS_DST-1], + &x->id.daddr); + + if (ext_hdrs[SADB_X_EXT_SA2-1]) { + struct sadb_x_sa2 *sa2 = (void*)ext_hdrs[SADB_X_EXT_SA2-1]; + x->props.mode = sa2->sadb_x_sa2_mode; + if (x->props.mode) + x->props.mode--; + x->props.reqid = sa2->sadb_x_sa2_reqid; + } + + if (ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]) { + struct sadb_address *addr = ext_hdrs[SADB_EXT_ADDRESS_PROXY-1]; + + /* Nobody uses this, but we try. */ + x->sel.family = pfkey_sadb_addr2xfrm_addr(addr, &x->sel.saddr); + x->sel.prefixlen_s = addr->sadb_address_prefixlen; + } + + if (ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]) { + struct sadb_x_nat_t_type* n_type; + struct xfrm_encap_tmpl *natt; + + x->encap = kmalloc(sizeof(*x->encap), GFP_KERNEL); + if (!x->encap) + goto out; + + natt = x->encap; + n_type = ext_hdrs[SADB_X_EXT_NAT_T_TYPE-1]; + natt->encap_type = n_type->sadb_x_nat_t_type_type; + + if (ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]) { + struct sadb_x_nat_t_port* n_port = + ext_hdrs[SADB_X_EXT_NAT_T_SPORT-1]; + natt->encap_sport = n_port->sadb_x_nat_t_port_port; + } + if (ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]) { + struct sadb_x_nat_t_port* n_port = + ext_hdrs[SADB_X_EXT_NAT_T_DPORT-1]; + natt->encap_dport = n_port->sadb_x_nat_t_port_port; + } + } + + x->type = xfrm_get_type(proto, x->props.family); + if (x->type == NULL) { + err = -ENOPROTOOPT; + goto out; + } + if (x->type->init_state(x, NULL)) { + err = -EINVAL; + goto out; + } + x->km.seq = hdr->sadb_msg_seq; + x->km.state = XFRM_STATE_VALID; + return x; + +out: + x->km.state = XFRM_STATE_DEAD; + xfrm_state_put(x); + return ERR_PTR(err); +} + +static int pfkey_reserved(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + return -EOPNOTSUPP; +} + +static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + struct sk_buff *resp_skb; + struct sadb_x_sa2 *sa2; + struct sadb_address *saddr, *daddr; + struct sadb_msg *out_hdr; + struct xfrm_state *x = NULL; + u8 mode; + u32 reqid; + u8 proto; + unsigned short family; + xfrm_address_t *xsaddr = NULL, *xdaddr = NULL; + + if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1])) + return -EINVAL; + + proto = pfkey_satype2proto(hdr->sadb_msg_satype); + if (proto == 0) + return -EINVAL; + + if ((sa2 = ext_hdrs[SADB_X_EXT_SA2-1]) != NULL) { + mode = sa2->sadb_x_sa2_mode - 1; + reqid = sa2->sadb_x_sa2_reqid; + } else { + mode = 0; + reqid = 0; + } + + saddr = ext_hdrs[SADB_EXT_ADDRESS_SRC-1]; + daddr = ext_hdrs[SADB_EXT_ADDRESS_DST-1]; + + family = ((struct sockaddr *)(saddr + 1))->sa_family; + switch (family) { + case AF_INET: + xdaddr = (xfrm_address_t *)&((struct sockaddr_in *)(daddr + 1))->sin_addr.s_addr; + xsaddr = (xfrm_address_t *)&((struct sockaddr_in *)(saddr + 1))->sin_addr.s_addr; + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + xdaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(daddr + 1))->sin6_addr; + xsaddr = (xfrm_address_t *)&((struct sockaddr_in6 *)(saddr + 1))->sin6_addr; + break; +#endif + } + + if (hdr->sadb_msg_seq) { + x = xfrm_find_acq_byseq(hdr->sadb_msg_seq); + if (x && xfrm_addr_cmp(&x->id.daddr, xdaddr, family)) { + xfrm_state_put(x); + x = NULL; + } + } + + if (!x) + x = xfrm_find_acq(mode, reqid, proto, xdaddr, xsaddr, 1, family); + + if (x == NULL) + return -ENOENT; + + resp_skb = ERR_PTR(-ENOENT); + + spin_lock_bh(&x->lock); + if (x->km.state != XFRM_STATE_DEAD) { + struct sadb_spirange *range = ext_hdrs[SADB_EXT_SPIRANGE-1]; + u32 min_spi, max_spi; + + if (range != NULL) { + min_spi = range->sadb_spirange_min; + max_spi = range->sadb_spirange_max; + } else { + min_spi = 0x100; + max_spi = 0x0fffffff; + } + xfrm_alloc_spi(x, htonl(min_spi), htonl(max_spi)); + if (x->id.spi) + resp_skb = pfkey_xfrm_state2msg(x, 0, 3); + } + spin_unlock_bh(&x->lock); + + if (IS_ERR(resp_skb)) { + xfrm_state_put(x); + return PTR_ERR(resp_skb); + } + + out_hdr = (struct sadb_msg *) resp_skb->data; + out_hdr->sadb_msg_version = hdr->sadb_msg_version; + out_hdr->sadb_msg_type = SADB_GETSPI; + out_hdr->sadb_msg_satype = pfkey_proto2satype(proto); + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_reserved = 0; + out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; + out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; + + xfrm_state_put(x); + + pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk); + + return 0; +} + +static int pfkey_acquire(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + struct xfrm_state *x; + + if (hdr->sadb_msg_len != sizeof(struct sadb_msg)/8) + return -EOPNOTSUPP; + + if (hdr->sadb_msg_seq == 0 || hdr->sadb_msg_errno == 0) + return 0; + + x = xfrm_find_acq_byseq(hdr->sadb_msg_seq); + if (x == NULL) + return 0; + + spin_lock_bh(&x->lock); + if (x->km.state == XFRM_STATE_ACQ) { + x->km.state = XFRM_STATE_ERROR; + wake_up(&km_waitq); + } + spin_unlock_bh(&x->lock); + xfrm_state_put(x); + return 0; +} + + +static int pfkey_add(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + struct xfrm_state *x; + int err; + + xfrm_probe_algs(); + + x = pfkey_msg2xfrm_state(hdr, ext_hdrs); + if (IS_ERR(x)) + return PTR_ERR(x); + + if (hdr->sadb_msg_type == SADB_ADD) + err = xfrm_state_add(x); + else + err = xfrm_state_update(x); + + if (err < 0) { + x->km.state = XFRM_STATE_DEAD; + xfrm_state_put(x); + return err; + } + + out_skb = pfkey_xfrm_state2msg(x, 0, 3); + if (IS_ERR(out_skb)) + return PTR_ERR(out_skb); /* XXX Should we return 0 here ? */ + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = hdr->sadb_msg_version; + out_hdr->sadb_msg_type = hdr->sadb_msg_type; + out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_reserved = 0; + out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; + out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; + + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk); + + return 0; +} + +static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + struct xfrm_state *x; + + if (!ext_hdrs[SADB_EXT_SA-1] || + !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1])) + return -EINVAL; + + x = pfkey_xfrm_state_lookup(hdr, ext_hdrs); + if (x == NULL) + return -ESRCH; + + if (xfrm_state_kern(x)) { + xfrm_state_put(x); + return -EPERM; + } + + xfrm_state_delete(x); + xfrm_state_put(x); + + pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, + BROADCAST_ALL, sk); + + return 0; +} + +static int pfkey_get(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + __u8 proto; + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + struct xfrm_state *x; + + if (!ext_hdrs[SADB_EXT_SA-1] || + !present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1])) + return -EINVAL; + + x = pfkey_xfrm_state_lookup(hdr, ext_hdrs); + if (x == NULL) + return -ESRCH; + + out_skb = pfkey_xfrm_state2msg(x, 1, 3); + proto = x->id.proto; + xfrm_state_put(x); + if (IS_ERR(out_skb)) + return PTR_ERR(out_skb); + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = hdr->sadb_msg_version; + out_hdr->sadb_msg_type = SADB_DUMP; + out_hdr->sadb_msg_satype = pfkey_proto2satype(proto); + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_reserved = 0; + out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; + out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk); + + return 0; +} + +static struct sk_buff *compose_sadb_supported(struct sadb_msg *orig, int allocation) +{ + struct sk_buff *skb; + struct sadb_msg *hdr; + int len, auth_len, enc_len, i; + + auth_len = xfrm_count_auth_supported(); + if (auth_len) { + auth_len *= sizeof(struct sadb_alg); + auth_len += sizeof(struct sadb_supported); + } + + enc_len = xfrm_count_enc_supported(); + if (enc_len) { + enc_len *= sizeof(struct sadb_alg); + enc_len += sizeof(struct sadb_supported); + } + + len = enc_len + auth_len + sizeof(struct sadb_msg); + + skb = alloc_skb(len + 16, allocation); + if (!skb) + goto out_put_algs; + + hdr = (struct sadb_msg *) skb_put(skb, sizeof(*hdr)); + pfkey_hdr_dup(hdr, orig); + hdr->sadb_msg_errno = 0; + hdr->sadb_msg_len = len / sizeof(uint64_t); + + if (auth_len) { + struct sadb_supported *sp; + struct sadb_alg *ap; + + sp = (struct sadb_supported *) skb_put(skb, auth_len); + ap = (struct sadb_alg *) (sp + 1); + + sp->sadb_supported_len = auth_len / sizeof(uint64_t); + sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH; + + for (i = 0; ; i++) { + struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); + if (!aalg) + break; + if (aalg->available) + *ap++ = aalg->desc; + } + } + + if (enc_len) { + struct sadb_supported *sp; + struct sadb_alg *ap; + + sp = (struct sadb_supported *) skb_put(skb, enc_len); + ap = (struct sadb_alg *) (sp + 1); + + sp->sadb_supported_len = enc_len / sizeof(uint64_t); + sp->sadb_supported_exttype = SADB_EXT_SUPPORTED_ENCRYPT; + + for (i = 0; ; i++) { + struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); + if (!ealg) + break; + if (ealg->available) + *ap++ = ealg->desc; + } + } + +out_put_algs: + return skb; +} + +static int pfkey_register(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + struct pfkey_sock *pfk = pfkey_sk(sk); + struct sk_buff *supp_skb; + + if (hdr->sadb_msg_satype > SADB_SATYPE_MAX) + return -EINVAL; + + if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) { + if (pfk->registered&(1<sadb_msg_satype)) + return -EEXIST; + pfk->registered |= (1<sadb_msg_satype); + } + + xfrm_probe_algs(); + + supp_skb = compose_sadb_supported(hdr, GFP_KERNEL); + if (!supp_skb) { + if (hdr->sadb_msg_satype != SADB_SATYPE_UNSPEC) + pfk->registered &= ~(1<sadb_msg_satype); + + return -ENOBUFS; + } + + pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk); + + return 0; +} + +static int pfkey_flush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + unsigned proto; + struct sk_buff *skb_out; + struct sadb_msg *hdr_out; + + proto = pfkey_satype2proto(hdr->sadb_msg_satype); + if (proto == 0) + return -EINVAL; + + skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL); + if (!skb_out) + return -ENOBUFS; + + xfrm_state_flush(proto); + + hdr_out = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg)); + pfkey_hdr_dup(hdr_out, hdr); + hdr_out->sadb_msg_errno = (uint8_t) 0; + hdr_out->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); + + pfkey_broadcast(skb_out, GFP_KERNEL, BROADCAST_ALL, NULL); + + return 0; +} + +struct pfkey_dump_data +{ + struct sk_buff *skb; + struct sadb_msg *hdr; + struct sock *sk; +}; + +static int dump_sa(struct xfrm_state *x, int count, void *ptr) +{ + struct pfkey_dump_data *data = ptr; + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + + out_skb = pfkey_xfrm_state2msg(x, 1, 3); + if (IS_ERR(out_skb)) + return PTR_ERR(out_skb); + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = data->hdr->sadb_msg_version; + out_hdr->sadb_msg_type = SADB_DUMP; + out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_reserved = 0; + out_hdr->sadb_msg_seq = count; + out_hdr->sadb_msg_pid = data->hdr->sadb_msg_pid; + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, data->sk); + return 0; +} + +static int pfkey_dump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + u8 proto; + struct pfkey_dump_data data = { .skb = skb, .hdr = hdr, .sk = sk }; + + proto = pfkey_satype2proto(hdr->sadb_msg_satype); + if (proto == 0) + return -EINVAL; + + return xfrm_state_walk(proto, dump_sa, &data); +} + +static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + struct pfkey_sock *pfk = pfkey_sk(sk); + int satype = hdr->sadb_msg_satype; + + if (hdr->sadb_msg_len == (sizeof(*hdr) / sizeof(uint64_t))) { + /* XXX we mangle packet... */ + hdr->sadb_msg_errno = 0; + if (satype != 0 && satype != 1) + return -EINVAL; + pfk->promisc = satype; + } + pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, BROADCAST_ALL, NULL); + return 0; +} + +static int check_reqid(struct xfrm_policy *xp, int dir, int count, void *ptr) +{ + int i; + u32 reqid = *(u32*)ptr; + + for (i=0; ixfrm_nr; i++) { + if (xp->xfrm_vec[i].reqid == reqid) + return -EEXIST; + } + return 0; +} + +static u32 gen_reqid(void) +{ + u32 start; + static u32 reqid = IPSEC_MANUAL_REQID_MAX; + + start = reqid; + do { + ++reqid; + if (reqid == 0) + reqid = IPSEC_MANUAL_REQID_MAX+1; + if (xfrm_policy_walk(check_reqid, (void*)&reqid) != -EEXIST) + return reqid; + } while (reqid != start); + return 0; +} + +static int +parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq) +{ + struct xfrm_tmpl *t = xp->xfrm_vec + xp->xfrm_nr; + struct sockaddr_in *sin; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct sockaddr_in6 *sin6; +#endif + + if (xp->xfrm_nr >= XFRM_MAX_DEPTH) + return -ELOOP; + + if (rq->sadb_x_ipsecrequest_mode == 0) + return -EINVAL; + + t->id.proto = rq->sadb_x_ipsecrequest_proto; /* XXX check proto */ + t->mode = rq->sadb_x_ipsecrequest_mode-1; + if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_USE) + t->optional = 1; + else if (rq->sadb_x_ipsecrequest_level == IPSEC_LEVEL_UNIQUE) { + t->reqid = rq->sadb_x_ipsecrequest_reqid; + if (t->reqid > IPSEC_MANUAL_REQID_MAX) + t->reqid = 0; + if (!t->reqid && !(t->reqid = gen_reqid())) + return -ENOBUFS; + } + + /* addresses present only in tunnel mode */ + if (t->mode) { + switch (xp->family) { + case AF_INET: + sin = (void*)(rq+1); + if (sin->sin_family != AF_INET) + return -EINVAL; + t->saddr.a4 = sin->sin_addr.s_addr; + sin++; + if (sin->sin_family != AF_INET) + return -EINVAL; + t->id.daddr.a4 = sin->sin_addr.s_addr; + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + sin6 = (void *)(rq+1); + if (sin6->sin6_family != AF_INET6) + return -EINVAL; + memcpy(t->saddr.a6, &sin6->sin6_addr, sizeof(struct in6_addr)); + sin6++; + if (sin6->sin6_family != AF_INET6) + return -EINVAL; + memcpy(t->id.daddr.a6, &sin6->sin6_addr, sizeof(struct in6_addr)); + break; +#endif + default: + return -EINVAL; + } + } + /* No way to set this via kame pfkey */ + t->aalgos = t->ealgos = t->calgos = ~0; + xp->xfrm_nr++; + return 0; +} + +static int +parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) +{ + int err; + int len = pol->sadb_x_policy_len*8 - sizeof(struct sadb_x_policy); + struct sadb_x_ipsecrequest *rq = (void*)(pol+1); + + while (len >= sizeof(struct sadb_x_ipsecrequest)) { + if ((err = parse_ipsecrequest(xp, rq)) < 0) + return err; + len -= rq->sadb_x_ipsecrequest_len; + rq = (void*)((u8*)rq + rq->sadb_x_ipsecrequest_len); + } + return 0; +} + +static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp) +{ + int sockaddr_size = pfkey_sockaddr_size(xp->family); + int socklen = (xp->family == AF_INET ? + sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6)); + + return sizeof(struct sadb_msg) + + (sizeof(struct sadb_lifetime) * 3) + + (sizeof(struct sadb_address) * 2) + + (sockaddr_size * 2) + + sizeof(struct sadb_x_policy) + + (xp->xfrm_nr * (sizeof(struct sadb_x_ipsecrequest) + + (socklen * 2))); +} + +static struct sk_buff * pfkey_xfrm_policy2msg_prep(struct xfrm_policy *xp) +{ + struct sk_buff *skb; + int size; + + size = pfkey_xfrm_policy2msg_size(xp); + + skb = alloc_skb(size + 16, GFP_ATOMIC); + if (skb == NULL) + return ERR_PTR(-ENOBUFS); + + return skb; +} + +static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, int dir) +{ + struct sadb_msg *hdr; + struct sadb_address *addr; + struct sadb_lifetime *lifetime; + struct sadb_x_policy *pol; + struct sockaddr_in *sin; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct sockaddr_in6 *sin6; +#endif + int i; + int size; + int sockaddr_size = pfkey_sockaddr_size(xp->family); + int socklen = (xp->family == AF_INET ? + sizeof(struct sockaddr_in) : + sizeof(struct sockaddr_in6)); + + size = pfkey_xfrm_policy2msg_size(xp); + + /* call should fill header later */ + hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); + memset(hdr, 0, size); /* XXX do we need this ? */ + + /* src address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; + addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto); + addr->sadb_address_prefixlen = xp->selector.prefixlen_s; + addr->sadb_address_reserved = 0; + /* src address */ + if (xp->family == AF_INET) { + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = xp->selector.saddr.a4; + sin->sin_port = xp->selector.sport; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (xp->family == AF_INET6) { + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = xp->selector.sport; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, xp->selector.saddr.a6, + sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + } +#endif + else + BUG(); + + /* dst address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; + addr->sadb_address_proto = pfkey_proto_from_xfrm(xp->selector.proto); + addr->sadb_address_prefixlen = xp->selector.prefixlen_d; + addr->sadb_address_reserved = 0; + if (xp->family == AF_INET) { + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = xp->selector.daddr.a4; + sin->sin_port = xp->selector.dport; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (xp->family == AF_INET6) { + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = xp->selector.dport; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, xp->selector.daddr.a6, + sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + } +#endif + else + BUG(); + + /* hard time */ + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_HARD; + lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.hard_packet_limit); + lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.hard_byte_limit); + lifetime->sadb_lifetime_addtime = xp->lft.hard_add_expires_seconds; + lifetime->sadb_lifetime_usetime = xp->lft.hard_use_expires_seconds; + /* soft time */ + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_SOFT; + lifetime->sadb_lifetime_allocations = _X2KEY(xp->lft.soft_packet_limit); + lifetime->sadb_lifetime_bytes = _X2KEY(xp->lft.soft_byte_limit); + lifetime->sadb_lifetime_addtime = xp->lft.soft_add_expires_seconds; + lifetime->sadb_lifetime_usetime = xp->lft.soft_use_expires_seconds; + /* current time */ + lifetime = (struct sadb_lifetime *) skb_put(skb, + sizeof(struct sadb_lifetime)); + lifetime->sadb_lifetime_len = + sizeof(struct sadb_lifetime)/sizeof(uint64_t); + lifetime->sadb_lifetime_exttype = SADB_EXT_LIFETIME_CURRENT; + lifetime->sadb_lifetime_allocations = xp->curlft.packets; + lifetime->sadb_lifetime_bytes = xp->curlft.bytes; + lifetime->sadb_lifetime_addtime = xp->curlft.add_time; + lifetime->sadb_lifetime_usetime = xp->curlft.use_time; + + pol = (struct sadb_x_policy *) skb_put(skb, sizeof(struct sadb_x_policy)); + pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t); + pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; + pol->sadb_x_policy_type = IPSEC_POLICY_DISCARD; + if (xp->action == XFRM_POLICY_ALLOW) { + if (xp->xfrm_nr) + pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; + else + pol->sadb_x_policy_type = IPSEC_POLICY_NONE; + } + pol->sadb_x_policy_dir = dir+1; + pol->sadb_x_policy_id = xp->index; + pol->sadb_x_policy_priority = xp->priority; + + for (i=0; ixfrm_nr; i++) { + struct sadb_x_ipsecrequest *rq; + struct xfrm_tmpl *t = xp->xfrm_vec + i; + int req_size; + + req_size = sizeof(struct sadb_x_ipsecrequest); + if (t->mode) + req_size += 2*socklen; + else + size -= 2*socklen; + rq = (void*)skb_put(skb, req_size); + pol->sadb_x_policy_len += req_size/8; + memset(rq, 0, sizeof(*rq)); + rq->sadb_x_ipsecrequest_len = req_size; + rq->sadb_x_ipsecrequest_proto = t->id.proto; + rq->sadb_x_ipsecrequest_mode = t->mode+1; + rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_REQUIRE; + if (t->reqid) + rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_UNIQUE; + if (t->optional) + rq->sadb_x_ipsecrequest_level = IPSEC_LEVEL_USE; + rq->sadb_x_ipsecrequest_reqid = t->reqid; + if (t->mode) { + switch (xp->family) { + case AF_INET: + sin = (void*)(rq+1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = t->saddr.a4; + sin->sin_port = 0; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + sin++; + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = t->id.daddr.a4; + sin->sin_port = 0; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + sin6 = (void*)(rq+1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, t->saddr.a6, + sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + + sin6++; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, t->id.daddr.a6, + sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + break; +#endif + default: + break; + } + } + } + hdr->sadb_msg_len = size / sizeof(uint64_t); + hdr->sadb_msg_reserved = atomic_read(&xp->refcnt); +} + +static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + int err; + struct sadb_lifetime *lifetime; + struct sadb_address *sa; + struct sadb_x_policy *pol; + struct xfrm_policy *xp; + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + + if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || + !ext_hdrs[SADB_X_EXT_POLICY-1]) + return -EINVAL; + + pol = ext_hdrs[SADB_X_EXT_POLICY-1]; + if (pol->sadb_x_policy_type > IPSEC_POLICY_IPSEC) + return -EINVAL; + if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) + return -EINVAL; + + xp = xfrm_policy_alloc(GFP_KERNEL); + if (xp == NULL) + return -ENOBUFS; + + xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ? + XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW); + xp->priority = pol->sadb_x_policy_priority; + + sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + xp->family = pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.saddr); + if (!xp->family) { + err = -EINVAL; + goto out; + } + xp->selector.family = xp->family; + xp->selector.prefixlen_s = sa->sadb_address_prefixlen; + xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); + xp->selector.sport = ((struct sockaddr_in *)(sa+1))->sin_port; + if (xp->selector.sport) + xp->selector.sport_mask = ~0; + + sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1], + pfkey_sadb_addr2xfrm_addr(sa, &xp->selector.daddr); + xp->selector.prefixlen_d = sa->sadb_address_prefixlen; + + /* Amusing, we set this twice. KAME apps appear to set same value + * in both addresses. + */ + xp->selector.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); + + xp->selector.dport = ((struct sockaddr_in *)(sa+1))->sin_port; + if (xp->selector.dport) + xp->selector.dport_mask = ~0; + + xp->lft.soft_byte_limit = XFRM_INF; + xp->lft.hard_byte_limit = XFRM_INF; + xp->lft.soft_packet_limit = XFRM_INF; + xp->lft.hard_packet_limit = XFRM_INF; + if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_HARD-1]) != NULL) { + xp->lft.hard_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); + xp->lft.hard_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); + xp->lft.hard_add_expires_seconds = lifetime->sadb_lifetime_addtime; + xp->lft.hard_use_expires_seconds = lifetime->sadb_lifetime_usetime; + } + if ((lifetime = ext_hdrs[SADB_EXT_LIFETIME_SOFT-1]) != NULL) { + xp->lft.soft_packet_limit = _KEY2X(lifetime->sadb_lifetime_allocations); + xp->lft.soft_byte_limit = _KEY2X(lifetime->sadb_lifetime_bytes); + xp->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; + xp->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; + } + xp->xfrm_nr = 0; + if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC && + (err = parse_ipsecrequests(xp, pol)) < 0) + goto out; + + out_skb = pfkey_xfrm_policy2msg_prep(xp); + if (IS_ERR(out_skb)) { + err = PTR_ERR(out_skb); + goto out; + } + + err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp, + hdr->sadb_msg_type != SADB_X_SPDUPDATE); + if (err) { + kfree_skb(out_skb); + goto out; + } + + pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1); + + xfrm_pol_put(xp); + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = hdr->sadb_msg_version; + out_hdr->sadb_msg_type = hdr->sadb_msg_type; + out_hdr->sadb_msg_satype = 0; + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; + out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk); + return 0; + +out: + kfree(xp); + return err; +} + +static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + int err; + struct sadb_address *sa; + struct sadb_x_policy *pol; + struct xfrm_policy *xp; + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + struct xfrm_selector sel; + + if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || + !ext_hdrs[SADB_X_EXT_POLICY-1]) + return -EINVAL; + + pol = ext_hdrs[SADB_X_EXT_POLICY-1]; + if (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir >= IPSEC_DIR_MAX) + return -EINVAL; + + memset(&sel, 0, sizeof(sel)); + + sa = ext_hdrs[SADB_EXT_ADDRESS_SRC-1], + sel.family = pfkey_sadb_addr2xfrm_addr(sa, &sel.saddr); + sel.prefixlen_s = sa->sadb_address_prefixlen; + sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); + sel.sport = ((struct sockaddr_in *)(sa+1))->sin_port; + if (sel.sport) + sel.sport_mask = ~0; + + sa = ext_hdrs[SADB_EXT_ADDRESS_DST-1], + pfkey_sadb_addr2xfrm_addr(sa, &sel.daddr); + sel.prefixlen_d = sa->sadb_address_prefixlen; + sel.proto = pfkey_proto_to_xfrm(sa->sadb_address_proto); + sel.dport = ((struct sockaddr_in *)(sa+1))->sin_port; + if (sel.dport) + sel.dport_mask = ~0; + + xp = xfrm_policy_bysel(pol->sadb_x_policy_dir-1, &sel, 1); + if (xp == NULL) + return -ENOENT; + + err = 0; + + out_skb = pfkey_xfrm_policy2msg_prep(xp); + if (IS_ERR(out_skb)) { + err = PTR_ERR(out_skb); + goto out; + } + pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1); + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = hdr->sadb_msg_version; + out_hdr->sadb_msg_type = SADB_X_SPDDELETE; + out_hdr->sadb_msg_satype = 0; + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; + out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk); + err = 0; + +out: + xfrm_pol_put(xp); + return err; +} + +static int pfkey_spdget(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + int err; + struct sadb_x_policy *pol; + struct xfrm_policy *xp; + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + + if ((pol = ext_hdrs[SADB_X_EXT_POLICY-1]) == NULL) + return -EINVAL; + + xp = xfrm_policy_byid(0, pol->sadb_x_policy_id, + hdr->sadb_msg_type == SADB_X_SPDDELETE2); + if (xp == NULL) + return -ENOENT; + + err = 0; + + out_skb = pfkey_xfrm_policy2msg_prep(xp); + if (IS_ERR(out_skb)) { + err = PTR_ERR(out_skb); + goto out; + } + pfkey_xfrm_policy2msg(out_skb, xp, pol->sadb_x_policy_dir-1); + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = hdr->sadb_msg_version; + out_hdr->sadb_msg_type = hdr->sadb_msg_type; + out_hdr->sadb_msg_satype = 0; + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; + out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, sk); + err = 0; + +out: + xfrm_pol_put(xp); + return err; +} + +static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr) +{ + struct pfkey_dump_data *data = ptr; + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + + out_skb = pfkey_xfrm_policy2msg_prep(xp); + if (IS_ERR(out_skb)) + return PTR_ERR(out_skb); + + pfkey_xfrm_policy2msg(out_skb, xp, dir); + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = data->hdr->sadb_msg_version; + out_hdr->sadb_msg_type = SADB_X_SPDDUMP; + out_hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_seq = count; + out_hdr->sadb_msg_pid = data->hdr->sadb_msg_pid; + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, data->sk); + return 0; +} + +static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + struct pfkey_dump_data data = { .skb = skb, .hdr = hdr, .sk = sk }; + + return xfrm_policy_walk(dump_sp, &data); +} + +static int pfkey_spdflush(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) +{ + struct sk_buff *skb_out; + struct sadb_msg *hdr_out; + + skb_out = alloc_skb(sizeof(struct sadb_msg) + 16, GFP_KERNEL); + if (!skb_out) + return -ENOBUFS; + + xfrm_policy_flush(); + + hdr_out = (struct sadb_msg *) skb_put(skb_out, sizeof(struct sadb_msg)); + pfkey_hdr_dup(hdr_out, hdr); + hdr_out->sadb_msg_errno = (uint8_t) 0; + hdr_out->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); + pfkey_broadcast(skb_out, GFP_KERNEL, BROADCAST_ALL, NULL); + + return 0; +} + +typedef int (*pfkey_handler)(struct sock *sk, struct sk_buff *skb, + struct sadb_msg *hdr, void **ext_hdrs); +static pfkey_handler pfkey_funcs[SADB_MAX + 1] = { + [SADB_RESERVED] = pfkey_reserved, + [SADB_GETSPI] = pfkey_getspi, + [SADB_UPDATE] = pfkey_add, + [SADB_ADD] = pfkey_add, + [SADB_DELETE] = pfkey_delete, + [SADB_GET] = pfkey_get, + [SADB_ACQUIRE] = pfkey_acquire, + [SADB_REGISTER] = pfkey_register, + [SADB_EXPIRE] = NULL, + [SADB_FLUSH] = pfkey_flush, + [SADB_DUMP] = pfkey_dump, + [SADB_X_PROMISC] = pfkey_promisc, + [SADB_X_PCHANGE] = NULL, + [SADB_X_SPDUPDATE] = pfkey_spdadd, + [SADB_X_SPDADD] = pfkey_spdadd, + [SADB_X_SPDDELETE] = pfkey_spddelete, + [SADB_X_SPDGET] = pfkey_spdget, + [SADB_X_SPDACQUIRE] = NULL, + [SADB_X_SPDDUMP] = pfkey_spddump, + [SADB_X_SPDFLUSH] = pfkey_spdflush, + [SADB_X_SPDSETIDX] = pfkey_spdadd, + [SADB_X_SPDDELETE2] = pfkey_spdget, +}; + +static int pfkey_process(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr) +{ + void *ext_hdrs[SADB_EXT_MAX]; + int err; + + pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, + BROADCAST_PROMISC_ONLY, NULL); + + memset(ext_hdrs, 0, sizeof(ext_hdrs)); + err = parse_exthdrs(skb, hdr, ext_hdrs); + if (!err) { + err = -EOPNOTSUPP; + if (pfkey_funcs[hdr->sadb_msg_type]) + err = pfkey_funcs[hdr->sadb_msg_type](sk, skb, hdr, ext_hdrs); + } + return err; +} + +static struct sadb_msg *pfkey_get_base_msg(struct sk_buff *skb, int *errp) +{ + struct sadb_msg *hdr = NULL; + + if (skb->len < sizeof(*hdr)) { + *errp = -EMSGSIZE; + } else { + hdr = (struct sadb_msg *) skb->data; + if (hdr->sadb_msg_version != PF_KEY_V2 || + hdr->sadb_msg_reserved != 0 || + (hdr->sadb_msg_type <= SADB_RESERVED || + hdr->sadb_msg_type > SADB_MAX)) { + hdr = NULL; + *errp = -EINVAL; + } else if (hdr->sadb_msg_len != (skb->len / + sizeof(uint64_t)) || + hdr->sadb_msg_len < (sizeof(struct sadb_msg) / + sizeof(uint64_t))) { + hdr = NULL; + *errp = -EMSGSIZE; + } else { + *errp = 0; + } + } + return hdr; +} + +static inline int aalg_tmpl_set(struct xfrm_tmpl *t, struct xfrm_algo_desc *d) +{ + return t->aalgos & (1 << d->desc.sadb_alg_id); +} + +static inline int ealg_tmpl_set(struct xfrm_tmpl *t, struct xfrm_algo_desc *d) +{ + return t->ealgos & (1 << d->desc.sadb_alg_id); +} + +static int count_ah_combs(struct xfrm_tmpl *t) +{ + int i, sz = 0; + + for (i = 0; ; i++) { + struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); + if (!aalg) + break; + if (aalg_tmpl_set(t, aalg) && aalg->available) + sz += sizeof(struct sadb_comb); + } + return sz + sizeof(struct sadb_prop); +} + +static int count_esp_combs(struct xfrm_tmpl *t) +{ + int i, k, sz = 0; + + for (i = 0; ; i++) { + struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); + if (!ealg) + break; + + if (!(ealg_tmpl_set(t, ealg) && ealg->available)) + continue; + + for (k = 1; ; k++) { + struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k); + if (!aalg) + break; + + if (aalg_tmpl_set(t, aalg) && aalg->available) + sz += sizeof(struct sadb_comb); + } + } + return sz + sizeof(struct sadb_prop); +} + +static void dump_ah_combs(struct sk_buff *skb, struct xfrm_tmpl *t) +{ + struct sadb_prop *p; + int i; + + p = (struct sadb_prop*)skb_put(skb, sizeof(struct sadb_prop)); + p->sadb_prop_len = sizeof(struct sadb_prop)/8; + p->sadb_prop_exttype = SADB_EXT_PROPOSAL; + p->sadb_prop_replay = 32; + memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved)); + + for (i = 0; ; i++) { + struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(i); + if (!aalg) + break; + + if (aalg_tmpl_set(t, aalg) && aalg->available) { + struct sadb_comb *c; + c = (struct sadb_comb*)skb_put(skb, sizeof(struct sadb_comb)); + memset(c, 0, sizeof(*c)); + p->sadb_prop_len += sizeof(struct sadb_comb)/8; + c->sadb_comb_auth = aalg->desc.sadb_alg_id; + c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits; + c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits; + c->sadb_comb_hard_addtime = 24*60*60; + c->sadb_comb_soft_addtime = 20*60*60; + c->sadb_comb_hard_usetime = 8*60*60; + c->sadb_comb_soft_usetime = 7*60*60; + } + } +} + +static void dump_esp_combs(struct sk_buff *skb, struct xfrm_tmpl *t) +{ + struct sadb_prop *p; + int i, k; + + p = (struct sadb_prop*)skb_put(skb, sizeof(struct sadb_prop)); + p->sadb_prop_len = sizeof(struct sadb_prop)/8; + p->sadb_prop_exttype = SADB_EXT_PROPOSAL; + p->sadb_prop_replay = 32; + memset(p->sadb_prop_reserved, 0, sizeof(p->sadb_prop_reserved)); + + for (i=0; ; i++) { + struct xfrm_algo_desc *ealg = xfrm_ealg_get_byidx(i); + if (!ealg) + break; + + if (!(ealg_tmpl_set(t, ealg) && ealg->available)) + continue; + + for (k = 1; ; k++) { + struct sadb_comb *c; + struct xfrm_algo_desc *aalg = xfrm_aalg_get_byidx(k); + if (!aalg) + break; + if (!(aalg_tmpl_set(t, aalg) && aalg->available)) + continue; + c = (struct sadb_comb*)skb_put(skb, sizeof(struct sadb_comb)); + memset(c, 0, sizeof(*c)); + p->sadb_prop_len += sizeof(struct sadb_comb)/8; + c->sadb_comb_auth = aalg->desc.sadb_alg_id; + c->sadb_comb_auth_minbits = aalg->desc.sadb_alg_minbits; + c->sadb_comb_auth_maxbits = aalg->desc.sadb_alg_maxbits; + c->sadb_comb_encrypt = ealg->desc.sadb_alg_id; + c->sadb_comb_encrypt_minbits = ealg->desc.sadb_alg_minbits; + c->sadb_comb_encrypt_maxbits = ealg->desc.sadb_alg_maxbits; + c->sadb_comb_hard_addtime = 24*60*60; + c->sadb_comb_soft_addtime = 20*60*60; + c->sadb_comb_hard_usetime = 8*60*60; + c->sadb_comb_soft_usetime = 7*60*60; + } + } +} + +static int pfkey_send_notify(struct xfrm_state *x, int hard) +{ + struct sk_buff *out_skb; + struct sadb_msg *out_hdr; + int hsc = (hard ? 2 : 1); + + out_skb = pfkey_xfrm_state2msg(x, 0, hsc); + if (IS_ERR(out_skb)) + return PTR_ERR(out_skb); + + out_hdr = (struct sadb_msg *) out_skb->data; + out_hdr->sadb_msg_version = PF_KEY_V2; + out_hdr->sadb_msg_type = SADB_EXPIRE; + out_hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); + out_hdr->sadb_msg_errno = 0; + out_hdr->sadb_msg_reserved = 0; + out_hdr->sadb_msg_seq = 0; + out_hdr->sadb_msg_pid = 0; + + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL); + return 0; +} + +static u32 get_acqseq(void) +{ + u32 res; + static u32 acqseq; + static DEFINE_SPINLOCK(acqseq_lock); + + spin_lock_bh(&acqseq_lock); + res = (++acqseq ? : ++acqseq); + spin_unlock_bh(&acqseq_lock); + return res; +} + +static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *xp, int dir) +{ + struct sk_buff *skb; + struct sadb_msg *hdr; + struct sadb_address *addr; + struct sadb_x_policy *pol; + struct sockaddr_in *sin; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct sockaddr_in6 *sin6; +#endif + int sockaddr_size; + int size; + + sockaddr_size = pfkey_sockaddr_size(x->props.family); + if (!sockaddr_size) + return -EINVAL; + + size = sizeof(struct sadb_msg) + + (sizeof(struct sadb_address) * 2) + + (sockaddr_size * 2) + + sizeof(struct sadb_x_policy); + + if (x->id.proto == IPPROTO_AH) + size += count_ah_combs(t); + else if (x->id.proto == IPPROTO_ESP) + size += count_esp_combs(t); + + skb = alloc_skb(size + 16, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); + hdr->sadb_msg_version = PF_KEY_V2; + hdr->sadb_msg_type = SADB_ACQUIRE; + hdr->sadb_msg_satype = pfkey_proto2satype(x->id.proto); + hdr->sadb_msg_len = size / sizeof(uint64_t); + hdr->sadb_msg_errno = 0; + hdr->sadb_msg_reserved = 0; + hdr->sadb_msg_seq = x->km.seq = get_acqseq(); + hdr->sadb_msg_pid = 0; + + /* src address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; + addr->sadb_address_proto = 0; + addr->sadb_address_reserved = 0; + if (x->props.family == AF_INET) { + addr->sadb_address_prefixlen = 32; + + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = x->props.saddr.a4; + sin->sin_port = 0; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (x->props.family == AF_INET6) { + addr->sadb_address_prefixlen = 128; + + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, + x->props.saddr.a6, sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + } +#endif + else + BUG(); + + /* dst address */ + addr = (struct sadb_address*) skb_put(skb, + sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; + addr->sadb_address_proto = 0; + addr->sadb_address_reserved = 0; + if (x->props.family == AF_INET) { + addr->sadb_address_prefixlen = 32; + + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = x->id.daddr.a4; + sin->sin_port = 0; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (x->props.family == AF_INET6) { + addr->sadb_address_prefixlen = 128; + + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, + x->id.daddr.a6, sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + } +#endif + else + BUG(); + + pol = (struct sadb_x_policy *) skb_put(skb, sizeof(struct sadb_x_policy)); + pol->sadb_x_policy_len = sizeof(struct sadb_x_policy)/sizeof(uint64_t); + pol->sadb_x_policy_exttype = SADB_X_EXT_POLICY; + pol->sadb_x_policy_type = IPSEC_POLICY_IPSEC; + pol->sadb_x_policy_dir = dir+1; + pol->sadb_x_policy_id = xp->index; + + /* Set sadb_comb's. */ + if (x->id.proto == IPPROTO_AH) + dump_ah_combs(skb, t); + else if (x->id.proto == IPPROTO_ESP) + dump_esp_combs(skb, t); + + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL); +} + +static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt, + u8 *data, int len, int *dir) +{ + struct xfrm_policy *xp; + struct sadb_x_policy *pol = (struct sadb_x_policy*)data; + + switch (family) { + case AF_INET: + if (opt != IP_IPSEC_POLICY) { + *dir = -EOPNOTSUPP; + return NULL; + } + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + if (opt != IPV6_IPSEC_POLICY) { + *dir = -EOPNOTSUPP; + return NULL; + } + break; +#endif + default: + *dir = -EINVAL; + return NULL; + } + + *dir = -EINVAL; + + if (len < sizeof(struct sadb_x_policy) || + pol->sadb_x_policy_len*8 > len || + pol->sadb_x_policy_type > IPSEC_POLICY_BYPASS || + (!pol->sadb_x_policy_dir || pol->sadb_x_policy_dir > IPSEC_DIR_OUTBOUND)) + return NULL; + + xp = xfrm_policy_alloc(GFP_ATOMIC); + if (xp == NULL) { + *dir = -ENOBUFS; + return NULL; + } + + xp->action = (pol->sadb_x_policy_type == IPSEC_POLICY_DISCARD ? + XFRM_POLICY_BLOCK : XFRM_POLICY_ALLOW); + + xp->lft.soft_byte_limit = XFRM_INF; + xp->lft.hard_byte_limit = XFRM_INF; + xp->lft.soft_packet_limit = XFRM_INF; + xp->lft.hard_packet_limit = XFRM_INF; + xp->family = family; + + xp->xfrm_nr = 0; + if (pol->sadb_x_policy_type == IPSEC_POLICY_IPSEC && + (*dir = parse_ipsecrequests(xp, pol)) < 0) + goto out; + + *dir = pol->sadb_x_policy_dir-1; + return xp; + +out: + kfree(xp); + return NULL; +} + +static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport) +{ + struct sk_buff *skb; + struct sadb_msg *hdr; + struct sadb_sa *sa; + struct sadb_address *addr; + struct sadb_x_nat_t_port *n_port; + struct sockaddr_in *sin; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct sockaddr_in6 *sin6; +#endif + int sockaddr_size; + int size; + __u8 satype = (x->id.proto == IPPROTO_ESP ? SADB_SATYPE_ESP : 0); + struct xfrm_encap_tmpl *natt = NULL; + + sockaddr_size = pfkey_sockaddr_size(x->props.family); + if (!sockaddr_size) + return -EINVAL; + + if (!satype) + return -EINVAL; + + if (!x->encap) + return -EINVAL; + + natt = x->encap; + + /* Build an SADB_X_NAT_T_NEW_MAPPING message: + * + * HDR | SA | ADDRESS_SRC (old addr) | NAT_T_SPORT (old port) | + * ADDRESS_DST (new addr) | NAT_T_DPORT (new port) + */ + + size = sizeof(struct sadb_msg) + + sizeof(struct sadb_sa) + + (sizeof(struct sadb_address) * 2) + + (sockaddr_size * 2) + + (sizeof(struct sadb_x_nat_t_port) * 2); + + skb = alloc_skb(size + 16, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + hdr = (struct sadb_msg *) skb_put(skb, sizeof(struct sadb_msg)); + hdr->sadb_msg_version = PF_KEY_V2; + hdr->sadb_msg_type = SADB_X_NAT_T_NEW_MAPPING; + hdr->sadb_msg_satype = satype; + hdr->sadb_msg_len = size / sizeof(uint64_t); + hdr->sadb_msg_errno = 0; + hdr->sadb_msg_reserved = 0; + hdr->sadb_msg_seq = x->km.seq = get_acqseq(); + hdr->sadb_msg_pid = 0; + + /* SA */ + sa = (struct sadb_sa *) skb_put(skb, sizeof(struct sadb_sa)); + sa->sadb_sa_len = sizeof(struct sadb_sa)/sizeof(uint64_t); + sa->sadb_sa_exttype = SADB_EXT_SA; + sa->sadb_sa_spi = x->id.spi; + sa->sadb_sa_replay = 0; + sa->sadb_sa_state = 0; + sa->sadb_sa_auth = 0; + sa->sadb_sa_encrypt = 0; + sa->sadb_sa_flags = 0; + + /* ADDRESS_SRC (old addr) */ + addr = (struct sadb_address*) + skb_put(skb, sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_SRC; + addr->sadb_address_proto = 0; + addr->sadb_address_reserved = 0; + if (x->props.family == AF_INET) { + addr->sadb_address_prefixlen = 32; + + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = x->props.saddr.a4; + sin->sin_port = 0; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (x->props.family == AF_INET6) { + addr->sadb_address_prefixlen = 128; + + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, + x->props.saddr.a6, sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + } +#endif + else + BUG(); + + /* NAT_T_SPORT (old port) */ + n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port)); + n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); + n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_SPORT; + n_port->sadb_x_nat_t_port_port = natt->encap_sport; + n_port->sadb_x_nat_t_port_reserved = 0; + + /* ADDRESS_DST (new addr) */ + addr = (struct sadb_address*) + skb_put(skb, sizeof(struct sadb_address)+sockaddr_size); + addr->sadb_address_len = + (sizeof(struct sadb_address)+sockaddr_size)/ + sizeof(uint64_t); + addr->sadb_address_exttype = SADB_EXT_ADDRESS_DST; + addr->sadb_address_proto = 0; + addr->sadb_address_reserved = 0; + if (x->props.family == AF_INET) { + addr->sadb_address_prefixlen = 32; + + sin = (struct sockaddr_in *) (addr + 1); + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = ipaddr->a4; + sin->sin_port = 0; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); + } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + else if (x->props.family == AF_INET6) { + addr->sadb_address_prefixlen = 128; + + sin6 = (struct sockaddr_in6 *) (addr + 1); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = 0; + sin6->sin6_flowinfo = 0; + memcpy(&sin6->sin6_addr, &ipaddr->a6, sizeof(struct in6_addr)); + sin6->sin6_scope_id = 0; + } +#endif + else + BUG(); + + /* NAT_T_DPORT (new port) */ + n_port = (struct sadb_x_nat_t_port*) skb_put(skb, sizeof (*n_port)); + n_port->sadb_x_nat_t_port_len = sizeof(*n_port)/sizeof(uint64_t); + n_port->sadb_x_nat_t_port_exttype = SADB_X_EXT_NAT_T_DPORT; + n_port->sadb_x_nat_t_port_port = sport; + n_port->sadb_x_nat_t_port_reserved = 0; + + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL); +} + +static int pfkey_sendmsg(struct kiocb *kiocb, + struct socket *sock, struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb = NULL; + struct sadb_msg *hdr = NULL; + int err; + + err = -EOPNOTSUPP; + if (msg->msg_flags & MSG_OOB) + goto out; + + err = -EMSGSIZE; + if ((unsigned)len > sk->sk_sndbuf - 32) + goto out; + + err = -ENOBUFS; + skb = alloc_skb(len, GFP_KERNEL); + if (skb == NULL) + goto out; + + err = -EFAULT; + if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) + goto out; + + hdr = pfkey_get_base_msg(skb, &err); + if (!hdr) + goto out; + + down(&xfrm_cfg_sem); + err = pfkey_process(sk, skb, hdr); + up(&xfrm_cfg_sem); + +out: + if (err && hdr && pfkey_error(hdr, err, sk) == 0) + err = 0; + if (skb) + kfree_skb(skb); + + return err ? : len; +} + +static int pfkey_recvmsg(struct kiocb *kiocb, + struct socket *sock, struct msghdr *msg, size_t len, + int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + int copied, err; + + err = -EINVAL; + if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT)) + goto out; + + msg->msg_namelen = 0; + skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err); + if (skb == NULL) + goto out; + + copied = skb->len; + if (copied > len) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + skb->h.raw = skb->data; + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free; + + sock_recv_timestamp(msg, sk, skb); + + err = (flags & MSG_TRUNC) ? skb->len : copied; + +out_free: + skb_free_datagram(sk, skb); +out: + return err; +} + +static struct proto_ops pfkey_ops = { + .family = PF_KEY, + .owner = THIS_MODULE, + /* Operations that make no sense on pfkey sockets. */ + .bind = sock_no_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = sock_no_getname, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, + + /* Now the operations that really occur. */ + .release = pfkey_release, + .poll = datagram_poll, + .sendmsg = pfkey_sendmsg, + .recvmsg = pfkey_recvmsg, +}; + +static struct net_proto_family pfkey_family_ops = { + .family = PF_KEY, + .create = pfkey_create, + .owner = THIS_MODULE, +}; + +#ifdef CONFIG_PROC_FS +static int pfkey_read_proc(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + off_t pos = 0; + off_t begin = 0; + int len = 0; + struct sock *s; + struct hlist_node *node; + + len += sprintf(buffer,"sk RefCnt Rmem Wmem User Inode\n"); + + read_lock(&pfkey_table_lock); + + sk_for_each(s, node, &pfkey_table) { + len += sprintf(buffer+len,"%p %-6d %-6u %-6u %-6u %-6lu", + s, + atomic_read(&s->sk_refcnt), + atomic_read(&s->sk_rmem_alloc), + atomic_read(&s->sk_wmem_alloc), + sock_i_uid(s), + sock_i_ino(s) + ); + + buffer[len++] = '\n'; + + pos = begin + len; + if (pos < offset) { + len = 0; + begin = pos; + } + if(pos > offset + length) + goto done; + } + *eof = 1; + +done: + read_unlock(&pfkey_table_lock); + + *start = buffer + (offset - begin); + len -= (offset - begin); + + if (len > length) + len = length; + if (len < 0) + len = 0; + + return len; +} +#endif + +static struct xfrm_mgr pfkeyv2_mgr = +{ + .id = "pfkeyv2", + .notify = pfkey_send_notify, + .acquire = pfkey_send_acquire, + .compile_policy = pfkey_compile_policy, + .new_mapping = pfkey_send_new_mapping, +}; + +static void __exit ipsec_pfkey_exit(void) +{ + xfrm_unregister_km(&pfkeyv2_mgr); + remove_proc_entry("net/pfkey", NULL); + sock_unregister(PF_KEY); + proto_unregister(&key_proto); +} + +static int __init ipsec_pfkey_init(void) +{ + int err = proto_register(&key_proto, 0); + + if (err != 0) + goto out; + + err = sock_register(&pfkey_family_ops); + if (err != 0) + goto out_unregister_key_proto; +#ifdef CONFIG_PROC_FS + err = -ENOMEM; + if (create_proc_read_entry("net/pfkey", 0, NULL, pfkey_read_proc, NULL) == NULL) + goto out_sock_unregister; +#endif + err = xfrm_register_km(&pfkeyv2_mgr); + if (err != 0) + goto out_remove_proc_entry; +out: + return err; +out_remove_proc_entry: +#ifdef CONFIG_PROC_FS + remove_proc_entry("net/pfkey", NULL); +out_sock_unregister: +#endif + sock_unregister(PF_KEY); +out_unregister_key_proto: + proto_unregister(&key_proto); + goto out; +} + +module_init(ipsec_pfkey_init); +module_exit(ipsec_pfkey_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_KEY); diff --git a/net/lapb/Makefile b/net/lapb/Makefile new file mode 100644 index 000000000000..53f7c90db163 --- /dev/null +++ b/net/lapb/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux LAPB layer. +# + +obj-$(CONFIG_LAPB) += lapb.o + +lapb-objs := lapb_in.o lapb_out.o lapb_subr.o lapb_timer.o lapb_iface.o diff --git a/net/lapb/lapb_iface.c b/net/lapb/lapb_iface.c new file mode 100644 index 000000000000..aea6616cea3d --- /dev/null +++ b/net/lapb/lapb_iface.c @@ -0,0 +1,449 @@ +/* + * LAPB release 002 + * + * This code REQUIRES 2.1.15 or higher/ NET3.038 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * LAPB 001 Jonathan Naylor Started Coding + * LAPB 002 Jonathan Naylor New timer architecture. + * 2000-10-29 Henner Eisen lapb_data_indication() return status. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct list_head lapb_list = LIST_HEAD_INIT(lapb_list); +static DEFINE_RWLOCK(lapb_list_lock); + +/* + * Free an allocated lapb control block. + */ +static void lapb_free_cb(struct lapb_cb *lapb) +{ + kfree(lapb); +} + +static __inline__ void lapb_hold(struct lapb_cb *lapb) +{ + atomic_inc(&lapb->refcnt); +} + +static __inline__ void lapb_put(struct lapb_cb *lapb) +{ + if (atomic_dec_and_test(&lapb->refcnt)) + lapb_free_cb(lapb); +} + +/* + * Socket removal during an interrupt is now safe. + */ +static void __lapb_remove_cb(struct lapb_cb *lapb) +{ + if (lapb->node.next) { + list_del(&lapb->node); + lapb_put(lapb); + } +} + +/* + * Add a socket to the bound sockets list. + */ +static void __lapb_insert_cb(struct lapb_cb *lapb) +{ + list_add(&lapb->node, &lapb_list); + lapb_hold(lapb); +} + +static struct lapb_cb *__lapb_devtostruct(struct net_device *dev) +{ + struct list_head *entry; + struct lapb_cb *lapb, *use = NULL; + + list_for_each(entry, &lapb_list) { + lapb = list_entry(entry, struct lapb_cb, node); + if (lapb->dev == dev) { + use = lapb; + break; + } + } + + if (use) + lapb_hold(use); + + return use; +} + +static struct lapb_cb *lapb_devtostruct(struct net_device *dev) +{ + struct lapb_cb *rc; + + read_lock_bh(&lapb_list_lock); + rc = __lapb_devtostruct(dev); + read_unlock_bh(&lapb_list_lock); + + return rc; +} +/* + * Create an empty LAPB control block. + */ +static struct lapb_cb *lapb_create_cb(void) +{ + struct lapb_cb *lapb = kmalloc(sizeof(*lapb), GFP_ATOMIC); + + + if (!lapb) + goto out; + + memset(lapb, 0x00, sizeof(*lapb)); + + skb_queue_head_init(&lapb->write_queue); + skb_queue_head_init(&lapb->ack_queue); + + init_timer(&lapb->t1timer); + init_timer(&lapb->t2timer); + + lapb->t1 = LAPB_DEFAULT_T1; + lapb->t2 = LAPB_DEFAULT_T2; + lapb->n2 = LAPB_DEFAULT_N2; + lapb->mode = LAPB_DEFAULT_MODE; + lapb->window = LAPB_DEFAULT_WINDOW; + lapb->state = LAPB_STATE_0; + atomic_set(&lapb->refcnt, 1); +out: + return lapb; +} + +int lapb_register(struct net_device *dev, struct lapb_register_struct *callbacks) +{ + struct lapb_cb *lapb; + int rc = LAPB_BADTOKEN; + + write_lock_bh(&lapb_list_lock); + + lapb = __lapb_devtostruct(dev); + if (lapb) { + lapb_put(lapb); + goto out; + } + + lapb = lapb_create_cb(); + rc = LAPB_NOMEM; + if (!lapb) + goto out; + + lapb->dev = dev; + lapb->callbacks = *callbacks; + + __lapb_insert_cb(lapb); + + lapb_start_t1timer(lapb); + + rc = LAPB_OK; +out: + write_unlock_bh(&lapb_list_lock); + return rc; +} + +int lapb_unregister(struct net_device *dev) +{ + struct lapb_cb *lapb; + int rc = LAPB_BADTOKEN; + + write_lock_bh(&lapb_list_lock); + lapb = __lapb_devtostruct(dev); + if (!lapb) + goto out; + + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + + lapb_clear_queues(lapb); + + __lapb_remove_cb(lapb); + + lapb_put(lapb); + rc = LAPB_OK; +out: + write_unlock_bh(&lapb_list_lock); + return rc; +} + +int lapb_getparms(struct net_device *dev, struct lapb_parms_struct *parms) +{ + int rc = LAPB_BADTOKEN; + struct lapb_cb *lapb = lapb_devtostruct(dev); + + if (!lapb) + goto out; + + parms->t1 = lapb->t1 / HZ; + parms->t2 = lapb->t2 / HZ; + parms->n2 = lapb->n2; + parms->n2count = lapb->n2count; + parms->state = lapb->state; + parms->window = lapb->window; + parms->mode = lapb->mode; + + if (!timer_pending(&lapb->t1timer)) + parms->t1timer = 0; + else + parms->t1timer = (lapb->t1timer.expires - jiffies) / HZ; + + if (!timer_pending(&lapb->t2timer)) + parms->t2timer = 0; + else + parms->t2timer = (lapb->t2timer.expires - jiffies) / HZ; + + lapb_put(lapb); + rc = LAPB_OK; +out: + return rc; +} + +int lapb_setparms(struct net_device *dev, struct lapb_parms_struct *parms) +{ + int rc = LAPB_BADTOKEN; + struct lapb_cb *lapb = lapb_devtostruct(dev); + + if (!lapb) + goto out; + + rc = LAPB_INVALUE; + if (parms->t1 < 1 || parms->t2 < 1 || parms->n2 < 1) + goto out_put; + + if (lapb->state == LAPB_STATE_0) { + if (((parms->mode & LAPB_EXTENDED) && + (parms->window < 1 || parms->window > 127)) || + (parms->window < 1 || parms->window > 7)) + goto out_put; + + lapb->mode = parms->mode; + lapb->window = parms->window; + } + + lapb->t1 = parms->t1 * HZ; + lapb->t2 = parms->t2 * HZ; + lapb->n2 = parms->n2; + + rc = LAPB_OK; +out_put: + lapb_put(lapb); +out: + return rc; +} + +int lapb_connect_request(struct net_device *dev) +{ + struct lapb_cb *lapb = lapb_devtostruct(dev); + int rc = LAPB_BADTOKEN; + + if (!lapb) + goto out; + + rc = LAPB_OK; + if (lapb->state == LAPB_STATE_1) + goto out_put; + + rc = LAPB_CONNECTED; + if (lapb->state == LAPB_STATE_3 || lapb->state == LAPB_STATE_4) + goto out_put; + + lapb_establish_data_link(lapb); + +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S0 -> S1\n", lapb->dev); +#endif + lapb->state = LAPB_STATE_1; + + rc = LAPB_OK; +out_put: + lapb_put(lapb); +out: + return rc; +} + +int lapb_disconnect_request(struct net_device *dev) +{ + struct lapb_cb *lapb = lapb_devtostruct(dev); + int rc = LAPB_BADTOKEN; + + if (!lapb) + goto out; + + switch (lapb->state) { + case LAPB_STATE_0: + rc = LAPB_NOTCONNECTED; + goto out_put; + + case LAPB_STATE_1: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 TX DISC(1)\n", lapb->dev); +#endif +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S1 -> S0\n", lapb->dev); +#endif + lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); + lapb->state = LAPB_STATE_0; + lapb_start_t1timer(lapb); + rc = LAPB_NOTCONNECTED; + goto out_put; + + case LAPB_STATE_2: + rc = LAPB_OK; + goto out_put; + } + + lapb_clear_queues(lapb); + lapb->n2count = 0; + lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_2; + +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 DISC(1)\n", lapb->dev); +#endif +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S2\n", lapb->dev); +#endif + + rc = LAPB_OK; +out_put: + lapb_put(lapb); +out: + return rc; +} + +int lapb_data_request(struct net_device *dev, struct sk_buff *skb) +{ + struct lapb_cb *lapb = lapb_devtostruct(dev); + int rc = LAPB_BADTOKEN; + + if (!lapb) + goto out; + + rc = LAPB_NOTCONNECTED; + if (lapb->state != LAPB_STATE_3 && lapb->state != LAPB_STATE_4) + goto out_put; + + skb_queue_tail(&lapb->write_queue, skb); + lapb_kick(lapb); + rc = LAPB_OK; +out_put: + lapb_put(lapb); +out: + return rc; +} + +int lapb_data_received(struct net_device *dev, struct sk_buff *skb) +{ + struct lapb_cb *lapb = lapb_devtostruct(dev); + int rc = LAPB_BADTOKEN; + + if (lapb) { + lapb_data_input(lapb, skb); + lapb_put(lapb); + rc = LAPB_OK; + } + + return rc; +} + +void lapb_connect_confirmation(struct lapb_cb *lapb, int reason) +{ + if (lapb->callbacks.connect_confirmation) + lapb->callbacks.connect_confirmation(lapb->dev, reason); +} + +void lapb_connect_indication(struct lapb_cb *lapb, int reason) +{ + if (lapb->callbacks.connect_indication) + lapb->callbacks.connect_indication(lapb->dev, reason); +} + +void lapb_disconnect_confirmation(struct lapb_cb *lapb, int reason) +{ + if (lapb->callbacks.disconnect_confirmation) + lapb->callbacks.disconnect_confirmation(lapb->dev, reason); +} + +void lapb_disconnect_indication(struct lapb_cb *lapb, int reason) +{ + if (lapb->callbacks.disconnect_indication) + lapb->callbacks.disconnect_indication(lapb->dev, reason); +} + +int lapb_data_indication(struct lapb_cb *lapb, struct sk_buff *skb) +{ + if (lapb->callbacks.data_indication) + return lapb->callbacks.data_indication(lapb->dev, skb); + + kfree_skb(skb); + return NET_RX_CN_HIGH; /* For now; must be != NET_RX_DROP */ +} + +int lapb_data_transmit(struct lapb_cb *lapb, struct sk_buff *skb) +{ + int used = 0; + + if (lapb->callbacks.data_transmit) { + lapb->callbacks.data_transmit(lapb->dev, skb); + used = 1; + } + + return used; +} + +EXPORT_SYMBOL(lapb_register); +EXPORT_SYMBOL(lapb_unregister); +EXPORT_SYMBOL(lapb_getparms); +EXPORT_SYMBOL(lapb_setparms); +EXPORT_SYMBOL(lapb_connect_request); +EXPORT_SYMBOL(lapb_disconnect_request); +EXPORT_SYMBOL(lapb_data_request); +EXPORT_SYMBOL(lapb_data_received); + +static int __init lapb_init(void) +{ + return 0; +} + +static void __exit lapb_exit(void) +{ + WARN_ON(!list_empty(&lapb_list)); +} + +MODULE_AUTHOR("Jonathan Naylor "); +MODULE_DESCRIPTION("The X.25 Link Access Procedure B link layer protocol"); +MODULE_LICENSE("GPL"); + +module_init(lapb_init); +module_exit(lapb_exit); diff --git a/net/lapb/lapb_in.c b/net/lapb/lapb_in.c new file mode 100644 index 000000000000..b0f8713f66ca --- /dev/null +++ b/net/lapb/lapb_in.c @@ -0,0 +1,724 @@ +/* + * LAPB release 002 + * + * This code REQUIRES 2.1.15 or higher/ NET3.038 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * LAPB 001 Jonathan Naulor Started Coding + * LAPB 002 Jonathan Naylor New timer architecture. + * 2000-10-29 Henner Eisen lapb_data_indication() return status. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * State machine for state 0, Disconnected State. + * The handling of the timer(s) is in file lapb_timer.c. + */ +static void lapb_state0_machine(struct lapb_cb *lapb, struct sk_buff *skb, + struct lapb_frame *frame) +{ + switch (frame->type) { + case LAPB_SABM: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S0 RX SABM(%d)\n", + lapb->dev, frame->pf); +#endif + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S0 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S0 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S0 -> S3\n", + lapb->dev); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_3; + lapb->condition = 0x00; + lapb->n2count = 0; + lapb->vs = 0; + lapb->vr = 0; + lapb->va = 0; + lapb_connect_indication(lapb, LAPB_OK); + } + break; + + case LAPB_SABME: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S0 RX SABME(%d)\n", + lapb->dev, frame->pf); +#endif + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S0 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S0 -> S3\n", + lapb->dev); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_3; + lapb->condition = 0x00; + lapb->n2count = 0; + lapb->vs = 0; + lapb->vr = 0; + lapb->va = 0; + lapb_connect_indication(lapb, LAPB_OK); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S0 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + } + break; + + case LAPB_DISC: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S0 RX DISC(%d)\n", + lapb->dev, frame->pf); + printk(KERN_DEBUG "lapb: (%p) S0 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + break; + + default: + break; + } + + kfree_skb(skb); +} + +/* + * State machine for state 1, Awaiting Connection State. + * The handling of the timer(s) is in file lapb_timer.c. + */ +static void lapb_state1_machine(struct lapb_cb *lapb, struct sk_buff *skb, + struct lapb_frame *frame) +{ + switch (frame->type) { + case LAPB_SABM: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 RX SABM(%d)\n", + lapb->dev, frame->pf); +#endif + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + } + break; + + case LAPB_SABME: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 RX SABME(%d)\n", + lapb->dev, frame->pf); +#endif + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + } + break; + + case LAPB_DISC: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 RX DISC(%d)\n", + lapb->dev, frame->pf); + printk(KERN_DEBUG "lapb: (%p) S1 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + break; + + case LAPB_UA: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 RX UA(%d)\n", + lapb->dev, frame->pf); +#endif + if (frame->pf) { +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S1 -> S3\n", + lapb->dev); +#endif + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_3; + lapb->condition = 0x00; + lapb->n2count = 0; + lapb->vs = 0; + lapb->vr = 0; + lapb->va = 0; + lapb_connect_confirmation(lapb, LAPB_OK); + } + break; + + case LAPB_DM: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 RX DM(%d)\n", + lapb->dev, frame->pf); +#endif + if (frame->pf) { +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S1 -> S0\n", + lapb->dev); +#endif + lapb_clear_queues(lapb); + lapb->state = LAPB_STATE_0; + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb_disconnect_indication(lapb, LAPB_REFUSED); + } + break; + } + + kfree_skb(skb); +} + +/* + * State machine for state 2, Awaiting Release State. + * The handling of the timer(s) is in file lapb_timer.c + */ +static void lapb_state2_machine(struct lapb_cb *lapb, struct sk_buff *skb, + struct lapb_frame *frame) +{ + switch (frame->type) { + case LAPB_SABM: + case LAPB_SABME: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S2 RX {SABM,SABME}(%d)\n", + lapb->dev, frame->pf); + printk(KERN_DEBUG "lapb: (%p) S2 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + break; + + case LAPB_DISC: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S2 RX DISC(%d)\n", + lapb->dev, frame->pf); + printk(KERN_DEBUG "lapb: (%p) S2 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + break; + + case LAPB_UA: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S2 RX UA(%d)\n", + lapb->dev, frame->pf); +#endif + if (frame->pf) { +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S2 -> S0\n", + lapb->dev); +#endif + lapb->state = LAPB_STATE_0; + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb_disconnect_confirmation(lapb, LAPB_OK); + } + break; + + case LAPB_DM: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S2 RX DM(%d)\n", + lapb->dev, frame->pf); +#endif + if (frame->pf) { +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S2 -> S0\n", + lapb->dev); +#endif + lapb->state = LAPB_STATE_0; + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb_disconnect_confirmation(lapb, + LAPB_NOTCONNECTED); + } + break; + + case LAPB_I: + case LAPB_REJ: + case LAPB_RNR: + case LAPB_RR: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S2 RX {I,REJ,RNR,RR}" + "(%d)\n", lapb->dev, frame->pf); + printk(KERN_DEBUG "lapb: (%p) S2 RX DM(%d)\n", + lapb->dev, frame->pf); +#endif + if (frame->pf) + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + break; + } + + kfree_skb(skb); +} + +/* + * State machine for state 3, Connected State. + * The handling of the timer(s) is in file lapb_timer.c + */ +static void lapb_state3_machine(struct lapb_cb *lapb, struct sk_buff *skb, + struct lapb_frame *frame) +{ + int queued = 0; + int modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : + LAPB_SMODULUS; + + switch (frame->type) { + case LAPB_SABM: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX SABM(%d)\n", + lapb->dev, frame->pf); +#endif + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->condition = 0x00; + lapb->n2count = 0; + lapb->vs = 0; + lapb->vr = 0; + lapb->va = 0; + lapb_requeue_frames(lapb); + } + break; + + case LAPB_SABME: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX SABME(%d)\n", + lapb->dev, frame->pf); +#endif + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->condition = 0x00; + lapb->n2count = 0; + lapb->vs = 0; + lapb->vr = 0; + lapb->va = 0; + lapb_requeue_frames(lapb); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + } + break; + + case LAPB_DISC: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX DISC(%d)\n", + lapb->dev, frame->pf); +#endif +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S0\n", + lapb->dev); +#endif + lapb_clear_queues(lapb); + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_0; + lapb_disconnect_indication(lapb, LAPB_OK); + break; + + case LAPB_DM: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX DM(%d)\n", + lapb->dev, frame->pf); +#endif +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S0\n", + lapb->dev); +#endif + lapb_clear_queues(lapb); + lapb->state = LAPB_STATE_0; + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb_disconnect_indication(lapb, LAPB_NOTCONNECTED); + break; + + case LAPB_RNR: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX RNR(%d) R%d\n", + lapb->dev, frame->pf, frame->nr); +#endif + lapb->condition |= LAPB_PEER_RX_BUSY_CONDITION; + lapb_check_need_response(lapb, frame->cr, frame->pf); + if (lapb_validate_nr(lapb, frame->nr)) { + lapb_check_iframes_acked(lapb, frame->nr); + } else { + lapb->frmr_data = *frame; + lapb->frmr_type = LAPB_FRMR_Z; + lapb_transmit_frmr(lapb); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S4\n", + lapb->dev); +#endif + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_4; + lapb->n2count = 0; + } + break; + + case LAPB_RR: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX RR(%d) R%d\n", + lapb->dev, frame->pf, frame->nr); +#endif + lapb->condition &= ~LAPB_PEER_RX_BUSY_CONDITION; + lapb_check_need_response(lapb, frame->cr, frame->pf); + if (lapb_validate_nr(lapb, frame->nr)) { + lapb_check_iframes_acked(lapb, frame->nr); + } else { + lapb->frmr_data = *frame; + lapb->frmr_type = LAPB_FRMR_Z; + lapb_transmit_frmr(lapb); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S4\n", + lapb->dev); +#endif + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_4; + lapb->n2count = 0; + } + break; + + case LAPB_REJ: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX REJ(%d) R%d\n", + lapb->dev, frame->pf, frame->nr); +#endif + lapb->condition &= ~LAPB_PEER_RX_BUSY_CONDITION; + lapb_check_need_response(lapb, frame->cr, frame->pf); + if (lapb_validate_nr(lapb, frame->nr)) { + lapb_frames_acked(lapb, frame->nr); + lapb_stop_t1timer(lapb); + lapb->n2count = 0; + lapb_requeue_frames(lapb); + } else { + lapb->frmr_data = *frame; + lapb->frmr_type = LAPB_FRMR_Z; + lapb_transmit_frmr(lapb); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S4\n", + lapb->dev); +#endif + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_4; + lapb->n2count = 0; + } + break; + + case LAPB_I: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX I(%d) S%d R%d\n", + lapb->dev, frame->pf, frame->ns, frame->nr); +#endif + if (!lapb_validate_nr(lapb, frame->nr)) { + lapb->frmr_data = *frame; + lapb->frmr_type = LAPB_FRMR_Z; + lapb_transmit_frmr(lapb); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S4\n", + lapb->dev); +#endif + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_4; + lapb->n2count = 0; + break; + } + if (lapb->condition & LAPB_PEER_RX_BUSY_CONDITION) + lapb_frames_acked(lapb, frame->nr); + else + lapb_check_iframes_acked(lapb, frame->nr); + + if (frame->ns == lapb->vr) { + int cn; + cn = lapb_data_indication(lapb, skb); + queued = 1; + /* + * If upper layer has dropped the frame, we + * basically ignore any further protocol + * processing. This will cause the peer + * to re-transmit the frame later like + * a frame lost on the wire. + */ + if (cn == NET_RX_DROP) { + printk(KERN_DEBUG + "LAPB: rx congestion\n"); + break; + } + lapb->vr = (lapb->vr + 1) % modulus; + lapb->condition &= ~LAPB_REJECT_CONDITION; + if (frame->pf) + lapb_enquiry_response(lapb); + else { + if (!(lapb->condition & + LAPB_ACK_PENDING_CONDITION)) { + lapb->condition |= LAPB_ACK_PENDING_CONDITION; + lapb_start_t2timer(lapb); + } + } + } else { + if (lapb->condition & LAPB_REJECT_CONDITION) { + if (frame->pf) + lapb_enquiry_response(lapb); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG + "lapb: (%p) S3 TX REJ(%d) R%d\n", + lapb->dev, frame->pf, lapb->vr); +#endif + lapb->condition |= LAPB_REJECT_CONDITION; + lapb_send_control(lapb, LAPB_REJ, + frame->pf, + LAPB_RESPONSE); + lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; + } + } + break; + + case LAPB_FRMR: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX FRMR(%d) %02X " + "%02X %02X %02X %02X\n", lapb->dev, frame->pf, + skb->data[0], skb->data[1], skb->data[2], + skb->data[3], skb->data[4]); +#endif + lapb_establish_data_link(lapb); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S1\n", + lapb->dev); +#endif + lapb_requeue_frames(lapb); + lapb->state = LAPB_STATE_1; + break; + + case LAPB_ILLEGAL: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S3 RX ILLEGAL(%d)\n", + lapb->dev, frame->pf); +#endif + lapb->frmr_data = *frame; + lapb->frmr_type = LAPB_FRMR_W; + lapb_transmit_frmr(lapb); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S4\n", lapb->dev); +#endif + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_4; + lapb->n2count = 0; + break; + } + + if (!queued) + kfree_skb(skb); +} + +/* + * State machine for state 4, Frame Reject State. + * The handling of the timer(s) is in file lapb_timer.c. + */ +static void lapb_state4_machine(struct lapb_cb *lapb, struct sk_buff *skb, + struct lapb_frame *frame) +{ + switch (frame->type) { + case LAPB_SABM: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S4 RX SABM(%d)\n", + lapb->dev, frame->pf); +#endif + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S4 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S4 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S4 -> S3\n", + lapb->dev); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_3; + lapb->condition = 0x00; + lapb->n2count = 0; + lapb->vs = 0; + lapb->vr = 0; + lapb->va = 0; + lapb_connect_indication(lapb, LAPB_OK); + } + break; + + case LAPB_SABME: +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S4 RX SABME(%d)\n", + lapb->dev, frame->pf); +#endif + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S4 TX UA(%d)\n", + lapb->dev, frame->pf); +#endif +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S4 -> S3\n", + lapb->dev); +#endif + lapb_send_control(lapb, LAPB_UA, frame->pf, + LAPB_RESPONSE); + lapb_stop_t1timer(lapb); + lapb_stop_t2timer(lapb); + lapb->state = LAPB_STATE_3; + lapb->condition = 0x00; + lapb->n2count = 0; + lapb->vs = 0; + lapb->vr = 0; + lapb->va = 0; + lapb_connect_indication(lapb, LAPB_OK); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S4 TX DM(%d)\n", + lapb->dev, frame->pf); +#endif + lapb_send_control(lapb, LAPB_DM, frame->pf, + LAPB_RESPONSE); + } + break; + } + + kfree_skb(skb); +} + +/* + * Process an incoming LAPB frame + */ +void lapb_data_input(struct lapb_cb *lapb, struct sk_buff *skb) +{ + struct lapb_frame frame; + + if (lapb_decode(lapb, skb, &frame) < 0) { + kfree_skb(skb); + return; + } + + switch (lapb->state) { + case LAPB_STATE_0: + lapb_state0_machine(lapb, skb, &frame); break; + case LAPB_STATE_1: + lapb_state1_machine(lapb, skb, &frame); break; + case LAPB_STATE_2: + lapb_state2_machine(lapb, skb, &frame); break; + case LAPB_STATE_3: + lapb_state3_machine(lapb, skb, &frame); break; + case LAPB_STATE_4: + lapb_state4_machine(lapb, skb, &frame); break; + } + + lapb_kick(lapb); +} diff --git a/net/lapb/lapb_out.c b/net/lapb/lapb_out.c new file mode 100644 index 000000000000..49a761bd9314 --- /dev/null +++ b/net/lapb/lapb_out.c @@ -0,0 +1,224 @@ +/* + * LAPB release 002 + * + * This code REQUIRES 2.1.15 or higher/ NET3.038 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * LAPB 001 Jonathan Naylor Started Coding + * LAPB 002 Jonathan Naylor New timer architecture. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This procedure is passed a buffer descriptor for an iframe. It builds + * the rest of the control part of the frame and then writes it out. + */ +static void lapb_send_iframe(struct lapb_cb *lapb, struct sk_buff *skb, int poll_bit) +{ + unsigned char *frame; + + if (!skb) + return; + + if (lapb->mode & LAPB_EXTENDED) { + frame = skb_push(skb, 2); + + frame[0] = LAPB_I; + frame[0] |= lapb->vs << 1; + frame[1] = poll_bit ? LAPB_EPF : 0; + frame[1] |= lapb->vr << 1; + } else { + frame = skb_push(skb, 1); + + *frame = LAPB_I; + *frame |= poll_bit ? LAPB_SPF : 0; + *frame |= lapb->vr << 5; + *frame |= lapb->vs << 1; + } + +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S%d TX I(%d) S%d R%d\n", + lapb->dev, lapb->state, poll_bit, lapb->vs, lapb->vr); +#endif + + lapb_transmit_buffer(lapb, skb, LAPB_COMMAND); +} + +void lapb_kick(struct lapb_cb *lapb) +{ + struct sk_buff *skb, *skbn; + unsigned short modulus, start, end; + + modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : LAPB_SMODULUS; + start = !skb_peek(&lapb->ack_queue) ? lapb->va : lapb->vs; + end = (lapb->va + lapb->window) % modulus; + + if (!(lapb->condition & LAPB_PEER_RX_BUSY_CONDITION) && + start != end && skb_peek(&lapb->write_queue)) { + lapb->vs = start; + + /* + * Dequeue the frame and copy it. + */ + skb = skb_dequeue(&lapb->write_queue); + + do { + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skb_queue_head(&lapb->write_queue, skb); + break; + } + + if (skb->sk) + skb_set_owner_w(skbn, skb->sk); + + /* + * Transmit the frame copy. + */ + lapb_send_iframe(lapb, skbn, LAPB_POLLOFF); + + lapb->vs = (lapb->vs + 1) % modulus; + + /* + * Requeue the original data frame. + */ + skb_queue_tail(&lapb->ack_queue, skb); + + } while (lapb->vs != end && (skb = skb_dequeue(&lapb->write_queue)) != NULL); + + lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; + + if (!lapb_t1timer_running(lapb)) + lapb_start_t1timer(lapb); + } +} + +void lapb_transmit_buffer(struct lapb_cb *lapb, struct sk_buff *skb, int type) +{ + unsigned char *ptr; + + ptr = skb_push(skb, 1); + + if (lapb->mode & LAPB_MLP) { + if (lapb->mode & LAPB_DCE) { + if (type == LAPB_COMMAND) + *ptr = LAPB_ADDR_C; + if (type == LAPB_RESPONSE) + *ptr = LAPB_ADDR_D; + } else { + if (type == LAPB_COMMAND) + *ptr = LAPB_ADDR_D; + if (type == LAPB_RESPONSE) + *ptr = LAPB_ADDR_C; + } + } else { + if (lapb->mode & LAPB_DCE) { + if (type == LAPB_COMMAND) + *ptr = LAPB_ADDR_A; + if (type == LAPB_RESPONSE) + *ptr = LAPB_ADDR_B; + } else { + if (type == LAPB_COMMAND) + *ptr = LAPB_ADDR_B; + if (type == LAPB_RESPONSE) + *ptr = LAPB_ADDR_A; + } + } + +#if LAPB_DEBUG > 2 + printk(KERN_DEBUG "lapb: (%p) S%d TX %02X %02X %02X\n", + lapb->dev, lapb->state, + skb->data[0], skb->data[1], skb->data[2]); +#endif + + if (!lapb_data_transmit(lapb, skb)) + kfree_skb(skb); +} + +void lapb_establish_data_link(struct lapb_cb *lapb) +{ + lapb->condition = 0x00; + lapb->n2count = 0; + + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S%d TX SABME(1)\n", + lapb->dev, lapb->state); +#endif + lapb_send_control(lapb, LAPB_SABME, LAPB_POLLON, LAPB_COMMAND); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S%d TX SABM(1)\n", + lapb->dev, lapb->state); +#endif + lapb_send_control(lapb, LAPB_SABM, LAPB_POLLON, LAPB_COMMAND); + } + + lapb_start_t1timer(lapb); + lapb_stop_t2timer(lapb); +} + +void lapb_enquiry_response(struct lapb_cb *lapb) +{ +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S%d TX RR(1) R%d\n", + lapb->dev, lapb->state, lapb->vr); +#endif + + lapb_send_control(lapb, LAPB_RR, LAPB_POLLON, LAPB_RESPONSE); + + lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; +} + +void lapb_timeout_response(struct lapb_cb *lapb) +{ +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S%d TX RR(0) R%d\n", + lapb->dev, lapb->state, lapb->vr); +#endif + lapb_send_control(lapb, LAPB_RR, LAPB_POLLOFF, LAPB_RESPONSE); + + lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; +} + +void lapb_check_iframes_acked(struct lapb_cb *lapb, unsigned short nr) +{ + if (lapb->vs == nr) { + lapb_frames_acked(lapb, nr); + lapb_stop_t1timer(lapb); + lapb->n2count = 0; + } else if (lapb->va != nr) { + lapb_frames_acked(lapb, nr); + lapb_start_t1timer(lapb); + } +} + +void lapb_check_need_response(struct lapb_cb *lapb, int type, int pf) +{ + if (type == LAPB_COMMAND && pf) + lapb_enquiry_response(lapb); +} diff --git a/net/lapb/lapb_subr.c b/net/lapb/lapb_subr.c new file mode 100644 index 000000000000..5de05a0bc0ff --- /dev/null +++ b/net/lapb/lapb_subr.c @@ -0,0 +1,313 @@ +/* + * LAPB release 002 + * + * This code REQUIRES 2.1.15 or higher/ NET3.038 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * LAPB 001 Jonathan Naylor Started Coding + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This routine purges all the queues of frames. + */ +void lapb_clear_queues(struct lapb_cb *lapb) +{ + skb_queue_purge(&lapb->write_queue); + skb_queue_purge(&lapb->ack_queue); +} + +/* + * This routine purges the input queue of those frames that have been + * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the + * SDL diagram. + */ +void lapb_frames_acked(struct lapb_cb *lapb, unsigned short nr) +{ + struct sk_buff *skb; + int modulus; + + modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : LAPB_SMODULUS; + + /* + * Remove all the ack-ed frames from the ack queue. + */ + if (lapb->va != nr) + while (skb_peek(&lapb->ack_queue) && lapb->va != nr) { + skb = skb_dequeue(&lapb->ack_queue); + kfree_skb(skb); + lapb->va = (lapb->va + 1) % modulus; + } +} + +void lapb_requeue_frames(struct lapb_cb *lapb) +{ + struct sk_buff *skb, *skb_prev = NULL; + + /* + * Requeue all the un-ack-ed frames on the output queue to be picked + * up by lapb_kick called from the timer. This arrangement handles the + * possibility of an empty output queue. + */ + while ((skb = skb_dequeue(&lapb->ack_queue)) != NULL) { + if (!skb_prev) + skb_queue_head(&lapb->write_queue, skb); + else + skb_append(skb_prev, skb); + skb_prev = skb; + } +} + +/* + * Validate that the value of nr is between va and vs. Return true or + * false for testing. + */ +int lapb_validate_nr(struct lapb_cb *lapb, unsigned short nr) +{ + unsigned short vc = lapb->va; + int modulus; + + modulus = (lapb->mode & LAPB_EXTENDED) ? LAPB_EMODULUS : LAPB_SMODULUS; + + while (vc != lapb->vs) { + if (nr == vc) + return 1; + vc = (vc + 1) % modulus; + } + + return nr == lapb->vs; +} + +/* + * This routine is the centralised routine for parsing the control + * information for the different frame formats. + */ +int lapb_decode(struct lapb_cb *lapb, struct sk_buff *skb, + struct lapb_frame *frame) +{ + frame->type = LAPB_ILLEGAL; + +#if LAPB_DEBUG > 2 + printk(KERN_DEBUG "lapb: (%p) S%d RX %02X %02X %02X\n", + lapb->dev, lapb->state, + skb->data[0], skb->data[1], skb->data[2]); +#endif + + /* We always need to look at 2 bytes, sometimes we need + * to look at 3 and those cases are handled below. + */ + if (!pskb_may_pull(skb, 2)) + return -1; + + if (lapb->mode & LAPB_MLP) { + if (lapb->mode & LAPB_DCE) { + if (skb->data[0] == LAPB_ADDR_D) + frame->cr = LAPB_COMMAND; + if (skb->data[0] == LAPB_ADDR_C) + frame->cr = LAPB_RESPONSE; + } else { + if (skb->data[0] == LAPB_ADDR_C) + frame->cr = LAPB_COMMAND; + if (skb->data[0] == LAPB_ADDR_D) + frame->cr = LAPB_RESPONSE; + } + } else { + if (lapb->mode & LAPB_DCE) { + if (skb->data[0] == LAPB_ADDR_B) + frame->cr = LAPB_COMMAND; + if (skb->data[0] == LAPB_ADDR_A) + frame->cr = LAPB_RESPONSE; + } else { + if (skb->data[0] == LAPB_ADDR_A) + frame->cr = LAPB_COMMAND; + if (skb->data[0] == LAPB_ADDR_B) + frame->cr = LAPB_RESPONSE; + } + } + + skb_pull(skb, 1); + + if (lapb->mode & LAPB_EXTENDED) { + if (!(skb->data[0] & LAPB_S)) { + if (!pskb_may_pull(skb, 2)) + return -1; + /* + * I frame - carries NR/NS/PF + */ + frame->type = LAPB_I; + frame->ns = (skb->data[0] >> 1) & 0x7F; + frame->nr = (skb->data[1] >> 1) & 0x7F; + frame->pf = skb->data[1] & LAPB_EPF; + frame->control[0] = skb->data[0]; + frame->control[1] = skb->data[1]; + skb_pull(skb, 2); + } else if ((skb->data[0] & LAPB_U) == 1) { + if (!pskb_may_pull(skb, 2)) + return -1; + /* + * S frame - take out PF/NR + */ + frame->type = skb->data[0] & 0x0F; + frame->nr = (skb->data[1] >> 1) & 0x7F; + frame->pf = skb->data[1] & LAPB_EPF; + frame->control[0] = skb->data[0]; + frame->control[1] = skb->data[1]; + skb_pull(skb, 2); + } else if ((skb->data[0] & LAPB_U) == 3) { + /* + * U frame - take out PF + */ + frame->type = skb->data[0] & ~LAPB_SPF; + frame->pf = skb->data[0] & LAPB_SPF; + frame->control[0] = skb->data[0]; + frame->control[1] = 0x00; + skb_pull(skb, 1); + } + } else { + if (!(skb->data[0] & LAPB_S)) { + /* + * I frame - carries NR/NS/PF + */ + frame->type = LAPB_I; + frame->ns = (skb->data[0] >> 1) & 0x07; + frame->nr = (skb->data[0] >> 5) & 0x07; + frame->pf = skb->data[0] & LAPB_SPF; + } else if ((skb->data[0] & LAPB_U) == 1) { + /* + * S frame - take out PF/NR + */ + frame->type = skb->data[0] & 0x0F; + frame->nr = (skb->data[0] >> 5) & 0x07; + frame->pf = skb->data[0] & LAPB_SPF; + } else if ((skb->data[0] & LAPB_U) == 3) { + /* + * U frame - take out PF + */ + frame->type = skb->data[0] & ~LAPB_SPF; + frame->pf = skb->data[0] & LAPB_SPF; + } + + frame->control[0] = skb->data[0]; + + skb_pull(skb, 1); + } + + return 0; +} + +/* + * This routine is called when the HDLC layer internally generates a + * command or response for the remote machine ( eg. RR, UA etc. ). + * Only supervisory or unnumbered frames are processed, FRMRs are handled + * by lapb_transmit_frmr below. + */ +void lapb_send_control(struct lapb_cb *lapb, int frametype, + int poll_bit, int type) +{ + struct sk_buff *skb; + unsigned char *dptr; + + if ((skb = alloc_skb(LAPB_HEADER_LEN + 3, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, LAPB_HEADER_LEN + 1); + + if (lapb->mode & LAPB_EXTENDED) { + if ((frametype & LAPB_U) == LAPB_U) { + dptr = skb_put(skb, 1); + *dptr = frametype; + *dptr |= poll_bit ? LAPB_SPF : 0; + } else { + dptr = skb_put(skb, 2); + dptr[0] = frametype; + dptr[1] = (lapb->vr << 1); + dptr[1] |= poll_bit ? LAPB_EPF : 0; + } + } else { + dptr = skb_put(skb, 1); + *dptr = frametype; + *dptr |= poll_bit ? LAPB_SPF : 0; + if ((frametype & LAPB_U) == LAPB_S) /* S frames carry NR */ + *dptr |= (lapb->vr << 5); + } + + lapb_transmit_buffer(lapb, skb, type); +} + +/* + * This routine generates FRMRs based on information previously stored in + * the LAPB control block. + */ +void lapb_transmit_frmr(struct lapb_cb *lapb) +{ + struct sk_buff *skb; + unsigned char *dptr; + + if ((skb = alloc_skb(LAPB_HEADER_LEN + 7, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, LAPB_HEADER_LEN + 1); + + if (lapb->mode & LAPB_EXTENDED) { + dptr = skb_put(skb, 6); + *dptr++ = LAPB_FRMR; + *dptr++ = lapb->frmr_data.control[0]; + *dptr++ = lapb->frmr_data.control[1]; + *dptr++ = (lapb->vs << 1) & 0xFE; + *dptr = (lapb->vr << 1) & 0xFE; + if (lapb->frmr_data.cr == LAPB_RESPONSE) + *dptr |= 0x01; + dptr++; + *dptr++ = lapb->frmr_type; + +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S%d TX FRMR %02X %02X %02X %02X %02X\n", + lapb->dev, lapb->state, + skb->data[1], skb->data[2], skb->data[3], + skb->data[4], skb->data[5]); +#endif + } else { + dptr = skb_put(skb, 4); + *dptr++ = LAPB_FRMR; + *dptr++ = lapb->frmr_data.control[0]; + *dptr = (lapb->vs << 1) & 0x0E; + *dptr |= (lapb->vr << 5) & 0xE0; + if (lapb->frmr_data.cr == LAPB_RESPONSE) + *dptr |= 0x10; + dptr++; + *dptr++ = lapb->frmr_type; + +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S%d TX FRMR %02X %02X %02X\n", + lapb->dev, lapb->state, skb->data[1], + skb->data[2], skb->data[3]); +#endif + } + + lapb_transmit_buffer(lapb, skb, LAPB_RESPONSE); +} diff --git a/net/lapb/lapb_timer.c b/net/lapb/lapb_timer.c new file mode 100644 index 000000000000..2c8f0f809220 --- /dev/null +++ b/net/lapb/lapb_timer.c @@ -0,0 +1,189 @@ +/* + * LAPB release 002 + * + * This code REQUIRES 2.1.15 or higher/ NET3.038 + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * LAPB 001 Jonathan Naylor Started Coding + * LAPB 002 Jonathan Naylor New timer architecture. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void lapb_t1timer_expiry(unsigned long); +static void lapb_t2timer_expiry(unsigned long); + +void lapb_start_t1timer(struct lapb_cb *lapb) +{ + del_timer(&lapb->t1timer); + + lapb->t1timer.data = (unsigned long)lapb; + lapb->t1timer.function = &lapb_t1timer_expiry; + lapb->t1timer.expires = jiffies + lapb->t1; + + add_timer(&lapb->t1timer); +} + +void lapb_start_t2timer(struct lapb_cb *lapb) +{ + del_timer(&lapb->t2timer); + + lapb->t2timer.data = (unsigned long)lapb; + lapb->t2timer.function = &lapb_t2timer_expiry; + lapb->t2timer.expires = jiffies + lapb->t2; + + add_timer(&lapb->t2timer); +} + +void lapb_stop_t1timer(struct lapb_cb *lapb) +{ + del_timer(&lapb->t1timer); +} + +void lapb_stop_t2timer(struct lapb_cb *lapb) +{ + del_timer(&lapb->t2timer); +} + +int lapb_t1timer_running(struct lapb_cb *lapb) +{ + return timer_pending(&lapb->t1timer); +} + +static void lapb_t2timer_expiry(unsigned long param) +{ + struct lapb_cb *lapb = (struct lapb_cb *)param; + + if (lapb->condition & LAPB_ACK_PENDING_CONDITION) { + lapb->condition &= ~LAPB_ACK_PENDING_CONDITION; + lapb_timeout_response(lapb); + } +} + +static void lapb_t1timer_expiry(unsigned long param) +{ + struct lapb_cb *lapb = (struct lapb_cb *)param; + + switch (lapb->state) { + + /* + * If we are a DCE, keep going DM .. DM .. DM + */ + case LAPB_STATE_0: + if (lapb->mode & LAPB_DCE) + lapb_send_control(lapb, LAPB_DM, LAPB_POLLOFF, LAPB_RESPONSE); + break; + + /* + * Awaiting connection state, send SABM(E), up to N2 times. + */ + case LAPB_STATE_1: + if (lapb->n2count == lapb->n2) { + lapb_clear_queues(lapb); + lapb->state = LAPB_STATE_0; + lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S1 -> S0\n", lapb->dev); +#endif + return; + } else { + lapb->n2count++; + if (lapb->mode & LAPB_EXTENDED) { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 TX SABME(1)\n", lapb->dev); +#endif + lapb_send_control(lapb, LAPB_SABME, LAPB_POLLON, LAPB_COMMAND); + } else { +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S1 TX SABM(1)\n", lapb->dev); +#endif + lapb_send_control(lapb, LAPB_SABM, LAPB_POLLON, LAPB_COMMAND); + } + } + break; + + /* + * Awaiting disconnection state, send DISC, up to N2 times. + */ + case LAPB_STATE_2: + if (lapb->n2count == lapb->n2) { + lapb_clear_queues(lapb); + lapb->state = LAPB_STATE_0; + lapb_disconnect_confirmation(lapb, LAPB_TIMEDOUT); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S2 -> S0\n", lapb->dev); +#endif + return; + } else { + lapb->n2count++; +#if LAPB_DEBUG > 1 + printk(KERN_DEBUG "lapb: (%p) S2 TX DISC(1)\n", lapb->dev); +#endif + lapb_send_control(lapb, LAPB_DISC, LAPB_POLLON, LAPB_COMMAND); + } + break; + + /* + * Data transfer state, restransmit I frames, up to N2 times. + */ + case LAPB_STATE_3: + if (lapb->n2count == lapb->n2) { + lapb_clear_queues(lapb); + lapb->state = LAPB_STATE_0; + lapb_stop_t2timer(lapb); + lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S3 -> S0\n", lapb->dev); +#endif + return; + } else { + lapb->n2count++; + lapb_requeue_frames(lapb); + } + break; + + /* + * Frame reject state, restransmit FRMR frames, up to N2 times. + */ + case LAPB_STATE_4: + if (lapb->n2count == lapb->n2) { + lapb_clear_queues(lapb); + lapb->state = LAPB_STATE_0; + lapb_disconnect_indication(lapb, LAPB_TIMEDOUT); +#if LAPB_DEBUG > 0 + printk(KERN_DEBUG "lapb: (%p) S4 -> S0\n", lapb->dev); +#endif + return; + } else { + lapb->n2count++; + lapb_transmit_frmr(lapb); + } + break; + } + + lapb_start_t1timer(lapb); +} diff --git a/net/llc/Kconfig b/net/llc/Kconfig new file mode 100644 index 000000000000..b91c65108162 --- /dev/null +++ b/net/llc/Kconfig @@ -0,0 +1,10 @@ +config LLC + tristate + depends on NET + +config LLC2 + tristate "ANSI/IEEE 802.2 LLC type 2 Support" + select LLC + help + This is a Logical Link Layer type 2, connection oriented support. + Select this if you want to have support for PF_LLC sockets. diff --git a/net/llc/Makefile b/net/llc/Makefile new file mode 100644 index 000000000000..5ebd4ed2bd42 --- /dev/null +++ b/net/llc/Makefile @@ -0,0 +1,24 @@ +########################################################################### +# Makefile for the Linux 802.2 LLC (fully-functional) layer. +# +# Copyright (c) 1997 by Procom Technology,Inc. +# 2001-2003 by Arnaldo Carvalho de Melo +# +# This program can be redistributed or modified under the terms of the +# GNU General Public License as published by the Free Software Foundation. +# This program is distributed without any warranty or implied warranty +# of merchantability or fitness for a particular purpose. +# +# See the GNU General Public License for more details. +########################################################################### + +obj-$(CONFIG_LLC) += llc.o + +llc-y := llc_core.o llc_input.o llc_output.o + +obj-$(CONFIG_LLC2) += llc2.o + +llc2-y := llc_if.o llc_c_ev.o llc_c_ac.o llc_conn.o llc_c_st.o llc_pdu.o \ + llc_sap.o llc_s_ac.o llc_s_ev.o llc_s_st.o af_llc.o llc_station.o + +llc2-$(CONFIG_PROC_FS) += llc_proc.o diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c new file mode 100644 index 000000000000..20b4cfebd74c --- /dev/null +++ b/net/llc/af_llc.c @@ -0,0 +1,1079 @@ +/* + * af_llc.c - LLC User Interface SAPs + * Description: + * Functions in this module are implementation of socket based llc + * communications for the Linux operating system. Support of llc class + * one and class two is provided via SOCK_DGRAM and SOCK_STREAM + * respectively. + * + * An llc2 connection is (mac + sap), only one llc2 sap connection + * is allowed per mac. Though one sap may have multiple mac + sap + * connections. + * + * Copyright (c) 2001 by Jay Schulist + * 2002-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* remember: uninitialized global data is zeroed because its in .bss */ +static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; +static u16 llc_ui_sap_link_no_max[256]; +static struct sockaddr_llc llc_ui_addrnull; +static struct proto_ops llc_ui_ops; + +static int llc_ui_wait_for_conn(struct sock *sk, int timeout); +static int llc_ui_wait_for_disc(struct sock *sk, int timeout); +static int llc_ui_wait_for_data(struct sock *sk, int timeout); +static int llc_ui_wait_for_busy_core(struct sock *sk, int timeout); + +#if 0 +#define dprintk(args...) printk(KERN_DEBUG args) +#else +#define dprintk(args...) +#endif + +/** + * llc_ui_next_link_no - return the next unused link number for a sap + * @sap: Address of sap to get link number from. + * + * Return the next unused link number for a given sap. + */ +static __inline__ u16 llc_ui_next_link_no(int sap) +{ + return llc_ui_sap_link_no_max[sap]++; +} + +/** + * llc_proto_type - return eth protocol for ARP header type + * @arphrd: ARP header type. + * + * Given an ARP header type return the corresponding ethernet protocol. + */ +static __inline__ u16 llc_proto_type(u16 arphrd) +{ + return arphrd == ARPHRD_IEEE802_TR ? + htons(ETH_P_TR_802_2) : htons(ETH_P_802_2); +} + +/** + * llc_ui_addr_null - determines if a address structure is null + * @addr: Address to test if null. + */ +static __inline__ u8 llc_ui_addr_null(struct sockaddr_llc *addr) +{ + return !memcmp(addr, &llc_ui_addrnull, sizeof(*addr)); +} + +/** + * llc_ui_header_len - return length of llc header based on operation + * @sk: Socket which contains a valid llc socket type. + * @addr: Complete sockaddr_llc structure received from the user. + * + * Provide the length of the llc header depending on what kind of + * operation the user would like to perform and the type of socket. + * Returns the correct llc header length. + */ +static __inline__ u8 llc_ui_header_len(struct sock *sk, + struct sockaddr_llc *addr) +{ + u8 rc = LLC_PDU_LEN_U; + + if (addr->sllc_test || addr->sllc_xid) + rc = LLC_PDU_LEN_U; + else if (sk->sk_type == SOCK_STREAM) + rc = LLC_PDU_LEN_I; + return rc; +} + +/** + * llc_ui_send_data - send data via reliable llc2 connection + * @sk: Connection the socket is using. + * @skb: Data the user wishes to send. + * @addr: Source and destination fields provided by the user. + * @noblock: can we block waiting for data? + * + * Send data via reliable llc2 connection. + * Returns 0 upon success, non-zero if action did not succeed. + */ +static int llc_ui_send_data(struct sock* sk, struct sk_buff *skb, int noblock) +{ + struct llc_sock* llc = llc_sk(sk); + int rc = 0; + + if (llc_data_accept_state(llc->state) || llc->p_flag) { + int timeout = sock_sndtimeo(sk, noblock); + + rc = llc_ui_wait_for_busy_core(sk, timeout); + } + if (!rc) + rc = llc_build_and_send_pkt(sk, skb); + return rc; +} + +static void llc_ui_sk_init(struct socket *sock, struct sock *sk) +{ + sk->sk_type = sock->type; + sk->sk_sleep = &sock->wait; + sk->sk_socket = sock; + sock->sk = sk; + sock->ops = &llc_ui_ops; +} + +static struct proto llc_proto = { + .name = "DDP", + .owner = THIS_MODULE, + .obj_size = sizeof(struct llc_sock), +}; + +/** + * llc_ui_create - alloc and init a new llc_ui socket + * @sock: Socket to initialize and attach allocated sk to. + * @protocol: Unused. + * + * Allocate and initialize a new llc_ui socket, validate the user wants a + * socket type we have available. + * Returns 0 upon success, negative upon failure. + */ +static int llc_ui_create(struct socket *sock, int protocol) +{ + struct sock *sk; + int rc = -ESOCKTNOSUPPORT; + + if (sock->type == SOCK_DGRAM || sock->type == SOCK_STREAM) { + rc = -ENOMEM; + sk = llc_sk_alloc(PF_LLC, GFP_KERNEL, &llc_proto); + if (sk) { + rc = 0; + llc_ui_sk_init(sock, sk); + } + } + return rc; +} + +/** + * llc_ui_release - shutdown socket + * @sock: Socket to release. + * + * Shutdown and deallocate an existing socket. + */ +static int llc_ui_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct llc_sock *llc; + + if (!sk) + goto out; + sock_hold(sk); + lock_sock(sk); + llc = llc_sk(sk); + dprintk("%s: closing local(%02X) remote(%02X)\n", __FUNCTION__, + llc->laddr.lsap, llc->daddr.lsap); + if (!llc_send_disc(sk)) + llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo); + if (!sock_flag(sk, SOCK_ZAPPED)) + llc_sap_remove_socket(llc->sap, sk); + release_sock(sk); + if (llc->sap && hlist_empty(&llc->sap->sk_list.list)) { + llc_release_sockets(llc->sap); + llc_sap_close(llc->sap); + } + if (llc->dev) + dev_put(llc->dev); + sock_put(sk); + llc_sk_free(sk); +out: + return 0; +} + +/** + * llc_ui_autoport - provide dynamically allocate SAP number + * + * Provide the caller with a dynamically allocated SAP number according + * to the rules that are set in this function. Returns: 0, upon failure, + * SAP number otherwise. + */ +static int llc_ui_autoport(void) +{ + struct llc_sap *sap; + int i, tries = 0; + + while (tries < LLC_SAP_DYN_TRIES) { + for (i = llc_ui_sap_last_autoport; + i < LLC_SAP_DYN_STOP; i += 2) { + sap = llc_sap_find(i); + if (!sap) { + llc_ui_sap_last_autoport = i + 2; + goto out; + } + } + llc_ui_sap_last_autoport = LLC_SAP_DYN_START; + tries++; + } + i = 0; +out: + return i; +} + +/** + * llc_ui_autobind - Bind a socket to a specific address. + * @sk: Socket to bind an address to. + * @addr: Address the user wants the socket bound to. + * + * Bind a socket to a specific address. For llc a user is able to bind to + * a specific sap only or mac + sap. If the user only specifies a sap and + * a null dmac (all zeros) the user is attempting to bind to an entire + * sap. This will stop anyone else on the local system from using that + * sap. If someone else has a mac + sap open the bind to null + sap will + * fail. + * If the user desires to bind to a specific mac + sap, it is possible to + * have multiple sap connections via multiple macs. + * Bind and autobind for that matter must enforce the correct sap usage + * otherwise all hell will break loose. + * Returns: 0 upon success, negative otherwise. + */ +static int llc_ui_autobind(struct socket *sock, struct sockaddr_llc *addr) +{ + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap; + int rc = -EINVAL; + + if (!sock_flag(sk, SOCK_ZAPPED)) + goto out; + rc = -ENODEV; + llc->dev = dev_getfirstbyhwtype(addr->sllc_arphrd); + if (!llc->dev) + goto out; + rc = -EUSERS; + llc->laddr.lsap = llc_ui_autoport(); + if (!llc->laddr.lsap) + goto out; + rc = -EBUSY; /* some other network layer is using the sap */ + sap = llc_sap_open(llc->laddr.lsap, NULL); + if (!sap) + goto out; + memcpy(llc->laddr.mac, llc->dev->dev_addr, IFHWADDRLEN); + memcpy(&llc->addr, addr, sizeof(llc->addr)); + /* assign new connection to its SAP */ + llc_sap_add_socket(sap, sk); + sock_reset_flag(sk, SOCK_ZAPPED); + rc = 0; +out: + return rc; +} + +/** + * llc_ui_bind - bind a socket to a specific address. + * @sock: Socket to bind an address to. + * @uaddr: Address the user wants the socket bound to. + * @addrlen: Length of the uaddr structure. + * + * Bind a socket to a specific address. For llc a user is able to bind to + * a specific sap only or mac + sap. If the user only specifies a sap and + * a null dmac (all zeros) the user is attempting to bind to an entire + * sap. This will stop anyone else on the local system from using that + * sap. If someone else has a mac + sap open the bind to null + sap will + * fail. + * If the user desires to bind to a specific mac + sap, it is possible to + * have multiple sap connections via multiple macs. + * Bind and autobind for that matter must enforce the correct sap usage + * otherwise all hell will break loose. + * Returns: 0 upon success, negative otherwise. + */ +static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) +{ + struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap; + int rc = -EINVAL; + + dprintk("%s: binding %02X\n", __FUNCTION__, addr->sllc_sap); + if (!sock_flag(sk, SOCK_ZAPPED) || addrlen != sizeof(*addr)) + goto out; + rc = -EAFNOSUPPORT; + if (addr->sllc_family != AF_LLC) + goto out; + if (!addr->sllc_sap) { + rc = -EUSERS; + addr->sllc_sap = llc_ui_autoport(); + if (!addr->sllc_sap) + goto out; + } + sap = llc_sap_find(addr->sllc_sap); + if (!sap) { + sap = llc_sap_open(addr->sllc_sap, NULL); + rc = -EBUSY; /* some other network layer is using the sap */ + if (!sap) + goto out; + } else { + struct llc_addr laddr, daddr; + struct sock *ask; + + memset(&laddr, 0, sizeof(laddr)); + memset(&daddr, 0, sizeof(daddr)); + /* + * FIXME: check if the the address is multicast, + * only SOCK_DGRAM can do this. + */ + memcpy(laddr.mac, addr->sllc_mac, IFHWADDRLEN); + laddr.lsap = addr->sllc_sap; + rc = -EADDRINUSE; /* mac + sap clash. */ + ask = llc_lookup_established(sap, &daddr, &laddr); + if (ask) { + sock_put(ask); + goto out; + } + } + llc->laddr.lsap = addr->sllc_sap; + memcpy(llc->laddr.mac, addr->sllc_mac, IFHWADDRLEN); + memcpy(&llc->addr, addr, sizeof(llc->addr)); + /* assign new connection to its SAP */ + llc_sap_add_socket(sap, sk); + sock_reset_flag(sk, SOCK_ZAPPED); + rc = 0; +out: + return rc; +} + +/** + * llc_ui_shutdown - shutdown a connect llc2 socket. + * @sock: Socket to shutdown. + * @how: What part of the socket to shutdown. + * + * Shutdown a connected llc2 socket. Currently this function only supports + * shutting down both sends and receives (2), we could probably make this + * function such that a user can shutdown only half the connection but not + * right now. + * Returns: 0 upon success, negative otherwise. + */ +static int llc_ui_shutdown(struct socket *sock, int how) +{ + struct sock *sk = sock->sk; + int rc = -ENOTCONN; + + lock_sock(sk); + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + rc = -EINVAL; + if (how != 2) + goto out; + rc = llc_send_disc(sk); + if (!rc) + rc = llc_ui_wait_for_disc(sk, sk->sk_rcvtimeo); + /* Wake up anyone sleeping in poll */ + sk->sk_state_change(sk); +out: + release_sock(sk); + return rc; +} + +/** + * llc_ui_connect - Connect to a remote llc2 mac + sap. + * @sock: Socket which will be connected to the remote destination. + * @uaddr: Remote and possibly the local address of the new connection. + * @addrlen: Size of uaddr structure. + * @flags: Operational flags specified by the user. + * + * Connect to a remote llc2 mac + sap. The caller must specify the + * destination mac and address to connect to. If the user hasn't previously + * called bind(2) with a smac the address of the first interface of the + * specified arp type will be used. + * This function will autobind if user did not previously call bind. + * Returns: 0 upon success, negative otherwise. + */ +static int llc_ui_connect(struct socket *sock, struct sockaddr *uaddr, + int addrlen, int flags) +{ + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); + struct sockaddr_llc *addr = (struct sockaddr_llc *)uaddr; + struct net_device *dev; + int rc = -EINVAL; + + lock_sock(sk); + if (addrlen != sizeof(*addr)) + goto out; + rc = -EAFNOSUPPORT; + if (addr->sllc_family != AF_LLC) + goto out; + /* bind connection to sap if user hasn't done it. */ + if (sock_flag(sk, SOCK_ZAPPED)) { + /* bind to sap with null dev, exclusive */ + rc = llc_ui_autobind(sock, addr); + if (rc) + goto out; + llc->daddr.lsap = addr->sllc_sap; + memcpy(llc->daddr.mac, addr->sllc_mac, IFHWADDRLEN); + } + dev = llc->dev; + if (sk->sk_type != SOCK_STREAM) + goto out; + rc = -EALREADY; + if (sock->state == SS_CONNECTING) + goto out; + sock->state = SS_CONNECTING; + sk->sk_state = TCP_SYN_SENT; + llc->link = llc_ui_next_link_no(llc->sap->laddr.lsap); + rc = llc_establish_connection(sk, dev->dev_addr, + addr->sllc_mac, addr->sllc_sap); + if (rc) { + dprintk("%s: llc_ui_send_conn failed :-(\n", __FUNCTION__); + sock->state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; + goto out; + } + rc = llc_ui_wait_for_conn(sk, sk->sk_rcvtimeo); + if (rc) + dprintk("%s: llc_ui_wait_for_conn failed=%d\n", __FUNCTION__, rc); +out: + release_sock(sk); + return rc; +} + +/** + * llc_ui_listen - allow a normal socket to accept incoming connections + * @sock: Socket to allow incoming connections on. + * @backlog: Number of connections to queue. + * + * Allow a normal socket to accept incoming connections. + * Returns 0 upon success, negative otherwise. + */ +static int llc_ui_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int rc = -EINVAL; + + lock_sock(sk); + if (sock->state != SS_UNCONNECTED) + goto out; + rc = -EOPNOTSUPP; + if (sk->sk_type != SOCK_STREAM) + goto out; + rc = -EAGAIN; + if (sock_flag(sk, SOCK_ZAPPED)) + goto out; + rc = 0; + if (!(unsigned)backlog) /* BSDism */ + backlog = 1; + sk->sk_max_ack_backlog = backlog; + if (sk->sk_state != TCP_LISTEN) { + sk->sk_ack_backlog = 0; + sk->sk_state = TCP_LISTEN; + } + sk->sk_socket->flags |= __SO_ACCEPTCON; +out: + release_sock(sk); + return rc; +} + +static int llc_ui_wait_for_disc(struct sock *sk, int timeout) +{ + DECLARE_WAITQUEUE(wait, current); + int rc; + + add_wait_queue_exclusive(sk->sk_sleep, &wait); + for (;;) { + __set_current_state(TASK_INTERRUPTIBLE); + rc = 0; + if (sk->sk_state != TCP_CLOSE) { + release_sock(sk); + timeout = schedule_timeout(timeout); + lock_sock(sk); + } else + break; + rc = -ERESTARTSYS; + if (signal_pending(current)) + break; + rc = -EAGAIN; + if (!timeout) + break; + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sk_sleep, &wait); + return rc; +} + +static int llc_ui_wait_for_conn(struct sock *sk, int timeout) +{ + DECLARE_WAITQUEUE(wait, current); + int rc; + + add_wait_queue_exclusive(sk->sk_sleep, &wait); + for (;;) { + __set_current_state(TASK_INTERRUPTIBLE); + rc = -EAGAIN; + if (sk->sk_state == TCP_CLOSE) + break; + rc = 0; + if (sk->sk_state != TCP_ESTABLISHED) { + release_sock(sk); + timeout = schedule_timeout(timeout); + lock_sock(sk); + } else + break; + rc = -ERESTARTSYS; + if (signal_pending(current)) + break; + rc = -EAGAIN; + if (!timeout) + break; + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sk_sleep, &wait); + return rc; +} + +static int llc_ui_wait_for_data(struct sock *sk, int timeout) +{ + DECLARE_WAITQUEUE(wait, current); + int rc = 0; + + add_wait_queue_exclusive(sk->sk_sleep, &wait); + for (;;) { + __set_current_state(TASK_INTERRUPTIBLE); + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + /* + * Well, if we have backlog, try to process it now. + */ + if (sk->sk_backlog.tail) { + release_sock(sk); + lock_sock(sk); + } + rc = 0; + if (skb_queue_empty(&sk->sk_receive_queue)) { + release_sock(sk); + timeout = schedule_timeout(timeout); + lock_sock(sk); + } else + break; + rc = -ERESTARTSYS; + if (signal_pending(current)) + break; + rc = -EAGAIN; + if (!timeout) + break; + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sk_sleep, &wait); + return rc; +} + +static int llc_ui_wait_for_busy_core(struct sock *sk, int timeout) +{ + DECLARE_WAITQUEUE(wait, current); + struct llc_sock *llc = llc_sk(sk); + int rc; + + add_wait_queue_exclusive(sk->sk_sleep, &wait); + for (;;) { + dprintk("%s: looping...\n", __FUNCTION__); + __set_current_state(TASK_INTERRUPTIBLE); + rc = -ENOTCONN; + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + rc = 0; + if (llc_data_accept_state(llc->state) || llc->p_flag) { + release_sock(sk); + timeout = schedule_timeout(timeout); + lock_sock(sk); + } else + break; + rc = -ERESTARTSYS; + if (signal_pending(current)) + break; + rc = -EAGAIN; + if (!timeout) + break; + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sk_sleep, &wait); + return rc; +} + +/** + * llc_ui_accept - accept a new incoming connection. + * @sock: Socket which connections arrive on. + * @newsock: Socket to move incoming connection to. + * @flags: User specified operational flags. + * + * Accept a new incoming connection. + * Returns 0 upon success, negative otherwise. + */ +static int llc_ui_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk = sock->sk, *newsk; + struct llc_sock *llc, *newllc; + struct sk_buff *skb; + int rc = -EOPNOTSUPP; + + dprintk("%s: accepting on %02X\n", __FUNCTION__, + llc_sk(sk)->laddr.lsap); + lock_sock(sk); + if (sk->sk_type != SOCK_STREAM) + goto out; + rc = -EINVAL; + if (sock->state != SS_UNCONNECTED || sk->sk_state != TCP_LISTEN) + goto out; + /* wait for a connection to arrive. */ + rc = llc_ui_wait_for_data(sk, sk->sk_rcvtimeo); + if (rc) + goto out; + dprintk("%s: got a new connection on %02X\n", __FUNCTION__, + llc_sk(sk)->laddr.lsap); + skb = skb_dequeue(&sk->sk_receive_queue); + rc = -EINVAL; + if (!skb->sk) + goto frees; + rc = 0; + newsk = skb->sk; + /* attach connection to a new socket. */ + llc_ui_sk_init(newsock, newsk); + sock_reset_flag(newsk, SOCK_ZAPPED); + newsk->sk_state = TCP_ESTABLISHED; + newsock->state = SS_CONNECTED; + llc = llc_sk(sk); + newllc = llc_sk(newsk); + memcpy(&newllc->addr, &llc->addr, sizeof(newllc->addr)); + newllc->link = llc_ui_next_link_no(newllc->laddr.lsap); + + /* put original socket back into a clean listen state. */ + sk->sk_state = TCP_LISTEN; + sk->sk_ack_backlog--; + skb->sk = NULL; + dprintk("%s: ok success on %02X, client on %02X\n", __FUNCTION__, + llc_sk(sk)->addr.sllc_sap, newllc->daddr.lsap); +frees: + kfree_skb(skb); +out: + release_sock(sk); + return rc; +} + +/** + * llc_ui_recvmsg - copy received data to the socket user. + * @sock: Socket to copy data from. + * @msg: Various user space related information. + * @size: Size of user buffer. + * @flags: User specified flags. + * + * Copy received data to the socket user. + * Returns non-negative upon success, negative otherwise. + */ +static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_llc *uaddr = (struct sockaddr_llc *)msg->msg_name; + struct sk_buff *skb; + size_t copied = 0; + int rc = -ENOMEM, timeout; + int noblock = flags & MSG_DONTWAIT; + + dprintk("%s: receiving in %02X from %02X\n", __FUNCTION__, + llc_sk(sk)->laddr.lsap, llc_sk(sk)->daddr.lsap); + lock_sock(sk); + timeout = sock_rcvtimeo(sk, noblock); + rc = llc_ui_wait_for_data(sk, timeout); + if (rc) { + dprintk("%s: llc_ui_wait_for_data failed recv " + "in %02X from %02X\n", __FUNCTION__, + llc_sk(sk)->laddr.lsap, llc_sk(sk)->daddr.lsap); + goto out; + } + skb = skb_dequeue(&sk->sk_receive_queue); + if (!skb) /* shutdown */ + goto out; + copied = skb->len; + if (copied > size) + copied = size; + rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (rc) + goto dgram_free; + if (skb->len > copied) { + skb_pull(skb, copied); + skb_queue_head(&sk->sk_receive_queue, skb); + } + if (uaddr) + memcpy(uaddr, llc_ui_skb_cb(skb), sizeof(*uaddr)); + msg->msg_namelen = sizeof(*uaddr); + if (!skb->list) { +dgram_free: + kfree_skb(skb); + } +out: + release_sock(sk); + return rc ? : copied; +} + +/** + * llc_ui_sendmsg - Transmit data provided by the socket user. + * @sock: Socket to transmit data from. + * @msg: Various user related information. + * @len: Length of data to transmit. + * + * Transmit data provided by the socket user. + * Returns non-negative upon success, negative otherwise. + */ +static int llc_ui_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); + struct sockaddr_llc *addr = (struct sockaddr_llc *)msg->msg_name; + int flags = msg->msg_flags; + int noblock = flags & MSG_DONTWAIT; + struct net_device *dev; + struct sk_buff *skb; + size_t size = 0; + int rc = -EINVAL, copied = 0, hdrlen; + + dprintk("%s: sending from %02X to %02X\n", __FUNCTION__, + llc->laddr.lsap, llc->daddr.lsap); + lock_sock(sk); + if (addr) { + if (msg->msg_namelen < sizeof(*addr)) + goto release; + } else { + if (llc_ui_addr_null(&llc->addr)) + goto release; + addr = &llc->addr; + } + /* must bind connection to sap if user hasn't done it. */ + if (sock_flag(sk, SOCK_ZAPPED)) { + /* bind to sap with null dev, exclusive. */ + rc = llc_ui_autobind(sock, addr); + if (rc) + goto release; + } + dev = llc->dev; + hdrlen = dev->hard_header_len + llc_ui_header_len(sk, addr); + size = hdrlen + len; + if (size > dev->mtu) + size = dev->mtu; + copied = size - hdrlen; + release_sock(sk); + skb = sock_alloc_send_skb(sk, size, noblock, &rc); + lock_sock(sk); + if (!skb) + goto release; + skb->sk = sk; + skb->dev = dev; + skb->protocol = llc_proto_type(addr->sllc_arphrd); + skb_reserve(skb, hdrlen); + rc = memcpy_fromiovec(skb_put(skb, copied), msg->msg_iov, copied); + if (rc) + goto out; + if (sk->sk_type == SOCK_DGRAM || addr->sllc_ua) { + llc_build_and_send_ui_pkt(llc->sap, skb, addr->sllc_mac, + addr->sllc_sap); + goto out; + } + if (addr->sllc_test) { + llc_build_and_send_test_pkt(llc->sap, skb, addr->sllc_mac, + addr->sllc_sap); + goto out; + } + if (addr->sllc_xid) { + llc_build_and_send_xid_pkt(llc->sap, skb, addr->sllc_mac, + addr->sllc_sap); + goto out; + } + rc = -ENOPROTOOPT; + if (!(sk->sk_type == SOCK_STREAM && !addr->sllc_ua)) + goto out; + rc = llc_ui_send_data(sk, skb, noblock); + if (rc) + dprintk("%s: llc_ui_send_data failed: %d\n", __FUNCTION__, rc); +out: + if (rc) + kfree_skb(skb); +release: + if (rc) + dprintk("%s: failed sending from %02X to %02X: %d\n", + __FUNCTION__, llc->laddr.lsap, llc->daddr.lsap, rc); + release_sock(sk); + return rc ? : copied; +} + +/** + * llc_ui_getname - return the address info of a socket + * @sock: Socket to get address of. + * @uaddr: Address structure to return information. + * @uaddrlen: Length of address structure. + * @peer: Does user want local or remote address information. + * + * Return the address information of a socket. + */ +static int llc_ui_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddrlen, int peer) +{ + struct sockaddr_llc sllc; + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); + int rc = 0; + + lock_sock(sk); + if (sock_flag(sk, SOCK_ZAPPED)) + goto out; + *uaddrlen = sizeof(sllc); + memset(uaddr, 0, *uaddrlen); + if (peer) { + rc = -ENOTCONN; + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + if(llc->dev) + sllc.sllc_arphrd = llc->dev->type; + sllc.sllc_sap = llc->daddr.lsap; + memcpy(&sllc.sllc_mac, &llc->daddr.mac, IFHWADDRLEN); + } else { + rc = -EINVAL; + if (!llc->sap) + goto out; + sllc.sllc_sap = llc->sap->laddr.lsap; + + if (llc->dev) { + sllc.sllc_arphrd = llc->dev->type; + memcpy(&sllc.sllc_mac, &llc->dev->dev_addr, + IFHWADDRLEN); + } + } + rc = 0; + sllc.sllc_family = AF_LLC; + memcpy(uaddr, &sllc, sizeof(sllc)); +out: + release_sock(sk); + return rc; +} + +/** + * llc_ui_ioctl - io controls for PF_LLC + * @sock: Socket to get/set info + * @cmd: command + * @arg: optional argument for cmd + * + * get/set info on llc sockets + */ +static int llc_ui_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + return dev_ioctl(cmd, (void __user *)arg); +} + +/** + * llc_ui_setsockopt - set various connection specific parameters. + * @sock: Socket to set options on. + * @level: Socket level user is requesting operations on. + * @optname: Operation name. + * @optval User provided operation data. + * @optlen: Length of optval. + * + * Set various connection specific parameters. + */ +static int llc_ui_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); + int rc = -EINVAL, opt; + + lock_sock(sk); + if (level != SOL_LLC || optlen != sizeof(int)) + goto out; + rc = get_user(opt, (int __user *)optval); + if (rc) + goto out; + rc = -EINVAL; + switch (optname) { + case LLC_OPT_RETRY: + if (opt > LLC_OPT_MAX_RETRY) + goto out; + llc->n2 = opt; + break; + case LLC_OPT_SIZE: + if (opt > LLC_OPT_MAX_SIZE) + goto out; + llc->n1 = opt; + break; + case LLC_OPT_ACK_TMR_EXP: + if (opt > LLC_OPT_MAX_ACK_TMR_EXP) + goto out; + llc->ack_timer.expire = opt; + break; + case LLC_OPT_P_TMR_EXP: + if (opt > LLC_OPT_MAX_P_TMR_EXP) + goto out; + llc->pf_cycle_timer.expire = opt; + break; + case LLC_OPT_REJ_TMR_EXP: + if (opt > LLC_OPT_MAX_REJ_TMR_EXP) + goto out; + llc->rej_sent_timer.expire = opt; + break; + case LLC_OPT_BUSY_TMR_EXP: + if (opt > LLC_OPT_MAX_BUSY_TMR_EXP) + goto out; + llc->busy_state_timer.expire = opt; + break; + case LLC_OPT_TX_WIN: + if (opt > LLC_OPT_MAX_WIN) + goto out; + llc->k = opt; + break; + case LLC_OPT_RX_WIN: + if (opt > LLC_OPT_MAX_WIN) + goto out; + llc->rw = opt; + break; + default: + rc = -ENOPROTOOPT; + goto out; + } + rc = 0; +out: + release_sock(sk); + return rc; +} + +/** + * llc_ui_getsockopt - get connection specific socket info + * @sock: Socket to get information from. + * @level: Socket level user is requesting operations on. + * @optname: Operation name. + * @optval: Variable to return operation data in. + * @optlen: Length of optval. + * + * Get connection specific socket information. + */ +static int llc_ui_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); + int val = 0, len = 0, rc = -EINVAL; + + lock_sock(sk); + if (level != SOL_LLC) + goto out; + rc = get_user(len, optlen); + if (rc) + goto out; + rc = -EINVAL; + if (len != sizeof(int)) + goto out; + switch (optname) { + case LLC_OPT_RETRY: + val = llc->n2; break; + case LLC_OPT_SIZE: + val = llc->n1; break; + case LLC_OPT_ACK_TMR_EXP: + val = llc->ack_timer.expire; break; + case LLC_OPT_P_TMR_EXP: + val = llc->pf_cycle_timer.expire; break; + case LLC_OPT_REJ_TMR_EXP: + val = llc->rej_sent_timer.expire; break; + case LLC_OPT_BUSY_TMR_EXP: + val = llc->busy_state_timer.expire; break; + case LLC_OPT_TX_WIN: + val = llc->k; break; + case LLC_OPT_RX_WIN: + val = llc->rw; break; + default: + rc = -ENOPROTOOPT; + goto out; + } + rc = 0; + if (put_user(len, optlen) || copy_to_user(optval, &val, len)) + rc = -EFAULT; +out: + release_sock(sk); + return rc; +} + +static struct net_proto_family llc_ui_family_ops = { + .family = PF_LLC, + .create = llc_ui_create, + .owner = THIS_MODULE, +}; + +static struct proto_ops llc_ui_ops = { + .family = PF_LLC, + .owner = THIS_MODULE, + .release = llc_ui_release, + .bind = llc_ui_bind, + .connect = llc_ui_connect, + .socketpair = sock_no_socketpair, + .accept = llc_ui_accept, + .getname = llc_ui_getname, + .poll = datagram_poll, + .ioctl = llc_ui_ioctl, + .listen = llc_ui_listen, + .shutdown = llc_ui_shutdown, + .setsockopt = llc_ui_setsockopt, + .getsockopt = llc_ui_getsockopt, + .sendmsg = llc_ui_sendmsg, + .recvmsg = llc_ui_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +extern void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb); +extern void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb); + +static int __init llc2_init(void) +{ + int rc = proto_register(&llc_proto, 0); + + if (rc != 0) + goto out; + + llc_build_offset_table(); + llc_station_init(); + llc_ui_sap_last_autoport = LLC_SAP_DYN_START; + rc = llc_proc_init(); + if (rc != 0) + goto out_unregister_llc_proto; + sock_register(&llc_ui_family_ops); + llc_add_pack(LLC_DEST_SAP, llc_sap_handler); + llc_add_pack(LLC_DEST_CONN, llc_conn_handler); +out: + return rc; +out_unregister_llc_proto: + proto_unregister(&llc_proto); + goto out; +} + +static void __exit llc2_exit(void) +{ + llc_station_exit(); + llc_remove_pack(LLC_DEST_SAP); + llc_remove_pack(LLC_DEST_CONN); + sock_unregister(PF_LLC); + llc_proc_exit(); + proto_unregister(&llc_proto); +} + +module_init(llc2_init); +module_exit(llc2_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003"); +MODULE_DESCRIPTION("IEEE 802.2 PF_LLC support"); +MODULE_ALIAS_NETPROTO(PF_LLC); diff --git a/net/llc/llc_c_ac.c b/net/llc/llc_c_ac.c new file mode 100644 index 000000000000..b218be4c10ec --- /dev/null +++ b/net/llc/llc_c_ac.c @@ -0,0 +1,1514 @@ +/* + * llc_c_ac.c - actions performed during connection state transition. + * + * Description: + * Functions in this module are implementation of connection component actions + * Details of actions can be found in IEEE-802.2 standard document. + * All functions have one connection and one event as input argument. All of + * them return 0 On success and 1 otherwise. + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "llc_output.h" + +static int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb); +static void llc_process_tmr_ev(struct sock *sk, struct sk_buff *skb); +static int llc_conn_ac_data_confirm(struct sock *sk, struct sk_buff *ev); + +static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb); + +static int llc_conn_ac_send_rr_rsp_f_set_ackpf(struct sock *sk, + struct sk_buff *skb); + +static int llc_conn_ac_set_p_flag_1(struct sock *sk, struct sk_buff *skb); + +#define INCORRECT 0 + +int llc_conn_ac_clear_remote_busy(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + if (llc->remote_busy_flag) { + u8 nr; + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + llc->remote_busy_flag = 0; + del_timer(&llc->busy_state_timer.timer); + nr = LLC_I_GET_NR(pdu); + llc_conn_resend_i_pdu_as_cmd(sk, nr, 0); + } + return 0; +} + +int llc_conn_ac_conn_ind(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOTCONN; + u8 dsap; + struct llc_sap *sap; + + llc_pdu_decode_dsap(skb, &dsap); + sap = llc_sap_find(dsap); + if (sap) { + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + struct llc_sock *llc = llc_sk(sk); + + llc_pdu_decode_sa(skb, llc->daddr.mac); + llc_pdu_decode_da(skb, llc->laddr.mac); + llc->dev = skb->dev; + ev->ind_prim = LLC_CONN_PRIM; + rc = 0; + } + return rc; +} + +int llc_conn_ac_conn_confirm(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->cfm_prim = LLC_CONN_PRIM; + return 0; +} + +static int llc_conn_ac_data_confirm(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->cfm_prim = LLC_DATA_PRIM; + return 0; +} + +int llc_conn_ac_data_ind(struct sock *sk, struct sk_buff *skb) +{ + llc_conn_rtn_pdu(sk, skb); + return 0; +} + +int llc_conn_ac_disc_ind(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + u8 reason = 0; + int rc = 0; + + if (ev->type == LLC_CONN_EV_TYPE_PDU) { + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + if (LLC_PDU_IS_RSP(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_DM) + reason = LLC_DISC_REASON_RX_DM_RSP_PDU; + else if (LLC_PDU_IS_CMD(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_DISC) + reason = LLC_DISC_REASON_RX_DISC_CMD_PDU; + } else if (ev->type == LLC_CONN_EV_TYPE_ACK_TMR) + reason = LLC_DISC_REASON_ACK_TMR_EXP; + else { + reason = 0; + rc = -EINVAL; + } + if (!rc) { + ev->reason = reason; + ev->ind_prim = LLC_DISC_PRIM; + } + return rc; +} + +int llc_conn_ac_disc_confirm(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->reason = ev->status; + ev->cfm_prim = LLC_DISC_PRIM; + return 0; +} + +int llc_conn_ac_rst_ind(struct sock *sk, struct sk_buff *skb) +{ + u8 reason = 0; + int rc = 1; + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + struct llc_sock *llc = llc_sk(sk); + + switch (ev->type) { + case LLC_CONN_EV_TYPE_PDU: + if (LLC_PDU_IS_RSP(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_FRMR) { + reason = LLC_RESET_REASON_LOCAL; + rc = 0; + } else if (LLC_PDU_IS_CMD(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_SABME) { + reason = LLC_RESET_REASON_REMOTE; + rc = 0; + } else { + reason = 0; + rc = 1; + } + break; + case LLC_CONN_EV_TYPE_ACK_TMR: + case LLC_CONN_EV_TYPE_P_TMR: + case LLC_CONN_EV_TYPE_REJ_TMR: + case LLC_CONN_EV_TYPE_BUSY_TMR: + if (llc->retry_count > llc->n2) { + reason = LLC_RESET_REASON_LOCAL; + rc = 0; + } else + rc = 1; + break; + } + if (!rc) { + ev->reason = reason; + ev->ind_prim = LLC_RESET_PRIM; + } + return rc; +} + +int llc_conn_ac_rst_confirm(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->reason = 0; + ev->cfm_prim = LLC_RESET_PRIM; + return 0; +} + +int llc_conn_ac_clear_remote_busy_if_f_eq_1(struct sock *sk, + struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + if (LLC_PDU_IS_RSP(pdu) && + LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_1(pdu) && llc_sk(sk)->ack_pf) + llc_conn_ac_clear_remote_busy(sk, skb); + return 0; +} + +int llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2(struct sock *sk, + struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + if (llc->data_flag == 2) + del_timer(&llc->rej_sent_timer.timer); + return 0; +} + +int llc_conn_ac_send_disc_cmd_p_set_x(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_disc_cmd(nskb, 1); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + llc_conn_ac_set_p_flag_1(sk, skb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_dm_rsp_f_set_p(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + u8 f_bit; + + nskb->dev = llc->dev; + llc_pdu_decode_pf_bit(skb, &f_bit); + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_dm_rsp(nskb, f_bit); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_dm_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + u8 f_bit = 1; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_dm_rsp(nskb, f_bit); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_frmr_rsp_f_set_x(struct sock *sk, struct sk_buff *skb) +{ + u8 f_bit; + int rc = -ENOBUFS; + struct sk_buff *nskb; + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + struct llc_sock *llc = llc_sk(sk); + + llc->rx_pdu_hdr = *((u32 *)pdu); + if (LLC_PDU_IS_CMD(pdu)) + llc_pdu_decode_pf_bit(skb, &f_bit); + else + f_bit = 0; + nskb = llc_alloc_frame(); + if (nskb) { + struct llc_sap *sap = llc->sap; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_frmr_rsp(nskb, pdu, f_bit, llc->vS, + llc->vR, INCORRECT); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_resend_frmr_rsp_f_set_0(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + u8 f_bit = 0; + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + struct llc_pdu_sn *pdu = (struct llc_pdu_sn *)&llc->rx_pdu_hdr; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_frmr_rsp(nskb, pdu, f_bit, llc->vS, + llc->vR, INCORRECT); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_resend_frmr_rsp_f_set_p(struct sock *sk, struct sk_buff *skb) +{ + u8 f_bit; + int rc = -ENOBUFS; + struct sk_buff *nskb; + + llc_pdu_decode_pf_bit(skb, &f_bit); + nskb = llc_alloc_frame(); + if (nskb) { + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_frmr_rsp(nskb, pdu, f_bit, llc->vS, + llc->vR, INCORRECT); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_i_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc; + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_i_cmd(skb, 1, llc->vS, llc->vR); + rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); + if (!rc) { + llc_conn_send_pdu(sk, skb); + llc_conn_ac_inc_vs_by_1(sk, skb); + } + return rc; +} + +static int llc_conn_ac_send_i_cmd_p_set_0(struct sock *sk, struct sk_buff *skb) +{ + int rc; + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); + rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); + if (!rc) { + llc_conn_send_pdu(sk, skb); + llc_conn_ac_inc_vs_by_1(sk, skb); + } + return rc; +} + +int llc_conn_ac_send_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) +{ + int rc; + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_i_cmd(skb, 0, llc->vS, llc->vR); + rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); + if (!rc) { + llc_conn_send_pdu(sk, skb); + llc_conn_ac_inc_vs_by_1(sk, skb); + } + return 0; +} + +int llc_conn_ac_resend_i_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + u8 nr = LLC_I_GET_NR(pdu); + + llc_conn_resend_i_pdu_as_cmd(sk, nr, 0); + return 0; +} + +int llc_conn_ac_resend_i_xxx_x_set_0_or_send_rr(struct sock *sk, + struct sk_buff *skb) +{ + u8 nr; + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (!rc) + llc_conn_send_pdu(sk, nskb); + else + kfree_skb(skb); + } + if (rc) { + nr = LLC_I_GET_NR(pdu); + rc = 0; + llc_conn_resend_i_pdu_as_cmd(sk, nr, 0); + } + return rc; +} + +int llc_conn_ac_resend_i_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + u8 nr = LLC_I_GET_NR(pdu); + + llc_conn_resend_i_pdu_as_rsp(sk, nr, 1); + return 0; +} + +int llc_conn_ac_send_rej_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_rej_cmd(nskb, 1, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_rej_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + u8 f_bit = 1; + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rej_rsp(nskb, f_bit, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_rej_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + u8 f_bit = 0; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rej_rsp(nskb, f_bit, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_rnr_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_rnr_cmd(nskb, 1, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_rnr_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + u8 f_bit = 1; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rnr_rsp(nskb, f_bit, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_rnr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + u8 f_bit = 0; + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rnr_rsp(nskb, f_bit, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_set_remote_busy(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + if (!llc->remote_busy_flag) { + llc->remote_busy_flag = 1; + mod_timer(&llc->busy_state_timer.timer, + jiffies + llc->busy_state_timer.expire * HZ); + } + return 0; +} + +int llc_conn_ac_opt_send_rnr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rnr_rsp(nskb, 0, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_rr_cmd_p_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_rr_cmd(nskb, 1, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_rr_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + u8 f_bit = 1; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rr_rsp(nskb, f_bit, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_ack_rsp_f_set_1(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + u8 f_bit = 1; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rr_rsp(nskb, f_bit, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_rr_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_ack_xxx_x_set_0(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rr_rsp(nskb, 0, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +void llc_conn_set_p_flag(struct sock *sk, u8 value) +{ + int state_changed = llc_sk(sk)->p_flag && !value; + + llc_sk(sk)->p_flag = value; + + if (state_changed) + sk->sk_state_change(sk); +} + +int llc_conn_ac_send_sabme_cmd_p_set_x(struct sock *sk, struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + struct llc_sock *llc = llc_sk(sk); + + if (nskb) { + struct llc_sap *sap = llc->sap; + u8 *dmac = llc->daddr.mac; + + if (llc->dev->flags & IFF_LOOPBACK) + dmac = llc->dev->dev_addr; + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_sabme_cmd(nskb, 1); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, dmac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + llc_conn_set_p_flag(sk, 1); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_send_ua_rsp_f_set_p(struct sock *sk, struct sk_buff *skb) +{ + u8 f_bit; + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + llc_pdu_decode_pf_bit(skb, &f_bit); + if (nskb) { + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_ua_rsp(nskb, f_bit); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +int llc_conn_ac_set_s_flag_0(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->s_flag = 0; + return 0; +} + +int llc_conn_ac_set_s_flag_1(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->s_flag = 1; + return 0; +} + +int llc_conn_ac_start_p_timer(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + llc_conn_set_p_flag(sk, 1); + mod_timer(&llc->pf_cycle_timer.timer, + jiffies + llc->pf_cycle_timer.expire * HZ); + return 0; +} + +/** + * llc_conn_ac_send_ack_if_needed - check if ack is needed + * @sk: current connection structure + * @skb: current event + * + * Checks number of received PDUs which have not been acknowledged, yet, + * If number of them reaches to "npta"(Number of PDUs To Acknowledge) then + * sends an RR response as acknowledgement for them. Returns 0 for + * success, 1 otherwise. + */ +int llc_conn_ac_send_ack_if_needed(struct sock *sk, struct sk_buff *skb) +{ + u8 pf_bit; + struct llc_sock *llc = llc_sk(sk); + + llc_pdu_decode_pf_bit(skb, &pf_bit); + llc->ack_pf |= pf_bit & 1; + if (!llc->ack_must_be_send) { + llc->first_pdu_Ns = llc->vR; + llc->ack_must_be_send = 1; + llc->ack_pf = pf_bit & 1; + } + if (((llc->vR - llc->first_pdu_Ns + 129) % 128) >= llc->npta) { + llc_conn_ac_send_rr_rsp_f_set_ackpf(sk, skb); + llc->ack_must_be_send = 0; + llc->ack_pf = 0; + llc_conn_ac_inc_npta_value(sk, skb); + } + return 0; +} + +/** + * llc_conn_ac_rst_sendack_flag - resets ack_must_be_send flag + * @sk: current connection structure + * @skb: current event + * + * This action resets ack_must_be_send flag of given connection, this flag + * indicates if there is any PDU which has not been acknowledged yet. + * Returns 0 for success, 1 otherwise. + */ +int llc_conn_ac_rst_sendack_flag(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->ack_must_be_send = llc_sk(sk)->ack_pf = 0; + return 0; +} + +/** + * llc_conn_ac_send_i_rsp_f_set_ackpf - acknowledge received PDUs + * @sk: current connection structure + * @skb: current event + * + * Sends an I response PDU with f-bit set to ack_pf flag as acknowledge to + * all received PDUs which have not been acknowledged, yet. ack_pf flag is + * set to one if one PDU with p-bit set to one is received. Returns 0 for + * success, 1 otherwise. + */ +static int llc_conn_ac_send_i_rsp_f_set_ackpf(struct sock *sk, + struct sk_buff *skb) +{ + int rc; + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + llc_pdu_header_init(skb, LLC_PDU_TYPE_I, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_i_cmd(skb, llc->ack_pf, llc->vS, llc->vR); + rc = llc_mac_hdr_init(skb, llc->dev->dev_addr, llc->daddr.mac); + if (!rc) { + llc_conn_send_pdu(sk, skb); + llc_conn_ac_inc_vs_by_1(sk, skb); + } + return rc; +} + +/** + * llc_conn_ac_send_i_as_ack - sends an I-format PDU to acknowledge rx PDUs + * @sk: current connection structure. + * @skb: current event. + * + * This action sends an I-format PDU as acknowledge to received PDUs which + * have not been acknowledged, yet, if there is any. By using of this + * action number of acknowledgements decreases, this technic is called + * piggy backing. Returns 0 for success, 1 otherwise. + */ +int llc_conn_ac_send_i_as_ack(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + if (llc->ack_must_be_send) { + llc_conn_ac_send_i_rsp_f_set_ackpf(sk, skb); + llc->ack_must_be_send = 0 ; + llc->ack_pf = 0; + } else + llc_conn_ac_send_i_cmd_p_set_0(sk, skb); + return 0; +} + +/** + * llc_conn_ac_send_rr_rsp_f_set_ackpf - ack all rx PDUs not yet acked + * @sk: current connection structure. + * @skb: current event. + * + * This action sends an RR response with f-bit set to ack_pf flag as + * acknowledge to all received PDUs which have not been acknowledged, yet, + * if there is any. ack_pf flag indicates if a PDU has been received with + * p-bit set to one. Returns 0 for success, 1 otherwise. + */ +static int llc_conn_ac_send_rr_rsp_f_set_ackpf(struct sock *sk, + struct sk_buff *skb) +{ + int rc = -ENOBUFS; + struct sk_buff *nskb = llc_alloc_frame(); + + if (nskb) { + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + nskb->dev = llc->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_S, sap->laddr.lsap, + llc->daddr.lsap, LLC_PDU_RSP); + llc_pdu_init_as_rr_rsp(nskb, llc->ack_pf, llc->vR); + rc = llc_mac_hdr_init(nskb, llc->dev->dev_addr, llc->daddr.mac); + if (rc) + goto free; + llc_conn_send_pdu(sk, nskb); + } +out: + return rc; +free: + kfree_skb(nskb); + goto out; +} + +/** + * llc_conn_ac_inc_npta_value - tries to make value of npta greater + * @sk: current connection structure. + * @skb: current event. + * + * After "inc_cntr" times calling of this action, "npta" increase by one. + * this action tries to make vale of "npta" greater as possible; number of + * acknowledgements decreases by increasing of "npta". Returns 0 for + * success, 1 otherwise. + */ +static int llc_conn_ac_inc_npta_value(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + if (!llc->inc_cntr) { + llc->dec_step = 0; + llc->dec_cntr = llc->inc_cntr = 2; + ++llc->npta; + if (llc->npta > 127) + llc->npta = 127 ; + } else + --llc->inc_cntr; + return 0; +} + +/** + * llc_conn_ac_adjust_npta_by_rr - decreases "npta" by one + * @sk: current connection structure. + * @skb: current event. + * + * After receiving "dec_cntr" times RR command, this action decreases + * "npta" by one. Returns 0 for success, 1 otherwise. + */ +int llc_conn_ac_adjust_npta_by_rr(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + if (!llc->connect_step && !llc->remote_busy_flag) { + if (!llc->dec_step) { + if (!llc->dec_cntr) { + llc->inc_cntr = llc->dec_cntr = 2; + if (llc->npta > 0) + llc->npta = llc->npta - 1; + } else + llc->dec_cntr -=1; + } + } else + llc->connect_step = 0 ; + return 0; +} + +/** + * llc_conn_ac_adjust_npta_by_rnr - decreases "npta" by one + * @sk: current connection structure. + * @skb: current event. + * + * After receiving "dec_cntr" times RNR command, this action decreases + * "npta" by one. Returns 0 for success, 1 otherwise. + */ +int llc_conn_ac_adjust_npta_by_rnr(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + if (llc->remote_busy_flag) + if (!llc->dec_step) { + if (!llc->dec_cntr) { + llc->inc_cntr = llc->dec_cntr = 2; + if (llc->npta > 0) + --llc->npta; + } else + --llc->dec_cntr; + } + return 0; +} + +/** + * llc_conn_ac_dec_tx_win_size - decreases tx window size + * @sk: current connection structure. + * @skb: current event. + * + * After receiving of a REJ command or response, transmit window size is + * decreased by number of PDUs which are outstanding yet. Returns 0 for + * success, 1 otherwise. + */ +int llc_conn_ac_dec_tx_win_size(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + u8 unacked_pdu = skb_queue_len(&llc->pdu_unack_q); + + llc->k -= unacked_pdu; + if (llc->k < 2) + llc->k = 2; + return 0; +} + +/** + * llc_conn_ac_inc_tx_win_size - tx window size is inc by 1 + * @sk: current connection structure. + * @skb: current event. + * + * After receiving an RR response with f-bit set to one, transmit window + * size is increased by one. Returns 0 for success, 1 otherwise. + */ +int llc_conn_ac_inc_tx_win_size(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + llc->k += 1; + if (llc->k > 128) + llc->k = 128 ; + return 0; +} + +int llc_conn_ac_stop_all_timers(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + del_timer(&llc->pf_cycle_timer.timer); + del_timer(&llc->ack_timer.timer); + del_timer(&llc->rej_sent_timer.timer); + del_timer(&llc->busy_state_timer.timer); + llc->ack_must_be_send = 0; + llc->ack_pf = 0; + return 0; +} + +int llc_conn_ac_stop_other_timers(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + del_timer(&llc->rej_sent_timer.timer); + del_timer(&llc->pf_cycle_timer.timer); + del_timer(&llc->busy_state_timer.timer); + llc->ack_must_be_send = 0; + llc->ack_pf = 0; + return 0; +} + +int llc_conn_ac_start_ack_timer(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + mod_timer(&llc->ack_timer.timer, jiffies + llc->ack_timer.expire * HZ); + return 0; +} + +int llc_conn_ac_start_rej_timer(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + mod_timer(&llc->rej_sent_timer.timer, + jiffies + llc->rej_sent_timer.expire * HZ); + return 0; +} + +int llc_conn_ac_start_ack_tmr_if_not_running(struct sock *sk, + struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + if (!timer_pending(&llc->ack_timer.timer)) + mod_timer(&llc->ack_timer.timer, + jiffies + llc->ack_timer.expire * HZ); + return 0; +} + +int llc_conn_ac_stop_ack_timer(struct sock *sk, struct sk_buff *skb) +{ + del_timer(&llc_sk(sk)->ack_timer.timer); + return 0; +} + +int llc_conn_ac_stop_p_timer(struct sock *sk, struct sk_buff *skb) +{ + struct llc_sock *llc = llc_sk(sk); + + del_timer(&llc->pf_cycle_timer.timer); + llc_conn_set_p_flag(sk, 0); + return 0; +} + +int llc_conn_ac_stop_rej_timer(struct sock *sk, struct sk_buff *skb) +{ + del_timer(&llc_sk(sk)->rej_sent_timer.timer); + return 0; +} + +int llc_conn_ac_upd_nr_received(struct sock *sk, struct sk_buff *skb) +{ + int acked; + u16 unacked = 0; + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + struct llc_sock *llc = llc_sk(sk); + + llc->last_nr = PDU_SUPV_GET_Nr(pdu); + acked = llc_conn_remove_acked_pdus(sk, llc->last_nr, &unacked); + /* On loopback we don't queue I frames in unack_pdu_q queue. */ + if (acked > 0 || (llc->dev->flags & IFF_LOOPBACK)) { + llc->retry_count = 0; + del_timer(&llc->ack_timer.timer); + if (llc->failed_data_req) { + /* already, we did not accept data from upper layer + * (tx_window full or unacceptable state). Now, we + * can send data and must inform to upper layer. + */ + llc->failed_data_req = 0; + llc_conn_ac_data_confirm(sk, skb); + } + if (unacked) + mod_timer(&llc->ack_timer.timer, + jiffies + llc->ack_timer.expire * HZ); + } else if (llc->failed_data_req) { + u8 f_bit; + + llc_pdu_decode_pf_bit(skb, &f_bit); + if (f_bit == 1) { + llc->failed_data_req = 0; + llc_conn_ac_data_confirm(sk, skb); + } + } + return 0; +} + +int llc_conn_ac_upd_p_flag(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + if (LLC_PDU_IS_RSP(pdu)) { + u8 f_bit; + + llc_pdu_decode_pf_bit(skb, &f_bit); + if (f_bit) { + llc_conn_set_p_flag(sk, 0); + llc_conn_ac_stop_p_timer(sk, skb); + } + } + return 0; +} + +int llc_conn_ac_set_data_flag_2(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->data_flag = 2; + return 0; +} + +int llc_conn_ac_set_data_flag_0(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->data_flag = 0; + return 0; +} + +int llc_conn_ac_set_data_flag_1(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->data_flag = 1; + return 0; +} + +int llc_conn_ac_set_data_flag_1_if_data_flag_eq_0(struct sock *sk, + struct sk_buff *skb) +{ + if (!llc_sk(sk)->data_flag) + llc_sk(sk)->data_flag = 1; + return 0; +} + +int llc_conn_ac_set_p_flag_0(struct sock *sk, struct sk_buff *skb) +{ + llc_conn_set_p_flag(sk, 0); + return 0; +} + +static int llc_conn_ac_set_p_flag_1(struct sock *sk, struct sk_buff *skb) +{ + llc_conn_set_p_flag(sk, 1); + return 0; +} + +int llc_conn_ac_set_remote_busy_0(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->remote_busy_flag = 0; + return 0; +} + +int llc_conn_ac_set_cause_flag_0(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->cause_flag = 0; + return 0; +} + +int llc_conn_ac_set_cause_flag_1(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->cause_flag = 1; + return 0; +} + +int llc_conn_ac_set_retry_cnt_0(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->retry_count = 0; + return 0; +} + +int llc_conn_ac_inc_retry_cnt_by_1(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->retry_count++; + return 0; +} + +int llc_conn_ac_set_vr_0(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->vR = 0; + return 0; +} + +int llc_conn_ac_inc_vr_by_1(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->vR = PDU_GET_NEXT_Vr(llc_sk(sk)->vR); + return 0; +} + +int llc_conn_ac_set_vs_0(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->vS = 0; + return 0; +} + +int llc_conn_ac_set_vs_nr(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->vS = llc_sk(sk)->last_nr; + return 0; +} + +int llc_conn_ac_inc_vs_by_1(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->vS = (llc_sk(sk)->vS + 1) % 128; + return 0; +} + +void llc_conn_pf_cycle_tmr_cb(unsigned long timeout_data) +{ + struct sock *sk = (struct sock *)timeout_data; + struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC); + + bh_lock_sock(sk); + if (skb) { + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + skb->sk = sk; + ev->type = LLC_CONN_EV_TYPE_P_TMR; + llc_process_tmr_ev(sk, skb); + } + bh_unlock_sock(sk); +} + +void llc_conn_busy_tmr_cb(unsigned long timeout_data) +{ + struct sock *sk = (struct sock *)timeout_data; + struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC); + + bh_lock_sock(sk); + if (skb) { + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + skb->sk = sk; + ev->type = LLC_CONN_EV_TYPE_BUSY_TMR; + llc_process_tmr_ev(sk, skb); + } + bh_unlock_sock(sk); +} + +void llc_conn_ack_tmr_cb(unsigned long timeout_data) +{ + struct sock* sk = (struct sock *)timeout_data; + struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC); + + bh_lock_sock(sk); + if (skb) { + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + skb->sk = sk; + ev->type = LLC_CONN_EV_TYPE_ACK_TMR; + llc_process_tmr_ev(sk, skb); + } + bh_unlock_sock(sk); +} + +void llc_conn_rej_tmr_cb(unsigned long timeout_data) +{ + struct sock *sk = (struct sock *)timeout_data; + struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC); + + bh_lock_sock(sk); + if (skb) { + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + skb->sk = sk; + ev->type = LLC_CONN_EV_TYPE_REJ_TMR; + llc_process_tmr_ev(sk, skb); + } + bh_unlock_sock(sk); +} + +int llc_conn_ac_rst_vs(struct sock *sk, struct sk_buff *skb) +{ + llc_sk(sk)->X = llc_sk(sk)->vS; + llc_conn_ac_set_vs_nr(sk, skb); + return 0; +} + +int llc_conn_ac_upd_vs(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + u8 nr = PDU_SUPV_GET_Nr(pdu); + + if (llc_circular_between(llc_sk(sk)->vS, nr, llc_sk(sk)->X)) + llc_conn_ac_set_vs_nr(sk, skb); + return 0; +} + +/* + * Non-standard actions; these not contained in IEEE specification; for + * our own usage + */ +/** + * llc_conn_disc - removes connection from SAP list and frees it + * @sk: closed connection + * @skb: occurred event + */ +int llc_conn_disc(struct sock *sk, struct sk_buff *skb) +{ + /* FIXME: this thing seems to want to die */ + return 0; +} + +/** + * llc_conn_reset - resets connection + * @sk : reseting connection. + * @skb: occurred event. + * + * Stop all timers, empty all queues and reset all flags. + */ +int llc_conn_reset(struct sock *sk, struct sk_buff *skb) +{ + llc_sk_reset(sk); + return 0; +} + +/** + * llc_circular_between - designates that b is between a and c or not + * @a: lower bound + * @b: element to see if is between a and b + * @c: upper bound + * + * This function designates that b is between a and c or not (for example, + * 0 is between 127 and 1). Returns 1 if b is between a and c, 0 + * otherwise. + */ +u8 llc_circular_between(u8 a, u8 b, u8 c) +{ + b = b - a; + c = c - a; + return b <= c; +} + +/** + * llc_process_tmr_ev - timer backend + * @sk: active connection + * @skb: occurred event + * + * This function is called from timer callback functions. When connection + * is busy (during sending a data frame) timer expiration event must be + * queued. Otherwise this event can be sent to connection state machine. + * Queued events will process by llc_backlog_rcv function after sending + * data frame. + */ +static void llc_process_tmr_ev(struct sock *sk, struct sk_buff *skb) +{ + if (llc_sk(sk)->state == LLC_CONN_OUT_OF_SVC) { + printk(KERN_WARNING "%s: timer called on closed connection\n", + __FUNCTION__); + kfree_skb(skb); + } else { + if (!sock_owned_by_user(sk)) + llc_conn_state_process(sk, skb); + else { + llc_set_backlog_type(skb, LLC_EVENT); + sk_add_backlog(sk, skb); + } + } +} diff --git a/net/llc/llc_c_ev.c b/net/llc/llc_c_ev.c new file mode 100644 index 000000000000..cd130c3b72bc --- /dev/null +++ b/net/llc/llc_c_ev.c @@ -0,0 +1,769 @@ +/* + * llc_c_ev.c - Connection component state transition event qualifiers + * + * A 'state' consists of a number of possible event matching functions, + * the actions associated with each being executed when that event is + * matched; a 'state machine' accepts events in a serial fashion from an + * event queue. Each event is passed to each successive event matching + * function until a match is made (the event matching function returns + * success, or '0') or the list of event matching functions is exhausted. + * If a match is made, the actions associated with the event are executed + * and the state is changed to that event's transition state. Before some + * events are recognized, even after a match has been made, a certain + * number of 'event qualifier' functions must also be executed. If these + * all execute successfully, then the event is finally executed. + * + * These event functions must return 0 for success, to show a matched + * event, of 1 if the event does not match. Event qualifier functions + * must return a 0 for success or a non-zero for failure. Each function + * is simply responsible for verifying one single thing and returning + * either a success or failure. + * + * All of followed event functions are described in 802.2 LLC Protocol + * standard document except two functions that we added that will explain + * in their comments, at below. + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include + +#if 1 +#define dprintk(args...) printk(KERN_DEBUG args) +#else +#define dprintk(args...) +#endif + +extern u16 llc_circular_between(u8 a, u8 b, u8 c); + +/** + * llc_util_ns_inside_rx_window - check if sequence number is in rx window + * @ns: sequence number of received pdu. + * @vr: sequence number which receiver expects to receive. + * @rw: receive window size of receiver. + * + * Checks if sequence number of received PDU is in range of receive + * window. Returns 0 for success, 1 otherwise + */ +static u16 llc_util_ns_inside_rx_window(u8 ns, u8 vr, u8 rw) +{ + return !llc_circular_between(vr, ns, + (vr + rw - 1) % LLC_2_SEQ_NBR_MODULO); +} + +/** + * llc_util_nr_inside_tx_window - check if sequence number is in tx window + * @sk: current connection. + * @nr: N(R) of received PDU. + * + * This routine checks if N(R) of received PDU is in range of transmit + * window; on the other hand checks if received PDU acknowledges some + * outstanding PDUs that are in transmit window. Returns 0 for success, 1 + * otherwise. + */ +static u16 llc_util_nr_inside_tx_window(struct sock *sk, u8 nr) +{ + u8 nr1, nr2; + struct sk_buff *skb; + struct llc_pdu_sn *pdu; + struct llc_sock *llc = llc_sk(sk); + int rc = 0; + + if (llc->dev->flags & IFF_LOOPBACK) + goto out; + rc = 1; + if (!skb_queue_len(&llc->pdu_unack_q)) + goto out; + skb = skb_peek(&llc->pdu_unack_q); + pdu = llc_pdu_sn_hdr(skb); + nr1 = LLC_I_GET_NS(pdu); + skb = skb_peek_tail(&llc->pdu_unack_q); + pdu = llc_pdu_sn_hdr(skb); + nr2 = LLC_I_GET_NS(pdu); + rc = !llc_circular_between(nr1, nr, (nr2 + 1) % LLC_2_SEQ_NBR_MODULO); +out: + return rc; +} + +int llc_conn_ev_conn_req(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->prim == LLC_CONN_PRIM && + ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; +} + +int llc_conn_ev_data_req(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->prim == LLC_DATA_PRIM && + ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; +} + +int llc_conn_ev_disc_req(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->prim == LLC_DISC_PRIM && + ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; +} + +int llc_conn_ev_rst_req(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->prim == LLC_RESET_PRIM && + ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; +} + +int llc_conn_ev_local_busy_detected(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->type == LLC_CONN_EV_TYPE_SIMPLE && + ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_DETECTED ? 0 : 1; +} + +int llc_conn_ev_local_busy_cleared(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->type == LLC_CONN_EV_TYPE_SIMPLE && + ev->prim_type == LLC_CONN_EV_LOCAL_BUSY_CLEARED ? 0 : 1; +} + +int llc_conn_ev_rx_bad_pdu(struct sock *sk, struct sk_buff *skb) +{ + return 1; +} + +int llc_conn_ev_rx_disc_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_DISC ? 0 : 1; +} + +int llc_conn_ev_rx_dm_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_DM ? 0 : 1; +} + +int llc_conn_ev_rx_frmr_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_FRMR ? 0 : 1; +} + +int llc_conn_ev_rx_i_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return llc_conn_space(sk, skb) && + LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_0(pdu) && + LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; +} + +int llc_conn_ev_rx_i_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return llc_conn_space(sk, skb) && + LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_1(pdu) && + LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; +} + +int llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns(struct sock *sk, + struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + u8 vr = llc_sk(sk)->vR; + u8 ns = LLC_I_GET_NS(pdu); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_0(pdu) && ns != vr && + !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; +} + +int llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns(struct sock *sk, + struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + u8 vr = llc_sk(sk)->vR; + u8 ns = LLC_I_GET_NS(pdu); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_1(pdu) && ns != vr && + !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; +} + +int llc_conn_ev_rx_i_cmd_pbit_set_x_inval_ns(struct sock *sk, + struct sk_buff *skb) +{ + struct llc_pdu_sn * pdu = llc_pdu_sn_hdr(skb); + u8 vr = llc_sk(sk)->vR; + u8 ns = LLC_I_GET_NS(pdu); + u16 rc = LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr && + llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; + if (!rc) + dprintk("%s: matched, state=%d, ns=%d, vr=%d\n", + __FUNCTION__, llc_sk(sk)->state, ns, vr); + return rc; +} + +int llc_conn_ev_rx_i_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return llc_conn_space(sk, skb) && + LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_0(pdu) && + LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; +} + +int llc_conn_ev_rx_i_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_1(pdu) && + LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; +} + +int llc_conn_ev_rx_i_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return llc_conn_space(sk, skb) && + LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_GET_NS(pdu) == llc_sk(sk)->vR ? 0 : 1; +} + +int llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns(struct sock *sk, + struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + u8 vr = llc_sk(sk)->vR; + u8 ns = LLC_I_GET_NS(pdu); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_0(pdu) && ns != vr && + !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; +} + +int llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns(struct sock *sk, + struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + u8 vr = llc_sk(sk)->vR; + u8 ns = LLC_I_GET_NS(pdu); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && + LLC_I_PF_IS_1(pdu) && ns != vr && + !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; +} + +int llc_conn_ev_rx_i_rsp_fbit_set_x_unexpd_ns(struct sock *sk, + struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + u8 vr = llc_sk(sk)->vR; + u8 ns = LLC_I_GET_NS(pdu); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr && + !llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; +} + +int llc_conn_ev_rx_i_rsp_fbit_set_x_inval_ns(struct sock *sk, + struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + u8 vr = llc_sk(sk)->vR; + u8 ns = LLC_I_GET_NS(pdu); + u16 rc = LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_I(pdu) && ns != vr && + llc_util_ns_inside_rx_window(ns, vr, llc_sk(sk)->rw) ? 0 : 1; + if (!rc) + dprintk("%s: matched, state=%d, ns=%d, vr=%d\n", + __FUNCTION__, llc_sk(sk)->state, ns, vr); + return rc; +} + +int llc_conn_ev_rx_rej_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_0(pdu) && + LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_REJ ? 0 : 1; +} + +int llc_conn_ev_rx_rej_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_1(pdu) && + LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_REJ ? 0 : 1; +} + +int llc_conn_ev_rx_rej_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_0(pdu) && + LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; +} + +int llc_conn_ev_rx_rej_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_1(pdu) && + LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; +} + +int llc_conn_ev_rx_rej_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_REJ ? 0 : 1; +} + +int llc_conn_ev_rx_rnr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_0(pdu) && + LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RNR ? 0 : 1; +} + +int llc_conn_ev_rx_rnr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_1(pdu) && + LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RNR ? 0 : 1; +} + +int llc_conn_ev_rx_rnr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_0(pdu) && + LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RNR ? 0 : 1; +} + +int llc_conn_ev_rx_rnr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_1(pdu) && + LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RNR ? 0 : 1; +} + +int llc_conn_ev_rx_rr_cmd_pbit_set_0(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_0(pdu) && + LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RR ? 0 : 1; +} + +int llc_conn_ev_rx_rr_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_1(pdu) && + LLC_S_PDU_CMD(pdu) == LLC_2_PDU_CMD_RR ? 0 : 1; +} + +int llc_conn_ev_rx_rr_rsp_fbit_set_0(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return llc_conn_space(sk, skb) && + LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_0(pdu) && + LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RR ? 0 : 1; +} + +int llc_conn_ev_rx_rr_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + return llc_conn_space(sk, skb) && + LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_S(pdu) && + LLC_S_PF_IS_1(pdu) && + LLC_S_PDU_RSP(pdu) == LLC_2_PDU_RSP_RR ? 0 : 1; +} + +int llc_conn_ev_rx_sabme_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return LLC_PDU_IS_CMD(pdu) && LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_CMD(pdu) == LLC_2_PDU_CMD_SABME ? 0 : 1; +} + +int llc_conn_ev_rx_ua_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return LLC_PDU_IS_RSP(pdu) && LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_RSP(pdu) == LLC_2_PDU_RSP_UA ? 0 : 1; +} + +int llc_conn_ev_rx_xxx_cmd_pbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + u16 rc = 1; + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + if (LLC_PDU_IS_CMD(pdu)) { + if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) { + if (LLC_I_PF_IS_1(pdu)) + rc = 0; + } else if (LLC_PDU_TYPE_IS_U(pdu) && LLC_U_PF_IS_1(pdu)) + rc = 0; + } + return rc; +} + +int llc_conn_ev_rx_xxx_cmd_pbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + u16 rc = 1; + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + if (LLC_PDU_IS_CMD(pdu)) { + if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) + rc = 0; + else if (LLC_PDU_TYPE_IS_U(pdu)) + switch (LLC_U_PDU_CMD(pdu)) { + case LLC_2_PDU_CMD_SABME: + case LLC_2_PDU_CMD_DISC: + rc = 0; + break; + } + } + return rc; +} + +int llc_conn_ev_rx_xxx_rsp_fbit_set_1(struct sock *sk, struct sk_buff *skb) +{ + u16 rc = 1; + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + if (LLC_PDU_IS_RSP(pdu)) { + if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) { + if (LLC_I_PF_IS_1(pdu)) + rc = 0; + } else if (LLC_PDU_TYPE_IS_U(pdu)) + switch (LLC_U_PDU_RSP(pdu)) { + case LLC_2_PDU_RSP_UA: + case LLC_2_PDU_RSP_DM: + case LLC_2_PDU_RSP_FRMR: + if (LLC_U_PF_IS_1(pdu)) + rc = 0; + break; + } + } + return rc; +} + +int llc_conn_ev_rx_xxx_rsp_fbit_set_x(struct sock *sk, struct sk_buff *skb) +{ + u16 rc = 1; + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + if (LLC_PDU_IS_RSP(pdu)) { + if (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) + rc = 0; + else if (LLC_PDU_TYPE_IS_U(pdu)) + switch (LLC_U_PDU_RSP(pdu)) { + case LLC_2_PDU_RSP_UA: + case LLC_2_PDU_RSP_DM: + case LLC_2_PDU_RSP_FRMR: + rc = 0; + break; + } + } + + return rc; +} + +int llc_conn_ev_rx_zzz_cmd_pbit_set_x_inval_nr(struct sock *sk, + struct sk_buff *skb) +{ + u16 rc = 1; + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + u8 vs = llc_sk(sk)->vS; + u8 nr = LLC_I_GET_NR(pdu); + + if (LLC_PDU_IS_CMD(pdu) && + (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) && + nr != vs && llc_util_nr_inside_tx_window(sk, nr)) { + dprintk("%s: matched, state=%d, vs=%d, nr=%d\n", + __FUNCTION__, llc_sk(sk)->state, vs, nr); + rc = 0; + } + return rc; +} + +int llc_conn_ev_rx_zzz_rsp_fbit_set_x_inval_nr(struct sock *sk, + struct sk_buff *skb) +{ + u16 rc = 1; + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + u8 vs = llc_sk(sk)->vS; + u8 nr = LLC_I_GET_NR(pdu); + + if (LLC_PDU_IS_RSP(pdu) && + (LLC_PDU_TYPE_IS_I(pdu) || LLC_PDU_TYPE_IS_S(pdu)) && + nr != vs && llc_util_nr_inside_tx_window(sk, nr)) { + rc = 0; + dprintk("%s: matched, state=%d, vs=%d, nr=%d\n", + __FUNCTION__, llc_sk(sk)->state, vs, nr); + } + return rc; +} + +int llc_conn_ev_rx_any_frame(struct sock *sk, struct sk_buff *skb) +{ + return 0; +} + +int llc_conn_ev_p_tmr_exp(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->type != LLC_CONN_EV_TYPE_P_TMR; +} + +int llc_conn_ev_ack_tmr_exp(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->type != LLC_CONN_EV_TYPE_ACK_TMR; +} + +int llc_conn_ev_rej_tmr_exp(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->type != LLC_CONN_EV_TYPE_REJ_TMR; +} + +int llc_conn_ev_busy_tmr_exp(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->type != LLC_CONN_EV_TYPE_BUSY_TMR; +} + +int llc_conn_ev_init_p_f_cycle(struct sock *sk, struct sk_buff *skb) +{ + return 1; +} + +int llc_conn_ev_tx_buffer_full(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + return ev->type == LLC_CONN_EV_TYPE_SIMPLE && + ev->prim_type == LLC_CONN_EV_TX_BUFF_FULL ? 0 : 1; +} + +/* Event qualifier functions + * + * these functions simply verify the value of a state flag associated with + * the connection and return either a 0 for success or a non-zero value + * for not-success; verify the event is the type we expect + */ +int llc_conn_ev_qlfy_data_flag_eq_1(struct sock *sk, struct sk_buff *skb) +{ + return llc_sk(sk)->data_flag != 1; +} + +int llc_conn_ev_qlfy_data_flag_eq_0(struct sock *sk, struct sk_buff *skb) +{ + return llc_sk(sk)->data_flag; +} + +int llc_conn_ev_qlfy_data_flag_eq_2(struct sock *sk, struct sk_buff *skb) +{ + return llc_sk(sk)->data_flag != 2; +} + +int llc_conn_ev_qlfy_p_flag_eq_1(struct sock *sk, struct sk_buff *skb) +{ + return llc_sk(sk)->p_flag != 1; +} + +/** + * conn_ev_qlfy_last_frame_eq_1 - checks if frame is last in tx window + * @sk: current connection structure. + * @skb: current event. + * + * This function determines when frame which is sent, is last frame of + * transmit window, if it is then this function return zero else return + * one. This function is used for sending last frame of transmit window + * as I-format command with p-bit set to one. Returns 0 if frame is last + * frame, 1 otherwise. + */ +int llc_conn_ev_qlfy_last_frame_eq_1(struct sock *sk, struct sk_buff *skb) +{ + return !(skb_queue_len(&llc_sk(sk)->pdu_unack_q) + 1 == llc_sk(sk)->k); +} + +/** + * conn_ev_qlfy_last_frame_eq_0 - checks if frame isn't last in tx window + * @sk: current connection structure. + * @skb: current event. + * + * This function determines when frame which is sent, isn't last frame of + * transmit window, if it isn't then this function return zero else return + * one. Returns 0 if frame isn't last frame, 1 otherwise. + */ +int llc_conn_ev_qlfy_last_frame_eq_0(struct sock *sk, struct sk_buff *skb) +{ + return skb_queue_len(&llc_sk(sk)->pdu_unack_q) + 1 == llc_sk(sk)->k; +} + +int llc_conn_ev_qlfy_p_flag_eq_0(struct sock *sk, struct sk_buff *skb) +{ + return llc_sk(sk)->p_flag; +} + +int llc_conn_ev_qlfy_p_flag_eq_f(struct sock *sk, struct sk_buff *skb) +{ + u8 f_bit; + + llc_pdu_decode_pf_bit(skb, &f_bit); + return llc_sk(sk)->p_flag == f_bit ? 0 : 1; +} + +int llc_conn_ev_qlfy_remote_busy_eq_0(struct sock *sk, struct sk_buff *skb) +{ + return llc_sk(sk)->remote_busy_flag; +} + +int llc_conn_ev_qlfy_remote_busy_eq_1(struct sock *sk, struct sk_buff *skb) +{ + return !llc_sk(sk)->remote_busy_flag; +} + +int llc_conn_ev_qlfy_retry_cnt_lt_n2(struct sock *sk, struct sk_buff *skb) +{ + return !(llc_sk(sk)->retry_count < llc_sk(sk)->n2); +} + +int llc_conn_ev_qlfy_retry_cnt_gte_n2(struct sock *sk, struct sk_buff *skb) +{ + return !(llc_sk(sk)->retry_count >= llc_sk(sk)->n2); +} + +int llc_conn_ev_qlfy_s_flag_eq_1(struct sock *sk, struct sk_buff *skb) +{ + return !llc_sk(sk)->s_flag; +} + +int llc_conn_ev_qlfy_s_flag_eq_0(struct sock *sk, struct sk_buff *skb) +{ + return llc_sk(sk)->s_flag; +} + +int llc_conn_ev_qlfy_cause_flag_eq_1(struct sock *sk, struct sk_buff *skb) +{ + return !llc_sk(sk)->cause_flag; +} + +int llc_conn_ev_qlfy_cause_flag_eq_0(struct sock *sk, struct sk_buff *skb) +{ + return llc_sk(sk)->cause_flag; +} + +int llc_conn_ev_qlfy_set_status_conn(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->status = LLC_STATUS_CONN; + return 0; +} + +int llc_conn_ev_qlfy_set_status_disc(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->status = LLC_STATUS_DISC; + return 0; +} + +int llc_conn_ev_qlfy_set_status_failed(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->status = LLC_STATUS_FAILED; + return 0; +} + +int llc_conn_ev_qlfy_set_status_remote_busy(struct sock *sk, + struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->status = LLC_STATUS_REMOTE_BUSY; + return 0; +} + +int llc_conn_ev_qlfy_set_status_refuse(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->status = LLC_STATUS_REFUSE; + return 0; +} + +int llc_conn_ev_qlfy_set_status_conflict(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->status = LLC_STATUS_CONFLICT; + return 0; +} + +int llc_conn_ev_qlfy_set_status_rst_done(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->status = LLC_STATUS_RESET_DONE; + return 0; +} diff --git a/net/llc/llc_c_st.c b/net/llc/llc_c_st.c new file mode 100644 index 000000000000..818a9428823b --- /dev/null +++ b/net/llc/llc_c_st.c @@ -0,0 +1,4946 @@ +/* + * llc_c_st.c - This module contains state transition of connection component. + * + * Description of event functions and actions there is in 802.2 LLC standard, + * or in "llc_c_ac.c" and "llc_c_ev.c" modules. + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include + +#define NONE NULL + +/* COMMON CONNECTION STATE transitions + * Common transitions for + * LLC_CONN_STATE_NORMAL, + * LLC_CONN_STATE_BUSY, + * LLC_CONN_STATE_REJ, + * LLC_CONN_STATE_AWAIT, + * LLC_CONN_STATE_AWAIT_BUSY and + * LLC_CONN_STATE_AWAIT_REJ states + */ +/* State transitions for LLC_CONN_EV_DISC_REQ event */ +static llc_conn_action_t llc_common_actions_1[] = { + [0] = llc_conn_ac_send_disc_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_set_cause_flag_1, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_1 = { + .ev = llc_conn_ev_disc_req, + .next_state = LLC_CONN_STATE_D_CONN, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_1, +}; + +/* State transitions for LLC_CONN_EV_RESET_REQ event */ +static llc_conn_action_t llc_common_actions_2[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_set_cause_flag_1, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_2 = { + .ev = llc_conn_ev_rst_req, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_2, +}; + +/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_common_actions_3[] = { + [0] = llc_conn_ac_stop_all_timers, + [1] = llc_conn_ac_set_vs_0, + [2] = llc_conn_ac_set_vr_0, + [3] = llc_conn_ac_send_ua_rsp_f_set_p, + [4] = llc_conn_ac_rst_ind, + [5] = llc_conn_ac_set_p_flag_0, + [6] = llc_conn_ac_set_remote_busy_0, + [7] = llc_conn_reset, + [8] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_3 = { + .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_3, +}; + +/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_common_actions_4[] = { + [0] = llc_conn_ac_stop_all_timers, + [1] = llc_conn_ac_send_ua_rsp_f_set_p, + [2] = llc_conn_ac_disc_ind, + [3] = llc_conn_disc, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_4 = { + .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_FRMR_RSP_Fbit_SET_X event */ +static llc_conn_action_t llc_common_actions_5[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_rst_ind, + [5] = llc_conn_ac_set_cause_flag_0, + [6] = llc_conn_reset, + [7] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_5 = { + .ev = llc_conn_ev_rx_frmr_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_5, +}; + +/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event */ +static llc_conn_action_t llc_common_actions_6[] = { + [0] = llc_conn_ac_disc_ind, + [1] = llc_conn_ac_stop_all_timers, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_6 = { + .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_6, +}; + +/* State transitions for LLC_CONN_EV_RX_ZZZ_CMD_Pbit_SET_X_INVAL_Nr event */ +static llc_conn_action_t llc_common_actions_7a[] = { + [0] = llc_conn_ac_send_frmr_rsp_f_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_7a = { + .ev = llc_conn_ev_rx_zzz_cmd_pbit_set_x_inval_nr, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_7a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_X_INVAL_Ns event */ +static llc_conn_action_t llc_common_actions_7b[] = { + [0] = llc_conn_ac_send_frmr_rsp_f_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_7b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_x_inval_ns, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_7b, +}; + +/* State transitions for LLC_CONN_EV_RX_ZZZ_RSP_Fbit_SET_X_INVAL_Nr event */ +static llc_conn_action_t llc_common_actions_8a[] = { + [0] = llc_conn_ac_send_frmr_rsp_f_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_8a = { + .ev = llc_conn_ev_rx_zzz_rsp_fbit_set_x_inval_nr, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_8a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X_INVAL_Ns event */ +static llc_conn_action_t llc_common_actions_8b[] = { + [0] = llc_conn_ac_send_frmr_rsp_f_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_8b = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_x_inval_ns, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_8b, +}; + +/* State transitions for LLC_CONN_EV_RX_BAD_PDU event */ +static llc_conn_action_t llc_common_actions_8c[] = { + [0] = llc_conn_ac_send_frmr_rsp_f_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_8c = { + .ev = llc_conn_ev_rx_bad_pdu, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_8c, +}; + +/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event */ +static llc_conn_action_t llc_common_actions_9[] = { + [0] = llc_conn_ac_send_frmr_rsp_f_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_9 = { + .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = NONE, + .ev_actions = llc_common_actions_9, +}; + +/* State transitions for LLC_CONN_EV_RX_XXX_RSP_Fbit_SET_1 event */ +#if 0 +static llc_conn_ev_qfyr_t llc_common_ev_qfyrs_10[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_common_actions_10[] = { + [0] = llc_conn_ac_send_frmr_rsp_f_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_10 = { + .ev = llc_conn_ev_rx_xxx_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = llc_common_ev_qfyrs_10, + .ev_actions = llc_common_actions_10, +}; +#endif + +/* State transitions for LLC_CONN_EV_P_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_common_ev_qfyrs_11a[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_common_actions_11a[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_set_cause_flag_0, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_11a = { + .ev = llc_conn_ev_p_tmr_exp, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = llc_common_ev_qfyrs_11a, + .ev_actions = llc_common_actions_11a, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_common_ev_qfyrs_11b[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_common_actions_11b[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_set_cause_flag_0, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_11b = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = llc_common_ev_qfyrs_11b, + .ev_actions = llc_common_actions_11b, +}; + +/* State transitions for LLC_CONN_EV_REJ_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_common_ev_qfyrs_11c[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_common_actions_11c[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_set_cause_flag_0, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_11c = { + .ev = llc_conn_ev_rej_tmr_exp, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = llc_common_ev_qfyrs_11c, + .ev_actions = llc_common_actions_11c, +}; + +/* State transitions for LLC_CONN_EV_BUSY_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_common_ev_qfyrs_11d[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_common_actions_11d[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_stop_other_timers, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_set_cause_flag_0, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_common_state_trans_11d = { + .ev = llc_conn_ev_busy_tmr_exp, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = llc_common_ev_qfyrs_11d, + .ev_actions = llc_common_actions_11d, +}; + +/* + * Common dummy state transition; must be last entry for all state + * transition groups - it'll be on .bss, so will be zeroed. + */ +static struct llc_conn_state_trans llc_common_state_trans_end; + +/* LLC_CONN_STATE_ADM transitions */ +/* State transitions for LLC_CONN_EV_CONN_REQ event */ +static llc_conn_action_t llc_adm_actions_1[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_set_retry_cnt_0, + [3] = llc_conn_ac_set_s_flag_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_adm_state_trans_1 = { + .ev = llc_conn_ev_conn_req, + .next_state = LLC_CONN_STATE_SETUP, + .ev_qualifiers = NONE, + .ev_actions = llc_adm_actions_1, +}; + +/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_adm_actions_2[] = { + [0] = llc_conn_ac_send_ua_rsp_f_set_p, + [1] = llc_conn_ac_set_vs_0, + [2] = llc_conn_ac_set_vr_0, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_set_p_flag_0, + [5] = llc_conn_ac_set_remote_busy_0, + [6] = llc_conn_ac_conn_ind, + [7] = NULL, +}; + +static struct llc_conn_state_trans llc_adm_state_trans_2 = { + .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_adm_actions_2, +}; + +/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_adm_actions_3[] = { + [0] = llc_conn_ac_send_dm_rsp_f_set_p, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_adm_state_trans_3 = { + .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = NONE, + .ev_actions = llc_adm_actions_3, +}; + +/* State transitions for LLC_CONN_EV_RX_XXX_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_adm_actions_4[] = { + [0] = llc_conn_ac_send_dm_rsp_f_set_1, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_adm_state_trans_4 = { + .ev = llc_conn_ev_rx_xxx_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = NONE, + .ev_actions = llc_adm_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_XXX_YYY event */ +static llc_conn_action_t llc_adm_actions_5[] = { + [0] = llc_conn_disc, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_adm_state_trans_5 = { + .ev = llc_conn_ev_rx_any_frame, + .next_state = LLC_CONN_OUT_OF_SVC, + .ev_qualifiers = NONE, + .ev_actions = llc_adm_actions_5, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_adm_state_transitions[] = { + [0] = &llc_adm_state_trans_1, /* Request */ + [1] = &llc_common_state_trans_end, + [2] = &llc_common_state_trans_end, /* local_busy */ + [3] = &llc_common_state_trans_end, /* init_pf_cycle */ + [4] = &llc_common_state_trans_end, /* timer */ + [5] = &llc_adm_state_trans_2, /* Receive frame */ + [6] = &llc_adm_state_trans_3, + [7] = &llc_adm_state_trans_4, + [8] = &llc_adm_state_trans_5, + [9] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_SETUP transitions */ +/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_setup_actions_1[] = { + [0] = llc_conn_ac_send_ua_rsp_f_set_p, + [1] = llc_conn_ac_set_vs_0, + [2] = llc_conn_ac_set_vr_0, + [3] = llc_conn_ac_set_s_flag_1, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_setup_state_trans_1 = { + .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_SETUP, + .ev_qualifiers = NONE, + .ev_actions = llc_setup_actions_1, +}; + +/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_2[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = llc_conn_ev_qlfy_set_status_conn, + [2] = NULL, +}; + +static llc_conn_action_t llc_setup_actions_2[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_ac_set_vs_0, + [2] = llc_conn_ac_set_vr_0, + [3] = llc_conn_ac_upd_p_flag, + [4] = llc_conn_ac_set_remote_busy_0, + [5] = llc_conn_ac_conn_confirm, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_setup_state_trans_2 = { + .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_setup_ev_qfyrs_2, + .ev_actions = llc_setup_actions_2, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_3[] = { + [0] = llc_conn_ev_qlfy_s_flag_eq_1, + [1] = llc_conn_ev_qlfy_set_status_conn, + [2] = NULL, +}; + +static llc_conn_action_t llc_setup_actions_3[] = { + [0] = llc_conn_ac_set_p_flag_0, + [1] = llc_conn_ac_set_remote_busy_0, + [2] = llc_conn_ac_conn_confirm, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_setup_state_trans_3 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_setup_ev_qfyrs_3, + .ev_actions = llc_setup_actions_3, +}; + +/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_4[] = { + [0] = llc_conn_ev_qlfy_set_status_disc, + [1] = NULL, +}; + +static llc_conn_action_t llc_setup_actions_4[] = { + [0] = llc_conn_ac_send_dm_rsp_f_set_p, + [1] = llc_conn_ac_stop_ack_timer, + [2] = llc_conn_ac_conn_confirm, + [3] = llc_conn_disc, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_setup_state_trans_4 = { + .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_setup_ev_qfyrs_4, + .ev_actions = llc_setup_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_5[] = { + [0] = llc_conn_ev_qlfy_set_status_disc, + [1] = NULL, +}; + +static llc_conn_action_t llc_setup_actions_5[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_ac_conn_confirm, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_setup_state_trans_5 = { + .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_setup_ev_qfyrs_5, + .ev_actions = llc_setup_actions_5, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_7[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = llc_conn_ev_qlfy_s_flag_eq_0, + [2] = NULL, +}; + +static llc_conn_action_t llc_setup_actions_7[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_setup_state_trans_7 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_SETUP, + .ev_qualifiers = llc_setup_ev_qfyrs_7, + .ev_actions = llc_setup_actions_7, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_setup_ev_qfyrs_8[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = llc_conn_ev_qlfy_s_flag_eq_0, + [2] = llc_conn_ev_qlfy_set_status_failed, + [3] = NULL, +}; + +static llc_conn_action_t llc_setup_actions_8[] = { + [0] = llc_conn_ac_conn_confirm, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_setup_state_trans_8 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_setup_ev_qfyrs_8, + .ev_actions = llc_setup_actions_8, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_setup_state_transitions[] = { + [0] = &llc_common_state_trans_end, /* Request */ + [1] = &llc_common_state_trans_end, /* local busy */ + [2] = &llc_common_state_trans_end, /* init_pf_cycle */ + [3] = &llc_setup_state_trans_3, /* Timer */ + [4] = &llc_setup_state_trans_7, + [5] = &llc_setup_state_trans_8, + [6] = &llc_common_state_trans_end, + [7] = &llc_setup_state_trans_1, /* Receive frame */ + [8] = &llc_setup_state_trans_2, + [9] = &llc_setup_state_trans_4, + [10] = &llc_setup_state_trans_5, + [11] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_NORMAL transitions */ +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_1[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_0, + [1] = llc_conn_ev_qlfy_p_flag_eq_0, + [2] = llc_conn_ev_qlfy_last_frame_eq_0, + [3] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_1[] = { + [0] = llc_conn_ac_send_i_as_ack, + [1] = llc_conn_ac_start_ack_tmr_if_not_running, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_1 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_1, + .ev_actions = llc_normal_actions_1, +}; + +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_2[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_0, + [1] = llc_conn_ev_qlfy_p_flag_eq_0, + [2] = llc_conn_ev_qlfy_last_frame_eq_1, + [3] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_2[] = { + [0] = llc_conn_ac_send_i_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_2 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_2, + .ev_actions = llc_normal_actions_2, +}; + +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_2_1[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_1, + [1] = llc_conn_ev_qlfy_set_status_remote_busy, + [2] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_normal_actions_2_1[1]; + +static struct llc_conn_state_trans llc_normal_state_trans_2_1 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_2_1, + .ev_actions = llc_normal_actions_2_1, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_3[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_3[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rnr_xxx_x_set_0, + [2] = llc_conn_ac_set_data_flag_0, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_3 = { + .ev = llc_conn_ev_local_busy_detected, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_normal_ev_qfyrs_3, + .ev_actions = llc_normal_actions_3, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_4[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_4[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rnr_xxx_x_set_0, + [2] = llc_conn_ac_set_data_flag_0, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_4 = { + .ev = llc_conn_ev_local_busy_detected, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_normal_ev_qfyrs_4, + .ev_actions = llc_normal_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_5a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_5a[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rej_xxx_x_set_0, + [2] = llc_conn_ac_upd_nr_received, + [3] = llc_conn_ac_upd_p_flag, + [4] = llc_conn_ac_start_rej_timer, + [5] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_5a = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_normal_ev_qfyrs_5a, + .ev_actions = llc_normal_actions_5a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_5b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_5b[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rej_xxx_x_set_0, + [2] = llc_conn_ac_upd_nr_received, + [3] = llc_conn_ac_upd_p_flag, + [4] = llc_conn_ac_start_rej_timer, + [5] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_5b = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_normal_ev_qfyrs_5b, + .ev_actions = llc_normal_actions_5b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_5c[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_5c[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rej_xxx_x_set_0, + [2] = llc_conn_ac_upd_nr_received, + [3] = llc_conn_ac_upd_p_flag, + [4] = llc_conn_ac_start_rej_timer, + [5] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_5c = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_normal_ev_qfyrs_5c, + .ev_actions = llc_normal_actions_5c, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_6a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_6a[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rej_xxx_x_set_0, + [2] = llc_conn_ac_upd_nr_received, + [3] = llc_conn_ac_start_rej_timer, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_6a = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_normal_ev_qfyrs_6a, + .ev_actions = llc_normal_actions_6a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_6b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_6b[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rej_xxx_x_set_0, + [2] = llc_conn_ac_upd_nr_received, + [3] = llc_conn_ac_start_rej_timer, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_6b = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_normal_ev_qfyrs_6b, + .ev_actions = llc_normal_actions_6b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_normal_actions_7[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rej_rsp_f_set_1, + [2] = llc_conn_ac_upd_nr_received, + [3] = llc_conn_ac_start_rej_timer, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_7 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_7, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_8a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_8[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [5] = llc_conn_ac_send_ack_if_needed, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_8a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_8a, + .ev_actions = llc_normal_actions_8, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_8b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_8b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_8b, + .ev_actions = llc_normal_actions_8, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_9a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_9a[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_data_ind, + [3] = llc_conn_ac_send_ack_if_needed, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_9a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_9a, + .ev_actions = llc_normal_actions_9a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_9b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_9b[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_data_ind, + [3] = llc_conn_ac_send_ack_if_needed, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_9b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_9b, + .ev_actions = llc_normal_actions_9b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_normal_actions_10[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_send_ack_rsp_f_set_1, + [2] = llc_conn_ac_rst_sendack_flag, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_data_ind, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_10 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_10, +}; + +/* State transitions for * LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_normal_actions_11a[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_11a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_11a, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_normal_actions_11b[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_11b = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_11b, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_11c[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_11c[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_inc_tx_win_size, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_11c = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_11c, + .ev_actions = llc_normal_actions_11c, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_normal_actions_12[] = { + [0] = llc_conn_ac_send_ack_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_adjust_npta_by_rr, + [3] = llc_conn_ac_rst_sendack_flag, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_12 = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_12, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_normal_actions_13a[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_13a = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_13a, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_normal_actions_13b[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_13b = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_13b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_13c[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_13c[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_13c = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_13c, + .ev_actions = llc_normal_actions_13c, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_normal_actions_14[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_adjust_npta_by_rnr, + [3] = llc_conn_ac_rst_sendack_flag, + [4] = llc_conn_ac_set_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_14 = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_14, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_15a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_15a[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_dec_tx_win_size, + [4] = llc_conn_ac_resend_i_xxx_x_set_0, + [5] = llc_conn_ac_clear_remote_busy, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_15a = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_15a, + .ev_actions = llc_normal_actions_15a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_15b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_15b[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_dec_tx_win_size, + [4] = llc_conn_ac_resend_i_xxx_x_set_0, + [5] = llc_conn_ac_clear_remote_busy, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_15b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_15b, + .ev_actions = llc_normal_actions_15b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_16a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_16a[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_dec_tx_win_size, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_16a = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_16a, + .ev_actions = llc_normal_actions_16a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_16b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_16b[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_dec_tx_win_size, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_16b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_16b, + .ev_actions = llc_normal_actions_16b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_normal_actions_17[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_dec_tx_win_size, + [3] = llc_conn_ac_resend_i_rsp_f_set_1, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_17 = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_normal_actions_17, +}; + +/* State transitions for LLC_CONN_EV_INIT_P_F_CYCLE event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_18[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_18[] = { + [0] = llc_conn_ac_send_rr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_18 = { + .ev = llc_conn_ev_init_p_f_cycle, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_18, + .ev_actions = llc_normal_actions_18, +}; + +/* State transitions for LLC_CONN_EV_P_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_19[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_19[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rr_cmd_p_set_1, + [2] = llc_conn_ac_rst_vs, + [3] = llc_conn_ac_start_p_timer, + [4] = llc_conn_ac_inc_retry_cnt_by_1, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_19 = { + .ev = llc_conn_ev_p_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = llc_normal_ev_qfyrs_19, + .ev_actions = llc_normal_actions_19, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_20a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_20a[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rr_cmd_p_set_1, + [2] = llc_conn_ac_rst_vs, + [3] = llc_conn_ac_start_p_timer, + [4] = llc_conn_ac_inc_retry_cnt_by_1, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_20a = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = llc_normal_ev_qfyrs_20a, + .ev_actions = llc_normal_actions_20a, +}; + +/* State transitions for LLC_CONN_EV_BUSY_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_20b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_20b[] = { + [0] = llc_conn_ac_rst_sendack_flag, + [1] = llc_conn_ac_send_rr_cmd_p_set_1, + [2] = llc_conn_ac_rst_vs, + [3] = llc_conn_ac_start_p_timer, + [4] = llc_conn_ac_inc_retry_cnt_by_1, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_20b = { + .ev = llc_conn_ev_busy_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = llc_normal_ev_qfyrs_20b, + .ev_actions = llc_normal_actions_20b, +}; + +/* State transitions for LLC_CONN_EV_TX_BUFF_FULL event */ +static llc_conn_ev_qfyr_t llc_normal_ev_qfyrs_21[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_normal_actions_21[] = { + [0] = llc_conn_ac_send_rr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_normal_state_trans_21 = { + .ev = llc_conn_ev_tx_buffer_full, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_normal_ev_qfyrs_21, + .ev_actions = llc_normal_actions_21, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_normal_state_transitions[] = { + [0] = &llc_normal_state_trans_1, /* Requests */ + [1] = &llc_normal_state_trans_2, + [2] = &llc_normal_state_trans_2_1, + [3] = &llc_common_state_trans_1, + [4] = &llc_common_state_trans_2, + [5] = &llc_common_state_trans_end, + [6] = &llc_normal_state_trans_21, + [7] = &llc_normal_state_trans_3, /* Local busy */ + [8] = &llc_normal_state_trans_4, + [9] = &llc_common_state_trans_end, + [10] = &llc_normal_state_trans_18, /* Init pf cycle */ + [11] = &llc_common_state_trans_end, + [12] = &llc_common_state_trans_11a, /* Timers */ + [13] = &llc_common_state_trans_11b, + [14] = &llc_common_state_trans_11c, + [15] = &llc_common_state_trans_11d, + [16] = &llc_normal_state_trans_19, + [17] = &llc_normal_state_trans_20a, + [18] = &llc_normal_state_trans_20b, + [19] = &llc_common_state_trans_end, + [20] = &llc_normal_state_trans_8b, /* Receive frames */ + [21] = &llc_normal_state_trans_9b, + [22] = &llc_normal_state_trans_10, + [23] = &llc_normal_state_trans_11b, + [24] = &llc_normal_state_trans_11c, + [25] = &llc_normal_state_trans_5a, + [26] = &llc_normal_state_trans_5b, + [27] = &llc_normal_state_trans_5c, + [28] = &llc_normal_state_trans_6a, + [29] = &llc_normal_state_trans_6b, + [30] = &llc_normal_state_trans_7, + [31] = &llc_normal_state_trans_8a, + [32] = &llc_normal_state_trans_9a, + [33] = &llc_normal_state_trans_11a, + [34] = &llc_normal_state_trans_12, + [35] = &llc_normal_state_trans_13a, + [36] = &llc_normal_state_trans_13b, + [37] = &llc_normal_state_trans_13c, + [38] = &llc_normal_state_trans_14, + [39] = &llc_normal_state_trans_15a, + [40] = &llc_normal_state_trans_15b, + [41] = &llc_normal_state_trans_16a, + [42] = &llc_normal_state_trans_16b, + [43] = &llc_normal_state_trans_17, + [44] = &llc_common_state_trans_3, + [45] = &llc_common_state_trans_4, + [46] = &llc_common_state_trans_5, + [47] = &llc_common_state_trans_6, + [48] = &llc_common_state_trans_7a, + [49] = &llc_common_state_trans_7b, + [50] = &llc_common_state_trans_8a, + [51] = &llc_common_state_trans_8b, + [52] = &llc_common_state_trans_8c, + [53] = &llc_common_state_trans_9, + /* [54] = &llc_common_state_trans_10, */ + [54] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_BUSY transitions */ +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_1[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_0, + [1] = llc_conn_ev_qlfy_p_flag_eq_0, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_1[] = { + [0] = llc_conn_ac_send_i_xxx_x_set_0, + [1] = llc_conn_ac_start_ack_tmr_if_not_running, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_1 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_1, + .ev_actions = llc_busy_actions_1, +}; + +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_2[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_0, + [1] = llc_conn_ev_qlfy_p_flag_eq_1, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_2[] = { + [0] = llc_conn_ac_send_i_xxx_x_set_0, + [1] = llc_conn_ac_start_ack_tmr_if_not_running, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_2 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_2, + .ev_actions = llc_busy_actions_2, +}; + +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_2_1[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_1, + [1] = llc_conn_ev_qlfy_set_status_remote_busy, + [2] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_busy_actions_2_1[1]; + +static struct llc_conn_state_trans llc_busy_state_trans_2_1 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_2_1, + .ev_actions = llc_busy_actions_2_1, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_3[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_1, + [1] = llc_conn_ev_qlfy_p_flag_eq_0, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_3[] = { + [0] = llc_conn_ac_send_rej_xxx_x_set_0, + [1] = llc_conn_ac_start_rej_timer, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_3 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_busy_ev_qfyrs_3, + .ev_actions = llc_busy_actions_3, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_4[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_1, + [1] = llc_conn_ev_qlfy_p_flag_eq_1, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_4[] = { + [0] = llc_conn_ac_send_rej_xxx_x_set_0, + [1] = llc_conn_ac_start_rej_timer, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_4 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_busy_ev_qfyrs_4, + .ev_actions = llc_busy_actions_4, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_5[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_0, + [1] = llc_conn_ev_qlfy_p_flag_eq_0, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_5[] = { + [0] = llc_conn_ac_send_rr_xxx_x_set_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_5 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_busy_ev_qfyrs_5, + .ev_actions = llc_busy_actions_5, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_6[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_0, + [1] = llc_conn_ev_qlfy_p_flag_eq_1, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_6[] = { + [0] = llc_conn_ac_send_rr_xxx_x_set_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_6 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_busy_ev_qfyrs_6, + .ev_actions = llc_busy_actions_6, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_7[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_2, + [1] = llc_conn_ev_qlfy_p_flag_eq_0, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_7[] = { + [0] = llc_conn_ac_send_rr_xxx_x_set_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_7 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_busy_ev_qfyrs_7, + .ev_actions = llc_busy_actions_7, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_8[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_2, + [1] = llc_conn_ev_qlfy_p_flag_eq_1, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_8[] = { + [0] = llc_conn_ac_send_rr_xxx_x_set_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_8 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_busy_ev_qfyrs_8, + .ev_actions = llc_busy_actions_8, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_9a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_9a[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_upd_p_flag, + [2] = llc_conn_ac_upd_nr_received, + [3] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0, + [4] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_9a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_x_unexpd_ns, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_9a, + .ev_actions = llc_busy_actions_9a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_9b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_9b[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_upd_p_flag, + [2] = llc_conn_ac_upd_nr_received, + [3] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0, + [4] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_9b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_9b, + .ev_actions = llc_busy_actions_9b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_10a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_10a[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_10a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_10a, + .ev_actions = llc_busy_actions_10a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_10b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_10b[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_10b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_10b, + .ev_actions = llc_busy_actions_10b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_busy_actions_11[] = { + [0] = llc_conn_ac_send_rnr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_data_flag_1_if_data_flag_eq_0, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_11 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_11, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_busy_actions_12[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_rnr_rsp_f_set_1, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2, + [5] = llc_conn_ac_set_data_flag_0, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_12 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_12, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_13a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_13a[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2, + [6] = llc_conn_ac_set_data_flag_0, + [7] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [8] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_13a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_13a, + .ev_actions = llc_busy_actions_13a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_13b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_13b[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2, + [6] = llc_conn_ac_set_data_flag_0, + [7] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [8] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_13b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_13b, + .ev_actions = llc_busy_actions_13b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_14a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_14a[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2, + [5] = llc_conn_ac_set_data_flag_0, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_14a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_14a, + .ev_actions = llc_busy_actions_14a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_14b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_14b[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_stop_rej_tmr_if_data_flag_eq_2, + [5] = llc_conn_ac_set_data_flag_0, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_14b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_14b, + .ev_actions = llc_busy_actions_14b, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_busy_actions_15a[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_15a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_15a, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_busy_actions_15b[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_15b = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_15b, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_15c[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_15c[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_15c = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_15c, + .ev_actions = llc_busy_actions_15c, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_busy_actions_16[] = { + [0] = llc_conn_ac_send_rnr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_16 = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_16, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_busy_actions_17a[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_17a = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_17a, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_busy_actions_17b[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_17b = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_17b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_17c[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_17c[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_17c = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_17c, + .ev_actions = llc_busy_actions_17c, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_busy_actions_18[] = { + [0] = llc_conn_ac_send_rnr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_18 = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_18, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_19a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_19a[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_19a = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_19a, + .ev_actions = llc_busy_actions_19a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_19b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_19b[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_19b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_19b, + .ev_actions = llc_busy_actions_19b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_20a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_20a[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_resend_i_xxx_x_set_0, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_20a = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_20a, + .ev_actions = llc_busy_actions_20a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_20b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_20b[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_resend_i_xxx_x_set_0, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_20b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_20b, + .ev_actions = llc_busy_actions_20b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_busy_actions_21[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_send_rnr_rsp_f_set_1, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_21 = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_busy_actions_21, +}; + +/* State transitions for LLC_CONN_EV_INIT_P_F_CYCLE event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_22[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_22[] = { + [0] = llc_conn_ac_send_rnr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_22 = { + .ev = llc_conn_ev_init_p_f_cycle, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_22, + .ev_actions = llc_busy_actions_22, +}; + +/* State transitions for LLC_CONN_EV_P_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_23[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_23[] = { + [0] = llc_conn_ac_send_rnr_cmd_p_set_1, + [1] = llc_conn_ac_rst_vs, + [2] = llc_conn_ac_start_p_timer, + [3] = llc_conn_ac_inc_retry_cnt_by_1, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_23 = { + .ev = llc_conn_ev_p_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_23, + .ev_actions = llc_busy_actions_23, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_24a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_24a[] = { + [0] = llc_conn_ac_send_rnr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = llc_conn_ac_rst_vs, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_24a = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_24a, + .ev_actions = llc_busy_actions_24a, +}; + +/* State transitions for LLC_CONN_EV_BUSY_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_24b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_24b[] = { + [0] = llc_conn_ac_send_rnr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = llc_conn_ac_rst_vs, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_24b = { + .ev = llc_conn_ev_busy_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_24b, + .ev_actions = llc_busy_actions_24b, +}; + +/* State transitions for LLC_CONN_EV_REJ_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_25[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_25[] = { + [0] = llc_conn_ac_send_rnr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = llc_conn_ac_rst_vs, + [4] = llc_conn_ac_set_data_flag_1, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_25 = { + .ev = llc_conn_ev_rej_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_25, + .ev_actions = llc_busy_actions_25, +}; + +/* State transitions for LLC_CONN_EV_REJ_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_busy_ev_qfyrs_26[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_busy_actions_26[] = { + [0] = llc_conn_ac_set_data_flag_1, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_busy_state_trans_26 = { + .ev = llc_conn_ev_rej_tmr_exp, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_busy_ev_qfyrs_26, + .ev_actions = llc_busy_actions_26, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_busy_state_transitions[] = { + [0] = &llc_common_state_trans_1, /* Request */ + [1] = &llc_common_state_trans_2, + [2] = &llc_busy_state_trans_1, + [3] = &llc_busy_state_trans_2, + [4] = &llc_busy_state_trans_2_1, + [5] = &llc_common_state_trans_end, + [6] = &llc_busy_state_trans_3, /* Local busy */ + [7] = &llc_busy_state_trans_4, + [8] = &llc_busy_state_trans_5, + [9] = &llc_busy_state_trans_6, + [10] = &llc_busy_state_trans_7, + [11] = &llc_busy_state_trans_8, + [12] = &llc_common_state_trans_end, + [13] = &llc_busy_state_trans_22, /* Initiate PF cycle */ + [14] = &llc_common_state_trans_end, + [15] = &llc_common_state_trans_11a, /* Timer */ + [16] = &llc_common_state_trans_11b, + [17] = &llc_common_state_trans_11c, + [18] = &llc_common_state_trans_11d, + [19] = &llc_busy_state_trans_23, + [20] = &llc_busy_state_trans_24a, + [21] = &llc_busy_state_trans_24b, + [22] = &llc_busy_state_trans_25, + [23] = &llc_busy_state_trans_26, + [24] = &llc_common_state_trans_end, + [25] = &llc_busy_state_trans_9a, /* Receive frame */ + [26] = &llc_busy_state_trans_9b, + [27] = &llc_busy_state_trans_10a, + [28] = &llc_busy_state_trans_10b, + [29] = &llc_busy_state_trans_11, + [30] = &llc_busy_state_trans_12, + [31] = &llc_busy_state_trans_13a, + [32] = &llc_busy_state_trans_13b, + [33] = &llc_busy_state_trans_14a, + [34] = &llc_busy_state_trans_14b, + [35] = &llc_busy_state_trans_15a, + [36] = &llc_busy_state_trans_15b, + [37] = &llc_busy_state_trans_15c, + [38] = &llc_busy_state_trans_16, + [39] = &llc_busy_state_trans_17a, + [40] = &llc_busy_state_trans_17b, + [41] = &llc_busy_state_trans_17c, + [42] = &llc_busy_state_trans_18, + [43] = &llc_busy_state_trans_19a, + [44] = &llc_busy_state_trans_19b, + [45] = &llc_busy_state_trans_20a, + [46] = &llc_busy_state_trans_20b, + [47] = &llc_busy_state_trans_21, + [48] = &llc_common_state_trans_3, + [49] = &llc_common_state_trans_4, + [50] = &llc_common_state_trans_5, + [51] = &llc_common_state_trans_6, + [52] = &llc_common_state_trans_7a, + [53] = &llc_common_state_trans_7b, + [54] = &llc_common_state_trans_8a, + [55] = &llc_common_state_trans_8b, + [56] = &llc_common_state_trans_8c, + [57] = &llc_common_state_trans_9, + /* [58] = &llc_common_state_trans_10, */ + [58] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_REJ transitions */ +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_1[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_0, + [1] = llc_conn_ev_qlfy_p_flag_eq_0, + [2] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_1[] = { + [0] = llc_conn_ac_send_i_xxx_x_set_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_1 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_1, + .ev_actions = llc_reject_actions_1, +}; + +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_2[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_0, + [1] = llc_conn_ev_qlfy_p_flag_eq_1, + [2] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_2[] = { + [0] = llc_conn_ac_send_i_xxx_x_set_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_2 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_2, + .ev_actions = llc_reject_actions_2, +}; + +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_2_1[] = { + [0] = llc_conn_ev_qlfy_remote_busy_eq_1, + [1] = llc_conn_ev_qlfy_set_status_remote_busy, + [2] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_reject_actions_2_1[1]; + +static struct llc_conn_state_trans llc_reject_state_trans_2_1 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_2_1, + .ev_actions = llc_reject_actions_2_1, +}; + + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_3[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_3[] = { + [0] = llc_conn_ac_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_set_data_flag_2, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_3 = { + .ev = llc_conn_ev_local_busy_detected, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_reject_ev_qfyrs_3, + .ev_actions = llc_reject_actions_3, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_4[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_4[] = { + [0] = llc_conn_ac_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_set_data_flag_2, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_4 = { + .ev = llc_conn_ev_local_busy_detected, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = llc_reject_ev_qfyrs_4, + .ev_actions = llc_reject_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */ +static llc_conn_action_t llc_reject_actions_5a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_p_flag, + [2] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_5a = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_5a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */ +static llc_conn_action_t llc_reject_actions_5b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_p_flag, + [2] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_5b = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_5b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_5c[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_5c[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_p_flag, + [2] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_5c = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_5c, + .ev_actions = llc_reject_actions_5c, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_reject_actions_6[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_6 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_6, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_7a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_7a[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_send_ack_xxx_x_set_0, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [6] = llc_conn_ac_stop_rej_timer, + [7] = NULL, + +}; + +static struct llc_conn_state_trans llc_reject_state_trans_7a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_reject_ev_qfyrs_7a, + .ev_actions = llc_reject_actions_7a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_7b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_7b[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_send_ack_xxx_x_set_0, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_clear_remote_busy_if_f_eq_1, + [6] = llc_conn_ac_stop_rej_timer, + [7] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_7b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_reject_ev_qfyrs_7b, + .ev_actions = llc_reject_actions_7b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_8a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_8a[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_ack_xxx_x_set_0, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_stop_rej_timer, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_8a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_reject_ev_qfyrs_8a, + .ev_actions = llc_reject_actions_8a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_8b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_8b[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_ack_xxx_x_set_0, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_stop_rej_timer, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_8b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_reject_ev_qfyrs_8b, + .ev_actions = llc_reject_actions_8b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_reject_actions_9[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_ack_rsp_f_set_1, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_stop_rej_timer, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_9 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_9, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_reject_actions_10a[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_10a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_10a, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_reject_actions_10b[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_10b = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_10b, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_10c[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_10c[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_10c = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_10c, + .ev_actions = llc_reject_actions_10c, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_reject_actions_11[] = { + [0] = llc_conn_ac_send_ack_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_11 = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_11, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_reject_actions_12a[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_12a = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_12a, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_reject_actions_12b[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_12b = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_12b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_12c[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_12c[] = { + [0] = llc_conn_ac_upd_p_flag, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_12c = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_12c, + .ev_actions = llc_reject_actions_12c, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_reject_actions_13[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_13 = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_13, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_14a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_14a[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_14a = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_14a, + .ev_actions = llc_reject_actions_14a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_X event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_14b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_14b[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_p_flag, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_14b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_14b, + .ev_actions = llc_reject_actions_14b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_15a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_15a[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_resend_i_xxx_x_set_0, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_15a = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_15a, + .ev_actions = llc_reject_actions_15a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_15b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_15b[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_resend_i_xxx_x_set_0, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_15b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_15b, + .ev_actions = llc_reject_actions_15b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_reject_actions_16[] = { + [0] = llc_conn_ac_set_vs_nr, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_resend_i_rsp_f_set_1, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_16 = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_reject_actions_16, +}; + +/* State transitions for LLC_CONN_EV_INIT_P_F_CYCLE event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_17[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_17[] = { + [0] = llc_conn_ac_send_rr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_17 = { + .ev = llc_conn_ev_init_p_f_cycle, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_17, + .ev_actions = llc_reject_actions_17, +}; + +/* State transitions for LLC_CONN_EV_REJ_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_18[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_18[] = { + [0] = llc_conn_ac_send_rej_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_start_rej_timer, + [3] = llc_conn_ac_inc_retry_cnt_by_1, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_18 = { + .ev = llc_conn_ev_rej_tmr_exp, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_18, + .ev_actions = llc_reject_actions_18, +}; + +/* State transitions for LLC_CONN_EV_P_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_19[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_19[] = { + [0] = llc_conn_ac_send_rr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_start_rej_timer, + [3] = llc_conn_ac_inc_retry_cnt_by_1, + [4] = llc_conn_ac_rst_vs, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_19 = { + .ev = llc_conn_ev_p_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_19, + .ev_actions = llc_reject_actions_19, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_20a[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_20a[] = { + [0] = llc_conn_ac_send_rr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_start_rej_timer, + [3] = llc_conn_ac_inc_retry_cnt_by_1, + [4] = llc_conn_ac_rst_vs, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_20a = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_20a, + .ev_actions = llc_reject_actions_20a, +}; + +/* State transitions for LLC_CONN_EV_BUSY_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_reject_ev_qfyrs_20b[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_0, + [1] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [2] = NULL, +}; + +static llc_conn_action_t llc_reject_actions_20b[] = { + [0] = llc_conn_ac_send_rr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_start_rej_timer, + [3] = llc_conn_ac_inc_retry_cnt_by_1, + [4] = llc_conn_ac_rst_vs, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_reject_state_trans_20b = { + .ev = llc_conn_ev_busy_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = llc_reject_ev_qfyrs_20b, + .ev_actions = llc_reject_actions_20b, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_reject_state_transitions[] = { + [0] = &llc_common_state_trans_1, /* Request */ + [1] = &llc_common_state_trans_2, + [2] = &llc_common_state_trans_end, + [3] = &llc_reject_state_trans_1, + [4] = &llc_reject_state_trans_2, + [5] = &llc_reject_state_trans_2_1, + [6] = &llc_reject_state_trans_3, /* Local busy */ + [7] = &llc_reject_state_trans_4, + [8] = &llc_common_state_trans_end, + [9] = &llc_reject_state_trans_17, /* Initiate PF cycle */ + [10] = &llc_common_state_trans_end, + [11] = &llc_common_state_trans_11a, /* Timer */ + [12] = &llc_common_state_trans_11b, + [13] = &llc_common_state_trans_11c, + [14] = &llc_common_state_trans_11d, + [15] = &llc_reject_state_trans_18, + [16] = &llc_reject_state_trans_19, + [17] = &llc_reject_state_trans_20a, + [18] = &llc_reject_state_trans_20b, + [19] = &llc_common_state_trans_end, + [20] = &llc_common_state_trans_3, /* Receive frame */ + [21] = &llc_common_state_trans_4, + [22] = &llc_common_state_trans_5, + [23] = &llc_common_state_trans_6, + [24] = &llc_common_state_trans_7a, + [25] = &llc_common_state_trans_7b, + [26] = &llc_common_state_trans_8a, + [27] = &llc_common_state_trans_8b, + [28] = &llc_common_state_trans_8c, + [29] = &llc_common_state_trans_9, + /* [30] = &llc_common_state_trans_10, */ + [30] = &llc_reject_state_trans_5a, + [31] = &llc_reject_state_trans_5b, + [32] = &llc_reject_state_trans_5c, + [33] = &llc_reject_state_trans_6, + [34] = &llc_reject_state_trans_7a, + [35] = &llc_reject_state_trans_7b, + [36] = &llc_reject_state_trans_8a, + [37] = &llc_reject_state_trans_8b, + [38] = &llc_reject_state_trans_9, + [39] = &llc_reject_state_trans_10a, + [40] = &llc_reject_state_trans_10b, + [41] = &llc_reject_state_trans_10c, + [42] = &llc_reject_state_trans_11, + [43] = &llc_reject_state_trans_12a, + [44] = &llc_reject_state_trans_12b, + [45] = &llc_reject_state_trans_12c, + [46] = &llc_reject_state_trans_13, + [47] = &llc_reject_state_trans_14a, + [48] = &llc_reject_state_trans_14b, + [49] = &llc_reject_state_trans_15a, + [50] = &llc_reject_state_trans_15b, + [51] = &llc_reject_state_trans_16, + [52] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_AWAIT transitions */ +/* State transitions for LLC_CONN_EV_DATA_REQ event */ +static llc_conn_ev_qfyr_t llc_await_ev_qfyrs_1_0[] = { + [0] = llc_conn_ev_qlfy_set_status_refuse, + [1] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_await_actions_1_0[1]; + +static struct llc_conn_state_trans llc_await_state_trans_1_0 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = llc_await_ev_qfyrs_1_0, + .ev_actions = llc_await_actions_1_0, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */ +static llc_conn_action_t llc_await_actions_1[] = { + [0] = llc_conn_ac_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_set_data_flag_0, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_1 = { + .ev = llc_conn_ev_local_busy_detected, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_1, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_actions_2[] = { + [0] = llc_conn_ac_send_rej_xxx_x_set_0, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_stop_p_timer, + [4] = llc_conn_ac_resend_i_xxx_x_set_0, + [5] = llc_conn_ac_start_rej_timer, + [6] = llc_conn_ac_clear_remote_busy, + [7] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_2 = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_2, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_actions_3a[] = { + [0] = llc_conn_ac_send_rej_xxx_x_set_0, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_start_rej_timer, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_3a = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_3a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_actions_3b[] = { + [0] = llc_conn_ac_send_rej_xxx_x_set_0, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_start_rej_timer, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_3b = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_3b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_actions_4[] = { + [0] = llc_conn_ac_send_rej_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_start_rej_timer, + [4] = llc_conn_ac_start_p_timer, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_4 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_actions_5[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_upd_vs, + [5] = llc_conn_ac_resend_i_xxx_x_set_0_or_send_rr, + [6] = llc_conn_ac_clear_remote_busy, + [7] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_5 = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_5, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_actions_6a[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_rr_xxx_x_set_0, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_upd_vs, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_6a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_6a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_actions_6b[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_rr_xxx_x_set_0, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_upd_vs, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_6b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_6b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_actions_7[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_rr_rsp_f_set_1, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_upd_vs, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_7 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_7, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_actions_8a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_8a = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_8a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_actions_8b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_8b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_8b, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_actions_9a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_9a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_9a, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_actions_9b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_9b = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_9b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_actions_9c[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_9c = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_9c, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_actions_9d[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_9d = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_9d, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_actions_10a[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_10a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_10a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_actions_10b[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_10b = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_10b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_actions_11[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_set_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_11 = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_11, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_actions_12a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_12a = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_12a, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_actions_12b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_12b = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_12b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_actions_13[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_set_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_13 = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_actions_13, +}; + +/* State transitions for LLC_CONN_EV_P_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_await_ev_qfyrs_14[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_await_actions_14[] = { + [0] = llc_conn_ac_send_rr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_state_trans_14 = { + .ev = llc_conn_ev_p_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = llc_await_ev_qfyrs_14, + .ev_actions = llc_await_actions_14, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_await_state_transitions[] = { + [0] = &llc_common_state_trans_1, /* Request */ + [1] = &llc_common_state_trans_2, + [2] = &llc_await_state_trans_1_0, + [3] = &llc_common_state_trans_end, + [4] = &llc_await_state_trans_1, /* Local busy */ + [5] = &llc_common_state_trans_end, + [6] = &llc_common_state_trans_end, /* Initiate PF Cycle */ + [7] = &llc_common_state_trans_11a, /* Timer */ + [8] = &llc_common_state_trans_11b, + [9] = &llc_common_state_trans_11c, + [10] = &llc_common_state_trans_11d, + [11] = &llc_await_state_trans_14, + [12] = &llc_common_state_trans_end, + [13] = &llc_common_state_trans_3, /* Receive frame */ + [14] = &llc_common_state_trans_4, + [15] = &llc_common_state_trans_5, + [16] = &llc_common_state_trans_6, + [17] = &llc_common_state_trans_7a, + [18] = &llc_common_state_trans_7b, + [19] = &llc_common_state_trans_8a, + [20] = &llc_common_state_trans_8b, + [21] = &llc_common_state_trans_8c, + [22] = &llc_common_state_trans_9, + /* [23] = &llc_common_state_trans_10, */ + [23] = &llc_await_state_trans_2, + [24] = &llc_await_state_trans_3a, + [25] = &llc_await_state_trans_3b, + [26] = &llc_await_state_trans_4, + [27] = &llc_await_state_trans_5, + [28] = &llc_await_state_trans_6a, + [29] = &llc_await_state_trans_6b, + [30] = &llc_await_state_trans_7, + [31] = &llc_await_state_trans_8a, + [32] = &llc_await_state_trans_8b, + [33] = &llc_await_state_trans_9a, + [34] = &llc_await_state_trans_9b, + [35] = &llc_await_state_trans_9c, + [36] = &llc_await_state_trans_9d, + [37] = &llc_await_state_trans_10a, + [38] = &llc_await_state_trans_10b, + [39] = &llc_await_state_trans_11, + [40] = &llc_await_state_trans_12a, + [41] = &llc_await_state_trans_12b, + [42] = &llc_await_state_trans_13, + [43] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_AWAIT_BUSY transitions */ +/* State transitions for LLC_CONN_EV_DATA_CONN_REQ event */ +static llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_1_0[] = { + [0] = llc_conn_ev_qlfy_set_status_refuse, + [1] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_await_busy_actions_1_0[1]; + +static struct llc_conn_state_trans llc_await_busy_state_trans_1_0 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = llc_await_busy_ev_qfyrs_1_0, + .ev_actions = llc_await_busy_actions_1_0, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_1[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_1, + [1] = NULL, +}; + +static llc_conn_action_t llc_await_busy_actions_1[] = { + [0] = llc_conn_ac_send_rej_xxx_x_set_0, + [1] = llc_conn_ac_start_rej_timer, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_1 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = llc_await_busy_ev_qfyrs_1, + .ev_actions = llc_await_busy_actions_1, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_2[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_0, + [1] = NULL, +}; + +static llc_conn_action_t llc_await_busy_actions_2[] = { + [0] = llc_conn_ac_send_rr_xxx_x_set_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_2 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = llc_await_busy_ev_qfyrs_2, + .ev_actions = llc_await_busy_actions_2, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_CLEARED event */ +static llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_3[] = { + [0] = llc_conn_ev_qlfy_data_flag_eq_2, + [1] = NULL, +}; + +static llc_conn_action_t llc_await_busy_actions_3[] = { + [0] = llc_conn_ac_send_rr_xxx_x_set_0, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_3 = { + .ev = llc_conn_ev_local_busy_cleared, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = llc_await_busy_ev_qfyrs_3, + .ev_actions = llc_await_busy_actions_3, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_busy_actions_4[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_stop_p_timer, + [4] = llc_conn_ac_set_data_flag_1, + [5] = llc_conn_ac_clear_remote_busy, + [6] = llc_conn_ac_resend_i_xxx_x_set_0, + [7] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_4 = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_busy_actions_5a[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_set_data_flag_1, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_5a = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_5a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_busy_actions_5b[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_set_data_flag_1, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_5b = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_5b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_busy_actions_6[] = { + [0] = llc_conn_ac_send_rnr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_set_data_flag_1, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_6 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_6, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_busy_actions_7[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_inc_vr_by_1, + [2] = llc_conn_ac_data_ind, + [3] = llc_conn_ac_stop_p_timer, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_upd_vs, + [6] = llc_conn_ac_set_data_flag_0, + [7] = llc_conn_ac_clear_remote_busy, + [8] = llc_conn_ac_resend_i_xxx_x_set_0, + [9] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_7 = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_7, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_busy_actions_8a[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_inc_vr_by_1, + [2] = llc_conn_ac_data_ind, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_upd_vs, + [5] = llc_conn_ac_set_data_flag_0, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_8a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_8a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_busy_actions_8b[] = { + [0] = llc_conn_ac_opt_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_inc_vr_by_1, + [2] = llc_conn_ac_data_ind, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_upd_vs, + [5] = llc_conn_ac_set_data_flag_0, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_8b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_8b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_busy_actions_9[] = { + [0] = llc_conn_ac_send_rnr_rsp_f_set_1, + [1] = llc_conn_ac_inc_vr_by_1, + [2] = llc_conn_ac_data_ind, + [3] = llc_conn_ac_upd_nr_received, + [4] = llc_conn_ac_upd_vs, + [5] = llc_conn_ac_set_data_flag_0, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_9 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_9, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_busy_actions_10a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_10a = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_10a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_busy_actions_10b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_10b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_10b, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_busy_actions_11a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_11a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_11a, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_busy_actions_11b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_11b = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_11b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_busy_actions_11c[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_11c = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_11c, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_busy_actions_11d[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_11d = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_11d, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_busy_actions_12a[] = { + [0] = llc_conn_ac_send_rnr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_12a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_12a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_busy_actions_12b[] = { + [0] = llc_conn_ac_send_rnr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_12b = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_12b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_busy_actions_13[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_set_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_13 = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_13, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_busy_actions_14a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_14a = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_14a, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_busy_actions_14b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_14b = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_14b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_busy_actions_15[] = { + [0] = llc_conn_ac_send_rnr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_set_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_15 = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_busy_actions_15, +}; + +/* State transitions for LLC_CONN_EV_P_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_await_busy_ev_qfyrs_16[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_await_busy_actions_16[] = { + [0] = llc_conn_ac_send_rnr_cmd_p_set_1, + [1] = llc_conn_ac_start_p_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_busy_state_trans_16 = { + .ev = llc_conn_ev_p_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = llc_await_busy_ev_qfyrs_16, + .ev_actions = llc_await_busy_actions_16, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_await_busy_state_transitions[] = { + [0] = &llc_common_state_trans_1, /* Request */ + [1] = &llc_common_state_trans_2, + [2] = &llc_await_busy_state_trans_1_0, + [3] = &llc_common_state_trans_end, + [4] = &llc_await_busy_state_trans_1, /* Local busy */ + [5] = &llc_await_busy_state_trans_2, + [6] = &llc_await_busy_state_trans_3, + [7] = &llc_common_state_trans_end, + [8] = &llc_common_state_trans_end, /* Initiate PF cycle */ + [9] = &llc_common_state_trans_11a, /* Timer */ + [10] = &llc_common_state_trans_11b, + [11] = &llc_common_state_trans_11c, + [12] = &llc_common_state_trans_11d, + [13] = &llc_await_busy_state_trans_16, + [14] = &llc_common_state_trans_end, + [15] = &llc_await_busy_state_trans_4, /* Receive frame */ + [16] = &llc_await_busy_state_trans_5a, + [17] = &llc_await_busy_state_trans_5b, + [18] = &llc_await_busy_state_trans_6, + [19] = &llc_await_busy_state_trans_7, + [20] = &llc_await_busy_state_trans_8a, + [21] = &llc_await_busy_state_trans_8b, + [22] = &llc_await_busy_state_trans_9, + [23] = &llc_await_busy_state_trans_10a, + [24] = &llc_await_busy_state_trans_10b, + [25] = &llc_await_busy_state_trans_11a, + [26] = &llc_await_busy_state_trans_11b, + [27] = &llc_await_busy_state_trans_11c, + [28] = &llc_await_busy_state_trans_11d, + [29] = &llc_await_busy_state_trans_12a, + [30] = &llc_await_busy_state_trans_12b, + [31] = &llc_await_busy_state_trans_13, + [32] = &llc_await_busy_state_trans_14a, + [33] = &llc_await_busy_state_trans_14b, + [34] = &llc_await_busy_state_trans_15, + [35] = &llc_common_state_trans_3, + [36] = &llc_common_state_trans_4, + [37] = &llc_common_state_trans_5, + [38] = &llc_common_state_trans_6, + [39] = &llc_common_state_trans_7a, + [40] = &llc_common_state_trans_7b, + [41] = &llc_common_state_trans_8a, + [42] = &llc_common_state_trans_8b, + [43] = &llc_common_state_trans_8c, + [44] = &llc_common_state_trans_9, + /* [45] = &llc_common_state_trans_10, */ + [45] = &llc_common_state_trans_end, +}; + +/* ----------------- LLC_CONN_STATE_AWAIT_REJ transitions --------------- */ +/* State transitions for LLC_CONN_EV_DATA_CONN_REQ event */ +static llc_conn_ev_qfyr_t llc_await_reject_ev_qfyrs_1_0[] = { + [0] = llc_conn_ev_qlfy_set_status_refuse, + [1] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_await_reject_actions_1_0[1]; + +static struct llc_conn_state_trans llc_await_reject_state_trans_1_0 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = llc_await_reject_ev_qfyrs_1_0, + .ev_actions = llc_await_reject_actions_1_0, +}; + +/* State transitions for LLC_CONN_EV_LOCAL_BUSY_DETECTED event */ +static llc_conn_action_t llc_await_rejct_actions_1[] = { + [0] = llc_conn_ac_send_rnr_xxx_x_set_0, + [1] = llc_conn_ac_set_data_flag_2, + [2] = NULL +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_1 = { + .ev = llc_conn_ev_local_busy_detected, + .next_state = LLC_CONN_STATE_AWAIT_BUSY, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_1, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_rejct_actions_2a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = NULL +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_2a = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_2a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_rejct_actions_2b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = NULL +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_2b = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_2b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_rejct_actions_3[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = NULL +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_3 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_3, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_rejct_actions_4[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_stop_rej_timer, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_upd_vs, + [6] = llc_conn_ac_resend_i_xxx_x_set_0_or_send_rr, + [7] = llc_conn_ac_clear_remote_busy, + [8] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_4 = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_rejct_actions_5a[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_rr_xxx_x_set_0, + [3] = llc_conn_ac_stop_rej_timer, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_upd_vs, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_5a = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_5a, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_rejct_actions_5b[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_rr_xxx_x_set_0, + [3] = llc_conn_ac_stop_rej_timer, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_upd_vs, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_5b = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_5b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_rejct_actions_6[] = { + [0] = llc_conn_ac_inc_vr_by_1, + [1] = llc_conn_ac_data_ind, + [2] = llc_conn_ac_send_rr_rsp_f_set_1, + [3] = llc_conn_ac_stop_rej_timer, + [4] = llc_conn_ac_upd_nr_received, + [5] = llc_conn_ac_upd_vs, + [6] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_6 = { + .ev = llc_conn_ev_rx_i_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_6, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_rejct_actions_7a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_7a = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_7a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_rejct_actions_7b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_7b = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_7b, +}; + +/* State transitions for LLC_CONN_EV_RX_I_RSP_Fbit_SET_1_UNEXPD_Ns event */ +static llc_conn_action_t llc_await_rejct_actions_7c[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_resend_i_xxx_x_set_0, + [4] = llc_conn_ac_clear_remote_busy, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_7c = { + .ev = llc_conn_ev_rx_i_rsp_fbit_set_1_unexpd_ns, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_7c, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_rejct_actions_8a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_8a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_8a, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_rejct_actions_8b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_8b = { + .ev = llc_conn_ev_rx_rr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_8b, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_rejct_actions_8c[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_8c = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_8c, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_rejct_actions_8d[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_clear_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_8d = { + .ev = llc_conn_ev_rx_rej_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_8d, +}; + +/* State transitions for LLC_CONN_EV_RX_RR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_rejct_actions_9a[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_9a = { + .ev = llc_conn_ev_rx_rr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_9a, +}; + +/* State transitions for LLC_CONN_EV_RX_REJ_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_rejct_actions_9b[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_clear_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_9b = { + .ev = llc_conn_ev_rx_rej_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_9b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_1 event */ +static llc_conn_action_t llc_await_rejct_actions_10[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_stop_p_timer, + [3] = llc_conn_ac_set_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_10 = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_1, + .next_state = LLC_CONN_STATE_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_10, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_0 event */ +static llc_conn_action_t llc_await_rejct_actions_11a[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_11a = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_11a, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_RSP_Fbit_SET_0 event */ +static llc_conn_action_t llc_await_rejct_actions_11b[] = { + [0] = llc_conn_ac_upd_nr_received, + [1] = llc_conn_ac_upd_vs, + [2] = llc_conn_ac_set_remote_busy, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_11b = { + .ev = llc_conn_ev_rx_rnr_rsp_fbit_set_0, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_11b, +}; + +/* State transitions for LLC_CONN_EV_RX_RNR_CMD_Pbit_SET_1 event */ +static llc_conn_action_t llc_await_rejct_actions_12[] = { + [0] = llc_conn_ac_send_rr_rsp_f_set_1, + [1] = llc_conn_ac_upd_nr_received, + [2] = llc_conn_ac_upd_vs, + [3] = llc_conn_ac_set_remote_busy, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_12 = { + .ev = llc_conn_ev_rx_rnr_cmd_pbit_set_1, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = NONE, + .ev_actions = llc_await_rejct_actions_12, +}; + +/* State transitions for LLC_CONN_EV_P_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_await_rejct_ev_qfyrs_13[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_await_rejct_actions_13[] = { + [0] = llc_conn_ac_send_rej_cmd_p_set_1, + [1] = llc_conn_ac_stop_p_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_await_rejct_state_trans_13 = { + .ev = llc_conn_ev_p_tmr_exp, + .next_state = LLC_CONN_STATE_AWAIT_REJ, + .ev_qualifiers = llc_await_rejct_ev_qfyrs_13, + .ev_actions = llc_await_rejct_actions_13, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_await_rejct_state_transitions[] = { + [0] = &llc_await_reject_state_trans_1_0, + [1] = &llc_common_state_trans_1, /* requests */ + [2] = &llc_common_state_trans_2, + [3] = &llc_common_state_trans_end, + [4] = &llc_await_rejct_state_trans_1, /* local busy */ + [5] = &llc_common_state_trans_end, + [6] = &llc_common_state_trans_end, /* Initiate PF cycle */ + [7] = &llc_await_rejct_state_trans_13, /* timers */ + [8] = &llc_common_state_trans_11a, + [9] = &llc_common_state_trans_11b, + [10] = &llc_common_state_trans_11c, + [11] = &llc_common_state_trans_11d, + [12] = &llc_common_state_trans_end, + [13] = &llc_await_rejct_state_trans_2a, /* receive frames */ + [14] = &llc_await_rejct_state_trans_2b, + [15] = &llc_await_rejct_state_trans_3, + [16] = &llc_await_rejct_state_trans_4, + [17] = &llc_await_rejct_state_trans_5a, + [18] = &llc_await_rejct_state_trans_5b, + [19] = &llc_await_rejct_state_trans_6, + [20] = &llc_await_rejct_state_trans_7a, + [21] = &llc_await_rejct_state_trans_7b, + [22] = &llc_await_rejct_state_trans_7c, + [23] = &llc_await_rejct_state_trans_8a, + [24] = &llc_await_rejct_state_trans_8b, + [25] = &llc_await_rejct_state_trans_8c, + [26] = &llc_await_rejct_state_trans_8d, + [27] = &llc_await_rejct_state_trans_9a, + [28] = &llc_await_rejct_state_trans_9b, + [29] = &llc_await_rejct_state_trans_10, + [30] = &llc_await_rejct_state_trans_11a, + [31] = &llc_await_rejct_state_trans_11b, + [32] = &llc_await_rejct_state_trans_12, + [33] = &llc_common_state_trans_3, + [34] = &llc_common_state_trans_4, + [35] = &llc_common_state_trans_5, + [36] = &llc_common_state_trans_6, + [37] = &llc_common_state_trans_7a, + [38] = &llc_common_state_trans_7b, + [39] = &llc_common_state_trans_8a, + [40] = &llc_common_state_trans_8b, + [41] = &llc_common_state_trans_8c, + [42] = &llc_common_state_trans_9, + /* [43] = &llc_common_state_trans_10, */ + [43] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_D_CONN transitions */ +/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event, + * cause_flag = 1 */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_1[] = { + [0] = llc_conn_ev_qlfy_cause_flag_eq_1, + [1] = llc_conn_ev_qlfy_set_status_conflict, + [2] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_1[] = { + [0] = llc_conn_ac_send_dm_rsp_f_set_p, + [1] = llc_conn_ac_stop_ack_timer, + [2] = llc_conn_ac_disc_confirm, + [3] = llc_conn_disc, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_1 = { + .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_d_conn_ev_qfyrs_1, + .ev_actions = llc_d_conn_actions_1, +}; + +/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event, + * cause_flag = 0 + */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_1_1[] = { + [0] = llc_conn_ev_qlfy_cause_flag_eq_0, + [1] = llc_conn_ev_qlfy_set_status_conflict, + [2] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_1_1[] = { + [0] = llc_conn_ac_send_dm_rsp_f_set_p, + [1] = llc_conn_ac_stop_ack_timer, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_1_1 = { + .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_d_conn_ev_qfyrs_1_1, + .ev_actions = llc_d_conn_actions_1_1, +}; + +/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event, + * cause_flag = 1 + */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_2[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = llc_conn_ev_qlfy_cause_flag_eq_1, + [2] = llc_conn_ev_qlfy_set_status_disc, + [3] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_2[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_ac_disc_confirm, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_2 = { + .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_d_conn_ev_qfyrs_2, + .ev_actions = llc_d_conn_actions_2, +}; + +/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event, + * cause_flag = 0 + */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_2_1[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = llc_conn_ev_qlfy_cause_flag_eq_0, + [2] = llc_conn_ev_qlfy_set_status_disc, + [3] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_2_1[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_2_1 = { + .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_d_conn_ev_qfyrs_2_1, + .ev_actions = llc_d_conn_actions_2_1, +}; + +/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_d_conn_actions_3[] = { + [0] = llc_conn_ac_send_ua_rsp_f_set_p, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_3 = { + .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_D_CONN, + .ev_qualifiers = NONE, + .ev_actions = llc_d_conn_actions_3, +}; + +/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event, + * cause_flag = 1 + */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_4[] = { + [0] = llc_conn_ev_qlfy_cause_flag_eq_1, + [1] = llc_conn_ev_qlfy_set_status_disc, + [2] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_4[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_ac_disc_confirm, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_4 = { + .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_d_conn_ev_qfyrs_4, + .ev_actions = llc_d_conn_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event, + * cause_flag = 0 + */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_4_1[] = { + [0] = llc_conn_ev_qlfy_cause_flag_eq_0, + [1] = llc_conn_ev_qlfy_set_status_disc, + [2] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_4_1[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_4_1 = { + .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_d_conn_ev_qfyrs_4_1, + .ev_actions = llc_d_conn_actions_4_1, +}; + +/* + * State transition for + * LLC_CONN_EV_DATA_CONN_REQ event + */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_5[] = { + [0] = llc_conn_ev_qlfy_set_status_refuse, + [1] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_d_conn_actions_5[1]; + +static struct llc_conn_state_trans llc_d_conn_state_trans_5 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_D_CONN, + .ev_qualifiers = llc_d_conn_ev_qfyrs_5, + .ev_actions = llc_d_conn_actions_5, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_6[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_6[] = { + [0] = llc_conn_ac_send_disc_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_6 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_D_CONN, + .ev_qualifiers = llc_d_conn_ev_qfyrs_6, + .ev_actions = llc_d_conn_actions_6, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event, cause_flag = 1 */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_7[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = llc_conn_ev_qlfy_cause_flag_eq_1, + [2] = llc_conn_ev_qlfy_set_status_failed, + [3] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_7[] = { + [0] = llc_conn_ac_disc_confirm, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_7 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_d_conn_ev_qfyrs_7, + .ev_actions = llc_d_conn_actions_7, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event, cause_flag = 0 */ +static llc_conn_ev_qfyr_t llc_d_conn_ev_qfyrs_8[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = llc_conn_ev_qlfy_cause_flag_eq_0, + [2] = llc_conn_ev_qlfy_set_status_failed, + [3] = NULL, +}; + +static llc_conn_action_t llc_d_conn_actions_8[] = { + [0] = llc_conn_disc, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_d_conn_state_trans_8 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_d_conn_ev_qfyrs_8, + .ev_actions = llc_d_conn_actions_8, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_d_conn_state_transitions[] = { + [0] = &llc_d_conn_state_trans_5, /* Request */ + [1] = &llc_common_state_trans_end, + [2] = &llc_common_state_trans_end, /* Local busy */ + [3] = &llc_common_state_trans_end, /* Initiate PF cycle */ + [4] = &llc_d_conn_state_trans_6, /* Timer */ + [5] = &llc_d_conn_state_trans_7, + [6] = &llc_d_conn_state_trans_8, + [7] = &llc_common_state_trans_end, + [8] = &llc_d_conn_state_trans_1, /* Receive frame */ + [9] = &llc_d_conn_state_trans_1_1, + [10] = &llc_d_conn_state_trans_2, + [11] = &llc_d_conn_state_trans_2_1, + [12] = &llc_d_conn_state_trans_3, + [13] = &llc_d_conn_state_trans_4, + [14] = &llc_d_conn_state_trans_4_1, + [15] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_RESET transitions */ +/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_rst_actions_1[] = { + [0] = llc_conn_ac_set_vs_0, + [1] = llc_conn_ac_set_vr_0, + [2] = llc_conn_ac_set_s_flag_1, + [3] = llc_conn_ac_send_ua_rsp_f_set_p, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_1 = { + .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = NONE, + .ev_actions = llc_rst_actions_1, +}; + +/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event, + * cause_flag = 1 + */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_2[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = llc_conn_ev_qlfy_cause_flag_eq_1, + [2] = llc_conn_ev_qlfy_set_status_conn, + [3] = NULL, +}; + +static llc_conn_action_t llc_rst_actions_2[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_ac_set_vs_0, + [2] = llc_conn_ac_set_vr_0, + [3] = llc_conn_ac_upd_p_flag, + [4] = llc_conn_ac_rst_confirm, + [5] = llc_conn_ac_set_remote_busy_0, + [6] = llc_conn_reset, + [7] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_2 = { + .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_rst_ev_qfyrs_2, + .ev_actions = llc_rst_actions_2, +}; + +/* State transitions for LLC_CONN_EV_RX_UA_RSP_Fbit_SET_X event, + * cause_flag = 0 + */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_2_1[] = { + [0] = llc_conn_ev_qlfy_p_flag_eq_f, + [1] = llc_conn_ev_qlfy_cause_flag_eq_0, + [2] = llc_conn_ev_qlfy_set_status_rst_done, + [3] = NULL, +}; + +static llc_conn_action_t llc_rst_actions_2_1[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_ac_set_vs_0, + [2] = llc_conn_ac_set_vr_0, + [3] = llc_conn_ac_upd_p_flag, + [4] = llc_conn_ac_rst_confirm, + [5] = llc_conn_ac_set_remote_busy_0, + [6] = llc_conn_reset, + [7] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_2_1 = { + .ev = llc_conn_ev_rx_ua_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_rst_ev_qfyrs_2_1, + .ev_actions = llc_rst_actions_2_1, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_3[] = { + [0] = llc_conn_ev_qlfy_s_flag_eq_1, + [1] = llc_conn_ev_qlfy_set_status_rst_done, + [2] = NULL, +}; + +static llc_conn_action_t llc_rst_actions_3[] = { + [0] = llc_conn_ac_set_p_flag_0, + [1] = llc_conn_ac_set_remote_busy_0, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_3 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = llc_rst_ev_qfyrs_3, + .ev_actions = llc_rst_actions_3, +}; + +/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event, + * cause_flag = 1 + */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_4[] = { + [0] = llc_conn_ev_qlfy_cause_flag_eq_1, + [1] = llc_conn_ev_qlfy_set_status_disc, + [2] = NULL, +}; +static llc_conn_action_t llc_rst_actions_4[] = { + [0] = llc_conn_ac_send_dm_rsp_f_set_p, + [1] = llc_conn_ac_disc_ind, + [2] = llc_conn_ac_stop_ack_timer, + [3] = llc_conn_disc, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_4 = { + .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_rst_ev_qfyrs_4, + .ev_actions = llc_rst_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event, + * cause_flag = 0 + */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_4_1[] = { + [0] = llc_conn_ev_qlfy_cause_flag_eq_0, + [1] = llc_conn_ev_qlfy_set_status_refuse, + [2] = NULL, +}; + +static llc_conn_action_t llc_rst_actions_4_1[] = { + [0] = llc_conn_ac_send_dm_rsp_f_set_p, + [1] = llc_conn_ac_stop_ack_timer, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_4_1 = { + .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_rst_ev_qfyrs_4_1, + .ev_actions = llc_rst_actions_4_1, +}; + +/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event, + * cause_flag = 1 + */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_5[] = { + [0] = llc_conn_ev_qlfy_cause_flag_eq_1, + [1] = llc_conn_ev_qlfy_set_status_disc, + [2] = NULL, +}; + +static llc_conn_action_t llc_rst_actions_5[] = { + [0] = llc_conn_ac_disc_ind, + [1] = llc_conn_ac_stop_ack_timer, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_5 = { + .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_rst_ev_qfyrs_5, + .ev_actions = llc_rst_actions_5, +}; + +/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event, + * cause_flag = 0 + */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_5_1[] = { + [0] = llc_conn_ev_qlfy_cause_flag_eq_0, + [1] = llc_conn_ev_qlfy_set_status_refuse, + [2] = NULL, +}; + +static llc_conn_action_t llc_rst_actions_5_1[] = { + [0] = llc_conn_ac_stop_ack_timer, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_5_1 = { + .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_rst_ev_qfyrs_5_1, + .ev_actions = llc_rst_actions_5_1, +}; + +/* State transitions for DATA_CONN_REQ event */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_6[] = { + [0] = llc_conn_ev_qlfy_set_status_refuse, + [1] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_rst_actions_6[1]; + +static struct llc_conn_state_trans llc_rst_state_trans_6 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = llc_rst_ev_qfyrs_6, + .ev_actions = llc_rst_actions_6, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_7[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = llc_conn_ev_qlfy_s_flag_eq_0, + [2] = NULL, +}; + +static llc_conn_action_t llc_rst_actions_7[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_7 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = llc_rst_ev_qfyrs_7, + .ev_actions = llc_rst_actions_7, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_8[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = llc_conn_ev_qlfy_s_flag_eq_0, + [2] = llc_conn_ev_qlfy_cause_flag_eq_1, + [3] = llc_conn_ev_qlfy_set_status_failed, + [4] = NULL, +}; +static llc_conn_action_t llc_rst_actions_8[] = { + [0] = llc_conn_ac_disc_ind, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_8 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_rst_ev_qfyrs_8, + .ev_actions = llc_rst_actions_8, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_rst_ev_qfyrs_8_1[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = llc_conn_ev_qlfy_s_flag_eq_0, + [2] = llc_conn_ev_qlfy_cause_flag_eq_0, + [3] = llc_conn_ev_qlfy_set_status_failed, + [4] = NULL, +}; +static llc_conn_action_t llc_rst_actions_8_1[] = { + [0] = llc_conn_ac_disc_ind, + [1] = llc_conn_disc, + [2] = NULL, +}; + +static struct llc_conn_state_trans llc_rst_state_trans_8_1 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = llc_rst_ev_qfyrs_8_1, + .ev_actions = llc_rst_actions_8_1, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_rst_state_transitions[] = { + [0] = &llc_rst_state_trans_6, /* Request */ + [1] = &llc_common_state_trans_end, + [2] = &llc_common_state_trans_end, /* Local busy */ + [3] = &llc_common_state_trans_end, /* Initiate PF cycle */ + [4] = &llc_rst_state_trans_3, /* Timer */ + [5] = &llc_rst_state_trans_7, + [6] = &llc_rst_state_trans_8, + [7] = &llc_rst_state_trans_8_1, + [8] = &llc_common_state_trans_end, + [9] = &llc_rst_state_trans_1, /* Receive frame */ + [10] = &llc_rst_state_trans_2, + [11] = &llc_rst_state_trans_2_1, + [12] = &llc_rst_state_trans_4, + [13] = &llc_rst_state_trans_4_1, + [14] = &llc_rst_state_trans_5, + [15] = &llc_rst_state_trans_5_1, + [16] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_ERROR transitions */ +/* State transitions for LLC_CONN_EV_RX_SABME_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_error_actions_1[] = { + [0] = llc_conn_ac_set_vs_0, + [1] = llc_conn_ac_set_vr_0, + [2] = llc_conn_ac_send_ua_rsp_f_set_p, + [3] = llc_conn_ac_rst_ind, + [4] = llc_conn_ac_set_p_flag_0, + [5] = llc_conn_ac_set_remote_busy_0, + [6] = llc_conn_ac_stop_ack_timer, + [7] = llc_conn_reset, + [8] = NULL, +}; + +static struct llc_conn_state_trans llc_error_state_trans_1 = { + .ev = llc_conn_ev_rx_sabme_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_NORMAL, + .ev_qualifiers = NONE, + .ev_actions = llc_error_actions_1, +}; + +/* State transitions for LLC_CONN_EV_RX_DISC_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_error_actions_2[] = { + [0] = llc_conn_ac_send_ua_rsp_f_set_p, + [1] = llc_conn_ac_disc_ind, + [2] = llc_conn_ac_stop_ack_timer, + [3] = llc_conn_disc, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_error_state_trans_2 = { + .ev = llc_conn_ev_rx_disc_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = NONE, + .ev_actions = llc_error_actions_2, +}; + +/* State transitions for LLC_CONN_EV_RX_DM_RSP_Fbit_SET_X event */ +static llc_conn_action_t llc_error_actions_3[] = { + [0] = llc_conn_ac_disc_ind, + [1] = llc_conn_ac_stop_ack_timer, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_error_state_trans_3 = { + .ev = llc_conn_ev_rx_dm_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = NONE, + .ev_actions = llc_error_actions_3, +}; + +/* State transitions for LLC_CONN_EV_RX_FRMR_RSP_Fbit_SET_X event */ +static llc_conn_action_t llc_error_actions_4[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_set_retry_cnt_0, + [3] = llc_conn_ac_set_cause_flag_0, + [4] = NULL, +}; + +static struct llc_conn_state_trans llc_error_state_trans_4 = { + .ev = llc_conn_ev_rx_frmr_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = NONE, + .ev_actions = llc_error_actions_4, +}; + +/* State transitions for LLC_CONN_EV_RX_XXX_CMD_Pbit_SET_X event */ +static llc_conn_action_t llc_error_actions_5[] = { + [0] = llc_conn_ac_resend_frmr_rsp_f_set_p, + [1] = NULL, +}; + +static struct llc_conn_state_trans llc_error_state_trans_5 = { + .ev = llc_conn_ev_rx_xxx_cmd_pbit_set_x, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = NONE, + .ev_actions = llc_error_actions_5, +}; + +/* State transitions for LLC_CONN_EV_RX_XXX_RSP_Fbit_SET_X event */ +static struct llc_conn_state_trans llc_error_state_trans_6 = { + .ev = llc_conn_ev_rx_xxx_rsp_fbit_set_x, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = NONE, + .ev_actions = NONE, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_error_ev_qfyrs_7[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_lt_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_error_actions_7[] = { + [0] = llc_conn_ac_resend_frmr_rsp_f_set_0, + [1] = llc_conn_ac_start_ack_timer, + [2] = llc_conn_ac_inc_retry_cnt_by_1, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_error_state_trans_7 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = llc_error_ev_qfyrs_7, + .ev_actions = llc_error_actions_7, +}; + +/* State transitions for LLC_CONN_EV_ACK_TMR_EXP event */ +static llc_conn_ev_qfyr_t llc_error_ev_qfyrs_8[] = { + [0] = llc_conn_ev_qlfy_retry_cnt_gte_n2, + [1] = NULL, +}; + +static llc_conn_action_t llc_error_actions_8[] = { + [0] = llc_conn_ac_send_sabme_cmd_p_set_x, + [1] = llc_conn_ac_set_s_flag_0, + [2] = llc_conn_ac_start_ack_timer, + [3] = llc_conn_ac_set_retry_cnt_0, + [4] = llc_conn_ac_set_cause_flag_0, + [5] = NULL, +}; + +static struct llc_conn_state_trans llc_error_state_trans_8 = { + .ev = llc_conn_ev_ack_tmr_exp, + .next_state = LLC_CONN_STATE_RESET, + .ev_qualifiers = llc_error_ev_qfyrs_8, + .ev_actions = llc_error_actions_8, +}; + +/* State transitions for LLC_CONN_EV_DATA_CONN_REQ event */ +static llc_conn_ev_qfyr_t llc_error_ev_qfyrs_9[] = { + [0] = llc_conn_ev_qlfy_set_status_refuse, + [1] = NULL, +}; + +/* just one member, NULL, .bss zeroes it */ +static llc_conn_action_t llc_error_actions_9[1]; + +static struct llc_conn_state_trans llc_error_state_trans_9 = { + .ev = llc_conn_ev_data_req, + .next_state = LLC_CONN_STATE_ERROR, + .ev_qualifiers = llc_error_ev_qfyrs_9, + .ev_actions = llc_error_actions_9, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_error_state_transitions[] = { + [0] = &llc_error_state_trans_9, /* Request */ + [1] = &llc_common_state_trans_end, + [2] = &llc_common_state_trans_end, /* Local busy */ + [3] = &llc_common_state_trans_end, /* Initiate PF cycle */ + [4] = &llc_error_state_trans_7, /* Timer */ + [5] = &llc_error_state_trans_8, + [6] = &llc_common_state_trans_end, + [7] = &llc_error_state_trans_1, /* Receive frame */ + [8] = &llc_error_state_trans_2, + [9] = &llc_error_state_trans_3, + [10] = &llc_error_state_trans_4, + [11] = &llc_error_state_trans_5, + [12] = &llc_error_state_trans_6, + [13] = &llc_common_state_trans_end, +}; + +/* LLC_CONN_STATE_TEMP transitions */ +/* State transitions for LLC_CONN_EV_DISC_REQ event */ +static llc_conn_action_t llc_temp_actions_1[] = { + [0] = llc_conn_ac_stop_all_timers, + [1] = llc_conn_ac_send_disc_cmd_p_set_x, + [2] = llc_conn_disc, + [3] = NULL, +}; + +static struct llc_conn_state_trans llc_temp_state_trans_1 = { + .ev = llc_conn_ev_disc_req, + .next_state = LLC_CONN_STATE_ADM, + .ev_qualifiers = NONE, + .ev_actions = llc_temp_actions_1, +}; + +/* + * Array of pointers; + * one to each transition + */ +static struct llc_conn_state_trans *llc_temp_state_transitions[] = { + [0] = &llc_temp_state_trans_1, /* requests */ + [1] = &llc_common_state_trans_end, + [2] = &llc_common_state_trans_end, /* local busy */ + [3] = &llc_common_state_trans_end, /* init_pf_cycle */ + [4] = &llc_common_state_trans_end, /* timer */ + [5] = &llc_common_state_trans_end, /* receive */ +}; + +/* Connection State Transition Table */ +struct llc_conn_state llc_conn_state_table[NBR_CONN_STATES] = { + [LLC_CONN_STATE_ADM - 1] = { + .current_state = LLC_CONN_STATE_ADM, + .transitions = llc_adm_state_transitions, + }, + [LLC_CONN_STATE_SETUP - 1] = { + .current_state = LLC_CONN_STATE_SETUP, + .transitions = llc_setup_state_transitions, + }, + [LLC_CONN_STATE_NORMAL - 1] = { + .current_state = LLC_CONN_STATE_NORMAL, + .transitions = llc_normal_state_transitions, + }, + [LLC_CONN_STATE_BUSY - 1] = { + .current_state = LLC_CONN_STATE_BUSY, + .transitions = llc_busy_state_transitions, + }, + [LLC_CONN_STATE_REJ - 1] = { + .current_state = LLC_CONN_STATE_REJ, + .transitions = llc_reject_state_transitions, + }, + [LLC_CONN_STATE_AWAIT - 1] = { + .current_state = LLC_CONN_STATE_AWAIT, + .transitions = llc_await_state_transitions, + }, + [LLC_CONN_STATE_AWAIT_BUSY - 1] = { + .current_state = LLC_CONN_STATE_AWAIT_BUSY, + .transitions = llc_await_busy_state_transitions, + }, + [LLC_CONN_STATE_AWAIT_REJ - 1] = { + .current_state = LLC_CONN_STATE_AWAIT_REJ, + .transitions = llc_await_rejct_state_transitions, + }, + [LLC_CONN_STATE_D_CONN - 1] = { + .current_state = LLC_CONN_STATE_D_CONN, + .transitions = llc_d_conn_state_transitions, + }, + [LLC_CONN_STATE_RESET - 1] = { + .current_state = LLC_CONN_STATE_RESET, + .transitions = llc_rst_state_transitions, + }, + [LLC_CONN_STATE_ERROR - 1] = { + .current_state = LLC_CONN_STATE_ERROR, + .transitions = llc_error_state_transitions, + }, + [LLC_CONN_STATE_TEMP - 1] = { + .current_state = LLC_CONN_STATE_TEMP, + .transitions = llc_temp_state_transitions, + }, +}; diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c new file mode 100644 index 000000000000..eba812a9c69c --- /dev/null +++ b/net/llc/llc_conn.c @@ -0,0 +1,915 @@ +/* + * llc_conn.c - Driver routines for connection component. + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 +#define dprintk(args...) printk(KERN_DEBUG args) +#else +#define dprintk(args...) +#endif + +static int llc_find_offset(int state, int ev_type); +static void llc_conn_send_pdus(struct sock *sk); +static int llc_conn_service(struct sock *sk, struct sk_buff *skb); +static int llc_exec_conn_trans_actions(struct sock *sk, + struct llc_conn_state_trans *trans, + struct sk_buff *ev); +static struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk, + struct sk_buff *skb); + +/* Offset table on connection states transition diagram */ +static int llc_offset_table[NBR_CONN_STATES][NBR_CONN_EV]; + +/** + * llc_conn_state_process - sends event to connection state machine + * @sk: connection + * @skb: occurred event + * + * Sends an event to connection state machine. After processing event + * (executing it's actions and changing state), upper layer will be + * indicated or confirmed, if needed. Returns 0 for success, 1 for + * failure. The socket lock has to be held before calling this function. + */ +int llc_conn_state_process(struct sock *sk, struct sk_buff *skb) +{ + int rc; + struct llc_sock *llc = llc_sk(sk); + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + /* + * We have to hold the skb, because llc_conn_service will kfree it in + * the sending path and we need to look at the skb->cb, where we encode + * llc_conn_state_ev. + */ + skb_get(skb); + ev->ind_prim = ev->cfm_prim = 0; + rc = llc_conn_service(sk, skb); /* sending event to state machine */ + if (rc) { + printk(KERN_ERR "%s: llc_conn_service failed\n", __FUNCTION__); + goto out_kfree_skb; + } + + if (!ev->ind_prim && !ev->cfm_prim) { + /* indicate or confirm not required */ + if (!skb->list) + goto out_kfree_skb; + goto out_skb_put; + } + + if (ev->ind_prim && ev->cfm_prim) /* Paranoia */ + skb_get(skb); + + switch (ev->ind_prim) { + case LLC_DATA_PRIM: + llc_save_primitive(skb, LLC_DATA_PRIM); + if (sock_queue_rcv_skb(sk, skb)) { + /* + * shouldn't happen + */ + printk(KERN_ERR "%s: sock_queue_rcv_skb failed!\n", + __FUNCTION__); + kfree_skb(skb); + } + break; + case LLC_CONN_PRIM: { + struct sock *parent = skb->sk; + + skb->sk = sk; + skb_queue_tail(&parent->sk_receive_queue, skb); + sk->sk_state_change(parent); + } + break; + case LLC_DISC_PRIM: + sock_hold(sk); + if (sk->sk_type == SOCK_STREAM && + sk->sk_state == TCP_ESTABLISHED) { + sk->sk_shutdown = SHUTDOWN_MASK; + sk->sk_socket->state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DEAD); + } + } + kfree_skb(skb); + sock_put(sk); + break; + case LLC_RESET_PRIM: + /* + * FIXME: + * RESET is not being notified to upper layers for now + */ + printk(KERN_INFO "%s: received a reset ind!\n", __FUNCTION__); + kfree_skb(skb); + break; + default: + if (ev->ind_prim) { + printk(KERN_INFO "%s: received unknown %d prim!\n", + __FUNCTION__, ev->ind_prim); + kfree_skb(skb); + } + /* No indication */ + break; + } + + switch (ev->cfm_prim) { + case LLC_DATA_PRIM: + if (!llc_data_accept_state(llc->state)) + sk->sk_write_space(sk); + else + rc = llc->failed_data_req = 1; + break; + case LLC_CONN_PRIM: + if (sk->sk_type == SOCK_STREAM && + sk->sk_state == TCP_SYN_SENT) { + if (ev->status) { + sk->sk_socket->state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; + } else { + sk->sk_socket->state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; + } + sk->sk_state_change(sk); + } + break; + case LLC_DISC_PRIM: + sock_hold(sk); + if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSING) { + sk->sk_socket->state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; + sk->sk_state_change(sk); + } + sock_put(sk); + break; + case LLC_RESET_PRIM: + /* + * FIXME: + * RESET is not being notified to upper layers for now + */ + printk(KERN_INFO "%s: received a reset conf!\n", __FUNCTION__); + break; + default: + if (ev->cfm_prim) { + printk(KERN_INFO "%s: received unknown %d prim!\n", + __FUNCTION__, ev->cfm_prim); + break; + } + goto out_skb_put; /* No confirmation */ + } +out_kfree_skb: + kfree_skb(skb); +out_skb_put: + kfree_skb(skb); + return rc; +} + +void llc_conn_send_pdu(struct sock *sk, struct sk_buff *skb) +{ + /* queue PDU to send to MAC layer */ + skb_queue_tail(&sk->sk_write_queue, skb); + llc_conn_send_pdus(sk); +} + +/** + * llc_conn_rtn_pdu - sends received data pdu to upper layer + * @sk: Active connection + * @skb: Received data frame + * + * Sends received data pdu to upper layer (by using indicate function). + * Prepares service parameters (prim and prim_data). calling indication + * function will be done in llc_conn_state_process. + */ +void llc_conn_rtn_pdu(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->ind_prim = LLC_DATA_PRIM; +} + +/** + * llc_conn_resend_i_pdu_as_cmd - resend all all unacknowledged I PDUs + * @sk: active connection + * @nr: NR + * @first_p_bit: p_bit value of first pdu + * + * Resend all unacknowledged I PDUs, starting with the NR; send first as + * command PDU with P bit equal first_p_bit; if more than one send + * subsequent as command PDUs with P bit equal zero (0). + */ +void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit) +{ + struct sk_buff *skb; + struct llc_pdu_sn *pdu; + u16 nbr_unack_pdus; + struct llc_sock *llc; + u8 howmany_resend = 0; + + llc_conn_remove_acked_pdus(sk, nr, &nbr_unack_pdus); + if (!nbr_unack_pdus) + goto out; + /* + * Process unack PDUs only if unack queue is not empty; remove + * appropriate PDUs, fix them up, and put them on mac_pdu_q. + */ + llc = llc_sk(sk); + + while ((skb = skb_dequeue(&llc->pdu_unack_q)) != NULL) { + pdu = llc_pdu_sn_hdr(skb); + llc_pdu_set_cmd_rsp(skb, LLC_PDU_CMD); + llc_pdu_set_pf_bit(skb, first_p_bit); + skb_queue_tail(&sk->sk_write_queue, skb); + first_p_bit = 0; + llc->vS = LLC_I_GET_NS(pdu); + howmany_resend++; + } + if (howmany_resend > 0) + llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; + /* any PDUs to re-send are queued up; start sending to MAC */ + llc_conn_send_pdus(sk); +out:; +} + +/** + * llc_conn_resend_i_pdu_as_rsp - Resend all unacknowledged I PDUs + * @sk: active connection. + * @nr: NR + * @first_f_bit: f_bit value of first pdu. + * + * Resend all unacknowledged I PDUs, starting with the NR; send first as + * response PDU with F bit equal first_f_bit; if more than one send + * subsequent as response PDUs with F bit equal zero (0). + */ +void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit) +{ + struct sk_buff *skb; + u16 nbr_unack_pdus; + struct llc_sock *llc = llc_sk(sk); + u8 howmany_resend = 0; + + llc_conn_remove_acked_pdus(sk, nr, &nbr_unack_pdus); + if (!nbr_unack_pdus) + goto out; + /* + * Process unack PDUs only if unack queue is not empty; remove + * appropriate PDUs, fix them up, and put them on mac_pdu_q + */ + while ((skb = skb_dequeue(&llc->pdu_unack_q)) != NULL) { + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + llc_pdu_set_cmd_rsp(skb, LLC_PDU_RSP); + llc_pdu_set_pf_bit(skb, first_f_bit); + skb_queue_tail(&sk->sk_write_queue, skb); + first_f_bit = 0; + llc->vS = LLC_I_GET_NS(pdu); + howmany_resend++; + } + if (howmany_resend > 0) + llc->vS = (llc->vS + 1) % LLC_2_SEQ_NBR_MODULO; + /* any PDUs to re-send are queued up; start sending to MAC */ + llc_conn_send_pdus(sk); +out:; +} + +/** + * llc_conn_remove_acked_pdus - Removes acknowledged pdus from tx queue + * @sk: active connection + * nr: NR + * how_many_unacked: size of pdu_unack_q after removing acked pdus + * + * Removes acknowledged pdus from transmit queue (pdu_unack_q). Returns + * the number of pdus that removed from queue. + */ +int llc_conn_remove_acked_pdus(struct sock *sk, u8 nr, u16 *how_many_unacked) +{ + int pdu_pos, i; + struct sk_buff *skb; + struct llc_pdu_sn *pdu; + int nbr_acked = 0; + struct llc_sock *llc = llc_sk(sk); + int q_len = skb_queue_len(&llc->pdu_unack_q); + + if (!q_len) + goto out; + skb = skb_peek(&llc->pdu_unack_q); + pdu = llc_pdu_sn_hdr(skb); + + /* finding position of last acked pdu in queue */ + pdu_pos = ((int)LLC_2_SEQ_NBR_MODULO + (int)nr - + (int)LLC_I_GET_NS(pdu)) % LLC_2_SEQ_NBR_MODULO; + + for (i = 0; i < pdu_pos && i < q_len; i++) { + skb = skb_dequeue(&llc->pdu_unack_q); + if (skb) + kfree_skb(skb); + nbr_acked++; + } +out: + *how_many_unacked = skb_queue_len(&llc->pdu_unack_q); + return nbr_acked; +} + +/** + * llc_conn_send_pdus - Sends queued PDUs + * @sk: active connection + * + * Sends queued pdus to MAC layer for transmission. + */ +static void llc_conn_send_pdus(struct sock *sk) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&sk->sk_write_queue)) != NULL) { + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + if (LLC_PDU_TYPE_IS_I(pdu) && + !(skb->dev->flags & IFF_LOOPBACK)) { + struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + + skb_queue_tail(&llc_sk(sk)->pdu_unack_q, skb); + if (!skb2) + break; + skb = skb2; + } + dev_queue_xmit(skb); + } +} + +/** + * llc_conn_service - finds transition and changes state of connection + * @sk: connection + * @skb: happened event + * + * This function finds transition that matches with happened event, then + * executes related actions and finally changes state of connection. + * Returns 0 for success, 1 for failure. + */ +static int llc_conn_service(struct sock *sk, struct sk_buff *skb) +{ + int rc = 1; + struct llc_sock *llc = llc_sk(sk); + struct llc_conn_state_trans *trans; + + if (llc->state > NBR_CONN_STATES) + goto out; + rc = 0; + trans = llc_qualify_conn_ev(sk, skb); + if (trans) { + rc = llc_exec_conn_trans_actions(sk, trans, skb); + if (!rc && trans->next_state != NO_STATE_CHANGE) { + llc->state = trans->next_state; + if (!llc_data_accept_state(llc->state)) + sk->sk_state_change(sk); + } + } +out: + return rc; +} + +/** + * llc_qualify_conn_ev - finds transition for event + * @sk: connection + * @skb: happened event + * + * This function finds transition that matches with happened event. + * Returns pointer to found transition on success, %NULL otherwise. + */ +static struct llc_conn_state_trans *llc_qualify_conn_ev(struct sock *sk, + struct sk_buff *skb) +{ + struct llc_conn_state_trans **next_trans; + llc_conn_ev_qfyr_t *next_qualifier; + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + struct llc_sock *llc = llc_sk(sk); + struct llc_conn_state *curr_state = + &llc_conn_state_table[llc->state - 1]; + + /* search thru events for this state until + * list exhausted or until no more + */ + for (next_trans = curr_state->transitions + + llc_find_offset(llc->state - 1, ev->type); + (*next_trans)->ev; next_trans++) { + if (!((*next_trans)->ev)(sk, skb)) { + /* got POSSIBLE event match; the event may require + * qualification based on the values of a number of + * state flags; if all qualifications are met (i.e., + * if all qualifying functions return success, or 0, + * then this is THE event we're looking for + */ + for (next_qualifier = (*next_trans)->ev_qualifiers; + next_qualifier && *next_qualifier && + !(*next_qualifier)(sk, skb); next_qualifier++) + /* nothing */; + if (!next_qualifier || !*next_qualifier) + /* all qualifiers executed successfully; this is + * our transition; return it so we can perform + * the associated actions & change the state + */ + return *next_trans; + } + } + return NULL; +} + +/** + * llc_exec_conn_trans_actions - executes related actions + * @sk: connection + * @trans: transition that it's actions must be performed + * @skb: event + * + * Executes actions that is related to happened event. Returns 0 for + * success, 1 to indicate failure of at least one action. + */ +static int llc_exec_conn_trans_actions(struct sock *sk, + struct llc_conn_state_trans *trans, + struct sk_buff *skb) +{ + int rc = 0; + llc_conn_action_t *next_action; + + for (next_action = trans->ev_actions; + next_action && *next_action; next_action++) { + int rc2 = (*next_action)(sk, skb); + + if (rc2 == 2) { + rc = rc2; + break; + } else if (rc2) + rc = 1; + } + return rc; +} + +/** + * llc_lookup_established - Finds connection for the remote/local sap/mac + * @sap: SAP + * @daddr: address of remote LLC (MAC + SAP) + * @laddr: address of local LLC (MAC + SAP) + * + * Search connection list of the SAP and finds connection using the remote + * mac, remote sap, local mac, and local sap. Returns pointer for + * connection found, %NULL otherwise. + */ +struct sock *llc_lookup_established(struct llc_sap *sap, struct llc_addr *daddr, + struct llc_addr *laddr) +{ + struct sock *rc; + struct hlist_node *node; + + read_lock_bh(&sap->sk_list.lock); + sk_for_each(rc, node, &sap->sk_list.list) { + struct llc_sock *llc = llc_sk(rc); + + if (llc->laddr.lsap == laddr->lsap && + llc->daddr.lsap == daddr->lsap && + llc_mac_match(llc->laddr.mac, laddr->mac) && + llc_mac_match(llc->daddr.mac, daddr->mac)) { + sock_hold(rc); + goto found; + } + } + rc = NULL; +found: + read_unlock_bh(&sap->sk_list.lock); + return rc; +} + +/** + * llc_lookup_listener - Finds listener for local MAC + SAP + * @sap: SAP + * @laddr: address of local LLC (MAC + SAP) + * + * Search connection list of the SAP and finds connection listening on + * local mac, and local sap. Returns pointer for parent socket found, + * %NULL otherwise. + */ +static struct sock *llc_lookup_listener(struct llc_sap *sap, + struct llc_addr *laddr) +{ + struct sock *rc; + struct hlist_node *node; + + read_lock_bh(&sap->sk_list.lock); + sk_for_each(rc, node, &sap->sk_list.list) { + struct llc_sock *llc = llc_sk(rc); + + if (rc->sk_type == SOCK_STREAM && rc->sk_state == TCP_LISTEN && + llc->laddr.lsap == laddr->lsap && + (llc_mac_match(llc->laddr.mac, laddr->mac) || + llc_mac_null(llc->laddr.mac))) { + sock_hold(rc); + goto found; + } + } + rc = NULL; +found: + read_unlock_bh(&sap->sk_list.lock); + return rc; +} + +/** + * llc_data_accept_state - designates if in this state data can be sent. + * @state: state of connection. + * + * Returns 0 if data can be sent, 1 otherwise. + */ +u8 llc_data_accept_state(u8 state) +{ + return state != LLC_CONN_STATE_NORMAL && state != LLC_CONN_STATE_BUSY && + state != LLC_CONN_STATE_REJ; +} + +/** + * find_next_offset - finds offset for next category of transitions + * @state: state table. + * @offset: start offset. + * + * Finds offset of next category of transitions in transition table. + * Returns the start index of next category. + */ +static u16 find_next_offset(struct llc_conn_state *state, u16 offset) +{ + u16 cnt = 0; + struct llc_conn_state_trans **next_trans; + + for (next_trans = state->transitions + offset; + (*next_trans)->ev; next_trans++) + ++cnt; + return cnt; +} + +/** + * llc_build_offset_table - builds offset table of connection + * + * Fills offset table of connection state transition table + * (llc_offset_table). + */ +void __init llc_build_offset_table(void) +{ + struct llc_conn_state *curr_state; + int state, ev_type, next_offset; + + for (state = 0; state < NBR_CONN_STATES; state++) { + curr_state = &llc_conn_state_table[state]; + next_offset = 0; + for (ev_type = 0; ev_type < NBR_CONN_EV; ev_type++) { + llc_offset_table[state][ev_type] = next_offset; + next_offset += find_next_offset(curr_state, + next_offset) + 1; + } + } +} + +/** + * llc_find_offset - finds start offset of category of transitions + * @state: state of connection + * @ev_type: type of happened event + * + * Finds start offset of desired category of transitions. Returns the + * desired start offset. + */ +static int llc_find_offset(int state, int ev_type) +{ + int rc = 0; + /* at this stage, llc_offset_table[..][2] is not important. it is for + * init_pf_cycle and I don't know what is it. + */ + switch (ev_type) { + case LLC_CONN_EV_TYPE_PRIM: + rc = llc_offset_table[state][0]; break; + case LLC_CONN_EV_TYPE_PDU: + rc = llc_offset_table[state][4]; break; + case LLC_CONN_EV_TYPE_SIMPLE: + rc = llc_offset_table[state][1]; break; + case LLC_CONN_EV_TYPE_P_TMR: + case LLC_CONN_EV_TYPE_ACK_TMR: + case LLC_CONN_EV_TYPE_REJ_TMR: + case LLC_CONN_EV_TYPE_BUSY_TMR: + rc = llc_offset_table[state][3]; break; + } + return rc; +} + +/** + * llc_sap_add_socket - adds a socket to a SAP + * @sap: SAP + * @sk: socket + * + * This function adds a socket to sk_list of a SAP. + */ +void llc_sap_add_socket(struct llc_sap *sap, struct sock *sk) +{ + write_lock_bh(&sap->sk_list.lock); + llc_sk(sk)->sap = sap; + sk_add_node(sk, &sap->sk_list.list); + write_unlock_bh(&sap->sk_list.lock); +} + +/** + * llc_sap_remove_socket - removes a socket from SAP + * @sap: SAP + * @sk: socket + * + * This function removes a connection from sk_list.list of a SAP if + * the connection was in this list. + */ +void llc_sap_remove_socket(struct llc_sap *sap, struct sock *sk) +{ + write_lock_bh(&sap->sk_list.lock); + sk_del_node_init(sk); + write_unlock_bh(&sap->sk_list.lock); +} + +/** + * llc_conn_rcv - sends received pdus to the connection state machine + * @sk: current connection structure. + * @skb: received frame. + * + * Sends received pdus to the connection state machine. + */ +static int llc_conn_rcv(struct sock* sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + struct llc_sock *llc = llc_sk(sk); + + if (!llc->dev) + llc->dev = skb->dev; + ev->type = LLC_CONN_EV_TYPE_PDU; + ev->reason = 0; + return llc_conn_state_process(sk, skb); +} + +void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_addr saddr, daddr; + struct sock *sk; + + llc_pdu_decode_sa(skb, saddr.mac); + llc_pdu_decode_ssap(skb, &saddr.lsap); + llc_pdu_decode_da(skb, daddr.mac); + llc_pdu_decode_dsap(skb, &daddr.lsap); + + sk = llc_lookup_established(sap, &saddr, &daddr); + if (!sk) { + /* + * Didn't find an active connection; verify if there + * is a listening socket for this llc addr + */ + struct llc_sock *llc; + struct sock *parent = llc_lookup_listener(sap, &daddr); + + if (!parent) { + dprintk("llc_lookup_listener failed!\n"); + goto drop; + } + + sk = llc_sk_alloc(parent->sk_family, GFP_ATOMIC, parent->sk_prot); + if (!sk) { + sock_put(parent); + goto drop; + } + llc = llc_sk(sk); + memcpy(&llc->laddr, &daddr, sizeof(llc->laddr)); + memcpy(&llc->daddr, &saddr, sizeof(llc->daddr)); + llc_sap_add_socket(sap, sk); + sock_hold(sk); + sock_put(parent); + skb->sk = parent; + } else + skb->sk = sk; + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) + llc_conn_rcv(sk, skb); + else { + dprintk("%s: adding to backlog...\n", __FUNCTION__); + llc_set_backlog_type(skb, LLC_PACKET); + sk_add_backlog(sk, skb); + } + bh_unlock_sock(sk); + sock_put(sk); + return; +drop: + kfree_skb(skb); +} + +#undef LLC_REFCNT_DEBUG +#ifdef LLC_REFCNT_DEBUG +static atomic_t llc_sock_nr; +#endif + +/** + * llc_release_sockets - releases all sockets in a sap + * @sap: sap to release its sockets + * + * Releases all connections of a sap. Returns 0 if all actions complete + * successfully, nonzero otherwise + */ +int llc_release_sockets(struct llc_sap *sap) +{ + int rc = 0; + struct sock *sk; + struct hlist_node *node; + + write_lock_bh(&sap->sk_list.lock); + + sk_for_each(sk, node, &sap->sk_list.list) { + llc_sk(sk)->state = LLC_CONN_STATE_TEMP; + + if (llc_send_disc(sk)) + rc = 1; + } + + write_unlock_bh(&sap->sk_list.lock); + return rc; +} + +/** + * llc_backlog_rcv - Processes rx frames and expired timers. + * @sk: LLC sock (p8022 connection) + * @skb: queued rx frame or event + * + * This function processes frames that has received and timers that has + * expired during sending an I pdu (refer to data_req_handler). frames + * queue by llc_rcv function (llc_mac.c) and timers queue by timer + * callback functions(llc_c_ac.c). + */ +static int llc_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + int rc = 0; + struct llc_sock *llc = llc_sk(sk); + + if (llc_backlog_type(skb) == LLC_PACKET) { + if (llc->state > 1) /* not closed */ + rc = llc_conn_rcv(sk, skb); + else + goto out_kfree_skb; + } else if (llc_backlog_type(skb) == LLC_EVENT) { + /* timer expiration event */ + if (llc->state > 1) /* not closed */ + rc = llc_conn_state_process(sk, skb); + else + goto out_kfree_skb; + } else { + printk(KERN_ERR "%s: invalid skb in backlog\n", __FUNCTION__); + goto out_kfree_skb; + } +out: + return rc; +out_kfree_skb: + kfree_skb(skb); + goto out; +} + +/** + * llc_sk_init - Initializes a socket with default llc values. + * @sk: socket to initialize. + * + * Initializes a socket with default llc values. + */ +static void llc_sk_init(struct sock* sk) +{ + struct llc_sock *llc = llc_sk(sk); + + llc->state = LLC_CONN_STATE_ADM; + llc->inc_cntr = llc->dec_cntr = 2; + llc->dec_step = llc->connect_step = 1; + + init_timer(&llc->ack_timer.timer); + llc->ack_timer.expire = LLC_ACK_TIME; + llc->ack_timer.timer.data = (unsigned long)sk; + llc->ack_timer.timer.function = llc_conn_ack_tmr_cb; + + init_timer(&llc->pf_cycle_timer.timer); + llc->pf_cycle_timer.expire = LLC_P_TIME; + llc->pf_cycle_timer.timer.data = (unsigned long)sk; + llc->pf_cycle_timer.timer.function = llc_conn_pf_cycle_tmr_cb; + + init_timer(&llc->rej_sent_timer.timer); + llc->rej_sent_timer.expire = LLC_REJ_TIME; + llc->rej_sent_timer.timer.data = (unsigned long)sk; + llc->rej_sent_timer.timer.function = llc_conn_rej_tmr_cb; + + init_timer(&llc->busy_state_timer.timer); + llc->busy_state_timer.expire = LLC_BUSY_TIME; + llc->busy_state_timer.timer.data = (unsigned long)sk; + llc->busy_state_timer.timer.function = llc_conn_busy_tmr_cb; + + llc->n2 = 2; /* max retransmit */ + llc->k = 2; /* tx win size, will adjust dynam */ + llc->rw = 128; /* rx win size (opt and equal to + * tx_win of remote LLC) */ + skb_queue_head_init(&llc->pdu_unack_q); + sk->sk_backlog_rcv = llc_backlog_rcv; +} + +/** + * llc_sk_alloc - Allocates LLC sock + * @family: upper layer protocol family + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * + * Allocates a LLC sock and initializes it. Returns the new LLC sock + * or %NULL if there's no memory available for one + */ +struct sock *llc_sk_alloc(int family, int priority, struct proto *prot) +{ + struct sock *sk = sk_alloc(family, priority, prot, 1); + + if (!sk) + goto out; + llc_sk_init(sk); + sock_init_data(NULL, sk); +#ifdef LLC_REFCNT_DEBUG + atomic_inc(&llc_sock_nr); + printk(KERN_DEBUG "LLC socket %p created in %s, now we have %d alive\n", sk, + __FUNCTION__, atomic_read(&llc_sock_nr)); +#endif +out: + return sk; +} + +/** + * llc_sk_free - Frees a LLC socket + * @sk - socket to free + * + * Frees a LLC socket + */ +void llc_sk_free(struct sock *sk) +{ + struct llc_sock *llc = llc_sk(sk); + + llc->state = LLC_CONN_OUT_OF_SVC; + /* Stop all (possibly) running timers */ + llc_conn_ac_stop_all_timers(sk, NULL); +#ifdef DEBUG_LLC_CONN_ALLOC + printk(KERN_INFO "%s: unackq=%d, txq=%d\n", __FUNCTION__, + skb_queue_len(&llc->pdu_unack_q), + skb_queue_len(&sk->sk_write_queue)); +#endif + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); + skb_queue_purge(&llc->pdu_unack_q); +#ifdef LLC_REFCNT_DEBUG + if (atomic_read(&sk->sk_refcnt) != 1) { + printk(KERN_DEBUG "Destruction of LLC sock %p delayed in %s, cnt=%d\n", + sk, __FUNCTION__, atomic_read(&sk->sk_refcnt)); + printk(KERN_DEBUG "%d LLC sockets are still alive\n", + atomic_read(&llc_sock_nr)); + } else { + atomic_dec(&llc_sock_nr); + printk(KERN_DEBUG "LLC socket %p released in %s, %d are still alive\n", sk, + __FUNCTION__, atomic_read(&llc_sock_nr)); + } +#endif + sock_put(sk); +} + +/** + * llc_sk_reset - resets a connection + * @sk: LLC socket to reset + * + * Resets a connection to the out of service state. Stops its timers + * and frees any frames in the queues of the connection. + */ +void llc_sk_reset(struct sock *sk) +{ + struct llc_sock *llc = llc_sk(sk); + + llc_conn_ac_stop_all_timers(sk, NULL); + skb_queue_purge(&sk->sk_write_queue); + skb_queue_purge(&llc->pdu_unack_q); + llc->remote_busy_flag = 0; + llc->cause_flag = 0; + llc->retry_count = 0; + llc_conn_set_p_flag(sk, 0); + llc->f_flag = 0; + llc->s_flag = 0; + llc->ack_pf = 0; + llc->first_pdu_Ns = 0; + llc->ack_must_be_send = 0; + llc->dec_step = 1; + llc->inc_cntr = 2; + llc->dec_cntr = 2; + llc->X = 0; + llc->failed_data_req = 0 ; + llc->last_nr = 0; +} diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c new file mode 100644 index 000000000000..5ff02c080a0b --- /dev/null +++ b/net/llc/llc_core.c @@ -0,0 +1,179 @@ +/* + * llc_core.c - Minimum needed routines for sap handling and module init/exit + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +LIST_HEAD(llc_sap_list); +DEFINE_RWLOCK(llc_sap_list_lock); + +unsigned char llc_station_mac_sa[ETH_ALEN]; + +/** + * llc_sap_alloc - allocates and initializes sap. + * + * Allocates and initializes sap. + */ +static struct llc_sap *llc_sap_alloc(void) +{ + struct llc_sap *sap = kmalloc(sizeof(*sap), GFP_ATOMIC); + + if (sap) { + memset(sap, 0, sizeof(*sap)); + sap->state = LLC_SAP_STATE_ACTIVE; + memcpy(sap->laddr.mac, llc_station_mac_sa, ETH_ALEN); + rwlock_init(&sap->sk_list.lock); + } + return sap; +} + +/** + * llc_add_sap - add sap to station list + * @sap: Address of the sap + * + * Adds a sap to the LLC's station sap list. + */ +static void llc_add_sap(struct llc_sap *sap) +{ + write_lock_bh(&llc_sap_list_lock); + list_add_tail(&sap->node, &llc_sap_list); + write_unlock_bh(&llc_sap_list_lock); +} + +/** + * llc_del_sap - del sap from station list + * @sap: Address of the sap + * + * Removes a sap to the LLC's station sap list. + */ +static void llc_del_sap(struct llc_sap *sap) +{ + write_lock_bh(&llc_sap_list_lock); + list_del(&sap->node); + write_unlock_bh(&llc_sap_list_lock); +} + +/** + * llc_sap_find - searchs a SAP in station + * @sap_value: sap to be found + * + * Searchs for a sap in the sap list of the LLC's station upon the sap ID. + * Returns the sap or %NULL if not found. + */ +struct llc_sap *llc_sap_find(unsigned char sap_value) +{ + struct llc_sap* sap; + + read_lock_bh(&llc_sap_list_lock); + list_for_each_entry(sap, &llc_sap_list, node) + if (sap->laddr.lsap == sap_value) + goto out; + sap = NULL; +out: + read_unlock_bh(&llc_sap_list_lock); + return sap; +} + +/** + * llc_sap_open - open interface to the upper layers. + * @lsap: SAP number. + * @func: rcv func for datalink protos + * + * Interface function to upper layer. Each one who wants to get a SAP + * (for example NetBEUI) should call this function. Returns the opened + * SAP for success, NULL for failure. + */ +struct llc_sap *llc_sap_open(unsigned char lsap, + int (*func)(struct sk_buff *skb, + struct net_device *dev, + struct packet_type *pt)) +{ + struct llc_sap *sap = llc_sap_find(lsap); + + if (sap) { /* SAP already exists */ + sap = NULL; + goto out; + } + sap = llc_sap_alloc(); + if (!sap) + goto out; + sap->laddr.lsap = lsap; + sap->rcv_func = func; + llc_add_sap(sap); +out: + return sap; +} + +/** + * llc_sap_close - close interface for upper layers. + * @sap: SAP to be closed. + * + * Close interface function to upper layer. Each one who wants to + * close an open SAP (for example NetBEUI) should call this function. + * Removes this sap from the list of saps in the station and then + * frees the memory for this sap. + */ +void llc_sap_close(struct llc_sap *sap) +{ + WARN_ON(!hlist_empty(&sap->sk_list.list)); + llc_del_sap(sap); + kfree(sap); +} + +static struct packet_type llc_packet_type = { + .type = __constant_htons(ETH_P_802_2), + .func = llc_rcv, +}; + +static struct packet_type llc_tr_packet_type = { + .type = __constant_htons(ETH_P_TR_802_2), + .func = llc_rcv, +}; + +static int __init llc_init(void) +{ + if (dev_base->next) + memcpy(llc_station_mac_sa, dev_base->next->dev_addr, ETH_ALEN); + else + memset(llc_station_mac_sa, 0, ETH_ALEN); + dev_add_pack(&llc_packet_type); + dev_add_pack(&llc_tr_packet_type); + return 0; +} + +static void __exit llc_exit(void) +{ + dev_remove_pack(&llc_packet_type); + dev_remove_pack(&llc_tr_packet_type); +} + +module_init(llc_init); +module_exit(llc_exit); + +EXPORT_SYMBOL(llc_station_mac_sa); +EXPORT_SYMBOL(llc_sap_list); +EXPORT_SYMBOL(llc_sap_list_lock); +EXPORT_SYMBOL(llc_sap_find); +EXPORT_SYMBOL(llc_sap_open); +EXPORT_SYMBOL(llc_sap_close); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Procom 1997, Jay Schullist 2001, Arnaldo C. Melo 2001-2003"); +MODULE_DESCRIPTION("LLC IEEE 802.2 core support"); diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c new file mode 100644 index 000000000000..0f9fc48aeaf9 --- /dev/null +++ b/net/llc/llc_if.c @@ -0,0 +1,157 @@ +/* + * llc_if.c - Defines LLC interface to upper layer + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +u8 llc_mac_null_var[IFHWADDRLEN]; + +/** + * llc_build_and_send_pkt - Connection data sending for upper layers. + * @sk: connection + * @skb: packet to send + * + * This function is called when upper layer wants to send data using + * connection oriented communication mode. During sending data, connection + * will be locked and received frames and expired timers will be queued. + * Returns 0 for success, -ECONNABORTED when the connection already + * closed and -EBUSY when sending data is not permitted in this state or + * LLC has send an I pdu with p bit set to 1 and is waiting for it's + * response. + */ +int llc_build_and_send_pkt(struct sock *sk, struct sk_buff *skb) +{ + struct llc_conn_state_ev *ev; + int rc = -ECONNABORTED; + struct llc_sock *llc = llc_sk(sk); + + if (llc->state == LLC_CONN_STATE_ADM) + goto out; + rc = -EBUSY; + if (llc_data_accept_state(llc->state)) { /* data_conn_refuse */ + llc->failed_data_req = 1; + goto out; + } + if (llc->p_flag) { + llc->failed_data_req = 1; + goto out; + } + ev = llc_conn_ev(skb); + ev->type = LLC_CONN_EV_TYPE_PRIM; + ev->prim = LLC_DATA_PRIM; + ev->prim_type = LLC_PRIM_TYPE_REQ; + skb->dev = llc->dev; + rc = llc_conn_state_process(sk, skb); +out: + return rc; +} + +/** + * llc_establish_connection - Called by upper layer to establish a conn + * @sk: connection + * @lmac: local mac address + * @dmac: destination mac address + * @dsap: destination sap + * + * Upper layer calls this to establish an LLC connection with a remote + * machine. This function packages a proper event and sends it connection + * component state machine. Success or failure of connection + * establishment will inform to upper layer via calling it's confirm + * function and passing proper information. + */ +int llc_establish_connection(struct sock *sk, u8 *lmac, u8 *dmac, u8 dsap) +{ + int rc = -EISCONN; + struct llc_addr laddr, daddr; + struct sk_buff *skb; + struct llc_sock *llc = llc_sk(sk); + struct sock *existing; + + laddr.lsap = llc->sap->laddr.lsap; + daddr.lsap = dsap; + memcpy(daddr.mac, dmac, sizeof(daddr.mac)); + memcpy(laddr.mac, lmac, sizeof(laddr.mac)); + existing = llc_lookup_established(llc->sap, &daddr, &laddr); + if (existing) { + if (existing->sk_state == TCP_ESTABLISHED) { + sk = existing; + goto out_put; + } else + sock_put(existing); + } + sock_hold(sk); + rc = -ENOMEM; + skb = alloc_skb(0, GFP_ATOMIC); + if (skb) { + struct llc_conn_state_ev *ev = llc_conn_ev(skb); + + ev->type = LLC_CONN_EV_TYPE_PRIM; + ev->prim = LLC_CONN_PRIM; + ev->prim_type = LLC_PRIM_TYPE_REQ; + rc = llc_conn_state_process(sk, skb); + } +out_put: + sock_put(sk); + return rc; +} + +/** + * llc_send_disc - Called by upper layer to close a connection + * @sk: connection to be closed + * + * Upper layer calls this when it wants to close an established LLC + * connection with a remote machine. This function packages a proper event + * and sends it to connection component state machine. Returns 0 for + * success, 1 otherwise. + */ +int llc_send_disc(struct sock *sk) +{ + u16 rc = 1; + struct llc_conn_state_ev *ev; + struct sk_buff *skb; + + sock_hold(sk); + if (sk->sk_type != SOCK_STREAM || sk->sk_state != TCP_ESTABLISHED || + llc_sk(sk)->state == LLC_CONN_STATE_ADM || + llc_sk(sk)->state == LLC_CONN_OUT_OF_SVC) + goto out; + /* + * Postpone unassigning the connection from its SAP and returning the + * connection until all ACTIONs have been completely executed + */ + skb = alloc_skb(0, GFP_ATOMIC); + if (!skb) + goto out; + sk->sk_state = TCP_CLOSING; + ev = llc_conn_ev(skb); + ev->type = LLC_CONN_EV_TYPE_PRIM; + ev->prim = LLC_DISC_PRIM; + ev->prim_type = LLC_PRIM_TYPE_REQ; + rc = llc_conn_state_process(sk, skb); +out: + sock_put(sk); + return rc; +} + diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c new file mode 100644 index 000000000000..4da6976efc9c --- /dev/null +++ b/net/llc/llc_input.c @@ -0,0 +1,189 @@ +/* + * llc_input.c - Minimal input path for LLC + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include + +#if 0 +#define dprintk(args...) printk(KERN_DEBUG args) +#else +#define dprintk(args...) +#endif + +/* + * Packet handler for the station, registerable because in the minimal + * LLC core that is taking shape only the very minimal subset of LLC that + * is needed for things like IPX, Appletalk, etc will stay, with all the + * rest in the llc1 and llc2 modules. + */ +static void (*llc_station_handler)(struct sk_buff *skb); + +/* + * Packet handlers for LLC_DEST_SAP and LLC_DEST_CONN. + */ +static void (*llc_type_handlers[2])(struct llc_sap *sap, + struct sk_buff *skb); + +void llc_add_pack(int type, void (*handler)(struct llc_sap *sap, + struct sk_buff *skb)) +{ + if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) + llc_type_handlers[type - 1] = handler; +} + +void llc_remove_pack(int type) +{ + if (type == LLC_DEST_SAP || type == LLC_DEST_CONN) + llc_type_handlers[type - 1] = NULL; +} + +void llc_set_station_handler(void (*handler)(struct sk_buff *skb)) +{ + llc_station_handler = handler; +} + +/** + * llc_pdu_type - returns which LLC component must handle for PDU + * @skb: input skb + * + * This function returns which LLC component must handle this PDU. + */ +static __inline__ int llc_pdu_type(struct sk_buff *skb) +{ + int type = LLC_DEST_CONN; /* I-PDU or S-PDU type */ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) != LLC_PDU_TYPE_U) + goto out; + switch (LLC_U_PDU_CMD(pdu)) { + case LLC_1_PDU_CMD_XID: + case LLC_1_PDU_CMD_UI: + case LLC_1_PDU_CMD_TEST: + type = LLC_DEST_SAP; + break; + case LLC_2_PDU_CMD_SABME: + case LLC_2_PDU_CMD_DISC: + case LLC_2_PDU_RSP_UA: + case LLC_2_PDU_RSP_DM: + case LLC_2_PDU_RSP_FRMR: + break; + default: + type = LLC_DEST_INVALID; + break; + } +out: + return type; +} + +/** + * llc_fixup_skb - initializes skb pointers + * @skb: This argument points to incoming skb + * + * Initializes internal skb pointer to start of network layer by deriving + * length of LLC header; finds length of LLC control field in LLC header + * by looking at the two lowest-order bits of the first control field + * byte; field is either 3 or 4 bytes long. + */ +static inline int llc_fixup_skb(struct sk_buff *skb) +{ + u8 llc_len = 2; + struct llc_pdu_sn *pdu; + + if (!pskb_may_pull(skb, sizeof(*pdu))) + return 0; + + pdu = (struct llc_pdu_sn *)skb->data; + if ((pdu->ctrl_1 & LLC_PDU_TYPE_MASK) == LLC_PDU_TYPE_U) + llc_len = 1; + llc_len += 2; + skb->h.raw += llc_len; + skb_pull(skb, llc_len); + if (skb->protocol == htons(ETH_P_802_2)) { + u16 pdulen = eth_hdr(skb)->h_proto, + data_size = ntohs(pdulen) - llc_len; + + skb_trim(skb, data_size); + } + return 1; +} + +/** + * llc_rcv - 802.2 entry point from net lower layers + * @skb: received pdu + * @dev: device that receive pdu + * @pt: packet type + * + * When the system receives a 802.2 frame this function is called. It + * checks SAP and connection of received pdu and passes frame to + * llc_{station,sap,conn}_rcv for sending to proper state machine. If + * the frame is related to a busy connection (a connection is sending + * data now), it queues this frame in the connection's backlog. + */ +int llc_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt) +{ + struct llc_sap *sap; + struct llc_pdu_sn *pdu; + int dest; + + /* + * When the interface is in promisc. mode, drop all the crap that it + * receives, do not try to analyse it. + */ + if (unlikely(skb->pkt_type == PACKET_OTHERHOST)) { + dprintk("%s: PACKET_OTHERHOST\n", __FUNCTION__); + goto drop; + } + skb = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + goto out; + if (unlikely(!llc_fixup_skb(skb))) + goto drop; + pdu = llc_pdu_sn_hdr(skb); + if (unlikely(!pdu->dsap)) /* NULL DSAP, refer to station */ + goto handle_station; + sap = llc_sap_find(pdu->dsap); + if (unlikely(!sap)) {/* unknown SAP */ + dprintk("%s: llc_sap_find(%02X) failed!\n", __FUNCTION__, + pdu->dsap); + goto drop; + } + /* + * First the upper layer protocols that don't need the full + * LLC functionality + */ + if (sap->rcv_func) { + sap->rcv_func(skb, dev, pt); + goto out; + } + dest = llc_pdu_type(skb); + if (unlikely(!dest || !llc_type_handlers[dest - 1])) + goto drop; + llc_type_handlers[dest - 1](sap, skb); +out: + return 0; +drop: + kfree_skb(skb); + goto out; +handle_station: + if (!llc_station_handler) + goto drop; + llc_station_handler(skb); + goto out; +} + +EXPORT_SYMBOL(llc_add_pack); +EXPORT_SYMBOL(llc_remove_pack); +EXPORT_SYMBOL(llc_set_station_handler); diff --git a/net/llc/llc_output.c b/net/llc/llc_output.c new file mode 100644 index 000000000000..ab5784cf163e --- /dev/null +++ b/net/llc/llc_output.c @@ -0,0 +1,107 @@ +/* + * llc_output.c - LLC minimal output path + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License version 2 for more details. + */ + +#include +#include +#include +#include +#include +#include +#include + +/** + * llc_mac_hdr_init - fills MAC header fields + * @skb: Address of the frame to initialize its MAC header + * @sa: The MAC source address + * @da: The MAC destination address + * + * Fills MAC header fields, depending on MAC type. Returns 0, If MAC type + * is a valid type and initialization completes correctly 1, otherwise. + */ +int llc_mac_hdr_init(struct sk_buff *skb, unsigned char *sa, unsigned char *da) +{ + int rc = 0; + + switch (skb->dev->type) { +#ifdef CONFIG_TR + case ARPHRD_IEEE802_TR: { + struct net_device *dev = skb->dev; + struct trh_hdr *trh; + + skb->mac.raw = skb_push(skb, sizeof(*trh)); + trh = tr_hdr(skb); + trh->ac = AC; + trh->fc = LLC_FRAME; + if (sa) + memcpy(trh->saddr, sa, dev->addr_len); + else + memset(trh->saddr, 0, dev->addr_len); + if (da) { + memcpy(trh->daddr, da, dev->addr_len); + tr_source_route(skb, trh, dev); + skb->mac.raw = skb->data; + } + break; + } +#endif + case ARPHRD_ETHER: + case ARPHRD_LOOPBACK: { + unsigned short len = skb->len; + struct ethhdr *eth; + + skb->mac.raw = skb_push(skb, sizeof(*eth)); + eth = eth_hdr(skb); + eth->h_proto = htons(len); + memcpy(eth->h_dest, da, ETH_ALEN); + memcpy(eth->h_source, sa, ETH_ALEN); + break; + } + default: + printk(KERN_WARNING "device type not supported: %d\n", + skb->dev->type); + rc = -EINVAL; + } + return rc; +} + +/** + * llc_build_and_send_ui_pkt - unitdata request interface for upper layers + * @sap: sap to use + * @skb: packet to send + * @dmac: destination mac address + * @dsap: destination sap + * + * Upper layers calls this function when upper layer wants to send data + * using connection-less mode communication (UI pdu). + * + * Accept data frame from network layer to be sent using connection- + * less mode communication; timeout/retries handled by network layer; + * package primitive as an event and send to SAP event handler + */ +int llc_build_and_send_ui_pkt(struct llc_sap *sap, struct sk_buff *skb, + unsigned char *dmac, unsigned char dsap) +{ + int rc; + llc_pdu_header_init(skb, LLC_PDU_TYPE_U, sap->laddr.lsap, + dsap, LLC_PDU_CMD); + llc_pdu_init_as_ui_cmd(skb); + rc = llc_mac_hdr_init(skb, skb->dev->dev_addr, dmac); + if (!rc) + rc = dev_queue_xmit(skb); + return rc; +} + +EXPORT_SYMBOL(llc_mac_hdr_init); +EXPORT_SYMBOL(llc_build_and_send_ui_pkt); diff --git a/net/llc/llc_output.h b/net/llc/llc_output.h new file mode 100644 index 000000000000..179edf753f00 --- /dev/null +++ b/net/llc/llc_output.h @@ -0,0 +1,20 @@ +#ifndef LLC_OUTPUT_H +#define LLC_OUTPUT_H +/* + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License version 2 as published by the Free Software + * Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License version 2 for more details. + */ + +struct sk_buff; + +int llc_mac_hdr_init(struct sk_buff *skb, unsigned char *sa, unsigned char *da); + +#endif /* LLC_OUTPUT_H */ diff --git a/net/llc/llc_pdu.c b/net/llc/llc_pdu.c new file mode 100644 index 000000000000..a28ce525d201 --- /dev/null +++ b/net/llc/llc_pdu.c @@ -0,0 +1,372 @@ +/* + * llc_pdu.c - access to PDU internals + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ + +#include +#include + +static void llc_pdu_decode_pdu_type(struct sk_buff *skb, u8 *type); +static u8 llc_pdu_get_pf_bit(struct llc_pdu_sn *pdu); + +void llc_pdu_set_cmd_rsp(struct sk_buff *skb, u8 pdu_type) +{ + llc_pdu_un_hdr(skb)->ssap |= pdu_type; +} + +/** + * pdu_set_pf_bit - sets poll/final bit in LLC header + * @pdu_frame: input frame that p/f bit must be set into it. + * @bit_value: poll/final bit (0 or 1). + * + * This function sets poll/final bit in LLC header (based on type of PDU). + * in I or S pdus, p/f bit is right bit of fourth byte in header. in U + * pdus p/f bit is fifth bit of third byte. + */ +void llc_pdu_set_pf_bit(struct sk_buff *skb, u8 bit_value) +{ + u8 pdu_type; + struct llc_pdu_sn *pdu; + + llc_pdu_decode_pdu_type(skb, &pdu_type); + pdu = llc_pdu_sn_hdr(skb); + + switch (pdu_type) { + case LLC_PDU_TYPE_I: + case LLC_PDU_TYPE_S: + pdu->ctrl_2 = (pdu->ctrl_2 & 0xFE) | bit_value; + break; + case LLC_PDU_TYPE_U: + pdu->ctrl_1 |= (pdu->ctrl_1 & 0xEF) | (bit_value << 4); + break; + } +} + +/** + * llc_pdu_decode_pf_bit - extracs poll/final bit from LLC header + * @skb: input skb that p/f bit must be extracted from it + * @pf_bit: poll/final bit (0 or 1) + * + * This function extracts poll/final bit from LLC header (based on type of + * PDU). In I or S pdus, p/f bit is right bit of fourth byte in header. In + * U pdus p/f bit is fifth bit of third byte. + */ +void llc_pdu_decode_pf_bit(struct sk_buff *skb, u8 *pf_bit) +{ + u8 pdu_type; + struct llc_pdu_sn *pdu; + + llc_pdu_decode_pdu_type(skb, &pdu_type); + pdu = llc_pdu_sn_hdr(skb); + + switch (pdu_type) { + case LLC_PDU_TYPE_I: + case LLC_PDU_TYPE_S: + *pf_bit = pdu->ctrl_2 & LLC_S_PF_BIT_MASK; + break; + case LLC_PDU_TYPE_U: + *pf_bit = (pdu->ctrl_1 & LLC_U_PF_BIT_MASK) >> 4; + break; + } +} + +/** + * llc_pdu_init_as_disc_cmd - Builds DISC PDU + * @skb: Address of the skb to build + * @p_bit: The P bit to set in the PDU + * + * Builds a pdu frame as a DISC command. + */ +void llc_pdu_init_as_disc_cmd(struct sk_buff *skb, u8 p_bit) +{ + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_U; + pdu->ctrl_1 |= LLC_2_PDU_CMD_DISC; + pdu->ctrl_1 |= ((p_bit & 1) << 4) & LLC_U_PF_BIT_MASK; +} + +/** + * llc_pdu_init_as_i_cmd - builds I pdu + * @skb: Address of the skb to build + * @p_bit: The P bit to set in the PDU + * @ns: The sequence number of the data PDU + * @nr: The seq. number of the expected I PDU from the remote + * + * Builds a pdu frame as an I command. + */ +void llc_pdu_init_as_i_cmd(struct sk_buff *skb, u8 p_bit, u8 ns, u8 nr) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_I; + pdu->ctrl_2 = 0; + pdu->ctrl_2 |= (p_bit & LLC_I_PF_BIT_MASK); /* p/f bit */ + pdu->ctrl_1 |= (ns << 1) & 0xFE; /* set N(S) in bits 2..8 */ + pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ +} + +/** + * llc_pdu_init_as_rej_cmd - builds REJ PDU + * @skb: Address of the skb to build + * @p_bit: The P bit to set in the PDU + * @nr: The seq. number of the expected I PDU from the remote + * + * Builds a pdu frame as a REJ command. + */ +void llc_pdu_init_as_rej_cmd(struct sk_buff *skb, u8 p_bit, u8 nr) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_S; + pdu->ctrl_1 |= LLC_2_PDU_CMD_REJ; + pdu->ctrl_2 = 0; + pdu->ctrl_2 |= p_bit & LLC_S_PF_BIT_MASK; + pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ + pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ +} + +/** + * llc_pdu_init_as_rnr_cmd - builds RNR pdu + * @skb: Address of the skb to build + * @p_bit: The P bit to set in the PDU + * @nr: The seq. number of the expected I PDU from the remote + * + * Builds a pdu frame as an RNR command. + */ +void llc_pdu_init_as_rnr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_S; + pdu->ctrl_1 |= LLC_2_PDU_CMD_RNR; + pdu->ctrl_2 = 0; + pdu->ctrl_2 |= p_bit & LLC_S_PF_BIT_MASK; + pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ + pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ +} + +/** + * llc_pdu_init_as_rr_cmd - Builds RR pdu + * @skb: Address of the skb to build + * @p_bit: The P bit to set in the PDU + * @nr: The seq. number of the expected I PDU from the remote + * + * Builds a pdu frame as an RR command. + */ +void llc_pdu_init_as_rr_cmd(struct sk_buff *skb, u8 p_bit, u8 nr) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_S; + pdu->ctrl_1 |= LLC_2_PDU_CMD_RR; + pdu->ctrl_2 = p_bit & LLC_S_PF_BIT_MASK; + pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ + pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ +} + +/** + * llc_pdu_init_as_sabme_cmd - builds SABME pdu + * @skb: Address of the skb to build + * @p_bit: The P bit to set in the PDU + * + * Builds a pdu frame as an SABME command. + */ +void llc_pdu_init_as_sabme_cmd(struct sk_buff *skb, u8 p_bit) +{ + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_U; + pdu->ctrl_1 |= LLC_2_PDU_CMD_SABME; + pdu->ctrl_1 |= ((p_bit & 1) << 4) & LLC_U_PF_BIT_MASK; +} + +/** + * llc_pdu_init_as_dm_rsp - builds DM response pdu + * @skb: Address of the skb to build + * @f_bit: The F bit to set in the PDU + * + * Builds a pdu frame as a DM response. + */ +void llc_pdu_init_as_dm_rsp(struct sk_buff *skb, u8 f_bit) +{ + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_U; + pdu->ctrl_1 |= LLC_2_PDU_RSP_DM; + pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK; +} + +/** + * llc_pdu_init_as_frmr_rsp - builds FRMR response PDU + * @skb: Address of the frame to build + * @prev_pdu: The rejected PDU frame + * @f_bit: The F bit to set in the PDU + * @vs: tx state vari value for the data link conn at the rejecting LLC + * @vr: rx state var value for the data link conn at the rejecting LLC + * @vzyxw: completely described in the IEEE Std 802.2 document (Pg 55) + * + * Builds a pdu frame as a FRMR response. + */ +void llc_pdu_init_as_frmr_rsp(struct sk_buff *skb, struct llc_pdu_sn *prev_pdu, + u8 f_bit, u8 vs, u8 vr, u8 vzyxw) +{ + struct llc_frmr_info *frmr_info; + u8 prev_pf = 0; + u8 *ctrl; + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_U; + pdu->ctrl_1 |= LLC_2_PDU_RSP_FRMR; + pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK; + + frmr_info = (struct llc_frmr_info *)&pdu->ctrl_2; + ctrl = (u8 *)&prev_pdu->ctrl_1; + FRMR_INFO_SET_REJ_CNTRL(frmr_info,ctrl); + FRMR_INFO_SET_Vs(frmr_info, vs); + FRMR_INFO_SET_Vr(frmr_info, vr); + prev_pf = llc_pdu_get_pf_bit(prev_pdu); + FRMR_INFO_SET_C_R_BIT(frmr_info, prev_pf); + FRMR_INFO_SET_INVALID_PDU_CTRL_IND(frmr_info, vzyxw); + FRMR_INFO_SET_INVALID_PDU_INFO_IND(frmr_info, vzyxw); + FRMR_INFO_SET_PDU_INFO_2LONG_IND(frmr_info, vzyxw); + FRMR_INFO_SET_PDU_INVALID_Nr_IND(frmr_info, vzyxw); + FRMR_INFO_SET_PDU_INVALID_Ns_IND(frmr_info, vzyxw); + skb_put(skb, 5); +} + +/** + * llc_pdu_init_as_rr_rsp - builds RR response pdu + * @skb: Address of the skb to build + * @f_bit: The F bit to set in the PDU + * @nr: The seq. number of the expected data PDU from the remote + * + * Builds a pdu frame as an RR response. + */ +void llc_pdu_init_as_rr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_S; + pdu->ctrl_1 |= LLC_2_PDU_RSP_RR; + pdu->ctrl_2 = 0; + pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK; + pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ + pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ +} + +/** + * llc_pdu_init_as_rej_rsp - builds REJ response pdu + * @skb: Address of the skb to build + * @f_bit: The F bit to set in the PDU + * @nr: The seq. number of the expected data PDU from the remote + * + * Builds a pdu frame as a REJ response. + */ +void llc_pdu_init_as_rej_rsp(struct sk_buff *skb, u8 f_bit, u8 nr) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_S; + pdu->ctrl_1 |= LLC_2_PDU_RSP_REJ; + pdu->ctrl_2 = 0; + pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK; + pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ + pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ +} + +/** + * llc_pdu_init_as_rnr_rsp - builds RNR response pdu + * @skb: Address of the frame to build + * @f_bit: The F bit to set in the PDU + * @nr: The seq. number of the expected data PDU from the remote + * + * Builds a pdu frame as an RNR response. + */ +void llc_pdu_init_as_rnr_rsp(struct sk_buff *skb, u8 f_bit, u8 nr) +{ + struct llc_pdu_sn *pdu = llc_pdu_sn_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_S; + pdu->ctrl_1 |= LLC_2_PDU_RSP_RNR; + pdu->ctrl_2 = 0; + pdu->ctrl_2 |= f_bit & LLC_S_PF_BIT_MASK; + pdu->ctrl_1 &= 0x0F; /* setting bits 5..8 to zero(reserved) */ + pdu->ctrl_2 |= (nr << 1) & 0xFE; /* set N(R) in bits 10..16 */ +} + +/** + * llc_pdu_init_as_ua_rsp - builds UA response pdu + * @skb: Address of the frame to build + * @f_bit: The F bit to set in the PDU + * + * Builds a pdu frame as a UA response. + */ +void llc_pdu_init_as_ua_rsp(struct sk_buff *skb, u8 f_bit) +{ + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + pdu->ctrl_1 = LLC_PDU_TYPE_U; + pdu->ctrl_1 |= LLC_2_PDU_RSP_UA; + pdu->ctrl_1 |= ((f_bit & 1) << 4) & LLC_U_PF_BIT_MASK; +} + +/** + * llc_pdu_decode_pdu_type - designates PDU type + * @skb: input skb that type of it must be designated. + * @type: type of PDU (output argument). + * + * This function designates type of PDU (I, S or U). + */ +static void llc_pdu_decode_pdu_type(struct sk_buff *skb, u8 *type) +{ + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + if (pdu->ctrl_1 & 1) { + if ((pdu->ctrl_1 & LLC_PDU_TYPE_U) == LLC_PDU_TYPE_U) + *type = LLC_PDU_TYPE_U; + else + *type = LLC_PDU_TYPE_S; + } else + *type = LLC_PDU_TYPE_I; +} + +/** + * llc_pdu_get_pf_bit - extracts p/f bit of input PDU + * @pdu: pointer to LLC header. + * + * This function extracts p/f bit of input PDU. at first examines type of + * PDU and then extracts p/f bit. Returns the p/f bit. + */ +static u8 llc_pdu_get_pf_bit(struct llc_pdu_sn *pdu) +{ + u8 pdu_type; + u8 pf_bit = 0; + + if (pdu->ctrl_1 & 1) { + if ((pdu->ctrl_1 & LLC_PDU_TYPE_U) == LLC_PDU_TYPE_U) + pdu_type = LLC_PDU_TYPE_U; + else + pdu_type = LLC_PDU_TYPE_S; + } else + pdu_type = LLC_PDU_TYPE_I; + switch (pdu_type) { + case LLC_PDU_TYPE_I: + case LLC_PDU_TYPE_S: + pf_bit = pdu->ctrl_2 & LLC_S_PF_BIT_MASK; + break; + case LLC_PDU_TYPE_U: + pf_bit = (pdu->ctrl_1 & LLC_U_PF_BIT_MASK) >> 4; + break; + } + return pf_bit; +} diff --git a/net/llc/llc_proc.c b/net/llc/llc_proc.c new file mode 100644 index 000000000000..36e8db3fa1a2 --- /dev/null +++ b/net/llc/llc_proc.c @@ -0,0 +1,267 @@ +/* + * proc_llc.c - proc interface for LLC + * + * Copyright (c) 2001 by Jay Schulist + * 2002-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void llc_ui_format_mac(struct seq_file *seq, unsigned char *mac) +{ + seq_printf(seq, "%02X:%02X:%02X:%02X:%02X:%02X", + mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]); +} + +static struct sock *llc_get_sk_idx(loff_t pos) +{ + struct list_head *sap_entry; + struct llc_sap *sap; + struct hlist_node *node; + struct sock *sk = NULL; + + list_for_each(sap_entry, &llc_sap_list) { + sap = list_entry(sap_entry, struct llc_sap, node); + + read_lock_bh(&sap->sk_list.lock); + sk_for_each(sk, node, &sap->sk_list.list) { + if (!pos) + goto found; + --pos; + } + read_unlock_bh(&sap->sk_list.lock); + } + sk = NULL; +found: + return sk; +} + +static void *llc_seq_start(struct seq_file *seq, loff_t *pos) +{ + loff_t l = *pos; + + read_lock_bh(&llc_sap_list_lock); + return l ? llc_get_sk_idx(--l) : SEQ_START_TOKEN; +} + +static void *llc_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock* sk, *next; + struct llc_sock *llc; + struct llc_sap *sap; + + ++*pos; + if (v == SEQ_START_TOKEN) { + sk = llc_get_sk_idx(0); + goto out; + } + sk = v; + next = sk_next(sk); + if (next) { + sk = next; + goto out; + } + llc = llc_sk(sk); + sap = llc->sap; + read_unlock_bh(&sap->sk_list.lock); + sk = NULL; + for (;;) { + if (sap->node.next == &llc_sap_list) + break; + sap = list_entry(sap->node.next, struct llc_sap, node); + read_lock_bh(&sap->sk_list.lock); + if (!hlist_empty(&sap->sk_list.list)) { + sk = sk_head(&sap->sk_list.list); + break; + } + read_unlock_bh(&sap->sk_list.lock); + } +out: + return sk; +} + +static void llc_seq_stop(struct seq_file *seq, void *v) +{ + if (v && v != SEQ_START_TOKEN) { + struct sock *sk = v; + struct llc_sock *llc = llc_sk(sk); + struct llc_sap *sap = llc->sap; + + read_unlock_bh(&sap->sk_list.lock); + } + read_unlock_bh(&llc_sap_list_lock); +} + +static int llc_seq_socket_show(struct seq_file *seq, void *v) +{ + struct sock* sk; + struct llc_sock *llc; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "SKt Mc local_mac_sap remote_mac_sap " + " tx_queue rx_queue st uid link\n"); + goto out; + } + sk = v; + llc = llc_sk(sk); + + /* FIXME: check if the address is multicast */ + seq_printf(seq, "%2X %2X ", sk->sk_type, 0); + + if (llc->dev) + llc_ui_format_mac(seq, llc->dev->dev_addr); + else + seq_printf(seq, "00:00:00:00:00:00"); + seq_printf(seq, "@%02X ", llc->sap->laddr.lsap); + llc_ui_format_mac(seq, llc->daddr.mac); + seq_printf(seq, "@%02X %8d %8d %2d %3d %4d\n", llc->daddr.lsap, + atomic_read(&sk->sk_wmem_alloc), + atomic_read(&sk->sk_rmem_alloc), + sk->sk_state, + sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : -1, + llc->link); +out: + return 0; +} + +static char *llc_conn_state_names[] = { + [LLC_CONN_STATE_ADM] = "adm", + [LLC_CONN_STATE_SETUP] = "setup", + [LLC_CONN_STATE_NORMAL] = "normal", + [LLC_CONN_STATE_BUSY] = "busy", + [LLC_CONN_STATE_REJ] = "rej", + [LLC_CONN_STATE_AWAIT] = "await", + [LLC_CONN_STATE_AWAIT_BUSY] = "await_busy", + [LLC_CONN_STATE_AWAIT_REJ] = "await_rej", + [LLC_CONN_STATE_D_CONN] = "d_conn", + [LLC_CONN_STATE_RESET] = "reset", + [LLC_CONN_STATE_ERROR] = "error", + [LLC_CONN_STATE_TEMP] = "temp", +}; + +static int llc_seq_core_show(struct seq_file *seq, void *v) +{ + struct sock* sk; + struct llc_sock *llc; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "Connection list:\n" + "dsap state retr txw rxw pf ff sf df rs cs " + "tack tpfc trs tbs blog busr\n"); + goto out; + } + sk = v; + llc = llc_sk(sk); + + seq_printf(seq, " %02X %-10s %3d %3d %3d %2d %2d %2d %2d %2d %2d " + "%4d %4d %3d %3d %4d %4d\n", + llc->daddr.lsap, llc_conn_state_names[llc->state], + llc->retry_count, llc->k, llc->rw, llc->p_flag, llc->f_flag, + llc->s_flag, llc->data_flag, llc->remote_busy_flag, + llc->cause_flag, timer_pending(&llc->ack_timer.timer), + timer_pending(&llc->pf_cycle_timer.timer), + timer_pending(&llc->rej_sent_timer.timer), + timer_pending(&llc->busy_state_timer.timer), + !!sk->sk_backlog.tail, !!sock_owned_by_user(sk)); +out: + return 0; +} + +static struct seq_operations llc_seq_socket_ops = { + .start = llc_seq_start, + .next = llc_seq_next, + .stop = llc_seq_stop, + .show = llc_seq_socket_show, +}; + +static struct seq_operations llc_seq_core_ops = { + .start = llc_seq_start, + .next = llc_seq_next, + .stop = llc_seq_stop, + .show = llc_seq_core_show, +}; + +static int llc_seq_socket_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &llc_seq_socket_ops); +} + +static int llc_seq_core_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &llc_seq_core_ops); +} + +static struct file_operations llc_seq_socket_fops = { + .owner = THIS_MODULE, + .open = llc_seq_socket_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct file_operations llc_seq_core_fops = { + .owner = THIS_MODULE, + .open = llc_seq_core_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct proc_dir_entry *llc_proc_dir; + +int __init llc_proc_init(void) +{ + int rc = -ENOMEM; + struct proc_dir_entry *p; + + llc_proc_dir = proc_mkdir("llc", proc_net); + if (!llc_proc_dir) + goto out; + llc_proc_dir->owner = THIS_MODULE; + + p = create_proc_entry("socket", S_IRUGO, llc_proc_dir); + if (!p) + goto out_socket; + + p->proc_fops = &llc_seq_socket_fops; + + p = create_proc_entry("core", S_IRUGO, llc_proc_dir); + if (!p) + goto out_core; + + p->proc_fops = &llc_seq_core_fops; + + rc = 0; +out: + return rc; +out_core: + remove_proc_entry("socket", llc_proc_dir); +out_socket: + remove_proc_entry("llc", proc_net); + goto out; +} + +void llc_proc_exit(void) +{ + remove_proc_entry("socket", llc_proc_dir); + remove_proc_entry("core", llc_proc_dir); + remove_proc_entry("llc", proc_net); +} diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c new file mode 100644 index 000000000000..ed8ba7de6122 --- /dev/null +++ b/net/llc/llc_s_ac.c @@ -0,0 +1,205 @@ +/* + * llc_s_ac.c - actions performed during sap state transition. + * + * Description : + * Functions in this module are implementation of sap component actions. + * Details of actions can be found in IEEE-802.2 standard document. + * All functions have one sap and one event as input argument. All of + * them return 0 On success and 1 otherwise. + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include "llc_output.h" + +/** + * llc_sap_action_unit_data_ind - forward UI PDU to network layer + * @sap: SAP + * @skb: the event to forward + * + * Received a UI PDU from MAC layer; forward to network layer as a + * UNITDATA INDICATION; verify our event is the kind we expect + */ +int llc_sap_action_unitdata_ind(struct llc_sap *sap, struct sk_buff *skb) +{ + llc_sap_rtn_pdu(sap, skb); + return 0; +} + +/** + * llc_sap_action_send_ui - sends UI PDU resp to UNITDATA REQ to MAC layer + * @sap: SAP + * @skb: the event to send + * + * Sends a UI PDU to the MAC layer in response to a UNITDATA REQUEST + * primitive from the network layer. Verifies event is a primitive type of + * event. Verify the primitive is a UNITDATA REQUEST. + */ +int llc_sap_action_send_ui(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + int rc; + + llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap, + ev->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_ui_cmd(skb); + rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); + if (!rc) + rc = dev_queue_xmit(skb); + return rc; +} + +/** + * llc_sap_action_send_xid_c - send XID PDU as response to XID REQ + * @sap: SAP + * @skb: the event to send + * + * Send a XID command PDU to MAC layer in response to a XID REQUEST + * primitive from the network layer. Verify event is a primitive type + * event. Verify the primitive is a XID REQUEST. + */ +int llc_sap_action_send_xid_c(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + int rc; + + llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap, + ev->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_xid_cmd(skb, LLC_XID_NULL_CLASS_2, 0); + rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); + if (!rc) + rc = dev_queue_xmit(skb); + return rc; +} + +/** + * llc_sap_action_send_xid_r - send XID PDU resp to MAC for received XID + * @sap: SAP + * @skb: the event to send + * + * Send XID response PDU to MAC in response to an earlier received XID + * command PDU. Verify event is a PDU type event + */ +int llc_sap_action_send_xid_r(struct llc_sap *sap, struct sk_buff *skb) +{ + u8 mac_da[ETH_ALEN], mac_sa[ETH_ALEN], dsap; + int rc = 1; + struct sk_buff *nskb; + + llc_pdu_decode_sa(skb, mac_da); + llc_pdu_decode_da(skb, mac_sa); + llc_pdu_decode_ssap(skb, &dsap); + nskb = llc_alloc_frame(); + if (!nskb) + goto out; + nskb->dev = skb->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, dsap, + LLC_PDU_RSP); + llc_pdu_init_as_xid_rsp(nskb, LLC_XID_NULL_CLASS_2, 0); + rc = llc_mac_hdr_init(nskb, mac_sa, mac_da); + if (!rc) + rc = dev_queue_xmit(nskb); +out: + return rc; +} + +/** + * llc_sap_action_send_test_c - send TEST PDU to MAC in resp to TEST REQ + * @sap: SAP + * @skb: the event to send + * + * Send a TEST command PDU to the MAC layer in response to a TEST REQUEST + * primitive from the network layer. Verify event is a primitive type + * event; verify the primitive is a TEST REQUEST. + */ +int llc_sap_action_send_test_c(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + int rc; + + llc_pdu_header_init(skb, LLC_PDU_TYPE_U, ev->saddr.lsap, + ev->daddr.lsap, LLC_PDU_CMD); + llc_pdu_init_as_test_cmd(skb); + rc = llc_mac_hdr_init(skb, ev->saddr.mac, ev->daddr.mac); + if (!rc) + rc = dev_queue_xmit(skb); + return rc; +} + +int llc_sap_action_send_test_r(struct llc_sap *sap, struct sk_buff *skb) +{ + u8 mac_da[ETH_ALEN], mac_sa[ETH_ALEN], dsap; + struct sk_buff *nskb; + int rc = 1; + + llc_pdu_decode_sa(skb, mac_da); + llc_pdu_decode_da(skb, mac_sa); + llc_pdu_decode_ssap(skb, &dsap); + nskb = llc_alloc_frame(); + if (!nskb) + goto out; + nskb->dev = skb->dev; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, sap->laddr.lsap, dsap, + LLC_PDU_RSP); + llc_pdu_init_as_test_rsp(nskb, skb); + rc = llc_mac_hdr_init(nskb, mac_sa, mac_da); + if (!rc) + rc = dev_queue_xmit(nskb); +out: + return rc; +} + +/** + * llc_sap_action_report_status - report data link status to layer mgmt + * @sap: SAP + * @skb: the event to send + * + * Report data link status to layer management. Verify our event is the + * kind we expect. + */ +int llc_sap_action_report_status(struct llc_sap *sap, struct sk_buff *skb) +{ + return 0; +} + +/** + * llc_sap_action_xid_ind - send XID PDU resp to net layer via XID IND + * @sap: SAP + * @skb: the event to send + * + * Send a XID response PDU to the network layer via a XID INDICATION + * primitive. + */ +int llc_sap_action_xid_ind(struct llc_sap *sap, struct sk_buff *skb) +{ + llc_sap_rtn_pdu(sap, skb); + return 0; +} + +/** + * llc_sap_action_test_ind - send TEST PDU to net layer via TEST IND + * @sap: SAP + * @skb: the event to send + * + * Send a TEST response PDU to the network layer via a TEST INDICATION + * primitive. Verify our event is a PDU type event. + */ +int llc_sap_action_test_ind(struct llc_sap *sap, struct sk_buff *skb) +{ + llc_sap_rtn_pdu(sap, skb); + return 0; +} diff --git a/net/llc/llc_s_ev.c b/net/llc/llc_s_ev.c new file mode 100644 index 000000000000..a74d2a1d6581 --- /dev/null +++ b/net/llc/llc_s_ev.c @@ -0,0 +1,115 @@ +/* + * llc_s_ev.c - Defines SAP component events + * + * The followed event functions are SAP component events which are described + * in 802.2 LLC protocol standard document. + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include +#include + +int llc_sap_ev_activation_req(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + return ev->type == LLC_SAP_EV_TYPE_SIMPLE && + ev->prim_type == LLC_SAP_EV_ACTIVATION_REQ ? 0 : 1; +} + +int llc_sap_ev_rx_ui(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_CMD(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_UI ? 0 : 1; +} + +int llc_sap_ev_unitdata_req(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + return ev->type == LLC_SAP_EV_TYPE_PRIM && + ev->prim == LLC_DATAUNIT_PRIM && + ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; + +} + +int llc_sap_ev_xid_req(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + return ev->type == LLC_SAP_EV_TYPE_PRIM && + ev->prim == LLC_XID_PRIM && + ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; +} + +int llc_sap_ev_rx_xid_c(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_CMD(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_XID ? 0 : 1; +} + +int llc_sap_ev_rx_xid_r(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_RSP(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_RSP(pdu) == LLC_1_PDU_CMD_XID ? 0 : 1; +} + +int llc_sap_ev_test_req(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + return ev->type == LLC_SAP_EV_TYPE_PRIM && + ev->prim == LLC_TEST_PRIM && + ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; +} + +int llc_sap_ev_rx_test_c(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_CMD(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_TEST ? 0 : 1; +} + +int llc_sap_ev_rx_test_r(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_SAP_EV_TYPE_PDU && LLC_PDU_IS_RSP(pdu) && + LLC_PDU_TYPE_IS_U(pdu) && + LLC_U_PDU_RSP(pdu) == LLC_1_PDU_CMD_TEST ? 0 : 1; +} + +int llc_sap_ev_deactivation_req(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + return ev->type == LLC_SAP_EV_TYPE_SIMPLE && + ev->prim_type == LLC_SAP_EV_DEACTIVATION_REQ ? 0 : 1; +} diff --git a/net/llc/llc_s_st.c b/net/llc/llc_s_st.c new file mode 100644 index 000000000000..6a43201aa32e --- /dev/null +++ b/net/llc/llc_s_st.c @@ -0,0 +1,183 @@ +/* + * llc_s_st.c - Defines SAP component state machine transitions. + * + * The followed transitions are SAP component state machine transitions + * which are described in 802.2 LLC protocol standard document. + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include +#include + +/* dummy last-transition indicator; common to all state transition groups + * last entry for this state + * all members are zeros, .bss zeroes it + */ +static struct llc_sap_state_trans llc_sap_state_trans_end; + +/* state LLC_SAP_STATE_INACTIVE transition for + * LLC_SAP_EV_ACTIVATION_REQ event + */ +static llc_sap_action_t llc_sap_inactive_state_actions_1[] = { + [0] = llc_sap_action_report_status, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_inactive_state_trans_1 = { + .ev = llc_sap_ev_activation_req, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_inactive_state_actions_1, +}; + +/* array of pointers; one to each transition */ +static struct llc_sap_state_trans *llc_sap_inactive_state_transitions[] = { + [0] = &llc_sap_inactive_state_trans_1, + [1] = &llc_sap_state_trans_end, +}; + +/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_UI event */ +static llc_sap_action_t llc_sap_active_state_actions_1[] = { + [0] = llc_sap_action_unitdata_ind, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_1 = { + .ev = llc_sap_ev_rx_ui, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_active_state_actions_1, +}; + +/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_UNITDATA_REQ event */ +static llc_sap_action_t llc_sap_active_state_actions_2[] = { + [0] = llc_sap_action_send_ui, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_2 = { + .ev = llc_sap_ev_unitdata_req, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_active_state_actions_2, +}; + +/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_XID_REQ event */ +static llc_sap_action_t llc_sap_active_state_actions_3[] = { + [0] = llc_sap_action_send_xid_c, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_3 = { + .ev = llc_sap_ev_xid_req, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_active_state_actions_3, +}; + +/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_XID_C event */ +static llc_sap_action_t llc_sap_active_state_actions_4[] = { + [0] = llc_sap_action_send_xid_r, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_4 = { + .ev = llc_sap_ev_rx_xid_c, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_active_state_actions_4, +}; + +/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_XID_R event */ +static llc_sap_action_t llc_sap_active_state_actions_5[] = { + [0] = llc_sap_action_xid_ind, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_5 = { + .ev = llc_sap_ev_rx_xid_r, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_active_state_actions_5, +}; + +/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_TEST_REQ event */ +static llc_sap_action_t llc_sap_active_state_actions_6[] = { + [0] = llc_sap_action_send_test_c, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_6 = { + .ev = llc_sap_ev_test_req, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_active_state_actions_6, +}; + +/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_TEST_C event */ +static llc_sap_action_t llc_sap_active_state_actions_7[] = { + [0] = llc_sap_action_send_test_r, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_7 = { + .ev = llc_sap_ev_rx_test_c, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_active_state_actions_7 +}; + +/* state LLC_SAP_STATE_ACTIVE transition for LLC_SAP_EV_RX_TEST_R event */ +static llc_sap_action_t llc_sap_active_state_actions_8[] = { + [0] = llc_sap_action_test_ind, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_8 = { + .ev = llc_sap_ev_rx_test_r, + .next_state = LLC_SAP_STATE_ACTIVE, + .ev_actions = llc_sap_active_state_actions_8, +}; + +/* state LLC_SAP_STATE_ACTIVE transition for + * LLC_SAP_EV_DEACTIVATION_REQ event + */ +static llc_sap_action_t llc_sap_active_state_actions_9[] = { + [0] = llc_sap_action_report_status, + [1] = NULL, +}; + +static struct llc_sap_state_trans llc_sap_active_state_trans_9 = { + .ev = llc_sap_ev_deactivation_req, + .next_state = LLC_SAP_STATE_INACTIVE, + .ev_actions = llc_sap_active_state_actions_9 +}; + +/* array of pointers; one to each transition */ +static struct llc_sap_state_trans *llc_sap_active_state_transitions[] = { + [0] = &llc_sap_active_state_trans_2, + [1] = &llc_sap_active_state_trans_1, + [2] = &llc_sap_active_state_trans_3, + [3] = &llc_sap_active_state_trans_4, + [4] = &llc_sap_active_state_trans_5, + [5] = &llc_sap_active_state_trans_6, + [6] = &llc_sap_active_state_trans_7, + [7] = &llc_sap_active_state_trans_8, + [8] = &llc_sap_active_state_trans_9, + [9] = &llc_sap_state_trans_end, +}; + +/* SAP state transition table */ +struct llc_sap_state llc_sap_state_table[LLC_NR_SAP_STATES] = { + [LLC_SAP_STATE_INACTIVE - 1] = { + .curr_state = LLC_SAP_STATE_INACTIVE, + .transitions = llc_sap_inactive_state_transitions, + }, + [LLC_SAP_STATE_ACTIVE - 1] = { + .curr_state = LLC_SAP_STATE_ACTIVE, + .transitions = llc_sap_active_state_transitions, + }, +}; diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c new file mode 100644 index 000000000000..965c94eb4bbc --- /dev/null +++ b/net/llc/llc_sap.c @@ -0,0 +1,316 @@ +/* + * llc_sap.c - driver routines for SAP component. + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * llc_alloc_frame - allocates sk_buff for frame + * + * Allocates an sk_buff for frame and initializes sk_buff fields. + * Returns allocated skb or %NULL when out of memory. + */ +struct sk_buff *llc_alloc_frame(void) +{ + struct sk_buff *skb = alloc_skb(128, GFP_ATOMIC); + + if (skb) { + skb_reserve(skb, 50); + skb->nh.raw = skb->h.raw = skb->data; + skb->protocol = htons(ETH_P_802_2); + skb->dev = dev_base->next; + skb->mac.raw = skb->head; + } + return skb; +} + +void llc_save_primitive(struct sk_buff* skb, u8 prim) +{ + struct sockaddr_llc *addr = llc_ui_skb_cb(skb); + + /* save primitive for use by the user. */ + addr->sllc_family = skb->sk->sk_family; + addr->sllc_arphrd = skb->dev->type; + addr->sllc_test = prim == LLC_TEST_PRIM; + addr->sllc_xid = prim == LLC_XID_PRIM; + addr->sllc_ua = prim == LLC_DATAUNIT_PRIM; + llc_pdu_decode_sa(skb, addr->sllc_mac); + llc_pdu_decode_ssap(skb, &addr->sllc_sap); +} + +/** + * llc_sap_rtn_pdu - Informs upper layer on rx of an UI, XID or TEST pdu. + * @sap: pointer to SAP + * @skb: received pdu + */ +void llc_sap_rtn_pdu(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + switch (LLC_U_PDU_RSP(pdu)) { + case LLC_1_PDU_CMD_TEST: + ev->prim = LLC_TEST_PRIM; break; + case LLC_1_PDU_CMD_XID: + ev->prim = LLC_XID_PRIM; break; + case LLC_1_PDU_CMD_UI: + ev->prim = LLC_DATAUNIT_PRIM; break; + } + ev->ind_cfm_flag = LLC_IND; +} + +/** + * llc_find_sap_trans - finds transition for event + * @sap: pointer to SAP + * @skb: happened event + * + * This function finds transition that matches with happened event. + * Returns the pointer to found transition on success or %NULL for + * failure. + */ +static struct llc_sap_state_trans *llc_find_sap_trans(struct llc_sap *sap, + struct sk_buff* skb) +{ + int i = 0; + struct llc_sap_state_trans *rc = NULL; + struct llc_sap_state_trans **next_trans; + struct llc_sap_state *curr_state = &llc_sap_state_table[sap->state - 1]; + /* + * Search thru events for this state until list exhausted or until + * its obvious the event is not valid for the current state + */ + for (next_trans = curr_state->transitions; next_trans[i]->ev; i++) + if (!next_trans[i]->ev(sap, skb)) { + rc = next_trans[i]; /* got event match; return it */ + break; + } + return rc; +} + +/** + * llc_exec_sap_trans_actions - execute actions related to event + * @sap: pointer to SAP + * @trans: pointer to transition that it's actions must be performed + * @skb: happened event. + * + * This function executes actions that is related to happened event. + * Returns 0 for success and 1 for failure of at least one action. + */ +static int llc_exec_sap_trans_actions(struct llc_sap *sap, + struct llc_sap_state_trans *trans, + struct sk_buff *skb) +{ + int rc = 0; + llc_sap_action_t *next_action = trans->ev_actions; + + for (; next_action && *next_action; next_action++) + if ((*next_action)(sap, skb)) + rc = 1; + return rc; +} + +/** + * llc_sap_next_state - finds transition, execs actions & change SAP state + * @sap: pointer to SAP + * @skb: happened event + * + * This function finds transition that matches with happened event, then + * executes related actions and finally changes state of SAP. It returns + * 0 on success and 1 for failure. + */ +static int llc_sap_next_state(struct llc_sap *sap, struct sk_buff *skb) +{ + int rc = 1; + struct llc_sap_state_trans *trans; + + if (sap->state > LLC_NR_SAP_STATES) + goto out; + trans = llc_find_sap_trans(sap, skb); + if (!trans) + goto out; + /* + * Got the state to which we next transition; perform the actions + * associated with this transition before actually transitioning to the + * next state + */ + rc = llc_exec_sap_trans_actions(sap, trans, skb); + if (rc) + goto out; + /* + * Transition SAP to next state if all actions execute successfully + */ + sap->state = trans->next_state; +out: + return rc; +} + +/** + * llc_sap_state_process - sends event to SAP state machine + * @sap: sap to use + * @skb: pointer to occurred event + * + * After executing actions of the event, upper layer will be indicated + * if needed(on receiving an UI frame). sk can be null for the + * datalink_proto case. + */ +static void llc_sap_state_process(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + /* + * We have to hold the skb, because llc_sap_next_state + * will kfree it in the sending path and we need to + * look at the skb->cb, where we encode llc_sap_state_ev. + */ + skb_get(skb); + ev->ind_cfm_flag = 0; + llc_sap_next_state(sap, skb); + if (ev->ind_cfm_flag == LLC_IND) { + if (skb->sk->sk_state == TCP_LISTEN) + kfree_skb(skb); + else { + llc_save_primitive(skb, ev->prim); + + /* queue skb to the user. */ + if (sock_queue_rcv_skb(skb->sk, skb)) + kfree_skb(skb); + } + } + kfree_skb(skb); +} + +/** + * llc_build_and_send_test_pkt - TEST interface for upper layers. + * @sap: sap to use + * @skb: packet to send + * @dmac: destination mac address + * @dsap: destination sap + * + * This function is called when upper layer wants to send a TEST pdu. + * Returns 0 for success, 1 otherwise. + */ +void llc_build_and_send_test_pkt(struct llc_sap *sap, + struct sk_buff *skb, u8 *dmac, u8 dsap) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + ev->saddr.lsap = sap->laddr.lsap; + ev->daddr.lsap = dsap; + memcpy(ev->saddr.mac, skb->dev->dev_addr, IFHWADDRLEN); + memcpy(ev->daddr.mac, dmac, IFHWADDRLEN); + + ev->type = LLC_SAP_EV_TYPE_PRIM; + ev->prim = LLC_TEST_PRIM; + ev->prim_type = LLC_PRIM_TYPE_REQ; + llc_sap_state_process(sap, skb); +} + +/** + * llc_build_and_send_xid_pkt - XID interface for upper layers + * @sap: sap to use + * @skb: packet to send + * @dmac: destination mac address + * @dsap: destination sap + * + * This function is called when upper layer wants to send a XID pdu. + * Returns 0 for success, 1 otherwise. + */ +void llc_build_and_send_xid_pkt(struct llc_sap *sap, struct sk_buff *skb, + u8 *dmac, u8 dsap) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + ev->saddr.lsap = sap->laddr.lsap; + ev->daddr.lsap = dsap; + memcpy(ev->saddr.mac, skb->dev->dev_addr, IFHWADDRLEN); + memcpy(ev->daddr.mac, dmac, IFHWADDRLEN); + + ev->type = LLC_SAP_EV_TYPE_PRIM; + ev->prim = LLC_XID_PRIM; + ev->prim_type = LLC_PRIM_TYPE_REQ; + llc_sap_state_process(sap, skb); +} + +/** + * llc_sap_rcv - sends received pdus to the sap state machine + * @sap: current sap component structure. + * @skb: received frame. + * + * Sends received pdus to the sap state machine. + */ +static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_sap_state_ev *ev = llc_sap_ev(skb); + + ev->type = LLC_SAP_EV_TYPE_PDU; + ev->reason = 0; + llc_sap_state_process(sap, skb); +} + +/** + * llc_lookup_dgram - Finds dgram socket for the local sap/mac + * @sap: SAP + * @laddr: address of local LLC (MAC + SAP) + * + * Search socket list of the SAP and finds connection using the local + * mac, and local sap. Returns pointer for socket found, %NULL otherwise. + */ +static struct sock *llc_lookup_dgram(struct llc_sap *sap, + struct llc_addr *laddr) +{ + struct sock *rc; + struct hlist_node *node; + + read_lock_bh(&sap->sk_list.lock); + sk_for_each(rc, node, &sap->sk_list.list) { + struct llc_sock *llc = llc_sk(rc); + + if (rc->sk_type == SOCK_DGRAM && + llc->laddr.lsap == laddr->lsap && + llc_mac_match(llc->laddr.mac, laddr->mac)) { + sock_hold(rc); + goto found; + } + } + rc = NULL; +found: + read_unlock_bh(&sap->sk_list.lock); + return rc; +} + +void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb) +{ + struct llc_addr laddr; + struct sock *sk; + + llc_pdu_decode_da(skb, laddr.mac); + llc_pdu_decode_dsap(skb, &laddr.lsap); + + sk = llc_lookup_dgram(sap, &laddr); + if (sk) { + skb->sk = sk; + llc_sap_rcv(sap, skb); + sock_put(sk); + } else + kfree_skb(skb); +} diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c new file mode 100644 index 000000000000..8fe48a24bad5 --- /dev/null +++ b/net/llc/llc_station.c @@ -0,0 +1,713 @@ +/* + * llc_station.c - station component of LLC + * + * Copyright (c) 1997 by Procom Technology, Inc. + * 2001-2003 by Arnaldo Carvalho de Melo + * + * This program can be redistributed or modified under the terms of the + * GNU General Public License as published by the Free Software Foundation. + * This program is distributed without any warranty or implied warranty + * of merchantability or fitness for a particular purpose. + * + * See the GNU General Public License for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * struct llc_station - LLC station component + * + * SAP and connection resource manager, one per adapter. + * + * @state - state of station + * @xid_r_count - XID response PDU counter + * @mac_sa - MAC source address + * @sap_list - list of related SAPs + * @ev_q - events entering state mach. + * @mac_pdu_q - PDUs ready to send to MAC + */ +struct llc_station { + u8 state; + u8 xid_r_count; + struct timer_list ack_timer; + u8 retry_count; + u8 maximum_retry; + struct { + struct sk_buff_head list; + spinlock_t lock; + } ev_q; + struct sk_buff_head mac_pdu_q; +}; + +/* Types of events (possible values in 'ev->type') */ +#define LLC_STATION_EV_TYPE_SIMPLE 1 +#define LLC_STATION_EV_TYPE_CONDITION 2 +#define LLC_STATION_EV_TYPE_PRIM 3 +#define LLC_STATION_EV_TYPE_PDU 4 /* command/response PDU */ +#define LLC_STATION_EV_TYPE_ACK_TMR 5 +#define LLC_STATION_EV_TYPE_RPT_STATUS 6 + +/* Events */ +#define LLC_STATION_EV_ENABLE_WITH_DUP_ADDR_CHECK 1 +#define LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK 2 +#define LLC_STATION_EV_ACK_TMR_EXP_LT_RETRY_CNT_MAX_RETRY 3 +#define LLC_STATION_EV_ACK_TMR_EXP_EQ_RETRY_CNT_MAX_RETRY 4 +#define LLC_STATION_EV_RX_NULL_DSAP_XID_C 5 +#define LLC_STATION_EV_RX_NULL_DSAP_0_XID_R_XID_R_CNT_EQ 6 +#define LLC_STATION_EV_RX_NULL_DSAP_1_XID_R_XID_R_CNT_EQ 7 +#define LLC_STATION_EV_RX_NULL_DSAP_TEST_C 8 +#define LLC_STATION_EV_DISABLE_REQ 9 + +struct llc_station_state_ev { + u8 type; + u8 prim; + u8 prim_type; + u8 reason; + struct list_head node; /* node in station->ev_q.list */ +}; + +static __inline__ struct llc_station_state_ev * + llc_station_ev(struct sk_buff *skb) +{ + return (struct llc_station_state_ev *)skb->cb; +} + +typedef int (*llc_station_ev_t)(struct sk_buff *skb); + +#define LLC_STATION_STATE_DOWN 1 /* initial state */ +#define LLC_STATION_STATE_DUP_ADDR_CHK 2 +#define LLC_STATION_STATE_UP 3 + +#define LLC_NBR_STATION_STATES 3 /* size of state table */ + +typedef int (*llc_station_action_t)(struct sk_buff *skb); + +/* Station component state table structure */ +struct llc_station_state_trans { + llc_station_ev_t ev; + u8 next_state; + llc_station_action_t *ev_actions; +}; + +struct llc_station_state { + u8 curr_state; + struct llc_station_state_trans **transitions; +}; + +static struct llc_station llc_main_station; + +static int llc_stat_ev_enable_with_dup_addr_check(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + + return ev->type == LLC_STATION_EV_TYPE_SIMPLE && + ev->prim_type == + LLC_STATION_EV_ENABLE_WITH_DUP_ADDR_CHECK ? 0 : 1; +} + +static int llc_stat_ev_enable_without_dup_addr_check(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + + return ev->type == LLC_STATION_EV_TYPE_SIMPLE && + ev->prim_type == + LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK ? 0 : 1; +} + +static int llc_stat_ev_ack_tmr_exp_lt_retry_cnt_max_retry(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + + return ev->type == LLC_STATION_EV_TYPE_ACK_TMR && + llc_main_station.retry_count < + llc_main_station.maximum_retry ? 0 : 1; +} + +static int llc_stat_ev_ack_tmr_exp_eq_retry_cnt_max_retry(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + + return ev->type == LLC_STATION_EV_TYPE_ACK_TMR && + llc_main_station.retry_count == + llc_main_station.maximum_retry ? 0 : 1; +} + +static int llc_stat_ev_rx_null_dsap_xid_c(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_STATION_EV_TYPE_PDU && + LLC_PDU_IS_CMD(pdu) && /* command PDU */ + LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */ + LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_XID && + !pdu->dsap ? 0 : 1; /* NULL DSAP value */ +} + +static int llc_stat_ev_rx_null_dsap_0_xid_r_xid_r_cnt_eq(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_STATION_EV_TYPE_PDU && + LLC_PDU_IS_RSP(pdu) && /* response PDU */ + LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */ + LLC_U_PDU_RSP(pdu) == LLC_1_PDU_CMD_XID && + !pdu->dsap && /* NULL DSAP value */ + !llc_main_station.xid_r_count ? 0 : 1; +} + +static int llc_stat_ev_rx_null_dsap_1_xid_r_xid_r_cnt_eq(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_STATION_EV_TYPE_PDU && + LLC_PDU_IS_RSP(pdu) && /* response PDU */ + LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */ + LLC_U_PDU_RSP(pdu) == LLC_1_PDU_CMD_XID && + !pdu->dsap && /* NULL DSAP value */ + llc_main_station.xid_r_count == 1 ? 0 : 1; +} + +static int llc_stat_ev_rx_null_dsap_test_c(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); + + return ev->type == LLC_STATION_EV_TYPE_PDU && + LLC_PDU_IS_CMD(pdu) && /* command PDU */ + LLC_PDU_TYPE_IS_U(pdu) && /* U type PDU */ + LLC_U_PDU_CMD(pdu) == LLC_1_PDU_CMD_TEST && + !pdu->dsap ? 0 : 1; /* NULL DSAP */ +} + +static int llc_stat_ev_disable_req(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + + return ev->type == LLC_STATION_EV_TYPE_PRIM && + ev->prim == LLC_DISABLE_PRIM && + ev->prim_type == LLC_PRIM_TYPE_REQ ? 0 : 1; +} + +/** + * llc_station_send_pdu - queues PDU to send + * @skb: Address of the PDU + * + * Queues a PDU to send to the MAC layer. + */ +static void llc_station_send_pdu(struct sk_buff *skb) +{ + skb_queue_tail(&llc_main_station.mac_pdu_q, skb); + while ((skb = skb_dequeue(&llc_main_station.mac_pdu_q)) != NULL) + if (dev_queue_xmit(skb)) + break; +} + +static int llc_station_ac_start_ack_timer(struct sk_buff *skb) +{ + mod_timer(&llc_main_station.ack_timer, jiffies + LLC_ACK_TIME * HZ); + return 0; +} + +static int llc_station_ac_set_retry_cnt_0(struct sk_buff *skb) +{ + llc_main_station.retry_count = 0; + return 0; +} + +static int llc_station_ac_inc_retry_cnt_by_1(struct sk_buff *skb) +{ + llc_main_station.retry_count++; + return 0; +} + +static int llc_station_ac_set_xid_r_cnt_0(struct sk_buff *skb) +{ + llc_main_station.xid_r_count = 0; + return 0; +} + +static int llc_station_ac_inc_xid_r_cnt_by_1(struct sk_buff *skb) +{ + llc_main_station.xid_r_count++; + return 0; +} + +static int llc_station_ac_send_null_dsap_xid_c(struct sk_buff *skb) +{ + int rc = 1; + struct sk_buff *nskb = llc_alloc_frame(); + + if (!nskb) + goto out; + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, 0, LLC_PDU_CMD); + llc_pdu_init_as_xid_cmd(nskb, LLC_XID_NULL_CLASS_2, 127); + rc = llc_mac_hdr_init(nskb, llc_station_mac_sa, llc_station_mac_sa); + if (rc) + goto free; + llc_station_send_pdu(nskb); +out: + return rc; +free: + kfree_skb(skb); + goto out; +} + +static int llc_station_ac_send_xid_r(struct sk_buff *skb) +{ + u8 mac_da[ETH_ALEN], dsap; + int rc = 1; + struct sk_buff* nskb = llc_alloc_frame(); + + if (!nskb) + goto out; + rc = 0; + nskb->dev = skb->dev; + llc_pdu_decode_sa(skb, mac_da); + llc_pdu_decode_ssap(skb, &dsap); + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP); + llc_pdu_init_as_xid_rsp(nskb, LLC_XID_NULL_CLASS_2, 127); + rc = llc_mac_hdr_init(nskb, llc_station_mac_sa, mac_da); + if (rc) + goto free; + llc_station_send_pdu(nskb); +out: + return rc; +free: + kfree_skb(skb); + goto out; +} + +static int llc_station_ac_send_test_r(struct sk_buff *skb) +{ + u8 mac_da[ETH_ALEN], dsap; + int rc = 1; + struct sk_buff *nskb = llc_alloc_frame(); + + if (!nskb) + goto out; + rc = 0; + nskb->dev = skb->dev; + llc_pdu_decode_sa(skb, mac_da); + llc_pdu_decode_ssap(skb, &dsap); + llc_pdu_header_init(nskb, LLC_PDU_TYPE_U, 0, dsap, LLC_PDU_RSP); + llc_pdu_init_as_test_rsp(nskb, skb); + rc = llc_mac_hdr_init(nskb, llc_station_mac_sa, mac_da); + if (rc) + goto free; + llc_station_send_pdu(nskb); +out: + return rc; +free: + kfree_skb(skb); + goto out; +} + +static int llc_station_ac_report_status(struct sk_buff *skb) +{ + return 0; +} + +/* COMMON STATION STATE transitions */ + +/* dummy last-transition indicator; common to all state transition groups + * last entry for this state + * all members are zeros, .bss zeroes it + */ +static struct llc_station_state_trans llc_stat_state_trans_end; + +/* DOWN STATE transitions */ + +/* state transition for LLC_STATION_EV_ENABLE_WITH_DUP_ADDR_CHECK event */ +static llc_station_action_t llc_stat_down_state_actions_1[] = { + [0] = llc_station_ac_start_ack_timer, + [1] = llc_station_ac_set_retry_cnt_0, + [2] = llc_station_ac_set_xid_r_cnt_0, + [3] = llc_station_ac_send_null_dsap_xid_c, + [4] = NULL, +}; + +static struct llc_station_state_trans llc_stat_down_state_trans_1 = { + .ev = llc_stat_ev_enable_with_dup_addr_check, + .next_state = LLC_STATION_STATE_DUP_ADDR_CHK, + .ev_actions = llc_stat_down_state_actions_1, +}; + +/* state transition for LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK event */ +static llc_station_action_t llc_stat_down_state_actions_2[] = { + [0] = llc_station_ac_report_status, /* STATION UP */ + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_down_state_trans_2 = { + .ev = llc_stat_ev_enable_without_dup_addr_check, + .next_state = LLC_STATION_STATE_UP, + .ev_actions = llc_stat_down_state_actions_2, +}; + +/* array of pointers; one to each transition */ +static struct llc_station_state_trans *llc_stat_dwn_state_trans[] = { + [0] = &llc_stat_down_state_trans_1, + [1] = &llc_stat_down_state_trans_2, + [2] = &llc_stat_state_trans_end, +}; + +/* UP STATE transitions */ +/* state transition for LLC_STATION_EV_DISABLE_REQ event */ +static llc_station_action_t llc_stat_up_state_actions_1[] = { + [0] = llc_station_ac_report_status, /* STATION DOWN */ + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_up_state_trans_1 = { + .ev = llc_stat_ev_disable_req, + .next_state = LLC_STATION_STATE_DOWN, + .ev_actions = llc_stat_up_state_actions_1, +}; + +/* state transition for LLC_STATION_EV_RX_NULL_DSAP_XID_C event */ +static llc_station_action_t llc_stat_up_state_actions_2[] = { + [0] = llc_station_ac_send_xid_r, + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_up_state_trans_2 = { + .ev = llc_stat_ev_rx_null_dsap_xid_c, + .next_state = LLC_STATION_STATE_UP, + .ev_actions = llc_stat_up_state_actions_2, +}; + +/* state transition for LLC_STATION_EV_RX_NULL_DSAP_TEST_C event */ +static llc_station_action_t llc_stat_up_state_actions_3[] = { + [0] = llc_station_ac_send_test_r, + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_up_state_trans_3 = { + .ev = llc_stat_ev_rx_null_dsap_test_c, + .next_state = LLC_STATION_STATE_UP, + .ev_actions = llc_stat_up_state_actions_3, +}; + +/* array of pointers; one to each transition */ +static struct llc_station_state_trans *llc_stat_up_state_trans [] = { + [0] = &llc_stat_up_state_trans_1, + [1] = &llc_stat_up_state_trans_2, + [2] = &llc_stat_up_state_trans_3, + [3] = &llc_stat_state_trans_end, +}; + +/* DUP ADDR CHK STATE transitions */ +/* state transition for LLC_STATION_EV_RX_NULL_DSAP_0_XID_R_XID_R_CNT_EQ + * event + */ +static llc_station_action_t llc_stat_dupaddr_state_actions_1[] = { + [0] = llc_station_ac_inc_xid_r_cnt_by_1, + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_dupaddr_state_trans_1 = { + .ev = llc_stat_ev_rx_null_dsap_0_xid_r_xid_r_cnt_eq, + .next_state = LLC_STATION_STATE_DUP_ADDR_CHK, + .ev_actions = llc_stat_dupaddr_state_actions_1, +}; + +/* state transition for LLC_STATION_EV_RX_NULL_DSAP_1_XID_R_XID_R_CNT_EQ + * event + */ +static llc_station_action_t llc_stat_dupaddr_state_actions_2[] = { + [0] = llc_station_ac_report_status, /* DUPLICATE ADDRESS FOUND */ + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_dupaddr_state_trans_2 = { + .ev = llc_stat_ev_rx_null_dsap_1_xid_r_xid_r_cnt_eq, + .next_state = LLC_STATION_STATE_DOWN, + .ev_actions = llc_stat_dupaddr_state_actions_2, +}; + +/* state transition for LLC_STATION_EV_RX_NULL_DSAP_XID_C event */ +static llc_station_action_t llc_stat_dupaddr_state_actions_3[] = { + [0] = llc_station_ac_send_xid_r, + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_dupaddr_state_trans_3 = { + .ev = llc_stat_ev_rx_null_dsap_xid_c, + .next_state = LLC_STATION_STATE_DUP_ADDR_CHK, + .ev_actions = llc_stat_dupaddr_state_actions_3, +}; + +/* state transition for LLC_STATION_EV_ACK_TMR_EXP_LT_RETRY_CNT_MAX_RETRY + * event + */ +static llc_station_action_t llc_stat_dupaddr_state_actions_4[] = { + [0] = llc_station_ac_start_ack_timer, + [1] = llc_station_ac_inc_retry_cnt_by_1, + [2] = llc_station_ac_set_xid_r_cnt_0, + [3] = llc_station_ac_send_null_dsap_xid_c, + [4] = NULL, +}; + +static struct llc_station_state_trans llc_stat_dupaddr_state_trans_4 = { + .ev = llc_stat_ev_ack_tmr_exp_lt_retry_cnt_max_retry, + .next_state = LLC_STATION_STATE_DUP_ADDR_CHK, + .ev_actions = llc_stat_dupaddr_state_actions_4, +}; + +/* state transition for LLC_STATION_EV_ACK_TMR_EXP_EQ_RETRY_CNT_MAX_RETRY + * event + */ +static llc_station_action_t llc_stat_dupaddr_state_actions_5[] = { + [0] = llc_station_ac_report_status, /* STATION UP */ + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_dupaddr_state_trans_5 = { + .ev = llc_stat_ev_ack_tmr_exp_eq_retry_cnt_max_retry, + .next_state = LLC_STATION_STATE_UP, + .ev_actions = llc_stat_dupaddr_state_actions_5, +}; + +/* state transition for LLC_STATION_EV_DISABLE_REQ event */ +static llc_station_action_t llc_stat_dupaddr_state_actions_6[] = { + [0] = llc_station_ac_report_status, /* STATION DOWN */ + [1] = NULL, +}; + +static struct llc_station_state_trans llc_stat_dupaddr_state_trans_6 = { + .ev = llc_stat_ev_disable_req, + .next_state = LLC_STATION_STATE_DOWN, + .ev_actions = llc_stat_dupaddr_state_actions_6, +}; + +/* array of pointers; one to each transition */ +static struct llc_station_state_trans *llc_stat_dupaddr_state_trans[] = { + [0] = &llc_stat_dupaddr_state_trans_6, /* Request */ + [1] = &llc_stat_dupaddr_state_trans_4, /* Timer */ + [2] = &llc_stat_dupaddr_state_trans_5, + [3] = &llc_stat_dupaddr_state_trans_1, /* Receive frame */ + [4] = &llc_stat_dupaddr_state_trans_2, + [5] = &llc_stat_dupaddr_state_trans_3, + [6] = &llc_stat_state_trans_end, +}; + +static struct llc_station_state + llc_station_state_table[LLC_NBR_STATION_STATES] = { + [LLC_STATION_STATE_DOWN - 1] = { + .curr_state = LLC_STATION_STATE_DOWN, + .transitions = llc_stat_dwn_state_trans, + }, + [LLC_STATION_STATE_DUP_ADDR_CHK - 1] = { + .curr_state = LLC_STATION_STATE_DUP_ADDR_CHK, + .transitions = llc_stat_dupaddr_state_trans, + }, + [LLC_STATION_STATE_UP - 1] = { + .curr_state = LLC_STATION_STATE_UP, + .transitions = llc_stat_up_state_trans, + }, +}; + +/** + * llc_exec_station_trans_actions - executes actions for transition + * @trans: Address of the transition + * @skb: Address of the event that caused the transition + * + * Executes actions of a transition of the station state machine. Returns + * 0 if all actions complete successfully, nonzero otherwise. + */ +static u16 llc_exec_station_trans_actions(struct llc_station_state_trans *trans, + struct sk_buff *skb) +{ + u16 rc = 0; + llc_station_action_t *next_action = trans->ev_actions; + + for (; next_action && *next_action; next_action++) + if ((*next_action)(skb)) + rc = 1; + return rc; +} + +/** + * llc_find_station_trans - finds transition for this event + * @skb: Address of the event + * + * Search thru events of the current state of the station until list + * exhausted or it's obvious that the event is not valid for the current + * state. Returns the address of the transition if cound, %NULL otherwise. + */ +static struct llc_station_state_trans * + llc_find_station_trans(struct sk_buff *skb) +{ + int i = 0; + struct llc_station_state_trans *rc = NULL; + struct llc_station_state_trans **next_trans; + struct llc_station_state *curr_state = + &llc_station_state_table[llc_main_station.state - 1]; + + for (next_trans = curr_state->transitions; next_trans[i]->ev; i++) + if (!next_trans[i]->ev(skb)) { + rc = next_trans[i]; + break; + } + return rc; +} + +/** + * llc_station_free_ev - frees an event + * @skb: Address of the event + * + * Frees an event. + */ +static void llc_station_free_ev(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + + if (ev->type == LLC_STATION_EV_TYPE_PDU) + kfree_skb(skb); +} + +/** + * llc_station_next_state - processes event and goes to the next state + * @skb: Address of the event + * + * Processes an event, executes any transitions related to that event and + * updates the state of the station. + */ +static u16 llc_station_next_state(struct sk_buff *skb) +{ + u16 rc = 1; + struct llc_station_state_trans *trans; + + if (llc_main_station.state > LLC_NBR_STATION_STATES) + goto out; + trans = llc_find_station_trans(skb); + if (trans) { + /* got the state to which we next transition; perform the + * actions associated with this transition before actually + * transitioning to the next state + */ + rc = llc_exec_station_trans_actions(trans, skb); + if (!rc) + /* transition station to next state if all actions + * execute successfully; done; wait for next event + */ + llc_main_station.state = trans->next_state; + } else + /* event not recognized in current state; re-queue it for + * processing again at a later time; return failure + */ + rc = 0; +out: + llc_station_free_ev(skb); + return rc; +} + +/** + * llc_station_service_events - service events in the queue + * + * Get an event from the station event queue (if any); attempt to service + * the event; if event serviced, get the next event (if any) on the event + * queue; if event not service, re-queue the event on the event queue and + * attempt to service the next event; when serviced all events in queue, + * finished; if don't transition to different state, just service all + * events once; if transition to new state, service all events again. + * Caller must hold llc_main_station.ev_q.lock. + */ +static void llc_station_service_events(void) +{ + struct sk_buff *skb; + + while ((skb = skb_dequeue(&llc_main_station.ev_q.list)) != NULL) + llc_station_next_state(skb); +} + +/** + * llc_station_state_process: queue event and try to process queue. + * @skb: Address of the event + * + * Queues an event (on the station event queue) for handling by the + * station state machine and attempts to process any queued-up events. + */ +static void llc_station_state_process(struct sk_buff *skb) +{ + spin_lock_bh(&llc_main_station.ev_q.lock); + skb_queue_tail(&llc_main_station.ev_q.list, skb); + llc_station_service_events(); + spin_unlock_bh(&llc_main_station.ev_q.lock); +} + +static void llc_station_ack_tmr_cb(unsigned long timeout_data) +{ + struct sk_buff *skb = alloc_skb(0, GFP_ATOMIC); + + if (skb) { + struct llc_station_state_ev *ev = llc_station_ev(skb); + + ev->type = LLC_STATION_EV_TYPE_ACK_TMR; + llc_station_state_process(skb); + } +} + +/* + * llc_station_rcv - send received pdu to the station state machine + * @skb: received frame. + * + * Sends data unit to station state machine. + */ +static void llc_station_rcv(struct sk_buff *skb) +{ + struct llc_station_state_ev *ev = llc_station_ev(skb); + + ev->type = LLC_STATION_EV_TYPE_PDU; + ev->reason = 0; + llc_station_state_process(skb); +} + +int __init llc_station_init(void) +{ + u16 rc = -ENOBUFS; + struct sk_buff *skb; + struct llc_station_state_ev *ev; + + skb_queue_head_init(&llc_main_station.mac_pdu_q); + skb_queue_head_init(&llc_main_station.ev_q.list); + spin_lock_init(&llc_main_station.ev_q.lock); + init_timer(&llc_main_station.ack_timer); + llc_main_station.ack_timer.data = (unsigned long)&llc_main_station; + llc_main_station.ack_timer.function = llc_station_ack_tmr_cb; + + skb = alloc_skb(0, GFP_ATOMIC); + if (!skb) + goto out; + rc = 0; + llc_set_station_handler(llc_station_rcv); + ev = llc_station_ev(skb); + memset(ev, 0, sizeof(*ev)); + llc_main_station.ack_timer.expires = jiffies + 3 * HZ; + llc_main_station.maximum_retry = 1; + llc_main_station.state = LLC_STATION_STATE_DOWN; + ev->type = LLC_STATION_EV_TYPE_SIMPLE; + ev->prim_type = LLC_STATION_EV_ENABLE_WITHOUT_DUP_ADDR_CHECK; + rc = llc_station_next_state(skb); +out: + return rc; +} + +void __exit llc_station_exit(void) +{ + llc_set_station_handler(NULL); +} diff --git a/net/netlink/Makefile b/net/netlink/Makefile new file mode 100644 index 000000000000..39d9c2dcd03c --- /dev/null +++ b/net/netlink/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the netlink driver. +# + +obj-y := af_netlink.o diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c new file mode 100644 index 000000000000..1d5905c90cd4 --- /dev/null +++ b/net/netlink/af_netlink.c @@ -0,0 +1,1454 @@ +/* + * NETLINK Kernel-user communication protocol. + * + * Authors: Alan Cox + * Alexey Kuznetsov + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Tue Jun 26 14:36:48 MEST 2001 Herbert "herp" Rosmanith + * added netlink_proto_exit + * Tue Jan 22 18:32:44 BRST 2002 Arnaldo C. de Melo + * use nlk_sk, as sk->protinfo is on a diet 8) + * + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define Nprintk(a...) + +struct netlink_sock { + /* struct sock has to be the first member of netlink_sock */ + struct sock sk; + u32 pid; + unsigned int groups; + u32 dst_pid; + unsigned int dst_groups; + unsigned long state; + wait_queue_head_t wait; + struct netlink_callback *cb; + spinlock_t cb_lock; + void (*data_ready)(struct sock *sk, int bytes); +}; + +static inline struct netlink_sock *nlk_sk(struct sock *sk) +{ + return (struct netlink_sock *)sk; +} + +struct nl_pid_hash { + struct hlist_head *table; + unsigned long rehash_time; + + unsigned int mask; + unsigned int shift; + + unsigned int entries; + unsigned int max_shift; + + u32 rnd; +}; + +struct netlink_table { + struct nl_pid_hash hash; + struct hlist_head mc_list; + unsigned int nl_nonroot; +}; + +static struct netlink_table *nl_table; + +static DECLARE_WAIT_QUEUE_HEAD(nl_table_wait); + +static int netlink_dump(struct sock *sk); +static void netlink_destroy_callback(struct netlink_callback *cb); + +static DEFINE_RWLOCK(nl_table_lock); +static atomic_t nl_table_users = ATOMIC_INIT(0); + +static struct notifier_block *netlink_chain; + +static struct hlist_head *nl_pid_hashfn(struct nl_pid_hash *hash, u32 pid) +{ + return &hash->table[jhash_1word(pid, hash->rnd) & hash->mask]; +} + +static void netlink_sock_destruct(struct sock *sk) +{ + skb_queue_purge(&sk->sk_receive_queue); + + if (!sock_flag(sk, SOCK_DEAD)) { + printk("Freeing alive netlink socket %p\n", sk); + return; + } + BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); + BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); + BUG_TRAP(!nlk_sk(sk)->cb); +} + +/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on SMP. + * Look, when several writers sleep and reader wakes them up, all but one + * immediately hit write lock and grab all the cpus. Exclusive sleep solves + * this, _but_ remember, it adds useless work on UP machines. + */ + +static void netlink_table_grab(void) +{ + write_lock_bh(&nl_table_lock); + + if (atomic_read(&nl_table_users)) { + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue_exclusive(&nl_table_wait, &wait); + for(;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (atomic_read(&nl_table_users) == 0) + break; + write_unlock_bh(&nl_table_lock); + schedule(); + write_lock_bh(&nl_table_lock); + } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&nl_table_wait, &wait); + } +} + +static __inline__ void netlink_table_ungrab(void) +{ + write_unlock_bh(&nl_table_lock); + wake_up(&nl_table_wait); +} + +static __inline__ void +netlink_lock_table(void) +{ + /* read_lock() synchronizes us to netlink_table_grab */ + + read_lock(&nl_table_lock); + atomic_inc(&nl_table_users); + read_unlock(&nl_table_lock); +} + +static __inline__ void +netlink_unlock_table(void) +{ + if (atomic_dec_and_test(&nl_table_users)) + wake_up(&nl_table_wait); +} + +static __inline__ struct sock *netlink_lookup(int protocol, u32 pid) +{ + struct nl_pid_hash *hash = &nl_table[protocol].hash; + struct hlist_head *head; + struct sock *sk; + struct hlist_node *node; + + read_lock(&nl_table_lock); + head = nl_pid_hashfn(hash, pid); + sk_for_each(sk, node, head) { + if (nlk_sk(sk)->pid == pid) { + sock_hold(sk); + goto found; + } + } + sk = NULL; +found: + read_unlock(&nl_table_lock); + return sk; +} + +static inline struct hlist_head *nl_pid_hash_alloc(size_t size) +{ + if (size <= PAGE_SIZE) + return kmalloc(size, GFP_ATOMIC); + else + return (struct hlist_head *) + __get_free_pages(GFP_ATOMIC, get_order(size)); +} + +static inline void nl_pid_hash_free(struct hlist_head *table, size_t size) +{ + if (size <= PAGE_SIZE) + kfree(table); + else + free_pages((unsigned long)table, get_order(size)); +} + +static int nl_pid_hash_rehash(struct nl_pid_hash *hash, int grow) +{ + unsigned int omask, mask, shift; + size_t osize, size; + struct hlist_head *otable, *table; + int i; + + omask = mask = hash->mask; + osize = size = (mask + 1) * sizeof(*table); + shift = hash->shift; + + if (grow) { + if (++shift > hash->max_shift) + return 0; + mask = mask * 2 + 1; + size *= 2; + } + + table = nl_pid_hash_alloc(size); + if (!table) + return 0; + + memset(table, 0, size); + otable = hash->table; + hash->table = table; + hash->mask = mask; + hash->shift = shift; + get_random_bytes(&hash->rnd, sizeof(hash->rnd)); + + for (i = 0; i <= omask; i++) { + struct sock *sk; + struct hlist_node *node, *tmp; + + sk_for_each_safe(sk, node, tmp, &otable[i]) + __sk_add_node(sk, nl_pid_hashfn(hash, nlk_sk(sk)->pid)); + } + + nl_pid_hash_free(otable, osize); + hash->rehash_time = jiffies + 10 * 60 * HZ; + return 1; +} + +static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len) +{ + int avg = hash->entries >> hash->shift; + + if (unlikely(avg > 1) && nl_pid_hash_rehash(hash, 1)) + return 1; + + if (unlikely(len > avg) && time_after(jiffies, hash->rehash_time)) { + nl_pid_hash_rehash(hash, 0); + return 1; + } + + return 0; +} + +static struct proto_ops netlink_ops; + +static int netlink_insert(struct sock *sk, u32 pid) +{ + struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash; + struct hlist_head *head; + int err = -EADDRINUSE; + struct sock *osk; + struct hlist_node *node; + int len; + + netlink_table_grab(); + head = nl_pid_hashfn(hash, pid); + len = 0; + sk_for_each(osk, node, head) { + if (nlk_sk(osk)->pid == pid) + break; + len++; + } + if (node) + goto err; + + err = -EBUSY; + if (nlk_sk(sk)->pid) + goto err; + + err = -ENOMEM; + if (BITS_PER_LONG > 32 && unlikely(hash->entries >= UINT_MAX)) + goto err; + + if (len && nl_pid_hash_dilute(hash, len)) + head = nl_pid_hashfn(hash, pid); + hash->entries++; + nlk_sk(sk)->pid = pid; + sk_add_node(sk, head); + err = 0; + +err: + netlink_table_ungrab(); + return err; +} + +static void netlink_remove(struct sock *sk) +{ + netlink_table_grab(); + nl_table[sk->sk_protocol].hash.entries--; + sk_del_node_init(sk); + if (nlk_sk(sk)->groups) + __sk_del_bind_node(sk); + netlink_table_ungrab(); +} + +static struct proto netlink_proto = { + .name = "NETLINK", + .owner = THIS_MODULE, + .obj_size = sizeof(struct netlink_sock), +}; + +static int netlink_create(struct socket *sock, int protocol) +{ + struct sock *sk; + struct netlink_sock *nlk; + + sock->state = SS_UNCONNECTED; + + if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM) + return -ESOCKTNOSUPPORT; + + if (protocol<0 || protocol >= MAX_LINKS) + return -EPROTONOSUPPORT; + + sock->ops = &netlink_ops; + + sk = sk_alloc(PF_NETLINK, GFP_KERNEL, &netlink_proto, 1); + if (!sk) + return -ENOMEM; + + sock_init_data(sock, sk); + + nlk = nlk_sk(sk); + + spin_lock_init(&nlk->cb_lock); + init_waitqueue_head(&nlk->wait); + sk->sk_destruct = netlink_sock_destruct; + + sk->sk_protocol = protocol; + return 0; +} + +static int netlink_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk; + + if (!sk) + return 0; + + netlink_remove(sk); + nlk = nlk_sk(sk); + + spin_lock(&nlk->cb_lock); + if (nlk->cb) { + nlk->cb->done(nlk->cb); + netlink_destroy_callback(nlk->cb); + nlk->cb = NULL; + __sock_put(sk); + } + spin_unlock(&nlk->cb_lock); + + /* OK. Socket is unlinked, and, therefore, + no new packets will arrive */ + + sock_orphan(sk); + sock->sk = NULL; + wake_up_interruptible_all(&nlk->wait); + + skb_queue_purge(&sk->sk_write_queue); + + if (nlk->pid && !nlk->groups) { + struct netlink_notify n = { + .protocol = sk->sk_protocol, + .pid = nlk->pid, + }; + notifier_call_chain(&netlink_chain, NETLINK_URELEASE, &n); + } + + sock_put(sk); + return 0; +} + +static int netlink_autobind(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct nl_pid_hash *hash = &nl_table[sk->sk_protocol].hash; + struct hlist_head *head; + struct sock *osk; + struct hlist_node *node; + s32 pid = current->pid; + int err; + static s32 rover = -4097; + +retry: + cond_resched(); + netlink_table_grab(); + head = nl_pid_hashfn(hash, pid); + sk_for_each(osk, node, head) { + if (nlk_sk(osk)->pid == pid) { + /* Bind collision, search negative pid values. */ + pid = rover--; + if (rover > -4097) + rover = -4097; + netlink_table_ungrab(); + goto retry; + } + } + netlink_table_ungrab(); + + err = netlink_insert(sk, pid); + if (err == -EADDRINUSE) + goto retry; + return 0; +} + +static inline int netlink_capable(struct socket *sock, unsigned int flag) +{ + return (nl_table[sock->sk->sk_protocol].nl_nonroot & flag) || + capable(CAP_NET_ADMIN); +} + +static int netlink_bind(struct socket *sock, struct sockaddr *addr, int addr_len) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr; + int err; + + if (nladdr->nl_family != AF_NETLINK) + return -EINVAL; + + /* Only superuser is allowed to listen multicasts */ + if (nladdr->nl_groups && !netlink_capable(sock, NL_NONROOT_RECV)) + return -EPERM; + + if (nlk->pid) { + if (nladdr->nl_pid != nlk->pid) + return -EINVAL; + } else { + err = nladdr->nl_pid ? + netlink_insert(sk, nladdr->nl_pid) : + netlink_autobind(sock); + if (err) + return err; + } + + if (!nladdr->nl_groups && !nlk->groups) + return 0; + + netlink_table_grab(); + if (nlk->groups && !nladdr->nl_groups) + __sk_del_bind_node(sk); + else if (!nlk->groups && nladdr->nl_groups) + sk_add_bind_node(sk, &nl_table[sk->sk_protocol].mc_list); + nlk->groups = nladdr->nl_groups; + netlink_table_ungrab(); + + return 0; +} + +static int netlink_connect(struct socket *sock, struct sockaddr *addr, + int alen, int flags) +{ + int err = 0; + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + struct sockaddr_nl *nladdr=(struct sockaddr_nl*)addr; + + if (addr->sa_family == AF_UNSPEC) { + sk->sk_state = NETLINK_UNCONNECTED; + nlk->dst_pid = 0; + nlk->dst_groups = 0; + return 0; + } + if (addr->sa_family != AF_NETLINK) + return -EINVAL; + + /* Only superuser is allowed to send multicasts */ + if (nladdr->nl_groups && !netlink_capable(sock, NL_NONROOT_SEND)) + return -EPERM; + + if (!nlk->pid) + err = netlink_autobind(sock); + + if (err == 0) { + sk->sk_state = NETLINK_CONNECTED; + nlk->dst_pid = nladdr->nl_pid; + nlk->dst_groups = nladdr->nl_groups; + } + + return err; +} + +static int netlink_getname(struct socket *sock, struct sockaddr *addr, int *addr_len, int peer) +{ + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + struct sockaddr_nl *nladdr=(struct sockaddr_nl *)addr; + + nladdr->nl_family = AF_NETLINK; + nladdr->nl_pad = 0; + *addr_len = sizeof(*nladdr); + + if (peer) { + nladdr->nl_pid = nlk->dst_pid; + nladdr->nl_groups = nlk->dst_groups; + } else { + nladdr->nl_pid = nlk->pid; + nladdr->nl_groups = nlk->groups; + } + return 0; +} + +static void netlink_overrun(struct sock *sk) +{ + if (!test_and_set_bit(0, &nlk_sk(sk)->state)) { + sk->sk_err = ENOBUFS; + sk->sk_error_report(sk); + } +} + +static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid) +{ + int protocol = ssk->sk_protocol; + struct sock *sock; + struct netlink_sock *nlk; + + sock = netlink_lookup(protocol, pid); + if (!sock) + return ERR_PTR(-ECONNREFUSED); + + /* Don't bother queuing skb if kernel socket has no input function */ + nlk = nlk_sk(sock); + if ((nlk->pid == 0 && !nlk->data_ready) || + (sock->sk_state == NETLINK_CONNECTED && + nlk->dst_pid != nlk_sk(ssk)->pid)) { + sock_put(sock); + return ERR_PTR(-ECONNREFUSED); + } + return sock; +} + +struct sock *netlink_getsockbyfilp(struct file *filp) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct sock *sock; + + if (!S_ISSOCK(inode->i_mode)) + return ERR_PTR(-ENOTSOCK); + + sock = SOCKET_I(inode)->sk; + if (sock->sk_family != AF_NETLINK) + return ERR_PTR(-EINVAL); + + sock_hold(sock); + return sock; +} + +/* + * Attach a skb to a netlink socket. + * The caller must hold a reference to the destination socket. On error, the + * reference is dropped. The skb is not send to the destination, just all + * all error checks are performed and memory in the queue is reserved. + * Return values: + * < 0: error. skb freed, reference to sock dropped. + * 0: continue + * 1: repeat lookup - reference dropped while waiting for socket memory. + */ +int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long timeo) +{ + struct netlink_sock *nlk; + + nlk = nlk_sk(sk); + + if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + test_bit(0, &nlk->state)) { + DECLARE_WAITQUEUE(wait, current); + if (!timeo) { + if (!nlk->pid) + netlink_overrun(sk); + sock_put(sk); + kfree_skb(skb); + return -EAGAIN; + } + + __set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&nlk->wait, &wait); + + if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || + test_bit(0, &nlk->state)) && + !sock_flag(sk, SOCK_DEAD)) + timeo = schedule_timeout(timeo); + + __set_current_state(TASK_RUNNING); + remove_wait_queue(&nlk->wait, &wait); + sock_put(sk); + + if (signal_pending(current)) { + kfree_skb(skb); + return sock_intr_errno(timeo); + } + return 1; + } + skb_set_owner_r(skb, sk); + return 0; +} + +int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol) +{ + struct netlink_sock *nlk; + int len = skb->len; + + nlk = nlk_sk(sk); + + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, len); + sock_put(sk); + return len; +} + +void netlink_detachskb(struct sock *sk, struct sk_buff *skb) +{ + kfree_skb(skb); + sock_put(sk); +} + +static inline struct sk_buff *netlink_trim(struct sk_buff *skb, int allocation) +{ + int delta; + + skb_orphan(skb); + + delta = skb->end - skb->tail; + if (delta * 2 < skb->truesize) + return skb; + + if (skb_shared(skb)) { + struct sk_buff *nskb = skb_clone(skb, allocation); + if (!nskb) + return skb; + kfree_skb(skb); + skb = nskb; + } + + if (!pskb_expand_head(skb, 0, -delta, allocation)) + skb->truesize -= delta; + + return skb; +} + +int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock) +{ + struct sock *sk; + int err; + long timeo; + + skb = netlink_trim(skb, gfp_any()); + + timeo = sock_sndtimeo(ssk, nonblock); +retry: + sk = netlink_getsockbypid(ssk, pid); + if (IS_ERR(sk)) { + kfree_skb(skb); + return PTR_ERR(sk); + } + err = netlink_attachskb(sk, skb, nonblock, timeo); + if (err == 1) + goto retry; + if (err) + return err; + + return netlink_sendskb(sk, skb, ssk->sk_protocol); +} + +static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && + !test_bit(0, &nlk->state)) { + skb_set_owner_r(skb, sk); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + return atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf; + } + return -1; +} + +struct netlink_broadcast_data { + struct sock *exclude_sk; + u32 pid; + u32 group; + int failure; + int congested; + int delivered; + int allocation; + struct sk_buff *skb, *skb2; +}; + +static inline int do_one_broadcast(struct sock *sk, + struct netlink_broadcast_data *p) +{ + struct netlink_sock *nlk = nlk_sk(sk); + int val; + + if (p->exclude_sk == sk) + goto out; + + if (nlk->pid == p->pid || !(nlk->groups & p->group)) + goto out; + + if (p->failure) { + netlink_overrun(sk); + goto out; + } + + sock_hold(sk); + if (p->skb2 == NULL) { + if (atomic_read(&p->skb->users) != 1) { + p->skb2 = skb_clone(p->skb, p->allocation); + } else { + p->skb2 = p->skb; + atomic_inc(&p->skb->users); + } + } + if (p->skb2 == NULL) { + netlink_overrun(sk); + /* Clone failed. Notify ALL listeners. */ + p->failure = 1; + } else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) { + netlink_overrun(sk); + } else { + p->congested |= val; + p->delivered = 1; + p->skb2 = NULL; + } + sock_put(sk); + +out: + return 0; +} + +int netlink_broadcast(struct sock *ssk, struct sk_buff *skb, u32 pid, + u32 group, int allocation) +{ + struct netlink_broadcast_data info; + struct hlist_node *node; + struct sock *sk; + + skb = netlink_trim(skb, allocation); + + info.exclude_sk = ssk; + info.pid = pid; + info.group = group; + info.failure = 0; + info.congested = 0; + info.delivered = 0; + info.allocation = allocation; + info.skb = skb; + info.skb2 = NULL; + + /* While we sleep in clone, do not allow to change socket list */ + + netlink_lock_table(); + + sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) + do_one_broadcast(sk, &info); + + netlink_unlock_table(); + + if (info.skb2) + kfree_skb(info.skb2); + kfree_skb(skb); + + if (info.delivered) { + if (info.congested && (allocation & __GFP_WAIT)) + yield(); + return 0; + } + if (info.failure) + return -ENOBUFS; + return -ESRCH; +} + +struct netlink_set_err_data { + struct sock *exclude_sk; + u32 pid; + u32 group; + int code; +}; + +static inline int do_one_set_err(struct sock *sk, + struct netlink_set_err_data *p) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (sk == p->exclude_sk) + goto out; + + if (nlk->pid == p->pid || !(nlk->groups & p->group)) + goto out; + + sk->sk_err = p->code; + sk->sk_error_report(sk); +out: + return 0; +} + +void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code) +{ + struct netlink_set_err_data info; + struct hlist_node *node; + struct sock *sk; + + info.exclude_sk = ssk; + info.pid = pid; + info.group = group; + info.code = code; + + read_lock(&nl_table_lock); + + sk_for_each_bound(sk, node, &nl_table[ssk->sk_protocol].mc_list) + do_one_set_err(sk, &info); + + read_unlock(&nl_table_lock); +} + +static inline void netlink_rcv_wake(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (!skb_queue_len(&sk->sk_receive_queue)) + clear_bit(0, &nlk->state); + if (!test_bit(0, &nlk->state)) + wake_up_interruptible(&nlk->wait); +} + +static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock_iocb *siocb = kiocb_to_siocb(kiocb); + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + struct sockaddr_nl *addr=msg->msg_name; + u32 dst_pid; + u32 dst_groups; + struct sk_buff *skb; + int err; + struct scm_cookie scm; + + if (msg->msg_flags&MSG_OOB) + return -EOPNOTSUPP; + + if (NULL == siocb->scm) + siocb->scm = &scm; + err = scm_send(sock, msg, siocb->scm); + if (err < 0) + return err; + + if (msg->msg_namelen) { + if (addr->nl_family != AF_NETLINK) + return -EINVAL; + dst_pid = addr->nl_pid; + dst_groups = addr->nl_groups; + if (dst_groups && !netlink_capable(sock, NL_NONROOT_SEND)) + return -EPERM; + } else { + dst_pid = nlk->dst_pid; + dst_groups = nlk->dst_groups; + } + + if (!nlk->pid) { + err = netlink_autobind(sock); + if (err) + goto out; + } + + err = -EMSGSIZE; + if (len > sk->sk_sndbuf - 32) + goto out; + err = -ENOBUFS; + skb = alloc_skb(len, GFP_KERNEL); + if (skb==NULL) + goto out; + + NETLINK_CB(skb).pid = nlk->pid; + NETLINK_CB(skb).groups = nlk->groups; + NETLINK_CB(skb).dst_pid = dst_pid; + NETLINK_CB(skb).dst_groups = dst_groups; + memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); + + /* What can I do? Netlink is asynchronous, so that + we will have to save current capabilities to + check them, when this message will be delivered + to corresponding kernel module. --ANK (980802) + */ + + err = -EFAULT; + if (memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len)) { + kfree_skb(skb); + goto out; + } + + err = security_netlink_send(sk, skb); + if (err) { + kfree_skb(skb); + goto out; + } + + if (dst_groups) { + atomic_inc(&skb->users); + netlink_broadcast(sk, skb, dst_pid, dst_groups, GFP_KERNEL); + } + err = netlink_unicast(sk, skb, dst_pid, msg->msg_flags&MSG_DONTWAIT); + +out: + return err; +} + +static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, + struct msghdr *msg, size_t len, + int flags) +{ + struct sock_iocb *siocb = kiocb_to_siocb(kiocb); + struct scm_cookie scm; + struct sock *sk = sock->sk; + struct netlink_sock *nlk = nlk_sk(sk); + int noblock = flags&MSG_DONTWAIT; + size_t copied; + struct sk_buff *skb; + int err; + + if (flags&MSG_OOB) + return -EOPNOTSUPP; + + copied = 0; + + skb = skb_recv_datagram(sk,flags,noblock,&err); + if (skb==NULL) + goto out; + + msg->msg_namelen = 0; + + copied = skb->len; + if (len < copied) { + msg->msg_flags |= MSG_TRUNC; + copied = len; + } + + skb->h.raw = skb->data; + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + + if (msg->msg_name) { + struct sockaddr_nl *addr = (struct sockaddr_nl*)msg->msg_name; + addr->nl_family = AF_NETLINK; + addr->nl_pad = 0; + addr->nl_pid = NETLINK_CB(skb).pid; + addr->nl_groups = NETLINK_CB(skb).dst_groups; + msg->msg_namelen = sizeof(*addr); + } + + if (NULL == siocb->scm) { + memset(&scm, 0, sizeof(scm)); + siocb->scm = &scm; + } + siocb->scm->creds = *NETLINK_CREDS(skb); + skb_free_datagram(sk, skb); + + if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) + netlink_dump(sk); + + scm_recv(sock, msg, siocb->scm, flags); + +out: + netlink_rcv_wake(sk); + return err ? : copied; +} + +static void netlink_data_ready(struct sock *sk, int len) +{ + struct netlink_sock *nlk = nlk_sk(sk); + + if (nlk->data_ready) + nlk->data_ready(sk, len); + netlink_rcv_wake(sk); +} + +/* + * We export these functions to other modules. They provide a + * complete set of kernel non-blocking support for message + * queueing. + */ + +struct sock * +netlink_kernel_create(int unit, void (*input)(struct sock *sk, int len)) +{ + struct socket *sock; + struct sock *sk; + + if (!nl_table) + return NULL; + + if (unit<0 || unit>=MAX_LINKS) + return NULL; + + if (sock_create_lite(PF_NETLINK, SOCK_DGRAM, unit, &sock)) + return NULL; + + if (netlink_create(sock, unit) < 0) { + sock_release(sock); + return NULL; + } + sk = sock->sk; + sk->sk_data_ready = netlink_data_ready; + if (input) + nlk_sk(sk)->data_ready = input; + + if (netlink_insert(sk, 0)) { + sock_release(sock); + return NULL; + } + return sk; +} + +void netlink_set_nonroot(int protocol, unsigned int flags) +{ + if ((unsigned int)protocol < MAX_LINKS) + nl_table[protocol].nl_nonroot = flags; +} + +static void netlink_destroy_callback(struct netlink_callback *cb) +{ + if (cb->skb) + kfree_skb(cb->skb); + kfree(cb); +} + +/* + * It looks a bit ugly. + * It would be better to create kernel thread. + */ + +static int netlink_dump(struct sock *sk) +{ + struct netlink_sock *nlk = nlk_sk(sk); + struct netlink_callback *cb; + struct sk_buff *skb; + struct nlmsghdr *nlh; + int len; + + skb = sock_rmalloc(sk, NLMSG_GOODSIZE, 0, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + spin_lock(&nlk->cb_lock); + + cb = nlk->cb; + if (cb == NULL) { + spin_unlock(&nlk->cb_lock); + kfree_skb(skb); + return -EINVAL; + } + + len = cb->dump(skb, cb); + + if (len > 0) { + spin_unlock(&nlk->cb_lock); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, len); + return 0; + } + + nlh = __nlmsg_put(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, NLMSG_DONE, sizeof(int)); + nlh->nlmsg_flags |= NLM_F_MULTI; + memcpy(NLMSG_DATA(nlh), &len, sizeof(len)); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + + cb->done(cb); + nlk->cb = NULL; + spin_unlock(&nlk->cb_lock); + + netlink_destroy_callback(cb); + __sock_put(sk); + return 0; +} + +int netlink_dump_start(struct sock *ssk, struct sk_buff *skb, + struct nlmsghdr *nlh, + int (*dump)(struct sk_buff *skb, struct netlink_callback*), + int (*done)(struct netlink_callback*)) +{ + struct netlink_callback *cb; + struct sock *sk; + struct netlink_sock *nlk; + + cb = kmalloc(sizeof(*cb), GFP_KERNEL); + if (cb == NULL) + return -ENOBUFS; + + memset(cb, 0, sizeof(*cb)); + cb->dump = dump; + cb->done = done; + cb->nlh = nlh; + atomic_inc(&skb->users); + cb->skb = skb; + + sk = netlink_lookup(ssk->sk_protocol, NETLINK_CB(skb).pid); + if (sk == NULL) { + netlink_destroy_callback(cb); + return -ECONNREFUSED; + } + nlk = nlk_sk(sk); + /* A dump is in progress... */ + spin_lock(&nlk->cb_lock); + if (nlk->cb) { + spin_unlock(&nlk->cb_lock); + netlink_destroy_callback(cb); + sock_put(sk); + return -EBUSY; + } + nlk->cb = cb; + sock_hold(sk); + spin_unlock(&nlk->cb_lock); + + netlink_dump(sk); + sock_put(sk); + return 0; +} + +void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) +{ + struct sk_buff *skb; + struct nlmsghdr *rep; + struct nlmsgerr *errmsg; + int size; + + if (err == 0) + size = NLMSG_SPACE(sizeof(struct nlmsgerr)); + else + size = NLMSG_SPACE(4 + NLMSG_ALIGN(nlh->nlmsg_len)); + + skb = alloc_skb(size, GFP_KERNEL); + if (!skb) { + struct sock *sk; + + sk = netlink_lookup(in_skb->sk->sk_protocol, + NETLINK_CB(in_skb).pid); + if (sk) { + sk->sk_err = ENOBUFS; + sk->sk_error_report(sk); + sock_put(sk); + } + return; + } + + rep = __nlmsg_put(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + NLMSG_ERROR, sizeof(struct nlmsgerr)); + errmsg = NLMSG_DATA(rep); + errmsg->error = err; + memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(struct nlmsghdr)); + netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); +} + + +#ifdef CONFIG_PROC_FS +struct nl_seq_iter { + int link; + int hash_idx; +}; + +static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos) +{ + struct nl_seq_iter *iter = seq->private; + int i, j; + struct sock *s; + struct hlist_node *node; + loff_t off = 0; + + for (i=0; imask; j++) { + sk_for_each(s, node, &hash->table[j]) { + if (off == pos) { + iter->link = i; + iter->hash_idx = j; + return s; + } + ++off; + } + } + } + return NULL; +} + +static void *netlink_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&nl_table_lock); + return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock *s; + struct nl_seq_iter *iter; + int i, j; + + ++*pos; + + if (v == SEQ_START_TOKEN) + return netlink_seq_socket_idx(seq, 0); + + s = sk_next(v); + if (s) + return s; + + iter = seq->private; + i = iter->link; + j = iter->hash_idx + 1; + + do { + struct nl_pid_hash *hash = &nl_table[i].hash; + + for (; j <= hash->mask; j++) { + s = sk_head(&hash->table[j]); + if (s) { + iter->link = i; + iter->hash_idx = j; + return s; + } + } + + j = 0; + } while (++i < MAX_LINKS); + + return NULL; +} + +static void netlink_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&nl_table_lock); +} + + +static int netlink_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "sk Eth Pid Groups " + "Rmem Wmem Dump Locks\n"); + else { + struct sock *s = v; + struct netlink_sock *nlk = nlk_sk(s); + + seq_printf(seq, "%p %-3d %-6d %08x %-8d %-8d %p %d\n", + s, + s->sk_protocol, + nlk->pid, + nlk->groups, + atomic_read(&s->sk_rmem_alloc), + atomic_read(&s->sk_wmem_alloc), + nlk->cb, + atomic_read(&s->sk_refcnt) + ); + + } + return 0; +} + +static struct seq_operations netlink_seq_ops = { + .start = netlink_seq_start, + .next = netlink_seq_next, + .stop = netlink_seq_stop, + .show = netlink_seq_show, +}; + + +static int netlink_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + struct nl_seq_iter *iter; + int err; + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + + err = seq_open(file, &netlink_seq_ops); + if (err) { + kfree(iter); + return err; + } + + memset(iter, 0, sizeof(*iter)); + seq = file->private_data; + seq->private = iter; + return 0; +} + +static struct file_operations netlink_seq_fops = { + .owner = THIS_MODULE, + .open = netlink_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif + +int netlink_register_notifier(struct notifier_block *nb) +{ + return notifier_chain_register(&netlink_chain, nb); +} + +int netlink_unregister_notifier(struct notifier_block *nb) +{ + return notifier_chain_unregister(&netlink_chain, nb); +} + +static struct proto_ops netlink_ops = { + .family = PF_NETLINK, + .owner = THIS_MODULE, + .release = netlink_release, + .bind = netlink_bind, + .connect = netlink_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = netlink_getname, + .poll = datagram_poll, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = netlink_sendmsg, + .recvmsg = netlink_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct net_proto_family netlink_family_ops = { + .family = PF_NETLINK, + .create = netlink_create, + .owner = THIS_MODULE, /* for consistency 8) */ +}; + +extern void netlink_skb_parms_too_large(void); + +static int __init netlink_proto_init(void) +{ + struct sk_buff *dummy_skb; + int i; + unsigned long max; + unsigned int order; + int err = proto_register(&netlink_proto, 0); + + if (err != 0) + goto out; + + if (sizeof(struct netlink_skb_parms) > sizeof(dummy_skb->cb)) + netlink_skb_parms_too_large(); + + nl_table = kmalloc(sizeof(*nl_table) * MAX_LINKS, GFP_KERNEL); + if (!nl_table) { +enomem: + printk(KERN_CRIT "netlink_init: Cannot allocate nl_table\n"); + return -ENOMEM; + } + + memset(nl_table, 0, sizeof(*nl_table) * MAX_LINKS); + + if (num_physpages >= (128 * 1024)) + max = num_physpages >> (21 - PAGE_SHIFT); + else + max = num_physpages >> (23 - PAGE_SHIFT); + + order = get_bitmask_order(max) - 1 + PAGE_SHIFT; + max = (1UL << order) / sizeof(struct hlist_head); + order = get_bitmask_order(max > UINT_MAX ? UINT_MAX : max) - 1; + + for (i = 0; i < MAX_LINKS; i++) { + struct nl_pid_hash *hash = &nl_table[i].hash; + + hash->table = nl_pid_hash_alloc(1 * sizeof(*hash->table)); + if (!hash->table) { + while (i-- > 0) + nl_pid_hash_free(nl_table[i].hash.table, + 1 * sizeof(*hash->table)); + kfree(nl_table); + goto enomem; + } + memset(hash->table, 0, 1 * sizeof(*hash->table)); + hash->max_shift = order; + hash->shift = 0; + hash->mask = 0; + hash->rehash_time = jiffies; + } + + sock_register(&netlink_family_ops); +#ifdef CONFIG_PROC_FS + proc_net_fops_create("netlink", 0, &netlink_seq_fops); +#endif + /* The netlink device handler may be needed early. */ + rtnetlink_init(); +out: + return err; +} + +static void __exit netlink_proto_exit(void) +{ + sock_unregister(PF_NETLINK); + proc_net_remove("netlink"); + kfree(nl_table); + nl_table = NULL; + proto_unregister(&netlink_proto); +} + +core_initcall(netlink_proto_init); +module_exit(netlink_proto_exit); + +MODULE_LICENSE("GPL"); + +MODULE_ALIAS_NETPROTO(PF_NETLINK); + +EXPORT_SYMBOL(netlink_ack); +EXPORT_SYMBOL(netlink_broadcast); +EXPORT_SYMBOL(netlink_dump_start); +EXPORT_SYMBOL(netlink_kernel_create); +EXPORT_SYMBOL(netlink_register_notifier); +EXPORT_SYMBOL(netlink_set_err); +EXPORT_SYMBOL(netlink_set_nonroot); +EXPORT_SYMBOL(netlink_unicast); +EXPORT_SYMBOL(netlink_unregister_notifier); + diff --git a/net/netrom/Makefile b/net/netrom/Makefile new file mode 100644 index 000000000000..2660f5a16991 --- /dev/null +++ b/net/netrom/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for the Linux NET/ROM layer. +# + +obj-$(CONFIG_NETROM) += netrom.o + +netrom-y := af_netrom.o nr_dev.o nr_in.o nr_loopback.o \ + nr_out.o nr_route.o nr_subr.o nr_timer.o +netrom-$(CONFIG_SYSCTL) += sysctl_net_netrom.o diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c new file mode 100644 index 000000000000..31ed4a9a1d06 --- /dev/null +++ b/net/netrom/af_netrom.c @@ -0,0 +1,1485 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For TIOCINQ/OUTQ */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int nr_ndevs = 4; + +int sysctl_netrom_default_path_quality = NR_DEFAULT_QUAL; +int sysctl_netrom_obsolescence_count_initialiser = NR_DEFAULT_OBS; +int sysctl_netrom_network_ttl_initialiser = NR_DEFAULT_TTL; +int sysctl_netrom_transport_timeout = NR_DEFAULT_T1; +int sysctl_netrom_transport_maximum_tries = NR_DEFAULT_N2; +int sysctl_netrom_transport_acknowledge_delay = NR_DEFAULT_T2; +int sysctl_netrom_transport_busy_delay = NR_DEFAULT_T4; +int sysctl_netrom_transport_requested_window_size = NR_DEFAULT_WINDOW; +int sysctl_netrom_transport_no_activity_timeout = NR_DEFAULT_IDLE; +int sysctl_netrom_routing_control = NR_DEFAULT_ROUTING; +int sysctl_netrom_link_fails_count = NR_DEFAULT_FAILS; + +static unsigned short circuit = 0x101; + +static HLIST_HEAD(nr_list); +static DEFINE_SPINLOCK(nr_list_lock); + +static struct proto_ops nr_proto_ops; + +/* + * Socket removal during an interrupt is now safe. + */ +static void nr_remove_socket(struct sock *sk) +{ + spin_lock_bh(&nr_list_lock); + sk_del_node_init(sk); + spin_unlock_bh(&nr_list_lock); +} + +/* + * Kill all bound sockets on a dropped device. + */ +static void nr_kill_by_device(struct net_device *dev) +{ + struct sock *s; + struct hlist_node *node; + + spin_lock_bh(&nr_list_lock); + sk_for_each(s, node, &nr_list) + if (nr_sk(s)->device == dev) + nr_disconnect(s, ENETUNREACH); + spin_unlock_bh(&nr_list_lock); +} + +/* + * Handle device status changes. + */ +static int nr_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = (struct net_device *)ptr; + + if (event != NETDEV_DOWN) + return NOTIFY_DONE; + + nr_kill_by_device(dev); + nr_rt_device_down(dev); + + return NOTIFY_DONE; +} + +/* + * Add a socket to the bound sockets list. + */ +static void nr_insert_socket(struct sock *sk) +{ + spin_lock_bh(&nr_list_lock); + sk_add_node(sk, &nr_list); + spin_unlock_bh(&nr_list_lock); +} + +/* + * Find a socket that wants to accept the Connect Request we just + * received. + */ +static struct sock *nr_find_listener(ax25_address *addr) +{ + struct sock *s; + struct hlist_node *node; + + spin_lock_bh(&nr_list_lock); + sk_for_each(s, node, &nr_list) + if (!ax25cmp(&nr_sk(s)->source_addr, addr) && + s->sk_state == TCP_LISTEN) { + bh_lock_sock(s); + goto found; + } + s = NULL; +found: + spin_unlock_bh(&nr_list_lock); + return s; +} + +/* + * Find a connected NET/ROM socket given my circuit IDs. + */ +static struct sock *nr_find_socket(unsigned char index, unsigned char id) +{ + struct sock *s; + struct hlist_node *node; + + spin_lock_bh(&nr_list_lock); + sk_for_each(s, node, &nr_list) { + struct nr_sock *nr = nr_sk(s); + + if (nr->my_index == index && nr->my_id == id) { + bh_lock_sock(s); + goto found; + } + } + s = NULL; +found: + spin_unlock_bh(&nr_list_lock); + return s; +} + +/* + * Find a connected NET/ROM socket given their circuit IDs. + */ +static struct sock *nr_find_peer(unsigned char index, unsigned char id, + ax25_address *dest) +{ + struct sock *s; + struct hlist_node *node; + + spin_lock_bh(&nr_list_lock); + sk_for_each(s, node, &nr_list) { + struct nr_sock *nr = nr_sk(s); + + if (nr->your_index == index && nr->your_id == id && + !ax25cmp(&nr->dest_addr, dest)) { + bh_lock_sock(s); + goto found; + } + } + s = NULL; +found: + spin_unlock_bh(&nr_list_lock); + return s; +} + +/* + * Find next free circuit ID. + */ +static unsigned short nr_find_next_circuit(void) +{ + unsigned short id = circuit; + unsigned char i, j; + struct sock *sk; + + for (;;) { + i = id / 256; + j = id % 256; + + if (i != 0 && j != 0) { + if ((sk=nr_find_socket(i, j)) == NULL) + break; + bh_unlock_sock(sk); + } + + id++; + } + + return id; +} + +/* + * Deferred destroy. + */ +void nr_destroy_socket(struct sock *); + +/* + * Handler for deferred kills. + */ +static void nr_destroy_timer(unsigned long data) +{ + struct sock *sk=(struct sock *)data; + bh_lock_sock(sk); + sock_hold(sk); + nr_destroy_socket(sk); + bh_unlock_sock(sk); + sock_put(sk); +} + +/* + * This is called from user mode and the timers. Thus it protects itself + * against interrupt users but doesn't worry about being called during + * work. Once it is removed from the queue no interrupt or bottom half + * will touch it and we are (fairly 8-) ) safe. + */ +void nr_destroy_socket(struct sock *sk) +{ + struct sk_buff *skb; + + nr_remove_socket(sk); + + nr_stop_heartbeat(sk); + nr_stop_t1timer(sk); + nr_stop_t2timer(sk); + nr_stop_t4timer(sk); + nr_stop_idletimer(sk); + + nr_clear_queues(sk); /* Flush the queues */ + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + if (skb->sk != sk) { /* A pending connection */ + /* Queue the unaccepted socket for death */ + sock_set_flag(skb->sk, SOCK_DEAD); + nr_start_heartbeat(skb->sk); + nr_sk(skb->sk)->state = NR_STATE_0; + } + + kfree_skb(skb); + } + + if (atomic_read(&sk->sk_wmem_alloc) || + atomic_read(&sk->sk_rmem_alloc)) { + /* Defer: outstanding buffers */ + sk->sk_timer.function = nr_destroy_timer; + sk->sk_timer.expires = jiffies + 2 * HZ; + add_timer(&sk->sk_timer); + } else + sock_put(sk); +} + +/* + * Handling for system calls applied via the various interfaces to a + * NET/ROM socket object. + */ + +static int nr_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + struct nr_sock *nr = nr_sk(sk); + int opt; + + if (level != SOL_NETROM) + return -ENOPROTOOPT; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(opt, (int __user *)optval)) + return -EFAULT; + + switch (optname) { + case NETROM_T1: + if (opt < 1) + return -EINVAL; + nr->t1 = opt * HZ; + return 0; + + case NETROM_T2: + if (opt < 1) + return -EINVAL; + nr->t2 = opt * HZ; + return 0; + + case NETROM_N2: + if (opt < 1 || opt > 31) + return -EINVAL; + nr->n2 = opt; + return 0; + + case NETROM_T4: + if (opt < 1) + return -EINVAL; + nr->t4 = opt * HZ; + return 0; + + case NETROM_IDLE: + if (opt < 0) + return -EINVAL; + nr->idle = opt * 60 * HZ; + return 0; + + default: + return -ENOPROTOOPT; + } +} + +static int nr_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct nr_sock *nr = nr_sk(sk); + int val = 0; + int len; + + if (level != SOL_NETROM) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + if (len < 0) + return -EINVAL; + + switch (optname) { + case NETROM_T1: + val = nr->t1 / HZ; + break; + + case NETROM_T2: + val = nr->t2 / HZ; + break; + + case NETROM_N2: + val = nr->n2; + break; + + case NETROM_T4: + val = nr->t4 / HZ; + break; + + case NETROM_IDLE: + val = nr->idle / (60 * HZ); + break; + + default: + return -ENOPROTOOPT; + } + + len = min_t(unsigned int, len, sizeof(int)); + + if (put_user(len, optlen)) + return -EFAULT; + + return copy_to_user(optval, &val, len) ? -EFAULT : 0; +} + +static int nr_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + + lock_sock(sk); + if (sk->sk_state != TCP_LISTEN) { + memset(&nr_sk(sk)->user_addr, 0, AX25_ADDR_LEN); + sk->sk_max_ack_backlog = backlog; + sk->sk_state = TCP_LISTEN; + release_sock(sk); + return 0; + } + release_sock(sk); + + return -EOPNOTSUPP; +} + +static struct proto nr_proto = { + .name = "NETROM", + .owner = THIS_MODULE, + .obj_size = sizeof(struct nr_sock), +}; + +static int nr_create(struct socket *sock, int protocol) +{ + struct sock *sk; + struct nr_sock *nr; + + if (sock->type != SOCK_SEQPACKET || protocol != 0) + return -ESOCKTNOSUPPORT; + + if ((sk = sk_alloc(PF_NETROM, GFP_ATOMIC, &nr_proto, 1)) == NULL) + return -ENOMEM; + + nr = nr_sk(sk); + + sock_init_data(sock, sk); + + sock->ops = &nr_proto_ops; + sk->sk_protocol = protocol; + + skb_queue_head_init(&nr->ack_queue); + skb_queue_head_init(&nr->reseq_queue); + skb_queue_head_init(&nr->frag_queue); + + nr_init_timers(sk); + + nr->t1 = sysctl_netrom_transport_timeout; + nr->t2 = sysctl_netrom_transport_acknowledge_delay; + nr->n2 = sysctl_netrom_transport_maximum_tries; + nr->t4 = sysctl_netrom_transport_busy_delay; + nr->idle = sysctl_netrom_transport_no_activity_timeout; + nr->window = sysctl_netrom_transport_requested_window_size; + + nr->bpqext = 1; + nr->state = NR_STATE_0; + + return 0; +} + +static struct sock *nr_make_new(struct sock *osk) +{ + struct sock *sk; + struct nr_sock *nr, *onr; + + if (osk->sk_type != SOCK_SEQPACKET) + return NULL; + + if ((sk = sk_alloc(PF_NETROM, GFP_ATOMIC, osk->sk_prot, 1)) == NULL) + return NULL; + + nr = nr_sk(sk); + + sock_init_data(NULL, sk); + + sk->sk_type = osk->sk_type; + sk->sk_socket = osk->sk_socket; + sk->sk_priority = osk->sk_priority; + sk->sk_protocol = osk->sk_protocol; + sk->sk_rcvbuf = osk->sk_rcvbuf; + sk->sk_sndbuf = osk->sk_sndbuf; + sk->sk_state = TCP_ESTABLISHED; + sk->sk_sleep = osk->sk_sleep; + + if (sock_flag(osk, SOCK_ZAPPED)) + sock_set_flag(sk, SOCK_ZAPPED); + + if (sock_flag(osk, SOCK_DBG)) + sock_set_flag(sk, SOCK_DBG); + + skb_queue_head_init(&nr->ack_queue); + skb_queue_head_init(&nr->reseq_queue); + skb_queue_head_init(&nr->frag_queue); + + nr_init_timers(sk); + + onr = nr_sk(osk); + + nr->t1 = onr->t1; + nr->t2 = onr->t2; + nr->n2 = onr->n2; + nr->t4 = onr->t4; + nr->idle = onr->idle; + nr->window = onr->window; + + nr->device = onr->device; + nr->bpqext = onr->bpqext; + + return sk; +} + +static int nr_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct nr_sock *nr; + + if (sk == NULL) return 0; + + sock_hold(sk); + lock_sock(sk); + nr = nr_sk(sk); + + switch (nr->state) { + case NR_STATE_0: + case NR_STATE_1: + case NR_STATE_2: + nr_disconnect(sk, 0); + nr_destroy_socket(sk); + break; + + case NR_STATE_3: + nr_clear_queues(sk); + nr->n2count = 0; + nr_write_internal(sk, NR_DISCREQ); + nr_start_t1timer(sk); + nr_stop_t2timer(sk); + nr_stop_t4timer(sk); + nr_stop_idletimer(sk); + nr->state = NR_STATE_2; + sk->sk_state = TCP_CLOSE; + sk->sk_shutdown |= SEND_SHUTDOWN; + sk->sk_state_change(sk); + sock_orphan(sk); + sock_set_flag(sk, SOCK_DESTROY); + sk->sk_socket = NULL; + break; + + default: + sk->sk_socket = NULL; + break; + } + + sock->sk = NULL; + release_sock(sk); + sock_put(sk); + + return 0; +} + +static int nr_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct nr_sock *nr = nr_sk(sk); + struct full_sockaddr_ax25 *addr = (struct full_sockaddr_ax25 *)uaddr; + struct net_device *dev; + ax25_address *user, *source; + + lock_sock(sk); + if (!sock_flag(sk, SOCK_ZAPPED)) { + release_sock(sk); + return -EINVAL; + } + if (addr_len < sizeof(struct sockaddr_ax25) || addr_len > sizeof(struct full_sockaddr_ax25)) { + release_sock(sk); + return -EINVAL; + } + if (addr_len < (addr->fsa_ax25.sax25_ndigis * sizeof(ax25_address) + sizeof(struct sockaddr_ax25))) { + release_sock(sk); + return -EINVAL; + } + if (addr->fsa_ax25.sax25_family != AF_NETROM) { + release_sock(sk); + return -EINVAL; + } + if ((dev = nr_dev_get(&addr->fsa_ax25.sax25_call)) == NULL) { + SOCK_DEBUG(sk, "NET/ROM: bind failed: invalid node callsign\n"); + release_sock(sk); + return -EADDRNOTAVAIL; + } + + /* + * Only the super user can set an arbitrary user callsign. + */ + if (addr->fsa_ax25.sax25_ndigis == 1) { + if (!capable(CAP_NET_BIND_SERVICE)) { + dev_put(dev); + release_sock(sk); + return -EACCES; + } + nr->user_addr = addr->fsa_digipeater[0]; + nr->source_addr = addr->fsa_ax25.sax25_call; + } else { + source = &addr->fsa_ax25.sax25_call; + + if ((user = ax25_findbyuid(current->euid)) == NULL) { + if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) { + release_sock(sk); + dev_put(dev); + return -EPERM; + } + user = source; + } + + nr->user_addr = *user; + nr->source_addr = *source; + } + + nr->device = dev; + nr_insert_socket(sk); + + sock_reset_flag(sk, SOCK_ZAPPED); + dev_put(dev); + release_sock(sk); + SOCK_DEBUG(sk, "NET/ROM: socket is bound\n"); + return 0; +} + +static int nr_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct nr_sock *nr = nr_sk(sk); + struct sockaddr_ax25 *addr = (struct sockaddr_ax25 *)uaddr; + ax25_address *user, *source = NULL; + struct net_device *dev; + + lock_sock(sk); + if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { + sock->state = SS_CONNECTED; + release_sock(sk); + return 0; /* Connect completed during a ERESTARTSYS event */ + } + + if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { + sock->state = SS_UNCONNECTED; + release_sock(sk); + return -ECONNREFUSED; + } + + if (sk->sk_state == TCP_ESTABLISHED) { + release_sock(sk); + return -EISCONN; /* No reconnect on a seqpacket socket */ + } + + sk->sk_state = TCP_CLOSE; + sock->state = SS_UNCONNECTED; + + if (addr_len != sizeof(struct sockaddr_ax25) && addr_len != sizeof(struct full_sockaddr_ax25)) { + release_sock(sk); + return -EINVAL; + } + if (addr->sax25_family != AF_NETROM) { + release_sock(sk); + return -EINVAL; + } + if (sock_flag(sk, SOCK_ZAPPED)) { /* Must bind first - autobinding in this may or may not work */ + sock_reset_flag(sk, SOCK_ZAPPED); + + if ((dev = nr_dev_first()) == NULL) { + release_sock(sk); + return -ENETUNREACH; + } + source = (ax25_address *)dev->dev_addr; + + if ((user = ax25_findbyuid(current->euid)) == NULL) { + if (ax25_uid_policy && !capable(CAP_NET_ADMIN)) { + dev_put(dev); + release_sock(sk); + return -EPERM; + } + user = source; + } + + nr->user_addr = *user; + nr->source_addr = *source; + nr->device = dev; + + dev_put(dev); + nr_insert_socket(sk); /* Finish the bind */ + } + + nr->dest_addr = addr->sax25_call; + + release_sock(sk); + circuit = nr_find_next_circuit(); + lock_sock(sk); + + nr->my_index = circuit / 256; + nr->my_id = circuit % 256; + + circuit++; + + /* Move to connecting socket, start sending Connect Requests */ + sock->state = SS_CONNECTING; + sk->sk_state = TCP_SYN_SENT; + + nr_establish_data_link(sk); + + nr->state = NR_STATE_1; + + nr_start_heartbeat(sk); + + /* Now the loop */ + if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) { + release_sock(sk); + return -EINPROGRESS; + } + + /* + * A Connect Ack with Choke or timeout or failed routing will go to + * closed. + */ + if (sk->sk_state == TCP_SYN_SENT) { + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue(sk->sk_sleep, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (sk->sk_state != TCP_SYN_SENT) + break; + release_sock(sk); + if (!signal_pending(tsk)) { + schedule(); + lock_sock(sk); + continue; + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep, &wait); + return -ERESTARTSYS; + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep, &wait); + } + + if (sk->sk_state != TCP_ESTABLISHED) { + sock->state = SS_UNCONNECTED; + release_sock(sk); + return sock_error(sk); /* Always set at this point */ + } + + sock->state = SS_CONNECTED; + release_sock(sk); + + return 0; +} + +static int nr_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + struct sk_buff *skb; + struct sock *newsk; + struct sock *sk; + int err = 0; + + if ((sk = sock->sk) == NULL) + return -EINVAL; + + lock_sock(sk); + if (sk->sk_type != SOCK_SEQPACKET) { + err = -EOPNOTSUPP; + goto out; + } + + if (sk->sk_state != TCP_LISTEN) { + err = -EINVAL; + goto out; + } + + /* + * The write queue this time is holding sockets ready to use + * hooked into the SABM we saved + */ + add_wait_queue(sk->sk_sleep, &wait); + for (;;) { + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb) + break; + + current->state = TASK_INTERRUPTIBLE; + release_sock(sk); + if (flags & O_NONBLOCK) { + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep, &wait); + return -EWOULDBLOCK; + } + if (!signal_pending(tsk)) { + schedule(); + lock_sock(sk); + continue; + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep, &wait); + return -ERESTARTSYS; + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep, &wait); + + newsk = skb->sk; + newsk->sk_socket = newsock; + newsk->sk_sleep = &newsock->wait; + + /* Now attach up the new socket */ + kfree_skb(skb); + sk->sk_ack_backlog--; + newsock->sk = newsk; + +out: + release_sock(sk); + return err; +} + +static int nr_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct full_sockaddr_ax25 *sax = (struct full_sockaddr_ax25 *)uaddr; + struct sock *sk = sock->sk; + struct nr_sock *nr = nr_sk(sk); + + lock_sock(sk); + if (peer != 0) { + if (sk->sk_state != TCP_ESTABLISHED) { + release_sock(sk); + return -ENOTCONN; + } + sax->fsa_ax25.sax25_family = AF_NETROM; + sax->fsa_ax25.sax25_ndigis = 1; + sax->fsa_ax25.sax25_call = nr->user_addr; + sax->fsa_digipeater[0] = nr->dest_addr; + *uaddr_len = sizeof(struct full_sockaddr_ax25); + } else { + sax->fsa_ax25.sax25_family = AF_NETROM; + sax->fsa_ax25.sax25_ndigis = 0; + sax->fsa_ax25.sax25_call = nr->source_addr; + *uaddr_len = sizeof(struct sockaddr_ax25); + } + release_sock(sk); + + return 0; +} + +int nr_rx_frame(struct sk_buff *skb, struct net_device *dev) +{ + struct sock *sk; + struct sock *make; + struct nr_sock *nr_make; + ax25_address *src, *dest, *user; + unsigned short circuit_index, circuit_id; + unsigned short peer_circuit_index, peer_circuit_id; + unsigned short frametype, flags, window, timeout; + int ret; + + skb->sk = NULL; /* Initially we don't know who it's for */ + + /* + * skb->data points to the netrom frame start + */ + + src = (ax25_address *)(skb->data + 0); + dest = (ax25_address *)(skb->data + 7); + + circuit_index = skb->data[15]; + circuit_id = skb->data[16]; + peer_circuit_index = skb->data[17]; + peer_circuit_id = skb->data[18]; + frametype = skb->data[19] & 0x0F; + flags = skb->data[19] & 0xF0; + +#ifdef CONFIG_INET + /* + * Check for an incoming IP over NET/ROM frame. + */ + if (frametype == NR_PROTOEXT && circuit_index == NR_PROTO_IP && circuit_id == NR_PROTO_IP) { + skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); + skb->h.raw = skb->data; + + return nr_rx_ip(skb, dev); + } +#endif + + /* + * Find an existing socket connection, based on circuit ID, if it's + * a Connect Request base it on their circuit ID. + * + * Circuit ID 0/0 is not valid but it could still be a "reset" for a + * circuit that no longer exists at the other end ... + */ + + sk = NULL; + + if (circuit_index == 0 && circuit_id == 0) { + if (frametype == NR_CONNACK && flags == NR_CHOKE_FLAG) + sk = nr_find_peer(peer_circuit_index, peer_circuit_id, src); + } else { + if (frametype == NR_CONNREQ) + sk = nr_find_peer(circuit_index, circuit_id, src); + else + sk = nr_find_socket(circuit_index, circuit_id); + } + + if (sk != NULL) { + skb->h.raw = skb->data; + + if (frametype == NR_CONNACK && skb->len == 22) + nr_sk(sk)->bpqext = 1; + else + nr_sk(sk)->bpqext = 0; + + ret = nr_process_rx_frame(sk, skb); + bh_unlock_sock(sk); + return ret; + } + + /* + * Now it should be a CONNREQ. + */ + if (frametype != NR_CONNREQ) { + /* + * Here it would be nice to be able to send a reset but + * NET/ROM doesn't have one. The following hack would + * have been a way to extend the protocol but apparently + * it kills BPQ boxes... :-( + */ +#if 0 + /* + * Never reply to a CONNACK/CHOKE. + */ + if (frametype != NR_CONNACK || flags != NR_CHOKE_FLAG) + nr_transmit_refusal(skb, 1); +#endif + return 0; + } + + sk = nr_find_listener(dest); + + user = (ax25_address *)(skb->data + 21); + + if (sk == NULL || sk_acceptq_is_full(sk) || + (make = nr_make_new(sk)) == NULL) { + nr_transmit_refusal(skb, 0); + if (sk) + bh_unlock_sock(sk); + return 0; + } + + window = skb->data[20]; + + skb->sk = make; + make->sk_state = TCP_ESTABLISHED; + + /* Fill in his circuit details */ + nr_make = nr_sk(make); + nr_make->source_addr = *dest; + nr_make->dest_addr = *src; + nr_make->user_addr = *user; + + nr_make->your_index = circuit_index; + nr_make->your_id = circuit_id; + + bh_unlock_sock(sk); + circuit = nr_find_next_circuit(); + bh_lock_sock(sk); + + nr_make->my_index = circuit / 256; + nr_make->my_id = circuit % 256; + + circuit++; + + /* Window negotiation */ + if (window < nr_make->window) + nr_make->window = window; + + /* L4 timeout negotiation */ + if (skb->len == 37) { + timeout = skb->data[36] * 256 + skb->data[35]; + if (timeout * HZ < nr_make->t1) + nr_make->t1 = timeout * HZ; + nr_make->bpqext = 1; + } else { + nr_make->bpqext = 0; + } + + nr_write_internal(make, NR_CONNACK); + + nr_make->condition = 0x00; + nr_make->vs = 0; + nr_make->va = 0; + nr_make->vr = 0; + nr_make->vl = 0; + nr_make->state = NR_STATE_3; + sk->sk_ack_backlog++; + + nr_insert_socket(make); + + skb_queue_head(&sk->sk_receive_queue, skb); + + nr_start_heartbeat(make); + nr_start_idletimer(make); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skb->len); + + bh_unlock_sock(sk); + return 1; +} + +static int nr_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct nr_sock *nr = nr_sk(sk); + struct sockaddr_ax25 *usax = (struct sockaddr_ax25 *)msg->msg_name; + int err; + struct sockaddr_ax25 sax; + struct sk_buff *skb; + unsigned char *asmptr; + int size; + + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) + return -EINVAL; + + lock_sock(sk); + if (sock_flag(sk, SOCK_ZAPPED)) { + err = -EADDRNOTAVAIL; + goto out; + } + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + send_sig(SIGPIPE, current, 0); + err = -EPIPE; + goto out; + } + + if (nr->device == NULL) { + err = -ENETUNREACH; + goto out; + } + + if (usax) { + if (msg->msg_namelen < sizeof(sax)) { + err = -EINVAL; + goto out; + } + sax = *usax; + if (ax25cmp(&nr->dest_addr, &sax.sax25_call) != 0) { + err = -EISCONN; + goto out; + } + if (sax.sax25_family != AF_NETROM) { + err = -EINVAL; + goto out; + } + } else { + if (sk->sk_state != TCP_ESTABLISHED) { + err = -ENOTCONN; + goto out; + } + sax.sax25_family = AF_NETROM; + sax.sax25_call = nr->dest_addr; + } + + SOCK_DEBUG(sk, "NET/ROM: sendto: Addresses built.\n"); + + /* Build a packet */ + SOCK_DEBUG(sk, "NET/ROM: sendto: building packet.\n"); + size = len + NR_NETWORK_LEN + NR_TRANSPORT_LEN; + + if ((skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, &err)) == NULL) + goto out; + + skb_reserve(skb, size - len); + + /* + * Push down the NET/ROM header + */ + + asmptr = skb_push(skb, NR_TRANSPORT_LEN); + SOCK_DEBUG(sk, "Building NET/ROM Header.\n"); + + /* Build a NET/ROM Transport header */ + + *asmptr++ = nr->your_index; + *asmptr++ = nr->your_id; + *asmptr++ = 0; /* To be filled in later */ + *asmptr++ = 0; /* Ditto */ + *asmptr++ = NR_INFO; + SOCK_DEBUG(sk, "Built header.\n"); + + /* + * Put the data on the end + */ + + skb->h.raw = skb_put(skb, len); + + asmptr = skb->h.raw; + SOCK_DEBUG(sk, "NET/ROM: Appending user data\n"); + + /* User data follows immediately after the NET/ROM transport header */ + if (memcpy_fromiovec(asmptr, msg->msg_iov, len)) { + kfree_skb(skb); + err = -EFAULT; + goto out; + } + + SOCK_DEBUG(sk, "NET/ROM: Transmitting buffer\n"); + + if (sk->sk_state != TCP_ESTABLISHED) { + kfree_skb(skb); + err = -ENOTCONN; + goto out; + } + + nr_output(sk, skb); /* Shove it onto the queue */ + + err = len; +out: + release_sock(sk); + return err; +} + +static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_ax25 *sax = (struct sockaddr_ax25 *)msg->msg_name; + size_t copied; + struct sk_buff *skb; + int er; + + /* + * This works for seqpacket too. The receiver has ordered the queue for + * us! We do one quick check first though + */ + + lock_sock(sk); + if (sk->sk_state != TCP_ESTABLISHED) { + release_sock(sk); + return -ENOTCONN; + } + + /* Now we can treat all alike */ + if ((skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &er)) == NULL) { + release_sock(sk); + return er; + } + + skb->h.raw = skb->data; + copied = skb->len; + + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + + if (sax != NULL) { + sax->sax25_family = AF_NETROM; + memcpy(sax->sax25_call.ax25_call, skb->data + 7, AX25_ADDR_LEN); + } + + msg->msg_namelen = sizeof(*sax); + + skb_free_datagram(sk, skb); + + release_sock(sk); + return copied; +} + + +static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + void __user *argp = (void __user *)arg; + int ret; + + lock_sock(sk); + switch (cmd) { + case TIOCOUTQ: { + long amount; + amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); + if (amount < 0) + amount = 0; + release_sock(sk); + return put_user(amount, (int __user *)argp); + } + + case TIOCINQ: { + struct sk_buff *skb; + long amount = 0L; + /* These two are safe on a single CPU system as only user tasks fiddle here */ + if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) + amount = skb->len; + release_sock(sk); + return put_user(amount, (int __user *)argp); + } + + case SIOCGSTAMP: + ret = -EINVAL; + if (sk != NULL) + ret = sock_get_timestamp(sk, argp); + release_sock(sk); + return ret; + + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + release_sock(sk); + return -EINVAL; + + case SIOCADDRT: + case SIOCDELRT: + case SIOCNRDECOBS: + release_sock(sk); + if (!capable(CAP_NET_ADMIN)) return -EPERM; + return nr_rt_ioctl(cmd, argp); + + default: + release_sock(sk); + return dev_ioctl(cmd, argp); + } + release_sock(sk); + + return 0; +} + +#ifdef CONFIG_PROC_FS + +static void *nr_info_start(struct seq_file *seq, loff_t *pos) +{ + struct sock *s; + struct hlist_node *node; + int i = 1; + + spin_lock_bh(&nr_list_lock); + if (*pos == 0) + return SEQ_START_TOKEN; + + sk_for_each(s, node, &nr_list) { + if (i == *pos) + return s; + ++i; + } + return NULL; +} + +static void *nr_info_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + + return (v == SEQ_START_TOKEN) ? sk_head(&nr_list) + : sk_next((struct sock *)v); +} + +static void nr_info_stop(struct seq_file *seq, void *v) +{ + spin_unlock_bh(&nr_list_lock); +} + +static int nr_info_show(struct seq_file *seq, void *v) +{ + struct sock *s = v; + struct net_device *dev; + struct nr_sock *nr; + const char *devname; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, +"user_addr dest_node src_node dev my your st vs vr va t1 t2 t4 idle n2 wnd Snd-Q Rcv-Q inode\n"); + + else { + + bh_lock_sock(s); + nr = nr_sk(s); + + if ((dev = nr->device) == NULL) + devname = "???"; + else + devname = dev->name; + + seq_printf(seq, "%-9s ", ax2asc(&nr->user_addr)); + seq_printf(seq, "%-9s ", ax2asc(&nr->dest_addr)); + seq_printf(seq, +"%-9s %-3s %02X/%02X %02X/%02X %2d %3d %3d %3d %3lu/%03lu %2lu/%02lu %3lu/%03lu %3lu/%03lu %2d/%02d %3d %5d %5d %ld\n", + ax2asc(&nr->source_addr), + devname, + nr->my_index, + nr->my_id, + nr->your_index, + nr->your_id, + nr->state, + nr->vs, + nr->vr, + nr->va, + ax25_display_timer(&nr->t1timer) / HZ, + nr->t1 / HZ, + ax25_display_timer(&nr->t2timer) / HZ, + nr->t2 / HZ, + ax25_display_timer(&nr->t4timer) / HZ, + nr->t4 / HZ, + ax25_display_timer(&nr->idletimer) / (60 * HZ), + nr->idle / (60 * HZ), + nr->n2count, + nr->n2, + nr->window, + atomic_read(&s->sk_wmem_alloc), + atomic_read(&s->sk_rmem_alloc), + s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L); + + bh_unlock_sock(s); + } + return 0; +} + +static struct seq_operations nr_info_seqops = { + .start = nr_info_start, + .next = nr_info_next, + .stop = nr_info_stop, + .show = nr_info_show, +}; + +static int nr_info_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &nr_info_seqops); +} + +static struct file_operations nr_info_fops = { + .owner = THIS_MODULE, + .open = nr_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* CONFIG_PROC_FS */ + +static struct net_proto_family nr_family_ops = { + .family = PF_NETROM, + .create = nr_create, + .owner = THIS_MODULE, +}; + +static struct proto_ops nr_proto_ops = { + .family = PF_NETROM, + .owner = THIS_MODULE, + .release = nr_release, + .bind = nr_bind, + .connect = nr_connect, + .socketpair = sock_no_socketpair, + .accept = nr_accept, + .getname = nr_getname, + .poll = datagram_poll, + .ioctl = nr_ioctl, + .listen = nr_listen, + .shutdown = sock_no_shutdown, + .setsockopt = nr_setsockopt, + .getsockopt = nr_getsockopt, + .sendmsg = nr_sendmsg, + .recvmsg = nr_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct notifier_block nr_dev_notifier = { + .notifier_call = nr_device_event, +}; + +static struct net_device **dev_nr; + +static char banner[] __initdata = KERN_INFO "G4KLX NET/ROM for Linux. Version 0.7 for AX25.037 Linux 2.4\n"; + +static int __init nr_proto_init(void) +{ + int i; + int rc = proto_register(&nr_proto, 0); + + if (rc != 0) + goto out; + + if (nr_ndevs > 0x7fffffff/sizeof(struct net_device *)) { + printk(KERN_ERR "NET/ROM: nr_proto_init - nr_ndevs parameter to large\n"); + return -1; + } + + dev_nr = kmalloc(nr_ndevs * sizeof(struct net_device *), GFP_KERNEL); + if (dev_nr == NULL) { + printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device array\n"); + return -1; + } + + memset(dev_nr, 0x00, nr_ndevs * sizeof(struct net_device *)); + + for (i = 0; i < nr_ndevs; i++) { + char name[IFNAMSIZ]; + struct net_device *dev; + + sprintf(name, "nr%d", i); + dev = alloc_netdev(sizeof(struct net_device_stats), name, + nr_setup); + if (!dev) { + printk(KERN_ERR "NET/ROM: nr_proto_init - unable to allocate device structure\n"); + goto fail; + } + + dev->base_addr = i; + if (register_netdev(dev)) { + printk(KERN_ERR "NET/ROM: nr_proto_init - unable to register network device\n"); + free_netdev(dev); + goto fail; + } + dev_nr[i] = dev; + } + + if (sock_register(&nr_family_ops)) { + printk(KERN_ERR "NET/ROM: nr_proto_init - unable to register socket family\n"); + goto fail; + } + + register_netdevice_notifier(&nr_dev_notifier); + printk(banner); + + ax25_protocol_register(AX25_P_NETROM, nr_route_frame); + ax25_linkfail_register(nr_link_failed); + +#ifdef CONFIG_SYSCTL + nr_register_sysctl(); +#endif + + nr_loopback_init(); + + proc_net_fops_create("nr", S_IRUGO, &nr_info_fops); + proc_net_fops_create("nr_neigh", S_IRUGO, &nr_neigh_fops); + proc_net_fops_create("nr_nodes", S_IRUGO, &nr_nodes_fops); +out: + return rc; +fail: + while (--i >= 0) { + unregister_netdev(dev_nr[i]); + free_netdev(dev_nr[i]); + } + kfree(dev_nr); + proto_unregister(&nr_proto); + rc = -1; + goto out; +} + +module_init(nr_proto_init); + +module_param(nr_ndevs, int, 0); +MODULE_PARM_DESC(nr_ndevs, "number of NET/ROM devices"); + +MODULE_AUTHOR("Jonathan Naylor G4KLX "); +MODULE_DESCRIPTION("The amateur radio NET/ROM network and transport layer protocol"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_NETROM); + +static void __exit nr_exit(void) +{ + int i; + + proc_net_remove("nr"); + proc_net_remove("nr_neigh"); + proc_net_remove("nr_nodes"); + nr_loopback_clear(); + + nr_rt_free(); + +#ifdef CONFIG_SYSCTL + nr_unregister_sysctl(); +#endif + + ax25_linkfail_release(nr_link_failed); + ax25_protocol_release(AX25_P_NETROM); + + unregister_netdevice_notifier(&nr_dev_notifier); + + sock_unregister(PF_NETROM); + + for (i = 0; i < nr_ndevs; i++) { + struct net_device *dev = dev_nr[i]; + if (dev) { + unregister_netdev(dev); + free_netdev(dev); + } + } + + kfree(dev_nr); + proto_unregister(&nr_proto); +} +module_exit(nr_exit); diff --git a/net/netrom/nr_dev.c b/net/netrom/nr_dev.c new file mode 100644 index 000000000000..220bf7494f71 --- /dev/null +++ b/net/netrom/nr_dev.c @@ -0,0 +1,220 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For the statistics structure. */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#ifdef CONFIG_INET + +/* + * Only allow IP over NET/ROM frames through if the netrom device is up. + */ + +int nr_rx_ip(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device_stats *stats = netdev_priv(dev); + + if (!netif_running(dev)) { + stats->rx_errors++; + return 0; + } + + stats->rx_packets++; + stats->rx_bytes += skb->len; + + skb->protocol = htons(ETH_P_IP); + + /* Spoof incoming device */ + skb->dev = dev; + skb->h.raw = skb->data; + skb->nh.raw = skb->data; + skb->pkt_type = PACKET_HOST; + + ip_rcv(skb, skb->dev, NULL); + + return 1; +} + + +static int nr_rebuild_header(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct net_device_stats *stats = netdev_priv(dev); + struct sk_buff *skbn; + unsigned char *bp = skb->data; + int len; + + if (arp_find(bp + 7, skb)) { + return 1; + } + + bp[6] &= ~AX25_CBIT; + bp[6] &= ~AX25_EBIT; + bp[6] |= AX25_SSSID_SPARE; + bp += AX25_ADDR_LEN; + + bp[6] &= ~AX25_CBIT; + bp[6] |= AX25_EBIT; + bp[6] |= AX25_SSSID_SPARE; + + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + kfree_skb(skb); + return 1; + } + + if (skb->sk != NULL) + skb_set_owner_w(skbn, skb->sk); + + kfree_skb(skb); + + len = skbn->len; + + if (!nr_route_frame(skbn, NULL)) { + kfree_skb(skbn); + stats->tx_errors++; + } + + stats->tx_packets++; + stats->tx_bytes += len; + + return 1; +} + +#else + +static int nr_rebuild_header(struct sk_buff *skb) +{ + return 1; +} + +#endif + +static int nr_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, + void *daddr, void *saddr, unsigned len) +{ + unsigned char *buff = skb_push(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); + + memcpy(buff, (saddr != NULL) ? saddr : dev->dev_addr, dev->addr_len); + buff[6] &= ~AX25_CBIT; + buff[6] &= ~AX25_EBIT; + buff[6] |= AX25_SSSID_SPARE; + buff += AX25_ADDR_LEN; + + if (daddr != NULL) + memcpy(buff, daddr, dev->addr_len); + buff[6] &= ~AX25_CBIT; + buff[6] |= AX25_EBIT; + buff[6] |= AX25_SSSID_SPARE; + buff += AX25_ADDR_LEN; + + *buff++ = sysctl_netrom_network_ttl_initialiser; + + *buff++ = NR_PROTO_IP; + *buff++ = NR_PROTO_IP; + *buff++ = 0; + *buff++ = 0; + *buff++ = NR_PROTOEXT; + + if (daddr != NULL) + return 37; + + return -37; +} + +static int nr_set_mac_address(struct net_device *dev, void *addr) +{ + struct sockaddr *sa = addr; + + if (dev->flags & IFF_UP) + ax25_listen_release((ax25_address *)dev->dev_addr, NULL); + + memcpy(dev->dev_addr, sa->sa_data, dev->addr_len); + + if (dev->flags & IFF_UP) + ax25_listen_register((ax25_address *)dev->dev_addr, NULL); + + return 0; +} + +static int nr_open(struct net_device *dev) +{ + netif_start_queue(dev); + ax25_listen_register((ax25_address *)dev->dev_addr, NULL); + return 0; +} + +static int nr_close(struct net_device *dev) +{ + ax25_listen_release((ax25_address *)dev->dev_addr, NULL); + netif_stop_queue(dev); + return 0; +} + +static int nr_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device_stats *stats = netdev_priv(dev); + dev_kfree_skb(skb); + stats->tx_errors++; + return 0; +} + +static struct net_device_stats *nr_get_stats(struct net_device *dev) +{ + return netdev_priv(dev); +} + +void nr_setup(struct net_device *dev) +{ + SET_MODULE_OWNER(dev); + dev->mtu = NR_MAX_PACKET_SIZE; + dev->hard_start_xmit = nr_xmit; + dev->open = nr_open; + dev->stop = nr_close; + + dev->hard_header = nr_header; + dev->hard_header_len = NR_NETWORK_LEN + NR_TRANSPORT_LEN; + dev->addr_len = AX25_ADDR_LEN; + dev->type = ARPHRD_NETROM; + dev->tx_queue_len = 40; + dev->rebuild_header = nr_rebuild_header; + dev->set_mac_address = nr_set_mac_address; + + /* New-style flags. */ + dev->flags = 0; + + dev->get_stats = nr_get_stats; +} diff --git a/net/netrom/nr_in.c b/net/netrom/nr_in.c new file mode 100644 index 000000000000..9c44b3794126 --- /dev/null +++ b/net/netrom/nr_in.c @@ -0,0 +1,290 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For ip_rcv */ +#include +#include +#include +#include +#include +#include + +static int nr_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) +{ + struct sk_buff *skbo, *skbn = skb; + struct nr_sock *nr = nr_sk(sk); + + skb_pull(skb, NR_NETWORK_LEN + NR_TRANSPORT_LEN); + + nr_start_idletimer(sk); + + if (more) { + nr->fraglen += skb->len; + skb_queue_tail(&nr->frag_queue, skb); + return 0; + } + + if (!more && nr->fraglen > 0) { /* End of fragment */ + nr->fraglen += skb->len; + skb_queue_tail(&nr->frag_queue, skb); + + if ((skbn = alloc_skb(nr->fraglen, GFP_ATOMIC)) == NULL) + return 1; + + skbn->h.raw = skbn->data; + + while ((skbo = skb_dequeue(&nr->frag_queue)) != NULL) { + memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); + kfree_skb(skbo); + } + + nr->fraglen = 0; + } + + return sock_queue_rcv_skb(sk, skbn); +} + +/* + * State machine for state 1, Awaiting Connection State. + * The handling of the timer(s) is in file nr_timer.c. + * Handling of state 0 and connection release is in netrom.c. + */ +static int nr_state1_machine(struct sock *sk, struct sk_buff *skb, + int frametype) +{ + switch (frametype) { + case NR_CONNACK: { + struct nr_sock *nr = nr_sk(sk); + + nr_stop_t1timer(sk); + nr_start_idletimer(sk); + nr->your_index = skb->data[17]; + nr->your_id = skb->data[18]; + nr->vs = 0; + nr->va = 0; + nr->vr = 0; + nr->vl = 0; + nr->state = NR_STATE_3; + nr->n2count = 0; + nr->window = skb->data[20]; + sk->sk_state = TCP_ESTABLISHED; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + break; + } + + case NR_CONNACK | NR_CHOKE_FLAG: + nr_disconnect(sk, ECONNREFUSED); + break; + + default: + break; + } + return 0; +} + +/* + * State machine for state 2, Awaiting Release State. + * The handling of the timer(s) is in file nr_timer.c + * Handling of state 0 and connection release is in netrom.c. + */ +static int nr_state2_machine(struct sock *sk, struct sk_buff *skb, + int frametype) +{ + switch (frametype) { + case NR_CONNACK | NR_CHOKE_FLAG: + nr_disconnect(sk, ECONNRESET); + break; + + case NR_DISCREQ: + nr_write_internal(sk, NR_DISCACK); + + case NR_DISCACK: + nr_disconnect(sk, 0); + break; + + default: + break; + } + return 0; +} + +/* + * State machine for state 3, Connected State. + * The handling of the timer(s) is in file nr_timer.c + * Handling of state 0 and connection release is in netrom.c. + */ +static int nr_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + struct nr_sock *nrom = nr_sk(sk); + struct sk_buff_head temp_queue; + struct sk_buff *skbn; + unsigned short save_vr; + unsigned short nr, ns; + int queued = 0; + + nr = skb->data[18]; + ns = skb->data[17]; + + switch (frametype) { + case NR_CONNREQ: + nr_write_internal(sk, NR_CONNACK); + break; + + case NR_DISCREQ: + nr_write_internal(sk, NR_DISCACK); + nr_disconnect(sk, 0); + break; + + case NR_CONNACK | NR_CHOKE_FLAG: + case NR_DISCACK: + nr_disconnect(sk, ECONNRESET); + break; + + case NR_INFOACK: + case NR_INFOACK | NR_CHOKE_FLAG: + case NR_INFOACK | NR_NAK_FLAG: + case NR_INFOACK | NR_NAK_FLAG | NR_CHOKE_FLAG: + if (frametype & NR_CHOKE_FLAG) { + nrom->condition |= NR_COND_PEER_RX_BUSY; + nr_start_t4timer(sk); + } else { + nrom->condition &= ~NR_COND_PEER_RX_BUSY; + nr_stop_t4timer(sk); + } + if (!nr_validate_nr(sk, nr)) { + break; + } + if (frametype & NR_NAK_FLAG) { + nr_frames_acked(sk, nr); + nr_send_nak_frame(sk); + } else { + if (nrom->condition & NR_COND_PEER_RX_BUSY) { + nr_frames_acked(sk, nr); + } else { + nr_check_iframes_acked(sk, nr); + } + } + break; + + case NR_INFO: + case NR_INFO | NR_NAK_FLAG: + case NR_INFO | NR_CHOKE_FLAG: + case NR_INFO | NR_MORE_FLAG: + case NR_INFO | NR_NAK_FLAG | NR_CHOKE_FLAG: + case NR_INFO | NR_CHOKE_FLAG | NR_MORE_FLAG: + case NR_INFO | NR_NAK_FLAG | NR_MORE_FLAG: + case NR_INFO | NR_NAK_FLAG | NR_CHOKE_FLAG | NR_MORE_FLAG: + if (frametype & NR_CHOKE_FLAG) { + nrom->condition |= NR_COND_PEER_RX_BUSY; + nr_start_t4timer(sk); + } else { + nrom->condition &= ~NR_COND_PEER_RX_BUSY; + nr_stop_t4timer(sk); + } + if (nr_validate_nr(sk, nr)) { + if (frametype & NR_NAK_FLAG) { + nr_frames_acked(sk, nr); + nr_send_nak_frame(sk); + } else { + if (nrom->condition & NR_COND_PEER_RX_BUSY) { + nr_frames_acked(sk, nr); + } else { + nr_check_iframes_acked(sk, nr); + } + } + } + queued = 1; + skb_queue_head(&nrom->reseq_queue, skb); + if (nrom->condition & NR_COND_OWN_RX_BUSY) + break; + skb_queue_head_init(&temp_queue); + do { + save_vr = nrom->vr; + while ((skbn = skb_dequeue(&nrom->reseq_queue)) != NULL) { + ns = skbn->data[17]; + if (ns == nrom->vr) { + if (nr_queue_rx_frame(sk, skbn, frametype & NR_MORE_FLAG) == 0) { + nrom->vr = (nrom->vr + 1) % NR_MODULUS; + } else { + nrom->condition |= NR_COND_OWN_RX_BUSY; + skb_queue_tail(&temp_queue, skbn); + } + } else if (nr_in_rx_window(sk, ns)) { + skb_queue_tail(&temp_queue, skbn); + } else { + kfree_skb(skbn); + } + } + while ((skbn = skb_dequeue(&temp_queue)) != NULL) { + skb_queue_tail(&nrom->reseq_queue, skbn); + } + } while (save_vr != nrom->vr); + /* + * Window is full, ack it immediately. + */ + if (((nrom->vl + nrom->window) % NR_MODULUS) == nrom->vr) { + nr_enquiry_response(sk); + } else { + if (!(nrom->condition & NR_COND_ACK_PENDING)) { + nrom->condition |= NR_COND_ACK_PENDING; + nr_start_t2timer(sk); + } + } + break; + + default: + break; + } + return queued; +} + +/* Higher level upcall for a LAPB frame - called with sk locked */ +int nr_process_rx_frame(struct sock *sk, struct sk_buff *skb) +{ + struct nr_sock *nr = nr_sk(sk); + int queued = 0, frametype; + + if (nr->state == NR_STATE_0) + return 0; + + frametype = skb->data[19]; + + switch (nr->state) { + case NR_STATE_1: + queued = nr_state1_machine(sk, skb, frametype); + break; + case NR_STATE_2: + queued = nr_state2_machine(sk, skb, frametype); + break; + case NR_STATE_3: + queued = nr_state3_machine(sk, skb, frametype); + break; + } + + nr_kick(sk); + + return queued; +} diff --git a/net/netrom/nr_loopback.c b/net/netrom/nr_loopback.c new file mode 100644 index 000000000000..165b2abce110 --- /dev/null +++ b/net/netrom/nr_loopback.c @@ -0,0 +1,76 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright Tomi Manninen OH2BNS (oh2bns@sral.fi) + */ +#include +#include +#include +#include +#include +#include +#include + +static void nr_loopback_timer(unsigned long); + +static struct sk_buff_head loopback_queue; +static struct timer_list loopback_timer = TIMER_INITIALIZER(nr_loopback_timer, 0, 0); + +void __init nr_loopback_init(void) +{ + skb_queue_head_init(&loopback_queue); +} + +static inline int nr_loopback_running(void) +{ + return timer_pending(&loopback_timer); +} + +int nr_loopback_queue(struct sk_buff *skb) +{ + struct sk_buff *skbn; + + if ((skbn = alloc_skb(skb->len, GFP_ATOMIC)) != NULL) { + memcpy(skb_put(skbn, skb->len), skb->data, skb->len); + skbn->h.raw = skbn->data; + + skb_queue_tail(&loopback_queue, skbn); + + if (!nr_loopback_running()) + mod_timer(&loopback_timer, jiffies + 10); + } + + kfree_skb(skb); + return 1; +} + +static void nr_loopback_timer(unsigned long param) +{ + struct sk_buff *skb; + ax25_address *nr_dest; + struct net_device *dev; + + if ((skb = skb_dequeue(&loopback_queue)) != NULL) { + nr_dest = (ax25_address *)(skb->data + 7); + + dev = nr_dev_get(nr_dest); + + if (dev == NULL || nr_rx_frame(skb, dev) == 0) + kfree_skb(skb); + + if (dev != NULL) + dev_put(dev); + + if (!skb_queue_empty(&loopback_queue) && !nr_loopback_running()) + mod_timer(&loopback_timer, jiffies + 10); + } +} + +void __exit nr_loopback_clear(void) +{ + del_timer_sync(&loopback_timer); + skb_queue_purge(&loopback_queue); +} diff --git a/net/netrom/nr_out.c b/net/netrom/nr_out.c new file mode 100644 index 000000000000..7939ded9c98c --- /dev/null +++ b/net/netrom/nr_out.c @@ -0,0 +1,274 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright Darryl Miles G7LED (dlm@g7led.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This is where all NET/ROM frames pass, except for IP-over-NET/ROM which + * cannot be fragmented in this manner. + */ +void nr_output(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *skbn; + unsigned char transport[NR_TRANSPORT_LEN]; + int err, frontlen, len; + + if (skb->len - NR_TRANSPORT_LEN > NR_MAX_PACKET_SIZE) { + /* Save a copy of the Transport Header */ + memcpy(transport, skb->data, NR_TRANSPORT_LEN); + skb_pull(skb, NR_TRANSPORT_LEN); + + frontlen = skb_headroom(skb); + + while (skb->len > 0) { + if ((skbn = sock_alloc_send_skb(sk, frontlen + NR_MAX_PACKET_SIZE, 0, &err)) == NULL) + return; + + skb_reserve(skbn, frontlen); + + len = (NR_MAX_PACKET_SIZE > skb->len) ? skb->len : NR_MAX_PACKET_SIZE; + + /* Copy the user data */ + memcpy(skb_put(skbn, len), skb->data, len); + skb_pull(skb, len); + + /* Duplicate the Transport Header */ + skb_push(skbn, NR_TRANSPORT_LEN); + memcpy(skbn->data, transport, NR_TRANSPORT_LEN); + + if (skb->len > 0) + skbn->data[4] |= NR_MORE_FLAG; + + skb_queue_tail(&sk->sk_write_queue, skbn); /* Throw it on the queue */ + } + + kfree_skb(skb); + } else { + skb_queue_tail(&sk->sk_write_queue, skb); /* Throw it on the queue */ + } + + nr_kick(sk); +} + +/* + * This procedure is passed a buffer descriptor for an iframe. It builds + * the rest of the control part of the frame and then writes it out. + */ +static void nr_send_iframe(struct sock *sk, struct sk_buff *skb) +{ + struct nr_sock *nr = nr_sk(sk); + + if (skb == NULL) + return; + + skb->data[2] = nr->vs; + skb->data[3] = nr->vr; + + if (nr->condition & NR_COND_OWN_RX_BUSY) + skb->data[4] |= NR_CHOKE_FLAG; + + nr_start_idletimer(sk); + + nr_transmit_buffer(sk, skb); +} + +void nr_send_nak_frame(struct sock *sk) +{ + struct sk_buff *skb, *skbn; + struct nr_sock *nr = nr_sk(sk); + + if ((skb = skb_peek(&nr->ack_queue)) == NULL) + return; + + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) + return; + + skbn->data[2] = nr->va; + skbn->data[3] = nr->vr; + + if (nr->condition & NR_COND_OWN_RX_BUSY) + skbn->data[4] |= NR_CHOKE_FLAG; + + nr_transmit_buffer(sk, skbn); + + nr->condition &= ~NR_COND_ACK_PENDING; + nr->vl = nr->vr; + + nr_stop_t1timer(sk); +} + +void nr_kick(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + struct sk_buff *skb, *skbn; + unsigned short start, end; + + if (nr->state != NR_STATE_3) + return; + + if (nr->condition & NR_COND_PEER_RX_BUSY) + return; + + if (!skb_peek(&sk->sk_write_queue)) + return; + + start = (skb_peek(&nr->ack_queue) == NULL) ? nr->va : nr->vs; + end = (nr->va + nr->window) % NR_MODULUS; + + if (start == end) + return; + + nr->vs = start; + + /* + * Transmit data until either we're out of data to send or + * the window is full. + */ + + /* + * Dequeue the frame and copy it. + */ + skb = skb_dequeue(&sk->sk_write_queue); + + do { + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skb_queue_head(&sk->sk_write_queue, skb); + break; + } + + skb_set_owner_w(skbn, sk); + + /* + * Transmit the frame copy. + */ + nr_send_iframe(sk, skbn); + + nr->vs = (nr->vs + 1) % NR_MODULUS; + + /* + * Requeue the original data frame. + */ + skb_queue_tail(&nr->ack_queue, skb); + + } while (nr->vs != end && + (skb = skb_dequeue(&sk->sk_write_queue)) != NULL); + + nr->vl = nr->vr; + nr->condition &= ~NR_COND_ACK_PENDING; + + if (!nr_t1timer_running(sk)) + nr_start_t1timer(sk); +} + +void nr_transmit_buffer(struct sock *sk, struct sk_buff *skb) +{ + struct nr_sock *nr = nr_sk(sk); + unsigned char *dptr; + + /* + * Add the protocol byte and network header. + */ + dptr = skb_push(skb, NR_NETWORK_LEN); + + memcpy(dptr, &nr->source_addr, AX25_ADDR_LEN); + dptr[6] &= ~AX25_CBIT; + dptr[6] &= ~AX25_EBIT; + dptr[6] |= AX25_SSSID_SPARE; + dptr += AX25_ADDR_LEN; + + memcpy(dptr, &nr->dest_addr, AX25_ADDR_LEN); + dptr[6] &= ~AX25_CBIT; + dptr[6] |= AX25_EBIT; + dptr[6] |= AX25_SSSID_SPARE; + dptr += AX25_ADDR_LEN; + + *dptr++ = sysctl_netrom_network_ttl_initialiser; + + if (!nr_route_frame(skb, NULL)) { + kfree_skb(skb); + nr_disconnect(sk, ENETUNREACH); + } +} + +/* + * The following routines are taken from page 170 of the 7th ARRL Computer + * Networking Conference paper, as is the whole state machine. + */ + +void nr_establish_data_link(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + + nr->condition = 0x00; + nr->n2count = 0; + + nr_write_internal(sk, NR_CONNREQ); + + nr_stop_t2timer(sk); + nr_stop_t4timer(sk); + nr_stop_idletimer(sk); + nr_start_t1timer(sk); +} + +/* + * Never send a NAK when we are CHOKEd. + */ +void nr_enquiry_response(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + int frametype = NR_INFOACK; + + if (nr->condition & NR_COND_OWN_RX_BUSY) { + frametype |= NR_CHOKE_FLAG; + } else { + if (skb_peek(&nr->reseq_queue) != NULL) + frametype |= NR_NAK_FLAG; + } + + nr_write_internal(sk, frametype); + + nr->vl = nr->vr; + nr->condition &= ~NR_COND_ACK_PENDING; +} + +void nr_check_iframes_acked(struct sock *sk, unsigned short nr) +{ + struct nr_sock *nrom = nr_sk(sk); + + if (nrom->vs == nr) { + nr_frames_acked(sk, nr); + nr_stop_t1timer(sk); + nrom->n2count = 0; + } else { + if (nrom->va != nr) { + nr_frames_acked(sk, nr); + nr_start_t1timer(sk); + } + } +} diff --git a/net/netrom/nr_route.c b/net/netrom/nr_route.c new file mode 100644 index 000000000000..7a86b36cba50 --- /dev/null +++ b/net/netrom/nr_route.c @@ -0,0 +1,1041 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright Tomi Manninen OH2BNS (oh2bns@sral.fi) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For TIOCINQ/OUTQ */ +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int nr_neigh_no = 1; + +static HLIST_HEAD(nr_node_list); +static DEFINE_SPINLOCK(nr_node_list_lock); +static HLIST_HEAD(nr_neigh_list); +static DEFINE_SPINLOCK(nr_neigh_list_lock); + +static struct nr_node *nr_node_get(ax25_address *callsign) +{ + struct nr_node *found = NULL; + struct nr_node *nr_node; + struct hlist_node *node; + + spin_lock_bh(&nr_node_list_lock); + nr_node_for_each(nr_node, node, &nr_node_list) + if (ax25cmp(callsign, &nr_node->callsign) == 0) { + nr_node_hold(nr_node); + found = nr_node; + break; + } + spin_unlock_bh(&nr_node_list_lock); + return found; +} + +static struct nr_neigh *nr_neigh_get_dev(ax25_address *callsign, + struct net_device *dev) +{ + struct nr_neigh *found = NULL; + struct nr_neigh *nr_neigh; + struct hlist_node *node; + + spin_lock_bh(&nr_neigh_list_lock); + nr_neigh_for_each(nr_neigh, node, &nr_neigh_list) + if (ax25cmp(callsign, &nr_neigh->callsign) == 0 && + nr_neigh->dev == dev) { + nr_neigh_hold(nr_neigh); + found = nr_neigh; + break; + } + spin_unlock_bh(&nr_neigh_list_lock); + return found; +} + +static void nr_remove_neigh(struct nr_neigh *); + +/* + * Add a new route to a node, and in the process add the node and the + * neighbour if it is new. + */ +static int nr_add_node(ax25_address *nr, const char *mnemonic, ax25_address *ax25, + ax25_digi *ax25_digi, struct net_device *dev, int quality, int obs_count) +{ + struct nr_node *nr_node; + struct nr_neigh *nr_neigh; + struct nr_route nr_route; + int i, found; + struct net_device *odev; + + if ((odev=nr_dev_get(nr)) != NULL) { /* Can't add routes to ourself */ + dev_put(odev); + return -EINVAL; + } + + nr_node = nr_node_get(nr); + + nr_neigh = nr_neigh_get_dev(ax25, dev); + + /* + * The L2 link to a neighbour has failed in the past + * and now a frame comes from this neighbour. We assume + * it was a temporary trouble with the link and reset the + * routes now (and not wait for a node broadcast). + */ + if (nr_neigh != NULL && nr_neigh->failed != 0 && quality == 0) { + struct nr_node *nr_nodet; + struct hlist_node *node; + + spin_lock_bh(&nr_node_list_lock); + nr_node_for_each(nr_nodet, node, &nr_node_list) { + nr_node_lock(nr_nodet); + for (i = 0; i < nr_nodet->count; i++) + if (nr_nodet->routes[i].neighbour == nr_neigh) + if (i < nr_nodet->which) + nr_nodet->which = i; + nr_node_unlock(nr_nodet); + } + spin_unlock_bh(&nr_node_list_lock); + } + + if (nr_neigh != NULL) + nr_neigh->failed = 0; + + if (quality == 0 && nr_neigh != NULL && nr_node != NULL) { + nr_neigh_put(nr_neigh); + nr_node_put(nr_node); + return 0; + } + + if (nr_neigh == NULL) { + if ((nr_neigh = kmalloc(sizeof(*nr_neigh), GFP_ATOMIC)) == NULL) { + if (nr_node) + nr_node_put(nr_node); + return -ENOMEM; + } + + nr_neigh->callsign = *ax25; + nr_neigh->digipeat = NULL; + nr_neigh->ax25 = NULL; + nr_neigh->dev = dev; + nr_neigh->quality = sysctl_netrom_default_path_quality; + nr_neigh->locked = 0; + nr_neigh->count = 0; + nr_neigh->number = nr_neigh_no++; + nr_neigh->failed = 0; + atomic_set(&nr_neigh->refcount, 1); + + if (ax25_digi != NULL && ax25_digi->ndigi > 0) { + if ((nr_neigh->digipeat = kmalloc(sizeof(*ax25_digi), GFP_KERNEL)) == NULL) { + kfree(nr_neigh); + if (nr_node) + nr_node_put(nr_node); + return -ENOMEM; + } + memcpy(nr_neigh->digipeat, ax25_digi, + sizeof(*ax25_digi)); + } + + spin_lock_bh(&nr_neigh_list_lock); + hlist_add_head(&nr_neigh->neigh_node, &nr_neigh_list); + nr_neigh_hold(nr_neigh); + spin_unlock_bh(&nr_neigh_list_lock); + } + + if (quality != 0 && ax25cmp(nr, ax25) == 0 && !nr_neigh->locked) + nr_neigh->quality = quality; + + if (nr_node == NULL) { + if ((nr_node = kmalloc(sizeof(*nr_node), GFP_ATOMIC)) == NULL) { + if (nr_neigh) + nr_neigh_put(nr_neigh); + return -ENOMEM; + } + + nr_node->callsign = *nr; + strcpy(nr_node->mnemonic, mnemonic); + + nr_node->which = 0; + nr_node->count = 1; + atomic_set(&nr_node->refcount, 1); + spin_lock_init(&nr_node->node_lock); + + nr_node->routes[0].quality = quality; + nr_node->routes[0].obs_count = obs_count; + nr_node->routes[0].neighbour = nr_neigh; + + nr_neigh_hold(nr_neigh); + nr_neigh->count++; + + spin_lock_bh(&nr_node_list_lock); + hlist_add_head(&nr_node->node_node, &nr_node_list); + /* refcount initialized at 1 */ + spin_unlock_bh(&nr_node_list_lock); + + return 0; + } + nr_node_lock(nr_node); + + if (quality != 0) + strcpy(nr_node->mnemonic, mnemonic); + + for (found = 0, i = 0; i < nr_node->count; i++) { + if (nr_node->routes[i].neighbour == nr_neigh) { + nr_node->routes[i].quality = quality; + nr_node->routes[i].obs_count = obs_count; + found = 1; + break; + } + } + + if (!found) { + /* We have space at the bottom, slot it in */ + if (nr_node->count < 3) { + nr_node->routes[2] = nr_node->routes[1]; + nr_node->routes[1] = nr_node->routes[0]; + + nr_node->routes[0].quality = quality; + nr_node->routes[0].obs_count = obs_count; + nr_node->routes[0].neighbour = nr_neigh; + + nr_node->which++; + nr_node->count++; + nr_neigh_hold(nr_neigh); + nr_neigh->count++; + } else { + /* It must be better than the worst */ + if (quality > nr_node->routes[2].quality) { + nr_node->routes[2].neighbour->count--; + nr_neigh_put(nr_node->routes[2].neighbour); + + if (nr_node->routes[2].neighbour->count == 0 && !nr_node->routes[2].neighbour->locked) + nr_remove_neigh(nr_node->routes[2].neighbour); + + nr_node->routes[2].quality = quality; + nr_node->routes[2].obs_count = obs_count; + nr_node->routes[2].neighbour = nr_neigh; + + nr_neigh_hold(nr_neigh); + nr_neigh->count++; + } + } + } + + /* Now re-sort the routes in quality order */ + switch (nr_node->count) { + case 3: + if (nr_node->routes[1].quality > nr_node->routes[0].quality) { + switch (nr_node->which) { + case 0: nr_node->which = 1; break; + case 1: nr_node->which = 0; break; + default: break; + } + nr_route = nr_node->routes[0]; + nr_node->routes[0] = nr_node->routes[1]; + nr_node->routes[1] = nr_route; + } + if (nr_node->routes[2].quality > nr_node->routes[1].quality) { + switch (nr_node->which) { + case 1: nr_node->which = 2; + break; + + case 2: nr_node->which = 1; + break; + + default: + break; + } + nr_route = nr_node->routes[1]; + nr_node->routes[1] = nr_node->routes[2]; + nr_node->routes[2] = nr_route; + } + case 2: + if (nr_node->routes[1].quality > nr_node->routes[0].quality) { + switch (nr_node->which) { + case 0: nr_node->which = 1; + break; + + case 1: nr_node->which = 0; + break; + + default: break; + } + nr_route = nr_node->routes[0]; + nr_node->routes[0] = nr_node->routes[1]; + nr_node->routes[1] = nr_route; + } + case 1: + break; + } + + for (i = 0; i < nr_node->count; i++) { + if (nr_node->routes[i].neighbour == nr_neigh) { + if (i < nr_node->which) + nr_node->which = i; + break; + } + } + + nr_neigh_put(nr_neigh); + nr_node_unlock(nr_node); + nr_node_put(nr_node); + return 0; +} + +static inline void __nr_remove_node(struct nr_node *nr_node) +{ + hlist_del_init(&nr_node->node_node); + nr_node_put(nr_node); +} + +#define nr_remove_node_locked(__node) \ + __nr_remove_node(__node) + +static void nr_remove_node(struct nr_node *nr_node) +{ + spin_lock_bh(&nr_node_list_lock); + __nr_remove_node(nr_node); + spin_unlock_bh(&nr_node_list_lock); +} + +static inline void __nr_remove_neigh(struct nr_neigh *nr_neigh) +{ + hlist_del_init(&nr_neigh->neigh_node); + nr_neigh_put(nr_neigh); +} + +#define nr_remove_neigh_locked(__neigh) \ + __nr_remove_neigh(__neigh) + +static void nr_remove_neigh(struct nr_neigh *nr_neigh) +{ + spin_lock_bh(&nr_neigh_list_lock); + __nr_remove_neigh(nr_neigh); + spin_unlock_bh(&nr_neigh_list_lock); +} + +/* + * "Delete" a node. Strictly speaking remove a route to a node. The node + * is only deleted if no routes are left to it. + */ +static int nr_del_node(ax25_address *callsign, ax25_address *neighbour, struct net_device *dev) +{ + struct nr_node *nr_node; + struct nr_neigh *nr_neigh; + int i; + + nr_node = nr_node_get(callsign); + + if (nr_node == NULL) + return -EINVAL; + + nr_neigh = nr_neigh_get_dev(neighbour, dev); + + if (nr_neigh == NULL) { + nr_node_put(nr_node); + return -EINVAL; + } + + nr_node_lock(nr_node); + for (i = 0; i < nr_node->count; i++) { + if (nr_node->routes[i].neighbour == nr_neigh) { + nr_neigh->count--; + nr_neigh_put(nr_neigh); + + if (nr_neigh->count == 0 && !nr_neigh->locked) + nr_remove_neigh(nr_neigh); + nr_neigh_put(nr_neigh); + + nr_node->count--; + + if (nr_node->count == 0) { + nr_remove_node(nr_node); + } else { + switch (i) { + case 0: + nr_node->routes[0] = nr_node->routes[1]; + case 1: + nr_node->routes[1] = nr_node->routes[2]; + case 2: + break; + } + nr_node_put(nr_node); + } + nr_node_unlock(nr_node); + + return 0; + } + } + nr_neigh_put(nr_neigh); + nr_node_unlock(nr_node); + nr_node_put(nr_node); + + return -EINVAL; +} + +/* + * Lock a neighbour with a quality. + */ +static int nr_add_neigh(ax25_address *callsign, ax25_digi *ax25_digi, struct net_device *dev, unsigned int quality) +{ + struct nr_neigh *nr_neigh; + + nr_neigh = nr_neigh_get_dev(callsign, dev); + if (nr_neigh) { + nr_neigh->quality = quality; + nr_neigh->locked = 1; + nr_neigh_put(nr_neigh); + return 0; + } + + if ((nr_neigh = kmalloc(sizeof(*nr_neigh), GFP_ATOMIC)) == NULL) + return -ENOMEM; + + nr_neigh->callsign = *callsign; + nr_neigh->digipeat = NULL; + nr_neigh->ax25 = NULL; + nr_neigh->dev = dev; + nr_neigh->quality = quality; + nr_neigh->locked = 1; + nr_neigh->count = 0; + nr_neigh->number = nr_neigh_no++; + nr_neigh->failed = 0; + atomic_set(&nr_neigh->refcount, 1); + + if (ax25_digi != NULL && ax25_digi->ndigi > 0) { + if ((nr_neigh->digipeat = kmalloc(sizeof(*ax25_digi), GFP_KERNEL)) == NULL) { + kfree(nr_neigh); + return -ENOMEM; + } + memcpy(nr_neigh->digipeat, ax25_digi, sizeof(*ax25_digi)); + } + + spin_lock_bh(&nr_neigh_list_lock); + hlist_add_head(&nr_neigh->neigh_node, &nr_neigh_list); + /* refcount is initialized at 1 */ + spin_unlock_bh(&nr_neigh_list_lock); + + return 0; +} + +/* + * "Delete" a neighbour. The neighbour is only removed if the number + * of nodes that may use it is zero. + */ +static int nr_del_neigh(ax25_address *callsign, struct net_device *dev, unsigned int quality) +{ + struct nr_neigh *nr_neigh; + + nr_neigh = nr_neigh_get_dev(callsign, dev); + + if (nr_neigh == NULL) return -EINVAL; + + nr_neigh->quality = quality; + nr_neigh->locked = 0; + + if (nr_neigh->count == 0) + nr_remove_neigh(nr_neigh); + nr_neigh_put(nr_neigh); + + return 0; +} + +/* + * Decrement the obsolescence count by one. If a route is reduced to a + * count of zero, remove it. Also remove any unlocked neighbours with + * zero nodes routing via it. + */ +static int nr_dec_obs(void) +{ + struct nr_neigh *nr_neigh; + struct nr_node *s; + struct hlist_node *node, *nodet; + int i; + + spin_lock_bh(&nr_node_list_lock); + nr_node_for_each_safe(s, node, nodet, &nr_node_list) { + nr_node_lock(s); + for (i = 0; i < s->count; i++) { + switch (s->routes[i].obs_count) { + case 0: /* A locked entry */ + break; + + case 1: /* From 1 -> 0 */ + nr_neigh = s->routes[i].neighbour; + + nr_neigh->count--; + nr_neigh_put(nr_neigh); + + if (nr_neigh->count == 0 && !nr_neigh->locked) + nr_remove_neigh(nr_neigh); + + s->count--; + + switch (i) { + case 0: + s->routes[0] = s->routes[1]; + case 1: + s->routes[1] = s->routes[2]; + case 2: + break; + } + break; + + default: + s->routes[i].obs_count--; + break; + + } + } + + if (s->count <= 0) + nr_remove_node_locked(s); + nr_node_unlock(s); + } + spin_unlock_bh(&nr_node_list_lock); + + return 0; +} + +/* + * A device has been removed. Remove its routes and neighbours. + */ +void nr_rt_device_down(struct net_device *dev) +{ + struct nr_neigh *s; + struct hlist_node *node, *nodet, *node2, *node2t; + struct nr_node *t; + int i; + + spin_lock_bh(&nr_neigh_list_lock); + nr_neigh_for_each_safe(s, node, nodet, &nr_neigh_list) { + if (s->dev == dev) { + spin_lock_bh(&nr_node_list_lock); + nr_node_for_each_safe(t, node2, node2t, &nr_node_list) { + nr_node_lock(t); + for (i = 0; i < t->count; i++) { + if (t->routes[i].neighbour == s) { + t->count--; + + switch (i) { + case 0: + t->routes[0] = t->routes[1]; + case 1: + t->routes[1] = t->routes[2]; + case 2: + break; + } + } + } + + if (t->count <= 0) + nr_remove_node_locked(t); + nr_node_unlock(t); + } + spin_unlock_bh(&nr_node_list_lock); + + nr_remove_neigh_locked(s); + } + } + spin_unlock_bh(&nr_neigh_list_lock); +} + +/* + * Check that the device given is a valid AX.25 interface that is "up". + * Or a valid ethernet interface with an AX.25 callsign binding. + */ +static struct net_device *nr_ax25_dev_get(char *devname) +{ + struct net_device *dev; + + if ((dev = dev_get_by_name(devname)) == NULL) + return NULL; + + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_AX25) + return dev; + + dev_put(dev); + return NULL; +} + +/* + * Find the first active NET/ROM device, usually "nr0". + */ +struct net_device *nr_dev_first(void) +{ + struct net_device *dev, *first = NULL; + + read_lock(&dev_base_lock); + for (dev = dev_base; dev != NULL; dev = dev->next) { + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM) + if (first == NULL || strncmp(dev->name, first->name, 3) < 0) + first = dev; + } + if (first) + dev_hold(first); + read_unlock(&dev_base_lock); + + return first; +} + +/* + * Find the NET/ROM device for the given callsign. + */ +struct net_device *nr_dev_get(ax25_address *addr) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + for (dev = dev_base; dev != NULL; dev = dev->next) { + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_NETROM && ax25cmp(addr, (ax25_address *)dev->dev_addr) == 0) { + dev_hold(dev); + goto out; + } + } +out: + read_unlock(&dev_base_lock); + return dev; +} + +static ax25_digi *nr_call_to_digi(int ndigis, ax25_address *digipeaters) +{ + static ax25_digi ax25_digi; + int i; + + if (ndigis == 0) + return NULL; + + for (i = 0; i < ndigis; i++) { + ax25_digi.calls[i] = digipeaters[i]; + ax25_digi.repeated[i] = 0; + } + + ax25_digi.ndigi = ndigis; + ax25_digi.lastrepeat = -1; + + return &ax25_digi; +} + +/* + * Handle the ioctls that control the routing functions. + */ +int nr_rt_ioctl(unsigned int cmd, void __user *arg) +{ + struct nr_route_struct nr_route; + struct net_device *dev; + int ret; + + switch (cmd) { + case SIOCADDRT: + if (copy_from_user(&nr_route, arg, sizeof(struct nr_route_struct))) + return -EFAULT; + if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL) + return -EINVAL; + if (nr_route.ndigis < 0 || nr_route.ndigis > AX25_MAX_DIGIS) { + dev_put(dev); + return -EINVAL; + } + switch (nr_route.type) { + case NETROM_NODE: + ret = nr_add_node(&nr_route.callsign, + nr_route.mnemonic, + &nr_route.neighbour, + nr_call_to_digi(nr_route.ndigis, nr_route.digipeaters), + dev, nr_route.quality, + nr_route.obs_count); + break; + case NETROM_NEIGH: + ret = nr_add_neigh(&nr_route.callsign, + nr_call_to_digi(nr_route.ndigis, nr_route.digipeaters), + dev, nr_route.quality); + break; + default: + ret = -EINVAL; + } + dev_put(dev); + return ret; + + case SIOCDELRT: + if (copy_from_user(&nr_route, arg, sizeof(struct nr_route_struct))) + return -EFAULT; + if ((dev = nr_ax25_dev_get(nr_route.device)) == NULL) + return -EINVAL; + switch (nr_route.type) { + case NETROM_NODE: + ret = nr_del_node(&nr_route.callsign, + &nr_route.neighbour, dev); + break; + case NETROM_NEIGH: + ret = nr_del_neigh(&nr_route.callsign, + dev, nr_route.quality); + break; + default: + ret = -EINVAL; + } + dev_put(dev); + return ret; + + case SIOCNRDECOBS: + return nr_dec_obs(); + + default: + return -EINVAL; + } + + return 0; +} + +/* + * A level 2 link has timed out, therefore it appears to be a poor link, + * then don't use that neighbour until it is reset. + */ +void nr_link_failed(ax25_cb *ax25, int reason) +{ + struct nr_neigh *s, *nr_neigh = NULL; + struct hlist_node *node; + struct nr_node *nr_node = NULL; + + spin_lock_bh(&nr_neigh_list_lock); + nr_neigh_for_each(s, node, &nr_neigh_list) + if (s->ax25 == ax25) { + nr_neigh_hold(s); + nr_neigh = s; + break; + } + spin_unlock_bh(&nr_neigh_list_lock); + + if (nr_neigh == NULL) return; + + nr_neigh->ax25 = NULL; + ax25_cb_put(ax25); + + if (++nr_neigh->failed < sysctl_netrom_link_fails_count) { + nr_neigh_put(nr_neigh); + return; + } + spin_lock_bh(&nr_node_list_lock); + nr_node_for_each(nr_node, node, &nr_node_list) + nr_node_lock(nr_node); + if (nr_node->which < nr_node->count && nr_node->routes[nr_node->which].neighbour == nr_neigh) + nr_node->which++; + nr_node_unlock(nr_node); + spin_unlock_bh(&nr_node_list_lock); + nr_neigh_put(nr_neigh); +} + +/* + * Route a frame to an appropriate AX.25 connection. A NULL ax25_cb + * indicates an internally generated frame. + */ +int nr_route_frame(struct sk_buff *skb, ax25_cb *ax25) +{ + ax25_address *nr_src, *nr_dest; + struct nr_neigh *nr_neigh; + struct nr_node *nr_node; + struct net_device *dev; + unsigned char *dptr; + ax25_cb *ax25s; + int ret; + struct sk_buff *skbn; + + + nr_src = (ax25_address *)(skb->data + 0); + nr_dest = (ax25_address *)(skb->data + 7); + + if (ax25 != NULL) + nr_add_node(nr_src, "", &ax25->dest_addr, ax25->digipeat, + ax25->ax25_dev->dev, 0, sysctl_netrom_obsolescence_count_initialiser); + + if ((dev = nr_dev_get(nr_dest)) != NULL) { /* Its for me */ + if (ax25 == NULL) /* Its from me */ + ret = nr_loopback_queue(skb); + else + ret = nr_rx_frame(skb, dev); + dev_put(dev); + return ret; + } + + if (!sysctl_netrom_routing_control && ax25 != NULL) + return 0; + + /* Its Time-To-Live has expired */ + if (skb->data[14] == 1) { + return 0; + } + + nr_node = nr_node_get(nr_dest); + if (nr_node == NULL) + return 0; + nr_node_lock(nr_node); + + if (nr_node->which >= nr_node->count) { + nr_node_unlock(nr_node); + nr_node_put(nr_node); + return 0; + } + + nr_neigh = nr_node->routes[nr_node->which].neighbour; + + if ((dev = nr_dev_first()) == NULL) { + nr_node_unlock(nr_node); + nr_node_put(nr_node); + return 0; + } + + /* We are going to change the netrom headers so we should get our + own skb, we also did not know until now how much header space + we had to reserve... - RXQ */ + if ((skbn=skb_copy_expand(skb, dev->hard_header_len, 0, GFP_ATOMIC)) == NULL) { + nr_node_unlock(nr_node); + nr_node_put(nr_node); + dev_put(dev); + return 0; + } + kfree_skb(skb); + skb=skbn; + skb->data[14]--; + + dptr = skb_push(skb, 1); + *dptr = AX25_P_NETROM; + + ax25s = ax25_send_frame(skb, 256, (ax25_address *)dev->dev_addr, &nr_neigh->callsign, nr_neigh->digipeat, nr_neigh->dev); + if (nr_neigh->ax25 && ax25s) { + /* We were already holding this ax25_cb */ + ax25_cb_put(ax25s); + } + nr_neigh->ax25 = ax25s; + + dev_put(dev); + ret = (nr_neigh->ax25 != NULL); + nr_node_unlock(nr_node); + nr_node_put(nr_node); + return ret; +} + +#ifdef CONFIG_PROC_FS + +static void *nr_node_start(struct seq_file *seq, loff_t *pos) +{ + struct nr_node *nr_node; + struct hlist_node *node; + int i = 1; + + spin_lock_bh(&nr_node_list_lock); + if (*pos == 0) + return SEQ_START_TOKEN; + + nr_node_for_each(nr_node, node, &nr_node_list) { + if (i == *pos) + return nr_node; + ++i; + } + + return NULL; +} + +static void *nr_node_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct hlist_node *node; + ++*pos; + + node = (v == SEQ_START_TOKEN) + ? nr_node_list.first + : ((struct nr_node *)v)->node_node.next; + + return hlist_entry(node, struct nr_node, node_node); +} + +static void nr_node_stop(struct seq_file *seq, void *v) +{ + spin_unlock_bh(&nr_node_list_lock); +} + +static int nr_node_show(struct seq_file *seq, void *v) +{ + int i; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "callsign mnemonic w n qual obs neigh qual obs neigh qual obs neigh\n"); + else { + struct nr_node *nr_node = v; + nr_node_lock(nr_node); + seq_printf(seq, "%-9s %-7s %d %d", + ax2asc(&nr_node->callsign), + (nr_node->mnemonic[0] == '\0') ? "*" : nr_node->mnemonic, + nr_node->which + 1, + nr_node->count); + + for (i = 0; i < nr_node->count; i++) { + seq_printf(seq, " %3d %d %05d", + nr_node->routes[i].quality, + nr_node->routes[i].obs_count, + nr_node->routes[i].neighbour->number); + } + nr_node_unlock(nr_node); + + seq_puts(seq, "\n"); + } + return 0; +} + +static struct seq_operations nr_node_seqops = { + .start = nr_node_start, + .next = nr_node_next, + .stop = nr_node_stop, + .show = nr_node_show, +}; + +static int nr_node_info_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &nr_node_seqops); +} + +struct file_operations nr_nodes_fops = { + .owner = THIS_MODULE, + .open = nr_node_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *nr_neigh_start(struct seq_file *seq, loff_t *pos) +{ + struct nr_neigh *nr_neigh; + struct hlist_node *node; + int i = 1; + + spin_lock_bh(&nr_neigh_list_lock); + if (*pos == 0) + return SEQ_START_TOKEN; + + nr_neigh_for_each(nr_neigh, node, &nr_neigh_list) { + if (i == *pos) + return nr_neigh; + } + return NULL; +} + +static void *nr_neigh_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct hlist_node *node; + ++*pos; + + node = (v == SEQ_START_TOKEN) + ? nr_neigh_list.first + : ((struct nr_neigh *)v)->neigh_node.next; + + return hlist_entry(node, struct nr_neigh, neigh_node); +} + +static void nr_neigh_stop(struct seq_file *seq, void *v) +{ + spin_unlock_bh(&nr_neigh_list_lock); +} + +static int nr_neigh_show(struct seq_file *seq, void *v) +{ + int i; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, "addr callsign dev qual lock count failed digipeaters\n"); + else { + struct nr_neigh *nr_neigh = v; + + seq_printf(seq, "%05d %-9s %-4s %3d %d %3d %3d", + nr_neigh->number, + ax2asc(&nr_neigh->callsign), + nr_neigh->dev ? nr_neigh->dev->name : "???", + nr_neigh->quality, + nr_neigh->locked, + nr_neigh->count, + nr_neigh->failed); + + if (nr_neigh->digipeat != NULL) { + for (i = 0; i < nr_neigh->digipeat->ndigi; i++) + seq_printf(seq, " %s", + ax2asc(&nr_neigh->digipeat->calls[i])); + } + + seq_puts(seq, "\n"); + } + return 0; +} + +static struct seq_operations nr_neigh_seqops = { + .start = nr_neigh_start, + .next = nr_neigh_next, + .stop = nr_neigh_stop, + .show = nr_neigh_show, +}; + +static int nr_neigh_info_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &nr_neigh_seqops); +} + +struct file_operations nr_neigh_fops = { + .owner = THIS_MODULE, + .open = nr_neigh_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + +/* + * Free all memory associated with the nodes and routes lists. + */ +void __exit nr_rt_free(void) +{ + struct nr_neigh *s = NULL; + struct nr_node *t = NULL; + struct hlist_node *node, *nodet; + + spin_lock_bh(&nr_neigh_list_lock); + spin_lock_bh(&nr_node_list_lock); + nr_node_for_each_safe(t, node, nodet, &nr_node_list) { + nr_node_lock(t); + nr_remove_node_locked(t); + nr_node_unlock(t); + } + nr_neigh_for_each_safe(s, node, nodet, &nr_neigh_list) { + while(s->count) { + s->count--; + nr_neigh_put(s); + } + nr_remove_neigh_locked(s); + } + spin_unlock_bh(&nr_node_list_lock); + spin_unlock_bh(&nr_neigh_list_lock); +} diff --git a/net/netrom/nr_subr.c b/net/netrom/nr_subr.c new file mode 100644 index 000000000000..0627347b14b8 --- /dev/null +++ b/net/netrom/nr_subr.c @@ -0,0 +1,283 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This routine purges all of the queues of frames. + */ +void nr_clear_queues(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + + skb_queue_purge(&sk->sk_write_queue); + skb_queue_purge(&nr->ack_queue); + skb_queue_purge(&nr->reseq_queue); + skb_queue_purge(&nr->frag_queue); +} + +/* + * This routine purges the input queue of those frames that have been + * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the + * SDL diagram. + */ +void nr_frames_acked(struct sock *sk, unsigned short nr) +{ + struct nr_sock *nrom = nr_sk(sk); + struct sk_buff *skb; + + /* + * Remove all the ack-ed frames from the ack queue. + */ + if (nrom->va != nr) { + while (skb_peek(&nrom->ack_queue) != NULL && nrom->va != nr) { + skb = skb_dequeue(&nrom->ack_queue); + kfree_skb(skb); + nrom->va = (nrom->va + 1) % NR_MODULUS; + } + } +} + +/* + * Requeue all the un-ack-ed frames on the output queue to be picked + * up by nr_kick called from the timer. This arrangement handles the + * possibility of an empty output queue. + */ +void nr_requeue_frames(struct sock *sk) +{ + struct sk_buff *skb, *skb_prev = NULL; + + while ((skb = skb_dequeue(&nr_sk(sk)->ack_queue)) != NULL) { + if (skb_prev == NULL) + skb_queue_head(&sk->sk_write_queue, skb); + else + skb_append(skb_prev, skb); + skb_prev = skb; + } +} + +/* + * Validate that the value of nr is between va and vs. Return true or + * false for testing. + */ +int nr_validate_nr(struct sock *sk, unsigned short nr) +{ + struct nr_sock *nrom = nr_sk(sk); + unsigned short vc = nrom->va; + + while (vc != nrom->vs) { + if (nr == vc) return 1; + vc = (vc + 1) % NR_MODULUS; + } + + return nr == nrom->vs; +} + +/* + * Check that ns is within the receive window. + */ +int nr_in_rx_window(struct sock *sk, unsigned short ns) +{ + struct nr_sock *nr = nr_sk(sk); + unsigned short vc = nr->vr; + unsigned short vt = (nr->vl + nr->window) % NR_MODULUS; + + while (vc != vt) { + if (ns == vc) return 1; + vc = (vc + 1) % NR_MODULUS; + } + + return 0; +} + +/* + * This routine is called when the HDLC layer internally generates a + * control frame. + */ +void nr_write_internal(struct sock *sk, int frametype) +{ + struct nr_sock *nr = nr_sk(sk); + struct sk_buff *skb; + unsigned char *dptr; + int len, timeout; + + len = NR_NETWORK_LEN + NR_TRANSPORT_LEN; + + switch (frametype & 0x0F) { + case NR_CONNREQ: + len += 17; + break; + case NR_CONNACK: + len += (nr->bpqext) ? 2 : 1; + break; + case NR_DISCREQ: + case NR_DISCACK: + case NR_INFOACK: + break; + default: + printk(KERN_ERR "NET/ROM: nr_write_internal - invalid frame type %d\n", frametype); + return; + } + + if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + /* + * Space for AX.25 and NET/ROM network header + */ + skb_reserve(skb, NR_NETWORK_LEN); + + dptr = skb_put(skb, skb_tailroom(skb)); + + switch (frametype & 0x0F) { + case NR_CONNREQ: + timeout = nr->t1 / HZ; + *dptr++ = nr->my_index; + *dptr++ = nr->my_id; + *dptr++ = 0; + *dptr++ = 0; + *dptr++ = frametype; + *dptr++ = nr->window; + memcpy(dptr, &nr->user_addr, AX25_ADDR_LEN); + dptr[6] &= ~AX25_CBIT; + dptr[6] &= ~AX25_EBIT; + dptr[6] |= AX25_SSSID_SPARE; + dptr += AX25_ADDR_LEN; + memcpy(dptr, &nr->source_addr, AX25_ADDR_LEN); + dptr[6] &= ~AX25_CBIT; + dptr[6] &= ~AX25_EBIT; + dptr[6] |= AX25_SSSID_SPARE; + dptr += AX25_ADDR_LEN; + *dptr++ = timeout % 256; + *dptr++ = timeout / 256; + break; + + case NR_CONNACK: + *dptr++ = nr->your_index; + *dptr++ = nr->your_id; + *dptr++ = nr->my_index; + *dptr++ = nr->my_id; + *dptr++ = frametype; + *dptr++ = nr->window; + if (nr->bpqext) *dptr++ = sysctl_netrom_network_ttl_initialiser; + break; + + case NR_DISCREQ: + case NR_DISCACK: + *dptr++ = nr->your_index; + *dptr++ = nr->your_id; + *dptr++ = 0; + *dptr++ = 0; + *dptr++ = frametype; + break; + + case NR_INFOACK: + *dptr++ = nr->your_index; + *dptr++ = nr->your_id; + *dptr++ = 0; + *dptr++ = nr->vr; + *dptr++ = frametype; + break; + } + + nr_transmit_buffer(sk, skb); +} + +/* + * This routine is called when a Connect Acknowledge with the Choke Flag + * set is needed to refuse a connection. + */ +void nr_transmit_refusal(struct sk_buff *skb, int mine) +{ + struct sk_buff *skbn; + unsigned char *dptr; + int len; + + len = NR_NETWORK_LEN + NR_TRANSPORT_LEN + 1; + + if ((skbn = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skbn, 0); + + dptr = skb_put(skbn, NR_NETWORK_LEN + NR_TRANSPORT_LEN); + + memcpy(dptr, skb->data + 7, AX25_ADDR_LEN); + dptr[6] &= ~AX25_CBIT; + dptr[6] &= ~AX25_EBIT; + dptr[6] |= AX25_SSSID_SPARE; + dptr += AX25_ADDR_LEN; + + memcpy(dptr, skb->data + 0, AX25_ADDR_LEN); + dptr[6] &= ~AX25_CBIT; + dptr[6] |= AX25_EBIT; + dptr[6] |= AX25_SSSID_SPARE; + dptr += AX25_ADDR_LEN; + + *dptr++ = sysctl_netrom_network_ttl_initialiser; + + if (mine) { + *dptr++ = 0; + *dptr++ = 0; + *dptr++ = skb->data[15]; + *dptr++ = skb->data[16]; + } else { + *dptr++ = skb->data[15]; + *dptr++ = skb->data[16]; + *dptr++ = 0; + *dptr++ = 0; + } + + *dptr++ = NR_CONNACK | NR_CHOKE_FLAG; + *dptr++ = 0; + + if (!nr_route_frame(skbn, NULL)) + kfree_skb(skbn); +} + +void nr_disconnect(struct sock *sk, int reason) +{ + nr_stop_t1timer(sk); + nr_stop_t2timer(sk); + nr_stop_t4timer(sk); + nr_stop_idletimer(sk); + + nr_clear_queues(sk); + + nr_sk(sk)->state = NR_STATE_0; + + sk->sk_state = TCP_CLOSE; + sk->sk_err = reason; + sk->sk_shutdown |= SEND_SHUTDOWN; + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DEAD); + } +} diff --git a/net/netrom/nr_timer.c b/net/netrom/nr_timer.c new file mode 100644 index 000000000000..faabda8088be --- /dev/null +++ b/net/netrom/nr_timer.c @@ -0,0 +1,260 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) 2002 Ralf Baechle DO1GRB (ralf@gnu.org) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void nr_heartbeat_expiry(unsigned long); +static void nr_t1timer_expiry(unsigned long); +static void nr_t2timer_expiry(unsigned long); +static void nr_t4timer_expiry(unsigned long); +static void nr_idletimer_expiry(unsigned long); + +void nr_init_timers(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + + init_timer(&nr->t1timer); + nr->t1timer.data = (unsigned long)sk; + nr->t1timer.function = &nr_t1timer_expiry; + + init_timer(&nr->t2timer); + nr->t2timer.data = (unsigned long)sk; + nr->t2timer.function = &nr_t2timer_expiry; + + init_timer(&nr->t4timer); + nr->t4timer.data = (unsigned long)sk; + nr->t4timer.function = &nr_t4timer_expiry; + + init_timer(&nr->idletimer); + nr->idletimer.data = (unsigned long)sk; + nr->idletimer.function = &nr_idletimer_expiry; + + /* initialized by sock_init_data */ + sk->sk_timer.data = (unsigned long)sk; + sk->sk_timer.function = &nr_heartbeat_expiry; +} + +void nr_start_t1timer(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + + mod_timer(&nr->t1timer, jiffies + nr->t1); +} + +void nr_start_t2timer(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + + mod_timer(&nr->t2timer, jiffies + nr->t2); +} + +void nr_start_t4timer(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + + mod_timer(&nr->t4timer, jiffies + nr->t4); +} + +void nr_start_idletimer(struct sock *sk) +{ + struct nr_sock *nr = nr_sk(sk); + + if (nr->idle > 0) + mod_timer(&nr->idletimer, jiffies + nr->idle); +} + +void nr_start_heartbeat(struct sock *sk) +{ + mod_timer(&sk->sk_timer, jiffies + 5 * HZ); +} + +void nr_stop_t1timer(struct sock *sk) +{ + del_timer(&nr_sk(sk)->t1timer); +} + +void nr_stop_t2timer(struct sock *sk) +{ + del_timer(&nr_sk(sk)->t2timer); +} + +void nr_stop_t4timer(struct sock *sk) +{ + del_timer(&nr_sk(sk)->t4timer); +} + +void nr_stop_idletimer(struct sock *sk) +{ + del_timer(&nr_sk(sk)->idletimer); +} + +void nr_stop_heartbeat(struct sock *sk) +{ + del_timer(&sk->sk_timer); +} + +int nr_t1timer_running(struct sock *sk) +{ + return timer_pending(&nr_sk(sk)->t1timer); +} + +static void nr_heartbeat_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + struct nr_sock *nr = nr_sk(sk); + + bh_lock_sock(sk); + switch (nr->state) { + case NR_STATE_0: + /* Magic here: If we listen() and a new link dies before it + is accepted() it isn't 'dead' so doesn't get removed. */ + if (sock_flag(sk, SOCK_DESTROY) || + (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { + sock_hold(sk); + nr_destroy_socket(sk); + bh_unlock_sock(sk); + sock_put(sk); + return; + } + break; + + case NR_STATE_3: + /* + * Check for the state of the receive buffer. + */ + if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf / 2) && + (nr->condition & NR_COND_OWN_RX_BUSY)) { + nr->condition &= ~NR_COND_OWN_RX_BUSY; + nr->condition &= ~NR_COND_ACK_PENDING; + nr->vl = nr->vr; + nr_write_internal(sk, NR_INFOACK); + break; + } + break; + } + + nr_start_heartbeat(sk); + bh_unlock_sock(sk); +} + +static void nr_t2timer_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + struct nr_sock *nr = nr_sk(sk); + + bh_lock_sock(sk); + if (nr->condition & NR_COND_ACK_PENDING) { + nr->condition &= ~NR_COND_ACK_PENDING; + nr_enquiry_response(sk); + } + bh_unlock_sock(sk); +} + +static void nr_t4timer_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + + bh_lock_sock(sk); + nr_sk(sk)->condition &= ~NR_COND_PEER_RX_BUSY; + bh_unlock_sock(sk); +} + +static void nr_idletimer_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + struct nr_sock *nr = nr_sk(sk); + + bh_lock_sock(sk); + + nr_clear_queues(sk); + + nr->n2count = 0; + nr_write_internal(sk, NR_DISCREQ); + nr->state = NR_STATE_2; + + nr_start_t1timer(sk); + nr_stop_t2timer(sk); + nr_stop_t4timer(sk); + + sk->sk_state = TCP_CLOSE; + sk->sk_err = 0; + sk->sk_shutdown |= SEND_SHUTDOWN; + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DEAD); + } + bh_unlock_sock(sk); +} + +static void nr_t1timer_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + struct nr_sock *nr = nr_sk(sk); + + bh_lock_sock(sk); + switch (nr->state) { + case NR_STATE_1: + if (nr->n2count == nr->n2) { + nr_disconnect(sk, ETIMEDOUT); + bh_unlock_sock(sk); + return; + } else { + nr->n2count++; + nr_write_internal(sk, NR_CONNREQ); + } + break; + + case NR_STATE_2: + if (nr->n2count == nr->n2) { + nr_disconnect(sk, ETIMEDOUT); + bh_unlock_sock(sk); + return; + } else { + nr->n2count++; + nr_write_internal(sk, NR_DISCREQ); + } + break; + + case NR_STATE_3: + if (nr->n2count == nr->n2) { + nr_disconnect(sk, ETIMEDOUT); + bh_unlock_sock(sk); + return; + } else { + nr->n2count++; + nr_requeue_frames(sk); + } + break; + } + + nr_start_t1timer(sk); + bh_unlock_sock(sk); +} diff --git a/net/netrom/sysctl_net_netrom.c b/net/netrom/sysctl_net_netrom.c new file mode 100644 index 000000000000..c9ed50382ea7 --- /dev/null +++ b/net/netrom/sysctl_net_netrom.c @@ -0,0 +1,189 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) 1996 Mike Shaver (shaver@zeroknowledge.com) + */ +#include +#include +#include +#include +#include + +/* + * Values taken from NET/ROM documentation. + */ +static int min_quality[] = {0}, max_quality[] = {255}; +static int min_obs[] = {0}, max_obs[] = {255}; +static int min_ttl[] = {0}, max_ttl[] = {255}; +static int min_t1[] = {5 * HZ}; +static int max_t1[] = {600 * HZ}; +static int min_n2[] = {2}, max_n2[] = {127}; +static int min_t2[] = {1 * HZ}; +static int max_t2[] = {60 * HZ}; +static int min_t4[] = {1 * HZ}; +static int max_t4[] = {1000 * HZ}; +static int min_window[] = {1}, max_window[] = {127}; +static int min_idle[] = {0 * HZ}; +static int max_idle[] = {65535 * HZ}; +static int min_route[] = {0}, max_route[] = {1}; +static int min_fails[] = {1}, max_fails[] = {10}; + +static struct ctl_table_header *nr_table_header; + +static ctl_table nr_table[] = { + { + .ctl_name = NET_NETROM_DEFAULT_PATH_QUALITY, + .procname = "default_path_quality", + .data = &sysctl_netrom_default_path_quality, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_quality, + .extra2 = &max_quality + }, + { + .ctl_name = NET_NETROM_OBSOLESCENCE_COUNT_INITIALISER, + .procname = "obsolescence_count_initialiser", + .data = &sysctl_netrom_obsolescence_count_initialiser, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_obs, + .extra2 = &max_obs + }, + { + .ctl_name = NET_NETROM_NETWORK_TTL_INITIALISER, + .procname = "network_ttl_initialiser", + .data = &sysctl_netrom_network_ttl_initialiser, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_ttl, + .extra2 = &max_ttl + }, + { + .ctl_name = NET_NETROM_TRANSPORT_TIMEOUT, + .procname = "transport_timeout", + .data = &sysctl_netrom_transport_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_t1, + .extra2 = &max_t1 + }, + { + .ctl_name = NET_NETROM_TRANSPORT_MAXIMUM_TRIES, + .procname = "transport_maximum_tries", + .data = &sysctl_netrom_transport_maximum_tries, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_n2, + .extra2 = &max_n2 + }, + { + .ctl_name = NET_NETROM_TRANSPORT_ACKNOWLEDGE_DELAY, + .procname = "transport_acknowledge_delay", + .data = &sysctl_netrom_transport_acknowledge_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_t2, + .extra2 = &max_t2 + }, + { + .ctl_name = NET_NETROM_TRANSPORT_BUSY_DELAY, + .procname = "transport_busy_delay", + .data = &sysctl_netrom_transport_busy_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_t4, + .extra2 = &max_t4 + }, + { + .ctl_name = NET_NETROM_TRANSPORT_REQUESTED_WINDOW_SIZE, + .procname = "transport_requested_window_size", + .data = &sysctl_netrom_transport_requested_window_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_window, + .extra2 = &max_window + }, + { + .ctl_name = NET_NETROM_TRANSPORT_NO_ACTIVITY_TIMEOUT, + .procname = "transport_no_activity_timeout", + .data = &sysctl_netrom_transport_no_activity_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_idle, + .extra2 = &max_idle + }, + { + .ctl_name = NET_NETROM_ROUTING_CONTROL, + .procname = "routing_control", + .data = &sysctl_netrom_routing_control, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_route, + .extra2 = &max_route + }, + { + .ctl_name = NET_NETROM_LINK_FAILS_COUNT, + .procname = "link_fails_count", + .data = &sysctl_netrom_link_fails_count, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_fails, + .extra2 = &max_fails + }, + { .ctl_name = 0 } +}; + +static ctl_table nr_dir_table[] = { + { + .ctl_name = NET_NETROM, + .procname = "netrom", + .mode = 0555, + .child = nr_table + }, + { .ctl_name = 0 } +}; + +static ctl_table nr_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = nr_dir_table + }, + { .ctl_name = 0 } +}; + +void __init nr_register_sysctl(void) +{ + nr_table_header = register_sysctl_table(nr_root_table, 1); +} + +void nr_unregister_sysctl(void) +{ + unregister_sysctl_table(nr_table_header); +} diff --git a/net/nonet.c b/net/nonet.c new file mode 100644 index 000000000000..e5241dceaa57 --- /dev/null +++ b/net/nonet.c @@ -0,0 +1,30 @@ +/* + * net/nonet.c + * + * Dummy functions to allow us to configure network support entirely + * out of the kernel. + * + * Distributed under the terms of the GNU GPL version 2. + * Copyright (c) Matthew Wilcox 2003 + */ + +#include +#include +#include +#include +#include + +void __init sock_init(void) +{ + printk(KERN_INFO "Linux NoNET1.0 for Linux 2.6\n"); +} + +static int sock_no_open(struct inode *irrelevant, struct file *dontcare) +{ + return -ENXIO; +} + +struct file_operations bad_sock_fops = { + .owner = THIS_MODULE, + .open = sock_no_open, +}; diff --git a/net/packet/Makefile b/net/packet/Makefile new file mode 100644 index 000000000000..81183eabfdec --- /dev/null +++ b/net/packet/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the packet AF. +# + +obj-$(CONFIG_PACKET) += af_packet.o diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c new file mode 100644 index 000000000000..64acea0adaae --- /dev/null +++ b/net/packet/af_packet.c @@ -0,0 +1,1907 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * PACKET - implements raw packet sockets. + * + * Version: $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $ + * + * Authors: Ross Biro, + * Fred N. van Kempen, + * Alan Cox, + * + * Fixes: + * Alan Cox : verify_area() now used correctly + * Alan Cox : new skbuff lists, look ma no backlogs! + * Alan Cox : tidied skbuff lists. + * Alan Cox : Now uses generic datagram routines I + * added. Also fixed the peek/read crash + * from all old Linux datagram code. + * Alan Cox : Uses the improved datagram code. + * Alan Cox : Added NULL's for socket options. + * Alan Cox : Re-commented the code. + * Alan Cox : Use new kernel side addressing + * Rob Janssen : Correct MTU usage. + * Dave Platt : Counter leaks caused by incorrect + * interrupt locking and some slightly + * dubious gcc output. Can you read + * compiler: it said _VOLATILE_ + * Richard Kooijman : Timestamp fixes. + * Alan Cox : New buffers. Use sk->mac.raw. + * Alan Cox : sendmsg/recvmsg support. + * Alan Cox : Protocol setting support + * Alexey Kuznetsov : Untied from IPv4 stack. + * Cyrus Durgin : Fixed kerneld for kmod. + * Michal Ostrowski : Module initialization cleanup. + * Ulises Alonso : Frame number limit removal and + * packet_set_ring memory leak. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_INET +#include +#endif + +#define CONFIG_SOCK_PACKET 1 + +/* + Proposed replacement for SIOC{ADD,DEL}MULTI and + IFF_PROMISC, IFF_ALLMULTI flags. + + It is more expensive, but I believe, + it is really correct solution: reentereble, safe and fault tolerant. + + IFF_PROMISC/IFF_ALLMULTI/SIOC{ADD/DEL}MULTI are faked by keeping + reference count and global flag, so that real status is + (gflag|(count != 0)), so that we can use obsolete faulty interface + not harming clever users. + */ +#define CONFIG_PACKET_MULTICAST 1 + +/* + Assumptions: + - if device has no dev->hard_header routine, it adds and removes ll header + inside itself. In this case ll header is invisible outside of device, + but higher levels still should reserve dev->hard_header_len. + Some devices are enough clever to reallocate skb, when header + will not fit to reserved space (tunnel), another ones are silly + (PPP). + - packet socket receives packets with pulled ll header, + so that SOCK_RAW should push it back. + +On receive: +----------- + +Incoming, dev->hard_header!=NULL + mac.raw -> ll header + data -> data + +Outgoing, dev->hard_header!=NULL + mac.raw -> ll header + data -> ll header + +Incoming, dev->hard_header==NULL + mac.raw -> UNKNOWN position. It is very likely, that it points to ll header. + PPP makes it, that is wrong, because introduce assymetry + between rx and tx paths. + data -> data + +Outgoing, dev->hard_header==NULL + mac.raw -> data. ll header is still not built! + data -> data + +Resume + If dev->hard_header==NULL we are unlikely to restore sensible ll header. + + +On transmit: +------------ + +dev->hard_header != NULL + mac.raw -> ll header + data -> ll header + +dev->hard_header == NULL (ll header is added by device, we cannot control it) + mac.raw -> data + data -> data + + We should set nh.raw on output to correct posistion, + packet classifier depends on it. + */ + +/* List of all packet sockets. */ +static HLIST_HEAD(packet_sklist); +static DEFINE_RWLOCK(packet_sklist_lock); + +static atomic_t packet_socks_nr; + + +/* Private packet socket structures. */ + +#ifdef CONFIG_PACKET_MULTICAST +struct packet_mclist +{ + struct packet_mclist *next; + int ifindex; + int count; + unsigned short type; + unsigned short alen; + unsigned char addr[8]; +}; +#endif +#ifdef CONFIG_PACKET_MMAP +static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing); +#endif + +static void packet_flush_mclist(struct sock *sk); + +struct packet_sock { + /* struct sock has to be the first member of packet_sock */ + struct sock sk; + struct tpacket_stats stats; +#ifdef CONFIG_PACKET_MMAP + char * *pg_vec; + unsigned int head; + unsigned int frames_per_block; + unsigned int frame_size; + unsigned int frame_max; + int copy_thresh; +#endif + struct packet_type prot_hook; + spinlock_t bind_lock; + char running; /* prot_hook is attached*/ + int ifindex; /* bound device */ + unsigned short num; +#ifdef CONFIG_PACKET_MULTICAST + struct packet_mclist *mclist; +#endif +#ifdef CONFIG_PACKET_MMAP + atomic_t mapped; + unsigned int pg_vec_order; + unsigned int pg_vec_pages; + unsigned int pg_vec_len; +#endif +}; + +#ifdef CONFIG_PACKET_MMAP + +static inline char *packet_lookup_frame(struct packet_sock *po, unsigned int position) +{ + unsigned int pg_vec_pos, frame_offset; + char *frame; + + pg_vec_pos = position / po->frames_per_block; + frame_offset = position % po->frames_per_block; + + frame = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size); + + return frame; +} +#endif + +static inline struct packet_sock *pkt_sk(struct sock *sk) +{ + return (struct packet_sock *)sk; +} + +static void packet_sock_destruct(struct sock *sk) +{ + BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc)); + BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); + + if (!sock_flag(sk, SOCK_DEAD)) { + printk("Attempt to release alive packet socket: %p\n", sk); + return; + } + + atomic_dec(&packet_socks_nr); +#ifdef PACKET_REFCNT_DEBUG + printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr)); +#endif +} + + +static struct proto_ops packet_ops; + +#ifdef CONFIG_SOCK_PACKET +static struct proto_ops packet_ops_spkt; + +static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +{ + struct sock *sk; + struct sockaddr_pkt *spkt; + + /* + * When we registered the protocol we saved the socket in the data + * field for just this event. + */ + + sk = pt->af_packet_priv; + + /* + * Yank back the headers [hope the device set this + * right or kerboom...] + * + * Incoming packets have ll header pulled, + * push it back. + * + * For outgoing ones skb->data == skb->mac.raw + * so that this procedure is noop. + */ + + if (skb->pkt_type == PACKET_LOOPBACK) + goto out; + + if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + goto oom; + + /* drop any routing info */ + dst_release(skb->dst); + skb->dst = NULL; + + spkt = (struct sockaddr_pkt*)skb->cb; + + skb_push(skb, skb->data-skb->mac.raw); + + /* + * The SOCK_PACKET socket receives _all_ frames. + */ + + spkt->spkt_family = dev->type; + strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device)); + spkt->spkt_protocol = skb->protocol; + + /* + * Charge the memory to the socket. This is done specifically + * to prevent sockets using all the memory up. + */ + + if (sock_queue_rcv_skb(sk,skb) == 0) + return 0; + +out: + kfree_skb(skb); +oom: + return 0; +} + + +/* + * Output a raw packet to a device layer. This bypasses all the other + * protocol layers and you must therefore supply it with a complete frame + */ + +static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name; + struct sk_buff *skb; + struct net_device *dev; + unsigned short proto=0; + int err; + + /* + * Get and verify the address. + */ + + if (saddr) + { + if (msg->msg_namelen < sizeof(struct sockaddr)) + return(-EINVAL); + if (msg->msg_namelen==sizeof(struct sockaddr_pkt)) + proto=saddr->spkt_protocol; + } + else + return(-ENOTCONN); /* SOCK_PACKET must be sent giving an address */ + + /* + * Find the device first to size check it + */ + + saddr->spkt_device[13] = 0; + dev = dev_get_by_name(saddr->spkt_device); + err = -ENODEV; + if (dev == NULL) + goto out_unlock; + + /* + * You may not queue a frame bigger than the mtu. This is the lowest level + * raw protocol and you must do your own fragmentation at this level. + */ + + err = -EMSGSIZE; + if(len>dev->mtu+dev->hard_header_len) + goto out_unlock; + + err = -ENOBUFS; + skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL); + + /* + * If the write buffer is full, then tough. At this level the user gets to + * deal with the problem - do your own algorithmic backoffs. That's far + * more flexible. + */ + + if (skb == NULL) + goto out_unlock; + + /* + * Fill it in + */ + + /* FIXME: Save some space for broken drivers that write a + * hard header at transmission time by themselves. PPP is the + * notable one here. This should really be fixed at the driver level. + */ + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb->nh.raw = skb->data; + + /* Try to align data part correctly */ + if (dev->hard_header) { + skb->data -= dev->hard_header_len; + skb->tail -= dev->hard_header_len; + if (len < dev->hard_header_len) + skb->nh.raw = skb->data; + } + + /* Returns -EFAULT on error */ + err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + skb->protocol = proto; + skb->dev = dev; + skb->priority = sk->sk_priority; + if (err) + goto out_free; + + err = -ENETDOWN; + if (!(dev->flags & IFF_UP)) + goto out_free; + + /* + * Now send it + */ + + dev_queue_xmit(skb); + dev_put(dev); + return(len); + +out_free: + kfree_skb(skb); +out_unlock: + if (dev) + dev_put(dev); + return err; +} +#endif + +static inline unsigned run_filter(struct sk_buff *skb, struct sock *sk, unsigned res) +{ + struct sk_filter *filter; + + bh_lock_sock(sk); + filter = sk->sk_filter; + /* + * Our caller already checked that filter != NULL but we need to + * verify that under bh_lock_sock() to be safe + */ + if (likely(filter != NULL)) + res = sk_run_filter(skb, filter->insns, filter->len); + bh_unlock_sock(sk); + + return res; +} + +/* + This function makes lazy skb cloning in hope that most of packets + are discarded by BPF. + + Note tricky part: we DO mangle shared skb! skb->data, skb->len + and skb->cb are mangled. It works because (and until) packets + falling here are owned by current CPU. Output packets are cloned + by dev_queue_xmit_nit(), input packets are processed by net_bh + sequencially, so that if we return skb to original state on exit, + we will not harm anyone. + */ + +static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +{ + struct sock *sk; + struct sockaddr_ll *sll; + struct packet_sock *po; + u8 * skb_head = skb->data; + int skb_len = skb->len; + unsigned snaplen; + + if (skb->pkt_type == PACKET_LOOPBACK) + goto drop; + + sk = pt->af_packet_priv; + po = pkt_sk(sk); + + skb->dev = dev; + + if (dev->hard_header) { + /* The device has an explicit notion of ll header, + exported to higher levels. + + Otherwise, the device hides datails of it frame + structure, so that corresponding packet head + never delivered to user. + */ + if (sk->sk_type != SOCK_DGRAM) + skb_push(skb, skb->data - skb->mac.raw); + else if (skb->pkt_type == PACKET_OUTGOING) { + /* Special case: outgoing packets have ll header at head */ + skb_pull(skb, skb->nh.raw - skb->data); + } + } + + snaplen = skb->len; + + if (sk->sk_filter) { + unsigned res = run_filter(skb, sk, snaplen); + if (res == 0) + goto drop_n_restore; + if (snaplen > res) + snaplen = res; + } + + if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= + (unsigned)sk->sk_rcvbuf) + goto drop_n_acct; + + if (skb_shared(skb)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + if (nskb == NULL) + goto drop_n_acct; + + if (skb_head != skb->data) { + skb->data = skb_head; + skb->len = skb_len; + } + kfree_skb(skb); + skb = nskb; + } + + sll = (struct sockaddr_ll*)skb->cb; + sll->sll_family = AF_PACKET; + sll->sll_hatype = dev->type; + sll->sll_protocol = skb->protocol; + sll->sll_pkttype = skb->pkt_type; + sll->sll_ifindex = dev->ifindex; + sll->sll_halen = 0; + + if (dev->hard_header_parse) + sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr); + + if (pskb_trim(skb, snaplen)) + goto drop_n_acct; + + skb_set_owner_r(skb, sk); + skb->dev = NULL; + dst_release(skb->dst); + skb->dst = NULL; + + spin_lock(&sk->sk_receive_queue.lock); + po->stats.tp_packets++; + __skb_queue_tail(&sk->sk_receive_queue, skb); + spin_unlock(&sk->sk_receive_queue.lock); + sk->sk_data_ready(sk, skb->len); + return 0; + +drop_n_acct: + spin_lock(&sk->sk_receive_queue.lock); + po->stats.tp_drops++; + spin_unlock(&sk->sk_receive_queue.lock); + +drop_n_restore: + if (skb_head != skb->data && skb_shared(skb)) { + skb->data = skb_head; + skb->len = skb_len; + } +drop: + kfree_skb(skb); + return 0; +} + +#ifdef CONFIG_PACKET_MMAP +static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt) +{ + struct sock *sk; + struct packet_sock *po; + struct sockaddr_ll *sll; + struct tpacket_hdr *h; + u8 * skb_head = skb->data; + int skb_len = skb->len; + unsigned snaplen; + unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER; + unsigned short macoff, netoff; + struct sk_buff *copy_skb = NULL; + + if (skb->pkt_type == PACKET_LOOPBACK) + goto drop; + + sk = pt->af_packet_priv; + po = pkt_sk(sk); + + if (dev->hard_header) { + if (sk->sk_type != SOCK_DGRAM) + skb_push(skb, skb->data - skb->mac.raw); + else if (skb->pkt_type == PACKET_OUTGOING) { + /* Special case: outgoing packets have ll header at head */ + skb_pull(skb, skb->nh.raw - skb->data); + if (skb->ip_summed == CHECKSUM_HW) + status |= TP_STATUS_CSUMNOTREADY; + } + } + + snaplen = skb->len; + + if (sk->sk_filter) { + unsigned res = run_filter(skb, sk, snaplen); + if (res == 0) + goto drop_n_restore; + if (snaplen > res) + snaplen = res; + } + + if (sk->sk_type == SOCK_DGRAM) { + macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16; + } else { + unsigned maclen = skb->nh.raw - skb->data; + netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen)); + macoff = netoff - maclen; + } + + if (macoff + snaplen > po->frame_size) { + if (po->copy_thresh && + atomic_read(&sk->sk_rmem_alloc) + skb->truesize < + (unsigned)sk->sk_rcvbuf) { + if (skb_shared(skb)) { + copy_skb = skb_clone(skb, GFP_ATOMIC); + } else { + copy_skb = skb_get(skb); + skb_head = skb->data; + } + if (copy_skb) + skb_set_owner_r(copy_skb, sk); + } + snaplen = po->frame_size - macoff; + if ((int)snaplen < 0) + snaplen = 0; + } + if (snaplen > skb->len-skb->data_len) + snaplen = skb->len-skb->data_len; + + spin_lock(&sk->sk_receive_queue.lock); + h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head); + + if (h->tp_status) + goto ring_is_full; + po->head = po->head != po->frame_max ? po->head+1 : 0; + po->stats.tp_packets++; + if (copy_skb) { + status |= TP_STATUS_COPY; + __skb_queue_tail(&sk->sk_receive_queue, copy_skb); + } + if (!po->stats.tp_drops) + status &= ~TP_STATUS_LOSING; + spin_unlock(&sk->sk_receive_queue.lock); + + memcpy((u8*)h + macoff, skb->data, snaplen); + + h->tp_len = skb->len; + h->tp_snaplen = snaplen; + h->tp_mac = macoff; + h->tp_net = netoff; + if (skb->stamp.tv_sec == 0) { + do_gettimeofday(&skb->stamp); + sock_enable_timestamp(sk); + } + h->tp_sec = skb->stamp.tv_sec; + h->tp_usec = skb->stamp.tv_usec; + + sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h))); + sll->sll_halen = 0; + if (dev->hard_header_parse) + sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr); + sll->sll_family = AF_PACKET; + sll->sll_hatype = dev->type; + sll->sll_protocol = skb->protocol; + sll->sll_pkttype = skb->pkt_type; + sll->sll_ifindex = dev->ifindex; + + h->tp_status = status; + mb(); + + { + struct page *p_start, *p_end; + u8 *h_end = (u8 *)h + macoff + snaplen - 1; + + p_start = virt_to_page(h); + p_end = virt_to_page(h_end); + while (p_start <= p_end) { + flush_dcache_page(p_start); + p_start++; + } + } + + sk->sk_data_ready(sk, 0); + +drop_n_restore: + if (skb_head != skb->data && skb_shared(skb)) { + skb->data = skb_head; + skb->len = skb_len; + } +drop: + kfree_skb(skb); + return 0; + +ring_is_full: + po->stats.tp_drops++; + spin_unlock(&sk->sk_receive_queue.lock); + + sk->sk_data_ready(sk, 0); + if (copy_skb) + kfree_skb(copy_skb); + goto drop_n_restore; +} + +#endif + + +static int packet_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name; + struct sk_buff *skb; + struct net_device *dev; + unsigned short proto; + unsigned char *addr; + int ifindex, err, reserve = 0; + + /* + * Get and verify the address. + */ + + if (saddr == NULL) { + struct packet_sock *po = pkt_sk(sk); + + ifindex = po->ifindex; + proto = po->num; + addr = NULL; + } else { + err = -EINVAL; + if (msg->msg_namelen < sizeof(struct sockaddr_ll)) + goto out; + ifindex = saddr->sll_ifindex; + proto = saddr->sll_protocol; + addr = saddr->sll_addr; + } + + + dev = dev_get_by_index(ifindex); + err = -ENXIO; + if (dev == NULL) + goto out_unlock; + if (sock->type == SOCK_RAW) + reserve = dev->hard_header_len; + + err = -EMSGSIZE; + if (len > dev->mtu+reserve) + goto out_unlock; + + skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev), + msg->msg_flags & MSG_DONTWAIT, &err); + if (skb==NULL) + goto out_unlock; + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb->nh.raw = skb->data; + + if (dev->hard_header) { + int res; + err = -EINVAL; + res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len); + if (sock->type != SOCK_DGRAM) { + skb->tail = skb->data; + skb->len = 0; + } else if (res < 0) + goto out_free; + } + + /* Returns -EFAULT on error */ + err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + if (err) + goto out_free; + + skb->protocol = proto; + skb->dev = dev; + skb->priority = sk->sk_priority; + + err = -ENETDOWN; + if (!(dev->flags & IFF_UP)) + goto out_free; + + /* + * Now send it + */ + + err = dev_queue_xmit(skb); + if (err > 0 && (err = net_xmit_errno(err)) != 0) + goto out_unlock; + + dev_put(dev); + + return(len); + +out_free: + kfree_skb(skb); +out_unlock: + if (dev) + dev_put(dev); +out: + return err; +} + +/* + * Close a PACKET socket. This is fairly simple. We immediately go + * to 'closed' state and remove our protocol entry in the device list. + */ + +static int packet_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct packet_sock *po; + + if (!sk) + return 0; + + po = pkt_sk(sk); + + write_lock_bh(&packet_sklist_lock); + sk_del_node_init(sk); + write_unlock_bh(&packet_sklist_lock); + + /* + * Unhook packet receive handler. + */ + + if (po->running) { + /* + * Remove the protocol hook + */ + dev_remove_pack(&po->prot_hook); + po->running = 0; + po->num = 0; + __sock_put(sk); + } + +#ifdef CONFIG_PACKET_MULTICAST + packet_flush_mclist(sk); +#endif + +#ifdef CONFIG_PACKET_MMAP + if (po->pg_vec) { + struct tpacket_req req; + memset(&req, 0, sizeof(req)); + packet_set_ring(sk, &req, 1); + } +#endif + + /* + * Now the socket is dead. No more input will appear. + */ + + sock_orphan(sk); + sock->sk = NULL; + + /* Purge queues */ + + skb_queue_purge(&sk->sk_receive_queue); + + sock_put(sk); + return 0; +} + +/* + * Attach a packet hook. + */ + +static int packet_do_bind(struct sock *sk, struct net_device *dev, int protocol) +{ + struct packet_sock *po = pkt_sk(sk); + /* + * Detach an existing hook if present. + */ + + lock_sock(sk); + + spin_lock(&po->bind_lock); + if (po->running) { + __sock_put(sk); + po->running = 0; + po->num = 0; + spin_unlock(&po->bind_lock); + dev_remove_pack(&po->prot_hook); + spin_lock(&po->bind_lock); + } + + po->num = protocol; + po->prot_hook.type = protocol; + po->prot_hook.dev = dev; + + po->ifindex = dev ? dev->ifindex : 0; + + if (protocol == 0) + goto out_unlock; + + if (dev) { + if (dev->flags&IFF_UP) { + dev_add_pack(&po->prot_hook); + sock_hold(sk); + po->running = 1; + } else { + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + } + } else { + dev_add_pack(&po->prot_hook); + sock_hold(sk); + po->running = 1; + } + +out_unlock: + spin_unlock(&po->bind_lock); + release_sock(sk); + return 0; +} + +/* + * Bind a packet socket to a device + */ + +#ifdef CONFIG_SOCK_PACKET + +static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk=sock->sk; + char name[15]; + struct net_device *dev; + int err = -ENODEV; + + /* + * Check legality + */ + + if(addr_len!=sizeof(struct sockaddr)) + return -EINVAL; + strlcpy(name,uaddr->sa_data,sizeof(name)); + + dev = dev_get_by_name(name); + if (dev) { + err = packet_do_bind(sk, dev, pkt_sk(sk)->num); + dev_put(dev); + } + return err; +} +#endif + +static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr; + struct sock *sk=sock->sk; + struct net_device *dev = NULL; + int err; + + + /* + * Check legality + */ + + if (addr_len < sizeof(struct sockaddr_ll)) + return -EINVAL; + if (sll->sll_family != AF_PACKET) + return -EINVAL; + + if (sll->sll_ifindex) { + err = -ENODEV; + dev = dev_get_by_index(sll->sll_ifindex); + if (dev == NULL) + goto out; + } + err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num); + if (dev) + dev_put(dev); + +out: + return err; +} + +static struct proto packet_proto = { + .name = "PACKET", + .owner = THIS_MODULE, + .obj_size = sizeof(struct packet_sock), +}; + +/* + * Create a packet of type SOCK_PACKET. + */ + +static int packet_create(struct socket *sock, int protocol) +{ + struct sock *sk; + struct packet_sock *po; + int err; + + if (!capable(CAP_NET_RAW)) + return -EPERM; + if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW +#ifdef CONFIG_SOCK_PACKET + && sock->type != SOCK_PACKET +#endif + ) + return -ESOCKTNOSUPPORT; + + sock->state = SS_UNCONNECTED; + + err = -ENOBUFS; + sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1); + if (sk == NULL) + goto out; + + sock->ops = &packet_ops; +#ifdef CONFIG_SOCK_PACKET + if (sock->type == SOCK_PACKET) + sock->ops = &packet_ops_spkt; +#endif + sock_init_data(sock, sk); + + po = pkt_sk(sk); + sk->sk_family = PF_PACKET; + po->num = protocol; + + sk->sk_destruct = packet_sock_destruct; + atomic_inc(&packet_socks_nr); + + /* + * Attach a protocol block + */ + + spin_lock_init(&po->bind_lock); + po->prot_hook.func = packet_rcv; +#ifdef CONFIG_SOCK_PACKET + if (sock->type == SOCK_PACKET) + po->prot_hook.func = packet_rcv_spkt; +#endif + po->prot_hook.af_packet_priv = sk; + + if (protocol) { + po->prot_hook.type = protocol; + dev_add_pack(&po->prot_hook); + sock_hold(sk); + po->running = 1; + } + + write_lock_bh(&packet_sklist_lock); + sk_add_node(sk, &packet_sklist); + write_unlock_bh(&packet_sklist_lock); + return(0); +out: + return err; +} + +/* + * Pull a packet from our receive queue and hand it to the user. + * If necessary we block. + */ + +static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len, int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + int copied, err; + + err = -EINVAL; + if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT)) + goto out; + +#if 0 + /* What error should we return now? EUNATTACH? */ + if (pkt_sk(sk)->ifindex < 0) + return -ENODEV; +#endif + + /* + * If the address length field is there to be filled in, we fill + * it in now. + */ + + if (sock->type == SOCK_PACKET) + msg->msg_namelen = sizeof(struct sockaddr_pkt); + else + msg->msg_namelen = sizeof(struct sockaddr_ll); + + /* + * Call the generic datagram receiver. This handles all sorts + * of horrible races and re-entrancy so we can forget about it + * in the protocol layers. + * + * Now it will return ENETDOWN, if device have just gone down, + * but then it will block. + */ + + skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err); + + /* + * An error occurred so return it. Because skb_recv_datagram() + * handles the blocking we don't see and worry about blocking + * retries. + */ + + if(skb==NULL) + goto out; + + /* + * You lose any data beyond the buffer you gave. If it worries a + * user program they can ask the device for its MTU anyway. + */ + + copied = skb->len; + if (copied > len) + { + copied=len; + msg->msg_flags|=MSG_TRUNC; + } + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (err) + goto out_free; + + sock_recv_timestamp(msg, sk, skb); + + if (msg->msg_name) + memcpy(msg->msg_name, skb->cb, msg->msg_namelen); + + /* + * Free or return the buffer as appropriate. Again this + * hides all the races and re-entrancy issues from us. + */ + err = (flags&MSG_TRUNC) ? skb->len : copied; + +out_free: + skb_free_datagram(sk, skb); +out: + return err; +} + +#ifdef CONFIG_SOCK_PACKET +static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct net_device *dev; + struct sock *sk = sock->sk; + + if (peer) + return -EOPNOTSUPP; + + uaddr->sa_family = AF_PACKET; + dev = dev_get_by_index(pkt_sk(sk)->ifindex); + if (dev) { + strlcpy(uaddr->sa_data, dev->name, 15); + dev_put(dev); + } else + memset(uaddr->sa_data, 0, 14); + *uaddr_len = sizeof(*uaddr); + + return 0; +} +#endif + +static int packet_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct net_device *dev; + struct sock *sk = sock->sk; + struct packet_sock *po = pkt_sk(sk); + struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr; + + if (peer) + return -EOPNOTSUPP; + + sll->sll_family = AF_PACKET; + sll->sll_ifindex = po->ifindex; + sll->sll_protocol = po->num; + dev = dev_get_by_index(po->ifindex); + if (dev) { + sll->sll_hatype = dev->type; + sll->sll_halen = dev->addr_len; + memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len); + dev_put(dev); + } else { + sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ + sll->sll_halen = 0; + } + *uaddr_len = sizeof(*sll); + + return 0; +} + +#ifdef CONFIG_PACKET_MULTICAST +static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what) +{ + switch (i->type) { + case PACKET_MR_MULTICAST: + if (what > 0) + dev_mc_add(dev, i->addr, i->alen, 0); + else + dev_mc_delete(dev, i->addr, i->alen, 0); + break; + case PACKET_MR_PROMISC: + dev_set_promiscuity(dev, what); + break; + case PACKET_MR_ALLMULTI: + dev_set_allmulti(dev, what); + break; + default:; + } +} + +static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what) +{ + for ( ; i; i=i->next) { + if (i->ifindex == dev->ifindex) + packet_dev_mc(dev, i, what); + } +} + +static int packet_mc_add(struct sock *sk, struct packet_mreq *mreq) +{ + struct packet_sock *po = pkt_sk(sk); + struct packet_mclist *ml, *i; + struct net_device *dev; + int err; + + rtnl_lock(); + + err = -ENODEV; + dev = __dev_get_by_index(mreq->mr_ifindex); + if (!dev) + goto done; + + err = -EINVAL; + if (mreq->mr_alen > dev->addr_len) + goto done; + + err = -ENOBUFS; + i = (struct packet_mclist *)kmalloc(sizeof(*i), GFP_KERNEL); + if (i == NULL) + goto done; + + err = 0; + for (ml = po->mclist; ml; ml = ml->next) { + if (ml->ifindex == mreq->mr_ifindex && + ml->type == mreq->mr_type && + ml->alen == mreq->mr_alen && + memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { + ml->count++; + /* Free the new element ... */ + kfree(i); + goto done; + } + } + + i->type = mreq->mr_type; + i->ifindex = mreq->mr_ifindex; + i->alen = mreq->mr_alen; + memcpy(i->addr, mreq->mr_address, i->alen); + i->count = 1; + i->next = po->mclist; + po->mclist = i; + packet_dev_mc(dev, i, +1); + +done: + rtnl_unlock(); + return err; +} + +static int packet_mc_drop(struct sock *sk, struct packet_mreq *mreq) +{ + struct packet_mclist *ml, **mlp; + + rtnl_lock(); + + for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) { + if (ml->ifindex == mreq->mr_ifindex && + ml->type == mreq->mr_type && + ml->alen == mreq->mr_alen && + memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) { + if (--ml->count == 0) { + struct net_device *dev; + *mlp = ml->next; + dev = dev_get_by_index(ml->ifindex); + if (dev) { + packet_dev_mc(dev, ml, -1); + dev_put(dev); + } + kfree(ml); + } + rtnl_unlock(); + return 0; + } + } + rtnl_unlock(); + return -EADDRNOTAVAIL; +} + +static void packet_flush_mclist(struct sock *sk) +{ + struct packet_sock *po = pkt_sk(sk); + struct packet_mclist *ml; + + if (!po->mclist) + return; + + rtnl_lock(); + while ((ml = po->mclist) != NULL) { + struct net_device *dev; + + po->mclist = ml->next; + if ((dev = dev_get_by_index(ml->ifindex)) != NULL) { + packet_dev_mc(dev, ml, -1); + dev_put(dev); + } + kfree(ml); + } + rtnl_unlock(); +} +#endif + +static int +packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + int ret; + + if (level != SOL_PACKET) + return -ENOPROTOOPT; + + switch(optname) { +#ifdef CONFIG_PACKET_MULTICAST + case PACKET_ADD_MEMBERSHIP: + case PACKET_DROP_MEMBERSHIP: + { + struct packet_mreq mreq; + if (optlencopy_thresh = val; + return 0; + } +#endif + default: + return -ENOPROTOOPT; + } +} + +static int packet_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + int len; + struct sock *sk = sock->sk; + struct packet_sock *po = pkt_sk(sk); + + if (level != SOL_PACKET) + return -ENOPROTOOPT; + + if (get_user(len,optlen)) + return -EFAULT; + + if (len < 0) + return -EINVAL; + + switch(optname) { + case PACKET_STATISTICS: + { + struct tpacket_stats st; + + if (len > sizeof(struct tpacket_stats)) + len = sizeof(struct tpacket_stats); + spin_lock_bh(&sk->sk_receive_queue.lock); + st = po->stats; + memset(&po->stats, 0, sizeof(st)); + spin_unlock_bh(&sk->sk_receive_queue.lock); + st.tp_packets += st.tp_drops; + + if (copy_to_user(optval, &st, len)) + return -EFAULT; + break; + } + default: + return -ENOPROTOOPT; + } + + if (put_user(len, optlen)) + return -EFAULT; + return 0; +} + + +static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data) +{ + struct sock *sk; + struct hlist_node *node; + struct net_device *dev = (struct net_device*)data; + + read_lock(&packet_sklist_lock); + sk_for_each(sk, node, &packet_sklist) { + struct packet_sock *po = pkt_sk(sk); + + switch (msg) { + case NETDEV_UNREGISTER: +#ifdef CONFIG_PACKET_MULTICAST + if (po->mclist) + packet_dev_mclist(dev, po->mclist, -1); + // fallthrough +#endif + case NETDEV_DOWN: + if (dev->ifindex == po->ifindex) { + spin_lock(&po->bind_lock); + if (po->running) { + __dev_remove_pack(&po->prot_hook); + __sock_put(sk); + po->running = 0; + sk->sk_err = ENETDOWN; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_error_report(sk); + } + if (msg == NETDEV_UNREGISTER) { + po->ifindex = -1; + po->prot_hook.dev = NULL; + } + spin_unlock(&po->bind_lock); + } + break; + case NETDEV_UP: + spin_lock(&po->bind_lock); + if (dev->ifindex == po->ifindex && po->num && + !po->running) { + dev_add_pack(&po->prot_hook); + sock_hold(sk); + po->running = 1; + } + spin_unlock(&po->bind_lock); + break; + } + } + read_unlock(&packet_sklist_lock); + return NOTIFY_DONE; +} + + +static int packet_ioctl(struct socket *sock, unsigned int cmd, + unsigned long arg) +{ + struct sock *sk = sock->sk; + + switch(cmd) { + case SIOCOUTQ: + { + int amount = atomic_read(&sk->sk_wmem_alloc); + return put_user(amount, (int __user *)arg); + } + case SIOCINQ: + { + struct sk_buff *skb; + int amount = 0; + + spin_lock_bh(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + if (skb) + amount = skb->len; + spin_unlock_bh(&sk->sk_receive_queue.lock); + return put_user(amount, (int __user *)arg); + } + case SIOCGSTAMP: + return sock_get_timestamp(sk, (struct timeval __user *)arg); + +#ifdef CONFIG_INET + case SIOCADDRT: + case SIOCDELRT: + case SIOCDARP: + case SIOCGARP: + case SIOCSARP: + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCSIFFLAGS: + return inet_dgram_ops.ioctl(sock, cmd, arg); +#endif + + default: + return dev_ioctl(cmd, (void __user *)arg); + } + return 0; +} + +#ifndef CONFIG_PACKET_MMAP +#define packet_mmap sock_no_mmap +#define packet_poll datagram_poll +#else + +static unsigned int packet_poll(struct file * file, struct socket *sock, + poll_table *wait) +{ + struct sock *sk = sock->sk; + struct packet_sock *po = pkt_sk(sk); + unsigned int mask = datagram_poll(file, sock, wait); + + spin_lock_bh(&sk->sk_receive_queue.lock); + if (po->pg_vec) { + unsigned last = po->head ? po->head-1 : po->frame_max; + struct tpacket_hdr *h; + + h = (struct tpacket_hdr *)packet_lookup_frame(po, last); + + if (h->tp_status) + mask |= POLLIN | POLLRDNORM; + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + return mask; +} + + +/* Dirty? Well, I still did not learn better way to account + * for user mmaps. + */ + +static void packet_mm_open(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct inode *inode = file->f_dentry->d_inode; + struct socket * sock = SOCKET_I(inode); + struct sock *sk = sock->sk; + + if (sk) + atomic_inc(&pkt_sk(sk)->mapped); +} + +static void packet_mm_close(struct vm_area_struct *vma) +{ + struct file *file = vma->vm_file; + struct inode *inode = file->f_dentry->d_inode; + struct socket * sock = SOCKET_I(inode); + struct sock *sk = sock->sk; + + if (sk) + atomic_dec(&pkt_sk(sk)->mapped); +} + +static struct vm_operations_struct packet_mmap_ops = { + .open = packet_mm_open, + .close =packet_mm_close, +}; + +static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order) +{ + return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1); +} + +static void free_pg_vec(char **pg_vec, unsigned order, unsigned len) +{ + int i; + + for (i=0; itp_block_nr) { + int i, l; + + /* Sanity tests and some calculations */ + + if (po->pg_vec) + return -EBUSY; + + if ((int)req->tp_block_size <= 0) + return -EINVAL; + if (req->tp_block_size&(PAGE_SIZE-1)) + return -EINVAL; + if (req->tp_frame_size < TPACKET_HDRLEN) + return -EINVAL; + if (req->tp_frame_size&(TPACKET_ALIGNMENT-1)) + return -EINVAL; + + po->frames_per_block = req->tp_block_size/req->tp_frame_size; + if (po->frames_per_block <= 0) + return -EINVAL; + if (po->frames_per_block*req->tp_block_nr != req->tp_frame_nr) + return -EINVAL; + /* OK! */ + + /* Allocate page vector */ + while ((PAGE_SIZE<tp_block_size) + order++; + + err = -ENOMEM; + + pg_vec = kmalloc(req->tp_block_nr*sizeof(char *), GFP_KERNEL); + if (pg_vec == NULL) + goto out; + memset(pg_vec, 0, req->tp_block_nr*sizeof(char **)); + + for (i=0; itp_block_nr; i++) { + struct page *page, *pend; + pg_vec[i] = (char *)__get_free_pages(GFP_KERNEL, order); + if (!pg_vec[i]) + goto out_free_pgvec; + + pend = pg_vec_endpage(pg_vec[i], order); + for (page = virt_to_page(pg_vec[i]); page <= pend; page++) + SetPageReserved(page); + } + /* Page vector is allocated */ + + l = 0; + for (i=0; itp_block_nr; i++) { + char *ptr = pg_vec[i]; + struct tpacket_hdr *header; + int k; + + for (k=0; kframes_per_block; k++) { + + header = (struct tpacket_hdr*)ptr; + header->tp_status = TP_STATUS_KERNEL; + ptr += req->tp_frame_size; + } + } + /* Done */ + } else { + if (req->tp_frame_nr) + return -EINVAL; + } + + lock_sock(sk); + + /* Detach socket from network */ + spin_lock(&po->bind_lock); + was_running = po->running; + num = po->num; + if (was_running) { + __dev_remove_pack(&po->prot_hook); + po->num = 0; + po->running = 0; + __sock_put(sk); + } + spin_unlock(&po->bind_lock); + + synchronize_net(); + + err = -EBUSY; + if (closing || atomic_read(&po->mapped) == 0) { + err = 0; +#define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; }) + + spin_lock_bh(&sk->sk_receive_queue.lock); + pg_vec = XC(po->pg_vec, pg_vec); + po->frame_max = req->tp_frame_nr-1; + po->head = 0; + po->frame_size = req->tp_frame_size; + spin_unlock_bh(&sk->sk_receive_queue.lock); + + order = XC(po->pg_vec_order, order); + req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr); + + po->pg_vec_pages = req->tp_block_size/PAGE_SIZE; + po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv; + skb_queue_purge(&sk->sk_receive_queue); +#undef XC + if (atomic_read(&po->mapped)) + printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped)); + } + + spin_lock(&po->bind_lock); + if (was_running && !po->running) { + sock_hold(sk); + po->running = 1; + po->num = num; + dev_add_pack(&po->prot_hook); + } + spin_unlock(&po->bind_lock); + + release_sock(sk); + +out_free_pgvec: + if (pg_vec) + free_pg_vec(pg_vec, order, req->tp_block_nr); +out: + return err; +} + +static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma) +{ + struct sock *sk = sock->sk; + struct packet_sock *po = pkt_sk(sk); + unsigned long size; + unsigned long start; + int err = -EINVAL; + int i; + + if (vma->vm_pgoff) + return -EINVAL; + + size = vma->vm_end - vma->vm_start; + + lock_sock(sk); + if (po->pg_vec == NULL) + goto out; + if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE) + goto out; + + atomic_inc(&po->mapped); + start = vma->vm_start; + err = -EAGAIN; + for (i=0; ipg_vec_len; i++) { + if (remap_pfn_range(vma, start, + __pa(po->pg_vec[i]) >> PAGE_SHIFT, + po->pg_vec_pages*PAGE_SIZE, + vma->vm_page_prot)) + goto out; + start += po->pg_vec_pages*PAGE_SIZE; + } + vma->vm_ops = &packet_mmap_ops; + err = 0; + +out: + release_sock(sk); + return err; +} +#endif + + +#ifdef CONFIG_SOCK_PACKET +static struct proto_ops packet_ops_spkt = { + .family = PF_PACKET, + .owner = THIS_MODULE, + .release = packet_release, + .bind = packet_bind_spkt, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = packet_getname_spkt, + .poll = datagram_poll, + .ioctl = packet_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = packet_sendmsg_spkt, + .recvmsg = packet_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; +#endif + +static struct proto_ops packet_ops = { + .family = PF_PACKET, + .owner = THIS_MODULE, + .release = packet_release, + .bind = packet_bind, + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .accept = sock_no_accept, + .getname = packet_getname, + .poll = packet_poll, + .ioctl = packet_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .setsockopt = packet_setsockopt, + .getsockopt = packet_getsockopt, + .sendmsg = packet_sendmsg, + .recvmsg = packet_recvmsg, + .mmap = packet_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct net_proto_family packet_family_ops = { + .family = PF_PACKET, + .create = packet_create, + .owner = THIS_MODULE, +}; + +static struct notifier_block packet_netdev_notifier = { + .notifier_call =packet_notifier, +}; + +#ifdef CONFIG_PROC_FS +static inline struct sock *packet_seq_idx(loff_t off) +{ + struct sock *s; + struct hlist_node *node; + + sk_for_each(s, node, &packet_sklist) { + if (!off--) + return s; + } + return NULL; +} + +static void *packet_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&packet_sklist_lock); + return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN; +} + +static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return (v == SEQ_START_TOKEN) + ? sk_head(&packet_sklist) + : sk_next((struct sock*)v) ; +} + +static void packet_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&packet_sklist_lock); +} + +static int packet_seq_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n"); + else { + struct sock *s = v; + const struct packet_sock *po = pkt_sk(s); + + seq_printf(seq, + "%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n", + s, + atomic_read(&s->sk_refcnt), + s->sk_type, + ntohs(po->num), + po->ifindex, + po->running, + atomic_read(&s->sk_rmem_alloc), + sock_i_uid(s), + sock_i_ino(s) ); + } + + return 0; +} + +static struct seq_operations packet_seq_ops = { + .start = packet_seq_start, + .next = packet_seq_next, + .stop = packet_seq_stop, + .show = packet_seq_show, +}; + +static int packet_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &packet_seq_ops); +} + +static struct file_operations packet_seq_fops = { + .owner = THIS_MODULE, + .open = packet_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif + +static void __exit packet_exit(void) +{ + proc_net_remove("packet"); + unregister_netdevice_notifier(&packet_netdev_notifier); + sock_unregister(PF_PACKET); + proto_unregister(&packet_proto); +} + +static int __init packet_init(void) +{ + int rc = proto_register(&packet_proto, 0); + + if (rc != 0) + goto out; + + sock_register(&packet_family_ops); + register_netdevice_notifier(&packet_netdev_notifier); + proc_net_fops_create("packet", 0, &packet_seq_fops); +out: + return rc; +} + +module_init(packet_init); +module_exit(packet_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_PACKET); diff --git a/net/rose/Makefile b/net/rose/Makefile new file mode 100644 index 000000000000..fa248116fd5b --- /dev/null +++ b/net/rose/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for the Linux Rose (X.25 PLP) layer. +# + +obj-$(CONFIG_ROSE) += rose.o + +rose-y := af_rose.o rose_dev.o rose_in.o rose_link.o rose_loopback.o \ + rose_out.o rose_route.o rose_subr.o rose_timer.o +rose-$(CONFIG_SYSCTL) += sysctl_net_rose.o diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c new file mode 100644 index 000000000000..7eb6a5bf93ea --- /dev/null +++ b/net/rose/af_rose.c @@ -0,0 +1,1589 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Alan Cox GW4PTS (alan@lxorguk.ukuu.org.uk) + * Copyright (C) Terry Dawson VK2KTJ (terry@animats.net) + * Copyright (C) Tomi Manninen OH2BNS (oh2bns@sral.fi) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int rose_ndevs = 10; + +int sysctl_rose_restart_request_timeout = ROSE_DEFAULT_T0; +int sysctl_rose_call_request_timeout = ROSE_DEFAULT_T1; +int sysctl_rose_reset_request_timeout = ROSE_DEFAULT_T2; +int sysctl_rose_clear_request_timeout = ROSE_DEFAULT_T3; +int sysctl_rose_no_activity_timeout = ROSE_DEFAULT_IDLE; +int sysctl_rose_ack_hold_back_timeout = ROSE_DEFAULT_HB; +int sysctl_rose_routing_control = ROSE_DEFAULT_ROUTING; +int sysctl_rose_link_fail_timeout = ROSE_DEFAULT_FAIL_TIMEOUT; +int sysctl_rose_maximum_vcs = ROSE_DEFAULT_MAXVC; +int sysctl_rose_window_size = ROSE_DEFAULT_WINDOW_SIZE; + +static HLIST_HEAD(rose_list); +static DEFINE_SPINLOCK(rose_list_lock); + +static struct proto_ops rose_proto_ops; + +ax25_address rose_callsign; + +/* + * Convert a ROSE address into text. + */ +const char *rose2asc(const rose_address *addr) +{ + static char buffer[11]; + + if (addr->rose_addr[0] == 0x00 && addr->rose_addr[1] == 0x00 && + addr->rose_addr[2] == 0x00 && addr->rose_addr[3] == 0x00 && + addr->rose_addr[4] == 0x00) { + strcpy(buffer, "*"); + } else { + sprintf(buffer, "%02X%02X%02X%02X%02X", addr->rose_addr[0] & 0xFF, + addr->rose_addr[1] & 0xFF, + addr->rose_addr[2] & 0xFF, + addr->rose_addr[3] & 0xFF, + addr->rose_addr[4] & 0xFF); + } + + return buffer; +} + +/* + * Compare two ROSE addresses, 0 == equal. + */ +int rosecmp(rose_address *addr1, rose_address *addr2) +{ + int i; + + for (i = 0; i < 5; i++) + if (addr1->rose_addr[i] != addr2->rose_addr[i]) + return 1; + + return 0; +} + +/* + * Compare two ROSE addresses for only mask digits, 0 == equal. + */ +int rosecmpm(rose_address *addr1, rose_address *addr2, unsigned short mask) +{ + int i, j; + + if (mask > 10) + return 1; + + for (i = 0; i < mask; i++) { + j = i / 2; + + if ((i % 2) != 0) { + if ((addr1->rose_addr[j] & 0x0F) != (addr2->rose_addr[j] & 0x0F)) + return 1; + } else { + if ((addr1->rose_addr[j] & 0xF0) != (addr2->rose_addr[j] & 0xF0)) + return 1; + } + } + + return 0; +} + +/* + * Socket removal during an interrupt is now safe. + */ +static void rose_remove_socket(struct sock *sk) +{ + spin_lock_bh(&rose_list_lock); + sk_del_node_init(sk); + spin_unlock_bh(&rose_list_lock); +} + +/* + * Kill all bound sockets on a broken link layer connection to a + * particular neighbour. + */ +void rose_kill_by_neigh(struct rose_neigh *neigh) +{ + struct sock *s; + struct hlist_node *node; + + spin_lock_bh(&rose_list_lock); + sk_for_each(s, node, &rose_list) { + struct rose_sock *rose = rose_sk(s); + + if (rose->neighbour == neigh) { + rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); + rose->neighbour->use--; + rose->neighbour = NULL; + } + } + spin_unlock_bh(&rose_list_lock); +} + +/* + * Kill all bound sockets on a dropped device. + */ +static void rose_kill_by_device(struct net_device *dev) +{ + struct sock *s; + struct hlist_node *node; + + spin_lock_bh(&rose_list_lock); + sk_for_each(s, node, &rose_list) { + struct rose_sock *rose = rose_sk(s); + + if (rose->device == dev) { + rose_disconnect(s, ENETUNREACH, ROSE_OUT_OF_ORDER, 0); + rose->neighbour->use--; + rose->device = NULL; + } + } + spin_unlock_bh(&rose_list_lock); +} + +/* + * Handle device status changes. + */ +static int rose_device_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = (struct net_device *)ptr; + + if (event != NETDEV_DOWN) + return NOTIFY_DONE; + + switch (dev->type) { + case ARPHRD_ROSE: + rose_kill_by_device(dev); + break; + case ARPHRD_AX25: + rose_link_device_down(dev); + rose_rt_device_down(dev); + break; + } + + return NOTIFY_DONE; +} + +/* + * Add a socket to the bound sockets list. + */ +static void rose_insert_socket(struct sock *sk) +{ + + spin_lock_bh(&rose_list_lock); + sk_add_node(sk, &rose_list); + spin_unlock_bh(&rose_list_lock); +} + +/* + * Find a socket that wants to accept the Call Request we just + * received. + */ +static struct sock *rose_find_listener(rose_address *addr, ax25_address *call) +{ + struct sock *s; + struct hlist_node *node; + + spin_lock_bh(&rose_list_lock); + sk_for_each(s, node, &rose_list) { + struct rose_sock *rose = rose_sk(s); + + if (!rosecmp(&rose->source_addr, addr) && + !ax25cmp(&rose->source_call, call) && + !rose->source_ndigis && s->sk_state == TCP_LISTEN) + goto found; + } + + sk_for_each(s, node, &rose_list) { + struct rose_sock *rose = rose_sk(s); + + if (!rosecmp(&rose->source_addr, addr) && + !ax25cmp(&rose->source_call, &null_ax25_address) && + s->sk_state == TCP_LISTEN) + goto found; + } + s = NULL; +found: + spin_unlock_bh(&rose_list_lock); + return s; +} + +/* + * Find a connected ROSE socket given my LCI and device. + */ +struct sock *rose_find_socket(unsigned int lci, struct rose_neigh *neigh) +{ + struct sock *s; + struct hlist_node *node; + + spin_lock_bh(&rose_list_lock); + sk_for_each(s, node, &rose_list) { + struct rose_sock *rose = rose_sk(s); + + if (rose->lci == lci && rose->neighbour == neigh) + goto found; + } + s = NULL; +found: + spin_unlock_bh(&rose_list_lock); + return s; +} + +/* + * Find a unique LCI for a given device. + */ +unsigned int rose_new_lci(struct rose_neigh *neigh) +{ + int lci; + + if (neigh->dce_mode) { + for (lci = 1; lci <= sysctl_rose_maximum_vcs; lci++) + if (rose_find_socket(lci, neigh) == NULL && rose_route_free_lci(lci, neigh) == NULL) + return lci; + } else { + for (lci = sysctl_rose_maximum_vcs; lci > 0; lci--) + if (rose_find_socket(lci, neigh) == NULL && rose_route_free_lci(lci, neigh) == NULL) + return lci; + } + + return 0; +} + +/* + * Deferred destroy. + */ +void rose_destroy_socket(struct sock *); + +/* + * Handler for deferred kills. + */ +static void rose_destroy_timer(unsigned long data) +{ + rose_destroy_socket((struct sock *)data); +} + +/* + * This is called from user mode and the timers. Thus it protects itself + * against interrupt users but doesn't worry about being called during + * work. Once it is removed from the queue no interrupt or bottom half + * will touch it and we are (fairly 8-) ) safe. + */ +void rose_destroy_socket(struct sock *sk) +{ + struct sk_buff *skb; + + rose_remove_socket(sk); + rose_stop_heartbeat(sk); + rose_stop_idletimer(sk); + rose_stop_timer(sk); + + rose_clear_queues(sk); /* Flush the queues */ + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + if (skb->sk != sk) { /* A pending connection */ + /* Queue the unaccepted socket for death */ + sock_set_flag(skb->sk, SOCK_DEAD); + rose_start_heartbeat(skb->sk); + rose_sk(skb->sk)->state = ROSE_STATE_0; + } + + kfree_skb(skb); + } + + if (atomic_read(&sk->sk_wmem_alloc) || + atomic_read(&sk->sk_rmem_alloc)) { + /* Defer: outstanding buffers */ + init_timer(&sk->sk_timer); + sk->sk_timer.expires = jiffies + 10 * HZ; + sk->sk_timer.function = rose_destroy_timer; + sk->sk_timer.data = (unsigned long)sk; + add_timer(&sk->sk_timer); + } else + sock_put(sk); +} + +/* + * Handling for system calls applied via the various interfaces to a + * ROSE socket object. + */ + +static int rose_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + struct sock *sk = sock->sk; + struct rose_sock *rose = rose_sk(sk); + int opt; + + if (level != SOL_ROSE) + return -ENOPROTOOPT; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(opt, (int __user *)optval)) + return -EFAULT; + + switch (optname) { + case ROSE_DEFER: + rose->defer = opt ? 1 : 0; + return 0; + + case ROSE_T1: + if (opt < 1) + return -EINVAL; + rose->t1 = opt * HZ; + return 0; + + case ROSE_T2: + if (opt < 1) + return -EINVAL; + rose->t2 = opt * HZ; + return 0; + + case ROSE_T3: + if (opt < 1) + return -EINVAL; + rose->t3 = opt * HZ; + return 0; + + case ROSE_HOLDBACK: + if (opt < 1) + return -EINVAL; + rose->hb = opt * HZ; + return 0; + + case ROSE_IDLE: + if (opt < 0) + return -EINVAL; + rose->idle = opt * 60 * HZ; + return 0; + + case ROSE_QBITINCL: + rose->qbitincl = opt ? 1 : 0; + return 0; + + default: + return -ENOPROTOOPT; + } +} + +static int rose_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + struct rose_sock *rose = rose_sk(sk); + int val = 0; + int len; + + if (level != SOL_ROSE) + return -ENOPROTOOPT; + + if (get_user(len, optlen)) + return -EFAULT; + + if (len < 0) + return -EINVAL; + + switch (optname) { + case ROSE_DEFER: + val = rose->defer; + break; + + case ROSE_T1: + val = rose->t1 / HZ; + break; + + case ROSE_T2: + val = rose->t2 / HZ; + break; + + case ROSE_T3: + val = rose->t3 / HZ; + break; + + case ROSE_HOLDBACK: + val = rose->hb / HZ; + break; + + case ROSE_IDLE: + val = rose->idle / (60 * HZ); + break; + + case ROSE_QBITINCL: + val = rose->qbitincl; + break; + + default: + return -ENOPROTOOPT; + } + + len = min_t(unsigned int, len, sizeof(int)); + + if (put_user(len, optlen)) + return -EFAULT; + + return copy_to_user(optval, &val, len) ? -EFAULT : 0; +} + +static int rose_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + + if (sk->sk_state != TCP_LISTEN) { + struct rose_sock *rose = rose_sk(sk); + + rose->dest_ndigis = 0; + memset(&rose->dest_addr, 0, ROSE_ADDR_LEN); + memset(&rose->dest_call, 0, AX25_ADDR_LEN); + memset(rose->dest_digis, 0, AX25_ADDR_LEN * ROSE_MAX_DIGIS); + sk->sk_max_ack_backlog = backlog; + sk->sk_state = TCP_LISTEN; + return 0; + } + + return -EOPNOTSUPP; +} + +static struct proto rose_proto = { + .name = "ROSE", + .owner = THIS_MODULE, + .obj_size = sizeof(struct rose_sock), +}; + +static int rose_create(struct socket *sock, int protocol) +{ + struct sock *sk; + struct rose_sock *rose; + + if (sock->type != SOCK_SEQPACKET || protocol != 0) + return -ESOCKTNOSUPPORT; + + if ((sk = sk_alloc(PF_ROSE, GFP_ATOMIC, &rose_proto, 1)) == NULL) + return -ENOMEM; + + rose = rose_sk(sk); + + sock_init_data(sock, sk); + + skb_queue_head_init(&rose->ack_queue); +#ifdef M_BIT + skb_queue_head_init(&rose->frag_queue); + rose->fraglen = 0; +#endif + + sock->ops = &rose_proto_ops; + sk->sk_protocol = protocol; + + init_timer(&rose->timer); + init_timer(&rose->idletimer); + + rose->t1 = sysctl_rose_call_request_timeout; + rose->t2 = sysctl_rose_reset_request_timeout; + rose->t3 = sysctl_rose_clear_request_timeout; + rose->hb = sysctl_rose_ack_hold_back_timeout; + rose->idle = sysctl_rose_no_activity_timeout; + + rose->state = ROSE_STATE_0; + + return 0; +} + +static struct sock *rose_make_new(struct sock *osk) +{ + struct sock *sk; + struct rose_sock *rose, *orose; + + if (osk->sk_type != SOCK_SEQPACKET) + return NULL; + + if ((sk = sk_alloc(PF_ROSE, GFP_ATOMIC, &rose_proto, 1)) == NULL) + return NULL; + + rose = rose_sk(sk); + + sock_init_data(NULL, sk); + + skb_queue_head_init(&rose->ack_queue); +#ifdef M_BIT + skb_queue_head_init(&rose->frag_queue); + rose->fraglen = 0; +#endif + + sk->sk_type = osk->sk_type; + sk->sk_socket = osk->sk_socket; + sk->sk_priority = osk->sk_priority; + sk->sk_protocol = osk->sk_protocol; + sk->sk_rcvbuf = osk->sk_rcvbuf; + sk->sk_sndbuf = osk->sk_sndbuf; + sk->sk_state = TCP_ESTABLISHED; + sk->sk_sleep = osk->sk_sleep; + + if (sock_flag(osk, SOCK_ZAPPED)) + sock_set_flag(sk, SOCK_ZAPPED); + + if (sock_flag(osk, SOCK_DBG)) + sock_set_flag(sk, SOCK_DBG); + + init_timer(&rose->timer); + init_timer(&rose->idletimer); + + orose = rose_sk(osk); + rose->t1 = orose->t1; + rose->t2 = orose->t2; + rose->t3 = orose->t3; + rose->hb = orose->hb; + rose->idle = orose->idle; + rose->defer = orose->defer; + rose->device = orose->device; + rose->qbitincl = orose->qbitincl; + + return sk; +} + +static int rose_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct rose_sock *rose; + + if (sk == NULL) return 0; + + rose = rose_sk(sk); + + switch (rose->state) { + case ROSE_STATE_0: + rose_disconnect(sk, 0, -1, -1); + rose_destroy_socket(sk); + break; + + case ROSE_STATE_2: + rose->neighbour->use--; + rose_disconnect(sk, 0, -1, -1); + rose_destroy_socket(sk); + break; + + case ROSE_STATE_1: + case ROSE_STATE_3: + case ROSE_STATE_4: + case ROSE_STATE_5: + rose_clear_queues(sk); + rose_stop_idletimer(sk); + rose_write_internal(sk, ROSE_CLEAR_REQUEST); + rose_start_t3timer(sk); + rose->state = ROSE_STATE_2; + sk->sk_state = TCP_CLOSE; + sk->sk_shutdown |= SEND_SHUTDOWN; + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DEAD); + sock_set_flag(sk, SOCK_DESTROY); + break; + + default: + break; + } + + sock->sk = NULL; + + return 0; +} + +static int rose_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct rose_sock *rose = rose_sk(sk); + struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr; + struct net_device *dev; + ax25_address *user, *source; + int n; + + if (!sock_flag(sk, SOCK_ZAPPED)) + return -EINVAL; + + if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose)) + return -EINVAL; + + if (addr->srose_family != AF_ROSE) + return -EINVAL; + + if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1) + return -EINVAL; + + if (addr->srose_ndigis > ROSE_MAX_DIGIS) + return -EINVAL; + + if ((dev = rose_dev_get(&addr->srose_addr)) == NULL) { + SOCK_DEBUG(sk, "ROSE: bind failed: invalid address\n"); + return -EADDRNOTAVAIL; + } + + source = &addr->srose_call; + + if ((user = ax25_findbyuid(current->euid)) == NULL) { + if (ax25_uid_policy && !capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + user = source; + } + + rose->source_addr = addr->srose_addr; + rose->source_call = *user; + rose->device = dev; + rose->source_ndigis = addr->srose_ndigis; + + if (addr_len == sizeof(struct full_sockaddr_rose)) { + struct full_sockaddr_rose *full_addr = (struct full_sockaddr_rose *)uaddr; + for (n = 0 ; n < addr->srose_ndigis ; n++) + rose->source_digis[n] = full_addr->srose_digis[n]; + } else { + if (rose->source_ndigis == 1) { + rose->source_digis[0] = addr->srose_digi; + } + } + + rose_insert_socket(sk); + + sock_reset_flag(sk, SOCK_ZAPPED); + SOCK_DEBUG(sk, "ROSE: socket is bound\n"); + return 0; +} + +static int rose_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct rose_sock *rose = rose_sk(sk); + struct sockaddr_rose *addr = (struct sockaddr_rose *)uaddr; + unsigned char cause, diagnostic; + ax25_address *user; + struct net_device *dev; + int n; + + if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { + sock->state = SS_CONNECTED; + return 0; /* Connect completed during a ERESTARTSYS event */ + } + + if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { + sock->state = SS_UNCONNECTED; + return -ECONNREFUSED; + } + + if (sk->sk_state == TCP_ESTABLISHED) + return -EISCONN; /* No reconnect on a seqpacket socket */ + + sk->sk_state = TCP_CLOSE; + sock->state = SS_UNCONNECTED; + + if (addr_len != sizeof(struct sockaddr_rose) && addr_len != sizeof(struct full_sockaddr_rose)) + return -EINVAL; + + if (addr->srose_family != AF_ROSE) + return -EINVAL; + + if (addr_len == sizeof(struct sockaddr_rose) && addr->srose_ndigis > 1) + return -EINVAL; + + if (addr->srose_ndigis > ROSE_MAX_DIGIS) + return -EINVAL; + + /* Source + Destination digis should not exceed ROSE_MAX_DIGIS */ + if ((rose->source_ndigis + addr->srose_ndigis) > ROSE_MAX_DIGIS) + return -EINVAL; + + rose->neighbour = rose_get_neigh(&addr->srose_addr, &cause, + &diagnostic); + if (!rose->neighbour) + return -ENETUNREACH; + + rose->lci = rose_new_lci(rose->neighbour); + if (!rose->lci) + return -ENETUNREACH; + + if (sock_flag(sk, SOCK_ZAPPED)) { /* Must bind first - autobinding in this may or may not work */ + sock_reset_flag(sk, SOCK_ZAPPED); + + if ((dev = rose_dev_first()) == NULL) + return -ENETUNREACH; + + if ((user = ax25_findbyuid(current->euid)) == NULL) + return -EINVAL; + + memcpy(&rose->source_addr, dev->dev_addr, ROSE_ADDR_LEN); + rose->source_call = *user; + rose->device = dev; + + rose_insert_socket(sk); /* Finish the bind */ + } + + rose->dest_addr = addr->srose_addr; + rose->dest_call = addr->srose_call; + rose->rand = ((long)rose & 0xFFFF) + rose->lci; + rose->dest_ndigis = addr->srose_ndigis; + + if (addr_len == sizeof(struct full_sockaddr_rose)) { + struct full_sockaddr_rose *full_addr = (struct full_sockaddr_rose *)uaddr; + for (n = 0 ; n < addr->srose_ndigis ; n++) + rose->dest_digis[n] = full_addr->srose_digis[n]; + } else { + if (rose->dest_ndigis == 1) { + rose->dest_digis[0] = addr->srose_digi; + } + } + + /* Move to connecting socket, start sending Connect Requests */ + sock->state = SS_CONNECTING; + sk->sk_state = TCP_SYN_SENT; + + rose->state = ROSE_STATE_1; + + rose->neighbour->use++; + + rose_write_internal(sk, ROSE_CALL_REQUEST); + rose_start_heartbeat(sk); + rose_start_t1timer(sk); + + /* Now the loop */ + if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) + return -EINPROGRESS; + + /* + * A Connect Ack with Choke or timeout or failed routing will go to + * closed. + */ + if (sk->sk_state == TCP_SYN_SENT) { + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + + add_wait_queue(sk->sk_sleep, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (sk->sk_state != TCP_SYN_SENT) + break; + if (!signal_pending(tsk)) { + schedule(); + continue; + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep, &wait); + return -ERESTARTSYS; + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep, &wait); + } + + if (sk->sk_state != TCP_ESTABLISHED) { + sock->state = SS_UNCONNECTED; + return sock_error(sk); /* Always set at this point */ + } + + sock->state = SS_CONNECTED; + + return 0; +} + +static int rose_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct task_struct *tsk = current; + DECLARE_WAITQUEUE(wait, tsk); + struct sk_buff *skb; + struct sock *newsk; + struct sock *sk; + int err = 0; + + if ((sk = sock->sk) == NULL) + return -EINVAL; + + lock_sock(sk); + if (sk->sk_type != SOCK_SEQPACKET) { + err = -EOPNOTSUPP; + goto out; + } + + if (sk->sk_state != TCP_LISTEN) { + err = -EINVAL; + goto out; + } + + /* + * The write queue this time is holding sockets ready to use + * hooked into the SABM we saved + */ + add_wait_queue(sk->sk_sleep, &wait); + for (;;) { + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb) + break; + + current->state = TASK_INTERRUPTIBLE; + release_sock(sk); + if (flags & O_NONBLOCK) { + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep, &wait); + return -EWOULDBLOCK; + } + if (!signal_pending(tsk)) { + schedule(); + lock_sock(sk); + continue; + } + return -ERESTARTSYS; + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep, &wait); + + newsk = skb->sk; + newsk->sk_socket = newsock; + newsk->sk_sleep = &newsock->wait; + + /* Now attach up the new socket */ + skb->sk = NULL; + kfree_skb(skb); + sk->sk_ack_backlog--; + newsock->sk = newsk; + +out: + release_sock(sk); + + return err; +} + +static int rose_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct full_sockaddr_rose *srose = (struct full_sockaddr_rose *)uaddr; + struct sock *sk = sock->sk; + struct rose_sock *rose = rose_sk(sk); + int n; + + if (peer != 0) { + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + srose->srose_family = AF_ROSE; + srose->srose_addr = rose->dest_addr; + srose->srose_call = rose->dest_call; + srose->srose_ndigis = rose->dest_ndigis; + for (n = 0; n < rose->dest_ndigis; n++) + srose->srose_digis[n] = rose->dest_digis[n]; + } else { + srose->srose_family = AF_ROSE; + srose->srose_addr = rose->source_addr; + srose->srose_call = rose->source_call; + srose->srose_ndigis = rose->source_ndigis; + for (n = 0; n < rose->source_ndigis; n++) + srose->srose_digis[n] = rose->source_digis[n]; + } + + *uaddr_len = sizeof(struct full_sockaddr_rose); + return 0; +} + +int rose_rx_call_request(struct sk_buff *skb, struct net_device *dev, struct rose_neigh *neigh, unsigned int lci) +{ + struct sock *sk; + struct sock *make; + struct rose_sock *make_rose; + struct rose_facilities_struct facilities; + int n, len; + + skb->sk = NULL; /* Initially we don't know who it's for */ + + /* + * skb->data points to the rose frame start + */ + memset(&facilities, 0x00, sizeof(struct rose_facilities_struct)); + + len = (((skb->data[3] >> 4) & 0x0F) + 1) / 2; + len += (((skb->data[3] >> 0) & 0x0F) + 1) / 2; + if (!rose_parse_facilities(skb->data + len + 4, &facilities)) { + rose_transmit_clear_request(neigh, lci, ROSE_INVALID_FACILITY, 76); + return 0; + } + + sk = rose_find_listener(&facilities.source_addr, &facilities.source_call); + + /* + * We can't accept the Call Request. + */ + if (sk == NULL || sk_acceptq_is_full(sk) || + (make = rose_make_new(sk)) == NULL) { + rose_transmit_clear_request(neigh, lci, ROSE_NETWORK_CONGESTION, 120); + return 0; + } + + skb->sk = make; + make->sk_state = TCP_ESTABLISHED; + make_rose = rose_sk(make); + + make_rose->lci = lci; + make_rose->dest_addr = facilities.dest_addr; + make_rose->dest_call = facilities.dest_call; + make_rose->dest_ndigis = facilities.dest_ndigis; + for (n = 0 ; n < facilities.dest_ndigis ; n++) + make_rose->dest_digis[n] = facilities.dest_digis[n]; + make_rose->source_addr = facilities.source_addr; + make_rose->source_call = facilities.source_call; + make_rose->source_ndigis = facilities.source_ndigis; + for (n = 0 ; n < facilities.source_ndigis ; n++) + make_rose->source_digis[n]= facilities.source_digis[n]; + make_rose->neighbour = neigh; + make_rose->device = dev; + make_rose->facilities = facilities; + + make_rose->neighbour->use++; + + if (rose_sk(sk)->defer) { + make_rose->state = ROSE_STATE_5; + } else { + rose_write_internal(make, ROSE_CALL_ACCEPTED); + make_rose->state = ROSE_STATE_3; + rose_start_idletimer(make); + } + + make_rose->condition = 0x00; + make_rose->vs = 0; + make_rose->va = 0; + make_rose->vr = 0; + make_rose->vl = 0; + sk->sk_ack_backlog++; + + rose_insert_socket(make); + + skb_queue_head(&sk->sk_receive_queue, skb); + + rose_start_heartbeat(make); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skb->len); + + return 1; +} + +static int rose_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct rose_sock *rose = rose_sk(sk); + struct sockaddr_rose *usrose = (struct sockaddr_rose *)msg->msg_name; + int err; + struct full_sockaddr_rose srose; + struct sk_buff *skb; + unsigned char *asmptr; + int n, size, qbit = 0; + + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_EOR|MSG_CMSG_COMPAT)) + return -EINVAL; + + if (sock_flag(sk, SOCK_ZAPPED)) + return -EADDRNOTAVAIL; + + if (sk->sk_shutdown & SEND_SHUTDOWN) { + send_sig(SIGPIPE, current, 0); + return -EPIPE; + } + + if (rose->neighbour == NULL || rose->device == NULL) + return -ENETUNREACH; + + if (usrose != NULL) { + if (msg->msg_namelen != sizeof(struct sockaddr_rose) && msg->msg_namelen != sizeof(struct full_sockaddr_rose)) + return -EINVAL; + memset(&srose, 0, sizeof(struct full_sockaddr_rose)); + memcpy(&srose, usrose, msg->msg_namelen); + if (rosecmp(&rose->dest_addr, &srose.srose_addr) != 0 || + ax25cmp(&rose->dest_call, &srose.srose_call) != 0) + return -EISCONN; + if (srose.srose_ndigis != rose->dest_ndigis) + return -EISCONN; + if (srose.srose_ndigis == rose->dest_ndigis) { + for (n = 0 ; n < srose.srose_ndigis ; n++) + if (ax25cmp(&rose->dest_digis[n], + &srose.srose_digis[n])) + return -EISCONN; + } + if (srose.srose_family != AF_ROSE) + return -EINVAL; + } else { + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + srose.srose_family = AF_ROSE; + srose.srose_addr = rose->dest_addr; + srose.srose_call = rose->dest_call; + srose.srose_ndigis = rose->dest_ndigis; + for (n = 0 ; n < rose->dest_ndigis ; n++) + srose.srose_digis[n] = rose->dest_digis[n]; + } + + SOCK_DEBUG(sk, "ROSE: sendto: Addresses built.\n"); + + /* Build a packet */ + SOCK_DEBUG(sk, "ROSE: sendto: building packet.\n"); + size = len + AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN; + + if ((skb = sock_alloc_send_skb(sk, size, msg->msg_flags & MSG_DONTWAIT, &err)) == NULL) + return err; + + skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN); + + /* + * Put the data on the end + */ + SOCK_DEBUG(sk, "ROSE: Appending user data\n"); + + asmptr = skb->h.raw = skb_put(skb, len); + + err = memcpy_fromiovec(asmptr, msg->msg_iov, len); + if (err) { + kfree_skb(skb); + return err; + } + + /* + * If the Q BIT Include socket option is in force, the first + * byte of the user data is the logical value of the Q Bit. + */ + if (rose->qbitincl) { + qbit = skb->data[0]; + skb_pull(skb, 1); + } + + /* + * Push down the ROSE header + */ + asmptr = skb_push(skb, ROSE_MIN_LEN); + + SOCK_DEBUG(sk, "ROSE: Building Network Header.\n"); + + /* Build a ROSE Network header */ + asmptr[0] = ((rose->lci >> 8) & 0x0F) | ROSE_GFI; + asmptr[1] = (rose->lci >> 0) & 0xFF; + asmptr[2] = ROSE_DATA; + + if (qbit) + asmptr[0] |= ROSE_Q_BIT; + + SOCK_DEBUG(sk, "ROSE: Built header.\n"); + + SOCK_DEBUG(sk, "ROSE: Transmitting buffer\n"); + + if (sk->sk_state != TCP_ESTABLISHED) { + kfree_skb(skb); + return -ENOTCONN; + } + +#ifdef M_BIT +#define ROSE_PACLEN (256-ROSE_MIN_LEN) + if (skb->len - ROSE_MIN_LEN > ROSE_PACLEN) { + unsigned char header[ROSE_MIN_LEN]; + struct sk_buff *skbn; + int frontlen; + int lg; + + /* Save a copy of the Header */ + memcpy(header, skb->data, ROSE_MIN_LEN); + skb_pull(skb, ROSE_MIN_LEN); + + frontlen = skb_headroom(skb); + + while (skb->len > 0) { + if ((skbn = sock_alloc_send_skb(sk, frontlen + ROSE_PACLEN, 0, &err)) == NULL) { + kfree_skb(skb); + return err; + } + + skbn->sk = sk; + skbn->free = 1; + skbn->arp = 1; + + skb_reserve(skbn, frontlen); + + lg = (ROSE_PACLEN > skb->len) ? skb->len : ROSE_PACLEN; + + /* Copy the user data */ + memcpy(skb_put(skbn, lg), skb->data, lg); + skb_pull(skb, lg); + + /* Duplicate the Header */ + skb_push(skbn, ROSE_MIN_LEN); + memcpy(skbn->data, header, ROSE_MIN_LEN); + + if (skb->len > 0) + skbn->data[2] |= M_BIT; + + skb_queue_tail(&sk->sk_write_queue, skbn); /* Throw it on the queue */ + } + + skb->free = 1; + kfree_skb(skb); + } else { + skb_queue_tail(&sk->sk_write_queue, skb); /* Throw it on the queue */ + } +#else + skb_queue_tail(&sk->sk_write_queue, skb); /* Shove it onto the queue */ +#endif + + rose_kick(sk); + + return len; +} + + +static int rose_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + struct rose_sock *rose = rose_sk(sk); + struct sockaddr_rose *srose = (struct sockaddr_rose *)msg->msg_name; + size_t copied; + unsigned char *asmptr; + struct sk_buff *skb; + int n, er, qbit; + + /* + * This works for seqpacket too. The receiver has ordered the queue for + * us! We do one quick check first though + */ + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + /* Now we can treat all alike */ + if ((skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &er)) == NULL) + return er; + + qbit = (skb->data[0] & ROSE_Q_BIT) == ROSE_Q_BIT; + + skb_pull(skb, ROSE_MIN_LEN); + + if (rose->qbitincl) { + asmptr = skb_push(skb, 1); + *asmptr = qbit; + } + + skb->h.raw = skb->data; + copied = skb->len; + + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + + if (srose != NULL) { + srose->srose_family = AF_ROSE; + srose->srose_addr = rose->dest_addr; + srose->srose_call = rose->dest_call; + srose->srose_ndigis = rose->dest_ndigis; + if (msg->msg_namelen >= sizeof(struct full_sockaddr_rose)) { + struct full_sockaddr_rose *full_srose = (struct full_sockaddr_rose *)msg->msg_name; + for (n = 0 ; n < rose->dest_ndigis ; n++) + full_srose->srose_digis[n] = rose->dest_digis[n]; + msg->msg_namelen = sizeof(struct full_sockaddr_rose); + } else { + if (rose->dest_ndigis >= 1) { + srose->srose_ndigis = 1; + srose->srose_digi = rose->dest_digis[0]; + } + msg->msg_namelen = sizeof(struct sockaddr_rose); + } + } + + skb_free_datagram(sk, skb); + + return copied; +} + + +static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + struct rose_sock *rose = rose_sk(sk); + void __user *argp = (void __user *)arg; + + switch (cmd) { + case TIOCOUTQ: { + long amount; + amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); + if (amount < 0) + amount = 0; + return put_user(amount, (unsigned int __user *)argp); + } + + case TIOCINQ: { + struct sk_buff *skb; + long amount = 0L; + /* These two are safe on a single CPU system as only user tasks fiddle here */ + if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) + amount = skb->len; + return put_user(amount, (unsigned int __user *)argp); + } + + case SIOCGSTAMP: + if (sk != NULL) + return sock_get_timestamp(sk, (struct timeval __user *)argp); + return -EINVAL; + + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + return -EINVAL; + + case SIOCADDRT: + case SIOCDELRT: + case SIOCRSCLRRT: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + return rose_rt_ioctl(cmd, argp); + + case SIOCRSGCAUSE: { + struct rose_cause_struct rose_cause; + rose_cause.cause = rose->cause; + rose_cause.diagnostic = rose->diagnostic; + return copy_to_user(argp, &rose_cause, sizeof(struct rose_cause_struct)) ? -EFAULT : 0; + } + + case SIOCRSSCAUSE: { + struct rose_cause_struct rose_cause; + if (copy_from_user(&rose_cause, argp, sizeof(struct rose_cause_struct))) + return -EFAULT; + rose->cause = rose_cause.cause; + rose->diagnostic = rose_cause.diagnostic; + return 0; + } + + case SIOCRSSL2CALL: + if (!capable(CAP_NET_ADMIN)) return -EPERM; + if (ax25cmp(&rose_callsign, &null_ax25_address) != 0) + ax25_listen_release(&rose_callsign, NULL); + if (copy_from_user(&rose_callsign, argp, sizeof(ax25_address))) + return -EFAULT; + if (ax25cmp(&rose_callsign, &null_ax25_address) != 0) + ax25_listen_register(&rose_callsign, NULL); + return 0; + + case SIOCRSGL2CALL: + return copy_to_user(argp, &rose_callsign, sizeof(ax25_address)) ? -EFAULT : 0; + + case SIOCRSACCEPT: + if (rose->state == ROSE_STATE_5) { + rose_write_internal(sk, ROSE_CALL_ACCEPTED); + rose_start_idletimer(sk); + rose->condition = 0x00; + rose->vs = 0; + rose->va = 0; + rose->vr = 0; + rose->vl = 0; + rose->state = ROSE_STATE_3; + } + return 0; + + default: + return dev_ioctl(cmd, argp); + } + + return 0; +} + +#ifdef CONFIG_PROC_FS +static void *rose_info_start(struct seq_file *seq, loff_t *pos) +{ + int i; + struct sock *s; + struct hlist_node *node; + + spin_lock_bh(&rose_list_lock); + if (*pos == 0) + return SEQ_START_TOKEN; + + i = 1; + sk_for_each(s, node, &rose_list) { + if (i == *pos) + return s; + ++i; + } + return NULL; +} + +static void *rose_info_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + + return (v == SEQ_START_TOKEN) ? sk_head(&rose_list) + : sk_next((struct sock *)v); +} + +static void rose_info_stop(struct seq_file *seq, void *v) +{ + spin_unlock_bh(&rose_list_lock); +} + +static int rose_info_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "dest_addr dest_call src_addr src_call dev lci neigh st vs vr va t t1 t2 t3 hb idle Snd-Q Rcv-Q inode\n"); + + else { + struct sock *s = v; + struct rose_sock *rose = rose_sk(s); + const char *devname, *callsign; + const struct net_device *dev = rose->device; + + if (!dev) + devname = "???"; + else + devname = dev->name; + + seq_printf(seq, "%-10s %-9s ", + rose2asc(&rose->dest_addr), + ax2asc(&rose->dest_call)); + + if (ax25cmp(&rose->source_call, &null_ax25_address) == 0) + callsign = "??????-?"; + else + callsign = ax2asc(&rose->source_call); + + seq_printf(seq, + "%-10s %-9s %-5s %3.3X %05d %d %d %d %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d %ld\n", + rose2asc(&rose->source_addr), + callsign, + devname, + rose->lci & 0x0FFF, + (rose->neighbour) ? rose->neighbour->number : 0, + rose->state, + rose->vs, + rose->vr, + rose->va, + ax25_display_timer(&rose->timer) / HZ, + rose->t1 / HZ, + rose->t2 / HZ, + rose->t3 / HZ, + rose->hb / HZ, + ax25_display_timer(&rose->idletimer) / (60 * HZ), + rose->idle / (60 * HZ), + atomic_read(&s->sk_wmem_alloc), + atomic_read(&s->sk_rmem_alloc), + s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L); + } + + return 0; +} + +static struct seq_operations rose_info_seqops = { + .start = rose_info_start, + .next = rose_info_next, + .stop = rose_info_stop, + .show = rose_info_show, +}; + +static int rose_info_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rose_info_seqops); +} + +static struct file_operations rose_info_fops = { + .owner = THIS_MODULE, + .open = rose_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif /* CONFIG_PROC_FS */ + +static struct net_proto_family rose_family_ops = { + .family = PF_ROSE, + .create = rose_create, + .owner = THIS_MODULE, +}; + +static struct proto_ops rose_proto_ops = { + .family = PF_ROSE, + .owner = THIS_MODULE, + .release = rose_release, + .bind = rose_bind, + .connect = rose_connect, + .socketpair = sock_no_socketpair, + .accept = rose_accept, + .getname = rose_getname, + .poll = datagram_poll, + .ioctl = rose_ioctl, + .listen = rose_listen, + .shutdown = sock_no_shutdown, + .setsockopt = rose_setsockopt, + .getsockopt = rose_getsockopt, + .sendmsg = rose_sendmsg, + .recvmsg = rose_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct notifier_block rose_dev_notifier = { + .notifier_call = rose_device_event, +}; + +static struct net_device **dev_rose; + +static const char banner[] = KERN_INFO "F6FBB/G4KLX ROSE for Linux. Version 0.62 for AX25.037 Linux 2.4\n"; + +static int __init rose_proto_init(void) +{ + int i; + int rc = proto_register(&rose_proto, 0); + + if (rc != 0) + goto out; + + rose_callsign = null_ax25_address; + + if (rose_ndevs > 0x7FFFFFFF/sizeof(struct net_device *)) { + printk(KERN_ERR "ROSE: rose_proto_init - rose_ndevs parameter to large\n"); + return -1; + } + + dev_rose = kmalloc(rose_ndevs * sizeof(struct net_device *), GFP_KERNEL); + if (dev_rose == NULL) { + printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate device structure\n"); + return -1; + } + + memset(dev_rose, 0x00, rose_ndevs * sizeof(struct net_device*)); + for (i = 0; i < rose_ndevs; i++) { + struct net_device *dev; + char name[IFNAMSIZ]; + + sprintf(name, "rose%d", i); + dev = alloc_netdev(sizeof(struct net_device_stats), + name, rose_setup); + if (!dev) { + printk(KERN_ERR "ROSE: rose_proto_init - unable to allocate memory\n"); + goto fail; + } + if (register_netdev(dev)) { + printk(KERN_ERR "ROSE: netdevice regeistration failed\n"); + free_netdev(dev); + goto fail; + } + dev_rose[i] = dev; + } + + sock_register(&rose_family_ops); + register_netdevice_notifier(&rose_dev_notifier); + printk(banner); + + ax25_protocol_register(AX25_P_ROSE, rose_route_frame); + ax25_linkfail_register(rose_link_failed); + +#ifdef CONFIG_SYSCTL + rose_register_sysctl(); +#endif + rose_loopback_init(); + + rose_add_loopback_neigh(); + + proc_net_fops_create("rose", S_IRUGO, &rose_info_fops); + proc_net_fops_create("rose_neigh", S_IRUGO, &rose_neigh_fops); + proc_net_fops_create("rose_nodes", S_IRUGO, &rose_nodes_fops); + proc_net_fops_create("rose_routes", S_IRUGO, &rose_routes_fops); +out: + return rc; +fail: + while (--i >= 0) { + unregister_netdev(dev_rose[i]); + free_netdev(dev_rose[i]); + } + kfree(dev_rose); + proto_unregister(&rose_proto); + return -ENOMEM; +} +module_init(rose_proto_init); + +module_param(rose_ndevs, int, 0); +MODULE_PARM_DESC(rose_ndevs, "number of ROSE devices"); + +MODULE_AUTHOR("Jonathan Naylor G4KLX "); +MODULE_DESCRIPTION("The amateur radio ROSE network layer protocol"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_ROSE); + +static void __exit rose_exit(void) +{ + int i; + + proc_net_remove("rose"); + proc_net_remove("rose_neigh"); + proc_net_remove("rose_nodes"); + proc_net_remove("rose_routes"); + rose_loopback_clear(); + + rose_rt_free(); + + ax25_protocol_release(AX25_P_ROSE); + ax25_linkfail_release(rose_link_failed); + + if (ax25cmp(&rose_callsign, &null_ax25_address) != 0) + ax25_listen_release(&rose_callsign, NULL); + +#ifdef CONFIG_SYSCTL + rose_unregister_sysctl(); +#endif + unregister_netdevice_notifier(&rose_dev_notifier); + + sock_unregister(PF_ROSE); + + for (i = 0; i < rose_ndevs; i++) { + struct net_device *dev = dev_rose[i]; + + if (dev) { + unregister_netdev(dev); + free_netdev(dev); + } + } + + kfree(dev_rose); + proto_unregister(&rose_proto); +} + +module_exit(rose_exit); diff --git a/net/rose/rose_dev.c b/net/rose/rose_dev.c new file mode 100644 index 000000000000..a8ed9a1d09f9 --- /dev/null +++ b/net/rose/rose_dev.c @@ -0,0 +1,154 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +static int rose_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, + void *daddr, void *saddr, unsigned len) +{ + unsigned char *buff = skb_push(skb, ROSE_MIN_LEN + 2); + + *buff++ = ROSE_GFI | ROSE_Q_BIT; + *buff++ = 0x00; + *buff++ = ROSE_DATA; + *buff++ = 0x7F; + *buff++ = AX25_P_IP; + + if (daddr != NULL) + return 37; + + return -37; +} + +static int rose_rebuild_header(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct net_device_stats *stats = netdev_priv(dev); + unsigned char *bp = (unsigned char *)skb->data; + struct sk_buff *skbn; + +#ifdef CONFIG_INET + if (arp_find(bp + 7, skb)) { + return 1; + } + + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + kfree_skb(skb); + return 1; + } + + if (skb->sk != NULL) + skb_set_owner_w(skbn, skb->sk); + + kfree_skb(skb); + + if (!rose_route_frame(skbn, NULL)) { + kfree_skb(skbn); + stats->tx_errors++; + return 1; + } + + stats->tx_packets++; + stats->tx_bytes += skbn->len; +#endif + return 1; +} + +static int rose_set_mac_address(struct net_device *dev, void *addr) +{ + struct sockaddr *sa = addr; + + rose_del_loopback_node((rose_address *)dev->dev_addr); + + memcpy(dev->dev_addr, sa->sa_data, dev->addr_len); + + rose_add_loopback_node((rose_address *)dev->dev_addr); + + return 0; +} + +static int rose_open(struct net_device *dev) +{ + netif_start_queue(dev); + rose_add_loopback_node((rose_address *)dev->dev_addr); + return 0; +} + +static int rose_close(struct net_device *dev) +{ + netif_stop_queue(dev); + rose_del_loopback_node((rose_address *)dev->dev_addr); + return 0; +} + +static int rose_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct net_device_stats *stats = netdev_priv(dev); + + if (!netif_running(dev)) { + printk(KERN_ERR "ROSE: rose_xmit - called when iface is down\n"); + return 1; + } + dev_kfree_skb(skb); + stats->tx_errors++; + return 0; +} + +static struct net_device_stats *rose_get_stats(struct net_device *dev) +{ + return netdev_priv(dev); +} + +void rose_setup(struct net_device *dev) +{ + SET_MODULE_OWNER(dev); + dev->mtu = ROSE_MAX_PACKET_SIZE - 2; + dev->hard_start_xmit = rose_xmit; + dev->open = rose_open; + dev->stop = rose_close; + + dev->hard_header = rose_header; + dev->hard_header_len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN; + dev->addr_len = ROSE_ADDR_LEN; + dev->type = ARPHRD_ROSE; + dev->rebuild_header = rose_rebuild_header; + dev->set_mac_address = rose_set_mac_address; + + /* New-style flags. */ + dev->flags = 0; + dev->get_stats = rose_get_stats; +} diff --git a/net/rose/rose_in.c b/net/rose/rose_in.c new file mode 100644 index 000000000000..ef475a1bb1ba --- /dev/null +++ b/net/rose/rose_in.c @@ -0,0 +1,297 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * + * Most of this code is based on the SDL diagrams published in the 7th ARRL + * Computer Networking Conference papers. The diagrams have mistakes in them, + * but are mostly correct. Before you modify the code could you read the SDL + * diagrams as the code is not obvious and probably very easy to break. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For ip_rcv */ +#include +#include +#include +#include +#include +#include + +/* + * State machine for state 1, Awaiting Call Accepted State. + * The handling of the timer(s) is in file rose_timer.c. + * Handling of state 0 and connection release is in af_rose.c. + */ +static int rose_state1_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + struct rose_sock *rose = rose_sk(sk); + + switch (frametype) { + case ROSE_CALL_ACCEPTED: + rose_stop_timer(sk); + rose_start_idletimer(sk); + rose->condition = 0x00; + rose->vs = 0; + rose->va = 0; + rose->vr = 0; + rose->vl = 0; + rose->state = ROSE_STATE_3; + sk->sk_state = TCP_ESTABLISHED; + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + break; + + case ROSE_CLEAR_REQUEST: + rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); + rose_disconnect(sk, ECONNREFUSED, skb->data[3], skb->data[4]); + rose->neighbour->use--; + break; + + default: + break; + } + + return 0; +} + +/* + * State machine for state 2, Awaiting Clear Confirmation State. + * The handling of the timer(s) is in file rose_timer.c + * Handling of state 0 and connection release is in af_rose.c. + */ +static int rose_state2_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + struct rose_sock *rose = rose_sk(sk); + + switch (frametype) { + case ROSE_CLEAR_REQUEST: + rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); + rose_disconnect(sk, 0, skb->data[3], skb->data[4]); + rose->neighbour->use--; + break; + + case ROSE_CLEAR_CONFIRMATION: + rose_disconnect(sk, 0, -1, -1); + rose->neighbour->use--; + break; + + default: + break; + } + + return 0; +} + +/* + * State machine for state 3, Connected State. + * The handling of the timer(s) is in file rose_timer.c + * Handling of state 0 and connection release is in af_rose.c. + */ +static int rose_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype, int ns, int nr, int q, int d, int m) +{ + struct rose_sock *rose = rose_sk(sk); + int queued = 0; + + switch (frametype) { + case ROSE_RESET_REQUEST: + rose_stop_timer(sk); + rose_start_idletimer(sk); + rose_write_internal(sk, ROSE_RESET_CONFIRMATION); + rose->condition = 0x00; + rose->vs = 0; + rose->vr = 0; + rose->va = 0; + rose->vl = 0; + rose_requeue_frames(sk); + break; + + case ROSE_CLEAR_REQUEST: + rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); + rose_disconnect(sk, 0, skb->data[3], skb->data[4]); + rose->neighbour->use--; + break; + + case ROSE_RR: + case ROSE_RNR: + if (!rose_validate_nr(sk, nr)) { + rose_write_internal(sk, ROSE_RESET_REQUEST); + rose->condition = 0x00; + rose->vs = 0; + rose->vr = 0; + rose->va = 0; + rose->vl = 0; + rose->state = ROSE_STATE_4; + rose_start_t2timer(sk); + rose_stop_idletimer(sk); + } else { + rose_frames_acked(sk, nr); + if (frametype == ROSE_RNR) { + rose->condition |= ROSE_COND_PEER_RX_BUSY; + } else { + rose->condition &= ~ROSE_COND_PEER_RX_BUSY; + } + } + break; + + case ROSE_DATA: /* XXX */ + rose->condition &= ~ROSE_COND_PEER_RX_BUSY; + if (!rose_validate_nr(sk, nr)) { + rose_write_internal(sk, ROSE_RESET_REQUEST); + rose->condition = 0x00; + rose->vs = 0; + rose->vr = 0; + rose->va = 0; + rose->vl = 0; + rose->state = ROSE_STATE_4; + rose_start_t2timer(sk); + rose_stop_idletimer(sk); + break; + } + rose_frames_acked(sk, nr); + if (ns == rose->vr) { + rose_start_idletimer(sk); + if (sock_queue_rcv_skb(sk, skb) == 0) { + rose->vr = (rose->vr + 1) % ROSE_MODULUS; + queued = 1; + } else { + /* Should never happen ! */ + rose_write_internal(sk, ROSE_RESET_REQUEST); + rose->condition = 0x00; + rose->vs = 0; + rose->vr = 0; + rose->va = 0; + rose->vl = 0; + rose->state = ROSE_STATE_4; + rose_start_t2timer(sk); + rose_stop_idletimer(sk); + break; + } + if (atomic_read(&sk->sk_rmem_alloc) > + (sk->sk_rcvbuf / 2)) + rose->condition |= ROSE_COND_OWN_RX_BUSY; + } + /* + * If the window is full, ack the frame, else start the + * acknowledge hold back timer. + */ + if (((rose->vl + sysctl_rose_window_size) % ROSE_MODULUS) == rose->vr) { + rose->condition &= ~ROSE_COND_ACK_PENDING; + rose_stop_timer(sk); + rose_enquiry_response(sk); + } else { + rose->condition |= ROSE_COND_ACK_PENDING; + rose_start_hbtimer(sk); + } + break; + + default: + printk(KERN_WARNING "ROSE: unknown %02X in state 3\n", frametype); + break; + } + + return queued; +} + +/* + * State machine for state 4, Awaiting Reset Confirmation State. + * The handling of the timer(s) is in file rose_timer.c + * Handling of state 0 and connection release is in af_rose.c. + */ +static int rose_state4_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + struct rose_sock *rose = rose_sk(sk); + + switch (frametype) { + case ROSE_RESET_REQUEST: + rose_write_internal(sk, ROSE_RESET_CONFIRMATION); + case ROSE_RESET_CONFIRMATION: + rose_stop_timer(sk); + rose_start_idletimer(sk); + rose->condition = 0x00; + rose->va = 0; + rose->vr = 0; + rose->vs = 0; + rose->vl = 0; + rose->state = ROSE_STATE_3; + rose_requeue_frames(sk); + break; + + case ROSE_CLEAR_REQUEST: + rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); + rose_disconnect(sk, 0, skb->data[3], skb->data[4]); + rose->neighbour->use--; + break; + + default: + break; + } + + return 0; +} + +/* + * State machine for state 5, Awaiting Call Acceptance State. + * The handling of the timer(s) is in file rose_timer.c + * Handling of state 0 and connection release is in af_rose.c. + */ +static int rose_state5_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + if (frametype == ROSE_CLEAR_REQUEST) { + rose_write_internal(sk, ROSE_CLEAR_CONFIRMATION); + rose_disconnect(sk, 0, skb->data[3], skb->data[4]); + rose_sk(sk)->neighbour->use--; + } + + return 0; +} + +/* Higher level upcall for a LAPB frame */ +int rose_process_rx_frame(struct sock *sk, struct sk_buff *skb) +{ + struct rose_sock *rose = rose_sk(sk); + int queued = 0, frametype, ns, nr, q, d, m; + + if (rose->state == ROSE_STATE_0) + return 0; + + frametype = rose_decode(skb, &ns, &nr, &q, &d, &m); + + switch (rose->state) { + case ROSE_STATE_1: + queued = rose_state1_machine(sk, skb, frametype); + break; + case ROSE_STATE_2: + queued = rose_state2_machine(sk, skb, frametype); + break; + case ROSE_STATE_3: + queued = rose_state3_machine(sk, skb, frametype, ns, nr, q, d, m); + break; + case ROSE_STATE_4: + queued = rose_state4_machine(sk, skb, frametype); + break; + case ROSE_STATE_5: + queued = rose_state5_machine(sk, skb, frametype); + break; + } + + rose_kick(sk); + + return queued; +} diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c new file mode 100644 index 000000000000..09e9e9d04d92 --- /dev/null +++ b/net/rose/rose_link.c @@ -0,0 +1,288 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void rose_ftimer_expiry(unsigned long); +static void rose_t0timer_expiry(unsigned long); + +static void rose_transmit_restart_confirmation(struct rose_neigh *neigh); +static void rose_transmit_restart_request(struct rose_neigh *neigh); + +void rose_start_ftimer(struct rose_neigh *neigh) +{ + del_timer(&neigh->ftimer); + + neigh->ftimer.data = (unsigned long)neigh; + neigh->ftimer.function = &rose_ftimer_expiry; + neigh->ftimer.expires = jiffies + sysctl_rose_link_fail_timeout; + + add_timer(&neigh->ftimer); +} + +static void rose_start_t0timer(struct rose_neigh *neigh) +{ + del_timer(&neigh->t0timer); + + neigh->t0timer.data = (unsigned long)neigh; + neigh->t0timer.function = &rose_t0timer_expiry; + neigh->t0timer.expires = jiffies + sysctl_rose_restart_request_timeout; + + add_timer(&neigh->t0timer); +} + +void rose_stop_ftimer(struct rose_neigh *neigh) +{ + del_timer(&neigh->ftimer); +} + +void rose_stop_t0timer(struct rose_neigh *neigh) +{ + del_timer(&neigh->t0timer); +} + +int rose_ftimer_running(struct rose_neigh *neigh) +{ + return timer_pending(&neigh->ftimer); +} + +static int rose_t0timer_running(struct rose_neigh *neigh) +{ + return timer_pending(&neigh->t0timer); +} + +static void rose_ftimer_expiry(unsigned long param) +{ +} + +static void rose_t0timer_expiry(unsigned long param) +{ + struct rose_neigh *neigh = (struct rose_neigh *)param; + + rose_transmit_restart_request(neigh); + + neigh->dce_mode = 0; + + rose_start_t0timer(neigh); +} + +/* + * Interface to ax25_send_frame. Changes my level 2 callsign depending + * on whether we have a global ROSE callsign or use the default port + * callsign. + */ +static int rose_send_frame(struct sk_buff *skb, struct rose_neigh *neigh) +{ + ax25_address *rose_call; + + if (ax25cmp(&rose_callsign, &null_ax25_address) == 0) + rose_call = (ax25_address *)neigh->dev->dev_addr; + else + rose_call = &rose_callsign; + + neigh->ax25 = ax25_send_frame(skb, 260, rose_call, &neigh->callsign, neigh->digipeat, neigh->dev); + + return (neigh->ax25 != NULL); +} + +/* + * Interface to ax25_link_up. Changes my level 2 callsign depending + * on whether we have a global ROSE callsign or use the default port + * callsign. + */ +static int rose_link_up(struct rose_neigh *neigh) +{ + ax25_address *rose_call; + + if (ax25cmp(&rose_callsign, &null_ax25_address) == 0) + rose_call = (ax25_address *)neigh->dev->dev_addr; + else + rose_call = &rose_callsign; + + neigh->ax25 = ax25_find_cb(rose_call, &neigh->callsign, neigh->digipeat, neigh->dev); + + return (neigh->ax25 != NULL); +} + +/* + * This handles all restart and diagnostic frames. + */ +void rose_link_rx_restart(struct sk_buff *skb, struct rose_neigh *neigh, unsigned short frametype) +{ + struct sk_buff *skbn; + + switch (frametype) { + case ROSE_RESTART_REQUEST: + rose_stop_t0timer(neigh); + neigh->restarted = 1; + neigh->dce_mode = (skb->data[3] == ROSE_DTE_ORIGINATED); + rose_transmit_restart_confirmation(neigh); + break; + + case ROSE_RESTART_CONFIRMATION: + rose_stop_t0timer(neigh); + neigh->restarted = 1; + break; + + case ROSE_DIAGNOSTIC: + printk(KERN_WARNING "ROSE: received diagnostic #%d - %02X %02X %02X\n", skb->data[3], skb->data[4], skb->data[5], skb->data[6]); + break; + + default: + printk(KERN_WARNING "ROSE: received unknown %02X with LCI 000\n", frametype); + break; + } + + if (neigh->restarted) { + while ((skbn = skb_dequeue(&neigh->queue)) != NULL) + if (!rose_send_frame(skbn, neigh)) + kfree_skb(skbn); + } +} + +/* + * This routine is called when a Restart Request is needed + */ +static void rose_transmit_restart_request(struct rose_neigh *neigh) +{ + struct sk_buff *skb; + unsigned char *dptr; + int len; + + len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 3; + + if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN); + + dptr = skb_put(skb, ROSE_MIN_LEN + 3); + + *dptr++ = AX25_P_ROSE; + *dptr++ = ROSE_GFI; + *dptr++ = 0x00; + *dptr++ = ROSE_RESTART_REQUEST; + *dptr++ = ROSE_DTE_ORIGINATED; + *dptr++ = 0; + + if (!rose_send_frame(skb, neigh)) + kfree_skb(skb); +} + +/* + * This routine is called when a Restart Confirmation is needed + */ +static void rose_transmit_restart_confirmation(struct rose_neigh *neigh) +{ + struct sk_buff *skb; + unsigned char *dptr; + int len; + + len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 1; + + if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN); + + dptr = skb_put(skb, ROSE_MIN_LEN + 1); + + *dptr++ = AX25_P_ROSE; + *dptr++ = ROSE_GFI; + *dptr++ = 0x00; + *dptr++ = ROSE_RESTART_CONFIRMATION; + + if (!rose_send_frame(skb, neigh)) + kfree_skb(skb); +} + +/* + * This routine is called when a Clear Request is needed outside of the context + * of a connected socket. + */ +void rose_transmit_clear_request(struct rose_neigh *neigh, unsigned int lci, unsigned char cause, unsigned char diagnostic) +{ + struct sk_buff *skb; + unsigned char *dptr; + int len; + + len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 3; + + if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN); + + dptr = skb_put(skb, ROSE_MIN_LEN + 3); + + *dptr++ = AX25_P_ROSE; + *dptr++ = ((lci >> 8) & 0x0F) | ROSE_GFI; + *dptr++ = ((lci >> 0) & 0xFF); + *dptr++ = ROSE_CLEAR_REQUEST; + *dptr++ = cause; + *dptr++ = diagnostic; + + if (!rose_send_frame(skb, neigh)) + kfree_skb(skb); +} + +void rose_transmit_link(struct sk_buff *skb, struct rose_neigh *neigh) +{ + unsigned char *dptr; + +#if 0 + if (call_fw_firewall(PF_ROSE, skb->dev, skb->data, NULL, &skb) != FW_ACCEPT) { + kfree_skb(skb); + return; + } +#endif + + if (neigh->loopback) { + rose_loopback_queue(skb, neigh); + return; + } + + if (!rose_link_up(neigh)) + neigh->restarted = 0; + + dptr = skb_push(skb, 1); + *dptr++ = AX25_P_ROSE; + + if (neigh->restarted) { + if (!rose_send_frame(skb, neigh)) + kfree_skb(skb); + } else { + skb_queue_tail(&neigh->queue, skb); + + if (!rose_t0timer_running(neigh)) { + rose_transmit_restart_request(neigh); + neigh->dce_mode = 0; + rose_start_t0timer(neigh); + } + } +} diff --git a/net/rose/rose_loopback.c b/net/rose/rose_loopback.c new file mode 100644 index 000000000000..103b4d38f88a --- /dev/null +++ b/net/rose/rose_loopback.c @@ -0,0 +1,111 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include + +static struct sk_buff_head loopback_queue; +static struct timer_list loopback_timer; + +static void rose_set_loopback_timer(void); + +void rose_loopback_init(void) +{ + skb_queue_head_init(&loopback_queue); + + init_timer(&loopback_timer); +} + +static int rose_loopback_running(void) +{ + return timer_pending(&loopback_timer); +} + +int rose_loopback_queue(struct sk_buff *skb, struct rose_neigh *neigh) +{ + struct sk_buff *skbn; + + skbn = skb_clone(skb, GFP_ATOMIC); + + kfree_skb(skb); + + if (skbn != NULL) { + skb_queue_tail(&loopback_queue, skbn); + + if (!rose_loopback_running()) + rose_set_loopback_timer(); + } + + return 1; +} + +static void rose_loopback_timer(unsigned long); + +static void rose_set_loopback_timer(void) +{ + del_timer(&loopback_timer); + + loopback_timer.data = 0; + loopback_timer.function = &rose_loopback_timer; + loopback_timer.expires = jiffies + 10; + + add_timer(&loopback_timer); +} + +static void rose_loopback_timer(unsigned long param) +{ + struct sk_buff *skb; + struct net_device *dev; + rose_address *dest; + struct sock *sk; + unsigned short frametype; + unsigned int lci_i, lci_o; + + while ((skb = skb_dequeue(&loopback_queue)) != NULL) { + lci_i = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF); + frametype = skb->data[2]; + dest = (rose_address *)(skb->data + 4); + lci_o = 0xFFF - lci_i; + + skb->h.raw = skb->data; + + if ((sk = rose_find_socket(lci_o, rose_loopback_neigh)) != NULL) { + if (rose_process_rx_frame(sk, skb) == 0) + kfree_skb(skb); + continue; + } + + if (frametype == ROSE_CALL_REQUEST) { + if ((dev = rose_dev_get(dest)) != NULL) { + if (rose_rx_call_request(skb, dev, rose_loopback_neigh, lci_o) == 0) + kfree_skb(skb); + } else { + kfree_skb(skb); + } + } else { + kfree_skb(skb); + } + } +} + +void __exit rose_loopback_clear(void) +{ + struct sk_buff *skb; + + del_timer(&loopback_timer); + + while ((skb = skb_dequeue(&loopback_queue)) != NULL) { + skb->sk = NULL; + kfree_skb(skb); + } +} diff --git a/net/rose/rose_out.c b/net/rose/rose_out.c new file mode 100644 index 000000000000..2965ffc83b9b --- /dev/null +++ b/net/rose/rose_out.c @@ -0,0 +1,126 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This procedure is passed a buffer descriptor for an iframe. It builds + * the rest of the control part of the frame and then writes it out. + */ +static void rose_send_iframe(struct sock *sk, struct sk_buff *skb) +{ + struct rose_sock *rose = rose_sk(sk); + + if (skb == NULL) + return; + + skb->data[2] |= (rose->vr << 5) & 0xE0; + skb->data[2] |= (rose->vs << 1) & 0x0E; + + rose_start_idletimer(sk); + + rose_transmit_link(skb, rose->neighbour); +} + +void rose_kick(struct sock *sk) +{ + struct rose_sock *rose = rose_sk(sk); + struct sk_buff *skb, *skbn; + unsigned short start, end; + + if (rose->state != ROSE_STATE_3) + return; + + if (rose->condition & ROSE_COND_PEER_RX_BUSY) + return; + + if (!skb_peek(&sk->sk_write_queue)) + return; + + start = (skb_peek(&rose->ack_queue) == NULL) ? rose->va : rose->vs; + end = (rose->va + sysctl_rose_window_size) % ROSE_MODULUS; + + if (start == end) + return; + + rose->vs = start; + + /* + * Transmit data until either we're out of data to send or + * the window is full. + */ + + skb = skb_dequeue(&sk->sk_write_queue); + + do { + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skb_queue_head(&sk->sk_write_queue, skb); + break; + } + + skb_set_owner_w(skbn, sk); + + /* + * Transmit the frame copy. + */ + rose_send_iframe(sk, skbn); + + rose->vs = (rose->vs + 1) % ROSE_MODULUS; + + /* + * Requeue the original data frame. + */ + skb_queue_tail(&rose->ack_queue, skb); + + } while (rose->vs != end && + (skb = skb_dequeue(&sk->sk_write_queue)) != NULL); + + rose->vl = rose->vr; + rose->condition &= ~ROSE_COND_ACK_PENDING; + + rose_stop_timer(sk); +} + +/* + * The following routines are taken from page 170 of the 7th ARRL Computer + * Networking Conference paper, as is the whole state machine. + */ + +void rose_enquiry_response(struct sock *sk) +{ + struct rose_sock *rose = rose_sk(sk); + + if (rose->condition & ROSE_COND_OWN_RX_BUSY) + rose_write_internal(sk, ROSE_RNR); + else + rose_write_internal(sk, ROSE_RR); + + rose->vl = rose->vr; + rose->condition &= ~ROSE_COND_ACK_PENDING; + + rose_stop_timer(sk); +} diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c new file mode 100644 index 000000000000..ff73ebb912b8 --- /dev/null +++ b/net/rose/rose_route.c @@ -0,0 +1,1343 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) Terry Dawson VK2KTJ (terry@animats.net) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For TIOCINQ/OUTQ */ +#include +#include +#include +#include +#include +#include +#include + +static unsigned int rose_neigh_no = 1; + +static struct rose_node *rose_node_list; +static DEFINE_SPINLOCK(rose_node_list_lock); +static struct rose_neigh *rose_neigh_list; +static DEFINE_SPINLOCK(rose_neigh_list_lock); +static struct rose_route *rose_route_list; +static DEFINE_SPINLOCK(rose_route_list_lock); + +struct rose_neigh *rose_loopback_neigh; + +static void rose_remove_neigh(struct rose_neigh *); + +/* + * Add a new route to a node, and in the process add the node and the + * neighbour if it is new. + */ +static int rose_add_node(struct rose_route_struct *rose_route, + struct net_device *dev) +{ + struct rose_node *rose_node, *rose_tmpn, *rose_tmpp; + struct rose_neigh *rose_neigh; + int i, res = 0; + + spin_lock_bh(&rose_node_list_lock); + spin_lock_bh(&rose_neigh_list_lock); + + rose_node = rose_node_list; + while (rose_node != NULL) { + if ((rose_node->mask == rose_route->mask) && + (rosecmpm(&rose_route->address, &rose_node->address, + rose_route->mask) == 0)) + break; + rose_node = rose_node->next; + } + + if (rose_node != NULL && rose_node->loopback) { + res = -EINVAL; + goto out; + } + + rose_neigh = rose_neigh_list; + while (rose_neigh != NULL) { + if (ax25cmp(&rose_route->neighbour, &rose_neigh->callsign) == 0 + && rose_neigh->dev == dev) + break; + rose_neigh = rose_neigh->next; + } + + if (rose_neigh == NULL) { + rose_neigh = kmalloc(sizeof(*rose_neigh), GFP_ATOMIC); + if (rose_neigh == NULL) { + res = -ENOMEM; + goto out; + } + + rose_neigh->callsign = rose_route->neighbour; + rose_neigh->digipeat = NULL; + rose_neigh->ax25 = NULL; + rose_neigh->dev = dev; + rose_neigh->count = 0; + rose_neigh->use = 0; + rose_neigh->dce_mode = 0; + rose_neigh->loopback = 0; + rose_neigh->number = rose_neigh_no++; + rose_neigh->restarted = 0; + + skb_queue_head_init(&rose_neigh->queue); + + init_timer(&rose_neigh->ftimer); + init_timer(&rose_neigh->t0timer); + + if (rose_route->ndigis != 0) { + if ((rose_neigh->digipeat = kmalloc(sizeof(ax25_digi), GFP_KERNEL)) == NULL) { + kfree(rose_neigh); + res = -ENOMEM; + goto out; + } + + rose_neigh->digipeat->ndigi = rose_route->ndigis; + rose_neigh->digipeat->lastrepeat = -1; + + for (i = 0; i < rose_route->ndigis; i++) { + rose_neigh->digipeat->calls[i] = + rose_route->digipeaters[i]; + rose_neigh->digipeat->repeated[i] = 0; + } + } + + rose_neigh->next = rose_neigh_list; + rose_neigh_list = rose_neigh; + } + + /* + * This is a new node to be inserted into the list. Find where it needs + * to be inserted into the list, and insert it. We want to be sure + * to order the list in descending order of mask size to ensure that + * later when we are searching this list the first match will be the + * best match. + */ + if (rose_node == NULL) { + rose_tmpn = rose_node_list; + rose_tmpp = NULL; + + while (rose_tmpn != NULL) { + if (rose_tmpn->mask > rose_route->mask) { + rose_tmpp = rose_tmpn; + rose_tmpn = rose_tmpn->next; + } else { + break; + } + } + + /* create new node */ + rose_node = kmalloc(sizeof(*rose_node), GFP_ATOMIC); + if (rose_node == NULL) { + res = -ENOMEM; + goto out; + } + + rose_node->address = rose_route->address; + rose_node->mask = rose_route->mask; + rose_node->count = 1; + rose_node->loopback = 0; + rose_node->neighbour[0] = rose_neigh; + + if (rose_tmpn == NULL) { + if (rose_tmpp == NULL) { /* Empty list */ + rose_node_list = rose_node; + rose_node->next = NULL; + } else { + rose_tmpp->next = rose_node; + rose_node->next = NULL; + } + } else { + if (rose_tmpp == NULL) { /* 1st node */ + rose_node->next = rose_node_list; + rose_node_list = rose_node; + } else { + rose_tmpp->next = rose_node; + rose_node->next = rose_tmpn; + } + } + rose_neigh->count++; + + goto out; + } + + /* We have space, slot it in */ + if (rose_node->count < 3) { + rose_node->neighbour[rose_node->count] = rose_neigh; + rose_node->count++; + rose_neigh->count++; + } + +out: + spin_unlock_bh(&rose_neigh_list_lock); + spin_unlock_bh(&rose_node_list_lock); + + return res; +} + +/* + * Caller is holding rose_node_list_lock. + */ +static void rose_remove_node(struct rose_node *rose_node) +{ + struct rose_node *s; + + if ((s = rose_node_list) == rose_node) { + rose_node_list = rose_node->next; + kfree(rose_node); + return; + } + + while (s != NULL && s->next != NULL) { + if (s->next == rose_node) { + s->next = rose_node->next; + kfree(rose_node); + return; + } + + s = s->next; + } +} + +/* + * Caller is holding rose_neigh_list_lock. + */ +static void rose_remove_neigh(struct rose_neigh *rose_neigh) +{ + struct rose_neigh *s; + + rose_stop_ftimer(rose_neigh); + rose_stop_t0timer(rose_neigh); + + skb_queue_purge(&rose_neigh->queue); + + spin_lock_bh(&rose_neigh_list_lock); + + if ((s = rose_neigh_list) == rose_neigh) { + rose_neigh_list = rose_neigh->next; + spin_unlock_bh(&rose_neigh_list_lock); + if (rose_neigh->digipeat != NULL) + kfree(rose_neigh->digipeat); + kfree(rose_neigh); + return; + } + + while (s != NULL && s->next != NULL) { + if (s->next == rose_neigh) { + s->next = rose_neigh->next; + spin_unlock_bh(&rose_neigh_list_lock); + if (rose_neigh->digipeat != NULL) + kfree(rose_neigh->digipeat); + kfree(rose_neigh); + return; + } + + s = s->next; + } + spin_unlock_bh(&rose_neigh_list_lock); +} + +/* + * Caller is holding rose_route_list_lock. + */ +static void rose_remove_route(struct rose_route *rose_route) +{ + struct rose_route *s; + + if (rose_route->neigh1 != NULL) + rose_route->neigh1->use--; + + if (rose_route->neigh2 != NULL) + rose_route->neigh2->use--; + + if ((s = rose_route_list) == rose_route) { + rose_route_list = rose_route->next; + kfree(rose_route); + return; + } + + while (s != NULL && s->next != NULL) { + if (s->next == rose_route) { + s->next = rose_route->next; + kfree(rose_route); + return; + } + + s = s->next; + } +} + +/* + * "Delete" a node. Strictly speaking remove a route to a node. The node + * is only deleted if no routes are left to it. + */ +static int rose_del_node(struct rose_route_struct *rose_route, + struct net_device *dev) +{ + struct rose_node *rose_node; + struct rose_neigh *rose_neigh; + int i, err = 0; + + spin_lock_bh(&rose_node_list_lock); + spin_lock_bh(&rose_neigh_list_lock); + + rose_node = rose_node_list; + while (rose_node != NULL) { + if ((rose_node->mask == rose_route->mask) && + (rosecmpm(&rose_route->address, &rose_node->address, + rose_route->mask) == 0)) + break; + rose_node = rose_node->next; + } + + if (rose_node == NULL || rose_node->loopback) { + err = -EINVAL; + goto out; + } + + rose_neigh = rose_neigh_list; + while (rose_neigh != NULL) { + if (ax25cmp(&rose_route->neighbour, &rose_neigh->callsign) == 0 + && rose_neigh->dev == dev) + break; + rose_neigh = rose_neigh->next; + } + + if (rose_neigh == NULL) { + err = -EINVAL; + goto out; + } + + for (i = 0; i < rose_node->count; i++) { + if (rose_node->neighbour[i] == rose_neigh) { + rose_neigh->count--; + + if (rose_neigh->count == 0 && rose_neigh->use == 0) + rose_remove_neigh(rose_neigh); + + rose_node->count--; + + if (rose_node->count == 0) { + rose_remove_node(rose_node); + } else { + switch (i) { + case 0: + rose_node->neighbour[0] = + rose_node->neighbour[1]; + case 1: + rose_node->neighbour[1] = + rose_node->neighbour[2]; + case 2: + break; + } + } + goto out; + } + } + err = -EINVAL; + +out: + spin_unlock_bh(&rose_neigh_list_lock); + spin_unlock_bh(&rose_node_list_lock); + + return err; +} + +/* + * Add the loopback neighbour. + */ +int rose_add_loopback_neigh(void) +{ + if ((rose_loopback_neigh = kmalloc(sizeof(struct rose_neigh), GFP_ATOMIC)) == NULL) + return -ENOMEM; + + rose_loopback_neigh->callsign = null_ax25_address; + rose_loopback_neigh->digipeat = NULL; + rose_loopback_neigh->ax25 = NULL; + rose_loopback_neigh->dev = NULL; + rose_loopback_neigh->count = 0; + rose_loopback_neigh->use = 0; + rose_loopback_neigh->dce_mode = 1; + rose_loopback_neigh->loopback = 1; + rose_loopback_neigh->number = rose_neigh_no++; + rose_loopback_neigh->restarted = 1; + + skb_queue_head_init(&rose_loopback_neigh->queue); + + init_timer(&rose_loopback_neigh->ftimer); + init_timer(&rose_loopback_neigh->t0timer); + + spin_lock_bh(&rose_neigh_list_lock); + rose_loopback_neigh->next = rose_neigh_list; + rose_neigh_list = rose_loopback_neigh; + spin_unlock_bh(&rose_neigh_list_lock); + + return 0; +} + +/* + * Add a loopback node. + */ +int rose_add_loopback_node(rose_address *address) +{ + struct rose_node *rose_node; + unsigned int err = 0; + + spin_lock_bh(&rose_node_list_lock); + + rose_node = rose_node_list; + while (rose_node != NULL) { + if ((rose_node->mask == 10) && + (rosecmpm(address, &rose_node->address, 10) == 0) && + rose_node->loopback) + break; + rose_node = rose_node->next; + } + + if (rose_node != NULL) + goto out; + + if ((rose_node = kmalloc(sizeof(*rose_node), GFP_ATOMIC)) == NULL) { + err = -ENOMEM; + goto out; + } + + rose_node->address = *address; + rose_node->mask = 10; + rose_node->count = 1; + rose_node->loopback = 1; + rose_node->neighbour[0] = rose_loopback_neigh; + + /* Insert at the head of list. Address is always mask=10 */ + rose_node->next = rose_node_list; + rose_node_list = rose_node; + + rose_loopback_neigh->count++; + +out: + spin_unlock_bh(&rose_node_list_lock); + + return 0; +} + +/* + * Delete a loopback node. + */ +void rose_del_loopback_node(rose_address *address) +{ + struct rose_node *rose_node; + + spin_lock_bh(&rose_node_list_lock); + + rose_node = rose_node_list; + while (rose_node != NULL) { + if ((rose_node->mask == 10) && + (rosecmpm(address, &rose_node->address, 10) == 0) && + rose_node->loopback) + break; + rose_node = rose_node->next; + } + + if (rose_node == NULL) + goto out; + + rose_remove_node(rose_node); + + rose_loopback_neigh->count--; + +out: + spin_unlock_bh(&rose_node_list_lock); +} + +/* + * A device has been removed. Remove its routes and neighbours. + */ +void rose_rt_device_down(struct net_device *dev) +{ + struct rose_neigh *s, *rose_neigh; + struct rose_node *t, *rose_node; + int i; + + spin_lock_bh(&rose_node_list_lock); + spin_lock_bh(&rose_neigh_list_lock); + rose_neigh = rose_neigh_list; + while (rose_neigh != NULL) { + s = rose_neigh; + rose_neigh = rose_neigh->next; + + if (s->dev != dev) + continue; + + rose_node = rose_node_list; + + while (rose_node != NULL) { + t = rose_node; + rose_node = rose_node->next; + + for (i = 0; i < t->count; i++) { + if (t->neighbour[i] != s) + continue; + + t->count--; + + switch (i) { + case 0: + t->neighbour[0] = t->neighbour[1]; + case 1: + t->neighbour[1] = t->neighbour[2]; + case 2: + break; + } + } + + if (t->count <= 0) + rose_remove_node(t); + } + + rose_remove_neigh(s); + } + spin_unlock_bh(&rose_neigh_list_lock); + spin_unlock_bh(&rose_node_list_lock); +} + +#if 0 /* Currently unused */ +/* + * A device has been removed. Remove its links. + */ +void rose_route_device_down(struct net_device *dev) +{ + struct rose_route *s, *rose_route; + + spin_lock_bh(&rose_route_list_lock); + rose_route = rose_route_list; + while (rose_route != NULL) { + s = rose_route; + rose_route = rose_route->next; + + if (s->neigh1->dev == dev || s->neigh2->dev == dev) + rose_remove_route(s); + } + spin_unlock_bh(&rose_route_list_lock); +} +#endif + +/* + * Clear all nodes and neighbours out, except for neighbours with + * active connections going through them. + * Do not clear loopback neighbour and nodes. + */ +static int rose_clear_routes(void) +{ + struct rose_neigh *s, *rose_neigh; + struct rose_node *t, *rose_node; + + spin_lock_bh(&rose_node_list_lock); + spin_lock_bh(&rose_neigh_list_lock); + + rose_neigh = rose_neigh_list; + rose_node = rose_node_list; + + while (rose_node != NULL) { + t = rose_node; + rose_node = rose_node->next; + if (!t->loopback) + rose_remove_node(t); + } + + while (rose_neigh != NULL) { + s = rose_neigh; + rose_neigh = rose_neigh->next; + + if (s->use == 0 && !s->loopback) { + s->count = 0; + rose_remove_neigh(s); + } + } + + spin_unlock_bh(&rose_neigh_list_lock); + spin_unlock_bh(&rose_node_list_lock); + + return 0; +} + +/* + * Check that the device given is a valid AX.25 interface that is "up". + */ +static struct net_device *rose_ax25_dev_get(char *devname) +{ + struct net_device *dev; + + if ((dev = dev_get_by_name(devname)) == NULL) + return NULL; + + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_AX25) + return dev; + + dev_put(dev); + return NULL; +} + +/* + * Find the first active ROSE device, usually "rose0". + */ +struct net_device *rose_dev_first(void) +{ + struct net_device *dev, *first = NULL; + + read_lock(&dev_base_lock); + for (dev = dev_base; dev != NULL; dev = dev->next) { + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE) + if (first == NULL || strncmp(dev->name, first->name, 3) < 0) + first = dev; + } + read_unlock(&dev_base_lock); + + return first; +} + +/* + * Find the ROSE device for the given address. + */ +struct net_device *rose_dev_get(rose_address *addr) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + for (dev = dev_base; dev != NULL; dev = dev->next) { + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0) { + dev_hold(dev); + goto out; + } + } +out: + read_unlock(&dev_base_lock); + return dev; +} + +static int rose_dev_exists(rose_address *addr) +{ + struct net_device *dev; + + read_lock(&dev_base_lock); + for (dev = dev_base; dev != NULL; dev = dev->next) { + if ((dev->flags & IFF_UP) && dev->type == ARPHRD_ROSE && rosecmp(addr, (rose_address *)dev->dev_addr) == 0) + goto out; + } +out: + read_unlock(&dev_base_lock); + return dev != NULL; +} + + + + +struct rose_route *rose_route_free_lci(unsigned int lci, struct rose_neigh *neigh) +{ + struct rose_route *rose_route; + + for (rose_route = rose_route_list; rose_route != NULL; rose_route = rose_route->next) + if ((rose_route->neigh1 == neigh && rose_route->lci1 == lci) || + (rose_route->neigh2 == neigh && rose_route->lci2 == lci)) + return rose_route; + + return NULL; +} + +/* + * Find a neighbour given a ROSE address. + */ +struct rose_neigh *rose_get_neigh(rose_address *addr, unsigned char *cause, + unsigned char *diagnostic) +{ + struct rose_neigh *res = NULL; + struct rose_node *node; + int failed = 0; + int i; + + spin_lock_bh(&rose_node_list_lock); + for (node = rose_node_list; node != NULL; node = node->next) { + if (rosecmpm(addr, &node->address, node->mask) == 0) { + for (i = 0; i < node->count; i++) { + if (!rose_ftimer_running(node->neighbour[i])) { + res = node->neighbour[i]; + goto out; + } else + failed = 1; + } + break; + } + } + + if (failed) { + *cause = ROSE_OUT_OF_ORDER; + *diagnostic = 0; + } else { + *cause = ROSE_NOT_OBTAINABLE; + *diagnostic = 0; + } + +out: + spin_unlock_bh(&rose_node_list_lock); + + return res; +} + +/* + * Handle the ioctls that control the routing functions. + */ +int rose_rt_ioctl(unsigned int cmd, void __user *arg) +{ + struct rose_route_struct rose_route; + struct net_device *dev; + int err; + + switch (cmd) { + case SIOCADDRT: + if (copy_from_user(&rose_route, arg, sizeof(struct rose_route_struct))) + return -EFAULT; + if ((dev = rose_ax25_dev_get(rose_route.device)) == NULL) + return -EINVAL; + if (rose_dev_exists(&rose_route.address)) { /* Can't add routes to ourself */ + dev_put(dev); + return -EINVAL; + } + if (rose_route.mask > 10) /* Mask can't be more than 10 digits */ + return -EINVAL; + if (rose_route.ndigis > 8) /* No more than 8 digipeats */ + return -EINVAL; + err = rose_add_node(&rose_route, dev); + dev_put(dev); + return err; + + case SIOCDELRT: + if (copy_from_user(&rose_route, arg, sizeof(struct rose_route_struct))) + return -EFAULT; + if ((dev = rose_ax25_dev_get(rose_route.device)) == NULL) + return -EINVAL; + err = rose_del_node(&rose_route, dev); + dev_put(dev); + return err; + + case SIOCRSCLRRT: + return rose_clear_routes(); + + default: + return -EINVAL; + } + + return 0; +} + +static void rose_del_route_by_neigh(struct rose_neigh *rose_neigh) +{ + struct rose_route *rose_route, *s; + + rose_neigh->restarted = 0; + + rose_stop_t0timer(rose_neigh); + rose_start_ftimer(rose_neigh); + + skb_queue_purge(&rose_neigh->queue); + + spin_lock_bh(&rose_route_list_lock); + + rose_route = rose_route_list; + + while (rose_route != NULL) { + if ((rose_route->neigh1 == rose_neigh && rose_route->neigh2 == rose_neigh) || + (rose_route->neigh1 == rose_neigh && rose_route->neigh2 == NULL) || + (rose_route->neigh2 == rose_neigh && rose_route->neigh1 == NULL)) { + s = rose_route->next; + rose_remove_route(rose_route); + rose_route = s; + continue; + } + + if (rose_route->neigh1 == rose_neigh) { + rose_route->neigh1->use--; + rose_route->neigh1 = NULL; + rose_transmit_clear_request(rose_route->neigh2, rose_route->lci2, ROSE_OUT_OF_ORDER, 0); + } + + if (rose_route->neigh2 == rose_neigh) { + rose_route->neigh2->use--; + rose_route->neigh2 = NULL; + rose_transmit_clear_request(rose_route->neigh1, rose_route->lci1, ROSE_OUT_OF_ORDER, 0); + } + + rose_route = rose_route->next; + } + spin_unlock_bh(&rose_route_list_lock); +} + +/* + * A level 2 link has timed out, therefore it appears to be a poor link, + * then don't use that neighbour until it is reset. Blow away all through + * routes and connections using this route. + */ +void rose_link_failed(ax25_cb *ax25, int reason) +{ + struct rose_neigh *rose_neigh; + + spin_lock_bh(&rose_neigh_list_lock); + rose_neigh = rose_neigh_list; + while (rose_neigh != NULL) { + if (rose_neigh->ax25 == ax25) + break; + rose_neigh = rose_neigh->next; + } + + if (rose_neigh != NULL) { + rose_neigh->ax25 = NULL; + + rose_del_route_by_neigh(rose_neigh); + rose_kill_by_neigh(rose_neigh); + } + spin_unlock_bh(&rose_neigh_list_lock); +} + +/* + * A device has been "downed" remove its link status. Blow away all + * through routes and connections that use this device. + */ +void rose_link_device_down(struct net_device *dev) +{ + struct rose_neigh *rose_neigh; + + for (rose_neigh = rose_neigh_list; rose_neigh != NULL; rose_neigh = rose_neigh->next) { + if (rose_neigh->dev == dev) { + rose_del_route_by_neigh(rose_neigh); + rose_kill_by_neigh(rose_neigh); + } + } +} + +/* + * Route a frame to an appropriate AX.25 connection. + */ +int rose_route_frame(struct sk_buff *skb, ax25_cb *ax25) +{ + struct rose_neigh *rose_neigh, *new_neigh; + struct rose_route *rose_route; + struct rose_facilities_struct facilities; + rose_address *src_addr, *dest_addr; + struct sock *sk; + unsigned short frametype; + unsigned int lci, new_lci; + unsigned char cause, diagnostic; + struct net_device *dev; + int len, res = 0; + +#if 0 + if (call_in_firewall(PF_ROSE, skb->dev, skb->data, NULL, &skb) != FW_ACCEPT) + return res; +#endif + + frametype = skb->data[2]; + lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF); + src_addr = (rose_address *)(skb->data + 9); + dest_addr = (rose_address *)(skb->data + 4); + + spin_lock_bh(&rose_node_list_lock); + spin_lock_bh(&rose_neigh_list_lock); + spin_lock_bh(&rose_route_list_lock); + + rose_neigh = rose_neigh_list; + while (rose_neigh != NULL) { + if (ax25cmp(&ax25->dest_addr, &rose_neigh->callsign) == 0 && + ax25->ax25_dev->dev == rose_neigh->dev) + break; + rose_neigh = rose_neigh->next; + } + + if (rose_neigh == NULL) { + printk("rose_route : unknown neighbour or device %s\n", + ax2asc(&ax25->dest_addr)); + goto out; + } + + /* + * Obviously the link is working, halt the ftimer. + */ + rose_stop_ftimer(rose_neigh); + + /* + * LCI of zero is always for us, and its always a restart + * frame. + */ + if (lci == 0) { + rose_link_rx_restart(skb, rose_neigh, frametype); + goto out; + } + + /* + * Find an existing socket. + */ + if ((sk = rose_find_socket(lci, rose_neigh)) != NULL) { + if (frametype == ROSE_CALL_REQUEST) { + struct rose_sock *rose = rose_sk(sk); + + /* Remove an existing unused socket */ + rose_clear_queues(sk); + rose->cause = ROSE_NETWORK_CONGESTION; + rose->diagnostic = 0; + rose->neighbour->use--; + rose->neighbour = NULL; + rose->lci = 0; + rose->state = ROSE_STATE_0; + sk->sk_state = TCP_CLOSE; + sk->sk_err = 0; + sk->sk_shutdown |= SEND_SHUTDOWN; + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DEAD); + } + } + else { + skb->h.raw = skb->data; + res = rose_process_rx_frame(sk, skb); + goto out; + } + } + + /* + * Is is a Call Request and is it for us ? + */ + if (frametype == ROSE_CALL_REQUEST) + if ((dev = rose_dev_get(dest_addr)) != NULL) { + res = rose_rx_call_request(skb, dev, rose_neigh, lci); + dev_put(dev); + goto out; + } + + if (!sysctl_rose_routing_control) { + rose_transmit_clear_request(rose_neigh, lci, ROSE_NOT_OBTAINABLE, 0); + goto out; + } + + /* + * Route it to the next in line if we have an entry for it. + */ + rose_route = rose_route_list; + while (rose_route != NULL) { + if (rose_route->lci1 == lci && + rose_route->neigh1 == rose_neigh) { + if (frametype == ROSE_CALL_REQUEST) { + /* F6FBB - Remove an existing unused route */ + rose_remove_route(rose_route); + break; + } else if (rose_route->neigh2 != NULL) { + skb->data[0] &= 0xF0; + skb->data[0] |= (rose_route->lci2 >> 8) & 0x0F; + skb->data[1] = (rose_route->lci2 >> 0) & 0xFF; + rose_transmit_link(skb, rose_route->neigh2); + if (frametype == ROSE_CLEAR_CONFIRMATION) + rose_remove_route(rose_route); + res = 1; + goto out; + } else { + if (frametype == ROSE_CLEAR_CONFIRMATION) + rose_remove_route(rose_route); + goto out; + } + } + if (rose_route->lci2 == lci && + rose_route->neigh2 == rose_neigh) { + if (frametype == ROSE_CALL_REQUEST) { + /* F6FBB - Remove an existing unused route */ + rose_remove_route(rose_route); + break; + } else if (rose_route->neigh1 != NULL) { + skb->data[0] &= 0xF0; + skb->data[0] |= (rose_route->lci1 >> 8) & 0x0F; + skb->data[1] = (rose_route->lci1 >> 0) & 0xFF; + rose_transmit_link(skb, rose_route->neigh1); + if (frametype == ROSE_CLEAR_CONFIRMATION) + rose_remove_route(rose_route); + res = 1; + goto out; + } else { + if (frametype == ROSE_CLEAR_CONFIRMATION) + rose_remove_route(rose_route); + goto out; + } + } + rose_route = rose_route->next; + } + + /* + * We know that: + * 1. The frame isn't for us, + * 2. It isn't "owned" by any existing route. + */ + if (frametype != ROSE_CALL_REQUEST) /* XXX */ + return 0; + + len = (((skb->data[3] >> 4) & 0x0F) + 1) / 2; + len += (((skb->data[3] >> 0) & 0x0F) + 1) / 2; + + memset(&facilities, 0x00, sizeof(struct rose_facilities_struct)); + + if (!rose_parse_facilities(skb->data + len + 4, &facilities)) { + rose_transmit_clear_request(rose_neigh, lci, ROSE_INVALID_FACILITY, 76); + goto out; + } + + /* + * Check for routing loops. + */ + rose_route = rose_route_list; + while (rose_route != NULL) { + if (rose_route->rand == facilities.rand && + rosecmp(src_addr, &rose_route->src_addr) == 0 && + ax25cmp(&facilities.dest_call, &rose_route->src_call) == 0 && + ax25cmp(&facilities.source_call, &rose_route->dest_call) == 0) { + rose_transmit_clear_request(rose_neigh, lci, ROSE_NOT_OBTAINABLE, 120); + goto out; + } + rose_route = rose_route->next; + } + + if ((new_neigh = rose_get_neigh(dest_addr, &cause, &diagnostic)) == NULL) { + rose_transmit_clear_request(rose_neigh, lci, cause, diagnostic); + goto out; + } + + if ((new_lci = rose_new_lci(new_neigh)) == 0) { + rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 71); + goto out; + } + + if ((rose_route = kmalloc(sizeof(*rose_route), GFP_ATOMIC)) == NULL) { + rose_transmit_clear_request(rose_neigh, lci, ROSE_NETWORK_CONGESTION, 120); + goto out; + } + + rose_route->lci1 = lci; + rose_route->src_addr = *src_addr; + rose_route->dest_addr = *dest_addr; + rose_route->src_call = facilities.dest_call; + rose_route->dest_call = facilities.source_call; + rose_route->rand = facilities.rand; + rose_route->neigh1 = rose_neigh; + rose_route->lci2 = new_lci; + rose_route->neigh2 = new_neigh; + + rose_route->neigh1->use++; + rose_route->neigh2->use++; + + rose_route->next = rose_route_list; + rose_route_list = rose_route; + + skb->data[0] &= 0xF0; + skb->data[0] |= (rose_route->lci2 >> 8) & 0x0F; + skb->data[1] = (rose_route->lci2 >> 0) & 0xFF; + + rose_transmit_link(skb, rose_route->neigh2); + res = 1; + +out: + spin_unlock_bh(&rose_route_list_lock); + spin_unlock_bh(&rose_neigh_list_lock); + spin_unlock_bh(&rose_node_list_lock); + + return res; +} + +#ifdef CONFIG_PROC_FS + +static void *rose_node_start(struct seq_file *seq, loff_t *pos) +{ + struct rose_node *rose_node; + int i = 1; + + spin_lock_bh(&rose_neigh_list_lock); + if (*pos == 0) + return SEQ_START_TOKEN; + + for (rose_node = rose_node_list; rose_node && i < *pos; + rose_node = rose_node->next, ++i); + + return (i == *pos) ? rose_node : NULL; +} + +static void *rose_node_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + + return (v == SEQ_START_TOKEN) ? rose_node_list + : ((struct rose_node *)v)->next; +} + +static void rose_node_stop(struct seq_file *seq, void *v) +{ + spin_unlock_bh(&rose_neigh_list_lock); +} + +static int rose_node_show(struct seq_file *seq, void *v) +{ + int i; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, "address mask n neigh neigh neigh\n"); + else { + const struct rose_node *rose_node = v; + /* if (rose_node->loopback) { + seq_printf(seq, "%-10s %04d 1 loopback\n", + rose2asc(&rose_node->address), + rose_node->mask); + } else { */ + seq_printf(seq, "%-10s %04d %d", + rose2asc(&rose_node->address), + rose_node->mask, + rose_node->count); + + for (i = 0; i < rose_node->count; i++) + seq_printf(seq, " %05d", + rose_node->neighbour[i]->number); + + seq_puts(seq, "\n"); + /* } */ + } + return 0; +} + +static struct seq_operations rose_node_seqops = { + .start = rose_node_start, + .next = rose_node_next, + .stop = rose_node_stop, + .show = rose_node_show, +}; + +static int rose_nodes_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rose_node_seqops); +} + +struct file_operations rose_nodes_fops = { + .owner = THIS_MODULE, + .open = rose_nodes_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void *rose_neigh_start(struct seq_file *seq, loff_t *pos) +{ + struct rose_neigh *rose_neigh; + int i = 1; + + spin_lock_bh(&rose_neigh_list_lock); + if (*pos == 0) + return SEQ_START_TOKEN; + + for (rose_neigh = rose_neigh_list; rose_neigh && i < *pos; + rose_neigh = rose_neigh->next, ++i); + + return (i == *pos) ? rose_neigh : NULL; +} + +static void *rose_neigh_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + + return (v == SEQ_START_TOKEN) ? rose_neigh_list + : ((struct rose_neigh *)v)->next; +} + +static void rose_neigh_stop(struct seq_file *seq, void *v) +{ + spin_unlock_bh(&rose_neigh_list_lock); +} + +static int rose_neigh_show(struct seq_file *seq, void *v) +{ + int i; + + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "addr callsign dev count use mode restart t0 tf digipeaters\n"); + else { + struct rose_neigh *rose_neigh = v; + + /* if (!rose_neigh->loopback) { */ + seq_printf(seq, "%05d %-9s %-4s %3d %3d %3s %3s %3lu %3lu", + rose_neigh->number, + (rose_neigh->loopback) ? "RSLOOP-0" : ax2asc(&rose_neigh->callsign), + rose_neigh->dev ? rose_neigh->dev->name : "???", + rose_neigh->count, + rose_neigh->use, + (rose_neigh->dce_mode) ? "DCE" : "DTE", + (rose_neigh->restarted) ? "yes" : "no", + ax25_display_timer(&rose_neigh->t0timer) / HZ, + ax25_display_timer(&rose_neigh->ftimer) / HZ); + + if (rose_neigh->digipeat != NULL) { + for (i = 0; i < rose_neigh->digipeat->ndigi; i++) + seq_printf(seq, " %s", ax2asc(&rose_neigh->digipeat->calls[i])); + } + + seq_puts(seq, "\n"); + } + return 0; +} + + +static struct seq_operations rose_neigh_seqops = { + .start = rose_neigh_start, + .next = rose_neigh_next, + .stop = rose_neigh_stop, + .show = rose_neigh_show, +}; + +static int rose_neigh_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rose_neigh_seqops); +} + +struct file_operations rose_neigh_fops = { + .owner = THIS_MODULE, + .open = rose_neigh_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + + +static void *rose_route_start(struct seq_file *seq, loff_t *pos) +{ + struct rose_route *rose_route; + int i = 1; + + spin_lock_bh(&rose_route_list_lock); + if (*pos == 0) + return SEQ_START_TOKEN; + + for (rose_route = rose_route_list; rose_route && i < *pos; + rose_route = rose_route->next, ++i); + + return (i == *pos) ? rose_route : NULL; +} + +static void *rose_route_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + + return (v == SEQ_START_TOKEN) ? rose_route_list + : ((struct rose_route *)v)->next; +} + +static void rose_route_stop(struct seq_file *seq, void *v) +{ + spin_unlock_bh(&rose_route_list_lock); +} + +static int rose_route_show(struct seq_file *seq, void *v) +{ + if (v == SEQ_START_TOKEN) + seq_puts(seq, + "lci address callsign neigh <-> lci address callsign neigh\n"); + else { + struct rose_route *rose_route = v; + + if (rose_route->neigh1) + seq_printf(seq, + "%3.3X %-10s %-9s %05d ", + rose_route->lci1, + rose2asc(&rose_route->src_addr), + ax2asc(&rose_route->src_call), + rose_route->neigh1->number); + else + seq_puts(seq, + "000 * * 00000 "); + + if (rose_route->neigh2) + seq_printf(seq, + "%3.3X %-10s %-9s %05d\n", + rose_route->lci2, + rose2asc(&rose_route->dest_addr), + ax2asc(&rose_route->dest_call), + rose_route->neigh2->number); + else + seq_puts(seq, + "000 * * 00000\n"); + } + return 0; +} + +static struct seq_operations rose_route_seqops = { + .start = rose_route_start, + .next = rose_route_next, + .stop = rose_route_stop, + .show = rose_route_show, +}; + +static int rose_route_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &rose_route_seqops); +} + +struct file_operations rose_routes_fops = { + .owner = THIS_MODULE, + .open = rose_route_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#endif /* CONFIG_PROC_FS */ + +/* + * Release all memory associated with ROSE routing structures. + */ +void __exit rose_rt_free(void) +{ + struct rose_neigh *s, *rose_neigh = rose_neigh_list; + struct rose_node *t, *rose_node = rose_node_list; + struct rose_route *u, *rose_route = rose_route_list; + + while (rose_neigh != NULL) { + s = rose_neigh; + rose_neigh = rose_neigh->next; + + rose_remove_neigh(s); + } + + while (rose_node != NULL) { + t = rose_node; + rose_node = rose_node->next; + + rose_remove_node(t); + } + + while (rose_route != NULL) { + u = rose_route; + rose_route = rose_route->next; + + rose_remove_route(u); + } +} diff --git a/net/rose/rose_subr.c b/net/rose/rose_subr.c new file mode 100644 index 000000000000..7db7e1cedc3a --- /dev/null +++ b/net/rose/rose_subr.c @@ -0,0 +1,519 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose); + +/* + * This routine purges all of the queues of frames. + */ +void rose_clear_queues(struct sock *sk) +{ + skb_queue_purge(&sk->sk_write_queue); + skb_queue_purge(&rose_sk(sk)->ack_queue); +} + +/* + * This routine purges the input queue of those frames that have been + * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the + * SDL diagram. + */ +void rose_frames_acked(struct sock *sk, unsigned short nr) +{ + struct sk_buff *skb; + struct rose_sock *rose = rose_sk(sk); + + /* + * Remove all the ack-ed frames from the ack queue. + */ + if (rose->va != nr) { + while (skb_peek(&rose->ack_queue) != NULL && rose->va != nr) { + skb = skb_dequeue(&rose->ack_queue); + kfree_skb(skb); + rose->va = (rose->va + 1) % ROSE_MODULUS; + } + } +} + +void rose_requeue_frames(struct sock *sk) +{ + struct sk_buff *skb, *skb_prev = NULL; + + /* + * Requeue all the un-ack-ed frames on the output queue to be picked + * up by rose_kick. This arrangement handles the possibility of an + * empty output queue. + */ + while ((skb = skb_dequeue(&rose_sk(sk)->ack_queue)) != NULL) { + if (skb_prev == NULL) + skb_queue_head(&sk->sk_write_queue, skb); + else + skb_append(skb_prev, skb); + skb_prev = skb; + } +} + +/* + * Validate that the value of nr is between va and vs. Return true or + * false for testing. + */ +int rose_validate_nr(struct sock *sk, unsigned short nr) +{ + struct rose_sock *rose = rose_sk(sk); + unsigned short vc = rose->va; + + while (vc != rose->vs) { + if (nr == vc) return 1; + vc = (vc + 1) % ROSE_MODULUS; + } + + return nr == rose->vs; +} + +/* + * This routine is called when the packet layer internally generates a + * control frame. + */ +void rose_write_internal(struct sock *sk, int frametype) +{ + struct rose_sock *rose = rose_sk(sk); + struct sk_buff *skb; + unsigned char *dptr; + unsigned char lci1, lci2; + char buffer[100]; + int len, faclen = 0; + + len = AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + ROSE_MIN_LEN + 1; + + switch (frametype) { + case ROSE_CALL_REQUEST: + len += 1 + ROSE_ADDR_LEN + ROSE_ADDR_LEN; + faclen = rose_create_facilities(buffer, rose); + len += faclen; + break; + case ROSE_CALL_ACCEPTED: + case ROSE_CLEAR_REQUEST: + case ROSE_RESET_REQUEST: + len += 2; + break; + } + + if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + /* + * Space for AX.25 header and PID. + */ + skb_reserve(skb, AX25_BPQ_HEADER_LEN + AX25_MAX_HEADER_LEN + 1); + + dptr = skb_put(skb, skb_tailroom(skb)); + + lci1 = (rose->lci >> 8) & 0x0F; + lci2 = (rose->lci >> 0) & 0xFF; + + switch (frametype) { + case ROSE_CALL_REQUEST: + *dptr++ = ROSE_GFI | lci1; + *dptr++ = lci2; + *dptr++ = frametype; + *dptr++ = 0xAA; + memcpy(dptr, &rose->dest_addr, ROSE_ADDR_LEN); + dptr += ROSE_ADDR_LEN; + memcpy(dptr, &rose->source_addr, ROSE_ADDR_LEN); + dptr += ROSE_ADDR_LEN; + memcpy(dptr, buffer, faclen); + dptr += faclen; + break; + + case ROSE_CALL_ACCEPTED: + *dptr++ = ROSE_GFI | lci1; + *dptr++ = lci2; + *dptr++ = frametype; + *dptr++ = 0x00; /* Address length */ + *dptr++ = 0; /* Facilities length */ + break; + + case ROSE_CLEAR_REQUEST: + *dptr++ = ROSE_GFI | lci1; + *dptr++ = lci2; + *dptr++ = frametype; + *dptr++ = rose->cause; + *dptr++ = rose->diagnostic; + break; + + case ROSE_RESET_REQUEST: + *dptr++ = ROSE_GFI | lci1; + *dptr++ = lci2; + *dptr++ = frametype; + *dptr++ = ROSE_DTE_ORIGINATED; + *dptr++ = 0; + break; + + case ROSE_RR: + case ROSE_RNR: + *dptr++ = ROSE_GFI | lci1; + *dptr++ = lci2; + *dptr = frametype; + *dptr++ |= (rose->vr << 5) & 0xE0; + break; + + case ROSE_CLEAR_CONFIRMATION: + case ROSE_RESET_CONFIRMATION: + *dptr++ = ROSE_GFI | lci1; + *dptr++ = lci2; + *dptr++ = frametype; + break; + + default: + printk(KERN_ERR "ROSE: rose_write_internal - invalid frametype %02X\n", frametype); + kfree_skb(skb); + return; + } + + rose_transmit_link(skb, rose->neighbour); +} + +int rose_decode(struct sk_buff *skb, int *ns, int *nr, int *q, int *d, int *m) +{ + unsigned char *frame; + + frame = skb->data; + + *ns = *nr = *q = *d = *m = 0; + + switch (frame[2]) { + case ROSE_CALL_REQUEST: + case ROSE_CALL_ACCEPTED: + case ROSE_CLEAR_REQUEST: + case ROSE_CLEAR_CONFIRMATION: + case ROSE_RESET_REQUEST: + case ROSE_RESET_CONFIRMATION: + return frame[2]; + default: + break; + } + + if ((frame[2] & 0x1F) == ROSE_RR || + (frame[2] & 0x1F) == ROSE_RNR) { + *nr = (frame[2] >> 5) & 0x07; + return frame[2] & 0x1F; + } + + if ((frame[2] & 0x01) == ROSE_DATA) { + *q = (frame[0] & ROSE_Q_BIT) == ROSE_Q_BIT; + *d = (frame[0] & ROSE_D_BIT) == ROSE_D_BIT; + *m = (frame[2] & ROSE_M_BIT) == ROSE_M_BIT; + *nr = (frame[2] >> 5) & 0x07; + *ns = (frame[2] >> 1) & 0x07; + return ROSE_DATA; + } + + return ROSE_ILLEGAL; +} + +static int rose_parse_national(unsigned char *p, struct rose_facilities_struct *facilities, int len) +{ + unsigned char *pt; + unsigned char l, lg, n = 0; + int fac_national_digis_received = 0; + + do { + switch (*p & 0xC0) { + case 0x00: + p += 2; + n += 2; + len -= 2; + break; + + case 0x40: + if (*p == FAC_NATIONAL_RAND) + facilities->rand = ((p[1] << 8) & 0xFF00) + ((p[2] << 0) & 0x00FF); + p += 3; + n += 3; + len -= 3; + break; + + case 0x80: + p += 4; + n += 4; + len -= 4; + break; + + case 0xC0: + l = p[1]; + if (*p == FAC_NATIONAL_DEST_DIGI) { + if (!fac_national_digis_received) { + memcpy(&facilities->source_digis[0], p + 2, AX25_ADDR_LEN); + facilities->source_ndigis = 1; + } + } + else if (*p == FAC_NATIONAL_SRC_DIGI) { + if (!fac_national_digis_received) { + memcpy(&facilities->dest_digis[0], p + 2, AX25_ADDR_LEN); + facilities->dest_ndigis = 1; + } + } + else if (*p == FAC_NATIONAL_FAIL_CALL) { + memcpy(&facilities->fail_call, p + 2, AX25_ADDR_LEN); + } + else if (*p == FAC_NATIONAL_FAIL_ADD) { + memcpy(&facilities->fail_addr, p + 3, ROSE_ADDR_LEN); + } + else if (*p == FAC_NATIONAL_DIGIS) { + fac_national_digis_received = 1; + facilities->source_ndigis = 0; + facilities->dest_ndigis = 0; + for (pt = p + 2, lg = 0 ; lg < l ; pt += AX25_ADDR_LEN, lg += AX25_ADDR_LEN) { + if (pt[6] & AX25_HBIT) + memcpy(&facilities->dest_digis[facilities->dest_ndigis++], pt, AX25_ADDR_LEN); + else + memcpy(&facilities->source_digis[facilities->source_ndigis++], pt, AX25_ADDR_LEN); + } + } + p += l + 2; + n += l + 2; + len -= l + 2; + break; + } + } while (*p != 0x00 && len > 0); + + return n; +} + +static int rose_parse_ccitt(unsigned char *p, struct rose_facilities_struct *facilities, int len) +{ + unsigned char l, n = 0; + char callsign[11]; + + do { + switch (*p & 0xC0) { + case 0x00: + p += 2; + n += 2; + len -= 2; + break; + + case 0x40: + p += 3; + n += 3; + len -= 3; + break; + + case 0x80: + p += 4; + n += 4; + len -= 4; + break; + + case 0xC0: + l = p[1]; + if (*p == FAC_CCITT_DEST_NSAP) { + memcpy(&facilities->source_addr, p + 7, ROSE_ADDR_LEN); + memcpy(callsign, p + 12, l - 10); + callsign[l - 10] = '\0'; + facilities->source_call = *asc2ax(callsign); + } + if (*p == FAC_CCITT_SRC_NSAP) { + memcpy(&facilities->dest_addr, p + 7, ROSE_ADDR_LEN); + memcpy(callsign, p + 12, l - 10); + callsign[l - 10] = '\0'; + facilities->dest_call = *asc2ax(callsign); + } + p += l + 2; + n += l + 2; + len -= l + 2; + break; + } + } while (*p != 0x00 && len > 0); + + return n; +} + +int rose_parse_facilities(unsigned char *p, + struct rose_facilities_struct *facilities) +{ + int facilities_len, len; + + facilities_len = *p++; + + if (facilities_len == 0) + return 0; + + while (facilities_len > 0) { + if (*p == 0x00) { + facilities_len--; + p++; + + switch (*p) { + case FAC_NATIONAL: /* National */ + len = rose_parse_national(p + 1, facilities, facilities_len - 1); + facilities_len -= len + 1; + p += len + 1; + break; + + case FAC_CCITT: /* CCITT */ + len = rose_parse_ccitt(p + 1, facilities, facilities_len - 1); + facilities_len -= len + 1; + p += len + 1; + break; + + default: + printk(KERN_DEBUG "ROSE: rose_parse_facilities - unknown facilities family %02X\n", *p); + facilities_len--; + p++; + break; + } + } else + break; /* Error in facilities format */ + } + + return 1; +} + +static int rose_create_facilities(unsigned char *buffer, struct rose_sock *rose) +{ + unsigned char *p = buffer + 1; + char *callsign; + int len, nb; + + /* National Facilities */ + if (rose->rand != 0 || rose->source_ndigis == 1 || rose->dest_ndigis == 1) { + *p++ = 0x00; + *p++ = FAC_NATIONAL; + + if (rose->rand != 0) { + *p++ = FAC_NATIONAL_RAND; + *p++ = (rose->rand >> 8) & 0xFF; + *p++ = (rose->rand >> 0) & 0xFF; + } + + /* Sent before older facilities */ + if ((rose->source_ndigis > 0) || (rose->dest_ndigis > 0)) { + int maxdigi = 0; + *p++ = FAC_NATIONAL_DIGIS; + *p++ = AX25_ADDR_LEN * (rose->source_ndigis + rose->dest_ndigis); + for (nb = 0 ; nb < rose->source_ndigis ; nb++) { + if (++maxdigi >= ROSE_MAX_DIGIS) + break; + memcpy(p, &rose->source_digis[nb], AX25_ADDR_LEN); + p[6] |= AX25_HBIT; + p += AX25_ADDR_LEN; + } + for (nb = 0 ; nb < rose->dest_ndigis ; nb++) { + if (++maxdigi >= ROSE_MAX_DIGIS) + break; + memcpy(p, &rose->dest_digis[nb], AX25_ADDR_LEN); + p[6] &= ~AX25_HBIT; + p += AX25_ADDR_LEN; + } + } + + /* For compatibility */ + if (rose->source_ndigis > 0) { + *p++ = FAC_NATIONAL_SRC_DIGI; + *p++ = AX25_ADDR_LEN; + memcpy(p, &rose->source_digis[0], AX25_ADDR_LEN); + p += AX25_ADDR_LEN; + } + + /* For compatibility */ + if (rose->dest_ndigis > 0) { + *p++ = FAC_NATIONAL_DEST_DIGI; + *p++ = AX25_ADDR_LEN; + memcpy(p, &rose->dest_digis[0], AX25_ADDR_LEN); + p += AX25_ADDR_LEN; + } + } + + *p++ = 0x00; + *p++ = FAC_CCITT; + + *p++ = FAC_CCITT_DEST_NSAP; + + callsign = ax2asc(&rose->dest_call); + + *p++ = strlen(callsign) + 10; + *p++ = (strlen(callsign) + 9) * 2; /* ??? */ + + *p++ = 0x47; *p++ = 0x00; *p++ = 0x11; + *p++ = ROSE_ADDR_LEN * 2; + memcpy(p, &rose->dest_addr, ROSE_ADDR_LEN); + p += ROSE_ADDR_LEN; + + memcpy(p, callsign, strlen(callsign)); + p += strlen(callsign); + + *p++ = FAC_CCITT_SRC_NSAP; + + callsign = ax2asc(&rose->source_call); + + *p++ = strlen(callsign) + 10; + *p++ = (strlen(callsign) + 9) * 2; /* ??? */ + + *p++ = 0x47; *p++ = 0x00; *p++ = 0x11; + *p++ = ROSE_ADDR_LEN * 2; + memcpy(p, &rose->source_addr, ROSE_ADDR_LEN); + p += ROSE_ADDR_LEN; + + memcpy(p, callsign, strlen(callsign)); + p += strlen(callsign); + + len = p - buffer; + buffer[0] = len - 1; + + return len; +} + +void rose_disconnect(struct sock *sk, int reason, int cause, int diagnostic) +{ + struct rose_sock *rose = rose_sk(sk); + + rose_stop_timer(sk); + rose_stop_idletimer(sk); + + rose_clear_queues(sk); + + rose->lci = 0; + rose->state = ROSE_STATE_0; + + if (cause != -1) + rose->cause = cause; + + if (diagnostic != -1) + rose->diagnostic = diagnostic; + + sk->sk_state = TCP_CLOSE; + sk->sk_err = reason; + sk->sk_shutdown |= SEND_SHUTDOWN; + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DEAD); + } +} diff --git a/net/rose/rose_timer.c b/net/rose/rose_timer.c new file mode 100644 index 000000000000..84dd4403f792 --- /dev/null +++ b/net/rose/rose_timer.c @@ -0,0 +1,216 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) Jonathan Naylor G4KLX (g4klx@g4klx.demon.co.uk) + * Copyright (C) 2002 Ralf Baechle DO1GRB (ralf@gnu.org) + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void rose_heartbeat_expiry(unsigned long); +static void rose_timer_expiry(unsigned long); +static void rose_idletimer_expiry(unsigned long); + +void rose_start_heartbeat(struct sock *sk) +{ + del_timer(&sk->sk_timer); + + sk->sk_timer.data = (unsigned long)sk; + sk->sk_timer.function = &rose_heartbeat_expiry; + sk->sk_timer.expires = jiffies + 5 * HZ; + + add_timer(&sk->sk_timer); +} + +void rose_start_t1timer(struct sock *sk) +{ + struct rose_sock *rose = rose_sk(sk); + + del_timer(&rose->timer); + + rose->timer.data = (unsigned long)sk; + rose->timer.function = &rose_timer_expiry; + rose->timer.expires = jiffies + rose->t1; + + add_timer(&rose->timer); +} + +void rose_start_t2timer(struct sock *sk) +{ + struct rose_sock *rose = rose_sk(sk); + + del_timer(&rose->timer); + + rose->timer.data = (unsigned long)sk; + rose->timer.function = &rose_timer_expiry; + rose->timer.expires = jiffies + rose->t2; + + add_timer(&rose->timer); +} + +void rose_start_t3timer(struct sock *sk) +{ + struct rose_sock *rose = rose_sk(sk); + + del_timer(&rose->timer); + + rose->timer.data = (unsigned long)sk; + rose->timer.function = &rose_timer_expiry; + rose->timer.expires = jiffies + rose->t3; + + add_timer(&rose->timer); +} + +void rose_start_hbtimer(struct sock *sk) +{ + struct rose_sock *rose = rose_sk(sk); + + del_timer(&rose->timer); + + rose->timer.data = (unsigned long)sk; + rose->timer.function = &rose_timer_expiry; + rose->timer.expires = jiffies + rose->hb; + + add_timer(&rose->timer); +} + +void rose_start_idletimer(struct sock *sk) +{ + struct rose_sock *rose = rose_sk(sk); + + del_timer(&rose->idletimer); + + if (rose->idle > 0) { + rose->idletimer.data = (unsigned long)sk; + rose->idletimer.function = &rose_idletimer_expiry; + rose->idletimer.expires = jiffies + rose->idle; + + add_timer(&rose->idletimer); + } +} + +void rose_stop_heartbeat(struct sock *sk) +{ + del_timer(&sk->sk_timer); +} + +void rose_stop_timer(struct sock *sk) +{ + del_timer(&rose_sk(sk)->timer); +} + +void rose_stop_idletimer(struct sock *sk) +{ + del_timer(&rose_sk(sk)->idletimer); +} + +static void rose_heartbeat_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + struct rose_sock *rose = rose_sk(sk); + + bh_lock_sock(sk); + switch (rose->state) { + case ROSE_STATE_0: + /* Magic here: If we listen() and a new link dies before it + is accepted() it isn't 'dead' so doesn't get removed. */ + if (sock_flag(sk, SOCK_DESTROY) || + (sk->sk_state == TCP_LISTEN && sock_flag(sk, SOCK_DEAD))) { + rose_destroy_socket(sk); + return; + } + break; + + case ROSE_STATE_3: + /* + * Check for the state of the receive buffer. + */ + if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf / 2) && + (rose->condition & ROSE_COND_OWN_RX_BUSY)) { + rose->condition &= ~ROSE_COND_OWN_RX_BUSY; + rose->condition &= ~ROSE_COND_ACK_PENDING; + rose->vl = rose->vr; + rose_write_internal(sk, ROSE_RR); + rose_stop_timer(sk); /* HB */ + break; + } + break; + } + + rose_start_heartbeat(sk); + bh_unlock_sock(sk); +} + +static void rose_timer_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + struct rose_sock *rose = rose_sk(sk); + + bh_lock_sock(sk); + switch (rose->state) { + case ROSE_STATE_1: /* T1 */ + case ROSE_STATE_4: /* T2 */ + rose_write_internal(sk, ROSE_CLEAR_REQUEST); + rose->state = ROSE_STATE_2; + rose_start_t3timer(sk); + break; + + case ROSE_STATE_2: /* T3 */ + rose->neighbour->use--; + rose_disconnect(sk, ETIMEDOUT, -1, -1); + break; + + case ROSE_STATE_3: /* HB */ + if (rose->condition & ROSE_COND_ACK_PENDING) { + rose->condition &= ~ROSE_COND_ACK_PENDING; + rose_enquiry_response(sk); + } + break; + } + bh_unlock_sock(sk); +} + +static void rose_idletimer_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + + bh_lock_sock(sk); + rose_clear_queues(sk); + + rose_write_internal(sk, ROSE_CLEAR_REQUEST); + rose_sk(sk)->state = ROSE_STATE_2; + + rose_start_t3timer(sk); + + sk->sk_state = TCP_CLOSE; + sk->sk_err = 0; + sk->sk_shutdown |= SEND_SHUTDOWN; + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DEAD); + } + bh_unlock_sock(sk); +} diff --git a/net/rose/sysctl_net_rose.c b/net/rose/sysctl_net_rose.c new file mode 100644 index 000000000000..8548c7cf5643 --- /dev/null +++ b/net/rose/sysctl_net_rose.c @@ -0,0 +1,169 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * Copyright (C) 1996 Mike Shaver (shaver@zeroknowledge.com) + */ +#include +#include +#include +#include +#include + +static int min_timer[] = {1 * HZ}; +static int max_timer[] = {300 * HZ}; +static int min_idle[] = {0 * HZ}; +static int max_idle[] = {65535 * HZ}; +static int min_route[1], max_route[] = {1}; +static int min_ftimer[] = {60 * HZ}; +static int max_ftimer[] = {600 * HZ}; +static int min_maxvcs[] = {1}, max_maxvcs[] = {254}; +static int min_window[] = {1}, max_window[] = {7}; + +static struct ctl_table_header *rose_table_header; + +static ctl_table rose_table[] = { + { + .ctl_name = NET_ROSE_RESTART_REQUEST_TIMEOUT, + .procname = "restart_request_timeout", + .data = &sysctl_rose_restart_request_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_timer, + .extra2 = &max_timer + }, + { + .ctl_name = NET_ROSE_CALL_REQUEST_TIMEOUT, + .procname = "call_request_timeout", + .data = &sysctl_rose_call_request_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_timer, + .extra2 = &max_timer + }, + { + .ctl_name = NET_ROSE_RESET_REQUEST_TIMEOUT, + .procname = "reset_request_timeout", + .data = &sysctl_rose_reset_request_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_timer, + .extra2 = &max_timer + }, + { + .ctl_name = NET_ROSE_CLEAR_REQUEST_TIMEOUT, + .procname = "clear_request_timeout", + .data = &sysctl_rose_clear_request_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_timer, + .extra2 = &max_timer + }, + { + .ctl_name = NET_ROSE_NO_ACTIVITY_TIMEOUT, + .procname = "no_activity_timeout", + .data = &sysctl_rose_no_activity_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_idle, + .extra2 = &max_idle + }, + { + .ctl_name = NET_ROSE_ACK_HOLD_BACK_TIMEOUT, + .procname = "acknowledge_hold_back_timeout", + .data = &sysctl_rose_ack_hold_back_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_timer, + .extra2 = &max_timer + }, + { + .ctl_name = NET_ROSE_ROUTING_CONTROL, + .procname = "routing_control", + .data = &sysctl_rose_routing_control, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_route, + .extra2 = &max_route + }, + { + .ctl_name = NET_ROSE_LINK_FAIL_TIMEOUT, + .procname = "link_fail_timeout", + .data = &sysctl_rose_link_fail_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_ftimer, + .extra2 = &max_ftimer + }, + { + .ctl_name = NET_ROSE_MAX_VCS, + .procname = "maximum_virtual_circuits", + .data = &sysctl_rose_maximum_vcs, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_maxvcs, + .extra2 = &max_maxvcs + }, + { + .ctl_name = NET_ROSE_WINDOW_SIZE, + .procname = "window_size", + .data = &sysctl_rose_window_size, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_window, + .extra2 = &max_window + }, + { .ctl_name = 0 } +}; + +static ctl_table rose_dir_table[] = { + { + .ctl_name = NET_ROSE, + .procname = "rose", + .mode = 0555, + .child = rose_table + }, + { .ctl_name = 0 } +}; + +static ctl_table rose_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = rose_dir_table + }, + { .ctl_name = 0 } +}; + +void __init rose_register_sysctl(void) +{ + rose_table_header = register_sysctl_table(rose_root_table, 1); +} + +void rose_unregister_sysctl(void) +{ + unregister_sysctl_table(rose_table_header); +} diff --git a/net/rxrpc/Makefile b/net/rxrpc/Makefile new file mode 100644 index 000000000000..6efcb6f162a0 --- /dev/null +++ b/net/rxrpc/Makefile @@ -0,0 +1,25 @@ +# +# Makefile for Linux kernel Rx RPC +# + +#CFLAGS += -finstrument-functions + +rxrpc-objs := \ + call.o \ + connection.o \ + krxiod.o \ + krxsecd.o \ + krxtimod.o \ + main.o \ + peer.o \ + rxrpc_syms.o \ + transport.o + +ifeq ($(CONFIG_PROC_FS),y) +rxrpc-objs += proc.o +endif +ifeq ($(CONFIG_SYSCTL),y) +rxrpc-objs += sysctl.o +endif + +obj-$(CONFIG_RXRPC) := rxrpc.o diff --git a/net/rxrpc/call.c b/net/rxrpc/call.c new file mode 100644 index 000000000000..5cfd4cadee42 --- /dev/null +++ b/net/rxrpc/call.c @@ -0,0 +1,2278 @@ +/* call.c: Rx call routines + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +__RXACCT_DECL(atomic_t rxrpc_call_count); +__RXACCT_DECL(atomic_t rxrpc_message_count); + +LIST_HEAD(rxrpc_calls); +DECLARE_RWSEM(rxrpc_calls_sem); + +unsigned rxrpc_call_rcv_timeout = HZ/3; +static unsigned rxrpc_call_acks_timeout = HZ/3; +static unsigned rxrpc_call_dfr_ack_timeout = HZ/20; +static unsigned short rxrpc_call_max_resend = HZ/10; + +const char *rxrpc_call_states[] = { + "COMPLETE", + "ERROR", + "SRVR_RCV_OPID", + "SRVR_RCV_ARGS", + "SRVR_GOT_ARGS", + "SRVR_SND_REPLY", + "SRVR_RCV_FINAL_ACK", + "CLNT_SND_ARGS", + "CLNT_RCV_REPLY", + "CLNT_GOT_REPLY" +}; + +const char *rxrpc_call_error_states[] = { + "NO_ERROR", + "LOCAL_ABORT", + "PEER_ABORT", + "LOCAL_ERROR", + "REMOTE_ERROR" +}; + +const char *rxrpc_pkts[] = { + "?00", + "data", "ack", "busy", "abort", "ackall", "chall", "resp", "debug", + "?09", "?10", "?11", "?12", "?13", "?14", "?15" +}; + +static const char *rxrpc_acks[] = { + "---", "REQ", "DUP", "SEQ", "WIN", "MEM", "PNG", "PNR", "DLY", "IDL", + "-?-" +}; + +static const char _acktype[] = "NA-"; + +static void rxrpc_call_receive_packet(struct rxrpc_call *call); +static void rxrpc_call_receive_data_packet(struct rxrpc_call *call, + struct rxrpc_message *msg); +static void rxrpc_call_receive_ack_packet(struct rxrpc_call *call, + struct rxrpc_message *msg); +static void rxrpc_call_definitively_ACK(struct rxrpc_call *call, + rxrpc_seq_t higest); +static void rxrpc_call_resend(struct rxrpc_call *call, rxrpc_seq_t highest); +static int __rxrpc_call_read_data(struct rxrpc_call *call); + +static int rxrpc_call_record_ACK(struct rxrpc_call *call, + struct rxrpc_message *msg, + rxrpc_seq_t seq, + size_t count); + +static int rxrpc_call_flush(struct rxrpc_call *call); + +#define _state(call) \ + _debug("[[[ state %s ]]]", rxrpc_call_states[call->app_call_state]); + +static void rxrpc_call_default_attn_func(struct rxrpc_call *call) +{ + wake_up(&call->waitq); +} + +static void rxrpc_call_default_error_func(struct rxrpc_call *call) +{ + wake_up(&call->waitq); +} + +static void rxrpc_call_default_aemap_func(struct rxrpc_call *call) +{ + switch (call->app_err_state) { + case RXRPC_ESTATE_LOCAL_ABORT: + call->app_abort_code = -call->app_errno; + case RXRPC_ESTATE_PEER_ABORT: + call->app_errno = -ECONNABORTED; + default: + break; + } +} + +static void __rxrpc_call_acks_timeout(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + _debug("ACKS TIMEOUT %05lu", jiffies - call->cjif); + + call->flags |= RXRPC_CALL_ACKS_TIMO; + rxrpc_krxiod_queue_call(call); +} + +static void __rxrpc_call_rcv_timeout(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + _debug("RCV TIMEOUT %05lu", jiffies - call->cjif); + + call->flags |= RXRPC_CALL_RCV_TIMO; + rxrpc_krxiod_queue_call(call); +} + +static void __rxrpc_call_ackr_timeout(unsigned long _call) +{ + struct rxrpc_call *call = (struct rxrpc_call *) _call; + + _debug("ACKR TIMEOUT %05lu",jiffies - call->cjif); + + call->flags |= RXRPC_CALL_ACKR_TIMO; + rxrpc_krxiod_queue_call(call); +} + +/*****************************************************************************/ +/* + * calculate a timeout based on an RTT value + */ +static inline unsigned long __rxrpc_rtt_based_timeout(struct rxrpc_call *call, + unsigned long val) +{ + unsigned long expiry = call->conn->peer->rtt / (1000000 / HZ); + + expiry += 10; + if (expiry < HZ / 25) + expiry = HZ / 25; + if (expiry > HZ) + expiry = HZ; + + _leave(" = %lu jiffies", expiry); + return jiffies + expiry; +} /* end __rxrpc_rtt_based_timeout() */ + +/*****************************************************************************/ +/* + * create a new call record + */ +static inline int __rxrpc_create_call(struct rxrpc_connection *conn, + struct rxrpc_call **_call) +{ + struct rxrpc_call *call; + + _enter("%p", conn); + + /* allocate and initialise a call record */ + call = (struct rxrpc_call *) get_zeroed_page(GFP_KERNEL); + if (!call) { + _leave(" ENOMEM"); + return -ENOMEM; + } + + atomic_set(&call->usage, 1); + + init_waitqueue_head(&call->waitq); + spin_lock_init(&call->lock); + INIT_LIST_HEAD(&call->link); + INIT_LIST_HEAD(&call->acks_pendq); + INIT_LIST_HEAD(&call->rcv_receiveq); + INIT_LIST_HEAD(&call->rcv_krxiodq_lk); + INIT_LIST_HEAD(&call->app_readyq); + INIT_LIST_HEAD(&call->app_unreadyq); + INIT_LIST_HEAD(&call->app_link); + INIT_LIST_HEAD(&call->app_attn_link); + + init_timer(&call->acks_timeout); + call->acks_timeout.data = (unsigned long) call; + call->acks_timeout.function = __rxrpc_call_acks_timeout; + + init_timer(&call->rcv_timeout); + call->rcv_timeout.data = (unsigned long) call; + call->rcv_timeout.function = __rxrpc_call_rcv_timeout; + + init_timer(&call->ackr_dfr_timo); + call->ackr_dfr_timo.data = (unsigned long) call; + call->ackr_dfr_timo.function = __rxrpc_call_ackr_timeout; + + call->conn = conn; + call->ackr_win_bot = 1; + call->ackr_win_top = call->ackr_win_bot + RXRPC_CALL_ACK_WINDOW_SIZE - 1; + call->ackr_prev_seq = 0; + call->app_mark = RXRPC_APP_MARK_EOF; + call->app_attn_func = rxrpc_call_default_attn_func; + call->app_error_func = rxrpc_call_default_error_func; + call->app_aemap_func = rxrpc_call_default_aemap_func; + call->app_scr_alloc = call->app_scratch; + + call->cjif = jiffies; + + _leave(" = 0 (%p)", call); + + *_call = call; + + return 0; +} /* end __rxrpc_create_call() */ + +/*****************************************************************************/ +/* + * create a new call record for outgoing calls + */ +int rxrpc_create_call(struct rxrpc_connection *conn, + rxrpc_call_attn_func_t attn, + rxrpc_call_error_func_t error, + rxrpc_call_aemap_func_t aemap, + struct rxrpc_call **_call) +{ + DECLARE_WAITQUEUE(myself, current); + + struct rxrpc_call *call; + int ret, cix, loop; + + _enter("%p", conn); + + /* allocate and initialise a call record */ + ret = __rxrpc_create_call(conn, &call); + if (ret < 0) { + _leave(" = %d", ret); + return ret; + } + + call->app_call_state = RXRPC_CSTATE_CLNT_SND_ARGS; + if (attn) + call->app_attn_func = attn; + if (error) + call->app_error_func = error; + if (aemap) + call->app_aemap_func = aemap; + + _state(call); + + spin_lock(&conn->lock); + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&conn->chanwait, &myself); + + try_again: + /* try to find an unused channel */ + for (cix = 0; cix < 4; cix++) + if (!conn->channels[cix]) + goto obtained_chan; + + /* no free channels - wait for one to become available */ + ret = -EINTR; + if (signal_pending(current)) + goto error_unwait; + + spin_unlock(&conn->lock); + + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + + spin_lock(&conn->lock); + goto try_again; + + /* got a channel - now attach to the connection */ + obtained_chan: + remove_wait_queue(&conn->chanwait, &myself); + set_current_state(TASK_RUNNING); + + /* concoct a unique call number */ + next_callid: + call->call_id = htonl(++conn->call_counter); + for (loop = 0; loop < 4; loop++) + if (conn->channels[loop] && + conn->channels[loop]->call_id == call->call_id) + goto next_callid; + + rxrpc_get_connection(conn); + conn->channels[cix] = call; /* assign _after_ done callid check loop */ + do_gettimeofday(&conn->atime); + call->chan_ix = htonl(cix); + + spin_unlock(&conn->lock); + + down_write(&rxrpc_calls_sem); + list_add_tail(&call->call_link, &rxrpc_calls); + up_write(&rxrpc_calls_sem); + + __RXACCT(atomic_inc(&rxrpc_call_count)); + *_call = call; + + _leave(" = 0 (call=%p cix=%u)", call, cix); + return 0; + + error_unwait: + remove_wait_queue(&conn->chanwait, &myself); + set_current_state(TASK_RUNNING); + spin_unlock(&conn->lock); + + free_page((unsigned long) call); + _leave(" = %d", ret); + return ret; +} /* end rxrpc_create_call() */ + +/*****************************************************************************/ +/* + * create a new call record for incoming calls + */ +int rxrpc_incoming_call(struct rxrpc_connection *conn, + struct rxrpc_message *msg, + struct rxrpc_call **_call) +{ + struct rxrpc_call *call; + unsigned cix; + int ret; + + cix = ntohl(msg->hdr.cid) & RXRPC_CHANNELMASK; + + _enter("%p,%u,%u", conn, ntohl(msg->hdr.callNumber), cix); + + /* allocate and initialise a call record */ + ret = __rxrpc_create_call(conn, &call); + if (ret < 0) { + _leave(" = %d", ret); + return ret; + } + + call->pkt_rcv_count = 1; + call->app_call_state = RXRPC_CSTATE_SRVR_RCV_OPID; + call->app_mark = sizeof(uint32_t); + + _state(call); + + /* attach to the connection */ + ret = -EBUSY; + call->chan_ix = htonl(cix); + call->call_id = msg->hdr.callNumber; + + spin_lock(&conn->lock); + + if (!conn->channels[cix] || + conn->channels[cix]->app_call_state == RXRPC_CSTATE_COMPLETE || + conn->channels[cix]->app_call_state == RXRPC_CSTATE_ERROR + ) { + conn->channels[cix] = call; + rxrpc_get_connection(conn); + ret = 0; + } + + spin_unlock(&conn->lock); + + if (ret < 0) { + free_page((unsigned long) call); + call = NULL; + } + + if (ret == 0) { + down_write(&rxrpc_calls_sem); + list_add_tail(&call->call_link, &rxrpc_calls); + up_write(&rxrpc_calls_sem); + __RXACCT(atomic_inc(&rxrpc_call_count)); + *_call = call; + } + + _leave(" = %d [%p]", ret, call); + return ret; +} /* end rxrpc_incoming_call() */ + +/*****************************************************************************/ +/* + * free a call record + */ +void rxrpc_put_call(struct rxrpc_call *call) +{ + struct rxrpc_connection *conn = call->conn; + struct rxrpc_message *msg; + + _enter("%p{u=%d}",call,atomic_read(&call->usage)); + + /* sanity check */ + if (atomic_read(&call->usage) <= 0) + BUG(); + + /* to prevent a race, the decrement and the de-list must be effectively + * atomic */ + spin_lock(&conn->lock); + if (likely(!atomic_dec_and_test(&call->usage))) { + spin_unlock(&conn->lock); + _leave(""); + return; + } + + if (conn->channels[ntohl(call->chan_ix)] == call) + conn->channels[ntohl(call->chan_ix)] = NULL; + + spin_unlock(&conn->lock); + + wake_up(&conn->chanwait); + + rxrpc_put_connection(conn); + + /* clear the timers and dequeue from krxiod */ + del_timer_sync(&call->acks_timeout); + del_timer_sync(&call->rcv_timeout); + del_timer_sync(&call->ackr_dfr_timo); + + rxrpc_krxiod_dequeue_call(call); + + /* clean up the contents of the struct */ + if (call->snd_nextmsg) + rxrpc_put_message(call->snd_nextmsg); + + if (call->snd_ping) + rxrpc_put_message(call->snd_ping); + + while (!list_empty(&call->acks_pendq)) { + msg = list_entry(call->acks_pendq.next, + struct rxrpc_message, link); + list_del(&msg->link); + rxrpc_put_message(msg); + } + + while (!list_empty(&call->rcv_receiveq)) { + msg = list_entry(call->rcv_receiveq.next, + struct rxrpc_message, link); + list_del(&msg->link); + rxrpc_put_message(msg); + } + + while (!list_empty(&call->app_readyq)) { + msg = list_entry(call->app_readyq.next, + struct rxrpc_message, link); + list_del(&msg->link); + rxrpc_put_message(msg); + } + + while (!list_empty(&call->app_unreadyq)) { + msg = list_entry(call->app_unreadyq.next, + struct rxrpc_message, link); + list_del(&msg->link); + rxrpc_put_message(msg); + } + + module_put(call->owner); + + down_write(&rxrpc_calls_sem); + list_del(&call->call_link); + up_write(&rxrpc_calls_sem); + + __RXACCT(atomic_dec(&rxrpc_call_count)); + free_page((unsigned long) call); + + _leave(" [destroyed]"); +} /* end rxrpc_put_call() */ + +/*****************************************************************************/ +/* + * actually generate a normal ACK + */ +static inline int __rxrpc_call_gen_normal_ACK(struct rxrpc_call *call, + rxrpc_seq_t seq) +{ + struct rxrpc_message *msg; + struct kvec diov[3]; + __be32 aux[4]; + int delta, ret; + + /* ACKs default to DELAY */ + if (!call->ackr.reason) + call->ackr.reason = RXRPC_ACK_DELAY; + + _proto("Rx %05lu Sending ACK { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", + jiffies - call->cjif, + ntohs(call->ackr.maxSkew), + ntohl(call->ackr.firstPacket), + ntohl(call->ackr.previousPacket), + ntohl(call->ackr.serial), + rxrpc_acks[call->ackr.reason], + call->ackr.nAcks); + + aux[0] = htonl(call->conn->peer->if_mtu); /* interface MTU */ + aux[1] = htonl(1444); /* max MTU */ + aux[2] = htonl(16); /* rwind */ + aux[3] = htonl(4); /* max packets */ + + diov[0].iov_len = sizeof(struct rxrpc_ackpacket); + diov[0].iov_base = &call->ackr; + diov[1].iov_len = call->ackr_pend_cnt + 3; + diov[1].iov_base = call->ackr_array; + diov[2].iov_len = sizeof(aux); + diov[2].iov_base = &aux; + + /* build and send the message */ + ret = rxrpc_conn_newmsg(call->conn,call, RXRPC_PACKET_TYPE_ACK, + 3, diov, GFP_KERNEL, &msg); + if (ret < 0) + goto out; + + msg->seq = seq; + msg->hdr.seq = htonl(seq); + msg->hdr.flags |= RXRPC_SLOW_START_OK; + + ret = rxrpc_conn_sendmsg(call->conn, msg); + rxrpc_put_message(msg); + if (ret < 0) + goto out; + call->pkt_snd_count++; + + /* count how many actual ACKs there were at the front */ + for (delta = 0; delta < call->ackr_pend_cnt; delta++) + if (call->ackr_array[delta] != RXRPC_ACK_TYPE_ACK) + break; + + call->ackr_pend_cnt -= delta; /* all ACK'd to this point */ + + /* crank the ACK window around */ + if (delta == 0) { + /* un-ACK'd window */ + } + else if (delta < RXRPC_CALL_ACK_WINDOW_SIZE) { + /* partially ACK'd window + * - shuffle down to avoid losing out-of-sequence packets + */ + call->ackr_win_bot += delta; + call->ackr_win_top += delta; + + memmove(&call->ackr_array[0], + &call->ackr_array[delta], + call->ackr_pend_cnt); + + memset(&call->ackr_array[call->ackr_pend_cnt], + RXRPC_ACK_TYPE_NACK, + sizeof(call->ackr_array) - call->ackr_pend_cnt); + } + else { + /* fully ACK'd window + * - just clear the whole thing + */ + memset(&call->ackr_array, + RXRPC_ACK_TYPE_NACK, + sizeof(call->ackr_array)); + } + + /* clear this ACK */ + memset(&call->ackr, 0, sizeof(call->ackr)); + + out: + if (!call->app_call_state) + printk("___ STATE 0 ___\n"); + return ret; +} /* end __rxrpc_call_gen_normal_ACK() */ + +/*****************************************************************************/ +/* + * note the reception of a packet in the call's ACK records and generate an + * appropriate ACK packet if necessary + * - returns 0 if packet should be processed, 1 if packet should be ignored + * and -ve on an error + */ +static int rxrpc_call_generate_ACK(struct rxrpc_call *call, + struct rxrpc_header *hdr, + struct rxrpc_ackpacket *ack) +{ + struct rxrpc_message *msg; + rxrpc_seq_t seq; + unsigned offset; + int ret = 0, err; + u8 special_ACK, do_ACK, force; + + _enter("%p,%p { seq=%d tp=%d fl=%02x }", + call, hdr, ntohl(hdr->seq), hdr->type, hdr->flags); + + seq = ntohl(hdr->seq); + offset = seq - call->ackr_win_bot; + do_ACK = RXRPC_ACK_DELAY; + special_ACK = 0; + force = (seq == 1); + + if (call->ackr_high_seq < seq) + call->ackr_high_seq = seq; + + /* deal with generation of obvious special ACKs first */ + if (ack && ack->reason == RXRPC_ACK_PING) { + special_ACK = RXRPC_ACK_PING_RESPONSE; + ret = 1; + goto gen_ACK; + } + + if (seq < call->ackr_win_bot) { + special_ACK = RXRPC_ACK_DUPLICATE; + ret = 1; + goto gen_ACK; + } + + if (seq >= call->ackr_win_top) { + special_ACK = RXRPC_ACK_EXCEEDS_WINDOW; + ret = 1; + goto gen_ACK; + } + + if (call->ackr_array[offset] != RXRPC_ACK_TYPE_NACK) { + special_ACK = RXRPC_ACK_DUPLICATE; + ret = 1; + goto gen_ACK; + } + + /* okay... it's a normal data packet inside the ACK window */ + call->ackr_array[offset] = RXRPC_ACK_TYPE_ACK; + + if (offset < call->ackr_pend_cnt) { + } + else if (offset > call->ackr_pend_cnt) { + do_ACK = RXRPC_ACK_OUT_OF_SEQUENCE; + call->ackr_pend_cnt = offset; + goto gen_ACK; + } + + if (hdr->flags & RXRPC_REQUEST_ACK) { + do_ACK = RXRPC_ACK_REQUESTED; + } + + /* generate an ACK on the final packet of a reply just received */ + if (hdr->flags & RXRPC_LAST_PACKET) { + if (call->conn->out_clientflag) + force = 1; + } + else if (!(hdr->flags & RXRPC_MORE_PACKETS)) { + do_ACK = RXRPC_ACK_REQUESTED; + } + + /* re-ACK packets previously received out-of-order */ + for (offset++; offset < RXRPC_CALL_ACK_WINDOW_SIZE; offset++) + if (call->ackr_array[offset] != RXRPC_ACK_TYPE_ACK) + break; + + call->ackr_pend_cnt = offset; + + /* generate an ACK if we fill up the window */ + if (call->ackr_pend_cnt >= RXRPC_CALL_ACK_WINDOW_SIZE) + force = 1; + + gen_ACK: + _debug("%05lu ACKs pend=%u norm=%s special=%s%s", + jiffies - call->cjif, + call->ackr_pend_cnt, + rxrpc_acks[do_ACK], + rxrpc_acks[special_ACK], + force ? " immediate" : + do_ACK == RXRPC_ACK_REQUESTED ? " merge-req" : + hdr->flags & RXRPC_LAST_PACKET ? " finalise" : + " defer" + ); + + /* send any pending normal ACKs if need be */ + if (call->ackr_pend_cnt > 0) { + /* fill out the appropriate form */ + call->ackr.bufferSpace = htons(RXRPC_CALL_ACK_WINDOW_SIZE); + call->ackr.maxSkew = htons(min(call->ackr_high_seq - seq, + 65535U)); + call->ackr.firstPacket = htonl(call->ackr_win_bot); + call->ackr.previousPacket = call->ackr_prev_seq; + call->ackr.serial = hdr->serial; + call->ackr.nAcks = call->ackr_pend_cnt; + + if (do_ACK == RXRPC_ACK_REQUESTED) + call->ackr.reason = do_ACK; + + /* generate the ACK immediately if necessary */ + if (special_ACK || force) { + err = __rxrpc_call_gen_normal_ACK( + call, do_ACK == RXRPC_ACK_DELAY ? 0 : seq); + if (err < 0) { + ret = err; + goto out; + } + } + } + + if (call->ackr.reason == RXRPC_ACK_REQUESTED) + call->ackr_dfr_seq = seq; + + /* start the ACK timer if not running if there are any pending deferred + * ACKs */ + if (call->ackr_pend_cnt > 0 && + call->ackr.reason != RXRPC_ACK_REQUESTED && + !timer_pending(&call->ackr_dfr_timo) + ) { + unsigned long timo; + + timo = rxrpc_call_dfr_ack_timeout + jiffies; + + _debug("START ACKR TIMER for cj=%lu", timo - call->cjif); + + spin_lock(&call->lock); + mod_timer(&call->ackr_dfr_timo, timo); + spin_unlock(&call->lock); + } + else if ((call->ackr_pend_cnt == 0 || + call->ackr.reason == RXRPC_ACK_REQUESTED) && + timer_pending(&call->ackr_dfr_timo) + ) { + /* stop timer if no pending ACKs */ + _debug("CLEAR ACKR TIMER"); + del_timer_sync(&call->ackr_dfr_timo); + } + + /* send a special ACK if one is required */ + if (special_ACK) { + struct rxrpc_ackpacket ack; + struct kvec diov[2]; + uint8_t acks[1] = { RXRPC_ACK_TYPE_ACK }; + + /* fill out the appropriate form */ + ack.bufferSpace = htons(RXRPC_CALL_ACK_WINDOW_SIZE); + ack.maxSkew = htons(min(call->ackr_high_seq - seq, + 65535U)); + ack.firstPacket = htonl(call->ackr_win_bot); + ack.previousPacket = call->ackr_prev_seq; + ack.serial = hdr->serial; + ack.reason = special_ACK; + ack.nAcks = 0; + + _proto("Rx Sending s-ACK" + " { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }", + ntohs(ack.maxSkew), + ntohl(ack.firstPacket), + ntohl(ack.previousPacket), + ntohl(ack.serial), + rxrpc_acks[ack.reason], + ack.nAcks); + + diov[0].iov_len = sizeof(struct rxrpc_ackpacket); + diov[0].iov_base = &ack; + diov[1].iov_len = sizeof(acks); + diov[1].iov_base = acks; + + /* build and send the message */ + err = rxrpc_conn_newmsg(call->conn,call, RXRPC_PACKET_TYPE_ACK, + hdr->seq ? 2 : 1, diov, + GFP_KERNEL, + &msg); + if (err < 0) { + ret = err; + goto out; + } + + msg->seq = seq; + msg->hdr.seq = htonl(seq); + msg->hdr.flags |= RXRPC_SLOW_START_OK; + + err = rxrpc_conn_sendmsg(call->conn, msg); + rxrpc_put_message(msg); + if (err < 0) { + ret = err; + goto out; + } + call->pkt_snd_count++; + } + + out: + if (hdr->seq) + call->ackr_prev_seq = hdr->seq; + + _leave(" = %d", ret); + return ret; +} /* end rxrpc_call_generate_ACK() */ + +/*****************************************************************************/ +/* + * handle work to be done on a call + * - includes packet reception and timeout processing + */ +void rxrpc_call_do_stuff(struct rxrpc_call *call) +{ + _enter("%p{flags=%lx}", call, call->flags); + + /* handle packet reception */ + if (call->flags & RXRPC_CALL_RCV_PKT) { + _debug("- receive packet"); + call->flags &= ~RXRPC_CALL_RCV_PKT; + rxrpc_call_receive_packet(call); + } + + /* handle overdue ACKs */ + if (call->flags & RXRPC_CALL_ACKS_TIMO) { + _debug("- overdue ACK timeout"); + call->flags &= ~RXRPC_CALL_ACKS_TIMO; + rxrpc_call_resend(call, call->snd_seq_count); + } + + /* handle lack of reception */ + if (call->flags & RXRPC_CALL_RCV_TIMO) { + _debug("- reception timeout"); + call->flags &= ~RXRPC_CALL_RCV_TIMO; + rxrpc_call_abort(call, -EIO); + } + + /* handle deferred ACKs */ + if (call->flags & RXRPC_CALL_ACKR_TIMO || + (call->ackr.nAcks > 0 && call->ackr.reason == RXRPC_ACK_REQUESTED) + ) { + _debug("- deferred ACK timeout: cj=%05lu r=%s n=%u", + jiffies - call->cjif, + rxrpc_acks[call->ackr.reason], + call->ackr.nAcks); + + call->flags &= ~RXRPC_CALL_ACKR_TIMO; + + if (call->ackr.nAcks > 0 && + call->app_call_state != RXRPC_CSTATE_ERROR) { + /* generate ACK */ + __rxrpc_call_gen_normal_ACK(call, call->ackr_dfr_seq); + call->ackr_dfr_seq = 0; + } + } + + _leave(""); + +} /* end rxrpc_call_do_stuff() */ + +/*****************************************************************************/ +/* + * send an abort message at call or connection level + * - must be called with call->lock held + * - the supplied error code is sent as the packet data + */ +static int __rxrpc_call_abort(struct rxrpc_call *call, int errno) +{ + struct rxrpc_connection *conn = call->conn; + struct rxrpc_message *msg; + struct kvec diov[1]; + int ret; + __be32 _error; + + _enter("%p{%08x},%p{%d},%d", + conn, ntohl(conn->conn_id), call, ntohl(call->call_id), errno); + + /* if this call is already aborted, then just wake up any waiters */ + if (call->app_call_state == RXRPC_CSTATE_ERROR) { + spin_unlock(&call->lock); + call->app_error_func(call); + _leave(" = 0"); + return 0; + } + + rxrpc_get_call(call); + + /* change the state _with_ the lock still held */ + call->app_call_state = RXRPC_CSTATE_ERROR; + call->app_err_state = RXRPC_ESTATE_LOCAL_ABORT; + call->app_errno = errno; + call->app_mark = RXRPC_APP_MARK_EOF; + call->app_read_buf = NULL; + call->app_async_read = 0; + + _state(call); + + /* ask the app to translate the error code */ + call->app_aemap_func(call); + + spin_unlock(&call->lock); + + /* flush any outstanding ACKs */ + del_timer_sync(&call->acks_timeout); + del_timer_sync(&call->rcv_timeout); + del_timer_sync(&call->ackr_dfr_timo); + + if (rxrpc_call_is_ack_pending(call)) + __rxrpc_call_gen_normal_ACK(call, 0); + + /* send the abort packet only if we actually traded some other + * packets */ + ret = 0; + if (call->pkt_snd_count || call->pkt_rcv_count) { + /* actually send the abort */ + _proto("Rx Sending Call ABORT { data=%d }", + call->app_abort_code); + + _error = htonl(call->app_abort_code); + + diov[0].iov_len = sizeof(_error); + diov[0].iov_base = &_error; + + ret = rxrpc_conn_newmsg(conn, call, RXRPC_PACKET_TYPE_ABORT, + 1, diov, GFP_KERNEL, &msg); + if (ret == 0) { + ret = rxrpc_conn_sendmsg(conn, msg); + rxrpc_put_message(msg); + } + } + + /* tell the app layer to let go */ + call->app_error_func(call); + + rxrpc_put_call(call); + + _leave(" = %d", ret); + return ret; +} /* end __rxrpc_call_abort() */ + +/*****************************************************************************/ +/* + * send an abort message at call or connection level + * - the supplied error code is sent as the packet data + */ +int rxrpc_call_abort(struct rxrpc_call *call, int error) +{ + spin_lock(&call->lock); + + return __rxrpc_call_abort(call, error); + +} /* end rxrpc_call_abort() */ + +/*****************************************************************************/ +/* + * process packets waiting for this call + */ +static void rxrpc_call_receive_packet(struct rxrpc_call *call) +{ + struct rxrpc_message *msg; + struct list_head *_p; + + _enter("%p", call); + + rxrpc_get_call(call); /* must not go away too soon if aborted by + * app-layer */ + + while (!list_empty(&call->rcv_receiveq)) { + /* try to get next packet */ + _p = NULL; + spin_lock(&call->lock); + if (!list_empty(&call->rcv_receiveq)) { + _p = call->rcv_receiveq.next; + list_del_init(_p); + } + spin_unlock(&call->lock); + + if (!_p) + break; + + msg = list_entry(_p, struct rxrpc_message, link); + + _proto("Rx %05lu Received %s packet (%%%u,#%u,%c%c%c%c%c)", + jiffies - call->cjif, + rxrpc_pkts[msg->hdr.type], + ntohl(msg->hdr.serial), + msg->seq, + msg->hdr.flags & RXRPC_JUMBO_PACKET ? 'j' : '-', + msg->hdr.flags & RXRPC_MORE_PACKETS ? 'm' : '-', + msg->hdr.flags & RXRPC_LAST_PACKET ? 'l' : '-', + msg->hdr.flags & RXRPC_REQUEST_ACK ? 'r' : '-', + msg->hdr.flags & RXRPC_CLIENT_INITIATED ? 'C' : 'S' + ); + + switch (msg->hdr.type) { + /* deal with data packets */ + case RXRPC_PACKET_TYPE_DATA: + /* ACK the packet if necessary */ + switch (rxrpc_call_generate_ACK(call, &msg->hdr, + NULL)) { + case 0: /* useful packet */ + rxrpc_call_receive_data_packet(call, msg); + break; + case 1: /* duplicate or out-of-window packet */ + break; + default: + rxrpc_put_message(msg); + goto out; + } + break; + + /* deal with ACK packets */ + case RXRPC_PACKET_TYPE_ACK: + rxrpc_call_receive_ack_packet(call, msg); + break; + + /* deal with abort packets */ + case RXRPC_PACKET_TYPE_ABORT: { + __be32 _dbuf, *dp; + + dp = skb_header_pointer(msg->pkt, msg->offset, + sizeof(_dbuf), &_dbuf); + if (dp == NULL) + printk("Rx Received short ABORT packet\n"); + + _proto("Rx Received Call ABORT { data=%d }", + (dp ? ntohl(*dp) : 0)); + + spin_lock(&call->lock); + call->app_call_state = RXRPC_CSTATE_ERROR; + call->app_err_state = RXRPC_ESTATE_PEER_ABORT; + call->app_abort_code = (dp ? ntohl(*dp) : 0); + call->app_errno = -ECONNABORTED; + call->app_mark = RXRPC_APP_MARK_EOF; + call->app_read_buf = NULL; + call->app_async_read = 0; + + /* ask the app to translate the error code */ + call->app_aemap_func(call); + _state(call); + spin_unlock(&call->lock); + call->app_error_func(call); + break; + } + default: + /* deal with other packet types */ + _proto("Rx Unsupported packet type %u (#%u)", + msg->hdr.type, msg->seq); + break; + } + + rxrpc_put_message(msg); + } + + out: + rxrpc_put_call(call); + _leave(""); +} /* end rxrpc_call_receive_packet() */ + +/*****************************************************************************/ +/* + * process next data packet + * - as the next data packet arrives: + * - it is queued on app_readyq _if_ it is the next one expected + * (app_ready_seq+1) + * - it is queued on app_unreadyq _if_ it is not the next one expected + * - if a packet placed on app_readyq completely fills a hole leading up to + * the first packet on app_unreadyq, then packets now in sequence are + * tranferred to app_readyq + * - the application layer can only see packets on app_readyq + * (app_ready_qty bytes) + * - the application layer is prodded every time a new packet arrives + */ +static void rxrpc_call_receive_data_packet(struct rxrpc_call *call, + struct rxrpc_message *msg) +{ + const struct rxrpc_operation *optbl, *op; + struct rxrpc_message *pmsg; + struct list_head *_p; + int ret, lo, hi, rmtimo; + __be32 opid; + + _enter("%p{%u},%p{%u}", call, ntohl(call->call_id), msg, msg->seq); + + rxrpc_get_message(msg); + + /* add to the unready queue if we'd have to create a hole in the ready + * queue otherwise */ + if (msg->seq != call->app_ready_seq + 1) { + _debug("Call add packet %d to unreadyq", msg->seq); + + /* insert in seq order */ + list_for_each(_p, &call->app_unreadyq) { + pmsg = list_entry(_p, struct rxrpc_message, link); + if (pmsg->seq > msg->seq) + break; + } + + list_add_tail(&msg->link, _p); + + _leave(" [unreadyq]"); + return; + } + + /* next in sequence - simply append into the call's ready queue */ + _debug("Call add packet %d to readyq (+%Zd => %Zd bytes)", + msg->seq, msg->dsize, call->app_ready_qty); + + spin_lock(&call->lock); + call->app_ready_seq = msg->seq; + call->app_ready_qty += msg->dsize; + list_add_tail(&msg->link, &call->app_readyq); + + /* move unready packets to the readyq if we got rid of a hole */ + while (!list_empty(&call->app_unreadyq)) { + pmsg = list_entry(call->app_unreadyq.next, + struct rxrpc_message, link); + + if (pmsg->seq != call->app_ready_seq + 1) + break; + + /* next in sequence - just move list-to-list */ + _debug("Call transfer packet %d to readyq (+%Zd => %Zd bytes)", + pmsg->seq, pmsg->dsize, call->app_ready_qty); + + call->app_ready_seq = pmsg->seq; + call->app_ready_qty += pmsg->dsize; + list_del_init(&pmsg->link); + list_add_tail(&pmsg->link, &call->app_readyq); + } + + /* see if we've got the last packet yet */ + if (!list_empty(&call->app_readyq)) { + pmsg = list_entry(call->app_readyq.prev, + struct rxrpc_message, link); + if (pmsg->hdr.flags & RXRPC_LAST_PACKET) { + call->app_last_rcv = 1; + _debug("Last packet on readyq"); + } + } + + switch (call->app_call_state) { + /* do nothing if call already aborted */ + case RXRPC_CSTATE_ERROR: + spin_unlock(&call->lock); + _leave(" [error]"); + return; + + /* extract the operation ID from an incoming call if that's not + * yet been done */ + case RXRPC_CSTATE_SRVR_RCV_OPID: + spin_unlock(&call->lock); + + /* handle as yet insufficient data for the operation ID */ + if (call->app_ready_qty < 4) { + if (call->app_last_rcv) + /* trouble - last packet seen */ + rxrpc_call_abort(call, -EINVAL); + + _leave(""); + return; + } + + /* pull the operation ID out of the buffer */ + ret = rxrpc_call_read_data(call, &opid, sizeof(opid), 0); + if (ret < 0) { + printk("Unexpected error from read-data: %d\n", ret); + if (call->app_call_state != RXRPC_CSTATE_ERROR) + rxrpc_call_abort(call, ret); + _leave(""); + return; + } + call->app_opcode = ntohl(opid); + + /* locate the operation in the available ops table */ + optbl = call->conn->service->ops_begin; + lo = 0; + hi = call->conn->service->ops_end - optbl; + + while (lo < hi) { + int mid = (hi + lo) / 2; + op = &optbl[mid]; + if (call->app_opcode == op->id) + goto found_op; + if (call->app_opcode > op->id) + lo = mid + 1; + else + hi = mid; + } + + /* search failed */ + kproto("Rx Client requested operation %d from %s service", + call->app_opcode, call->conn->service->name); + rxrpc_call_abort(call, -EINVAL); + _leave(" [inval]"); + return; + + found_op: + _proto("Rx Client requested operation %s from %s service", + op->name, call->conn->service->name); + + /* we're now waiting for the argument block (unless the call + * was aborted) */ + spin_lock(&call->lock); + if (call->app_call_state == RXRPC_CSTATE_SRVR_RCV_OPID || + call->app_call_state == RXRPC_CSTATE_SRVR_SND_REPLY) { + if (!call->app_last_rcv) + call->app_call_state = + RXRPC_CSTATE_SRVR_RCV_ARGS; + else if (call->app_ready_qty > 0) + call->app_call_state = + RXRPC_CSTATE_SRVR_GOT_ARGS; + else + call->app_call_state = + RXRPC_CSTATE_SRVR_SND_REPLY; + call->app_mark = op->asize; + call->app_user = op->user; + } + spin_unlock(&call->lock); + + _state(call); + break; + + case RXRPC_CSTATE_SRVR_RCV_ARGS: + /* change state if just received last packet of arg block */ + if (call->app_last_rcv) + call->app_call_state = RXRPC_CSTATE_SRVR_GOT_ARGS; + spin_unlock(&call->lock); + + _state(call); + break; + + case RXRPC_CSTATE_CLNT_RCV_REPLY: + /* change state if just received last packet of reply block */ + rmtimo = 0; + if (call->app_last_rcv) { + call->app_call_state = RXRPC_CSTATE_CLNT_GOT_REPLY; + rmtimo = 1; + } + spin_unlock(&call->lock); + + if (rmtimo) { + del_timer_sync(&call->acks_timeout); + del_timer_sync(&call->rcv_timeout); + del_timer_sync(&call->ackr_dfr_timo); + } + + _state(call); + break; + + default: + /* deal with data reception in an unexpected state */ + printk("Unexpected state [[[ %u ]]]\n", call->app_call_state); + __rxrpc_call_abort(call, -EBADMSG); + _leave(""); + return; + } + + if (call->app_call_state == RXRPC_CSTATE_CLNT_RCV_REPLY && + call->app_last_rcv) + BUG(); + + /* otherwise just invoke the data function whenever we can satisfy its desire for more + * data + */ + _proto("Rx Received Op Data: st=%u qty=%Zu mk=%Zu%s", + call->app_call_state, call->app_ready_qty, call->app_mark, + call->app_last_rcv ? " last-rcvd" : ""); + + spin_lock(&call->lock); + + ret = __rxrpc_call_read_data(call); + switch (ret) { + case 0: + spin_unlock(&call->lock); + call->app_attn_func(call); + break; + case -EAGAIN: + spin_unlock(&call->lock); + break; + case -ECONNABORTED: + spin_unlock(&call->lock); + break; + default: + __rxrpc_call_abort(call, ret); + break; + } + + _state(call); + + _leave(""); + +} /* end rxrpc_call_receive_data_packet() */ + +/*****************************************************************************/ +/* + * received an ACK packet + */ +static void rxrpc_call_receive_ack_packet(struct rxrpc_call *call, + struct rxrpc_message *msg) +{ + struct rxrpc_ackpacket _ack, *ap; + rxrpc_serial_net_t serial; + rxrpc_seq_t seq; + int ret; + + _enter("%p{%u},%p{%u}", call, ntohl(call->call_id), msg, msg->seq); + + /* extract the basic ACK record */ + ap = skb_header_pointer(msg->pkt, msg->offset, sizeof(_ack), &_ack); + if (ap == NULL) { + printk("Rx Received short ACK packet\n"); + return; + } + msg->offset += sizeof(_ack); + + serial = ap->serial; + seq = ntohl(ap->firstPacket); + + _proto("Rx Received ACK %%%d { b=%hu m=%hu f=%u p=%u s=%u r=%s n=%u }", + ntohl(msg->hdr.serial), + ntohs(ap->bufferSpace), + ntohs(ap->maxSkew), + seq, + ntohl(ap->previousPacket), + ntohl(serial), + rxrpc_acks[ap->reason], + call->ackr.nAcks + ); + + /* check the other side isn't ACK'ing a sequence number I haven't sent + * yet */ + if (ap->nAcks > 0 && + (seq > call->snd_seq_count || + seq + ap->nAcks - 1 > call->snd_seq_count)) { + printk("Received ACK (#%u-#%u) for unsent packet\n", + seq, seq + ap->nAcks - 1); + rxrpc_call_abort(call, -EINVAL); + _leave(""); + return; + } + + /* deal with RTT calculation */ + if (serial) { + struct rxrpc_message *rttmsg; + + /* find the prompting packet */ + spin_lock(&call->lock); + if (call->snd_ping && call->snd_ping->hdr.serial == serial) { + /* it was a ping packet */ + rttmsg = call->snd_ping; + call->snd_ping = NULL; + spin_unlock(&call->lock); + + if (rttmsg) { + rttmsg->rttdone = 1; + rxrpc_peer_calculate_rtt(call->conn->peer, + rttmsg, msg); + rxrpc_put_message(rttmsg); + } + } + else { + struct list_head *_p; + + /* it ought to be a data packet - look in the pending + * ACK list */ + list_for_each(_p, &call->acks_pendq) { + rttmsg = list_entry(_p, struct rxrpc_message, + link); + if (rttmsg->hdr.serial == serial) { + if (rttmsg->rttdone) + /* never do RTT twice without + * resending */ + break; + + rttmsg->rttdone = 1; + rxrpc_peer_calculate_rtt( + call->conn->peer, rttmsg, msg); + break; + } + } + spin_unlock(&call->lock); + } + } + + switch (ap->reason) { + /* deal with negative/positive acknowledgement of data + * packets */ + case RXRPC_ACK_REQUESTED: + case RXRPC_ACK_DELAY: + case RXRPC_ACK_IDLE: + rxrpc_call_definitively_ACK(call, seq - 1); + + case RXRPC_ACK_DUPLICATE: + case RXRPC_ACK_OUT_OF_SEQUENCE: + case RXRPC_ACK_EXCEEDS_WINDOW: + call->snd_resend_cnt = 0; + ret = rxrpc_call_record_ACK(call, msg, seq, ap->nAcks); + if (ret < 0) + rxrpc_call_abort(call, ret); + break; + + /* respond to ping packets immediately */ + case RXRPC_ACK_PING: + rxrpc_call_generate_ACK(call, &msg->hdr, ap); + break; + + /* only record RTT on ping response packets */ + case RXRPC_ACK_PING_RESPONSE: + if (call->snd_ping) { + struct rxrpc_message *rttmsg; + + /* only do RTT stuff if the response matches the + * retained ping */ + rttmsg = NULL; + spin_lock(&call->lock); + if (call->snd_ping && + call->snd_ping->hdr.serial == ap->serial) { + rttmsg = call->snd_ping; + call->snd_ping = NULL; + } + spin_unlock(&call->lock); + + if (rttmsg) { + rttmsg->rttdone = 1; + rxrpc_peer_calculate_rtt(call->conn->peer, + rttmsg, msg); + rxrpc_put_message(rttmsg); + } + } + break; + + default: + printk("Unsupported ACK reason %u\n", ap->reason); + break; + } + + _leave(""); +} /* end rxrpc_call_receive_ack_packet() */ + +/*****************************************************************************/ +/* + * record definitive ACKs for all messages up to and including the one with the + * 'highest' seq + */ +static void rxrpc_call_definitively_ACK(struct rxrpc_call *call, + rxrpc_seq_t highest) +{ + struct rxrpc_message *msg; + int now_complete; + + _enter("%p{ads=%u},%u", call, call->acks_dftv_seq, highest); + + while (call->acks_dftv_seq < highest) { + call->acks_dftv_seq++; + + _proto("Definitive ACK on packet #%u", call->acks_dftv_seq); + + /* discard those at front of queue until message with highest + * ACK is found */ + spin_lock(&call->lock); + msg = NULL; + if (!list_empty(&call->acks_pendq)) { + msg = list_entry(call->acks_pendq.next, + struct rxrpc_message, link); + list_del_init(&msg->link); /* dequeue */ + if (msg->state == RXRPC_MSG_SENT) + call->acks_pend_cnt--; + } + spin_unlock(&call->lock); + + /* insanity check */ + if (!msg) + panic("%s(): acks_pendq unexpectedly empty\n", + __FUNCTION__); + + if (msg->seq != call->acks_dftv_seq) + panic("%s(): Packet #%u expected at front of acks_pendq" + " (#%u found)\n", + __FUNCTION__, call->acks_dftv_seq, msg->seq); + + /* discard the message */ + msg->state = RXRPC_MSG_DONE; + rxrpc_put_message(msg); + } + + /* if all sent packets are definitively ACK'd then prod any sleepers just in case */ + now_complete = 0; + spin_lock(&call->lock); + if (call->acks_dftv_seq == call->snd_seq_count) { + if (call->app_call_state != RXRPC_CSTATE_COMPLETE) { + call->app_call_state = RXRPC_CSTATE_COMPLETE; + _state(call); + now_complete = 1; + } + } + spin_unlock(&call->lock); + + if (now_complete) { + del_timer_sync(&call->acks_timeout); + del_timer_sync(&call->rcv_timeout); + del_timer_sync(&call->ackr_dfr_timo); + call->app_attn_func(call); + } + + _leave(""); +} /* end rxrpc_call_definitively_ACK() */ + +/*****************************************************************************/ +/* + * record the specified amount of ACKs/NAKs + */ +static int rxrpc_call_record_ACK(struct rxrpc_call *call, + struct rxrpc_message *msg, + rxrpc_seq_t seq, + size_t count) +{ + struct rxrpc_message *dmsg; + struct list_head *_p; + rxrpc_seq_t highest; + unsigned ix; + size_t chunk; + char resend, now_complete; + u8 acks[16]; + + _enter("%p{apc=%u ads=%u},%p,%u,%Zu", + call, call->acks_pend_cnt, call->acks_dftv_seq, + msg, seq, count); + + /* handle re-ACK'ing of definitively ACK'd packets (may be out-of-order + * ACKs) */ + if (seq <= call->acks_dftv_seq) { + unsigned delta = call->acks_dftv_seq - seq; + + if (count <= delta) { + _leave(" = 0 [all definitively ACK'd]"); + return 0; + } + + seq += delta; + count -= delta; + msg->offset += delta; + } + + highest = seq + count - 1; + resend = 0; + while (count > 0) { + /* extract up to 16 ACK slots at a time */ + chunk = min(count, sizeof(acks)); + count -= chunk; + + memset(acks, 2, sizeof(acks)); + + if (skb_copy_bits(msg->pkt, msg->offset, &acks, chunk) < 0) { + printk("Rx Received short ACK packet\n"); + _leave(" = -EINVAL"); + return -EINVAL; + } + msg->offset += chunk; + + /* check that the ACK set is valid */ + for (ix = 0; ix < chunk; ix++) { + switch (acks[ix]) { + case RXRPC_ACK_TYPE_ACK: + break; + case RXRPC_ACK_TYPE_NACK: + resend = 1; + break; + default: + printk("Rx Received unsupported ACK state" + " %u\n", acks[ix]); + _leave(" = -EINVAL"); + return -EINVAL; + } + } + + _proto("Rx ACK of packets #%u-#%u " + "[%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c%c] (pend=%u)", + seq, (unsigned) (seq + chunk - 1), + _acktype[acks[0x0]], + _acktype[acks[0x1]], + _acktype[acks[0x2]], + _acktype[acks[0x3]], + _acktype[acks[0x4]], + _acktype[acks[0x5]], + _acktype[acks[0x6]], + _acktype[acks[0x7]], + _acktype[acks[0x8]], + _acktype[acks[0x9]], + _acktype[acks[0xA]], + _acktype[acks[0xB]], + _acktype[acks[0xC]], + _acktype[acks[0xD]], + _acktype[acks[0xE]], + _acktype[acks[0xF]], + call->acks_pend_cnt + ); + + /* mark the packets in the ACK queue as being provisionally + * ACK'd */ + ix = 0; + spin_lock(&call->lock); + + /* find the first packet ACK'd/NAK'd here */ + list_for_each(_p, &call->acks_pendq) { + dmsg = list_entry(_p, struct rxrpc_message, link); + if (dmsg->seq == seq) + goto found_first; + _debug("- %u: skipping #%u", ix, dmsg->seq); + } + goto bad_queue; + + found_first: + do { + _debug("- %u: processing #%u (%c) apc=%u", + ix, dmsg->seq, _acktype[acks[ix]], + call->acks_pend_cnt); + + if (acks[ix] == RXRPC_ACK_TYPE_ACK) { + if (dmsg->state == RXRPC_MSG_SENT) + call->acks_pend_cnt--; + dmsg->state = RXRPC_MSG_ACKED; + } + else { + if (dmsg->state == RXRPC_MSG_ACKED) + call->acks_pend_cnt++; + dmsg->state = RXRPC_MSG_SENT; + } + ix++; + seq++; + + _p = dmsg->link.next; + dmsg = list_entry(_p, struct rxrpc_message, link); + } while(ix < chunk && + _p != &call->acks_pendq && + dmsg->seq == seq); + + if (ix < chunk) + goto bad_queue; + + spin_unlock(&call->lock); + } + + if (resend) + rxrpc_call_resend(call, highest); + + /* if all packets are provisionally ACK'd, then wake up anyone who's + * waiting for that */ + now_complete = 0; + spin_lock(&call->lock); + if (call->acks_pend_cnt == 0) { + if (call->app_call_state == RXRPC_CSTATE_SRVR_RCV_FINAL_ACK) { + call->app_call_state = RXRPC_CSTATE_COMPLETE; + _state(call); + } + now_complete = 1; + } + spin_unlock(&call->lock); + + if (now_complete) { + _debug("- wake up waiters"); + del_timer_sync(&call->acks_timeout); + del_timer_sync(&call->rcv_timeout); + del_timer_sync(&call->ackr_dfr_timo); + call->app_attn_func(call); + } + + _leave(" = 0 (apc=%u)", call->acks_pend_cnt); + return 0; + + bad_queue: + panic("%s(): acks_pendq in bad state (packet #%u absent)\n", + __FUNCTION__, seq); + +} /* end rxrpc_call_record_ACK() */ + +/*****************************************************************************/ +/* + * transfer data from the ready packet queue to the asynchronous read buffer + * - since this func is the only one going to look at packets queued on + * app_readyq, we don't need a lock to modify or access them, only to modify + * the queue pointers + * - called with call->lock held + * - the buffer must be in kernel space + * - returns: + * 0 if buffer filled + * -EAGAIN if buffer not filled and more data to come + * -EBADMSG if last packet received and insufficient data left + * -ECONNABORTED if the call has in an error state + */ +static int __rxrpc_call_read_data(struct rxrpc_call *call) +{ + struct rxrpc_message *msg; + size_t qty; + int ret; + + _enter("%p{as=%d buf=%p qty=%Zu/%Zu}", + call, + call->app_async_read, call->app_read_buf, + call->app_ready_qty, call->app_mark); + + /* check the state */ + switch (call->app_call_state) { + case RXRPC_CSTATE_SRVR_RCV_ARGS: + case RXRPC_CSTATE_CLNT_RCV_REPLY: + if (call->app_last_rcv) { + printk("%s(%p,%p,%Zd):" + " Inconsistent call state (%s, last pkt)", + __FUNCTION__, + call, call->app_read_buf, call->app_mark, + rxrpc_call_states[call->app_call_state]); + BUG(); + } + break; + + case RXRPC_CSTATE_SRVR_RCV_OPID: + case RXRPC_CSTATE_SRVR_GOT_ARGS: + case RXRPC_CSTATE_CLNT_GOT_REPLY: + break; + + case RXRPC_CSTATE_SRVR_SND_REPLY: + if (!call->app_last_rcv) { + printk("%s(%p,%p,%Zd):" + " Inconsistent call state (%s, not last pkt)", + __FUNCTION__, + call, call->app_read_buf, call->app_mark, + rxrpc_call_states[call->app_call_state]); + BUG(); + } + _debug("Trying to read data from call in SND_REPLY state"); + break; + + case RXRPC_CSTATE_ERROR: + _leave(" = -ECONNABORTED"); + return -ECONNABORTED; + + default: + printk("reading in unexpected state [[[ %u ]]]\n", + call->app_call_state); + BUG(); + } + + /* handle the case of not having an async buffer */ + if (!call->app_async_read) { + if (call->app_mark == RXRPC_APP_MARK_EOF) { + ret = call->app_last_rcv ? 0 : -EAGAIN; + } + else { + if (call->app_mark >= call->app_ready_qty) { + call->app_mark = RXRPC_APP_MARK_EOF; + ret = 0; + } + else { + ret = call->app_last_rcv ? -EBADMSG : -EAGAIN; + } + } + + _leave(" = %d [no buf]", ret); + return 0; + } + + while (!list_empty(&call->app_readyq) && call->app_mark > 0) { + msg = list_entry(call->app_readyq.next, + struct rxrpc_message, link); + + /* drag as much data as we need out of this packet */ + qty = min(call->app_mark, msg->dsize); + + _debug("reading %Zu from skb=%p off=%lu", + qty, msg->pkt, msg->offset); + + if (call->app_read_buf) + if (skb_copy_bits(msg->pkt, msg->offset, + call->app_read_buf, qty) < 0) + panic("%s: Failed to copy data from packet:" + " (%p,%p,%Zd)", + __FUNCTION__, + call, call->app_read_buf, qty); + + /* if that packet is now empty, discard it */ + call->app_ready_qty -= qty; + msg->dsize -= qty; + + if (msg->dsize == 0) { + list_del_init(&msg->link); + rxrpc_put_message(msg); + } + else { + msg->offset += qty; + } + + call->app_mark -= qty; + if (call->app_read_buf) + call->app_read_buf += qty; + } + + if (call->app_mark == 0) { + call->app_async_read = 0; + call->app_mark = RXRPC_APP_MARK_EOF; + call->app_read_buf = NULL; + + /* adjust the state if used up all packets */ + if (list_empty(&call->app_readyq) && call->app_last_rcv) { + switch (call->app_call_state) { + case RXRPC_CSTATE_SRVR_RCV_OPID: + call->app_call_state = RXRPC_CSTATE_SRVR_SND_REPLY; + call->app_mark = RXRPC_APP_MARK_EOF; + _state(call); + del_timer_sync(&call->rcv_timeout); + break; + case RXRPC_CSTATE_SRVR_GOT_ARGS: + call->app_call_state = RXRPC_CSTATE_SRVR_SND_REPLY; + _state(call); + del_timer_sync(&call->rcv_timeout); + break; + default: + call->app_call_state = RXRPC_CSTATE_COMPLETE; + _state(call); + del_timer_sync(&call->acks_timeout); + del_timer_sync(&call->ackr_dfr_timo); + del_timer_sync(&call->rcv_timeout); + break; + } + } + + _leave(" = 0"); + return 0; + } + + if (call->app_last_rcv) { + _debug("Insufficient data (%Zu/%Zu)", + call->app_ready_qty, call->app_mark); + call->app_async_read = 0; + call->app_mark = RXRPC_APP_MARK_EOF; + call->app_read_buf = NULL; + + _leave(" = -EBADMSG"); + return -EBADMSG; + } + + _leave(" = -EAGAIN"); + return -EAGAIN; +} /* end __rxrpc_call_read_data() */ + +/*****************************************************************************/ +/* + * attempt to read the specified amount of data from the call's ready queue + * into the buffer provided + * - since this func is the only one going to look at packets queued on + * app_readyq, we don't need a lock to modify or access them, only to modify + * the queue pointers + * - if the buffer pointer is NULL, then data is merely drained, not copied + * - if flags&RXRPC_CALL_READ_BLOCK, then the function will wait until there is + * enough data or an error will be generated + * - note that the caller must have added the calling task to the call's wait + * queue beforehand + * - if flags&RXRPC_CALL_READ_ALL, then an error will be generated if this + * function doesn't read all available data + */ +int rxrpc_call_read_data(struct rxrpc_call *call, + void *buffer, size_t size, int flags) +{ + int ret; + + _enter("%p{arq=%Zu},%p,%Zd,%x", + call, call->app_ready_qty, buffer, size, flags); + + spin_lock(&call->lock); + + if (unlikely(!!call->app_read_buf)) { + spin_unlock(&call->lock); + _leave(" = -EBUSY"); + return -EBUSY; + } + + call->app_mark = size; + call->app_read_buf = buffer; + call->app_async_read = 1; + call->app_read_count++; + + /* read as much data as possible */ + ret = __rxrpc_call_read_data(call); + switch (ret) { + case 0: + if (flags & RXRPC_CALL_READ_ALL && + (!call->app_last_rcv || call->app_ready_qty > 0)) { + _leave(" = -EBADMSG"); + __rxrpc_call_abort(call, -EBADMSG); + return -EBADMSG; + } + + spin_unlock(&call->lock); + call->app_attn_func(call); + _leave(" = 0"); + return ret; + + case -ECONNABORTED: + spin_unlock(&call->lock); + _leave(" = %d [aborted]", ret); + return ret; + + default: + __rxrpc_call_abort(call, ret); + _leave(" = %d", ret); + return ret; + + case -EAGAIN: + spin_unlock(&call->lock); + + if (!(flags & RXRPC_CALL_READ_BLOCK)) { + _leave(" = -EAGAIN"); + return -EAGAIN; + } + + /* wait for the data to arrive */ + _debug("blocking for data arrival"); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (!call->app_async_read || signal_pending(current)) + break; + schedule(); + } + set_current_state(TASK_RUNNING); + + if (signal_pending(current)) { + _leave(" = -EINTR"); + return -EINTR; + } + + if (call->app_call_state == RXRPC_CSTATE_ERROR) { + _leave(" = -ECONNABORTED"); + return -ECONNABORTED; + } + + _leave(" = 0"); + return 0; + } + +} /* end rxrpc_call_read_data() */ + +/*****************************************************************************/ +/* + * write data to a call + * - the data may not be sent immediately if it doesn't fill a buffer + * - if we can't queue all the data for buffering now, siov[] will have been + * adjusted to take account of what has been sent + */ +int rxrpc_call_write_data(struct rxrpc_call *call, + size_t sioc, + struct kvec *siov, + u8 rxhdr_flags, + int alloc_flags, + int dup_data, + size_t *size_sent) +{ + struct rxrpc_message *msg; + struct kvec *sptr; + size_t space, size, chunk, tmp; + char *buf; + int ret; + + _enter("%p,%Zu,%p,%02x,%x,%d,%p", + call, sioc, siov, rxhdr_flags, alloc_flags, dup_data, + size_sent); + + *size_sent = 0; + size = 0; + ret = -EINVAL; + + /* can't send more if we've sent last packet from this end */ + switch (call->app_call_state) { + case RXRPC_CSTATE_SRVR_SND_REPLY: + case RXRPC_CSTATE_CLNT_SND_ARGS: + break; + case RXRPC_CSTATE_ERROR: + ret = call->app_errno; + default: + goto out; + } + + /* calculate how much data we've been given */ + sptr = siov; + for (; sioc > 0; sptr++, sioc--) { + if (!sptr->iov_len) + continue; + + if (!sptr->iov_base) + goto out; + + size += sptr->iov_len; + } + + _debug("- size=%Zu mtu=%Zu", size, call->conn->mtu_size); + + do { + /* make sure there's a message under construction */ + if (!call->snd_nextmsg) { + /* no - allocate a message with no data yet attached */ + ret = rxrpc_conn_newmsg(call->conn, call, + RXRPC_PACKET_TYPE_DATA, + 0, NULL, alloc_flags, + &call->snd_nextmsg); + if (ret < 0) + goto out; + _debug("- allocated new message [ds=%Zu]", + call->snd_nextmsg->dsize); + } + + msg = call->snd_nextmsg; + msg->hdr.flags |= rxhdr_flags; + + /* deal with zero-length terminal packet */ + if (size == 0) { + if (rxhdr_flags & RXRPC_LAST_PACKET) { + ret = rxrpc_call_flush(call); + if (ret < 0) + goto out; + } + break; + } + + /* work out how much space current packet has available */ + space = call->conn->mtu_size - msg->dsize; + chunk = min(space, size); + + _debug("- [before] space=%Zu chunk=%Zu", space, chunk); + + while (!siov->iov_len) + siov++; + + /* if we are going to have to duplicate the data then coalesce + * it too */ + if (dup_data) { + /* don't allocate more that 1 page at a time */ + if (chunk > PAGE_SIZE) + chunk = PAGE_SIZE; + + /* allocate a data buffer and attach to the message */ + buf = kmalloc(chunk, alloc_flags); + if (unlikely(!buf)) { + if (msg->dsize == + sizeof(struct rxrpc_header)) { + /* discard an empty msg and wind back + * the seq counter */ + rxrpc_put_message(msg); + call->snd_nextmsg = NULL; + call->snd_seq_count--; + } + + ret = -ENOMEM; + goto out; + } + + tmp = msg->dcount++; + set_bit(tmp, &msg->dfree); + msg->data[tmp].iov_base = buf; + msg->data[tmp].iov_len = chunk; + msg->dsize += chunk; + *size_sent += chunk; + size -= chunk; + + /* load the buffer with data */ + while (chunk > 0) { + tmp = min(chunk, siov->iov_len); + memcpy(buf, siov->iov_base, tmp); + buf += tmp; + siov->iov_base += tmp; + siov->iov_len -= tmp; + if (!siov->iov_len) + siov++; + chunk -= tmp; + } + } + else { + /* we want to attach the supplied buffers directly */ + while (chunk > 0 && + msg->dcount < RXRPC_MSG_MAX_IOCS) { + tmp = msg->dcount++; + msg->data[tmp].iov_base = siov->iov_base; + msg->data[tmp].iov_len = siov->iov_len; + msg->dsize += siov->iov_len; + *size_sent += siov->iov_len; + size -= siov->iov_len; + chunk -= siov->iov_len; + siov++; + } + } + + _debug("- [loaded] chunk=%Zu size=%Zu", chunk, size); + + /* dispatch the message when full, final or requesting ACK */ + if (msg->dsize >= call->conn->mtu_size || rxhdr_flags) { + ret = rxrpc_call_flush(call); + if (ret < 0) + goto out; + } + + } while(size > 0); + + ret = 0; + out: + _leave(" = %d (%Zd queued, %Zd rem)", ret, *size_sent, size); + return ret; + +} /* end rxrpc_call_write_data() */ + +/*****************************************************************************/ +/* + * flush outstanding packets to the network + */ +static int rxrpc_call_flush(struct rxrpc_call *call) +{ + struct rxrpc_message *msg; + int ret = 0; + + _enter("%p", call); + + rxrpc_get_call(call); + + /* if there's a packet under construction, then dispatch it now */ + if (call->snd_nextmsg) { + msg = call->snd_nextmsg; + call->snd_nextmsg = NULL; + + if (msg->hdr.flags & RXRPC_LAST_PACKET) { + msg->hdr.flags &= ~RXRPC_MORE_PACKETS; + if (call->app_call_state != RXRPC_CSTATE_CLNT_SND_ARGS) + msg->hdr.flags |= RXRPC_REQUEST_ACK; + } + else { + msg->hdr.flags |= RXRPC_MORE_PACKETS; + } + + _proto("Sending DATA message { ds=%Zu dc=%u df=%02lu }", + msg->dsize, msg->dcount, msg->dfree); + + /* queue and adjust call state */ + spin_lock(&call->lock); + list_add_tail(&msg->link, &call->acks_pendq); + + /* decide what to do depending on current state and if this is + * the last packet */ + ret = -EINVAL; + switch (call->app_call_state) { + case RXRPC_CSTATE_SRVR_SND_REPLY: + if (msg->hdr.flags & RXRPC_LAST_PACKET) { + call->app_call_state = + RXRPC_CSTATE_SRVR_RCV_FINAL_ACK; + _state(call); + } + break; + + case RXRPC_CSTATE_CLNT_SND_ARGS: + if (msg->hdr.flags & RXRPC_LAST_PACKET) { + call->app_call_state = + RXRPC_CSTATE_CLNT_RCV_REPLY; + _state(call); + } + break; + + case RXRPC_CSTATE_ERROR: + ret = call->app_errno; + default: + spin_unlock(&call->lock); + goto out; + } + + call->acks_pend_cnt++; + + mod_timer(&call->acks_timeout, + __rxrpc_rtt_based_timeout(call, + rxrpc_call_acks_timeout)); + + spin_unlock(&call->lock); + + ret = rxrpc_conn_sendmsg(call->conn, msg); + if (ret == 0) + call->pkt_snd_count++; + } + + out: + rxrpc_put_call(call); + + _leave(" = %d", ret); + return ret; + +} /* end rxrpc_call_flush() */ + +/*****************************************************************************/ +/* + * resend NAK'd or unacknowledged packets up to the highest one specified + */ +static void rxrpc_call_resend(struct rxrpc_call *call, rxrpc_seq_t highest) +{ + struct rxrpc_message *msg; + struct list_head *_p; + rxrpc_seq_t seq = 0; + + _enter("%p,%u", call, highest); + + _proto("Rx Resend required"); + + /* handle too many resends */ + if (call->snd_resend_cnt >= rxrpc_call_max_resend) { + _debug("Aborting due to too many resends (rcv=%d)", + call->pkt_rcv_count); + rxrpc_call_abort(call, + call->pkt_rcv_count > 0 ? -EIO : -ETIMEDOUT); + _leave(""); + return; + } + + spin_lock(&call->lock); + call->snd_resend_cnt++; + for (;;) { + /* determine which the next packet we might need to ACK is */ + if (seq <= call->acks_dftv_seq) + seq = call->acks_dftv_seq; + seq++; + + if (seq > highest) + break; + + /* look for the packet in the pending-ACK queue */ + list_for_each(_p, &call->acks_pendq) { + msg = list_entry(_p, struct rxrpc_message, link); + if (msg->seq == seq) + goto found_msg; + } + + panic("%s(%p,%d):" + " Inconsistent pending-ACK queue (ds=%u sc=%u sq=%u)\n", + __FUNCTION__, call, highest, + call->acks_dftv_seq, call->snd_seq_count, seq); + + found_msg: + if (msg->state != RXRPC_MSG_SENT) + continue; /* only un-ACK'd packets */ + + rxrpc_get_message(msg); + spin_unlock(&call->lock); + + /* send each message again (and ignore any errors we might + * incur) */ + _proto("Resending DATA message { ds=%Zu dc=%u df=%02lu }", + msg->dsize, msg->dcount, msg->dfree); + + if (rxrpc_conn_sendmsg(call->conn, msg) == 0) + call->pkt_snd_count++; + + rxrpc_put_message(msg); + + spin_lock(&call->lock); + } + + /* reset the timeout */ + mod_timer(&call->acks_timeout, + __rxrpc_rtt_based_timeout(call, rxrpc_call_acks_timeout)); + + spin_unlock(&call->lock); + + _leave(""); +} /* end rxrpc_call_resend() */ + +/*****************************************************************************/ +/* + * handle an ICMP error being applied to a call + */ +void rxrpc_call_handle_error(struct rxrpc_call *call, int local, int errno) +{ + _enter("%p{%u},%d", call, ntohl(call->call_id), errno); + + /* if this call is already aborted, then just wake up any waiters */ + if (call->app_call_state == RXRPC_CSTATE_ERROR) { + call->app_error_func(call); + } + else { + /* tell the app layer what happened */ + spin_lock(&call->lock); + call->app_call_state = RXRPC_CSTATE_ERROR; + _state(call); + if (local) + call->app_err_state = RXRPC_ESTATE_LOCAL_ERROR; + else + call->app_err_state = RXRPC_ESTATE_REMOTE_ERROR; + call->app_errno = errno; + call->app_mark = RXRPC_APP_MARK_EOF; + call->app_read_buf = NULL; + call->app_async_read = 0; + + /* map the error */ + call->app_aemap_func(call); + + del_timer_sync(&call->acks_timeout); + del_timer_sync(&call->rcv_timeout); + del_timer_sync(&call->ackr_dfr_timo); + + spin_unlock(&call->lock); + + call->app_error_func(call); + } + + _leave(""); +} /* end rxrpc_call_handle_error() */ diff --git a/net/rxrpc/connection.c b/net/rxrpc/connection.c new file mode 100644 index 000000000000..61463c74f8cc --- /dev/null +++ b/net/rxrpc/connection.c @@ -0,0 +1,778 @@ +/* connection.c: Rx connection routines + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +__RXACCT_DECL(atomic_t rxrpc_connection_count); + +LIST_HEAD(rxrpc_conns); +DECLARE_RWSEM(rxrpc_conns_sem); +unsigned long rxrpc_conn_timeout = 60 * 60; + +static void rxrpc_conn_do_timeout(struct rxrpc_connection *conn); + +static void __rxrpc_conn_timeout(rxrpc_timer_t *timer) +{ + struct rxrpc_connection *conn = + list_entry(timer, struct rxrpc_connection, timeout); + + _debug("Rx CONN TIMEOUT [%p{u=%d}]", conn, atomic_read(&conn->usage)); + + rxrpc_conn_do_timeout(conn); +} + +static const struct rxrpc_timer_ops rxrpc_conn_timer_ops = { + .timed_out = __rxrpc_conn_timeout, +}; + +/*****************************************************************************/ +/* + * create a new connection record + */ +static inline int __rxrpc_create_connection(struct rxrpc_peer *peer, + struct rxrpc_connection **_conn) +{ + struct rxrpc_connection *conn; + + _enter("%p",peer); + + /* allocate and initialise a connection record */ + conn = kmalloc(sizeof(struct rxrpc_connection), GFP_KERNEL); + if (!conn) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + memset(conn, 0, sizeof(struct rxrpc_connection)); + atomic_set(&conn->usage, 1); + + INIT_LIST_HEAD(&conn->link); + INIT_LIST_HEAD(&conn->id_link); + init_waitqueue_head(&conn->chanwait); + spin_lock_init(&conn->lock); + rxrpc_timer_init(&conn->timeout, &rxrpc_conn_timer_ops); + + do_gettimeofday(&conn->atime); + conn->mtu_size = 1024; + conn->peer = peer; + conn->trans = peer->trans; + + __RXACCT(atomic_inc(&rxrpc_connection_count)); + *_conn = conn; + _leave(" = 0 (%p)", conn); + + return 0; +} /* end __rxrpc_create_connection() */ + +/*****************************************************************************/ +/* + * create a new connection record for outgoing connections + */ +int rxrpc_create_connection(struct rxrpc_transport *trans, + __be16 port, + __be32 addr, + uint16_t service_id, + void *security, + struct rxrpc_connection **_conn) +{ + struct rxrpc_connection *candidate, *conn; + struct rxrpc_peer *peer; + struct list_head *_p; + __be32 connid; + int ret; + + _enter("%p{%hu},%u,%hu", trans, trans->port, ntohs(port), service_id); + + /* get a peer record */ + ret = rxrpc_peer_lookup(trans, addr, &peer); + if (ret < 0) { + _leave(" = %d", ret); + return ret; + } + + /* allocate and initialise a connection record */ + ret = __rxrpc_create_connection(peer, &candidate); + if (ret < 0) { + rxrpc_put_peer(peer); + _leave(" = %d", ret); + return ret; + } + + /* fill in the specific bits */ + candidate->addr.sin_family = AF_INET; + candidate->addr.sin_port = port; + candidate->addr.sin_addr.s_addr = addr; + + candidate->in_epoch = rxrpc_epoch; + candidate->out_epoch = rxrpc_epoch; + candidate->in_clientflag = 0; + candidate->out_clientflag = RXRPC_CLIENT_INITIATED; + candidate->service_id = htons(service_id); + + /* invent a unique connection ID */ + write_lock(&peer->conn_idlock); + + try_next_id: + connid = htonl(peer->conn_idcounter & RXRPC_CIDMASK); + peer->conn_idcounter += RXRPC_MAXCALLS; + + list_for_each(_p, &peer->conn_idlist) { + conn = list_entry(_p, struct rxrpc_connection, id_link); + if (connid == conn->conn_id) + goto try_next_id; + if (connid > conn->conn_id) + break; + } + + _debug("selected candidate conn ID %x.%u", + ntohl(peer->addr.s_addr), ntohl(connid)); + + candidate->conn_id = connid; + list_add_tail(&candidate->id_link, _p); + + write_unlock(&peer->conn_idlock); + + /* attach to peer */ + candidate->peer = peer; + + write_lock(&peer->conn_lock); + + /* search the peer's transport graveyard list */ + spin_lock(&peer->conn_gylock); + list_for_each(_p, &peer->conn_graveyard) { + conn = list_entry(_p, struct rxrpc_connection, link); + if (conn->addr.sin_port == candidate->addr.sin_port && + conn->security_ix == candidate->security_ix && + conn->service_id == candidate->service_id && + conn->in_clientflag == 0) + goto found_in_graveyard; + } + spin_unlock(&peer->conn_gylock); + + /* pick the new candidate */ + _debug("created connection: {%08x} [out]", ntohl(candidate->conn_id)); + atomic_inc(&peer->conn_count); + conn = candidate; + candidate = NULL; + + make_active: + list_add_tail(&conn->link, &peer->conn_active); + write_unlock(&peer->conn_lock); + + if (candidate) { + write_lock(&peer->conn_idlock); + list_del(&candidate->id_link); + write_unlock(&peer->conn_idlock); + + __RXACCT(atomic_dec(&rxrpc_connection_count)); + kfree(candidate); + } + else { + down_write(&rxrpc_conns_sem); + list_add_tail(&conn->proc_link, &rxrpc_conns); + up_write(&rxrpc_conns_sem); + } + + *_conn = conn; + _leave(" = 0 (%p)", conn); + + return 0; + + /* handle resurrecting a connection from the graveyard */ + found_in_graveyard: + _debug("resurrecting connection: {%08x} [out]", ntohl(conn->conn_id)); + rxrpc_get_connection(conn); + rxrpc_krxtimod_del_timer(&conn->timeout); + list_del_init(&conn->link); + spin_unlock(&peer->conn_gylock); + goto make_active; +} /* end rxrpc_create_connection() */ + +/*****************************************************************************/ +/* + * lookup the connection for an incoming packet + * - create a new connection record for unrecorded incoming connections + */ +int rxrpc_connection_lookup(struct rxrpc_peer *peer, + struct rxrpc_message *msg, + struct rxrpc_connection **_conn) +{ + struct rxrpc_connection *conn, *candidate = NULL; + struct list_head *_p; + int ret, fresh = 0; + __be32 x_epoch, x_connid; + __be16 x_port, x_servid; + __u32 x_secix; + u8 x_clflag; + + _enter("%p{{%hu}},%u,%hu", + peer, + peer->trans->port, + ntohs(msg->pkt->h.uh->source), + ntohs(msg->hdr.serviceId)); + + x_port = msg->pkt->h.uh->source; + x_epoch = msg->hdr.epoch; + x_clflag = msg->hdr.flags & RXRPC_CLIENT_INITIATED; + x_connid = htonl(ntohl(msg->hdr.cid) & RXRPC_CIDMASK); + x_servid = msg->hdr.serviceId; + x_secix = msg->hdr.securityIndex; + + /* [common case] search the transport's active list first */ + read_lock(&peer->conn_lock); + list_for_each(_p, &peer->conn_active) { + conn = list_entry(_p, struct rxrpc_connection, link); + if (conn->addr.sin_port == x_port && + conn->in_epoch == x_epoch && + conn->conn_id == x_connid && + conn->security_ix == x_secix && + conn->service_id == x_servid && + conn->in_clientflag == x_clflag) + goto found_active; + } + read_unlock(&peer->conn_lock); + + /* [uncommon case] not active + * - create a candidate for a new record if an inbound connection + * - only examine the graveyard for an outbound connection + */ + if (x_clflag) { + ret = __rxrpc_create_connection(peer, &candidate); + if (ret < 0) { + _leave(" = %d", ret); + return ret; + } + + /* fill in the specifics */ + candidate->addr.sin_family = AF_INET; + candidate->addr.sin_port = x_port; + candidate->addr.sin_addr.s_addr = msg->pkt->nh.iph->saddr; + candidate->in_epoch = x_epoch; + candidate->out_epoch = x_epoch; + candidate->in_clientflag = RXRPC_CLIENT_INITIATED; + candidate->out_clientflag = 0; + candidate->conn_id = x_connid; + candidate->service_id = x_servid; + candidate->security_ix = x_secix; + } + + /* search the active list again, just in case it appeared whilst we + * were busy */ + write_lock(&peer->conn_lock); + list_for_each(_p, &peer->conn_active) { + conn = list_entry(_p, struct rxrpc_connection, link); + if (conn->addr.sin_port == x_port && + conn->in_epoch == x_epoch && + conn->conn_id == x_connid && + conn->security_ix == x_secix && + conn->service_id == x_servid && + conn->in_clientflag == x_clflag) + goto found_active_second_chance; + } + + /* search the transport's graveyard list */ + spin_lock(&peer->conn_gylock); + list_for_each(_p, &peer->conn_graveyard) { + conn = list_entry(_p, struct rxrpc_connection, link); + if (conn->addr.sin_port == x_port && + conn->in_epoch == x_epoch && + conn->conn_id == x_connid && + conn->security_ix == x_secix && + conn->service_id == x_servid && + conn->in_clientflag == x_clflag) + goto found_in_graveyard; + } + spin_unlock(&peer->conn_gylock); + + /* outbound connections aren't created here */ + if (!x_clflag) { + write_unlock(&peer->conn_lock); + _leave(" = -ENOENT"); + return -ENOENT; + } + + /* we can now add the new candidate to the list */ + _debug("created connection: {%08x} [in]", ntohl(candidate->conn_id)); + rxrpc_get_peer(peer); + conn = candidate; + candidate = NULL; + atomic_inc(&peer->conn_count); + fresh = 1; + + make_active: + list_add_tail(&conn->link, &peer->conn_active); + + success_uwfree: + write_unlock(&peer->conn_lock); + + if (candidate) { + write_lock(&peer->conn_idlock); + list_del(&candidate->id_link); + write_unlock(&peer->conn_idlock); + + __RXACCT(atomic_dec(&rxrpc_connection_count)); + kfree(candidate); + } + + if (fresh) { + down_write(&rxrpc_conns_sem); + list_add_tail(&conn->proc_link, &rxrpc_conns); + up_write(&rxrpc_conns_sem); + } + + success: + *_conn = conn; + _leave(" = 0 (%p)", conn); + return 0; + + /* handle the connection being found in the active list straight off */ + found_active: + rxrpc_get_connection(conn); + read_unlock(&peer->conn_lock); + goto success; + + /* handle resurrecting a connection from the graveyard */ + found_in_graveyard: + _debug("resurrecting connection: {%08x} [in]", ntohl(conn->conn_id)); + rxrpc_get_peer(peer); + rxrpc_get_connection(conn); + rxrpc_krxtimod_del_timer(&conn->timeout); + list_del_init(&conn->link); + spin_unlock(&peer->conn_gylock); + goto make_active; + + /* handle finding the connection on the second time through the active + * list */ + found_active_second_chance: + rxrpc_get_connection(conn); + goto success_uwfree; + +} /* end rxrpc_connection_lookup() */ + +/*****************************************************************************/ +/* + * finish using a connection record + * - it will be transferred to the peer's connection graveyard when refcount + * reaches 0 + */ +void rxrpc_put_connection(struct rxrpc_connection *conn) +{ + struct rxrpc_peer *peer; + + if (!conn) + return; + + _enter("%p{u=%d p=%hu}", + conn, atomic_read(&conn->usage), ntohs(conn->addr.sin_port)); + + peer = conn->peer; + spin_lock(&peer->conn_gylock); + + /* sanity check */ + if (atomic_read(&conn->usage) <= 0) + BUG(); + + if (likely(!atomic_dec_and_test(&conn->usage))) { + spin_unlock(&peer->conn_gylock); + _leave(""); + return; + } + + /* move to graveyard queue */ + _debug("burying connection: {%08x}", ntohl(conn->conn_id)); + list_del(&conn->link); + list_add_tail(&conn->link, &peer->conn_graveyard); + + rxrpc_krxtimod_add_timer(&conn->timeout, rxrpc_conn_timeout * HZ); + + spin_unlock(&peer->conn_gylock); + + rxrpc_put_peer(conn->peer); + + _leave(" [killed]"); +} /* end rxrpc_put_connection() */ + +/*****************************************************************************/ +/* + * free a connection record + */ +static void rxrpc_conn_do_timeout(struct rxrpc_connection *conn) +{ + struct rxrpc_peer *peer; + + _enter("%p{u=%d p=%hu}", + conn, atomic_read(&conn->usage), ntohs(conn->addr.sin_port)); + + peer = conn->peer; + + if (atomic_read(&conn->usage) < 0) + BUG(); + + /* remove from graveyard if still dead */ + spin_lock(&peer->conn_gylock); + if (atomic_read(&conn->usage) == 0) { + list_del_init(&conn->link); + } + else { + conn = NULL; + } + spin_unlock(&peer->conn_gylock); + + if (!conn) { + _leave(""); + return; /* resurrected */ + } + + _debug("--- Destroying Connection %p{%08x} ---", + conn, ntohl(conn->conn_id)); + + down_write(&rxrpc_conns_sem); + list_del(&conn->proc_link); + up_write(&rxrpc_conns_sem); + + write_lock(&peer->conn_idlock); + list_del(&conn->id_link); + write_unlock(&peer->conn_idlock); + + __RXACCT(atomic_dec(&rxrpc_connection_count)); + kfree(conn); + + /* if the graveyard is now empty, wake up anyone waiting for that */ + if (atomic_dec_and_test(&peer->conn_count)) + wake_up(&peer->conn_gy_waitq); + + _leave(" [destroyed]"); +} /* end rxrpc_conn_do_timeout() */ + +/*****************************************************************************/ +/* + * clear all connection records from a peer endpoint + */ +void rxrpc_conn_clearall(struct rxrpc_peer *peer) +{ + DECLARE_WAITQUEUE(myself, current); + + struct rxrpc_connection *conn; + int err; + + _enter("%p", peer); + + /* there shouldn't be any active conns remaining */ + if (!list_empty(&peer->conn_active)) + BUG(); + + /* manually timeout all conns in the graveyard */ + spin_lock(&peer->conn_gylock); + while (!list_empty(&peer->conn_graveyard)) { + conn = list_entry(peer->conn_graveyard.next, + struct rxrpc_connection, link); + err = rxrpc_krxtimod_del_timer(&conn->timeout); + spin_unlock(&peer->conn_gylock); + + if (err == 0) + rxrpc_conn_do_timeout(conn); + + spin_lock(&peer->conn_gylock); + } + spin_unlock(&peer->conn_gylock); + + /* wait for the the conn graveyard to be completely cleared */ + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&peer->conn_gy_waitq, &myself); + + while (atomic_read(&peer->conn_count) != 0) { + schedule(); + set_current_state(TASK_UNINTERRUPTIBLE); + } + + remove_wait_queue(&peer->conn_gy_waitq, &myself); + set_current_state(TASK_RUNNING); + + _leave(""); +} /* end rxrpc_conn_clearall() */ + +/*****************************************************************************/ +/* + * allocate and prepare a message for sending out through the transport + * endpoint + */ +int rxrpc_conn_newmsg(struct rxrpc_connection *conn, + struct rxrpc_call *call, + uint8_t type, + int dcount, + struct kvec diov[], + int alloc_flags, + struct rxrpc_message **_msg) +{ + struct rxrpc_message *msg; + int loop; + + _enter("%p{%d},%p,%u", conn, ntohs(conn->addr.sin_port), call, type); + + if (dcount > 3) { + _leave(" = -EINVAL"); + return -EINVAL; + } + + msg = kmalloc(sizeof(struct rxrpc_message), alloc_flags); + if (!msg) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + memset(msg, 0, sizeof(*msg)); + atomic_set(&msg->usage, 1); + + INIT_LIST_HEAD(&msg->link); + + msg->state = RXRPC_MSG_PREPARED; + + msg->hdr.epoch = conn->out_epoch; + msg->hdr.cid = conn->conn_id | (call ? call->chan_ix : 0); + msg->hdr.callNumber = call ? call->call_id : 0; + msg->hdr.type = type; + msg->hdr.flags = conn->out_clientflag; + msg->hdr.securityIndex = conn->security_ix; + msg->hdr.serviceId = conn->service_id; + + /* generate sequence numbers for data packets */ + if (call) { + switch (type) { + case RXRPC_PACKET_TYPE_DATA: + msg->seq = ++call->snd_seq_count; + msg->hdr.seq = htonl(msg->seq); + break; + case RXRPC_PACKET_TYPE_ACK: + /* ACK sequence numbers are complicated. The following + * may be wrong: + * - jumbo packet ACKs should have a seq number + * - normal ACKs should not + */ + default: + break; + } + } + + msg->dcount = dcount + 1; + msg->dsize = sizeof(msg->hdr); + msg->data[0].iov_len = sizeof(msg->hdr); + msg->data[0].iov_base = &msg->hdr; + + for (loop=0; loop < dcount; loop++) { + msg->dsize += diov[loop].iov_len; + msg->data[loop+1].iov_len = diov[loop].iov_len; + msg->data[loop+1].iov_base = diov[loop].iov_base; + } + + __RXACCT(atomic_inc(&rxrpc_message_count)); + *_msg = msg; + _leave(" = 0 (%p) #%d", msg, atomic_read(&rxrpc_message_count)); + return 0; +} /* end rxrpc_conn_newmsg() */ + +/*****************************************************************************/ +/* + * free a message + */ +void __rxrpc_put_message(struct rxrpc_message *msg) +{ + int loop; + + _enter("%p #%d", msg, atomic_read(&rxrpc_message_count)); + + if (msg->pkt) + kfree_skb(msg->pkt); + rxrpc_put_connection(msg->conn); + + for (loop = 0; loop < 8; loop++) + if (test_bit(loop, &msg->dfree)) + kfree(msg->data[loop].iov_base); + + __RXACCT(atomic_dec(&rxrpc_message_count)); + kfree(msg); + + _leave(""); +} /* end __rxrpc_put_message() */ + +/*****************************************************************************/ +/* + * send a message out through the transport endpoint + */ +int rxrpc_conn_sendmsg(struct rxrpc_connection *conn, + struct rxrpc_message *msg) +{ + struct msghdr msghdr; + int ret; + + _enter("%p{%d}", conn, ntohs(conn->addr.sin_port)); + + /* fill in some fields in the header */ + spin_lock(&conn->lock); + msg->hdr.serial = htonl(++conn->serial_counter); + msg->rttdone = 0; + spin_unlock(&conn->lock); + + /* set up the message to be transmitted */ + msghdr.msg_name = &conn->addr; + msghdr.msg_namelen = sizeof(conn->addr); + msghdr.msg_control = NULL; + msghdr.msg_controllen = 0; + msghdr.msg_flags = MSG_CONFIRM | MSG_DONTWAIT; + + _net("Sending message type %d of %Zd bytes to %08x:%d", + msg->hdr.type, + msg->dsize, + ntohl(conn->addr.sin_addr.s_addr), + ntohs(conn->addr.sin_port)); + + /* send the message */ + ret = kernel_sendmsg(conn->trans->socket, &msghdr, + msg->data, msg->dcount, msg->dsize); + if (ret < 0) { + msg->state = RXRPC_MSG_ERROR; + } else { + msg->state = RXRPC_MSG_SENT; + ret = 0; + + spin_lock(&conn->lock); + do_gettimeofday(&conn->atime); + msg->stamp = conn->atime; + spin_unlock(&conn->lock); + } + + _leave(" = %d", ret); + + return ret; +} /* end rxrpc_conn_sendmsg() */ + +/*****************************************************************************/ +/* + * deal with a subsequent call packet + */ +int rxrpc_conn_receive_call_packet(struct rxrpc_connection *conn, + struct rxrpc_call *call, + struct rxrpc_message *msg) +{ + struct rxrpc_message *pmsg; + struct list_head *_p; + unsigned cix, seq; + int ret = 0; + + _enter("%p,%p,%p", conn, call, msg); + + if (!call) { + cix = ntohl(msg->hdr.cid) & RXRPC_CHANNELMASK; + + spin_lock(&conn->lock); + call = conn->channels[cix]; + + if (!call || call->call_id != msg->hdr.callNumber) { + spin_unlock(&conn->lock); + rxrpc_trans_immediate_abort(conn->trans, msg, -ENOENT); + goto out; + } + else { + rxrpc_get_call(call); + spin_unlock(&conn->lock); + } + } + else { + rxrpc_get_call(call); + } + + _proto("Received packet %%%u [%u] on call %hu:%u:%u", + ntohl(msg->hdr.serial), + ntohl(msg->hdr.seq), + ntohs(msg->hdr.serviceId), + ntohl(conn->conn_id), + ntohl(call->call_id)); + + call->pkt_rcv_count++; + + if (msg->pkt->dst && msg->pkt->dst->dev) + conn->peer->if_mtu = + msg->pkt->dst->dev->mtu - + msg->pkt->dst->dev->hard_header_len; + + /* queue on the call in seq order */ + rxrpc_get_message(msg); + seq = msg->seq; + + spin_lock(&call->lock); + list_for_each(_p, &call->rcv_receiveq) { + pmsg = list_entry(_p, struct rxrpc_message, link); + if (pmsg->seq > seq) + break; + } + list_add_tail(&msg->link, _p); + + /* reset the activity timeout */ + call->flags |= RXRPC_CALL_RCV_PKT; + mod_timer(&call->rcv_timeout,jiffies + rxrpc_call_rcv_timeout * HZ); + + spin_unlock(&call->lock); + + rxrpc_krxiod_queue_call(call); + + rxrpc_put_call(call); + out: + _leave(" = %d", ret); + return ret; +} /* end rxrpc_conn_receive_call_packet() */ + +/*****************************************************************************/ +/* + * handle an ICMP error being applied to a connection + */ +void rxrpc_conn_handle_error(struct rxrpc_connection *conn, + int local, int errno) +{ + struct rxrpc_call *calls[4]; + int loop; + + _enter("%p{%d},%d", conn, ntohs(conn->addr.sin_port), errno); + + /* get a ref to all my calls in one go */ + memset(calls, 0, sizeof(calls)); + spin_lock(&conn->lock); + + for (loop = 3; loop >= 0; loop--) { + if (conn->channels[loop]) { + calls[loop] = conn->channels[loop]; + rxrpc_get_call(calls[loop]); + } + } + + spin_unlock(&conn->lock); + + /* now kick them all */ + for (loop = 3; loop >= 0; loop--) { + if (calls[loop]) { + rxrpc_call_handle_error(calls[loop], local, errno); + rxrpc_put_call(calls[loop]); + } + } + + _leave(""); +} /* end rxrpc_conn_handle_error() */ diff --git a/net/rxrpc/internal.h b/net/rxrpc/internal.h new file mode 100644 index 000000000000..70e52f6b0b64 --- /dev/null +++ b/net/rxrpc/internal.h @@ -0,0 +1,106 @@ +/* internal.h: internal Rx RPC stuff + * + * Copyright (c) 2002 David Howells (dhowells@redhat.com). + */ + +#ifndef RXRPC_INTERNAL_H +#define RXRPC_INTERNAL_H + +#include +#include + +/* + * debug accounting + */ +#if 1 +#define __RXACCT_DECL(X) X +#define __RXACCT(X) do { X; } while(0) +#else +#define __RXACCT_DECL(X) +#define __RXACCT(X) do { } while(0) +#endif + +__RXACCT_DECL(extern atomic_t rxrpc_transport_count); +__RXACCT_DECL(extern atomic_t rxrpc_peer_count); +__RXACCT_DECL(extern atomic_t rxrpc_connection_count); +__RXACCT_DECL(extern atomic_t rxrpc_call_count); +__RXACCT_DECL(extern atomic_t rxrpc_message_count); + +/* + * debug tracing + */ +#define kenter(FMT, a...) printk("==> %s("FMT")\n",__FUNCTION__ , ##a) +#define kleave(FMT, a...) printk("<== %s()"FMT"\n",__FUNCTION__ , ##a) +#define kdebug(FMT, a...) printk(" "FMT"\n" , ##a) +#define kproto(FMT, a...) printk("### "FMT"\n" , ##a) +#define knet(FMT, a...) printk(" "FMT"\n" , ##a) + +#if 0 +#define _enter(FMT, a...) kenter(FMT , ##a) +#define _leave(FMT, a...) kleave(FMT , ##a) +#define _debug(FMT, a...) kdebug(FMT , ##a) +#define _proto(FMT, a...) kproto(FMT , ##a) +#define _net(FMT, a...) knet(FMT , ##a) +#else +#define _enter(FMT, a...) do { if (rxrpc_ktrace) kenter(FMT , ##a); } while(0) +#define _leave(FMT, a...) do { if (rxrpc_ktrace) kleave(FMT , ##a); } while(0) +#define _debug(FMT, a...) do { if (rxrpc_kdebug) kdebug(FMT , ##a); } while(0) +#define _proto(FMT, a...) do { if (rxrpc_kproto) kproto(FMT , ##a); } while(0) +#define _net(FMT, a...) do { if (rxrpc_knet) knet (FMT , ##a); } while(0) +#endif + +static inline void rxrpc_discard_my_signals(void) +{ + while (signal_pending(current)) { + siginfo_t sinfo; + + spin_lock_irq(¤t->sighand->siglock); + dequeue_signal(current, ¤t->blocked, &sinfo); + spin_unlock_irq(¤t->sighand->siglock); + } +} + +/* + * call.c + */ +extern struct list_head rxrpc_calls; +extern struct rw_semaphore rxrpc_calls_sem; + +/* + * connection.c + */ +extern struct list_head rxrpc_conns; +extern struct rw_semaphore rxrpc_conns_sem; +extern unsigned long rxrpc_conn_timeout; + +extern void rxrpc_conn_clearall(struct rxrpc_peer *peer); + +/* + * peer.c + */ +extern struct list_head rxrpc_peers; +extern struct rw_semaphore rxrpc_peers_sem; +extern unsigned long rxrpc_peer_timeout; + +extern void rxrpc_peer_calculate_rtt(struct rxrpc_peer *peer, + struct rxrpc_message *msg, + struct rxrpc_message *resp); + +extern void rxrpc_peer_clearall(struct rxrpc_transport *trans); + + +/* + * proc.c + */ +#ifdef CONFIG_PROC_FS +extern int rxrpc_proc_init(void); +extern void rxrpc_proc_cleanup(void); +#endif + +/* + * transport.c + */ +extern struct list_head rxrpc_proc_transports; +extern struct rw_semaphore rxrpc_proc_transports_sem; + +#endif /* RXRPC_INTERNAL_H */ diff --git a/net/rxrpc/krxiod.c b/net/rxrpc/krxiod.c new file mode 100644 index 000000000000..2b537f425a17 --- /dev/null +++ b/net/rxrpc/krxiod.c @@ -0,0 +1,261 @@ +/* krxiod.c: Rx I/O daemon + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static DECLARE_WAIT_QUEUE_HEAD(rxrpc_krxiod_sleepq); +static DECLARE_COMPLETION(rxrpc_krxiod_dead); + +static atomic_t rxrpc_krxiod_qcount = ATOMIC_INIT(0); + +static LIST_HEAD(rxrpc_krxiod_transportq); +static DEFINE_SPINLOCK(rxrpc_krxiod_transportq_lock); + +static LIST_HEAD(rxrpc_krxiod_callq); +static DEFINE_SPINLOCK(rxrpc_krxiod_callq_lock); + +static volatile int rxrpc_krxiod_die; + +/*****************************************************************************/ +/* + * Rx I/O daemon + */ +static int rxrpc_krxiod(void *arg) +{ + DECLARE_WAITQUEUE(krxiod,current); + + printk("Started krxiod %d\n",current->pid); + + daemonize("krxiod"); + + /* loop around waiting for work to do */ + do { + /* wait for work or to be told to exit */ + _debug("### Begin Wait"); + if (!atomic_read(&rxrpc_krxiod_qcount)) { + set_current_state(TASK_INTERRUPTIBLE); + + add_wait_queue(&rxrpc_krxiod_sleepq, &krxiod); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (atomic_read(&rxrpc_krxiod_qcount) || + rxrpc_krxiod_die || + signal_pending(current)) + break; + + schedule(); + } + + remove_wait_queue(&rxrpc_krxiod_sleepq, &krxiod); + set_current_state(TASK_RUNNING); + } + _debug("### End Wait"); + + /* do work if been given some to do */ + _debug("### Begin Work"); + + /* see if there's a transport in need of attention */ + if (!list_empty(&rxrpc_krxiod_transportq)) { + struct rxrpc_transport *trans = NULL; + + spin_lock_irq(&rxrpc_krxiod_transportq_lock); + + if (!list_empty(&rxrpc_krxiod_transportq)) { + trans = list_entry( + rxrpc_krxiod_transportq.next, + struct rxrpc_transport, + krxiodq_link); + + list_del_init(&trans->krxiodq_link); + atomic_dec(&rxrpc_krxiod_qcount); + + /* make sure it hasn't gone away and doesn't go + * away */ + if (atomic_read(&trans->usage)>0) + rxrpc_get_transport(trans); + else + trans = NULL; + } + + spin_unlock_irq(&rxrpc_krxiod_transportq_lock); + + if (trans) { + rxrpc_trans_receive_packet(trans); + rxrpc_put_transport(trans); + } + } + + /* see if there's a call in need of attention */ + if (!list_empty(&rxrpc_krxiod_callq)) { + struct rxrpc_call *call = NULL; + + spin_lock_irq(&rxrpc_krxiod_callq_lock); + + if (!list_empty(&rxrpc_krxiod_callq)) { + call = list_entry(rxrpc_krxiod_callq.next, + struct rxrpc_call, + rcv_krxiodq_lk); + list_del_init(&call->rcv_krxiodq_lk); + atomic_dec(&rxrpc_krxiod_qcount); + + /* make sure it hasn't gone away and doesn't go + * away */ + if (atomic_read(&call->usage) > 0) { + _debug("@@@ KRXIOD" + " Begin Attend Call %p", call); + rxrpc_get_call(call); + } + else { + call = NULL; + } + } + + spin_unlock_irq(&rxrpc_krxiod_callq_lock); + + if (call) { + rxrpc_call_do_stuff(call); + rxrpc_put_call(call); + _debug("@@@ KRXIOD End Attend Call %p", call); + } + } + + _debug("### End Work"); + + try_to_freeze(PF_FREEZE); + + /* discard pending signals */ + rxrpc_discard_my_signals(); + + } while (!rxrpc_krxiod_die); + + /* and that's all */ + complete_and_exit(&rxrpc_krxiod_dead, 0); + +} /* end rxrpc_krxiod() */ + +/*****************************************************************************/ +/* + * start up a krxiod daemon + */ +int __init rxrpc_krxiod_init(void) +{ + return kernel_thread(rxrpc_krxiod, NULL, 0); + +} /* end rxrpc_krxiod_init() */ + +/*****************************************************************************/ +/* + * kill the krxiod daemon and wait for it to complete + */ +void rxrpc_krxiod_kill(void) +{ + rxrpc_krxiod_die = 1; + wake_up_all(&rxrpc_krxiod_sleepq); + wait_for_completion(&rxrpc_krxiod_dead); + +} /* end rxrpc_krxiod_kill() */ + +/*****************************************************************************/ +/* + * queue a transport for attention by krxiod + */ +void rxrpc_krxiod_queue_transport(struct rxrpc_transport *trans) +{ + unsigned long flags; + + _enter(""); + + if (list_empty(&trans->krxiodq_link)) { + spin_lock_irqsave(&rxrpc_krxiod_transportq_lock, flags); + + if (list_empty(&trans->krxiodq_link)) { + if (atomic_read(&trans->usage) > 0) { + list_add_tail(&trans->krxiodq_link, + &rxrpc_krxiod_transportq); + atomic_inc(&rxrpc_krxiod_qcount); + } + } + + spin_unlock_irqrestore(&rxrpc_krxiod_transportq_lock, flags); + wake_up_all(&rxrpc_krxiod_sleepq); + } + + _leave(""); + +} /* end rxrpc_krxiod_queue_transport() */ + +/*****************************************************************************/ +/* + * dequeue a transport from krxiod's attention queue + */ +void rxrpc_krxiod_dequeue_transport(struct rxrpc_transport *trans) +{ + unsigned long flags; + + _enter(""); + + spin_lock_irqsave(&rxrpc_krxiod_transportq_lock, flags); + if (!list_empty(&trans->krxiodq_link)) { + list_del_init(&trans->krxiodq_link); + atomic_dec(&rxrpc_krxiod_qcount); + } + spin_unlock_irqrestore(&rxrpc_krxiod_transportq_lock, flags); + + _leave(""); + +} /* end rxrpc_krxiod_dequeue_transport() */ + +/*****************************************************************************/ +/* + * queue a call for attention by krxiod + */ +void rxrpc_krxiod_queue_call(struct rxrpc_call *call) +{ + unsigned long flags; + + if (list_empty(&call->rcv_krxiodq_lk)) { + spin_lock_irqsave(&rxrpc_krxiod_callq_lock, flags); + if (atomic_read(&call->usage) > 0) { + list_add_tail(&call->rcv_krxiodq_lk, + &rxrpc_krxiod_callq); + atomic_inc(&rxrpc_krxiod_qcount); + } + spin_unlock_irqrestore(&rxrpc_krxiod_callq_lock, flags); + } + wake_up_all(&rxrpc_krxiod_sleepq); + +} /* end rxrpc_krxiod_queue_call() */ + +/*****************************************************************************/ +/* + * dequeue a call from krxiod's attention queue + */ +void rxrpc_krxiod_dequeue_call(struct rxrpc_call *call) +{ + unsigned long flags; + + spin_lock_irqsave(&rxrpc_krxiod_callq_lock, flags); + if (!list_empty(&call->rcv_krxiodq_lk)) { + list_del_init(&call->rcv_krxiodq_lk); + atomic_dec(&rxrpc_krxiod_qcount); + } + spin_unlock_irqrestore(&rxrpc_krxiod_callq_lock, flags); + +} /* end rxrpc_krxiod_dequeue_call() */ diff --git a/net/rxrpc/krxsecd.c b/net/rxrpc/krxsecd.c new file mode 100644 index 000000000000..6020c89d9228 --- /dev/null +++ b/net/rxrpc/krxsecd.c @@ -0,0 +1,270 @@ +/* krxsecd.c: Rx security daemon + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * This daemon deals with: + * - consulting the application as to whether inbound peers and calls should be authorised + * - generating security challenges for inbound connections + * - responding to security challenges on outbound connections + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static DECLARE_WAIT_QUEUE_HEAD(rxrpc_krxsecd_sleepq); +static DECLARE_COMPLETION(rxrpc_krxsecd_dead); +static volatile int rxrpc_krxsecd_die; + +static atomic_t rxrpc_krxsecd_qcount; + +/* queue of unprocessed inbound messages with seqno #1 and + * RXRPC_CLIENT_INITIATED flag set */ +static LIST_HEAD(rxrpc_krxsecd_initmsgq); +static DEFINE_SPINLOCK(rxrpc_krxsecd_initmsgq_lock); + +static void rxrpc_krxsecd_process_incoming_call(struct rxrpc_message *msg); + +/*****************************************************************************/ +/* + * Rx security daemon + */ +static int rxrpc_krxsecd(void *arg) +{ + DECLARE_WAITQUEUE(krxsecd, current); + + int die; + + printk("Started krxsecd %d\n", current->pid); + + daemonize("krxsecd"); + + /* loop around waiting for work to do */ + do { + /* wait for work or to be told to exit */ + _debug("### Begin Wait"); + if (!atomic_read(&rxrpc_krxsecd_qcount)) { + set_current_state(TASK_INTERRUPTIBLE); + + add_wait_queue(&rxrpc_krxsecd_sleepq, &krxsecd); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (atomic_read(&rxrpc_krxsecd_qcount) || + rxrpc_krxsecd_die || + signal_pending(current)) + break; + + schedule(); + } + + remove_wait_queue(&rxrpc_krxsecd_sleepq, &krxsecd); + set_current_state(TASK_RUNNING); + } + die = rxrpc_krxsecd_die; + _debug("### End Wait"); + + /* see if there're incoming calls in need of authenticating */ + _debug("### Begin Inbound Calls"); + + if (!list_empty(&rxrpc_krxsecd_initmsgq)) { + struct rxrpc_message *msg = NULL; + + spin_lock(&rxrpc_krxsecd_initmsgq_lock); + + if (!list_empty(&rxrpc_krxsecd_initmsgq)) { + msg = list_entry(rxrpc_krxsecd_initmsgq.next, + struct rxrpc_message, link); + list_del_init(&msg->link); + atomic_dec(&rxrpc_krxsecd_qcount); + } + + spin_unlock(&rxrpc_krxsecd_initmsgq_lock); + + if (msg) { + rxrpc_krxsecd_process_incoming_call(msg); + rxrpc_put_message(msg); + } + } + + _debug("### End Inbound Calls"); + + try_to_freeze(PF_FREEZE); + + /* discard pending signals */ + rxrpc_discard_my_signals(); + + } while (!die); + + /* and that's all */ + complete_and_exit(&rxrpc_krxsecd_dead, 0); + +} /* end rxrpc_krxsecd() */ + +/*****************************************************************************/ +/* + * start up a krxsecd daemon + */ +int __init rxrpc_krxsecd_init(void) +{ + return kernel_thread(rxrpc_krxsecd, NULL, 0); + +} /* end rxrpc_krxsecd_init() */ + +/*****************************************************************************/ +/* + * kill the krxsecd daemon and wait for it to complete + */ +void rxrpc_krxsecd_kill(void) +{ + rxrpc_krxsecd_die = 1; + wake_up_all(&rxrpc_krxsecd_sleepq); + wait_for_completion(&rxrpc_krxsecd_dead); + +} /* end rxrpc_krxsecd_kill() */ + +/*****************************************************************************/ +/* + * clear all pending incoming calls for the specified transport + */ +void rxrpc_krxsecd_clear_transport(struct rxrpc_transport *trans) +{ + LIST_HEAD(tmp); + + struct rxrpc_message *msg; + struct list_head *_p, *_n; + + _enter("%p",trans); + + /* move all the messages for this transport onto a temp list */ + spin_lock(&rxrpc_krxsecd_initmsgq_lock); + + list_for_each_safe(_p, _n, &rxrpc_krxsecd_initmsgq) { + msg = list_entry(_p, struct rxrpc_message, link); + if (msg->trans == trans) { + list_del(&msg->link); + list_add_tail(&msg->link, &tmp); + atomic_dec(&rxrpc_krxsecd_qcount); + } + } + + spin_unlock(&rxrpc_krxsecd_initmsgq_lock); + + /* zap all messages on the temp list */ + while (!list_empty(&tmp)) { + msg = list_entry(tmp.next, struct rxrpc_message, link); + list_del_init(&msg->link); + rxrpc_put_message(msg); + } + + _leave(""); +} /* end rxrpc_krxsecd_clear_transport() */ + +/*****************************************************************************/ +/* + * queue a message on the incoming calls list + */ +void rxrpc_krxsecd_queue_incoming_call(struct rxrpc_message *msg) +{ + _enter("%p", msg); + + /* queue for processing by krxsecd */ + spin_lock(&rxrpc_krxsecd_initmsgq_lock); + + if (!rxrpc_krxsecd_die) { + rxrpc_get_message(msg); + list_add_tail(&msg->link, &rxrpc_krxsecd_initmsgq); + atomic_inc(&rxrpc_krxsecd_qcount); + } + + spin_unlock(&rxrpc_krxsecd_initmsgq_lock); + + wake_up(&rxrpc_krxsecd_sleepq); + + _leave(""); +} /* end rxrpc_krxsecd_queue_incoming_call() */ + +/*****************************************************************************/ +/* + * process the initial message of an incoming call + */ +void rxrpc_krxsecd_process_incoming_call(struct rxrpc_message *msg) +{ + struct rxrpc_transport *trans = msg->trans; + struct rxrpc_service *srv; + struct rxrpc_call *call; + struct list_head *_p; + unsigned short sid; + int ret; + + _enter("%p{tr=%p}", msg, trans); + + ret = rxrpc_incoming_call(msg->conn, msg, &call); + if (ret < 0) + goto out; + + /* find the matching service on the transport */ + sid = ntohs(msg->hdr.serviceId); + srv = NULL; + + spin_lock(&trans->lock); + list_for_each(_p, &trans->services) { + srv = list_entry(_p, struct rxrpc_service, link); + if (srv->service_id == sid && try_module_get(srv->owner)) { + /* found a match (made sure it won't vanish) */ + _debug("found service '%s'", srv->name); + call->owner = srv->owner; + break; + } + } + spin_unlock(&trans->lock); + + /* report the new connection + * - the func must inc the call's usage count to keep it + */ + ret = -ENOENT; + if (_p != &trans->services) { + /* attempt to accept the call */ + call->conn->service = srv; + call->app_attn_func = srv->attn_func; + call->app_error_func = srv->error_func; + call->app_aemap_func = srv->aemap_func; + + ret = srv->new_call(call); + + /* send an abort if an error occurred */ + if (ret < 0) { + rxrpc_call_abort(call, ret); + } + else { + /* formally receive and ACK the new packet */ + ret = rxrpc_conn_receive_call_packet(call->conn, + call, msg); + } + } + + rxrpc_put_call(call); + out: + if (ret < 0) + rxrpc_trans_immediate_abort(trans, msg, ret); + + _leave(" (%d)", ret); +} /* end rxrpc_krxsecd_process_incoming_call() */ diff --git a/net/rxrpc/krxtimod.c b/net/rxrpc/krxtimod.c new file mode 100644 index 000000000000..249c2b0290bb --- /dev/null +++ b/net/rxrpc/krxtimod.c @@ -0,0 +1,203 @@ +/* krxtimod.c: RXRPC timeout daemon + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static DECLARE_COMPLETION(krxtimod_alive); +static DECLARE_COMPLETION(krxtimod_dead); +static DECLARE_WAIT_QUEUE_HEAD(krxtimod_sleepq); +static int krxtimod_die; + +static LIST_HEAD(krxtimod_list); +static DEFINE_SPINLOCK(krxtimod_lock); + +static int krxtimod(void *arg); + +/*****************************************************************************/ +/* + * start the timeout daemon + */ +int rxrpc_krxtimod_start(void) +{ + int ret; + + ret = kernel_thread(krxtimod, NULL, 0); + if (ret < 0) + return ret; + + wait_for_completion(&krxtimod_alive); + + return ret; +} /* end rxrpc_krxtimod_start() */ + +/*****************************************************************************/ +/* + * stop the timeout daemon + */ +void rxrpc_krxtimod_kill(void) +{ + /* get rid of my daemon */ + krxtimod_die = 1; + wake_up(&krxtimod_sleepq); + wait_for_completion(&krxtimod_dead); + +} /* end rxrpc_krxtimod_kill() */ + +/*****************************************************************************/ +/* + * timeout processing daemon + */ +static int krxtimod(void *arg) +{ + DECLARE_WAITQUEUE(myself, current); + + rxrpc_timer_t *timer; + + printk("Started krxtimod %d\n", current->pid); + + daemonize("krxtimod"); + + complete(&krxtimod_alive); + + /* loop around looking for things to attend to */ + loop: + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&krxtimod_sleepq, &myself); + + for (;;) { + unsigned long jif; + signed long timeout; + + /* deal with the server being asked to die */ + if (krxtimod_die) { + remove_wait_queue(&krxtimod_sleepq, &myself); + _leave(""); + complete_and_exit(&krxtimod_dead, 0); + } + + try_to_freeze(PF_FREEZE); + + /* discard pending signals */ + rxrpc_discard_my_signals(); + + /* work out the time to elapse before the next event */ + spin_lock(&krxtimod_lock); + if (list_empty(&krxtimod_list)) { + timeout = MAX_SCHEDULE_TIMEOUT; + } + else { + timer = list_entry(krxtimod_list.next, + rxrpc_timer_t, link); + timeout = timer->timo_jif; + jif = jiffies; + + if (time_before_eq((unsigned long) timeout, jif)) + goto immediate; + + else { + timeout = (long) timeout - (long) jiffies; + } + } + spin_unlock(&krxtimod_lock); + + schedule_timeout(timeout); + + set_current_state(TASK_INTERRUPTIBLE); + } + + /* the thing on the front of the queue needs processing + * - we come here with the lock held and timer pointing to the expired + * entry + */ + immediate: + remove_wait_queue(&krxtimod_sleepq, &myself); + set_current_state(TASK_RUNNING); + + _debug("@@@ Begin Timeout of %p", timer); + + /* dequeue the timer */ + list_del_init(&timer->link); + spin_unlock(&krxtimod_lock); + + /* call the timeout function */ + timer->ops->timed_out(timer); + + _debug("@@@ End Timeout"); + goto loop; + +} /* end krxtimod() */ + +/*****************************************************************************/ +/* + * (re-)queue a timer + */ +void rxrpc_krxtimod_add_timer(rxrpc_timer_t *timer, unsigned long timeout) +{ + struct list_head *_p; + rxrpc_timer_t *ptimer; + + _enter("%p,%lu", timer, timeout); + + spin_lock(&krxtimod_lock); + + list_del(&timer->link); + + /* the timer was deferred or reset - put it back in the queue at the + * right place */ + timer->timo_jif = jiffies + timeout; + + list_for_each(_p, &krxtimod_list) { + ptimer = list_entry(_p, rxrpc_timer_t, link); + if (time_before(timer->timo_jif, ptimer->timo_jif)) + break; + } + + list_add_tail(&timer->link, _p); /* insert before stopping point */ + + spin_unlock(&krxtimod_lock); + + wake_up(&krxtimod_sleepq); + + _leave(""); +} /* end rxrpc_krxtimod_add_timer() */ + +/*****************************************************************************/ +/* + * dequeue a timer + * - returns 0 if the timer was deleted or -ENOENT if it wasn't queued + */ +int rxrpc_krxtimod_del_timer(rxrpc_timer_t *timer) +{ + int ret = 0; + + _enter("%p", timer); + + spin_lock(&krxtimod_lock); + + if (list_empty(&timer->link)) + ret = -ENOENT; + else + list_del_init(&timer->link); + + spin_unlock(&krxtimod_lock); + + wake_up(&krxtimod_sleepq); + + _leave(" = %d", ret); + return ret; +} /* end rxrpc_krxtimod_del_timer() */ diff --git a/net/rxrpc/main.c b/net/rxrpc/main.c new file mode 100644 index 000000000000..36fdcbcd80d1 --- /dev/null +++ b/net/rxrpc/main.c @@ -0,0 +1,180 @@ +/* main.c: Rx RPC interface + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +MODULE_DESCRIPTION("Rx RPC implementation"); +MODULE_AUTHOR("Red Hat, Inc."); +MODULE_LICENSE("GPL"); + +__be32 rxrpc_epoch; + +/*****************************************************************************/ +/* + * initialise the Rx module + */ +static int __init rxrpc_initialise(void) +{ + int ret; + + /* my epoch value */ + rxrpc_epoch = htonl(xtime.tv_sec); + + /* register the /proc interface */ +#ifdef CONFIG_PROC_FS + ret = rxrpc_proc_init(); + if (ret<0) + return ret; +#endif + + /* register the sysctl files */ +#ifdef CONFIG_SYSCTL + ret = rxrpc_sysctl_init(); + if (ret<0) + goto error_proc; +#endif + + /* start the krxtimod daemon */ + ret = rxrpc_krxtimod_start(); + if (ret<0) + goto error_sysctl; + + /* start the krxiod daemon */ + ret = rxrpc_krxiod_init(); + if (ret<0) + goto error_krxtimod; + + /* start the krxsecd daemon */ + ret = rxrpc_krxsecd_init(); + if (ret<0) + goto error_krxiod; + + kdebug("\n\n"); + + return 0; + + error_krxiod: + rxrpc_krxiod_kill(); + error_krxtimod: + rxrpc_krxtimod_kill(); + error_sysctl: +#ifdef CONFIG_SYSCTL + rxrpc_sysctl_cleanup(); +#endif + error_proc: +#ifdef CONFIG_PROC_FS + rxrpc_proc_cleanup(); +#endif + return ret; +} /* end rxrpc_initialise() */ + +module_init(rxrpc_initialise); + +/*****************************************************************************/ +/* + * clean up the Rx module + */ +static void __exit rxrpc_cleanup(void) +{ + kenter(""); + + __RXACCT(printk("Outstanding Messages : %d\n", + atomic_read(&rxrpc_message_count))); + __RXACCT(printk("Outstanding Calls : %d\n", + atomic_read(&rxrpc_call_count))); + __RXACCT(printk("Outstanding Connections: %d\n", + atomic_read(&rxrpc_connection_count))); + __RXACCT(printk("Outstanding Peers : %d\n", + atomic_read(&rxrpc_peer_count))); + __RXACCT(printk("Outstanding Transports : %d\n", + atomic_read(&rxrpc_transport_count))); + + rxrpc_krxsecd_kill(); + rxrpc_krxiod_kill(); + rxrpc_krxtimod_kill(); +#ifdef CONFIG_SYSCTL + rxrpc_sysctl_cleanup(); +#endif +#ifdef CONFIG_PROC_FS + rxrpc_proc_cleanup(); +#endif + + __RXACCT(printk("Outstanding Messages : %d\n", + atomic_read(&rxrpc_message_count))); + __RXACCT(printk("Outstanding Calls : %d\n", + atomic_read(&rxrpc_call_count))); + __RXACCT(printk("Outstanding Connections: %d\n", + atomic_read(&rxrpc_connection_count))); + __RXACCT(printk("Outstanding Peers : %d\n", + atomic_read(&rxrpc_peer_count))); + __RXACCT(printk("Outstanding Transports : %d\n", + atomic_read(&rxrpc_transport_count))); + + kleave(""); +} /* end rxrpc_cleanup() */ + +module_exit(rxrpc_cleanup); + +/*****************************************************************************/ +/* + * clear the dead space between task_struct and kernel stack + * - called by supplying -finstrument-functions to gcc + */ +#if 0 +void __cyg_profile_func_enter (void *this_fn, void *call_site) +__attribute__((no_instrument_function)); + +void __cyg_profile_func_enter (void *this_fn, void *call_site) +{ + asm volatile(" movl %%esp,%%edi \n" + " andl %0,%%edi \n" + " addl %1,%%edi \n" + " movl %%esp,%%ecx \n" + " subl %%edi,%%ecx \n" + " shrl $2,%%ecx \n" + " movl $0xedededed,%%eax \n" + " rep stosl \n" + : + : "i"(~(THREAD_SIZE-1)), "i"(sizeof(struct thread_info)) + : "eax", "ecx", "edi", "memory", "cc" + ); +} + +void __cyg_profile_func_exit(void *this_fn, void *call_site) +__attribute__((no_instrument_function)); + +void __cyg_profile_func_exit(void *this_fn, void *call_site) +{ + asm volatile(" movl %%esp,%%edi \n" + " andl %0,%%edi \n" + " addl %1,%%edi \n" + " movl %%esp,%%ecx \n" + " subl %%edi,%%ecx \n" + " shrl $2,%%ecx \n" + " movl $0xdadadada,%%eax \n" + " rep stosl \n" + : + : "i"(~(THREAD_SIZE-1)), "i"(sizeof(struct thread_info)) + : "eax", "ecx", "edi", "memory", "cc" + ); +} +#endif diff --git a/net/rxrpc/peer.c b/net/rxrpc/peer.c new file mode 100644 index 000000000000..ed38f5b17c1b --- /dev/null +++ b/net/rxrpc/peer.c @@ -0,0 +1,399 @@ +/* peer.c: Rx RPC peer management + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +__RXACCT_DECL(atomic_t rxrpc_peer_count); +LIST_HEAD(rxrpc_peers); +DECLARE_RWSEM(rxrpc_peers_sem); +unsigned long rxrpc_peer_timeout = 12 * 60 * 60; + +static void rxrpc_peer_do_timeout(struct rxrpc_peer *peer); + +static void __rxrpc_peer_timeout(rxrpc_timer_t *timer) +{ + struct rxrpc_peer *peer = + list_entry(timer, struct rxrpc_peer, timeout); + + _debug("Rx PEER TIMEOUT [%p{u=%d}]", peer, atomic_read(&peer->usage)); + + rxrpc_peer_do_timeout(peer); +} + +static const struct rxrpc_timer_ops rxrpc_peer_timer_ops = { + .timed_out = __rxrpc_peer_timeout, +}; + +/*****************************************************************************/ +/* + * create a peer record + */ +static int __rxrpc_create_peer(struct rxrpc_transport *trans, __be32 addr, + struct rxrpc_peer **_peer) +{ + struct rxrpc_peer *peer; + + _enter("%p,%08x", trans, ntohl(addr)); + + /* allocate and initialise a peer record */ + peer = kmalloc(sizeof(struct rxrpc_peer), GFP_KERNEL); + if (!peer) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + memset(peer, 0, sizeof(struct rxrpc_peer)); + atomic_set(&peer->usage, 1); + + INIT_LIST_HEAD(&peer->link); + INIT_LIST_HEAD(&peer->proc_link); + INIT_LIST_HEAD(&peer->conn_idlist); + INIT_LIST_HEAD(&peer->conn_active); + INIT_LIST_HEAD(&peer->conn_graveyard); + spin_lock_init(&peer->conn_gylock); + init_waitqueue_head(&peer->conn_gy_waitq); + rwlock_init(&peer->conn_idlock); + rwlock_init(&peer->conn_lock); + atomic_set(&peer->conn_count, 0); + spin_lock_init(&peer->lock); + rxrpc_timer_init(&peer->timeout, &rxrpc_peer_timer_ops); + + peer->addr.s_addr = addr; + + peer->trans = trans; + peer->ops = trans->peer_ops; + + __RXACCT(atomic_inc(&rxrpc_peer_count)); + *_peer = peer; + _leave(" = 0 (%p)", peer); + + return 0; +} /* end __rxrpc_create_peer() */ + +/*****************************************************************************/ +/* + * find a peer record on the specified transport + * - returns (if successful) with peer record usage incremented + * - resurrects it from the graveyard if found there + */ +int rxrpc_peer_lookup(struct rxrpc_transport *trans, __be32 addr, + struct rxrpc_peer **_peer) +{ + struct rxrpc_peer *peer, *candidate = NULL; + struct list_head *_p; + int ret; + + _enter("%p{%hu},%08x", trans, trans->port, ntohl(addr)); + + /* [common case] search the transport's active list first */ + read_lock(&trans->peer_lock); + list_for_each(_p, &trans->peer_active) { + peer = list_entry(_p, struct rxrpc_peer, link); + if (peer->addr.s_addr == addr) + goto found_active; + } + read_unlock(&trans->peer_lock); + + /* [uncommon case] not active - create a candidate for a new record */ + ret = __rxrpc_create_peer(trans, addr, &candidate); + if (ret < 0) { + _leave(" = %d", ret); + return ret; + } + + /* search the active list again, just in case it appeared whilst we + * were busy */ + write_lock(&trans->peer_lock); + list_for_each(_p, &trans->peer_active) { + peer = list_entry(_p, struct rxrpc_peer, link); + if (peer->addr.s_addr == addr) + goto found_active_second_chance; + } + + /* search the transport's graveyard list */ + spin_lock(&trans->peer_gylock); + list_for_each(_p, &trans->peer_graveyard) { + peer = list_entry(_p, struct rxrpc_peer, link); + if (peer->addr.s_addr == addr) + goto found_in_graveyard; + } + spin_unlock(&trans->peer_gylock); + + /* we can now add the new candidate to the list + * - tell the application layer that this peer has been added + */ + rxrpc_get_transport(trans); + peer = candidate; + candidate = NULL; + + if (peer->ops && peer->ops->adding) { + ret = peer->ops->adding(peer); + if (ret < 0) { + write_unlock(&trans->peer_lock); + __RXACCT(atomic_dec(&rxrpc_peer_count)); + kfree(peer); + rxrpc_put_transport(trans); + _leave(" = %d", ret); + return ret; + } + } + + atomic_inc(&trans->peer_count); + + make_active: + list_add_tail(&peer->link, &trans->peer_active); + + success_uwfree: + write_unlock(&trans->peer_lock); + + if (candidate) { + __RXACCT(atomic_dec(&rxrpc_peer_count)); + kfree(candidate); + } + + if (list_empty(&peer->proc_link)) { + down_write(&rxrpc_peers_sem); + list_add_tail(&peer->proc_link, &rxrpc_peers); + up_write(&rxrpc_peers_sem); + } + + success: + *_peer = peer; + + _leave(" = 0 (%p{u=%d cc=%d})", + peer, + atomic_read(&peer->usage), + atomic_read(&peer->conn_count)); + return 0; + + /* handle the peer being found in the active list straight off */ + found_active: + rxrpc_get_peer(peer); + read_unlock(&trans->peer_lock); + goto success; + + /* handle resurrecting a peer from the graveyard */ + found_in_graveyard: + rxrpc_get_peer(peer); + rxrpc_get_transport(peer->trans); + rxrpc_krxtimod_del_timer(&peer->timeout); + list_del_init(&peer->link); + spin_unlock(&trans->peer_gylock); + goto make_active; + + /* handle finding the peer on the second time through the active + * list */ + found_active_second_chance: + rxrpc_get_peer(peer); + goto success_uwfree; + +} /* end rxrpc_peer_lookup() */ + +/*****************************************************************************/ +/* + * finish with a peer record + * - it gets sent to the graveyard from where it can be resurrected or timed + * out + */ +void rxrpc_put_peer(struct rxrpc_peer *peer) +{ + struct rxrpc_transport *trans = peer->trans; + + _enter("%p{cc=%d a=%08x}", + peer, + atomic_read(&peer->conn_count), + ntohl(peer->addr.s_addr)); + + /* sanity check */ + if (atomic_read(&peer->usage) <= 0) + BUG(); + + write_lock(&trans->peer_lock); + spin_lock(&trans->peer_gylock); + if (likely(!atomic_dec_and_test(&peer->usage))) { + spin_unlock(&trans->peer_gylock); + write_unlock(&trans->peer_lock); + _leave(""); + return; + } + + /* move to graveyard queue */ + list_del(&peer->link); + write_unlock(&trans->peer_lock); + + list_add_tail(&peer->link, &trans->peer_graveyard); + + BUG_ON(!list_empty(&peer->conn_active)); + + rxrpc_krxtimod_add_timer(&peer->timeout, rxrpc_peer_timeout * HZ); + + spin_unlock(&trans->peer_gylock); + + rxrpc_put_transport(trans); + + _leave(" [killed]"); +} /* end rxrpc_put_peer() */ + +/*****************************************************************************/ +/* + * handle a peer timing out in the graveyard + * - called from krxtimod + */ +static void rxrpc_peer_do_timeout(struct rxrpc_peer *peer) +{ + struct rxrpc_transport *trans = peer->trans; + + _enter("%p{u=%d cc=%d a=%08x}", + peer, + atomic_read(&peer->usage), + atomic_read(&peer->conn_count), + ntohl(peer->addr.s_addr)); + + BUG_ON(atomic_read(&peer->usage) < 0); + + /* remove from graveyard if still dead */ + spin_lock(&trans->peer_gylock); + if (atomic_read(&peer->usage) == 0) + list_del_init(&peer->link); + else + peer = NULL; + spin_unlock(&trans->peer_gylock); + + if (!peer) { + _leave(""); + return; /* resurrected */ + } + + /* clear all connections on this peer */ + rxrpc_conn_clearall(peer); + + BUG_ON(!list_empty(&peer->conn_active)); + BUG_ON(!list_empty(&peer->conn_graveyard)); + + /* inform the application layer */ + if (peer->ops && peer->ops->discarding) + peer->ops->discarding(peer); + + if (!list_empty(&peer->proc_link)) { + down_write(&rxrpc_peers_sem); + list_del(&peer->proc_link); + up_write(&rxrpc_peers_sem); + } + + __RXACCT(atomic_dec(&rxrpc_peer_count)); + kfree(peer); + + /* if the graveyard is now empty, wake up anyone waiting for that */ + if (atomic_dec_and_test(&trans->peer_count)) + wake_up(&trans->peer_gy_waitq); + + _leave(" [destroyed]"); +} /* end rxrpc_peer_do_timeout() */ + +/*****************************************************************************/ +/* + * clear all peer records from a transport endpoint + */ +void rxrpc_peer_clearall(struct rxrpc_transport *trans) +{ + DECLARE_WAITQUEUE(myself,current); + + struct rxrpc_peer *peer; + int err; + + _enter("%p",trans); + + /* there shouldn't be any active peers remaining */ + BUG_ON(!list_empty(&trans->peer_active)); + + /* manually timeout all peers in the graveyard */ + spin_lock(&trans->peer_gylock); + while (!list_empty(&trans->peer_graveyard)) { + peer = list_entry(trans->peer_graveyard.next, + struct rxrpc_peer, link); + _debug("Clearing peer %p\n", peer); + err = rxrpc_krxtimod_del_timer(&peer->timeout); + spin_unlock(&trans->peer_gylock); + + if (err == 0) + rxrpc_peer_do_timeout(peer); + + spin_lock(&trans->peer_gylock); + } + spin_unlock(&trans->peer_gylock); + + /* wait for the the peer graveyard to be completely cleared */ + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&trans->peer_gy_waitq, &myself); + + while (atomic_read(&trans->peer_count) != 0) { + schedule(); + set_current_state(TASK_UNINTERRUPTIBLE); + } + + remove_wait_queue(&trans->peer_gy_waitq, &myself); + set_current_state(TASK_RUNNING); + + _leave(""); +} /* end rxrpc_peer_clearall() */ + +/*****************************************************************************/ +/* + * calculate and cache the Round-Trip-Time for a message and its response + */ +void rxrpc_peer_calculate_rtt(struct rxrpc_peer *peer, + struct rxrpc_message *msg, + struct rxrpc_message *resp) +{ + unsigned long long rtt; + int loop; + + _enter("%p,%p,%p", peer, msg, resp); + + /* calculate the latest RTT */ + rtt = resp->stamp.tv_sec - msg->stamp.tv_sec; + rtt *= 1000000UL; + rtt += resp->stamp.tv_usec - msg->stamp.tv_usec; + + /* add to cache */ + peer->rtt_cache[peer->rtt_point] = rtt; + peer->rtt_point++; + peer->rtt_point %= RXRPC_RTT_CACHE_SIZE; + + if (peer->rtt_usage < RXRPC_RTT_CACHE_SIZE) + peer->rtt_usage++; + + /* recalculate RTT */ + rtt = 0; + for (loop = peer->rtt_usage - 1; loop >= 0; loop--) + rtt += peer->rtt_cache[loop]; + + do_div(rtt, peer->rtt_usage); + peer->rtt = rtt; + + _leave(" RTT=%lu.%lums", + (long) (peer->rtt / 1000), (long) (peer->rtt % 1000)); + +} /* end rxrpc_peer_calculate_rtt() */ diff --git a/net/rxrpc/proc.c b/net/rxrpc/proc.c new file mode 100644 index 000000000000..3b5ecd8e2401 --- /dev/null +++ b/net/rxrpc/proc.c @@ -0,0 +1,617 @@ +/* proc.c: /proc interface for RxRPC + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +static struct proc_dir_entry *proc_rxrpc; + +static int rxrpc_proc_transports_open(struct inode *inode, struct file *file); +static void *rxrpc_proc_transports_start(struct seq_file *p, loff_t *pos); +static void *rxrpc_proc_transports_next(struct seq_file *p, void *v, loff_t *pos); +static void rxrpc_proc_transports_stop(struct seq_file *p, void *v); +static int rxrpc_proc_transports_show(struct seq_file *m, void *v); + +static struct seq_operations rxrpc_proc_transports_ops = { + .start = rxrpc_proc_transports_start, + .next = rxrpc_proc_transports_next, + .stop = rxrpc_proc_transports_stop, + .show = rxrpc_proc_transports_show, +}; + +static struct file_operations rxrpc_proc_transports_fops = { + .open = rxrpc_proc_transports_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int rxrpc_proc_peers_open(struct inode *inode, struct file *file); +static void *rxrpc_proc_peers_start(struct seq_file *p, loff_t *pos); +static void *rxrpc_proc_peers_next(struct seq_file *p, void *v, loff_t *pos); +static void rxrpc_proc_peers_stop(struct seq_file *p, void *v); +static int rxrpc_proc_peers_show(struct seq_file *m, void *v); + +static struct seq_operations rxrpc_proc_peers_ops = { + .start = rxrpc_proc_peers_start, + .next = rxrpc_proc_peers_next, + .stop = rxrpc_proc_peers_stop, + .show = rxrpc_proc_peers_show, +}; + +static struct file_operations rxrpc_proc_peers_fops = { + .open = rxrpc_proc_peers_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int rxrpc_proc_conns_open(struct inode *inode, struct file *file); +static void *rxrpc_proc_conns_start(struct seq_file *p, loff_t *pos); +static void *rxrpc_proc_conns_next(struct seq_file *p, void *v, loff_t *pos); +static void rxrpc_proc_conns_stop(struct seq_file *p, void *v); +static int rxrpc_proc_conns_show(struct seq_file *m, void *v); + +static struct seq_operations rxrpc_proc_conns_ops = { + .start = rxrpc_proc_conns_start, + .next = rxrpc_proc_conns_next, + .stop = rxrpc_proc_conns_stop, + .show = rxrpc_proc_conns_show, +}; + +static struct file_operations rxrpc_proc_conns_fops = { + .open = rxrpc_proc_conns_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int rxrpc_proc_calls_open(struct inode *inode, struct file *file); +static void *rxrpc_proc_calls_start(struct seq_file *p, loff_t *pos); +static void *rxrpc_proc_calls_next(struct seq_file *p, void *v, loff_t *pos); +static void rxrpc_proc_calls_stop(struct seq_file *p, void *v); +static int rxrpc_proc_calls_show(struct seq_file *m, void *v); + +static struct seq_operations rxrpc_proc_calls_ops = { + .start = rxrpc_proc_calls_start, + .next = rxrpc_proc_calls_next, + .stop = rxrpc_proc_calls_stop, + .show = rxrpc_proc_calls_show, +}; + +static struct file_operations rxrpc_proc_calls_fops = { + .open = rxrpc_proc_calls_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static const char *rxrpc_call_states7[] = { + "complet", + "error ", + "rcv_op ", + "rcv_arg", + "got_arg", + "snd_rpl", + "fin_ack", + "snd_arg", + "rcv_rpl", + "got_rpl" +}; + +static const char *rxrpc_call_error_states7[] = { + "no_err ", + "loc_abt", + "rmt_abt", + "loc_err", + "rmt_err" +}; + +/*****************************************************************************/ +/* + * initialise the /proc/net/rxrpc/ directory + */ +int rxrpc_proc_init(void) +{ + struct proc_dir_entry *p; + + proc_rxrpc = proc_mkdir("rxrpc", proc_net); + if (!proc_rxrpc) + goto error; + proc_rxrpc->owner = THIS_MODULE; + + p = create_proc_entry("calls", 0, proc_rxrpc); + if (!p) + goto error_proc; + p->proc_fops = &rxrpc_proc_calls_fops; + p->owner = THIS_MODULE; + + p = create_proc_entry("connections", 0, proc_rxrpc); + if (!p) + goto error_calls; + p->proc_fops = &rxrpc_proc_conns_fops; + p->owner = THIS_MODULE; + + p = create_proc_entry("peers", 0, proc_rxrpc); + if (!p) + goto error_calls; + p->proc_fops = &rxrpc_proc_peers_fops; + p->owner = THIS_MODULE; + + p = create_proc_entry("transports", 0, proc_rxrpc); + if (!p) + goto error_conns; + p->proc_fops = &rxrpc_proc_transports_fops; + p->owner = THIS_MODULE; + + return 0; + + error_conns: + remove_proc_entry("connections", proc_rxrpc); + error_calls: + remove_proc_entry("calls", proc_rxrpc); + error_proc: + remove_proc_entry("rxrpc", proc_net); + error: + return -ENOMEM; +} /* end rxrpc_proc_init() */ + +/*****************************************************************************/ +/* + * clean up the /proc/net/rxrpc/ directory + */ +void rxrpc_proc_cleanup(void) +{ + remove_proc_entry("transports", proc_rxrpc); + remove_proc_entry("peers", proc_rxrpc); + remove_proc_entry("connections", proc_rxrpc); + remove_proc_entry("calls", proc_rxrpc); + + remove_proc_entry("rxrpc", proc_net); + +} /* end rxrpc_proc_cleanup() */ + +/*****************************************************************************/ +/* + * open "/proc/net/rxrpc/transports" which provides a summary of extant transports + */ +static int rxrpc_proc_transports_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &rxrpc_proc_transports_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = PDE(inode)->data; + + return 0; +} /* end rxrpc_proc_transports_open() */ + +/*****************************************************************************/ +/* + * set up the iterator to start reading from the transports list and return the first item + */ +static void *rxrpc_proc_transports_start(struct seq_file *m, loff_t *_pos) +{ + struct list_head *_p; + loff_t pos = *_pos; + + /* lock the list against modification */ + down_read(&rxrpc_proc_transports_sem); + + /* allow for the header line */ + if (!pos) + return SEQ_START_TOKEN; + pos--; + + /* find the n'th element in the list */ + list_for_each(_p, &rxrpc_proc_transports) + if (!pos--) + break; + + return _p != &rxrpc_proc_transports ? _p : NULL; +} /* end rxrpc_proc_transports_start() */ + +/*****************************************************************************/ +/* + * move to next call in transports list + */ +static void *rxrpc_proc_transports_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct list_head *_p; + + (*pos)++; + + _p = v; + _p = (v == SEQ_START_TOKEN) ? rxrpc_proc_transports.next : _p->next; + + return _p != &rxrpc_proc_transports ? _p : NULL; +} /* end rxrpc_proc_transports_next() */ + +/*****************************************************************************/ +/* + * clean up after reading from the transports list + */ +static void rxrpc_proc_transports_stop(struct seq_file *p, void *v) +{ + up_read(&rxrpc_proc_transports_sem); + +} /* end rxrpc_proc_transports_stop() */ + +/*****************************************************************************/ +/* + * display a header line followed by a load of call lines + */ +static int rxrpc_proc_transports_show(struct seq_file *m, void *v) +{ + struct rxrpc_transport *trans = + list_entry(v, struct rxrpc_transport, proc_link); + + /* display header on line 1 */ + if (v == SEQ_START_TOKEN) { + seq_puts(m, "LOCAL USE\n"); + return 0; + } + + /* display one transport per line on subsequent lines */ + seq_printf(m, "%5hu %3d\n", + trans->port, + atomic_read(&trans->usage) + ); + + return 0; +} /* end rxrpc_proc_transports_show() */ + +/*****************************************************************************/ +/* + * open "/proc/net/rxrpc/peers" which provides a summary of extant peers + */ +static int rxrpc_proc_peers_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &rxrpc_proc_peers_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = PDE(inode)->data; + + return 0; +} /* end rxrpc_proc_peers_open() */ + +/*****************************************************************************/ +/* + * set up the iterator to start reading from the peers list and return the + * first item + */ +static void *rxrpc_proc_peers_start(struct seq_file *m, loff_t *_pos) +{ + struct list_head *_p; + loff_t pos = *_pos; + + /* lock the list against modification */ + down_read(&rxrpc_peers_sem); + + /* allow for the header line */ + if (!pos) + return SEQ_START_TOKEN; + pos--; + + /* find the n'th element in the list */ + list_for_each(_p, &rxrpc_peers) + if (!pos--) + break; + + return _p != &rxrpc_peers ? _p : NULL; +} /* end rxrpc_proc_peers_start() */ + +/*****************************************************************************/ +/* + * move to next conn in peers list + */ +static void *rxrpc_proc_peers_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct list_head *_p; + + (*pos)++; + + _p = v; + _p = (v == SEQ_START_TOKEN) ? rxrpc_peers.next : _p->next; + + return _p != &rxrpc_peers ? _p : NULL; +} /* end rxrpc_proc_peers_next() */ + +/*****************************************************************************/ +/* + * clean up after reading from the peers list + */ +static void rxrpc_proc_peers_stop(struct seq_file *p, void *v) +{ + up_read(&rxrpc_peers_sem); + +} /* end rxrpc_proc_peers_stop() */ + +/*****************************************************************************/ +/* + * display a header line followed by a load of conn lines + */ +static int rxrpc_proc_peers_show(struct seq_file *m, void *v) +{ + struct rxrpc_peer *peer = list_entry(v, struct rxrpc_peer, proc_link); + signed long timeout; + + /* display header on line 1 */ + if (v == SEQ_START_TOKEN) { + seq_puts(m, "LOCAL REMOTE USAGE CONNS TIMEOUT" + " MTU RTT(uS)\n"); + return 0; + } + + /* display one peer per line on subsequent lines */ + timeout = 0; + if (!list_empty(&peer->timeout.link)) + timeout = (signed long) peer->timeout.timo_jif - + (signed long) jiffies; + + seq_printf(m, "%5hu %08x %5d %5d %8ld %5Zu %7lu\n", + peer->trans->port, + ntohl(peer->addr.s_addr), + atomic_read(&peer->usage), + atomic_read(&peer->conn_count), + timeout, + peer->if_mtu, + (long) peer->rtt + ); + + return 0; +} /* end rxrpc_proc_peers_show() */ + +/*****************************************************************************/ +/* + * open "/proc/net/rxrpc/connections" which provides a summary of extant + * connections + */ +static int rxrpc_proc_conns_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &rxrpc_proc_conns_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = PDE(inode)->data; + + return 0; +} /* end rxrpc_proc_conns_open() */ + +/*****************************************************************************/ +/* + * set up the iterator to start reading from the conns list and return the + * first item + */ +static void *rxrpc_proc_conns_start(struct seq_file *m, loff_t *_pos) +{ + struct list_head *_p; + loff_t pos = *_pos; + + /* lock the list against modification */ + down_read(&rxrpc_conns_sem); + + /* allow for the header line */ + if (!pos) + return SEQ_START_TOKEN; + pos--; + + /* find the n'th element in the list */ + list_for_each(_p, &rxrpc_conns) + if (!pos--) + break; + + return _p != &rxrpc_conns ? _p : NULL; +} /* end rxrpc_proc_conns_start() */ + +/*****************************************************************************/ +/* + * move to next conn in conns list + */ +static void *rxrpc_proc_conns_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct list_head *_p; + + (*pos)++; + + _p = v; + _p = (v == SEQ_START_TOKEN) ? rxrpc_conns.next : _p->next; + + return _p != &rxrpc_conns ? _p : NULL; +} /* end rxrpc_proc_conns_next() */ + +/*****************************************************************************/ +/* + * clean up after reading from the conns list + */ +static void rxrpc_proc_conns_stop(struct seq_file *p, void *v) +{ + up_read(&rxrpc_conns_sem); + +} /* end rxrpc_proc_conns_stop() */ + +/*****************************************************************************/ +/* + * display a header line followed by a load of conn lines + */ +static int rxrpc_proc_conns_show(struct seq_file *m, void *v) +{ + struct rxrpc_connection *conn; + signed long timeout; + + conn = list_entry(v, struct rxrpc_connection, proc_link); + + /* display header on line 1 */ + if (v == SEQ_START_TOKEN) { + seq_puts(m, + "LOCAL REMOTE RPORT SRVC CONN END SERIALNO " + "CALLNO MTU TIMEOUT" + "\n"); + return 0; + } + + /* display one conn per line on subsequent lines */ + timeout = 0; + if (!list_empty(&conn->timeout.link)) + timeout = (signed long) conn->timeout.timo_jif - + (signed long) jiffies; + + seq_printf(m, + "%5hu %08x %5hu %04hx %08x %-3.3s %08x %08x %5Zu %8ld\n", + conn->trans->port, + ntohl(conn->addr.sin_addr.s_addr), + ntohs(conn->addr.sin_port), + ntohs(conn->service_id), + ntohl(conn->conn_id), + conn->out_clientflag ? "CLT" : "SRV", + conn->serial_counter, + conn->call_counter, + conn->mtu_size, + timeout + ); + + return 0; +} /* end rxrpc_proc_conns_show() */ + +/*****************************************************************************/ +/* + * open "/proc/net/rxrpc/calls" which provides a summary of extant calls + */ +static int rxrpc_proc_calls_open(struct inode *inode, struct file *file) +{ + struct seq_file *m; + int ret; + + ret = seq_open(file, &rxrpc_proc_calls_ops); + if (ret < 0) + return ret; + + m = file->private_data; + m->private = PDE(inode)->data; + + return 0; +} /* end rxrpc_proc_calls_open() */ + +/*****************************************************************************/ +/* + * set up the iterator to start reading from the calls list and return the + * first item + */ +static void *rxrpc_proc_calls_start(struct seq_file *m, loff_t *_pos) +{ + struct list_head *_p; + loff_t pos = *_pos; + + /* lock the list against modification */ + down_read(&rxrpc_calls_sem); + + /* allow for the header line */ + if (!pos) + return SEQ_START_TOKEN; + pos--; + + /* find the n'th element in the list */ + list_for_each(_p, &rxrpc_calls) + if (!pos--) + break; + + return _p != &rxrpc_calls ? _p : NULL; +} /* end rxrpc_proc_calls_start() */ + +/*****************************************************************************/ +/* + * move to next call in calls list + */ +static void *rxrpc_proc_calls_next(struct seq_file *p, void *v, loff_t *pos) +{ + struct list_head *_p; + + (*pos)++; + + _p = v; + _p = (v == SEQ_START_TOKEN) ? rxrpc_calls.next : _p->next; + + return _p != &rxrpc_calls ? _p : NULL; +} /* end rxrpc_proc_calls_next() */ + +/*****************************************************************************/ +/* + * clean up after reading from the calls list + */ +static void rxrpc_proc_calls_stop(struct seq_file *p, void *v) +{ + up_read(&rxrpc_calls_sem); + +} /* end rxrpc_proc_calls_stop() */ + +/*****************************************************************************/ +/* + * display a header line followed by a load of call lines + */ +static int rxrpc_proc_calls_show(struct seq_file *m, void *v) +{ + struct rxrpc_call *call = list_entry(v, struct rxrpc_call, call_link); + + /* display header on line 1 */ + if (v == SEQ_START_TOKEN) { + seq_puts(m, + "LOCAL REMOT SRVC CONN CALL DIR USE " + " L STATE OPCODE ABORT ERRNO\n" + ); + return 0; + } + + /* display one call per line on subsequent lines */ + seq_printf(m, + "%5hu %5hu %04hx %08x %08x %s %3u%c" + " %c %-7.7s %6d %08x %5d\n", + call->conn->trans->port, + ntohs(call->conn->addr.sin_port), + ntohs(call->conn->service_id), + ntohl(call->conn->conn_id), + ntohl(call->call_id), + call->conn->service ? "SVC" : "CLT", + atomic_read(&call->usage), + waitqueue_active(&call->waitq) ? 'w' : ' ', + call->app_last_rcv ? 'Y' : '-', + (call->app_call_state!=RXRPC_CSTATE_ERROR ? + rxrpc_call_states7[call->app_call_state] : + rxrpc_call_error_states7[call->app_err_state]), + call->app_opcode, + call->app_abort_code, + call->app_errno + ); + + return 0; +} /* end rxrpc_proc_calls_show() */ diff --git a/net/rxrpc/rxrpc_syms.c b/net/rxrpc/rxrpc_syms.c new file mode 100644 index 000000000000..56adf16fed0c --- /dev/null +++ b/net/rxrpc/rxrpc_syms.c @@ -0,0 +1,35 @@ +/* rxrpc_syms.c: exported Rx RPC layer interface symbols + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +#include +#include +#include +#include + +/* call.c */ +EXPORT_SYMBOL(rxrpc_create_call); +EXPORT_SYMBOL(rxrpc_put_call); +EXPORT_SYMBOL(rxrpc_call_abort); +EXPORT_SYMBOL(rxrpc_call_read_data); +EXPORT_SYMBOL(rxrpc_call_write_data); + +/* connection.c */ +EXPORT_SYMBOL(rxrpc_create_connection); +EXPORT_SYMBOL(rxrpc_put_connection); + +/* transport.c */ +EXPORT_SYMBOL(rxrpc_create_transport); +EXPORT_SYMBOL(rxrpc_put_transport); +EXPORT_SYMBOL(rxrpc_add_service); +EXPORT_SYMBOL(rxrpc_del_service); diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c new file mode 100644 index 000000000000..fbf98729c748 --- /dev/null +++ b/net/rxrpc/sysctl.c @@ -0,0 +1,122 @@ +/* sysctl.c: Rx RPC control + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +int rxrpc_ktrace; +int rxrpc_kdebug; +int rxrpc_kproto; +int rxrpc_knet; + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *rxrpc_sysctl = NULL; + +static ctl_table rxrpc_sysctl_table[] = { + { + .ctl_name = 1, + .procname = "kdebug", + .data = &rxrpc_kdebug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = 2, + .procname = "ktrace", + .data = &rxrpc_ktrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = 3, + .procname = "kproto", + .data = &rxrpc_kproto, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = 4, + .procname = "knet", + .data = &rxrpc_knet, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = 5, + .procname = "peertimo", + .data = &rxrpc_peer_timeout, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax + }, + { + .ctl_name = 6, + .procname = "conntimo", + .data = &rxrpc_conn_timeout, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = &proc_doulongvec_minmax + }, + { .ctl_name = 0 } +}; + +static ctl_table rxrpc_dir_sysctl_table[] = { + { + .ctl_name = 1, + .procname = "rxrpc", + .maxlen = 0, + .mode = 0555, + .child = rxrpc_sysctl_table + }, + { .ctl_name = 0 } +}; +#endif /* CONFIG_SYSCTL */ + +/*****************************************************************************/ +/* + * initialise the sysctl stuff for Rx RPC + */ +int rxrpc_sysctl_init(void) +{ +#ifdef CONFIG_SYSCTL + rxrpc_sysctl = register_sysctl_table(rxrpc_dir_sysctl_table, 0); + if (!rxrpc_sysctl) + return -ENOMEM; +#endif /* CONFIG_SYSCTL */ + + return 0; +} /* end rxrpc_sysctl_init() */ + +/*****************************************************************************/ +/* + * clean up the sysctl stuff for Rx RPC + */ +void rxrpc_sysctl_cleanup(void) +{ +#ifdef CONFIG_SYSCTL + if (rxrpc_sysctl) { + unregister_sysctl_table(rxrpc_sysctl); + rxrpc_sysctl = NULL; + } +#endif /* CONFIG_SYSCTL */ + +} /* end rxrpc_sysctl_cleanup() */ diff --git a/net/rxrpc/transport.c b/net/rxrpc/transport.c new file mode 100644 index 000000000000..9bce7794130a --- /dev/null +++ b/net/rxrpc/transport.c @@ -0,0 +1,854 @@ +/* transport.c: Rx Transport routines + * + * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) +#include /* this should _really_ be in errqueue.h.. */ +#endif +#include +#include +#include +#include "internal.h" + +struct errormsg { + struct cmsghdr cmsg; /* control message header */ + struct sock_extended_err ee; /* extended error information */ + struct sockaddr_in icmp_src; /* ICMP packet source address */ +}; + +static DEFINE_SPINLOCK(rxrpc_transports_lock); +static struct list_head rxrpc_transports = LIST_HEAD_INIT(rxrpc_transports); + +__RXACCT_DECL(atomic_t rxrpc_transport_count); +LIST_HEAD(rxrpc_proc_transports); +DECLARE_RWSEM(rxrpc_proc_transports_sem); + +static void rxrpc_data_ready(struct sock *sk, int count); +static void rxrpc_error_report(struct sock *sk); +static int rxrpc_trans_receive_new_call(struct rxrpc_transport *trans, + struct list_head *msgq); +static void rxrpc_trans_receive_error_report(struct rxrpc_transport *trans); + +/*****************************************************************************/ +/* + * create a new transport endpoint using the specified UDP port + */ +int rxrpc_create_transport(unsigned short port, + struct rxrpc_transport **_trans) +{ + struct rxrpc_transport *trans; + struct sockaddr_in sin; + mm_segment_t oldfs; + struct sock *sock; + int ret, opt; + + _enter("%hu", port); + + trans = kmalloc(sizeof(struct rxrpc_transport), GFP_KERNEL); + if (!trans) + return -ENOMEM; + + memset(trans, 0, sizeof(struct rxrpc_transport)); + atomic_set(&trans->usage, 1); + INIT_LIST_HEAD(&trans->services); + INIT_LIST_HEAD(&trans->link); + INIT_LIST_HEAD(&trans->krxiodq_link); + spin_lock_init(&trans->lock); + INIT_LIST_HEAD(&trans->peer_active); + INIT_LIST_HEAD(&trans->peer_graveyard); + spin_lock_init(&trans->peer_gylock); + init_waitqueue_head(&trans->peer_gy_waitq); + rwlock_init(&trans->peer_lock); + atomic_set(&trans->peer_count, 0); + trans->port = port; + + /* create a UDP socket to be my actual transport endpoint */ + ret = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &trans->socket); + if (ret < 0) + goto error; + + /* use the specified port */ + if (port) { + memset(&sin, 0, sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_port = htons(port); + ret = trans->socket->ops->bind(trans->socket, + (struct sockaddr *) &sin, + sizeof(sin)); + if (ret < 0) + goto error; + } + + opt = 1; + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = trans->socket->ops->setsockopt(trans->socket, SOL_IP, IP_RECVERR, + (char *) &opt, sizeof(opt)); + set_fs(oldfs); + + spin_lock(&rxrpc_transports_lock); + list_add(&trans->link, &rxrpc_transports); + spin_unlock(&rxrpc_transports_lock); + + /* set the socket up */ + sock = trans->socket->sk; + sock->sk_user_data = trans; + sock->sk_data_ready = rxrpc_data_ready; + sock->sk_error_report = rxrpc_error_report; + + down_write(&rxrpc_proc_transports_sem); + list_add_tail(&trans->proc_link, &rxrpc_proc_transports); + up_write(&rxrpc_proc_transports_sem); + + __RXACCT(atomic_inc(&rxrpc_transport_count)); + + *_trans = trans; + _leave(" = 0 (%p)", trans); + return 0; + + error: + /* finish cleaning up the transport (not really needed here, but...) */ + if (trans->socket) + trans->socket->ops->shutdown(trans->socket, 2); + + /* close the socket */ + if (trans->socket) { + trans->socket->sk->sk_user_data = NULL; + sock_release(trans->socket); + trans->socket = NULL; + } + + kfree(trans); + + + _leave(" = %d", ret); + return ret; +} /* end rxrpc_create_transport() */ + +/*****************************************************************************/ +/* + * destroy a transport endpoint + */ +void rxrpc_put_transport(struct rxrpc_transport *trans) +{ + _enter("%p{u=%d p=%hu}", + trans, atomic_read(&trans->usage), trans->port); + + BUG_ON(atomic_read(&trans->usage) <= 0); + + /* to prevent a race, the decrement and the dequeue must be + * effectively atomic */ + spin_lock(&rxrpc_transports_lock); + if (likely(!atomic_dec_and_test(&trans->usage))) { + spin_unlock(&rxrpc_transports_lock); + _leave(""); + return; + } + + list_del(&trans->link); + spin_unlock(&rxrpc_transports_lock); + + /* finish cleaning up the transport */ + if (trans->socket) + trans->socket->ops->shutdown(trans->socket, 2); + + rxrpc_krxsecd_clear_transport(trans); + rxrpc_krxiod_dequeue_transport(trans); + + /* discard all peer information */ + rxrpc_peer_clearall(trans); + + down_write(&rxrpc_proc_transports_sem); + list_del(&trans->proc_link); + up_write(&rxrpc_proc_transports_sem); + __RXACCT(atomic_dec(&rxrpc_transport_count)); + + /* close the socket */ + if (trans->socket) { + trans->socket->sk->sk_user_data = NULL; + sock_release(trans->socket); + trans->socket = NULL; + } + + kfree(trans); + + _leave(""); +} /* end rxrpc_put_transport() */ + +/*****************************************************************************/ +/* + * add a service to a transport to be listened upon + */ +int rxrpc_add_service(struct rxrpc_transport *trans, + struct rxrpc_service *newsrv) +{ + struct rxrpc_service *srv; + struct list_head *_p; + int ret = -EEXIST; + + _enter("%p{%hu},%p{%hu}", + trans, trans->port, newsrv, newsrv->service_id); + + /* verify that the service ID is not already present */ + spin_lock(&trans->lock); + + list_for_each(_p, &trans->services) { + srv = list_entry(_p, struct rxrpc_service, link); + if (srv->service_id == newsrv->service_id) + goto out; + } + + /* okay - add the transport to the list */ + list_add_tail(&newsrv->link, &trans->services); + rxrpc_get_transport(trans); + ret = 0; + + out: + spin_unlock(&trans->lock); + + _leave("= %d", ret); + return ret; +} /* end rxrpc_add_service() */ + +/*****************************************************************************/ +/* + * remove a service from a transport + */ +void rxrpc_del_service(struct rxrpc_transport *trans, struct rxrpc_service *srv) +{ + _enter("%p{%hu},%p{%hu}", trans, trans->port, srv, srv->service_id); + + spin_lock(&trans->lock); + list_del(&srv->link); + spin_unlock(&trans->lock); + + rxrpc_put_transport(trans); + + _leave(""); +} /* end rxrpc_del_service() */ + +/*****************************************************************************/ +/* + * INET callback when data has been received on the socket. + */ +static void rxrpc_data_ready(struct sock *sk, int count) +{ + struct rxrpc_transport *trans; + + _enter("%p{t=%p},%d", sk, sk->sk_user_data, count); + + /* queue the transport for attention by krxiod */ + trans = (struct rxrpc_transport *) sk->sk_user_data; + if (trans) + rxrpc_krxiod_queue_transport(trans); + + /* wake up anyone waiting on the socket */ + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + + _leave(""); +} /* end rxrpc_data_ready() */ + +/*****************************************************************************/ +/* + * INET callback when an ICMP error packet is received + * - sk->err is error (EHOSTUNREACH, EPROTO or EMSGSIZE) + */ +static void rxrpc_error_report(struct sock *sk) +{ + struct rxrpc_transport *trans; + + _enter("%p{t=%p}", sk, sk->sk_user_data); + + /* queue the transport for attention by krxiod */ + trans = (struct rxrpc_transport *) sk->sk_user_data; + if (trans) { + trans->error_rcvd = 1; + rxrpc_krxiod_queue_transport(trans); + } + + /* wake up anyone waiting on the socket */ + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + + _leave(""); +} /* end rxrpc_error_report() */ + +/*****************************************************************************/ +/* + * split a message up, allocating message records and filling them in + * from the contents of a socket buffer + */ +static int rxrpc_incoming_msg(struct rxrpc_transport *trans, + struct sk_buff *pkt, + struct list_head *msgq) +{ + struct rxrpc_message *msg; + int ret; + + _enter(""); + + msg = kmalloc(sizeof(struct rxrpc_message), GFP_KERNEL); + if (!msg) { + _leave(" = -ENOMEM"); + return -ENOMEM; + } + + memset(msg, 0, sizeof(*msg)); + atomic_set(&msg->usage, 1); + list_add_tail(&msg->link,msgq); + + /* dig out the Rx routing parameters */ + if (skb_copy_bits(pkt, sizeof(struct udphdr), + &msg->hdr, sizeof(msg->hdr)) < 0) { + ret = -EBADMSG; + goto error; + } + + msg->trans = trans; + msg->state = RXRPC_MSG_RECEIVED; + msg->stamp = pkt->stamp; + if (msg->stamp.tv_sec == 0) { + do_gettimeofday(&msg->stamp); + if (pkt->sk) + sock_enable_timestamp(pkt->sk); + } + msg->seq = ntohl(msg->hdr.seq); + + /* attach the packet */ + skb_get(pkt); + msg->pkt = pkt; + + msg->offset = sizeof(struct udphdr) + sizeof(struct rxrpc_header); + msg->dsize = msg->pkt->len - msg->offset; + + _net("Rx Received packet from %s (%08x;%08x,%1x,%d,%s,%02x,%d,%d)", + msg->hdr.flags & RXRPC_CLIENT_INITIATED ? "client" : "server", + ntohl(msg->hdr.epoch), + (ntohl(msg->hdr.cid) & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT, + ntohl(msg->hdr.cid) & RXRPC_CHANNELMASK, + ntohl(msg->hdr.callNumber), + rxrpc_pkts[msg->hdr.type], + msg->hdr.flags, + ntohs(msg->hdr.serviceId), + msg->hdr.securityIndex); + + __RXACCT(atomic_inc(&rxrpc_message_count)); + + /* split off jumbo packets */ + while (msg->hdr.type == RXRPC_PACKET_TYPE_DATA && + msg->hdr.flags & RXRPC_JUMBO_PACKET + ) { + struct rxrpc_jumbo_header jumbo; + struct rxrpc_message *jumbomsg = msg; + + _debug("split jumbo packet"); + + /* quick sanity check */ + ret = -EBADMSG; + if (msg->dsize < + RXRPC_JUMBO_DATALEN + sizeof(struct rxrpc_jumbo_header)) + goto error; + if (msg->hdr.flags & RXRPC_LAST_PACKET) + goto error; + + /* dig out the secondary header */ + if (skb_copy_bits(pkt, msg->offset + RXRPC_JUMBO_DATALEN, + &jumbo, sizeof(jumbo)) < 0) + goto error; + + /* allocate a new message record */ + ret = -ENOMEM; + msg = kmalloc(sizeof(struct rxrpc_message), GFP_KERNEL); + if (!msg) + goto error; + + memcpy(msg, jumbomsg, sizeof(*msg)); + list_add_tail(&msg->link, msgq); + + /* adjust the jumbo packet */ + jumbomsg->dsize = RXRPC_JUMBO_DATALEN; + + /* attach the packet here too */ + skb_get(pkt); + + /* adjust the parameters */ + msg->seq++; + msg->hdr.seq = htonl(msg->seq); + msg->hdr.serial = htonl(ntohl(msg->hdr.serial) + 1); + msg->offset += RXRPC_JUMBO_DATALEN + + sizeof(struct rxrpc_jumbo_header); + msg->dsize -= RXRPC_JUMBO_DATALEN + + sizeof(struct rxrpc_jumbo_header); + msg->hdr.flags = jumbo.flags; + msg->hdr._rsvd = jumbo._rsvd; + + _net("Rx Split jumbo packet from %s" + " (%08x;%08x,%1x,%d,%s,%02x,%d,%d)", + msg->hdr.flags & RXRPC_CLIENT_INITIATED ? "client" : "server", + ntohl(msg->hdr.epoch), + (ntohl(msg->hdr.cid) & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT, + ntohl(msg->hdr.cid) & RXRPC_CHANNELMASK, + ntohl(msg->hdr.callNumber), + rxrpc_pkts[msg->hdr.type], + msg->hdr.flags, + ntohs(msg->hdr.serviceId), + msg->hdr.securityIndex); + + __RXACCT(atomic_inc(&rxrpc_message_count)); + } + + _leave(" = 0 #%d", atomic_read(&rxrpc_message_count)); + return 0; + + error: + while (!list_empty(msgq)) { + msg = list_entry(msgq->next, struct rxrpc_message, link); + list_del_init(&msg->link); + + rxrpc_put_message(msg); + } + + _leave(" = %d", ret); + return ret; +} /* end rxrpc_incoming_msg() */ + +/*****************************************************************************/ +/* + * accept a new call + * - called from krxiod in process context + */ +void rxrpc_trans_receive_packet(struct rxrpc_transport *trans) +{ + struct rxrpc_message *msg; + struct rxrpc_peer *peer; + struct sk_buff *pkt; + int ret; + __be32 addr; + __be16 port; + + LIST_HEAD(msgq); + + _enter("%p{%d}", trans, trans->port); + + for (;;) { + /* deal with outstanting errors first */ + if (trans->error_rcvd) + rxrpc_trans_receive_error_report(trans); + + /* attempt to receive a packet */ + pkt = skb_recv_datagram(trans->socket->sk, 0, 1, &ret); + if (!pkt) { + if (ret == -EAGAIN) { + _leave(" EAGAIN"); + return; + } + + /* an icmp error may have occurred */ + rxrpc_krxiod_queue_transport(trans); + _leave(" error %d\n", ret); + return; + } + + /* we'll probably need to checksum it (didn't call + * sock_recvmsg) */ + if (pkt->ip_summed != CHECKSUM_UNNECESSARY) { + if ((unsigned short) + csum_fold(skb_checksum(pkt, 0, pkt->len, + pkt->csum))) { + kfree_skb(pkt); + rxrpc_krxiod_queue_transport(trans); + _leave(" CSUM failed"); + return; + } + } + + addr = pkt->nh.iph->saddr; + port = pkt->h.uh->source; + + _net("Rx Received UDP packet from %08x:%04hu", + ntohl(addr), ntohs(port)); + + /* unmarshall the Rx parameters and split jumbo packets */ + ret = rxrpc_incoming_msg(trans, pkt, &msgq); + if (ret < 0) { + kfree_skb(pkt); + rxrpc_krxiod_queue_transport(trans); + _leave(" bad packet"); + return; + } + + BUG_ON(list_empty(&msgq)); + + msg = list_entry(msgq.next, struct rxrpc_message, link); + + /* locate the record for the peer from which it + * originated */ + ret = rxrpc_peer_lookup(trans, addr, &peer); + if (ret < 0) { + kdebug("Rx No connections from that peer"); + rxrpc_trans_immediate_abort(trans, msg, -EINVAL); + goto finished_msg; + } + + /* try and find a matching connection */ + ret = rxrpc_connection_lookup(peer, msg, &msg->conn); + if (ret < 0) { + kdebug("Rx Unknown Connection"); + rxrpc_trans_immediate_abort(trans, msg, -EINVAL); + rxrpc_put_peer(peer); + goto finished_msg; + } + rxrpc_put_peer(peer); + + /* deal with the first packet of a new call */ + if (msg->hdr.flags & RXRPC_CLIENT_INITIATED && + msg->hdr.type == RXRPC_PACKET_TYPE_DATA && + ntohl(msg->hdr.seq) == 1 + ) { + _debug("Rx New server call"); + rxrpc_trans_receive_new_call(trans, &msgq); + goto finished_msg; + } + + /* deal with subsequent packet(s) of call */ + _debug("Rx Call packet"); + while (!list_empty(&msgq)) { + msg = list_entry(msgq.next, struct rxrpc_message, link); + list_del_init(&msg->link); + + ret = rxrpc_conn_receive_call_packet(msg->conn, NULL, msg); + if (ret < 0) { + rxrpc_trans_immediate_abort(trans, msg, ret); + rxrpc_put_message(msg); + goto finished_msg; + } + + rxrpc_put_message(msg); + } + + goto finished_msg; + + /* dispose of the packets */ + finished_msg: + while (!list_empty(&msgq)) { + msg = list_entry(msgq.next, struct rxrpc_message, link); + list_del_init(&msg->link); + + rxrpc_put_message(msg); + } + kfree_skb(pkt); + } + + _leave(""); + +} /* end rxrpc_trans_receive_packet() */ + +/*****************************************************************************/ +/* + * accept a new call from a client trying to connect to one of my services + * - called in process context + */ +static int rxrpc_trans_receive_new_call(struct rxrpc_transport *trans, + struct list_head *msgq) +{ + struct rxrpc_message *msg; + + _enter(""); + + /* only bother with the first packet */ + msg = list_entry(msgq->next, struct rxrpc_message, link); + list_del_init(&msg->link); + rxrpc_krxsecd_queue_incoming_call(msg); + rxrpc_put_message(msg); + + _leave(" = 0"); + + return 0; +} /* end rxrpc_trans_receive_new_call() */ + +/*****************************************************************************/ +/* + * perform an immediate abort without connection or call structures + */ +int rxrpc_trans_immediate_abort(struct rxrpc_transport *trans, + struct rxrpc_message *msg, + int error) +{ + struct rxrpc_header ahdr; + struct sockaddr_in sin; + struct msghdr msghdr; + struct kvec iov[2]; + __be32 _error; + int len, ret; + + _enter("%p,%p,%d", trans, msg, error); + + /* don't abort an abort packet */ + if (msg->hdr.type == RXRPC_PACKET_TYPE_ABORT) { + _leave(" = 0"); + return 0; + } + + _error = htonl(-error); + + /* set up the message to be transmitted */ + memcpy(&ahdr, &msg->hdr, sizeof(ahdr)); + ahdr.epoch = msg->hdr.epoch; + ahdr.serial = htonl(1); + ahdr.seq = 0; + ahdr.type = RXRPC_PACKET_TYPE_ABORT; + ahdr.flags = RXRPC_LAST_PACKET; + ahdr.flags |= ~msg->hdr.flags & RXRPC_CLIENT_INITIATED; + + iov[0].iov_len = sizeof(ahdr); + iov[0].iov_base = &ahdr; + iov[1].iov_len = sizeof(_error); + iov[1].iov_base = &_error; + + len = sizeof(ahdr) + sizeof(_error); + + memset(&sin,0,sizeof(sin)); + sin.sin_family = AF_INET; + sin.sin_port = msg->pkt->h.uh->source; + sin.sin_addr.s_addr = msg->pkt->nh.iph->saddr; + + msghdr.msg_name = &sin; + msghdr.msg_namelen = sizeof(sin); + msghdr.msg_control = NULL; + msghdr.msg_controllen = 0; + msghdr.msg_flags = MSG_DONTWAIT; + + _net("Sending message type %d of %d bytes to %08x:%d", + ahdr.type, + len, + ntohl(sin.sin_addr.s_addr), + ntohs(sin.sin_port)); + + /* send the message */ + ret = kernel_sendmsg(trans->socket, &msghdr, iov, 2, len); + + _leave(" = %d", ret); + return ret; +} /* end rxrpc_trans_immediate_abort() */ + +/*****************************************************************************/ +/* + * receive an ICMP error report and percolate it to all connections + * heading to the affected host or port + */ +static void rxrpc_trans_receive_error_report(struct rxrpc_transport *trans) +{ + struct rxrpc_connection *conn; + struct sockaddr_in sin; + struct rxrpc_peer *peer; + struct list_head connq, *_p; + struct errormsg emsg; + struct msghdr msg; + __be16 port; + int local, err; + + _enter("%p", trans); + + for (;;) { + trans->error_rcvd = 0; + + /* try and receive an error message */ + msg.msg_name = &sin; + msg.msg_namelen = sizeof(sin); + msg.msg_control = &emsg; + msg.msg_controllen = sizeof(emsg); + msg.msg_flags = 0; + + err = kernel_recvmsg(trans->socket, &msg, NULL, 0, 0, + MSG_ERRQUEUE | MSG_DONTWAIT | MSG_TRUNC); + + if (err == -EAGAIN) { + _leave(""); + return; + } + + if (err < 0) { + printk("%s: unable to recv an error report: %d\n", + __FUNCTION__, err); + _leave(""); + return; + } + + msg.msg_controllen = (char *) msg.msg_control - (char *) &emsg; + + if (msg.msg_controllen < sizeof(emsg.cmsg) || + msg.msg_namelen < sizeof(sin)) { + printk("%s: short control message" + " (nlen=%u clen=%Zu fl=%x)\n", + __FUNCTION__, + msg.msg_namelen, + msg.msg_controllen, + msg.msg_flags); + continue; + } + + _net("Rx Received control message" + " { len=%Zu level=%u type=%u }", + emsg.cmsg.cmsg_len, + emsg.cmsg.cmsg_level, + emsg.cmsg.cmsg_type); + + if (sin.sin_family != AF_INET) { + printk("Rx Ignoring error report with non-INET address" + " (fam=%u)", + sin.sin_family); + continue; + } + + _net("Rx Received message pertaining to host addr=%x port=%hu", + ntohl(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + + if (emsg.cmsg.cmsg_level != SOL_IP || + emsg.cmsg.cmsg_type != IP_RECVERR) { + printk("Rx Ignoring unknown error report" + " { level=%u type=%u }", + emsg.cmsg.cmsg_level, + emsg.cmsg.cmsg_type); + continue; + } + + if (msg.msg_controllen < sizeof(emsg.cmsg) + sizeof(emsg.ee)) { + printk("%s: short error message (%Zu)\n", + __FUNCTION__, msg.msg_controllen); + _leave(""); + return; + } + + port = sin.sin_port; + + switch (emsg.ee.ee_origin) { + case SO_EE_ORIGIN_ICMP: + local = 0; + switch (emsg.ee.ee_type) { + case ICMP_DEST_UNREACH: + switch (emsg.ee.ee_code) { + case ICMP_NET_UNREACH: + _net("Rx Received ICMP Network Unreachable"); + port = 0; + err = -ENETUNREACH; + break; + case ICMP_HOST_UNREACH: + _net("Rx Received ICMP Host Unreachable"); + port = 0; + err = -EHOSTUNREACH; + break; + case ICMP_PORT_UNREACH: + _net("Rx Received ICMP Port Unreachable"); + err = -ECONNREFUSED; + break; + case ICMP_NET_UNKNOWN: + _net("Rx Received ICMP Unknown Network"); + port = 0; + err = -ENETUNREACH; + break; + case ICMP_HOST_UNKNOWN: + _net("Rx Received ICMP Unknown Host"); + port = 0; + err = -EHOSTUNREACH; + break; + default: + _net("Rx Received ICMP DestUnreach { code=%u }", + emsg.ee.ee_code); + err = emsg.ee.ee_errno; + break; + } + break; + + case ICMP_TIME_EXCEEDED: + _net("Rx Received ICMP TTL Exceeded"); + err = emsg.ee.ee_errno; + break; + + default: + _proto("Rx Received ICMP error { type=%u code=%u }", + emsg.ee.ee_type, emsg.ee.ee_code); + err = emsg.ee.ee_errno; + break; + } + break; + + case SO_EE_ORIGIN_LOCAL: + _proto("Rx Received local error { error=%d }", + emsg.ee.ee_errno); + local = 1; + err = emsg.ee.ee_errno; + break; + + case SO_EE_ORIGIN_NONE: + case SO_EE_ORIGIN_ICMP6: + default: + _proto("Rx Received error report { orig=%u }", + emsg.ee.ee_origin); + local = 0; + err = emsg.ee.ee_errno; + break; + } + + /* find all the connections between this transport and the + * affected destination */ + INIT_LIST_HEAD(&connq); + + if (rxrpc_peer_lookup(trans, sin.sin_addr.s_addr, + &peer) == 0) { + read_lock(&peer->conn_lock); + list_for_each(_p, &peer->conn_active) { + conn = list_entry(_p, struct rxrpc_connection, + link); + if (port && conn->addr.sin_port != port) + continue; + if (!list_empty(&conn->err_link)) + continue; + + rxrpc_get_connection(conn); + list_add_tail(&conn->err_link, &connq); + } + read_unlock(&peer->conn_lock); + + /* service all those connections */ + while (!list_empty(&connq)) { + conn = list_entry(connq.next, + struct rxrpc_connection, + err_link); + list_del(&conn->err_link); + + rxrpc_conn_handle_error(conn, local, err); + + rxrpc_put_connection(conn); + } + + rxrpc_put_peer(peer); + } + } + + _leave(""); + return; +} /* end rxrpc_trans_receive_error_report() */ diff --git a/net/sched/Kconfig b/net/sched/Kconfig new file mode 100644 index 000000000000..3d1d902dd1a1 --- /dev/null +++ b/net/sched/Kconfig @@ -0,0 +1,508 @@ +# +# Traffic control configuration. +# +choice + prompt "Packet scheduler clock source" + depends on NET_SCHED + default NET_SCH_CLK_JIFFIES + help + Packet schedulers need a monotonic clock that increments at a static + rate. The kernel provides several suitable interfaces, each with + different properties: + + - high resolution (us or better) + - fast to read (minimal locking, no i/o access) + - synchronized on all processors + - handles cpu clock frequency changes + + but nothing provides all of the above. + +config NET_SCH_CLK_JIFFIES + bool "Timer interrupt" + help + Say Y here if you want to use the timer interrupt (jiffies) as clock + source. This clock source is fast, synchronized on all processors and + handles cpu clock frequency changes, but its resolution is too low + for accurate shaping except at very low speed. + +config NET_SCH_CLK_GETTIMEOFDAY + bool "gettimeofday" + help + Say Y here if you want to use gettimeofday as clock source. This clock + source has high resolution, is synchronized on all processors and + handles cpu clock frequency changes, but it is slow. + + Choose this if you need a high resolution clock source but can't use + the CPU's cycle counter. + +config NET_SCH_CLK_CPU + bool "CPU cycle counter" + depends on X86_TSC || X86_64 || ALPHA || SPARC64 || PPC64 || IA64 + help + Say Y here if you want to use the CPU's cycle counter as clock source. + This is a cheap and high resolution clock source, but on some + architectures it is not synchronized on all processors and doesn't + handle cpu clock frequency changes. + + The useable cycle counters are: + + x86/x86_64 - Timestamp Counter + alpha - Cycle Counter + sparc64 - %ticks register + ppc64 - Time base + ia64 - Interval Time Counter + + Choose this if your CPU's cycle counter is working properly. + +endchoice + +config NET_SCH_CBQ + tristate "CBQ packet scheduler" + depends on NET_SCHED + ---help--- + Say Y here if you want to use the Class-Based Queueing (CBQ) packet + scheduling algorithm for some of your network devices. This + algorithm classifies the waiting packets into a tree-like hierarchy + of classes; the leaves of this tree are in turn scheduled by + separate algorithms (called "disciplines" in this context). + + See the top of for references about the + CBQ algorithm. + + CBQ is a commonly used scheduler, so if you're unsure, you should + say Y here. Then say Y to all the queueing algorithms below that you + want to use as CBQ disciplines. Then say Y to "Packet classifier + API" and say Y to all the classifiers you want to use; a classifier + is a routine that allows you to sort your outgoing traffic into + classes based on a certain criterion. + + To compile this code as a module, choose M here: the + module will be called sch_cbq. + +config NET_SCH_HTB + tristate "HTB packet scheduler" + depends on NET_SCHED + ---help--- + Say Y here if you want to use the Hierarchical Token Buckets (HTB) + packet scheduling algorithm for some of your network devices. See + for complete manual and + in-depth articles. + + HTB is very similar to the CBQ regarding its goals however is has + different properties and different algorithm. + + To compile this code as a module, choose M here: the + module will be called sch_htb. + +config NET_SCH_HFSC + tristate "HFSC packet scheduler" + depends on NET_SCHED + ---help--- + Say Y here if you want to use the Hierarchical Fair Service Curve + (HFSC) packet scheduling algorithm for some of your network devices. + + To compile this code as a module, choose M here: the + module will be called sch_hfsc. + +#tristate ' H-PFQ packet scheduler' CONFIG_NET_SCH_HPFQ +config NET_SCH_ATM + tristate "ATM pseudo-scheduler" + depends on NET_SCHED && ATM + ---help--- + Say Y here if you want to use the ATM pseudo-scheduler. This + provides a framework for invoking classifiers (aka "filters"), which + in turn select classes of this queuing discipline. Each class maps + the flow(s) it is handling to a given virtual circuit (see the top of + ). + + To compile this code as a module, choose M here: the + module will be called sch_atm. + +config NET_SCH_PRIO + tristate "The simplest PRIO pseudoscheduler" + depends on NET_SCHED + help + Say Y here if you want to use an n-band priority queue packet + "scheduler" for some of your network devices or as a leaf discipline + for the CBQ scheduling algorithm. If unsure, say Y. + + To compile this code as a module, choose M here: the + module will be called sch_prio. + +config NET_SCH_RED + tristate "RED queue" + depends on NET_SCHED + help + Say Y here if you want to use the Random Early Detection (RED) + packet scheduling algorithm for some of your network devices (see + the top of for details and references + about the algorithm). + + To compile this code as a module, choose M here: the + module will be called sch_red. + +config NET_SCH_SFQ + tristate "SFQ queue" + depends on NET_SCHED + ---help--- + Say Y here if you want to use the Stochastic Fairness Queueing (SFQ) + packet scheduling algorithm for some of your network devices or as a + leaf discipline for the CBQ scheduling algorithm (see the top of + for details and references about the SFQ + algorithm). + + To compile this code as a module, choose M here: the + module will be called sch_sfq. + +config NET_SCH_TEQL + tristate "TEQL queue" + depends on NET_SCHED + ---help--- + Say Y here if you want to use the True Link Equalizer (TLE) packet + scheduling algorithm for some of your network devices or as a leaf + discipline for the CBQ scheduling algorithm. This queueing + discipline allows the combination of several physical devices into + one virtual device. (see the top of for + details). + + To compile this code as a module, choose M here: the + module will be called sch_teql. + +config NET_SCH_TBF + tristate "TBF queue" + depends on NET_SCHED + help + Say Y here if you want to use the Simple Token Bucket Filter (TBF) + packet scheduling algorithm for some of your network devices or as a + leaf discipline for the CBQ scheduling algorithm (see the top of + for a description of the TBF algorithm). + + To compile this code as a module, choose M here: the + module will be called sch_tbf. + +config NET_SCH_GRED + tristate "GRED queue" + depends on NET_SCHED + help + Say Y here if you want to use the Generic Random Early Detection + (RED) packet scheduling algorithm for some of your network devices + (see the top of for details and + references about the algorithm). + + To compile this code as a module, choose M here: the + module will be called sch_gred. + +config NET_SCH_DSMARK + tristate "Diffserv field marker" + depends on NET_SCHED + help + Say Y if you want to schedule packets according to the + Differentiated Services architecture proposed in RFC 2475. + Technical information on this method, with pointers to associated + RFCs, is available at . + + To compile this code as a module, choose M here: the + module will be called sch_dsmark. + +config NET_SCH_NETEM + tristate "Network emulator" + depends on NET_SCHED + help + Say Y if you want to emulate network delay, loss, and packet + re-ordering. This is often useful to simulate networks when + testing applications or protocols. + + To compile this driver as a module, choose M here: the module + will be called sch_netem. + + If unsure, say N. + +config NET_SCH_INGRESS + tristate "Ingress Qdisc" + depends on NET_SCHED + help + If you say Y here, you will be able to police incoming bandwidth + and drop packets when this bandwidth exceeds your desired rate. + If unsure, say Y. + + To compile this code as a module, choose M here: the + module will be called sch_ingress. + +config NET_QOS + bool "QoS support" + depends on NET_SCHED + ---help--- + Say Y here if you want to include Quality Of Service scheduling + features, which means that you will be able to request certain + rate-of-flow limits for your network devices. + + This Quality of Service (QoS) support will enable you to use + Differentiated Services (diffserv) and Resource Reservation Protocol + (RSVP) on your Linux router if you also say Y to "Packet classifier + API" and to some classifiers below. Documentation and software is at + . + + Note that the answer to this question won't directly affect the + kernel: saying N will just cause the configurator to skip all + the questions about QoS support. + +config NET_ESTIMATOR + bool "Rate estimator" + depends on NET_QOS + help + In order for Quality of Service scheduling to work, the current + rate-of-flow for a network device has to be estimated; if you say Y + here, the kernel will do just that. + +config NET_CLS + bool "Packet classifier API" + depends on NET_SCHED + ---help--- + The CBQ scheduling algorithm requires that network packets which are + scheduled to be sent out over a network device be classified + according to some criterion. If you say Y here, you will get a + choice of several different packet classifiers with the following + questions. + + This will enable you to use Differentiated Services (diffserv) and + Resource Reservation Protocol (RSVP) on your Linux router. + Documentation and software is at + . + +config NET_CLS_BASIC + tristate "Basic classifier" + depends on NET_CLS + ---help--- + Say Y here if you want to be able to classify packets using + only extended matches and actions. + + To compile this code as a module, choose M here: the + module will be called cls_basic. + +config NET_CLS_TCINDEX + tristate "TC index classifier" + depends on NET_CLS + help + If you say Y here, you will be able to classify outgoing packets + according to the tc_index field of the skb. You will want this + feature if you want to implement Differentiated Services using + sch_dsmark. If unsure, say Y. + + To compile this code as a module, choose M here: the + module will be called cls_tcindex. + +config NET_CLS_ROUTE4 + tristate "Routing table based classifier" + depends on NET_CLS + select NET_CLS_ROUTE + help + If you say Y here, you will be able to classify outgoing packets + according to the route table entry they matched. If unsure, say Y. + + To compile this code as a module, choose M here: the + module will be called cls_route. + +config NET_CLS_ROUTE + bool + default n + +config NET_CLS_FW + tristate "Firewall based classifier" + depends on NET_CLS + help + If you say Y here, you will be able to classify outgoing packets + according to firewall criteria you specified. + + To compile this code as a module, choose M here: the + module will be called cls_fw. + +config NET_CLS_U32 + tristate "U32 classifier" + depends on NET_CLS + help + If you say Y here, you will be able to classify outgoing packets + according to their destination address. If unsure, say Y. + + To compile this code as a module, choose M here: the + module will be called cls_u32. + +config CLS_U32_PERF + bool "U32 classifier performance counters" + depends on NET_CLS_U32 + help + gathers stats that could be used to tune u32 classifier performance. + Requires a new iproute2 + You MUST NOT turn this on if you dont have an update iproute2. + +config NET_CLS_IND + bool "classify input device (slows things u32/fw) " + depends on NET_CLS_U32 || NET_CLS_FW + help + This option will be killed eventually when a + metadata action appears because it slows things a little + Available only for u32 and fw classifiers. + Requires a new iproute2 + You MUST NOT turn this on if you dont have an update iproute2. + +config CLS_U32_MARK + bool "Use nfmark as a key in U32 classifier" + depends on NET_CLS_U32 && NETFILTER + help + This allows you to match mark in a u32 filter. + Example: + tc filter add dev eth0 protocol ip parent 1:0 prio 5 u32 \ + match mark 0x0090 0xffff \ + match ip dst 4.4.4.4 \ + flowid 1:90 + You must use a new iproute2 to use this feature. + +config NET_CLS_RSVP + tristate "Special RSVP classifier" + depends on NET_CLS && NET_QOS + ---help--- + The Resource Reservation Protocol (RSVP) permits end systems to + request a minimum and maximum data flow rate for a connection; this + is important for real time data such as streaming sound or video. + + Say Y here if you want to be able to classify outgoing packets based + on their RSVP requests. + + To compile this code as a module, choose M here: the + module will be called cls_rsvp. + +config NET_CLS_RSVP6 + tristate "Special RSVP classifier for IPv6" + depends on NET_CLS && NET_QOS + ---help--- + The Resource Reservation Protocol (RSVP) permits end systems to + request a minimum and maximum data flow rate for a connection; this + is important for real time data such as streaming sound or video. + + Say Y here if you want to be able to classify outgoing packets based + on their RSVP requests and you are using the new Internet Protocol + IPv6 as opposed to the older and more common IPv4. + + To compile this code as a module, choose M here: the + module will be called cls_rsvp6. + +config NET_EMATCH + bool "Extended Matches" + depends on NET_CLS + ---help--- + Say Y here if you want to use extended matches on top of classifiers + and select the extended matches below. + + Extended matches are small classification helpers not worth writing + a separate classifier. + + You must have a recent version of the iproute2 tools in order to use + extended matches. + +config NET_EMATCH_STACK + int "Stack size" + depends on NET_EMATCH + default "32" + ---help--- + Size of the local stack variable used while evaluating the tree of + ematches. Limits the depth of the tree, i.e. the number of + encapsulated precedences. Every level requires 4 bytes of addtional + stack space. + +config NET_EMATCH_CMP + tristate "Simple packet data comparison" + depends on NET_EMATCH + ---help--- + Say Y here if you want to be able to classify packets based on + simple packet data comparisons for 8, 16, and 32bit values. + + To compile this code as a module, choose M here: the + module will be called em_cmp. + +config NET_EMATCH_NBYTE + tristate "Multi byte comparison" + depends on NET_EMATCH + ---help--- + Say Y here if you want to be able to classify packets based on + multiple byte comparisons mainly useful for IPv6 address comparisons. + + To compile this code as a module, choose M here: the + module will be called em_nbyte. + +config NET_EMATCH_U32 + tristate "U32 hashing key" + depends on NET_EMATCH + ---help--- + Say Y here if you want to be able to classify packets using + the famous u32 key in combination with logic relations. + + To compile this code as a module, choose M here: the + module will be called em_u32. + +config NET_EMATCH_META + tristate "Metadata" + depends on NET_EMATCH + ---help--- + Say Y here if you want to be ablt to classify packets based on + metadata such as load average, netfilter attributes, socket + attributes and routing decisions. + + To compile this code as a module, choose M here: the + module will be called em_meta. + +config NET_CLS_ACT + bool "Packet ACTION" + depends on EXPERIMENTAL && NET_CLS && NET_QOS + ---help--- + This option requires you have a new iproute2. It enables + tc extensions which can be used with tc classifiers. + You MUST NOT turn this on if you dont have an update iproute2. + +config NET_ACT_POLICE + tristate "Policing Actions" + depends on NET_CLS_ACT + ---help--- + If you are using a newer iproute2 select this one, otherwise use one + below to select a policer. + You MUST NOT turn this on if you dont have an update iproute2. + +config NET_ACT_GACT + tristate "generic Actions" + depends on NET_CLS_ACT + ---help--- + You must have new iproute2 to use this feature. + This adds simple filtering actions like drop, accept etc. + +config GACT_PROB + bool "generic Actions probability" + depends on NET_ACT_GACT + ---help--- + Allows generic actions to be randomly or deterministically used. + +config NET_ACT_MIRRED + tristate "Packet In/Egress redirecton/mirror Actions" + depends on NET_CLS_ACT + ---help--- + requires new iproute2 + This allows packets to be mirrored or redirected to netdevices + +config NET_ACT_IPT + tristate "iptables Actions" + depends on NET_CLS_ACT && NETFILTER && IP_NF_IPTABLES + ---help--- + requires new iproute2 + This allows iptables targets to be used by tc filters + +config NET_ACT_PEDIT + tristate "Generic Packet Editor Actions" + depends on NET_CLS_ACT + ---help--- + requires new iproute2 + This allows for packets to be generically edited + +config NET_CLS_POLICE + bool "Traffic policing (needed for in/egress)" + depends on NET_CLS && NET_QOS && NET_CLS_ACT!=y + help + Say Y to support traffic policing (bandwidth limits). Needed for + ingress and egress rate limiting. + diff --git a/net/sched/Makefile b/net/sched/Makefile new file mode 100644 index 000000000000..431e55786efd --- /dev/null +++ b/net/sched/Makefile @@ -0,0 +1,41 @@ +# +# Makefile for the Linux Traffic Control Unit. +# + +obj-y := sch_generic.o + +obj-$(CONFIG_NET_SCHED) += sch_api.o sch_fifo.o +obj-$(CONFIG_NET_CLS) += cls_api.o +obj-$(CONFIG_NET_CLS_ACT) += act_api.o +obj-$(CONFIG_NET_ACT_POLICE) += police.o +obj-$(CONFIG_NET_CLS_POLICE) += police.o +obj-$(CONFIG_NET_ACT_GACT) += gact.o +obj-$(CONFIG_NET_ACT_MIRRED) += mirred.o +obj-$(CONFIG_NET_ACT_IPT) += ipt.o +obj-$(CONFIG_NET_ACT_PEDIT) += pedit.o +obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o +obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o +obj-$(CONFIG_NET_SCH_HPFQ) += sch_hpfq.o +obj-$(CONFIG_NET_SCH_HFSC) += sch_hfsc.o +obj-$(CONFIG_NET_SCH_RED) += sch_red.o +obj-$(CONFIG_NET_SCH_GRED) += sch_gred.o +obj-$(CONFIG_NET_SCH_INGRESS) += sch_ingress.o +obj-$(CONFIG_NET_SCH_DSMARK) += sch_dsmark.o +obj-$(CONFIG_NET_SCH_SFQ) += sch_sfq.o +obj-$(CONFIG_NET_SCH_TBF) += sch_tbf.o +obj-$(CONFIG_NET_SCH_TEQL) += sch_teql.o +obj-$(CONFIG_NET_SCH_PRIO) += sch_prio.o +obj-$(CONFIG_NET_SCH_ATM) += sch_atm.o +obj-$(CONFIG_NET_SCH_NETEM) += sch_netem.o +obj-$(CONFIG_NET_CLS_U32) += cls_u32.o +obj-$(CONFIG_NET_CLS_ROUTE4) += cls_route.o +obj-$(CONFIG_NET_CLS_FW) += cls_fw.o +obj-$(CONFIG_NET_CLS_RSVP) += cls_rsvp.o +obj-$(CONFIG_NET_CLS_TCINDEX) += cls_tcindex.o +obj-$(CONFIG_NET_CLS_RSVP6) += cls_rsvp6.o +obj-$(CONFIG_NET_CLS_BASIC) += cls_basic.o +obj-$(CONFIG_NET_EMATCH) += ematch.o +obj-$(CONFIG_NET_EMATCH_CMP) += em_cmp.o +obj-$(CONFIG_NET_EMATCH_NBYTE) += em_nbyte.o +obj-$(CONFIG_NET_EMATCH_U32) += em_u32.o +obj-$(CONFIG_NET_EMATCH_META) += em_meta.o diff --git a/net/sched/act_api.c b/net/sched/act_api.c new file mode 100644 index 000000000000..5e6cc371b39e --- /dev/null +++ b/net/sched/act_api.c @@ -0,0 +1,894 @@ +/* + * net/sched/act_api.c Packet action API. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Author: Jamal Hadi Salim + * + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 1 /* control */ +#define DPRINTK(format, args...) printk(KERN_DEBUG format, ##args) +#else +#define DPRINTK(format, args...) +#endif +#if 0 /* data */ +#define D2PRINTK(format, args...) printk(KERN_DEBUG format, ##args) +#else +#define D2PRINTK(format, args...) +#endif + +static struct tc_action_ops *act_base = NULL; +static DEFINE_RWLOCK(act_mod_lock); + +int tcf_register_action(struct tc_action_ops *act) +{ + struct tc_action_ops *a, **ap; + + write_lock(&act_mod_lock); + for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) { + if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) { + write_unlock(&act_mod_lock); + return -EEXIST; + } + } + act->next = NULL; + *ap = act; + write_unlock(&act_mod_lock); + return 0; +} + +int tcf_unregister_action(struct tc_action_ops *act) +{ + struct tc_action_ops *a, **ap; + int err = -ENOENT; + + write_lock(&act_mod_lock); + for (ap = &act_base; (a = *ap) != NULL; ap = &a->next) + if (a == act) + break; + if (a) { + *ap = a->next; + a->next = NULL; + err = 0; + } + write_unlock(&act_mod_lock); + return err; +} + +/* lookup by name */ +static struct tc_action_ops *tc_lookup_action_n(char *kind) +{ + struct tc_action_ops *a = NULL; + + if (kind) { + read_lock(&act_mod_lock); + for (a = act_base; a; a = a->next) { + if (strcmp(kind, a->kind) == 0) { + if (!try_module_get(a->owner)) { + read_unlock(&act_mod_lock); + return NULL; + } + break; + } + } + read_unlock(&act_mod_lock); + } + return a; +} + +/* lookup by rtattr */ +static struct tc_action_ops *tc_lookup_action(struct rtattr *kind) +{ + struct tc_action_ops *a = NULL; + + if (kind) { + read_lock(&act_mod_lock); + for (a = act_base; a; a = a->next) { + if (rtattr_strcmp(kind, a->kind) == 0) { + if (!try_module_get(a->owner)) { + read_unlock(&act_mod_lock); + return NULL; + } + break; + } + } + read_unlock(&act_mod_lock); + } + return a; +} + +#if 0 +/* lookup by id */ +static struct tc_action_ops *tc_lookup_action_id(u32 type) +{ + struct tc_action_ops *a = NULL; + + if (type) { + read_lock(&act_mod_lock); + for (a = act_base; a; a = a->next) { + if (a->type == type) { + if (!try_module_get(a->owner)) { + read_unlock(&act_mod_lock); + return NULL; + } + break; + } + } + read_unlock(&act_mod_lock); + } + return a; +} +#endif + +int tcf_action_exec(struct sk_buff *skb, struct tc_action *act, + struct tcf_result *res) +{ + struct tc_action *a; + int ret = -1; + + if (skb->tc_verd & TC_NCLS) { + skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); + D2PRINTK("(%p)tcf_action_exec: cleared TC_NCLS in %s out %s\n", + skb, skb->input_dev ? skb->input_dev->name : "xxx", + skb->dev->name); + ret = TC_ACT_OK; + goto exec_done; + } + while ((a = act) != NULL) { +repeat: + if (a->ops && a->ops->act) { + ret = a->ops->act(&skb, a); + if (TC_MUNGED & skb->tc_verd) { + /* copied already, allow trampling */ + skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd); + skb->tc_verd = CLR_TC_MUNGED(skb->tc_verd); + } + if (ret != TC_ACT_PIPE) + goto exec_done; + if (ret == TC_ACT_REPEAT) + goto repeat; /* we need a ttl - JHS */ + } + act = a->next; + } +exec_done: + if (skb->tc_classid > 0) { + res->classid = skb->tc_classid; + res->class = 0; + skb->tc_classid = 0; + } + return ret; +} + +void tcf_action_destroy(struct tc_action *act, int bind) +{ + struct tc_action *a; + + for (a = act; a; a = act) { + if (a->ops && a->ops->cleanup) { + DPRINTK("tcf_action_destroy destroying %p next %p\n", + a, a->next); + if (a->ops->cleanup(a, bind) == ACT_P_DELETED) + module_put(a->ops->owner); + act = act->next; + kfree(a); + } else { /*FIXME: Remove later - catch insertion bugs*/ + printk("tcf_action_destroy: BUG? destroying NULL ops\n"); + act = act->next; + kfree(a); + } + } +} + +int +tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + int err = -EINVAL; + + if (a->ops == NULL || a->ops->dump == NULL) + return err; + return a->ops->dump(skb, a, bind, ref); +} + +int +tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + int err = -EINVAL; + unsigned char *b = skb->tail; + struct rtattr *r; + + if (a->ops == NULL || a->ops->dump == NULL) + return err; + + RTA_PUT(skb, TCA_KIND, IFNAMSIZ, a->ops->kind); + if (tcf_action_copy_stats(skb, a, 0)) + goto rtattr_failure; + r = (struct rtattr*) skb->tail; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if ((err = tcf_action_dump_old(skb, a, bind, ref)) > 0) { + r->rta_len = skb->tail - (u8*)r; + return err; + } + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +int +tcf_action_dump(struct sk_buff *skb, struct tc_action *act, int bind, int ref) +{ + struct tc_action *a; + int err = -EINVAL; + unsigned char *b = skb->tail; + struct rtattr *r ; + + while ((a = act) != NULL) { + r = (struct rtattr*) skb->tail; + act = a->next; + RTA_PUT(skb, a->order, 0, NULL); + err = tcf_action_dump_1(skb, a, bind, ref); + if (err < 0) + goto rtattr_failure; + r->rta_len = skb->tail - (u8*)r; + } + + return 0; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -err; +} + +struct tc_action *tcf_action_init_1(struct rtattr *rta, struct rtattr *est, + char *name, int ovr, int bind, int *err) +{ + struct tc_action *a; + struct tc_action_ops *a_o; + char act_name[IFNAMSIZ]; + struct rtattr *tb[TCA_ACT_MAX+1]; + struct rtattr *kind; + + *err = -EINVAL; + + if (name == NULL) { + if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0) + goto err_out; + kind = tb[TCA_ACT_KIND-1]; + if (kind == NULL) + goto err_out; + if (rtattr_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) + goto err_out; + } else { + if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) + goto err_out; + } + + a_o = tc_lookup_action_n(act_name); + if (a_o == NULL) { +#ifdef CONFIG_KMOD + rtnl_unlock(); + request_module(act_name); + rtnl_lock(); + + a_o = tc_lookup_action_n(act_name); + + /* We dropped the RTNL semaphore in order to + * perform the module load. So, even if we + * succeeded in loading the module we have to + * tell the caller to replay the request. We + * indicate this using -EAGAIN. + */ + if (a_o != NULL) { + *err = -EAGAIN; + goto err_mod; + } +#endif + goto err_out; + } + + *err = -ENOMEM; + a = kmalloc(sizeof(*a), GFP_KERNEL); + if (a == NULL) + goto err_mod; + memset(a, 0, sizeof(*a)); + + /* backward compatibility for policer */ + if (name == NULL) + *err = a_o->init(tb[TCA_ACT_OPTIONS-1], est, a, ovr, bind); + else + *err = a_o->init(rta, est, a, ovr, bind); + if (*err < 0) + goto err_free; + + /* module count goes up only when brand new policy is created + if it exists and is only bound to in a_o->init() then + ACT_P_CREATED is not returned (a zero is). + */ + if (*err != ACT_P_CREATED) + module_put(a_o->owner); + a->ops = a_o; + DPRINTK("tcf_action_init_1: successfull %s\n", act_name); + + *err = 0; + return a; + +err_free: + kfree(a); +err_mod: + module_put(a_o->owner); +err_out: + return NULL; +} + +struct tc_action *tcf_action_init(struct rtattr *rta, struct rtattr *est, + char *name, int ovr, int bind, int *err) +{ + struct rtattr *tb[TCA_ACT_MAX_PRIO+1]; + struct tc_action *head = NULL, *act, *act_prev = NULL; + int i; + + if (rtattr_parse_nested(tb, TCA_ACT_MAX_PRIO, rta) < 0) { + *err = -EINVAL; + return head; + } + + for (i=0; i < TCA_ACT_MAX_PRIO && tb[i]; i++) { + act = tcf_action_init_1(tb[i], est, name, ovr, bind, err); + if (act == NULL) + goto err; + act->order = i+1; + + if (head == NULL) + head = act; + else + act_prev->next = act; + act_prev = act; + } + return head; + +err: + if (head != NULL) + tcf_action_destroy(head, bind); + return NULL; +} + +int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *a, + int compat_mode) +{ + int err = 0; + struct gnet_dump d; + struct tcf_act_hdr *h = a->priv; + + if (h == NULL) + goto errout; + + /* compat_mode being true specifies a call that is supposed + * to add additional backward compatiblity statistic TLVs. + */ + if (compat_mode) { + if (a->type == TCA_OLD_COMPAT) + err = gnet_stats_start_copy_compat(skb, 0, + TCA_STATS, TCA_XSTATS, h->stats_lock, &d); + else + return 0; + } else + err = gnet_stats_start_copy(skb, TCA_ACT_STATS, + h->stats_lock, &d); + + if (err < 0) + goto errout; + + if (a->ops != NULL && a->ops->get_stats != NULL) + if (a->ops->get_stats(skb, a) < 0) + goto errout; + + if (gnet_stats_copy_basic(&d, &h->bstats) < 0 || +#ifdef CONFIG_NET_ESTIMATOR + gnet_stats_copy_rate_est(&d, &h->rate_est) < 0 || +#endif + gnet_stats_copy_queue(&d, &h->qstats) < 0) + goto errout; + + if (gnet_stats_finish_copy(&d) < 0) + goto errout; + + return 0; + +errout: + return -1; +} + +static int +tca_get_fill(struct sk_buff *skb, struct tc_action *a, u32 pid, u32 seq, + unsigned flags, int event, int bind, int ref) +{ + struct tcamsg *t; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct rtattr *x; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t)); + nlh->nlmsg_flags = flags; + t = NLMSG_DATA(nlh); + t->tca_family = AF_UNSPEC; + + x = (struct rtattr*) skb->tail; + RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); + + if (tcf_action_dump(skb, a, bind, ref) < 0) + goto rtattr_failure; + + x->rta_len = skb->tail - (u8*)x; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +rtattr_failure: +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int +act_get_notify(u32 pid, struct nlmsghdr *n, struct tc_action *a, int event) +{ + struct sk_buff *skb; + int err = 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + if (tca_get_fill(skb, a, pid, n->nlmsg_seq, 0, event, 0, 0) <= 0) { + kfree_skb(skb); + return -EINVAL; + } + err = netlink_unicast(rtnl, skb, pid, MSG_DONTWAIT); + if (err > 0) + err = 0; + return err; +} + +static struct tc_action * +tcf_action_get_1(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int *err) +{ + struct rtattr *tb[TCA_ACT_MAX+1]; + struct tc_action *a; + int index; + + *err = -EINVAL; + if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0) + return NULL; + + if (tb[TCA_ACT_INDEX - 1] == NULL || + RTA_PAYLOAD(tb[TCA_ACT_INDEX - 1]) < sizeof(index)) + return NULL; + index = *(int *)RTA_DATA(tb[TCA_ACT_INDEX - 1]); + + *err = -ENOMEM; + a = kmalloc(sizeof(struct tc_action), GFP_KERNEL); + if (a == NULL) + return NULL; + memset(a, 0, sizeof(struct tc_action)); + + *err = -EINVAL; + a->ops = tc_lookup_action(tb[TCA_ACT_KIND - 1]); + if (a->ops == NULL) + goto err_free; + if (a->ops->lookup == NULL) + goto err_mod; + *err = -ENOENT; + if (a->ops->lookup(a, index) == 0) + goto err_mod; + + module_put(a->ops->owner); + *err = 0; + return a; +err_mod: + module_put(a->ops->owner); +err_free: + kfree(a); + return NULL; +} + +static void cleanup_a(struct tc_action *act) +{ + struct tc_action *a; + + for (a = act; a; a = act) { + act = a->next; + kfree(a); + } +} + +static struct tc_action *create_a(int i) +{ + struct tc_action *act; + + act = kmalloc(sizeof(*act), GFP_KERNEL); + if (act == NULL) { + printk("create_a: failed to alloc!\n"); + return NULL; + } + memset(act, 0, sizeof(*act)); + act->order = i; + return act; +} + +static int tca_action_flush(struct rtattr *rta, struct nlmsghdr *n, u32 pid) +{ + struct sk_buff *skb; + unsigned char *b; + struct nlmsghdr *nlh; + struct tcamsg *t; + struct netlink_callback dcb; + struct rtattr *x; + struct rtattr *tb[TCA_ACT_MAX+1]; + struct rtattr *kind; + struct tc_action *a = create_a(0); + int err = -EINVAL; + + if (a == NULL) { + printk("tca_action_flush: couldnt create tc_action\n"); + return err; + } + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) { + printk("tca_action_flush: failed skb alloc\n"); + kfree(a); + return -ENOBUFS; + } + + b = (unsigned char *)skb->tail; + + if (rtattr_parse_nested(tb, TCA_ACT_MAX, rta) < 0) + goto err_out; + + kind = tb[TCA_ACT_KIND-1]; + a->ops = tc_lookup_action(kind); + if (a->ops == NULL) + goto err_out; + + nlh = NLMSG_PUT(skb, pid, n->nlmsg_seq, RTM_DELACTION, sizeof(*t)); + t = NLMSG_DATA(nlh); + t->tca_family = AF_UNSPEC; + + x = (struct rtattr *) skb->tail; + RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); + + err = a->ops->walk(skb, &dcb, RTM_DELACTION, a); + if (err < 0) + goto rtattr_failure; + + x->rta_len = skb->tail - (u8 *) x; + + nlh->nlmsg_len = skb->tail - b; + nlh->nlmsg_flags |= NLM_F_ROOT; + module_put(a->ops->owner); + kfree(a); + err = rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + if (err > 0) + return 0; + + return err; + +rtattr_failure: + module_put(a->ops->owner); +nlmsg_failure: +err_out: + kfree_skb(skb); + kfree(a); + return err; +} + +static int +tca_action_gd(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int event) +{ + int i, ret = 0; + struct rtattr *tb[TCA_ACT_MAX_PRIO+1]; + struct tc_action *head = NULL, *act, *act_prev = NULL; + + if (rtattr_parse_nested(tb, TCA_ACT_MAX_PRIO, rta) < 0) + return -EINVAL; + + if (event == RTM_DELACTION && n->nlmsg_flags&NLM_F_ROOT) { + if (tb[0] != NULL && tb[1] == NULL) + return tca_action_flush(tb[0], n, pid); + } + + for (i=0; i < TCA_ACT_MAX_PRIO && tb[i]; i++) { + act = tcf_action_get_1(tb[i], n, pid, &ret); + if (act == NULL) + goto err; + act->order = i+1; + + if (head == NULL) + head = act; + else + act_prev->next = act; + act_prev = act; + } + + if (event == RTM_GETACTION) + ret = act_get_notify(pid, n, head, event); + else { /* delete */ + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) { + ret = -ENOBUFS; + goto err; + } + + if (tca_get_fill(skb, head, pid, n->nlmsg_seq, 0, event, + 0, 1) <= 0) { + kfree_skb(skb); + ret = -EINVAL; + goto err; + } + + /* now do the delete */ + tcf_action_destroy(head, 0); + ret = rtnetlink_send(skb, pid, RTMGRP_TC, + n->nlmsg_flags&NLM_F_ECHO); + if (ret > 0) + return 0; + return ret; + } +err: + cleanup_a(head); + return ret; +} + +static int tcf_add_notify(struct tc_action *a, u32 pid, u32 seq, int event, + unsigned flags) +{ + struct tcamsg *t; + struct nlmsghdr *nlh; + struct sk_buff *skb; + struct rtattr *x; + unsigned char *b; + int err = 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + b = (unsigned char *)skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*t)); + nlh->nlmsg_flags = flags; + t = NLMSG_DATA(nlh); + t->tca_family = AF_UNSPEC; + + x = (struct rtattr*) skb->tail; + RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); + + if (tcf_action_dump(skb, a, 0, 0) < 0) + goto rtattr_failure; + + x->rta_len = skb->tail - (u8*)x; + + nlh->nlmsg_len = skb->tail - b; + NETLINK_CB(skb).dst_groups = RTMGRP_TC; + + err = rtnetlink_send(skb, pid, RTMGRP_TC, flags&NLM_F_ECHO); + if (err > 0) + err = 0; + return err; + +rtattr_failure: +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + + +static int +tcf_action_add(struct rtattr *rta, struct nlmsghdr *n, u32 pid, int ovr) +{ + int ret = 0; + struct tc_action *act; + struct tc_action *a; + u32 seq = n->nlmsg_seq; + + act = tcf_action_init(rta, NULL, NULL, ovr, 0, &ret); + if (act == NULL) + goto done; + + /* dump then free all the actions after update; inserted policy + * stays intact + * */ + ret = tcf_add_notify(act, pid, seq, RTM_NEWACTION, n->nlmsg_flags); + for (a = act; a; a = act) { + act = a->next; + kfree(a); + } +done: + return ret; +} + +static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct rtattr **tca = arg; + u32 pid = skb ? NETLINK_CB(skb).pid : 0; + int ret = 0, ovr = 0; + + if (tca[TCA_ACT_TAB-1] == NULL) { + printk("tc_ctl_action: received NO action attribs\n"); + return -EINVAL; + } + + /* n->nlmsg_flags&NLM_F_CREATE + * */ + switch (n->nlmsg_type) { + case RTM_NEWACTION: + /* we are going to assume all other flags + * imply create only if it doesnt exist + * Note that CREATE | EXCL implies that + * but since we want avoid ambiguity (eg when flags + * is zero) then just set this + */ + if (n->nlmsg_flags&NLM_F_REPLACE) + ovr = 1; +replay: + ret = tcf_action_add(tca[TCA_ACT_TAB-1], n, pid, ovr); + if (ret == -EAGAIN) + goto replay; + break; + case RTM_DELACTION: + ret = tca_action_gd(tca[TCA_ACT_TAB-1], n, pid, RTM_DELACTION); + break; + case RTM_GETACTION: + ret = tca_action_gd(tca[TCA_ACT_TAB-1], n, pid, RTM_GETACTION); + break; + default: + BUG(); + } + + return ret; +} + +static char * +find_dump_kind(struct nlmsghdr *n) +{ + struct rtattr *tb1, *tb2[TCA_ACT_MAX+1]; + struct rtattr *tb[TCA_ACT_MAX_PRIO + 1]; + struct rtattr *rta[TCAA_MAX + 1]; + struct rtattr *kind; + int min_len = NLMSG_LENGTH(sizeof(struct tcamsg)); + int attrlen = n->nlmsg_len - NLMSG_ALIGN(min_len); + struct rtattr *attr = (void *) n + NLMSG_ALIGN(min_len); + + if (rtattr_parse(rta, TCAA_MAX, attr, attrlen) < 0) + return NULL; + tb1 = rta[TCA_ACT_TAB - 1]; + if (tb1 == NULL) + return NULL; + + if (rtattr_parse(tb, TCA_ACT_MAX_PRIO, RTA_DATA(tb1), + NLMSG_ALIGN(RTA_PAYLOAD(tb1))) < 0) + return NULL; + if (tb[0] == NULL) + return NULL; + + if (rtattr_parse(tb2, TCA_ACT_MAX, RTA_DATA(tb[0]), + RTA_PAYLOAD(tb[0])) < 0) + return NULL; + kind = tb2[TCA_ACT_KIND-1]; + + return (char *) RTA_DATA(kind); +} + +static int +tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct rtattr *x; + struct tc_action_ops *a_o; + struct tc_action a; + int ret = 0; + struct tcamsg *t = (struct tcamsg *) NLMSG_DATA(cb->nlh); + char *kind = find_dump_kind(cb->nlh); + + if (kind == NULL) { + printk("tc_dump_action: action bad kind\n"); + return 0; + } + + a_o = tc_lookup_action_n(kind); + if (a_o == NULL) { + printk("failed to find %s\n", kind); + return 0; + } + + memset(&a, 0, sizeof(struct tc_action)); + a.ops = a_o; + + if (a_o->walk == NULL) { + printk("tc_dump_action: %s !capable of dumping table\n", kind); + goto rtattr_failure; + } + + nlh = NLMSG_PUT(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, + cb->nlh->nlmsg_type, sizeof(*t)); + t = NLMSG_DATA(nlh); + t->tca_family = AF_UNSPEC; + + x = (struct rtattr *) skb->tail; + RTA_PUT(skb, TCA_ACT_TAB, 0, NULL); + + ret = a_o->walk(skb, cb, RTM_GETACTION, &a); + if (ret < 0) + goto rtattr_failure; + + if (ret > 0) { + x->rta_len = skb->tail - (u8 *) x; + ret = skb->len; + } else + skb_trim(skb, (u8*)x - skb->data); + + nlh->nlmsg_len = skb->tail - b; + if (NETLINK_CB(cb->skb).pid && ret) + nlh->nlmsg_flags |= NLM_F_MULTI; + module_put(a_o->owner); + return skb->len; + +rtattr_failure: +nlmsg_failure: + module_put(a_o->owner); + skb_trim(skb, b - skb->data); + return skb->len; +} + +static int __init tc_action_init(void) +{ + struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC]; + + if (link_p) { + link_p[RTM_NEWACTION-RTM_BASE].doit = tc_ctl_action; + link_p[RTM_DELACTION-RTM_BASE].doit = tc_ctl_action; + link_p[RTM_GETACTION-RTM_BASE].doit = tc_ctl_action; + link_p[RTM_GETACTION-RTM_BASE].dumpit = tc_dump_action; + } + + printk("TC classifier action (bugs to netdev@oss.sgi.com cc " + "hadi@cyberus.ca)\n"); + return 0; +} + +subsys_initcall(tc_action_init); + +EXPORT_SYMBOL(tcf_register_action); +EXPORT_SYMBOL(tcf_unregister_action); +EXPORT_SYMBOL(tcf_action_exec); +EXPORT_SYMBOL(tcf_action_dump_1); diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c new file mode 100644 index 000000000000..56e66c3fe0fa --- /dev/null +++ b/net/sched/cls_api.c @@ -0,0 +1,642 @@ +/* + * net/sched/cls_api.c Packet classifier API. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + * Changes: + * + * Eduardo J. Blanco :990222: kmod support + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 0 /* control */ +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + +/* The list of all installed classifier types */ + +static struct tcf_proto_ops *tcf_proto_base; + +/* Protects list of registered TC modules. It is pure SMP lock. */ +static DEFINE_RWLOCK(cls_mod_lock); + +/* Find classifier type by string name */ + +static struct tcf_proto_ops * tcf_proto_lookup_ops(struct rtattr *kind) +{ + struct tcf_proto_ops *t = NULL; + + if (kind) { + read_lock(&cls_mod_lock); + for (t = tcf_proto_base; t; t = t->next) { + if (rtattr_strcmp(kind, t->kind) == 0) { + if (!try_module_get(t->owner)) + t = NULL; + break; + } + } + read_unlock(&cls_mod_lock); + } + return t; +} + +/* Register(unregister) new classifier type */ + +int register_tcf_proto_ops(struct tcf_proto_ops *ops) +{ + struct tcf_proto_ops *t, **tp; + int rc = -EEXIST; + + write_lock(&cls_mod_lock); + for (tp = &tcf_proto_base; (t = *tp) != NULL; tp = &t->next) + if (!strcmp(ops->kind, t->kind)) + goto out; + + ops->next = NULL; + *tp = ops; + rc = 0; +out: + write_unlock(&cls_mod_lock); + return rc; +} + +int unregister_tcf_proto_ops(struct tcf_proto_ops *ops) +{ + struct tcf_proto_ops *t, **tp; + int rc = -ENOENT; + + write_lock(&cls_mod_lock); + for (tp = &tcf_proto_base; (t=*tp) != NULL; tp = &t->next) + if (t == ops) + break; + + if (!t) + goto out; + *tp = t->next; + rc = 0; +out: + write_unlock(&cls_mod_lock); + return rc; +} + +static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct tcf_proto *tp, unsigned long fh, int event); + + +/* Select new prio value from the range, managed by kernel. */ + +static __inline__ u32 tcf_auto_prio(struct tcf_proto *tp) +{ + u32 first = TC_H_MAKE(0xC0000000U,0U); + + if (tp) + first = tp->prio-1; + + return first; +} + +/* Add/change/delete/get a filter node */ + +static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct rtattr **tca; + struct tcmsg *t; + u32 protocol; + u32 prio; + u32 nprio; + u32 parent; + struct net_device *dev; + struct Qdisc *q; + struct tcf_proto **back, **chain; + struct tcf_proto *tp; + struct tcf_proto_ops *tp_ops; + struct Qdisc_class_ops *cops; + unsigned long cl; + unsigned long fh; + int err; + +replay: + tca = arg; + t = NLMSG_DATA(n); + protocol = TC_H_MIN(t->tcm_info); + prio = TC_H_MAJ(t->tcm_info); + nprio = prio; + parent = t->tcm_parent; + cl = 0; + + if (prio == 0) { + /* If no priority is given, user wants we allocated it. */ + if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + return -ENOENT; + prio = TC_H_MAKE(0x80000000U,0U); + } + + /* Find head of filter chain. */ + + /* Find link */ + if ((dev = __dev_get_by_index(t->tcm_ifindex)) == NULL) + return -ENODEV; + + /* Find qdisc */ + if (!parent) { + q = dev->qdisc_sleeping; + parent = q->handle; + } else if ((q = qdisc_lookup(dev, TC_H_MAJ(t->tcm_parent))) == NULL) + return -EINVAL; + + /* Is it classful? */ + if ((cops = q->ops->cl_ops) == NULL) + return -EINVAL; + + /* Do we search for filter, attached to class? */ + if (TC_H_MIN(parent)) { + cl = cops->get(q, parent); + if (cl == 0) + return -ENOENT; + } + + /* And the last stroke */ + chain = cops->tcf_chain(q, cl); + err = -EINVAL; + if (chain == NULL) + goto errout; + + /* Check the chain for existence of proto-tcf with this priority */ + for (back = chain; (tp=*back) != NULL; back = &tp->next) { + if (tp->prio >= prio) { + if (tp->prio == prio) { + if (!nprio || (tp->protocol != protocol && protocol)) + goto errout; + } else + tp = NULL; + break; + } + } + + if (tp == NULL) { + /* Proto-tcf does not exist, create new one */ + + if (tca[TCA_KIND-1] == NULL || !protocol) + goto errout; + + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + goto errout; + + + /* Create new proto tcf */ + + err = -ENOBUFS; + if ((tp = kmalloc(sizeof(*tp), GFP_KERNEL)) == NULL) + goto errout; + err = -EINVAL; + tp_ops = tcf_proto_lookup_ops(tca[TCA_KIND-1]); + if (tp_ops == NULL) { +#ifdef CONFIG_KMOD + struct rtattr *kind = tca[TCA_KIND-1]; + char name[IFNAMSIZ]; + + if (kind != NULL && + rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { + rtnl_unlock(); + request_module("cls_%s", name); + rtnl_lock(); + tp_ops = tcf_proto_lookup_ops(kind); + /* We dropped the RTNL semaphore in order to + * perform the module load. So, even if we + * succeeded in loading the module we have to + * replay the request. We indicate this using + * -EAGAIN. + */ + if (tp_ops != NULL) { + module_put(tp_ops->owner); + err = -EAGAIN; + } + } +#endif + kfree(tp); + goto errout; + } + memset(tp, 0, sizeof(*tp)); + tp->ops = tp_ops; + tp->protocol = protocol; + tp->prio = nprio ? : tcf_auto_prio(*back); + tp->q = q; + tp->classify = tp_ops->classify; + tp->classid = parent; + if ((err = tp_ops->init(tp)) != 0) { + module_put(tp_ops->owner); + kfree(tp); + goto errout; + } + + qdisc_lock_tree(dev); + tp->next = *back; + *back = tp; + qdisc_unlock_tree(dev); + + } else if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], tp->ops->kind)) + goto errout; + + fh = tp->ops->get(tp, t->tcm_handle); + + if (fh == 0) { + if (n->nlmsg_type == RTM_DELTFILTER && t->tcm_handle == 0) { + qdisc_lock_tree(dev); + *back = tp->next; + qdisc_unlock_tree(dev); + + tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER); + tcf_destroy(tp); + err = 0; + goto errout; + } + + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTFILTER || !(n->nlmsg_flags&NLM_F_CREATE)) + goto errout; + } else { + switch (n->nlmsg_type) { + case RTM_NEWTFILTER: + err = -EEXIST; + if (n->nlmsg_flags&NLM_F_EXCL) + goto errout; + break; + case RTM_DELTFILTER: + err = tp->ops->delete(tp, fh); + if (err == 0) + tfilter_notify(skb, n, tp, fh, RTM_DELTFILTER); + goto errout; + case RTM_GETTFILTER: + err = tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); + goto errout; + default: + err = -EINVAL; + goto errout; + } + } + + err = tp->ops->change(tp, cl, t->tcm_handle, tca, &fh); + if (err == 0) + tfilter_notify(skb, n, tp, fh, RTM_NEWTFILTER); + +errout: + if (cl) + cops->put(q, cl); + if (err == -EAGAIN) + /* Replay the request. */ + goto replay; + return err; +} + +static int +tcf_fill_node(struct sk_buff *skb, struct tcf_proto *tp, unsigned long fh, + u32 pid, u32 seq, unsigned flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); + nlh->nlmsg_flags = flags; + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = tp->q->dev->ifindex; + tcm->tcm_parent = tp->classid; + tcm->tcm_info = TC_H_MAKE(tp->prio, tp->protocol); + RTA_PUT(skb, TCA_KIND, IFNAMSIZ, tp->ops->kind); + tcm->tcm_handle = fh; + if (RTM_DELTFILTER != event) { + tcm->tcm_handle = 0; + if (tp->ops->dump && tp->ops->dump(tp, fh, skb, tcm) < 0) + goto rtattr_failure; + } + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int tfilter_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct tcf_proto *tp, unsigned long fh, int event) +{ + struct sk_buff *skb; + u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tcf_fill_node(skb, tp, fh, pid, n->nlmsg_seq, 0, event) <= 0) { + kfree_skb(skb); + return -EINVAL; + } + + return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); +} + +struct tcf_dump_args +{ + struct tcf_walker w; + struct sk_buff *skb; + struct netlink_callback *cb; +}; + +static int tcf_node_dump(struct tcf_proto *tp, unsigned long n, struct tcf_walker *arg) +{ + struct tcf_dump_args *a = (void*)arg; + + return tcf_fill_node(a->skb, tp, n, NETLINK_CB(a->cb->skb).pid, + a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER); +} + +static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct net_device *dev; + struct Qdisc *q; + struct tcf_proto *tp, **chain; + struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); + unsigned long cl = 0; + struct Qdisc_class_ops *cops; + struct tcf_dump_args arg; + + if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + return skb->len; + if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return skb->len; + + read_lock_bh(&qdisc_tree_lock); + if (!tcm->tcm_parent) + q = dev->qdisc_sleeping; + else + q = qdisc_lookup(dev, TC_H_MAJ(tcm->tcm_parent)); + if (!q) + goto out; + if ((cops = q->ops->cl_ops) == NULL) + goto errout; + if (TC_H_MIN(tcm->tcm_parent)) { + cl = cops->get(q, tcm->tcm_parent); + if (cl == 0) + goto errout; + } + chain = cops->tcf_chain(q, cl); + if (chain == NULL) + goto errout; + + s_t = cb->args[0]; + + for (tp=*chain, t=0; tp; tp = tp->next, t++) { + if (t < s_t) continue; + if (TC_H_MAJ(tcm->tcm_info) && + TC_H_MAJ(tcm->tcm_info) != tp->prio) + continue; + if (TC_H_MIN(tcm->tcm_info) && + TC_H_MIN(tcm->tcm_info) != tp->protocol) + continue; + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); + if (cb->args[1] == 0) { + if (tcf_fill_node(skb, tp, 0, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTFILTER) <= 0) { + break; + } + cb->args[1] = 1; + } + if (tp->ops->walk == NULL) + continue; + arg.w.fn = tcf_node_dump; + arg.skb = skb; + arg.cb = cb; + arg.w.stop = 0; + arg.w.skip = cb->args[1]-1; + arg.w.count = 0; + tp->ops->walk(tp, &arg.w); + cb->args[1] = arg.w.count+1; + if (arg.w.stop) + break; + } + + cb->args[0] = t; + +errout: + if (cl) + cops->put(q, cl); +out: + read_unlock_bh(&qdisc_tree_lock); + dev_put(dev); + return skb->len; +} + +void +tcf_exts_destroy(struct tcf_proto *tp, struct tcf_exts *exts) +{ +#ifdef CONFIG_NET_CLS_ACT + if (exts->action) { + tcf_action_destroy(exts->action, TCA_ACT_UNBIND); + exts->action = NULL; + } +#elif defined CONFIG_NET_CLS_POLICE + if (exts->police) { + tcf_police_release(exts->police, TCA_ACT_UNBIND); + exts->police = NULL; + } +#endif +} + + +int +tcf_exts_validate(struct tcf_proto *tp, struct rtattr **tb, + struct rtattr *rate_tlv, struct tcf_exts *exts, + struct tcf_ext_map *map) +{ + memset(exts, 0, sizeof(*exts)); + +#ifdef CONFIG_NET_CLS_ACT + { + int err; + struct tc_action *act; + + if (map->police && tb[map->police-1]) { + act = tcf_action_init_1(tb[map->police-1], rate_tlv, "police", + TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err); + if (act == NULL) + return err; + + act->type = TCA_OLD_COMPAT; + exts->action = act; + } else if (map->action && tb[map->action-1]) { + act = tcf_action_init(tb[map->action-1], rate_tlv, NULL, + TCA_ACT_NOREPLACE, TCA_ACT_BIND, &err); + if (act == NULL) + return err; + + exts->action = act; + } + } +#elif defined CONFIG_NET_CLS_POLICE + if (map->police && tb[map->police-1]) { + struct tcf_police *p; + + p = tcf_police_locate(tb[map->police-1], rate_tlv); + if (p == NULL) + return -EINVAL; + + exts->police = p; + } else if (map->action && tb[map->action-1]) + return -EOPNOTSUPP; +#else + if ((map->action && tb[map->action-1]) || + (map->police && tb[map->police-1])) + return -EOPNOTSUPP; +#endif + + return 0; +} + +void +tcf_exts_change(struct tcf_proto *tp, struct tcf_exts *dst, + struct tcf_exts *src) +{ +#ifdef CONFIG_NET_CLS_ACT + if (src->action) { + struct tc_action *act; + tcf_tree_lock(tp); + act = xchg(&dst->action, src->action); + tcf_tree_unlock(tp); + if (act) + tcf_action_destroy(act, TCA_ACT_UNBIND); + } +#elif defined CONFIG_NET_CLS_POLICE + if (src->police) { + struct tcf_police *p; + tcf_tree_lock(tp); + p = xchg(&dst->police, src->police); + tcf_tree_unlock(tp); + if (p) + tcf_police_release(p, TCA_ACT_UNBIND); + } +#endif +} + +int +tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts, + struct tcf_ext_map *map) +{ +#ifdef CONFIG_NET_CLS_ACT + if (map->action && exts->action) { + /* + * again for backward compatible mode - we want + * to work with both old and new modes of entering + * tc data even if iproute2 was newer - jhs + */ + struct rtattr * p_rta = (struct rtattr*) skb->tail; + + if (exts->action->type != TCA_OLD_COMPAT) { + RTA_PUT(skb, map->action, 0, NULL); + if (tcf_action_dump(skb, exts->action, 0, 0) < 0) + goto rtattr_failure; + p_rta->rta_len = skb->tail - (u8*)p_rta; + } else if (map->police) { + RTA_PUT(skb, map->police, 0, NULL); + if (tcf_action_dump_old(skb, exts->action, 0, 0) < 0) + goto rtattr_failure; + p_rta->rta_len = skb->tail - (u8*)p_rta; + } + } +#elif defined CONFIG_NET_CLS_POLICE + if (map->police && exts->police) { + struct rtattr * p_rta = (struct rtattr*) skb->tail; + + RTA_PUT(skb, map->police, 0, NULL); + + if (tcf_police_dump(skb, exts->police) < 0) + goto rtattr_failure; + + p_rta->rta_len = skb->tail - (u8*)p_rta; + } +#endif + return 0; +rtattr_failure: __attribute__ ((unused)) + return -1; +} + +int +tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts, + struct tcf_ext_map *map) +{ +#ifdef CONFIG_NET_CLS_ACT + if (exts->action) + if (tcf_action_copy_stats(skb, exts->action, 1) < 0) + goto rtattr_failure; +#elif defined CONFIG_NET_CLS_POLICE + if (exts->police) + if (tcf_police_dump_stats(skb, exts->police) < 0) + goto rtattr_failure; +#endif + return 0; +rtattr_failure: __attribute__ ((unused)) + return -1; +} + +static int __init tc_filter_init(void) +{ + struct rtnetlink_link *link_p = rtnetlink_links[PF_UNSPEC]; + + /* Setup rtnetlink links. It is made here to avoid + exporting large number of public symbols. + */ + + if (link_p) { + link_p[RTM_NEWTFILTER-RTM_BASE].doit = tc_ctl_tfilter; + link_p[RTM_DELTFILTER-RTM_BASE].doit = tc_ctl_tfilter; + link_p[RTM_GETTFILTER-RTM_BASE].doit = tc_ctl_tfilter; + link_p[RTM_GETTFILTER-RTM_BASE].dumpit = tc_dump_tfilter; + } + return 0; +} + +subsys_initcall(tc_filter_init); + +EXPORT_SYMBOL(register_tcf_proto_ops); +EXPORT_SYMBOL(unregister_tcf_proto_ops); +EXPORT_SYMBOL(tcf_exts_validate); +EXPORT_SYMBOL(tcf_exts_destroy); +EXPORT_SYMBOL(tcf_exts_change); +EXPORT_SYMBOL(tcf_exts_dump); +EXPORT_SYMBOL(tcf_exts_dump_stats); diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c new file mode 100644 index 000000000000..0d2d4415f334 --- /dev/null +++ b/net/sched/cls_basic.c @@ -0,0 +1,303 @@ +/* + * net/sched/cls_basic.c Basic Packet Classifier. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct basic_head +{ + u32 hgenerator; + struct list_head flist; +}; + +struct basic_filter +{ + u32 handle; + struct tcf_exts exts; + struct tcf_ematch_tree ematches; + struct tcf_result res; + struct list_head link; +}; + +static struct tcf_ext_map basic_ext_map = { + .action = TCA_BASIC_ACT, + .police = TCA_BASIC_POLICE +}; + +static int basic_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + int r; + struct basic_head *head = (struct basic_head *) tp->root; + struct basic_filter *f; + + list_for_each_entry(f, &head->flist, link) { + if (!tcf_em_tree_match(skb, &f->ematches, NULL)) + continue; + *res = f->res; + r = tcf_exts_exec(skb, &f->exts, res); + if (r < 0) + continue; + return r; + } + return -1; +} + +static unsigned long basic_get(struct tcf_proto *tp, u32 handle) +{ + unsigned long l = 0UL; + struct basic_head *head = (struct basic_head *) tp->root; + struct basic_filter *f; + + if (head == NULL) + return 0UL; + + list_for_each_entry(f, &head->flist, link) + if (f->handle == handle) + l = (unsigned long) f; + + return l; +} + +static void basic_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int basic_init(struct tcf_proto *tp) +{ + return 0; +} + +static inline void basic_delete_filter(struct tcf_proto *tp, + struct basic_filter *f) +{ + tcf_unbind_filter(tp, &f->res); + tcf_exts_destroy(tp, &f->exts); + tcf_em_tree_destroy(tp, &f->ematches); + kfree(f); +} + +static void basic_destroy(struct tcf_proto *tp) +{ + struct basic_head *head = (struct basic_head *) xchg(&tp->root, NULL); + struct basic_filter *f, *n; + + list_for_each_entry_safe(f, n, &head->flist, link) { + list_del(&f->link); + basic_delete_filter(tp, f); + } +} + +static int basic_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct basic_head *head = (struct basic_head *) tp->root; + struct basic_filter *t, *f = (struct basic_filter *) arg; + + list_for_each_entry(t, &head->flist, link) + if (t == f) { + tcf_tree_lock(tp); + list_del(&t->link); + tcf_tree_unlock(tp); + basic_delete_filter(tp, t); + return 0; + } + + return -ENOENT; +} + +static inline int basic_set_parms(struct tcf_proto *tp, struct basic_filter *f, + unsigned long base, struct rtattr **tb, + struct rtattr *est) +{ + int err = -EINVAL; + struct tcf_exts e; + struct tcf_ematch_tree t; + + if (tb[TCA_BASIC_CLASSID-1]) + if (RTA_PAYLOAD(tb[TCA_BASIC_CLASSID-1]) < sizeof(u32)) + return err; + + err = tcf_exts_validate(tp, tb, est, &e, &basic_ext_map); + if (err < 0) + return err; + + err = tcf_em_tree_validate(tp, tb[TCA_BASIC_EMATCHES-1], &t); + if (err < 0) + goto errout; + + if (tb[TCA_BASIC_CLASSID-1]) { + f->res.classid = *(u32*)RTA_DATA(tb[TCA_BASIC_CLASSID-1]); + tcf_bind_filter(tp, &f->res, base); + } + + tcf_exts_change(tp, &f->exts, &e); + tcf_em_tree_change(tp, &f->ematches, &t); + + return 0; +errout: + tcf_exts_destroy(tp, &e); + return err; +} + +static int basic_change(struct tcf_proto *tp, unsigned long base, u32 handle, + struct rtattr **tca, unsigned long *arg) +{ + int err = -EINVAL; + struct basic_head *head = (struct basic_head *) tp->root; + struct rtattr *tb[TCA_BASIC_MAX]; + struct basic_filter *f = (struct basic_filter *) *arg; + + if (tca[TCA_OPTIONS-1] == NULL) + return -EINVAL; + + if (rtattr_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS-1]) < 0) + return -EINVAL; + + if (f != NULL) { + if (handle && f->handle != handle) + return -EINVAL; + return basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]); + } + + err = -ENOBUFS; + if (head == NULL) { + head = kmalloc(sizeof(*head), GFP_KERNEL); + if (head == NULL) + goto errout; + + memset(head, 0, sizeof(*head)); + INIT_LIST_HEAD(&head->flist); + tp->root = head; + } + + f = kmalloc(sizeof(*f), GFP_KERNEL); + if (f == NULL) + goto errout; + memset(f, 0, sizeof(*f)); + + err = -EINVAL; + if (handle) + f->handle = handle; + else { + int i = 0x80000000; + do { + if (++head->hgenerator == 0x7FFFFFFF) + head->hgenerator = 1; + } while (--i > 0 && basic_get(tp, head->hgenerator)); + + if (i <= 0) { + printk(KERN_ERR "Insufficient number of handles\n"); + goto errout; + } + + f->handle = head->hgenerator; + } + + err = basic_set_parms(tp, f, base, tb, tca[TCA_RATE-1]); + if (err < 0) + goto errout; + + tcf_tree_lock(tp); + list_add(&f->link, &head->flist); + tcf_tree_unlock(tp); + *arg = (unsigned long) f; + + return 0; +errout: + if (*arg == 0UL && f) + kfree(f); + + return err; +} + +static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct basic_head *head = (struct basic_head *) tp->root; + struct basic_filter *f; + + list_for_each_entry(f, &head->flist, link) { + if (arg->count < arg->skip) + goto skip; + + if (arg->fn(tp, (unsigned long) f, arg) < 0) { + arg->stop = 1; + break; + } +skip: + arg->count++; + } +} + +static int basic_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct basic_filter *f = (struct basic_filter *) fh; + unsigned char *b = skb->tail; + struct rtattr *rta; + + if (f == NULL) + return skb->len; + + t->tcm_handle = f->handle; + + rta = (struct rtattr *) b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + if (tcf_exts_dump(skb, &f->exts, &basic_ext_map) < 0 || + tcf_em_tree_dump(skb, &f->ematches, TCA_BASIC_EMATCHES) < 0) + goto rtattr_failure; + + rta->rta_len = (skb->tail - b); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct tcf_proto_ops cls_basic_ops = { + .kind = "basic", + .classify = basic_classify, + .init = basic_init, + .destroy = basic_destroy, + .get = basic_get, + .put = basic_put, + .change = basic_change, + .delete = basic_delete, + .walk = basic_walk, + .dump = basic_dump, + .owner = THIS_MODULE, +}; + +static int __init init_basic(void) +{ + return register_tcf_proto_ops(&cls_basic_ops); +} + +static void __exit exit_basic(void) +{ + unregister_tcf_proto_ops(&cls_basic_ops); +} + +module_init(init_basic) +module_exit(exit_basic) +MODULE_LICENSE("GPL"); + diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c new file mode 100644 index 000000000000..fdfc83af3d1f --- /dev/null +++ b/net/sched/cls_fw.c @@ -0,0 +1,378 @@ +/* + * net/sched/cls_fw.c Classifier mapping ipchains' fwmark to traffic class. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + * Changes: + * Karlis Peisenieks : 990415 : fw_walk off by one + * Karlis Peisenieks : 990415 : fw_delete killed all the filter (and kernel). + * Alex : 2004xxyy: Added Action extension + * + * JHS: We should remove the CONFIG_NET_CLS_IND from here + * eventually when the meta match extension is made available + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct fw_head +{ + struct fw_filter *ht[256]; +}; + +struct fw_filter +{ + struct fw_filter *next; + u32 id; + struct tcf_result res; +#ifdef CONFIG_NET_CLS_IND + char indev[IFNAMSIZ]; +#endif /* CONFIG_NET_CLS_IND */ + struct tcf_exts exts; +}; + +static struct tcf_ext_map fw_ext_map = { + .action = TCA_FW_ACT, + .police = TCA_FW_POLICE +}; + +static __inline__ int fw_hash(u32 handle) +{ + return handle&0xFF; +} + +static int fw_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct fw_head *head = (struct fw_head*)tp->root; + struct fw_filter *f; + int r; +#ifdef CONFIG_NETFILTER + u32 id = skb->nfmark; +#else + u32 id = 0; +#endif + + if (head != NULL) { + for (f=head->ht[fw_hash(id)]; f; f=f->next) { + if (f->id == id) { + *res = f->res; +#ifdef CONFIG_NET_CLS_IND + if (!tcf_match_indev(skb, f->indev)) + continue; +#endif /* CONFIG_NET_CLS_IND */ + r = tcf_exts_exec(skb, &f->exts, res); + if (r < 0) + continue; + + return r; + } + } + } else { + /* old method */ + if (id && (TC_H_MAJ(id) == 0 || !(TC_H_MAJ(id^tp->q->handle)))) { + res->classid = id; + res->class = 0; + return 0; + } + } + + return -1; +} + +static unsigned long fw_get(struct tcf_proto *tp, u32 handle) +{ + struct fw_head *head = (struct fw_head*)tp->root; + struct fw_filter *f; + + if (head == NULL) + return 0; + + for (f=head->ht[fw_hash(handle)]; f; f=f->next) { + if (f->id == handle) + return (unsigned long)f; + } + return 0; +} + +static void fw_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int fw_init(struct tcf_proto *tp) +{ + return 0; +} + +static inline void +fw_delete_filter(struct tcf_proto *tp, struct fw_filter *f) +{ + tcf_unbind_filter(tp, &f->res); + tcf_exts_destroy(tp, &f->exts); + kfree(f); +} + +static void fw_destroy(struct tcf_proto *tp) +{ + struct fw_head *head = (struct fw_head*)xchg(&tp->root, NULL); + struct fw_filter *f; + int h; + + if (head == NULL) + return; + + for (h=0; h<256; h++) { + while ((f=head->ht[h]) != NULL) { + head->ht[h] = f->next; + fw_delete_filter(tp, f); + } + } + kfree(head); +} + +static int fw_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct fw_head *head = (struct fw_head*)tp->root; + struct fw_filter *f = (struct fw_filter*)arg; + struct fw_filter **fp; + + if (head == NULL || f == NULL) + goto out; + + for (fp=&head->ht[fw_hash(f->id)]; *fp; fp = &(*fp)->next) { + if (*fp == f) { + tcf_tree_lock(tp); + *fp = f->next; + tcf_tree_unlock(tp); + fw_delete_filter(tp, f); + return 0; + } + } +out: + return -EINVAL; +} + +static int +fw_change_attrs(struct tcf_proto *tp, struct fw_filter *f, + struct rtattr **tb, struct rtattr **tca, unsigned long base) +{ + struct tcf_exts e; + int err; + + err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &fw_ext_map); + if (err < 0) + return err; + + err = -EINVAL; + if (tb[TCA_FW_CLASSID-1]) { + if (RTA_PAYLOAD(tb[TCA_FW_CLASSID-1]) != sizeof(u32)) + goto errout; + f->res.classid = *(u32*)RTA_DATA(tb[TCA_FW_CLASSID-1]); + tcf_bind_filter(tp, &f->res, base); + } + +#ifdef CONFIG_NET_CLS_IND + if (tb[TCA_FW_INDEV-1]) { + err = tcf_change_indev(tp, f->indev, tb[TCA_FW_INDEV-1]); + if (err < 0) + goto errout; + } +#endif /* CONFIG_NET_CLS_IND */ + + tcf_exts_change(tp, &f->exts, &e); + + return 0; +errout: + tcf_exts_destroy(tp, &e); + return err; +} + +static int fw_change(struct tcf_proto *tp, unsigned long base, + u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + struct fw_head *head = (struct fw_head*)tp->root; + struct fw_filter *f = (struct fw_filter *) *arg; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_FW_MAX]; + int err; + + if (!opt) + return handle ? -EINVAL : 0; + + if (rtattr_parse_nested(tb, TCA_FW_MAX, opt) < 0) + return -EINVAL; + + if (f != NULL) { + if (f->id != handle && handle) + return -EINVAL; + return fw_change_attrs(tp, f, tb, tca, base); + } + + if (!handle) + return -EINVAL; + + if (head == NULL) { + head = kmalloc(sizeof(struct fw_head), GFP_KERNEL); + if (head == NULL) + return -ENOBUFS; + memset(head, 0, sizeof(*head)); + + tcf_tree_lock(tp); + tp->root = head; + tcf_tree_unlock(tp); + } + + f = kmalloc(sizeof(struct fw_filter), GFP_KERNEL); + if (f == NULL) + return -ENOBUFS; + memset(f, 0, sizeof(*f)); + + f->id = handle; + + err = fw_change_attrs(tp, f, tb, tca, base); + if (err < 0) + goto errout; + + f->next = head->ht[fw_hash(handle)]; + tcf_tree_lock(tp); + head->ht[fw_hash(handle)] = f; + tcf_tree_unlock(tp); + + *arg = (unsigned long)f; + return 0; + +errout: + if (f) + kfree(f); + return err; +} + +static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct fw_head *head = (struct fw_head*)tp->root; + int h; + + if (head == NULL) + arg->stop = 1; + + if (arg->stop) + return; + + for (h = 0; h < 256; h++) { + struct fw_filter *f; + + for (f = head->ht[h]; f; f = f->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)f, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static int fw_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct fw_filter *f = (struct fw_filter*)fh; + unsigned char *b = skb->tail; + struct rtattr *rta; + + if (f == NULL) + return skb->len; + + t->tcm_handle = f->id; + + if (!f->res.classid && !tcf_exts_is_available(&f->exts)) + return skb->len; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + if (f->res.classid) + RTA_PUT(skb, TCA_FW_CLASSID, 4, &f->res.classid); +#ifdef CONFIG_NET_CLS_IND + if (strlen(f->indev)) + RTA_PUT(skb, TCA_FW_INDEV, IFNAMSIZ, f->indev); +#endif /* CONFIG_NET_CLS_IND */ + + if (tcf_exts_dump(skb, &f->exts, &fw_ext_map) < 0) + goto rtattr_failure; + + rta->rta_len = skb->tail - b; + + if (tcf_exts_dump_stats(skb, &f->exts, &fw_ext_map) < 0) + goto rtattr_failure; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct tcf_proto_ops cls_fw_ops = { + .next = NULL, + .kind = "fw", + .classify = fw_classify, + .init = fw_init, + .destroy = fw_destroy, + .get = fw_get, + .put = fw_put, + .change = fw_change, + .delete = fw_delete, + .walk = fw_walk, + .dump = fw_dump, + .owner = THIS_MODULE, +}; + +static int __init init_fw(void) +{ + return register_tcf_proto_ops(&cls_fw_ops); +} + +static void __exit exit_fw(void) +{ + unregister_tcf_proto_ops(&cls_fw_ops); +} + +module_init(init_fw) +module_exit(exit_fw) +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c new file mode 100644 index 000000000000..02996ac05c75 --- /dev/null +++ b/net/sched/cls_route.c @@ -0,0 +1,639 @@ +/* + * net/sched/cls_route.c ROUTE4 classifier. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + 1. For now we assume that route tags < 256. + It allows to use direct table lookups, instead of hash tables. + 2. For now we assume that "from TAG" and "fromdev DEV" statements + are mutually exclusive. + 3. "to TAG from ANY" has higher priority, than "to ANY from XXX" + */ + +struct route4_fastmap +{ + struct route4_filter *filter; + u32 id; + int iif; +}; + +struct route4_head +{ + struct route4_fastmap fastmap[16]; + struct route4_bucket *table[256+1]; +}; + +struct route4_bucket +{ + /* 16 FROM buckets + 16 IIF buckets + 1 wildcard bucket */ + struct route4_filter *ht[16+16+1]; +}; + +struct route4_filter +{ + struct route4_filter *next; + u32 id; + int iif; + + struct tcf_result res; + struct tcf_exts exts; + u32 handle; + struct route4_bucket *bkt; +}; + +#define ROUTE4_FAILURE ((struct route4_filter*)(-1L)) + +static struct tcf_ext_map route_ext_map = { + .police = TCA_ROUTE4_POLICE, + .action = TCA_ROUTE4_ACT +}; + +static __inline__ int route4_fastmap_hash(u32 id, int iif) +{ + return id&0xF; +} + +static inline +void route4_reset_fastmap(struct net_device *dev, struct route4_head *head, u32 id) +{ + spin_lock_bh(&dev->queue_lock); + memset(head->fastmap, 0, sizeof(head->fastmap)); + spin_unlock_bh(&dev->queue_lock); +} + +static void __inline__ +route4_set_fastmap(struct route4_head *head, u32 id, int iif, + struct route4_filter *f) +{ + int h = route4_fastmap_hash(id, iif); + head->fastmap[h].id = id; + head->fastmap[h].iif = iif; + head->fastmap[h].filter = f; +} + +static __inline__ int route4_hash_to(u32 id) +{ + return id&0xFF; +} + +static __inline__ int route4_hash_from(u32 id) +{ + return (id>>16)&0xF; +} + +static __inline__ int route4_hash_iif(int iif) +{ + return 16 + ((iif>>16)&0xF); +} + +static __inline__ int route4_hash_wild(void) +{ + return 32; +} + +#define ROUTE4_APPLY_RESULT() \ +{ \ + *res = f->res; \ + if (tcf_exts_is_available(&f->exts)) { \ + int r = tcf_exts_exec(skb, &f->exts, res); \ + if (r < 0) { \ + dont_cache = 1; \ + continue; \ + } \ + return r; \ + } else if (!dont_cache) \ + route4_set_fastmap(head, id, iif, f); \ + return 0; \ +} + +static int route4_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct route4_head *head = (struct route4_head*)tp->root; + struct dst_entry *dst; + struct route4_bucket *b; + struct route4_filter *f; + u32 id, h; + int iif, dont_cache = 0; + + if ((dst = skb->dst) == NULL) + goto failure; + + id = dst->tclassid; + if (head == NULL) + goto old_method; + + iif = ((struct rtable*)dst)->fl.iif; + + h = route4_fastmap_hash(id, iif); + if (id == head->fastmap[h].id && + iif == head->fastmap[h].iif && + (f = head->fastmap[h].filter) != NULL) { + if (f == ROUTE4_FAILURE) + goto failure; + + *res = f->res; + return 0; + } + + h = route4_hash_to(id); + +restart: + if ((b = head->table[h]) != NULL) { + for (f = b->ht[route4_hash_from(id)]; f; f = f->next) + if (f->id == id) + ROUTE4_APPLY_RESULT(); + + for (f = b->ht[route4_hash_iif(iif)]; f; f = f->next) + if (f->iif == iif) + ROUTE4_APPLY_RESULT(); + + for (f = b->ht[route4_hash_wild()]; f; f = f->next) + ROUTE4_APPLY_RESULT(); + + } + if (h < 256) { + h = 256; + id &= ~0xFFFF; + goto restart; + } + + if (!dont_cache) + route4_set_fastmap(head, id, iif, ROUTE4_FAILURE); +failure: + return -1; + +old_method: + if (id && (TC_H_MAJ(id) == 0 || + !(TC_H_MAJ(id^tp->q->handle)))) { + res->classid = id; + res->class = 0; + return 0; + } + return -1; +} + +static inline u32 to_hash(u32 id) +{ + u32 h = id&0xFF; + if (id&0x8000) + h += 256; + return h; +} + +static inline u32 from_hash(u32 id) +{ + id &= 0xFFFF; + if (id == 0xFFFF) + return 32; + if (!(id & 0x8000)) { + if (id > 255) + return 256; + return id&0xF; + } + return 16 + (id&0xF); +} + +static unsigned long route4_get(struct tcf_proto *tp, u32 handle) +{ + struct route4_head *head = (struct route4_head*)tp->root; + struct route4_bucket *b; + struct route4_filter *f; + unsigned h1, h2; + + if (!head) + return 0; + + h1 = to_hash(handle); + if (h1 > 256) + return 0; + + h2 = from_hash(handle>>16); + if (h2 > 32) + return 0; + + if ((b = head->table[h1]) != NULL) { + for (f = b->ht[h2]; f; f = f->next) + if (f->handle == handle) + return (unsigned long)f; + } + return 0; +} + +static void route4_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int route4_init(struct tcf_proto *tp) +{ + return 0; +} + +static inline void +route4_delete_filter(struct tcf_proto *tp, struct route4_filter *f) +{ + tcf_unbind_filter(tp, &f->res); + tcf_exts_destroy(tp, &f->exts); + kfree(f); +} + +static void route4_destroy(struct tcf_proto *tp) +{ + struct route4_head *head = xchg(&tp->root, NULL); + int h1, h2; + + if (head == NULL) + return; + + for (h1=0; h1<=256; h1++) { + struct route4_bucket *b; + + if ((b = head->table[h1]) != NULL) { + for (h2=0; h2<=32; h2++) { + struct route4_filter *f; + + while ((f = b->ht[h2]) != NULL) { + b->ht[h2] = f->next; + route4_delete_filter(tp, f); + } + } + kfree(b); + } + } + kfree(head); +} + +static int route4_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct route4_head *head = (struct route4_head*)tp->root; + struct route4_filter **fp, *f = (struct route4_filter*)arg; + unsigned h = 0; + struct route4_bucket *b; + int i; + + if (!head || !f) + return -EINVAL; + + h = f->handle; + b = f->bkt; + + for (fp = &b->ht[from_hash(h>>16)]; *fp; fp = &(*fp)->next) { + if (*fp == f) { + tcf_tree_lock(tp); + *fp = f->next; + tcf_tree_unlock(tp); + + route4_reset_fastmap(tp->q->dev, head, f->id); + route4_delete_filter(tp, f); + + /* Strip tree */ + + for (i=0; i<=32; i++) + if (b->ht[i]) + return 0; + + /* OK, session has no flows */ + tcf_tree_lock(tp); + head->table[to_hash(h)] = NULL; + tcf_tree_unlock(tp); + + kfree(b); + return 0; + } + } + return 0; +} + +static int route4_set_parms(struct tcf_proto *tp, unsigned long base, + struct route4_filter *f, u32 handle, struct route4_head *head, + struct rtattr **tb, struct rtattr *est, int new) +{ + int err; + u32 id = 0, to = 0, nhandle = 0x8000; + struct route4_filter *fp; + unsigned int h1; + struct route4_bucket *b; + struct tcf_exts e; + + err = tcf_exts_validate(tp, tb, est, &e, &route_ext_map); + if (err < 0) + return err; + + err = -EINVAL; + if (tb[TCA_ROUTE4_CLASSID-1]) + if (RTA_PAYLOAD(tb[TCA_ROUTE4_CLASSID-1]) < sizeof(u32)) + goto errout; + + if (tb[TCA_ROUTE4_TO-1]) { + if (new && handle & 0x8000) + goto errout; + if (RTA_PAYLOAD(tb[TCA_ROUTE4_TO-1]) < sizeof(u32)) + goto errout; + to = *(u32*)RTA_DATA(tb[TCA_ROUTE4_TO-1]); + if (to > 0xFF) + goto errout; + nhandle = to; + } + + if (tb[TCA_ROUTE4_FROM-1]) { + if (tb[TCA_ROUTE4_IIF-1]) + goto errout; + if (RTA_PAYLOAD(tb[TCA_ROUTE4_FROM-1]) < sizeof(u32)) + goto errout; + id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_FROM-1]); + if (id > 0xFF) + goto errout; + nhandle |= id << 16; + } else if (tb[TCA_ROUTE4_IIF-1]) { + if (RTA_PAYLOAD(tb[TCA_ROUTE4_IIF-1]) < sizeof(u32)) + goto errout; + id = *(u32*)RTA_DATA(tb[TCA_ROUTE4_IIF-1]); + if (id > 0x7FFF) + goto errout; + nhandle |= (id | 0x8000) << 16; + } else + nhandle |= 0xFFFF << 16; + + if (handle && new) { + nhandle |= handle & 0x7F00; + if (nhandle != handle) + goto errout; + } + + h1 = to_hash(nhandle); + if ((b = head->table[h1]) == NULL) { + err = -ENOBUFS; + b = kmalloc(sizeof(struct route4_bucket), GFP_KERNEL); + if (b == NULL) + goto errout; + memset(b, 0, sizeof(*b)); + + tcf_tree_lock(tp); + head->table[h1] = b; + tcf_tree_unlock(tp); + } else { + unsigned int h2 = from_hash(nhandle >> 16); + err = -EEXIST; + for (fp = b->ht[h2]; fp; fp = fp->next) + if (fp->handle == f->handle) + goto errout; + } + + tcf_tree_lock(tp); + if (tb[TCA_ROUTE4_TO-1]) + f->id = to; + + if (tb[TCA_ROUTE4_FROM-1]) + f->id = to | id<<16; + else if (tb[TCA_ROUTE4_IIF-1]) + f->iif = id; + + f->handle = nhandle; + f->bkt = b; + tcf_tree_unlock(tp); + + if (tb[TCA_ROUTE4_CLASSID-1]) { + f->res.classid = *(u32*)RTA_DATA(tb[TCA_ROUTE4_CLASSID-1]); + tcf_bind_filter(tp, &f->res, base); + } + + tcf_exts_change(tp, &f->exts, &e); + + return 0; +errout: + tcf_exts_destroy(tp, &e); + return err; +} + +static int route4_change(struct tcf_proto *tp, unsigned long base, + u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + struct route4_head *head = tp->root; + struct route4_filter *f, *f1, **fp; + struct route4_bucket *b; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_ROUTE4_MAX]; + unsigned int h, th; + u32 old_handle = 0; + int err; + + if (opt == NULL) + return handle ? -EINVAL : 0; + + if (rtattr_parse_nested(tb, TCA_ROUTE4_MAX, opt) < 0) + return -EINVAL; + + if ((f = (struct route4_filter*)*arg) != NULL) { + if (f->handle != handle && handle) + return -EINVAL; + + if (f->bkt) + old_handle = f->handle; + + err = route4_set_parms(tp, base, f, handle, head, tb, + tca[TCA_RATE-1], 0); + if (err < 0) + return err; + + goto reinsert; + } + + err = -ENOBUFS; + if (head == NULL) { + head = kmalloc(sizeof(struct route4_head), GFP_KERNEL); + if (head == NULL) + goto errout; + memset(head, 0, sizeof(struct route4_head)); + + tcf_tree_lock(tp); + tp->root = head; + tcf_tree_unlock(tp); + } + + f = kmalloc(sizeof(struct route4_filter), GFP_KERNEL); + if (f == NULL) + goto errout; + memset(f, 0, sizeof(*f)); + + err = route4_set_parms(tp, base, f, handle, head, tb, + tca[TCA_RATE-1], 1); + if (err < 0) + goto errout; + +reinsert: + h = from_hash(f->handle >> 16); + for (fp = &f->bkt->ht[h]; (f1=*fp) != NULL; fp = &f1->next) + if (f->handle < f1->handle) + break; + + f->next = f1; + tcf_tree_lock(tp); + *fp = f; + + if (old_handle && f->handle != old_handle) { + th = to_hash(old_handle); + h = from_hash(old_handle >> 16); + if ((b = head->table[th]) != NULL) { + for (fp = &b->ht[h]; *fp; fp = &(*fp)->next) { + if (*fp == f) { + *fp = f->next; + break; + } + } + } + } + tcf_tree_unlock(tp); + + route4_reset_fastmap(tp->q->dev, head, f->id); + *arg = (unsigned long)f; + return 0; + +errout: + if (f) + kfree(f); + return err; +} + +static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct route4_head *head = tp->root; + unsigned h, h1; + + if (head == NULL) + arg->stop = 1; + + if (arg->stop) + return; + + for (h = 0; h <= 256; h++) { + struct route4_bucket *b = head->table[h]; + + if (b) { + for (h1 = 0; h1 <= 32; h1++) { + struct route4_filter *f; + + for (f = b->ht[h1]; f; f = f->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)f, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } + } + } +} + +static int route4_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct route4_filter *f = (struct route4_filter*)fh; + unsigned char *b = skb->tail; + struct rtattr *rta; + u32 id; + + if (f == NULL) + return skb->len; + + t->tcm_handle = f->handle; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + if (!(f->handle&0x8000)) { + id = f->id&0xFF; + RTA_PUT(skb, TCA_ROUTE4_TO, sizeof(id), &id); + } + if (f->handle&0x80000000) { + if ((f->handle>>16) != 0xFFFF) + RTA_PUT(skb, TCA_ROUTE4_IIF, sizeof(f->iif), &f->iif); + } else { + id = f->id>>16; + RTA_PUT(skb, TCA_ROUTE4_FROM, sizeof(id), &id); + } + if (f->res.classid) + RTA_PUT(skb, TCA_ROUTE4_CLASSID, 4, &f->res.classid); + + if (tcf_exts_dump(skb, &f->exts, &route_ext_map) < 0) + goto rtattr_failure; + + rta->rta_len = skb->tail - b; + + if (tcf_exts_dump_stats(skb, &f->exts, &route_ext_map) < 0) + goto rtattr_failure; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct tcf_proto_ops cls_route4_ops = { + .next = NULL, + .kind = "route", + .classify = route4_classify, + .init = route4_init, + .destroy = route4_destroy, + .get = route4_get, + .put = route4_put, + .change = route4_change, + .delete = route4_delete, + .walk = route4_walk, + .dump = route4_dump, + .owner = THIS_MODULE, +}; + +static int __init init_route4(void) +{ + return register_tcf_proto_ops(&cls_route4_ops); +} + +static void __exit exit_route4(void) +{ + unregister_tcf_proto_ops(&cls_route4_ops); +} + +module_init(init_route4) +module_exit(exit_route4) +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_rsvp.c b/net/sched/cls_rsvp.c new file mode 100644 index 000000000000..ad2613790d85 --- /dev/null +++ b/net/sched/cls_rsvp.c @@ -0,0 +1,43 @@ +/* + * net/sched/cls_rsvp.c Special RSVP packet classifier for IPv4. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RSVP_DST_LEN 1 +#define RSVP_ID "rsvp" +#define RSVP_OPS cls_rsvp_ops + +#include "cls_rsvp.h" +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h new file mode 100644 index 000000000000..232fb9196810 --- /dev/null +++ b/net/sched/cls_rsvp.h @@ -0,0 +1,667 @@ +/* + * net/sched/cls_rsvp.h Template file for RSVPv[46] classifiers. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +/* + Comparing to general packet classification problem, + RSVP needs only sevaral relatively simple rules: + + * (dst, protocol) are always specified, + so that we are able to hash them. + * src may be exact, or may be wildcard, so that + we can keep a hash table plus one wildcard entry. + * source port (or flow label) is important only if src is given. + + IMPLEMENTATION. + + We use a two level hash table: The top level is keyed by + destination address and protocol ID, every bucket contains a list + of "rsvp sessions", identified by destination address, protocol and + DPI(="Destination Port ID"): triple (key, mask, offset). + + Every bucket has a smaller hash table keyed by source address + (cf. RSVP flowspec) and one wildcard entry for wildcard reservations. + Every bucket is again a list of "RSVP flows", selected by + source address and SPI(="Source Port ID" here rather than + "security parameter index"): triple (key, mask, offset). + + + NOTE 1. All the packets with IPv6 extension headers (but AH and ESP) + and all fragmented packets go to the best-effort traffic class. + + + NOTE 2. Two "port id"'s seems to be redundant, rfc2207 requires + only one "Generalized Port Identifier". So that for classic + ah, esp (and udp,tcp) both *pi should coincide or one of them + should be wildcard. + + At first sight, this redundancy is just a waste of CPU + resources. But DPI and SPI add the possibility to assign different + priorities to GPIs. Look also at note 4 about tunnels below. + + + NOTE 3. One complication is the case of tunneled packets. + We implement it as following: if the first lookup + matches a special session with "tunnelhdr" value not zero, + flowid doesn't contain the true flow ID, but the tunnel ID (1...255). + In this case, we pull tunnelhdr bytes and restart lookup + with tunnel ID added to the list of keys. Simple and stupid 8)8) + It's enough for PIMREG and IPIP. + + + NOTE 4. Two GPIs make it possible to parse even GRE packets. + F.e. DPI can select ETH_P_IP (and necessary flags to make + tunnelhdr correct) in GRE protocol field and SPI matches + GRE key. Is it not nice? 8)8) + + + Well, as result, despite its simplicity, we get a pretty + powerful classification engine. */ + +#include + +struct rsvp_head +{ + u32 tmap[256/32]; + u32 hgenerator; + u8 tgenerator; + struct rsvp_session *ht[256]; +}; + +struct rsvp_session +{ + struct rsvp_session *next; + u32 dst[RSVP_DST_LEN]; + struct tc_rsvp_gpi dpi; + u8 protocol; + u8 tunnelid; + /* 16 (src,sport) hash slots, and one wildcard source slot */ + struct rsvp_filter *ht[16+1]; +}; + + +struct rsvp_filter +{ + struct rsvp_filter *next; + u32 src[RSVP_DST_LEN]; + struct tc_rsvp_gpi spi; + u8 tunnelhdr; + + struct tcf_result res; + struct tcf_exts exts; + + u32 handle; + struct rsvp_session *sess; +}; + +static __inline__ unsigned hash_dst(u32 *dst, u8 protocol, u8 tunnelid) +{ + unsigned h = dst[RSVP_DST_LEN-1]; + h ^= h>>16; + h ^= h>>8; + return (h ^ protocol ^ tunnelid) & 0xFF; +} + +static __inline__ unsigned hash_src(u32 *src) +{ + unsigned h = src[RSVP_DST_LEN-1]; + h ^= h>>16; + h ^= h>>8; + h ^= h>>4; + return h & 0xF; +} + +static struct tcf_ext_map rsvp_ext_map = { + .police = TCA_RSVP_POLICE, + .action = TCA_RSVP_ACT +}; + +#define RSVP_APPLY_RESULT() \ +{ \ + int r = tcf_exts_exec(skb, &f->exts, res); \ + if (r < 0) \ + continue; \ + else if (r > 0) \ + return r; \ +} + +static int rsvp_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; + struct rsvp_session *s; + struct rsvp_filter *f; + unsigned h1, h2; + u32 *dst, *src; + u8 protocol; + u8 tunnelid = 0; + u8 *xprt; +#if RSVP_DST_LEN == 4 + struct ipv6hdr *nhptr = skb->nh.ipv6h; +#else + struct iphdr *nhptr = skb->nh.iph; +#endif + +restart: + +#if RSVP_DST_LEN == 4 + src = &nhptr->saddr.s6_addr32[0]; + dst = &nhptr->daddr.s6_addr32[0]; + protocol = nhptr->nexthdr; + xprt = ((u8*)nhptr) + sizeof(struct ipv6hdr); +#else + src = &nhptr->saddr; + dst = &nhptr->daddr; + protocol = nhptr->protocol; + xprt = ((u8*)nhptr) + (nhptr->ihl<<2); + if (nhptr->frag_off&__constant_htons(IP_MF|IP_OFFSET)) + return -1; +#endif + + h1 = hash_dst(dst, protocol, tunnelid); + h2 = hash_src(src); + + for (s = sht[h1]; s; s = s->next) { + if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && + protocol == s->protocol && + !(s->dpi.mask & (*(u32*)(xprt+s->dpi.offset)^s->dpi.key)) +#if RSVP_DST_LEN == 4 + && dst[0] == s->dst[0] + && dst[1] == s->dst[1] + && dst[2] == s->dst[2] +#endif + && tunnelid == s->tunnelid) { + + for (f = s->ht[h2]; f; f = f->next) { + if (src[RSVP_DST_LEN-1] == f->src[RSVP_DST_LEN-1] && + !(f->spi.mask & (*(u32*)(xprt+f->spi.offset)^f->spi.key)) +#if RSVP_DST_LEN == 4 + && src[0] == f->src[0] + && src[1] == f->src[1] + && src[2] == f->src[2] +#endif + ) { + *res = f->res; + RSVP_APPLY_RESULT(); + +matched: + if (f->tunnelhdr == 0) + return 0; + + tunnelid = f->res.classid; + nhptr = (void*)(xprt + f->tunnelhdr - sizeof(*nhptr)); + goto restart; + } + } + + /* And wildcard bucket... */ + for (f = s->ht[16]; f; f = f->next) { + *res = f->res; + RSVP_APPLY_RESULT(); + goto matched; + } + return -1; + } + } + return -1; +} + +static unsigned long rsvp_get(struct tcf_proto *tp, u32 handle) +{ + struct rsvp_session **sht = ((struct rsvp_head*)tp->root)->ht; + struct rsvp_session *s; + struct rsvp_filter *f; + unsigned h1 = handle&0xFF; + unsigned h2 = (handle>>8)&0xFF; + + if (h2 > 16) + return 0; + + for (s = sht[h1]; s; s = s->next) { + for (f = s->ht[h2]; f; f = f->next) { + if (f->handle == handle) + return (unsigned long)f; + } + } + return 0; +} + +static void rsvp_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static int rsvp_init(struct tcf_proto *tp) +{ + struct rsvp_head *data; + + data = kmalloc(sizeof(struct rsvp_head), GFP_KERNEL); + if (data) { + memset(data, 0, sizeof(struct rsvp_head)); + tp->root = data; + return 0; + } + return -ENOBUFS; +} + +static inline void +rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) +{ + tcf_unbind_filter(tp, &f->res); + tcf_exts_destroy(tp, &f->exts); + kfree(f); +} + +static void rsvp_destroy(struct tcf_proto *tp) +{ + struct rsvp_head *data = xchg(&tp->root, NULL); + struct rsvp_session **sht; + int h1, h2; + + if (data == NULL) + return; + + sht = data->ht; + + for (h1=0; h1<256; h1++) { + struct rsvp_session *s; + + while ((s = sht[h1]) != NULL) { + sht[h1] = s->next; + + for (h2=0; h2<=16; h2++) { + struct rsvp_filter *f; + + while ((f = s->ht[h2]) != NULL) { + s->ht[h2] = f->next; + rsvp_delete_filter(tp, f); + } + } + kfree(s); + } + } + kfree(data); +} + +static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct rsvp_filter **fp, *f = (struct rsvp_filter*)arg; + unsigned h = f->handle; + struct rsvp_session **sp; + struct rsvp_session *s = f->sess; + int i; + + for (fp = &s->ht[(h>>8)&0xFF]; *fp; fp = &(*fp)->next) { + if (*fp == f) { + tcf_tree_lock(tp); + *fp = f->next; + tcf_tree_unlock(tp); + rsvp_delete_filter(tp, f); + + /* Strip tree */ + + for (i=0; i<=16; i++) + if (s->ht[i]) + return 0; + + /* OK, session has no flows */ + for (sp = &((struct rsvp_head*)tp->root)->ht[h&0xFF]; + *sp; sp = &(*sp)->next) { + if (*sp == s) { + tcf_tree_lock(tp); + *sp = s->next; + tcf_tree_unlock(tp); + + kfree(s); + return 0; + } + } + + return 0; + } + } + return 0; +} + +static unsigned gen_handle(struct tcf_proto *tp, unsigned salt) +{ + struct rsvp_head *data = tp->root; + int i = 0xFFFF; + + while (i-- > 0) { + u32 h; + if ((data->hgenerator += 0x10000) == 0) + data->hgenerator = 0x10000; + h = data->hgenerator|salt; + if (rsvp_get(tp, h) == 0) + return h; + } + return 0; +} + +static int tunnel_bts(struct rsvp_head *data) +{ + int n = data->tgenerator>>5; + u32 b = 1<<(data->tgenerator&0x1F); + + if (data->tmap[n]&b) + return 0; + data->tmap[n] |= b; + return 1; +} + +static void tunnel_recycle(struct rsvp_head *data) +{ + struct rsvp_session **sht = data->ht; + u32 tmap[256/32]; + int h1, h2; + + memset(tmap, 0, sizeof(tmap)); + + for (h1=0; h1<256; h1++) { + struct rsvp_session *s; + for (s = sht[h1]; s; s = s->next) { + for (h2=0; h2<=16; h2++) { + struct rsvp_filter *f; + + for (f = s->ht[h2]; f; f = f->next) { + if (f->tunnelhdr == 0) + continue; + data->tgenerator = f->res.classid; + tunnel_bts(data); + } + } + } + } + + memcpy(data->tmap, tmap, sizeof(tmap)); +} + +static u32 gen_tunnel(struct rsvp_head *data) +{ + int i, k; + + for (k=0; k<2; k++) { + for (i=255; i>0; i--) { + if (++data->tgenerator == 0) + data->tgenerator = 1; + if (tunnel_bts(data)) + return data->tgenerator; + } + tunnel_recycle(data); + } + return 0; +} + +static int rsvp_change(struct tcf_proto *tp, unsigned long base, + u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + struct rsvp_head *data = tp->root; + struct rsvp_filter *f, **fp; + struct rsvp_session *s, **sp; + struct tc_rsvp_pinfo *pinfo = NULL; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_RSVP_MAX]; + struct tcf_exts e; + unsigned h1, h2; + u32 *dst; + int err; + + if (opt == NULL) + return handle ? -EINVAL : 0; + + if (rtattr_parse_nested(tb, TCA_RSVP_MAX, opt) < 0) + return -EINVAL; + + err = tcf_exts_validate(tp, tb, tca[TCA_RATE-1], &e, &rsvp_ext_map); + if (err < 0) + return err; + + if ((f = (struct rsvp_filter*)*arg) != NULL) { + /* Node exists: adjust only classid */ + + if (f->handle != handle && handle) + goto errout2; + if (tb[TCA_RSVP_CLASSID-1]) { + f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); + tcf_bind_filter(tp, &f->res, base); + } + + tcf_exts_change(tp, &f->exts, &e); + return 0; + } + + /* Now more serious part... */ + err = -EINVAL; + if (handle) + goto errout2; + if (tb[TCA_RSVP_DST-1] == NULL) + goto errout2; + + err = -ENOBUFS; + f = kmalloc(sizeof(struct rsvp_filter), GFP_KERNEL); + if (f == NULL) + goto errout2; + + memset(f, 0, sizeof(*f)); + h2 = 16; + if (tb[TCA_RSVP_SRC-1]) { + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_SRC-1]) != sizeof(f->src)) + goto errout; + memcpy(f->src, RTA_DATA(tb[TCA_RSVP_SRC-1]), sizeof(f->src)); + h2 = hash_src(f->src); + } + if (tb[TCA_RSVP_PINFO-1]) { + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_PINFO-1]) < sizeof(struct tc_rsvp_pinfo)) + goto errout; + pinfo = RTA_DATA(tb[TCA_RSVP_PINFO-1]); + f->spi = pinfo->spi; + f->tunnelhdr = pinfo->tunnelhdr; + } + if (tb[TCA_RSVP_CLASSID-1]) { + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_CLASSID-1]) != 4) + goto errout; + f->res.classid = *(u32*)RTA_DATA(tb[TCA_RSVP_CLASSID-1]); + } + + err = -EINVAL; + if (RTA_PAYLOAD(tb[TCA_RSVP_DST-1]) != sizeof(f->src)) + goto errout; + dst = RTA_DATA(tb[TCA_RSVP_DST-1]); + h1 = hash_dst(dst, pinfo ? pinfo->protocol : 0, pinfo ? pinfo->tunnelid : 0); + + err = -ENOMEM; + if ((f->handle = gen_handle(tp, h1 | (h2<<8))) == 0) + goto errout; + + if (f->tunnelhdr) { + err = -EINVAL; + if (f->res.classid > 255) + goto errout; + + err = -ENOMEM; + if (f->res.classid == 0 && + (f->res.classid = gen_tunnel(data)) == 0) + goto errout; + } + + for (sp = &data->ht[h1]; (s=*sp) != NULL; sp = &s->next) { + if (dst[RSVP_DST_LEN-1] == s->dst[RSVP_DST_LEN-1] && + pinfo && pinfo->protocol == s->protocol && + memcmp(&pinfo->dpi, &s->dpi, sizeof(s->dpi)) == 0 +#if RSVP_DST_LEN == 4 + && dst[0] == s->dst[0] + && dst[1] == s->dst[1] + && dst[2] == s->dst[2] +#endif + && pinfo->tunnelid == s->tunnelid) { + +insert: + /* OK, we found appropriate session */ + + fp = &s->ht[h2]; + + f->sess = s; + if (f->tunnelhdr == 0) + tcf_bind_filter(tp, &f->res, base); + + tcf_exts_change(tp, &f->exts, &e); + + for (fp = &s->ht[h2]; *fp; fp = &(*fp)->next) + if (((*fp)->spi.mask&f->spi.mask) != f->spi.mask) + break; + f->next = *fp; + wmb(); + *fp = f; + + *arg = (unsigned long)f; + return 0; + } + } + + /* No session found. Create new one. */ + + err = -ENOBUFS; + s = kmalloc(sizeof(struct rsvp_session), GFP_KERNEL); + if (s == NULL) + goto errout; + memset(s, 0, sizeof(*s)); + memcpy(s->dst, dst, sizeof(s->dst)); + + if (pinfo) { + s->dpi = pinfo->dpi; + s->protocol = pinfo->protocol; + s->tunnelid = pinfo->tunnelid; + } + for (sp = &data->ht[h1]; *sp; sp = &(*sp)->next) { + if (((*sp)->dpi.mask&s->dpi.mask) != s->dpi.mask) + break; + } + s->next = *sp; + wmb(); + *sp = s; + + goto insert; + +errout: + if (f) + kfree(f); +errout2: + tcf_exts_destroy(tp, &e); + return err; +} + +static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct rsvp_head *head = tp->root; + unsigned h, h1; + + if (arg->stop) + return; + + for (h = 0; h < 256; h++) { + struct rsvp_session *s; + + for (s = head->ht[h]; s; s = s->next) { + for (h1 = 0; h1 <= 16; h1++) { + struct rsvp_filter *f; + + for (f = s->ht[h1]; f; f = f->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)f, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } + } + } +} + +static int rsvp_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct rsvp_filter *f = (struct rsvp_filter*)fh; + struct rsvp_session *s; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_rsvp_pinfo pinfo; + + if (f == NULL) + return skb->len; + s = f->sess; + + t->tcm_handle = f->handle; + + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + RTA_PUT(skb, TCA_RSVP_DST, sizeof(s->dst), &s->dst); + pinfo.dpi = s->dpi; + pinfo.spi = f->spi; + pinfo.protocol = s->protocol; + pinfo.tunnelid = s->tunnelid; + pinfo.tunnelhdr = f->tunnelhdr; + RTA_PUT(skb, TCA_RSVP_PINFO, sizeof(pinfo), &pinfo); + if (f->res.classid) + RTA_PUT(skb, TCA_RSVP_CLASSID, 4, &f->res.classid); + if (((f->handle>>8)&0xFF) != 16) + RTA_PUT(skb, TCA_RSVP_SRC, sizeof(f->src), f->src); + + if (tcf_exts_dump(skb, &f->exts, &rsvp_ext_map) < 0) + goto rtattr_failure; + + rta->rta_len = skb->tail - b; + + if (tcf_exts_dump_stats(skb, &f->exts, &rsvp_ext_map) < 0) + goto rtattr_failure; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct tcf_proto_ops RSVP_OPS = { + .next = NULL, + .kind = RSVP_ID, + .classify = rsvp_classify, + .init = rsvp_init, + .destroy = rsvp_destroy, + .get = rsvp_get, + .put = rsvp_put, + .change = rsvp_change, + .delete = rsvp_delete, + .walk = rsvp_walk, + .dump = rsvp_dump, + .owner = THIS_MODULE, +}; + +static int __init init_rsvp(void) +{ + return register_tcf_proto_ops(&RSVP_OPS); +} + +static void __exit exit_rsvp(void) +{ + unregister_tcf_proto_ops(&RSVP_OPS); +} + +module_init(init_rsvp) +module_exit(exit_rsvp) diff --git a/net/sched/cls_rsvp6.c b/net/sched/cls_rsvp6.c new file mode 100644 index 000000000000..fde51f7848eb --- /dev/null +++ b/net/sched/cls_rsvp6.c @@ -0,0 +1,44 @@ +/* + * net/sched/cls_rsvp6.c Special RSVP packet classifier for IPv6. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RSVP_DST_LEN 4 +#define RSVP_ID "rsvp6" +#define RSVP_OPS cls_rsvp6_ops + +#include "cls_rsvp.h" +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c new file mode 100644 index 000000000000..404d9d83a7fa --- /dev/null +++ b/net/sched/cls_tcindex.c @@ -0,0 +1,537 @@ +/* + * net/sched/cls_tcindex.c Packet classifier for skb->tc_index + * + * Written 1998,1999 by Werner Almesberger, EPFL ICA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* + * Not quite sure if we need all the xchgs Alexey uses when accessing things. + * Can always add them later ... :) + */ + +/* + * Passing parameters to the root seems to be done more awkwardly than really + * necessary. At least, u32 doesn't seem to use such dirty hacks. To be + * verified. FIXME. + */ + +#define PERFECT_HASH_THRESHOLD 64 /* use perfect hash if not bigger */ +#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */ + + +#if 1 /* control */ +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + +#if 0 /* data */ +#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define D2PRINTK(format,args...) +#endif + + +#define PRIV(tp) ((struct tcindex_data *) (tp)->root) + + +struct tcindex_filter_result { + struct tcf_exts exts; + struct tcf_result res; +}; + +struct tcindex_filter { + u16 key; + struct tcindex_filter_result result; + struct tcindex_filter *next; +}; + + +struct tcindex_data { + struct tcindex_filter_result *perfect; /* perfect hash; NULL if none */ + struct tcindex_filter **h; /* imperfect hash; only used if !perfect; + NULL if unused */ + u16 mask; /* AND key with mask */ + int shift; /* shift ANDed key to the right */ + int hash; /* hash table size; 0 if undefined */ + int alloc_hash; /* allocated size */ + int fall_through; /* 0: only classify if explicit match */ +}; + +static struct tcf_ext_map tcindex_ext_map = { + .police = TCA_TCINDEX_POLICE, + .action = TCA_TCINDEX_ACT +}; + +static inline int +tcindex_filter_is_set(struct tcindex_filter_result *r) +{ + return tcf_exts_is_predicative(&r->exts) || r->res.classid; +} + +static struct tcindex_filter_result * +tcindex_lookup(struct tcindex_data *p, u16 key) +{ + struct tcindex_filter *f; + + if (p->perfect) + return tcindex_filter_is_set(p->perfect + key) ? + p->perfect + key : NULL; + else if (p->h) { + for (f = p->h[key % p->hash]; f; f = f->next) + if (f->key == key) + return &f->result; + } + + return NULL; +} + + +static int tcindex_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *f; + int key = (skb->tc_index & p->mask) >> p->shift; + + D2PRINTK("tcindex_classify(skb %p,tp %p,res %p),p %p\n",skb,tp,res,p); + + f = tcindex_lookup(p, key); + if (!f) { + if (!p->fall_through) + return -1; + res->classid = TC_H_MAKE(TC_H_MAJ(tp->q->handle), key); + res->class = 0; + D2PRINTK("alg 0x%x\n",res->classid); + return 0; + } + *res = f->res; + D2PRINTK("map 0x%x\n",res->classid); + + return tcf_exts_exec(skb, &f->exts, res); +} + + +static unsigned long tcindex_get(struct tcf_proto *tp, u32 handle) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *r; + + DPRINTK("tcindex_get(tp %p,handle 0x%08x)\n",tp,handle); + if (p->perfect && handle >= p->alloc_hash) + return 0; + r = tcindex_lookup(p, handle); + return r && tcindex_filter_is_set(r) ? (unsigned long) r : 0UL; +} + + +static void tcindex_put(struct tcf_proto *tp, unsigned long f) +{ + DPRINTK("tcindex_put(tp %p,f 0x%lx)\n",tp,f); +} + + +static int tcindex_init(struct tcf_proto *tp) +{ + struct tcindex_data *p; + + DPRINTK("tcindex_init(tp %p)\n",tp); + p = kmalloc(sizeof(struct tcindex_data),GFP_KERNEL); + if (!p) + return -ENOMEM; + + memset(p, 0, sizeof(*p)); + p->mask = 0xffff; + p->hash = DEFAULT_HASH_SIZE; + p->fall_through = 1; + + tp->root = p; + return 0; +} + + +static int +__tcindex_delete(struct tcf_proto *tp, unsigned long arg, int lock) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; + struct tcindex_filter *f = NULL; + + DPRINTK("tcindex_delete(tp %p,arg 0x%lx),p %p,f %p\n",tp,arg,p,f); + if (p->perfect) { + if (!r->res.class) + return -ENOENT; + } else { + int i; + struct tcindex_filter **walk = NULL; + + for (i = 0; i < p->hash; i++) + for (walk = p->h+i; *walk; walk = &(*walk)->next) + if (&(*walk)->result == r) + goto found; + return -ENOENT; + +found: + f = *walk; + if (lock) + tcf_tree_lock(tp); + *walk = f->next; + if (lock) + tcf_tree_unlock(tp); + } + tcf_unbind_filter(tp, &r->res); + tcf_exts_destroy(tp, &r->exts); + if (f) + kfree(f); + return 0; +} + +static int tcindex_delete(struct tcf_proto *tp, unsigned long arg) +{ + return __tcindex_delete(tp, arg, 1); +} + +static inline int +valid_perfect_hash(struct tcindex_data *p) +{ + return p->hash > (p->mask >> p->shift); +} + +static int +tcindex_set_parms(struct tcf_proto *tp, unsigned long base, u32 handle, + struct tcindex_data *p, struct tcindex_filter_result *r, + struct rtattr **tb, struct rtattr *est) +{ + int err, balloc = 0; + struct tcindex_filter_result new_filter_result, *old_r = r; + struct tcindex_filter_result cr; + struct tcindex_data cp; + struct tcindex_filter *f = NULL; /* make gcc behave */ + struct tcf_exts e; + + err = tcf_exts_validate(tp, tb, est, &e, &tcindex_ext_map); + if (err < 0) + return err; + + memcpy(&cp, p, sizeof(cp)); + memset(&new_filter_result, 0, sizeof(new_filter_result)); + + if (old_r) + memcpy(&cr, r, sizeof(cr)); + else + memset(&cr, 0, sizeof(cr)); + + err = -EINVAL; + if (tb[TCA_TCINDEX_HASH-1]) { + if (RTA_PAYLOAD(tb[TCA_TCINDEX_HASH-1]) < sizeof(u32)) + goto errout; + cp.hash = *(u32 *) RTA_DATA(tb[TCA_TCINDEX_HASH-1]); + } + + if (tb[TCA_TCINDEX_MASK-1]) { + if (RTA_PAYLOAD(tb[TCA_TCINDEX_MASK-1]) < sizeof(u16)) + goto errout; + cp.mask = *(u16 *) RTA_DATA(tb[TCA_TCINDEX_MASK-1]); + } + + if (tb[TCA_TCINDEX_SHIFT-1]) { + if (RTA_PAYLOAD(tb[TCA_TCINDEX_SHIFT-1]) < sizeof(u16)) + goto errout; + cp.shift = *(u16 *) RTA_DATA(tb[TCA_TCINDEX_SHIFT-1]); + } + + err = -EBUSY; + /* Hash already allocated, make sure that we still meet the + * requirements for the allocated hash. + */ + if (cp.perfect) { + if (!valid_perfect_hash(&cp) || + cp.hash > cp.alloc_hash) + goto errout; + } else if (cp.h && cp.hash != cp.alloc_hash) + goto errout; + + err = -EINVAL; + if (tb[TCA_TCINDEX_FALL_THROUGH-1]) { + if (RTA_PAYLOAD(tb[TCA_TCINDEX_FALL_THROUGH-1]) < sizeof(u32)) + goto errout; + cp.fall_through = + *(u32 *) RTA_DATA(tb[TCA_TCINDEX_FALL_THROUGH-1]); + } + + if (!cp.hash) { + /* Hash not specified, use perfect hash if the upper limit + * of the hashing index is below the threshold. + */ + if ((cp.mask >> cp.shift) < PERFECT_HASH_THRESHOLD) + cp.hash = (cp.mask >> cp.shift)+1; + else + cp.hash = DEFAULT_HASH_SIZE; + } + + if (!cp.perfect && !cp.h) + cp.alloc_hash = cp.hash; + + /* Note: this could be as restrictive as if (handle & ~(mask >> shift)) + * but then, we'd fail handles that may become valid after some future + * mask change. While this is extremely unlikely to ever matter, + * the check below is safer (and also more backwards-compatible). + */ + if (cp.perfect || valid_perfect_hash(&cp)) + if (handle >= cp.alloc_hash) + goto errout; + + + err = -ENOMEM; + if (!cp.perfect && !cp.h) { + if (valid_perfect_hash(&cp)) { + cp.perfect = kmalloc(cp.hash * sizeof(*r), GFP_KERNEL); + if (!cp.perfect) + goto errout; + memset(cp.perfect, 0, cp.hash * sizeof(*r)); + balloc = 1; + } else { + cp.h = kmalloc(cp.hash * sizeof(f), GFP_KERNEL); + if (!cp.h) + goto errout; + memset(cp.h, 0, cp.hash * sizeof(f)); + balloc = 2; + } + } + + if (cp.perfect) + r = cp.perfect + handle; + else + r = tcindex_lookup(&cp, handle) ? : &new_filter_result; + + if (r == &new_filter_result) { + f = kmalloc(sizeof(*f), GFP_KERNEL); + if (!f) + goto errout_alloc; + memset(f, 0, sizeof(*f)); + } + + if (tb[TCA_TCINDEX_CLASSID-1]) { + cr.res.classid = *(u32 *) RTA_DATA(tb[TCA_TCINDEX_CLASSID-1]); + tcf_bind_filter(tp, &cr.res, base); + } + + tcf_exts_change(tp, &cr.exts, &e); + + tcf_tree_lock(tp); + if (old_r && old_r != r) + memset(old_r, 0, sizeof(*old_r)); + + memcpy(p, &cp, sizeof(cp)); + memcpy(r, &cr, sizeof(cr)); + + if (r == &new_filter_result) { + struct tcindex_filter **fp; + + f->key = handle; + f->result = new_filter_result; + f->next = NULL; + for (fp = p->h+(handle % p->hash); *fp; fp = &(*fp)->next) + /* nothing */; + *fp = f; + } + tcf_tree_unlock(tp); + + return 0; + +errout_alloc: + if (balloc == 1) + kfree(cp.perfect); + else if (balloc == 2) + kfree(cp.h); +errout: + tcf_exts_destroy(tp, &e); + return err; +} + +static int +tcindex_change(struct tcf_proto *tp, unsigned long base, u32 handle, + struct rtattr **tca, unsigned long *arg) +{ + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_TCINDEX_MAX]; + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *r = (struct tcindex_filter_result *) *arg; + + DPRINTK("tcindex_change(tp %p,handle 0x%08x,tca %p,arg %p),opt %p," + "p %p,r %p,*arg 0x%lx\n", + tp, handle, tca, arg, opt, p, r, arg ? *arg : 0L); + + if (!opt) + return 0; + + if (rtattr_parse_nested(tb, TCA_TCINDEX_MAX, opt) < 0) + return -EINVAL; + + return tcindex_set_parms(tp, base, handle, p, r, tb, tca[TCA_RATE-1]); +} + + +static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter *f,*next; + int i; + + DPRINTK("tcindex_walk(tp %p,walker %p),p %p\n",tp,walker,p); + if (p->perfect) { + for (i = 0; i < p->hash; i++) { + if (!p->perfect[i].res.class) + continue; + if (walker->count >= walker->skip) { + if (walker->fn(tp, + (unsigned long) (p->perfect+i), walker) + < 0) { + walker->stop = 1; + return; + } + } + walker->count++; + } + } + if (!p->h) + return; + for (i = 0; i < p->hash; i++) { + for (f = p->h[i]; f; f = next) { + next = f->next; + if (walker->count >= walker->skip) { + if (walker->fn(tp,(unsigned long) &f->result, + walker) < 0) { + walker->stop = 1; + return; + } + } + walker->count++; + } + } +} + + +static int tcindex_destroy_element(struct tcf_proto *tp, + unsigned long arg, struct tcf_walker *walker) +{ + return __tcindex_delete(tp, arg, 0); +} + + +static void tcindex_destroy(struct tcf_proto *tp) +{ + struct tcindex_data *p = PRIV(tp); + struct tcf_walker walker; + + DPRINTK("tcindex_destroy(tp %p),p %p\n",tp,p); + walker.count = 0; + walker.skip = 0; + walker.fn = &tcindex_destroy_element; + tcindex_walk(tp,&walker); + if (p->perfect) + kfree(p->perfect); + if (p->h) + kfree(p->h); + kfree(p); + tp->root = NULL; +} + + +static int tcindex_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct tcindex_data *p = PRIV(tp); + struct tcindex_filter_result *r = (struct tcindex_filter_result *) fh; + unsigned char *b = skb->tail; + struct rtattr *rta; + + DPRINTK("tcindex_dump(tp %p,fh 0x%lx,skb %p,t %p),p %p,r %p,b %p\n", + tp,fh,skb,t,p,r,b); + DPRINTK("p->perfect %p p->h %p\n",p->perfect,p->h); + rta = (struct rtattr *) b; + RTA_PUT(skb,TCA_OPTIONS,0,NULL); + if (!fh) { + t->tcm_handle = ~0; /* whatever ... */ + RTA_PUT(skb,TCA_TCINDEX_HASH,sizeof(p->hash),&p->hash); + RTA_PUT(skb,TCA_TCINDEX_MASK,sizeof(p->mask),&p->mask); + RTA_PUT(skb,TCA_TCINDEX_SHIFT,sizeof(p->shift),&p->shift); + RTA_PUT(skb,TCA_TCINDEX_FALL_THROUGH,sizeof(p->fall_through), + &p->fall_through); + rta->rta_len = skb->tail-b; + } else { + if (p->perfect) { + t->tcm_handle = r-p->perfect; + } else { + struct tcindex_filter *f; + int i; + + t->tcm_handle = 0; + for (i = 0; !t->tcm_handle && i < p->hash; i++) { + for (f = p->h[i]; !t->tcm_handle && f; + f = f->next) { + if (&f->result == r) + t->tcm_handle = f->key; + } + } + } + DPRINTK("handle = %d\n",t->tcm_handle); + if (r->res.class) + RTA_PUT(skb, TCA_TCINDEX_CLASSID, 4, &r->res.classid); + + if (tcf_exts_dump(skb, &r->exts, &tcindex_ext_map) < 0) + goto rtattr_failure; + rta->rta_len = skb->tail-b; + + if (tcf_exts_dump_stats(skb, &r->exts, &tcindex_ext_map) < 0) + goto rtattr_failure; + } + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct tcf_proto_ops cls_tcindex_ops = { + .next = NULL, + .kind = "tcindex", + .classify = tcindex_classify, + .init = tcindex_init, + .destroy = tcindex_destroy, + .get = tcindex_get, + .put = tcindex_put, + .change = tcindex_change, + .delete = tcindex_delete, + .walk = tcindex_walk, + .dump = tcindex_dump, + .owner = THIS_MODULE, +}; + +static int __init init_tcindex(void) +{ + return register_tcf_proto_ops(&cls_tcindex_ops); +} + +static void __exit exit_tcindex(void) +{ + unregister_tcf_proto_ops(&cls_tcindex_ops); +} + +module_init(init_tcindex) +module_exit(exit_tcindex) +MODULE_LICENSE("GPL"); diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c new file mode 100644 index 000000000000..364b87d86455 --- /dev/null +++ b/net/sched/cls_u32.c @@ -0,0 +1,828 @@ +/* + * net/sched/cls_u32.c Ugly (or Universal) 32bit key Packet Classifier. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + * The filters are packed to hash tables of key nodes + * with a set of 32bit key/mask pairs at every node. + * Nodes reference next level hash tables etc. + * + * This scheme is the best universal classifier I managed to + * invent; it is not super-fast, but it is not slow (provided you + * program it correctly), and general enough. And its relative + * speed grows as the number of rules becomes larger. + * + * It seems that it represents the best middle point between + * speed and manageability both by human and by machine. + * + * It is especially useful for link sharing combined with QoS; + * pure RSVP doesn't need such a general approach and can use + * much simpler (and faster) schemes, sort of cls_rsvp.c. + * + * JHS: We should remove the CONFIG_NET_CLS_IND from here + * eventually when the meta match extension is made available + * + * nfmark match added by Catalin(ux aka Dino) BOIE + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct tc_u_knode +{ + struct tc_u_knode *next; + u32 handle; + struct tc_u_hnode *ht_up; + struct tcf_exts exts; +#ifdef CONFIG_NET_CLS_IND + char indev[IFNAMSIZ]; +#endif + u8 fshift; + struct tcf_result res; + struct tc_u_hnode *ht_down; +#ifdef CONFIG_CLS_U32_PERF + struct tc_u32_pcnt *pf; +#endif +#ifdef CONFIG_CLS_U32_MARK + struct tc_u32_mark mark; +#endif + struct tc_u32_sel sel; +}; + +struct tc_u_hnode +{ + struct tc_u_hnode *next; + u32 handle; + u32 prio; + struct tc_u_common *tp_c; + int refcnt; + unsigned divisor; + struct tc_u_knode *ht[1]; +}; + +struct tc_u_common +{ + struct tc_u_common *next; + struct tc_u_hnode *hlist; + struct Qdisc *q; + int refcnt; + u32 hgenerator; +}; + +static struct tcf_ext_map u32_ext_map = { + .action = TCA_U32_ACT, + .police = TCA_U32_POLICE +}; + +static struct tc_u_common *u32_list; + +static __inline__ unsigned u32_hash_fold(u32 key, struct tc_u32_sel *sel, u8 fshift) +{ + unsigned h = (key & sel->hmask)>>fshift; + + return h; +} + +static int u32_classify(struct sk_buff *skb, struct tcf_proto *tp, struct tcf_result *res) +{ + struct { + struct tc_u_knode *knode; + u8 *ptr; + } stack[TC_U32_MAXDEPTH]; + + struct tc_u_hnode *ht = (struct tc_u_hnode*)tp->root; + u8 *ptr = skb->nh.raw; + struct tc_u_knode *n; + int sdepth = 0; + int off2 = 0; + int sel = 0; +#ifdef CONFIG_CLS_U32_PERF + int j; +#endif + int i, r; + +next_ht: + n = ht->ht[sel]; + +next_knode: + if (n) { + struct tc_u32_key *key = n->sel.keys; + +#ifdef CONFIG_CLS_U32_PERF + n->pf->rcnt +=1; + j = 0; +#endif + +#ifdef CONFIG_CLS_U32_MARK + if ((skb->nfmark & n->mark.mask) != n->mark.val) { + n = n->next; + goto next_knode; + } else { + n->mark.success++; + } +#endif + + for (i = n->sel.nkeys; i>0; i--, key++) { + + if ((*(u32*)(ptr+key->off+(off2&key->offmask))^key->val)&key->mask) { + n = n->next; + goto next_knode; + } +#ifdef CONFIG_CLS_U32_PERF + n->pf->kcnts[j] +=1; + j++; +#endif + } + if (n->ht_down == NULL) { +check_terminal: + if (n->sel.flags&TC_U32_TERMINAL) { + + *res = n->res; +#ifdef CONFIG_NET_CLS_IND + if (!tcf_match_indev(skb, n->indev)) { + n = n->next; + goto next_knode; + } +#endif +#ifdef CONFIG_CLS_U32_PERF + n->pf->rhit +=1; +#endif + r = tcf_exts_exec(skb, &n->exts, res); + if (r < 0) { + n = n->next; + goto next_knode; + } + + return r; + } + n = n->next; + goto next_knode; + } + + /* PUSH */ + if (sdepth >= TC_U32_MAXDEPTH) + goto deadloop; + stack[sdepth].knode = n; + stack[sdepth].ptr = ptr; + sdepth++; + + ht = n->ht_down; + sel = 0; + if (ht->divisor) + sel = ht->divisor&u32_hash_fold(*(u32*)(ptr+n->sel.hoff), &n->sel,n->fshift); + + if (!(n->sel.flags&(TC_U32_VAROFFSET|TC_U32_OFFSET|TC_U32_EAT))) + goto next_ht; + + if (n->sel.flags&(TC_U32_OFFSET|TC_U32_VAROFFSET)) { + off2 = n->sel.off + 3; + if (n->sel.flags&TC_U32_VAROFFSET) + off2 += ntohs(n->sel.offmask & *(u16*)(ptr+n->sel.offoff)) >>n->sel.offshift; + off2 &= ~3; + } + if (n->sel.flags&TC_U32_EAT) { + ptr += off2; + off2 = 0; + } + + if (ptr < skb->tail) + goto next_ht; + } + + /* POP */ + if (sdepth--) { + n = stack[sdepth].knode; + ht = n->ht_up; + ptr = stack[sdepth].ptr; + goto check_terminal; + } + return -1; + +deadloop: + if (net_ratelimit()) + printk("cls_u32: dead loop\n"); + return -1; +} + +static __inline__ struct tc_u_hnode * +u32_lookup_ht(struct tc_u_common *tp_c, u32 handle) +{ + struct tc_u_hnode *ht; + + for (ht = tp_c->hlist; ht; ht = ht->next) + if (ht->handle == handle) + break; + + return ht; +} + +static __inline__ struct tc_u_knode * +u32_lookup_key(struct tc_u_hnode *ht, u32 handle) +{ + unsigned sel; + struct tc_u_knode *n = NULL; + + sel = TC_U32_HASH(handle); + if (sel > ht->divisor) + goto out; + + for (n = ht->ht[sel]; n; n = n->next) + if (n->handle == handle) + break; +out: + return n; +} + + +static unsigned long u32_get(struct tcf_proto *tp, u32 handle) +{ + struct tc_u_hnode *ht; + struct tc_u_common *tp_c = tp->data; + + if (TC_U32_HTID(handle) == TC_U32_ROOT) + ht = tp->root; + else + ht = u32_lookup_ht(tp_c, TC_U32_HTID(handle)); + + if (!ht) + return 0; + + if (TC_U32_KEY(handle) == 0) + return (unsigned long)ht; + + return (unsigned long)u32_lookup_key(ht, handle); +} + +static void u32_put(struct tcf_proto *tp, unsigned long f) +{ +} + +static u32 gen_new_htid(struct tc_u_common *tp_c) +{ + int i = 0x800; + + do { + if (++tp_c->hgenerator == 0x7FF) + tp_c->hgenerator = 1; + } while (--i>0 && u32_lookup_ht(tp_c, (tp_c->hgenerator|0x800)<<20)); + + return i > 0 ? (tp_c->hgenerator|0x800)<<20 : 0; +} + +static int u32_init(struct tcf_proto *tp) +{ + struct tc_u_hnode *root_ht; + struct tc_u_common *tp_c; + + for (tp_c = u32_list; tp_c; tp_c = tp_c->next) + if (tp_c->q == tp->q) + break; + + root_ht = kmalloc(sizeof(*root_ht), GFP_KERNEL); + if (root_ht == NULL) + return -ENOBUFS; + + memset(root_ht, 0, sizeof(*root_ht)); + root_ht->divisor = 0; + root_ht->refcnt++; + root_ht->handle = tp_c ? gen_new_htid(tp_c) : 0x80000000; + root_ht->prio = tp->prio; + + if (tp_c == NULL) { + tp_c = kmalloc(sizeof(*tp_c), GFP_KERNEL); + if (tp_c == NULL) { + kfree(root_ht); + return -ENOBUFS; + } + memset(tp_c, 0, sizeof(*tp_c)); + tp_c->q = tp->q; + tp_c->next = u32_list; + u32_list = tp_c; + } + + tp_c->refcnt++; + root_ht->next = tp_c->hlist; + tp_c->hlist = root_ht; + root_ht->tp_c = tp_c; + + tp->root = root_ht; + tp->data = tp_c; + return 0; +} + +static int u32_destroy_key(struct tcf_proto *tp, struct tc_u_knode *n) +{ + tcf_unbind_filter(tp, &n->res); + tcf_exts_destroy(tp, &n->exts); + if (n->ht_down) + n->ht_down->refcnt--; +#ifdef CONFIG_CLS_U32_PERF + if (n && (NULL != n->pf)) + kfree(n->pf); +#endif + kfree(n); + return 0; +} + +static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode* key) +{ + struct tc_u_knode **kp; + struct tc_u_hnode *ht = key->ht_up; + + if (ht) { + for (kp = &ht->ht[TC_U32_HASH(key->handle)]; *kp; kp = &(*kp)->next) { + if (*kp == key) { + tcf_tree_lock(tp); + *kp = key->next; + tcf_tree_unlock(tp); + + u32_destroy_key(tp, key); + return 0; + } + } + } + BUG_TRAP(0); + return 0; +} + +static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) +{ + struct tc_u_knode *n; + unsigned h; + + for (h=0; h<=ht->divisor; h++) { + while ((n = ht->ht[h]) != NULL) { + ht->ht[h] = n->next; + + u32_destroy_key(tp, n); + } + } +} + +static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode **hn; + + BUG_TRAP(!ht->refcnt); + + u32_clear_hnode(tp, ht); + + for (hn = &tp_c->hlist; *hn; hn = &(*hn)->next) { + if (*hn == ht) { + *hn = ht->next; + kfree(ht); + return 0; + } + } + + BUG_TRAP(0); + return -ENOENT; +} + +static void u32_destroy(struct tcf_proto *tp) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *root_ht = xchg(&tp->root, NULL); + + BUG_TRAP(root_ht != NULL); + + if (root_ht && --root_ht->refcnt == 0) + u32_destroy_hnode(tp, root_ht); + + if (--tp_c->refcnt == 0) { + struct tc_u_hnode *ht; + struct tc_u_common **tp_cp; + + for (tp_cp = &u32_list; *tp_cp; tp_cp = &(*tp_cp)->next) { + if (*tp_cp == tp_c) { + *tp_cp = tp_c->next; + break; + } + } + + for (ht=tp_c->hlist; ht; ht = ht->next) + u32_clear_hnode(tp, ht); + + while ((ht = tp_c->hlist) != NULL) { + tp_c->hlist = ht->next; + + BUG_TRAP(ht->refcnt == 0); + + kfree(ht); + }; + + kfree(tp_c); + } + + tp->data = NULL; +} + +static int u32_delete(struct tcf_proto *tp, unsigned long arg) +{ + struct tc_u_hnode *ht = (struct tc_u_hnode*)arg; + + if (ht == NULL) + return 0; + + if (TC_U32_KEY(ht->handle)) + return u32_delete_key(tp, (struct tc_u_knode*)ht); + + if (tp->root == ht) + return -EINVAL; + + if (--ht->refcnt == 0) + u32_destroy_hnode(tp, ht); + + return 0; +} + +static u32 gen_new_kid(struct tc_u_hnode *ht, u32 handle) +{ + struct tc_u_knode *n; + unsigned i = 0x7FF; + + for (n=ht->ht[TC_U32_HASH(handle)]; n; n = n->next) + if (i < TC_U32_NODE(n->handle)) + i = TC_U32_NODE(n->handle); + i++; + + return handle|(i>0xFFF ? 0xFFF : i); +} + +static int u32_set_parms(struct tcf_proto *tp, unsigned long base, + struct tc_u_hnode *ht, + struct tc_u_knode *n, struct rtattr **tb, + struct rtattr *est) +{ + int err; + struct tcf_exts e; + + err = tcf_exts_validate(tp, tb, est, &e, &u32_ext_map); + if (err < 0) + return err; + + err = -EINVAL; + if (tb[TCA_U32_LINK-1]) { + u32 handle = *(u32*)RTA_DATA(tb[TCA_U32_LINK-1]); + struct tc_u_hnode *ht_down = NULL; + + if (TC_U32_KEY(handle)) + goto errout; + + if (handle) { + ht_down = u32_lookup_ht(ht->tp_c, handle); + + if (ht_down == NULL) + goto errout; + ht_down->refcnt++; + } + + tcf_tree_lock(tp); + ht_down = xchg(&n->ht_down, ht_down); + tcf_tree_unlock(tp); + + if (ht_down) + ht_down->refcnt--; + } + if (tb[TCA_U32_CLASSID-1]) { + n->res.classid = *(u32*)RTA_DATA(tb[TCA_U32_CLASSID-1]); + tcf_bind_filter(tp, &n->res, base); + } + +#ifdef CONFIG_NET_CLS_IND + if (tb[TCA_U32_INDEV-1]) { + int err = tcf_change_indev(tp, n->indev, tb[TCA_U32_INDEV-1]); + if (err < 0) + goto errout; + } +#endif + tcf_exts_change(tp, &n->exts, &e); + + return 0; +errout: + tcf_exts_destroy(tp, &e); + return err; +} + +static int u32_change(struct tcf_proto *tp, unsigned long base, u32 handle, + struct rtattr **tca, + unsigned long *arg) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *ht; + struct tc_u_knode *n; + struct tc_u32_sel *s; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_U32_MAX]; + u32 htid; + int err; + + if (opt == NULL) + return handle ? -EINVAL : 0; + + if (rtattr_parse_nested(tb, TCA_U32_MAX, opt) < 0) + return -EINVAL; + + if ((n = (struct tc_u_knode*)*arg) != NULL) { + if (TC_U32_KEY(n->handle) == 0) + return -EINVAL; + + return u32_set_parms(tp, base, n->ht_up, n, tb, tca[TCA_RATE-1]); + } + + if (tb[TCA_U32_DIVISOR-1]) { + unsigned divisor = *(unsigned*)RTA_DATA(tb[TCA_U32_DIVISOR-1]); + + if (--divisor > 0x100) + return -EINVAL; + if (TC_U32_KEY(handle)) + return -EINVAL; + if (handle == 0) { + handle = gen_new_htid(tp->data); + if (handle == 0) + return -ENOMEM; + } + ht = kmalloc(sizeof(*ht) + divisor*sizeof(void*), GFP_KERNEL); + if (ht == NULL) + return -ENOBUFS; + memset(ht, 0, sizeof(*ht) + divisor*sizeof(void*)); + ht->tp_c = tp_c; + ht->refcnt = 0; + ht->divisor = divisor; + ht->handle = handle; + ht->prio = tp->prio; + ht->next = tp_c->hlist; + tp_c->hlist = ht; + *arg = (unsigned long)ht; + return 0; + } + + if (tb[TCA_U32_HASH-1]) { + htid = *(unsigned*)RTA_DATA(tb[TCA_U32_HASH-1]); + if (TC_U32_HTID(htid) == TC_U32_ROOT) { + ht = tp->root; + htid = ht->handle; + } else { + ht = u32_lookup_ht(tp->data, TC_U32_HTID(htid)); + if (ht == NULL) + return -EINVAL; + } + } else { + ht = tp->root; + htid = ht->handle; + } + + if (ht->divisor < TC_U32_HASH(htid)) + return -EINVAL; + + if (handle) { + if (TC_U32_HTID(handle) && TC_U32_HTID(handle^htid)) + return -EINVAL; + handle = htid | TC_U32_NODE(handle); + } else + handle = gen_new_kid(ht, htid); + + if (tb[TCA_U32_SEL-1] == 0 || + RTA_PAYLOAD(tb[TCA_U32_SEL-1]) < sizeof(struct tc_u32_sel)) + return -EINVAL; + + s = RTA_DATA(tb[TCA_U32_SEL-1]); + + n = kmalloc(sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key), GFP_KERNEL); + if (n == NULL) + return -ENOBUFS; + + memset(n, 0, sizeof(*n) + s->nkeys*sizeof(struct tc_u32_key)); +#ifdef CONFIG_CLS_U32_PERF + n->pf = kmalloc(sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64), GFP_KERNEL); + if (n->pf == NULL) { + kfree(n); + return -ENOBUFS; + } + memset(n->pf, 0, sizeof(struct tc_u32_pcnt) + s->nkeys*sizeof(u64)); +#endif + + memcpy(&n->sel, s, sizeof(*s) + s->nkeys*sizeof(struct tc_u32_key)); + n->ht_up = ht; + n->handle = handle; +{ + u8 i = 0; + u32 mask = s->hmask; + if (mask) { + while (!(mask & 1)) { + i++; + mask>>=1; + } + } + n->fshift = i; +} + +#ifdef CONFIG_CLS_U32_MARK + if (tb[TCA_U32_MARK-1]) { + struct tc_u32_mark *mark; + + if (RTA_PAYLOAD(tb[TCA_U32_MARK-1]) < sizeof(struct tc_u32_mark)) { +#ifdef CONFIG_CLS_U32_PERF + kfree(n->pf); +#endif + kfree(n); + return -EINVAL; + } + mark = RTA_DATA(tb[TCA_U32_MARK-1]); + memcpy(&n->mark, mark, sizeof(struct tc_u32_mark)); + n->mark.success = 0; + } +#endif + + err = u32_set_parms(tp, base, ht, n, tb, tca[TCA_RATE-1]); + if (err == 0) { + struct tc_u_knode **ins; + for (ins = &ht->ht[TC_U32_HASH(handle)]; *ins; ins = &(*ins)->next) + if (TC_U32_NODE(handle) < TC_U32_NODE((*ins)->handle)) + break; + + n->next = *ins; + wmb(); + *ins = n; + + *arg = (unsigned long)n; + return 0; + } +#ifdef CONFIG_CLS_U32_PERF + if (n && (NULL != n->pf)) + kfree(n->pf); +#endif + kfree(n); + return err; +} + +static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg) +{ + struct tc_u_common *tp_c = tp->data; + struct tc_u_hnode *ht; + struct tc_u_knode *n; + unsigned h; + + if (arg->stop) + return; + + for (ht = tp_c->hlist; ht; ht = ht->next) { + if (ht->prio != tp->prio) + continue; + if (arg->count >= arg->skip) { + if (arg->fn(tp, (unsigned long)ht, arg) < 0) { + arg->stop = 1; + return; + } + } + arg->count++; + for (h = 0; h <= ht->divisor; h++) { + for (n = ht->ht[h]; n; n = n->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(tp, (unsigned long)n, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } + } +} + +static int u32_dump(struct tcf_proto *tp, unsigned long fh, + struct sk_buff *skb, struct tcmsg *t) +{ + struct tc_u_knode *n = (struct tc_u_knode*)fh; + unsigned char *b = skb->tail; + struct rtattr *rta; + + if (n == NULL) + return skb->len; + + t->tcm_handle = n->handle; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + if (TC_U32_KEY(n->handle) == 0) { + struct tc_u_hnode *ht = (struct tc_u_hnode*)fh; + u32 divisor = ht->divisor+1; + RTA_PUT(skb, TCA_U32_DIVISOR, 4, &divisor); + } else { + RTA_PUT(skb, TCA_U32_SEL, + sizeof(n->sel) + n->sel.nkeys*sizeof(struct tc_u32_key), + &n->sel); + if (n->ht_up) { + u32 htid = n->handle & 0xFFFFF000; + RTA_PUT(skb, TCA_U32_HASH, 4, &htid); + } + if (n->res.classid) + RTA_PUT(skb, TCA_U32_CLASSID, 4, &n->res.classid); + if (n->ht_down) + RTA_PUT(skb, TCA_U32_LINK, 4, &n->ht_down->handle); + +#ifdef CONFIG_CLS_U32_MARK + if (n->mark.val || n->mark.mask) + RTA_PUT(skb, TCA_U32_MARK, sizeof(n->mark), &n->mark); +#endif + + if (tcf_exts_dump(skb, &n->exts, &u32_ext_map) < 0) + goto rtattr_failure; + +#ifdef CONFIG_NET_CLS_IND + if(strlen(n->indev)) + RTA_PUT(skb, TCA_U32_INDEV, IFNAMSIZ, n->indev); +#endif +#ifdef CONFIG_CLS_U32_PERF + RTA_PUT(skb, TCA_U32_PCNT, + sizeof(struct tc_u32_pcnt) + n->sel.nkeys*sizeof(u64), + n->pf); +#endif + } + + rta->rta_len = skb->tail - b; + if (TC_U32_KEY(n->handle)) + if (tcf_exts_dump_stats(skb, &n->exts, &u32_ext_map) < 0) + goto rtattr_failure; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct tcf_proto_ops cls_u32_ops = { + .next = NULL, + .kind = "u32", + .classify = u32_classify, + .init = u32_init, + .destroy = u32_destroy, + .get = u32_get, + .put = u32_put, + .change = u32_change, + .delete = u32_delete, + .walk = u32_walk, + .dump = u32_dump, + .owner = THIS_MODULE, +}; + +static int __init init_u32(void) +{ + printk("u32 classifier\n"); +#ifdef CONFIG_CLS_U32_PERF + printk(" Perfomance counters on\n"); +#endif +#ifdef CONFIG_NET_CLS_POLICE + printk(" OLD policer on \n"); +#endif +#ifdef CONFIG_NET_CLS_IND + printk(" input device check on \n"); +#endif +#ifdef CONFIG_NET_CLS_ACT + printk(" Actions configured \n"); +#endif + return register_tcf_proto_ops(&cls_u32_ops); +} + +static void __exit exit_u32(void) +{ + unregister_tcf_proto_ops(&cls_u32_ops); +} + +module_init(init_u32) +module_exit(exit_u32) +MODULE_LICENSE("GPL"); diff --git a/net/sched/em_cmp.c b/net/sched/em_cmp.c new file mode 100644 index 000000000000..bf1f00f8b1bf --- /dev/null +++ b/net/sched/em_cmp.c @@ -0,0 +1,101 @@ +/* + * net/sched/em_cmp.c Simple packet data comparison ematch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + */ + +#include +#include +#include +#include +#include +#include +#include + +static inline int cmp_needs_transformation(struct tcf_em_cmp *cmp) +{ + return unlikely(cmp->flags & TCF_EM_CMP_TRANS); +} + +static int em_cmp_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + struct tcf_em_cmp *cmp = (struct tcf_em_cmp *) em->data; + unsigned char *ptr = tcf_get_base_ptr(skb, cmp->layer) + cmp->off; + u32 val = 0; + + if (!tcf_valid_offset(skb, ptr, cmp->align)) + return 0; + + switch (cmp->align) { + case TCF_EM_ALIGN_U8: + val = *ptr; + break; + + case TCF_EM_ALIGN_U16: + val = *ptr << 8; + val |= *(ptr+1); + + if (cmp_needs_transformation(cmp)) + val = be16_to_cpu(val); + break; + + case TCF_EM_ALIGN_U32: + /* Worth checking boundries? The branching seems + * to get worse. Visit again. */ + val = *ptr << 24; + val |= *(ptr+1) << 16; + val |= *(ptr+2) << 8; + val |= *(ptr+3); + + if (cmp_needs_transformation(cmp)) + val = be32_to_cpu(val); + break; + + default: + return 0; + } + + if (cmp->mask) + val &= cmp->mask; + + switch (cmp->opnd) { + case TCF_EM_OPND_EQ: + return val == cmp->val; + case TCF_EM_OPND_LT: + return val < cmp->val; + case TCF_EM_OPND_GT: + return val > cmp->val; + } + + return 0; +} + +static struct tcf_ematch_ops em_cmp_ops = { + .kind = TCF_EM_CMP, + .datalen = sizeof(struct tcf_em_cmp), + .match = em_cmp_match, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_cmp_ops.link) +}; + +static int __init init_em_cmp(void) +{ + return tcf_em_register(&em_cmp_ops); +} + +static void __exit exit_em_cmp(void) +{ + tcf_em_unregister(&em_cmp_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_cmp); +module_exit(exit_em_cmp); + diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c new file mode 100644 index 000000000000..f1eeaf65cee5 --- /dev/null +++ b/net/sched/em_meta.c @@ -0,0 +1,661 @@ +/* + * net/sched/em_meta.c Metadata ematch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + * + * ========================================================================== + * + * The metadata ematch compares two meta objects where each object + * represents either a meta value stored in the kernel or a static + * value provided by userspace. The objects are not provided by + * userspace itself but rather a definition providing the information + * to build them. Every object is of a certain type which must be + * equal to the object it is being compared to. + * + * The definition of a objects conists of the type (meta type), a + * identifier (meta id) and additional type specific information. + * The meta id is either TCF_META_TYPE_VALUE for values provided by + * userspace or a index to the meta operations table consisting of + * function pointers to type specific meta data collectors returning + * the value of the requested meta value. + * + * lvalue rvalue + * +-----------+ +-----------+ + * | type: INT | | type: INT | + * def | id: INDEV | | id: VALUE | + * | data: | | data: 3 | + * +-----------+ +-----------+ + * | | + * ---> meta_ops[INT][INDEV](...) | + * | | + * ----------- | + * V V + * +-----------+ +-----------+ + * | type: INT | | type: INT | + * obj | id: INDEV | | id: VALUE | + * | data: 2 |<--data got filled out | data: 3 | + * +-----------+ +-----------+ + * | | + * --------------> 2 equals 3 <-------------- + * + * This is a simplified schema, the complexity varies depending + * on the meta type. Obviously, the length of the data must also + * be provided for non-numeric types. + * + * Additionaly, type dependant modifiers such as shift operators + * or mask may be applied to extend the functionaliy. As of now, + * the variable length type supports shifting the byte string to + * the right, eating up any number of octets and thus supporting + * wildcard interface name comparisons such as "ppp%" matching + * ppp0..9. + * + * NOTE: Certain meta values depend on other subsystems and are + * only available if that subsytem is enabled in the kernel. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct meta_obj +{ + unsigned long value; + unsigned int len; +}; + +struct meta_value +{ + struct tcf_meta_val hdr; + unsigned long val; + unsigned int len; +}; + +struct meta_match +{ + struct meta_value lvalue; + struct meta_value rvalue; +}; + +static inline int meta_id(struct meta_value *v) +{ + return TCF_META_ID(v->hdr.kind); +} + +static inline int meta_type(struct meta_value *v) +{ + return TCF_META_TYPE(v->hdr.kind); +} + +#define META_COLLECTOR(FUNC) static void meta_##FUNC(struct sk_buff *skb, \ + struct tcf_pkt_info *info, struct meta_value *v, \ + struct meta_obj *dst, int *err) + +/************************************************************************** + * System status & misc + **************************************************************************/ + +META_COLLECTOR(int_random) +{ + get_random_bytes(&dst->value, sizeof(dst->value)); +} + +static inline unsigned long fixed_loadavg(int load) +{ + int rnd_load = load + (FIXED_1/200); + int rnd_frac = ((rnd_load & (FIXED_1-1)) * 100) >> FSHIFT; + + return ((rnd_load >> FSHIFT) * 100) + rnd_frac; +} + +META_COLLECTOR(int_loadavg_0) +{ + dst->value = fixed_loadavg(avenrun[0]); +} + +META_COLLECTOR(int_loadavg_1) +{ + dst->value = fixed_loadavg(avenrun[1]); +} + +META_COLLECTOR(int_loadavg_2) +{ + dst->value = fixed_loadavg(avenrun[2]); +} + +/************************************************************************** + * Device names & indices + **************************************************************************/ + +static inline int int_dev(struct net_device *dev, struct meta_obj *dst) +{ + if (unlikely(dev == NULL)) + return -1; + + dst->value = dev->ifindex; + return 0; +} + +static inline int var_dev(struct net_device *dev, struct meta_obj *dst) +{ + if (unlikely(dev == NULL)) + return -1; + + dst->value = (unsigned long) dev->name; + dst->len = strlen(dev->name); + return 0; +} + +META_COLLECTOR(int_dev) +{ + *err = int_dev(skb->dev, dst); +} + +META_COLLECTOR(var_dev) +{ + *err = var_dev(skb->dev, dst); +} + +META_COLLECTOR(int_indev) +{ + *err = int_dev(skb->input_dev, dst); +} + +META_COLLECTOR(var_indev) +{ + *err = var_dev(skb->input_dev, dst); +} + +META_COLLECTOR(int_realdev) +{ + *err = int_dev(skb->real_dev, dst); +} + +META_COLLECTOR(var_realdev) +{ + *err = var_dev(skb->real_dev, dst); +} + +/************************************************************************** + * skb attributes + **************************************************************************/ + +META_COLLECTOR(int_priority) +{ + dst->value = skb->priority; +} + +META_COLLECTOR(int_protocol) +{ + /* Let userspace take care of the byte ordering */ + dst->value = skb->protocol; +} + +META_COLLECTOR(int_security) +{ + dst->value = skb->security; +} + +META_COLLECTOR(int_pkttype) +{ + dst->value = skb->pkt_type; +} + +META_COLLECTOR(int_pktlen) +{ + dst->value = skb->len; +} + +META_COLLECTOR(int_datalen) +{ + dst->value = skb->data_len; +} + +META_COLLECTOR(int_maclen) +{ + dst->value = skb->mac_len; +} + +/************************************************************************** + * Netfilter + **************************************************************************/ + +#ifdef CONFIG_NETFILTER +META_COLLECTOR(int_nfmark) +{ + dst->value = skb->nfmark; +} +#endif + +/************************************************************************** + * Traffic Control + **************************************************************************/ + +META_COLLECTOR(int_tcindex) +{ + dst->value = skb->tc_index; +} + +#ifdef CONFIG_NET_CLS_ACT +META_COLLECTOR(int_tcverd) +{ + dst->value = skb->tc_verd; +} + +META_COLLECTOR(int_tcclassid) +{ + dst->value = skb->tc_classid; +} +#endif + +/************************************************************************** + * Routing + **************************************************************************/ + +#ifdef CONFIG_NET_CLS_ROUTE +META_COLLECTOR(int_rtclassid) +{ + if (unlikely(skb->dst == NULL)) + *err = -1; + else + dst->value = skb->dst->tclassid; +} +#endif + +META_COLLECTOR(int_rtiif) +{ + if (unlikely(skb->dst == NULL)) + *err = -1; + else + dst->value = ((struct rtable*) skb->dst)->fl.iif; +} + +/************************************************************************** + * Meta value collectors assignment table + **************************************************************************/ + +struct meta_ops +{ + void (*get)(struct sk_buff *, struct tcf_pkt_info *, + struct meta_value *, struct meta_obj *, int *); +}; + +/* Meta value operations table listing all meta value collectors and + * assigns them to a type and meta id. */ +static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { + [TCF_META_TYPE_VAR] = { + [TCF_META_ID_DEV] = { .get = meta_var_dev }, + [TCF_META_ID_INDEV] = { .get = meta_var_indev }, + [TCF_META_ID_REALDEV] = { .get = meta_var_realdev } + }, + [TCF_META_TYPE_INT] = { + [TCF_META_ID_RANDOM] = { .get = meta_int_random }, + [TCF_META_ID_LOADAVG_0] = { .get = meta_int_loadavg_0 }, + [TCF_META_ID_LOADAVG_1] = { .get = meta_int_loadavg_1 }, + [TCF_META_ID_LOADAVG_2] = { .get = meta_int_loadavg_2 }, + [TCF_META_ID_DEV] = { .get = meta_int_dev }, + [TCF_META_ID_INDEV] = { .get = meta_int_indev }, + [TCF_META_ID_REALDEV] = { .get = meta_int_realdev }, + [TCF_META_ID_PRIORITY] = { .get = meta_int_priority }, + [TCF_META_ID_PROTOCOL] = { .get = meta_int_protocol }, + [TCF_META_ID_SECURITY] = { .get = meta_int_security }, + [TCF_META_ID_PKTTYPE] = { .get = meta_int_pkttype }, + [TCF_META_ID_PKTLEN] = { .get = meta_int_pktlen }, + [TCF_META_ID_DATALEN] = { .get = meta_int_datalen }, + [TCF_META_ID_MACLEN] = { .get = meta_int_maclen }, +#ifdef CONFIG_NETFILTER + [TCF_META_ID_NFMARK] = { .get = meta_int_nfmark }, +#endif + [TCF_META_ID_TCINDEX] = { .get = meta_int_tcindex }, +#ifdef CONFIG_NET_CLS_ACT + [TCF_META_ID_TCVERDICT] = { .get = meta_int_tcverd }, + [TCF_META_ID_TCCLASSID] = { .get = meta_int_tcclassid }, +#endif +#ifdef CONFIG_NET_CLS_ROUTE + [TCF_META_ID_RTCLASSID] = { .get = meta_int_rtclassid }, +#endif + [TCF_META_ID_RTIIF] = { .get = meta_int_rtiif } + } +}; + +static inline struct meta_ops * meta_ops(struct meta_value *val) +{ + return &__meta_ops[meta_type(val)][meta_id(val)]; +} + +/************************************************************************** + * Type specific operations for TCF_META_TYPE_VAR + **************************************************************************/ + +static int meta_var_compare(struct meta_obj *a, struct meta_obj *b) +{ + int r = a->len - b->len; + + if (r == 0) + r = memcmp((void *) a->value, (void *) b->value, a->len); + + return r; +} + +static int meta_var_change(struct meta_value *dst, struct rtattr *rta) +{ + int len = RTA_PAYLOAD(rta); + + dst->val = (unsigned long) kmalloc(len, GFP_KERNEL); + if (dst->val == 0UL) + return -ENOMEM; + memcpy((void *) dst->val, RTA_DATA(rta), len); + dst->len = len; + return 0; +} + +static void meta_var_destroy(struct meta_value *v) +{ + if (v->val) + kfree((void *) v->val); +} + +static void meta_var_apply_extras(struct meta_value *v, + struct meta_obj *dst) +{ + int shift = v->hdr.shift; + + if (shift && shift < dst->len) + dst->len -= shift; +} + +static int meta_var_dump(struct sk_buff *skb, struct meta_value *v, int tlv) +{ + if (v->val && v->len) + RTA_PUT(skb, tlv, v->len, (void *) v->val); + return 0; + +rtattr_failure: + return -1; +} + +/************************************************************************** + * Type specific operations for TCF_META_TYPE_INT + **************************************************************************/ + +static int meta_int_compare(struct meta_obj *a, struct meta_obj *b) +{ + /* Let gcc optimize it, the unlikely is not really based on + * some numbers but jump free code for mismatches seems + * more logical. */ + if (unlikely(a == b)) + return 0; + else if (a < b) + return -1; + else + return 1; +} + +static int meta_int_change(struct meta_value *dst, struct rtattr *rta) +{ + if (RTA_PAYLOAD(rta) >= sizeof(unsigned long)) { + dst->val = *(unsigned long *) RTA_DATA(rta); + dst->len = sizeof(unsigned long); + } else if (RTA_PAYLOAD(rta) == sizeof(u32)) { + dst->val = *(u32 *) RTA_DATA(rta); + dst->len = sizeof(u32); + } else + return -EINVAL; + + return 0; +} + +static void meta_int_apply_extras(struct meta_value *v, + struct meta_obj *dst) +{ + if (v->hdr.shift) + dst->value >>= v->hdr.shift; + + if (v->val) + dst->value &= v->val; +} + +static int meta_int_dump(struct sk_buff *skb, struct meta_value *v, int tlv) +{ + if (v->len == sizeof(unsigned long)) + RTA_PUT(skb, tlv, sizeof(unsigned long), &v->val); + else if (v->len == sizeof(u32)) { + u32 d = v->val; + RTA_PUT(skb, tlv, sizeof(d), &d); + } + + return 0; + +rtattr_failure: + return -1; +} + +/************************************************************************** + * Type specific operations table + **************************************************************************/ + +struct meta_type_ops +{ + void (*destroy)(struct meta_value *); + int (*compare)(struct meta_obj *, struct meta_obj *); + int (*change)(struct meta_value *, struct rtattr *); + void (*apply_extras)(struct meta_value *, struct meta_obj *); + int (*dump)(struct sk_buff *, struct meta_value *, int); +}; + +static struct meta_type_ops __meta_type_ops[TCF_META_TYPE_MAX+1] = { + [TCF_META_TYPE_VAR] = { + .destroy = meta_var_destroy, + .compare = meta_var_compare, + .change = meta_var_change, + .apply_extras = meta_var_apply_extras, + .dump = meta_var_dump + }, + [TCF_META_TYPE_INT] = { + .compare = meta_int_compare, + .change = meta_int_change, + .apply_extras = meta_int_apply_extras, + .dump = meta_int_dump + } +}; + +static inline struct meta_type_ops * meta_type_ops(struct meta_value *v) +{ + return &__meta_type_ops[meta_type(v)]; +} + +/************************************************************************** + * Core + **************************************************************************/ + +static inline int meta_get(struct sk_buff *skb, struct tcf_pkt_info *info, + struct meta_value *v, struct meta_obj *dst) +{ + int err = 0; + + if (meta_id(v) == TCF_META_ID_VALUE) { + dst->value = v->val; + dst->len = v->len; + return 0; + } + + meta_ops(v)->get(skb, info, v, dst, &err); + if (err < 0) + return err; + + if (meta_type_ops(v)->apply_extras) + meta_type_ops(v)->apply_extras(v, dst); + + return 0; +} + +static int em_meta_match(struct sk_buff *skb, struct tcf_ematch *m, + struct tcf_pkt_info *info) +{ + int r; + struct meta_match *meta = (struct meta_match *) m->data; + struct meta_obj l_value, r_value; + + if (meta_get(skb, info, &meta->lvalue, &l_value) < 0 || + meta_get(skb, info, &meta->rvalue, &r_value) < 0) + return 0; + + r = meta_type_ops(&meta->lvalue)->compare(&l_value, &r_value); + + switch (meta->lvalue.hdr.op) { + case TCF_EM_OPND_EQ: + return !r; + case TCF_EM_OPND_LT: + return r < 0; + case TCF_EM_OPND_GT: + return r > 0; + } + + return 0; +} + +static inline void meta_delete(struct meta_match *meta) +{ + struct meta_type_ops *ops = meta_type_ops(&meta->lvalue); + + if (ops && ops->destroy) { + ops->destroy(&meta->lvalue); + ops->destroy(&meta->rvalue); + } + + kfree(meta); +} + +static inline int meta_change_data(struct meta_value *dst, struct rtattr *rta) +{ + if (rta) { + if (RTA_PAYLOAD(rta) == 0) + return -EINVAL; + + return meta_type_ops(dst)->change(dst, rta); + } + + return 0; +} + +static inline int meta_is_supported(struct meta_value *val) +{ + return (!meta_id(val) || meta_ops(val)->get); +} + +static int em_meta_change(struct tcf_proto *tp, void *data, int len, + struct tcf_ematch *m) +{ + int err = -EINVAL; + struct rtattr *tb[TCA_EM_META_MAX]; + struct tcf_meta_hdr *hdr; + struct meta_match *meta = NULL; + + if (rtattr_parse(tb, TCA_EM_META_MAX, data, len) < 0) + goto errout; + + if (tb[TCA_EM_META_HDR-1] == NULL || + RTA_PAYLOAD(tb[TCA_EM_META_HDR-1]) < sizeof(*hdr)) + goto errout; + hdr = RTA_DATA(tb[TCA_EM_META_HDR-1]); + + if (TCF_META_TYPE(hdr->left.kind) != TCF_META_TYPE(hdr->right.kind) || + TCF_META_TYPE(hdr->left.kind) > TCF_META_TYPE_MAX || + TCF_META_ID(hdr->left.kind) > TCF_META_ID_MAX || + TCF_META_ID(hdr->right.kind) > TCF_META_ID_MAX) + goto errout; + + meta = kmalloc(sizeof(*meta), GFP_KERNEL); + if (meta == NULL) + goto errout; + memset(meta, 0, sizeof(*meta)); + + memcpy(&meta->lvalue.hdr, &hdr->left, sizeof(hdr->left)); + memcpy(&meta->rvalue.hdr, &hdr->right, sizeof(hdr->right)); + + if (!meta_is_supported(&meta->lvalue) || + !meta_is_supported(&meta->rvalue)) { + err = -EOPNOTSUPP; + goto errout; + } + + if (meta_change_data(&meta->lvalue, tb[TCA_EM_META_LVALUE-1]) < 0 || + meta_change_data(&meta->rvalue, tb[TCA_EM_META_RVALUE-1]) < 0) + goto errout; + + m->datalen = sizeof(*meta); + m->data = (unsigned long) meta; + + err = 0; +errout: + if (err && meta) + meta_delete(meta); + return err; +} + +static void em_meta_destroy(struct tcf_proto *tp, struct tcf_ematch *m) +{ + if (m) + meta_delete((struct meta_match *) m->data); +} + +static int em_meta_dump(struct sk_buff *skb, struct tcf_ematch *em) +{ + struct meta_match *meta = (struct meta_match *) em->data; + struct tcf_meta_hdr hdr; + struct meta_type_ops *ops; + + memset(&hdr, 0, sizeof(hdr)); + memcpy(&hdr.left, &meta->lvalue.hdr, sizeof(hdr.left)); + memcpy(&hdr.right, &meta->rvalue.hdr, sizeof(hdr.right)); + + RTA_PUT(skb, TCA_EM_META_HDR, sizeof(hdr), &hdr); + + ops = meta_type_ops(&meta->lvalue); + if (ops->dump(skb, &meta->lvalue, TCA_EM_META_LVALUE) < 0 || + ops->dump(skb, &meta->rvalue, TCA_EM_META_RVALUE) < 0) + goto rtattr_failure; + + return 0; + +rtattr_failure: + return -1; +} + +static struct tcf_ematch_ops em_meta_ops = { + .kind = TCF_EM_META, + .change = em_meta_change, + .match = em_meta_match, + .destroy = em_meta_destroy, + .dump = em_meta_dump, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_meta_ops.link) +}; + +static int __init init_em_meta(void) +{ + return tcf_em_register(&em_meta_ops); +} + +static void __exit exit_em_meta(void) +{ + tcf_em_unregister(&em_meta_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_meta); +module_exit(exit_em_meta); diff --git a/net/sched/em_nbyte.c b/net/sched/em_nbyte.c new file mode 100644 index 000000000000..71ea926a9f09 --- /dev/null +++ b/net/sched/em_nbyte.c @@ -0,0 +1,82 @@ +/* + * net/sched/em_nbyte.c N-Byte ematch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct nbyte_data +{ + struct tcf_em_nbyte hdr; + char pattern[0]; +}; + +static int em_nbyte_change(struct tcf_proto *tp, void *data, int data_len, + struct tcf_ematch *em) +{ + struct tcf_em_nbyte *nbyte = data; + + if (data_len < sizeof(*nbyte) || + data_len < (sizeof(*nbyte) + nbyte->len)) + return -EINVAL; + + em->datalen = sizeof(*nbyte) + nbyte->len; + em->data = (unsigned long) kmalloc(em->datalen, GFP_KERNEL); + if (em->data == 0UL) + return -ENOBUFS; + + memcpy((void *) em->data, data, em->datalen); + + return 0; +} + +static int em_nbyte_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + struct nbyte_data *nbyte = (struct nbyte_data *) em->data; + unsigned char *ptr = tcf_get_base_ptr(skb, nbyte->hdr.layer); + + ptr += nbyte->hdr.off; + + if (!tcf_valid_offset(skb, ptr, nbyte->hdr.len)) + return 0; + + return !memcmp(ptr + nbyte->hdr.off, nbyte->pattern, nbyte->hdr.len); +} + +static struct tcf_ematch_ops em_nbyte_ops = { + .kind = TCF_EM_NBYTE, + .change = em_nbyte_change, + .match = em_nbyte_match, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_nbyte_ops.link) +}; + +static int __init init_em_nbyte(void) +{ + return tcf_em_register(&em_nbyte_ops); +} + +static void __exit exit_em_nbyte(void) +{ + tcf_em_unregister(&em_nbyte_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_nbyte); +module_exit(exit_em_nbyte); diff --git a/net/sched/em_u32.c b/net/sched/em_u32.c new file mode 100644 index 000000000000..34e7e51e601e --- /dev/null +++ b/net/sched/em_u32.c @@ -0,0 +1,63 @@ +/* + * net/sched/em_u32.c U32 Ematch + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + * Alexey Kuznetsov, + * + * Based on net/sched/cls_u32.c + */ + +#include +#include +#include +#include +#include +#include + +static int em_u32_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + struct tc_u32_key *key = (struct tc_u32_key *) em->data; + unsigned char *ptr = skb->nh.raw; + + if (info) { + if (info->ptr) + ptr = info->ptr; + ptr += (info->nexthdr & key->offmask); + } + + ptr += key->off; + + if (!tcf_valid_offset(skb, ptr, sizeof(u32))) + return 0; + + return !(((*(u32*) ptr) ^ key->val) & key->mask); +} + +static struct tcf_ematch_ops em_u32_ops = { + .kind = TCF_EM_U32, + .datalen = sizeof(struct tc_u32_key), + .match = em_u32_match, + .owner = THIS_MODULE, + .link = LIST_HEAD_INIT(em_u32_ops.link) +}; + +static int __init init_em_u32(void) +{ + return tcf_em_register(&em_u32_ops); +} + +static void __exit exit_em_u32(void) +{ + tcf_em_unregister(&em_u32_ops); +} + +MODULE_LICENSE("GPL"); + +module_init(init_em_u32); +module_exit(exit_em_u32); diff --git a/net/sched/ematch.c b/net/sched/ematch.c new file mode 100644 index 000000000000..ebfe2e7d21bd --- /dev/null +++ b/net/sched/ematch.c @@ -0,0 +1,524 @@ +/* + * net/sched/ematch.c Extended Match API + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Thomas Graf + * + * ========================================================================== + * + * An extended match (ematch) is a small classification tool not worth + * writing a full classifier for. Ematches can be interconnected to form + * a logic expression and get attached to classifiers to extend their + * functionatlity. + * + * The userspace part transforms the logic expressions into an array + * consisting of multiple sequences of interconnected ematches separated + * by markers. Precedence is implemented by a special ematch kind + * referencing a sequence beyond the marker of the current sequence + * causing the current position in the sequence to be pushed onto a stack + * to allow the current position to be overwritten by the position referenced + * in the special ematch. Matching continues in the new sequence until a + * marker is reached causing the position to be restored from the stack. + * + * Example: + * A AND (B1 OR B2) AND C AND D + * + * ------->-PUSH------- + * -->-- / -->-- \ -->-- + * / \ / / \ \ / \ + * +-------+-------+-------+-------+-------+--------+ + * | A AND | B AND | C AND | D END | B1 OR | B2 END | + * +-------+-------+-------+-------+-------+--------+ + * \ / + * --------<-POP--------- + * + * where B is a virtual ematch referencing to sequence starting with B1. + * + * ========================================================================== + * + * How to write an ematch in 60 seconds + * ------------------------------------ + * + * 1) Provide a matcher function: + * static int my_match(struct sk_buff *skb, struct tcf_ematch *m, + * struct tcf_pkt_info *info) + * { + * struct mydata *d = (struct mydata *) m->data; + * + * if (...matching goes here...) + * return 1; + * else + * return 0; + * } + * + * 2) Fill out a struct tcf_ematch_ops: + * static struct tcf_ematch_ops my_ops = { + * .kind = unique id, + * .datalen = sizeof(struct mydata), + * .match = my_match, + * .owner = THIS_MODULE, + * }; + * + * 3) Register/Unregister your ematch: + * static int __init init_my_ematch(void) + * { + * return tcf_em_register(&my_ops); + * } + * + * static void __exit exit_my_ematch(void) + * { + * return tcf_em_unregister(&my_ops); + * } + * + * module_init(init_my_ematch); + * module_exit(exit_my_ematch); + * + * 4) By now you should have two more seconds left, barely enough to + * open up a beer to watch the compilation going. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static LIST_HEAD(ematch_ops); +static DEFINE_RWLOCK(ematch_mod_lock); + +static inline struct tcf_ematch_ops * tcf_em_lookup(u16 kind) +{ + struct tcf_ematch_ops *e = NULL; + + read_lock(&ematch_mod_lock); + list_for_each_entry(e, &ematch_ops, link) { + if (kind == e->kind) { + if (!try_module_get(e->owner)) + e = NULL; + read_unlock(&ematch_mod_lock); + return e; + } + } + read_unlock(&ematch_mod_lock); + + return NULL; +} + +/** + * tcf_em_register - register an extended match + * + * @ops: ematch operations lookup table + * + * This function must be called by ematches to announce their presence. + * The given @ops must have kind set to a unique identifier and the + * callback match() must be implemented. All other callbacks are optional + * and a fallback implementation is used instead. + * + * Returns -EEXISTS if an ematch of the same kind has already registered. + */ +int tcf_em_register(struct tcf_ematch_ops *ops) +{ + int err = -EEXIST; + struct tcf_ematch_ops *e; + + if (ops->match == NULL) + return -EINVAL; + + write_lock(&ematch_mod_lock); + list_for_each_entry(e, &ematch_ops, link) + if (ops->kind == e->kind) + goto errout; + + list_add_tail(&ops->link, &ematch_ops); + err = 0; +errout: + write_unlock(&ematch_mod_lock); + return err; +} + +/** + * tcf_em_unregister - unregster and extended match + * + * @ops: ematch operations lookup table + * + * This function must be called by ematches to announce their disappearance + * for examples when the module gets unloaded. The @ops parameter must be + * the same as the one used for registration. + * + * Returns -ENOENT if no matching ematch was found. + */ +int tcf_em_unregister(struct tcf_ematch_ops *ops) +{ + int err = 0; + struct tcf_ematch_ops *e; + + write_lock(&ematch_mod_lock); + list_for_each_entry(e, &ematch_ops, link) { + if (e == ops) { + list_del(&e->link); + goto out; + } + } + + err = -ENOENT; +out: + write_unlock(&ematch_mod_lock); + return err; +} + +static inline struct tcf_ematch * tcf_em_get_match(struct tcf_ematch_tree *tree, + int index) +{ + return &tree->matches[index]; +} + + +static int tcf_em_validate(struct tcf_proto *tp, + struct tcf_ematch_tree_hdr *tree_hdr, + struct tcf_ematch *em, struct rtattr *rta, int idx) +{ + int err = -EINVAL; + struct tcf_ematch_hdr *em_hdr = RTA_DATA(rta); + int data_len = RTA_PAYLOAD(rta) - sizeof(*em_hdr); + void *data = (void *) em_hdr + sizeof(*em_hdr); + + if (!TCF_EM_REL_VALID(em_hdr->flags)) + goto errout; + + if (em_hdr->kind == TCF_EM_CONTAINER) { + /* Special ematch called "container", carries an index + * referencing an external ematch sequence. */ + u32 ref; + + if (data_len < sizeof(ref)) + goto errout; + ref = *(u32 *) data; + + if (ref >= tree_hdr->nmatches) + goto errout; + + /* We do not allow backward jumps to avoid loops and jumps + * to our own position are of course illegal. */ + if (ref <= idx) + goto errout; + + + em->data = ref; + } else { + /* Note: This lookup will increase the module refcnt + * of the ematch module referenced. In case of a failure, + * a destroy function is called by the underlying layer + * which automatically releases the reference again, therefore + * the module MUST not be given back under any circumstances + * here. Be aware, the destroy function assumes that the + * module is held if the ops field is non zero. */ + em->ops = tcf_em_lookup(em_hdr->kind); + + if (em->ops == NULL) { + err = -ENOENT; + goto errout; + } + + /* ematch module provides expected length of data, so we + * can do a basic sanity check. */ + if (em->ops->datalen && data_len < em->ops->datalen) + goto errout; + + if (em->ops->change) { + err = em->ops->change(tp, data, data_len, em); + if (err < 0) + goto errout; + } else if (data_len > 0) { + /* ematch module doesn't provide an own change + * procedure and expects us to allocate and copy + * the ematch data. + * + * TCF_EM_SIMPLE may be specified stating that the + * data only consists of a u32 integer and the module + * does not expected a memory reference but rather + * the value carried. */ + if (em_hdr->flags & TCF_EM_SIMPLE) { + if (data_len < sizeof(u32)) + goto errout; + em->data = *(u32 *) data; + } else { + void *v = kmalloc(data_len, GFP_KERNEL); + if (v == NULL) { + err = -ENOBUFS; + goto errout; + } + memcpy(v, data, data_len); + em->data = (unsigned long) v; + } + } + } + + em->matchid = em_hdr->matchid; + em->flags = em_hdr->flags; + em->datalen = data_len; + + err = 0; +errout: + return err; +} + +/** + * tcf_em_tree_validate - validate ematch config TLV and build ematch tree + * + * @tp: classifier kind handle + * @rta: ematch tree configuration TLV + * @tree: destination ematch tree variable to store the resulting + * ematch tree. + * + * This function validates the given configuration TLV @rta and builds an + * ematch tree in @tree. The resulting tree must later be copied into + * the private classifier data using tcf_em_tree_change(). You MUST NOT + * provide the ematch tree variable of the private classifier data directly, + * the changes would not be locked properly. + * + * Returns a negative error code if the configuration TLV contains errors. + */ +int tcf_em_tree_validate(struct tcf_proto *tp, struct rtattr *rta, + struct tcf_ematch_tree *tree) +{ + int idx, list_len, matches_len, err = -EINVAL; + struct rtattr *tb[TCA_EMATCH_TREE_MAX]; + struct rtattr *rt_match, *rt_hdr, *rt_list; + struct tcf_ematch_tree_hdr *tree_hdr; + struct tcf_ematch *em; + + if (rtattr_parse_nested(tb, TCA_EMATCH_TREE_MAX, rta) < 0) + goto errout; + + rt_hdr = tb[TCA_EMATCH_TREE_HDR-1]; + rt_list = tb[TCA_EMATCH_TREE_LIST-1]; + + if (rt_hdr == NULL || rt_list == NULL) + goto errout; + + if (RTA_PAYLOAD(rt_hdr) < sizeof(*tree_hdr) || + RTA_PAYLOAD(rt_list) < sizeof(*rt_match)) + goto errout; + + tree_hdr = RTA_DATA(rt_hdr); + memcpy(&tree->hdr, tree_hdr, sizeof(*tree_hdr)); + + rt_match = RTA_DATA(rt_list); + list_len = RTA_PAYLOAD(rt_list); + matches_len = tree_hdr->nmatches * sizeof(*em); + + tree->matches = kmalloc(matches_len, GFP_KERNEL); + if (tree->matches == NULL) + goto errout; + memset(tree->matches, 0, matches_len); + + /* We do not use rtattr_parse_nested here because the maximum + * number of attributes is unknown. This saves us the allocation + * for a tb buffer which would serve no purpose at all. + * + * The array of rt attributes is parsed in the order as they are + * provided, their type must be incremental from 1 to n. Even + * if it does not serve any real purpose, a failure of sticking + * to this policy will result in parsing failure. */ + for (idx = 0; RTA_OK(rt_match, list_len); idx++) { + err = -EINVAL; + + if (rt_match->rta_type != (idx + 1)) + goto errout_abort; + + if (idx >= tree_hdr->nmatches) + goto errout_abort; + + if (RTA_PAYLOAD(rt_match) < sizeof(struct tcf_ematch_hdr)) + goto errout_abort; + + em = tcf_em_get_match(tree, idx); + + err = tcf_em_validate(tp, tree_hdr, em, rt_match, idx); + if (err < 0) + goto errout_abort; + + rt_match = RTA_NEXT(rt_match, list_len); + } + + /* Check if the number of matches provided by userspace actually + * complies with the array of matches. The number was used for + * the validation of references and a mismatch could lead to + * undefined references during the matching process. */ + if (idx != tree_hdr->nmatches) { + err = -EINVAL; + goto errout_abort; + } + + err = 0; +errout: + return err; + +errout_abort: + tcf_em_tree_destroy(tp, tree); + return err; +} + +/** + * tcf_em_tree_destroy - destroy an ematch tree + * + * @tp: classifier kind handle + * @tree: ematch tree to be deleted + * + * This functions destroys an ematch tree previously created by + * tcf_em_tree_validate()/tcf_em_tree_change(). You must ensure that + * the ematch tree is not in use before calling this function. + */ +void tcf_em_tree_destroy(struct tcf_proto *tp, struct tcf_ematch_tree *tree) +{ + int i; + + if (tree->matches == NULL) + return; + + for (i = 0; i < tree->hdr.nmatches; i++) { + struct tcf_ematch *em = tcf_em_get_match(tree, i); + + if (em->ops) { + if (em->ops->destroy) + em->ops->destroy(tp, em); + else if (!tcf_em_is_simple(em) && em->data) + kfree((void *) em->data); + module_put(em->ops->owner); + } + } + + tree->hdr.nmatches = 0; + kfree(tree->matches); +} + +/** + * tcf_em_tree_dump - dump ematch tree into a rtnl message + * + * @skb: skb holding the rtnl message + * @t: ematch tree to be dumped + * @tlv: TLV type to be used to encapsulate the tree + * + * This function dumps a ematch tree into a rtnl message. It is valid to + * call this function while the ematch tree is in use. + * + * Returns -1 if the skb tailroom is insufficient. + */ +int tcf_em_tree_dump(struct sk_buff *skb, struct tcf_ematch_tree *tree, int tlv) +{ + int i; + struct rtattr * top_start = (struct rtattr*) skb->tail; + struct rtattr * list_start; + + RTA_PUT(skb, tlv, 0, NULL); + RTA_PUT(skb, TCA_EMATCH_TREE_HDR, sizeof(tree->hdr), &tree->hdr); + + list_start = (struct rtattr *) skb->tail; + RTA_PUT(skb, TCA_EMATCH_TREE_LIST, 0, NULL); + + for (i = 0; i < tree->hdr.nmatches; i++) { + struct rtattr *match_start = (struct rtattr*) skb->tail; + struct tcf_ematch *em = tcf_em_get_match(tree, i); + struct tcf_ematch_hdr em_hdr = { + .kind = em->ops ? em->ops->kind : TCF_EM_CONTAINER, + .matchid = em->matchid, + .flags = em->flags + }; + + RTA_PUT(skb, i+1, sizeof(em_hdr), &em_hdr); + + if (em->ops && em->ops->dump) { + if (em->ops->dump(skb, em) < 0) + goto rtattr_failure; + } else if (tcf_em_is_container(em) || tcf_em_is_simple(em)) { + u32 u = em->data; + RTA_PUT_NOHDR(skb, sizeof(u), &u); + } else if (em->datalen > 0) + RTA_PUT_NOHDR(skb, em->datalen, (void *) em->data); + + match_start->rta_len = skb->tail - (u8*) match_start; + } + + list_start->rta_len = skb->tail - (u8 *) list_start; + top_start->rta_len = skb->tail - (u8 *) top_start; + + return 0; + +rtattr_failure: + return -1; +} + +static inline int tcf_em_match(struct sk_buff *skb, struct tcf_ematch *em, + struct tcf_pkt_info *info) +{ + int r = em->ops->match(skb, em, info); + return tcf_em_is_inverted(em) ? !r : r; +} + +/* Do not use this function directly, use tcf_em_tree_match instead */ +int __tcf_em_tree_match(struct sk_buff *skb, struct tcf_ematch_tree *tree, + struct tcf_pkt_info *info) +{ + int stackp = 0, match_idx = 0, res = 0; + struct tcf_ematch *cur_match; + int stack[CONFIG_NET_EMATCH_STACK]; + +proceed: + while (match_idx < tree->hdr.nmatches) { + cur_match = tcf_em_get_match(tree, match_idx); + + if (tcf_em_is_container(cur_match)) { + if (unlikely(stackp >= CONFIG_NET_EMATCH_STACK)) + goto stack_overflow; + + stack[stackp++] = match_idx; + match_idx = cur_match->data; + goto proceed; + } + + res = tcf_em_match(skb, cur_match, info); + + if (tcf_em_early_end(cur_match, res)) + break; + + match_idx++; + } + +pop_stack: + if (stackp > 0) { + match_idx = stack[--stackp]; + cur_match = tcf_em_get_match(tree, match_idx); + + if (tcf_em_early_end(cur_match, res)) + goto pop_stack; + else { + match_idx++; + goto proceed; + } + } + + return res; + +stack_overflow: + if (net_ratelimit()) + printk("Local stack overflow, increase NET_EMATCH_STACK\n"); + return -1; +} + +EXPORT_SYMBOL(tcf_em_register); +EXPORT_SYMBOL(tcf_em_unregister); +EXPORT_SYMBOL(tcf_em_tree_validate); +EXPORT_SYMBOL(tcf_em_tree_destroy); +EXPORT_SYMBOL(tcf_em_tree_dump); +EXPORT_SYMBOL(__tcf_em_tree_match); diff --git a/net/sched/estimator.c b/net/sched/estimator.c new file mode 100644 index 000000000000..5d3ae03e22a7 --- /dev/null +++ b/net/sched/estimator.c @@ -0,0 +1,197 @@ +/* + * net/sched/estimator.c Simple rate estimator. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + This code is NOT intended to be used for statistics collection, + its purpose is to provide a base for statistical multiplexing + for controlled load service. + If you need only statistics, run a user level daemon which + periodically reads byte counters. + + Unfortunately, rate estimation is not a very easy task. + F.e. I did not find a simple way to estimate the current peak rate + and even failed to formulate the problem 8)8) + + So I preferred not to built an estimator into the scheduler, + but run this task separately. + Ideally, it should be kernel thread(s), but for now it runs + from timers, which puts apparent top bounds on the number of rated + flows, has minimal overhead on small, but is enough + to handle controlled load service, sets of aggregates. + + We measure rate over A=(1<next) { + struct tc_stats *st = e->stats; + u64 nbytes; + u32 npackets; + u32 rate; + + spin_lock(e->stats_lock); + nbytes = st->bytes; + npackets = st->packets; + rate = (nbytes - e->last_bytes)<<(7 - idx); + e->last_bytes = nbytes; + e->avbps += ((long)rate - (long)e->avbps) >> e->ewma_log; + st->bps = (e->avbps+0xF)>>5; + + rate = (npackets - e->last_packets)<<(12 - idx); + e->last_packets = npackets; + e->avpps += ((long)rate - (long)e->avpps) >> e->ewma_log; + e->stats->pps = (e->avpps+0x1FF)>>10; + spin_unlock(e->stats_lock); + } + + mod_timer(&elist[idx].timer, jiffies + ((HZ<interval < -2 || parm->interval > 3) + return -EINVAL; + + est = kmalloc(sizeof(*est), GFP_KERNEL); + if (est == NULL) + return -ENOBUFS; + + memset(est, 0, sizeof(*est)); + est->interval = parm->interval + 2; + est->stats = stats; + est->stats_lock = stats_lock; + est->ewma_log = parm->ewma_log; + est->last_bytes = stats->bytes; + est->avbps = stats->bps<<5; + est->last_packets = stats->packets; + est->avpps = stats->pps<<10; + + est->next = elist[est->interval].list; + if (est->next == NULL) { + init_timer(&elist[est->interval].timer); + elist[est->interval].timer.data = est->interval; + elist[est->interval].timer.expires = jiffies + ((HZ<interval)/4); + elist[est->interval].timer.function = est_timer; + add_timer(&elist[est->interval].timer); + } + write_lock_bh(&est_lock); + elist[est->interval].list = est; + write_unlock_bh(&est_lock); + return 0; +} + +void qdisc_kill_estimator(struct tc_stats *stats) +{ + int idx; + struct qdisc_estimator *est, **pest; + + for (idx=0; idx <= EST_MAX_INTERVAL; idx++) { + int killed = 0; + pest = &elist[idx].list; + while ((est=*pest) != NULL) { + if (est->stats != stats) { + pest = &est->next; + continue; + } + + write_lock_bh(&est_lock); + *pest = est->next; + write_unlock_bh(&est_lock); + + kfree(est); + killed++; + } + if (killed && elist[idx].list == NULL) + del_timer(&elist[idx].timer); + } +} + +EXPORT_SYMBOL(qdisc_kill_estimator); +EXPORT_SYMBOL(qdisc_new_estimator); diff --git a/net/sched/gact.c b/net/sched/gact.c new file mode 100644 index 000000000000..a811c89fef7f --- /dev/null +++ b/net/sched/gact.c @@ -0,0 +1,231 @@ +/* + * net/sched/gact.c Generic actions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * copyright Jamal Hadi Salim (2002-4) + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* use generic hash table */ +#define MY_TAB_SIZE 16 +#define MY_TAB_MASK 15 + +static u32 idx_gen; +static struct tcf_gact *tcf_gact_ht[MY_TAB_SIZE]; +static DEFINE_RWLOCK(gact_lock); + +/* ovewrride the defaults */ +#define tcf_st tcf_gact +#define tc_st tc_gact +#define tcf_t_lock gact_lock +#define tcf_ht tcf_gact_ht + +#define CONFIG_NET_ACT_INIT 1 +#include + +#ifdef CONFIG_GACT_PROB +static int gact_net_rand(struct tcf_gact *p) +{ + if (net_random()%p->pval) + return p->action; + return p->paction; +} + +static int gact_determ(struct tcf_gact *p) +{ + if (p->bstats.packets%p->pval) + return p->action; + return p->paction; +} + +typedef int (*g_rand)(struct tcf_gact *p); +static g_rand gact_rand[MAX_RAND]= { NULL, gact_net_rand, gact_determ }; +#endif + +static int tcf_gact_init(struct rtattr *rta, struct rtattr *est, + struct tc_action *a, int ovr, int bind) +{ + struct rtattr *tb[TCA_GACT_MAX]; + struct tc_gact *parm; + struct tcf_gact *p; + int ret = 0; + + if (rta == NULL || rtattr_parse_nested(tb, TCA_GACT_MAX, rta) < 0) + return -EINVAL; + + if (tb[TCA_GACT_PARMS - 1] == NULL || + RTA_PAYLOAD(tb[TCA_GACT_PARMS - 1]) < sizeof(*parm)) + return -EINVAL; + parm = RTA_DATA(tb[TCA_GACT_PARMS - 1]); + + if (tb[TCA_GACT_PROB-1] != NULL) +#ifdef CONFIG_GACT_PROB + if (RTA_PAYLOAD(tb[TCA_GACT_PROB-1]) < sizeof(struct tc_gact_p)) + return -EINVAL; +#else + return -EOPNOTSUPP; +#endif + + p = tcf_hash_check(parm->index, a, ovr, bind); + if (p == NULL) { + p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind); + if (p == NULL) + return -ENOMEM; + ret = ACT_P_CREATED; + } else { + if (!ovr) { + tcf_hash_release(p, bind); + return -EEXIST; + } + } + + spin_lock_bh(&p->lock); + p->action = parm->action; +#ifdef CONFIG_GACT_PROB + if (tb[TCA_GACT_PROB-1] != NULL) { + struct tc_gact_p *p_parm = RTA_DATA(tb[TCA_GACT_PROB-1]); + p->paction = p_parm->paction; + p->pval = p_parm->pval; + p->ptype = p_parm->ptype; + } +#endif + spin_unlock_bh(&p->lock); + if (ret == ACT_P_CREATED) + tcf_hash_insert(p); + return ret; +} + +static int +tcf_gact_cleanup(struct tc_action *a, int bind) +{ + struct tcf_gact *p = PRIV(a, gact); + + if (p != NULL) + return tcf_hash_release(p, bind); + return 0; +} + +static int +tcf_gact(struct sk_buff **pskb, struct tc_action *a) +{ + struct tcf_gact *p = PRIV(a, gact); + struct sk_buff *skb = *pskb; + int action = TC_ACT_SHOT; + + spin_lock(&p->lock); +#ifdef CONFIG_GACT_PROB + if (p->ptype && gact_rand[p->ptype] != NULL) + action = gact_rand[p->ptype](p); + else + action = p->action; +#else + action = p->action; +#endif + p->bstats.bytes += skb->len; + p->bstats.packets++; + if (action == TC_ACT_SHOT) + p->qstats.drops++; + p->tm.lastuse = jiffies; + spin_unlock(&p->lock); + + return action; +} + +static int +tcf_gact_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb->tail; + struct tc_gact opt; + struct tcf_gact *p = PRIV(a, gact); + struct tcf_t t; + + opt.index = p->index; + opt.refcnt = p->refcnt - ref; + opt.bindcnt = p->bindcnt - bind; + opt.action = p->action; + RTA_PUT(skb, TCA_GACT_PARMS, sizeof(opt), &opt); +#ifdef CONFIG_GACT_PROB + if (p->ptype) { + struct tc_gact_p p_opt; + p_opt.paction = p->paction; + p_opt.pval = p->pval; + p_opt.ptype = p->ptype; + RTA_PUT(skb, TCA_GACT_PROB, sizeof(p_opt), &p_opt); + } +#endif + t.install = jiffies_to_clock_t(jiffies - p->tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse); + t.expires = jiffies_to_clock_t(p->tm.expires); + RTA_PUT(skb, TCA_GACT_TM, sizeof(t), &t); + return skb->len; + + rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct tc_action_ops act_gact_ops = { + .kind = "gact", + .type = TCA_ACT_GACT, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_gact, + .dump = tcf_gact_dump, + .cleanup = tcf_gact_cleanup, + .lookup = tcf_hash_search, + .init = tcf_gact_init, + .walk = tcf_generic_walker +}; + +MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); +MODULE_DESCRIPTION("Generic Classifier actions"); +MODULE_LICENSE("GPL"); + +static int __init +gact_init_module(void) +{ +#ifdef CONFIG_GACT_PROB + printk("GACT probability on\n"); +#else + printk("GACT probability NOT on\n"); +#endif + return tcf_register_action(&act_gact_ops); +} + +static void __exit +gact_cleanup_module(void) +{ + tcf_unregister_action(&act_gact_ops); +} + +module_init(gact_init_module); +module_exit(gact_cleanup_module); diff --git a/net/sched/ipt.c b/net/sched/ipt.c new file mode 100644 index 000000000000..b114d994d523 --- /dev/null +++ b/net/sched/ipt.c @@ -0,0 +1,326 @@ +/* + * net/sched/ipt.c iptables target interface + * + *TODO: Add other tables. For now we only support the ipv4 table targets + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Copyright: Jamal Hadi Salim (2002-4) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* use generic hash table */ +#define MY_TAB_SIZE 16 +#define MY_TAB_MASK 15 + +static u32 idx_gen; +static struct tcf_ipt *tcf_ipt_ht[MY_TAB_SIZE]; +/* ipt hash table lock */ +static DEFINE_RWLOCK(ipt_lock); + +/* ovewrride the defaults */ +#define tcf_st tcf_ipt +#define tcf_t_lock ipt_lock +#define tcf_ht tcf_ipt_ht + +#define CONFIG_NET_ACT_INIT +#include + +static int +ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook) +{ + struct ipt_target *target; + int ret = 0; + + target = ipt_find_target(t->u.user.name, t->u.user.revision); + if (!target) + return -ENOENT; + + DPRINTK("ipt_init_target: found %s\n", target->name); + t->u.kernel.target = target; + + if (t->u.kernel.target->checkentry + && !t->u.kernel.target->checkentry(table, NULL, t->data, + t->u.target_size - sizeof(*t), + hook)) { + DPRINTK("ipt_init_target: check failed for `%s'.\n", + t->u.kernel.target->name); + module_put(t->u.kernel.target->me); + ret = -EINVAL; + } + + return ret; +} + +static void +ipt_destroy_target(struct ipt_entry_target *t) +{ + if (t->u.kernel.target->destroy) + t->u.kernel.target->destroy(t->data, + t->u.target_size - sizeof(*t)); + module_put(t->u.kernel.target->me); +} + +static int +tcf_ipt_release(struct tcf_ipt *p, int bind) +{ + int ret = 0; + if (p) { + if (bind) + p->bindcnt--; + p->refcnt--; + if (p->bindcnt <= 0 && p->refcnt <= 0) { + ipt_destroy_target(p->t); + kfree(p->tname); + kfree(p->t); + tcf_hash_destroy(p); + ret = ACT_P_DELETED; + } + } + return ret; +} + +static int +tcf_ipt_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a, + int ovr, int bind) +{ + struct rtattr *tb[TCA_IPT_MAX]; + struct tcf_ipt *p; + struct ipt_entry_target *td, *t; + char *tname; + int ret = 0, err; + u32 hook = 0; + u32 index = 0; + + if (rta == NULL || rtattr_parse_nested(tb, TCA_IPT_MAX, rta) < 0) + return -EINVAL; + + if (tb[TCA_IPT_HOOK-1] == NULL || + RTA_PAYLOAD(tb[TCA_IPT_HOOK-1]) < sizeof(u32)) + return -EINVAL; + if (tb[TCA_IPT_TARG-1] == NULL || + RTA_PAYLOAD(tb[TCA_IPT_TARG-1]) < sizeof(*t)) + return -EINVAL; + td = (struct ipt_entry_target *)RTA_DATA(tb[TCA_IPT_TARG-1]); + if (RTA_PAYLOAD(tb[TCA_IPT_TARG-1]) < td->u.target_size) + return -EINVAL; + + if (tb[TCA_IPT_INDEX-1] != NULL && + RTA_PAYLOAD(tb[TCA_IPT_INDEX-1]) >= sizeof(u32)) + index = *(u32 *)RTA_DATA(tb[TCA_IPT_INDEX-1]); + + p = tcf_hash_check(index, a, ovr, bind); + if (p == NULL) { + p = tcf_hash_create(index, est, a, sizeof(*p), ovr, bind); + if (p == NULL) + return -ENOMEM; + ret = ACT_P_CREATED; + } else { + if (!ovr) { + tcf_ipt_release(p, bind); + return -EEXIST; + } + } + + hook = *(u32 *)RTA_DATA(tb[TCA_IPT_HOOK-1]); + + err = -ENOMEM; + tname = kmalloc(IFNAMSIZ, GFP_KERNEL); + if (tname == NULL) + goto err1; + if (tb[TCA_IPT_TABLE - 1] == NULL || + rtattr_strlcpy(tname, tb[TCA_IPT_TABLE-1], IFNAMSIZ) >= IFNAMSIZ) + strcpy(tname, "mangle"); + + t = kmalloc(td->u.target_size, GFP_KERNEL); + if (t == NULL) + goto err2; + memcpy(t, td, td->u.target_size); + + if ((err = ipt_init_target(t, tname, hook)) < 0) + goto err3; + + spin_lock_bh(&p->lock); + if (ret != ACT_P_CREATED) { + ipt_destroy_target(p->t); + kfree(p->tname); + kfree(p->t); + } + p->tname = tname; + p->t = t; + p->hook = hook; + spin_unlock_bh(&p->lock); + if (ret == ACT_P_CREATED) + tcf_hash_insert(p); + return ret; + +err3: + kfree(t); +err2: + kfree(tname); +err1: + kfree(p); + return err; +} + +static int +tcf_ipt_cleanup(struct tc_action *a, int bind) +{ + struct tcf_ipt *p = PRIV(a, ipt); + return tcf_ipt_release(p, bind); +} + +static int +tcf_ipt(struct sk_buff **pskb, struct tc_action *a) +{ + int ret = 0, result = 0; + struct tcf_ipt *p = PRIV(a, ipt); + struct sk_buff *skb = *pskb; + + if (skb_cloned(skb)) { + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + return TC_ACT_UNSPEC; + } + + spin_lock(&p->lock); + + p->tm.lastuse = jiffies; + p->bstats.bytes += skb->len; + p->bstats.packets++; + + /* yes, we have to worry about both in and out dev + worry later - danger - this API seems to have changed + from earlier kernels */ + + ret = p->t->u.kernel.target->target(&skb, skb->dev, NULL, + p->hook, p->t->data, NULL); + switch (ret) { + case NF_ACCEPT: + result = TC_ACT_OK; + break; + case NF_DROP: + result = TC_ACT_SHOT; + p->qstats.drops++; + break; + case IPT_CONTINUE: + result = TC_ACT_PIPE; + break; + default: + if (net_ratelimit()) + printk("Bogus netfilter code %d assume ACCEPT\n", ret); + result = TC_POLICE_OK; + break; + } + spin_unlock(&p->lock); + return result; + +} + +static int +tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + struct ipt_entry_target *t; + struct tcf_t tm; + struct tc_cnt c; + unsigned char *b = skb->tail; + struct tcf_ipt *p = PRIV(a, ipt); + + /* for simple targets kernel size == user size + ** user name = target name + ** for foolproof you need to not assume this + */ + + t = kmalloc(p->t->u.user.target_size, GFP_ATOMIC); + if (t == NULL) + goto rtattr_failure; + + c.bindcnt = p->bindcnt - bind; + c.refcnt = p->refcnt - ref; + memcpy(t, p->t, p->t->u.user.target_size); + strcpy(t->u.user.name, p->t->u.kernel.target->name); + + DPRINTK("\ttcf_ipt_dump tablename %s length %d\n", p->tname, + strlen(p->tname)); + DPRINTK("\tdump target name %s size %d size user %d " + "data[0] %x data[1] %x\n", p->t->u.kernel.target->name, + p->t->u.target_size, p->t->u.user.target_size, + p->t->data[0], p->t->data[1]); + RTA_PUT(skb, TCA_IPT_TARG, p->t->u.user.target_size, t); + RTA_PUT(skb, TCA_IPT_INDEX, 4, &p->index); + RTA_PUT(skb, TCA_IPT_HOOK, 4, &p->hook); + RTA_PUT(skb, TCA_IPT_CNT, sizeof(struct tc_cnt), &c); + RTA_PUT(skb, TCA_IPT_TABLE, IFNAMSIZ, p->tname); + tm.install = jiffies_to_clock_t(jiffies - p->tm.install); + tm.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse); + tm.expires = jiffies_to_clock_t(p->tm.expires); + RTA_PUT(skb, TCA_IPT_TM, sizeof (tm), &tm); + kfree(t); + return skb->len; + + rtattr_failure: + skb_trim(skb, b - skb->data); + kfree(t); + return -1; +} + +static struct tc_action_ops act_ipt_ops = { + .kind = "ipt", + .type = TCA_ACT_IPT, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_ipt, + .dump = tcf_ipt_dump, + .cleanup = tcf_ipt_cleanup, + .lookup = tcf_hash_search, + .init = tcf_ipt_init, + .walk = tcf_generic_walker +}; + +MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); +MODULE_DESCRIPTION("Iptables target actions"); +MODULE_LICENSE("GPL"); + +static int __init +ipt_init_module(void) +{ + return tcf_register_action(&act_ipt_ops); +} + +static void __exit +ipt_cleanup_module(void) +{ + tcf_unregister_action(&act_ipt_ops); +} + +module_init(ipt_init_module); +module_exit(ipt_cleanup_module); diff --git a/net/sched/mirred.c b/net/sched/mirred.c new file mode 100644 index 000000000000..f309ce336803 --- /dev/null +++ b/net/sched/mirred.c @@ -0,0 +1,276 @@ +/* + * net/sched/mirred.c packet mirroring and redirect actions + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Jamal Hadi Salim (2002-4) + * + * TODO: Add ingress support (and socket redirect support) + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + +/* use generic hash table */ +#define MY_TAB_SIZE 8 +#define MY_TAB_MASK (MY_TAB_SIZE - 1) +static u32 idx_gen; +static struct tcf_mirred *tcf_mirred_ht[MY_TAB_SIZE]; +static DEFINE_RWLOCK(mirred_lock); + +/* ovewrride the defaults */ +#define tcf_st tcf_mirred +#define tc_st tc_mirred +#define tcf_t_lock mirred_lock +#define tcf_ht tcf_mirred_ht + +#define CONFIG_NET_ACT_INIT 1 +#include + +static inline int +tcf_mirred_release(struct tcf_mirred *p, int bind) +{ + if (p) { + if (bind) + p->bindcnt--; + p->refcnt--; + if(!p->bindcnt && p->refcnt <= 0) { + dev_put(p->dev); + tcf_hash_destroy(p); + return 1; + } + } + return 0; +} + +static int +tcf_mirred_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a, + int ovr, int bind) +{ + struct rtattr *tb[TCA_MIRRED_MAX]; + struct tc_mirred *parm; + struct tcf_mirred *p; + struct net_device *dev = NULL; + int ret = 0; + int ok_push = 0; + + if (rta == NULL || rtattr_parse_nested(tb, TCA_MIRRED_MAX, rta) < 0) + return -EINVAL; + + if (tb[TCA_MIRRED_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_MIRRED_PARMS-1]) < sizeof(*parm)) + return -EINVAL; + parm = RTA_DATA(tb[TCA_MIRRED_PARMS-1]); + + if (parm->ifindex) { + dev = __dev_get_by_index(parm->ifindex); + if (dev == NULL) + return -ENODEV; + switch (dev->type) { + case ARPHRD_TUNNEL: + case ARPHRD_TUNNEL6: + case ARPHRD_SIT: + case ARPHRD_IPGRE: + case ARPHRD_VOID: + case ARPHRD_NONE: + ok_push = 0; + break; + default: + ok_push = 1; + break; + } + } + + p = tcf_hash_check(parm->index, a, ovr, bind); + if (p == NULL) { + if (!parm->ifindex) + return -EINVAL; + p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind); + if (p == NULL) + return -ENOMEM; + ret = ACT_P_CREATED; + } else { + if (!ovr) { + tcf_mirred_release(p, bind); + return -EEXIST; + } + } + + spin_lock_bh(&p->lock); + p->action = parm->action; + p->eaction = parm->eaction; + if (parm->ifindex) { + p->ifindex = parm->ifindex; + if (ret != ACT_P_CREATED) + dev_put(p->dev); + p->dev = dev; + dev_hold(dev); + p->ok_push = ok_push; + } + spin_unlock_bh(&p->lock); + if (ret == ACT_P_CREATED) + tcf_hash_insert(p); + + DPRINTK("tcf_mirred_init index %d action %d eaction %d device %s " + "ifindex %d\n", parm->index, parm->action, parm->eaction, + dev->name, parm->ifindex); + return ret; +} + +static int +tcf_mirred_cleanup(struct tc_action *a, int bind) +{ + struct tcf_mirred *p = PRIV(a, mirred); + + if (p != NULL) + return tcf_mirred_release(p, bind); + return 0; +} + +static int +tcf_mirred(struct sk_buff **pskb, struct tc_action *a) +{ + struct tcf_mirred *p = PRIV(a, mirred); + struct net_device *dev; + struct sk_buff *skb2 = NULL; + struct sk_buff *skb = *pskb; + u32 at = G_TC_AT(skb->tc_verd); + + spin_lock(&p->lock); + + dev = p->dev; + p->tm.lastuse = jiffies; + + if (!(dev->flags&IFF_UP) ) { + if (net_ratelimit()) + printk("mirred to Houston: device %s is gone!\n", + dev->name); +bad_mirred: + if (skb2 != NULL) + kfree_skb(skb2); + p->qstats.overlimits++; + p->bstats.bytes += skb->len; + p->bstats.packets++; + spin_unlock(&p->lock); + /* should we be asking for packet to be dropped? + * may make sense for redirect case only + */ + return TC_ACT_SHOT; + } + + skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2 == NULL) + goto bad_mirred; + if (p->eaction != TCA_EGRESS_MIRROR && p->eaction != TCA_EGRESS_REDIR) { + if (net_ratelimit()) + printk("tcf_mirred unknown action %d\n", p->eaction); + goto bad_mirred; + } + + p->bstats.bytes += skb2->len; + p->bstats.packets++; + if (!(at & AT_EGRESS)) + if (p->ok_push) + skb_push(skb2, skb2->dev->hard_header_len); + + /* mirror is always swallowed */ + if (p->eaction != TCA_EGRESS_MIRROR) + skb2->tc_verd = SET_TC_FROM(skb2->tc_verd, at); + + skb2->dev = dev; + skb2->input_dev = skb->dev; + dev_queue_xmit(skb2); + spin_unlock(&p->lock); + return p->action; +} + +static int +tcf_mirred_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb->tail; + struct tc_mirred opt; + struct tcf_mirred *p = PRIV(a, mirred); + struct tcf_t t; + + opt.index = p->index; + opt.action = p->action; + opt.refcnt = p->refcnt - ref; + opt.bindcnt = p->bindcnt - bind; + opt.eaction = p->eaction; + opt.ifindex = p->ifindex; + DPRINTK("tcf_mirred_dump index %d action %d eaction %d ifindex %d\n", + p->index, p->action, p->eaction, p->ifindex); + RTA_PUT(skb, TCA_MIRRED_PARMS, sizeof(opt), &opt); + t.install = jiffies_to_clock_t(jiffies - p->tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse); + t.expires = jiffies_to_clock_t(p->tm.expires); + RTA_PUT(skb, TCA_MIRRED_TM, sizeof(t), &t); + return skb->len; + + rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct tc_action_ops act_mirred_ops = { + .kind = "mirred", + .type = TCA_ACT_MIRRED, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_mirred, + .dump = tcf_mirred_dump, + .cleanup = tcf_mirred_cleanup, + .lookup = tcf_hash_search, + .init = tcf_mirred_init, + .walk = tcf_generic_walker +}; + +MODULE_AUTHOR("Jamal Hadi Salim(2002)"); +MODULE_DESCRIPTION("Device Mirror/redirect actions"); +MODULE_LICENSE("GPL"); + +static int __init +mirred_init_module(void) +{ + printk("Mirror/redirect action on\n"); + return tcf_register_action(&act_mirred_ops); +} + +static void __exit +mirred_cleanup_module(void) +{ + tcf_unregister_action(&act_mirred_ops); +} + +module_init(mirred_init_module); +module_exit(mirred_cleanup_module); diff --git a/net/sched/pedit.c b/net/sched/pedit.c new file mode 100644 index 000000000000..678be6a645fb --- /dev/null +++ b/net/sched/pedit.c @@ -0,0 +1,288 @@ +/* + * net/sched/pedit.c Generic packet editor + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Jamal Hadi Salim (2002-4) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define PEDIT_DEB 1 + +/* use generic hash table */ +#define MY_TAB_SIZE 16 +#define MY_TAB_MASK 15 +static u32 idx_gen; +static struct tcf_pedit *tcf_pedit_ht[MY_TAB_SIZE]; +static DEFINE_RWLOCK(pedit_lock); + +#define tcf_st tcf_pedit +#define tc_st tc_pedit +#define tcf_t_lock pedit_lock +#define tcf_ht tcf_pedit_ht + +#define CONFIG_NET_ACT_INIT 1 +#include + +static int +tcf_pedit_init(struct rtattr *rta, struct rtattr *est, struct tc_action *a, + int ovr, int bind) +{ + struct rtattr *tb[TCA_PEDIT_MAX]; + struct tc_pedit *parm; + int ret = 0; + struct tcf_pedit *p; + struct tc_pedit_key *keys = NULL; + int ksize; + + if (rta == NULL || rtattr_parse_nested(tb, TCA_PEDIT_MAX, rta) < 0) + return -EINVAL; + + if (tb[TCA_PEDIT_PARMS - 1] == NULL || + RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) < sizeof(*parm)) + return -EINVAL; + parm = RTA_DATA(tb[TCA_PEDIT_PARMS-1]); + ksize = parm->nkeys * sizeof(struct tc_pedit_key); + if (RTA_PAYLOAD(tb[TCA_PEDIT_PARMS-1]) < sizeof(*parm) + ksize) + return -EINVAL; + + p = tcf_hash_check(parm->index, a, ovr, bind); + if (p == NULL) { + if (!parm->nkeys) + return -EINVAL; + p = tcf_hash_create(parm->index, est, a, sizeof(*p), ovr, bind); + if (p == NULL) + return -ENOMEM; + keys = kmalloc(ksize, GFP_KERNEL); + if (keys == NULL) { + kfree(p); + return -ENOMEM; + } + ret = ACT_P_CREATED; + } else { + if (!ovr) { + tcf_hash_release(p, bind); + return -EEXIST; + } + if (p->nkeys && p->nkeys != parm->nkeys) { + keys = kmalloc(ksize, GFP_KERNEL); + if (keys == NULL) + return -ENOMEM; + } + } + + spin_lock_bh(&p->lock); + p->flags = parm->flags; + p->action = parm->action; + if (keys) { + kfree(p->keys); + p->keys = keys; + p->nkeys = parm->nkeys; + } + memcpy(p->keys, parm->keys, ksize); + spin_unlock_bh(&p->lock); + if (ret == ACT_P_CREATED) + tcf_hash_insert(p); + return ret; +} + +static int +tcf_pedit_cleanup(struct tc_action *a, int bind) +{ + struct tcf_pedit *p = PRIV(a, pedit); + + if (p != NULL) { + struct tc_pedit_key *keys = p->keys; + if (tcf_hash_release(p, bind)) { + kfree(keys); + return 1; + } + } + return 0; +} + +static int +tcf_pedit(struct sk_buff **pskb, struct tc_action *a) +{ + struct tcf_pedit *p = PRIV(a, pedit); + struct sk_buff *skb = *pskb; + int i, munged = 0; + u8 *pptr; + + if (!(skb->tc_verd & TC_OK2MUNGE)) { + /* should we set skb->cloned? */ + if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) { + return p->action; + } + } + + pptr = skb->nh.raw; + + spin_lock(&p->lock); + + p->tm.lastuse = jiffies; + + if (p->nkeys > 0) { + struct tc_pedit_key *tkey = p->keys; + + for (i = p->nkeys; i > 0; i--, tkey++) { + u32 *ptr; + int offset = tkey->off; + + if (tkey->offmask) { + if (skb->len > tkey->at) { + char *j = pptr + tkey->at; + offset += ((*j & tkey->offmask) >> + tkey->shift); + } else { + goto bad; + } + } + + if (offset % 4) { + printk("offset must be on 32 bit boundaries\n"); + goto bad; + } + if (skb->len < 0 || (offset > 0 && offset > skb->len)) { + printk("offset %d cant exceed pkt length %d\n", + offset, skb->len); + goto bad; + } + + ptr = (u32 *)(pptr+offset); + /* just do it, baby */ + *ptr = ((*ptr & tkey->mask) ^ tkey->val); + munged++; + } + + if (munged) + skb->tc_verd = SET_TC_MUNGED(skb->tc_verd); + goto done; + } else { + printk("pedit BUG: index %d\n",p->index); + } + +bad: + p->qstats.overlimits++; +done: + p->bstats.bytes += skb->len; + p->bstats.packets++; + spin_unlock(&p->lock); + return p->action; +} + +static int +tcf_pedit_dump(struct sk_buff *skb, struct tc_action *a,int bind, int ref) +{ + unsigned char *b = skb->tail; + struct tc_pedit *opt; + struct tcf_pedit *p = PRIV(a, pedit); + struct tcf_t t; + int s; + + s = sizeof(*opt) + p->nkeys * sizeof(struct tc_pedit_key); + + /* netlink spinlocks held above us - must use ATOMIC */ + opt = kmalloc(s, GFP_ATOMIC); + if (opt == NULL) + return -ENOBUFS; + memset(opt, 0, s); + + memcpy(opt->keys, p->keys, p->nkeys * sizeof(struct tc_pedit_key)); + opt->index = p->index; + opt->nkeys = p->nkeys; + opt->flags = p->flags; + opt->action = p->action; + opt->refcnt = p->refcnt - ref; + opt->bindcnt = p->bindcnt - bind; + + +#ifdef PEDIT_DEB + { + /* Debug - get rid of later */ + int i; + struct tc_pedit_key *key = opt->keys; + + for (i=0; inkeys; i++, key++) { + printk( "\n key #%d",i); + printk( " at %d: val %08x mask %08x", + (unsigned int)key->off, + (unsigned int)key->val, + (unsigned int)key->mask); + } + } +#endif + + RTA_PUT(skb, TCA_PEDIT_PARMS, s, opt); + t.install = jiffies_to_clock_t(jiffies - p->tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - p->tm.lastuse); + t.expires = jiffies_to_clock_t(p->tm.expires); + RTA_PUT(skb, TCA_PEDIT_TM, sizeof(t), &t); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static +struct tc_action_ops act_pedit_ops = { + .kind = "pedit", + .type = TCA_ACT_PEDIT, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_pedit, + .dump = tcf_pedit_dump, + .cleanup = tcf_pedit_cleanup, + .lookup = tcf_hash_search, + .init = tcf_pedit_init, + .walk = tcf_generic_walker +}; + +MODULE_AUTHOR("Jamal Hadi Salim(2002-4)"); +MODULE_DESCRIPTION("Generic Packet Editor actions"); +MODULE_LICENSE("GPL"); + +static int __init +pedit_init_module(void) +{ + return tcf_register_action(&act_pedit_ops); +} + +static void __exit +pedit_cleanup_module(void) +{ + tcf_unregister_action(&act_pedit_ops); +} + +module_init(pedit_init_module); +module_exit(pedit_cleanup_module); + diff --git a/net/sched/police.c b/net/sched/police.c new file mode 100644 index 000000000000..c03545faf523 --- /dev/null +++ b/net/sched/police.c @@ -0,0 +1,612 @@ +/* + * net/sched/police.c Input police filter. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * J Hadi Salim (action changes) + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define L2T(p,L) ((p)->R_tab->data[(L)>>(p)->R_tab->rate.cell_log]) +#define L2T_P(p,L) ((p)->P_tab->data[(L)>>(p)->P_tab->rate.cell_log]) +#define PRIV(a) ((struct tcf_police *) (a)->priv) + +/* use generic hash table */ +#define MY_TAB_SIZE 16 +#define MY_TAB_MASK 15 +static u32 idx_gen; +static struct tcf_police *tcf_police_ht[MY_TAB_SIZE]; +/* Policer hash table lock */ +static DEFINE_RWLOCK(police_lock); + +/* Each policer is serialized by its individual spinlock */ + +static __inline__ unsigned tcf_police_hash(u32 index) +{ + return index&0xF; +} + +static __inline__ struct tcf_police * tcf_police_lookup(u32 index) +{ + struct tcf_police *p; + + read_lock(&police_lock); + for (p = tcf_police_ht[tcf_police_hash(index)]; p; p = p->next) { + if (p->index == index) + break; + } + read_unlock(&police_lock); + return p; +} + +#ifdef CONFIG_NET_CLS_ACT +static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb, + int type, struct tc_action *a) +{ + struct tcf_police *p; + int err = 0, index = -1, i = 0, s_i = 0, n_i = 0; + struct rtattr *r; + + read_lock(&police_lock); + + s_i = cb->args[0]; + + for (i = 0; i < MY_TAB_SIZE; i++) { + p = tcf_police_ht[tcf_police_hash(i)]; + + for (; p; p = p->next) { + index++; + if (index < s_i) + continue; + a->priv = p; + a->order = index; + r = (struct rtattr*) skb->tail; + RTA_PUT(skb, a->order, 0, NULL); + if (type == RTM_DELACTION) + err = tcf_action_dump_1(skb, a, 0, 1); + else + err = tcf_action_dump_1(skb, a, 0, 0); + if (err < 0) { + index--; + skb_trim(skb, (u8*)r - skb->data); + goto done; + } + r->rta_len = skb->tail - (u8*)r; + n_i++; + } + } +done: + read_unlock(&police_lock); + if (n_i) + cb->args[0] += n_i; + return n_i; + +rtattr_failure: + skb_trim(skb, (u8*)r - skb->data); + goto done; +} + +static inline int +tcf_hash_search(struct tc_action *a, u32 index) +{ + struct tcf_police *p = tcf_police_lookup(index); + + if (p != NULL) { + a->priv = p; + return 1; + } else { + return 0; + } +} +#endif + +static inline u32 tcf_police_new_index(void) +{ + do { + if (++idx_gen == 0) + idx_gen = 1; + } while (tcf_police_lookup(idx_gen)); + + return idx_gen; +} + +void tcf_police_destroy(struct tcf_police *p) +{ + unsigned h = tcf_police_hash(p->index); + struct tcf_police **p1p; + + for (p1p = &tcf_police_ht[h]; *p1p; p1p = &(*p1p)->next) { + if (*p1p == p) { + write_lock_bh(&police_lock); + *p1p = p->next; + write_unlock_bh(&police_lock); +#ifdef CONFIG_NET_ESTIMATOR + gen_kill_estimator(&p->bstats, &p->rate_est); +#endif + if (p->R_tab) + qdisc_put_rtab(p->R_tab); + if (p->P_tab) + qdisc_put_rtab(p->P_tab); + kfree(p); + return; + } + } + BUG_TRAP(0); +} + +#ifdef CONFIG_NET_CLS_ACT +static int tcf_act_police_locate(struct rtattr *rta, struct rtattr *est, + struct tc_action *a, int ovr, int bind) +{ + unsigned h; + int ret = 0, err; + struct rtattr *tb[TCA_POLICE_MAX]; + struct tc_police *parm; + struct tcf_police *p; + struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; + + if (rta == NULL || rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0) + return -EINVAL; + + if (tb[TCA_POLICE_TBF-1] == NULL || + RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]) != sizeof(*parm)) + return -EINVAL; + parm = RTA_DATA(tb[TCA_POLICE_TBF-1]); + + if (tb[TCA_POLICE_RESULT-1] != NULL && + RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32)) + return -EINVAL; + if (tb[TCA_POLICE_RESULT-1] != NULL && + RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32)) + return -EINVAL; + + if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) { + a->priv = p; + if (bind) { + p->bindcnt += 1; + p->refcnt += 1; + } + if (ovr) + goto override; + return ret; + } + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p == NULL) + return -ENOMEM; + memset(p, 0, sizeof(*p)); + + ret = ACT_P_CREATED; + p->refcnt = 1; + spin_lock_init(&p->lock); + p->stats_lock = &p->lock; + if (bind) + p->bindcnt = 1; +override: + if (parm->rate.rate) { + err = -ENOMEM; + R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1]); + if (R_tab == NULL) + goto failure; + if (parm->peakrate.rate) { + P_tab = qdisc_get_rtab(&parm->peakrate, + tb[TCA_POLICE_PEAKRATE-1]); + if (p->P_tab == NULL) { + qdisc_put_rtab(R_tab); + goto failure; + } + } + } + /* No failure allowed after this point */ + spin_lock_bh(&p->lock); + if (R_tab != NULL) { + qdisc_put_rtab(p->R_tab); + p->R_tab = R_tab; + } + if (P_tab != NULL) { + qdisc_put_rtab(p->P_tab); + p->P_tab = P_tab; + } + + if (tb[TCA_POLICE_RESULT-1]) + p->result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]); + p->toks = p->burst = parm->burst; + p->mtu = parm->mtu; + if (p->mtu == 0) { + p->mtu = ~0; + if (p->R_tab) + p->mtu = 255<R_tab->rate.cell_log; + } + if (p->P_tab) + p->ptoks = L2T_P(p, p->mtu); + p->action = parm->action; + +#ifdef CONFIG_NET_ESTIMATOR + if (tb[TCA_POLICE_AVRATE-1]) + p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]); + if (est) + gen_replace_estimator(&p->bstats, &p->rate_est, p->stats_lock, est); +#endif + + spin_unlock_bh(&p->lock); + if (ret != ACT_P_CREATED) + return ret; + + PSCHED_GET_TIME(p->t_c); + p->index = parm->index ? : tcf_police_new_index(); + h = tcf_police_hash(p->index); + write_lock_bh(&police_lock); + p->next = tcf_police_ht[h]; + tcf_police_ht[h] = p; + write_unlock_bh(&police_lock); + + a->priv = p; + return ret; + +failure: + if (ret == ACT_P_CREATED) + kfree(p); + return err; +} + +static int tcf_act_police_cleanup(struct tc_action *a, int bind) +{ + struct tcf_police *p = PRIV(a); + + if (p != NULL) + return tcf_police_release(p, bind); + return 0; +} + +static int tcf_act_police(struct sk_buff **pskb, struct tc_action *a) +{ + psched_time_t now; + struct sk_buff *skb = *pskb; + struct tcf_police *p = PRIV(a); + long toks; + long ptoks = 0; + + spin_lock(&p->lock); + + p->bstats.bytes += skb->len; + p->bstats.packets++; + +#ifdef CONFIG_NET_ESTIMATOR + if (p->ewma_rate && p->rate_est.bps >= p->ewma_rate) { + p->qstats.overlimits++; + spin_unlock(&p->lock); + return p->action; + } +#endif + + if (skb->len <= p->mtu) { + if (p->R_tab == NULL) { + spin_unlock(&p->lock); + return p->result; + } + + PSCHED_GET_TIME(now); + + toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst); + + if (p->P_tab) { + ptoks = toks + p->ptoks; + if (ptoks > (long)L2T_P(p, p->mtu)) + ptoks = (long)L2T_P(p, p->mtu); + ptoks -= L2T_P(p, skb->len); + } + toks += p->toks; + if (toks > (long)p->burst) + toks = p->burst; + toks -= L2T(p, skb->len); + + if ((toks|ptoks) >= 0) { + p->t_c = now; + p->toks = toks; + p->ptoks = ptoks; + spin_unlock(&p->lock); + return p->result; + } + } + + p->qstats.overlimits++; + spin_unlock(&p->lock); + return p->action; +} + +static int +tcf_act_police_dump(struct sk_buff *skb, struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb->tail; + struct tc_police opt; + struct tcf_police *p = PRIV(a); + + opt.index = p->index; + opt.action = p->action; + opt.mtu = p->mtu; + opt.burst = p->burst; + opt.refcnt = p->refcnt - ref; + opt.bindcnt = p->bindcnt - bind; + if (p->R_tab) + opt.rate = p->R_tab->rate; + else + memset(&opt.rate, 0, sizeof(opt.rate)); + if (p->P_tab) + opt.peakrate = p->P_tab->rate; + else + memset(&opt.peakrate, 0, sizeof(opt.peakrate)); + RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); + if (p->result) + RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result); +#ifdef CONFIG_NET_ESTIMATOR + if (p->ewma_rate) + RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate); +#endif + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +MODULE_AUTHOR("Alexey Kuznetsov"); +MODULE_DESCRIPTION("Policing actions"); +MODULE_LICENSE("GPL"); + +static struct tc_action_ops act_police_ops = { + .kind = "police", + .type = TCA_ID_POLICE, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_act_police, + .dump = tcf_act_police_dump, + .cleanup = tcf_act_police_cleanup, + .lookup = tcf_hash_search, + .init = tcf_act_police_locate, + .walk = tcf_generic_walker +}; + +static int __init +police_init_module(void) +{ + return tcf_register_action(&act_police_ops); +} + +static void __exit +police_cleanup_module(void) +{ + tcf_unregister_action(&act_police_ops); +} + +module_init(police_init_module); +module_exit(police_cleanup_module); + +#endif + +struct tcf_police * tcf_police_locate(struct rtattr *rta, struct rtattr *est) +{ + unsigned h; + struct tcf_police *p; + struct rtattr *tb[TCA_POLICE_MAX]; + struct tc_police *parm; + + if (rtattr_parse_nested(tb, TCA_POLICE_MAX, rta) < 0) + return NULL; + + if (tb[TCA_POLICE_TBF-1] == NULL || + RTA_PAYLOAD(tb[TCA_POLICE_TBF-1]) != sizeof(*parm)) + return NULL; + + parm = RTA_DATA(tb[TCA_POLICE_TBF-1]); + + if (parm->index && (p = tcf_police_lookup(parm->index)) != NULL) { + p->refcnt++; + return p; + } + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p == NULL) + return NULL; + + memset(p, 0, sizeof(*p)); + p->refcnt = 1; + spin_lock_init(&p->lock); + p->stats_lock = &p->lock; + if (parm->rate.rate) { + p->R_tab = qdisc_get_rtab(&parm->rate, tb[TCA_POLICE_RATE-1]); + if (p->R_tab == NULL) + goto failure; + if (parm->peakrate.rate) { + p->P_tab = qdisc_get_rtab(&parm->peakrate, + tb[TCA_POLICE_PEAKRATE-1]); + if (p->P_tab == NULL) + goto failure; + } + } + if (tb[TCA_POLICE_RESULT-1]) { + if (RTA_PAYLOAD(tb[TCA_POLICE_RESULT-1]) != sizeof(u32)) + goto failure; + p->result = *(u32*)RTA_DATA(tb[TCA_POLICE_RESULT-1]); + } +#ifdef CONFIG_NET_ESTIMATOR + if (tb[TCA_POLICE_AVRATE-1]) { + if (RTA_PAYLOAD(tb[TCA_POLICE_AVRATE-1]) != sizeof(u32)) + goto failure; + p->ewma_rate = *(u32*)RTA_DATA(tb[TCA_POLICE_AVRATE-1]); + } +#endif + p->toks = p->burst = parm->burst; + p->mtu = parm->mtu; + if (p->mtu == 0) { + p->mtu = ~0; + if (p->R_tab) + p->mtu = 255<R_tab->rate.cell_log; + } + if (p->P_tab) + p->ptoks = L2T_P(p, p->mtu); + PSCHED_GET_TIME(p->t_c); + p->index = parm->index ? : tcf_police_new_index(); + p->action = parm->action; +#ifdef CONFIG_NET_ESTIMATOR + if (est) + gen_new_estimator(&p->bstats, &p->rate_est, p->stats_lock, est); +#endif + h = tcf_police_hash(p->index); + write_lock_bh(&police_lock); + p->next = tcf_police_ht[h]; + tcf_police_ht[h] = p; + write_unlock_bh(&police_lock); + return p; + +failure: + if (p->R_tab) + qdisc_put_rtab(p->R_tab); + kfree(p); + return NULL; +} + +int tcf_police(struct sk_buff *skb, struct tcf_police *p) +{ + psched_time_t now; + long toks; + long ptoks = 0; + + spin_lock(&p->lock); + + p->bstats.bytes += skb->len; + p->bstats.packets++; + +#ifdef CONFIG_NET_ESTIMATOR + if (p->ewma_rate && p->rate_est.bps >= p->ewma_rate) { + p->qstats.overlimits++; + spin_unlock(&p->lock); + return p->action; + } +#endif + + if (skb->len <= p->mtu) { + if (p->R_tab == NULL) { + spin_unlock(&p->lock); + return p->result; + } + + PSCHED_GET_TIME(now); + + toks = PSCHED_TDIFF_SAFE(now, p->t_c, p->burst); + + if (p->P_tab) { + ptoks = toks + p->ptoks; + if (ptoks > (long)L2T_P(p, p->mtu)) + ptoks = (long)L2T_P(p, p->mtu); + ptoks -= L2T_P(p, skb->len); + } + toks += p->toks; + if (toks > (long)p->burst) + toks = p->burst; + toks -= L2T(p, skb->len); + + if ((toks|ptoks) >= 0) { + p->t_c = now; + p->toks = toks; + p->ptoks = ptoks; + spin_unlock(&p->lock); + return p->result; + } + } + + p->qstats.overlimits++; + spin_unlock(&p->lock); + return p->action; +} + +int tcf_police_dump(struct sk_buff *skb, struct tcf_police *p) +{ + unsigned char *b = skb->tail; + struct tc_police opt; + + opt.index = p->index; + opt.action = p->action; + opt.mtu = p->mtu; + opt.burst = p->burst; + if (p->R_tab) + opt.rate = p->R_tab->rate; + else + memset(&opt.rate, 0, sizeof(opt.rate)); + if (p->P_tab) + opt.peakrate = p->P_tab->rate; + else + memset(&opt.peakrate, 0, sizeof(opt.peakrate)); + RTA_PUT(skb, TCA_POLICE_TBF, sizeof(opt), &opt); + if (p->result) + RTA_PUT(skb, TCA_POLICE_RESULT, sizeof(int), &p->result); +#ifdef CONFIG_NET_ESTIMATOR + if (p->ewma_rate) + RTA_PUT(skb, TCA_POLICE_AVRATE, 4, &p->ewma_rate); +#endif + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +int tcf_police_dump_stats(struct sk_buff *skb, struct tcf_police *p) +{ + struct gnet_dump d; + + if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, + TCA_XSTATS, p->stats_lock, &d) < 0) + goto errout; + + if (gnet_stats_copy_basic(&d, &p->bstats) < 0 || +#ifdef CONFIG_NET_ESTIMATOR + gnet_stats_copy_rate_est(&d, &p->rate_est) < 0 || +#endif + gnet_stats_copy_queue(&d, &p->qstats) < 0) + goto errout; + + if (gnet_stats_finish_copy(&d) < 0) + goto errout; + + return 0; + +errout: + return -1; +} + + +EXPORT_SYMBOL(tcf_police); +EXPORT_SYMBOL(tcf_police_destroy); +EXPORT_SYMBOL(tcf_police_dump); +EXPORT_SYMBOL(tcf_police_dump_stats); +EXPORT_SYMBOL(tcf_police_hash); +EXPORT_SYMBOL(tcf_police_ht); +EXPORT_SYMBOL(tcf_police_locate); +EXPORT_SYMBOL(tcf_police_lookup); +EXPORT_SYMBOL(tcf_police_new_index); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c new file mode 100644 index 000000000000..4323a74eea30 --- /dev/null +++ b/net/sched/sch_api.c @@ -0,0 +1,1296 @@ +/* + * net/sched/sch_api.c Packet scheduler API. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + * Fixes: + * + * Rani Assaf :980802: JIFFIES and CPU clock sources are repaired. + * Eduardo J. Blanco :990222: kmod support + * Jamal Hadi Salim : 990601: ingress support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, u32 clid, + struct Qdisc *old, struct Qdisc *new); +static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct Qdisc *q, unsigned long cl, int event); + +/* + + Short review. + ------------- + + This file consists of two interrelated parts: + + 1. queueing disciplines manager frontend. + 2. traffic classes manager frontend. + + Generally, queueing discipline ("qdisc") is a black box, + which is able to enqueue packets and to dequeue them (when + device is ready to send something) in order and at times + determined by algorithm hidden in it. + + qdisc's are divided to two categories: + - "queues", which have no internal structure visible from outside. + - "schedulers", which split all the packets to "traffic classes", + using "packet classifiers" (look at cls_api.c) + + In turn, classes may have child qdiscs (as rule, queues) + attached to them etc. etc. etc. + + The goal of the routines in this file is to translate + information supplied by user in the form of handles + to more intelligible for kernel form, to make some sanity + checks and part of work, which is common to all qdiscs + and to provide rtnetlink notifications. + + All real intelligent work is done inside qdisc modules. + + + + Every discipline has two major routines: enqueue and dequeue. + + ---dequeue + + dequeue usually returns a skb to send. It is allowed to return NULL, + but it does not mean that queue is empty, it just means that + discipline does not want to send anything this time. + Queue is really empty if q->q.qlen == 0. + For complicated disciplines with multiple queues q->q is not + real packet queue, but however q->q.qlen must be valid. + + ---enqueue + + enqueue returns 0, if packet was enqueued successfully. + If packet (this one or another one) was dropped, it returns + not zero error code. + NET_XMIT_DROP - this packet dropped + Expected action: do not backoff, but wait until queue will clear. + NET_XMIT_CN - probably this packet enqueued, but another one dropped. + Expected action: backoff or ignore + NET_XMIT_POLICED - dropped by police. + Expected action: backoff or error to real-time apps. + + Auxiliary routines: + + ---requeue + + requeues once dequeued packet. It is used for non-standard or + just buggy devices, which can defer output even if dev->tbusy=0. + + ---reset + + returns qdisc to initial state: purge all buffers, clear all + timers, counters (except for statistics) etc. + + ---init + + initializes newly created qdisc. + + ---destroy + + destroys resources allocated by init and during lifetime of qdisc. + + ---change + + changes qdisc parameters. + */ + +/* Protects list of registered TC modules. It is pure SMP lock. */ +static DEFINE_RWLOCK(qdisc_mod_lock); + + +/************************************************ + * Queueing disciplines manipulation. * + ************************************************/ + + +/* The list of all installed queueing disciplines. */ + +static struct Qdisc_ops *qdisc_base; + +/* Register/uregister queueing discipline */ + +int register_qdisc(struct Qdisc_ops *qops) +{ + struct Qdisc_ops *q, **qp; + int rc = -EEXIST; + + write_lock(&qdisc_mod_lock); + for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next) + if (!strcmp(qops->id, q->id)) + goto out; + + if (qops->enqueue == NULL) + qops->enqueue = noop_qdisc_ops.enqueue; + if (qops->requeue == NULL) + qops->requeue = noop_qdisc_ops.requeue; + if (qops->dequeue == NULL) + qops->dequeue = noop_qdisc_ops.dequeue; + + qops->next = NULL; + *qp = qops; + rc = 0; +out: + write_unlock(&qdisc_mod_lock); + return rc; +} + +int unregister_qdisc(struct Qdisc_ops *qops) +{ + struct Qdisc_ops *q, **qp; + int err = -ENOENT; + + write_lock(&qdisc_mod_lock); + for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next) + if (q == qops) + break; + if (q) { + *qp = q->next; + q->next = NULL; + err = 0; + } + write_unlock(&qdisc_mod_lock); + return err; +} + +/* We know handle. Find qdisc among all qdisc's attached to device + (root qdisc, all its children, children of children etc.) + */ + +struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) +{ + struct Qdisc *q; + + read_lock_bh(&qdisc_tree_lock); + list_for_each_entry(q, &dev->qdisc_list, list) { + if (q->handle == handle) { + read_unlock_bh(&qdisc_tree_lock); + return q; + } + } + read_unlock_bh(&qdisc_tree_lock); + return NULL; +} + +static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid) +{ + unsigned long cl; + struct Qdisc *leaf; + struct Qdisc_class_ops *cops = p->ops->cl_ops; + + if (cops == NULL) + return NULL; + cl = cops->get(p, classid); + + if (cl == 0) + return NULL; + leaf = cops->leaf(p, cl); + cops->put(p, cl); + return leaf; +} + +/* Find queueing discipline by name */ + +static struct Qdisc_ops *qdisc_lookup_ops(struct rtattr *kind) +{ + struct Qdisc_ops *q = NULL; + + if (kind) { + read_lock(&qdisc_mod_lock); + for (q = qdisc_base; q; q = q->next) { + if (rtattr_strcmp(kind, q->id) == 0) { + if (!try_module_get(q->owner)) + q = NULL; + break; + } + } + read_unlock(&qdisc_mod_lock); + } + return q; +} + +static struct qdisc_rate_table *qdisc_rtab_list; + +struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct rtattr *tab) +{ + struct qdisc_rate_table *rtab; + + for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) { + if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) { + rtab->refcnt++; + return rtab; + } + } + + if (tab == NULL || r->rate == 0 || r->cell_log == 0 || RTA_PAYLOAD(tab) != 1024) + return NULL; + + rtab = kmalloc(sizeof(*rtab), GFP_KERNEL); + if (rtab) { + rtab->rate = *r; + rtab->refcnt = 1; + memcpy(rtab->data, RTA_DATA(tab), 1024); + rtab->next = qdisc_rtab_list; + qdisc_rtab_list = rtab; + } + return rtab; +} + +void qdisc_put_rtab(struct qdisc_rate_table *tab) +{ + struct qdisc_rate_table *rtab, **rtabp; + + if (!tab || --tab->refcnt) + return; + + for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) { + if (rtab == tab) { + *rtabp = rtab->next; + kfree(rtab); + return; + } + } +} + + +/* Allocate an unique handle from space managed by kernel */ + +static u32 qdisc_alloc_handle(struct net_device *dev) +{ + int i = 0x10000; + static u32 autohandle = TC_H_MAKE(0x80000000U, 0); + + do { + autohandle += TC_H_MAKE(0x10000U, 0); + if (autohandle == TC_H_MAKE(TC_H_ROOT, 0)) + autohandle = TC_H_MAKE(0x80000000U, 0); + } while (qdisc_lookup(dev, autohandle) && --i > 0); + + return i>0 ? autohandle : 0; +} + +/* Attach toplevel qdisc to device dev */ + +static struct Qdisc * +dev_graft_qdisc(struct net_device *dev, struct Qdisc *qdisc) +{ + struct Qdisc *oqdisc; + + if (dev->flags & IFF_UP) + dev_deactivate(dev); + + qdisc_lock_tree(dev); + if (qdisc && qdisc->flags&TCQ_F_INGRESS) { + oqdisc = dev->qdisc_ingress; + /* Prune old scheduler */ + if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) { + /* delete */ + qdisc_reset(oqdisc); + dev->qdisc_ingress = NULL; + } else { /* new */ + dev->qdisc_ingress = qdisc; + } + + } else { + + oqdisc = dev->qdisc_sleeping; + + /* Prune old scheduler */ + if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1) + qdisc_reset(oqdisc); + + /* ... and graft new one */ + if (qdisc == NULL) + qdisc = &noop_qdisc; + dev->qdisc_sleeping = qdisc; + dev->qdisc = &noop_qdisc; + } + + qdisc_unlock_tree(dev); + + if (dev->flags & IFF_UP) + dev_activate(dev); + + return oqdisc; +} + + +/* Graft qdisc "new" to class "classid" of qdisc "parent" or + to device "dev". + + Old qdisc is not destroyed but returned in *old. + */ + +static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, + u32 classid, + struct Qdisc *new, struct Qdisc **old) +{ + int err = 0; + struct Qdisc *q = *old; + + + if (parent == NULL) { + if (q && q->flags&TCQ_F_INGRESS) { + *old = dev_graft_qdisc(dev, q); + } else { + *old = dev_graft_qdisc(dev, new); + } + } else { + struct Qdisc_class_ops *cops = parent->ops->cl_ops; + + err = -EINVAL; + + if (cops) { + unsigned long cl = cops->get(parent, classid); + if (cl) { + err = cops->graft(parent, cl, new, old); + if (new) + new->parent = classid; + cops->put(parent, cl); + } + } + } + return err; +} + +/* + Allocate and initialize new qdisc. + + Parameters are passed via opt. + */ + +static struct Qdisc * +qdisc_create(struct net_device *dev, u32 handle, struct rtattr **tca, int *errp) +{ + int err; + struct rtattr *kind = tca[TCA_KIND-1]; + void *p = NULL; + struct Qdisc *sch; + struct Qdisc_ops *ops; + int size; + + ops = qdisc_lookup_ops(kind); +#ifdef CONFIG_KMOD + if (ops == NULL && kind != NULL) { + char name[IFNAMSIZ]; + if (rtattr_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) { + /* We dropped the RTNL semaphore in order to + * perform the module load. So, even if we + * succeeded in loading the module we have to + * tell the caller to replay the request. We + * indicate this using -EAGAIN. + * We replay the request because the device may + * go away in the mean time. + */ + rtnl_unlock(); + request_module("sch_%s", name); + rtnl_lock(); + ops = qdisc_lookup_ops(kind); + if (ops != NULL) { + /* We will try again qdisc_lookup_ops, + * so don't keep a reference. + */ + module_put(ops->owner); + err = -EAGAIN; + goto err_out; + } + } + } +#endif + + err = -EINVAL; + if (ops == NULL) + goto err_out; + + /* ensure that the Qdisc and the private data are 32-byte aligned */ + size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); + size += ops->priv_size + QDISC_ALIGN_CONST; + + p = kmalloc(size, GFP_KERNEL); + err = -ENOBUFS; + if (!p) + goto err_out2; + memset(p, 0, size); + sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) + & ~QDISC_ALIGN_CONST); + sch->padded = (char *)sch - (char *)p; + + INIT_LIST_HEAD(&sch->list); + skb_queue_head_init(&sch->q); + + if (handle == TC_H_INGRESS) + sch->flags |= TCQ_F_INGRESS; + + sch->ops = ops; + sch->enqueue = ops->enqueue; + sch->dequeue = ops->dequeue; + sch->dev = dev; + dev_hold(dev); + atomic_set(&sch->refcnt, 1); + sch->stats_lock = &dev->queue_lock; + if (handle == 0) { + handle = qdisc_alloc_handle(dev); + err = -ENOMEM; + if (handle == 0) + goto err_out3; + } + + if (handle == TC_H_INGRESS) + sch->handle =TC_H_MAKE(TC_H_INGRESS, 0); + else + sch->handle = handle; + + if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS-1])) == 0) { + qdisc_lock_tree(dev); + list_add_tail(&sch->list, &dev->qdisc_list); + qdisc_unlock_tree(dev); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + gen_new_estimator(&sch->bstats, &sch->rate_est, + sch->stats_lock, tca[TCA_RATE-1]); +#endif + return sch; + } +err_out3: + dev_put(dev); +err_out2: + module_put(ops->owner); +err_out: + *errp = err; + if (p) + kfree(p); + return NULL; +} + +static int qdisc_change(struct Qdisc *sch, struct rtattr **tca) +{ + if (tca[TCA_OPTIONS-1]) { + int err; + + if (sch->ops->change == NULL) + return -EINVAL; + err = sch->ops->change(sch, tca[TCA_OPTIONS-1]); + if (err) + return err; + } +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + gen_replace_estimator(&sch->bstats, &sch->rate_est, + sch->stats_lock, tca[TCA_RATE-1]); +#endif + return 0; +} + +struct check_loop_arg +{ + struct qdisc_walker w; + struct Qdisc *p; + int depth; +}; + +static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w); + +static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth) +{ + struct check_loop_arg arg; + + if (q->ops->cl_ops == NULL) + return 0; + + arg.w.stop = arg.w.skip = arg.w.count = 0; + arg.w.fn = check_loop_fn; + arg.depth = depth; + arg.p = p; + q->ops->cl_ops->walk(q, &arg.w); + return arg.w.stop ? -ELOOP : 0; +} + +static int +check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) +{ + struct Qdisc *leaf; + struct Qdisc_class_ops *cops = q->ops->cl_ops; + struct check_loop_arg *arg = (struct check_loop_arg *)w; + + leaf = cops->leaf(q, cl); + if (leaf) { + if (leaf == arg->p || arg->depth > 7) + return -ELOOP; + return check_loop(leaf, arg->p, arg->depth + 1); + } + return 0; +} + +/* + * Delete/get qdisc. + */ + +static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct tcmsg *tcm = NLMSG_DATA(n); + struct rtattr **tca = arg; + struct net_device *dev; + u32 clid = tcm->tcm_parent; + struct Qdisc *q = NULL; + struct Qdisc *p = NULL; + int err; + + if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return -ENODEV; + + if (clid) { + if (clid != TC_H_ROOT) { + if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) { + if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) + return -ENOENT; + q = qdisc_leaf(p, clid); + } else { /* ingress */ + q = dev->qdisc_ingress; + } + } else { + q = dev->qdisc_sleeping; + } + if (!q) + return -ENOENT; + + if (tcm->tcm_handle && q->handle != tcm->tcm_handle) + return -EINVAL; + } else { + if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) + return -ENOENT; + } + + if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) + return -EINVAL; + + if (n->nlmsg_type == RTM_DELQDISC) { + if (!clid) + return -EINVAL; + if (q->handle == 0) + return -ENOENT; + if ((err = qdisc_graft(dev, p, clid, NULL, &q)) != 0) + return err; + if (q) { + qdisc_notify(skb, n, clid, q, NULL); + spin_lock_bh(&dev->queue_lock); + qdisc_destroy(q); + spin_unlock_bh(&dev->queue_lock); + } + } else { + qdisc_notify(skb, n, clid, NULL, q); + } + return 0; +} + +/* + Create/change qdisc. + */ + +static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct tcmsg *tcm; + struct rtattr **tca; + struct net_device *dev; + u32 clid; + struct Qdisc *q, *p; + int err; + +replay: + /* Reinit, just in case something touches this. */ + tcm = NLMSG_DATA(n); + tca = arg; + clid = tcm->tcm_parent; + q = p = NULL; + + if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return -ENODEV; + + if (clid) { + if (clid != TC_H_ROOT) { + if (clid != TC_H_INGRESS) { + if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) + return -ENOENT; + q = qdisc_leaf(p, clid); + } else { /*ingress */ + q = dev->qdisc_ingress; + } + } else { + q = dev->qdisc_sleeping; + } + + /* It may be default qdisc, ignore it */ + if (q && q->handle == 0) + q = NULL; + + if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) { + if (tcm->tcm_handle) { + if (q && !(n->nlmsg_flags&NLM_F_REPLACE)) + return -EEXIST; + if (TC_H_MIN(tcm->tcm_handle)) + return -EINVAL; + if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL) + goto create_n_graft; + if (n->nlmsg_flags&NLM_F_EXCL) + return -EEXIST; + if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) + return -EINVAL; + if (q == p || + (p && check_loop(q, p, 0))) + return -ELOOP; + atomic_inc(&q->refcnt); + goto graft; + } else { + if (q == NULL) + goto create_n_graft; + + /* This magic test requires explanation. + * + * We know, that some child q is already + * attached to this parent and have choice: + * either to change it or to create/graft new one. + * + * 1. We are allowed to create/graft only + * if CREATE and REPLACE flags are set. + * + * 2. If EXCL is set, requestor wanted to say, + * that qdisc tcm_handle is not expected + * to exist, so that we choose create/graft too. + * + * 3. The last case is when no flags are set. + * Alas, it is sort of hole in API, we + * cannot decide what to do unambiguously. + * For now we select create/graft, if + * user gave KIND, which does not match existing. + */ + if ((n->nlmsg_flags&NLM_F_CREATE) && + (n->nlmsg_flags&NLM_F_REPLACE) && + ((n->nlmsg_flags&NLM_F_EXCL) || + (tca[TCA_KIND-1] && + rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)))) + goto create_n_graft; + } + } + } else { + if (!tcm->tcm_handle) + return -EINVAL; + q = qdisc_lookup(dev, tcm->tcm_handle); + } + + /* Change qdisc parameters */ + if (q == NULL) + return -ENOENT; + if (n->nlmsg_flags&NLM_F_EXCL) + return -EEXIST; + if (tca[TCA_KIND-1] && rtattr_strcmp(tca[TCA_KIND-1], q->ops->id)) + return -EINVAL; + err = qdisc_change(q, tca); + if (err == 0) + qdisc_notify(skb, n, clid, NULL, q); + return err; + +create_n_graft: + if (!(n->nlmsg_flags&NLM_F_CREATE)) + return -ENOENT; + if (clid == TC_H_INGRESS) + q = qdisc_create(dev, tcm->tcm_parent, tca, &err); + else + q = qdisc_create(dev, tcm->tcm_handle, tca, &err); + if (q == NULL) { + if (err == -EAGAIN) + goto replay; + return err; + } + +graft: + if (1) { + struct Qdisc *old_q = NULL; + err = qdisc_graft(dev, p, clid, q, &old_q); + if (err) { + if (q) { + spin_lock_bh(&dev->queue_lock); + qdisc_destroy(q); + spin_unlock_bh(&dev->queue_lock); + } + return err; + } + qdisc_notify(skb, n, clid, old_q, q); + if (old_q) { + spin_lock_bh(&dev->queue_lock); + qdisc_destroy(old_q); + spin_unlock_bh(&dev->queue_lock); + } + } + return 0; +} + +static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, + u32 pid, u32 seq, unsigned flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct gnet_dump d; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); + nlh->nlmsg_flags = flags; + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = q->dev->ifindex; + tcm->tcm_parent = clid; + tcm->tcm_handle = q->handle; + tcm->tcm_info = atomic_read(&q->refcnt); + RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); + if (q->ops->dump && q->ops->dump(q, skb) < 0) + goto rtattr_failure; + q->qstats.qlen = q->q.qlen; + + if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, + TCA_XSTATS, q->stats_lock, &d) < 0) + goto rtattr_failure; + + if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0) + goto rtattr_failure; + + if (gnet_stats_copy_basic(&d, &q->bstats) < 0 || +#ifdef CONFIG_NET_ESTIMATOR + gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 || +#endif + gnet_stats_copy_queue(&d, &q->qstats) < 0) + goto rtattr_failure; + + if (gnet_stats_finish_copy(&d) < 0) + goto rtattr_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int qdisc_notify(struct sk_buff *oskb, struct nlmsghdr *n, + u32 clid, struct Qdisc *old, struct Qdisc *new) +{ + struct sk_buff *skb; + u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (old && old->handle) { + if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) + goto err_out; + } + if (new) { + if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) + goto err_out; + } + + if (skb->len) + return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); + +err_out: + kfree_skb(skb); + return -EINVAL; +} + +static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) +{ + int idx, q_idx; + int s_idx, s_q_idx; + struct net_device *dev; + struct Qdisc *q; + + s_idx = cb->args[0]; + s_q_idx = q_idx = cb->args[1]; + read_lock(&dev_base_lock); + for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { + if (idx < s_idx) + continue; + if (idx > s_idx) + s_q_idx = 0; + read_lock_bh(&qdisc_tree_lock); + q_idx = 0; + list_for_each_entry(q, &dev->qdisc_list, list) { + if (q_idx < s_q_idx) { + q_idx++; + continue; + } + if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { + read_unlock_bh(&qdisc_tree_lock); + goto done; + } + q_idx++; + } + read_unlock_bh(&qdisc_tree_lock); + } + +done: + read_unlock(&dev_base_lock); + + cb->args[0] = idx; + cb->args[1] = q_idx; + + return skb->len; +} + + + +/************************************************ + * Traffic classes manipulation. * + ************************************************/ + + + +static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg) +{ + struct tcmsg *tcm = NLMSG_DATA(n); + struct rtattr **tca = arg; + struct net_device *dev; + struct Qdisc *q = NULL; + struct Qdisc_class_ops *cops; + unsigned long cl = 0; + unsigned long new_cl; + u32 pid = tcm->tcm_parent; + u32 clid = tcm->tcm_handle; + u32 qid = TC_H_MAJ(clid); + int err; + + if ((dev = __dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return -ENODEV; + + /* + parent == TC_H_UNSPEC - unspecified parent. + parent == TC_H_ROOT - class is root, which has no parent. + parent == X:0 - parent is root class. + parent == X:Y - parent is a node in hierarchy. + parent == 0:Y - parent is X:Y, where X:0 is qdisc. + + handle == 0:0 - generate handle from kernel pool. + handle == 0:Y - class is X:Y, where X:0 is qdisc. + handle == X:Y - clear. + handle == X:0 - root class. + */ + + /* Step 1. Determine qdisc handle X:0 */ + + if (pid != TC_H_ROOT) { + u32 qid1 = TC_H_MAJ(pid); + + if (qid && qid1) { + /* If both majors are known, they must be identical. */ + if (qid != qid1) + return -EINVAL; + } else if (qid1) { + qid = qid1; + } else if (qid == 0) + qid = dev->qdisc_sleeping->handle; + + /* Now qid is genuine qdisc handle consistent + both with parent and child. + + TC_H_MAJ(pid) still may be unspecified, complete it now. + */ + if (pid) + pid = TC_H_MAKE(qid, pid); + } else { + if (qid == 0) + qid = dev->qdisc_sleeping->handle; + } + + /* OK. Locate qdisc */ + if ((q = qdisc_lookup(dev, qid)) == NULL) + return -ENOENT; + + /* An check that it supports classes */ + cops = q->ops->cl_ops; + if (cops == NULL) + return -EINVAL; + + /* Now try to get class */ + if (clid == 0) { + if (pid == TC_H_ROOT) + clid = qid; + } else + clid = TC_H_MAKE(qid, clid); + + if (clid) + cl = cops->get(q, clid); + + if (cl == 0) { + err = -ENOENT; + if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE)) + goto out; + } else { + switch (n->nlmsg_type) { + case RTM_NEWTCLASS: + err = -EEXIST; + if (n->nlmsg_flags&NLM_F_EXCL) + goto out; + break; + case RTM_DELTCLASS: + err = cops->delete(q, cl); + if (err == 0) + tclass_notify(skb, n, q, cl, RTM_DELTCLASS); + goto out; + case RTM_GETTCLASS: + err = tclass_notify(skb, n, q, cl, RTM_NEWTCLASS); + goto out; + default: + err = -EINVAL; + goto out; + } + } + + new_cl = cl; + err = cops->change(q, clid, pid, tca, &new_cl); + if (err == 0) + tclass_notify(skb, n, q, new_cl, RTM_NEWTCLASS); + +out: + if (cl) + cops->put(q, cl); + + return err; +} + + +static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q, + unsigned long cl, + u32 pid, u32 seq, unsigned flags, int event) +{ + struct tcmsg *tcm; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + struct gnet_dump d; + struct Qdisc_class_ops *cl_ops = q->ops->cl_ops; + + nlh = NLMSG_PUT(skb, pid, seq, event, sizeof(*tcm)); + nlh->nlmsg_flags = flags; + tcm = NLMSG_DATA(nlh); + tcm->tcm_family = AF_UNSPEC; + tcm->tcm_ifindex = q->dev->ifindex; + tcm->tcm_parent = q->handle; + tcm->tcm_handle = q->handle; + tcm->tcm_info = 0; + RTA_PUT(skb, TCA_KIND, IFNAMSIZ, q->ops->id); + if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0) + goto rtattr_failure; + + if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, + TCA_XSTATS, q->stats_lock, &d) < 0) + goto rtattr_failure; + + if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0) + goto rtattr_failure; + + if (gnet_stats_finish_copy(&d) < 0) + goto rtattr_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int tclass_notify(struct sk_buff *oskb, struct nlmsghdr *n, + struct Qdisc *q, unsigned long cl, int event) +{ + struct sk_buff *skb; + u32 pid = oskb ? NETLINK_CB(oskb).pid : 0; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return -ENOBUFS; + + if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) { + kfree_skb(skb); + return -EINVAL; + } + + return rtnetlink_send(skb, pid, RTMGRP_TC, n->nlmsg_flags&NLM_F_ECHO); +} + +struct qdisc_dump_args +{ + struct qdisc_walker w; + struct sk_buff *skb; + struct netlink_callback *cb; +}; + +static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg) +{ + struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg; + + return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid, + a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS); +} + +static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) +{ + int t; + int s_t; + struct net_device *dev; + struct Qdisc *q; + struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh); + struct qdisc_dump_args arg; + + if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm))) + return 0; + if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) + return 0; + + s_t = cb->args[0]; + t = 0; + + read_lock_bh(&qdisc_tree_lock); + list_for_each_entry(q, &dev->qdisc_list, list) { + if (t < s_t || !q->ops->cl_ops || + (tcm->tcm_parent && + TC_H_MAJ(tcm->tcm_parent) != q->handle)) { + t++; + continue; + } + if (t > s_t) + memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); + arg.w.fn = qdisc_class_dump; + arg.skb = skb; + arg.cb = cb; + arg.w.stop = 0; + arg.w.skip = cb->args[1]; + arg.w.count = 0; + q->ops->cl_ops->walk(q, &arg.w); + cb->args[1] = arg.w.count; + if (arg.w.stop) + break; + t++; + } + read_unlock_bh(&qdisc_tree_lock); + + cb->args[0] = t; + + dev_put(dev); + return skb->len; +} + +/* Main classifier routine: scans classifier chain attached + to this qdisc, (optionally) tests for protocol and asks + specific classifiers. + */ +int tc_classify(struct sk_buff *skb, struct tcf_proto *tp, + struct tcf_result *res) +{ + int err = 0; + u32 protocol = skb->protocol; +#ifdef CONFIG_NET_CLS_ACT + struct tcf_proto *otp = tp; +reclassify: +#endif + protocol = skb->protocol; + + for ( ; tp; tp = tp->next) { + if ((tp->protocol == protocol || + tp->protocol == __constant_htons(ETH_P_ALL)) && + (err = tp->classify(skb, tp, res)) >= 0) { +#ifdef CONFIG_NET_CLS_ACT + if ( TC_ACT_RECLASSIFY == err) { + __u32 verd = (__u32) G_TC_VERD(skb->tc_verd); + tp = otp; + + if (MAX_REC_LOOP < verd++) { + printk("rule prio %d protocol %02x reclassify is buggy packet dropped\n", + tp->prio&0xffff, ntohs(tp->protocol)); + return TC_ACT_SHOT; + } + skb->tc_verd = SET_TC_VERD(skb->tc_verd,verd); + goto reclassify; + } else { + if (skb->tc_verd) + skb->tc_verd = SET_TC_VERD(skb->tc_verd,0); + return err; + } +#else + + return err; +#endif + } + + } + return -1; +} + +static int psched_us_per_tick = 1; +static int psched_tick_per_us = 1; + +#ifdef CONFIG_PROC_FS +static int psched_show(struct seq_file *seq, void *v) +{ + seq_printf(seq, "%08x %08x %08x %08x\n", + psched_tick_per_us, psched_us_per_tick, + 1000000, HZ); + + return 0; +} + +static int psched_open(struct inode *inode, struct file *file) +{ + return single_open(file, psched_show, PDE(inode)->data); +} + +static struct file_operations psched_fops = { + .owner = THIS_MODULE, + .open = psched_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; +#endif + +#ifdef CONFIG_NET_SCH_CLK_CPU +psched_tdiff_t psched_clock_per_hz; +int psched_clock_scale; +EXPORT_SYMBOL(psched_clock_per_hz); +EXPORT_SYMBOL(psched_clock_scale); + +psched_time_t psched_time_base; +cycles_t psched_time_mark; +EXPORT_SYMBOL(psched_time_mark); +EXPORT_SYMBOL(psched_time_base); + +/* + * Periodically adjust psched_time_base to avoid overflow + * with 32-bit get_cycles(). Safe up to 4GHz CPU. + */ +static void psched_tick(unsigned long); +static struct timer_list psched_timer = TIMER_INITIALIZER(psched_tick, 0, 0); + +static void psched_tick(unsigned long dummy) +{ + if (sizeof(cycles_t) == sizeof(u32)) { + psched_time_t dummy_stamp; + PSCHED_GET_TIME(dummy_stamp); + psched_timer.expires = jiffies + 1*HZ; + add_timer(&psched_timer); + } +} + +int __init psched_calibrate_clock(void) +{ + psched_time_t stamp, stamp1; + struct timeval tv, tv1; + psched_tdiff_t delay; + long rdelay; + unsigned long stop; + + psched_tick(0); + stop = jiffies + HZ/10; + PSCHED_GET_TIME(stamp); + do_gettimeofday(&tv); + while (time_before(jiffies, stop)) { + barrier(); + cpu_relax(); + } + PSCHED_GET_TIME(stamp1); + do_gettimeofday(&tv1); + + delay = PSCHED_TDIFF(stamp1, stamp); + rdelay = tv1.tv_usec - tv.tv_usec; + rdelay += (tv1.tv_sec - tv.tv_sec)*1000000; + if (rdelay > delay) + return -1; + delay /= rdelay; + psched_tick_per_us = delay; + while ((delay>>=1) != 0) + psched_clock_scale++; + psched_us_per_tick = 1<>psched_clock_scale; + return 0; +} +#endif + +static int __init pktsched_init(void) +{ + struct rtnetlink_link *link_p; + +#ifdef CONFIG_NET_SCH_CLK_CPU + if (psched_calibrate_clock() < 0) + return -1; +#elif defined(CONFIG_NET_SCH_CLK_JIFFIES) + psched_tick_per_us = HZ< +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for fput */ +#include +#include + + +extern struct socket *sockfd_lookup(int fd, int *err); /* @@@ fix this */ + +#if 0 /* control */ +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + +#if 0 /* data */ +#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define D2PRINTK(format,args...) +#endif + + +/* + * The ATM queuing discipline provides a framework for invoking classifiers + * (aka "filters"), which in turn select classes of this queuing discipline. + * Each class maps the flow(s) it is handling to a given VC. Multiple classes + * may share the same VC. + * + * When creating a class, VCs are specified by passing the number of the open + * socket descriptor by which the calling process references the VC. The kernel + * keeps the VC open at least until all classes using it are removed. + * + * In this file, most functions are named atm_tc_* to avoid confusion with all + * the atm_* in net/atm. This naming convention differs from what's used in the + * rest of net/sched. + * + * Known bugs: + * - sometimes messes up the IP stack + * - any manipulations besides the few operations described in the README, are + * untested and likely to crash the system + * - should lock the flow while there is data in the queue (?) + */ + + +#define PRIV(sch) qdisc_priv(sch) +#define VCC2FLOW(vcc) ((struct atm_flow_data *) ((vcc)->user_back)) + + +struct atm_flow_data { + struct Qdisc *q; /* FIFO, TBF, etc. */ + struct tcf_proto *filter_list; + struct atm_vcc *vcc; /* VCC; NULL if VCC is closed */ + void (*old_pop)(struct atm_vcc *vcc,struct sk_buff *skb); /* chaining */ + struct atm_qdisc_data *parent; /* parent qdisc */ + struct socket *sock; /* for closing */ + u32 classid; /* x:y type ID */ + int ref; /* reference count */ + struct gnet_stats_basic bstats; + struct gnet_stats_queue qstats; + spinlock_t *stats_lock; + struct atm_flow_data *next; + struct atm_flow_data *excess; /* flow for excess traffic; + NULL to set CLP instead */ + int hdr_len; + unsigned char hdr[0]; /* header data; MUST BE LAST */ +}; + +struct atm_qdisc_data { + struct atm_flow_data link; /* unclassified skbs go here */ + struct atm_flow_data *flows; /* NB: "link" is also on this + list */ + struct tasklet_struct task; /* requeue tasklet */ +}; + + +/* ------------------------- Class/flow operations ------------------------- */ + + +static int find_flow(struct atm_qdisc_data *qdisc,struct atm_flow_data *flow) +{ + struct atm_flow_data *walk; + + DPRINTK("find_flow(qdisc %p,flow %p)\n",qdisc,flow); + for (walk = qdisc->flows; walk; walk = walk->next) + if (walk == flow) return 1; + DPRINTK("find_flow: not found\n"); + return 0; +} + + +static __inline__ struct atm_flow_data *lookup_flow(struct Qdisc *sch, + u32 classid) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow; + + for (flow = p->flows; flow; flow = flow->next) + if (flow->classid == classid) break; + return flow; +} + + +static int atm_tc_graft(struct Qdisc *sch,unsigned long arg, + struct Qdisc *new,struct Qdisc **old) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) arg; + + DPRINTK("atm_tc_graft(sch %p,[qdisc %p],flow %p,new %p,old %p)\n",sch, + p,flow,new,old); + if (!find_flow(p,flow)) return -EINVAL; + if (!new) new = &noop_qdisc; + *old = xchg(&flow->q,new); + if (*old) qdisc_reset(*old); + return 0; +} + + +static struct Qdisc *atm_tc_leaf(struct Qdisc *sch,unsigned long cl) +{ + struct atm_flow_data *flow = (struct atm_flow_data *) cl; + + DPRINTK("atm_tc_leaf(sch %p,flow %p)\n",sch,flow); + return flow ? flow->q : NULL; +} + + +static unsigned long atm_tc_get(struct Qdisc *sch,u32 classid) +{ + struct atm_qdisc_data *p __attribute__((unused)) = PRIV(sch); + struct atm_flow_data *flow; + + DPRINTK("atm_tc_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid); + flow = lookup_flow(sch,classid); + if (flow) flow->ref++; + DPRINTK("atm_tc_get: flow %p\n",flow); + return (unsigned long) flow; +} + + +static unsigned long atm_tc_bind_filter(struct Qdisc *sch, + unsigned long parent, u32 classid) +{ + return atm_tc_get(sch,classid); +} + + +static void destroy_filters(struct atm_flow_data *flow) +{ + struct tcf_proto *filter; + + while ((filter = flow->filter_list)) { + DPRINTK("destroy_filters: destroying filter %p\n",filter); + flow->filter_list = filter->next; + tcf_destroy(filter); + } +} + + +/* + * atm_tc_put handles all destructions, including the ones that are explicitly + * requested (atm_tc_destroy, etc.). The assumption here is that we never drop + * anything that still seems to be in use. + */ + +static void atm_tc_put(struct Qdisc *sch, unsigned long cl) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) cl; + struct atm_flow_data **prev; + + DPRINTK("atm_tc_put(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); + if (--flow->ref) return; + DPRINTK("atm_tc_put: destroying\n"); + for (prev = &p->flows; *prev; prev = &(*prev)->next) + if (*prev == flow) break; + if (!*prev) { + printk(KERN_CRIT "atm_tc_put: class %p not found\n",flow); + return; + } + *prev = flow->next; + DPRINTK("atm_tc_put: qdisc %p\n",flow->q); + qdisc_destroy(flow->q); + destroy_filters(flow); + if (flow->sock) { + DPRINTK("atm_tc_put: f_count %d\n", + file_count(flow->sock->file)); + flow->vcc->pop = flow->old_pop; + sockfd_put(flow->sock); + } + if (flow->excess) atm_tc_put(sch,(unsigned long) flow->excess); + if (flow != &p->link) kfree(flow); + /* + * If flow == &p->link, the qdisc no longer works at this point and + * needs to be removed. (By the caller of atm_tc_put.) + */ +} + + +static void sch_atm_pop(struct atm_vcc *vcc,struct sk_buff *skb) +{ + struct atm_qdisc_data *p = VCC2FLOW(vcc)->parent; + + D2PRINTK("sch_atm_pop(vcc %p,skb %p,[qdisc %p])\n",vcc,skb,p); + VCC2FLOW(vcc)->old_pop(vcc,skb); + tasklet_schedule(&p->task); +} + +static const u8 llc_oui_ip[] = { + 0xaa, /* DSAP: non-ISO */ + 0xaa, /* SSAP: non-ISO */ + 0x03, /* Ctrl: Unnumbered Information Command PDU */ + 0x00, /* OUI: EtherType */ + 0x00, 0x00, + 0x08, 0x00 }; /* Ethertype IP (0800) */ + +static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, + struct rtattr **tca, unsigned long *arg) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) *arg; + struct atm_flow_data *excess = NULL; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_ATM_MAX]; + struct socket *sock; + int fd,error,hdr_len; + void *hdr; + + DPRINTK("atm_tc_change(sch %p,[qdisc %p],classid %x,parent %x," + "flow %p,opt %p)\n",sch,p,classid,parent,flow,opt); + /* + * The concept of parents doesn't apply for this qdisc. + */ + if (parent && parent != TC_H_ROOT && parent != sch->handle) + return -EINVAL; + /* + * ATM classes cannot be changed. In order to change properties of the + * ATM connection, that socket needs to be modified directly (via the + * native ATM API. In order to send a flow to a different VC, the old + * class needs to be removed and a new one added. (This may be changed + * later.) + */ + if (flow) return -EBUSY; + if (opt == NULL || rtattr_parse_nested(tb, TCA_ATM_MAX, opt)) + return -EINVAL; + if (!tb[TCA_ATM_FD-1] || RTA_PAYLOAD(tb[TCA_ATM_FD-1]) < sizeof(fd)) + return -EINVAL; + fd = *(int *) RTA_DATA(tb[TCA_ATM_FD-1]); + DPRINTK("atm_tc_change: fd %d\n",fd); + if (tb[TCA_ATM_HDR-1]) { + hdr_len = RTA_PAYLOAD(tb[TCA_ATM_HDR-1]); + hdr = RTA_DATA(tb[TCA_ATM_HDR-1]); + } + else { + hdr_len = RFC1483LLC_LEN; + hdr = NULL; /* default LLC/SNAP for IP */ + } + if (!tb[TCA_ATM_EXCESS-1]) excess = NULL; + else { + if (RTA_PAYLOAD(tb[TCA_ATM_EXCESS-1]) != sizeof(u32)) + return -EINVAL; + excess = (struct atm_flow_data *) atm_tc_get(sch, + *(u32 *) RTA_DATA(tb[TCA_ATM_EXCESS-1])); + if (!excess) return -ENOENT; + } + DPRINTK("atm_tc_change: type %d, payload %d, hdr_len %d\n", + opt->rta_type,RTA_PAYLOAD(opt),hdr_len); + if (!(sock = sockfd_lookup(fd,&error))) return error; /* f_count++ */ + DPRINTK("atm_tc_change: f_count %d\n",file_count(sock->file)); + if (sock->ops->family != PF_ATMSVC && sock->ops->family != PF_ATMPVC) { + error = -EPROTOTYPE; + goto err_out; + } + /* @@@ should check if the socket is really operational or we'll crash + on vcc->send */ + if (classid) { + if (TC_H_MAJ(classid ^ sch->handle)) { + DPRINTK("atm_tc_change: classid mismatch\n"); + error = -EINVAL; + goto err_out; + } + if (find_flow(p,flow)) { + error = -EEXIST; + goto err_out; + } + } + else { + int i; + unsigned long cl; + + for (i = 1; i < 0x8000; i++) { + classid = TC_H_MAKE(sch->handle,0x8000 | i); + if (!(cl = atm_tc_get(sch,classid))) break; + atm_tc_put(sch,cl); + } + } + DPRINTK("atm_tc_change: new id %x\n",classid); + flow = kmalloc(sizeof(struct atm_flow_data)+hdr_len,GFP_KERNEL); + DPRINTK("atm_tc_change: flow %p\n",flow); + if (!flow) { + error = -ENOBUFS; + goto err_out; + } + memset(flow,0,sizeof(*flow)); + flow->filter_list = NULL; + if (!(flow->q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops))) + flow->q = &noop_qdisc; + DPRINTK("atm_tc_change: qdisc %p\n",flow->q); + flow->sock = sock; + flow->vcc = ATM_SD(sock); /* speedup */ + flow->vcc->user_back = flow; + DPRINTK("atm_tc_change: vcc %p\n",flow->vcc); + flow->old_pop = flow->vcc->pop; + flow->parent = p; + flow->vcc->pop = sch_atm_pop; + flow->classid = classid; + flow->ref = 1; + flow->excess = excess; + flow->next = p->link.next; + p->link.next = flow; + flow->hdr_len = hdr_len; + if (hdr) + memcpy(flow->hdr,hdr,hdr_len); + else + memcpy(flow->hdr,llc_oui_ip,sizeof(llc_oui_ip)); + *arg = (unsigned long) flow; + return 0; +err_out: + if (excess) atm_tc_put(sch,(unsigned long) excess); + sockfd_put(sock); + return error; +} + + +static int atm_tc_delete(struct Qdisc *sch,unsigned long arg) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) arg; + + DPRINTK("atm_tc_delete(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); + if (!find_flow(PRIV(sch),flow)) return -EINVAL; + if (flow->filter_list || flow == &p->link) return -EBUSY; + /* + * Reference count must be 2: one for "keepalive" (set at class + * creation), and one for the reference held when calling delete. + */ + if (flow->ref < 2) { + printk(KERN_ERR "atm_tc_delete: flow->ref == %d\n",flow->ref); + return -EINVAL; + } + if (flow->ref > 2) return -EBUSY; /* catch references via excess, etc.*/ + atm_tc_put(sch,arg); + return 0; +} + + +static void atm_tc_walk(struct Qdisc *sch,struct qdisc_walker *walker) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow; + + DPRINTK("atm_tc_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker); + if (walker->stop) return; + for (flow = p->flows; flow; flow = flow->next) { + if (walker->count >= walker->skip) + if (walker->fn(sch,(unsigned long) flow,walker) < 0) { + walker->stop = 1; + break; + } + walker->count++; + } +} + + +static struct tcf_proto **atm_tc_find_tcf(struct Qdisc *sch,unsigned long cl) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) cl; + + DPRINTK("atm_tc_find_tcf(sch %p,[qdisc %p],flow %p)\n",sch,p,flow); + return flow ? &flow->filter_list : &p->link.filter_list; +} + + +/* --------------------------- Qdisc operations ---------------------------- */ + + +static int atm_tc_enqueue(struct sk_buff *skb,struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = NULL ; /* @@@ */ + struct tcf_result res; + int result; + int ret = NET_XMIT_POLICED; + + D2PRINTK("atm_tc_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); + result = TC_POLICE_OK; /* be nice to gcc */ + if (TC_H_MAJ(skb->priority) != sch->handle || + !(flow = (struct atm_flow_data *) atm_tc_get(sch,skb->priority))) + for (flow = p->flows; flow; flow = flow->next) + if (flow->filter_list) { + result = tc_classify(skb,flow->filter_list, + &res); + if (result < 0) continue; + flow = (struct atm_flow_data *) res.class; + if (!flow) flow = lookup_flow(sch,res.classid); + break; + } + if (!flow) flow = &p->link; + else { + if (flow->vcc) + ATM_SKB(skb)->atm_options = flow->vcc->atm_options; + /*@@@ looks good ... but it's not supposed to work :-)*/ +#ifdef CONFIG_NET_CLS_POLICE + switch (result) { + case TC_POLICE_SHOT: + kfree_skb(skb); + break; + case TC_POLICE_RECLASSIFY: + if (flow->excess) flow = flow->excess; + else { + ATM_SKB(skb)->atm_options |= + ATM_ATMOPT_CLP; + break; + } + /* fall through */ + case TC_POLICE_OK: + /* fall through */ + default: + break; + } +#endif + } + if ( +#ifdef CONFIG_NET_CLS_POLICE + result == TC_POLICE_SHOT || +#endif + (ret = flow->q->enqueue(skb,flow->q)) != 0) { + sch->qstats.drops++; + if (flow) flow->qstats.drops++; + return ret; + } + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + flow->bstats.bytes += skb->len; + flow->bstats.packets++; + /* + * Okay, this may seem weird. We pretend we've dropped the packet if + * it goes via ATM. The reason for this is that the outer qdisc + * expects to be able to q->dequeue the packet later on if we return + * success at this place. Also, sch->q.qdisc needs to reflect whether + * there is a packet egligible for dequeuing or not. Note that the + * statistics of the outer qdisc are necessarily wrong because of all + * this. There's currently no correct solution for this. + */ + if (flow == &p->link) { + sch->q.qlen++; + return 0; + } + tasklet_schedule(&p->task); + return NET_XMIT_BYPASS; +} + + +/* + * Dequeue packets and send them over ATM. Note that we quite deliberately + * avoid checking net_device's flow control here, simply because sch_atm + * uses its own channels, which have nothing to do with any CLIP/LANE/or + * non-ATM interfaces. + */ + + +static void sch_atm_dequeue(unsigned long data) +{ + struct Qdisc *sch = (struct Qdisc *) data; + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow; + struct sk_buff *skb; + + D2PRINTK("sch_atm_dequeue(sch %p,[qdisc %p])\n",sch,p); + for (flow = p->link.next; flow; flow = flow->next) + /* + * If traffic is properly shaped, this won't generate nasty + * little bursts. Otherwise, it may ... (but that's okay) + */ + while ((skb = flow->q->dequeue(flow->q))) { + if (!atm_may_send(flow->vcc,skb->truesize)) { + (void) flow->q->ops->requeue(skb,flow->q); + break; + } + D2PRINTK("atm_tc_dequeue: sending on class %p\n",flow); + /* remove any LL header somebody else has attached */ + skb_pull(skb,(char *) skb->nh.iph-(char *) skb->data); + if (skb_headroom(skb) < flow->hdr_len) { + struct sk_buff *new; + + new = skb_realloc_headroom(skb,flow->hdr_len); + dev_kfree_skb(skb); + if (!new) continue; + skb = new; + } + D2PRINTK("sch_atm_dequeue: ip %p, data %p\n", + skb->nh.iph,skb->data); + ATM_SKB(skb)->vcc = flow->vcc; + memcpy(skb_push(skb,flow->hdr_len),flow->hdr, + flow->hdr_len); + atomic_add(skb->truesize, + &sk_atm(flow->vcc)->sk_wmem_alloc); + /* atm.atm_options are already set by atm_tc_enqueue */ + (void) flow->vcc->send(flow->vcc,skb); + } +} + + +static struct sk_buff *atm_tc_dequeue(struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct sk_buff *skb; + + D2PRINTK("atm_tc_dequeue(sch %p,[qdisc %p])\n",sch,p); + tasklet_schedule(&p->task); + skb = p->link.q->dequeue(p->link.q); + if (skb) sch->q.qlen--; + return skb; +} + + +static int atm_tc_requeue(struct sk_buff *skb,struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + int ret; + + D2PRINTK("atm_tc_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); + ret = p->link.q->ops->requeue(skb,p->link.q); + if (!ret) { + sch->q.qlen++; + sch->qstats.requeues++; + } else { + sch->qstats.drops++; + p->link.qstats.drops++; + } + return ret; +} + + +static unsigned int atm_tc_drop(struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow; + unsigned int len; + + DPRINTK("atm_tc_drop(sch %p,[qdisc %p])\n",sch,p); + for (flow = p->flows; flow; flow = flow->next) + if (flow->q->ops->drop && (len = flow->q->ops->drop(flow->q))) + return len; + return 0; +} + + +static int atm_tc_init(struct Qdisc *sch,struct rtattr *opt) +{ + struct atm_qdisc_data *p = PRIV(sch); + + DPRINTK("atm_tc_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); + p->flows = &p->link; + if(!(p->link.q = qdisc_create_dflt(sch->dev,&pfifo_qdisc_ops))) + p->link.q = &noop_qdisc; + DPRINTK("atm_tc_init: link (%p) qdisc %p\n",&p->link,p->link.q); + p->link.filter_list = NULL; + p->link.vcc = NULL; + p->link.sock = NULL; + p->link.classid = sch->handle; + p->link.ref = 1; + p->link.next = NULL; + tasklet_init(&p->task,sch_atm_dequeue,(unsigned long) sch); + return 0; +} + + +static void atm_tc_reset(struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow; + + DPRINTK("atm_tc_reset(sch %p,[qdisc %p])\n",sch,p); + for (flow = p->flows; flow; flow = flow->next) qdisc_reset(flow->q); + sch->q.qlen = 0; +} + + +static void atm_tc_destroy(struct Qdisc *sch) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow; + + DPRINTK("atm_tc_destroy(sch %p,[qdisc %p])\n",sch,p); + /* races ? */ + while ((flow = p->flows)) { + destroy_filters(flow); + if (flow->ref > 1) + printk(KERN_ERR "atm_destroy: %p->ref = %d\n",flow, + flow->ref); + atm_tc_put(sch,(unsigned long) flow); + if (p->flows == flow) { + printk(KERN_ERR "atm_destroy: putting flow %p didn't " + "kill it\n",flow); + p->flows = flow->next; /* brute force */ + break; + } + } + tasklet_kill(&p->task); +} + + +static int atm_tc_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct atm_qdisc_data *p = PRIV(sch); + struct atm_flow_data *flow = (struct atm_flow_data *) cl; + unsigned char *b = skb->tail; + struct rtattr *rta; + + DPRINTK("atm_tc_dump_class(sch %p,[qdisc %p],flow %p,skb %p,tcm %p)\n", + sch,p,flow,skb,tcm); + if (!find_flow(p,flow)) return -EINVAL; + tcm->tcm_handle = flow->classid; + rta = (struct rtattr *) b; + RTA_PUT(skb,TCA_OPTIONS,0,NULL); + RTA_PUT(skb,TCA_ATM_HDR,flow->hdr_len,flow->hdr); + if (flow->vcc) { + struct sockaddr_atmpvc pvc; + int state; + + pvc.sap_family = AF_ATMPVC; + pvc.sap_addr.itf = flow->vcc->dev ? flow->vcc->dev->number : -1; + pvc.sap_addr.vpi = flow->vcc->vpi; + pvc.sap_addr.vci = flow->vcc->vci; + RTA_PUT(skb,TCA_ATM_ADDR,sizeof(pvc),&pvc); + state = ATM_VF2VS(flow->vcc->flags); + RTA_PUT(skb,TCA_ATM_STATE,sizeof(state),&state); + } + if (flow->excess) + RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(u32),&flow->classid); + else { + static u32 zero; + + RTA_PUT(skb,TCA_ATM_EXCESS,sizeof(zero),&zero); + } + rta->rta_len = skb->tail-b; + return skb->len; + +rtattr_failure: + skb_trim(skb,b-skb->data); + return -1; +} +static int +atm_tc_dump_class_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct atm_flow_data *flow = (struct atm_flow_data *) arg; + + flow->qstats.qlen = flow->q->q.qlen; + + if (gnet_stats_copy_basic(d, &flow->bstats) < 0 || + gnet_stats_copy_queue(d, &flow->qstats) < 0) + return -1; + + return 0; +} + +static int atm_tc_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + return 0; +} + +static struct Qdisc_class_ops atm_class_ops = { + .graft = atm_tc_graft, + .leaf = atm_tc_leaf, + .get = atm_tc_get, + .put = atm_tc_put, + .change = atm_tc_change, + .delete = atm_tc_delete, + .walk = atm_tc_walk, + .tcf_chain = atm_tc_find_tcf, + .bind_tcf = atm_tc_bind_filter, + .unbind_tcf = atm_tc_put, + .dump = atm_tc_dump_class, + .dump_stats = atm_tc_dump_class_stats, +}; + +static struct Qdisc_ops atm_qdisc_ops = { + .next = NULL, + .cl_ops = &atm_class_ops, + .id = "atm", + .priv_size = sizeof(struct atm_qdisc_data), + .enqueue = atm_tc_enqueue, + .dequeue = atm_tc_dequeue, + .requeue = atm_tc_requeue, + .drop = atm_tc_drop, + .init = atm_tc_init, + .reset = atm_tc_reset, + .destroy = atm_tc_destroy, + .change = NULL, + .dump = atm_tc_dump, + .owner = THIS_MODULE, +}; + + +static int __init atm_init(void) +{ + return register_qdisc(&atm_qdisc_ops); +} + +static void __exit atm_exit(void) +{ + unregister_qdisc(&atm_qdisc_ops); +} + +module_init(atm_init) +module_exit(atm_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c new file mode 100644 index 000000000000..d43e3b8cbf6a --- /dev/null +++ b/net/sched/sch_cbq.c @@ -0,0 +1,2124 @@ +/* + * net/sched/sch_cbq.c Class-Based Queueing discipline. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Class-Based Queueing (CBQ) algorithm. + ======================================= + + Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource + Management Models for Packet Networks", + IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995 + + [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995 + + [3] Sally Floyd, "Notes on Class-Based Queueing: Setting + Parameters", 1996 + + [4] Sally Floyd and Michael Speer, "Experimental Results + for Class-Based Queueing", 1998, not published. + + ----------------------------------------------------------------------- + + Algorithm skeleton was taken from NS simulator cbq.cc. + If someone wants to check this code against the LBL version, + he should take into account that ONLY the skeleton was borrowed, + the implementation is different. Particularly: + + --- The WRR algorithm is different. Our version looks more + reasonable (I hope) and works when quanta are allowed to be + less than MTU, which is always the case when real time classes + have small rates. Note, that the statement of [3] is + incomplete, delay may actually be estimated even if class + per-round allotment is less than MTU. Namely, if per-round + allotment is W*r_i, and r_1+...+r_k = r < 1 + + delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B + + In the worst case we have IntServ estimate with D = W*r+k*MTU + and C = MTU*r. The proof (if correct at all) is trivial. + + + --- It seems that cbq-2.0 is not very accurate. At least, I cannot + interpret some places, which look like wrong translations + from NS. Anyone is advised to find these differences + and explain to me, why I am wrong 8). + + --- Linux has no EOI event, so that we cannot estimate true class + idle time. Workaround is to consider the next dequeue event + as sign that previous packet is finished. This is wrong because of + internal device queueing, but on a permanently loaded link it is true. + Moreover, combined with clock integrator, this scheme looks + very close to an ideal solution. */ + +struct cbq_sched_data; + + +struct cbq_class +{ + struct cbq_class *next; /* hash table link */ + struct cbq_class *next_alive; /* next class with backlog in this priority band */ + +/* Parameters */ + u32 classid; + unsigned char priority; /* class priority */ + unsigned char priority2; /* priority to be used after overlimit */ + unsigned char ewma_log; /* time constant for idle time calculation */ + unsigned char ovl_strategy; +#ifdef CONFIG_NET_CLS_POLICE + unsigned char police; +#endif + + u32 defmap; + + /* Link-sharing scheduler parameters */ + long maxidle; /* Class parameters: see below. */ + long offtime; + long minidle; + u32 avpkt; + struct qdisc_rate_table *R_tab; + + /* Overlimit strategy parameters */ + void (*overlimit)(struct cbq_class *cl); + long penalty; + + /* General scheduler (WRR) parameters */ + long allot; + long quantum; /* Allotment per WRR round */ + long weight; /* Relative allotment: see below */ + + struct Qdisc *qdisc; /* Ptr to CBQ discipline */ + struct cbq_class *split; /* Ptr to split node */ + struct cbq_class *share; /* Ptr to LS parent in the class tree */ + struct cbq_class *tparent; /* Ptr to tree parent in the class tree */ + struct cbq_class *borrow; /* NULL if class is bandwidth limited; + parent otherwise */ + struct cbq_class *sibling; /* Sibling chain */ + struct cbq_class *children; /* Pointer to children chain */ + + struct Qdisc *q; /* Elementary queueing discipline */ + + +/* Variables */ + unsigned char cpriority; /* Effective priority */ + unsigned char delayed; + unsigned char level; /* level of the class in hierarchy: + 0 for leaf classes, and maximal + level of children + 1 for nodes. + */ + + psched_time_t last; /* Last end of service */ + psched_time_t undertime; + long avgidle; + long deficit; /* Saved deficit for WRR */ + unsigned long penalized; + struct gnet_stats_basic bstats; + struct gnet_stats_queue qstats; + struct gnet_stats_rate_est rate_est; + spinlock_t *stats_lock; + struct tc_cbq_xstats xstats; + + struct tcf_proto *filter_list; + + int refcnt; + int filters; + + struct cbq_class *defaults[TC_PRIO_MAX+1]; +}; + +struct cbq_sched_data +{ + struct cbq_class *classes[16]; /* Hash table of all classes */ + int nclasses[TC_CBQ_MAXPRIO+1]; + unsigned quanta[TC_CBQ_MAXPRIO+1]; + + struct cbq_class link; + + unsigned activemask; + struct cbq_class *active[TC_CBQ_MAXPRIO+1]; /* List of all classes + with backlog */ + +#ifdef CONFIG_NET_CLS_POLICE + struct cbq_class *rx_class; +#endif + struct cbq_class *tx_class; + struct cbq_class *tx_borrowed; + int tx_len; + psched_time_t now; /* Cached timestamp */ + psched_time_t now_rt; /* Cached real time */ + unsigned pmask; + + struct timer_list delay_timer; + struct timer_list wd_timer; /* Watchdog timer, + started when CBQ has + backlog, but cannot + transmit just now */ + long wd_expires; + int toplevel; + u32 hgenerator; +}; + + +#define L2T(cl,len) ((cl)->R_tab->data[(len)>>(cl)->R_tab->rate.cell_log]) + + +static __inline__ unsigned cbq_hash(u32 h) +{ + h ^= h>>8; + h ^= h>>4; + return h&0xF; +} + +static __inline__ struct cbq_class * +cbq_class_lookup(struct cbq_sched_data *q, u32 classid) +{ + struct cbq_class *cl; + + for (cl = q->classes[cbq_hash(classid)]; cl; cl = cl->next) + if (cl->classid == classid) + return cl; + return NULL; +} + +#ifdef CONFIG_NET_CLS_POLICE + +static struct cbq_class * +cbq_reclassify(struct sk_buff *skb, struct cbq_class *this) +{ + struct cbq_class *cl, *new; + + for (cl = this->tparent; cl; cl = cl->tparent) + if ((new = cl->defaults[TC_PRIO_BESTEFFORT]) != NULL && new != this) + return new; + + return NULL; +} + +#endif + +/* Classify packet. The procedure is pretty complicated, but + it allows us to combine link sharing and priority scheduling + transparently. + + Namely, you can put link sharing rules (f.e. route based) at root of CBQ, + so that it resolves to split nodes. Then packets are classified + by logical priority, or a more specific classifier may be attached + to the split node. + */ + +static struct cbq_class * +cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *head = &q->link; + struct cbq_class **defmap; + struct cbq_class *cl = NULL; + u32 prio = skb->priority; + struct tcf_result res; + + /* + * Step 1. If skb->priority points to one of our classes, use it. + */ + if (TC_H_MAJ(prio^sch->handle) == 0 && + (cl = cbq_class_lookup(q, prio)) != NULL) + return cl; + + *qerr = NET_XMIT_DROP; + for (;;) { + int result = 0; + defmap = head->defaults; + + /* + * Step 2+n. Apply classifier. + */ + if (!head->filter_list || (result = tc_classify(skb, head->filter_list, &res)) < 0) + goto fallback; + + if ((cl = (void*)res.class) == NULL) { + if (TC_H_MAJ(res.classid)) + cl = cbq_class_lookup(q, res.classid); + else if ((cl = defmap[res.classid&TC_PRIO_MAX]) == NULL) + cl = defmap[TC_PRIO_BESTEFFORT]; + + if (cl == NULL || cl->level >= head->level) + goto fallback; + } + +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS; + case TC_ACT_SHOT: + return NULL; + } +#elif defined(CONFIG_NET_CLS_POLICE) + switch (result) { + case TC_POLICE_RECLASSIFY: + return cbq_reclassify(skb, cl); + case TC_POLICE_SHOT: + return NULL; + default: + break; + } +#endif + if (cl->level == 0) + return cl; + + /* + * Step 3+n. If classifier selected a link sharing class, + * apply agency specific classifier. + * Repeat this procdure until we hit a leaf node. + */ + head = cl; + } + +fallback: + cl = head; + + /* + * Step 4. No success... + */ + if (TC_H_MAJ(prio) == 0 && + !(cl = head->defaults[prio&TC_PRIO_MAX]) && + !(cl = head->defaults[TC_PRIO_BESTEFFORT])) + return head; + + return cl; +} + +/* + A packet has just been enqueued on the empty class. + cbq_activate_class adds it to the tail of active class list + of its priority band. + */ + +static __inline__ void cbq_activate_class(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + int prio = cl->cpriority; + struct cbq_class *cl_tail; + + cl_tail = q->active[prio]; + q->active[prio] = cl; + + if (cl_tail != NULL) { + cl->next_alive = cl_tail->next_alive; + cl_tail->next_alive = cl; + } else { + cl->next_alive = cl; + q->activemask |= (1<qdisc); + int prio = this->cpriority; + struct cbq_class *cl; + struct cbq_class *cl_prev = q->active[prio]; + + do { + cl = cl_prev->next_alive; + if (cl == this) { + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + + if (cl == q->active[prio]) { + q->active[prio] = cl_prev; + if (cl == q->active[prio]) { + q->active[prio] = NULL; + q->activemask &= ~(1<next_alive; + return; + } + } while ((cl_prev = cl) != q->active[prio]); +} + +static void +cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl) +{ + int toplevel = q->toplevel; + + if (toplevel > cl->level && !(cl->q->flags&TCQ_F_THROTTLED)) { + psched_time_t now; + psched_tdiff_t incr; + + PSCHED_GET_TIME(now); + incr = PSCHED_TDIFF(now, q->now_rt); + PSCHED_TADD2(q->now, incr, now); + + do { + if (PSCHED_TLESS(cl->undertime, now)) { + q->toplevel = cl->level; + return; + } + } while ((cl=cl->borrow) != NULL && toplevel > cl->level); + } +} + +static int +cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + int len = skb->len; + int ret; + struct cbq_class *cl = cbq_classify(skb, sch, &ret); + +#ifdef CONFIG_NET_CLS_POLICE + q->rx_class = cl; +#endif + if (cl == NULL) { + if (ret == NET_XMIT_DROP) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } + +#ifdef CONFIG_NET_CLS_POLICE + cl->q->__parent = sch; +#endif + if ((ret = cl->q->enqueue(skb, cl->q)) == NET_XMIT_SUCCESS) { + sch->q.qlen++; + sch->bstats.packets++; + sch->bstats.bytes+=len; + cbq_mark_toplevel(q, cl); + if (!cl->next_alive) + cbq_activate_class(cl); + return ret; + } + + sch->qstats.drops++; + cbq_mark_toplevel(q, cl); + cl->qstats.drops++; + return ret; +} + +static int +cbq_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl; + int ret; + + if ((cl = q->tx_class) == NULL) { + kfree_skb(skb); + sch->qstats.drops++; + return NET_XMIT_CN; + } + q->tx_class = NULL; + + cbq_mark_toplevel(q, cl); + +#ifdef CONFIG_NET_CLS_POLICE + q->rx_class = cl; + cl->q->__parent = sch; +#endif + if ((ret = cl->q->ops->requeue(skb, cl->q)) == 0) { + sch->q.qlen++; + sch->qstats.requeues++; + if (!cl->next_alive) + cbq_activate_class(cl); + return 0; + } + sch->qstats.drops++; + cl->qstats.drops++; + return ret; +} + +/* Overlimit actions */ + +/* TC_CBQ_OVL_CLASSIC: (default) penalize leaf class by adding offtime */ + +static void cbq_ovl_classic(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now); + + if (!cl->delayed) { + delay += cl->offtime; + + /* + Class goes to sleep, so that it will have no + chance to work avgidle. Let's forgive it 8) + + BTW cbq-2.0 has a crap in this + place, apparently they forgot to shift it by cl->ewma_log. + */ + if (cl->avgidle < 0) + delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); + if (cl->avgidle < cl->minidle) + cl->avgidle = cl->minidle; + if (delay <= 0) + delay = 1; + PSCHED_TADD2(q->now, delay, cl->undertime); + + cl->xstats.overactions++; + cl->delayed = 1; + } + if (q->wd_expires == 0 || q->wd_expires > delay) + q->wd_expires = delay; + + /* Dirty work! We must schedule wakeups based on + real available rate, rather than leaf rate, + which may be tiny (even zero). + */ + if (q->toplevel == TC_CBQ_MAXLEVEL) { + struct cbq_class *b; + psched_tdiff_t base_delay = q->wd_expires; + + for (b = cl->borrow; b; b = b->borrow) { + delay = PSCHED_TDIFF(b->undertime, q->now); + if (delay < base_delay) { + if (delay <= 0) + delay = 1; + base_delay = delay; + } + } + + q->wd_expires = base_delay; + } +} + +/* TC_CBQ_OVL_RCLASSIC: penalize by offtime classes in hierarchy, when + they go overlimit + */ + +static void cbq_ovl_rclassic(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + struct cbq_class *this = cl; + + do { + if (cl->level > q->toplevel) { + cl = NULL; + break; + } + } while ((cl = cl->borrow) != NULL); + + if (cl == NULL) + cl = this; + cbq_ovl_classic(cl); +} + +/* TC_CBQ_OVL_DELAY: delay until it will go to underlimit */ + +static void cbq_ovl_delay(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + psched_tdiff_t delay = PSCHED_TDIFF(cl->undertime, q->now); + + if (!cl->delayed) { + unsigned long sched = jiffies; + + delay += cl->offtime; + if (cl->avgidle < 0) + delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log); + if (cl->avgidle < cl->minidle) + cl->avgidle = cl->minidle; + PSCHED_TADD2(q->now, delay, cl->undertime); + + if (delay > 0) { + sched += PSCHED_US2JIFFIE(delay) + cl->penalty; + cl->penalized = sched; + cl->cpriority = TC_CBQ_MAXPRIO; + q->pmask |= (1<delay_timer) && + (long)(q->delay_timer.expires - sched) > 0) + q->delay_timer.expires = sched; + add_timer(&q->delay_timer); + cl->delayed = 1; + cl->xstats.overactions++; + return; + } + delay = 1; + } + if (q->wd_expires == 0 || q->wd_expires > delay) + q->wd_expires = delay; +} + +/* TC_CBQ_OVL_LOWPRIO: penalize class by lowering its priority band */ + +static void cbq_ovl_lowprio(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + + cl->penalized = jiffies + cl->penalty; + + if (cl->cpriority != cl->priority2) { + cl->cpriority = cl->priority2; + q->pmask |= (1<cpriority); + cl->xstats.overactions++; + } + cbq_ovl_classic(cl); +} + +/* TC_CBQ_OVL_DROP: penalize class by dropping */ + +static void cbq_ovl_drop(struct cbq_class *cl) +{ + if (cl->q->ops->drop) + if (cl->q->ops->drop(cl->q)) + cl->qdisc->q.qlen--; + cl->xstats.overactions++; + cbq_ovl_classic(cl); +} + +static void cbq_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + + sch->flags &= ~TCQ_F_THROTTLED; + netif_schedule(sch->dev); +} + +static unsigned long cbq_undelay_prio(struct cbq_sched_data *q, int prio) +{ + struct cbq_class *cl; + struct cbq_class *cl_prev = q->active[prio]; + unsigned long now = jiffies; + unsigned long sched = now; + + if (cl_prev == NULL) + return now; + + do { + cl = cl_prev->next_alive; + if ((long)(now - cl->penalized) > 0) { + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + cl->cpriority = cl->priority; + cl->delayed = 0; + cbq_activate_class(cl); + + if (cl == q->active[prio]) { + q->active[prio] = cl_prev; + if (cl == q->active[prio]) { + q->active[prio] = NULL; + return 0; + } + } + + cl = cl_prev->next_alive; + } else if ((long)(sched - cl->penalized) > 0) + sched = cl->penalized; + } while ((cl_prev = cl) != q->active[prio]); + + return (long)(sched - now); +} + +static void cbq_undelay(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct cbq_sched_data *q = qdisc_priv(sch); + long delay = 0; + unsigned pmask; + + pmask = q->pmask; + q->pmask = 0; + + while (pmask) { + int prio = ffz(~pmask); + long tmp; + + pmask &= ~(1< 0) { + q->pmask |= 1<delay_timer.expires = jiffies + delay; + add_timer(&q->delay_timer); + } + + sch->flags &= ~TCQ_F_THROTTLED; + netif_schedule(sch->dev); +} + + +#ifdef CONFIG_NET_CLS_POLICE + +static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child) +{ + int len = skb->len; + struct Qdisc *sch = child->__parent; + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = q->rx_class; + + q->rx_class = NULL; + + if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) { + + cbq_mark_toplevel(q, cl); + + q->rx_class = cl; + cl->q->__parent = sch; + + if (cl->q->enqueue(skb, cl->q) == 0) { + sch->q.qlen++; + sch->bstats.packets++; + sch->bstats.bytes+=len; + if (!cl->next_alive) + cbq_activate_class(cl); + return 0; + } + sch->qstats.drops++; + return 0; + } + + sch->qstats.drops++; + return -1; +} +#endif + +/* + It is mission critical procedure. + + We "regenerate" toplevel cutoff, if transmitting class + has backlog and it is not regulated. It is not part of + original CBQ description, but looks more reasonable. + Probably, it is wrong. This question needs further investigation. +*/ + +static __inline__ void +cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl, + struct cbq_class *borrowed) +{ + if (cl && q->toplevel >= borrowed->level) { + if (cl->q->q.qlen > 1) { + do { + if (PSCHED_IS_PASTPERFECT(borrowed->undertime)) { + q->toplevel = borrowed->level; + return; + } + } while ((borrowed=borrowed->borrow) != NULL); + } +#if 0 + /* It is not necessary now. Uncommenting it + will save CPU cycles, but decrease fairness. + */ + q->toplevel = TC_CBQ_MAXLEVEL; +#endif + } +} + +static void +cbq_update(struct cbq_sched_data *q) +{ + struct cbq_class *this = q->tx_class; + struct cbq_class *cl = this; + int len = q->tx_len; + + q->tx_class = NULL; + + for ( ; cl; cl = cl->share) { + long avgidle = cl->avgidle; + long idle; + + cl->bstats.packets++; + cl->bstats.bytes += len; + + /* + (now - last) is total time between packet right edges. + (last_pktlen/rate) is "virtual" busy time, so that + + idle = (now - last) - last_pktlen/rate + */ + + idle = PSCHED_TDIFF(q->now, cl->last); + if ((unsigned long)idle > 128*1024*1024) { + avgidle = cl->maxidle; + } else { + idle -= L2T(cl, len); + + /* true_avgidle := (1-W)*true_avgidle + W*idle, + where W=2^{-ewma_log}. But cl->avgidle is scaled: + cl->avgidle == true_avgidle/W, + hence: + */ + avgidle += idle - (avgidle>>cl->ewma_log); + } + + if (avgidle <= 0) { + /* Overlimit or at-limit */ + + if (avgidle < cl->minidle) + avgidle = cl->minidle; + + cl->avgidle = avgidle; + + /* Calculate expected time, when this class + will be allowed to send. + It will occur, when: + (1-W)*true_avgidle + W*delay = 0, i.e. + idle = (1/W - 1)*(-true_avgidle) + or + idle = (1 - W)*(-cl->avgidle); + */ + idle = (-avgidle) - ((-avgidle) >> cl->ewma_log); + + /* + That is not all. + To maintain the rate allocated to the class, + we add to undertime virtual clock, + necessary to complete transmitted packet. + (len/phys_bandwidth has been already passed + to the moment of cbq_update) + */ + + idle -= L2T(&q->link, len); + idle += L2T(cl, len); + + PSCHED_AUDIT_TDIFF(idle); + + PSCHED_TADD2(q->now, idle, cl->undertime); + } else { + /* Underlimit */ + + PSCHED_SET_PASTPERFECT(cl->undertime); + if (avgidle > cl->maxidle) + cl->avgidle = cl->maxidle; + else + cl->avgidle = avgidle; + } + cl->last = q->now; + } + + cbq_update_toplevel(q, this, q->tx_borrowed); +} + +static __inline__ struct cbq_class * +cbq_under_limit(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + struct cbq_class *this_cl = cl; + + if (cl->tparent == NULL) + return cl; + + if (PSCHED_IS_PASTPERFECT(cl->undertime) || + !PSCHED_TLESS(q->now, cl->undertime)) { + cl->delayed = 0; + return cl; + } + + do { + /* It is very suspicious place. Now overlimit + action is generated for not bounded classes + only if link is completely congested. + Though it is in agree with ancestor-only paradigm, + it looks very stupid. Particularly, + it means that this chunk of code will either + never be called or result in strong amplification + of burstiness. Dangerous, silly, and, however, + no another solution exists. + */ + if ((cl = cl->borrow) == NULL) { + this_cl->qstats.overlimits++; + this_cl->overlimit(this_cl); + return NULL; + } + if (cl->level > q->toplevel) + return NULL; + } while (!PSCHED_IS_PASTPERFECT(cl->undertime) && + PSCHED_TLESS(q->now, cl->undertime)); + + cl->delayed = 0; + return cl; +} + +static __inline__ struct sk_buff * +cbq_dequeue_prio(struct Qdisc *sch, int prio) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl_tail, *cl_prev, *cl; + struct sk_buff *skb; + int deficit; + + cl_tail = cl_prev = q->active[prio]; + cl = cl_prev->next_alive; + + do { + deficit = 0; + + /* Start round */ + do { + struct cbq_class *borrow = cl; + + if (cl->q->q.qlen && + (borrow = cbq_under_limit(cl)) == NULL) + goto skip_class; + + if (cl->deficit <= 0) { + /* Class exhausted its allotment per + this round. Switch to the next one. + */ + deficit = 1; + cl->deficit += cl->quantum; + goto next_class; + } + + skb = cl->q->dequeue(cl->q); + + /* Class did not give us any skb :-( + It could occur even if cl->q->q.qlen != 0 + f.e. if cl->q == "tbf" + */ + if (skb == NULL) + goto skip_class; + + cl->deficit -= skb->len; + q->tx_class = cl; + q->tx_borrowed = borrow; + if (borrow != cl) { +#ifndef CBQ_XSTATS_BORROWS_BYTES + borrow->xstats.borrows++; + cl->xstats.borrows++; +#else + borrow->xstats.borrows += skb->len; + cl->xstats.borrows += skb->len; +#endif + } + q->tx_len = skb->len; + + if (cl->deficit <= 0) { + q->active[prio] = cl; + cl = cl->next_alive; + cl->deficit += cl->quantum; + } + return skb; + +skip_class: + if (cl->q->q.qlen == 0 || prio != cl->cpriority) { + /* Class is empty or penalized. + Unlink it from active chain. + */ + cl_prev->next_alive = cl->next_alive; + cl->next_alive = NULL; + + /* Did cl_tail point to it? */ + if (cl == cl_tail) { + /* Repair it! */ + cl_tail = cl_prev; + + /* Was it the last class in this band? */ + if (cl == cl_tail) { + /* Kill the band! */ + q->active[prio] = NULL; + q->activemask &= ~(1<q->q.qlen) + cbq_activate_class(cl); + return NULL; + } + + q->active[prio] = cl_tail; + } + if (cl->q->q.qlen) + cbq_activate_class(cl); + + cl = cl_prev; + } + +next_class: + cl_prev = cl; + cl = cl->next_alive; + } while (cl_prev != cl_tail); + } while (deficit); + + q->active[prio] = cl_prev; + + return NULL; +} + +static __inline__ struct sk_buff * +cbq_dequeue_1(struct Qdisc *sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + unsigned activemask; + + activemask = q->activemask&0xFF; + while (activemask) { + int prio = ffz(~activemask); + activemask &= ~(1<now_rt); + + if (q->tx_class) { + psched_tdiff_t incr2; + /* Time integrator. We calculate EOS time + by adding expected packet transmission time. + If real time is greater, we warp artificial clock, + so that: + + cbq_time = max(real_time, work); + */ + incr2 = L2T(&q->link, q->tx_len); + PSCHED_TADD(q->now, incr2); + cbq_update(q); + if ((incr -= incr2) < 0) + incr = 0; + } + PSCHED_TADD(q->now, incr); + q->now_rt = now; + + for (;;) { + q->wd_expires = 0; + + skb = cbq_dequeue_1(sch); + if (skb) { + sch->q.qlen--; + sch->flags &= ~TCQ_F_THROTTLED; + return skb; + } + + /* All the classes are overlimit. + + It is possible, if: + + 1. Scheduler is empty. + 2. Toplevel cutoff inhibited borrowing. + 3. Root class is overlimit. + + Reset 2d and 3d conditions and retry. + + Note, that NS and cbq-2.0 are buggy, peeking + an arbitrary class is appropriate for ancestor-only + sharing, but not for toplevel algorithm. + + Our version is better, but slower, because it requires + two passes, but it is unavoidable with top-level sharing. + */ + + if (q->toplevel == TC_CBQ_MAXLEVEL && + PSCHED_IS_PASTPERFECT(q->link.undertime)) + break; + + q->toplevel = TC_CBQ_MAXLEVEL; + PSCHED_SET_PASTPERFECT(q->link.undertime); + } + + /* No packets in scheduler or nobody wants to give them to us :-( + Sigh... start watchdog timer in the last case. */ + + if (sch->q.qlen) { + sch->qstats.overlimits++; + if (q->wd_expires) { + long delay = PSCHED_US2JIFFIE(q->wd_expires); + if (delay <= 0) + delay = 1; + mod_timer(&q->wd_timer, jiffies + delay); + sch->flags |= TCQ_F_THROTTLED; + } + } + return NULL; +} + +/* CBQ class maintanance routines */ + +static void cbq_adjust_levels(struct cbq_class *this) +{ + if (this == NULL) + return; + + do { + int level = 0; + struct cbq_class *cl; + + if ((cl = this->children) != NULL) { + do { + if (cl->level > level) + level = cl->level; + } while ((cl = cl->sibling) != this->children); + } + this->level = level+1; + } while ((this = this->tparent) != NULL); +} + +static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio) +{ + struct cbq_class *cl; + unsigned h; + + if (q->quanta[prio] == 0) + return; + + for (h=0; h<16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) { + /* BUGGGG... Beware! This expression suffer of + arithmetic overflows! + */ + if (cl->priority == prio) { + cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/ + q->quanta[prio]; + } + if (cl->quantum <= 0 || cl->quantum>32*cl->qdisc->dev->mtu) { + printk(KERN_WARNING "CBQ: class %08x has bad quantum==%ld, repaired.\n", cl->classid, cl->quantum); + cl->quantum = cl->qdisc->dev->mtu/2 + 1; + } + } + } +} + +static void cbq_sync_defmap(struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + struct cbq_class *split = cl->split; + unsigned h; + int i; + + if (split == NULL) + return; + + for (i=0; i<=TC_PRIO_MAX; i++) { + if (split->defaults[i] == cl && !(cl->defmap&(1<defaults[i] = NULL; + } + + for (i=0; i<=TC_PRIO_MAX; i++) { + int level = split->level; + + if (split->defaults[i]) + continue; + + for (h=0; h<16; h++) { + struct cbq_class *c; + + for (c = q->classes[h]; c; c = c->next) { + if (c->split == split && c->level < level && + c->defmap&(1<defaults[i] = c; + level = c->level; + } + } + } + } +} + +static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask) +{ + struct cbq_class *split = NULL; + + if (splitid == 0) { + if ((split = cl->split) == NULL) + return; + splitid = split->classid; + } + + if (split == NULL || split->classid != splitid) { + for (split = cl->tparent; split; split = split->tparent) + if (split->classid == splitid) + break; + } + + if (split == NULL) + return; + + if (cl->split != split) { + cl->defmap = 0; + cbq_sync_defmap(cl); + cl->split = split; + cl->defmap = def&mask; + } else + cl->defmap = (cl->defmap&~mask)|(def&mask); + + cbq_sync_defmap(cl); +} + +static void cbq_unlink_class(struct cbq_class *this) +{ + struct cbq_class *cl, **clp; + struct cbq_sched_data *q = qdisc_priv(this->qdisc); + + for (clp = &q->classes[cbq_hash(this->classid)]; (cl = *clp) != NULL; clp = &cl->next) { + if (cl == this) { + *clp = cl->next; + cl->next = NULL; + break; + } + } + + if (this->tparent) { + clp=&this->sibling; + cl = *clp; + do { + if (cl == this) { + *clp = cl->sibling; + break; + } + clp = &cl->sibling; + } while ((cl = *clp) != this->sibling); + + if (this->tparent->children == this) { + this->tparent->children = this->sibling; + if (this->sibling == this) + this->tparent->children = NULL; + } + } else { + BUG_TRAP(this->sibling == this); + } +} + +static void cbq_link_class(struct cbq_class *this) +{ + struct cbq_sched_data *q = qdisc_priv(this->qdisc); + unsigned h = cbq_hash(this->classid); + struct cbq_class *parent = this->tparent; + + this->sibling = this; + this->next = q->classes[h]; + q->classes[h] = this; + + if (parent == NULL) + return; + + if (parent->children == NULL) { + parent->children = this; + } else { + this->sibling = parent->children->sibling; + parent->children->sibling = this; + } +} + +static unsigned int cbq_drop(struct Qdisc* sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl, *cl_head; + int prio; + unsigned int len; + + for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) { + if ((cl_head = q->active[prio]) == NULL) + continue; + + cl = cl_head; + do { + if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) { + sch->q.qlen--; + return len; + } + } while ((cl = cl->next_alive) != cl_head); + } + return 0; +} + +static void +cbq_reset(struct Qdisc* sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl; + int prio; + unsigned h; + + q->activemask = 0; + q->pmask = 0; + q->tx_class = NULL; + q->tx_borrowed = NULL; + del_timer(&q->wd_timer); + del_timer(&q->delay_timer); + q->toplevel = TC_CBQ_MAXLEVEL; + PSCHED_GET_TIME(q->now); + q->now_rt = q->now; + + for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++) + q->active[prio] = NULL; + + for (h = 0; h < 16; h++) { + for (cl = q->classes[h]; cl; cl = cl->next) { + qdisc_reset(cl->q); + + cl->next_alive = NULL; + PSCHED_SET_PASTPERFECT(cl->undertime); + cl->avgidle = cl->maxidle; + cl->deficit = cl->quantum; + cl->cpriority = cl->priority; + } + } + sch->q.qlen = 0; +} + + +static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss) +{ + if (lss->change&TCF_CBQ_LSS_FLAGS) { + cl->share = (lss->flags&TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent; + cl->borrow = (lss->flags&TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent; + } + if (lss->change&TCF_CBQ_LSS_EWMA) + cl->ewma_log = lss->ewma_log; + if (lss->change&TCF_CBQ_LSS_AVPKT) + cl->avpkt = lss->avpkt; + if (lss->change&TCF_CBQ_LSS_MINIDLE) + cl->minidle = -(long)lss->minidle; + if (lss->change&TCF_CBQ_LSS_MAXIDLE) { + cl->maxidle = lss->maxidle; + cl->avgidle = lss->maxidle; + } + if (lss->change&TCF_CBQ_LSS_OFFTIME) + cl->offtime = lss->offtime; + return 0; +} + +static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl) +{ + q->nclasses[cl->priority]--; + q->quanta[cl->priority] -= cl->weight; + cbq_normalize_quanta(q, cl->priority); +} + +static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl) +{ + q->nclasses[cl->priority]++; + q->quanta[cl->priority] += cl->weight; + cbq_normalize_quanta(q, cl->priority); +} + +static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr) +{ + struct cbq_sched_data *q = qdisc_priv(cl->qdisc); + + if (wrr->allot) + cl->allot = wrr->allot; + if (wrr->weight) + cl->weight = wrr->weight; + if (wrr->priority) { + cl->priority = wrr->priority-1; + cl->cpriority = cl->priority; + if (cl->priority >= cl->priority2) + cl->priority2 = TC_CBQ_MAXPRIO-1; + } + + cbq_addprio(q, cl); + return 0; +} + +static int cbq_set_overlimit(struct cbq_class *cl, struct tc_cbq_ovl *ovl) +{ + switch (ovl->strategy) { + case TC_CBQ_OVL_CLASSIC: + cl->overlimit = cbq_ovl_classic; + break; + case TC_CBQ_OVL_DELAY: + cl->overlimit = cbq_ovl_delay; + break; + case TC_CBQ_OVL_LOWPRIO: + if (ovl->priority2-1 >= TC_CBQ_MAXPRIO || + ovl->priority2-1 <= cl->priority) + return -EINVAL; + cl->priority2 = ovl->priority2-1; + cl->overlimit = cbq_ovl_lowprio; + break; + case TC_CBQ_OVL_DROP: + cl->overlimit = cbq_ovl_drop; + break; + case TC_CBQ_OVL_RCLASSIC: + cl->overlimit = cbq_ovl_rclassic; + break; + default: + return -EINVAL; + } + cl->penalty = (ovl->penalty*HZ)/1000; + return 0; +} + +#ifdef CONFIG_NET_CLS_POLICE +static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p) +{ + cl->police = p->police; + + if (cl->q->handle) { + if (p->police == TC_POLICE_RECLASSIFY) + cl->q->reshape_fail = cbq_reshape_fail; + else + cl->q->reshape_fail = NULL; + } + return 0; +} +#endif + +static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt) +{ + cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange); + return 0; +} + +static int cbq_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct rtattr *tb[TCA_CBQ_MAX]; + struct tc_ratespec *r; + + if (rtattr_parse_nested(tb, TCA_CBQ_MAX, opt) < 0 || + tb[TCA_CBQ_RTAB-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || + RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) + return -EINVAL; + + if (tb[TCA_CBQ_LSSOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) + return -EINVAL; + + r = RTA_DATA(tb[TCA_CBQ_RATE-1]); + + if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB-1])) == NULL) + return -EINVAL; + + q->link.refcnt = 1; + q->link.sibling = &q->link; + q->link.classid = sch->handle; + q->link.qdisc = sch; + if (!(q->link.q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + q->link.q = &noop_qdisc; + + q->link.priority = TC_CBQ_MAXPRIO-1; + q->link.priority2 = TC_CBQ_MAXPRIO-1; + q->link.cpriority = TC_CBQ_MAXPRIO-1; + q->link.ovl_strategy = TC_CBQ_OVL_CLASSIC; + q->link.overlimit = cbq_ovl_classic; + q->link.allot = psched_mtu(sch->dev); + q->link.quantum = q->link.allot; + q->link.weight = q->link.R_tab->rate.rate; + + q->link.ewma_log = TC_CBQ_DEF_EWMA; + q->link.avpkt = q->link.allot/2; + q->link.minidle = -0x7FFFFFFF; + q->link.stats_lock = &sch->dev->queue_lock; + + init_timer(&q->wd_timer); + q->wd_timer.data = (unsigned long)sch; + q->wd_timer.function = cbq_watchdog; + init_timer(&q->delay_timer); + q->delay_timer.data = (unsigned long)sch; + q->delay_timer.function = cbq_undelay; + q->toplevel = TC_CBQ_MAXLEVEL; + PSCHED_GET_TIME(q->now); + q->now_rt = q->now; + + cbq_link_class(&q->link); + + if (tb[TCA_CBQ_LSSOPT-1]) + cbq_set_lss(&q->link, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + + cbq_addprio(q, &q->link); + return 0; +} + +static __inline__ int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + + RTA_PUT(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_lssopt opt; + + opt.flags = 0; + if (cl->borrow == NULL) + opt.flags |= TCF_CBQ_LSS_BOUNDED; + if (cl->share == NULL) + opt.flags |= TCF_CBQ_LSS_ISOLATED; + opt.ewma_log = cl->ewma_log; + opt.level = cl->level; + opt.avpkt = cl->avpkt; + opt.maxidle = cl->maxidle; + opt.minidle = (u32)(-cl->minidle); + opt.offtime = cl->offtime; + opt.change = ~0; + RTA_PUT(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_wrropt opt; + + opt.flags = 0; + opt.allot = cl->allot; + opt.priority = cl->priority+1; + opt.cpriority = cl->cpriority+1; + opt.weight = cl->weight; + RTA_PUT(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_ovl(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_ovl opt; + + opt.strategy = cl->ovl_strategy; + opt.priority2 = cl->priority2+1; + opt.penalty = (cl->penalty*1000)/HZ; + RTA_PUT(skb, TCA_CBQ_OVL_STRATEGY, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static __inline__ int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_fopt opt; + + if (cl->split || cl->defmap) { + opt.split = cl->split ? cl->split->classid : 0; + opt.defmap = cl->defmap; + opt.defchange = ~0; + RTA_PUT(skb, TCA_CBQ_FOPT, sizeof(opt), &opt); + } + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +#ifdef CONFIG_NET_CLS_POLICE +static __inline__ int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl) +{ + unsigned char *b = skb->tail; + struct tc_cbq_police opt; + + if (cl->police) { + opt.police = cl->police; + RTA_PUT(skb, TCA_CBQ_POLICE, sizeof(opt), &opt); + } + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} +#endif + +static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl) +{ + if (cbq_dump_lss(skb, cl) < 0 || + cbq_dump_rate(skb, cl) < 0 || + cbq_dump_wrr(skb, cl) < 0 || + cbq_dump_ovl(skb, cl) < 0 || +#ifdef CONFIG_NET_CLS_POLICE + cbq_dump_police(skb, cl) < 0 || +#endif + cbq_dump_fopt(skb, cl) < 0) + return -1; + return 0; +} + +static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct rtattr *rta; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if (cbq_dump_attr(skb, &q->link) < 0) + goto rtattr_failure; + rta->rta_len = skb->tail - b; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int +cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + + q->link.xstats.avgidle = q->link.avgidle; + return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats)); +} + +static int +cbq_dump_class(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + unsigned char *b = skb->tail; + struct rtattr *rta; + + if (cl->tparent) + tcm->tcm_parent = cl->tparent->classid; + else + tcm->tcm_parent = TC_H_ROOT; + tcm->tcm_handle = cl->classid; + tcm->tcm_info = cl->q->handle; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if (cbq_dump_attr(skb, cl) < 0) + goto rtattr_failure; + rta->rta_len = skb->tail - b; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int +cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = (struct cbq_class*)arg; + + cl->qstats.qlen = cl->q->q.qlen; + cl->xstats.avgidle = cl->avgidle; + cl->xstats.undertime = 0; + + if (!PSCHED_IS_PASTPERFECT(cl->undertime)) + cl->xstats.undertime = PSCHED_TDIFF(cl->undertime, q->now); + + if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || +#ifdef CONFIG_NET_ESTIMATOR + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || +#endif + gnet_stats_copy_queue(d, &cl->qstats) < 0) + return -1; + + return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); +} + +static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + if (cl) { + if (new == NULL) { + if ((new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops)) == NULL) + return -ENOBUFS; + } else { +#ifdef CONFIG_NET_CLS_POLICE + if (cl->police == TC_POLICE_RECLASSIFY) + new->reshape_fail = cbq_reshape_fail; +#endif + } + sch_tree_lock(sch); + *old = cl->q; + cl->q = new; + sch->q.qlen -= (*old)->q.qlen; + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; + } + return -ENOENT; +} + +static struct Qdisc * +cbq_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + return cl ? cl->q : NULL; +} + +static unsigned long cbq_get(struct Qdisc *sch, u32 classid) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = cbq_class_lookup(q, classid); + + if (cl) { + cl->refcnt++; + return (unsigned long)cl; + } + return 0; +} + +static void cbq_destroy_filters(struct cbq_class *cl) +{ + struct tcf_proto *tp; + + while ((tp = cl->filter_list) != NULL) { + cl->filter_list = tp->next; + tcf_destroy(tp); + } +} + +static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + + BUG_TRAP(!cl->filters); + + cbq_destroy_filters(cl); + qdisc_destroy(cl->q); + qdisc_put_rtab(cl->R_tab); +#ifdef CONFIG_NET_ESTIMATOR + gen_kill_estimator(&cl->bstats, &cl->rate_est); +#endif + if (cl != &q->link) + kfree(cl); +} + +static void +cbq_destroy(struct Qdisc* sch) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl; + unsigned h; + +#ifdef CONFIG_NET_CLS_POLICE + q->rx_class = NULL; +#endif + /* + * Filters must be destroyed first because we don't destroy the + * classes from root to leafs which means that filters can still + * be bound to classes which have been destroyed already. --TGR '04 + */ + for (h = 0; h < 16; h++) + for (cl = q->classes[h]; cl; cl = cl->next) + cbq_destroy_filters(cl); + + for (h = 0; h < 16; h++) { + struct cbq_class *next; + + for (cl = q->classes[h]; cl; cl = next) { + next = cl->next; + cbq_destroy_class(sch, cl); + } + } +} + +static void cbq_put(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + if (--cl->refcnt == 0) { +#ifdef CONFIG_NET_CLS_POLICE + struct cbq_sched_data *q = qdisc_priv(sch); + + spin_lock_bh(&sch->dev->queue_lock); + if (q->rx_class == cl) + q->rx_class = NULL; + spin_unlock_bh(&sch->dev->queue_lock); +#endif + + cbq_destroy_class(sch, cl); + } +} + +static int +cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct rtattr **tca, + unsigned long *arg) +{ + int err; + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = (struct cbq_class*)*arg; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_CBQ_MAX]; + struct cbq_class *parent; + struct qdisc_rate_table *rtab = NULL; + + if (opt==NULL || rtattr_parse_nested(tb, TCA_CBQ_MAX, opt)) + return -EINVAL; + + if (tb[TCA_CBQ_OVL_STRATEGY-1] && + RTA_PAYLOAD(tb[TCA_CBQ_OVL_STRATEGY-1]) < sizeof(struct tc_cbq_ovl)) + return -EINVAL; + + if (tb[TCA_CBQ_FOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_FOPT-1]) < sizeof(struct tc_cbq_fopt)) + return -EINVAL; + + if (tb[TCA_CBQ_RATE-1] && + RTA_PAYLOAD(tb[TCA_CBQ_RATE-1]) < sizeof(struct tc_ratespec)) + return -EINVAL; + + if (tb[TCA_CBQ_LSSOPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_LSSOPT-1]) < sizeof(struct tc_cbq_lssopt)) + return -EINVAL; + + if (tb[TCA_CBQ_WRROPT-1] && + RTA_PAYLOAD(tb[TCA_CBQ_WRROPT-1]) < sizeof(struct tc_cbq_wrropt)) + return -EINVAL; + +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1] && + RTA_PAYLOAD(tb[TCA_CBQ_POLICE-1]) < sizeof(struct tc_cbq_police)) + return -EINVAL; +#endif + + if (cl) { + /* Check parent */ + if (parentid) { + if (cl->tparent && cl->tparent->classid != parentid) + return -EINVAL; + if (!cl->tparent && parentid != TC_H_ROOT) + return -EINVAL; + } + + if (tb[TCA_CBQ_RATE-1]) { + rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); + if (rtab == NULL) + return -EINVAL; + } + + /* Change class parameters */ + sch_tree_lock(sch); + + if (cl->next_alive != NULL) + cbq_deactivate_class(cl); + + if (rtab) { + rtab = xchg(&cl->R_tab, rtab); + qdisc_put_rtab(rtab); + } + + if (tb[TCA_CBQ_LSSOPT-1]) + cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + + if (tb[TCA_CBQ_WRROPT-1]) { + cbq_rmprio(q, cl); + cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); + } + + if (tb[TCA_CBQ_OVL_STRATEGY-1]) + cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); + +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1]) + cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); +#endif + + if (tb[TCA_CBQ_FOPT-1]) + cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); + + if (cl->q->q.qlen) + cbq_activate_class(cl); + + sch_tree_unlock(sch); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + gen_replace_estimator(&cl->bstats, &cl->rate_est, + cl->stats_lock, tca[TCA_RATE-1]); +#endif + return 0; + } + + if (parentid == TC_H_ROOT) + return -EINVAL; + + if (tb[TCA_CBQ_WRROPT-1] == NULL || tb[TCA_CBQ_RATE-1] == NULL || + tb[TCA_CBQ_LSSOPT-1] == NULL) + return -EINVAL; + + rtab = qdisc_get_rtab(RTA_DATA(tb[TCA_CBQ_RATE-1]), tb[TCA_CBQ_RTAB-1]); + if (rtab == NULL) + return -EINVAL; + + if (classid) { + err = -EINVAL; + if (TC_H_MAJ(classid^sch->handle) || cbq_class_lookup(q, classid)) + goto failure; + } else { + int i; + classid = TC_H_MAKE(sch->handle,0x8000); + + for (i=0; i<0x8000; i++) { + if (++q->hgenerator >= 0x8000) + q->hgenerator = 1; + if (cbq_class_lookup(q, classid|q->hgenerator) == NULL) + break; + } + err = -ENOSR; + if (i >= 0x8000) + goto failure; + classid = classid|q->hgenerator; + } + + parent = &q->link; + if (parentid) { + parent = cbq_class_lookup(q, parentid); + err = -EINVAL; + if (parent == NULL) + goto failure; + } + + err = -ENOBUFS; + cl = kmalloc(sizeof(*cl), GFP_KERNEL); + if (cl == NULL) + goto failure; + memset(cl, 0, sizeof(*cl)); + cl->R_tab = rtab; + rtab = NULL; + cl->refcnt = 1; + if (!(cl->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + cl->q = &noop_qdisc; + cl->classid = classid; + cl->tparent = parent; + cl->qdisc = sch; + cl->allot = parent->allot; + cl->quantum = cl->allot; + cl->weight = cl->R_tab->rate.rate; + cl->stats_lock = &sch->dev->queue_lock; + + sch_tree_lock(sch); + cbq_link_class(cl); + cl->borrow = cl->tparent; + if (cl->tparent != &q->link) + cl->share = cl->tparent; + cbq_adjust_levels(parent); + cl->minidle = -0x7FFFFFFF; + cbq_set_lss(cl, RTA_DATA(tb[TCA_CBQ_LSSOPT-1])); + cbq_set_wrr(cl, RTA_DATA(tb[TCA_CBQ_WRROPT-1])); + if (cl->ewma_log==0) + cl->ewma_log = q->link.ewma_log; + if (cl->maxidle==0) + cl->maxidle = q->link.maxidle; + if (cl->avpkt==0) + cl->avpkt = q->link.avpkt; + cl->overlimit = cbq_ovl_classic; + if (tb[TCA_CBQ_OVL_STRATEGY-1]) + cbq_set_overlimit(cl, RTA_DATA(tb[TCA_CBQ_OVL_STRATEGY-1])); +#ifdef CONFIG_NET_CLS_POLICE + if (tb[TCA_CBQ_POLICE-1]) + cbq_set_police(cl, RTA_DATA(tb[TCA_CBQ_POLICE-1])); +#endif + if (tb[TCA_CBQ_FOPT-1]) + cbq_set_fopt(cl, RTA_DATA(tb[TCA_CBQ_FOPT-1])); + sch_tree_unlock(sch); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + gen_new_estimator(&cl->bstats, &cl->rate_est, + cl->stats_lock, tca[TCA_RATE-1]); +#endif + + *arg = (unsigned long)cl; + return 0; + +failure: + qdisc_put_rtab(rtab); + return err; +} + +static int cbq_delete(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = (struct cbq_class*)arg; + + if (cl->filters || cl->children || cl == &q->link) + return -EBUSY; + + sch_tree_lock(sch); + + if (cl->next_alive) + cbq_deactivate_class(cl); + + if (q->tx_borrowed == cl) + q->tx_borrowed = q->tx_class; + if (q->tx_class == cl) { + q->tx_class = NULL; + q->tx_borrowed = NULL; + } +#ifdef CONFIG_NET_CLS_POLICE + if (q->rx_class == cl) + q->rx_class = NULL; +#endif + + cbq_unlink_class(cl); + cbq_adjust_levels(cl->tparent); + cl->defmap = 0; + cbq_sync_defmap(cl); + + cbq_rmprio(q, cl); + sch_tree_unlock(sch); + + if (--cl->refcnt == 0) + cbq_destroy_class(sch, cl); + + return 0; +} + +static struct tcf_proto **cbq_find_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *cl = (struct cbq_class *)arg; + + if (cl == NULL) + cl = &q->link; + + return &cl->filter_list; +} + +static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + struct cbq_class *p = (struct cbq_class*)parent; + struct cbq_class *cl = cbq_class_lookup(q, classid); + + if (cl) { + if (p && p->level <= cl->level) + return 0; + cl->filters++; + return (unsigned long)cl; + } + return 0; +} + +static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg) +{ + struct cbq_class *cl = (struct cbq_class*)arg; + + cl->filters--; +} + +static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct cbq_sched_data *q = qdisc_priv(sch); + unsigned h; + + if (arg->stop) + return; + + for (h = 0; h < 16; h++) { + struct cbq_class *cl; + + for (cl = q->classes[h]; cl; cl = cl->next) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static struct Qdisc_class_ops cbq_class_ops = { + .graft = cbq_graft, + .leaf = cbq_leaf, + .get = cbq_get, + .put = cbq_put, + .change = cbq_change_class, + .delete = cbq_delete, + .walk = cbq_walk, + .tcf_chain = cbq_find_tcf, + .bind_tcf = cbq_bind_filter, + .unbind_tcf = cbq_unbind_filter, + .dump = cbq_dump_class, + .dump_stats = cbq_dump_class_stats, +}; + +static struct Qdisc_ops cbq_qdisc_ops = { + .next = NULL, + .cl_ops = &cbq_class_ops, + .id = "cbq", + .priv_size = sizeof(struct cbq_sched_data), + .enqueue = cbq_enqueue, + .dequeue = cbq_dequeue, + .requeue = cbq_requeue, + .drop = cbq_drop, + .init = cbq_init, + .reset = cbq_reset, + .destroy = cbq_destroy, + .change = NULL, + .dump = cbq_dump, + .dump_stats = cbq_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init cbq_module_init(void) +{ + return register_qdisc(&cbq_qdisc_ops); +} +static void __exit cbq_module_exit(void) +{ + unregister_qdisc(&cbq_qdisc_ops); +} +module_init(cbq_module_init) +module_exit(cbq_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c new file mode 100644 index 000000000000..8a3db9d95bab --- /dev/null +++ b/net/sched/sch_dsmark.c @@ -0,0 +1,479 @@ +/* net/sched/sch_dsmark.c - Differentiated Services field marker */ + +/* Written 1998-2000 by Werner Almesberger, EPFL ICA */ + + +#include +#include +#include +#include +#include +#include +#include +#include /* for pkt_sched */ +#include +#include +#include +#include +#include + + +#if 1 /* control */ +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + +#if 0 /* data */ +#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define D2PRINTK(format,args...) +#endif + + +#define PRIV(sch) qdisc_priv(sch) + + +/* + * classid class marking + * ------- ----- ------- + * n/a 0 n/a + * x:0 1 use entry [0] + * ... ... ... + * x:y y>0 y+1 use entry [y] + * ... ... ... + * x:indices-1 indices use entry [indices-1] + * ... ... ... + * x:y y+1 use entry [y & (indices-1)] + * ... ... ... + * 0xffff 0x10000 use entry [indices-1] + */ + + +#define NO_DEFAULT_INDEX (1 << 16) + +struct dsmark_qdisc_data { + struct Qdisc *q; + struct tcf_proto *filter_list; + __u8 *mask; /* "owns" the array */ + __u8 *value; + __u16 indices; + __u32 default_index; /* index range is 0...0xffff */ + int set_tc_index; +}; + + +/* ------------------------- Class/flow operations ------------------------- */ + + +static int dsmark_graft(struct Qdisc *sch,unsigned long arg, + struct Qdisc *new,struct Qdisc **old) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + + DPRINTK("dsmark_graft(sch %p,[qdisc %p],new %p,old %p)\n",sch,p,new, + old); + if (!new) + new = &noop_qdisc; + sch_tree_lock(sch); + *old = xchg(&p->q,new); + if (*old) + qdisc_reset(*old); + sch->q.qlen = 0; + sch_tree_unlock(sch); /* @@@ move up ? */ + return 0; +} + + +static struct Qdisc *dsmark_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + + return p->q; +} + + +static unsigned long dsmark_get(struct Qdisc *sch,u32 classid) +{ + struct dsmark_qdisc_data *p __attribute__((unused)) = PRIV(sch); + + DPRINTK("dsmark_get(sch %p,[qdisc %p],classid %x)\n",sch,p,classid); + return TC_H_MIN(classid)+1; +} + + +static unsigned long dsmark_bind_filter(struct Qdisc *sch, + unsigned long parent, u32 classid) +{ + return dsmark_get(sch,classid); +} + + +static void dsmark_put(struct Qdisc *sch, unsigned long cl) +{ +} + + +static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, + struct rtattr **tca, unsigned long *arg) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_DSMARK_MAX]; + + DPRINTK("dsmark_change(sch %p,[qdisc %p],classid %x,parent %x)," + "arg 0x%lx\n",sch,p,classid,parent,*arg); + if (*arg > p->indices) + return -ENOENT; + if (!opt || rtattr_parse_nested(tb, TCA_DSMARK_MAX, opt)) + return -EINVAL; + if (tb[TCA_DSMARK_MASK-1]) { + if (!RTA_PAYLOAD(tb[TCA_DSMARK_MASK-1])) + return -EINVAL; + p->mask[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_MASK-1]); + } + if (tb[TCA_DSMARK_VALUE-1]) { + if (!RTA_PAYLOAD(tb[TCA_DSMARK_VALUE-1])) + return -EINVAL; + p->value[*arg-1] = *(__u8 *) RTA_DATA(tb[TCA_DSMARK_VALUE-1]); + } + return 0; +} + + +static int dsmark_delete(struct Qdisc *sch,unsigned long arg) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + + if (!arg || arg > p->indices) + return -EINVAL; + p->mask[arg-1] = 0xff; + p->value[arg-1] = 0; + return 0; +} + + +static void dsmark_walk(struct Qdisc *sch,struct qdisc_walker *walker) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + int i; + + DPRINTK("dsmark_walk(sch %p,[qdisc %p],walker %p)\n",sch,p,walker); + if (walker->stop) + return; + for (i = 0; i < p->indices; i++) { + if (p->mask[i] == 0xff && !p->value[i]) + continue; + if (walker->count >= walker->skip) { + if (walker->fn(sch, i+1, walker) < 0) { + walker->stop = 1; + break; + } + } + walker->count++; + } +} + + +static struct tcf_proto **dsmark_find_tcf(struct Qdisc *sch,unsigned long cl) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + + return &p->filter_list; +} + + +/* --------------------------- Qdisc operations ---------------------------- */ + + +static int dsmark_enqueue(struct sk_buff *skb,struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + struct tcf_result res; + int result; + int ret = NET_XMIT_POLICED; + + D2PRINTK("dsmark_enqueue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); + if (p->set_tc_index) { + /* FIXME: Safe with non-linear skbs? --RR */ + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + skb->tc_index = ipv4_get_dsfield(skb->nh.iph) + & ~INET_ECN_MASK; + break; + case __constant_htons(ETH_P_IPV6): + skb->tc_index = ipv6_get_dsfield(skb->nh.ipv6h) + & ~INET_ECN_MASK; + break; + default: + skb->tc_index = 0; + break; + }; + } + result = TC_POLICE_OK; /* be nice to gcc */ + if (TC_H_MAJ(skb->priority) == sch->handle) { + skb->tc_index = TC_H_MIN(skb->priority); + } else { + result = tc_classify(skb,p->filter_list,&res); + D2PRINTK("result %d class 0x%04x\n",result,res.classid); + switch (result) { +#ifdef CONFIG_NET_CLS_POLICE + case TC_POLICE_SHOT: + kfree_skb(skb); + break; +#if 0 + case TC_POLICE_RECLASSIFY: + /* FIXME: what to do here ??? */ +#endif +#endif + case TC_POLICE_OK: + skb->tc_index = TC_H_MIN(res.classid); + break; + case TC_POLICE_UNSPEC: + /* fall through */ + default: + if (p->default_index != NO_DEFAULT_INDEX) + skb->tc_index = p->default_index; + break; + }; + } + if ( +#ifdef CONFIG_NET_CLS_POLICE + result == TC_POLICE_SHOT || +#endif + + ((ret = p->q->enqueue(skb,p->q)) != 0)) { + sch->qstats.drops++; + return ret; + } + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + sch->q.qlen++; + return ret; +} + + +static struct sk_buff *dsmark_dequeue(struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + struct sk_buff *skb; + int index; + + D2PRINTK("dsmark_dequeue(sch %p,[qdisc %p])\n",sch,p); + skb = p->q->ops->dequeue(p->q); + if (!skb) + return NULL; + sch->q.qlen--; + index = skb->tc_index & (p->indices-1); + D2PRINTK("index %d->%d\n",skb->tc_index,index); + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + ipv4_change_dsfield(skb->nh.iph, + p->mask[index],p->value[index]); + break; + case __constant_htons(ETH_P_IPV6): + ipv6_change_dsfield(skb->nh.ipv6h, + p->mask[index],p->value[index]); + break; + default: + /* + * Only complain if a change was actually attempted. + * This way, we can send non-IP traffic through dsmark + * and don't need yet another qdisc as a bypass. + */ + if (p->mask[index] != 0xff || p->value[index]) + printk(KERN_WARNING "dsmark_dequeue: " + "unsupported protocol %d\n", + htons(skb->protocol)); + break; + }; + return skb; +} + + +static int dsmark_requeue(struct sk_buff *skb,struct Qdisc *sch) +{ + int ret; + struct dsmark_qdisc_data *p = PRIV(sch); + + D2PRINTK("dsmark_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,p); + if ((ret = p->q->ops->requeue(skb, p->q)) == 0) { + sch->q.qlen++; + sch->qstats.requeues++; + return 0; + } + sch->qstats.drops++; + return ret; +} + + +static unsigned int dsmark_drop(struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + unsigned int len; + + DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p); + if (!p->q->ops->drop) + return 0; + if (!(len = p->q->ops->drop(p->q))) + return 0; + sch->q.qlen--; + return len; +} + + +static int dsmark_init(struct Qdisc *sch,struct rtattr *opt) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + struct rtattr *tb[TCA_DSMARK_MAX]; + __u16 tmp; + + DPRINTK("dsmark_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); + if (!opt || + rtattr_parse(tb,TCA_DSMARK_MAX,RTA_DATA(opt),RTA_PAYLOAD(opt)) < 0 || + !tb[TCA_DSMARK_INDICES-1] || + RTA_PAYLOAD(tb[TCA_DSMARK_INDICES-1]) < sizeof(__u16)) + return -EINVAL; + p->indices = *(__u16 *) RTA_DATA(tb[TCA_DSMARK_INDICES-1]); + if (!p->indices) + return -EINVAL; + for (tmp = p->indices; tmp != 1; tmp >>= 1) { + if (tmp & 1) + return -EINVAL; + } + p->default_index = NO_DEFAULT_INDEX; + if (tb[TCA_DSMARK_DEFAULT_INDEX-1]) { + if (RTA_PAYLOAD(tb[TCA_DSMARK_DEFAULT_INDEX-1]) < sizeof(__u16)) + return -EINVAL; + p->default_index = + *(__u16 *) RTA_DATA(tb[TCA_DSMARK_DEFAULT_INDEX-1]); + } + p->set_tc_index = !!tb[TCA_DSMARK_SET_TC_INDEX-1]; + p->mask = kmalloc(p->indices*2,GFP_KERNEL); + if (!p->mask) + return -ENOMEM; + p->value = p->mask+p->indices; + memset(p->mask,0xff,p->indices); + memset(p->value,0,p->indices); + if (!(p->q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops))) + p->q = &noop_qdisc; + DPRINTK("dsmark_init: qdisc %p\n",&p->q); + return 0; +} + + +static void dsmark_reset(struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + + DPRINTK("dsmark_reset(sch %p,[qdisc %p])\n",sch,p); + qdisc_reset(p->q); + sch->q.qlen = 0; +} + + +static void dsmark_destroy(struct Qdisc *sch) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + struct tcf_proto *tp; + + DPRINTK("dsmark_destroy(sch %p,[qdisc %p])\n",sch,p); + while (p->filter_list) { + tp = p->filter_list; + p->filter_list = tp->next; + tcf_destroy(tp); + } + qdisc_destroy(p->q); + kfree(p->mask); +} + + +static int dsmark_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + unsigned char *b = skb->tail; + struct rtattr *rta; + + DPRINTK("dsmark_dump_class(sch %p,[qdisc %p],class %ld\n",sch,p,cl); + if (!cl || cl > p->indices) + return -EINVAL; + tcm->tcm_handle = TC_H_MAKE(TC_H_MAJ(sch->handle),cl-1); + rta = (struct rtattr *) b; + RTA_PUT(skb,TCA_OPTIONS,0,NULL); + RTA_PUT(skb,TCA_DSMARK_MASK,1,&p->mask[cl-1]); + RTA_PUT(skb,TCA_DSMARK_VALUE,1,&p->value[cl-1]); + rta->rta_len = skb->tail-b; + return skb->len; + +rtattr_failure: + skb_trim(skb,b-skb->data); + return -1; +} + +static int dsmark_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct dsmark_qdisc_data *p = PRIV(sch); + unsigned char *b = skb->tail; + struct rtattr *rta; + + rta = (struct rtattr *) b; + RTA_PUT(skb,TCA_OPTIONS,0,NULL); + RTA_PUT(skb,TCA_DSMARK_INDICES,sizeof(__u16),&p->indices); + if (p->default_index != NO_DEFAULT_INDEX) { + __u16 tmp = p->default_index; + + RTA_PUT(skb,TCA_DSMARK_DEFAULT_INDEX, sizeof(__u16), &tmp); + } + if (p->set_tc_index) + RTA_PUT(skb, TCA_DSMARK_SET_TC_INDEX, 0, NULL); + rta->rta_len = skb->tail-b; + return skb->len; + +rtattr_failure: + skb_trim(skb,b-skb->data); + return -1; +} + +static struct Qdisc_class_ops dsmark_class_ops = { + .graft = dsmark_graft, + .leaf = dsmark_leaf, + .get = dsmark_get, + .put = dsmark_put, + .change = dsmark_change, + .delete = dsmark_delete, + .walk = dsmark_walk, + .tcf_chain = dsmark_find_tcf, + .bind_tcf = dsmark_bind_filter, + .unbind_tcf = dsmark_put, + .dump = dsmark_dump_class, +}; + +static struct Qdisc_ops dsmark_qdisc_ops = { + .next = NULL, + .cl_ops = &dsmark_class_ops, + .id = "dsmark", + .priv_size = sizeof(struct dsmark_qdisc_data), + .enqueue = dsmark_enqueue, + .dequeue = dsmark_dequeue, + .requeue = dsmark_requeue, + .drop = dsmark_drop, + .init = dsmark_init, + .reset = dsmark_reset, + .destroy = dsmark_destroy, + .change = NULL, + .dump = dsmark_dump, + .owner = THIS_MODULE, +}; + +static int __init dsmark_module_init(void) +{ + return register_qdisc(&dsmark_qdisc_ops); +} +static void __exit dsmark_module_exit(void) +{ + unregister_qdisc(&dsmark_qdisc_ops); +} +module_init(dsmark_module_init) +module_exit(dsmark_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c new file mode 100644 index 000000000000..4888305c96da --- /dev/null +++ b/net/sched/sch_fifo.c @@ -0,0 +1,212 @@ +/* + * net/sched/sch_fifo.c The simplest FIFO queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* 1 band FIFO pseudo-"scheduler" */ + +struct fifo_sched_data +{ + unsigned limit; +}; + +static int +bfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + + if (sch->qstats.backlog + skb->len <= q->limit) { + __skb_queue_tail(&sch->q, skb); + sch->qstats.backlog += skb->len; + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + return 0; + } + sch->qstats.drops++; +#ifdef CONFIG_NET_CLS_POLICE + if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) +#endif + kfree_skb(skb); + return NET_XMIT_DROP; +} + +static int +bfifo_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + __skb_queue_head(&sch->q, skb); + sch->qstats.backlog += skb->len; + sch->qstats.requeues++; + return 0; +} + +static struct sk_buff * +bfifo_dequeue(struct Qdisc* sch) +{ + struct sk_buff *skb; + + skb = __skb_dequeue(&sch->q); + if (skb) + sch->qstats.backlog -= skb->len; + return skb; +} + +static unsigned int +fifo_drop(struct Qdisc* sch) +{ + struct sk_buff *skb; + + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + unsigned int len = skb->len; + sch->qstats.backlog -= len; + kfree_skb(skb); + return len; + } + return 0; +} + +static void +fifo_reset(struct Qdisc* sch) +{ + skb_queue_purge(&sch->q); + sch->qstats.backlog = 0; +} + +static int +pfifo_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + + if (sch->q.qlen < q->limit) { + __skb_queue_tail(&sch->q, skb); + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + return 0; + } + sch->qstats.drops++; +#ifdef CONFIG_NET_CLS_POLICE + if (sch->reshape_fail==NULL || sch->reshape_fail(skb, sch)) +#endif + kfree_skb(skb); + return NET_XMIT_DROP; +} + +static int +pfifo_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + __skb_queue_head(&sch->q, skb); + sch->qstats.requeues++; + return 0; +} + + +static struct sk_buff * +pfifo_dequeue(struct Qdisc* sch) +{ + return __skb_dequeue(&sch->q); +} + +static int fifo_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + + if (opt == NULL) { + unsigned int limit = sch->dev->tx_queue_len ? : 1; + + if (sch->ops == &bfifo_qdisc_ops) + q->limit = limit*sch->dev->mtu; + else + q->limit = limit; + } else { + struct tc_fifo_qopt *ctl = RTA_DATA(opt); + if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) + return -EINVAL; + q->limit = ctl->limit; + } + return 0; +} + +static int fifo_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct fifo_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct tc_fifo_qopt opt; + + opt.limit = q->limit; + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +struct Qdisc_ops pfifo_qdisc_ops = { + .next = NULL, + .cl_ops = NULL, + .id = "pfifo", + .priv_size = sizeof(struct fifo_sched_data), + .enqueue = pfifo_enqueue, + .dequeue = pfifo_dequeue, + .requeue = pfifo_requeue, + .drop = fifo_drop, + .init = fifo_init, + .reset = fifo_reset, + .destroy = NULL, + .change = fifo_init, + .dump = fifo_dump, + .owner = THIS_MODULE, +}; + +struct Qdisc_ops bfifo_qdisc_ops = { + .next = NULL, + .cl_ops = NULL, + .id = "bfifo", + .priv_size = sizeof(struct fifo_sched_data), + .enqueue = bfifo_enqueue, + .dequeue = bfifo_dequeue, + .requeue = bfifo_requeue, + .drop = fifo_drop, + .init = fifo_init, + .reset = fifo_reset, + .destroy = NULL, + .change = fifo_init, + .dump = fifo_dump, + .owner = THIS_MODULE, +}; + +EXPORT_SYMBOL(bfifo_qdisc_ops); +EXPORT_SYMBOL(pfifo_qdisc_ops); diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c new file mode 100644 index 000000000000..8c01e023f02e --- /dev/null +++ b/net/sched/sch_generic.c @@ -0,0 +1,609 @@ +/* + * net/sched/sch_generic.c Generic packet scheduler routines. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * Jamal Hadi Salim, 990601 + * - Ingress support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Main transmission queue. */ + +/* Main qdisc structure lock. + + However, modifications + to data, participating in scheduling must be additionally + protected with dev->queue_lock spinlock. + + The idea is the following: + - enqueue, dequeue are serialized via top level device + spinlock dev->queue_lock. + - tree walking is protected by read_lock_bh(qdisc_tree_lock) + and this lock is used only in process context. + - updates to tree are made under rtnl semaphore or + from softirq context (__qdisc_destroy rcu-callback) + hence this lock needs local bh disabling. + + qdisc_tree_lock must be grabbed BEFORE dev->queue_lock! + */ +DEFINE_RWLOCK(qdisc_tree_lock); + +void qdisc_lock_tree(struct net_device *dev) +{ + write_lock_bh(&qdisc_tree_lock); + spin_lock_bh(&dev->queue_lock); +} + +void qdisc_unlock_tree(struct net_device *dev) +{ + spin_unlock_bh(&dev->queue_lock); + write_unlock_bh(&qdisc_tree_lock); +} + +/* + dev->queue_lock serializes queue accesses for this device + AND dev->qdisc pointer itself. + + dev->xmit_lock serializes accesses to device driver. + + dev->queue_lock and dev->xmit_lock are mutually exclusive, + if one is grabbed, another must be free. + */ + + +/* Kick device. + Note, that this procedure can be called by a watchdog timer, so that + we do not check dev->tbusy flag here. + + Returns: 0 - queue is empty. + >0 - queue is not empty, but throttled. + <0 - queue is not empty. Device is throttled, if dev->tbusy != 0. + + NOTE: Called under dev->queue_lock with locally disabled BH. +*/ + +int qdisc_restart(struct net_device *dev) +{ + struct Qdisc *q = dev->qdisc; + struct sk_buff *skb; + + /* Dequeue packet */ + if ((skb = q->dequeue(q)) != NULL) { + unsigned nolock = (dev->features & NETIF_F_LLTX); + /* + * When the driver has LLTX set it does its own locking + * in start_xmit. No need to add additional overhead by + * locking again. These checks are worth it because + * even uncongested locks can be quite expensive. + * The driver can do trylock like here too, in case + * of lock congestion it should return -1 and the packet + * will be requeued. + */ + if (!nolock) { + if (!spin_trylock(&dev->xmit_lock)) { + collision: + /* So, someone grabbed the driver. */ + + /* It may be transient configuration error, + when hard_start_xmit() recurses. We detect + it by checking xmit owner and drop the + packet when deadloop is detected. + */ + if (dev->xmit_lock_owner == smp_processor_id()) { + kfree_skb(skb); + if (net_ratelimit()) + printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name); + return -1; + } + __get_cpu_var(netdev_rx_stat).cpu_collision++; + goto requeue; + } + /* Remember that the driver is grabbed by us. */ + dev->xmit_lock_owner = smp_processor_id(); + } + + { + /* And release queue */ + spin_unlock(&dev->queue_lock); + + if (!netif_queue_stopped(dev)) { + int ret; + if (netdev_nit) + dev_queue_xmit_nit(skb, dev); + + ret = dev->hard_start_xmit(skb, dev); + if (ret == NETDEV_TX_OK) { + if (!nolock) { + dev->xmit_lock_owner = -1; + spin_unlock(&dev->xmit_lock); + } + spin_lock(&dev->queue_lock); + return -1; + } + if (ret == NETDEV_TX_LOCKED && nolock) { + spin_lock(&dev->queue_lock); + goto collision; + } + } + + /* NETDEV_TX_BUSY - we need to requeue */ + /* Release the driver */ + if (!nolock) { + dev->xmit_lock_owner = -1; + spin_unlock(&dev->xmit_lock); + } + spin_lock(&dev->queue_lock); + q = dev->qdisc; + } + + /* Device kicked us out :( + This is possible in three cases: + + 0. driver is locked + 1. fastroute is enabled + 2. device cannot determine busy state + before start of transmission (f.e. dialout) + 3. device is buggy (ppp) + */ + +requeue: + q->ops->requeue(skb, q); + netif_schedule(dev); + return 1; + } + return q->q.qlen; +} + +static void dev_watchdog(unsigned long arg) +{ + struct net_device *dev = (struct net_device *)arg; + + spin_lock(&dev->xmit_lock); + if (dev->qdisc != &noop_qdisc) { + if (netif_device_present(dev) && + netif_running(dev) && + netif_carrier_ok(dev)) { + if (netif_queue_stopped(dev) && + (jiffies - dev->trans_start) > dev->watchdog_timeo) { + printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n", dev->name); + dev->tx_timeout(dev); + } + if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo)) + dev_hold(dev); + } + } + spin_unlock(&dev->xmit_lock); + + dev_put(dev); +} + +static void dev_watchdog_init(struct net_device *dev) +{ + init_timer(&dev->watchdog_timer); + dev->watchdog_timer.data = (unsigned long)dev; + dev->watchdog_timer.function = dev_watchdog; +} + +void __netdev_watchdog_up(struct net_device *dev) +{ + if (dev->tx_timeout) { + if (dev->watchdog_timeo <= 0) + dev->watchdog_timeo = 5*HZ; + if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo)) + dev_hold(dev); + } +} + +static void dev_watchdog_up(struct net_device *dev) +{ + spin_lock_bh(&dev->xmit_lock); + __netdev_watchdog_up(dev); + spin_unlock_bh(&dev->xmit_lock); +} + +static void dev_watchdog_down(struct net_device *dev) +{ + spin_lock_bh(&dev->xmit_lock); + if (del_timer(&dev->watchdog_timer)) + __dev_put(dev); + spin_unlock_bh(&dev->xmit_lock); +} + +/* "NOOP" scheduler: the best scheduler, recommended for all interfaces + under all circumstances. It is difficult to invent anything faster or + cheaper. + */ + +static int +noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc) +{ + kfree_skb(skb); + return NET_XMIT_CN; +} + +static struct sk_buff * +noop_dequeue(struct Qdisc * qdisc) +{ + return NULL; +} + +static int +noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc) +{ + if (net_ratelimit()) + printk(KERN_DEBUG "%s deferred output. It is buggy.\n", skb->dev->name); + kfree_skb(skb); + return NET_XMIT_CN; +} + +struct Qdisc_ops noop_qdisc_ops = { + .next = NULL, + .cl_ops = NULL, + .id = "noop", + .priv_size = 0, + .enqueue = noop_enqueue, + .dequeue = noop_dequeue, + .requeue = noop_requeue, + .owner = THIS_MODULE, +}; + +struct Qdisc noop_qdisc = { + .enqueue = noop_enqueue, + .dequeue = noop_dequeue, + .flags = TCQ_F_BUILTIN, + .ops = &noop_qdisc_ops, + .list = LIST_HEAD_INIT(noop_qdisc.list), +}; + +static struct Qdisc_ops noqueue_qdisc_ops = { + .next = NULL, + .cl_ops = NULL, + .id = "noqueue", + .priv_size = 0, + .enqueue = noop_enqueue, + .dequeue = noop_dequeue, + .requeue = noop_requeue, + .owner = THIS_MODULE, +}; + +static struct Qdisc noqueue_qdisc = { + .enqueue = NULL, + .dequeue = noop_dequeue, + .flags = TCQ_F_BUILTIN, + .ops = &noqueue_qdisc_ops, + .list = LIST_HEAD_INIT(noqueue_qdisc.list), +}; + + +static const u8 prio2band[TC_PRIO_MAX+1] = + { 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 }; + +/* 3-band FIFO queue: old style, but should be a bit faster than + generic prio+fifo combination. + */ + +static int +pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc) +{ + struct sk_buff_head *list = qdisc_priv(qdisc); + + list += prio2band[skb->priority&TC_PRIO_MAX]; + + if (list->qlen < qdisc->dev->tx_queue_len) { + __skb_queue_tail(list, skb); + qdisc->q.qlen++; + qdisc->bstats.bytes += skb->len; + qdisc->bstats.packets++; + return 0; + } + qdisc->qstats.drops++; + kfree_skb(skb); + return NET_XMIT_DROP; +} + +static struct sk_buff * +pfifo_fast_dequeue(struct Qdisc* qdisc) +{ + int prio; + struct sk_buff_head *list = qdisc_priv(qdisc); + struct sk_buff *skb; + + for (prio = 0; prio < 3; prio++, list++) { + skb = __skb_dequeue(list); + if (skb) { + qdisc->q.qlen--; + return skb; + } + } + return NULL; +} + +static int +pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc) +{ + struct sk_buff_head *list = qdisc_priv(qdisc); + + list += prio2band[skb->priority&TC_PRIO_MAX]; + + __skb_queue_head(list, skb); + qdisc->q.qlen++; + qdisc->qstats.requeues++; + return 0; +} + +static void +pfifo_fast_reset(struct Qdisc* qdisc) +{ + int prio; + struct sk_buff_head *list = qdisc_priv(qdisc); + + for (prio=0; prio < 3; prio++) + skb_queue_purge(list+prio); + qdisc->q.qlen = 0; +} + +static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb) +{ + unsigned char *b = skb->tail; + struct tc_prio_qopt opt; + + opt.bands = 3; + memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1); + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt) +{ + int i; + struct sk_buff_head *list = qdisc_priv(qdisc); + + for (i=0; i<3; i++) + skb_queue_head_init(list+i); + + return 0; +} + +static struct Qdisc_ops pfifo_fast_ops = { + .next = NULL, + .cl_ops = NULL, + .id = "pfifo_fast", + .priv_size = 3 * sizeof(struct sk_buff_head), + .enqueue = pfifo_fast_enqueue, + .dequeue = pfifo_fast_dequeue, + .requeue = pfifo_fast_requeue, + .init = pfifo_fast_init, + .reset = pfifo_fast_reset, + .dump = pfifo_fast_dump, + .owner = THIS_MODULE, +}; + +struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops) +{ + void *p; + struct Qdisc *sch; + int size; + + /* ensure that the Qdisc and the private data are 32-byte aligned */ + size = ((sizeof(*sch) + QDISC_ALIGN_CONST) & ~QDISC_ALIGN_CONST); + size += ops->priv_size + QDISC_ALIGN_CONST; + + p = kmalloc(size, GFP_KERNEL); + if (!p) + return NULL; + memset(p, 0, size); + + sch = (struct Qdisc *)(((unsigned long)p + QDISC_ALIGN_CONST) + & ~QDISC_ALIGN_CONST); + sch->padded = (char *)sch - (char *)p; + + INIT_LIST_HEAD(&sch->list); + skb_queue_head_init(&sch->q); + sch->ops = ops; + sch->enqueue = ops->enqueue; + sch->dequeue = ops->dequeue; + sch->dev = dev; + dev_hold(dev); + sch->stats_lock = &dev->queue_lock; + atomic_set(&sch->refcnt, 1); + if (!ops->init || ops->init(sch, NULL) == 0) + return sch; + + dev_put(dev); + kfree(p); + return NULL; +} + +/* Under dev->queue_lock and BH! */ + +void qdisc_reset(struct Qdisc *qdisc) +{ + struct Qdisc_ops *ops = qdisc->ops; + + if (ops->reset) + ops->reset(qdisc); +} + +/* this is the rcu callback function to clean up a qdisc when there + * are no further references to it */ + +static void __qdisc_destroy(struct rcu_head *head) +{ + struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu); + struct Qdisc_ops *ops = qdisc->ops; + +#ifdef CONFIG_NET_ESTIMATOR + gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); +#endif + write_lock(&qdisc_tree_lock); + if (ops->reset) + ops->reset(qdisc); + if (ops->destroy) + ops->destroy(qdisc); + write_unlock(&qdisc_tree_lock); + module_put(ops->owner); + + dev_put(qdisc->dev); + kfree((char *) qdisc - qdisc->padded); +} + +/* Under dev->queue_lock and BH! */ + +void qdisc_destroy(struct Qdisc *qdisc) +{ + struct list_head cql = LIST_HEAD_INIT(cql); + struct Qdisc *cq, *q, *n; + + if (qdisc->flags & TCQ_F_BUILTIN || + !atomic_dec_and_test(&qdisc->refcnt)) + return; + + if (!list_empty(&qdisc->list)) { + if (qdisc->ops->cl_ops == NULL) + list_del(&qdisc->list); + else + list_move(&qdisc->list, &cql); + } + + /* unlink inner qdiscs from dev->qdisc_list immediately */ + list_for_each_entry(cq, &cql, list) + list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list) + if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) { + if (q->ops->cl_ops == NULL) + list_del_init(&q->list); + else + list_move_tail(&q->list, &cql); + } + list_for_each_entry_safe(cq, n, &cql, list) + list_del_init(&cq->list); + + call_rcu(&qdisc->q_rcu, __qdisc_destroy); +} + +void dev_activate(struct net_device *dev) +{ + /* No queueing discipline is attached to device; + create default one i.e. pfifo_fast for devices, + which need queueing and noqueue_qdisc for + virtual interfaces + */ + + if (dev->qdisc_sleeping == &noop_qdisc) { + struct Qdisc *qdisc; + if (dev->tx_queue_len) { + qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops); + if (qdisc == NULL) { + printk(KERN_INFO "%s: activation failed\n", dev->name); + return; + } + write_lock_bh(&qdisc_tree_lock); + list_add_tail(&qdisc->list, &dev->qdisc_list); + write_unlock_bh(&qdisc_tree_lock); + } else { + qdisc = &noqueue_qdisc; + } + write_lock_bh(&qdisc_tree_lock); + dev->qdisc_sleeping = qdisc; + write_unlock_bh(&qdisc_tree_lock); + } + + spin_lock_bh(&dev->queue_lock); + rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping); + if (dev->qdisc != &noqueue_qdisc) { + dev->trans_start = jiffies; + dev_watchdog_up(dev); + } + spin_unlock_bh(&dev->queue_lock); +} + +void dev_deactivate(struct net_device *dev) +{ + struct Qdisc *qdisc; + + spin_lock_bh(&dev->queue_lock); + qdisc = dev->qdisc; + dev->qdisc = &noop_qdisc; + + qdisc_reset(qdisc); + + spin_unlock_bh(&dev->queue_lock); + + dev_watchdog_down(dev); + + while (test_bit(__LINK_STATE_SCHED, &dev->state)) + yield(); + + spin_unlock_wait(&dev->xmit_lock); +} + +void dev_init_scheduler(struct net_device *dev) +{ + qdisc_lock_tree(dev); + dev->qdisc = &noop_qdisc; + dev->qdisc_sleeping = &noop_qdisc; + INIT_LIST_HEAD(&dev->qdisc_list); + qdisc_unlock_tree(dev); + + dev_watchdog_init(dev); +} + +void dev_shutdown(struct net_device *dev) +{ + struct Qdisc *qdisc; + + qdisc_lock_tree(dev); + qdisc = dev->qdisc_sleeping; + dev->qdisc = &noop_qdisc; + dev->qdisc_sleeping = &noop_qdisc; + qdisc_destroy(qdisc); +#if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE) + if ((qdisc = dev->qdisc_ingress) != NULL) { + dev->qdisc_ingress = NULL; + qdisc_destroy(qdisc); + } +#endif + BUG_TRAP(!timer_pending(&dev->watchdog_timer)); + qdisc_unlock_tree(dev); +} + +EXPORT_SYMBOL(__netdev_watchdog_up); +EXPORT_SYMBOL(noop_qdisc); +EXPORT_SYMBOL(noop_qdisc_ops); +EXPORT_SYMBOL(qdisc_create_dflt); +EXPORT_SYMBOL(qdisc_destroy); +EXPORT_SYMBOL(qdisc_reset); +EXPORT_SYMBOL(qdisc_restart); +EXPORT_SYMBOL(qdisc_lock_tree); +EXPORT_SYMBOL(qdisc_unlock_tree); diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c new file mode 100644 index 000000000000..25c171c32715 --- /dev/null +++ b/net/sched/sch_gred.c @@ -0,0 +1,630 @@ +/* + * net/sched/sch_gred.c Generic Random Early Detection queue. + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: J Hadi Salim (hadi@cyberus.ca) 1998-2002 + * + * 991129: - Bug fix with grio mode + * - a better sing. AvgQ mode with Grio(WRED) + * - A finer grained VQ dequeue based on sugestion + * from Ren Liu + * - More error checks + * + * + * + * For all the glorious comments look at Alexey's sch_red.c + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if 1 /* control */ +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + +#if 0 /* data */ +#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define D2PRINTK(format,args...) +#endif + +struct gred_sched_data; +struct gred_sched; + +struct gred_sched_data +{ +/* Parameters */ + u32 limit; /* HARD maximal queue length */ + u32 qth_min; /* Min average length threshold: A scaled */ + u32 qth_max; /* Max average length threshold: A scaled */ + u32 DP; /* the drop pramaters */ + char Wlog; /* log(W) */ + char Plog; /* random number bits */ + u32 Scell_max; + u32 Rmask; + u32 bytesin; /* bytes seen on virtualQ so far*/ + u32 packetsin; /* packets seen on virtualQ so far*/ + u32 backlog; /* bytes on the virtualQ */ + u32 forced; /* packets dropped for exceeding limits */ + u32 early; /* packets dropped as a warning */ + u32 other; /* packets dropped by invoking drop() */ + u32 pdrop; /* packets dropped because we exceeded physical queue limits */ + char Scell_log; + u8 Stab[256]; + u8 prio; /* the prio of this vq */ + +/* Variables */ + unsigned long qave; /* Average queue length: A scaled */ + int qcount; /* Packets since last random number generation */ + u32 qR; /* Cached random number */ + + psched_time_t qidlestart; /* Start of idle period */ +}; + +struct gred_sched +{ + struct gred_sched_data *tab[MAX_DPs]; + u32 DPs; + u32 def; + u8 initd; + u8 grio; + u8 eqp; +}; + +static int +gred_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + psched_time_t now; + struct gred_sched_data *q=NULL; + struct gred_sched *t= qdisc_priv(sch); + unsigned long qave=0; + int i=0; + + if (!t->initd && skb_queue_len(&sch->q) < (sch->dev->tx_queue_len ? : 1)) { + D2PRINTK("NO GRED Queues setup yet! Enqueued anyway\n"); + goto do_enqueue; + } + + + if ( ((skb->tc_index&0xf) > (t->DPs -1)) || !(q=t->tab[skb->tc_index&0xf])) { + printk("GRED: setting to default (%d)\n ",t->def); + if (!(q=t->tab[t->def])) { + DPRINTK("GRED: setting to default FAILED! dropping!! " + "(%d)\n ", t->def); + goto drop; + } + /* fix tc_index? --could be controvesial but needed for + requeueing */ + skb->tc_index=(skb->tc_index&0xfffffff0) | t->def; + } + + D2PRINTK("gred_enqueue virtualQ 0x%x classid %x backlog %d " + "general backlog %d\n",skb->tc_index&0xf,sch->handle,q->backlog, + sch->qstats.backlog); + /* sum up all the qaves of prios <= to ours to get the new qave*/ + if (!t->eqp && t->grio) { + for (i=0;iDPs;i++) { + if ((!t->tab[i]) || (i==q->DP)) + continue; + + if ((t->tab[i]->prio < q->prio) && (PSCHED_IS_PASTPERFECT(t->tab[i]->qidlestart))) + qave +=t->tab[i]->qave; + } + + } + + q->packetsin++; + q->bytesin+=skb->len; + + if (t->eqp && t->grio) { + qave=0; + q->qave=t->tab[t->def]->qave; + q->qidlestart=t->tab[t->def]->qidlestart; + } + + if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { + long us_idle; + PSCHED_GET_TIME(now); + us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); + PSCHED_SET_PASTPERFECT(q->qidlestart); + + q->qave >>= q->Stab[(us_idle>>q->Scell_log)&0xFF]; + } else { + if (t->eqp) { + q->qave += sch->qstats.backlog - (q->qave >> q->Wlog); + } else { + q->qave += q->backlog - (q->qave >> q->Wlog); + } + + } + + + if (t->eqp && t->grio) + t->tab[t->def]->qave=q->qave; + + if ((q->qave+qave) < q->qth_min) { + q->qcount = -1; +enqueue: + if (q->backlog + skb->len <= q->limit) { + q->backlog += skb->len; +do_enqueue: + __skb_queue_tail(&sch->q, skb); + sch->qstats.backlog += skb->len; + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + return 0; + } else { + q->pdrop++; + } + +drop: + kfree_skb(skb); + sch->qstats.drops++; + return NET_XMIT_DROP; + } + if ((q->qave+qave) >= q->qth_max) { + q->qcount = -1; + sch->qstats.overlimits++; + q->forced++; + goto drop; + } + if (++q->qcount) { + if ((((qave+q->qave) - q->qth_min)>>q->Wlog)*q->qcount < q->qR) + goto enqueue; + q->qcount = 0; + q->qR = net_random()&q->Rmask; + sch->qstats.overlimits++; + q->early++; + goto drop; + } + q->qR = net_random()&q->Rmask; + goto enqueue; +} + +static int +gred_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct gred_sched_data *q; + struct gred_sched *t= qdisc_priv(sch); + q= t->tab[(skb->tc_index&0xf)]; +/* error checking here -- probably unnecessary */ + PSCHED_SET_PASTPERFECT(q->qidlestart); + + __skb_queue_head(&sch->q, skb); + sch->qstats.backlog += skb->len; + sch->qstats.requeues++; + q->backlog += skb->len; + return 0; +} + +static struct sk_buff * +gred_dequeue(struct Qdisc* sch) +{ + struct sk_buff *skb; + struct gred_sched_data *q; + struct gred_sched *t= qdisc_priv(sch); + + skb = __skb_dequeue(&sch->q); + if (skb) { + sch->qstats.backlog -= skb->len; + q= t->tab[(skb->tc_index&0xf)]; + if (q) { + q->backlog -= skb->len; + if (!q->backlog && !t->eqp) + PSCHED_GET_TIME(q->qidlestart); + } else { + D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); + } + return skb; + } + + if (t->eqp) { + q= t->tab[t->def]; + if (!q) + D2PRINTK("no default VQ set: Results will be " + "screwed up\n"); + else + PSCHED_GET_TIME(q->qidlestart); + } + + return NULL; +} + +static unsigned int gred_drop(struct Qdisc* sch) +{ + struct sk_buff *skb; + + struct gred_sched_data *q; + struct gred_sched *t= qdisc_priv(sch); + + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + unsigned int len = skb->len; + sch->qstats.backlog -= len; + sch->qstats.drops++; + q= t->tab[(skb->tc_index&0xf)]; + if (q) { + q->backlog -= len; + q->other++; + if (!q->backlog && !t->eqp) + PSCHED_GET_TIME(q->qidlestart); + } else { + D2PRINTK("gred_dequeue: skb has bad tcindex %x\n",skb->tc_index&0xf); + } + + kfree_skb(skb); + return len; + } + + q=t->tab[t->def]; + if (!q) { + D2PRINTK("no default VQ set: Results might be screwed up\n"); + return 0; + } + + PSCHED_GET_TIME(q->qidlestart); + return 0; + +} + +static void gred_reset(struct Qdisc* sch) +{ + int i; + struct gred_sched_data *q; + struct gred_sched *t= qdisc_priv(sch); + + __skb_queue_purge(&sch->q); + + sch->qstats.backlog = 0; + + for (i=0;iDPs;i++) { + q= t->tab[i]; + if (!q) + continue; + PSCHED_SET_PASTPERFECT(q->qidlestart); + q->qave = 0; + q->qcount = -1; + q->backlog = 0; + q->other=0; + q->forced=0; + q->pdrop=0; + q->early=0; + } +} + +static int gred_change(struct Qdisc *sch, struct rtattr *opt) +{ + struct gred_sched *table = qdisc_priv(sch); + struct gred_sched_data *q; + struct tc_gred_qopt *ctl; + struct tc_gred_sopt *sopt; + struct rtattr *tb[TCA_GRED_STAB]; + struct rtattr *tb2[TCA_GRED_DPS]; + int i; + + if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt)) + return -EINVAL; + + if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) { + rtattr_parse_nested(tb2, TCA_GRED_DPS, opt); + + if (tb2[TCA_GRED_DPS-1] == 0) + return -EINVAL; + + sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); + table->DPs=sopt->DPs; + table->def=sopt->def_DP; + table->grio=sopt->grio; + table->initd=0; + /* probably need to clear all the table DP entries as well */ + return 0; + } + + + if (!table->DPs || tb[TCA_GRED_PARMS-1] == 0 || tb[TCA_GRED_STAB-1] == 0 || + RTA_PAYLOAD(tb[TCA_GRED_PARMS-1]) < sizeof(*ctl) || + RTA_PAYLOAD(tb[TCA_GRED_STAB-1]) < 256) + return -EINVAL; + + ctl = RTA_DATA(tb[TCA_GRED_PARMS-1]); + if (ctl->DP > MAX_DPs-1 ) { + /* misbehaving is punished! Put in the default drop probability */ + DPRINTK("\nGRED: DP %u not in the proper range fixed. New DP " + "set to default at %d\n",ctl->DP,table->def); + ctl->DP=table->def; + } + + if (table->tab[ctl->DP] == NULL) { + table->tab[ctl->DP]=kmalloc(sizeof(struct gred_sched_data), + GFP_KERNEL); + if (NULL == table->tab[ctl->DP]) + return -ENOMEM; + memset(table->tab[ctl->DP], 0, (sizeof(struct gred_sched_data))); + } + q= table->tab[ctl->DP]; + + if (table->grio) { + if (ctl->prio <=0) { + if (table->def && table->tab[table->def]) { + DPRINTK("\nGRED: DP %u does not have a prio" + "setting default to %d\n",ctl->DP, + table->tab[table->def]->prio); + q->prio=table->tab[table->def]->prio; + } else { + DPRINTK("\nGRED: DP %u does not have a prio" + " setting default to 8\n",ctl->DP); + q->prio=8; + } + } else { + q->prio=ctl->prio; + } + } else { + q->prio=8; + } + + + q->DP=ctl->DP; + q->Wlog = ctl->Wlog; + q->Plog = ctl->Plog; + q->limit = ctl->limit; + q->Scell_log = ctl->Scell_log; + q->Rmask = ctl->Plog < 32 ? ((1<Plog) - 1) : ~0UL; + q->Scell_max = (255<Scell_log); + q->qth_min = ctl->qth_min<Wlog; + q->qth_max = ctl->qth_max<Wlog; + q->qave=0; + q->backlog=0; + q->qcount = -1; + q->other=0; + q->forced=0; + q->pdrop=0; + q->early=0; + + PSCHED_SET_PASTPERFECT(q->qidlestart); + memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); + + if ( table->initd && table->grio) { + /* this looks ugly but it's not in the fast path */ + for (i=0;iDPs;i++) { + if ((!table->tab[i]) || (i==q->DP) ) + continue; + if (table->tab[i]->prio == q->prio ){ + /* WRED mode detected */ + table->eqp=1; + break; + } + } + } + + if (!table->initd) { + table->initd=1; + /* + the first entry also goes into the default until + over-written + */ + + if (table->tab[table->def] == NULL) { + table->tab[table->def]= + kmalloc(sizeof(struct gred_sched_data), GFP_KERNEL); + if (NULL == table->tab[table->def]) + return -ENOMEM; + + memset(table->tab[table->def], 0, + (sizeof(struct gred_sched_data))); + } + q= table->tab[table->def]; + q->DP=table->def; + q->Wlog = ctl->Wlog; + q->Plog = ctl->Plog; + q->limit = ctl->limit; + q->Scell_log = ctl->Scell_log; + q->Rmask = ctl->Plog < 32 ? ((1<Plog) - 1) : ~0UL; + q->Scell_max = (255<Scell_log); + q->qth_min = ctl->qth_min<Wlog; + q->qth_max = ctl->qth_max<Wlog; + + if (table->grio) + q->prio=table->tab[ctl->DP]->prio; + else + q->prio=8; + + q->qcount = -1; + PSCHED_SET_PASTPERFECT(q->qidlestart); + memcpy(q->Stab, RTA_DATA(tb[TCA_GRED_STAB-1]), 256); + } + return 0; + +} + +static int gred_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct gred_sched *table = qdisc_priv(sch); + struct tc_gred_sopt *sopt; + struct rtattr *tb[TCA_GRED_STAB]; + struct rtattr *tb2[TCA_GRED_DPS]; + + if (opt == NULL || rtattr_parse_nested(tb, TCA_GRED_STAB, opt)) + return -EINVAL; + + if (tb[TCA_GRED_PARMS-1] == 0 && tb[TCA_GRED_STAB-1] == 0) { + rtattr_parse_nested(tb2, TCA_GRED_DPS, opt); + + if (tb2[TCA_GRED_DPS-1] == 0) + return -EINVAL; + + sopt = RTA_DATA(tb2[TCA_GRED_DPS-1]); + table->DPs=sopt->DPs; + table->def=sopt->def_DP; + table->grio=sopt->grio; + table->initd=0; + return 0; + } + + DPRINTK("\n GRED_INIT error!\n"); + return -EINVAL; +} + +static int gred_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + unsigned long qave; + struct rtattr *rta; + struct tc_gred_qopt *opt = NULL ; + struct tc_gred_qopt *dst; + struct gred_sched *table = qdisc_priv(sch); + struct gred_sched_data *q; + int i; + unsigned char *b = skb->tail; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + opt=kmalloc(sizeof(struct tc_gred_qopt)*MAX_DPs, GFP_KERNEL); + + if (opt == NULL) { + DPRINTK("gred_dump:failed to malloc for %Zd\n", + sizeof(struct tc_gred_qopt)*MAX_DPs); + goto rtattr_failure; + } + + memset(opt, 0, (sizeof(struct tc_gred_qopt))*table->DPs); + + if (!table->initd) { + DPRINTK("NO GRED Queues setup!\n"); + } + + for (i=0;itab[i]; + + if (!q) { + /* hack -- fix at some point with proper message + This is how we indicate to tc that there is no VQ + at this DP */ + + dst->DP=MAX_DPs+i; + continue; + } + + dst->limit=q->limit; + dst->qth_min=q->qth_min>>q->Wlog; + dst->qth_max=q->qth_max>>q->Wlog; + dst->DP=q->DP; + dst->backlog=q->backlog; + if (q->qave) { + if (table->eqp && table->grio) { + q->qidlestart=table->tab[table->def]->qidlestart; + q->qave=table->tab[table->def]->qave; + } + if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { + long idle; + psched_time_t now; + PSCHED_GET_TIME(now); + idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); + qave = q->qave >> q->Stab[(idle>>q->Scell_log)&0xFF]; + dst->qave = qave >> q->Wlog; + + } else { + dst->qave = q->qave >> q->Wlog; + } + } else { + dst->qave = 0; + } + + + dst->Wlog = q->Wlog; + dst->Plog = q->Plog; + dst->Scell_log = q->Scell_log; + dst->other = q->other; + dst->forced = q->forced; + dst->early = q->early; + dst->pdrop = q->pdrop; + dst->prio = q->prio; + dst->packets=q->packetsin; + dst->bytesin=q->bytesin; + } + + RTA_PUT(skb, TCA_GRED_PARMS, sizeof(struct tc_gred_qopt)*MAX_DPs, opt); + rta->rta_len = skb->tail - b; + + kfree(opt); + return skb->len; + +rtattr_failure: + if (opt) + kfree(opt); + DPRINTK("gred_dump: FAILURE!!!!\n"); + +/* also free the opt struct here */ + skb_trim(skb, b - skb->data); + return -1; +} + +static void gred_destroy(struct Qdisc *sch) +{ + struct gred_sched *table = qdisc_priv(sch); + int i; + + for (i = 0;i < table->DPs; i++) { + if (table->tab[i]) + kfree(table->tab[i]); + } +} + +static struct Qdisc_ops gred_qdisc_ops = { + .next = NULL, + .cl_ops = NULL, + .id = "gred", + .priv_size = sizeof(struct gred_sched), + .enqueue = gred_enqueue, + .dequeue = gred_dequeue, + .requeue = gred_requeue, + .drop = gred_drop, + .init = gred_init, + .reset = gred_reset, + .destroy = gred_destroy, + .change = gred_change, + .dump = gred_dump, + .owner = THIS_MODULE, +}; + +static int __init gred_module_init(void) +{ + return register_qdisc(&gred_qdisc_ops); +} +static void __exit gred_module_exit(void) +{ + unregister_qdisc(&gred_qdisc_ops); +} +module_init(gred_module_init) +module_exit(gred_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c new file mode 100644 index 000000000000..c26764bc4103 --- /dev/null +++ b/net/sched/sch_hfsc.c @@ -0,0 +1,1822 @@ +/* + * Copyright (c) 2003 Patrick McHardy, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * 2003-10-17 - Ported from altq + */ +/* + * Copyright (c) 1997-1999 Carnegie Mellon University. All Rights Reserved. + * + * Permission to use, copy, modify, and distribute this software and + * its documentation is hereby granted (including for commercial or + * for-profit use), provided that both the copyright notice and this + * permission notice appear in all copies of the software, derivative + * works, or modified versions, and any portions thereof. + * + * THIS SOFTWARE IS EXPERIMENTAL AND IS KNOWN TO HAVE BUGS, SOME OF + * WHICH MAY HAVE SERIOUS CONSEQUENCES. CARNEGIE MELLON PROVIDES THIS + * SOFTWARE IN ITS ``AS IS'' CONDITION, AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + * DAMAGE. + * + * Carnegie Mellon encourages (but does not require) users of this + * software to return any improvements or extensions that they make, + * and to grant Carnegie Mellon the rights to redistribute these + * changes without encumbrance. + */ +/* + * H-FSC is described in Proceedings of SIGCOMM'97, + * "A Hierarchical Fair Service Curve Algorithm for Link-Sharing, + * Real-Time and Priority Service" + * by Ion Stoica, Hui Zhang, and T. S. Eugene Ng. + * + * Oleg Cherevko added the upperlimit for link-sharing. + * when a class has an upperlimit, the fit-time is computed from the + * upperlimit service curve. the link-sharing scheduler does not schedule + * a class whose fit-time exceeds the current time. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define HFSC_DEBUG 1 + +/* + * kernel internal service curve representation: + * coordinates are given by 64 bit unsigned integers. + * x-axis: unit is clock count. + * y-axis: unit is byte. + * + * The service curve parameters are converted to the internal + * representation. The slope values are scaled to avoid overflow. + * the inverse slope values as well as the y-projection of the 1st + * segment are kept in order to to avoid 64-bit divide operations + * that are expensive on 32-bit architectures. + */ + +struct internal_sc +{ + u64 sm1; /* scaled slope of the 1st segment */ + u64 ism1; /* scaled inverse-slope of the 1st segment */ + u64 dx; /* the x-projection of the 1st segment */ + u64 dy; /* the y-projection of the 1st segment */ + u64 sm2; /* scaled slope of the 2nd segment */ + u64 ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +/* runtime service curve */ +struct runtime_sc +{ + u64 x; /* current starting position on x-axis */ + u64 y; /* current starting position on y-axis */ + u64 sm1; /* scaled slope of the 1st segment */ + u64 ism1; /* scaled inverse-slope of the 1st segment */ + u64 dx; /* the x-projection of the 1st segment */ + u64 dy; /* the y-projection of the 1st segment */ + u64 sm2; /* scaled slope of the 2nd segment */ + u64 ism2; /* scaled inverse-slope of the 2nd segment */ +}; + +enum hfsc_class_flags +{ + HFSC_RSC = 0x1, + HFSC_FSC = 0x2, + HFSC_USC = 0x4 +}; + +struct hfsc_class +{ + u32 classid; /* class id */ + unsigned int refcnt; /* usage count */ + + struct gnet_stats_basic bstats; + struct gnet_stats_queue qstats; + struct gnet_stats_rate_est rate_est; + spinlock_t *stats_lock; + unsigned int level; /* class level in hierarchy */ + struct tcf_proto *filter_list; /* filter list */ + unsigned int filter_cnt; /* filter count */ + + struct hfsc_sched *sched; /* scheduler data */ + struct hfsc_class *cl_parent; /* parent class */ + struct list_head siblings; /* sibling classes */ + struct list_head children; /* child classes */ + struct Qdisc *qdisc; /* leaf qdisc */ + + struct rb_node el_node; /* qdisc's eligible tree member */ + struct rb_root vt_tree; /* active children sorted by cl_vt */ + struct rb_node vt_node; /* parent's vt_tree member */ + struct rb_root cf_tree; /* active children sorted by cl_f */ + struct rb_node cf_node; /* parent's cf_heap member */ + struct list_head hlist; /* hash list member */ + struct list_head dlist; /* drop list member */ + + u64 cl_total; /* total work in bytes */ + u64 cl_cumul; /* cumulative work in bytes done by + real-time criteria */ + + u64 cl_d; /* deadline*/ + u64 cl_e; /* eligible time */ + u64 cl_vt; /* virtual time */ + u64 cl_f; /* time when this class will fit for + link-sharing, max(myf, cfmin) */ + u64 cl_myf; /* my fit-time (calculated from this + class's own upperlimit curve) */ + u64 cl_myfadj; /* my fit-time adjustment (to cancel + history dependence) */ + u64 cl_cfmin; /* earliest children's fit-time (used + with cl_myf to obtain cl_f) */ + u64 cl_cvtmin; /* minimal virtual time among the + children fit for link-sharing + (monotonic within a period) */ + u64 cl_vtadj; /* intra-period cumulative vt + adjustment */ + u64 cl_vtoff; /* inter-period cumulative vt offset */ + u64 cl_cvtmax; /* max child's vt in the last period */ + u64 cl_cvtoff; /* cumulative cvtmax of all periods */ + u64 cl_pcvtoff; /* parent's cvtoff at initalization + time */ + + struct internal_sc cl_rsc; /* internal real-time service curve */ + struct internal_sc cl_fsc; /* internal fair service curve */ + struct internal_sc cl_usc; /* internal upperlimit service curve */ + struct runtime_sc cl_deadline; /* deadline curve */ + struct runtime_sc cl_eligible; /* eligible curve */ + struct runtime_sc cl_virtual; /* virtual curve */ + struct runtime_sc cl_ulimit; /* upperlimit curve */ + + unsigned long cl_flags; /* which curves are valid */ + unsigned long cl_vtperiod; /* vt period sequence number */ + unsigned long cl_parentperiod;/* parent's vt period sequence number*/ + unsigned long cl_nactive; /* number of active children */ +}; + +#define HFSC_HSIZE 16 + +struct hfsc_sched +{ + u16 defcls; /* default class id */ + struct hfsc_class root; /* root class */ + struct list_head clhash[HFSC_HSIZE]; /* class hash */ + struct rb_root eligible; /* eligible tree */ + struct list_head droplist; /* active leaf class list (for + dropping) */ + struct sk_buff_head requeue; /* requeued packet */ + struct timer_list wd_timer; /* watchdog timer */ +}; + +/* + * macros + */ +#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY +#include +#undef PSCHED_GET_TIME +#define PSCHED_GET_TIME(stamp) \ +do { \ + struct timeval tv; \ + do_gettimeofday(&tv); \ + (stamp) = 1000000ULL * tv.tv_sec + tv.tv_usec; \ +} while (0) +#endif + +#if HFSC_DEBUG +#define ASSERT(cond) \ +do { \ + if (unlikely(!(cond))) \ + printk("assertion %s failed at %s:%i (%s)\n", \ + #cond, __FILE__, __LINE__, __FUNCTION__); \ +} while (0) +#else +#define ASSERT(cond) +#endif /* HFSC_DEBUG */ + +#define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */ + + +/* + * eligible tree holds backlogged classes being sorted by their eligible times. + * there is one eligible tree per hfsc instance. + */ + +static void +eltree_insert(struct hfsc_class *cl) +{ + struct rb_node **p = &cl->sched->eligible.rb_node; + struct rb_node *parent = NULL; + struct hfsc_class *cl1; + + while (*p != NULL) { + parent = *p; + cl1 = rb_entry(parent, struct hfsc_class, el_node); + if (cl->cl_e >= cl1->cl_e) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->el_node, parent, p); + rb_insert_color(&cl->el_node, &cl->sched->eligible); +} + +static inline void +eltree_remove(struct hfsc_class *cl) +{ + rb_erase(&cl->el_node, &cl->sched->eligible); +} + +static inline void +eltree_update(struct hfsc_class *cl) +{ + eltree_remove(cl); + eltree_insert(cl); +} + +/* find the class with the minimum deadline among the eligible classes */ +static inline struct hfsc_class * +eltree_get_mindl(struct hfsc_sched *q, u64 cur_time) +{ + struct hfsc_class *p, *cl = NULL; + struct rb_node *n; + + for (n = rb_first(&q->eligible); n != NULL; n = rb_next(n)) { + p = rb_entry(n, struct hfsc_class, el_node); + if (p->cl_e > cur_time) + break; + if (cl == NULL || p->cl_d < cl->cl_d) + cl = p; + } + return cl; +} + +/* find the class with minimum eligible time among the eligible classes */ +static inline struct hfsc_class * +eltree_get_minel(struct hfsc_sched *q) +{ + struct rb_node *n; + + n = rb_first(&q->eligible); + if (n == NULL) + return NULL; + return rb_entry(n, struct hfsc_class, el_node); +} + +/* + * vttree holds holds backlogged child classes being sorted by their virtual + * time. each intermediate class has one vttree. + */ +static void +vttree_insert(struct hfsc_class *cl) +{ + struct rb_node **p = &cl->cl_parent->vt_tree.rb_node; + struct rb_node *parent = NULL; + struct hfsc_class *cl1; + + while (*p != NULL) { + parent = *p; + cl1 = rb_entry(parent, struct hfsc_class, vt_node); + if (cl->cl_vt >= cl1->cl_vt) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->vt_node, parent, p); + rb_insert_color(&cl->vt_node, &cl->cl_parent->vt_tree); +} + +static inline void +vttree_remove(struct hfsc_class *cl) +{ + rb_erase(&cl->vt_node, &cl->cl_parent->vt_tree); +} + +static inline void +vttree_update(struct hfsc_class *cl) +{ + vttree_remove(cl); + vttree_insert(cl); +} + +static inline struct hfsc_class * +vttree_firstfit(struct hfsc_class *cl, u64 cur_time) +{ + struct hfsc_class *p; + struct rb_node *n; + + for (n = rb_first(&cl->vt_tree); n != NULL; n = rb_next(n)) { + p = rb_entry(n, struct hfsc_class, vt_node); + if (p->cl_f <= cur_time) + return p; + } + return NULL; +} + +/* + * get the leaf class with the minimum vt in the hierarchy + */ +static struct hfsc_class * +vttree_get_minvt(struct hfsc_class *cl, u64 cur_time) +{ + /* if root-class's cfmin is bigger than cur_time nothing to do */ + if (cl->cl_cfmin > cur_time) + return NULL; + + while (cl->level > 0) { + cl = vttree_firstfit(cl, cur_time); + if (cl == NULL) + return NULL; + /* + * update parent's cl_cvtmin. + */ + if (cl->cl_parent->cl_cvtmin < cl->cl_vt) + cl->cl_parent->cl_cvtmin = cl->cl_vt; + } + return cl; +} + +static void +cftree_insert(struct hfsc_class *cl) +{ + struct rb_node **p = &cl->cl_parent->cf_tree.rb_node; + struct rb_node *parent = NULL; + struct hfsc_class *cl1; + + while (*p != NULL) { + parent = *p; + cl1 = rb_entry(parent, struct hfsc_class, cf_node); + if (cl->cl_f >= cl1->cl_f) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->cf_node, parent, p); + rb_insert_color(&cl->cf_node, &cl->cl_parent->cf_tree); +} + +static inline void +cftree_remove(struct hfsc_class *cl) +{ + rb_erase(&cl->cf_node, &cl->cl_parent->cf_tree); +} + +static inline void +cftree_update(struct hfsc_class *cl) +{ + cftree_remove(cl); + cftree_insert(cl); +} + +/* + * service curve support functions + * + * external service curve parameters + * m: bps + * d: us + * internal service curve parameters + * sm: (bytes/psched_us) << SM_SHIFT + * ism: (psched_us/byte) << ISM_SHIFT + * dx: psched_us + * + * Clock source resolution (CONFIG_NET_SCH_CLK_*) + * JIFFIES: for 48<=HZ<=1534 resolution is between 0.63us and 1.27us. + * CPU: resolution is between 0.5us and 1us. + * GETTIMEOFDAY: resolution is exactly 1us. + * + * sm and ism are scaled in order to keep effective digits. + * SM_SHIFT and ISM_SHIFT are selected to keep at least 4 effective + * digits in decimal using the following table. + * + * Note: We can afford the additional accuracy (altq hfsc keeps at most + * 3 effective digits) thanks to the fact that linux clock is bounded + * much more tightly. + * + * bits/sec 100Kbps 1Mbps 10Mbps 100Mbps 1Gbps + * ------------+------------------------------------------------------- + * bytes/0.5us 6.25e-3 62.5e-3 625e-3 6250e-e 62500e-3 + * bytes/us 12.5e-3 125e-3 1250e-3 12500e-3 125000e-3 + * bytes/1.27us 15.875e-3 158.75e-3 1587.5e-3 15875e-3 158750e-3 + * + * 0.5us/byte 160 16 1.6 0.16 0.016 + * us/byte 80 8 0.8 0.08 0.008 + * 1.27us/byte 63 6.3 0.63 0.063 0.0063 + */ +#define SM_SHIFT 20 +#define ISM_SHIFT 18 + +#define SM_MASK ((1ULL << SM_SHIFT) - 1) +#define ISM_MASK ((1ULL << ISM_SHIFT) - 1) + +static inline u64 +seg_x2y(u64 x, u64 sm) +{ + u64 y; + + /* + * compute + * y = x * sm >> SM_SHIFT + * but divide it for the upper and lower bits to avoid overflow + */ + y = (x >> SM_SHIFT) * sm + (((x & SM_MASK) * sm) >> SM_SHIFT); + return y; +} + +static inline u64 +seg_y2x(u64 y, u64 ism) +{ + u64 x; + + if (y == 0) + x = 0; + else if (ism == HT_INFINITY) + x = HT_INFINITY; + else { + x = (y >> ISM_SHIFT) * ism + + (((y & ISM_MASK) * ism) >> ISM_SHIFT); + } + return x; +} + +/* Convert m (bps) into sm (bytes/psched us) */ +static u64 +m2sm(u32 m) +{ + u64 sm; + + sm = ((u64)m << SM_SHIFT); + sm += PSCHED_JIFFIE2US(HZ) - 1; + do_div(sm, PSCHED_JIFFIE2US(HZ)); + return sm; +} + +/* convert m (bps) into ism (psched us/byte) */ +static u64 +m2ism(u32 m) +{ + u64 ism; + + if (m == 0) + ism = HT_INFINITY; + else { + ism = ((u64)PSCHED_JIFFIE2US(HZ) << ISM_SHIFT); + ism += m - 1; + do_div(ism, m); + } + return ism; +} + +/* convert d (us) into dx (psched us) */ +static u64 +d2dx(u32 d) +{ + u64 dx; + + dx = ((u64)d * PSCHED_JIFFIE2US(HZ)); + dx += 1000000 - 1; + do_div(dx, 1000000); + return dx; +} + +/* convert sm (bytes/psched us) into m (bps) */ +static u32 +sm2m(u64 sm) +{ + u64 m; + + m = (sm * PSCHED_JIFFIE2US(HZ)) >> SM_SHIFT; + return (u32)m; +} + +/* convert dx (psched us) into d (us) */ +static u32 +dx2d(u64 dx) +{ + u64 d; + + d = dx * 1000000; + do_div(d, PSCHED_JIFFIE2US(HZ)); + return (u32)d; +} + +static void +sc2isc(struct tc_service_curve *sc, struct internal_sc *isc) +{ + isc->sm1 = m2sm(sc->m1); + isc->ism1 = m2ism(sc->m1); + isc->dx = d2dx(sc->d); + isc->dy = seg_x2y(isc->dx, isc->sm1); + isc->sm2 = m2sm(sc->m2); + isc->ism2 = m2ism(sc->m2); +} + +/* + * initialize the runtime service curve with the given internal + * service curve starting at (x, y). + */ +static void +rtsc_init(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) +{ + rtsc->x = x; + rtsc->y = y; + rtsc->sm1 = isc->sm1; + rtsc->ism1 = isc->ism1; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + rtsc->sm2 = isc->sm2; + rtsc->ism2 = isc->ism2; +} + +/* + * calculate the y-projection of the runtime service curve by the + * given x-projection value + */ +static u64 +rtsc_y2x(struct runtime_sc *rtsc, u64 y) +{ + u64 x; + + if (y < rtsc->y) + x = rtsc->x; + else if (y <= rtsc->y + rtsc->dy) { + /* x belongs to the 1st segment */ + if (rtsc->dy == 0) + x = rtsc->x + rtsc->dx; + else + x = rtsc->x + seg_y2x(y - rtsc->y, rtsc->ism1); + } else { + /* x belongs to the 2nd segment */ + x = rtsc->x + rtsc->dx + + seg_y2x(y - rtsc->y - rtsc->dy, rtsc->ism2); + } + return x; +} + +static u64 +rtsc_x2y(struct runtime_sc *rtsc, u64 x) +{ + u64 y; + + if (x <= rtsc->x) + y = rtsc->y; + else if (x <= rtsc->x + rtsc->dx) + /* y belongs to the 1st segment */ + y = rtsc->y + seg_x2y(x - rtsc->x, rtsc->sm1); + else + /* y belongs to the 2nd segment */ + y = rtsc->y + rtsc->dy + + seg_x2y(x - rtsc->x - rtsc->dx, rtsc->sm2); + return y; +} + +/* + * update the runtime service curve by taking the minimum of the current + * runtime service curve and the service curve starting at (x, y). + */ +static void +rtsc_min(struct runtime_sc *rtsc, struct internal_sc *isc, u64 x, u64 y) +{ + u64 y1, y2, dx, dy; + u32 dsm; + + if (isc->sm1 <= isc->sm2) { + /* service curve is convex */ + y1 = rtsc_x2y(rtsc, x); + if (y1 < y) + /* the current rtsc is smaller */ + return; + rtsc->x = x; + rtsc->y = y; + return; + } + + /* + * service curve is concave + * compute the two y values of the current rtsc + * y1: at x + * y2: at (x + dx) + */ + y1 = rtsc_x2y(rtsc, x); + if (y1 <= y) { + /* rtsc is below isc, no change to rtsc */ + return; + } + + y2 = rtsc_x2y(rtsc, x + isc->dx); + if (y2 >= y + isc->dy) { + /* rtsc is above isc, replace rtsc by isc */ + rtsc->x = x; + rtsc->y = y; + rtsc->dx = isc->dx; + rtsc->dy = isc->dy; + return; + } + + /* + * the two curves intersect + * compute the offsets (dx, dy) using the reverse + * function of seg_x2y() + * seg_x2y(dx, sm1) == seg_x2y(dx, sm2) + (y1 - y) + */ + dx = (y1 - y) << SM_SHIFT; + dsm = isc->sm1 - isc->sm2; + do_div(dx, dsm); + /* + * check if (x, y1) belongs to the 1st segment of rtsc. + * if so, add the offset. + */ + if (rtsc->x + rtsc->dx > x) + dx += rtsc->x + rtsc->dx - x; + dy = seg_x2y(dx, isc->sm1); + + rtsc->x = x; + rtsc->y = y; + rtsc->dx = dx; + rtsc->dy = dy; + return; +} + +static void +init_ed(struct hfsc_class *cl, unsigned int next_len) +{ + u64 cur_time; + + PSCHED_GET_TIME(cur_time); + + /* update the deadline curve */ + rtsc_min(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); + + /* + * update the eligible curve. + * for concave, it is equal to the deadline curve. + * for convex, it is a linear curve with slope m2. + */ + cl->cl_eligible = cl->cl_deadline; + if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) { + cl->cl_eligible.dx = 0; + cl->cl_eligible.dy = 0; + } + + /* compute e and d */ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + eltree_insert(cl); +} + +static void +update_ed(struct hfsc_class *cl, unsigned int next_len) +{ + cl->cl_e = rtsc_y2x(&cl->cl_eligible, cl->cl_cumul); + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); + + eltree_update(cl); +} + +static inline void +update_d(struct hfsc_class *cl, unsigned int next_len) +{ + cl->cl_d = rtsc_y2x(&cl->cl_deadline, cl->cl_cumul + next_len); +} + +static inline void +update_cfmin(struct hfsc_class *cl) +{ + struct rb_node *n = rb_first(&cl->cf_tree); + struct hfsc_class *p; + + if (n == NULL) { + cl->cl_cfmin = 0; + return; + } + p = rb_entry(n, struct hfsc_class, cf_node); + cl->cl_cfmin = p->cl_f; +} + +static void +init_vf(struct hfsc_class *cl, unsigned int len) +{ + struct hfsc_class *max_cl; + struct rb_node *n; + u64 vt, f, cur_time; + int go_active; + + cur_time = 0; + go_active = 1; + for (; cl->cl_parent != NULL; cl = cl->cl_parent) { + if (go_active && cl->cl_nactive++ == 0) + go_active = 1; + else + go_active = 0; + + if (go_active) { + n = rb_last(&cl->cl_parent->vt_tree); + if (n != NULL) { + max_cl = rb_entry(n, struct hfsc_class,vt_node); + /* + * set vt to the average of the min and max + * classes. if the parent's period didn't + * change, don't decrease vt of the class. + */ + vt = max_cl->cl_vt; + if (cl->cl_parent->cl_cvtmin != 0) + vt = (cl->cl_parent->cl_cvtmin + vt)/2; + + if (cl->cl_parent->cl_vtperiod != + cl->cl_parentperiod || vt > cl->cl_vt) + cl->cl_vt = vt; + } else { + /* + * first child for a new parent backlog period. + * add parent's cvtmax to cvtoff to make a new + * vt (vtoff + vt) larger than the vt in the + * last period for all children. + */ + vt = cl->cl_parent->cl_cvtmax; + cl->cl_parent->cl_cvtoff += vt; + cl->cl_parent->cl_cvtmax = 0; + cl->cl_parent->cl_cvtmin = 0; + cl->cl_vt = 0; + } + + cl->cl_vtoff = cl->cl_parent->cl_cvtoff - + cl->cl_pcvtoff; + + /* update the virtual curve */ + vt = cl->cl_vt + cl->cl_vtoff; + rtsc_min(&cl->cl_virtual, &cl->cl_fsc, vt, + cl->cl_total); + if (cl->cl_virtual.x == vt) { + cl->cl_virtual.x -= cl->cl_vtoff; + cl->cl_vtoff = 0; + } + cl->cl_vtadj = 0; + + cl->cl_vtperiod++; /* increment vt period */ + cl->cl_parentperiod = cl->cl_parent->cl_vtperiod; + if (cl->cl_parent->cl_nactive == 0) + cl->cl_parentperiod++; + cl->cl_f = 0; + + vttree_insert(cl); + cftree_insert(cl); + + if (cl->cl_flags & HFSC_USC) { + /* class has upper limit curve */ + if (cur_time == 0) + PSCHED_GET_TIME(cur_time); + + /* update the ulimit curve */ + rtsc_min(&cl->cl_ulimit, &cl->cl_usc, cur_time, + cl->cl_total); + /* compute myf */ + cl->cl_myf = rtsc_y2x(&cl->cl_ulimit, + cl->cl_total); + cl->cl_myfadj = 0; + } + } + + f = max(cl->cl_myf, cl->cl_cfmin); + if (f != cl->cl_f) { + cl->cl_f = f; + cftree_update(cl); + update_cfmin(cl->cl_parent); + } + } +} + +static void +update_vf(struct hfsc_class *cl, unsigned int len, u64 cur_time) +{ + u64 f; /* , myf_bound, delta; */ + int go_passive = 0; + + if (cl->qdisc->q.qlen == 0 && cl->cl_flags & HFSC_FSC) + go_passive = 1; + + for (; cl->cl_parent != NULL; cl = cl->cl_parent) { + cl->cl_total += len; + + if (!(cl->cl_flags & HFSC_FSC) || cl->cl_nactive == 0) + continue; + + if (go_passive && --cl->cl_nactive == 0) + go_passive = 1; + else + go_passive = 0; + + if (go_passive) { + /* no more active child, going passive */ + + /* update cvtmax of the parent class */ + if (cl->cl_vt > cl->cl_parent->cl_cvtmax) + cl->cl_parent->cl_cvtmax = cl->cl_vt; + + /* remove this class from the vt tree */ + vttree_remove(cl); + + cftree_remove(cl); + update_cfmin(cl->cl_parent); + + continue; + } + + /* + * update vt and f + */ + cl->cl_vt = rtsc_y2x(&cl->cl_virtual, cl->cl_total) + - cl->cl_vtoff + cl->cl_vtadj; + + /* + * if vt of the class is smaller than cvtmin, + * the class was skipped in the past due to non-fit. + * if so, we need to adjust vtadj. + */ + if (cl->cl_vt < cl->cl_parent->cl_cvtmin) { + cl->cl_vtadj += cl->cl_parent->cl_cvtmin - cl->cl_vt; + cl->cl_vt = cl->cl_parent->cl_cvtmin; + } + + /* update the vt tree */ + vttree_update(cl); + + if (cl->cl_flags & HFSC_USC) { + cl->cl_myf = cl->cl_myfadj + rtsc_y2x(&cl->cl_ulimit, + cl->cl_total); +#if 0 + /* + * This code causes classes to stay way under their + * limit when multiple classes are used at gigabit + * speed. needs investigation. -kaber + */ + /* + * if myf lags behind by more than one clock tick + * from the current time, adjust myfadj to prevent + * a rate-limited class from going greedy. + * in a steady state under rate-limiting, myf + * fluctuates within one clock tick. + */ + myf_bound = cur_time - PSCHED_JIFFIE2US(1); + if (cl->cl_myf < myf_bound) { + delta = cur_time - cl->cl_myf; + cl->cl_myfadj += delta; + cl->cl_myf += delta; + } +#endif + } + + f = max(cl->cl_myf, cl->cl_cfmin); + if (f != cl->cl_f) { + cl->cl_f = f; + cftree_update(cl); + update_cfmin(cl->cl_parent); + } + } +} + +static void +set_active(struct hfsc_class *cl, unsigned int len) +{ + if (cl->cl_flags & HFSC_RSC) + init_ed(cl, len); + if (cl->cl_flags & HFSC_FSC) + init_vf(cl, len); + + list_add_tail(&cl->dlist, &cl->sched->droplist); +} + +static void +set_passive(struct hfsc_class *cl) +{ + if (cl->cl_flags & HFSC_RSC) + eltree_remove(cl); + + list_del(&cl->dlist); + + /* + * vttree is now handled in update_vf() so that update_vf(cl, 0, 0) + * needs to be called explicitly to remove a class from vttree. + */ +} + +/* + * hack to get length of first packet in queue. + */ +static unsigned int +qdisc_peek_len(struct Qdisc *sch) +{ + struct sk_buff *skb; + unsigned int len; + + skb = sch->dequeue(sch); + if (skb == NULL) { + if (net_ratelimit()) + printk("qdisc_peek_len: non work-conserving qdisc ?\n"); + return 0; + } + len = skb->len; + if (unlikely(sch->ops->requeue(skb, sch) != NET_XMIT_SUCCESS)) { + if (net_ratelimit()) + printk("qdisc_peek_len: failed to requeue\n"); + return 0; + } + return len; +} + +static void +hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl) +{ + unsigned int len = cl->qdisc->q.qlen; + + qdisc_reset(cl->qdisc); + if (len > 0) { + update_vf(cl, 0, 0); + set_passive(cl); + sch->q.qlen -= len; + } +} + +static void +hfsc_adjust_levels(struct hfsc_class *cl) +{ + struct hfsc_class *p; + unsigned int level; + + do { + level = 0; + list_for_each_entry(p, &cl->children, siblings) { + if (p->level > level) + level = p->level; + } + cl->level = level + 1; + } while ((cl = cl->cl_parent) != NULL); +} + +static inline unsigned int +hfsc_hash(u32 h) +{ + h ^= h >> 8; + h ^= h >> 4; + + return h & (HFSC_HSIZE - 1); +} + +static inline struct hfsc_class * +hfsc_find_class(u32 classid, struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + + list_for_each_entry(cl, &q->clhash[hfsc_hash(classid)], hlist) { + if (cl->classid == classid) + return cl; + } + return NULL; +} + +static void +hfsc_change_rsc(struct hfsc_class *cl, struct tc_service_curve *rsc, + u64 cur_time) +{ + sc2isc(rsc, &cl->cl_rsc); + rtsc_init(&cl->cl_deadline, &cl->cl_rsc, cur_time, cl->cl_cumul); + cl->cl_eligible = cl->cl_deadline; + if (cl->cl_rsc.sm1 <= cl->cl_rsc.sm2) { + cl->cl_eligible.dx = 0; + cl->cl_eligible.dy = 0; + } + cl->cl_flags |= HFSC_RSC; +} + +static void +hfsc_change_fsc(struct hfsc_class *cl, struct tc_service_curve *fsc) +{ + sc2isc(fsc, &cl->cl_fsc); + rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total); + cl->cl_flags |= HFSC_FSC; +} + +static void +hfsc_change_usc(struct hfsc_class *cl, struct tc_service_curve *usc, + u64 cur_time) +{ + sc2isc(usc, &cl->cl_usc); + rtsc_init(&cl->cl_ulimit, &cl->cl_usc, cur_time, cl->cl_total); + cl->cl_flags |= HFSC_USC; +} + +static int +hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct rtattr **tca, unsigned long *arg) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl = (struct hfsc_class *)*arg; + struct hfsc_class *parent = NULL; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct rtattr *tb[TCA_HFSC_MAX]; + struct tc_service_curve *rsc = NULL, *fsc = NULL, *usc = NULL; + u64 cur_time; + + if (opt == NULL || rtattr_parse_nested(tb, TCA_HFSC_MAX, opt)) + return -EINVAL; + + if (tb[TCA_HFSC_RSC-1]) { + if (RTA_PAYLOAD(tb[TCA_HFSC_RSC-1]) < sizeof(*rsc)) + return -EINVAL; + rsc = RTA_DATA(tb[TCA_HFSC_RSC-1]); + if (rsc->m1 == 0 && rsc->m2 == 0) + rsc = NULL; + } + + if (tb[TCA_HFSC_FSC-1]) { + if (RTA_PAYLOAD(tb[TCA_HFSC_FSC-1]) < sizeof(*fsc)) + return -EINVAL; + fsc = RTA_DATA(tb[TCA_HFSC_FSC-1]); + if (fsc->m1 == 0 && fsc->m2 == 0) + fsc = NULL; + } + + if (tb[TCA_HFSC_USC-1]) { + if (RTA_PAYLOAD(tb[TCA_HFSC_USC-1]) < sizeof(*usc)) + return -EINVAL; + usc = RTA_DATA(tb[TCA_HFSC_USC-1]); + if (usc->m1 == 0 && usc->m2 == 0) + usc = NULL; + } + + if (cl != NULL) { + if (parentid) { + if (cl->cl_parent && cl->cl_parent->classid != parentid) + return -EINVAL; + if (cl->cl_parent == NULL && parentid != TC_H_ROOT) + return -EINVAL; + } + PSCHED_GET_TIME(cur_time); + + sch_tree_lock(sch); + if (rsc != NULL) + hfsc_change_rsc(cl, rsc, cur_time); + if (fsc != NULL) + hfsc_change_fsc(cl, fsc); + if (usc != NULL) + hfsc_change_usc(cl, usc, cur_time); + + if (cl->qdisc->q.qlen != 0) { + if (cl->cl_flags & HFSC_RSC) + update_ed(cl, qdisc_peek_len(cl->qdisc)); + if (cl->cl_flags & HFSC_FSC) + update_vf(cl, 0, cur_time); + } + sch_tree_unlock(sch); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + gen_replace_estimator(&cl->bstats, &cl->rate_est, + cl->stats_lock, tca[TCA_RATE-1]); +#endif + return 0; + } + + if (parentid == TC_H_ROOT) + return -EEXIST; + + parent = &q->root; + if (parentid) { + parent = hfsc_find_class(parentid, sch); + if (parent == NULL) + return -ENOENT; + } + + if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0) + return -EINVAL; + if (hfsc_find_class(classid, sch)) + return -EEXIST; + + if (rsc == NULL && fsc == NULL) + return -EINVAL; + + cl = kmalloc(sizeof(struct hfsc_class), GFP_KERNEL); + if (cl == NULL) + return -ENOBUFS; + memset(cl, 0, sizeof(struct hfsc_class)); + + if (rsc != NULL) + hfsc_change_rsc(cl, rsc, 0); + if (fsc != NULL) + hfsc_change_fsc(cl, fsc); + if (usc != NULL) + hfsc_change_usc(cl, usc, 0); + + cl->refcnt = 1; + cl->classid = classid; + cl->sched = q; + cl->cl_parent = parent; + cl->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (cl->qdisc == NULL) + cl->qdisc = &noop_qdisc; + cl->stats_lock = &sch->dev->queue_lock; + INIT_LIST_HEAD(&cl->children); + cl->vt_tree = RB_ROOT; + cl->cf_tree = RB_ROOT; + + sch_tree_lock(sch); + list_add_tail(&cl->hlist, &q->clhash[hfsc_hash(classid)]); + list_add_tail(&cl->siblings, &parent->children); + if (parent->level == 0) + hfsc_purge_queue(sch, parent); + hfsc_adjust_levels(parent); + cl->cl_pcvtoff = parent->cl_cvtoff; + sch_tree_unlock(sch); + +#ifdef CONFIG_NET_ESTIMATOR + if (tca[TCA_RATE-1]) + gen_new_estimator(&cl->bstats, &cl->rate_est, + cl->stats_lock, tca[TCA_RATE-1]); +#endif + *arg = (unsigned long)cl; + return 0; +} + +static void +hfsc_destroy_filters(struct tcf_proto **fl) +{ + struct tcf_proto *tp; + + while ((tp = *fl) != NULL) { + *fl = tp->next; + tcf_destroy(tp); + } +} + +static void +hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl) +{ + struct hfsc_sched *q = qdisc_priv(sch); + + hfsc_destroy_filters(&cl->filter_list); + qdisc_destroy(cl->qdisc); +#ifdef CONFIG_NET_ESTIMATOR + gen_kill_estimator(&cl->bstats, &cl->rate_est); +#endif + if (cl != &q->root) + kfree(cl); +} + +static int +hfsc_delete_class(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl->level > 0 || cl->filter_cnt > 0 || cl == &q->root) + return -EBUSY; + + sch_tree_lock(sch); + + list_del(&cl->hlist); + list_del(&cl->siblings); + hfsc_adjust_levels(cl->cl_parent); + hfsc_purge_queue(sch, cl); + if (--cl->refcnt == 0) + hfsc_destroy_class(sch, cl); + + sch_tree_unlock(sch); + return 0; +} + +static struct hfsc_class * +hfsc_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + struct tcf_result res; + struct tcf_proto *tcf; + int result; + + if (TC_H_MAJ(skb->priority ^ sch->handle) == 0 && + (cl = hfsc_find_class(skb->priority, sch)) != NULL) + if (cl->level == 0) + return cl; + + *qerr = NET_XMIT_DROP; + tcf = q->root.filter_list; + while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS; + case TC_ACT_SHOT: + return NULL; + } +#elif defined(CONFIG_NET_CLS_POLICE) + if (result == TC_POLICE_SHOT) + return NULL; +#endif + if ((cl = (struct hfsc_class *)res.class) == NULL) { + if ((cl = hfsc_find_class(res.classid, sch)) == NULL) + break; /* filter selected invalid classid */ + } + + if (cl->level == 0) + return cl; /* hit leaf class */ + + /* apply inner filter chain */ + tcf = cl->filter_list; + } + + /* classification failed, try default class */ + cl = hfsc_find_class(TC_H_MAKE(TC_H_MAJ(sch->handle), q->defcls), sch); + if (cl == NULL || cl->level > 0) + return NULL; + + return cl; +} + +static int +hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl == NULL) + return -ENOENT; + if (cl->level > 0) + return -EINVAL; + if (new == NULL) { + new = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (new == NULL) + new = &noop_qdisc; + } + + sch_tree_lock(sch); + hfsc_purge_queue(sch, cl); + *old = xchg(&cl->qdisc, new); + sch_tree_unlock(sch); + return 0; +} + +static struct Qdisc * +hfsc_class_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl != NULL && cl->level == 0) + return cl->qdisc; + + return NULL; +} + +static unsigned long +hfsc_get_class(struct Qdisc *sch, u32 classid) +{ + struct hfsc_class *cl = hfsc_find_class(classid, sch); + + if (cl != NULL) + cl->refcnt++; + + return (unsigned long)cl; +} + +static void +hfsc_put_class(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (--cl->refcnt == 0) + hfsc_destroy_class(sch, cl); +} + +static unsigned long +hfsc_bind_tcf(struct Qdisc *sch, unsigned long parent, u32 classid) +{ + struct hfsc_class *p = (struct hfsc_class *)parent; + struct hfsc_class *cl = hfsc_find_class(classid, sch); + + if (cl != NULL) { + if (p != NULL && p->level <= cl->level) + return 0; + cl->filter_cnt++; + } + + return (unsigned long)cl; +} + +static void +hfsc_unbind_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + + cl->filter_cnt--; +} + +static struct tcf_proto ** +hfsc_tcf_chain(struct Qdisc *sch, unsigned long arg) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl = (struct hfsc_class *)arg; + + if (cl == NULL) + cl = &q->root; + + return &cl->filter_list; +} + +static int +hfsc_dump_sc(struct sk_buff *skb, int attr, struct internal_sc *sc) +{ + struct tc_service_curve tsc; + + tsc.m1 = sm2m(sc->sm1); + tsc.d = dx2d(sc->dx); + tsc.m2 = sm2m(sc->sm2); + RTA_PUT(skb, attr, sizeof(tsc), &tsc); + + return skb->len; + + rtattr_failure: + return -1; +} + +static inline int +hfsc_dump_curves(struct sk_buff *skb, struct hfsc_class *cl) +{ + if ((cl->cl_flags & HFSC_RSC) && + (hfsc_dump_sc(skb, TCA_HFSC_RSC, &cl->cl_rsc) < 0)) + goto rtattr_failure; + + if ((cl->cl_flags & HFSC_FSC) && + (hfsc_dump_sc(skb, TCA_HFSC_FSC, &cl->cl_fsc) < 0)) + goto rtattr_failure; + + if ((cl->cl_flags & HFSC_USC) && + (hfsc_dump_sc(skb, TCA_HFSC_USC, &cl->cl_usc) < 0)) + goto rtattr_failure; + + return skb->len; + + rtattr_failure: + return -1; +} + +static int +hfsc_dump_class(struct Qdisc *sch, unsigned long arg, struct sk_buff *skb, + struct tcmsg *tcm) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + unsigned char *b = skb->tail; + struct rtattr *rta = (struct rtattr *)b; + + tcm->tcm_parent = cl->cl_parent ? cl->cl_parent->classid : TC_H_ROOT; + tcm->tcm_handle = cl->classid; + if (cl->level == 0) + tcm->tcm_info = cl->qdisc->handle; + + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + if (hfsc_dump_curves(skb, cl) < 0) + goto rtattr_failure; + rta->rta_len = skb->tail - b; + return skb->len; + + rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int +hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct hfsc_class *cl = (struct hfsc_class *)arg; + struct tc_hfsc_stats xstats; + + cl->qstats.qlen = cl->qdisc->q.qlen; + xstats.level = cl->level; + xstats.period = cl->cl_vtperiod; + xstats.work = cl->cl_total; + xstats.rtwork = cl->cl_cumul; + + if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || +#ifdef CONFIG_NET_ESTIMATOR + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || +#endif + gnet_stats_copy_queue(d, &cl->qstats) < 0) + return -1; + + return gnet_stats_copy_app(d, &xstats, sizeof(xstats)); +} + + + +static void +hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + unsigned int i; + + if (arg->stop) + return; + + for (i = 0; i < HFSC_HSIZE; i++) { + list_for_each_entry(cl, &q->clhash[i], hlist) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static void +hfsc_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc *)arg; + + sch->flags &= ~TCQ_F_THROTTLED; + netif_schedule(sch->dev); +} + +static void +hfsc_schedule_watchdog(struct Qdisc *sch, u64 cur_time) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + u64 next_time = 0; + long delay; + + if ((cl = eltree_get_minel(q)) != NULL) + next_time = cl->cl_e; + if (q->root.cl_cfmin != 0) { + if (next_time == 0 || next_time > q->root.cl_cfmin) + next_time = q->root.cl_cfmin; + } + ASSERT(next_time != 0); + delay = next_time - cur_time; + delay = PSCHED_US2JIFFIE(delay); + + sch->flags |= TCQ_F_THROTTLED; + mod_timer(&q->wd_timer, jiffies + delay); +} + +static int +hfsc_init_qdisc(struct Qdisc *sch, struct rtattr *opt) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct tc_hfsc_qopt *qopt; + unsigned int i; + + if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) + return -EINVAL; + qopt = RTA_DATA(opt); + + sch->stats_lock = &sch->dev->queue_lock; + + q->defcls = qopt->defcls; + for (i = 0; i < HFSC_HSIZE; i++) + INIT_LIST_HEAD(&q->clhash[i]); + q->eligible = RB_ROOT; + INIT_LIST_HEAD(&q->droplist); + skb_queue_head_init(&q->requeue); + + q->root.refcnt = 1; + q->root.classid = sch->handle; + q->root.sched = q; + q->root.qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (q->root.qdisc == NULL) + q->root.qdisc = &noop_qdisc; + q->root.stats_lock = &sch->dev->queue_lock; + INIT_LIST_HEAD(&q->root.children); + q->root.vt_tree = RB_ROOT; + q->root.cf_tree = RB_ROOT; + + list_add(&q->root.hlist, &q->clhash[hfsc_hash(q->root.classid)]); + + init_timer(&q->wd_timer); + q->wd_timer.function = hfsc_watchdog; + q->wd_timer.data = (unsigned long)sch; + + return 0; +} + +static int +hfsc_change_qdisc(struct Qdisc *sch, struct rtattr *opt) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct tc_hfsc_qopt *qopt; + + if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) + return -EINVAL; + qopt = RTA_DATA(opt); + + sch_tree_lock(sch); + q->defcls = qopt->defcls; + sch_tree_unlock(sch); + + return 0; +} + +static void +hfsc_reset_class(struct hfsc_class *cl) +{ + cl->cl_total = 0; + cl->cl_cumul = 0; + cl->cl_d = 0; + cl->cl_e = 0; + cl->cl_vt = 0; + cl->cl_vtadj = 0; + cl->cl_vtoff = 0; + cl->cl_cvtmin = 0; + cl->cl_cvtmax = 0; + cl->cl_cvtoff = 0; + cl->cl_pcvtoff = 0; + cl->cl_vtperiod = 0; + cl->cl_parentperiod = 0; + cl->cl_f = 0; + cl->cl_myf = 0; + cl->cl_myfadj = 0; + cl->cl_cfmin = 0; + cl->cl_nactive = 0; + + cl->vt_tree = RB_ROOT; + cl->cf_tree = RB_ROOT; + qdisc_reset(cl->qdisc); + + if (cl->cl_flags & HFSC_RSC) + rtsc_init(&cl->cl_deadline, &cl->cl_rsc, 0, 0); + if (cl->cl_flags & HFSC_FSC) + rtsc_init(&cl->cl_virtual, &cl->cl_fsc, 0, 0); + if (cl->cl_flags & HFSC_USC) + rtsc_init(&cl->cl_ulimit, &cl->cl_usc, 0, 0); +} + +static void +hfsc_reset_qdisc(struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + unsigned int i; + + for (i = 0; i < HFSC_HSIZE; i++) { + list_for_each_entry(cl, &q->clhash[i], hlist) + hfsc_reset_class(cl); + } + __skb_queue_purge(&q->requeue); + q->eligible = RB_ROOT; + INIT_LIST_HEAD(&q->droplist); + del_timer(&q->wd_timer); + sch->flags &= ~TCQ_F_THROTTLED; + sch->q.qlen = 0; +} + +static void +hfsc_destroy_qdisc(struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl, *next; + unsigned int i; + + for (i = 0; i < HFSC_HSIZE; i++) { + list_for_each_entry_safe(cl, next, &q->clhash[i], hlist) + hfsc_destroy_class(sch, cl); + } + __skb_queue_purge(&q->requeue); + del_timer(&q->wd_timer); +} + +static int +hfsc_dump_qdisc(struct Qdisc *sch, struct sk_buff *skb) +{ + struct hfsc_sched *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct tc_hfsc_qopt qopt; + + qopt.defcls = q->defcls; + RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + return skb->len; + + rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int +hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct hfsc_class *cl; + unsigned int len; + int err; + + cl = hfsc_classify(skb, sch, &err); + if (cl == NULL) { + if (err == NET_XMIT_DROP) + sch->qstats.drops++; + kfree_skb(skb); + return err; + } + + len = skb->len; + err = cl->qdisc->enqueue(skb, cl->qdisc); + if (unlikely(err != NET_XMIT_SUCCESS)) { + cl->qstats.drops++; + sch->qstats.drops++; + return err; + } + + if (cl->qdisc->q.qlen == 1) + set_active(cl, len); + + cl->bstats.packets++; + cl->bstats.bytes += len; + sch->bstats.packets++; + sch->bstats.bytes += len; + sch->q.qlen++; + + return NET_XMIT_SUCCESS; +} + +static struct sk_buff * +hfsc_dequeue(struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + struct sk_buff *skb; + u64 cur_time; + unsigned int next_len; + int realtime = 0; + + if (sch->q.qlen == 0) + return NULL; + if ((skb = __skb_dequeue(&q->requeue))) + goto out; + + PSCHED_GET_TIME(cur_time); + + /* + * if there are eligible classes, use real-time criteria. + * find the class with the minimum deadline among + * the eligible classes. + */ + if ((cl = eltree_get_mindl(q, cur_time)) != NULL) { + realtime = 1; + } else { + /* + * use link-sharing criteria + * get the class with the minimum vt in the hierarchy + */ + cl = vttree_get_minvt(&q->root, cur_time); + if (cl == NULL) { + sch->qstats.overlimits++; + hfsc_schedule_watchdog(sch, cur_time); + return NULL; + } + } + + skb = cl->qdisc->dequeue(cl->qdisc); + if (skb == NULL) { + if (net_ratelimit()) + printk("HFSC: Non-work-conserving qdisc ?\n"); + return NULL; + } + + update_vf(cl, skb->len, cur_time); + if (realtime) + cl->cl_cumul += skb->len; + + if (cl->qdisc->q.qlen != 0) { + if (cl->cl_flags & HFSC_RSC) { + /* update ed */ + next_len = qdisc_peek_len(cl->qdisc); + if (realtime) + update_ed(cl, next_len); + else + update_d(cl, next_len); + } + } else { + /* the class becomes passive */ + set_passive(cl); + } + + out: + sch->flags &= ~TCQ_F_THROTTLED; + sch->q.qlen--; + + return skb; +} + +static int +hfsc_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + + __skb_queue_head(&q->requeue, skb); + sch->q.qlen++; + sch->qstats.requeues++; + return NET_XMIT_SUCCESS; +} + +static unsigned int +hfsc_drop(struct Qdisc *sch) +{ + struct hfsc_sched *q = qdisc_priv(sch); + struct hfsc_class *cl; + unsigned int len; + + list_for_each_entry(cl, &q->droplist, dlist) { + if (cl->qdisc->ops->drop != NULL && + (len = cl->qdisc->ops->drop(cl->qdisc)) > 0) { + if (cl->qdisc->q.qlen == 0) { + update_vf(cl, 0, 0); + set_passive(cl); + } else { + list_move_tail(&cl->dlist, &q->droplist); + } + cl->qstats.drops++; + sch->qstats.drops++; + sch->q.qlen--; + return len; + } + } + return 0; +} + +static struct Qdisc_class_ops hfsc_class_ops = { + .change = hfsc_change_class, + .delete = hfsc_delete_class, + .graft = hfsc_graft_class, + .leaf = hfsc_class_leaf, + .get = hfsc_get_class, + .put = hfsc_put_class, + .bind_tcf = hfsc_bind_tcf, + .unbind_tcf = hfsc_unbind_tcf, + .tcf_chain = hfsc_tcf_chain, + .dump = hfsc_dump_class, + .dump_stats = hfsc_dump_class_stats, + .walk = hfsc_walk +}; + +static struct Qdisc_ops hfsc_qdisc_ops = { + .id = "hfsc", + .init = hfsc_init_qdisc, + .change = hfsc_change_qdisc, + .reset = hfsc_reset_qdisc, + .destroy = hfsc_destroy_qdisc, + .dump = hfsc_dump_qdisc, + .enqueue = hfsc_enqueue, + .dequeue = hfsc_dequeue, + .requeue = hfsc_requeue, + .drop = hfsc_drop, + .cl_ops = &hfsc_class_ops, + .priv_size = sizeof(struct hfsc_sched), + .owner = THIS_MODULE +}; + +static int __init +hfsc_init(void) +{ + return register_qdisc(&hfsc_qdisc_ops); +} + +static void __exit +hfsc_cleanup(void) +{ + unregister_qdisc(&hfsc_qdisc_ops); +} + +MODULE_LICENSE("GPL"); +module_init(hfsc_init); +module_exit(hfsc_cleanup); diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c new file mode 100644 index 000000000000..a85935e7d53d --- /dev/null +++ b/net/sched/sch_htb.c @@ -0,0 +1,1759 @@ +/* vim: ts=8 sw=8 + * net/sched/sch_htb.c Hierarchical token bucket, feed tree version + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Martin Devera, + * + * Credits (in time order) for older HTB versions: + * Stef Coene + * HTB support at LARTC mailing list + * Ondrej Kraus, + * found missing INIT_QDISC(htb) + * Vladimir Smelhaus, Aamer Akhter, Bert Hubert + * helped a lot to locate nasty class stall bug + * Andi Kleen, Jamal Hadi, Bert Hubert + * code review and helpful comments on shaping + * Tomasz Wrona, + * created test case so that I was able to fix nasty bug + * Wilfried Weissmann + * spotted bug in dequeue code and helped with fix + * Jiri Fojtasek + * fixed requeue routine + * and many others. thanks. + * + * $Id: sch_htb.c,v 1.25 2003/12/07 11:08:25 devik Exp devik $ + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* HTB algorithm. + Author: devik@cdi.cz + ======================================================================== + HTB is like TBF with multiple classes. It is also similar to CBQ because + it allows to assign priority to each class in hierarchy. + In fact it is another implementation of Floyd's formal sharing. + + Levels: + Each class is assigned level. Leaf has ALWAYS level 0 and root + classes have level TC_HTB_MAXDEPTH-1. Interior nodes has level + one less than their parent. +*/ + +#define HTB_HSIZE 16 /* classid hash size */ +#define HTB_EWMAC 2 /* rate average over HTB_EWMAC*HTB_HSIZE sec */ +#undef HTB_DEBUG /* compile debugging support (activated by tc tool) */ +#define HTB_RATECM 1 /* whether to use rate computer */ +#define HTB_HYSTERESIS 1/* whether to use mode hysteresis for speedup */ +#define HTB_QLOCK(S) spin_lock_bh(&(S)->dev->queue_lock) +#define HTB_QUNLOCK(S) spin_unlock_bh(&(S)->dev->queue_lock) +#define HTB_VER 0x30011 /* major must be matched with number suplied by TC as version */ + +#if HTB_VER >> 16 != TC_HTB_PROTOVER +#error "Mismatched sch_htb.c and pkt_sch.h" +#endif + +/* debugging support; S is subsystem, these are defined: + 0 - netlink messages + 1 - enqueue + 2 - drop & requeue + 3 - dequeue main + 4 - dequeue one prio DRR part + 5 - dequeue class accounting + 6 - class overlimit status computation + 7 - hint tree + 8 - event queue + 10 - rate estimator + 11 - classifier + 12 - fast dequeue cache + + L is level; 0 = none, 1 = basic info, 2 = detailed, 3 = full + q->debug uint32 contains 16 2-bit fields one for subsystem starting + from LSB + */ +#ifdef HTB_DEBUG +#define HTB_DBG_COND(S,L) (((q->debug>>(2*S))&3) >= L) +#define HTB_DBG(S,L,FMT,ARG...) if (HTB_DBG_COND(S,L)) \ + printk(KERN_DEBUG FMT,##ARG) +#define HTB_CHCL(cl) BUG_TRAP((cl)->magic == HTB_CMAGIC) +#define HTB_PASSQ q, +#define HTB_ARGQ struct htb_sched *q, +#define static +#undef __inline__ +#define __inline__ +#undef inline +#define inline +#define HTB_CMAGIC 0xFEFAFEF1 +#define htb_safe_rb_erase(N,R) do { BUG_TRAP((N)->rb_color != -1); \ + if ((N)->rb_color == -1) break; \ + rb_erase(N,R); \ + (N)->rb_color = -1; } while (0) +#else +#define HTB_DBG_COND(S,L) (0) +#define HTB_DBG(S,L,FMT,ARG...) +#define HTB_PASSQ +#define HTB_ARGQ +#define HTB_CHCL(cl) +#define htb_safe_rb_erase(N,R) rb_erase(N,R) +#endif + + +/* used internaly to keep status of single class */ +enum htb_cmode { + HTB_CANT_SEND, /* class can't send and can't borrow */ + HTB_MAY_BORROW, /* class can't send but may borrow */ + HTB_CAN_SEND /* class can send */ +}; + +/* interior & leaf nodes; props specific to leaves are marked L: */ +struct htb_class +{ +#ifdef HTB_DEBUG + unsigned magic; +#endif + /* general class parameters */ + u32 classid; + struct gnet_stats_basic bstats; + struct gnet_stats_queue qstats; + struct gnet_stats_rate_est rate_est; + struct tc_htb_xstats xstats;/* our special stats */ + int refcnt; /* usage count of this class */ + +#ifdef HTB_RATECM + /* rate measurement counters */ + unsigned long rate_bytes,sum_bytes; + unsigned long rate_packets,sum_packets; +#endif + + /* topology */ + int level; /* our level (see above) */ + struct htb_class *parent; /* parent class */ + struct list_head hlist; /* classid hash list item */ + struct list_head sibling; /* sibling list item */ + struct list_head children; /* children list */ + + union { + struct htb_class_leaf { + struct Qdisc *q; + int prio; + int aprio; + int quantum; + int deficit[TC_HTB_MAXDEPTH]; + struct list_head drop_list; + } leaf; + struct htb_class_inner { + struct rb_root feed[TC_HTB_NUMPRIO]; /* feed trees */ + struct rb_node *ptr[TC_HTB_NUMPRIO]; /* current class ptr */ + /* When class changes from state 1->2 and disconnects from + parent's feed then we lost ptr value and start from the + first child again. Here we store classid of the + last valid ptr (used when ptr is NULL). */ + u32 last_ptr_id[TC_HTB_NUMPRIO]; + } inner; + } un; + struct rb_node node[TC_HTB_NUMPRIO]; /* node for self or feed tree */ + struct rb_node pq_node; /* node for event queue */ + unsigned long pq_key; /* the same type as jiffies global */ + + int prio_activity; /* for which prios are we active */ + enum htb_cmode cmode; /* current mode of the class */ + + /* class attached filters */ + struct tcf_proto *filter_list; + int filter_cnt; + + int warned; /* only one warning about non work conserving .. */ + + /* token bucket parameters */ + struct qdisc_rate_table *rate; /* rate table of the class itself */ + struct qdisc_rate_table *ceil; /* ceiling rate (limits borrows too) */ + long buffer,cbuffer; /* token bucket depth/rate */ + long mbuffer; /* max wait time */ + long tokens,ctokens; /* current number of tokens */ + psched_time_t t_c; /* checkpoint time */ +}; + +/* TODO: maybe compute rate when size is too large .. or drop ? */ +static __inline__ long L2T(struct htb_class *cl,struct qdisc_rate_table *rate, + int size) +{ + int slot = size >> rate->rate.cell_log; + if (slot > 255) { + cl->xstats.giants++; + slot = 255; + } + return rate->data[slot]; +} + +struct htb_sched +{ + struct list_head root; /* root classes list */ + struct list_head hash[HTB_HSIZE]; /* hashed by classid */ + struct list_head drops[TC_HTB_NUMPRIO]; /* active leaves (for drops) */ + + /* self list - roots of self generating tree */ + struct rb_root row[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; + int row_mask[TC_HTB_MAXDEPTH]; + struct rb_node *ptr[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; + u32 last_ptr_id[TC_HTB_MAXDEPTH][TC_HTB_NUMPRIO]; + + /* self wait list - roots of wait PQs per row */ + struct rb_root wait_pq[TC_HTB_MAXDEPTH]; + + /* time of nearest event per level (row) */ + unsigned long near_ev_cache[TC_HTB_MAXDEPTH]; + + /* cached value of jiffies in dequeue */ + unsigned long jiffies; + + /* whether we hit non-work conserving class during this dequeue; we use */ + int nwc_hit; /* this to disable mindelay complaint in dequeue */ + + int defcls; /* class where unclassified flows go to */ + u32 debug; /* subsystem debug levels */ + + /* filters for qdisc itself */ + struct tcf_proto *filter_list; + int filter_cnt; + + int rate2quantum; /* quant = rate / rate2quantum */ + psched_time_t now; /* cached dequeue time */ + struct timer_list timer; /* send delay timer */ +#ifdef HTB_RATECM + struct timer_list rttim; /* rate computer timer */ + int recmp_bucket; /* which hash bucket to recompute next */ +#endif + + /* non shaped skbs; let them go directly thru */ + struct sk_buff_head direct_queue; + int direct_qlen; /* max qlen of above */ + + long direct_pkts; +}; + +/* compute hash of size HTB_HSIZE for given handle */ +static __inline__ int htb_hash(u32 h) +{ +#if HTB_HSIZE != 16 + #error "Declare new hash for your HTB_HSIZE" +#endif + h ^= h>>8; /* stolen from cbq_hash */ + h ^= h>>4; + return h & 0xf; +} + +/* find class in global hash table using given handle */ +static __inline__ struct htb_class *htb_find(u32 handle, struct Qdisc *sch) +{ + struct htb_sched *q = qdisc_priv(sch); + struct list_head *p; + if (TC_H_MAJ(handle) != sch->handle) + return NULL; + + list_for_each (p,q->hash+htb_hash(handle)) { + struct htb_class *cl = list_entry(p,struct htb_class,hlist); + if (cl->classid == handle) + return cl; + } + return NULL; +} + +/** + * htb_classify - classify a packet into class + * + * It returns NULL if the packet should be dropped or -1 if the packet + * should be passed directly thru. In all other cases leaf class is returned. + * We allow direct class selection by classid in priority. The we examine + * filters in qdisc and in inner nodes (if higher filter points to the inner + * node). If we end up with classid MAJOR:0 we enqueue the skb into special + * internal fifo (direct). These packets then go directly thru. If we still + * have no valid leaf we try to use MAJOR:default leaf. It still unsuccessfull + * then finish and return direct queue. + */ +#define HTB_DIRECT (struct htb_class*)-1 +static inline u32 htb_classid(struct htb_class *cl) +{ + return (cl && cl != HTB_DIRECT) ? cl->classid : TC_H_UNSPEC; +} + +static struct htb_class *htb_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) +{ + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl; + struct tcf_result res; + struct tcf_proto *tcf; + int result; + + /* allow to select class by setting skb->priority to valid classid; + note that nfmark can be used too by attaching filter fw with no + rules in it */ + if (skb->priority == sch->handle) + return HTB_DIRECT; /* X:0 (direct flow) selected */ + if ((cl = htb_find(skb->priority,sch)) != NULL && cl->level == 0) + return cl; + + *qerr = NET_XMIT_DROP; + tcf = q->filter_list; + while (tcf && (result = tc_classify(skb, tcf, &res)) >= 0) { +#ifdef CONFIG_NET_CLS_ACT + switch (result) { + case TC_ACT_QUEUED: + case TC_ACT_STOLEN: + *qerr = NET_XMIT_SUCCESS; + case TC_ACT_SHOT: + return NULL; + } +#elif defined(CONFIG_NET_CLS_POLICE) + if (result == TC_POLICE_SHOT) + return HTB_DIRECT; +#endif + if ((cl = (void*)res.class) == NULL) { + if (res.classid == sch->handle) + return HTB_DIRECT; /* X:0 (direct flow) */ + if ((cl = htb_find(res.classid,sch)) == NULL) + break; /* filter selected invalid classid */ + } + if (!cl->level) + return cl; /* we hit leaf; return it */ + + /* we have got inner class; apply inner filter chain */ + tcf = cl->filter_list; + } + /* classification failed; try to use default class */ + cl = htb_find(TC_H_MAKE(TC_H_MAJ(sch->handle),q->defcls),sch); + if (!cl || cl->level) + return HTB_DIRECT; /* bad default .. this is safe bet */ + return cl; +} + +#ifdef HTB_DEBUG +static void htb_next_rb_node(struct rb_node **n); +#define HTB_DUMTREE(root,memb) if(root) { \ + struct rb_node *n = (root)->rb_node; \ + while (n->rb_left) n = n->rb_left; \ + while (n) { \ + struct htb_class *cl = rb_entry(n, struct htb_class, memb); \ + printk(" %x",cl->classid); htb_next_rb_node (&n); \ + } } + +static void htb_debug_dump (struct htb_sched *q) +{ + int i,p; + printk(KERN_DEBUG "htb*g j=%lu lj=%lu\n",jiffies,q->jiffies); + /* rows */ + for (i=TC_HTB_MAXDEPTH-1;i>=0;i--) { + printk(KERN_DEBUG "htb*r%d m=%x",i,q->row_mask[i]); + for (p=0;prow[i][p].rb_node) continue; + printk(" p%d:",p); + HTB_DUMTREE(q->row[i]+p,node[p]); + } + printk("\n"); + } + /* classes */ + for (i = 0; i < HTB_HSIZE; i++) { + struct list_head *l; + list_for_each (l,q->hash+i) { + struct htb_class *cl = list_entry(l,struct htb_class,hlist); + long diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer); + printk(KERN_DEBUG "htb*c%x m=%d t=%ld c=%ld pq=%lu df=%ld ql=%d " + "pa=%x f:", + cl->classid,cl->cmode,cl->tokens,cl->ctokens, + cl->pq_node.rb_color==-1?0:cl->pq_key,diff, + cl->level?0:cl->un.leaf.q->q.qlen,cl->prio_activity); + if (cl->level) + for (p=0;pun.inner.feed[p].rb_node) continue; + printk(" p%d a=%x:",p,cl->un.inner.ptr[p]?rb_entry(cl->un.inner.ptr[p], struct htb_class,node[p])->classid:0); + HTB_DUMTREE(cl->un.inner.feed+p,node[p]); + } + printk("\n"); + } + } +} +#endif +/** + * htb_add_to_id_tree - adds class to the round robin list + * + * Routine adds class to the list (actually tree) sorted by classid. + * Make sure that class is not already on such list for given prio. + */ +static void htb_add_to_id_tree (HTB_ARGQ struct rb_root *root, + struct htb_class *cl,int prio) +{ + struct rb_node **p = &root->rb_node, *parent = NULL; + HTB_DBG(7,3,"htb_add_id_tree cl=%X prio=%d\n",cl->classid,prio); +#ifdef HTB_DEBUG + if (cl->node[prio].rb_color != -1) { BUG_TRAP(0); return; } + HTB_CHCL(cl); + if (*p) { + struct htb_class *x = rb_entry(*p,struct htb_class,node[prio]); + HTB_CHCL(x); + } +#endif + while (*p) { + struct htb_class *c; parent = *p; + c = rb_entry(parent, struct htb_class, node[prio]); + HTB_CHCL(c); + if (cl->classid > c->classid) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->node[prio], parent, p); + rb_insert_color(&cl->node[prio], root); +} + +/** + * htb_add_to_wait_tree - adds class to the event queue with delay + * + * The class is added to priority event queue to indicate that class will + * change its mode in cl->pq_key microseconds. Make sure that class is not + * already in the queue. + */ +static void htb_add_to_wait_tree (struct htb_sched *q, + struct htb_class *cl,long delay,int debug_hint) +{ + struct rb_node **p = &q->wait_pq[cl->level].rb_node, *parent = NULL; + HTB_DBG(7,3,"htb_add_wt cl=%X key=%lu\n",cl->classid,cl->pq_key); +#ifdef HTB_DEBUG + if (cl->pq_node.rb_color != -1) { BUG_TRAP(0); return; } + HTB_CHCL(cl); + if ((delay <= 0 || delay > cl->mbuffer) && net_ratelimit()) + printk(KERN_ERR "HTB: suspicious delay in wait_tree d=%ld cl=%X h=%d\n",delay,cl->classid,debug_hint); +#endif + cl->pq_key = q->jiffies + PSCHED_US2JIFFIE(delay); + if (cl->pq_key == q->jiffies) + cl->pq_key++; + + /* update the nearest event cache */ + if (time_after(q->near_ev_cache[cl->level], cl->pq_key)) + q->near_ev_cache[cl->level] = cl->pq_key; + + while (*p) { + struct htb_class *c; parent = *p; + c = rb_entry(parent, struct htb_class, pq_node); + if (time_after_eq(cl->pq_key, c->pq_key)) + p = &parent->rb_right; + else + p = &parent->rb_left; + } + rb_link_node(&cl->pq_node, parent, p); + rb_insert_color(&cl->pq_node, &q->wait_pq[cl->level]); +} + +/** + * htb_next_rb_node - finds next node in binary tree + * + * When we are past last key we return NULL. + * Average complexity is 2 steps per call. + */ +static void htb_next_rb_node(struct rb_node **n) +{ + *n = rb_next(*n); +} + +/** + * htb_add_class_to_row - add class to its row + * + * The class is added to row at priorities marked in mask. + * It does nothing if mask == 0. + */ +static inline void htb_add_class_to_row(struct htb_sched *q, + struct htb_class *cl,int mask) +{ + HTB_DBG(7,2,"htb_addrow cl=%X mask=%X rmask=%X\n", + cl->classid,mask,q->row_mask[cl->level]); + HTB_CHCL(cl); + q->row_mask[cl->level] |= mask; + while (mask) { + int prio = ffz(~mask); + mask &= ~(1 << prio); + htb_add_to_id_tree(HTB_PASSQ q->row[cl->level]+prio,cl,prio); + } +} + +/** + * htb_remove_class_from_row - removes class from its row + * + * The class is removed from row at priorities marked in mask. + * It does nothing if mask == 0. + */ +static __inline__ void htb_remove_class_from_row(struct htb_sched *q, + struct htb_class *cl,int mask) +{ + int m = 0; + HTB_CHCL(cl); + while (mask) { + int prio = ffz(~mask); + mask &= ~(1 << prio); + if (q->ptr[cl->level][prio] == cl->node+prio) + htb_next_rb_node(q->ptr[cl->level]+prio); + htb_safe_rb_erase(cl->node + prio,q->row[cl->level]+prio); + if (!q->row[cl->level][prio].rb_node) + m |= 1 << prio; + } + HTB_DBG(7,2,"htb_delrow cl=%X mask=%X rmask=%X maskdel=%X\n", + cl->classid,mask,q->row_mask[cl->level],m); + q->row_mask[cl->level] &= ~m; +} + +/** + * htb_activate_prios - creates active classe's feed chain + * + * The class is connected to ancestors and/or appropriate rows + * for priorities it is participating on. cl->cmode must be new + * (activated) mode. It does nothing if cl->prio_activity == 0. + */ +static void htb_activate_prios(struct htb_sched *q,struct htb_class *cl) +{ + struct htb_class *p = cl->parent; + long m,mask = cl->prio_activity; + HTB_DBG(7,2,"htb_act_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode); + HTB_CHCL(cl); + + while (cl->cmode == HTB_MAY_BORROW && p && mask) { + HTB_CHCL(p); + m = mask; while (m) { + int prio = ffz(~m); + m &= ~(1 << prio); + + if (p->un.inner.feed[prio].rb_node) + /* parent already has its feed in use so that + reset bit in mask as parent is already ok */ + mask &= ~(1 << prio); + + htb_add_to_id_tree(HTB_PASSQ p->un.inner.feed+prio,cl,prio); + } + HTB_DBG(7,3,"htb_act_pr_aft p=%X pact=%X mask=%lX pmode=%d\n", + p->classid,p->prio_activity,mask,p->cmode); + p->prio_activity |= mask; + cl = p; p = cl->parent; + HTB_CHCL(cl); + } + if (cl->cmode == HTB_CAN_SEND && mask) + htb_add_class_to_row(q,cl,mask); +} + +/** + * htb_deactivate_prios - remove class from feed chain + * + * cl->cmode must represent old mode (before deactivation). It does + * nothing if cl->prio_activity == 0. Class is removed from all feed + * chains and rows. + */ +static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) +{ + struct htb_class *p = cl->parent; + long m,mask = cl->prio_activity; + HTB_DBG(7,2,"htb_deact_prios cl=%X mask=%lX cmode=%d\n",cl->classid,mask,cl->cmode); + HTB_CHCL(cl); + + while (cl->cmode == HTB_MAY_BORROW && p && mask) { + m = mask; mask = 0; + while (m) { + int prio = ffz(~m); + m &= ~(1 << prio); + + if (p->un.inner.ptr[prio] == cl->node+prio) { + /* we are removing child which is pointed to from + parent feed - forget the pointer but remember + classid */ + p->un.inner.last_ptr_id[prio] = cl->classid; + p->un.inner.ptr[prio] = NULL; + } + + htb_safe_rb_erase(cl->node + prio,p->un.inner.feed + prio); + + if (!p->un.inner.feed[prio].rb_node) + mask |= 1 << prio; + } + HTB_DBG(7,3,"htb_deact_pr_aft p=%X pact=%X mask=%lX pmode=%d\n", + p->classid,p->prio_activity,mask,p->cmode); + p->prio_activity &= ~mask; + cl = p; p = cl->parent; + HTB_CHCL(cl); + } + if (cl->cmode == HTB_CAN_SEND && mask) + htb_remove_class_from_row(q,cl,mask); +} + +/** + * htb_class_mode - computes and returns current class mode + * + * It computes cl's mode at time cl->t_c+diff and returns it. If mode + * is not HTB_CAN_SEND then cl->pq_key is updated to time difference + * from now to time when cl will change its state. + * Also it is worth to note that class mode doesn't change simply + * at cl->{c,}tokens == 0 but there can rather be hysteresis of + * 0 .. -cl->{c,}buffer range. It is meant to limit number of + * mode transitions per time unit. The speed gain is about 1/6. + */ +static __inline__ enum htb_cmode +htb_class_mode(struct htb_class *cl,long *diff) +{ + long toks; + + if ((toks = (cl->ctokens + *diff)) < ( +#if HTB_HYSTERESIS + cl->cmode != HTB_CANT_SEND ? -cl->cbuffer : +#endif + 0)) { + *diff = -toks; + return HTB_CANT_SEND; + } + if ((toks = (cl->tokens + *diff)) >= ( +#if HTB_HYSTERESIS + cl->cmode == HTB_CAN_SEND ? -cl->buffer : +#endif + 0)) + return HTB_CAN_SEND; + + *diff = -toks; + return HTB_MAY_BORROW; +} + +/** + * htb_change_class_mode - changes classe's mode + * + * This should be the only way how to change classe's mode under normal + * cirsumstances. Routine will update feed lists linkage, change mode + * and add class to the wait event queue if appropriate. New mode should + * be different from old one and cl->pq_key has to be valid if changing + * to mode other than HTB_CAN_SEND (see htb_add_to_wait_tree). + */ +static void +htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, long *diff) +{ + enum htb_cmode new_mode = htb_class_mode(cl,diff); + + HTB_CHCL(cl); + HTB_DBG(7,1,"htb_chging_clmode %d->%d cl=%X\n",cl->cmode,new_mode,cl->classid); + + if (new_mode == cl->cmode) + return; + + if (cl->prio_activity) { /* not necessary: speed optimization */ + if (cl->cmode != HTB_CANT_SEND) + htb_deactivate_prios(q,cl); + cl->cmode = new_mode; + if (new_mode != HTB_CANT_SEND) + htb_activate_prios(q,cl); + } else + cl->cmode = new_mode; +} + +/** + * htb_activate - inserts leaf cl into appropriate active feeds + * + * Routine learns (new) priority of leaf and activates feed chain + * for the prio. It can be called on already active leaf safely. + * It also adds leaf into droplist. + */ +static __inline__ void htb_activate(struct htb_sched *q,struct htb_class *cl) +{ + BUG_TRAP(!cl->level && cl->un.leaf.q && cl->un.leaf.q->q.qlen); + HTB_CHCL(cl); + if (!cl->prio_activity) { + cl->prio_activity = 1 << (cl->un.leaf.aprio = cl->un.leaf.prio); + htb_activate_prios(q,cl); + list_add_tail(&cl->un.leaf.drop_list,q->drops+cl->un.leaf.aprio); + } +} + +/** + * htb_deactivate - remove leaf cl from active feeds + * + * Make sure that leaf is active. In the other words it can't be called + * with non-active leaf. It also removes class from the drop list. + */ +static __inline__ void +htb_deactivate(struct htb_sched *q,struct htb_class *cl) +{ + BUG_TRAP(cl->prio_activity); + HTB_CHCL(cl); + htb_deactivate_prios(q,cl); + cl->prio_activity = 0; + list_del_init(&cl->un.leaf.drop_list); +} + +static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + int ret; + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl = htb_classify(skb,sch,&ret); + + if (cl == HTB_DIRECT) { + /* enqueue to helper queue */ + if (q->direct_queue.qlen < q->direct_qlen) { + __skb_queue_tail(&q->direct_queue, skb); + q->direct_pkts++; + } +#ifdef CONFIG_NET_CLS_ACT + } else if (!cl) { + if (ret == NET_XMIT_DROP) + sch->qstats.drops++; + kfree_skb (skb); + return ret; +#endif + } else if (cl->un.leaf.q->enqueue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) { + sch->qstats.drops++; + cl->qstats.drops++; + return NET_XMIT_DROP; + } else { + cl->bstats.packets++; cl->bstats.bytes += skb->len; + htb_activate (q,cl); + } + + sch->q.qlen++; + sch->bstats.packets++; sch->bstats.bytes += skb->len; + HTB_DBG(1,1,"htb_enq_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb); + return NET_XMIT_SUCCESS; +} + +/* TODO: requeuing packet charges it to policers again !! */ +static int htb_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct htb_sched *q = qdisc_priv(sch); + int ret = NET_XMIT_SUCCESS; + struct htb_class *cl = htb_classify(skb,sch, &ret); + struct sk_buff *tskb; + + if (cl == HTB_DIRECT || !cl) { + /* enqueue to helper queue */ + if (q->direct_queue.qlen < q->direct_qlen && cl) { + __skb_queue_head(&q->direct_queue, skb); + } else { + __skb_queue_head(&q->direct_queue, skb); + tskb = __skb_dequeue_tail(&q->direct_queue); + kfree_skb (tskb); + sch->qstats.drops++; + return NET_XMIT_CN; + } + } else if (cl->un.leaf.q->ops->requeue(skb, cl->un.leaf.q) != NET_XMIT_SUCCESS) { + sch->qstats.drops++; + cl->qstats.drops++; + return NET_XMIT_DROP; + } else + htb_activate (q,cl); + + sch->q.qlen++; + sch->qstats.requeues++; + HTB_DBG(1,1,"htb_req_ok cl=%X skb=%p\n",(cl && cl != HTB_DIRECT)?cl->classid:0,skb); + return NET_XMIT_SUCCESS; +} + +static void htb_timer(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + sch->flags &= ~TCQ_F_THROTTLED; + wmb(); + netif_schedule(sch->dev); +} + +#ifdef HTB_RATECM +#define RT_GEN(D,R) R+=D-(R/HTB_EWMAC);D=0 +static void htb_rate_timer(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct htb_sched *q = qdisc_priv(sch); + struct list_head *p; + + /* lock queue so that we can muck with it */ + HTB_QLOCK(sch); + HTB_DBG(10,1,"htb_rttmr j=%ld\n",jiffies); + + q->rttim.expires = jiffies + HZ; + add_timer(&q->rttim); + + /* scan and recompute one bucket at time */ + if (++q->recmp_bucket >= HTB_HSIZE) + q->recmp_bucket = 0; + list_for_each (p,q->hash+q->recmp_bucket) { + struct htb_class *cl = list_entry(p,struct htb_class,hlist); + HTB_DBG(10,2,"htb_rttmr_cl cl=%X sbyte=%lu spkt=%lu\n", + cl->classid,cl->sum_bytes,cl->sum_packets); + RT_GEN (cl->sum_bytes,cl->rate_bytes); + RT_GEN (cl->sum_packets,cl->rate_packets); + } + HTB_QUNLOCK(sch); +} +#endif + +/** + * htb_charge_class - charges amount "bytes" to leaf and ancestors + * + * Routine assumes that packet "bytes" long was dequeued from leaf cl + * borrowing from "level". It accounts bytes to ceil leaky bucket for + * leaf and all ancestors and to rate bucket for ancestors at levels + * "level" and higher. It also handles possible change of mode resulting + * from the update. Note that mode can also increase here (MAY_BORROW to + * CAN_SEND) because we can use more precise clock that event queue here. + * In such case we remove class from event queue first. + */ +static void htb_charge_class(struct htb_sched *q,struct htb_class *cl, + int level,int bytes) +{ + long toks,diff; + enum htb_cmode old_mode; + HTB_DBG(5,1,"htb_chrg_cl cl=%X lev=%d len=%d\n",cl->classid,level,bytes); + +#define HTB_ACCNT(T,B,R) toks = diff + cl->T; \ + if (toks > cl->B) toks = cl->B; \ + toks -= L2T(cl, cl->R, bytes); \ + if (toks <= -cl->mbuffer) toks = 1-cl->mbuffer; \ + cl->T = toks + + while (cl) { + HTB_CHCL(cl); + diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer); +#ifdef HTB_DEBUG + if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) { + if (net_ratelimit()) + printk(KERN_ERR "HTB: bad diff in charge, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n", + cl->classid, diff, +#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY + q->now.tv_sec * 1000000ULL + q->now.tv_usec, + cl->t_c.tv_sec * 1000000ULL + cl->t_c.tv_usec, +#else + (unsigned long long) q->now, + (unsigned long long) cl->t_c, +#endif + q->jiffies); + diff = 1000; + } +#endif + if (cl->level >= level) { + if (cl->level == level) cl->xstats.lends++; + HTB_ACCNT (tokens,buffer,rate); + } else { + cl->xstats.borrows++; + cl->tokens += diff; /* we moved t_c; update tokens */ + } + HTB_ACCNT (ctokens,cbuffer,ceil); + cl->t_c = q->now; + HTB_DBG(5,2,"htb_chrg_clp cl=%X diff=%ld tok=%ld ctok=%ld\n",cl->classid,diff,cl->tokens,cl->ctokens); + + old_mode = cl->cmode; diff = 0; + htb_change_class_mode(q,cl,&diff); + if (old_mode != cl->cmode) { + if (old_mode != HTB_CAN_SEND) + htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level); + if (cl->cmode != HTB_CAN_SEND) + htb_add_to_wait_tree (q,cl,diff,1); + } + +#ifdef HTB_RATECM + /* update rate counters */ + cl->sum_bytes += bytes; cl->sum_packets++; +#endif + + /* update byte stats except for leaves which are already updated */ + if (cl->level) { + cl->bstats.bytes += bytes; + cl->bstats.packets++; + } + cl = cl->parent; + } +} + +/** + * htb_do_events - make mode changes to classes at the level + * + * Scans event queue for pending events and applies them. Returns jiffies to + * next pending event (0 for no event in pq). + * Note: Aplied are events whose have cl->pq_key <= jiffies. + */ +static long htb_do_events(struct htb_sched *q,int level) +{ + int i; + HTB_DBG(8,1,"htb_do_events l=%d root=%p rmask=%X\n", + level,q->wait_pq[level].rb_node,q->row_mask[level]); + for (i = 0; i < 500; i++) { + struct htb_class *cl; + long diff; + struct rb_node *p = q->wait_pq[level].rb_node; + if (!p) return 0; + while (p->rb_left) p = p->rb_left; + + cl = rb_entry(p, struct htb_class, pq_node); + if (time_after(cl->pq_key, q->jiffies)) { + HTB_DBG(8,3,"htb_do_ev_ret delay=%ld\n",cl->pq_key - q->jiffies); + return cl->pq_key - q->jiffies; + } + htb_safe_rb_erase(p,q->wait_pq+level); + diff = PSCHED_TDIFF_SAFE(q->now, cl->t_c, (u32)cl->mbuffer); +#ifdef HTB_DEBUG + if (diff > cl->mbuffer || diff < 0 || PSCHED_TLESS(q->now, cl->t_c)) { + if (net_ratelimit()) + printk(KERN_ERR "HTB: bad diff in events, cl=%X diff=%lX now=%Lu then=%Lu j=%lu\n", + cl->classid, diff, +#ifdef CONFIG_NET_SCH_CLK_GETTIMEOFDAY + q->now.tv_sec * 1000000ULL + q->now.tv_usec, + cl->t_c.tv_sec * 1000000ULL + cl->t_c.tv_usec, +#else + (unsigned long long) q->now, + (unsigned long long) cl->t_c, +#endif + q->jiffies); + diff = 1000; + } +#endif + htb_change_class_mode(q,cl,&diff); + if (cl->cmode != HTB_CAN_SEND) + htb_add_to_wait_tree (q,cl,diff,2); + } + if (net_ratelimit()) + printk(KERN_WARNING "htb: too many events !\n"); + return HZ/10; +} + +/* Returns class->node+prio from id-tree where classe's id is >= id. NULL + is no such one exists. */ +static struct rb_node * +htb_id_find_next_upper(int prio,struct rb_node *n,u32 id) +{ + struct rb_node *r = NULL; + while (n) { + struct htb_class *cl = rb_entry(n,struct htb_class,node[prio]); + if (id == cl->classid) return n; + + if (id > cl->classid) { + n = n->rb_right; + } else { + r = n; + n = n->rb_left; + } + } + return r; +} + +/** + * htb_lookup_leaf - returns next leaf class in DRR order + * + * Find leaf where current feed pointers points to. + */ +static struct htb_class * +htb_lookup_leaf(HTB_ARGQ struct rb_root *tree,int prio,struct rb_node **pptr,u32 *pid) +{ + int i; + struct { + struct rb_node *root; + struct rb_node **pptr; + u32 *pid; + } stk[TC_HTB_MAXDEPTH],*sp = stk; + + BUG_TRAP(tree->rb_node); + sp->root = tree->rb_node; + sp->pptr = pptr; + sp->pid = pid; + + for (i = 0; i < 65535; i++) { + HTB_DBG(4,2,"htb_lleaf ptr=%p pid=%X\n",*sp->pptr,*sp->pid); + + if (!*sp->pptr && *sp->pid) { + /* ptr was invalidated but id is valid - try to recover + the original or next ptr */ + *sp->pptr = htb_id_find_next_upper(prio,sp->root,*sp->pid); + } + *sp->pid = 0; /* ptr is valid now so that remove this hint as it + can become out of date quickly */ + if (!*sp->pptr) { /* we are at right end; rewind & go up */ + *sp->pptr = sp->root; + while ((*sp->pptr)->rb_left) + *sp->pptr = (*sp->pptr)->rb_left; + if (sp > stk) { + sp--; + BUG_TRAP(*sp->pptr); if(!*sp->pptr) return NULL; + htb_next_rb_node (sp->pptr); + } + } else { + struct htb_class *cl; + cl = rb_entry(*sp->pptr,struct htb_class,node[prio]); + HTB_CHCL(cl); + if (!cl->level) + return cl; + (++sp)->root = cl->un.inner.feed[prio].rb_node; + sp->pptr = cl->un.inner.ptr+prio; + sp->pid = cl->un.inner.last_ptr_id+prio; + } + } + BUG_TRAP(0); + return NULL; +} + +/* dequeues packet at given priority and level; call only if + you are sure that there is active class at prio/level */ +static struct sk_buff * +htb_dequeue_tree(struct htb_sched *q,int prio,int level) +{ + struct sk_buff *skb = NULL; + struct htb_class *cl,*start; + /* look initial class up in the row */ + start = cl = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,prio, + q->ptr[level]+prio,q->last_ptr_id[level]+prio); + + do { +next: + BUG_TRAP(cl); + if (!cl) return NULL; + HTB_DBG(4,1,"htb_deq_tr prio=%d lev=%d cl=%X defic=%d\n", + prio,level,cl->classid,cl->un.leaf.deficit[level]); + + /* class can be empty - it is unlikely but can be true if leaf + qdisc drops packets in enqueue routine or if someone used + graft operation on the leaf since last dequeue; + simply deactivate and skip such class */ + if (unlikely(cl->un.leaf.q->q.qlen == 0)) { + struct htb_class *next; + htb_deactivate(q,cl); + + /* row/level might become empty */ + if ((q->row_mask[level] & (1 << prio)) == 0) + return NULL; + + next = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio, + prio,q->ptr[level]+prio,q->last_ptr_id[level]+prio); + + if (cl == start) /* fix start if we just deleted it */ + start = next; + cl = next; + goto next; + } + + if (likely((skb = cl->un.leaf.q->dequeue(cl->un.leaf.q)) != NULL)) + break; + if (!cl->warned) { + printk(KERN_WARNING "htb: class %X isn't work conserving ?!\n",cl->classid); + cl->warned = 1; + } + q->nwc_hit++; + htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio); + cl = htb_lookup_leaf (HTB_PASSQ q->row[level]+prio,prio,q->ptr[level]+prio, + q->last_ptr_id[level]+prio); + + } while (cl != start); + + if (likely(skb != NULL)) { + if ((cl->un.leaf.deficit[level] -= skb->len) < 0) { + HTB_DBG(4,2,"htb_next_cl oldptr=%p quant_add=%d\n", + level?cl->parent->un.inner.ptr[prio]:q->ptr[0][prio],cl->un.leaf.quantum); + cl->un.leaf.deficit[level] += cl->un.leaf.quantum; + htb_next_rb_node((level?cl->parent->un.inner.ptr:q->ptr[0])+prio); + } + /* this used to be after charge_class but this constelation + gives us slightly better performance */ + if (!cl->un.leaf.q->q.qlen) + htb_deactivate (q,cl); + htb_charge_class (q,cl,level,skb->len); + } + return skb; +} + +static void htb_delay_by(struct Qdisc *sch,long delay) +{ + struct htb_sched *q = qdisc_priv(sch); + if (delay <= 0) delay = 1; + if (unlikely(delay > 5*HZ)) { + if (net_ratelimit()) + printk(KERN_INFO "HTB delay %ld > 5sec\n", delay); + delay = 5*HZ; + } + /* why don't use jiffies here ? because expires can be in past */ + mod_timer(&q->timer, q->jiffies + delay); + sch->flags |= TCQ_F_THROTTLED; + sch->qstats.overlimits++; + HTB_DBG(3,1,"htb_deq t_delay=%ld\n",delay); +} + +static struct sk_buff *htb_dequeue(struct Qdisc *sch) +{ + struct sk_buff *skb = NULL; + struct htb_sched *q = qdisc_priv(sch); + int level; + long min_delay; +#ifdef HTB_DEBUG + int evs_used = 0; +#endif + + q->jiffies = jiffies; + HTB_DBG(3,1,"htb_deq dircnt=%d qlen=%d\n",skb_queue_len(&q->direct_queue), + sch->q.qlen); + + /* try to dequeue direct packets as high prio (!) to minimize cpu work */ + if ((skb = __skb_dequeue(&q->direct_queue)) != NULL) { + sch->flags &= ~TCQ_F_THROTTLED; + sch->q.qlen--; + return skb; + } + + if (!sch->q.qlen) goto fin; + PSCHED_GET_TIME(q->now); + + min_delay = LONG_MAX; + q->nwc_hit = 0; + for (level = 0; level < TC_HTB_MAXDEPTH; level++) { + /* common case optimization - skip event handler quickly */ + int m; + long delay; + if (time_after_eq(q->jiffies, q->near_ev_cache[level])) { + delay = htb_do_events(q,level); + q->near_ev_cache[level] = q->jiffies + (delay ? delay : HZ); +#ifdef HTB_DEBUG + evs_used++; +#endif + } else + delay = q->near_ev_cache[level] - q->jiffies; + + if (delay && min_delay > delay) + min_delay = delay; + m = ~q->row_mask[level]; + while (m != (int)(-1)) { + int prio = ffz (m); + m |= 1 << prio; + skb = htb_dequeue_tree(q,prio,level); + if (likely(skb != NULL)) { + sch->q.qlen--; + sch->flags &= ~TCQ_F_THROTTLED; + goto fin; + } + } + } +#ifdef HTB_DEBUG + if (!q->nwc_hit && min_delay >= 10*HZ && net_ratelimit()) { + if (min_delay == LONG_MAX) { + printk(KERN_ERR "HTB: dequeue bug (%d,%lu,%lu), report it please !\n", + evs_used,q->jiffies,jiffies); + htb_debug_dump(q); + } else + printk(KERN_WARNING "HTB: mindelay=%ld, some class has " + "too small rate\n",min_delay); + } +#endif + htb_delay_by (sch,min_delay > 5*HZ ? 5*HZ : min_delay); +fin: + HTB_DBG(3,1,"htb_deq_end %s j=%lu skb=%p\n",sch->dev->name,q->jiffies,skb); + return skb; +} + +/* try to drop from each class (by prio) until one succeed */ +static unsigned int htb_drop(struct Qdisc* sch) +{ + struct htb_sched *q = qdisc_priv(sch); + int prio; + + for (prio = TC_HTB_NUMPRIO - 1; prio >= 0; prio--) { + struct list_head *p; + list_for_each (p,q->drops+prio) { + struct htb_class *cl = list_entry(p, struct htb_class, + un.leaf.drop_list); + unsigned int len; + if (cl->un.leaf.q->ops->drop && + (len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) { + sch->q.qlen--; + if (!cl->un.leaf.q->q.qlen) + htb_deactivate (q,cl); + return len; + } + } + } + return 0; +} + +/* reset all classes */ +/* always caled under BH & queue lock */ +static void htb_reset(struct Qdisc* sch) +{ + struct htb_sched *q = qdisc_priv(sch); + int i; + HTB_DBG(0,1,"htb_reset sch=%p, handle=%X\n",sch,sch->handle); + + for (i = 0; i < HTB_HSIZE; i++) { + struct list_head *p; + list_for_each (p,q->hash+i) { + struct htb_class *cl = list_entry(p,struct htb_class,hlist); + if (cl->level) + memset(&cl->un.inner,0,sizeof(cl->un.inner)); + else { + if (cl->un.leaf.q) + qdisc_reset(cl->un.leaf.q); + INIT_LIST_HEAD(&cl->un.leaf.drop_list); + } + cl->prio_activity = 0; + cl->cmode = HTB_CAN_SEND; +#ifdef HTB_DEBUG + cl->pq_node.rb_color = -1; + memset(cl->node,255,sizeof(cl->node)); +#endif + + } + } + sch->flags &= ~TCQ_F_THROTTLED; + del_timer(&q->timer); + __skb_queue_purge(&q->direct_queue); + sch->q.qlen = 0; + memset(q->row,0,sizeof(q->row)); + memset(q->row_mask,0,sizeof(q->row_mask)); + memset(q->wait_pq,0,sizeof(q->wait_pq)); + memset(q->ptr,0,sizeof(q->ptr)); + for (i = 0; i < TC_HTB_NUMPRIO; i++) + INIT_LIST_HEAD(q->drops+i); +} + +static int htb_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct htb_sched *q = qdisc_priv(sch); + struct rtattr *tb[TCA_HTB_INIT]; + struct tc_htb_glob *gopt; + int i; +#ifdef HTB_DEBUG + printk(KERN_INFO "HTB init, kernel part version %d.%d\n", + HTB_VER >> 16,HTB_VER & 0xffff); +#endif + if (!opt || rtattr_parse_nested(tb, TCA_HTB_INIT, opt) || + tb[TCA_HTB_INIT-1] == NULL || + RTA_PAYLOAD(tb[TCA_HTB_INIT-1]) < sizeof(*gopt)) { + printk(KERN_ERR "HTB: hey probably you have bad tc tool ?\n"); + return -EINVAL; + } + gopt = RTA_DATA(tb[TCA_HTB_INIT-1]); + if (gopt->version != HTB_VER >> 16) { + printk(KERN_ERR "HTB: need tc/htb version %d (minor is %d), you have %d\n", + HTB_VER >> 16,HTB_VER & 0xffff,gopt->version); + return -EINVAL; + } + q->debug = gopt->debug; + HTB_DBG(0,1,"htb_init sch=%p handle=%X r2q=%d\n",sch,sch->handle,gopt->rate2quantum); + + INIT_LIST_HEAD(&q->root); + for (i = 0; i < HTB_HSIZE; i++) + INIT_LIST_HEAD(q->hash+i); + for (i = 0; i < TC_HTB_NUMPRIO; i++) + INIT_LIST_HEAD(q->drops+i); + + init_timer(&q->timer); + skb_queue_head_init(&q->direct_queue); + + q->direct_qlen = sch->dev->tx_queue_len; + if (q->direct_qlen < 2) /* some devices have zero tx_queue_len */ + q->direct_qlen = 2; + q->timer.function = htb_timer; + q->timer.data = (unsigned long)sch; + +#ifdef HTB_RATECM + init_timer(&q->rttim); + q->rttim.function = htb_rate_timer; + q->rttim.data = (unsigned long)sch; + q->rttim.expires = jiffies + HZ; + add_timer(&q->rttim); +#endif + if ((q->rate2quantum = gopt->rate2quantum) < 1) + q->rate2quantum = 1; + q->defcls = gopt->defcls; + + return 0; +} + +static int htb_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct htb_sched *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_htb_glob gopt; + HTB_DBG(0,1,"htb_dump sch=%p, handle=%X\n",sch,sch->handle); + HTB_QLOCK(sch); + gopt.direct_pkts = q->direct_pkts; + +#ifdef HTB_DEBUG + if (HTB_DBG_COND(0,2)) + htb_debug_dump(q); +#endif + gopt.version = HTB_VER; + gopt.rate2quantum = q->rate2quantum; + gopt.defcls = q->defcls; + gopt.debug = q->debug; + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + RTA_PUT(skb, TCA_HTB_INIT, sizeof(gopt), &gopt); + rta->rta_len = skb->tail - b; + HTB_QUNLOCK(sch); + return skb->len; +rtattr_failure: + HTB_QUNLOCK(sch); + skb_trim(skb, skb->tail - skb->data); + return -1; +} + +static int htb_dump_class(struct Qdisc *sch, unsigned long arg, + struct sk_buff *skb, struct tcmsg *tcm) +{ +#ifdef HTB_DEBUG + struct htb_sched *q = qdisc_priv(sch); +#endif + struct htb_class *cl = (struct htb_class*)arg; + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_htb_opt opt; + + HTB_DBG(0,1,"htb_dump_class handle=%X clid=%X\n",sch->handle,cl->classid); + + HTB_QLOCK(sch); + tcm->tcm_parent = cl->parent ? cl->parent->classid : TC_H_ROOT; + tcm->tcm_handle = cl->classid; + if (!cl->level && cl->un.leaf.q) + tcm->tcm_info = cl->un.leaf.q->handle; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + memset (&opt,0,sizeof(opt)); + + opt.rate = cl->rate->rate; opt.buffer = cl->buffer; + opt.ceil = cl->ceil->rate; opt.cbuffer = cl->cbuffer; + opt.quantum = cl->un.leaf.quantum; opt.prio = cl->un.leaf.prio; + opt.level = cl->level; + RTA_PUT(skb, TCA_HTB_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + HTB_QUNLOCK(sch); + return skb->len; +rtattr_failure: + HTB_QUNLOCK(sch); + skb_trim(skb, b - skb->data); + return -1; +} + +static int +htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, + struct gnet_dump *d) +{ + struct htb_class *cl = (struct htb_class*)arg; + +#ifdef HTB_RATECM + cl->rate_est.bps = cl->rate_bytes/(HTB_EWMAC*HTB_HSIZE); + cl->rate_est.pps = cl->rate_packets/(HTB_EWMAC*HTB_HSIZE); +#endif + + if (!cl->level && cl->un.leaf.q) + cl->qstats.qlen = cl->un.leaf.q->q.qlen; + cl->xstats.tokens = cl->tokens; + cl->xstats.ctokens = cl->ctokens; + + if (gnet_stats_copy_basic(d, &cl->bstats) < 0 || + gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 || + gnet_stats_copy_queue(d, &cl->qstats) < 0) + return -1; + + return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats)); +} + +static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct htb_class *cl = (struct htb_class*)arg; + + if (cl && !cl->level) { + if (new == NULL && (new = qdisc_create_dflt(sch->dev, + &pfifo_qdisc_ops)) == NULL) + return -ENOBUFS; + sch_tree_lock(sch); + if ((*old = xchg(&cl->un.leaf.q, new)) != NULL) { + if (cl->prio_activity) + htb_deactivate (qdisc_priv(sch),cl); + + /* TODO: is it correct ? Why CBQ doesn't do it ? */ + sch->q.qlen -= (*old)->q.qlen; + qdisc_reset(*old); + } + sch_tree_unlock(sch); + return 0; + } + return -ENOENT; +} + +static struct Qdisc * htb_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct htb_class *cl = (struct htb_class*)arg; + return (cl && !cl->level) ? cl->un.leaf.q : NULL; +} + +static unsigned long htb_get(struct Qdisc *sch, u32 classid) +{ +#ifdef HTB_DEBUG + struct htb_sched *q = qdisc_priv(sch); +#endif + struct htb_class *cl = htb_find(classid,sch); + HTB_DBG(0,1,"htb_get clid=%X q=%p cl=%p ref=%d\n",classid,q,cl,cl?cl->refcnt:0); + if (cl) + cl->refcnt++; + return (unsigned long)cl; +} + +static void htb_destroy_filters(struct tcf_proto **fl) +{ + struct tcf_proto *tp; + + while ((tp = *fl) != NULL) { + *fl = tp->next; + tcf_destroy(tp); + } +} + +static void htb_destroy_class(struct Qdisc* sch,struct htb_class *cl) +{ + struct htb_sched *q = qdisc_priv(sch); + HTB_DBG(0,1,"htb_destrycls clid=%X ref=%d\n", cl?cl->classid:0,cl?cl->refcnt:0); + if (!cl->level) { + BUG_TRAP(cl->un.leaf.q); + sch->q.qlen -= cl->un.leaf.q->q.qlen; + qdisc_destroy(cl->un.leaf.q); + } + qdisc_put_rtab(cl->rate); + qdisc_put_rtab(cl->ceil); + + htb_destroy_filters (&cl->filter_list); + + while (!list_empty(&cl->children)) + htb_destroy_class (sch,list_entry(cl->children.next, + struct htb_class,sibling)); + + /* note: this delete may happen twice (see htb_delete) */ + list_del(&cl->hlist); + list_del(&cl->sibling); + + if (cl->prio_activity) + htb_deactivate (q,cl); + + if (cl->cmode != HTB_CAN_SEND) + htb_safe_rb_erase(&cl->pq_node,q->wait_pq+cl->level); + + kfree(cl); +} + +/* always caled under BH & queue lock */ +static void htb_destroy(struct Qdisc* sch) +{ + struct htb_sched *q = qdisc_priv(sch); + HTB_DBG(0,1,"htb_destroy q=%p\n",q); + + del_timer_sync (&q->timer); +#ifdef HTB_RATECM + del_timer_sync (&q->rttim); +#endif + /* This line used to be after htb_destroy_class call below + and surprisingly it worked in 2.4. But it must precede it + because filter need its target class alive to be able to call + unbind_filter on it (without Oops). */ + htb_destroy_filters(&q->filter_list); + + while (!list_empty(&q->root)) + htb_destroy_class (sch,list_entry(q->root.next, + struct htb_class,sibling)); + + __skb_queue_purge(&q->direct_queue); +} + +static int htb_delete(struct Qdisc *sch, unsigned long arg) +{ + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl = (struct htb_class*)arg; + HTB_DBG(0,1,"htb_delete q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0); + + // TODO: why don't allow to delete subtree ? references ? does + // tc subsys quarantee us that in htb_destroy it holds no class + // refs so that we can remove children safely there ? + if (!list_empty(&cl->children) || cl->filter_cnt) + return -EBUSY; + + sch_tree_lock(sch); + + /* delete from hash and active; remainder in destroy_class */ + list_del_init(&cl->hlist); + if (cl->prio_activity) + htb_deactivate (q,cl); + + if (--cl->refcnt == 0) + htb_destroy_class(sch,cl); + + sch_tree_unlock(sch); + return 0; +} + +static void htb_put(struct Qdisc *sch, unsigned long arg) +{ +#ifdef HTB_DEBUG + struct htb_sched *q = qdisc_priv(sch); +#endif + struct htb_class *cl = (struct htb_class*)arg; + HTB_DBG(0,1,"htb_put q=%p cl=%X ref=%d\n",q,cl?cl->classid:0,cl?cl->refcnt:0); + + if (--cl->refcnt == 0) + htb_destroy_class(sch,cl); +} + +static int htb_change_class(struct Qdisc *sch, u32 classid, + u32 parentid, struct rtattr **tca, unsigned long *arg) +{ + int err = -EINVAL; + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl = (struct htb_class*)*arg,*parent; + struct rtattr *opt = tca[TCA_OPTIONS-1]; + struct qdisc_rate_table *rtab = NULL, *ctab = NULL; + struct rtattr *tb[TCA_HTB_RTAB]; + struct tc_htb_opt *hopt; + + /* extract all subattrs from opt attr */ + if (!opt || rtattr_parse_nested(tb, TCA_HTB_RTAB, opt) || + tb[TCA_HTB_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_HTB_PARMS-1]) < sizeof(*hopt)) + goto failure; + + parent = parentid == TC_H_ROOT ? NULL : htb_find (parentid,sch); + + hopt = RTA_DATA(tb[TCA_HTB_PARMS-1]); + HTB_DBG(0,1,"htb_chg cl=%p(%X), clid=%X, parid=%X, opt/prio=%d, rate=%u, buff=%d, quant=%d\n", cl,cl?cl->classid:0,classid,parentid,(int)hopt->prio,hopt->rate.rate,hopt->buffer,hopt->quantum); + rtab = qdisc_get_rtab(&hopt->rate, tb[TCA_HTB_RTAB-1]); + ctab = qdisc_get_rtab(&hopt->ceil, tb[TCA_HTB_CTAB-1]); + if (!rtab || !ctab) goto failure; + + if (!cl) { /* new class */ + struct Qdisc *new_q; + /* check for valid classid */ + if (!classid || TC_H_MAJ(classid^sch->handle) || htb_find(classid,sch)) + goto failure; + + /* check maximal depth */ + if (parent && parent->parent && parent->parent->level < 2) { + printk(KERN_ERR "htb: tree is too deep\n"); + goto failure; + } + err = -ENOBUFS; + if ((cl = kmalloc(sizeof(*cl), GFP_KERNEL)) == NULL) + goto failure; + + memset(cl, 0, sizeof(*cl)); + cl->refcnt = 1; + INIT_LIST_HEAD(&cl->sibling); + INIT_LIST_HEAD(&cl->hlist); + INIT_LIST_HEAD(&cl->children); + INIT_LIST_HEAD(&cl->un.leaf.drop_list); +#ifdef HTB_DEBUG + cl->magic = HTB_CMAGIC; +#endif + + /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL) + so that can't be used inside of sch_tree_lock + -- thanks to Karlis Peisenieks */ + new_q = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + sch_tree_lock(sch); + if (parent && !parent->level) { + /* turn parent into inner node */ + sch->q.qlen -= parent->un.leaf.q->q.qlen; + qdisc_destroy (parent->un.leaf.q); + if (parent->prio_activity) + htb_deactivate (q,parent); + + /* remove from evt list because of level change */ + if (parent->cmode != HTB_CAN_SEND) { + htb_safe_rb_erase(&parent->pq_node,q->wait_pq /*+0*/); + parent->cmode = HTB_CAN_SEND; + } + parent->level = (parent->parent ? parent->parent->level + : TC_HTB_MAXDEPTH) - 1; + memset (&parent->un.inner,0,sizeof(parent->un.inner)); + } + /* leaf (we) needs elementary qdisc */ + cl->un.leaf.q = new_q ? new_q : &noop_qdisc; + + cl->classid = classid; cl->parent = parent; + + /* set class to be in HTB_CAN_SEND state */ + cl->tokens = hopt->buffer; + cl->ctokens = hopt->cbuffer; + cl->mbuffer = 60000000; /* 1min */ + PSCHED_GET_TIME(cl->t_c); + cl->cmode = HTB_CAN_SEND; + + /* attach to the hash list and parent's family */ + list_add_tail(&cl->hlist, q->hash+htb_hash(classid)); + list_add_tail(&cl->sibling, parent ? &parent->children : &q->root); +#ifdef HTB_DEBUG + { + int i; + for (i = 0; i < TC_HTB_NUMPRIO; i++) cl->node[i].rb_color = -1; + cl->pq_node.rb_color = -1; + } +#endif + } else sch_tree_lock(sch); + + /* it used to be a nasty bug here, we have to check that node + is really leaf before changing cl->un.leaf ! */ + if (!cl->level) { + cl->un.leaf.quantum = rtab->rate.rate / q->rate2quantum; + if (!hopt->quantum && cl->un.leaf.quantum < 1000) { + printk(KERN_WARNING "HTB: quantum of class %X is small. Consider r2q change.\n", cl->classid); + cl->un.leaf.quantum = 1000; + } + if (!hopt->quantum && cl->un.leaf.quantum > 200000) { + printk(KERN_WARNING "HTB: quantum of class %X is big. Consider r2q change.\n", cl->classid); + cl->un.leaf.quantum = 200000; + } + if (hopt->quantum) + cl->un.leaf.quantum = hopt->quantum; + if ((cl->un.leaf.prio = hopt->prio) >= TC_HTB_NUMPRIO) + cl->un.leaf.prio = TC_HTB_NUMPRIO - 1; + } + + cl->buffer = hopt->buffer; + cl->cbuffer = hopt->cbuffer; + if (cl->rate) qdisc_put_rtab(cl->rate); cl->rate = rtab; + if (cl->ceil) qdisc_put_rtab(cl->ceil); cl->ceil = ctab; + sch_tree_unlock(sch); + + *arg = (unsigned long)cl; + return 0; + +failure: + if (rtab) qdisc_put_rtab(rtab); + if (ctab) qdisc_put_rtab(ctab); + return err; +} + +static struct tcf_proto **htb_find_tcf(struct Qdisc *sch, unsigned long arg) +{ + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl = (struct htb_class *)arg; + struct tcf_proto **fl = cl ? &cl->filter_list : &q->filter_list; + HTB_DBG(0,2,"htb_tcf q=%p clid=%X fref=%d fl=%p\n",q,cl?cl->classid:0,cl?cl->filter_cnt:q->filter_cnt,*fl); + return fl; +} + +static unsigned long htb_bind_filter(struct Qdisc *sch, unsigned long parent, + u32 classid) +{ + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl = htb_find (classid,sch); + HTB_DBG(0,2,"htb_bind q=%p clid=%X cl=%p fref=%d\n",q,classid,cl,cl?cl->filter_cnt:q->filter_cnt); + /*if (cl && !cl->level) return 0; + The line above used to be there to prevent attaching filters to + leaves. But at least tc_index filter uses this just to get class + for other reasons so that we have to allow for it. + ---- + 19.6.2002 As Werner explained it is ok - bind filter is just + another way to "lock" the class - unlike "get" this lock can + be broken by class during destroy IIUC. + */ + if (cl) + cl->filter_cnt++; + else + q->filter_cnt++; + return (unsigned long)cl; +} + +static void htb_unbind_filter(struct Qdisc *sch, unsigned long arg) +{ + struct htb_sched *q = qdisc_priv(sch); + struct htb_class *cl = (struct htb_class *)arg; + HTB_DBG(0,2,"htb_unbind q=%p cl=%p fref=%d\n",q,cl,cl?cl->filter_cnt:q->filter_cnt); + if (cl) + cl->filter_cnt--; + else + q->filter_cnt--; +} + +static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct htb_sched *q = qdisc_priv(sch); + int i; + + if (arg->stop) + return; + + for (i = 0; i < HTB_HSIZE; i++) { + struct list_head *p; + list_for_each (p,q->hash+i) { + struct htb_class *cl = list_entry(p,struct htb_class,hlist); + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, (unsigned long)cl, arg) < 0) { + arg->stop = 1; + return; + } + arg->count++; + } + } +} + +static struct Qdisc_class_ops htb_class_ops = { + .graft = htb_graft, + .leaf = htb_leaf, + .get = htb_get, + .put = htb_put, + .change = htb_change_class, + .delete = htb_delete, + .walk = htb_walk, + .tcf_chain = htb_find_tcf, + .bind_tcf = htb_bind_filter, + .unbind_tcf = htb_unbind_filter, + .dump = htb_dump_class, + .dump_stats = htb_dump_class_stats, +}; + +static struct Qdisc_ops htb_qdisc_ops = { + .next = NULL, + .cl_ops = &htb_class_ops, + .id = "htb", + .priv_size = sizeof(struct htb_sched), + .enqueue = htb_enqueue, + .dequeue = htb_dequeue, + .requeue = htb_requeue, + .drop = htb_drop, + .init = htb_init, + .reset = htb_reset, + .destroy = htb_destroy, + .change = NULL /* htb_change */, + .dump = htb_dump, + .owner = THIS_MODULE, +}; + +static int __init htb_module_init(void) +{ + return register_qdisc(&htb_qdisc_ops); +} +static void __exit htb_module_exit(void) +{ + unregister_qdisc(&htb_qdisc_ops); +} +module_init(htb_module_init) +module_exit(htb_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c new file mode 100644 index 000000000000..8edc32a6ad2f --- /dev/null +++ b/net/sched/sch_ingress.c @@ -0,0 +1,436 @@ +/* net/sched/sch_ingress.c - Ingress qdisc + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Jamal Hadi Salim 1999 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#undef DEBUG_INGRESS + +#ifdef DEBUG_INGRESS /* control */ +#define DPRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define DPRINTK(format,args...) +#endif + +#if 0 /* data */ +#define D2PRINTK(format,args...) printk(KERN_DEBUG format,##args) +#else +#define D2PRINTK(format,args...) +#endif + + +#define PRIV(sch) qdisc_priv(sch) + + +/* Thanks to Doron Oz for this hack +*/ +#ifndef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NETFILTER +static int nf_registered; +#endif +#endif + +struct ingress_qdisc_data { + struct Qdisc *q; + struct tcf_proto *filter_list; +}; + + +/* ------------------------- Class/flow operations ------------------------- */ + + +static int ingress_graft(struct Qdisc *sch,unsigned long arg, + struct Qdisc *new,struct Qdisc **old) +{ +#ifdef DEBUG_INGRESS + struct ingress_qdisc_data *p = PRIV(sch); +#endif + + DPRINTK("ingress_graft(sch %p,[qdisc %p],new %p,old %p)\n", + sch, p, new, old); + DPRINTK("\n ingress_graft: You cannot add qdiscs to classes"); + return 1; +} + + +static struct Qdisc *ingress_leaf(struct Qdisc *sch, unsigned long arg) +{ + return NULL; +} + + +static unsigned long ingress_get(struct Qdisc *sch,u32 classid) +{ +#ifdef DEBUG_INGRESS + struct ingress_qdisc_data *p = PRIV(sch); +#endif + DPRINTK("ingress_get(sch %p,[qdisc %p],classid %x)\n", sch, p, classid); + return TC_H_MIN(classid) + 1; +} + + +static unsigned long ingress_bind_filter(struct Qdisc *sch, + unsigned long parent, u32 classid) +{ + return ingress_get(sch, classid); +} + + +static void ingress_put(struct Qdisc *sch, unsigned long cl) +{ +} + + +static int ingress_change(struct Qdisc *sch, u32 classid, u32 parent, + struct rtattr **tca, unsigned long *arg) +{ +#ifdef DEBUG_INGRESS + struct ingress_qdisc_data *p = PRIV(sch); +#endif + DPRINTK("ingress_change(sch %p,[qdisc %p],classid %x,parent %x)," + "arg 0x%lx\n", sch, p, classid, parent, *arg); + DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment"); + return 0; +} + + + +static void ingress_walk(struct Qdisc *sch,struct qdisc_walker *walker) +{ +#ifdef DEBUG_INGRESS + struct ingress_qdisc_data *p = PRIV(sch); +#endif + DPRINTK("ingress_walk(sch %p,[qdisc %p],walker %p)\n", sch, p, walker); + DPRINTK("No effect. sch_ingress doesn't maintain classes at the moment"); +} + + +static struct tcf_proto **ingress_find_tcf(struct Qdisc *sch,unsigned long cl) +{ + struct ingress_qdisc_data *p = PRIV(sch); + + return &p->filter_list; +} + + +/* --------------------------- Qdisc operations ---------------------------- */ + + +static int ingress_enqueue(struct sk_buff *skb,struct Qdisc *sch) +{ + struct ingress_qdisc_data *p = PRIV(sch); + struct tcf_result res; + int result; + + D2PRINTK("ingress_enqueue(skb %p,sch %p,[qdisc %p])\n", skb, sch, p); + result = tc_classify(skb, p->filter_list, &res); + D2PRINTK("result %d class 0x%04x\n", result, res.classid); + /* + * Unlike normal "enqueue" functions, ingress_enqueue returns a + * firewall FW_* code. + */ +#ifdef CONFIG_NET_CLS_ACT + sch->bstats.packets++; + sch->bstats.bytes += skb->len; + switch (result) { + case TC_ACT_SHOT: + result = TC_ACT_SHOT; + sch->qstats.drops++; + break; + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + result = TC_ACT_STOLEN; + break; + case TC_ACT_RECLASSIFY: + case TC_ACT_OK: + case TC_ACT_UNSPEC: + default: + skb->tc_index = TC_H_MIN(res.classid); + result = TC_ACT_OK; + break; + }; +/* backward compat */ +#else +#ifdef CONFIG_NET_CLS_POLICE + switch (result) { + case TC_POLICE_SHOT: + result = NF_DROP; + sch->qstats.drops++; + break; + case TC_POLICE_RECLASSIFY: /* DSCP remarking here ? */ + case TC_POLICE_OK: + case TC_POLICE_UNSPEC: + default: + sch->bstats.packets++; + sch->bstats.bytes += skb->len; + result = NF_ACCEPT; + break; + }; + +#else + D2PRINTK("Overriding result to ACCEPT\n"); + result = NF_ACCEPT; + sch->bstats.packets++; + sch->bstats.bytes += skb->len; +#endif +#endif + + return result; +} + + +static struct sk_buff *ingress_dequeue(struct Qdisc *sch) +{ +/* + struct ingress_qdisc_data *p = PRIV(sch); + D2PRINTK("ingress_dequeue(sch %p,[qdisc %p])\n",sch,PRIV(p)); +*/ + return NULL; +} + + +static int ingress_requeue(struct sk_buff *skb,struct Qdisc *sch) +{ +/* + struct ingress_qdisc_data *p = PRIV(sch); + D2PRINTK("ingress_requeue(skb %p,sch %p,[qdisc %p])\n",skb,sch,PRIV(p)); +*/ + return 0; +} + +static unsigned int ingress_drop(struct Qdisc *sch) +{ +#ifdef DEBUG_INGRESS + struct ingress_qdisc_data *p = PRIV(sch); +#endif + DPRINTK("ingress_drop(sch %p,[qdisc %p])\n", sch, p); + return 0; +} + +#ifndef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NETFILTER +static unsigned int +ing_hook(unsigned int hook, struct sk_buff **pskb, + const struct net_device *indev, + const struct net_device *outdev, + int (*okfn)(struct sk_buff *)) +{ + + struct Qdisc *q; + struct sk_buff *skb = *pskb; + struct net_device *dev = skb->dev; + int fwres=NF_ACCEPT; + + DPRINTK("ing_hook: skb %s dev=%s len=%u\n", + skb->sk ? "(owned)" : "(unowned)", + skb->dev ? (*pskb)->dev->name : "(no dev)", + skb->len); + +/* +revisit later: Use a private since lock dev->queue_lock is also +used on the egress (might slow things for an iota) +*/ + + if (dev->qdisc_ingress) { + spin_lock(&dev->queue_lock); + if ((q = dev->qdisc_ingress) != NULL) + fwres = q->enqueue(skb, q); + spin_unlock(&dev->queue_lock); + } + + return fwres; +} + +/* after ipt_filter */ +static struct nf_hook_ops ing_ops = { + .hook = ing_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_FILTER + 1, +}; + +static struct nf_hook_ops ing6_ops = { + .hook = ing_hook, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_IP6_PRE_ROUTING, + .priority = NF_IP6_PRI_FILTER + 1, +}; + +#endif +#endif + +static int ingress_init(struct Qdisc *sch,struct rtattr *opt) +{ + struct ingress_qdisc_data *p = PRIV(sch); + +/* Make sure either netfilter or preferably CLS_ACT is +* compiled in */ +#ifndef CONFIG_NET_CLS_ACT +#ifndef CONFIG_NETFILTER + printk("You MUST compile classifier actions into the kernel\n"); + return -EINVAL; +#else + printk("Ingress scheduler: Classifier actions prefered over netfilter\n"); +#endif +#endif + +#ifndef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NETFILTER + if (!nf_registered) { + if (nf_register_hook(&ing_ops) < 0) { + printk("ingress qdisc registration error \n"); + return -EINVAL; + } + nf_registered++; + + if (nf_register_hook(&ing6_ops) < 0) { + printk("IPv6 ingress qdisc registration error, " \ + "disabling IPv6 support.\n"); + } else + nf_registered++; + } +#endif +#endif + + DPRINTK("ingress_init(sch %p,[qdisc %p],opt %p)\n",sch,p,opt); + p->q = &noop_qdisc; + return 0; +} + + +static void ingress_reset(struct Qdisc *sch) +{ + struct ingress_qdisc_data *p = PRIV(sch); + + DPRINTK("ingress_reset(sch %p,[qdisc %p])\n", sch, p); + +/* +#if 0 +*/ +/* for future use */ + qdisc_reset(p->q); +/* +#endif +*/ +} + +/* ------------------------------------------------------------- */ + + +/* ------------------------------------------------------------- */ + +static void ingress_destroy(struct Qdisc *sch) +{ + struct ingress_qdisc_data *p = PRIV(sch); + struct tcf_proto *tp; + + DPRINTK("ingress_destroy(sch %p,[qdisc %p])\n", sch, p); + while (p->filter_list) { + tp = p->filter_list; + p->filter_list = tp->next; + tcf_destroy(tp); + } +#if 0 +/* for future use */ + qdisc_destroy(p->q); +#endif +} + + +static int ingress_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + unsigned char *b = skb->tail; + struct rtattr *rta; + + rta = (struct rtattr *) b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + rta->rta_len = skb->tail - b; + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct Qdisc_class_ops ingress_class_ops = { + .graft = ingress_graft, + .leaf = ingress_leaf, + .get = ingress_get, + .put = ingress_put, + .change = ingress_change, + .delete = NULL, + .walk = ingress_walk, + .tcf_chain = ingress_find_tcf, + .bind_tcf = ingress_bind_filter, + .unbind_tcf = ingress_put, + .dump = NULL, +}; + +static struct Qdisc_ops ingress_qdisc_ops = { + .next = NULL, + .cl_ops = &ingress_class_ops, + .id = "ingress", + .priv_size = sizeof(struct ingress_qdisc_data), + .enqueue = ingress_enqueue, + .dequeue = ingress_dequeue, + .requeue = ingress_requeue, + .drop = ingress_drop, + .init = ingress_init, + .reset = ingress_reset, + .destroy = ingress_destroy, + .change = NULL, + .dump = ingress_dump, + .owner = THIS_MODULE, +}; + +static int __init ingress_module_init(void) +{ + int ret = 0; + + if ((ret = register_qdisc(&ingress_qdisc_ops)) < 0) { + printk("Unable to register Ingress qdisc\n"); + return ret; + } + + return ret; +} +static void __exit ingress_module_exit(void) +{ + unregister_qdisc(&ingress_qdisc_ops); +#ifndef CONFIG_NET_CLS_ACT +#ifdef CONFIG_NETFILTER + if (nf_registered) { + nf_unregister_hook(&ing_ops); + if (nf_registered > 1) + nf_unregister_hook(&ing6_ops); + } +#endif +#endif +} +module_init(ingress_module_init) +module_exit(ingress_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c new file mode 100644 index 000000000000..31c29deb139d --- /dev/null +++ b/net/sched/sch_netem.c @@ -0,0 +1,598 @@ +/* + * net/sched/sch_netem.c Network emulator + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Many of the algorithms and ideas for this came from + * NIST Net which is not copyrighted. + * + * Authors: Stephen Hemminger + * Catalin(ux aka Dino) BOIE + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* Network Emulation Queuing algorithm. + ==================================== + + Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based + Network Emulation Tool + [2] Luigi Rizzo, DummyNet for FreeBSD + + ---------------------------------------------------------------- + + This started out as a simple way to delay outgoing packets to + test TCP but has grown to include most of the functionality + of a full blown network emulator like NISTnet. It can delay + packets and add random jitter (and correlation). The random + distribution can be loaded from a table as well to provide + normal, Pareto, or experimental curves. Packet loss, + duplication, and reordering can also be emulated. + + This qdisc does not do classification that can be handled in + layering other disciplines. It does not need to do bandwidth + control either since that can be handled by using token + bucket or other rate control. + + The simulator is limited by the Linux timer resolution + and will create packet bursts on the HZ boundary (1ms). +*/ + +struct netem_sched_data { + struct Qdisc *qdisc; + struct sk_buff_head delayed; + struct timer_list timer; + + u32 latency; + u32 loss; + u32 limit; + u32 counter; + u32 gap; + u32 jitter; + u32 duplicate; + + struct crndstate { + unsigned long last; + unsigned long rho; + } delay_cor, loss_cor, dup_cor; + + struct disttable { + u32 size; + s16 table[0]; + } *delay_dist; +}; + +/* Time stamp put into socket buffer control block */ +struct netem_skb_cb { + psched_time_t time_to_send; +}; + +/* init_crandom - initialize correlated random number generator + * Use entropy source for initial seed. + */ +static void init_crandom(struct crndstate *state, unsigned long rho) +{ + state->rho = rho; + state->last = net_random(); +} + +/* get_crandom - correlated random number generator + * Next number depends on last value. + * rho is scaled to avoid floating point. + */ +static unsigned long get_crandom(struct crndstate *state) +{ + u64 value, rho; + unsigned long answer; + + if (state->rho == 0) /* no correllation */ + return net_random(); + + value = net_random(); + rho = (u64)state->rho + 1; + answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; + state->last = answer; + return answer; +} + +/* tabledist - return a pseudo-randomly distributed value with mean mu and + * std deviation sigma. Uses table lookup to approximate the desired + * distribution, and a uniformly-distributed pseudo-random source. + */ +static long tabledist(unsigned long mu, long sigma, + struct crndstate *state, const struct disttable *dist) +{ + long t, x; + unsigned long rnd; + + if (sigma == 0) + return mu; + + rnd = get_crandom(state); + + /* default uniform distribution */ + if (dist == NULL) + return (rnd % (2*sigma)) - sigma + mu; + + t = dist->table[rnd % dist->size]; + x = (sigma % NETEM_DIST_SCALE) * t; + if (x >= 0) + x += NETEM_DIST_SCALE/2; + else + x -= NETEM_DIST_SCALE/2; + + return x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu; +} + +/* Put skb in the private delayed queue. */ +static int delay_skb(struct Qdisc *sch, struct sk_buff *skb) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct netem_skb_cb *cb = (struct netem_skb_cb *)skb->cb; + psched_tdiff_t td; + psched_time_t now; + + PSCHED_GET_TIME(now); + td = tabledist(q->latency, q->jitter, &q->delay_cor, q->delay_dist); + PSCHED_TADD2(now, td, cb->time_to_send); + + /* Always queue at tail to keep packets in order */ + if (likely(q->delayed.qlen < q->limit)) { + __skb_queue_tail(&q->delayed, skb); + if (!timer_pending(&q->timer)) { + q->timer.expires = jiffies + PSCHED_US2JIFFIE(td); + add_timer(&q->timer); + } + return NET_XMIT_SUCCESS; + } + + kfree_skb(skb); + return NET_XMIT_DROP; +} + +static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb2; + int ret; + + pr_debug("netem_enqueue skb=%p @%lu\n", skb, jiffies); + + /* Random packet drop 0 => none, ~0 => all */ + if (q->loss && q->loss >= get_crandom(&q->loss_cor)) { + pr_debug("netem_enqueue: random loss\n"); + sch->qstats.drops++; + kfree_skb(skb); + return 0; /* lie about loss so TCP doesn't know */ + } + + /* Random duplication */ + if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor) + && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) { + pr_debug("netem_enqueue: dup %p\n", skb2); + + if (delay_skb(sch, skb2)) { + sch->q.qlen++; + sch->bstats.bytes += skb2->len; + sch->bstats.packets++; + } else + sch->qstats.drops++; + } + + /* If doing simple delay then gap == 0 so all packets + * go into the delayed holding queue + * otherwise if doing out of order only "1 out of gap" + * packets will be delayed. + */ + if (q->counter < q->gap) { + ++q->counter; + ret = q->qdisc->enqueue(skb, q->qdisc); + } else { + q->counter = 0; + ret = delay_skb(sch, skb); + } + + if (likely(ret == NET_XMIT_SUCCESS)) { + sch->q.qlen++; + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + } else + sch->qstats.drops++; + + return ret; +} + +/* Requeue packets but don't change time stamp */ +static int netem_requeue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + int ret; + + if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) { + sch->q.qlen++; + sch->qstats.requeues++; + } + + return ret; +} + +static unsigned int netem_drop(struct Qdisc* sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + unsigned int len; + + if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) { + sch->q.qlen--; + sch->qstats.drops++; + } + return len; +} + +/* Dequeue packet. + * Move all packets that are ready to send from the delay holding + * list to the underlying qdisc, then just call dequeue + */ +static struct sk_buff *netem_dequeue(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + skb = q->qdisc->dequeue(q->qdisc); + if (skb) + sch->q.qlen--; + return skb; +} + +static void netem_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc *)arg; + struct netem_sched_data *q = qdisc_priv(sch); + struct net_device *dev = sch->dev; + struct sk_buff *skb; + psched_time_t now; + + pr_debug("netem_watchdog: fired @%lu\n", jiffies); + + spin_lock_bh(&dev->queue_lock); + PSCHED_GET_TIME(now); + + while ((skb = skb_peek(&q->delayed)) != NULL) { + const struct netem_skb_cb *cb + = (const struct netem_skb_cb *)skb->cb; + long delay + = PSCHED_US2JIFFIE(PSCHED_TDIFF(cb->time_to_send, now)); + pr_debug("netem_watchdog: skb %p@%lu %ld\n", + skb, jiffies, delay); + + /* if more time remaining? */ + if (delay > 0) { + mod_timer(&q->timer, jiffies + delay); + break; + } + __skb_unlink(skb, &q->delayed); + + if (q->qdisc->enqueue(skb, q->qdisc)) { + sch->q.qlen--; + sch->qstats.drops++; + } + } + qdisc_run(dev); + spin_unlock_bh(&dev->queue_lock); +} + +static void netem_reset(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + qdisc_reset(q->qdisc); + skb_queue_purge(&q->delayed); + + sch->q.qlen = 0; + del_timer_sync(&q->timer); +} + +static int set_fifo_limit(struct Qdisc *q, int limit) +{ + struct rtattr *rta; + int ret = -ENOMEM; + + rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); + if (rta) { + rta->rta_type = RTM_NEWQDISC; + rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); + ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit; + + ret = q->ops->change(q, rta); + kfree(rta); + } + return ret; +} + +/* + * Distribution data is a variable size payload containing + * signed 16 bit values. + */ +static int get_dist_table(struct Qdisc *sch, const struct rtattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + unsigned long n = RTA_PAYLOAD(attr)/sizeof(__s16); + const __s16 *data = RTA_DATA(attr); + struct disttable *d; + int i; + + if (n > 65536) + return -EINVAL; + + d = kmalloc(sizeof(*d) + n*sizeof(d->table[0]), GFP_KERNEL); + if (!d) + return -ENOMEM; + + d->size = n; + for (i = 0; i < n; i++) + d->table[i] = data[i]; + + spin_lock_bh(&sch->dev->queue_lock); + d = xchg(&q->delay_dist, d); + spin_unlock_bh(&sch->dev->queue_lock); + + kfree(d); + return 0; +} + +static int get_correlation(struct Qdisc *sch, const struct rtattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct tc_netem_corr *c = RTA_DATA(attr); + + if (RTA_PAYLOAD(attr) != sizeof(*c)) + return -EINVAL; + + init_crandom(&q->delay_cor, c->delay_corr); + init_crandom(&q->loss_cor, c->loss_corr); + init_crandom(&q->dup_cor, c->dup_corr); + return 0; +} + +static int netem_change(struct Qdisc *sch, struct rtattr *opt) +{ + struct netem_sched_data *q = qdisc_priv(sch); + struct tc_netem_qopt *qopt; + int ret; + + if (opt == NULL || RTA_PAYLOAD(opt) < sizeof(*qopt)) + return -EINVAL; + + qopt = RTA_DATA(opt); + ret = set_fifo_limit(q->qdisc, qopt->limit); + if (ret) { + pr_debug("netem: can't set fifo limit\n"); + return ret; + } + + q->latency = qopt->latency; + q->jitter = qopt->jitter; + q->limit = qopt->limit; + q->gap = qopt->gap; + q->loss = qopt->loss; + q->duplicate = qopt->duplicate; + + /* Handle nested options after initial queue options. + * Should have put all options in nested format but too late now. + */ + if (RTA_PAYLOAD(opt) > sizeof(*qopt)) { + struct rtattr *tb[TCA_NETEM_MAX]; + if (rtattr_parse(tb, TCA_NETEM_MAX, + RTA_DATA(opt) + sizeof(*qopt), + RTA_PAYLOAD(opt) - sizeof(*qopt))) + return -EINVAL; + + if (tb[TCA_NETEM_CORR-1]) { + ret = get_correlation(sch, tb[TCA_NETEM_CORR-1]); + if (ret) + return ret; + } + + if (tb[TCA_NETEM_DELAY_DIST-1]) { + ret = get_dist_table(sch, tb[TCA_NETEM_DELAY_DIST-1]); + if (ret) + return ret; + } + } + + + return 0; +} + +static int netem_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct netem_sched_data *q = qdisc_priv(sch); + int ret; + + if (!opt) + return -EINVAL; + + skb_queue_head_init(&q->delayed); + init_timer(&q->timer); + q->timer.function = netem_watchdog; + q->timer.data = (unsigned long) sch; + q->counter = 0; + + q->qdisc = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (!q->qdisc) { + pr_debug("netem: qdisc create failed\n"); + return -ENOMEM; + } + + ret = netem_change(sch, opt); + if (ret) { + pr_debug("netem: change failed\n"); + qdisc_destroy(q->qdisc); + } + return ret; +} + +static void netem_destroy(struct Qdisc *sch) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + del_timer_sync(&q->timer); + qdisc_destroy(q->qdisc); + kfree(q->delay_dist); +} + +static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + const struct netem_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct rtattr *rta = (struct rtattr *) b; + struct tc_netem_qopt qopt; + struct tc_netem_corr cor; + + qopt.latency = q->latency; + qopt.jitter = q->jitter; + qopt.limit = q->limit; + qopt.loss = q->loss; + qopt.gap = q->gap; + qopt.duplicate = q->duplicate; + RTA_PUT(skb, TCA_OPTIONS, sizeof(qopt), &qopt); + + cor.delay_corr = q->delay_cor.rho; + cor.loss_corr = q->loss_cor.rho; + cor.dup_corr = q->dup_cor.rho; + RTA_PUT(skb, TCA_NETEM_CORR, sizeof(cor), &cor); + rta->rta_len = skb->tail - b; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int netem_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + if (cl != 1) /* only one class */ + return -ENOENT; + + tcm->tcm_handle |= TC_H_MIN(1); + tcm->tcm_info = q->qdisc->handle; + + return 0; +} + +static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct netem_sched_data *q = qdisc_priv(sch); + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = xchg(&q->qdisc, new); + qdisc_reset(*old); + sch->q.qlen = 0; + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct netem_sched_data *q = qdisc_priv(sch); + return q->qdisc; +} + +static unsigned long netem_get(struct Qdisc *sch, u32 classid) +{ + return 1; +} + +static void netem_put(struct Qdisc *sch, unsigned long arg) +{ +} + +static int netem_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct rtattr **tca, unsigned long *arg) +{ + return -ENOSYS; +} + +static int netem_delete(struct Qdisc *sch, unsigned long arg) +{ + return -ENOSYS; +} + +static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + if (!walker->stop) { + if (walker->count >= walker->skip) + if (walker->fn(sch, 1, walker) < 0) { + walker->stop = 1; + return; + } + walker->count++; + } +} + +static struct tcf_proto **netem_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + return NULL; +} + +static struct Qdisc_class_ops netem_class_ops = { + .graft = netem_graft, + .leaf = netem_leaf, + .get = netem_get, + .put = netem_put, + .change = netem_change_class, + .delete = netem_delete, + .walk = netem_walk, + .tcf_chain = netem_find_tcf, + .dump = netem_dump_class, +}; + +static struct Qdisc_ops netem_qdisc_ops = { + .id = "netem", + .cl_ops = &netem_class_ops, + .priv_size = sizeof(struct netem_sched_data), + .enqueue = netem_enqueue, + .dequeue = netem_dequeue, + .requeue = netem_requeue, + .drop = netem_drop, + .init = netem_init, + .reset = netem_reset, + .destroy = netem_destroy, + .change = netem_change, + .dump = netem_dump, + .owner = THIS_MODULE, +}; + + +static int __init netem_module_init(void) +{ + return register_qdisc(&netem_qdisc_ops); +} +static void __exit netem_module_exit(void) +{ + unregister_qdisc(&netem_qdisc_ops); +} +module_init(netem_module_init) +module_exit(netem_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c new file mode 100644 index 000000000000..3ac0f495bad0 --- /dev/null +++ b/net/sched/sch_prio.c @@ -0,0 +1,444 @@ +/* + * net/sched/sch_prio.c Simple 3-band priority "scheduler". + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * Fixes: 19990609: J Hadi Salim : + * Init -- EINVAL when opt undefined + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +struct prio_sched_data +{ + int bands; + struct tcf_proto *filter_list; + u8 prio2band[TC_PRIO_MAX+1]; + struct Qdisc *queues[TCQ_PRIO_BANDS]; +}; + + +static struct Qdisc * +prio_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr) +{ + struct prio_sched_data *q = qdisc_priv(sch); + u32 band = skb->priority; + struct tcf_result res; + + *qerr = NET_XMIT_DROP; + if (TC_H_MAJ(skb->priority) != sch->handle) { +#ifdef CONFIG_NET_CLS_ACT + switch (tc_classify(skb, q->filter_list, &res)) { + case TC_ACT_STOLEN: + case TC_ACT_QUEUED: + *qerr = NET_XMIT_SUCCESS; + case TC_ACT_SHOT: + return NULL; + }; + + if (!q->filter_list ) { +#else + if (!q->filter_list || tc_classify(skb, q->filter_list, &res)) { +#endif + if (TC_H_MAJ(band)) + band = 0; + return q->queues[q->prio2band[band&TC_PRIO_MAX]]; + } + band = res.classid; + } + band = TC_H_MIN(band) - 1; + if (band > q->bands) + return q->queues[q->prio2band[0]]; + + return q->queues[band]; +} + +static int +prio_enqueue(struct sk_buff *skb, struct Qdisc *sch) +{ + struct Qdisc *qdisc; + int ret; + + qdisc = prio_classify(skb, sch, &ret); +#ifdef CONFIG_NET_CLS_ACT + if (qdisc == NULL) { + if (ret == NET_XMIT_DROP) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } +#endif + + if ((ret = qdisc->enqueue(skb, qdisc)) == NET_XMIT_SUCCESS) { + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + sch->q.qlen++; + return NET_XMIT_SUCCESS; + } + sch->qstats.drops++; + return ret; +} + + +static int +prio_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct Qdisc *qdisc; + int ret; + + qdisc = prio_classify(skb, sch, &ret); +#ifdef CONFIG_NET_CLS_ACT + if (qdisc == NULL) { + if (ret == NET_XMIT_DROP) + sch->qstats.drops++; + kfree_skb(skb); + return ret; + } +#endif + + if ((ret = qdisc->ops->requeue(skb, qdisc)) == NET_XMIT_SUCCESS) { + sch->q.qlen++; + sch->qstats.requeues++; + return 0; + } + sch->qstats.drops++; + return NET_XMIT_DROP; +} + + +static struct sk_buff * +prio_dequeue(struct Qdisc* sch) +{ + struct sk_buff *skb; + struct prio_sched_data *q = qdisc_priv(sch); + int prio; + struct Qdisc *qdisc; + + for (prio = 0; prio < q->bands; prio++) { + qdisc = q->queues[prio]; + skb = qdisc->dequeue(qdisc); + if (skb) { + sch->q.qlen--; + return skb; + } + } + return NULL; + +} + +static unsigned int prio_drop(struct Qdisc* sch) +{ + struct prio_sched_data *q = qdisc_priv(sch); + int prio; + unsigned int len; + struct Qdisc *qdisc; + + for (prio = q->bands-1; prio >= 0; prio--) { + qdisc = q->queues[prio]; + if ((len = qdisc->ops->drop(qdisc)) != 0) { + sch->q.qlen--; + return len; + } + } + return 0; +} + + +static void +prio_reset(struct Qdisc* sch) +{ + int prio; + struct prio_sched_data *q = qdisc_priv(sch); + + for (prio=0; priobands; prio++) + qdisc_reset(q->queues[prio]); + sch->q.qlen = 0; +} + +static void +prio_destroy(struct Qdisc* sch) +{ + int prio; + struct prio_sched_data *q = qdisc_priv(sch); + struct tcf_proto *tp; + + while ((tp = q->filter_list) != NULL) { + q->filter_list = tp->next; + tcf_destroy(tp); + } + + for (prio=0; priobands; prio++) + qdisc_destroy(q->queues[prio]); +} + +static int prio_tune(struct Qdisc *sch, struct rtattr *opt) +{ + struct prio_sched_data *q = qdisc_priv(sch); + struct tc_prio_qopt *qopt = RTA_DATA(opt); + int i; + + if (opt->rta_len < RTA_LENGTH(sizeof(*qopt))) + return -EINVAL; + if (qopt->bands > TCQ_PRIO_BANDS || qopt->bands < 2) + return -EINVAL; + + for (i=0; i<=TC_PRIO_MAX; i++) { + if (qopt->priomap[i] >= qopt->bands) + return -EINVAL; + } + + sch_tree_lock(sch); + q->bands = qopt->bands; + memcpy(q->prio2band, qopt->priomap, TC_PRIO_MAX+1); + + for (i=q->bands; iqueues[i], &noop_qdisc); + if (child != &noop_qdisc) + qdisc_destroy(child); + } + sch_tree_unlock(sch); + + for (i=0; i<=TC_PRIO_MAX; i++) { + int band = q->prio2band[i]; + if (q->queues[band] == &noop_qdisc) { + struct Qdisc *child; + child = qdisc_create_dflt(sch->dev, &pfifo_qdisc_ops); + if (child) { + sch_tree_lock(sch); + child = xchg(&q->queues[band], child); + + if (child != &noop_qdisc) + qdisc_destroy(child); + sch_tree_unlock(sch); + } + } + } + return 0; +} + +static int prio_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct prio_sched_data *q = qdisc_priv(sch); + int i; + + for (i=0; iqueues[i] = &noop_qdisc; + + if (opt == NULL) { + return -EINVAL; + } else { + int err; + + if ((err= prio_tune(sch, opt)) != 0) + return err; + } + return 0; +} + +static int prio_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct prio_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct tc_prio_qopt opt; + + opt.bands = q->bands; + memcpy(&opt.priomap, q->prio2band, TC_PRIO_MAX+1); + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct prio_sched_data *q = qdisc_priv(sch); + unsigned long band = arg - 1; + + if (band >= q->bands) + return -EINVAL; + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = q->queues[band]; + q->queues[band] = new; + sch->q.qlen -= (*old)->q.qlen; + qdisc_reset(*old); + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc * +prio_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct prio_sched_data *q = qdisc_priv(sch); + unsigned long band = arg - 1; + + if (band >= q->bands) + return NULL; + + return q->queues[band]; +} + +static unsigned long prio_get(struct Qdisc *sch, u32 classid) +{ + struct prio_sched_data *q = qdisc_priv(sch); + unsigned long band = TC_H_MIN(classid); + + if (band - 1 >= q->bands) + return 0; + return band; +} + +static unsigned long prio_bind(struct Qdisc *sch, unsigned long parent, u32 classid) +{ + return prio_get(sch, classid); +} + + +static void prio_put(struct Qdisc *q, unsigned long cl) +{ + return; +} + +static int prio_change(struct Qdisc *sch, u32 handle, u32 parent, struct rtattr **tca, unsigned long *arg) +{ + unsigned long cl = *arg; + struct prio_sched_data *q = qdisc_priv(sch); + + if (cl - 1 > q->bands) + return -ENOENT; + return 0; +} + +static int prio_delete(struct Qdisc *sch, unsigned long cl) +{ + struct prio_sched_data *q = qdisc_priv(sch); + if (cl - 1 > q->bands) + return -ENOENT; + return 0; +} + + +static int prio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, + struct tcmsg *tcm) +{ + struct prio_sched_data *q = qdisc_priv(sch); + + if (cl - 1 > q->bands) + return -ENOENT; + tcm->tcm_handle |= TC_H_MIN(cl); + if (q->queues[cl-1]) + tcm->tcm_info = q->queues[cl-1]->handle; + return 0; +} + +static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) +{ + struct prio_sched_data *q = qdisc_priv(sch); + int prio; + + if (arg->stop) + return; + + for (prio = 0; prio < q->bands; prio++) { + if (arg->count < arg->skip) { + arg->count++; + continue; + } + if (arg->fn(sch, prio+1, arg) < 0) { + arg->stop = 1; + break; + } + arg->count++; + } +} + +static struct tcf_proto ** prio_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + struct prio_sched_data *q = qdisc_priv(sch); + + if (cl) + return NULL; + return &q->filter_list; +} + +static struct Qdisc_class_ops prio_class_ops = { + .graft = prio_graft, + .leaf = prio_leaf, + .get = prio_get, + .put = prio_put, + .change = prio_change, + .delete = prio_delete, + .walk = prio_walk, + .tcf_chain = prio_find_tcf, + .bind_tcf = prio_bind, + .unbind_tcf = prio_put, + .dump = prio_dump_class, +}; + +static struct Qdisc_ops prio_qdisc_ops = { + .next = NULL, + .cl_ops = &prio_class_ops, + .id = "prio", + .priv_size = sizeof(struct prio_sched_data), + .enqueue = prio_enqueue, + .dequeue = prio_dequeue, + .requeue = prio_requeue, + .drop = prio_drop, + .init = prio_init, + .reset = prio_reset, + .destroy = prio_destroy, + .change = prio_tune, + .dump = prio_dump, + .owner = THIS_MODULE, +}; + +static int __init prio_module_init(void) +{ + return register_qdisc(&prio_qdisc_ops); +} + +static void __exit prio_module_exit(void) +{ + unregister_qdisc(&prio_qdisc_ops); +} + +module_init(prio_module_init) +module_exit(prio_module_exit) + +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c new file mode 100644 index 000000000000..664d0e47374f --- /dev/null +++ b/net/sched/sch_red.c @@ -0,0 +1,459 @@ +/* + * net/sched/sch_red.c Random Early Detection queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * + * Changes: + * J Hadi Salim 980914: computation fixes + * Alexey Makarenko 990814: qave on idle link was calculated incorrectly. + * J Hadi Salim 980816: ECN support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Random Early Detection (RED) algorithm. + ======================================= + + Source: Sally Floyd and Van Jacobson, "Random Early Detection Gateways + for Congestion Avoidance", 1993, IEEE/ACM Transactions on Networking. + + This file codes a "divisionless" version of RED algorithm + as written down in Fig.17 of the paper. + +Short description. +------------------ + + When a new packet arrives we calculate the average queue length: + + avg = (1-W)*avg + W*current_queue_len, + + W is the filter time constant (chosen as 2^(-Wlog)), it controls + the inertia of the algorithm. To allow larger bursts, W should be + decreased. + + if (avg > th_max) -> packet marked (dropped). + if (avg < th_min) -> packet passes. + if (th_min < avg < th_max) we calculate probability: + + Pb = max_P * (avg - th_min)/(th_max-th_min) + + and mark (drop) packet with this probability. + Pb changes from 0 (at avg==th_min) to max_P (avg==th_max). + max_P should be small (not 1), usually 0.01..0.02 is good value. + + max_P is chosen as a number, so that max_P/(th_max-th_min) + is a negative power of two in order arithmetics to contain + only shifts. + + + Parameters, settable by user: + ----------------------------- + + limit - bytes (must be > qth_max + burst) + + Hard limit on queue length, should be chosen >qth_max + to allow packet bursts. This parameter does not + affect the algorithms behaviour and can be chosen + arbitrarily high (well, less than ram size) + Really, this limit will never be reached + if RED works correctly. + + qth_min - bytes (should be < qth_max/2) + qth_max - bytes (should be at least 2*qth_min and less limit) + Wlog - bits (<32) log(1/W). + Plog - bits (<32) + + Plog is related to max_P by formula: + + max_P = (qth_max-qth_min)/2^Plog; + + F.e. if qth_max=128K and qth_min=32K, then Plog=22 + corresponds to max_P=0.02 + + Scell_log + Stab + + Lookup table for log((1-W)^(t/t_ave). + + +NOTES: + +Upper bound on W. +----------------- + + If you want to allow bursts of L packets of size S, + you should choose W: + + L + 1 - th_min/S < (1-(1-W)^L)/W + + th_min/S = 32 th_min/S = 4 + + log(W) L + -1 33 + -2 35 + -3 39 + -4 46 + -5 57 + -6 75 + -7 101 + -8 135 + -9 190 + etc. + */ + +struct red_sched_data +{ +/* Parameters */ + u32 limit; /* HARD maximal queue length */ + u32 qth_min; /* Min average length threshold: A scaled */ + u32 qth_max; /* Max average length threshold: A scaled */ + u32 Rmask; + u32 Scell_max; + unsigned char flags; + char Wlog; /* log(W) */ + char Plog; /* random number bits */ + char Scell_log; + u8 Stab[256]; + +/* Variables */ + unsigned long qave; /* Average queue length: A scaled */ + int qcount; /* Packets since last random number generation */ + u32 qR; /* Cached random number */ + + psched_time_t qidlestart; /* Start of idle period */ + struct tc_red_xstats st; +}; + +static int red_ecn_mark(struct sk_buff *skb) +{ + if (skb->nh.raw + 20 > skb->tail) + return 0; + + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + if (INET_ECN_is_not_ect(skb->nh.iph->tos)) + return 0; + IP_ECN_set_ce(skb->nh.iph); + return 1; + case __constant_htons(ETH_P_IPV6): + if (INET_ECN_is_not_ect(ipv6_get_dsfield(skb->nh.ipv6h))) + return 0; + IP6_ECN_set_ce(skb->nh.ipv6h); + return 1; + default: + return 0; + } +} + +static int +red_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct red_sched_data *q = qdisc_priv(sch); + + psched_time_t now; + + if (!PSCHED_IS_PASTPERFECT(q->qidlestart)) { + long us_idle; + int shift; + + PSCHED_GET_TIME(now); + us_idle = PSCHED_TDIFF_SAFE(now, q->qidlestart, q->Scell_max); + PSCHED_SET_PASTPERFECT(q->qidlestart); + +/* + The problem: ideally, average length queue recalcultion should + be done over constant clock intervals. This is too expensive, so that + the calculation is driven by outgoing packets. + When the queue is idle we have to model this clock by hand. + + SF+VJ proposed to "generate" m = idletime/(average_pkt_size/bandwidth) + dummy packets as a burst after idle time, i.e. + + q->qave *= (1-W)^m + + This is an apparently overcomplicated solution (f.e. we have to precompute + a table to make this calculation in reasonable time) + I believe that a simpler model may be used here, + but it is field for experiments. +*/ + shift = q->Stab[us_idle>>q->Scell_log]; + + if (shift) { + q->qave >>= shift; + } else { + /* Approximate initial part of exponent + with linear function: + (1-W)^m ~= 1-mW + ... + + Seems, it is the best solution to + problem of too coarce exponent tabulation. + */ + + us_idle = (q->qave * us_idle)>>q->Scell_log; + if (us_idle < q->qave/2) + q->qave -= us_idle; + else + q->qave >>= 1; + } + } else { + q->qave += sch->qstats.backlog - (q->qave >> q->Wlog); + /* NOTE: + q->qave is fixed point number with point at Wlog. + The formulae above is equvalent to floating point + version: + + qave = qave*(1-W) + sch->qstats.backlog*W; + --ANK (980924) + */ + } + + if (q->qave < q->qth_min) { + q->qcount = -1; +enqueue: + if (sch->qstats.backlog + skb->len <= q->limit) { + __skb_queue_tail(&sch->q, skb); + sch->qstats.backlog += skb->len; + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + return NET_XMIT_SUCCESS; + } else { + q->st.pdrop++; + } + kfree_skb(skb); + sch->qstats.drops++; + return NET_XMIT_DROP; + } + if (q->qave >= q->qth_max) { + q->qcount = -1; + sch->qstats.overlimits++; +mark: + if (!(q->flags&TC_RED_ECN) || !red_ecn_mark(skb)) { + q->st.early++; + goto drop; + } + q->st.marked++; + goto enqueue; + } + + if (++q->qcount) { + /* The formula used below causes questions. + + OK. qR is random number in the interval 0..Rmask + i.e. 0..(2^Plog). If we used floating point + arithmetics, it would be: (2^Plog)*rnd_num, + where rnd_num is less 1. + + Taking into account, that qave have fixed + point at Wlog, and Plog is related to max_P by + max_P = (qth_max-qth_min)/2^Plog; two lines + below have the following floating point equivalent: + + max_P*(qave - qth_min)/(qth_max-qth_min) < rnd/qcount + + Any questions? --ANK (980924) + */ + if (((q->qave - q->qth_min)>>q->Wlog)*q->qcount < q->qR) + goto enqueue; + q->qcount = 0; + q->qR = net_random()&q->Rmask; + sch->qstats.overlimits++; + goto mark; + } + q->qR = net_random()&q->Rmask; + goto enqueue; + +drop: + kfree_skb(skb); + sch->qstats.drops++; + return NET_XMIT_CN; +} + +static int +red_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct red_sched_data *q = qdisc_priv(sch); + + PSCHED_SET_PASTPERFECT(q->qidlestart); + + __skb_queue_head(&sch->q, skb); + sch->qstats.backlog += skb->len; + sch->qstats.requeues++; + return 0; +} + +static struct sk_buff * +red_dequeue(struct Qdisc* sch) +{ + struct sk_buff *skb; + struct red_sched_data *q = qdisc_priv(sch); + + skb = __skb_dequeue(&sch->q); + if (skb) { + sch->qstats.backlog -= skb->len; + return skb; + } + PSCHED_GET_TIME(q->qidlestart); + return NULL; +} + +static unsigned int red_drop(struct Qdisc* sch) +{ + struct sk_buff *skb; + struct red_sched_data *q = qdisc_priv(sch); + + skb = __skb_dequeue_tail(&sch->q); + if (skb) { + unsigned int len = skb->len; + sch->qstats.backlog -= len; + sch->qstats.drops++; + q->st.other++; + kfree_skb(skb); + return len; + } + PSCHED_GET_TIME(q->qidlestart); + return 0; +} + +static void red_reset(struct Qdisc* sch) +{ + struct red_sched_data *q = qdisc_priv(sch); + + __skb_queue_purge(&sch->q); + sch->qstats.backlog = 0; + PSCHED_SET_PASTPERFECT(q->qidlestart); + q->qave = 0; + q->qcount = -1; +} + +static int red_change(struct Qdisc *sch, struct rtattr *opt) +{ + struct red_sched_data *q = qdisc_priv(sch); + struct rtattr *tb[TCA_RED_STAB]; + struct tc_red_qopt *ctl; + + if (opt == NULL || + rtattr_parse_nested(tb, TCA_RED_STAB, opt) || + tb[TCA_RED_PARMS-1] == 0 || tb[TCA_RED_STAB-1] == 0 || + RTA_PAYLOAD(tb[TCA_RED_PARMS-1]) < sizeof(*ctl) || + RTA_PAYLOAD(tb[TCA_RED_STAB-1]) < 256) + return -EINVAL; + + ctl = RTA_DATA(tb[TCA_RED_PARMS-1]); + + sch_tree_lock(sch); + q->flags = ctl->flags; + q->Wlog = ctl->Wlog; + q->Plog = ctl->Plog; + q->Rmask = ctl->Plog < 32 ? ((1<Plog) - 1) : ~0UL; + q->Scell_log = ctl->Scell_log; + q->Scell_max = (255<Scell_log); + q->qth_min = ctl->qth_min<Wlog; + q->qth_max = ctl->qth_max<Wlog; + q->limit = ctl->limit; + memcpy(q->Stab, RTA_DATA(tb[TCA_RED_STAB-1]), 256); + + q->qcount = -1; + if (skb_queue_len(&sch->q) == 0) + PSCHED_SET_PASTPERFECT(q->qidlestart); + sch_tree_unlock(sch); + return 0; +} + +static int red_init(struct Qdisc* sch, struct rtattr *opt) +{ + return red_change(sch, opt); +} + +static int red_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct red_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_red_qopt opt; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + opt.limit = q->limit; + opt.qth_min = q->qth_min>>q->Wlog; + opt.qth_max = q->qth_max>>q->Wlog; + opt.Wlog = q->Wlog; + opt.Plog = q->Plog; + opt.Scell_log = q->Scell_log; + opt.flags = q->flags; + RTA_PUT(skb, TCA_RED_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int red_dump_stats(struct Qdisc *sch, struct gnet_dump *d) +{ + struct red_sched_data *q = qdisc_priv(sch); + + return gnet_stats_copy_app(d, &q->st, sizeof(q->st)); +} + +static struct Qdisc_ops red_qdisc_ops = { + .next = NULL, + .cl_ops = NULL, + .id = "red", + .priv_size = sizeof(struct red_sched_data), + .enqueue = red_enqueue, + .dequeue = red_dequeue, + .requeue = red_requeue, + .drop = red_drop, + .init = red_init, + .reset = red_reset, + .change = red_change, + .dump = red_dump, + .dump_stats = red_dump_stats, + .owner = THIS_MODULE, +}; + +static int __init red_module_init(void) +{ + return register_qdisc(&red_qdisc_ops); +} +static void __exit red_module_exit(void) +{ + unregister_qdisc(&red_qdisc_ops); +} +module_init(red_module_init) +module_exit(red_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c new file mode 100644 index 000000000000..8734bb7280e3 --- /dev/null +++ b/net/sched/sch_sfq.c @@ -0,0 +1,497 @@ +/* + * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Stochastic Fairness Queuing algorithm. + ======================================= + + Source: + Paul E. McKenney "Stochastic Fairness Queuing", + IEEE INFOCOMM'90 Proceedings, San Francisco, 1990. + + Paul E. McKenney "Stochastic Fairness Queuing", + "Interworking: Research and Experience", v.2, 1991, p.113-131. + + + See also: + M. Shreedhar and George Varghese "Efficient Fair + Queuing using Deficit Round Robin", Proc. SIGCOMM 95. + + + This is not the thing that is usually called (W)FQ nowadays. + It does not use any timestamp mechanism, but instead + processes queues in round-robin order. + + ADVANTAGE: + + - It is very cheap. Both CPU and memory requirements are minimal. + + DRAWBACKS: + + - "Stochastic" -> It is not 100% fair. + When hash collisions occur, several flows are considered as one. + + - "Round-robin" -> It introduces larger delays than virtual clock + based schemes, and should not be used for isolating interactive + traffic from non-interactive. It means, that this scheduler + should be used as leaf of CBQ or P3, which put interactive traffic + to higher priority band. + + We still need true WFQ for top level CSZ, but using WFQ + for the best effort traffic is absolutely pointless: + SFQ is superior for this purpose. + + IMPLEMENTATION: + This implementation limits maximal queue length to 128; + maximal mtu to 2^15-1; number of hash buckets to 1024. + The only goal of this restrictions was that all data + fit into one 4K page :-). Struct sfq_sched_data is + organized in anti-cache manner: all the data for a bucket + are scattered over different locations. This is not good, + but it allowed me to put it into 4K. + + It is easy to increase these values, but not in flight. */ + +#define SFQ_DEPTH 128 +#define SFQ_HASH_DIVISOR 1024 + +/* This type should contain at least SFQ_DEPTH*2 values */ +typedef unsigned char sfq_index; + +struct sfq_head +{ + sfq_index next; + sfq_index prev; +}; + +struct sfq_sched_data +{ +/* Parameters */ + int perturb_period; + unsigned quantum; /* Allotment per round: MUST BE >= MTU */ + int limit; + +/* Variables */ + struct timer_list perturb_timer; + int perturbation; + sfq_index tail; /* Index of current slot in round */ + sfq_index max_depth; /* Maximal depth */ + + sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */ + sfq_index next[SFQ_DEPTH]; /* Active slots link */ + short allot[SFQ_DEPTH]; /* Current allotment per slot */ + unsigned short hash[SFQ_DEPTH]; /* Hash value indexed by slots */ + struct sk_buff_head qs[SFQ_DEPTH]; /* Slot queue */ + struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots, indexed by depth */ +}; + +static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1) +{ + int pert = q->perturbation; + + /* Have we any rotation primitives? If not, WHY? */ + h ^= (h1<>(0x1F - pert)); + h ^= h>>10; + return h & 0x3FF; +} + +static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) +{ + u32 h, h2; + + switch (skb->protocol) { + case __constant_htons(ETH_P_IP): + { + struct iphdr *iph = skb->nh.iph; + h = iph->daddr; + h2 = iph->saddr^iph->protocol; + if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && + (iph->protocol == IPPROTO_TCP || + iph->protocol == IPPROTO_UDP || + iph->protocol == IPPROTO_ESP)) + h2 ^= *(((u32*)iph) + iph->ihl); + break; + } + case __constant_htons(ETH_P_IPV6): + { + struct ipv6hdr *iph = skb->nh.ipv6h; + h = iph->daddr.s6_addr32[3]; + h2 = iph->saddr.s6_addr32[3]^iph->nexthdr; + if (iph->nexthdr == IPPROTO_TCP || + iph->nexthdr == IPPROTO_UDP || + iph->nexthdr == IPPROTO_ESP) + h2 ^= *(u32*)&iph[1]; + break; + } + default: + h = (u32)(unsigned long)skb->dst^skb->protocol; + h2 = (u32)(unsigned long)skb->sk; + } + return sfq_fold_hash(q, h, h2); +} + +static inline void sfq_link(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + int d = q->qs[x].qlen + SFQ_DEPTH; + + p = d; + n = q->dep[d].next; + q->dep[x].next = n; + q->dep[x].prev = p; + q->dep[p].next = q->dep[n].prev = x; +} + +static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + + n = q->dep[x].next; + p = q->dep[x].prev; + q->dep[p].next = n; + q->dep[n].prev = p; + + if (n == p && q->max_depth == q->qs[x].qlen + 1) + q->max_depth--; + + sfq_link(q, x); +} + +static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x) +{ + sfq_index p, n; + int d; + + n = q->dep[x].next; + p = q->dep[x].prev; + q->dep[p].next = n; + q->dep[n].prev = p; + d = q->qs[x].qlen; + if (q->max_depth < d) + q->max_depth = d; + + sfq_link(q, x); +} + +static unsigned int sfq_drop(struct Qdisc *sch) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + sfq_index d = q->max_depth; + struct sk_buff *skb; + unsigned int len; + + /* Queue is full! Find the longest slot and + drop a packet from it */ + + if (d > 1) { + sfq_index x = q->dep[d+SFQ_DEPTH].next; + skb = q->qs[x].prev; + len = skb->len; + __skb_unlink(skb, &q->qs[x]); + kfree_skb(skb); + sfq_dec(q, x); + sch->q.qlen--; + sch->qstats.drops++; + return len; + } + + if (d == 1) { + /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */ + d = q->next[q->tail]; + q->next[q->tail] = q->next[d]; + q->allot[q->next[d]] += q->quantum; + skb = q->qs[d].prev; + len = skb->len; + __skb_unlink(skb, &q->qs[d]); + kfree_skb(skb); + sfq_dec(q, d); + sch->q.qlen--; + q->ht[q->hash[d]] = SFQ_DEPTH; + sch->qstats.drops++; + return len; + } + + return 0; +} + +static int +sfq_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + unsigned hash = sfq_hash(q, skb); + sfq_index x; + + x = q->ht[hash]; + if (x == SFQ_DEPTH) { + q->ht[hash] = x = q->dep[SFQ_DEPTH].next; + q->hash[x] = hash; + } + __skb_queue_tail(&q->qs[x], skb); + sfq_inc(q, x); + if (q->qs[x].qlen == 1) { /* The flow is new */ + if (q->tail == SFQ_DEPTH) { /* It is the first flow */ + q->tail = x; + q->next[x] = x; + q->allot[x] = q->quantum; + } else { + q->next[x] = q->next[q->tail]; + q->next[q->tail] = x; + q->tail = x; + } + } + if (++sch->q.qlen < q->limit-1) { + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + return 0; + } + + sfq_drop(sch); + return NET_XMIT_CN; +} + +static int +sfq_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + unsigned hash = sfq_hash(q, skb); + sfq_index x; + + x = q->ht[hash]; + if (x == SFQ_DEPTH) { + q->ht[hash] = x = q->dep[SFQ_DEPTH].next; + q->hash[x] = hash; + } + __skb_queue_head(&q->qs[x], skb); + sfq_inc(q, x); + if (q->qs[x].qlen == 1) { /* The flow is new */ + if (q->tail == SFQ_DEPTH) { /* It is the first flow */ + q->tail = x; + q->next[x] = x; + q->allot[x] = q->quantum; + } else { + q->next[x] = q->next[q->tail]; + q->next[q->tail] = x; + q->tail = x; + } + } + if (++sch->q.qlen < q->limit - 1) { + sch->qstats.requeues++; + return 0; + } + + sch->qstats.drops++; + sfq_drop(sch); + return NET_XMIT_CN; +} + + + + +static struct sk_buff * +sfq_dequeue(struct Qdisc* sch) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + sfq_index a, old_a; + + /* No active slots */ + if (q->tail == SFQ_DEPTH) + return NULL; + + a = old_a = q->next[q->tail]; + + /* Grab packet */ + skb = __skb_dequeue(&q->qs[a]); + sfq_dec(q, a); + sch->q.qlen--; + + /* Is the slot empty? */ + if (q->qs[a].qlen == 0) { + q->ht[q->hash[a]] = SFQ_DEPTH; + a = q->next[a]; + if (a == old_a) { + q->tail = SFQ_DEPTH; + return skb; + } + q->next[q->tail] = a; + q->allot[a] += q->quantum; + } else if ((q->allot[a] -= skb->len) <= 0) { + q->tail = a; + a = q->next[a]; + q->allot[a] += q->quantum; + } + return skb; +} + +static void +sfq_reset(struct Qdisc* sch) +{ + struct sk_buff *skb; + + while ((skb = sfq_dequeue(sch)) != NULL) + kfree_skb(skb); +} + +static void sfq_perturbation(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + struct sfq_sched_data *q = qdisc_priv(sch); + + q->perturbation = net_random()&0x1F; + + if (q->perturb_period) { + q->perturb_timer.expires = jiffies + q->perturb_period; + add_timer(&q->perturb_timer); + } +} + +static int sfq_change(struct Qdisc *sch, struct rtattr *opt) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + struct tc_sfq_qopt *ctl = RTA_DATA(opt); + + if (opt->rta_len < RTA_LENGTH(sizeof(*ctl))) + return -EINVAL; + + sch_tree_lock(sch); + q->quantum = ctl->quantum ? : psched_mtu(sch->dev); + q->perturb_period = ctl->perturb_period*HZ; + if (ctl->limit) + q->limit = min_t(u32, ctl->limit, SFQ_DEPTH); + + while (sch->q.qlen >= q->limit-1) + sfq_drop(sch); + + del_timer(&q->perturb_timer); + if (q->perturb_period) { + q->perturb_timer.expires = jiffies + q->perturb_period; + add_timer(&q->perturb_timer); + } + sch_tree_unlock(sch); + return 0; +} + +static int sfq_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + int i; + + init_timer(&q->perturb_timer); + q->perturb_timer.data = (unsigned long)sch; + q->perturb_timer.function = sfq_perturbation; + + for (i=0; iht[i] = SFQ_DEPTH; + for (i=0; iqs[i]); + q->dep[i+SFQ_DEPTH].next = i+SFQ_DEPTH; + q->dep[i+SFQ_DEPTH].prev = i+SFQ_DEPTH; + } + q->limit = SFQ_DEPTH; + q->max_depth = 0; + q->tail = SFQ_DEPTH; + if (opt == NULL) { + q->quantum = psched_mtu(sch->dev); + q->perturb_period = 0; + } else { + int err = sfq_change(sch, opt); + if (err) + return err; + } + for (i=0; iperturb_timer); +} + +static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct sfq_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct tc_sfq_qopt opt; + + opt.quantum = q->quantum; + opt.perturb_period = q->perturb_period/HZ; + + opt.limit = q->limit; + opt.divisor = SFQ_HASH_DIVISOR; + opt.flows = q->limit; + + RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt); + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static struct Qdisc_ops sfq_qdisc_ops = { + .next = NULL, + .cl_ops = NULL, + .id = "sfq", + .priv_size = sizeof(struct sfq_sched_data), + .enqueue = sfq_enqueue, + .dequeue = sfq_dequeue, + .requeue = sfq_requeue, + .drop = sfq_drop, + .init = sfq_init, + .reset = sfq_reset, + .destroy = sfq_destroy, + .change = NULL, + .dump = sfq_dump, + .owner = THIS_MODULE, +}; + +static int __init sfq_module_init(void) +{ + return register_qdisc(&sfq_qdisc_ops); +} +static void __exit sfq_module_exit(void) +{ + unregister_qdisc(&sfq_qdisc_ops); +} +module_init(sfq_module_init) +module_exit(sfq_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c new file mode 100644 index 000000000000..cb9711ea8c6c --- /dev/null +++ b/net/sched/sch_tbf.c @@ -0,0 +1,543 @@ +/* + * net/sched/sch_tbf.c Token Bucket Filter queue. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + * Dmitry Torokhov - allow attaching inner qdiscs - + * original idea by Martin Devera + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Simple Token Bucket Filter. + ======================================= + + SOURCE. + ------- + + None. + + Description. + ------------ + + A data flow obeys TBF with rate R and depth B, if for any + time interval t_i...t_f the number of transmitted bits + does not exceed B + R*(t_f-t_i). + + Packetized version of this definition: + The sequence of packets of sizes s_i served at moments t_i + obeys TBF, if for any i<=k: + + s_i+....+s_k <= B + R*(t_k - t_i) + + Algorithm. + ---------- + + Let N(t_i) be B/R initially and N(t) grow continuously with time as: + + N(t+delta) = min{B/R, N(t) + delta} + + If the first packet in queue has length S, it may be + transmitted only at the time t_* when S/R <= N(t_*), + and in this case N(t) jumps: + + N(t_* + 0) = N(t_* - 0) - S/R. + + + + Actually, QoS requires two TBF to be applied to a data stream. + One of them controls steady state burst size, another + one with rate P (peak rate) and depth M (equal to link MTU) + limits bursts at a smaller time scale. + + It is easy to see that P>R, and B>M. If P is infinity, this double + TBF is equivalent to a single one. + + When TBF works in reshaping mode, latency is estimated as: + + lat = max ((L-B)/R, (L-M)/P) + + + NOTES. + ------ + + If TBF throttles, it starts a watchdog timer, which will wake it up + when it is ready to transmit. + Note that the minimal timer resolution is 1/HZ. + If no new packets arrive during this period, + or if the device is not awaken by EOI for some previous packet, + TBF can stop its activity for 1/HZ. + + + This means, that with depth B, the maximal rate is + + R_crit = B*HZ + + F.e. for 10Mbit ethernet and HZ=100 the minimal allowed B is ~10Kbytes. + + Note that the peak rate TBF is much more tough: with MTU 1500 + P_crit = 150Kbytes/sec. So, if you need greater peak + rates, use alpha with HZ=1000 :-) + + With classful TBF, limit is just kept for backwards compatibility. + It is passed to the default bfifo qdisc - if the inner qdisc is + changed the limit is not effective anymore. +*/ + +struct tbf_sched_data +{ +/* Parameters */ + u32 limit; /* Maximal length of backlog: bytes */ + u32 buffer; /* Token bucket depth/rate: MUST BE >= MTU/B */ + u32 mtu; + u32 max_size; + struct qdisc_rate_table *R_tab; + struct qdisc_rate_table *P_tab; + +/* Variables */ + long tokens; /* Current number of B tokens */ + long ptokens; /* Current number of P tokens */ + psched_time_t t_c; /* Time check-point */ + struct timer_list wd_timer; /* Watchdog timer */ + struct Qdisc *qdisc; /* Inner qdisc, default - bfifo queue */ +}; + +#define L2T(q,L) ((q)->R_tab->data[(L)>>(q)->R_tab->rate.cell_log]) +#define L2T_P(q,L) ((q)->P_tab->data[(L)>>(q)->P_tab->rate.cell_log]) + +static int tbf_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + int ret; + + if (skb->len > q->max_size) { + sch->qstats.drops++; +#ifdef CONFIG_NET_CLS_POLICE + if (sch->reshape_fail == NULL || sch->reshape_fail(skb, sch)) +#endif + kfree_skb(skb); + + return NET_XMIT_DROP; + } + + if ((ret = q->qdisc->enqueue(skb, q->qdisc)) != 0) { + sch->qstats.drops++; + return ret; + } + + sch->q.qlen++; + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + return 0; +} + +static int tbf_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + int ret; + + if ((ret = q->qdisc->ops->requeue(skb, q->qdisc)) == 0) { + sch->q.qlen++; + sch->qstats.requeues++; + } + + return ret; +} + +static unsigned int tbf_drop(struct Qdisc* sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + unsigned int len; + + if ((len = q->qdisc->ops->drop(q->qdisc)) != 0) { + sch->q.qlen--; + sch->qstats.drops++; + } + return len; +} + +static void tbf_watchdog(unsigned long arg) +{ + struct Qdisc *sch = (struct Qdisc*)arg; + + sch->flags &= ~TCQ_F_THROTTLED; + netif_schedule(sch->dev); +} + +static struct sk_buff *tbf_dequeue(struct Qdisc* sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + struct sk_buff *skb; + + skb = q->qdisc->dequeue(q->qdisc); + + if (skb) { + psched_time_t now; + long toks, delay; + long ptoks = 0; + unsigned int len = skb->len; + + PSCHED_GET_TIME(now); + + toks = PSCHED_TDIFF_SAFE(now, q->t_c, q->buffer); + + if (q->P_tab) { + ptoks = toks + q->ptokens; + if (ptoks > (long)q->mtu) + ptoks = q->mtu; + ptoks -= L2T_P(q, len); + } + toks += q->tokens; + if (toks > (long)q->buffer) + toks = q->buffer; + toks -= L2T(q, len); + + if ((toks|ptoks) >= 0) { + q->t_c = now; + q->tokens = toks; + q->ptokens = ptoks; + sch->q.qlen--; + sch->flags &= ~TCQ_F_THROTTLED; + return skb; + } + + delay = PSCHED_US2JIFFIE(max_t(long, -toks, -ptoks)); + + if (delay == 0) + delay = 1; + + mod_timer(&q->wd_timer, jiffies+delay); + + /* Maybe we have a shorter packet in the queue, + which can be sent now. It sounds cool, + but, however, this is wrong in principle. + We MUST NOT reorder packets under these circumstances. + + Really, if we split the flow into independent + subflows, it would be a very good solution. + This is the main idea of all FQ algorithms + (cf. CSZ, HPFQ, HFSC) + */ + + if (q->qdisc->ops->requeue(skb, q->qdisc) != NET_XMIT_SUCCESS) { + /* When requeue fails skb is dropped */ + sch->q.qlen--; + sch->qstats.drops++; + } + + sch->flags |= TCQ_F_THROTTLED; + sch->qstats.overlimits++; + } + return NULL; +} + +static void tbf_reset(struct Qdisc* sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + + qdisc_reset(q->qdisc); + sch->q.qlen = 0; + PSCHED_GET_TIME(q->t_c); + q->tokens = q->buffer; + q->ptokens = q->mtu; + sch->flags &= ~TCQ_F_THROTTLED; + del_timer(&q->wd_timer); +} + +static struct Qdisc *tbf_create_dflt_qdisc(struct net_device *dev, u32 limit) +{ + struct Qdisc *q = qdisc_create_dflt(dev, &bfifo_qdisc_ops); + struct rtattr *rta; + int ret; + + if (q) { + rta = kmalloc(RTA_LENGTH(sizeof(struct tc_fifo_qopt)), GFP_KERNEL); + if (rta) { + rta->rta_type = RTM_NEWQDISC; + rta->rta_len = RTA_LENGTH(sizeof(struct tc_fifo_qopt)); + ((struct tc_fifo_qopt *)RTA_DATA(rta))->limit = limit; + + ret = q->ops->change(q, rta); + kfree(rta); + + if (ret == 0) + return q; + } + qdisc_destroy(q); + } + + return NULL; +} + +static int tbf_change(struct Qdisc* sch, struct rtattr *opt) +{ + int err = -EINVAL; + struct tbf_sched_data *q = qdisc_priv(sch); + struct rtattr *tb[TCA_TBF_PTAB]; + struct tc_tbf_qopt *qopt; + struct qdisc_rate_table *rtab = NULL; + struct qdisc_rate_table *ptab = NULL; + struct Qdisc *child = NULL; + int max_size,n; + + if (rtattr_parse_nested(tb, TCA_TBF_PTAB, opt) || + tb[TCA_TBF_PARMS-1] == NULL || + RTA_PAYLOAD(tb[TCA_TBF_PARMS-1]) < sizeof(*qopt)) + goto done; + + qopt = RTA_DATA(tb[TCA_TBF_PARMS-1]); + rtab = qdisc_get_rtab(&qopt->rate, tb[TCA_TBF_RTAB-1]); + if (rtab == NULL) + goto done; + + if (qopt->peakrate.rate) { + if (qopt->peakrate.rate > qopt->rate.rate) + ptab = qdisc_get_rtab(&qopt->peakrate, tb[TCA_TBF_PTAB-1]); + if (ptab == NULL) + goto done; + } + + for (n = 0; n < 256; n++) + if (rtab->data[n] > qopt->buffer) break; + max_size = (n << qopt->rate.cell_log)-1; + if (ptab) { + int size; + + for (n = 0; n < 256; n++) + if (ptab->data[n] > qopt->mtu) break; + size = (n << qopt->peakrate.cell_log)-1; + if (size < max_size) max_size = size; + } + if (max_size < 0) + goto done; + + if (q->qdisc == &noop_qdisc) { + if ((child = tbf_create_dflt_qdisc(sch->dev, qopt->limit)) == NULL) + goto done; + } + + sch_tree_lock(sch); + if (child) q->qdisc = child; + q->limit = qopt->limit; + q->mtu = qopt->mtu; + q->max_size = max_size; + q->buffer = qopt->buffer; + q->tokens = q->buffer; + q->ptokens = q->mtu; + rtab = xchg(&q->R_tab, rtab); + ptab = xchg(&q->P_tab, ptab); + sch_tree_unlock(sch); + err = 0; +done: + if (rtab) + qdisc_put_rtab(rtab); + if (ptab) + qdisc_put_rtab(ptab); + return err; +} + +static int tbf_init(struct Qdisc* sch, struct rtattr *opt) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + + if (opt == NULL) + return -EINVAL; + + PSCHED_GET_TIME(q->t_c); + init_timer(&q->wd_timer); + q->wd_timer.function = tbf_watchdog; + q->wd_timer.data = (unsigned long)sch; + + q->qdisc = &noop_qdisc; + + return tbf_change(sch, opt); +} + +static void tbf_destroy(struct Qdisc *sch) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + + del_timer(&q->wd_timer); + + if (q->P_tab) + qdisc_put_rtab(q->P_tab); + if (q->R_tab) + qdisc_put_rtab(q->R_tab); + + qdisc_destroy(q->qdisc); +} + +static int tbf_dump(struct Qdisc *sch, struct sk_buff *skb) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + unsigned char *b = skb->tail; + struct rtattr *rta; + struct tc_tbf_qopt opt; + + rta = (struct rtattr*)b; + RTA_PUT(skb, TCA_OPTIONS, 0, NULL); + + opt.limit = q->limit; + opt.rate = q->R_tab->rate; + if (q->P_tab) + opt.peakrate = q->P_tab->rate; + else + memset(&opt.peakrate, 0, sizeof(opt.peakrate)); + opt.mtu = q->mtu; + opt.buffer = q->buffer; + RTA_PUT(skb, TCA_TBF_PARMS, sizeof(opt), &opt); + rta->rta_len = skb->tail - b; + + return skb->len; + +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int tbf_dump_class(struct Qdisc *sch, unsigned long cl, + struct sk_buff *skb, struct tcmsg *tcm) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + + if (cl != 1) /* only one class */ + return -ENOENT; + + tcm->tcm_handle |= TC_H_MIN(1); + tcm->tcm_info = q->qdisc->handle; + + return 0; +} + +static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, + struct Qdisc **old) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + + if (new == NULL) + new = &noop_qdisc; + + sch_tree_lock(sch); + *old = xchg(&q->qdisc, new); + qdisc_reset(*old); + sch->q.qlen = 0; + sch_tree_unlock(sch); + + return 0; +} + +static struct Qdisc *tbf_leaf(struct Qdisc *sch, unsigned long arg) +{ + struct tbf_sched_data *q = qdisc_priv(sch); + return q->qdisc; +} + +static unsigned long tbf_get(struct Qdisc *sch, u32 classid) +{ + return 1; +} + +static void tbf_put(struct Qdisc *sch, unsigned long arg) +{ +} + +static int tbf_change_class(struct Qdisc *sch, u32 classid, u32 parentid, + struct rtattr **tca, unsigned long *arg) +{ + return -ENOSYS; +} + +static int tbf_delete(struct Qdisc *sch, unsigned long arg) +{ + return -ENOSYS; +} + +static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker) +{ + if (!walker->stop) { + if (walker->count >= walker->skip) + if (walker->fn(sch, 1, walker) < 0) { + walker->stop = 1; + return; + } + walker->count++; + } +} + +static struct tcf_proto **tbf_find_tcf(struct Qdisc *sch, unsigned long cl) +{ + return NULL; +} + +static struct Qdisc_class_ops tbf_class_ops = +{ + .graft = tbf_graft, + .leaf = tbf_leaf, + .get = tbf_get, + .put = tbf_put, + .change = tbf_change_class, + .delete = tbf_delete, + .walk = tbf_walk, + .tcf_chain = tbf_find_tcf, + .dump = tbf_dump_class, +}; + +static struct Qdisc_ops tbf_qdisc_ops = { + .next = NULL, + .cl_ops = &tbf_class_ops, + .id = "tbf", + .priv_size = sizeof(struct tbf_sched_data), + .enqueue = tbf_enqueue, + .dequeue = tbf_dequeue, + .requeue = tbf_requeue, + .drop = tbf_drop, + .init = tbf_init, + .reset = tbf_reset, + .destroy = tbf_destroy, + .change = tbf_change, + .dump = tbf_dump, + .owner = THIS_MODULE, +}; + +static int __init tbf_module_init(void) +{ + return register_qdisc(&tbf_qdisc_ops); +} + +static void __exit tbf_module_exit(void) +{ + unregister_qdisc(&tbf_qdisc_ops); +} +module_init(tbf_module_init) +module_exit(tbf_module_exit) +MODULE_LICENSE("GPL"); diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c new file mode 100644 index 000000000000..6cf0342706b5 --- /dev/null +++ b/net/sched/sch_teql.c @@ -0,0 +1,511 @@ +/* net/sched/sch_teql.c "True" (or "trivial") link equalizer. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Alexey Kuznetsov, + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + How to setup it. + ---------------- + + After loading this module you will find a new device teqlN + and new qdisc with the same name. To join a slave to the equalizer + you should just set this qdisc on a device f.e. + + # tc qdisc add dev eth0 root teql0 + # tc qdisc add dev eth1 root teql0 + + That's all. Full PnP 8) + + Applicability. + -------------- + + 1. Slave devices MUST be active devices, i.e., they must raise the tbusy + signal and generate EOI events. If you want to equalize virtual devices + like tunnels, use a normal eql device. + 2. This device puts no limitations on physical slave characteristics + f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-) + Certainly, large difference in link speeds will make the resulting + eqalized link unusable, because of huge packet reordering. + I estimate an upper useful difference as ~10 times. + 3. If the slave requires address resolution, only protocols using + neighbour cache (IPv4/IPv6) will work over the equalized link. + Other protocols are still allowed to use the slave device directly, + which will not break load balancing, though native slave + traffic will have the highest priority. */ + +struct teql_master +{ + struct Qdisc_ops qops; + struct net_device *dev; + struct Qdisc *slaves; + struct list_head master_list; + struct net_device_stats stats; +}; + +struct teql_sched_data +{ + struct Qdisc *next; + struct teql_master *m; + struct neighbour *ncache; + struct sk_buff_head q; +}; + +#define NEXT_SLAVE(q) (((struct teql_sched_data*)qdisc_priv(q))->next) + +#define FMASK (IFF_BROADCAST|IFF_POINTOPOINT|IFF_BROADCAST) + +/* "teql*" qdisc routines */ + +static int +teql_enqueue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct net_device *dev = sch->dev; + struct teql_sched_data *q = qdisc_priv(sch); + + __skb_queue_tail(&q->q, skb); + if (q->q.qlen <= dev->tx_queue_len) { + sch->bstats.bytes += skb->len; + sch->bstats.packets++; + return 0; + } + + __skb_unlink(skb, &q->q); + kfree_skb(skb); + sch->qstats.drops++; + return NET_XMIT_DROP; +} + +static int +teql_requeue(struct sk_buff *skb, struct Qdisc* sch) +{ + struct teql_sched_data *q = qdisc_priv(sch); + + __skb_queue_head(&q->q, skb); + sch->qstats.requeues++; + return 0; +} + +static struct sk_buff * +teql_dequeue(struct Qdisc* sch) +{ + struct teql_sched_data *dat = qdisc_priv(sch); + struct sk_buff *skb; + + skb = __skb_dequeue(&dat->q); + if (skb == NULL) { + struct net_device *m = dat->m->dev->qdisc->dev; + if (m) { + dat->m->slaves = sch; + netif_wake_queue(m); + } + } + sch->q.qlen = dat->q.qlen + dat->m->dev->qdisc->q.qlen; + return skb; +} + +static __inline__ void +teql_neigh_release(struct neighbour *n) +{ + if (n) + neigh_release(n); +} + +static void +teql_reset(struct Qdisc* sch) +{ + struct teql_sched_data *dat = qdisc_priv(sch); + + skb_queue_purge(&dat->q); + sch->q.qlen = 0; + teql_neigh_release(xchg(&dat->ncache, NULL)); +} + +static void +teql_destroy(struct Qdisc* sch) +{ + struct Qdisc *q, *prev; + struct teql_sched_data *dat = qdisc_priv(sch); + struct teql_master *master = dat->m; + + if ((prev = master->slaves) != NULL) { + do { + q = NEXT_SLAVE(prev); + if (q == sch) { + NEXT_SLAVE(prev) = NEXT_SLAVE(q); + if (q == master->slaves) { + master->slaves = NEXT_SLAVE(q); + if (q == master->slaves) { + master->slaves = NULL; + spin_lock_bh(&master->dev->queue_lock); + qdisc_reset(master->dev->qdisc); + spin_unlock_bh(&master->dev->queue_lock); + } + } + skb_queue_purge(&dat->q); + teql_neigh_release(xchg(&dat->ncache, NULL)); + break; + } + + } while ((prev = q) != master->slaves); + } +} + +static int teql_qdisc_init(struct Qdisc *sch, struct rtattr *opt) +{ + struct net_device *dev = sch->dev; + struct teql_master *m = (struct teql_master*)sch->ops; + struct teql_sched_data *q = qdisc_priv(sch); + + if (dev->hard_header_len > m->dev->hard_header_len) + return -EINVAL; + + if (m->dev == dev) + return -ELOOP; + + q->m = m; + + skb_queue_head_init(&q->q); + + if (m->slaves) { + if (m->dev->flags & IFF_UP) { + if ((m->dev->flags&IFF_POINTOPOINT && !(dev->flags&IFF_POINTOPOINT)) + || (m->dev->flags&IFF_BROADCAST && !(dev->flags&IFF_BROADCAST)) + || (m->dev->flags&IFF_MULTICAST && !(dev->flags&IFF_MULTICAST)) + || dev->mtu < m->dev->mtu) + return -EINVAL; + } else { + if (!(dev->flags&IFF_POINTOPOINT)) + m->dev->flags &= ~IFF_POINTOPOINT; + if (!(dev->flags&IFF_BROADCAST)) + m->dev->flags &= ~IFF_BROADCAST; + if (!(dev->flags&IFF_MULTICAST)) + m->dev->flags &= ~IFF_MULTICAST; + if (dev->mtu < m->dev->mtu) + m->dev->mtu = dev->mtu; + } + q->next = NEXT_SLAVE(m->slaves); + NEXT_SLAVE(m->slaves) = sch; + } else { + q->next = sch; + m->slaves = sch; + m->dev->mtu = dev->mtu; + m->dev->flags = (m->dev->flags&~FMASK)|(dev->flags&FMASK); + } + return 0; +} + +/* "teql*" netdevice routines */ + +static int +__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev) +{ + struct teql_sched_data *q = qdisc_priv(dev->qdisc); + struct neighbour *mn = skb->dst->neighbour; + struct neighbour *n = q->ncache; + + if (mn->tbl == NULL) + return -EINVAL; + if (n && n->tbl == mn->tbl && + memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) { + atomic_inc(&n->refcnt); + } else { + n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev); + if (IS_ERR(n)) + return PTR_ERR(n); + } + if (neigh_event_send(n, skb_res) == 0) { + int err; + read_lock(&n->lock); + err = dev->hard_header(skb, dev, ntohs(skb->protocol), n->ha, NULL, skb->len); + read_unlock(&n->lock); + if (err < 0) { + neigh_release(n); + return -EINVAL; + } + teql_neigh_release(xchg(&q->ncache, n)); + return 0; + } + neigh_release(n); + return (skb_res == NULL) ? -EAGAIN : 1; +} + +static __inline__ int +teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device *dev) +{ + if (dev->hard_header == NULL || + skb->dst == NULL || + skb->dst->neighbour == NULL) + return 0; + return __teql_resolve(skb, skb_res, dev); +} + +static int teql_master_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct teql_master *master = (void*)dev->priv; + struct Qdisc *start, *q; + int busy; + int nores; + int len = skb->len; + struct sk_buff *skb_res = NULL; + + start = master->slaves; + +restart: + nores = 0; + busy = 0; + + if ((q = start) == NULL) + goto drop; + + do { + struct net_device *slave = q->dev; + + if (slave->qdisc_sleeping != q) + continue; + if (netif_queue_stopped(slave) || ! netif_running(slave)) { + busy = 1; + continue; + } + + switch (teql_resolve(skb, skb_res, slave)) { + case 0: + if (spin_trylock(&slave->xmit_lock)) { + slave->xmit_lock_owner = smp_processor_id(); + if (!netif_queue_stopped(slave) && + slave->hard_start_xmit(skb, slave) == 0) { + slave->xmit_lock_owner = -1; + spin_unlock(&slave->xmit_lock); + master->slaves = NEXT_SLAVE(q); + netif_wake_queue(dev); + master->stats.tx_packets++; + master->stats.tx_bytes += len; + return 0; + } + slave->xmit_lock_owner = -1; + spin_unlock(&slave->xmit_lock); + } + if (netif_queue_stopped(dev)) + busy = 1; + break; + case 1: + master->slaves = NEXT_SLAVE(q); + return 0; + default: + nores = 1; + break; + } + __skb_pull(skb, skb->nh.raw - skb->data); + } while ((q = NEXT_SLAVE(q)) != start); + + if (nores && skb_res == NULL) { + skb_res = skb; + goto restart; + } + + if (busy) { + netif_stop_queue(dev); + return 1; + } + master->stats.tx_errors++; + +drop: + master->stats.tx_dropped++; + dev_kfree_skb(skb); + return 0; +} + +static int teql_master_open(struct net_device *dev) +{ + struct Qdisc * q; + struct teql_master *m = (void*)dev->priv; + int mtu = 0xFFFE; + unsigned flags = IFF_NOARP|IFF_MULTICAST; + + if (m->slaves == NULL) + return -EUNATCH; + + flags = FMASK; + + q = m->slaves; + do { + struct net_device *slave = q->dev; + + if (slave == NULL) + return -EUNATCH; + + if (slave->mtu < mtu) + mtu = slave->mtu; + if (slave->hard_header_len > LL_MAX_HEADER) + return -EINVAL; + + /* If all the slaves are BROADCAST, master is BROADCAST + If all the slaves are PtP, master is PtP + Otherwise, master is NBMA. + */ + if (!(slave->flags&IFF_POINTOPOINT)) + flags &= ~IFF_POINTOPOINT; + if (!(slave->flags&IFF_BROADCAST)) + flags &= ~IFF_BROADCAST; + if (!(slave->flags&IFF_MULTICAST)) + flags &= ~IFF_MULTICAST; + } while ((q = NEXT_SLAVE(q)) != m->slaves); + + m->dev->mtu = mtu; + m->dev->flags = (m->dev->flags&~FMASK) | flags; + netif_start_queue(m->dev); + return 0; +} + +static int teql_master_close(struct net_device *dev) +{ + netif_stop_queue(dev); + return 0; +} + +static struct net_device_stats *teql_master_stats(struct net_device *dev) +{ + struct teql_master *m = (void*)dev->priv; + return &m->stats; +} + +static int teql_master_mtu(struct net_device *dev, int new_mtu) +{ + struct teql_master *m = (void*)dev->priv; + struct Qdisc *q; + + if (new_mtu < 68) + return -EINVAL; + + q = m->slaves; + if (q) { + do { + if (new_mtu > q->dev->mtu) + return -EINVAL; + } while ((q=NEXT_SLAVE(q)) != m->slaves); + } + + dev->mtu = new_mtu; + return 0; +} + +static __init void teql_master_setup(struct net_device *dev) +{ + struct teql_master *master = dev->priv; + struct Qdisc_ops *ops = &master->qops; + + master->dev = dev; + ops->priv_size = sizeof(struct teql_sched_data); + + ops->enqueue = teql_enqueue; + ops->dequeue = teql_dequeue; + ops->requeue = teql_requeue; + ops->init = teql_qdisc_init; + ops->reset = teql_reset; + ops->destroy = teql_destroy; + ops->owner = THIS_MODULE; + + dev->open = teql_master_open; + dev->hard_start_xmit = teql_master_xmit; + dev->stop = teql_master_close; + dev->get_stats = teql_master_stats; + dev->change_mtu = teql_master_mtu; + dev->type = ARPHRD_VOID; + dev->mtu = 1500; + dev->tx_queue_len = 100; + dev->flags = IFF_NOARP; + dev->hard_header_len = LL_MAX_HEADER; + SET_MODULE_OWNER(dev); +} + +static LIST_HEAD(master_dev_list); +static int max_equalizers = 1; +module_param(max_equalizers, int, 0); +MODULE_PARM_DESC(max_equalizers, "Max number of link equalizers"); + +static int __init teql_init(void) +{ + int i; + int err = -ENODEV; + + for (i = 0; i < max_equalizers; i++) { + struct net_device *dev; + struct teql_master *master; + + dev = alloc_netdev(sizeof(struct teql_master), + "teql%d", teql_master_setup); + if (!dev) { + err = -ENOMEM; + break; + } + + if ((err = register_netdev(dev))) { + free_netdev(dev); + break; + } + + master = dev->priv; + + strlcpy(master->qops.id, dev->name, IFNAMSIZ); + err = register_qdisc(&master->qops); + + if (err) { + unregister_netdev(dev); + free_netdev(dev); + break; + } + + list_add_tail(&master->master_list, &master_dev_list); + } + return i ? 0 : err; +} + +static void __exit teql_exit(void) +{ + struct teql_master *master, *nxt; + + list_for_each_entry_safe(master, nxt, &master_dev_list, master_list) { + + list_del(&master->master_list); + + unregister_qdisc(&master->qops); + unregister_netdev(master->dev); + free_netdev(master->dev); + } +} + +module_init(teql_init); +module_exit(teql_exit); + +MODULE_LICENSE("GPL"); diff --git a/net/sctp/Kconfig b/net/sctp/Kconfig new file mode 100644 index 000000000000..9cba49e2ad43 --- /dev/null +++ b/net/sctp/Kconfig @@ -0,0 +1,89 @@ +# +# SCTP configuration +# + +menu "SCTP Configuration (EXPERIMENTAL)" + depends on INET && EXPERIMENTAL + +config IP_SCTP + tristate "The SCTP Protocol (EXPERIMENTAL)" + depends on IPV6 || IPV6=n + select CRYPTO if SCTP_HMAC_SHA1 || SCTP_HMAC_MD5 + select CRYPTO_HMAC if SCTP_HMAC_SHA1 || SCTP_HMAC_MD5 + select CRYPTO_SHA1 if SCTP_HMAC_SHA1 + select CRYPTO_MD5 if SCTP_HMAC_MD5 + ---help--- + Stream Control Transmission Protocol + + From RFC 2960 . + + "SCTP is a reliable transport protocol operating on top of a + connectionless packet network such as IP. It offers the following + services to its users: + + -- acknowledged error-free non-duplicated transfer of user data, + -- data fragmentation to conform to discovered path MTU size, + -- sequenced delivery of user messages within multiple streams, + with an option for order-of-arrival delivery of individual user + messages, + -- optional bundling of multiple user messages into a single SCTP + packet, and + -- network-level fault tolerance through supporting of multi- + homing at either or both ends of an association." + + To compile this protocol support as a module, choose M here: the + module will be called sctp. + + If in doubt, say N. + +config SCTP_DBG_MSG + bool "SCTP: Debug messages" + depends on IP_SCTP + help + If you say Y, this will enable verbose debugging messages. + + If unsure, say N. However, if you are running into problems, use + this option to gather detailed trace information + +config SCTP_DBG_OBJCNT + bool "SCTP: Debug object counts" + depends on IP_SCTP + help + If you say Y, this will enable debugging support for counting the + type of objects that are currently allocated. This is useful for + identifying memory leaks. If the /proc filesystem is enabled this + debug information can be viewed by + 'cat /proc/net/sctp/sctp_dbg_objcnt' + + If unsure, say N + +choice + prompt "SCTP: Cookie HMAC Algorithm" + depends on IP_SCTP + default SCTP_HMAC_MD5 + help + HMAC algorithm to be used during association initialization. It + is strongly recommended to use HMAC-SHA1 or HMAC-MD5. See + configuration for Cryptographic API and enable those algorithms + to make usable by SCTP. + +config SCTP_HMAC_NONE + bool "None" + help + Choosing this disables the use of an HMAC during association + establishment. It is advised to use either HMAC-MD5 or HMAC-SHA1. + +config SCTP_HMAC_SHA1 + bool "HMAC-SHA1" + help + Enable the use of HMAC-SHA1 during association establishment. It + is advised to use either HMAC-MD5 or HMAC-SHA1. + +config SCTP_HMAC_MD5 + bool "HMAC-MD5" + help + Enable the use of HMAC-MD5 during association establishment. It is + advised to use either HMAC-MD5 or HMAC-SHA1. + +endchoice +endmenu diff --git a/net/sctp/Makefile b/net/sctp/Makefile new file mode 100644 index 000000000000..70c828bbe444 --- /dev/null +++ b/net/sctp/Makefile @@ -0,0 +1,17 @@ +# +# Makefile for SCTP support code. +# + +obj-$(CONFIG_IP_SCTP) += sctp.o + +sctp-y := sm_statetable.o sm_statefuns.o sm_sideeffect.o \ + protocol.o endpointola.o associola.o \ + transport.o chunk.o sm_make_chunk.o ulpevent.o \ + inqueue.o outqueue.o ulpqueue.o command.o \ + tsnmap.o bind_addr.o socket.o primitive.o \ + output.o input.o debug.o ssnmap.o proc.o crc32c.o + +sctp-$(CONFIG_SCTP_DBG_OBJCNT) += objcnt.o +sctp-$(CONFIG_SYSCTL) += sysctl.o + +sctp-$(subst m,y,$(CONFIG_IPV6)) += ipv6.o diff --git a/net/sctp/associola.c b/net/sctp/associola.c new file mode 100644 index 000000000000..663843d97a92 --- /dev/null +++ b/net/sctp/associola.c @@ -0,0 +1,1205 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 La Monte H.P. Yarroll + * + * This file is part of the SCTP kernel reference Implementation + * + * This module provides the abstraction for an SCTP association. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Jon Grimm + * Xingang Guo + * Hui Huang + * Sridhar Samudrala + * Daisy Chang + * Ryan Layer + * Kevin Gao + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* Forward declarations for internal functions. */ +static void sctp_assoc_bh_rcv(struct sctp_association *asoc); + + +/* 1st Level Abstractions. */ + +/* Initialize a new association from provided memory. */ +static struct sctp_association *sctp_association_init(struct sctp_association *asoc, + const struct sctp_endpoint *ep, + const struct sock *sk, + sctp_scope_t scope, + int gfp) +{ + struct sctp_sock *sp; + int i; + + /* Retrieve the SCTP per socket area. */ + sp = sctp_sk((struct sock *)sk); + + /* Init all variables to a known value. */ + memset(asoc, 0, sizeof(struct sctp_association)); + + /* Discarding const is appropriate here. */ + asoc->ep = (struct sctp_endpoint *)ep; + sctp_endpoint_hold(asoc->ep); + + /* Hold the sock. */ + asoc->base.sk = (struct sock *)sk; + sock_hold(asoc->base.sk); + + /* Initialize the common base substructure. */ + asoc->base.type = SCTP_EP_TYPE_ASSOCIATION; + + /* Initialize the object handling fields. */ + atomic_set(&asoc->base.refcnt, 1); + asoc->base.dead = 0; + asoc->base.malloced = 0; + + /* Initialize the bind addr area. */ + sctp_bind_addr_init(&asoc->base.bind_addr, ep->base.bind_addr.port); + rwlock_init(&asoc->base.addr_lock); + + asoc->state = SCTP_STATE_CLOSED; + + /* Set these values from the socket values, a conversion between + * millsecons to seconds/microseconds must also be done. + */ + asoc->cookie_life.tv_sec = sp->assocparams.sasoc_cookie_life / 1000; + asoc->cookie_life.tv_usec = (sp->assocparams.sasoc_cookie_life % 1000) + * 1000; + asoc->pmtu = 0; + asoc->frag_point = 0; + + /* Set the association max_retrans and RTO values from the + * socket values. + */ + asoc->max_retrans = sp->assocparams.sasoc_asocmaxrxt; + asoc->rto_initial = msecs_to_jiffies(sp->rtoinfo.srto_initial); + asoc->rto_max = msecs_to_jiffies(sp->rtoinfo.srto_max); + asoc->rto_min = msecs_to_jiffies(sp->rtoinfo.srto_min); + + asoc->overall_error_count = 0; + + /* Initialize the maximum mumber of new data packets that can be sent + * in a burst. + */ + asoc->max_burst = sctp_max_burst; + + /* Copy things from the endpoint. */ + for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) { + asoc->timeouts[i] = ep->timeouts[i]; + init_timer(&asoc->timers[i]); + asoc->timers[i].function = sctp_timer_events[i]; + asoc->timers[i].data = (unsigned long) asoc; + } + + /* Pull default initialization values from the sock options. + * Note: This assumes that the values have already been + * validated in the sock. + */ + asoc->c.sinit_max_instreams = sp->initmsg.sinit_max_instreams; + asoc->c.sinit_num_ostreams = sp->initmsg.sinit_num_ostreams; + asoc->max_init_attempts = sp->initmsg.sinit_max_attempts; + + asoc->max_init_timeo = + msecs_to_jiffies(sp->initmsg.sinit_max_init_timeo); + + /* Allocate storage for the ssnmap after the inbound and outbound + * streams have been negotiated during Init. + */ + asoc->ssnmap = NULL; + + /* Set the local window size for receive. + * This is also the rcvbuf space per association. + * RFC 6 - A SCTP receiver MUST be able to receive a minimum of + * 1500 bytes in one SCTP packet. + */ + if (sk->sk_rcvbuf < SCTP_DEFAULT_MINWINDOW) + asoc->rwnd = SCTP_DEFAULT_MINWINDOW; + else + asoc->rwnd = sk->sk_rcvbuf; + + asoc->a_rwnd = asoc->rwnd; + + asoc->rwnd_over = 0; + + /* Use my own max window until I learn something better. */ + asoc->peer.rwnd = SCTP_DEFAULT_MAXWINDOW; + + /* Set the sndbuf size for transmit. */ + asoc->sndbuf_used = 0; + + init_waitqueue_head(&asoc->wait); + + asoc->c.my_vtag = sctp_generate_tag(ep); + asoc->peer.i.init_tag = 0; /* INIT needs a vtag of 0. */ + asoc->c.peer_vtag = 0; + asoc->c.my_ttag = 0; + asoc->c.peer_ttag = 0; + asoc->c.my_port = ep->base.bind_addr.port; + + asoc->c.initial_tsn = sctp_generate_tsn(ep); + + asoc->next_tsn = asoc->c.initial_tsn; + + asoc->ctsn_ack_point = asoc->next_tsn - 1; + asoc->adv_peer_ack_point = asoc->ctsn_ack_point; + asoc->highest_sacked = asoc->ctsn_ack_point; + asoc->last_cwr_tsn = asoc->ctsn_ack_point; + asoc->unack_data = 0; + + SCTP_DEBUG_PRINTK("myctsnap for %s INIT as 0x%x.\n", + asoc->ep->debug_name, + asoc->ctsn_ack_point); + + /* ADDIP Section 4.1 Asconf Chunk Procedures + * + * When an endpoint has an ASCONF signaled change to be sent to the + * remote endpoint it should do the following: + * ... + * A2) a serial number should be assigned to the chunk. The serial + * number SHOULD be a monotonically increasing number. The serial + * numbers SHOULD be initialized at the start of the + * association to the same value as the initial TSN. + */ + asoc->addip_serial = asoc->c.initial_tsn; + + skb_queue_head_init(&asoc->addip_chunks); + + /* Make an empty list of remote transport addresses. */ + INIT_LIST_HEAD(&asoc->peer.transport_addr_list); + + /* RFC 2960 5.1 Normal Establishment of an Association + * + * After the reception of the first data chunk in an + * association the endpoint must immediately respond with a + * sack to acknowledge the data chunk. Subsequent + * acknowledgements should be done as described in Section + * 6.2. + * + * [We implement this by telling a new association that it + * already received one packet.] + */ + asoc->peer.sack_needed = 1; + + /* Assume that the peer recongizes ASCONF until reported otherwise + * via an ERROR chunk. + */ + asoc->peer.asconf_capable = 1; + + /* Create an input queue. */ + sctp_inq_init(&asoc->base.inqueue); + sctp_inq_set_th_handler(&asoc->base.inqueue, + (void (*)(void *))sctp_assoc_bh_rcv, + asoc); + + /* Create an output queue. */ + sctp_outq_init(asoc, &asoc->outqueue); + + if (!sctp_ulpq_init(&asoc->ulpq, asoc)) + goto fail_init; + + /* Set up the tsn tracking. */ + sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_SIZE, 0); + + asoc->need_ecne = 0; + + asoc->assoc_id = 0; + + /* Assume that peer would support both address types unless we are + * told otherwise. + */ + asoc->peer.ipv4_address = 1; + asoc->peer.ipv6_address = 1; + INIT_LIST_HEAD(&asoc->asocs); + + asoc->autoclose = sp->autoclose; + + asoc->default_stream = sp->default_stream; + asoc->default_ppid = sp->default_ppid; + asoc->default_flags = sp->default_flags; + asoc->default_context = sp->default_context; + asoc->default_timetolive = sp->default_timetolive; + + return asoc; + +fail_init: + sctp_endpoint_put(asoc->ep); + sock_put(asoc->base.sk); + return NULL; +} + +/* Allocate and initialize a new association */ +struct sctp_association *sctp_association_new(const struct sctp_endpoint *ep, + const struct sock *sk, + sctp_scope_t scope, int gfp) +{ + struct sctp_association *asoc; + + asoc = t_new(struct sctp_association, gfp); + if (!asoc) + goto fail; + + if (!sctp_association_init(asoc, ep, sk, scope, gfp)) + goto fail_init; + + asoc->base.malloced = 1; + SCTP_DBG_OBJCNT_INC(assoc); + + return asoc; + +fail_init: + kfree(asoc); +fail: + return NULL; +} + +/* Free this association if possible. There may still be users, so + * the actual deallocation may be delayed. + */ +void sctp_association_free(struct sctp_association *asoc) +{ + struct sock *sk = asoc->base.sk; + struct sctp_transport *transport; + struct list_head *pos, *temp; + int i; + + list_del(&asoc->asocs); + + /* Decrement the backlog value for a TCP-style listening socket. */ + if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) + sk->sk_ack_backlog--; + + /* Mark as dead, so other users can know this structure is + * going away. + */ + asoc->base.dead = 1; + + /* Dispose of any data lying around in the outqueue. */ + sctp_outq_free(&asoc->outqueue); + + /* Dispose of any pending messages for the upper layer. */ + sctp_ulpq_free(&asoc->ulpq); + + /* Dispose of any pending chunks on the inqueue. */ + sctp_inq_free(&asoc->base.inqueue); + + /* Free ssnmap storage. */ + sctp_ssnmap_free(asoc->ssnmap); + + /* Clean up the bound address list. */ + sctp_bind_addr_free(&asoc->base.bind_addr); + + /* Do we need to go through all of our timers and + * delete them? To be safe we will try to delete all, but we + * should be able to go through and make a guess based + * on our state. + */ + for (i = SCTP_EVENT_TIMEOUT_NONE; i < SCTP_NUM_TIMEOUT_TYPES; ++i) { + if (timer_pending(&asoc->timers[i]) && + del_timer(&asoc->timers[i])) + sctp_association_put(asoc); + } + + /* Free peer's cached cookie. */ + if (asoc->peer.cookie) { + kfree(asoc->peer.cookie); + } + + /* Release the transport structures. */ + list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { + transport = list_entry(pos, struct sctp_transport, transports); + list_del(pos); + sctp_transport_free(transport); + } + + /* Free any cached ASCONF_ACK chunk. */ + if (asoc->addip_last_asconf_ack) + sctp_chunk_free(asoc->addip_last_asconf_ack); + + /* Free any cached ASCONF chunk. */ + if (asoc->addip_last_asconf) + sctp_chunk_free(asoc->addip_last_asconf); + + sctp_association_put(asoc); +} + +/* Cleanup and free up an association. */ +static void sctp_association_destroy(struct sctp_association *asoc) +{ + SCTP_ASSERT(asoc->base.dead, "Assoc is not dead", return); + + sctp_endpoint_put(asoc->ep); + sock_put(asoc->base.sk); + + if (asoc->assoc_id != 0) { + spin_lock_bh(&sctp_assocs_id_lock); + idr_remove(&sctp_assocs_id, asoc->assoc_id); + spin_unlock_bh(&sctp_assocs_id_lock); + } + + if (asoc->base.malloced) { + kfree(asoc); + SCTP_DBG_OBJCNT_DEC(assoc); + } +} + +/* Change the primary destination address for the peer. */ +void sctp_assoc_set_primary(struct sctp_association *asoc, + struct sctp_transport *transport) +{ + asoc->peer.primary_path = transport; + + /* Set a default msg_name for events. */ + memcpy(&asoc->peer.primary_addr, &transport->ipaddr, + sizeof(union sctp_addr)); + + /* If the primary path is changing, assume that the + * user wants to use this new path. + */ + if (transport->active) + asoc->peer.active_path = transport; + + /* + * SFR-CACC algorithm: + * Upon the receipt of a request to change the primary + * destination address, on the data structure for the new + * primary destination, the sender MUST do the following: + * + * 1) If CHANGEOVER_ACTIVE is set, then there was a switch + * to this destination address earlier. The sender MUST set + * CYCLING_CHANGEOVER to indicate that this switch is a + * double switch to the same destination address. + */ + if (transport->cacc.changeover_active) + transport->cacc.cycling_changeover = 1; + + /* 2) The sender MUST set CHANGEOVER_ACTIVE to indicate that + * a changeover has occurred. + */ + transport->cacc.changeover_active = 1; + + /* 3) The sender MUST store the next TSN to be sent in + * next_tsn_at_change. + */ + transport->cacc.next_tsn_at_change = asoc->next_tsn; +} + +/* Add a transport address to an association. */ +struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, + const union sctp_addr *addr, + int gfp) +{ + struct sctp_transport *peer; + struct sctp_sock *sp; + unsigned short port; + + sp = sctp_sk(asoc->base.sk); + + /* AF_INET and AF_INET6 share common port field. */ + port = addr->v4.sin_port; + + /* Set the port if it has not been set yet. */ + if (0 == asoc->peer.port) + asoc->peer.port = port; + + /* Check to see if this is a duplicate. */ + peer = sctp_assoc_lookup_paddr(asoc, addr); + if (peer) + return peer; + + peer = sctp_transport_new(addr, gfp); + if (!peer) + return NULL; + + sctp_transport_set_owner(peer, asoc); + + /* Initialize the pmtu of the transport. */ + sctp_transport_pmtu(peer); + + /* If this is the first transport addr on this association, + * initialize the association PMTU to the peer's PMTU. + * If not and the current association PMTU is higher than the new + * peer's PMTU, reset the association PMTU to the new peer's PMTU. + */ + if (asoc->pmtu) + asoc->pmtu = min_t(int, peer->pmtu, asoc->pmtu); + else + asoc->pmtu = peer->pmtu; + + SCTP_DEBUG_PRINTK("sctp_assoc_add_peer:association %p PMTU set to " + "%d\n", asoc, asoc->pmtu); + + asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); + + /* The asoc->peer.port might not be meaningful yet, but + * initialize the packet structure anyway. + */ + sctp_packet_init(&peer->packet, peer, asoc->base.bind_addr.port, + asoc->peer.port); + + /* 7.2.1 Slow-Start + * + * o The initial cwnd before DATA transmission or after a sufficiently + * long idle period MUST be set to + * min(4*MTU, max(2*MTU, 4380 bytes)) + * + * o The initial value of ssthresh MAY be arbitrarily high + * (for example, implementations MAY use the size of the + * receiver advertised window). + */ + peer->cwnd = min(4*asoc->pmtu, max_t(__u32, 2*asoc->pmtu, 4380)); + + /* At this point, we may not have the receiver's advertised window, + * so initialize ssthresh to the default value and it will be set + * later when we process the INIT. + */ + peer->ssthresh = SCTP_DEFAULT_MAXWINDOW; + + peer->partial_bytes_acked = 0; + peer->flight_size = 0; + + /* By default, enable heartbeat for peer address. */ + peer->hb_allowed = 1; + + /* Initialize the peer's heartbeat interval based on the + * sock configured value. + */ + peer->hb_interval = msecs_to_jiffies(sp->paddrparam.spp_hbinterval); + + /* Set the path max_retrans. */ + peer->max_retrans = sp->paddrparam.spp_pathmaxrxt; + + /* Set the transport's RTO.initial value */ + peer->rto = asoc->rto_initial; + + /* Attach the remote transport to our asoc. */ + list_add_tail(&peer->transports, &asoc->peer.transport_addr_list); + + /* If we do not yet have a primary path, set one. */ + if (!asoc->peer.primary_path) { + sctp_assoc_set_primary(asoc, peer); + asoc->peer.retran_path = peer; + } + + if (asoc->peer.active_path == asoc->peer.retran_path) + asoc->peer.retran_path = peer; + + return peer; +} + +/* Delete a transport address from an association. */ +void sctp_assoc_del_peer(struct sctp_association *asoc, + const union sctp_addr *addr) +{ + struct list_head *pos; + struct list_head *temp; + struct sctp_transport *peer = NULL; + struct sctp_transport *transport; + + list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { + transport = list_entry(pos, struct sctp_transport, transports); + if (sctp_cmp_addr_exact(addr, &transport->ipaddr)) { + peer = transport; + list_del(pos); + break; + } + } + + /* The address we want delete is not in the association. */ + if (!peer) + return; + + /* Get the first transport of asoc. */ + pos = asoc->peer.transport_addr_list.next; + transport = list_entry(pos, struct sctp_transport, transports); + + /* Update any entries that match the peer to be deleted. */ + if (asoc->peer.primary_path == peer) + sctp_assoc_set_primary(asoc, transport); + if (asoc->peer.active_path == peer) + asoc->peer.active_path = transport; + if (asoc->peer.retran_path == peer) + asoc->peer.retran_path = transport; + if (asoc->peer.last_data_from == peer) + asoc->peer.last_data_from = transport; + + sctp_transport_free(peer); +} + +/* Lookup a transport by address. */ +struct sctp_transport *sctp_assoc_lookup_paddr( + const struct sctp_association *asoc, + const union sctp_addr *address) +{ + struct sctp_transport *t; + struct list_head *pos; + + /* Cycle through all transports searching for a peer address. */ + + list_for_each(pos, &asoc->peer.transport_addr_list) { + t = list_entry(pos, struct sctp_transport, transports); + if (sctp_cmp_addr_exact(address, &t->ipaddr)) + return t; + } + + return NULL; +} + +/* Engage in transport control operations. + * Mark the transport up or down and send a notification to the user. + * Select and update the new active and retran paths. + */ +void sctp_assoc_control_transport(struct sctp_association *asoc, + struct sctp_transport *transport, + sctp_transport_cmd_t command, + sctp_sn_error_t error) +{ + struct sctp_transport *t = NULL; + struct sctp_transport *first; + struct sctp_transport *second; + struct sctp_ulpevent *event; + struct list_head *pos; + int spc_state = 0; + + /* Record the transition on the transport. */ + switch (command) { + case SCTP_TRANSPORT_UP: + transport->active = SCTP_ACTIVE; + spc_state = SCTP_ADDR_AVAILABLE; + break; + + case SCTP_TRANSPORT_DOWN: + transport->active = SCTP_INACTIVE; + spc_state = SCTP_ADDR_UNREACHABLE; + break; + + default: + return; + }; + + /* Generate and send a SCTP_PEER_ADDR_CHANGE notification to the + * user. + */ + event = sctp_ulpevent_make_peer_addr_change(asoc, + (struct sockaddr_storage *) &transport->ipaddr, + 0, spc_state, error, GFP_ATOMIC); + if (event) + sctp_ulpq_tail_event(&asoc->ulpq, event); + + /* Select new active and retran paths. */ + + /* Look for the two most recently used active transports. + * + * This code produces the wrong ordering whenever jiffies + * rolls over, but we still get usable transports, so we don't + * worry about it. + */ + first = NULL; second = NULL; + + list_for_each(pos, &asoc->peer.transport_addr_list) { + t = list_entry(pos, struct sctp_transport, transports); + + if (!t->active) + continue; + if (!first || t->last_time_heard > first->last_time_heard) { + second = first; + first = t; + } + if (!second || t->last_time_heard > second->last_time_heard) + second = t; + } + + /* RFC 2960 6.4 Multi-Homed SCTP Endpoints + * + * By default, an endpoint should always transmit to the + * primary path, unless the SCTP user explicitly specifies the + * destination transport address (and possibly source + * transport address) to use. + * + * [If the primary is active but not most recent, bump the most + * recently used transport.] + */ + if (asoc->peer.primary_path->active && + first != asoc->peer.primary_path) { + second = first; + first = asoc->peer.primary_path; + } + + /* If we failed to find a usable transport, just camp on the + * primary, even if it is inactive. + */ + if (!first) { + first = asoc->peer.primary_path; + second = asoc->peer.primary_path; + } + + /* Set the active and retran transports. */ + asoc->peer.active_path = first; + asoc->peer.retran_path = second; +} + +/* Hold a reference to an association. */ +void sctp_association_hold(struct sctp_association *asoc) +{ + atomic_inc(&asoc->base.refcnt); +} + +/* Release a reference to an association and cleanup + * if there are no more references. + */ +void sctp_association_put(struct sctp_association *asoc) +{ + if (atomic_dec_and_test(&asoc->base.refcnt)) + sctp_association_destroy(asoc); +} + +/* Allocate the next TSN, Transmission Sequence Number, for the given + * association. + */ +__u32 sctp_association_get_next_tsn(struct sctp_association *asoc) +{ + /* From Section 1.6 Serial Number Arithmetic: + * Transmission Sequence Numbers wrap around when they reach + * 2**32 - 1. That is, the next TSN a DATA chunk MUST use + * after transmitting TSN = 2*32 - 1 is TSN = 0. + */ + __u32 retval = asoc->next_tsn; + asoc->next_tsn++; + asoc->unack_data++; + + return retval; +} + +/* Compare two addresses to see if they match. Wildcard addresses + * only match themselves. + */ +int sctp_cmp_addr_exact(const union sctp_addr *ss1, + const union sctp_addr *ss2) +{ + struct sctp_af *af; + + af = sctp_get_af_specific(ss1->sa.sa_family); + if (unlikely(!af)) + return 0; + + return af->cmp_addr(ss1, ss2); +} + +/* Return an ecne chunk to get prepended to a packet. + * Note: We are sly and return a shared, prealloced chunk. FIXME: + * No we don't, but we could/should. + */ +struct sctp_chunk *sctp_get_ecne_prepend(struct sctp_association *asoc) +{ + struct sctp_chunk *chunk; + + /* Send ECNE if needed. + * Not being able to allocate a chunk here is not deadly. + */ + if (asoc->need_ecne) + chunk = sctp_make_ecne(asoc, asoc->last_ecne_tsn); + else + chunk = NULL; + + return chunk; +} + +/* + * Find which transport this TSN was sent on. + */ +struct sctp_transport *sctp_assoc_lookup_tsn(struct sctp_association *asoc, + __u32 tsn) +{ + struct sctp_transport *active; + struct sctp_transport *match; + struct list_head *entry, *pos; + struct sctp_transport *transport; + struct sctp_chunk *chunk; + __u32 key = htonl(tsn); + + match = NULL; + + /* + * FIXME: In general, find a more efficient data structure for + * searching. + */ + + /* + * The general strategy is to search each transport's transmitted + * list. Return which transport this TSN lives on. + * + * Let's be hopeful and check the active_path first. + * Another optimization would be to know if there is only one + * outbound path and not have to look for the TSN at all. + * + */ + + active = asoc->peer.active_path; + + list_for_each(entry, &active->transmitted) { + chunk = list_entry(entry, struct sctp_chunk, transmitted_list); + + if (key == chunk->subh.data_hdr->tsn) { + match = active; + goto out; + } + } + + /* If not found, go search all the other transports. */ + list_for_each(pos, &asoc->peer.transport_addr_list) { + transport = list_entry(pos, struct sctp_transport, transports); + + if (transport == active) + break; + list_for_each(entry, &transport->transmitted) { + chunk = list_entry(entry, struct sctp_chunk, + transmitted_list); + if (key == chunk->subh.data_hdr->tsn) { + match = transport; + goto out; + } + } + } +out: + return match; +} + +/* Is this the association we are looking for? */ +struct sctp_transport *sctp_assoc_is_match(struct sctp_association *asoc, + const union sctp_addr *laddr, + const union sctp_addr *paddr) +{ + struct sctp_transport *transport; + + sctp_read_lock(&asoc->base.addr_lock); + + if ((asoc->base.bind_addr.port == laddr->v4.sin_port) && + (asoc->peer.port == paddr->v4.sin_port)) { + transport = sctp_assoc_lookup_paddr(asoc, paddr); + if (!transport) + goto out; + + if (sctp_bind_addr_match(&asoc->base.bind_addr, laddr, + sctp_sk(asoc->base.sk))) + goto out; + } + transport = NULL; + +out: + sctp_read_unlock(&asoc->base.addr_lock); + return transport; +} + +/* Do delayed input processing. This is scheduled by sctp_rcv(). */ +static void sctp_assoc_bh_rcv(struct sctp_association *asoc) +{ + struct sctp_endpoint *ep; + struct sctp_chunk *chunk; + struct sock *sk; + struct sctp_inq *inqueue; + int state; + sctp_subtype_t subtype; + int error = 0; + + /* The association should be held so we should be safe. */ + ep = asoc->ep; + sk = asoc->base.sk; + + inqueue = &asoc->base.inqueue; + sctp_association_hold(asoc); + while (NULL != (chunk = sctp_inq_pop(inqueue))) { + state = asoc->state; + subtype = SCTP_ST_CHUNK(chunk->chunk_hdr->type); + + /* Remember where the last DATA chunk came from so we + * know where to send the SACK. + */ + if (sctp_chunk_is_data(chunk)) + asoc->peer.last_data_from = chunk->transport; + else + SCTP_INC_STATS(SCTP_MIB_INCTRLCHUNKS); + + if (chunk->transport) + chunk->transport->last_time_heard = jiffies; + + /* Run through the state machine. */ + error = sctp_do_sm(SCTP_EVENT_T_CHUNK, subtype, + state, ep, asoc, chunk, GFP_ATOMIC); + + /* Check to see if the association is freed in response to + * the incoming chunk. If so, get out of the while loop. + */ + if (asoc->base.dead) + break; + + /* If there is an error on chunk, discard this packet. */ + if (error && chunk) + chunk->pdiscard = 1; + } + sctp_association_put(asoc); +} + +/* This routine moves an association from its old sk to a new sk. */ +void sctp_assoc_migrate(struct sctp_association *assoc, struct sock *newsk) +{ + struct sctp_sock *newsp = sctp_sk(newsk); + struct sock *oldsk = assoc->base.sk; + + /* Delete the association from the old endpoint's list of + * associations. + */ + list_del_init(&assoc->asocs); + + /* Decrement the backlog value for a TCP-style socket. */ + if (sctp_style(oldsk, TCP)) + oldsk->sk_ack_backlog--; + + /* Release references to the old endpoint and the sock. */ + sctp_endpoint_put(assoc->ep); + sock_put(assoc->base.sk); + + /* Get a reference to the new endpoint. */ + assoc->ep = newsp->ep; + sctp_endpoint_hold(assoc->ep); + + /* Get a reference to the new sock. */ + assoc->base.sk = newsk; + sock_hold(assoc->base.sk); + + /* Add the association to the new endpoint's list of associations. */ + sctp_endpoint_add_asoc(newsp->ep, assoc); +} + +/* Update an association (possibly from unexpected COOKIE-ECHO processing). */ +void sctp_assoc_update(struct sctp_association *asoc, + struct sctp_association *new) +{ + struct sctp_transport *trans; + struct list_head *pos, *temp; + + /* Copy in new parameters of peer. */ + asoc->c = new->c; + asoc->peer.rwnd = new->peer.rwnd; + asoc->peer.sack_needed = new->peer.sack_needed; + asoc->peer.i = new->peer.i; + sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_SIZE, + asoc->peer.i.initial_tsn); + + /* Remove any peer addresses not present in the new association. */ + list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { + trans = list_entry(pos, struct sctp_transport, transports); + if (!sctp_assoc_lookup_paddr(new, &trans->ipaddr)) + sctp_assoc_del_peer(asoc, &trans->ipaddr); + } + + /* If the case is A (association restart), use + * initial_tsn as next_tsn. If the case is B, use + * current next_tsn in case data sent to peer + * has been discarded and needs retransmission. + */ + if (asoc->state >= SCTP_STATE_ESTABLISHED) { + asoc->next_tsn = new->next_tsn; + asoc->ctsn_ack_point = new->ctsn_ack_point; + asoc->adv_peer_ack_point = new->adv_peer_ack_point; + + /* Reinitialize SSN for both local streams + * and peer's streams. + */ + sctp_ssnmap_clear(asoc->ssnmap); + + } else { + /* Add any peer addresses from the new association. */ + list_for_each(pos, &new->peer.transport_addr_list) { + trans = list_entry(pos, struct sctp_transport, + transports); + if (!sctp_assoc_lookup_paddr(asoc, &trans->ipaddr)) + sctp_assoc_add_peer(asoc, &trans->ipaddr, + GFP_ATOMIC); + } + + asoc->ctsn_ack_point = asoc->next_tsn - 1; + asoc->adv_peer_ack_point = asoc->ctsn_ack_point; + if (!asoc->ssnmap) { + /* Move the ssnmap. */ + asoc->ssnmap = new->ssnmap; + new->ssnmap = NULL; + } + } +} + +/* Update the retran path for sending a retransmitted packet. + * Round-robin through the active transports, else round-robin + * through the inactive transports as this is the next best thing + * we can try. + */ +void sctp_assoc_update_retran_path(struct sctp_association *asoc) +{ + struct sctp_transport *t, *next; + struct list_head *head = &asoc->peer.transport_addr_list; + struct list_head *pos; + + /* Find the next transport in a round-robin fashion. */ + t = asoc->peer.retran_path; + pos = &t->transports; + next = NULL; + + while (1) { + /* Skip the head. */ + if (pos->next == head) + pos = head->next; + else + pos = pos->next; + + t = list_entry(pos, struct sctp_transport, transports); + + /* Try to find an active transport. */ + + if (t->active) { + break; + } else { + /* Keep track of the next transport in case + * we don't find any active transport. + */ + if (!next) + next = t; + } + + /* We have exhausted the list, but didn't find any + * other active transports. If so, use the next + * transport. + */ + if (t == asoc->peer.retran_path) { + t = next; + break; + } + } + + asoc->peer.retran_path = t; +} + +/* Choose the transport for sending a SHUTDOWN packet. */ +struct sctp_transport *sctp_assoc_choose_shutdown_transport( + struct sctp_association *asoc) +{ + /* If this is the first time SHUTDOWN is sent, use the active path, + * else use the retran path. If the last SHUTDOWN was sent over the + * retran path, update the retran path and use it. + */ + if (!asoc->shutdown_last_sent_to) + return asoc->peer.active_path; + else { + if (asoc->shutdown_last_sent_to == asoc->peer.retran_path) + sctp_assoc_update_retran_path(asoc); + return asoc->peer.retran_path; + } + +} + +/* Update the association's pmtu and frag_point by going through all the + * transports. This routine is called when a transport's PMTU has changed. + */ +void sctp_assoc_sync_pmtu(struct sctp_association *asoc) +{ + struct sctp_transport *t; + struct list_head *pos; + __u32 pmtu = 0; + + if (!asoc) + return; + + /* Get the lowest pmtu of all the transports. */ + list_for_each(pos, &asoc->peer.transport_addr_list) { + t = list_entry(pos, struct sctp_transport, transports); + if (!pmtu || (t->pmtu < pmtu)) + pmtu = t->pmtu; + } + + if (pmtu) { + struct sctp_sock *sp = sctp_sk(asoc->base.sk); + asoc->pmtu = pmtu; + asoc->frag_point = sctp_frag_point(sp, pmtu); + } + + SCTP_DEBUG_PRINTK("%s: asoc:%p, pmtu:%d, frag_point:%d\n", + __FUNCTION__, asoc, asoc->pmtu, asoc->frag_point); +} + +/* Should we send a SACK to update our peer? */ +static inline int sctp_peer_needs_update(struct sctp_association *asoc) +{ + switch (asoc->state) { + case SCTP_STATE_ESTABLISHED: + case SCTP_STATE_SHUTDOWN_PENDING: + case SCTP_STATE_SHUTDOWN_RECEIVED: + case SCTP_STATE_SHUTDOWN_SENT: + if ((asoc->rwnd > asoc->a_rwnd) && + ((asoc->rwnd - asoc->a_rwnd) >= + min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pmtu))) + return 1; + break; + default: + break; + } + return 0; +} + +/* Increase asoc's rwnd by len and send any window update SACK if needed. */ +void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned len) +{ + struct sctp_chunk *sack; + struct timer_list *timer; + + if (asoc->rwnd_over) { + if (asoc->rwnd_over >= len) { + asoc->rwnd_over -= len; + } else { + asoc->rwnd += (len - asoc->rwnd_over); + asoc->rwnd_over = 0; + } + } else { + asoc->rwnd += len; + } + + SCTP_DEBUG_PRINTK("%s: asoc %p rwnd increased by %d to (%u, %u) " + "- %u\n", __FUNCTION__, asoc, len, asoc->rwnd, + asoc->rwnd_over, asoc->a_rwnd); + + /* Send a window update SACK if the rwnd has increased by at least the + * minimum of the association's PMTU and half of the receive buffer. + * The algorithm used is similar to the one described in + * Section 4.2.3.3 of RFC 1122. + */ + if (sctp_peer_needs_update(asoc)) { + asoc->a_rwnd = asoc->rwnd; + SCTP_DEBUG_PRINTK("%s: Sending window update SACK- asoc: %p " + "rwnd: %u a_rwnd: %u\n", __FUNCTION__, + asoc, asoc->rwnd, asoc->a_rwnd); + sack = sctp_make_sack(asoc); + if (!sack) + return; + + asoc->peer.sack_needed = 0; + + sctp_outq_tail(&asoc->outqueue, sack); + + /* Stop the SACK timer. */ + timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK]; + if (timer_pending(timer) && del_timer(timer)) + sctp_association_put(asoc); + } +} + +/* Decrease asoc's rwnd by len. */ +void sctp_assoc_rwnd_decrease(struct sctp_association *asoc, unsigned len) +{ + SCTP_ASSERT(asoc->rwnd, "rwnd zero", return); + SCTP_ASSERT(!asoc->rwnd_over, "rwnd_over not zero", return); + if (asoc->rwnd >= len) { + asoc->rwnd -= len; + } else { + asoc->rwnd_over = len - asoc->rwnd; + asoc->rwnd = 0; + } + SCTP_DEBUG_PRINTK("%s: asoc %p rwnd decreased by %d to (%u, %u)\n", + __FUNCTION__, asoc, len, asoc->rwnd, + asoc->rwnd_over); +} + +/* Build the bind address list for the association based on info from the + * local endpoint and the remote peer. + */ +int sctp_assoc_set_bind_addr_from_ep(struct sctp_association *asoc, int gfp) +{ + sctp_scope_t scope; + int flags; + + /* Use scoping rules to determine the subset of addresses from + * the endpoint. + */ + scope = sctp_scope(&asoc->peer.active_path->ipaddr); + flags = (PF_INET6 == asoc->base.sk->sk_family) ? SCTP_ADDR6_ALLOWED : 0; + if (asoc->peer.ipv4_address) + flags |= SCTP_ADDR4_PEERSUPP; + if (asoc->peer.ipv6_address) + flags |= SCTP_ADDR6_PEERSUPP; + + return sctp_bind_addr_copy(&asoc->base.bind_addr, + &asoc->ep->base.bind_addr, + scope, gfp, flags); +} + +/* Build the association's bind address list from the cookie. */ +int sctp_assoc_set_bind_addr_from_cookie(struct sctp_association *asoc, + struct sctp_cookie *cookie, int gfp) +{ + int var_size2 = ntohs(cookie->peer_init->chunk_hdr.length); + int var_size3 = cookie->raw_addr_list_len; + __u8 *raw = (__u8 *)cookie->peer_init + var_size2; + + return sctp_raw_to_bind_addrs(&asoc->base.bind_addr, raw, var_size3, + asoc->ep->base.bind_addr.port, gfp); +} + +/* Lookup laddr in the bind address list of an association. */ +int sctp_assoc_lookup_laddr(struct sctp_association *asoc, + const union sctp_addr *laddr) +{ + int found; + + sctp_read_lock(&asoc->base.addr_lock); + if ((asoc->base.bind_addr.port == ntohs(laddr->v4.sin_port)) && + sctp_bind_addr_match(&asoc->base.bind_addr, laddr, + sctp_sk(asoc->base.sk))) { + found = 1; + goto out; + } + + found = 0; +out: + sctp_read_unlock(&asoc->base.addr_lock); + return found; +} diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c new file mode 100644 index 000000000000..f90eadfb60a2 --- /dev/null +++ b/net/sctp/bind_addr.c @@ -0,0 +1,417 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2001, 2003 + * Copyright (c) Cisco 1999,2000 + * Copyright (c) Motorola 1999,2000,2001 + * Copyright (c) La Monte H.P. Yarroll 2001 + * + * This file is part of the SCTP kernel reference implementation. + * + * A collection class to handle the storage of transport addresses. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Jon Grimm + * Daisy Chang + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Forward declarations for internal helpers. */ +static int sctp_copy_one_addr(struct sctp_bind_addr *, union sctp_addr *, + sctp_scope_t scope, int gfp, int flags); +static void sctp_bind_addr_clean(struct sctp_bind_addr *); + +/* First Level Abstractions. */ + +/* Copy 'src' to 'dest' taking 'scope' into account. Omit addresses + * in 'src' which have a broader scope than 'scope'. + */ +int sctp_bind_addr_copy(struct sctp_bind_addr *dest, + const struct sctp_bind_addr *src, + sctp_scope_t scope, int gfp, int flags) +{ + struct sctp_sockaddr_entry *addr; + struct list_head *pos; + int error = 0; + + /* All addresses share the same port. */ + dest->port = src->port; + + /* Extract the addresses which are relevant for this scope. */ + list_for_each(pos, &src->address_list) { + addr = list_entry(pos, struct sctp_sockaddr_entry, list); + error = sctp_copy_one_addr(dest, &addr->a, scope, + gfp, flags); + if (error < 0) + goto out; + } + + /* If there are no addresses matching the scope and + * this is global scope, try to get a link scope address, with + * the assumption that we must be sitting behind a NAT. + */ + if (list_empty(&dest->address_list) && (SCTP_SCOPE_GLOBAL == scope)) { + list_for_each(pos, &src->address_list) { + addr = list_entry(pos, struct sctp_sockaddr_entry, + list); + error = sctp_copy_one_addr(dest, &addr->a, + SCTP_SCOPE_LINK, gfp, + flags); + if (error < 0) + goto out; + } + } + +out: + if (error) + sctp_bind_addr_clean(dest); + + return error; +} + +/* Initialize the SCTP_bind_addr structure for either an endpoint or + * an association. + */ +void sctp_bind_addr_init(struct sctp_bind_addr *bp, __u16 port) +{ + bp->malloced = 0; + + INIT_LIST_HEAD(&bp->address_list); + bp->port = port; +} + +/* Dispose of the address list. */ +static void sctp_bind_addr_clean(struct sctp_bind_addr *bp) +{ + struct sctp_sockaddr_entry *addr; + struct list_head *pos, *temp; + + /* Empty the bind address list. */ + list_for_each_safe(pos, temp, &bp->address_list) { + addr = list_entry(pos, struct sctp_sockaddr_entry, list); + list_del(pos); + kfree(addr); + SCTP_DBG_OBJCNT_DEC(addr); + } +} + +/* Dispose of an SCTP_bind_addr structure */ +void sctp_bind_addr_free(struct sctp_bind_addr *bp) +{ + /* Empty the bind address list. */ + sctp_bind_addr_clean(bp); + + if (bp->malloced) { + kfree(bp); + SCTP_DBG_OBJCNT_DEC(bind_addr); + } +} + +/* Add an address to the bind address list in the SCTP_bind_addr structure. */ +int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new, + int gfp) +{ + struct sctp_sockaddr_entry *addr; + + /* Add the address to the bind address list. */ + addr = t_new(struct sctp_sockaddr_entry, gfp); + if (!addr) + return -ENOMEM; + + memcpy(&addr->a, new, sizeof(*new)); + + /* Fix up the port if it has not yet been set. + * Both v4 and v6 have the port at the same offset. + */ + if (!addr->a.v4.sin_port) + addr->a.v4.sin_port = bp->port; + + INIT_LIST_HEAD(&addr->list); + list_add_tail(&addr->list, &bp->address_list); + SCTP_DBG_OBJCNT_INC(addr); + + return 0; +} + +/* Delete an address from the bind address list in the SCTP_bind_addr + * structure. + */ +int sctp_del_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *del_addr) +{ + struct list_head *pos, *temp; + struct sctp_sockaddr_entry *addr; + + list_for_each_safe(pos, temp, &bp->address_list) { + addr = list_entry(pos, struct sctp_sockaddr_entry, list); + if (sctp_cmp_addr_exact(&addr->a, del_addr)) { + /* Found the exact match. */ + list_del(pos); + kfree(addr); + SCTP_DBG_OBJCNT_DEC(addr); + + return 0; + } + } + + return -EINVAL; +} + +/* Create a network byte-order representation of all the addresses + * formated as SCTP parameters. + * + * The second argument is the return value for the length. + */ +union sctp_params sctp_bind_addrs_to_raw(const struct sctp_bind_addr *bp, + int *addrs_len, int gfp) +{ + union sctp_params addrparms; + union sctp_params retval; + int addrparms_len; + union sctp_addr_param rawaddr; + int len; + struct sctp_sockaddr_entry *addr; + struct list_head *pos; + struct sctp_af *af; + + addrparms_len = 0; + len = 0; + + /* Allocate enough memory at once. */ + list_for_each(pos, &bp->address_list) { + len += sizeof(union sctp_addr_param); + } + + /* Don't even bother embedding an address if there + * is only one. + */ + if (len == sizeof(union sctp_addr_param)) { + retval.v = NULL; + goto end_raw; + } + + retval.v = kmalloc(len, gfp); + if (!retval.v) + goto end_raw; + + addrparms = retval; + + list_for_each(pos, &bp->address_list) { + addr = list_entry(pos, struct sctp_sockaddr_entry, list); + af = sctp_get_af_specific(addr->a.v4.sin_family); + len = af->to_addr_param(&addr->a, &rawaddr); + memcpy(addrparms.v, &rawaddr, len); + addrparms.v += len; + addrparms_len += len; + } + +end_raw: + *addrs_len = addrparms_len; + return retval; +} + +/* + * Create an address list out of the raw address list format (IPv4 and IPv6 + * address parameters). + */ +int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list, + int addrs_len, __u16 port, int gfp) +{ + union sctp_addr_param *rawaddr; + struct sctp_paramhdr *param; + union sctp_addr addr; + int retval = 0; + int len; + struct sctp_af *af; + + /* Convert the raw address to standard address format */ + while (addrs_len) { + param = (struct sctp_paramhdr *)raw_addr_list; + rawaddr = (union sctp_addr_param *)raw_addr_list; + + af = sctp_get_af_specific(param_type2af(param->type)); + if (unlikely(!af)) { + retval = -EINVAL; + sctp_bind_addr_clean(bp); + break; + } + + af->from_addr_param(&addr, rawaddr, port, 0); + retval = sctp_add_bind_addr(bp, &addr, gfp); + if (retval) { + /* Can't finish building the list, clean up. */ + sctp_bind_addr_clean(bp); + break; + } + + len = ntohs(param->length); + addrs_len -= len; + raw_addr_list += len; + } + + return retval; +} + +/******************************************************************** + * 2nd Level Abstractions + ********************************************************************/ + +/* Does this contain a specified address? Allow wildcarding. */ +int sctp_bind_addr_match(struct sctp_bind_addr *bp, + const union sctp_addr *addr, + struct sctp_sock *opt) +{ + struct sctp_sockaddr_entry *laddr; + struct list_head *pos; + + list_for_each(pos, &bp->address_list) { + laddr = list_entry(pos, struct sctp_sockaddr_entry, list); + if (opt->pf->cmp_addr(&laddr->a, addr, opt)) + return 1; + } + + return 0; +} + +/* Find the first address in the bind address list that is not present in + * the addrs packed array. + */ +union sctp_addr *sctp_find_unmatch_addr(struct sctp_bind_addr *bp, + const union sctp_addr *addrs, + int addrcnt, + struct sctp_sock *opt) +{ + struct sctp_sockaddr_entry *laddr; + union sctp_addr *addr; + void *addr_buf; + struct sctp_af *af; + struct list_head *pos; + int i; + + list_for_each(pos, &bp->address_list) { + laddr = list_entry(pos, struct sctp_sockaddr_entry, list); + + addr_buf = (union sctp_addr *)addrs; + for (i = 0; i < addrcnt; i++) { + addr = (union sctp_addr *)addr_buf; + af = sctp_get_af_specific(addr->v4.sin_family); + if (!af) + return NULL; + + if (opt->pf->cmp_addr(&laddr->a, addr, opt)) + break; + + addr_buf += af->sockaddr_len; + } + if (i == addrcnt) + return &laddr->a; + } + + return NULL; +} + +/* Copy out addresses from the global local address list. */ +static int sctp_copy_one_addr(struct sctp_bind_addr *dest, + union sctp_addr *addr, + sctp_scope_t scope, int gfp, int flags) +{ + int error = 0; + + if (sctp_is_any(addr)) { + error = sctp_copy_local_addr_list(dest, scope, gfp, flags); + } else if (sctp_in_scope(addr, scope)) { + /* Now that the address is in scope, check to see if + * the address type is supported by local sock as + * well as the remote peer. + */ + if ((((AF_INET == addr->sa.sa_family) && + (flags & SCTP_ADDR4_PEERSUPP))) || + (((AF_INET6 == addr->sa.sa_family) && + (flags & SCTP_ADDR6_ALLOWED) && + (flags & SCTP_ADDR6_PEERSUPP)))) + error = sctp_add_bind_addr(dest, addr, gfp); + } + + return error; +} + +/* Is this a wildcard address? */ +int sctp_is_any(const union sctp_addr *addr) +{ + struct sctp_af *af = sctp_get_af_specific(addr->sa.sa_family); + if (!af) + return 0; + return af->is_any(addr); +} + +/* Is 'addr' valid for 'scope'? */ +int sctp_in_scope(const union sctp_addr *addr, sctp_scope_t scope) +{ + sctp_scope_t addr_scope = sctp_scope(addr); + + /* The unusable SCTP addresses will not be considered with + * any defined scopes. + */ + if (SCTP_SCOPE_UNUSABLE == addr_scope) + return 0; + /* + * For INIT and INIT-ACK address list, let L be the level of + * of requested destination address, sender and receiver + * SHOULD include all of its addresses with level greater + * than or equal to L. + */ + if (addr_scope <= scope) + return 1; + + return 0; +} + +/******************************************************************** + * 3rd Level Abstractions + ********************************************************************/ + +/* What is the scope of 'addr'? */ +sctp_scope_t sctp_scope(const union sctp_addr *addr) +{ + struct sctp_af *af; + + af = sctp_get_af_specific(addr->sa.sa_family); + if (!af) + return SCTP_SCOPE_UNUSABLE; + + return af->scope((union sctp_addr *)addr); +} diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c new file mode 100644 index 000000000000..0c2ab7885058 --- /dev/null +++ b/net/sctp/chunk.c @@ -0,0 +1,309 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2003, 2004 + * + * This file is part of the SCTP kernel reference Implementation + * + * This file contains the code relating the the chunk abstraction. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Jon Grimm + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* This file is mostly in anticipation of future work, but initially + * populate with fragment tracking for an outbound message. + */ + +/* Initialize datamsg from memory. */ +static void sctp_datamsg_init(struct sctp_datamsg *msg) +{ + atomic_set(&msg->refcnt, 1); + msg->send_failed = 0; + msg->send_error = 0; + msg->can_abandon = 0; + msg->expires_at = 0; + INIT_LIST_HEAD(&msg->chunks); +} + +/* Allocate and initialize datamsg. */ +SCTP_STATIC struct sctp_datamsg *sctp_datamsg_new(int gfp) +{ + struct sctp_datamsg *msg; + msg = kmalloc(sizeof(struct sctp_datamsg), gfp); + if (msg) + sctp_datamsg_init(msg); + SCTP_DBG_OBJCNT_INC(datamsg); + return msg; +} + +/* Final destructruction of datamsg memory. */ +static void sctp_datamsg_destroy(struct sctp_datamsg *msg) +{ + struct list_head *pos, *temp; + struct sctp_chunk *chunk; + struct sctp_sock *sp; + struct sctp_ulpevent *ev; + struct sctp_association *asoc = NULL; + int error = 0, notify; + + /* If we failed, we may need to notify. */ + notify = msg->send_failed ? -1 : 0; + + /* Release all references. */ + list_for_each_safe(pos, temp, &msg->chunks) { + list_del_init(pos); + chunk = list_entry(pos, struct sctp_chunk, frag_list); + /* Check whether we _really_ need to notify. */ + if (notify < 0) { + asoc = chunk->asoc; + if (msg->send_error) + error = msg->send_error; + else + error = asoc->outqueue.error; + + sp = sctp_sk(asoc->base.sk); + notify = sctp_ulpevent_type_enabled(SCTP_SEND_FAILED, + &sp->subscribe); + } + + /* Generate a SEND FAILED event only if enabled. */ + if (notify > 0) { + int sent; + if (chunk->has_tsn) + sent = SCTP_DATA_SENT; + else + sent = SCTP_DATA_UNSENT; + + ev = sctp_ulpevent_make_send_failed(asoc, chunk, sent, + error, GFP_ATOMIC); + if (ev) + sctp_ulpq_tail_event(&asoc->ulpq, ev); + } + + sctp_chunk_put(chunk); + } + + SCTP_DBG_OBJCNT_DEC(datamsg); + kfree(msg); +} + +/* Hold a reference. */ +static void sctp_datamsg_hold(struct sctp_datamsg *msg) +{ + atomic_inc(&msg->refcnt); +} + +/* Release a reference. */ +void sctp_datamsg_put(struct sctp_datamsg *msg) +{ + if (atomic_dec_and_test(&msg->refcnt)) + sctp_datamsg_destroy(msg); +} + +/* Free a message. Really just give up a reference, the + * really free happens in sctp_datamsg_destroy(). + */ +void sctp_datamsg_free(struct sctp_datamsg *msg) +{ + sctp_datamsg_put(msg); +} + +/* Hold on to all the fragments until all chunks have been sent. */ +void sctp_datamsg_track(struct sctp_chunk *chunk) +{ + sctp_chunk_hold(chunk); +} + +/* Assign a chunk to this datamsg. */ +static void sctp_datamsg_assign(struct sctp_datamsg *msg, struct sctp_chunk *chunk) +{ + sctp_datamsg_hold(msg); + chunk->msg = msg; +} + + +/* A data chunk can have a maximum payload of (2^16 - 20). Break + * down any such message into smaller chunks. Opportunistically, fragment + * the chunks down to the current MTU constraints. We may get refragmented + * later if the PMTU changes, but it is _much better_ to fragment immediately + * with a reasonable guess than always doing our fragmentation on the + * soft-interrupt. + */ +struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc, + struct sctp_sndrcvinfo *sinfo, + struct msghdr *msgh, int msg_len) +{ + int max, whole, i, offset, over, err; + int len, first_len; + struct sctp_chunk *chunk; + struct sctp_datamsg *msg; + struct list_head *pos, *temp; + __u8 frag; + + msg = sctp_datamsg_new(GFP_KERNEL); + if (!msg) + return NULL; + + /* Note: Calculate this outside of the loop, so that all fragments + * have the same expiration. + */ + if (sinfo->sinfo_timetolive) { + /* sinfo_timetolive is in milliseconds */ + msg->expires_at = jiffies + + msecs_to_jiffies(sinfo->sinfo_timetolive); + msg->can_abandon = 1; + SCTP_DEBUG_PRINTK("%s: msg:%p expires_at: %ld jiffies:%ld\n", + __FUNCTION__, msg, msg->expires_at, jiffies); + } + + max = asoc->frag_point; + + whole = 0; + first_len = max; + + /* Encourage Cookie-ECHO bundling. */ + if (asoc->state < SCTP_STATE_COOKIE_ECHOED) { + whole = msg_len / (max - SCTP_ARBITRARY_COOKIE_ECHO_LEN); + + /* Account for the DATA to be bundled with the COOKIE-ECHO. */ + if (whole) { + first_len = max - SCTP_ARBITRARY_COOKIE_ECHO_LEN; + msg_len -= first_len; + whole = 1; + } + } + + /* How many full sized? How many bytes leftover? */ + whole += msg_len / max; + over = msg_len % max; + offset = 0; + + if ((whole > 1) || (whole && over)) + SCTP_INC_STATS_USER(SCTP_MIB_FRAGUSRMSGS); + + /* Create chunks for all the full sized DATA chunks. */ + for (i=0, len=first_len; i < whole; i++) { + frag = SCTP_DATA_MIDDLE_FRAG; + + if (0 == i) + frag |= SCTP_DATA_FIRST_FRAG; + + if ((i == (whole - 1)) && !over) + frag |= SCTP_DATA_LAST_FRAG; + + chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, 0); + + if (!chunk) + goto errout; + err = sctp_user_addto_chunk(chunk, offset, len, msgh->msg_iov); + if (err < 0) + goto errout; + + offset += len; + + /* Put the chunk->skb back into the form expected by send. */ + __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr + - (__u8 *)chunk->skb->data); + + sctp_datamsg_assign(msg, chunk); + list_add_tail(&chunk->frag_list, &msg->chunks); + + /* The first chunk, the first chunk was likely short + * to allow bundling, so reset to full size. + */ + if (0 == i) + len = max; + } + + /* .. now the leftover bytes. */ + if (over) { + if (!whole) + frag = SCTP_DATA_NOT_FRAG; + else + frag = SCTP_DATA_LAST_FRAG; + + chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag, 0); + + if (!chunk) + goto errout; + + err = sctp_user_addto_chunk(chunk, offset, over,msgh->msg_iov); + + /* Put the chunk->skb back into the form expected by send. */ + __skb_pull(chunk->skb, (__u8 *)chunk->chunk_hdr + - (__u8 *)chunk->skb->data); + if (err < 0) + goto errout; + + sctp_datamsg_assign(msg, chunk); + list_add_tail(&chunk->frag_list, &msg->chunks); + } + + return msg; + +errout: + list_for_each_safe(pos, temp, &msg->chunks) { + list_del_init(pos); + chunk = list_entry(pos, struct sctp_chunk, frag_list); + sctp_chunk_free(chunk); + } + sctp_datamsg_free(msg); + return NULL; +} + +/* Check whether this message has expired. */ +int sctp_chunk_abandoned(struct sctp_chunk *chunk) +{ + struct sctp_datamsg *msg = chunk->msg; + + if (!msg->can_abandon) + return 0; + + if (time_after(jiffies, msg->expires_at)) + return 1; + + return 0; +} + +/* This chunk (and consequently entire message) has failed in its sending. */ +void sctp_chunk_fail(struct sctp_chunk *chunk, int error) +{ + chunk->msg->send_failed = 1; + chunk->msg->send_error = error; +} diff --git a/net/sctp/command.c b/net/sctp/command.c new file mode 100644 index 000000000000..3ff804757f4a --- /dev/null +++ b/net/sctp/command.c @@ -0,0 +1,81 @@ +/* SCTP kernel reference Implementation Copyright (C) 1999-2001 + * Cisco, Motorola, and IBM + * Copyright 2001 La Monte H.P. Yarroll + * + * This file is part of the SCTP kernel reference Implementation + * + * These functions manipulate sctp command sequences. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include + +/* Initialize a block of memory as a command sequence. */ +int sctp_init_cmd_seq(sctp_cmd_seq_t *seq) +{ + memset(seq, 0, sizeof(sctp_cmd_seq_t)); + return 1; /* We always succeed. */ +} + +/* Add a command to a sctp_cmd_seq_t. + * Return 0 if the command sequence is full. + */ +int sctp_add_cmd(sctp_cmd_seq_t *seq, sctp_verb_t verb, sctp_arg_t obj) +{ + if (seq->next_free_slot >= SCTP_MAX_NUM_COMMANDS) + goto fail; + + seq->cmds[seq->next_free_slot].verb = verb; + seq->cmds[seq->next_free_slot++].obj = obj; + + return 1; + +fail: + return 0; +} + +/* Return the next command structure in a sctp_cmd_seq. + * Returns NULL at the end of the sequence. + */ +sctp_cmd_t *sctp_next_cmd(sctp_cmd_seq_t *seq) +{ + sctp_cmd_t *retval = NULL; + + if (seq->next_cmd < seq->next_free_slot) + retval = &seq->cmds[seq->next_cmd++]; + + return retval; +} + diff --git a/net/sctp/crc32c.c b/net/sctp/crc32c.c new file mode 100644 index 000000000000..31f05ec8e1d3 --- /dev/null +++ b/net/sctp/crc32c.c @@ -0,0 +1,220 @@ +/* SCTP kernel reference Implementation + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001-2003 International Business Machines, Corp. + * + * This file is part of the SCTP kernel reference Implementation + * + * SCTP Checksum functions + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Dinakaran Joseph + * Jon Grimm + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +/* The following code has been taken directly from + * draft-ietf-tsvwg-sctpcsum-03.txt + * + * The code has now been modified specifically for SCTP knowledge. + */ + +#include +#include + +#define CRC32C_POLY 0x1EDC6F41 +#define CRC32C(c,d) (c=(c>>8)^crc_c[(c^(d))&0xFF]) +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +/* Copyright 2001, D. Otis. Use this program, code or tables */ +/* extracted from it, as desired without restriction. */ +/* */ +/* 32 Bit Reflected CRC table generation for SCTP. */ +/* To accommodate serial byte data being shifted out least */ +/* significant bit first, the table's 32 bit words are reflected */ +/* which flips both byte and bit MS and LS positions. The CRC */ +/* is calculated MS bits first from the perspective of the serial*/ +/* stream. The x^32 term is implied and the x^0 term may also */ +/* be shown as +1. The polynomial code used is 0x1EDC6F41. */ +/* Castagnoli93 */ +/* x^32+x^28+x^27+x^26+x^25+x^23+x^22+x^20+x^19+x^18+x^14+x^13+ */ +/* x^11+x^10+x^9+x^8+x^6+x^0 */ +/* Guy Castagnoli Stefan Braeuer and Martin Herrman */ +/* "Optimization of Cyclic Redundancy-Check Codes */ +/* with 24 and 32 Parity Bits", */ +/* IEEE Transactions on Communications, Vol.41, No.6, June 1993 */ +/* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ +static const __u32 crc_c[256] = { + 0x00000000, 0xF26B8303, 0xE13B70F7, 0x1350F3F4, + 0xC79A971F, 0x35F1141C, 0x26A1E7E8, 0xD4CA64EB, + 0x8AD958CF, 0x78B2DBCC, 0x6BE22838, 0x9989AB3B, + 0x4D43CFD0, 0xBF284CD3, 0xAC78BF27, 0x5E133C24, + 0x105EC76F, 0xE235446C, 0xF165B798, 0x030E349B, + 0xD7C45070, 0x25AFD373, 0x36FF2087, 0xC494A384, + 0x9A879FA0, 0x68EC1CA3, 0x7BBCEF57, 0x89D76C54, + 0x5D1D08BF, 0xAF768BBC, 0xBC267848, 0x4E4DFB4B, + 0x20BD8EDE, 0xD2D60DDD, 0xC186FE29, 0x33ED7D2A, + 0xE72719C1, 0x154C9AC2, 0x061C6936, 0xF477EA35, + 0xAA64D611, 0x580F5512, 0x4B5FA6E6, 0xB93425E5, + 0x6DFE410E, 0x9F95C20D, 0x8CC531F9, 0x7EAEB2FA, + 0x30E349B1, 0xC288CAB2, 0xD1D83946, 0x23B3BA45, + 0xF779DEAE, 0x05125DAD, 0x1642AE59, 0xE4292D5A, + 0xBA3A117E, 0x4851927D, 0x5B016189, 0xA96AE28A, + 0x7DA08661, 0x8FCB0562, 0x9C9BF696, 0x6EF07595, + 0x417B1DBC, 0xB3109EBF, 0xA0406D4B, 0x522BEE48, + 0x86E18AA3, 0x748A09A0, 0x67DAFA54, 0x95B17957, + 0xCBA24573, 0x39C9C670, 0x2A993584, 0xD8F2B687, + 0x0C38D26C, 0xFE53516F, 0xED03A29B, 0x1F682198, + 0x5125DAD3, 0xA34E59D0, 0xB01EAA24, 0x42752927, + 0x96BF4DCC, 0x64D4CECF, 0x77843D3B, 0x85EFBE38, + 0xDBFC821C, 0x2997011F, 0x3AC7F2EB, 0xC8AC71E8, + 0x1C661503, 0xEE0D9600, 0xFD5D65F4, 0x0F36E6F7, + 0x61C69362, 0x93AD1061, 0x80FDE395, 0x72966096, + 0xA65C047D, 0x5437877E, 0x4767748A, 0xB50CF789, + 0xEB1FCBAD, 0x197448AE, 0x0A24BB5A, 0xF84F3859, + 0x2C855CB2, 0xDEEEDFB1, 0xCDBE2C45, 0x3FD5AF46, + 0x7198540D, 0x83F3D70E, 0x90A324FA, 0x62C8A7F9, + 0xB602C312, 0x44694011, 0x5739B3E5, 0xA55230E6, + 0xFB410CC2, 0x092A8FC1, 0x1A7A7C35, 0xE811FF36, + 0x3CDB9BDD, 0xCEB018DE, 0xDDE0EB2A, 0x2F8B6829, + 0x82F63B78, 0x709DB87B, 0x63CD4B8F, 0x91A6C88C, + 0x456CAC67, 0xB7072F64, 0xA457DC90, 0x563C5F93, + 0x082F63B7, 0xFA44E0B4, 0xE9141340, 0x1B7F9043, + 0xCFB5F4A8, 0x3DDE77AB, 0x2E8E845F, 0xDCE5075C, + 0x92A8FC17, 0x60C37F14, 0x73938CE0, 0x81F80FE3, + 0x55326B08, 0xA759E80B, 0xB4091BFF, 0x466298FC, + 0x1871A4D8, 0xEA1A27DB, 0xF94AD42F, 0x0B21572C, + 0xDFEB33C7, 0x2D80B0C4, 0x3ED04330, 0xCCBBC033, + 0xA24BB5A6, 0x502036A5, 0x4370C551, 0xB11B4652, + 0x65D122B9, 0x97BAA1BA, 0x84EA524E, 0x7681D14D, + 0x2892ED69, 0xDAF96E6A, 0xC9A99D9E, 0x3BC21E9D, + 0xEF087A76, 0x1D63F975, 0x0E330A81, 0xFC588982, + 0xB21572C9, 0x407EF1CA, 0x532E023E, 0xA145813D, + 0x758FE5D6, 0x87E466D5, 0x94B49521, 0x66DF1622, + 0x38CC2A06, 0xCAA7A905, 0xD9F75AF1, 0x2B9CD9F2, + 0xFF56BD19, 0x0D3D3E1A, 0x1E6DCDEE, 0xEC064EED, + 0xC38D26C4, 0x31E6A5C7, 0x22B65633, 0xD0DDD530, + 0x0417B1DB, 0xF67C32D8, 0xE52CC12C, 0x1747422F, + 0x49547E0B, 0xBB3FFD08, 0xA86F0EFC, 0x5A048DFF, + 0x8ECEE914, 0x7CA56A17, 0x6FF599E3, 0x9D9E1AE0, + 0xD3D3E1AB, 0x21B862A8, 0x32E8915C, 0xC083125F, + 0x144976B4, 0xE622F5B7, 0xF5720643, 0x07198540, + 0x590AB964, 0xAB613A67, 0xB831C993, 0x4A5A4A90, + 0x9E902E7B, 0x6CFBAD78, 0x7FAB5E8C, 0x8DC0DD8F, + 0xE330A81A, 0x115B2B19, 0x020BD8ED, 0xF0605BEE, + 0x24AA3F05, 0xD6C1BC06, 0xC5914FF2, 0x37FACCF1, + 0x69E9F0D5, 0x9B8273D6, 0x88D28022, 0x7AB90321, + 0xAE7367CA, 0x5C18E4C9, 0x4F48173D, 0xBD23943E, + 0xF36E6F75, 0x0105EC76, 0x12551F82, 0xE03E9C81, + 0x34F4F86A, 0xC69F7B69, 0xD5CF889D, 0x27A40B9E, + 0x79B737BA, 0x8BDCB4B9, 0x988C474D, 0x6AE7C44E, + 0xBE2DA0A5, 0x4C4623A6, 0x5F16D052, 0xAD7D5351, +}; + +__u32 sctp_start_cksum(__u8 *buffer, __u16 length) +{ + __u32 crc32 = ~(__u32) 0; + __u32 i; + + /* Optimize this routine to be SCTP specific, knowing how + * to skip the checksum field of the SCTP header. + */ + + /* Calculate CRC up to the checksum. */ + for (i = 0; i < (sizeof(struct sctphdr) - sizeof(__u32)); i++) + CRC32C(crc32, buffer[i]); + + /* Skip checksum field of the header. */ + for (i = 0; i < sizeof(__u32); i++) + CRC32C(crc32, 0); + + /* Calculate the rest of the CRC. */ + for (i = sizeof(struct sctphdr); i < length ; i++) + CRC32C(crc32, buffer[i]); + + return crc32; +} + +__u32 sctp_update_cksum(__u8 *buffer, __u16 length, __u32 crc32) +{ + __u32 i; + + for (i = 0; i < length ; i++) + CRC32C(crc32, buffer[i]); + + return crc32; +} + +__u32 sctp_update_copy_cksum(__u8 *to, __u8 *from, __u16 length, __u32 crc32) +{ + __u32 i; + __u32 *_to = (__u32 *)to; + __u32 *_from = (__u32 *)from; + + for (i = 0; i < (length/4); i++) { + _to[i] = _from[i]; + CRC32C(crc32, from[i*4]); + CRC32C(crc32, from[i*4+1]); + CRC32C(crc32, from[i*4+2]); + CRC32C(crc32, from[i*4+3]); + } + + return crc32; +} + +__u32 sctp_end_cksum(__u32 crc32) +{ + __u32 result; + __u8 byte0, byte1, byte2, byte3; + + result = ~crc32; + + /* result now holds the negated polynomial remainder; + * since the table and algorithm is "reflected" [williams95]. + * That is, result has the same value as if we mapped the message + * to a polyomial, computed the host-bit-order polynomial + * remainder, performed final negation, then did an end-for-end + * bit-reversal. + * Note that a 32-bit bit-reversal is identical to four inplace + * 8-bit reversals followed by an end-for-end byteswap. + * In other words, the bytes of each bit are in the right order, + * but the bytes have been byteswapped. So we now do an explicit + * byteswap. On a little-endian machine, this byteswap and + * the final ntohl cancel out and could be elided. + */ + byte0 = result & 0xff; + byte1 = (result>>8) & 0xff; + byte2 = (result>>16) & 0xff; + byte3 = (result>>24) & 0xff; + + crc32 = ((byte0 << 24) | + (byte1 << 16) | + (byte2 << 8) | + byte3); + return crc32; +} diff --git a/net/sctp/debug.c b/net/sctp/debug.c new file mode 100644 index 000000000000..aa8340373af7 --- /dev/null +++ b/net/sctp/debug.c @@ -0,0 +1,191 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001 Intel Corp. + * + * This file is part of the SCTP kernel reference Implementation + * + * This file is part of the implementation of the add-IP extension, + * based on June 29, 2001, + * for the SCTP kernel reference Implementation. + * + * This file converts numerical ID value to alphabetical names for SCTP + * terms such as chunk type, parameter time, event type, etc. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Xingang Guo + * Jon Grimm + * Daisy Chang + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include + +#if SCTP_DEBUG +int sctp_debug_flag = 1; /* Initially enable DEBUG */ +#endif /* SCTP_DEBUG */ + +/* These are printable forms of Chunk ID's from section 3.1. */ +static const char *sctp_cid_tbl[SCTP_NUM_BASE_CHUNK_TYPES] = { + "DATA", + "INIT", + "INIT_ACK", + "SACK", + "HEARTBEAT", + "HEARTBEAT_ACK", + "ABORT", + "SHUTDOWN", + "SHUTDOWN_ACK", + "ERROR", + "COOKIE_ECHO", + "COOKIE_ACK", + "ECN_ECNE", + "ECN_CWR", + "SHUTDOWN_COMPLETE", +}; + +/* Lookup "chunk type" debug name. */ +const char *sctp_cname(const sctp_subtype_t cid) +{ + if (cid.chunk < 0) + return "illegal chunk id"; + if (cid.chunk <= SCTP_CID_BASE_MAX) + return sctp_cid_tbl[cid.chunk]; + + switch (cid.chunk) { + case SCTP_CID_ASCONF: + return "ASCONF"; + + case SCTP_CID_ASCONF_ACK: + return "ASCONF_ACK"; + + case SCTP_CID_FWD_TSN: + return "FWD_TSN"; + + default: + return "unknown chunk"; + }; + return "unknown chunk"; +} + +/* These are printable forms of the states. */ +const char *sctp_state_tbl[SCTP_STATE_NUM_STATES] = { + "STATE_EMPTY", + "STATE_CLOSED", + "STATE_COOKIE_WAIT", + "STATE_COOKIE_ECHOED", + "STATE_ESTABLISHED", + "STATE_SHUTDOWN_PENDING", + "STATE_SHUTDOWN_SENT", + "STATE_SHUTDOWN_RECEIVED", + "STATE_SHUTDOWN_ACK_SENT", +}; + +/* Events that could change the state of an association. */ +const char *sctp_evttype_tbl[] = { + "EVENT_T_unknown", + "EVENT_T_CHUNK", + "EVENT_T_TIMEOUT", + "EVENT_T_OTHER", + "EVENT_T_PRIMITIVE" +}; + +/* Return value of a state function */ +const char *sctp_status_tbl[] = { + "DISPOSITION_DISCARD", + "DISPOSITION_CONSUME", + "DISPOSITION_NOMEM", + "DISPOSITION_DELETE_TCB", + "DISPOSITION_ABORT", + "DISPOSITION_VIOLATION", + "DISPOSITION_NOT_IMPL", + "DISPOSITION_ERROR", + "DISPOSITION_BUG" +}; + +/* Printable forms of primitives */ +static const char *sctp_primitive_tbl[SCTP_NUM_PRIMITIVE_TYPES] = { + "PRIMITIVE_ASSOCIATE", + "PRIMITIVE_SHUTDOWN", + "PRIMITIVE_ABORT", + "PRIMITIVE_SEND", + "PRIMITIVE_REQUESTHEARTBEAT", +}; + +/* Lookup primitive debug name. */ +const char *sctp_pname(const sctp_subtype_t id) +{ + if (id.primitive < 0) + return "illegal primitive"; + if (id.primitive <= SCTP_EVENT_PRIMITIVE_MAX) + return sctp_primitive_tbl[id.primitive]; + return "unknown_primitive"; +} + +static const char *sctp_other_tbl[] = { + "NO_PENDING_TSN", + "ICMP_PROTO_UNREACH", +}; + +/* Lookup "other" debug name. */ +const char *sctp_oname(const sctp_subtype_t id) +{ + if (id.other < 0) + return "illegal 'other' event"; + if (id.other <= SCTP_EVENT_OTHER_MAX) + return sctp_other_tbl[id.other]; + return "unknown 'other' event"; +} + +static const char *sctp_timer_tbl[] = { + "TIMEOUT_NONE", + "TIMEOUT_T1_COOKIE", + "TIMEOUT_T1_INIT", + "TIMEOUT_T2_SHUTDOWN", + "TIMEOUT_T3_RTX", + "TIMEOUT_T4_RTO", + "TIMEOUT_T5_SHUTDOWN_GUARD", + "TIMEOUT_HEARTBEAT", + "TIMEOUT_SACK", + "TIMEOUT_AUTOCLOSE", +}; + +/* Lookup timer debug name. */ +const char *sctp_tname(const sctp_subtype_t id) +{ + if (id.timeout < 0) + return "illegal 'timer' event"; + if (id.timeout <= SCTP_EVENT_TIMEOUT_MAX) + return sctp_timer_tbl[id.timeout]; + return "unknown_timer"; +} diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c new file mode 100644 index 000000000000..544b75077dbd --- /dev/null +++ b/net/sctp/endpointola.c @@ -0,0 +1,389 @@ +/* SCTP kernel reference Implementation + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001-2002 International Business Machines, Corp. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 Nokia, Inc. + * Copyright (c) 2001 La Monte H.P. Yarroll + * + * This file is part of the SCTP kernel reference Implementation + * + * This abstraction represents an SCTP endpoint. + * + * This file is part of the implementation of the add-IP extension, + * based on June 29, 2001, + * for the SCTP kernel reference Implementation. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Jon Grimm + * Daisy Chang + * Dajiang Zhang + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include /* get_random_bytes() */ +#include +#include +#include +#include +#include + +/* Forward declarations for internal helpers. */ +static void sctp_endpoint_bh_rcv(struct sctp_endpoint *ep); + +/* + * Initialize the base fields of the endpoint structure. + */ +static struct sctp_endpoint *sctp_endpoint_init(struct sctp_endpoint *ep, + struct sock *sk, int gfp) +{ + struct sctp_sock *sp = sctp_sk(sk); + memset(ep, 0, sizeof(struct sctp_endpoint)); + + /* Initialize the base structure. */ + /* What type of endpoint are we? */ + ep->base.type = SCTP_EP_TYPE_SOCKET; + + /* Initialize the basic object fields. */ + atomic_set(&ep->base.refcnt, 1); + ep->base.dead = 0; + ep->base.malloced = 1; + + /* Create an input queue. */ + sctp_inq_init(&ep->base.inqueue); + + /* Set its top-half handler */ + sctp_inq_set_th_handler(&ep->base.inqueue, + (void (*)(void *))sctp_endpoint_bh_rcv, ep); + + /* Initialize the bind addr area */ + sctp_bind_addr_init(&ep->base.bind_addr, 0); + rwlock_init(&ep->base.addr_lock); + + /* Remember who we are attached to. */ + ep->base.sk = sk; + sock_hold(ep->base.sk); + + /* Create the lists of associations. */ + INIT_LIST_HEAD(&ep->asocs); + + /* Set up the base timeout information. */ + ep->timeouts[SCTP_EVENT_TIMEOUT_NONE] = 0; + ep->timeouts[SCTP_EVENT_TIMEOUT_T1_COOKIE] = + SCTP_DEFAULT_TIMEOUT_T1_COOKIE; + ep->timeouts[SCTP_EVENT_TIMEOUT_T1_INIT] = + SCTP_DEFAULT_TIMEOUT_T1_INIT; + ep->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = + msecs_to_jiffies(sp->rtoinfo.srto_initial); + ep->timeouts[SCTP_EVENT_TIMEOUT_T3_RTX] = 0; + ep->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = 0; + + /* sctpimpguide-05 Section 2.12.2 + * If the 'T5-shutdown-guard' timer is used, it SHOULD be set to the + * recommended value of 5 times 'RTO.Max'. + */ + ep->timeouts[SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD] + = 5 * msecs_to_jiffies(sp->rtoinfo.srto_max); + + ep->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = + SCTP_DEFAULT_TIMEOUT_HEARTBEAT; + ep->timeouts[SCTP_EVENT_TIMEOUT_SACK] = + SCTP_DEFAULT_TIMEOUT_SACK; + ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = + sp->autoclose * HZ; + + /* Use SCTP specific send buffer space queues. */ + sk->sk_write_space = sctp_write_space; + sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); + + /* Initialize the secret key used with cookie. */ + get_random_bytes(&ep->secret_key[0], SCTP_SECRET_SIZE); + ep->last_key = ep->current_key = 0; + ep->key_changed_at = jiffies; + + ep->debug_name = "unnamedEndpoint"; + return ep; +} + +/* Create a sctp_endpoint with all that boring stuff initialized. + * Returns NULL if there isn't enough memory. + */ +struct sctp_endpoint *sctp_endpoint_new(struct sock *sk, int gfp) +{ + struct sctp_endpoint *ep; + + /* Build a local endpoint. */ + ep = t_new(struct sctp_endpoint, gfp); + if (!ep) + goto fail; + if (!sctp_endpoint_init(ep, sk, gfp)) + goto fail_init; + ep->base.malloced = 1; + SCTP_DBG_OBJCNT_INC(ep); + return ep; + +fail_init: + kfree(ep); +fail: + return NULL; +} + +/* Add an association to an endpoint. */ +void sctp_endpoint_add_asoc(struct sctp_endpoint *ep, + struct sctp_association *asoc) +{ + struct sock *sk = ep->base.sk; + + /* Now just add it to our list of asocs */ + list_add_tail(&asoc->asocs, &ep->asocs); + + /* Increment the backlog value for a TCP-style listening socket. */ + if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) + sk->sk_ack_backlog++; +} + +/* Free the endpoint structure. Delay cleanup until + * all users have released their reference count on this structure. + */ +void sctp_endpoint_free(struct sctp_endpoint *ep) +{ + ep->base.dead = 1; + sctp_endpoint_put(ep); +} + +/* Final destructor for endpoint. */ +static void sctp_endpoint_destroy(struct sctp_endpoint *ep) +{ + SCTP_ASSERT(ep->base.dead, "Endpoint is not dead", return); + + ep->base.sk->sk_state = SCTP_SS_CLOSED; + + /* Unlink this endpoint, so we can't find it again! */ + sctp_unhash_endpoint(ep); + + /* Free up the HMAC transform. */ + if (sctp_sk(ep->base.sk)->hmac) + sctp_crypto_free_tfm(sctp_sk(ep->base.sk)->hmac); + + /* Cleanup. */ + sctp_inq_free(&ep->base.inqueue); + sctp_bind_addr_free(&ep->base.bind_addr); + + /* Remove and free the port */ + if (sctp_sk(ep->base.sk)->bind_hash) + sctp_put_port(ep->base.sk); + + /* Give up our hold on the sock. */ + if (ep->base.sk) + sock_put(ep->base.sk); + + /* Finally, free up our memory. */ + if (ep->base.malloced) { + kfree(ep); + SCTP_DBG_OBJCNT_DEC(ep); + } +} + +/* Hold a reference to an endpoint. */ +void sctp_endpoint_hold(struct sctp_endpoint *ep) +{ + atomic_inc(&ep->base.refcnt); +} + +/* Release a reference to an endpoint and clean up if there are + * no more references. + */ +void sctp_endpoint_put(struct sctp_endpoint *ep) +{ + if (atomic_dec_and_test(&ep->base.refcnt)) + sctp_endpoint_destroy(ep); +} + +/* Is this the endpoint we are looking for? */ +struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep, + const union sctp_addr *laddr) +{ + struct sctp_endpoint *retval; + + sctp_read_lock(&ep->base.addr_lock); + if (ep->base.bind_addr.port == laddr->v4.sin_port) { + if (sctp_bind_addr_match(&ep->base.bind_addr, laddr, + sctp_sk(ep->base.sk))) { + retval = ep; + goto out; + } + } + + retval = NULL; + +out: + sctp_read_unlock(&ep->base.addr_lock); + return retval; +} + +/* Find the association that goes with this chunk. + * We do a linear search of the associations for this endpoint. + * We return the matching transport address too. + */ +static struct sctp_association *__sctp_endpoint_lookup_assoc( + const struct sctp_endpoint *ep, + const union sctp_addr *paddr, + struct sctp_transport **transport) +{ + int rport; + struct sctp_association *asoc; + struct list_head *pos; + + rport = paddr->v4.sin_port; + + list_for_each(pos, &ep->asocs) { + asoc = list_entry(pos, struct sctp_association, asocs); + if (rport == asoc->peer.port) { + sctp_read_lock(&asoc->base.addr_lock); + *transport = sctp_assoc_lookup_paddr(asoc, paddr); + sctp_read_unlock(&asoc->base.addr_lock); + + if (*transport) + return asoc; + } + } + + *transport = NULL; + return NULL; +} + +/* Lookup association on an endpoint based on a peer address. BH-safe. */ +struct sctp_association *sctp_endpoint_lookup_assoc( + const struct sctp_endpoint *ep, + const union sctp_addr *paddr, + struct sctp_transport **transport) +{ + struct sctp_association *asoc; + + sctp_local_bh_disable(); + asoc = __sctp_endpoint_lookup_assoc(ep, paddr, transport); + sctp_local_bh_enable(); + + return asoc; +} + +/* Look for any peeled off association from the endpoint that matches the + * given peer address. + */ +int sctp_endpoint_is_peeled_off(struct sctp_endpoint *ep, + const union sctp_addr *paddr) +{ + struct list_head *pos; + struct sctp_sockaddr_entry *addr; + struct sctp_bind_addr *bp; + + sctp_read_lock(&ep->base.addr_lock); + bp = &ep->base.bind_addr; + list_for_each(pos, &bp->address_list) { + addr = list_entry(pos, struct sctp_sockaddr_entry, list); + if (sctp_has_association(&addr->a, paddr)) { + sctp_read_unlock(&ep->base.addr_lock); + return 1; + } + } + sctp_read_unlock(&ep->base.addr_lock); + + return 0; +} + +/* Do delayed input processing. This is scheduled by sctp_rcv(). + * This may be called on BH or task time. + */ +static void sctp_endpoint_bh_rcv(struct sctp_endpoint *ep) +{ + struct sctp_association *asoc; + struct sock *sk; + struct sctp_transport *transport; + struct sctp_chunk *chunk; + struct sctp_inq *inqueue; + sctp_subtype_t subtype; + sctp_state_t state; + int error = 0; + + if (ep->base.dead) + return; + + asoc = NULL; + inqueue = &ep->base.inqueue; + sk = ep->base.sk; + + while (NULL != (chunk = sctp_inq_pop(inqueue))) { + subtype = SCTP_ST_CHUNK(chunk->chunk_hdr->type); + + /* We might have grown an association since last we + * looked, so try again. + * + * This happens when we've just processed our + * COOKIE-ECHO chunk. + */ + if (NULL == chunk->asoc) { + asoc = sctp_endpoint_lookup_assoc(ep, + sctp_source(chunk), + &transport); + chunk->asoc = asoc; + chunk->transport = transport; + } + + state = asoc ? asoc->state : SCTP_STATE_CLOSED; + + /* Remember where the last DATA chunk came from so we + * know where to send the SACK. + */ + if (asoc && sctp_chunk_is_data(chunk)) + asoc->peer.last_data_from = chunk->transport; + else + SCTP_INC_STATS(SCTP_MIB_INCTRLCHUNKS); + + if (chunk->transport) + chunk->transport->last_time_heard = jiffies; + + error = sctp_do_sm(SCTP_EVENT_T_CHUNK, subtype, state, + ep, asoc, chunk, GFP_ATOMIC); + + if (error && chunk) + chunk->pdiscard = 1; + + /* Check to see if the endpoint is freed in response to + * the incoming chunk. If so, get out of the while loop. + */ + if (!sctp_sk(sk)->ep) + break; + } +} diff --git a/net/sctp/input.c b/net/sctp/input.c new file mode 100644 index 000000000000..b719a77d66b4 --- /dev/null +++ b/net/sctp/input.c @@ -0,0 +1,913 @@ +/* SCTP kernel reference Implementation + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001-2003 International Business Machines, Corp. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 Nokia, Inc. + * Copyright (c) 2001 La Monte H.P. Yarroll + * + * This file is part of the SCTP kernel reference Implementation + * + * These functions handle all input from the IP layer into SCTP. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Xingang Guo + * Jon Grimm + * Hui Huang + * Daisy Chang + * Sridhar Samudrala + * Ardelle Fan + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include /* For struct list_head */ +#include +#include +#include /* For struct timeval */ +#include +#include +#include +#include +#include +#include +#include + +/* Forward declarations for internal helpers. */ +static int sctp_rcv_ootb(struct sk_buff *); +static struct sctp_association *__sctp_rcv_lookup(struct sk_buff *skb, + const union sctp_addr *laddr, + const union sctp_addr *paddr, + struct sctp_transport **transportp); +static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(const union sctp_addr *laddr); +static struct sctp_association *__sctp_lookup_association( + const union sctp_addr *local, + const union sctp_addr *peer, + struct sctp_transport **pt); + + +/* Calculate the SCTP checksum of an SCTP packet. */ +static inline int sctp_rcv_checksum(struct sk_buff *skb) +{ + struct sctphdr *sh; + __u32 cmp, val; + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + sh = (struct sctphdr *) skb->h.raw; + cmp = ntohl(sh->checksum); + + val = sctp_start_cksum((__u8 *)sh, skb_headlen(skb)); + + for (; list; list = list->next) + val = sctp_update_cksum((__u8 *)list->data, skb_headlen(list), + val); + + val = sctp_end_cksum(val); + + if (val != cmp) { + /* CRC failure, dump it. */ + SCTP_INC_STATS_BH(SCTP_MIB_CHECKSUMERRORS); + return -1; + } + return 0; +} + +/* The free routine for skbuffs that sctp receives */ +static void sctp_rfree(struct sk_buff *skb) +{ + atomic_sub(sizeof(struct sctp_chunk),&skb->sk->sk_rmem_alloc); + sock_rfree(skb); +} + +/* The ownership wrapper routine to do receive buffer accounting */ +static void sctp_rcv_set_owner_r(struct sk_buff *skb, struct sock *sk) +{ + skb_set_owner_r(skb,sk); + skb->destructor = sctp_rfree; + atomic_add(sizeof(struct sctp_chunk),&sk->sk_rmem_alloc); +} + +/* + * This is the routine which IP calls when receiving an SCTP packet. + */ +int sctp_rcv(struct sk_buff *skb) +{ + struct sock *sk; + struct sctp_association *asoc; + struct sctp_endpoint *ep = NULL; + struct sctp_ep_common *rcvr; + struct sctp_transport *transport = NULL; + struct sctp_chunk *chunk; + struct sctphdr *sh; + union sctp_addr src; + union sctp_addr dest; + int family; + struct sctp_af *af; + int ret = 0; + + if (skb->pkt_type!=PACKET_HOST) + goto discard_it; + + SCTP_INC_STATS_BH(SCTP_MIB_INSCTPPACKS); + + sh = (struct sctphdr *) skb->h.raw; + + /* Pull up the IP and SCTP headers. */ + __skb_pull(skb, skb->h.raw - skb->data); + if (skb->len < sizeof(struct sctphdr)) + goto discard_it; + if (sctp_rcv_checksum(skb) < 0) + goto discard_it; + + skb_pull(skb, sizeof(struct sctphdr)); + + /* Make sure we at least have chunk headers worth of data left. */ + if (skb->len < sizeof(struct sctp_chunkhdr)) + goto discard_it; + + family = ipver2af(skb->nh.iph->version); + af = sctp_get_af_specific(family); + if (unlikely(!af)) + goto discard_it; + + /* Initialize local addresses for lookups. */ + af->from_skb(&src, skb, 1); + af->from_skb(&dest, skb, 0); + + /* If the packet is to or from a non-unicast address, + * silently discard the packet. + * + * This is not clearly defined in the RFC except in section + * 8.4 - OOTB handling. However, based on the book "Stream Control + * Transmission Protocol" 2.1, "It is important to note that the + * IP address of an SCTP transport address must be a routable + * unicast address. In other words, IP multicast addresses and + * IP broadcast addresses cannot be used in an SCTP transport + * address." + */ + if (!af->addr_valid(&src, NULL) || !af->addr_valid(&dest, NULL)) + goto discard_it; + + asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport); + + /* + * RFC 2960, 8.4 - Handle "Out of the blue" Packets. + * An SCTP packet is called an "out of the blue" (OOTB) + * packet if it is correctly formed, i.e., passed the + * receiver's checksum check, but the receiver is not + * able to identify the association to which this + * packet belongs. + */ + if (!asoc) { + ep = __sctp_rcv_lookup_endpoint(&dest); + if (sctp_rcv_ootb(skb)) { + SCTP_INC_STATS_BH(SCTP_MIB_OUTOFBLUES); + goto discard_release; + } + } + + /* Retrieve the common input handling substructure. */ + rcvr = asoc ? &asoc->base : &ep->base; + sk = rcvr->sk; + + if ((sk) && (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)) { + goto discard_release; + } + + + /* SCTP seems to always need a timestamp right now (FIXME) */ + if (skb->stamp.tv_sec == 0) { + do_gettimeofday(&skb->stamp); + sock_enable_timestamp(sk); + } + + if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family)) + goto discard_release; + + ret = sk_filter(sk, skb, 1); + if (ret) + goto discard_release; + + /* Create an SCTP packet structure. */ + chunk = sctp_chunkify(skb, asoc, sk); + if (!chunk) { + ret = -ENOMEM; + goto discard_release; + } + + sctp_rcv_set_owner_r(skb,sk); + + /* Remember what endpoint is to handle this packet. */ + chunk->rcvr = rcvr; + + /* Remember the SCTP header. */ + chunk->sctp_hdr = sh; + + /* Set the source and destination addresses of the incoming chunk. */ + sctp_init_addrs(chunk, &src, &dest); + + /* Remember where we came from. */ + chunk->transport = transport; + + /* Acquire access to the sock lock. Note: We are safe from other + * bottom halves on this lock, but a user may be in the lock too, + * so check if it is busy. + */ + sctp_bh_lock_sock(sk); + + if (sock_owned_by_user(sk)) + sk_add_backlog(sk, (struct sk_buff *) chunk); + else + sctp_backlog_rcv(sk, (struct sk_buff *) chunk); + + /* Release the sock and any reference counts we took in the + * lookup calls. + */ + sctp_bh_unlock_sock(sk); + if (asoc) + sctp_association_put(asoc); + else + sctp_endpoint_put(ep); + sock_put(sk); + return ret; + +discard_it: + kfree_skb(skb); + return ret; + +discard_release: + /* Release any structures we may be holding. */ + if (asoc) { + sock_put(asoc->base.sk); + sctp_association_put(asoc); + } else { + sock_put(ep->base.sk); + sctp_endpoint_put(ep); + } + + goto discard_it; +} + +/* Handle second half of inbound skb processing. If the sock was busy, + * we may have need to delay processing until later when the sock is + * released (on the backlog). If not busy, we call this routine + * directly from the bottom half. + */ +int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct sctp_chunk *chunk; + struct sctp_inq *inqueue; + + /* One day chunk will live inside the skb, but for + * now this works. + */ + chunk = (struct sctp_chunk *) skb; + inqueue = &chunk->rcvr->inqueue; + + sctp_inq_push(inqueue, chunk); + return 0; +} + +/* Handle icmp frag needed error. */ +void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, + struct sctp_transport *t, __u32 pmtu) +{ + if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { + printk(KERN_WARNING "%s: Reported pmtu %d too low, " + "using default minimum of %d\n", __FUNCTION__, pmtu, + SCTP_DEFAULT_MINSEGMENT); + pmtu = SCTP_DEFAULT_MINSEGMENT; + } + + if (!sock_owned_by_user(sk) && t && (t->pmtu != pmtu)) { + t->pmtu = pmtu; + sctp_assoc_sync_pmtu(asoc); + sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); + } +} + +/* + * SCTP Implementer's Guide, 2.37 ICMP handling procedures + * + * ICMP8) If the ICMP code is a "Unrecognized next header type encountered" + * or a "Protocol Unreachable" treat this message as an abort + * with the T bit set. + * + * This function sends an event to the state machine, which will abort the + * association. + * + */ +void sctp_icmp_proto_unreachable(struct sock *sk, + struct sctp_endpoint *ep, + struct sctp_association *asoc, + struct sctp_transport *t) +{ + SCTP_DEBUG_PRINTK("%s\n", __FUNCTION__); + + sctp_do_sm(SCTP_EVENT_T_OTHER, + SCTP_ST_OTHER(SCTP_EVENT_ICMP_PROTO_UNREACH), + asoc->state, asoc->ep, asoc, NULL, + GFP_ATOMIC); + +} + +/* Common lookup code for icmp/icmpv6 error handler. */ +struct sock *sctp_err_lookup(int family, struct sk_buff *skb, + struct sctphdr *sctphdr, + struct sctp_endpoint **epp, + struct sctp_association **app, + struct sctp_transport **tpp) +{ + union sctp_addr saddr; + union sctp_addr daddr; + struct sctp_af *af; + struct sock *sk = NULL; + struct sctp_endpoint *ep = NULL; + struct sctp_association *asoc = NULL; + struct sctp_transport *transport = NULL; + + *app = NULL; *epp = NULL; *tpp = NULL; + + af = sctp_get_af_specific(family); + if (unlikely(!af)) { + return NULL; + } + + /* Initialize local addresses for lookups. */ + af->from_skb(&saddr, skb, 1); + af->from_skb(&daddr, skb, 0); + + /* Look for an association that matches the incoming ICMP error + * packet. + */ + asoc = __sctp_lookup_association(&saddr, &daddr, &transport); + if (!asoc) { + /* If there is no matching association, see if it matches any + * endpoint. This may happen for an ICMP error generated in + * response to an INIT_ACK. + */ + ep = __sctp_rcv_lookup_endpoint(&daddr); + if (!ep) { + return NULL; + } + } + + if (asoc) { + sk = asoc->base.sk; + + if (ntohl(sctphdr->vtag) != asoc->c.peer_vtag) { + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + goto out; + } + } else + sk = ep->base.sk; + + sctp_bh_lock_sock(sk); + + /* If too many ICMPs get dropped on busy + * servers this needs to be solved differently. + */ + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); + + *epp = ep; + *app = asoc; + *tpp = transport; + return sk; + +out: + sock_put(sk); + if (asoc) + sctp_association_put(asoc); + if (ep) + sctp_endpoint_put(ep); + return NULL; +} + +/* Common cleanup code for icmp/icmpv6 error handler. */ +void sctp_err_finish(struct sock *sk, struct sctp_endpoint *ep, + struct sctp_association *asoc) +{ + sctp_bh_unlock_sock(sk); + sock_put(sk); + if (asoc) + sctp_association_put(asoc); + if (ep) + sctp_endpoint_put(ep); +} + +/* + * This routine is called by the ICMP module when it gets some + * sort of error condition. If err < 0 then the socket should + * be closed and the error returned to the user. If err > 0 + * it's just the icmp type << 8 | icmp code. After adjustment + * header points to the first 8 bytes of the sctp header. We need + * to find the appropriate port. + * + * The locking strategy used here is very "optimistic". When + * someone else accesses the socket the ICMP is just dropped + * and for some paths there is no check at all. + * A more general error queue to queue errors for later handling + * is probably better. + * + */ +void sctp_v4_err(struct sk_buff *skb, __u32 info) +{ + struct iphdr *iph = (struct iphdr *)skb->data; + struct sctphdr *sh = (struct sctphdr *)(skb->data + (iph->ihl <<2)); + int type = skb->h.icmph->type; + int code = skb->h.icmph->code; + struct sock *sk; + struct sctp_endpoint *ep; + struct sctp_association *asoc; + struct sctp_transport *transport; + struct inet_sock *inet; + char *saveip, *savesctp; + int err; + + if (skb->len < ((iph->ihl << 2) + 8)) { + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + return; + } + + /* Fix up skb to look at the embedded net header. */ + saveip = skb->nh.raw; + savesctp = skb->h.raw; + skb->nh.iph = iph; + skb->h.raw = (char *)sh; + sk = sctp_err_lookup(AF_INET, skb, sh, &ep, &asoc, &transport); + /* Put back, the original pointers. */ + skb->nh.raw = saveip; + skb->h.raw = savesctp; + if (!sk) { + ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); + return; + } + /* Warning: The sock lock is held. Remember to call + * sctp_err_finish! + */ + + switch (type) { + case ICMP_PARAMETERPROB: + err = EPROTO; + break; + case ICMP_DEST_UNREACH: + if (code > NR_ICMP_UNREACH) + goto out_unlock; + + /* PMTU discovery (RFC1191) */ + if (ICMP_FRAG_NEEDED == code) { + sctp_icmp_frag_needed(sk, asoc, transport, info); + goto out_unlock; + } + else { + if (ICMP_PROT_UNREACH == code) { + sctp_icmp_proto_unreachable(sk, ep, asoc, + transport); + goto out_unlock; + } + } + err = icmp_err_convert[code].errno; + break; + case ICMP_TIME_EXCEEDED: + /* Ignore any time exceeded errors due to fragment reassembly + * timeouts. + */ + if (ICMP_EXC_FRAGTIME == code) + goto out_unlock; + + err = EHOSTUNREACH; + break; + default: + goto out_unlock; + } + + inet = inet_sk(sk); + if (!sock_owned_by_user(sk) && inet->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else { /* Only an error on timeout */ + sk->sk_err_soft = err; + } + +out_unlock: + sctp_err_finish(sk, ep, asoc); +} + +/* + * RFC 2960, 8.4 - Handle "Out of the blue" Packets. + * + * This function scans all the chunks in the OOTB packet to determine if + * the packet should be discarded right away. If a response might be needed + * for this packet, or, if further processing is possible, the packet will + * be queued to a proper inqueue for the next phase of handling. + * + * Output: + * Return 0 - If further processing is needed. + * Return 1 - If the packet can be discarded right away. + */ +int sctp_rcv_ootb(struct sk_buff *skb) +{ + sctp_chunkhdr_t *ch; + __u8 *ch_end; + sctp_errhdr_t *err; + + ch = (sctp_chunkhdr_t *) skb->data; + ch_end = ((__u8 *) ch) + WORD_ROUND(ntohs(ch->length)); + + /* Scan through all the chunks in the packet. */ + while (ch_end > (__u8 *)ch && ch_end < skb->tail) { + + /* RFC 8.4, 2) If the OOTB packet contains an ABORT chunk, the + * receiver MUST silently discard the OOTB packet and take no + * further action. + */ + if (SCTP_CID_ABORT == ch->type) + goto discard; + + /* RFC 8.4, 6) If the packet contains a SHUTDOWN COMPLETE + * chunk, the receiver should silently discard the packet + * and take no further action. + */ + if (SCTP_CID_SHUTDOWN_COMPLETE == ch->type) + goto discard; + + /* RFC 8.4, 7) If the packet contains a "Stale cookie" ERROR + * or a COOKIE ACK the SCTP Packet should be silently + * discarded. + */ + if (SCTP_CID_COOKIE_ACK == ch->type) + goto discard; + + if (SCTP_CID_ERROR == ch->type) { + sctp_walk_errors(err, ch) { + if (SCTP_ERROR_STALE_COOKIE == err->cause) + goto discard; + } + } + + ch = (sctp_chunkhdr_t *) ch_end; + ch_end = ((__u8 *) ch) + WORD_ROUND(ntohs(ch->length)); + } + + return 0; + +discard: + return 1; +} + +/* Insert endpoint into the hash table. */ +static void __sctp_hash_endpoint(struct sctp_endpoint *ep) +{ + struct sctp_ep_common **epp; + struct sctp_ep_common *epb; + struct sctp_hashbucket *head; + + epb = &ep->base; + + epb->hashent = sctp_ep_hashfn(epb->bind_addr.port); + head = &sctp_ep_hashtable[epb->hashent]; + + sctp_write_lock(&head->lock); + epp = &head->chain; + epb->next = *epp; + if (epb->next) + (*epp)->pprev = &epb->next; + *epp = epb; + epb->pprev = epp; + sctp_write_unlock(&head->lock); +} + +/* Add an endpoint to the hash. Local BH-safe. */ +void sctp_hash_endpoint(struct sctp_endpoint *ep) +{ + sctp_local_bh_disable(); + __sctp_hash_endpoint(ep); + sctp_local_bh_enable(); +} + +/* Remove endpoint from the hash table. */ +static void __sctp_unhash_endpoint(struct sctp_endpoint *ep) +{ + struct sctp_hashbucket *head; + struct sctp_ep_common *epb; + + epb = &ep->base; + + epb->hashent = sctp_ep_hashfn(epb->bind_addr.port); + + head = &sctp_ep_hashtable[epb->hashent]; + + sctp_write_lock(&head->lock); + + if (epb->pprev) { + if (epb->next) + epb->next->pprev = epb->pprev; + *epb->pprev = epb->next; + epb->pprev = NULL; + } + + sctp_write_unlock(&head->lock); +} + +/* Remove endpoint from the hash. Local BH-safe. */ +void sctp_unhash_endpoint(struct sctp_endpoint *ep) +{ + sctp_local_bh_disable(); + __sctp_unhash_endpoint(ep); + sctp_local_bh_enable(); +} + +/* Look up an endpoint. */ +static struct sctp_endpoint *__sctp_rcv_lookup_endpoint(const union sctp_addr *laddr) +{ + struct sctp_hashbucket *head; + struct sctp_ep_common *epb; + struct sctp_endpoint *ep; + int hash; + + hash = sctp_ep_hashfn(laddr->v4.sin_port); + head = &sctp_ep_hashtable[hash]; + read_lock(&head->lock); + for (epb = head->chain; epb; epb = epb->next) { + ep = sctp_ep(epb); + if (sctp_endpoint_is_match(ep, laddr)) + goto hit; + } + + ep = sctp_sk((sctp_get_ctl_sock()))->ep; + epb = &ep->base; + +hit: + sctp_endpoint_hold(ep); + sock_hold(epb->sk); + read_unlock(&head->lock); + return ep; +} + +/* Insert association into the hash table. */ +static void __sctp_hash_established(struct sctp_association *asoc) +{ + struct sctp_ep_common **epp; + struct sctp_ep_common *epb; + struct sctp_hashbucket *head; + + epb = &asoc->base; + + /* Calculate which chain this entry will belong to. */ + epb->hashent = sctp_assoc_hashfn(epb->bind_addr.port, asoc->peer.port); + + head = &sctp_assoc_hashtable[epb->hashent]; + + sctp_write_lock(&head->lock); + epp = &head->chain; + epb->next = *epp; + if (epb->next) + (*epp)->pprev = &epb->next; + *epp = epb; + epb->pprev = epp; + sctp_write_unlock(&head->lock); +} + +/* Add an association to the hash. Local BH-safe. */ +void sctp_hash_established(struct sctp_association *asoc) +{ + sctp_local_bh_disable(); + __sctp_hash_established(asoc); + sctp_local_bh_enable(); +} + +/* Remove association from the hash table. */ +static void __sctp_unhash_established(struct sctp_association *asoc) +{ + struct sctp_hashbucket *head; + struct sctp_ep_common *epb; + + epb = &asoc->base; + + epb->hashent = sctp_assoc_hashfn(epb->bind_addr.port, + asoc->peer.port); + + head = &sctp_assoc_hashtable[epb->hashent]; + + sctp_write_lock(&head->lock); + + if (epb->pprev) { + if (epb->next) + epb->next->pprev = epb->pprev; + *epb->pprev = epb->next; + epb->pprev = NULL; + } + + sctp_write_unlock(&head->lock); +} + +/* Remove association from the hash table. Local BH-safe. */ +void sctp_unhash_established(struct sctp_association *asoc) +{ + sctp_local_bh_disable(); + __sctp_unhash_established(asoc); + sctp_local_bh_enable(); +} + +/* Look up an association. */ +static struct sctp_association *__sctp_lookup_association( + const union sctp_addr *local, + const union sctp_addr *peer, + struct sctp_transport **pt) +{ + struct sctp_hashbucket *head; + struct sctp_ep_common *epb; + struct sctp_association *asoc; + struct sctp_transport *transport; + int hash; + + /* Optimize here for direct hit, only listening connections can + * have wildcards anyways. + */ + hash = sctp_assoc_hashfn(local->v4.sin_port, peer->v4.sin_port); + head = &sctp_assoc_hashtable[hash]; + read_lock(&head->lock); + for (epb = head->chain; epb; epb = epb->next) { + asoc = sctp_assoc(epb); + transport = sctp_assoc_is_match(asoc, local, peer); + if (transport) + goto hit; + } + + read_unlock(&head->lock); + + return NULL; + +hit: + *pt = transport; + sctp_association_hold(asoc); + sock_hold(epb->sk); + read_unlock(&head->lock); + return asoc; +} + +/* Look up an association. BH-safe. */ +SCTP_STATIC +struct sctp_association *sctp_lookup_association(const union sctp_addr *laddr, + const union sctp_addr *paddr, + struct sctp_transport **transportp) +{ + struct sctp_association *asoc; + + sctp_local_bh_disable(); + asoc = __sctp_lookup_association(laddr, paddr, transportp); + sctp_local_bh_enable(); + + return asoc; +} + +/* Is there an association matching the given local and peer addresses? */ +int sctp_has_association(const union sctp_addr *laddr, + const union sctp_addr *paddr) +{ + struct sctp_association *asoc; + struct sctp_transport *transport; + + if ((asoc = sctp_lookup_association(laddr, paddr, &transport))) { + sock_put(asoc->base.sk); + sctp_association_put(asoc); + return 1; + } + + return 0; +} + +/* + * SCTP Implementors Guide, 2.18 Handling of address + * parameters within the INIT or INIT-ACK. + * + * D) When searching for a matching TCB upon reception of an INIT + * or INIT-ACK chunk the receiver SHOULD use not only the + * source address of the packet (containing the INIT or + * INIT-ACK) but the receiver SHOULD also use all valid + * address parameters contained within the chunk. + * + * 2.18.3 Solution description + * + * This new text clearly specifies to an implementor the need + * to look within the INIT or INIT-ACK. Any implementation that + * does not do this, may not be able to establish associations + * in certain circumstances. + * + */ +static struct sctp_association *__sctp_rcv_init_lookup(struct sk_buff *skb, + const union sctp_addr *laddr, struct sctp_transport **transportp) +{ + struct sctp_association *asoc; + union sctp_addr addr; + union sctp_addr *paddr = &addr; + struct sctphdr *sh = (struct sctphdr *) skb->h.raw; + sctp_chunkhdr_t *ch; + union sctp_params params; + sctp_init_chunk_t *init; + struct sctp_transport *transport; + struct sctp_af *af; + + ch = (sctp_chunkhdr_t *) skb->data; + + /* If this is INIT/INIT-ACK look inside the chunk too. */ + switch (ch->type) { + case SCTP_CID_INIT: + case SCTP_CID_INIT_ACK: + break; + default: + return NULL; + } + + /* The code below will attempt to walk the chunk and extract + * parameter information. Before we do that, we need to verify + * that the chunk length doesn't cause overflow. Otherwise, we'll + * walk off the end. + */ + if (WORD_ROUND(ntohs(ch->length)) > skb->len) + return NULL; + + /* + * This code will NOT touch anything inside the chunk--it is + * strictly READ-ONLY. + * + * RFC 2960 3 SCTP packet Format + * + * Multiple chunks can be bundled into one SCTP packet up to + * the MTU size, except for the INIT, INIT ACK, and SHUTDOWN + * COMPLETE chunks. These chunks MUST NOT be bundled with any + * other chunk in a packet. See Section 6.10 for more details + * on chunk bundling. + */ + + /* Find the start of the TLVs and the end of the chunk. This is + * the region we search for address parameters. + */ + init = (sctp_init_chunk_t *)skb->data; + + /* Walk the parameters looking for embedded addresses. */ + sctp_walk_params(params, init, init_hdr.params) { + + /* Note: Ignoring hostname addresses. */ + af = sctp_get_af_specific(param_type2af(params.p->type)); + if (!af) + continue; + + af->from_addr_param(paddr, params.addr, ntohs(sh->source), 0); + + asoc = __sctp_lookup_association(laddr, paddr, &transport); + if (asoc) + return asoc; + } + + return NULL; +} + +/* Lookup an association for an inbound skb. */ +static struct sctp_association *__sctp_rcv_lookup(struct sk_buff *skb, + const union sctp_addr *paddr, + const union sctp_addr *laddr, + struct sctp_transport **transportp) +{ + struct sctp_association *asoc; + + asoc = __sctp_lookup_association(laddr, paddr, transportp); + + /* Further lookup for INIT/INIT-ACK packets. + * SCTP Implementors Guide, 2.18 Handling of address + * parameters within the INIT or INIT-ACK. + */ + if (!asoc) + asoc = __sctp_rcv_init_lookup(skb, laddr, transportp); + + return asoc; +} diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c new file mode 100644 index 000000000000..cedf4351556c --- /dev/null +++ b/net/sctp/inqueue.c @@ -0,0 +1,204 @@ +/* SCTP kernel reference Implementation + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2002 International Business Machines, Corp. + * + * This file is part of the SCTP kernel reference Implementation + * + * These functions are the methods for accessing the SCTP inqueue. + * + * An SCTP inqueue is a queue into which you push SCTP packets + * (which might be bundles or fragments of chunks) and out of which you + * pop SCTP whole chunks. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include + +/* Initialize an SCTP inqueue. */ +void sctp_inq_init(struct sctp_inq *queue) +{ + skb_queue_head_init(&queue->in); + queue->in_progress = NULL; + + /* Create a task for delivering data. */ + INIT_WORK(&queue->immediate, NULL, NULL); + + queue->malloced = 0; +} + +/* Release the memory associated with an SCTP inqueue. */ +void sctp_inq_free(struct sctp_inq *queue) +{ + struct sctp_chunk *chunk; + + /* Empty the queue. */ + while ((chunk = (struct sctp_chunk *) skb_dequeue(&queue->in)) != NULL) + sctp_chunk_free(chunk); + + /* If there is a packet which is currently being worked on, + * free it as well. + */ + if (queue->in_progress) + sctp_chunk_free(queue->in_progress); + + if (queue->malloced) { + /* Dump the master memory segment. */ + kfree(queue); + } +} + +/* Put a new packet in an SCTP inqueue. + * We assume that packet->sctp_hdr is set and in host byte order. + */ +void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *packet) +{ + /* Directly call the packet handling routine. */ + + /* We are now calling this either from the soft interrupt + * or from the backlog processing. + * Eventually, we should clean up inqueue to not rely + * on the BH related data structures. + */ + skb_queue_tail(&(q->in), (struct sk_buff *) packet); + q->immediate.func(q->immediate.data); +} + +/* Extract a chunk from an SCTP inqueue. + * + * WARNING: If you need to put the chunk on another queue, you need to + * make a shallow copy (clone) of it. + */ +struct sctp_chunk *sctp_inq_pop(struct sctp_inq *queue) +{ + struct sctp_chunk *chunk; + sctp_chunkhdr_t *ch = NULL; + + /* The assumption is that we are safe to process the chunks + * at this time. + */ + + if ((chunk = queue->in_progress)) { + /* There is a packet that we have been working on. + * Any post processing work to do before we move on? + */ + if (chunk->singleton || + chunk->end_of_packet || + chunk->pdiscard) { + sctp_chunk_free(chunk); + chunk = queue->in_progress = NULL; + } else { + /* Nothing to do. Next chunk in the packet, please. */ + ch = (sctp_chunkhdr_t *) chunk->chunk_end; + + /* Force chunk->skb->data to chunk->chunk_end. */ + skb_pull(chunk->skb, + chunk->chunk_end - chunk->skb->data); + } + } + + /* Do we need to take the next packet out of the queue to process? */ + if (!chunk) { + /* Is the queue empty? */ + if (skb_queue_empty(&queue->in)) + return NULL; + + chunk = queue->in_progress = + (struct sctp_chunk *) skb_dequeue(&queue->in); + + /* This is the first chunk in the packet. */ + chunk->singleton = 1; + ch = (sctp_chunkhdr_t *) chunk->skb->data; + } + + chunk->chunk_hdr = ch; + chunk->chunk_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); + /* In the unlikely case of an IP reassembly, the skb could be + * non-linear. If so, update chunk_end so that it doesn't go past + * the skb->tail. + */ + if (unlikely(skb_is_nonlinear(chunk->skb))) { + if (chunk->chunk_end > chunk->skb->tail) + chunk->chunk_end = chunk->skb->tail; + } + skb_pull(chunk->skb, sizeof(sctp_chunkhdr_t)); + chunk->subh.v = NULL; /* Subheader is no longer valid. */ + + if (chunk->chunk_end < chunk->skb->tail) { + /* This is not a singleton */ + chunk->singleton = 0; + } else if (chunk->chunk_end > chunk->skb->tail) { + /* RFC 2960, Section 6.10 Bundling + * + * Partial chunks MUST NOT be placed in an SCTP packet. + * If the receiver detects a partial chunk, it MUST drop + * the chunk. + * + * Since the end of the chunk is past the end of our buffer + * (which contains the whole packet, we can freely discard + * the whole packet. + */ + sctp_chunk_free(chunk); + chunk = queue->in_progress = NULL; + + return NULL; + } else { + /* We are at the end of the packet, so mark the chunk + * in case we need to send a SACK. + */ + chunk->end_of_packet = 1; + } + + SCTP_DEBUG_PRINTK("+++sctp_inq_pop+++ chunk %p[%s]," + " length %d, skb->len %d\n",chunk, + sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)), + ntohs(chunk->chunk_hdr->length), chunk->skb->len); + return chunk; +} + +/* Set a top-half handler. + * + * Originally, we the top-half handler was scheduled as a BH. We now + * call the handler directly in sctp_inq_push() at a time that + * we know we are lock safe. + * The intent is that this routine will pull stuff out of the + * inqueue and process it. + */ +void sctp_inq_set_th_handler(struct sctp_inq *q, + void (*callback)(void *), void *arg) +{ + INIT_WORK(&q->immediate, callback, arg); +} + diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c new file mode 100644 index 000000000000..e42c74e3ec1e --- /dev/null +++ b/net/sctp/ipv6.c @@ -0,0 +1,1013 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2002, 2004 + * Copyright (c) 2001 Nokia, Inc. + * Copyright (c) 2001 La Monte H.P. Yarroll + * Copyright (c) 2002-2003 Intel Corp. + * + * This file is part of the SCTP kernel reference Implementation + * + * SCTP over IPv6. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Le Yanqun + * Hui Huang + * La Monte H.P. Yarroll + * Sridhar Samudrala + * Jon Grimm + * Ardelle Fan + * + * Based on: + * linux/net/ipv6/tcp_ipv6.c + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +extern int sctp_inetaddr_event(struct notifier_block *, unsigned long, void *); +static struct notifier_block sctp_inet6addr_notifier = { + .notifier_call = sctp_inetaddr_event, +}; + +/* ICMP error handler. */ +SCTP_STATIC void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct inet6_dev *idev; + struct ipv6hdr *iph = (struct ipv6hdr *)skb->data; + struct sctphdr *sh = (struct sctphdr *)(skb->data + offset); + struct sock *sk; + struct sctp_endpoint *ep; + struct sctp_association *asoc; + struct sctp_transport *transport; + struct ipv6_pinfo *np; + char *saveip, *savesctp; + int err; + + idev = in6_dev_get(skb->dev); + + /* Fix up skb to look at the embedded net header. */ + saveip = skb->nh.raw; + savesctp = skb->h.raw; + skb->nh.ipv6h = iph; + skb->h.raw = (char *)sh; + sk = sctp_err_lookup(AF_INET6, skb, sh, &ep, &asoc, &transport); + /* Put back, the original pointers. */ + skb->nh.raw = saveip; + skb->h.raw = savesctp; + if (!sk) { + ICMP6_INC_STATS_BH(idev, ICMP6_MIB_INERRORS); + goto out; + } + + /* Warning: The sock lock is held. Remember to call + * sctp_err_finish! + */ + + switch (type) { + case ICMPV6_PKT_TOOBIG: + sctp_icmp_frag_needed(sk, asoc, transport, ntohl(info)); + goto out_unlock; + case ICMPV6_PARAMPROB: + if (ICMPV6_UNK_NEXTHDR == code) { + sctp_icmp_proto_unreachable(sk, ep, asoc, transport); + goto out_unlock; + } + break; + default: + break; + } + + np = inet6_sk(sk); + icmpv6_err_convert(type, code, &err); + if (!sock_owned_by_user(sk) && np->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else { /* Only an error on timeout */ + sk->sk_err_soft = err; + } + +out_unlock: + sctp_err_finish(sk, ep, asoc); +out: + if (likely(idev != NULL)) + in6_dev_put(idev); +} + +/* Based on tcp_v6_xmit() in tcp_ipv6.c. */ +static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport, + int ipfragok) +{ + struct sock *sk = skb->sk; + struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + + fl.proto = sk->sk_protocol; + + /* Fill in the dest address from the route entry passed with the skb + * and the source address from the transport. + */ + ipv6_addr_copy(&fl.fl6_dst, &transport->ipaddr.v6.sin6_addr); + ipv6_addr_copy(&fl.fl6_src, &transport->saddr.v6.sin6_addr); + + fl.fl6_flowlabel = np->flow_label; + IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel); + if (ipv6_addr_type(&fl.fl6_src) & IPV6_ADDR_LINKLOCAL) + fl.oif = transport->saddr.v6.sin6_scope_id; + else + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_sport = inet_sk(sk)->sport; + fl.fl_ip_dport = transport->ipaddr.v6.sin6_port; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + } + + SCTP_DEBUG_PRINTK("%s: skb:%p, len:%d, " + "src:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x " + "dst:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + __FUNCTION__, skb, skb->len, + NIP6(fl.fl6_src), NIP6(fl.fl6_dst)); + + SCTP_INC_STATS(SCTP_MIB_OUTSCTPPACKS); + + return ip6_xmit(sk, skb, &fl, np->opt, ipfragok); +} + +/* Returns the dst cache entry for the given source and destination ip + * addresses. + */ +static struct dst_entry *sctp_v6_get_dst(struct sctp_association *asoc, + union sctp_addr *daddr, + union sctp_addr *saddr) +{ + struct dst_entry *dst; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + ipv6_addr_copy(&fl.fl6_dst, &daddr->v6.sin6_addr); + if (ipv6_addr_type(&daddr->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) + fl.oif = daddr->v6.sin6_scope_id; + + + SCTP_DEBUG_PRINTK("%s: DST=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", + __FUNCTION__, NIP6(fl.fl6_dst)); + + if (saddr) { + ipv6_addr_copy(&fl.fl6_src, &saddr->v6.sin6_addr); + SCTP_DEBUG_PRINTK( + "SRC=%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x - ", + NIP6(fl.fl6_src)); + } + + dst = ip6_route_output(NULL, &fl); + if (dst) { + struct rt6_info *rt; + rt = (struct rt6_info *)dst; + SCTP_DEBUG_PRINTK( + "rt6_dst:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x " + "rt6_src:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + NIP6(rt->rt6i_dst.addr), NIP6(rt->rt6i_src.addr)); + } else { + SCTP_DEBUG_PRINTK("NO ROUTE\n"); + } + + return dst; +} + +/* Returns the number of consecutive initial bits that match in the 2 ipv6 + * addresses. + */ +static inline int sctp_v6_addr_match_len(union sctp_addr *s1, + union sctp_addr *s2) +{ + struct in6_addr *a1 = &s1->v6.sin6_addr; + struct in6_addr *a2 = &s2->v6.sin6_addr; + int i, j; + + for (i = 0; i < 4 ; i++) { + __u32 a1xora2; + + a1xora2 = a1->s6_addr32[i] ^ a2->s6_addr32[i]; + + if ((j = fls(ntohl(a1xora2)))) + return (i * 32 + 32 - j); + } + + return (i*32); +} + +/* Fills in the source address(saddr) based on the destination address(daddr) + * and asoc's bind address list. + */ +static void sctp_v6_get_saddr(struct sctp_association *asoc, + struct dst_entry *dst, + union sctp_addr *daddr, + union sctp_addr *saddr) +{ + struct sctp_bind_addr *bp; + rwlock_t *addr_lock; + struct sctp_sockaddr_entry *laddr; + struct list_head *pos; + sctp_scope_t scope; + union sctp_addr *baddr = NULL; + __u8 matchlen = 0; + __u8 bmatchlen; + + SCTP_DEBUG_PRINTK("%s: asoc:%p dst:%p " + "daddr:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", + __FUNCTION__, asoc, dst, NIP6(daddr->v6.sin6_addr)); + + if (!asoc) { + ipv6_get_saddr(dst, &daddr->v6.sin6_addr,&saddr->v6.sin6_addr); + SCTP_DEBUG_PRINTK("saddr from ipv6_get_saddr: " + "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + NIP6(saddr->v6.sin6_addr)); + return; + } + + scope = sctp_scope(daddr); + + bp = &asoc->base.bind_addr; + addr_lock = &asoc->base.addr_lock; + + /* Go through the bind address list and find the best source address + * that matches the scope of the destination address. + */ + sctp_read_lock(addr_lock); + list_for_each(pos, &bp->address_list) { + laddr = list_entry(pos, struct sctp_sockaddr_entry, list); + if ((laddr->a.sa.sa_family == AF_INET6) && + (scope <= sctp_scope(&laddr->a))) { + bmatchlen = sctp_v6_addr_match_len(daddr, &laddr->a); + if (!baddr || (matchlen < bmatchlen)) { + baddr = &laddr->a; + matchlen = bmatchlen; + } + } + } + + if (baddr) { + memcpy(saddr, baddr, sizeof(union sctp_addr)); + SCTP_DEBUG_PRINTK("saddr: " + "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + NIP6(saddr->v6.sin6_addr)); + } else { + printk(KERN_ERR "%s: asoc:%p Could not find a valid source " + "address for the " + "dest:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n", + __FUNCTION__, asoc, NIP6(daddr->v6.sin6_addr)); + } + + sctp_read_unlock(addr_lock); +} + +/* Make a copy of all potential local addresses. */ +static void sctp_v6_copy_addrlist(struct list_head *addrlist, + struct net_device *dev) +{ + struct inet6_dev *in6_dev; + struct inet6_ifaddr *ifp; + struct sctp_sockaddr_entry *addr; + + read_lock(&addrconf_lock); + if ((in6_dev = __in6_dev_get(dev)) == NULL) { + read_unlock(&addrconf_lock); + return; + } + + read_lock(&in6_dev->lock); + for (ifp = in6_dev->addr_list; ifp; ifp = ifp->if_next) { + /* Add the address to the local list. */ + addr = t_new(struct sctp_sockaddr_entry, GFP_ATOMIC); + if (addr) { + addr->a.v6.sin6_family = AF_INET6; + addr->a.v6.sin6_port = 0; + addr->a.v6.sin6_addr = ifp->addr; + addr->a.v6.sin6_scope_id = dev->ifindex; + INIT_LIST_HEAD(&addr->list); + list_add_tail(&addr->list, addrlist); + } + } + + read_unlock(&in6_dev->lock); + read_unlock(&addrconf_lock); +} + +/* Initialize a sockaddr_storage from in incoming skb. */ +static void sctp_v6_from_skb(union sctp_addr *addr,struct sk_buff *skb, + int is_saddr) +{ + void *from; + __u16 *port; + struct sctphdr *sh; + + port = &addr->v6.sin6_port; + addr->v6.sin6_family = AF_INET6; + addr->v6.sin6_flowinfo = 0; /* FIXME */ + addr->v6.sin6_scope_id = ((struct inet6_skb_parm *)skb->cb)->iif; + + sh = (struct sctphdr *) skb->h.raw; + if (is_saddr) { + *port = ntohs(sh->source); + from = &skb->nh.ipv6h->saddr; + } else { + *port = ntohs(sh->dest); + from = &skb->nh.ipv6h->daddr; + } + ipv6_addr_copy(&addr->v6.sin6_addr, from); +} + +/* Initialize an sctp_addr from a socket. */ +static void sctp_v6_from_sk(union sctp_addr *addr, struct sock *sk) +{ + addr->v6.sin6_family = AF_INET6; + addr->v6.sin6_port = inet_sk(sk)->num; + addr->v6.sin6_addr = inet6_sk(sk)->rcv_saddr; +} + +/* Initialize sk->sk_rcv_saddr from sctp_addr. */ +static void sctp_v6_to_sk_saddr(union sctp_addr *addr, struct sock *sk) +{ + if (addr->sa.sa_family == AF_INET && sctp_sk(sk)->v4mapped) { + inet6_sk(sk)->rcv_saddr.s6_addr32[0] = 0; + inet6_sk(sk)->rcv_saddr.s6_addr32[1] = 0; + inet6_sk(sk)->rcv_saddr.s6_addr32[2] = htonl(0x0000ffff); + inet6_sk(sk)->rcv_saddr.s6_addr32[3] = + addr->v4.sin_addr.s_addr; + } else { + inet6_sk(sk)->rcv_saddr = addr->v6.sin6_addr; + } +} + +/* Initialize sk->sk_daddr from sctp_addr. */ +static void sctp_v6_to_sk_daddr(union sctp_addr *addr, struct sock *sk) +{ + if (addr->sa.sa_family == AF_INET && sctp_sk(sk)->v4mapped) { + inet6_sk(sk)->daddr.s6_addr32[0] = 0; + inet6_sk(sk)->daddr.s6_addr32[1] = 0; + inet6_sk(sk)->daddr.s6_addr32[2] = htonl(0x0000ffff); + inet6_sk(sk)->daddr.s6_addr32[3] = addr->v4.sin_addr.s_addr; + } else { + inet6_sk(sk)->daddr = addr->v6.sin6_addr; + } +} + +/* Initialize a sctp_addr from an address parameter. */ +static void sctp_v6_from_addr_param(union sctp_addr *addr, + union sctp_addr_param *param, + __u16 port, int iif) +{ + addr->v6.sin6_family = AF_INET6; + addr->v6.sin6_port = port; + addr->v6.sin6_flowinfo = 0; /* BUG */ + ipv6_addr_copy(&addr->v6.sin6_addr, ¶m->v6.addr); + addr->v6.sin6_scope_id = iif; +} + +/* Initialize an address parameter from a sctp_addr and return the length + * of the address parameter. + */ +static int sctp_v6_to_addr_param(const union sctp_addr *addr, + union sctp_addr_param *param) +{ + int length = sizeof(sctp_ipv6addr_param_t); + + param->v6.param_hdr.type = SCTP_PARAM_IPV6_ADDRESS; + param->v6.param_hdr.length = ntohs(length); + ipv6_addr_copy(¶m->v6.addr, &addr->v6.sin6_addr); + + return length; +} + +/* Initialize a sctp_addr from a dst_entry. */ +static void sctp_v6_dst_saddr(union sctp_addr *addr, struct dst_entry *dst, + unsigned short port) +{ + struct rt6_info *rt = (struct rt6_info *)dst; + addr->sa.sa_family = AF_INET6; + addr->v6.sin6_port = port; + ipv6_addr_copy(&addr->v6.sin6_addr, &rt->rt6i_src.addr); +} + +/* Compare addresses exactly. + * v4-mapped-v6 is also in consideration. + */ +static int sctp_v6_cmp_addr(const union sctp_addr *addr1, + const union sctp_addr *addr2) +{ + if (addr1->sa.sa_family != addr2->sa.sa_family) { + if (addr1->sa.sa_family == AF_INET && + addr2->sa.sa_family == AF_INET6 && + IPV6_ADDR_MAPPED == ipv6_addr_type(&addr2->v6.sin6_addr)) { + if (addr2->v6.sin6_port == addr1->v4.sin_port && + addr2->v6.sin6_addr.s6_addr32[3] == + addr1->v4.sin_addr.s_addr) + return 1; + } + if (addr2->sa.sa_family == AF_INET && + addr1->sa.sa_family == AF_INET6 && + IPV6_ADDR_MAPPED == ipv6_addr_type(&addr1->v6.sin6_addr)) { + if (addr1->v6.sin6_port == addr2->v4.sin_port && + addr1->v6.sin6_addr.s6_addr32[3] == + addr2->v4.sin_addr.s_addr) + return 1; + } + return 0; + } + if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr)) + return 0; + /* If this is a linklocal address, compare the scope_id. */ + if (ipv6_addr_type(&addr1->v6.sin6_addr) & IPV6_ADDR_LINKLOCAL) { + if (addr1->v6.sin6_scope_id && addr2->v6.sin6_scope_id && + (addr1->v6.sin6_scope_id != addr2->v6.sin6_scope_id)) { + return 0; + } + } + + return 1; +} + +/* Initialize addr struct to INADDR_ANY. */ +static void sctp_v6_inaddr_any(union sctp_addr *addr, unsigned short port) +{ + memset(addr, 0x00, sizeof(union sctp_addr)); + addr->v6.sin6_family = AF_INET6; + addr->v6.sin6_port = port; +} + +/* Is this a wildcard address? */ +static int sctp_v6_is_any(const union sctp_addr *addr) +{ + int type; + type = ipv6_addr_type((struct in6_addr *)&addr->v6.sin6_addr); + return IPV6_ADDR_ANY == type; +} + +/* Should this be available for binding? */ +static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) +{ + int type; + struct in6_addr *in6 = (struct in6_addr *)&addr->v6.sin6_addr; + + type = ipv6_addr_type(in6); + if (IPV6_ADDR_ANY == type) + return 1; + if (type == IPV6_ADDR_MAPPED) { + if (sp && !sp->v4mapped) + return 0; + if (sp && ipv6_only_sock(sctp_opt2sk(sp))) + return 0; + sctp_v6_map_v4(addr); + return sctp_get_af_specific(AF_INET)->available(addr, sp); + } + if (!(type & IPV6_ADDR_UNICAST)) + return 0; + + return ipv6_chk_addr(in6, NULL, 0); +} + +/* This function checks if the address is a valid address to be used for + * SCTP. + * + * Output: + * Return 0 - If the address is a non-unicast or an illegal address. + * Return 1 - If the address is a unicast. + */ +static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp) +{ + int ret = ipv6_addr_type(&addr->v6.sin6_addr); + + /* Support v4-mapped-v6 address. */ + if (ret == IPV6_ADDR_MAPPED) { + /* Note: This routine is used in input, so v4-mapped-v6 + * are disallowed here when there is no sctp_sock. + */ + if (!sp || !sp->v4mapped) + return 0; + if (sp && ipv6_only_sock(sctp_opt2sk(sp))) + return 0; + sctp_v6_map_v4(addr); + return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp); + } + + /* Is this a non-unicast address */ + if (!(ret & IPV6_ADDR_UNICAST)) + return 0; + + return 1; +} + +/* What is the scope of 'addr'? */ +static sctp_scope_t sctp_v6_scope(union sctp_addr *addr) +{ + int v6scope; + sctp_scope_t retval; + + /* The IPv6 scope is really a set of bit fields. + * See IFA_* in . Map to a generic SCTP scope. + */ + + v6scope = ipv6_addr_scope(&addr->v6.sin6_addr); + switch (v6scope) { + case IFA_HOST: + retval = SCTP_SCOPE_LOOPBACK; + break; + case IFA_LINK: + retval = SCTP_SCOPE_LINK; + break; + case IFA_SITE: + retval = SCTP_SCOPE_PRIVATE; + break; + default: + retval = SCTP_SCOPE_GLOBAL; + break; + }; + + return retval; +} + +/* Create and initialize a new sk for the socket to be returned by accept(). */ +static struct sock *sctp_v6_create_accept_sk(struct sock *sk, + struct sctp_association *asoc) +{ + struct inet_sock *inet = inet_sk(sk); + struct sock *newsk; + struct inet_sock *newinet; + struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct sctp6_sock *newsctp6sk; + + newsk = sk_alloc(PF_INET6, GFP_KERNEL, sk->sk_prot, 1); + if (!newsk) + goto out; + + sock_init_data(NULL, newsk); + + newsk->sk_type = SOCK_STREAM; + + newsk->sk_prot = sk->sk_prot; + newsk->sk_no_check = sk->sk_no_check; + newsk->sk_reuse = sk->sk_reuse; + + newsk->sk_destruct = inet_sock_destruct; + newsk->sk_family = PF_INET6; + newsk->sk_protocol = IPPROTO_SCTP; + newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; + newsk->sk_shutdown = sk->sk_shutdown; + sock_reset_flag(sk, SOCK_ZAPPED); + + newsctp6sk = (struct sctp6_sock *)newsk; + inet_sk(newsk)->pinet6 = &newsctp6sk->inet6; + + newinet = inet_sk(newsk); + newnp = inet6_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + /* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname() + * and getpeername(). + */ + newinet->sport = inet->sport; + newnp->saddr = np->saddr; + newnp->rcv_saddr = np->rcv_saddr; + newinet->dport = htons(asoc->peer.port); + sctp_v6_to_sk_daddr(&asoc->peer.primary_addr, newsk); + + /* Init the ipv4 part of the socket since we can have sockets + * using v6 API for ipv4. + */ + newinet->uc_ttl = -1; + newinet->mc_loop = 1; + newinet->mc_ttl = 1; + newinet->mc_index = 0; + newinet->mc_list = NULL; + + if (ipv4_config.no_pmtu_disc) + newinet->pmtudisc = IP_PMTUDISC_DONT; + else + newinet->pmtudisc = IP_PMTUDISC_WANT; + +#ifdef INET_REFCNT_DEBUG + atomic_inc(&inet6_sock_nr); + atomic_inc(&inet_sock_nr); +#endif + + if (newsk->sk_prot->init(newsk)) { + sk_common_release(newsk); + newsk = NULL; + } + +out: + return newsk; +} + +/* Map v4 address to mapped v6 address */ +static void sctp_v6_addr_v4map(struct sctp_sock *sp, union sctp_addr *addr) +{ + if (sp->v4mapped && AF_INET == addr->sa.sa_family) + sctp_v4_map_v6(addr); +} + +/* Where did this skb come from? */ +static int sctp_v6_skb_iif(const struct sk_buff *skb) +{ + struct inet6_skb_parm *opt = (struct inet6_skb_parm *) skb->cb; + return opt->iif; +} + +/* Was this packet marked by Explicit Congestion Notification? */ +static int sctp_v6_is_ce(const struct sk_buff *skb) +{ + return *((__u32 *)(skb->nh.ipv6h)) & htonl(1<<20); +} + +/* Dump the v6 addr to the seq file. */ +static void sctp_v6_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr) +{ + seq_printf(seq, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x ", + NIP6(addr->v6.sin6_addr)); +} + +/* Initialize a PF_INET6 socket msg_name. */ +static void sctp_inet6_msgname(char *msgname, int *addr_len) +{ + struct sockaddr_in6 *sin6; + + sin6 = (struct sockaddr_in6 *)msgname; + sin6->sin6_family = AF_INET6; + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; /*FIXME */ + *addr_len = sizeof(struct sockaddr_in6); +} + +/* Initialize a PF_INET msgname from a ulpevent. */ +static void sctp_inet6_event_msgname(struct sctp_ulpevent *event, + char *msgname, int *addrlen) +{ + struct sockaddr_in6 *sin6, *sin6from; + + if (msgname) { + union sctp_addr *addr; + struct sctp_association *asoc; + + asoc = event->asoc; + sctp_inet6_msgname(msgname, addrlen); + sin6 = (struct sockaddr_in6 *)msgname; + sin6->sin6_port = htons(asoc->peer.port); + addr = &asoc->peer.primary_addr; + + /* Note: If we go to a common v6 format, this code + * will change. + */ + + /* Map ipv4 address into v4-mapped-on-v6 address. */ + if (sctp_sk(asoc->base.sk)->v4mapped && + AF_INET == addr->sa.sa_family) { + sctp_v4_map_v6((union sctp_addr *)sin6); + sin6->sin6_addr.s6_addr32[3] = + addr->v4.sin_addr.s_addr; + return; + } + + sin6from = &asoc->peer.primary_addr.v6; + ipv6_addr_copy(&sin6->sin6_addr, &sin6from->sin6_addr); + if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = sin6from->sin6_scope_id; + } +} + +/* Initialize a msg_name from an inbound skb. */ +static void sctp_inet6_skb_msgname(struct sk_buff *skb, char *msgname, + int *addr_len) +{ + struct sctphdr *sh; + struct sockaddr_in6 *sin6; + + if (msgname) { + sctp_inet6_msgname(msgname, addr_len); + sin6 = (struct sockaddr_in6 *)msgname; + sh = (struct sctphdr *)skb->h.raw; + sin6->sin6_port = sh->source; + + /* Map ipv4 address into v4-mapped-on-v6 address. */ + if (sctp_sk(skb->sk)->v4mapped && + skb->nh.iph->version == 4) { + sctp_v4_map_v6((union sctp_addr *)sin6); + sin6->sin6_addr.s6_addr32[3] = skb->nh.iph->saddr; + return; + } + + /* Otherwise, just copy the v6 address. */ + ipv6_addr_copy(&sin6->sin6_addr, &skb->nh.ipv6h->saddr); + if (ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) { + struct sctp_ulpevent *ev = sctp_skb2event(skb); + sin6->sin6_scope_id = ev->iif; + } + } +} + +/* Do we support this AF? */ +static int sctp_inet6_af_supported(sa_family_t family, struct sctp_sock *sp) +{ + switch (family) { + case AF_INET6: + return 1; + /* v4-mapped-v6 addresses */ + case AF_INET: + if (!__ipv6_only_sock(sctp_opt2sk(sp)) && sp->v4mapped) + return 1; + default: + return 0; + } +} + +/* Address matching with wildcards allowed. This extra level + * of indirection lets us choose whether a PF_INET6 should + * disallow any v4 addresses if we so choose. + */ +static int sctp_inet6_cmp_addr(const union sctp_addr *addr1, + const union sctp_addr *addr2, + struct sctp_sock *opt) +{ + struct sctp_af *af1, *af2; + + af1 = sctp_get_af_specific(addr1->sa.sa_family); + af2 = sctp_get_af_specific(addr2->sa.sa_family); + + if (!af1 || !af2) + return 0; + /* Today, wildcard AF_INET/AF_INET6. */ + if (sctp_is_any(addr1) || sctp_is_any(addr2)) + return 1; + + if (addr1->sa.sa_family != addr2->sa.sa_family) + return 0; + + return af1->cmp_addr(addr1, addr2); +} + +/* Verify that the provided sockaddr looks bindable. Common verification, + * has already been taken care of. + */ +static int sctp_inet6_bind_verify(struct sctp_sock *opt, union sctp_addr *addr) +{ + struct sctp_af *af; + + /* ASSERT: address family has already been verified. */ + if (addr->sa.sa_family != AF_INET6) + af = sctp_get_af_specific(addr->sa.sa_family); + else { + struct sock *sk; + int type = ipv6_addr_type(&addr->v6.sin6_addr); + sk = sctp_opt2sk(opt); + if (type & IPV6_ADDR_LINKLOCAL) { + /* Note: Behavior similar to af_inet6.c: + * 1) Overrides previous bound_dev_if + * 2) Destructive even if bind isn't successful. + */ + + if (addr->v6.sin6_scope_id) + sk->sk_bound_dev_if = addr->v6.sin6_scope_id; + if (!sk->sk_bound_dev_if) + return 0; + } + af = opt->pf->af; + } + return af->available(addr, opt); +} + +/* Verify that the provided sockaddr looks bindable. Common verification, + * has already been taken care of. + */ +static int sctp_inet6_send_verify(struct sctp_sock *opt, union sctp_addr *addr) +{ + struct sctp_af *af = NULL; + + /* ASSERT: address family has already been verified. */ + if (addr->sa.sa_family != AF_INET6) + af = sctp_get_af_specific(addr->sa.sa_family); + else { + struct sock *sk; + int type = ipv6_addr_type(&addr->v6.sin6_addr); + sk = sctp_opt2sk(opt); + if (type & IPV6_ADDR_LINKLOCAL) { + /* Note: Behavior similar to af_inet6.c: + * 1) Overrides previous bound_dev_if + * 2) Destructive even if bind isn't successful. + */ + + if (addr->v6.sin6_scope_id) + sk->sk_bound_dev_if = addr->v6.sin6_scope_id; + if (!sk->sk_bound_dev_if) + return 0; + } + af = opt->pf->af; + } + + return af != NULL; +} + +/* Fill in Supported Address Type information for INIT and INIT-ACK + * chunks. Note: In the future, we may want to look at sock options + * to determine whether a PF_INET6 socket really wants to have IPV4 + * addresses. + * Returns number of addresses supported. + */ +static int sctp_inet6_supported_addrs(const struct sctp_sock *opt, + __u16 *types) +{ + types[0] = SCTP_PARAM_IPV4_ADDRESS; + types[1] = SCTP_PARAM_IPV6_ADDRESS; + return 2; +} + +static struct proto_ops inet6_seqpacket_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_dgram_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet6_getname, + .poll = sctp_poll, + .ioctl = inet6_ioctl, + .listen = sctp_inet_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, +}; + +static struct inet_protosw sctpv6_seqpacket_protosw = { + .type = SOCK_SEQPACKET, + .protocol = IPPROTO_SCTP, + .prot = &sctpv6_prot, + .ops = &inet6_seqpacket_ops, + .capability = -1, + .no_check = 0, + .flags = SCTP_PROTOSW_FLAG +}; +static struct inet_protosw sctpv6_stream_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_SCTP, + .prot = &sctpv6_prot, + .ops = &inet6_seqpacket_ops, + .capability = -1, + .no_check = 0, + .flags = SCTP_PROTOSW_FLAG, +}; + +static int sctp6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + return sctp_rcv(*pskb) ? -1 : 0; +} + +static struct inet6_protocol sctpv6_protocol = { + .handler = sctp6_rcv, + .err_handler = sctp_v6_err, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, +}; + +static struct sctp_af sctp_ipv6_specific = { + .sctp_xmit = sctp_v6_xmit, + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .get_dst = sctp_v6_get_dst, + .get_saddr = sctp_v6_get_saddr, + .copy_addrlist = sctp_v6_copy_addrlist, + .from_skb = sctp_v6_from_skb, + .from_sk = sctp_v6_from_sk, + .to_sk_saddr = sctp_v6_to_sk_saddr, + .to_sk_daddr = sctp_v6_to_sk_daddr, + .from_addr_param = sctp_v6_from_addr_param, + .to_addr_param = sctp_v6_to_addr_param, + .dst_saddr = sctp_v6_dst_saddr, + .cmp_addr = sctp_v6_cmp_addr, + .scope = sctp_v6_scope, + .addr_valid = sctp_v6_addr_valid, + .inaddr_any = sctp_v6_inaddr_any, + .is_any = sctp_v6_is_any, + .available = sctp_v6_available, + .skb_iif = sctp_v6_skb_iif, + .is_ce = sctp_v6_is_ce, + .seq_dump_addr = sctp_v6_seq_dump_addr, + .net_header_len = sizeof(struct ipv6hdr), + .sockaddr_len = sizeof(struct sockaddr_in6), + .sa_family = AF_INET6, +}; + +static struct sctp_pf sctp_pf_inet6_specific = { + .event_msgname = sctp_inet6_event_msgname, + .skb_msgname = sctp_inet6_skb_msgname, + .af_supported = sctp_inet6_af_supported, + .cmp_addr = sctp_inet6_cmp_addr, + .bind_verify = sctp_inet6_bind_verify, + .send_verify = sctp_inet6_send_verify, + .supported_addrs = sctp_inet6_supported_addrs, + .create_accept_sk = sctp_v6_create_accept_sk, + .addr_v4map = sctp_v6_addr_v4map, + .af = &sctp_ipv6_specific, +}; + +/* Initialize IPv6 support and register with inet6 stack. */ +int sctp_v6_init(void) +{ + int rc = proto_register(&sctpv6_prot, 1); + + if (rc) + goto out; + /* Register inet6 protocol. */ + rc = -EAGAIN; + if (inet6_add_protocol(&sctpv6_protocol, IPPROTO_SCTP) < 0) + goto out_unregister_sctp_proto; + + /* Add SCTPv6(UDP and TCP style) to inetsw6 linked list. */ + inet6_register_protosw(&sctpv6_seqpacket_protosw); + inet6_register_protosw(&sctpv6_stream_protosw); + + /* Register the SCTP specific PF_INET6 functions. */ + sctp_register_pf(&sctp_pf_inet6_specific, PF_INET6); + + /* Register the SCTP specific AF_INET6 functions. */ + sctp_register_af(&sctp_ipv6_specific); + + /* Register notifier for inet6 address additions/deletions. */ + register_inet6addr_notifier(&sctp_inet6addr_notifier); + rc = 0; +out: + return rc; +out_unregister_sctp_proto: + proto_unregister(&sctpv6_prot); + goto out; +} + +/* IPv6 specific exit support. */ +void sctp_v6_exit(void) +{ + list_del(&sctp_ipv6_specific.list); + inet6_del_protocol(&sctpv6_protocol, IPPROTO_SCTP); + inet6_unregister_protosw(&sctpv6_seqpacket_protosw); + inet6_unregister_protosw(&sctpv6_stream_protosw); + unregister_inet6addr_notifier(&sctp_inet6addr_notifier); + proto_unregister(&sctpv6_prot); +} diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c new file mode 100644 index 000000000000..0781e5d509fd --- /dev/null +++ b/net/sctp/objcnt.c @@ -0,0 +1,140 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2001, 2004 + * + * This file is part of the SCTP kernel reference Implementation + * + * Support for memory object debugging. This allows one to monitor the + * object allocations/deallocations for types instrumented for this + * via the proc fs. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Jon Grimm + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include + +/* + * Global counters to count raw object allocation counts. + * To add new counters, choose a unique suffix for the variable + * name as the helper macros key off this suffix to make + * life easier for the programmer. + */ + +SCTP_DBG_OBJCNT(sock); +SCTP_DBG_OBJCNT(ep); +SCTP_DBG_OBJCNT(transport); +SCTP_DBG_OBJCNT(assoc); +SCTP_DBG_OBJCNT(bind_addr); +SCTP_DBG_OBJCNT(bind_bucket); +SCTP_DBG_OBJCNT(chunk); +SCTP_DBG_OBJCNT(addr); +SCTP_DBG_OBJCNT(ssnmap); +SCTP_DBG_OBJCNT(datamsg); + +/* An array to make it easy to pretty print the debug information + * to the proc fs. + */ +static sctp_dbg_objcnt_entry_t sctp_dbg_objcnt[] = { + SCTP_DBG_OBJCNT_ENTRY(sock), + SCTP_DBG_OBJCNT_ENTRY(ep), + SCTP_DBG_OBJCNT_ENTRY(assoc), + SCTP_DBG_OBJCNT_ENTRY(transport), + SCTP_DBG_OBJCNT_ENTRY(chunk), + SCTP_DBG_OBJCNT_ENTRY(bind_addr), + SCTP_DBG_OBJCNT_ENTRY(bind_bucket), + SCTP_DBG_OBJCNT_ENTRY(addr), + SCTP_DBG_OBJCNT_ENTRY(ssnmap), + SCTP_DBG_OBJCNT_ENTRY(datamsg), +}; + +/* Callback from procfs to read out objcount information. + * Walk through the entries in the sctp_dbg_objcnt array, dumping + * the raw object counts for each monitored type. + * + * This code was modified from similar code in route.c + */ +static int sctp_dbg_objcnt_read(char *buffer, char **start, off_t offset, + int length, int *eof, void *data) +{ + int len = 0; + off_t pos = 0; + int entries; + int i; + char temp[128]; + + /* How many entries? */ + entries = ARRAY_SIZE(sctp_dbg_objcnt); + + /* Walk the entries and print out the debug information + * for proc fs. + */ + for (i = 0; i < entries; i++) { + pos += 128; + + /* Skip ahead. */ + if (pos <= offset) { + len = 0; + continue; + } + /* Print out each entry. */ + sprintf(temp, "%s: %d", + sctp_dbg_objcnt[i].label, + atomic_read(sctp_dbg_objcnt[i].counter)); + + sprintf(buffer + len, "%-127s\n", temp); + len += 128; + if (pos >= offset+length) + goto done; + } + +done: + *start = buffer + len - (pos - offset); + len = pos - offset; + if (len > length) + len = length; + + return len; +} + +/* Initialize the objcount in the proc filesystem. */ +void sctp_dbg_objcnt_init(void) +{ + create_proc_read_entry("sctp_dbg_objcnt", 0, proc_net_sctp, + sctp_dbg_objcnt_read, NULL); +} + +/* Cleanup the objcount entry in the proc filesystem. */ +void sctp_dbg_objcnt_exit(void) +{ + remove_proc_entry("sctp_dbg_objcnt", proc_net_sctp); +} + + diff --git a/net/sctp/output.c b/net/sctp/output.c new file mode 100644 index 000000000000..9013f64f5219 --- /dev/null +++ b/net/sctp/output.c @@ -0,0 +1,646 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * + * This file is part of the SCTP kernel reference Implementation + * + * These functions handle output processing. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Jon Grimm + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef TEST_FRAME +#include +#endif /* TEST_FRAME (not defined) */ + +#include /* for sa_family_t */ +#include + +#include +#include + +/* Forward declarations for private helpers. */ +static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet, + struct sctp_chunk *chunk); + +/* Config a packet. + * This appears to be a followup set of initializations. + */ +struct sctp_packet *sctp_packet_config(struct sctp_packet *packet, + __u32 vtag, int ecn_capable) +{ + struct sctp_chunk *chunk = NULL; + + SCTP_DEBUG_PRINTK("%s: packet:%p vtag:0x%x\n", __FUNCTION__, + packet, vtag); + + packet->vtag = vtag; + packet->has_cookie_echo = 0; + packet->has_sack = 0; + packet->ipfragok = 0; + + if (ecn_capable && sctp_packet_empty(packet)) { + chunk = sctp_get_ecne_prepend(packet->transport->asoc); + + /* If there a is a prepend chunk stick it on the list before + * any other chunks get appended. + */ + if (chunk) + sctp_packet_append_chunk(packet, chunk); + } + + return packet; +} + +/* Initialize the packet structure. */ +struct sctp_packet *sctp_packet_init(struct sctp_packet *packet, + struct sctp_transport *transport, + __u16 sport, __u16 dport) +{ + struct sctp_association *asoc = transport->asoc; + size_t overhead; + + SCTP_DEBUG_PRINTK("%s: packet:%p transport:%p\n", __FUNCTION__, + packet, transport); + + packet->transport = transport; + packet->source_port = sport; + packet->destination_port = dport; + skb_queue_head_init(&packet->chunks); + if (asoc) { + struct sctp_sock *sp = sctp_sk(asoc->base.sk); + overhead = sp->pf->af->net_header_len; + } else { + overhead = sizeof(struct ipv6hdr); + } + overhead += sizeof(struct sctphdr); + packet->overhead = overhead; + packet->size = overhead; + packet->vtag = 0; + packet->has_cookie_echo = 0; + packet->has_sack = 0; + packet->ipfragok = 0; + packet->malloced = 0; + return packet; +} + +/* Free a packet. */ +void sctp_packet_free(struct sctp_packet *packet) +{ + struct sctp_chunk *chunk; + + SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet); + + while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) + sctp_chunk_free(chunk); + + if (packet->malloced) + kfree(packet); +} + +/* This routine tries to append the chunk to the offered packet. If adding + * the chunk causes the packet to exceed the path MTU and COOKIE_ECHO chunk + * is not present in the packet, it transmits the input packet. + * Data can be bundled with a packet containing a COOKIE_ECHO chunk as long + * as it can fit in the packet, but any more data that does not fit in this + * packet can be sent only after receiving the COOKIE_ACK. + */ +sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet, + struct sctp_chunk *chunk) +{ + sctp_xmit_t retval; + int error = 0; + + SCTP_DEBUG_PRINTK("%s: packet:%p chunk:%p\n", __FUNCTION__, + packet, chunk); + + switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) { + case SCTP_XMIT_PMTU_FULL: + if (!packet->has_cookie_echo) { + error = sctp_packet_transmit(packet); + if (error < 0) + chunk->skb->sk->sk_err = -error; + + /* If we have an empty packet, then we can NOT ever + * return PMTU_FULL. + */ + retval = sctp_packet_append_chunk(packet, chunk); + } + break; + + case SCTP_XMIT_RWND_FULL: + case SCTP_XMIT_OK: + case SCTP_XMIT_NAGLE_DELAY: + break; + }; + + return retval; +} + +/* Try to bundle a SACK with the packet. */ +static sctp_xmit_t sctp_packet_bundle_sack(struct sctp_packet *pkt, + struct sctp_chunk *chunk) +{ + sctp_xmit_t retval = SCTP_XMIT_OK; + + /* If sending DATA and haven't aleady bundled a SACK, try to + * bundle one in to the packet. + */ + if (sctp_chunk_is_data(chunk) && !pkt->has_sack && + !pkt->has_cookie_echo) { + struct sctp_association *asoc; + asoc = pkt->transport->asoc; + + if (asoc->a_rwnd > asoc->rwnd) { + struct sctp_chunk *sack; + asoc->a_rwnd = asoc->rwnd; + sack = sctp_make_sack(asoc); + if (sack) { + struct timer_list *timer; + retval = sctp_packet_append_chunk(pkt, sack); + asoc->peer.sack_needed = 0; + timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK]; + if (timer_pending(timer) && del_timer(timer)) + sctp_association_put(asoc); + } + } + } + return retval; +} + +/* Append a chunk to the offered packet reporting back any inability to do + * so. + */ +sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *packet, + struct sctp_chunk *chunk) +{ + sctp_xmit_t retval = SCTP_XMIT_OK; + __u16 chunk_len = WORD_ROUND(ntohs(chunk->chunk_hdr->length)); + size_t psize; + size_t pmtu; + int too_big; + + SCTP_DEBUG_PRINTK("%s: packet:%p chunk:%p\n", __FUNCTION__, packet, + chunk); + + retval = sctp_packet_bundle_sack(packet, chunk); + psize = packet->size; + + if (retval != SCTP_XMIT_OK) + goto finish; + + pmtu = ((packet->transport->asoc) ? + (packet->transport->asoc->pmtu) : + (packet->transport->pmtu)); + + too_big = (psize + chunk_len > pmtu); + + /* Decide if we need to fragment or resubmit later. */ + if (too_big) { + /* Both control chunks and data chunks with TSNs are + * non-fragmentable. + */ + if (sctp_packet_empty(packet) || !sctp_chunk_is_data(chunk)) { + /* We no longer do re-fragmentation. + * Just fragment at the IP layer, if we + * actually hit this condition + */ + packet->ipfragok = 1; + goto append; + + } else { + retval = SCTP_XMIT_PMTU_FULL; + goto finish; + } + } + +append: + /* We believe that this chunk is OK to add to the packet (as + * long as we have the cwnd for it). + */ + + /* DATA is a special case since we must examine both rwnd and cwnd + * before we send DATA. + */ + if (sctp_chunk_is_data(chunk)) { + retval = sctp_packet_append_data(packet, chunk); + /* Disallow SACK bundling after DATA. */ + packet->has_sack = 1; + if (SCTP_XMIT_OK != retval) + goto finish; + } else if (SCTP_CID_COOKIE_ECHO == chunk->chunk_hdr->type) + packet->has_cookie_echo = 1; + else if (SCTP_CID_SACK == chunk->chunk_hdr->type) + packet->has_sack = 1; + + /* It is OK to send this chunk. */ + __skb_queue_tail(&packet->chunks, (struct sk_buff *)chunk); + packet->size += chunk_len; + chunk->transport = packet->transport; +finish: + return retval; +} + +/* All packets are sent to the network through this function from + * sctp_outq_tail(). + * + * The return value is a normal kernel error return value. + */ +int sctp_packet_transmit(struct sctp_packet *packet) +{ + struct sctp_transport *tp = packet->transport; + struct sctp_association *asoc = tp->asoc; + struct sctphdr *sh; + __u32 crc32; + struct sk_buff *nskb; + struct sctp_chunk *chunk; + struct sock *sk; + int err = 0; + int padding; /* How much padding do we need? */ + __u8 has_data = 0; + struct dst_entry *dst; + + SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet); + + /* Do NOT generate a chunkless packet. */ + chunk = (struct sctp_chunk *)skb_peek(&packet->chunks); + if (unlikely(!chunk)) + return err; + + /* Set up convenience variables... */ + sk = chunk->skb->sk; + + /* Allocate the new skb. */ + nskb = dev_alloc_skb(packet->size); + if (!nskb) + goto nomem; + + /* Make sure the outbound skb has enough header room reserved. */ + skb_reserve(nskb, packet->overhead); + + /* Set the owning socket so that we know where to get the + * destination IP address. + */ + skb_set_owner_w(nskb, sk); + + /* Build the SCTP header. */ + sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr)); + sh->source = htons(packet->source_port); + sh->dest = htons(packet->destination_port); + + /* From 6.8 Adler-32 Checksum Calculation: + * After the packet is constructed (containing the SCTP common + * header and one or more control or DATA chunks), the + * transmitter shall: + * + * 1) Fill in the proper Verification Tag in the SCTP common + * header and initialize the checksum field to 0's. + */ + sh->vtag = htonl(packet->vtag); + sh->checksum = 0; + + /* 2) Calculate the Adler-32 checksum of the whole packet, + * including the SCTP common header and all the + * chunks. + * + * Note: Adler-32 is no longer applicable, as has been replaced + * by CRC32-C as described in . + */ + crc32 = sctp_start_cksum((__u8 *)sh, sizeof(struct sctphdr)); + + /** + * 6.10 Bundling + * + * An endpoint bundles chunks by simply including multiple + * chunks in one outbound SCTP packet. ... + */ + + /** + * 3.2 Chunk Field Descriptions + * + * The total length of a chunk (including Type, Length and + * Value fields) MUST be a multiple of 4 bytes. If the length + * of the chunk is not a multiple of 4 bytes, the sender MUST + * pad the chunk with all zero bytes and this padding is not + * included in the chunk length field. The sender should + * never pad with more than 3 bytes. + * + * [This whole comment explains WORD_ROUND() below.] + */ + SCTP_DEBUG_PRINTK("***sctp_transmit_packet***\n"); + while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) { + if (sctp_chunk_is_data(chunk)) { + + if (!chunk->has_tsn) { + sctp_chunk_assign_ssn(chunk); + sctp_chunk_assign_tsn(chunk); + + /* 6.3.1 C4) When data is in flight and when allowed + * by rule C5, a new RTT measurement MUST be made each + * round trip. Furthermore, new RTT measurements + * SHOULD be made no more than once per round-trip + * for a given destination transport address. + */ + + if (!tp->rto_pending) { + chunk->rtt_in_progress = 1; + tp->rto_pending = 1; + } + } else + chunk->resent = 1; + + chunk->sent_at = jiffies; + has_data = 1; + } + + padding = WORD_ROUND(chunk->skb->len) - chunk->skb->len; + if (padding) + memset(skb_put(chunk->skb, padding), 0, padding); + + crc32 = sctp_update_copy_cksum(skb_put(nskb, chunk->skb->len), + chunk->skb->data, + chunk->skb->len, crc32); + + SCTP_DEBUG_PRINTK("%s %p[%s] %s 0x%x, %s %d, %s %d, %s %d\n", + "*** Chunk", chunk, + sctp_cname(SCTP_ST_CHUNK( + chunk->chunk_hdr->type)), + chunk->has_tsn ? "TSN" : "No TSN", + chunk->has_tsn ? + ntohl(chunk->subh.data_hdr->tsn) : 0, + "length", ntohs(chunk->chunk_hdr->length), + "chunk->skb->len", chunk->skb->len, + "rtt_in_progress", chunk->rtt_in_progress); + + /* + * If this is a control chunk, this is our last + * reference. Free data chunks after they've been + * acknowledged or have failed. + */ + if (!sctp_chunk_is_data(chunk)) + sctp_chunk_free(chunk); + } + + /* Perform final transformation on checksum. */ + crc32 = sctp_end_cksum(crc32); + + /* 3) Put the resultant value into the checksum field in the + * common header, and leave the rest of the bits unchanged. + */ + sh->checksum = htonl(crc32); + + /* IP layer ECN support + * From RFC 2481 + * "The ECN-Capable Transport (ECT) bit would be set by the + * data sender to indicate that the end-points of the + * transport protocol are ECN-capable." + * + * Now setting the ECT bit all the time, as it should not cause + * any problems protocol-wise even if our peer ignores it. + * + * Note: The works for IPv6 layer checks this bit too later + * in transmission. See IP6_ECN_flow_xmit(). + */ + INET_ECN_xmit(nskb->sk); + + /* Set up the IP options. */ + /* BUG: not implemented + * For v4 this all lives somewhere in sk->sk_opt... + */ + + /* Dump that on IP! */ + if (asoc && asoc->peer.last_sent_to != tp) { + /* Considering the multiple CPU scenario, this is a + * "correcter" place for last_sent_to. --xguo + */ + asoc->peer.last_sent_to = tp; + } + + if (has_data) { + struct timer_list *timer; + unsigned long timeout; + + tp->last_time_used = jiffies; + + /* Restart the AUTOCLOSE timer when sending data. */ + if (sctp_state(asoc, ESTABLISHED) && asoc->autoclose) { + timer = &asoc->timers[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; + timeout = asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE]; + + if (!mod_timer(timer, jiffies + timeout)) + sctp_association_hold(asoc); + } + } + + dst = tp->dst; + /* The 'obsolete' field of dst is set to 2 when a dst is freed. */ + if (!dst || (dst->obsolete > 1)) { + dst_release(dst); + sctp_transport_route(tp, NULL, sctp_sk(sk)); + sctp_assoc_sync_pmtu(asoc); + } + + nskb->dst = dst_clone(tp->dst); + if (!nskb->dst) + goto no_route; + + SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n", + nskb->len); + + (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok); + +out: + packet->size = packet->overhead; + return err; +no_route: + kfree_skb(nskb); + IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); + + /* FIXME: Returning the 'err' will effect all the associations + * associated with a socket, although only one of the paths of the + * association is unreachable. + * The real failure of a transport or association can be passed on + * to the user via notifications. So setting this error may not be + * required. + */ + /* err = -EHOSTUNREACH; */ +err: + /* Control chunks are unreliable so just drop them. DATA chunks + * will get resent or dropped later. + */ + + while ((chunk = (struct sctp_chunk *)__skb_dequeue(&packet->chunks)) != NULL) { + if (!sctp_chunk_is_data(chunk)) + sctp_chunk_free(chunk); + } + goto out; +nomem: + err = -ENOMEM; + goto err; +} + +/******************************************************************** + * 2nd Level Abstractions + ********************************************************************/ + +/* This private function handles the specifics of appending DATA chunks. */ +static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet, + struct sctp_chunk *chunk) +{ + sctp_xmit_t retval = SCTP_XMIT_OK; + size_t datasize, rwnd, inflight; + struct sctp_transport *transport = packet->transport; + __u32 max_burst_bytes; + struct sctp_association *asoc = transport->asoc; + struct sctp_sock *sp = sctp_sk(asoc->base.sk); + struct sctp_outq *q = &asoc->outqueue; + + /* RFC 2960 6.1 Transmission of DATA Chunks + * + * A) At any given time, the data sender MUST NOT transmit new data to + * any destination transport address if its peer's rwnd indicates + * that the peer has no buffer space (i.e. rwnd is 0, see Section + * 6.2.1). However, regardless of the value of rwnd (including if it + * is 0), the data sender can always have one DATA chunk in flight to + * the receiver if allowed by cwnd (see rule B below). This rule + * allows the sender to probe for a change in rwnd that the sender + * missed due to the SACK having been lost in transit from the data + * receiver to the data sender. + */ + + rwnd = asoc->peer.rwnd; + inflight = asoc->outqueue.outstanding_bytes; + + datasize = sctp_data_size(chunk); + + if (datasize > rwnd) { + if (inflight > 0) { + /* We have (at least) one data chunk in flight, + * so we can't fall back to rule 6.1 B). + */ + retval = SCTP_XMIT_RWND_FULL; + goto finish; + } + } + + /* sctpimpguide-05 2.14.2 + * D) When the time comes for the sender to + * transmit new DATA chunks, the protocol parameter Max.Burst MUST + * first be applied to limit how many new DATA chunks may be sent. + * The limit is applied by adjusting cwnd as follows: + * if ((flightsize + Max.Burst * MTU) < cwnd) + * cwnd = flightsize + Max.Burst * MTU + */ + max_burst_bytes = asoc->max_burst * asoc->pmtu; + if ((transport->flight_size + max_burst_bytes) < transport->cwnd) { + transport->cwnd = transport->flight_size + max_burst_bytes; + SCTP_DEBUG_PRINTK("%s: cwnd limited by max_burst: " + "transport: %p, cwnd: %d, " + "ssthresh: %d, flight_size: %d, " + "pba: %d\n", + __FUNCTION__, transport, + transport->cwnd, + transport->ssthresh, + transport->flight_size, + transport->partial_bytes_acked); + } + + /* RFC 2960 6.1 Transmission of DATA Chunks + * + * B) At any given time, the sender MUST NOT transmit new data + * to a given transport address if it has cwnd or more bytes + * of data outstanding to that transport address. + */ + /* RFC 7.2.4 & the Implementers Guide 2.8. + * + * 3) ... + * When a Fast Retransmit is being performed the sender SHOULD + * ignore the value of cwnd and SHOULD NOT delay retransmission. + */ + if (!chunk->fast_retransmit) + if (transport->flight_size >= transport->cwnd) { + retval = SCTP_XMIT_RWND_FULL; + goto finish; + } + + /* Nagle's algorithm to solve small-packet problem: + * Inhibit the sending of new chunks when new outgoing data arrives + * if any previously transmitted data on the connection remains + * unacknowledged. + */ + if (!sp->nodelay && sctp_packet_empty(packet) && + q->outstanding_bytes && sctp_state(asoc, ESTABLISHED)) { + unsigned len = datasize + q->out_qlen; + + /* Check whether this chunk and all the rest of pending + * data will fit or delay in hopes of bundling a full + * sized packet. + */ + if (len < asoc->pmtu - packet->overhead) { + retval = SCTP_XMIT_NAGLE_DELAY; + goto finish; + } + } + + /* Keep track of how many bytes are in flight over this transport. */ + transport->flight_size += datasize; + + /* Keep track of how many bytes are in flight to the receiver. */ + asoc->outqueue.outstanding_bytes += datasize; + + /* Update our view of the receiver's rwnd. */ + if (datasize < rwnd) + rwnd -= datasize; + else + rwnd = 0; + + asoc->peer.rwnd = rwnd; + /* Has been accepted for transmission. */ + if (!asoc->peer.prsctp_capable) + chunk->msg->can_abandon = 0; + +finish: + return retval; +} diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c new file mode 100644 index 000000000000..1b2d4adc4ddb --- /dev/null +++ b/net/sctp/outqueue.c @@ -0,0 +1,1734 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001-2003 Intel Corp. + * + * This file is part of the SCTP kernel reference Implementation + * + * These functions implement the sctp_outq class. The outqueue handles + * bundling and queueing of outgoing SCTP chunks. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Perry Melange + * Xingang Guo + * Hui Huang + * Sridhar Samudrala + * Jon Grimm + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include /* For struct list_head */ +#include +#include +#include /* For skb_set_owner_w */ + +#include +#include + +/* Declare internal functions here. */ +static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn); +static void sctp_check_transmitted(struct sctp_outq *q, + struct list_head *transmitted_queue, + struct sctp_transport *transport, + struct sctp_sackhdr *sack, + __u32 highest_new_tsn); + +static void sctp_mark_missing(struct sctp_outq *q, + struct list_head *transmitted_queue, + struct sctp_transport *transport, + __u32 highest_new_tsn, + int count_of_newacks); + +static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn); + +/* Add data to the front of the queue. */ +static inline void sctp_outq_head_data(struct sctp_outq *q, + struct sctp_chunk *ch) +{ + __skb_queue_head(&q->out, (struct sk_buff *)ch); + q->out_qlen += ch->skb->len; + return; +} + +/* Take data from the front of the queue. */ +static inline struct sctp_chunk *sctp_outq_dequeue_data(struct sctp_outq *q) +{ + struct sctp_chunk *ch; + ch = (struct sctp_chunk *)__skb_dequeue(&q->out); + if (ch) + q->out_qlen -= ch->skb->len; + return ch; +} +/* Add data chunk to the end of the queue. */ +static inline void sctp_outq_tail_data(struct sctp_outq *q, + struct sctp_chunk *ch) +{ + __skb_queue_tail(&q->out, (struct sk_buff *)ch); + q->out_qlen += ch->skb->len; + return; +} + +/* + * SFR-CACC algorithm: + * D) If count_of_newacks is greater than or equal to 2 + * and t was not sent to the current primary then the + * sender MUST NOT increment missing report count for t. + */ +static inline int sctp_cacc_skip_3_1_d(struct sctp_transport *primary, + struct sctp_transport *transport, + int count_of_newacks) +{ + if (count_of_newacks >=2 && transport != primary) + return 1; + return 0; +} + +/* + * SFR-CACC algorithm: + * F) If count_of_newacks is less than 2, let d be the + * destination to which t was sent. If cacc_saw_newack + * is 0 for destination d, then the sender MUST NOT + * increment missing report count for t. + */ +static inline int sctp_cacc_skip_3_1_f(struct sctp_transport *transport, + int count_of_newacks) +{ + if (count_of_newacks < 2 && !transport->cacc.cacc_saw_newack) + return 1; + return 0; +} + +/* + * SFR-CACC algorithm: + * 3.1) If CYCLING_CHANGEOVER is 0, the sender SHOULD + * execute steps C, D, F. + * + * C has been implemented in sctp_outq_sack + */ +static inline int sctp_cacc_skip_3_1(struct sctp_transport *primary, + struct sctp_transport *transport, + int count_of_newacks) +{ + if (!primary->cacc.cycling_changeover) { + if (sctp_cacc_skip_3_1_d(primary, transport, count_of_newacks)) + return 1; + if (sctp_cacc_skip_3_1_f(transport, count_of_newacks)) + return 1; + return 0; + } + return 0; +} + +/* + * SFR-CACC algorithm: + * 3.2) Else if CYCLING_CHANGEOVER is 1, and t is less + * than next_tsn_at_change of the current primary, then + * the sender MUST NOT increment missing report count + * for t. + */ +static inline int sctp_cacc_skip_3_2(struct sctp_transport *primary, __u32 tsn) +{ + if (primary->cacc.cycling_changeover && + TSN_lt(tsn, primary->cacc.next_tsn_at_change)) + return 1; + return 0; +} + +/* + * SFR-CACC algorithm: + * 3) If the missing report count for TSN t is to be + * incremented according to [RFC2960] and + * [SCTP_STEWART-2002], and CHANGEOVER_ACTIVE is set, + * then the sender MUST futher execute steps 3.1 and + * 3.2 to determine if the missing report count for + * TSN t SHOULD NOT be incremented. + * + * 3.3) If 3.1 and 3.2 do not dictate that the missing + * report count for t should not be incremented, then + * the sender SOULD increment missing report count for + * t (according to [RFC2960] and [SCTP_STEWART_2002]). + */ +static inline int sctp_cacc_skip(struct sctp_transport *primary, + struct sctp_transport *transport, + int count_of_newacks, + __u32 tsn) +{ + if (primary->cacc.changeover_active && + (sctp_cacc_skip_3_1(primary, transport, count_of_newacks) + || sctp_cacc_skip_3_2(primary, tsn))) + return 1; + return 0; +} + +/* Initialize an existing sctp_outq. This does the boring stuff. + * You still need to define handlers if you really want to DO + * something with this structure... + */ +void sctp_outq_init(struct sctp_association *asoc, struct sctp_outq *q) +{ + q->asoc = asoc; + skb_queue_head_init(&q->out); + skb_queue_head_init(&q->control); + INIT_LIST_HEAD(&q->retransmit); + INIT_LIST_HEAD(&q->sacked); + INIT_LIST_HEAD(&q->abandoned); + + q->outstanding_bytes = 0; + q->empty = 1; + q->cork = 0; + + q->malloced = 0; + q->out_qlen = 0; +} + +/* Free the outqueue structure and any related pending chunks. + */ +void sctp_outq_teardown(struct sctp_outq *q) +{ + struct sctp_transport *transport; + struct list_head *lchunk, *pos, *temp; + struct sctp_chunk *chunk; + + /* Throw away unacknowledged chunks. */ + list_for_each(pos, &q->asoc->peer.transport_addr_list) { + transport = list_entry(pos, struct sctp_transport, transports); + while ((lchunk = sctp_list_dequeue(&transport->transmitted)) != NULL) { + chunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + /* Mark as part of a failed message. */ + sctp_chunk_fail(chunk, q->error); + sctp_chunk_free(chunk); + } + } + + /* Throw away chunks that have been gap ACKed. */ + list_for_each_safe(lchunk, temp, &q->sacked) { + list_del_init(lchunk); + chunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + sctp_chunk_fail(chunk, q->error); + sctp_chunk_free(chunk); + } + + /* Throw away any chunks in the retransmit queue. */ + list_for_each_safe(lchunk, temp, &q->retransmit) { + list_del_init(lchunk); + chunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + sctp_chunk_fail(chunk, q->error); + sctp_chunk_free(chunk); + } + + /* Throw away any chunks that are in the abandoned queue. */ + list_for_each_safe(lchunk, temp, &q->abandoned) { + list_del_init(lchunk); + chunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + sctp_chunk_fail(chunk, q->error); + sctp_chunk_free(chunk); + } + + /* Throw away any leftover data chunks. */ + while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { + + /* Mark as send failure. */ + sctp_chunk_fail(chunk, q->error); + sctp_chunk_free(chunk); + } + + q->error = 0; + + /* Throw away any leftover control chunks. */ + while ((chunk = (struct sctp_chunk *) skb_dequeue(&q->control)) != NULL) + sctp_chunk_free(chunk); +} + +/* Free the outqueue structure and any related pending chunks. */ +void sctp_outq_free(struct sctp_outq *q) +{ + /* Throw away leftover chunks. */ + sctp_outq_teardown(q); + + /* If we were kmalloc()'d, free the memory. */ + if (q->malloced) + kfree(q); +} + +/* Put a new chunk in an sctp_outq. */ +int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk) +{ + int error = 0; + + SCTP_DEBUG_PRINTK("sctp_outq_tail(%p, %p[%s])\n", + q, chunk, chunk && chunk->chunk_hdr ? + sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) + : "Illegal Chunk"); + + /* If it is data, queue it up, otherwise, send it + * immediately. + */ + if (SCTP_CID_DATA == chunk->chunk_hdr->type) { + /* Is it OK to queue data chunks? */ + /* From 9. Termination of Association + * + * When either endpoint performs a shutdown, the + * association on each peer will stop accepting new + * data from its user and only deliver data in queue + * at the time of sending or receiving the SHUTDOWN + * chunk. + */ + switch (q->asoc->state) { + case SCTP_STATE_EMPTY: + case SCTP_STATE_CLOSED: + case SCTP_STATE_SHUTDOWN_PENDING: + case SCTP_STATE_SHUTDOWN_SENT: + case SCTP_STATE_SHUTDOWN_RECEIVED: + case SCTP_STATE_SHUTDOWN_ACK_SENT: + /* Cannot send after transport endpoint shutdown */ + error = -ESHUTDOWN; + break; + + default: + SCTP_DEBUG_PRINTK("outqueueing (%p, %p[%s])\n", + q, chunk, chunk && chunk->chunk_hdr ? + sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) + : "Illegal Chunk"); + + sctp_outq_tail_data(q, chunk); + if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) + SCTP_INC_STATS(SCTP_MIB_OUTUNORDERCHUNKS); + else + SCTP_INC_STATS(SCTP_MIB_OUTORDERCHUNKS); + q->empty = 0; + break; + }; + } else { + __skb_queue_tail(&q->control, (struct sk_buff *) chunk); + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + } + + if (error < 0) + return error; + + if (!q->cork) + error = sctp_outq_flush(q, 0); + + return error; +} + +/* Insert a chunk into the sorted list based on the TSNs. The retransmit list + * and the abandoned list are in ascending order. + */ +static void sctp_insert_list(struct list_head *head, struct list_head *new) +{ + struct list_head *pos; + struct sctp_chunk *nchunk, *lchunk; + __u32 ntsn, ltsn; + int done = 0; + + nchunk = list_entry(new, struct sctp_chunk, transmitted_list); + ntsn = ntohl(nchunk->subh.data_hdr->tsn); + + list_for_each(pos, head) { + lchunk = list_entry(pos, struct sctp_chunk, transmitted_list); + ltsn = ntohl(lchunk->subh.data_hdr->tsn); + if (TSN_lt(ntsn, ltsn)) { + list_add(new, pos->prev); + done = 1; + break; + } + } + if (!done) + list_add_tail(new, head); +} + +/* Mark all the eligible packets on a transport for retransmission. */ +void sctp_retransmit_mark(struct sctp_outq *q, + struct sctp_transport *transport, + __u8 fast_retransmit) +{ + struct list_head *lchunk, *ltemp; + struct sctp_chunk *chunk; + + /* Walk through the specified transmitted queue. */ + list_for_each_safe(lchunk, ltemp, &transport->transmitted) { + chunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + + /* If the chunk is abandoned, move it to abandoned list. */ + if (sctp_chunk_abandoned(chunk)) { + list_del_init(lchunk); + sctp_insert_list(&q->abandoned, lchunk); + continue; + } + + /* If we are doing retransmission due to a fast retransmit, + * only the chunk's that are marked for fast retransmit + * should be added to the retransmit queue. If we are doing + * retransmission due to a timeout or pmtu discovery, only the + * chunks that are not yet acked should be added to the + * retransmit queue. + */ + if ((fast_retransmit && chunk->fast_retransmit) || + (!fast_retransmit && !chunk->tsn_gap_acked)) { + /* RFC 2960 6.2.1 Processing a Received SACK + * + * C) Any time a DATA chunk is marked for + * retransmission (via either T3-rtx timer expiration + * (Section 6.3.3) or via fast retransmit + * (Section 7.2.4)), add the data size of those + * chunks to the rwnd. + */ + q->asoc->peer.rwnd += sctp_data_size(chunk); + q->outstanding_bytes -= sctp_data_size(chunk); + transport->flight_size -= sctp_data_size(chunk); + + /* sctpimpguide-05 Section 2.8.2 + * M5) If a T3-rtx timer expires, the + * 'TSN.Missing.Report' of all affected TSNs is set + * to 0. + */ + chunk->tsn_missing_report = 0; + + /* If a chunk that is being used for RTT measurement + * has to be retransmitted, we cannot use this chunk + * anymore for RTT measurements. Reset rto_pending so + * that a new RTT measurement is started when a new + * data chunk is sent. + */ + if (chunk->rtt_in_progress) { + chunk->rtt_in_progress = 0; + transport->rto_pending = 0; + } + + /* Move the chunk to the retransmit queue. The chunks + * on the retransmit queue are always kept in order. + */ + list_del_init(lchunk); + sctp_insert_list(&q->retransmit, lchunk); + } + } + + SCTP_DEBUG_PRINTK("%s: transport: %p, fast_retransmit: %d, " + "cwnd: %d, ssthresh: %d, flight_size: %d, " + "pba: %d\n", __FUNCTION__, + transport, fast_retransmit, + transport->cwnd, transport->ssthresh, + transport->flight_size, + transport->partial_bytes_acked); + +} + +/* Mark all the eligible packets on a transport for retransmission and force + * one packet out. + */ +void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport, + sctp_retransmit_reason_t reason) +{ + int error = 0; + __u8 fast_retransmit = 0; + + switch(reason) { + case SCTP_RTXR_T3_RTX: + sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_T3_RTX); + /* Update the retran path if the T3-rtx timer has expired for + * the current retran path. + */ + if (transport == transport->asoc->peer.retran_path) + sctp_assoc_update_retran_path(transport->asoc); + break; + case SCTP_RTXR_FAST_RTX: + sctp_transport_lower_cwnd(transport, SCTP_LOWER_CWND_FAST_RTX); + fast_retransmit = 1; + break; + case SCTP_RTXR_PMTUD: + default: + break; + } + + sctp_retransmit_mark(q, transport, fast_retransmit); + + /* PR-SCTP A5) Any time the T3-rtx timer expires, on any destination, + * the sender SHOULD try to advance the "Advanced.Peer.Ack.Point" by + * following the procedures outlined in C1 - C5. + */ + sctp_generate_fwdtsn(q, q->asoc->ctsn_ack_point); + + error = sctp_outq_flush(q, /* rtx_timeout */ 1); + + if (error) + q->asoc->base.sk->sk_err = -error; +} + +/* + * Transmit DATA chunks on the retransmit queue. Upon return from + * sctp_outq_flush_rtx() the packet 'pkt' may contain chunks which + * need to be transmitted by the caller. + * We assume that pkt->transport has already been set. + * + * The return value is a normal kernel error return value. + */ +static int sctp_outq_flush_rtx(struct sctp_outq *q, struct sctp_packet *pkt, + int rtx_timeout, int *start_timer) +{ + struct list_head *lqueue; + struct list_head *lchunk, *lchunk1; + struct sctp_transport *transport = pkt->transport; + sctp_xmit_t status; + struct sctp_chunk *chunk, *chunk1; + struct sctp_association *asoc; + int error = 0; + + asoc = q->asoc; + lqueue = &q->retransmit; + + /* RFC 2960 6.3.3 Handle T3-rtx Expiration + * + * E3) Determine how many of the earliest (i.e., lowest TSN) + * outstanding DATA chunks for the address for which the + * T3-rtx has expired will fit into a single packet, subject + * to the MTU constraint for the path corresponding to the + * destination transport address to which the retransmission + * is being sent (this may be different from the address for + * which the timer expires [see Section 6.4]). Call this value + * K. Bundle and retransmit those K DATA chunks in a single + * packet to the destination endpoint. + * + * [Just to be painfully clear, if we are retransmitting + * because a timeout just happened, we should send only ONE + * packet of retransmitted data.] + */ + lchunk = sctp_list_dequeue(lqueue); + + while (lchunk) { + chunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + + /* Make sure that Gap Acked TSNs are not retransmitted. A + * simple approach is just to move such TSNs out of the + * way and into a 'transmitted' queue and skip to the + * next chunk. + */ + if (chunk->tsn_gap_acked) { + list_add_tail(lchunk, &transport->transmitted); + lchunk = sctp_list_dequeue(lqueue); + continue; + } + + /* Attempt to append this chunk to the packet. */ + status = sctp_packet_append_chunk(pkt, chunk); + + switch (status) { + case SCTP_XMIT_PMTU_FULL: + /* Send this packet. */ + if ((error = sctp_packet_transmit(pkt)) == 0) + *start_timer = 1; + + /* If we are retransmitting, we should only + * send a single packet. + */ + if (rtx_timeout) { + list_add(lchunk, lqueue); + lchunk = NULL; + } + + /* Bundle lchunk in the next round. */ + break; + + case SCTP_XMIT_RWND_FULL: + /* Send this packet. */ + if ((error = sctp_packet_transmit(pkt)) == 0) + *start_timer = 1; + + /* Stop sending DATA as there is no more room + * at the receiver. + */ + list_add(lchunk, lqueue); + lchunk = NULL; + break; + + case SCTP_XMIT_NAGLE_DELAY: + /* Send this packet. */ + if ((error = sctp_packet_transmit(pkt)) == 0) + *start_timer = 1; + + /* Stop sending DATA because of nagle delay. */ + list_add(lchunk, lqueue); + lchunk = NULL; + break; + + default: + /* The append was successful, so add this chunk to + * the transmitted list. + */ + list_add_tail(lchunk, &transport->transmitted); + + /* Mark the chunk as ineligible for fast retransmit + * after it is retransmitted. + */ + chunk->fast_retransmit = 0; + + *start_timer = 1; + q->empty = 0; + + /* Retrieve a new chunk to bundle. */ + lchunk = sctp_list_dequeue(lqueue); + break; + }; + + /* If we are here due to a retransmit timeout or a fast + * retransmit and if there are any chunks left in the retransmit + * queue that could not fit in the PMTU sized packet, they need * to be marked as ineligible for a subsequent fast retransmit. + */ + if (rtx_timeout && !lchunk) { + list_for_each(lchunk1, lqueue) { + chunk1 = list_entry(lchunk1, struct sctp_chunk, + transmitted_list); + chunk1->fast_retransmit = 0; + } + } + } + + return error; +} + +/* Cork the outqueue so queued chunks are really queued. */ +int sctp_outq_uncork(struct sctp_outq *q) +{ + int error = 0; + if (q->cork) { + q->cork = 0; + error = sctp_outq_flush(q, 0); + } + return error; +} + +/* + * Try to flush an outqueue. + * + * Description: Send everything in q which we legally can, subject to + * congestion limitations. + * * Note: This function can be called from multiple contexts so appropriate + * locking concerns must be made. Today we use the sock lock to protect + * this function. + */ +int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout) +{ + struct sctp_packet *packet; + struct sctp_packet singleton; + struct sctp_association *asoc = q->asoc; + __u16 sport = asoc->base.bind_addr.port; + __u16 dport = asoc->peer.port; + __u32 vtag = asoc->peer.i.init_tag; + struct sk_buff_head *queue; + struct sctp_transport *transport = NULL; + struct sctp_transport *new_transport; + struct sctp_chunk *chunk; + sctp_xmit_t status; + int error = 0; + int start_timer = 0; + + /* These transports have chunks to send. */ + struct list_head transport_list; + struct list_head *ltransport; + + INIT_LIST_HEAD(&transport_list); + packet = NULL; + + /* + * 6.10 Bundling + * ... + * When bundling control chunks with DATA chunks, an + * endpoint MUST place control chunks first in the outbound + * SCTP packet. The transmitter MUST transmit DATA chunks + * within a SCTP packet in increasing order of TSN. + * ... + */ + + queue = &q->control; + while ((chunk = (struct sctp_chunk *)skb_dequeue(queue)) != NULL) { + /* Pick the right transport to use. */ + new_transport = chunk->transport; + + if (!new_transport) { + new_transport = asoc->peer.active_path; + } else if (!new_transport->active) { + /* If the chunk is Heartbeat or Heartbeat Ack, + * send it to chunk->transport, even if it's + * inactive. + * + * 3.3.6 Heartbeat Acknowledgement: + * ... + * A HEARTBEAT ACK is always sent to the source IP + * address of the IP datagram containing the + * HEARTBEAT chunk to which this ack is responding. + * ... + */ + if (chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT && + chunk->chunk_hdr->type != SCTP_CID_HEARTBEAT_ACK) + new_transport = asoc->peer.active_path; + } + + /* Are we switching transports? + * Take care of transport locks. + */ + if (new_transport != transport) { + transport = new_transport; + if (list_empty(&transport->send_ready)) { + list_add_tail(&transport->send_ready, + &transport_list); + } + packet = &transport->packet; + sctp_packet_config(packet, vtag, + asoc->peer.ecn_capable); + } + + switch (chunk->chunk_hdr->type) { + /* + * 6.10 Bundling + * ... + * An endpoint MUST NOT bundle INIT, INIT ACK or SHUTDOWN + * COMPLETE with any other chunks. [Send them immediately.] + */ + case SCTP_CID_INIT: + case SCTP_CID_INIT_ACK: + case SCTP_CID_SHUTDOWN_COMPLETE: + sctp_packet_init(&singleton, transport, sport, dport); + sctp_packet_config(&singleton, vtag, 0); + sctp_packet_append_chunk(&singleton, chunk); + error = sctp_packet_transmit(&singleton); + if (error < 0) + return error; + break; + + case SCTP_CID_ABORT: + case SCTP_CID_SACK: + case SCTP_CID_HEARTBEAT: + case SCTP_CID_HEARTBEAT_ACK: + case SCTP_CID_SHUTDOWN: + case SCTP_CID_SHUTDOWN_ACK: + case SCTP_CID_ERROR: + case SCTP_CID_COOKIE_ECHO: + case SCTP_CID_COOKIE_ACK: + case SCTP_CID_ECN_ECNE: + case SCTP_CID_ECN_CWR: + case SCTP_CID_ASCONF: + case SCTP_CID_ASCONF_ACK: + case SCTP_CID_FWD_TSN: + sctp_packet_transmit_chunk(packet, chunk); + break; + + default: + /* We built a chunk with an illegal type! */ + BUG(); + }; + } + + /* Is it OK to send data chunks? */ + switch (asoc->state) { + case SCTP_STATE_COOKIE_ECHOED: + /* Only allow bundling when this packet has a COOKIE-ECHO + * chunk. + */ + if (!packet || !packet->has_cookie_echo) + break; + + /* fallthru */ + case SCTP_STATE_ESTABLISHED: + case SCTP_STATE_SHUTDOWN_PENDING: + case SCTP_STATE_SHUTDOWN_RECEIVED: + /* + * RFC 2960 6.1 Transmission of DATA Chunks + * + * C) When the time comes for the sender to transmit, + * before sending new DATA chunks, the sender MUST + * first transmit any outstanding DATA chunks which + * are marked for retransmission (limited by the + * current cwnd). + */ + if (!list_empty(&q->retransmit)) { + if (transport == asoc->peer.retran_path) + goto retran; + + /* Switch transports & prepare the packet. */ + + transport = asoc->peer.retran_path; + + if (list_empty(&transport->send_ready)) { + list_add_tail(&transport->send_ready, + &transport_list); + } + + packet = &transport->packet; + sctp_packet_config(packet, vtag, + asoc->peer.ecn_capable); + retran: + error = sctp_outq_flush_rtx(q, packet, + rtx_timeout, &start_timer); + + if (start_timer) + sctp_transport_reset_timers(transport); + + /* This can happen on COOKIE-ECHO resend. Only + * one chunk can get bundled with a COOKIE-ECHO. + */ + if (packet->has_cookie_echo) + goto sctp_flush_out; + + /* Don't send new data if there is still data + * waiting to retransmit. + */ + if (!list_empty(&q->retransmit)) + goto sctp_flush_out; + } + + /* Finally, transmit new packets. */ + start_timer = 0; + queue = &q->out; + + while ((chunk = sctp_outq_dequeue_data(q)) != NULL) { + /* RFC 2960 6.5 Every DATA chunk MUST carry a valid + * stream identifier. + */ + if (chunk->sinfo.sinfo_stream >= + asoc->c.sinit_num_ostreams) { + + /* Mark as failed send. */ + sctp_chunk_fail(chunk, SCTP_ERROR_INV_STRM); + sctp_chunk_free(chunk); + continue; + } + + /* Has this chunk expired? */ + if (sctp_chunk_abandoned(chunk)) { + sctp_chunk_fail(chunk, 0); + sctp_chunk_free(chunk); + continue; + } + + /* If there is a specified transport, use it. + * Otherwise, we want to use the active path. + */ + new_transport = chunk->transport; + if (!new_transport || !new_transport->active) + new_transport = asoc->peer.active_path; + + /* Change packets if necessary. */ + if (new_transport != transport) { + transport = new_transport; + + /* Schedule to have this transport's + * packet flushed. + */ + if (list_empty(&transport->send_ready)) { + list_add_tail(&transport->send_ready, + &transport_list); + } + + packet = &transport->packet; + sctp_packet_config(packet, vtag, + asoc->peer.ecn_capable); + } + + SCTP_DEBUG_PRINTK("sctp_outq_flush(%p, %p[%s]), ", + q, chunk, + chunk && chunk->chunk_hdr ? + sctp_cname(SCTP_ST_CHUNK( + chunk->chunk_hdr->type)) + : "Illegal Chunk"); + + SCTP_DEBUG_PRINTK("TX TSN 0x%x skb->head " + "%p skb->users %d.\n", + ntohl(chunk->subh.data_hdr->tsn), + chunk->skb ?chunk->skb->head : NULL, + chunk->skb ? + atomic_read(&chunk->skb->users) : -1); + + /* Add the chunk to the packet. */ + status = sctp_packet_transmit_chunk(packet, chunk); + + switch (status) { + case SCTP_XMIT_PMTU_FULL: + case SCTP_XMIT_RWND_FULL: + case SCTP_XMIT_NAGLE_DELAY: + /* We could not append this chunk, so put + * the chunk back on the output queue. + */ + SCTP_DEBUG_PRINTK("sctp_outq_flush: could " + "not transmit TSN: 0x%x, status: %d\n", + ntohl(chunk->subh.data_hdr->tsn), + status); + sctp_outq_head_data(q, chunk); + goto sctp_flush_out; + break; + + case SCTP_XMIT_OK: + break; + + default: + BUG(); + } + + /* BUG: We assume that the sctp_packet_transmit() + * call below will succeed all the time and add the + * chunk to the transmitted list and restart the + * timers. + * It is possible that the call can fail under OOM + * conditions. + * + * Is this really a problem? Won't this behave + * like a lost TSN? + */ + list_add_tail(&chunk->transmitted_list, + &transport->transmitted); + + sctp_transport_reset_timers(transport); + + q->empty = 0; + + /* Only let one DATA chunk get bundled with a + * COOKIE-ECHO chunk. + */ + if (packet->has_cookie_echo) + goto sctp_flush_out; + } + break; + + default: + /* Do nothing. */ + break; + } + +sctp_flush_out: + + /* Before returning, examine all the transports touched in + * this call. Right now, we bluntly force clear all the + * transports. Things might change after we implement Nagle. + * But such an examination is still required. + * + * --xguo + */ + while ((ltransport = sctp_list_dequeue(&transport_list)) != NULL ) { + struct sctp_transport *t = list_entry(ltransport, + struct sctp_transport, + send_ready); + packet = &t->packet; + if (!sctp_packet_empty(packet)) + error = sctp_packet_transmit(packet); + } + + return error; +} + +/* Update unack_data based on the incoming SACK chunk */ +static void sctp_sack_update_unack_data(struct sctp_association *assoc, + struct sctp_sackhdr *sack) +{ + sctp_sack_variable_t *frags; + __u16 unack_data; + int i; + + unack_data = assoc->next_tsn - assoc->ctsn_ack_point - 1; + + frags = sack->variable; + for (i = 0; i < ntohs(sack->num_gap_ack_blocks); i++) { + unack_data -= ((ntohs(frags[i].gab.end) - + ntohs(frags[i].gab.start) + 1)); + } + + assoc->unack_data = unack_data; +} + +/* Return the highest new tsn that is acknowledged by the given SACK chunk. */ +static __u32 sctp_highest_new_tsn(struct sctp_sackhdr *sack, + struct sctp_association *asoc) +{ + struct list_head *ltransport, *lchunk; + struct sctp_transport *transport; + struct sctp_chunk *chunk; + __u32 highest_new_tsn, tsn; + struct list_head *transport_list = &asoc->peer.transport_addr_list; + + highest_new_tsn = ntohl(sack->cum_tsn_ack); + + list_for_each(ltransport, transport_list) { + transport = list_entry(ltransport, struct sctp_transport, + transports); + list_for_each(lchunk, &transport->transmitted) { + chunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + tsn = ntohl(chunk->subh.data_hdr->tsn); + + if (!chunk->tsn_gap_acked && + TSN_lt(highest_new_tsn, tsn) && + sctp_acked(sack, tsn)) + highest_new_tsn = tsn; + } + } + + return highest_new_tsn; +} + +/* This is where we REALLY process a SACK. + * + * Process the SACK against the outqueue. Mostly, this just frees + * things off the transmitted queue. + */ +int sctp_outq_sack(struct sctp_outq *q, struct sctp_sackhdr *sack) +{ + struct sctp_association *asoc = q->asoc; + struct sctp_transport *transport; + struct sctp_chunk *tchunk = NULL; + struct list_head *lchunk, *transport_list, *pos, *temp; + sctp_sack_variable_t *frags = sack->variable; + __u32 sack_ctsn, ctsn, tsn; + __u32 highest_tsn, highest_new_tsn; + __u32 sack_a_rwnd; + unsigned outstanding; + struct sctp_transport *primary = asoc->peer.primary_path; + int count_of_newacks = 0; + + /* Grab the association's destination address list. */ + transport_list = &asoc->peer.transport_addr_list; + + sack_ctsn = ntohl(sack->cum_tsn_ack); + + /* + * SFR-CACC algorithm: + * On receipt of a SACK the sender SHOULD execute the + * following statements. + * + * 1) If the cumulative ack in the SACK passes next tsn_at_change + * on the current primary, the CHANGEOVER_ACTIVE flag SHOULD be + * cleared. The CYCLING_CHANGEOVER flag SHOULD also be cleared for + * all destinations. + */ + if (TSN_lte(primary->cacc.next_tsn_at_change, sack_ctsn)) { + primary->cacc.changeover_active = 0; + list_for_each(pos, transport_list) { + transport = list_entry(pos, struct sctp_transport, + transports); + transport->cacc.cycling_changeover = 0; + } + } + + /* + * SFR-CACC algorithm: + * 2) If the SACK contains gap acks and the flag CHANGEOVER_ACTIVE + * is set the receiver of the SACK MUST take the following actions: + * + * A) Initialize the cacc_saw_newack to 0 for all destination + * addresses. + */ + if (sack->num_gap_ack_blocks > 0 && + primary->cacc.changeover_active) { + list_for_each(pos, transport_list) { + transport = list_entry(pos, struct sctp_transport, + transports); + transport->cacc.cacc_saw_newack = 0; + } + } + + /* Get the highest TSN in the sack. */ + highest_tsn = sack_ctsn; + if (sack->num_gap_ack_blocks) + highest_tsn += + ntohs(frags[ntohs(sack->num_gap_ack_blocks) - 1].gab.end); + + if (TSN_lt(asoc->highest_sacked, highest_tsn)) { + highest_new_tsn = highest_tsn; + asoc->highest_sacked = highest_tsn; + } else { + highest_new_tsn = sctp_highest_new_tsn(sack, asoc); + } + + /* Run through the retransmit queue. Credit bytes received + * and free those chunks that we can. + */ + sctp_check_transmitted(q, &q->retransmit, NULL, sack, highest_new_tsn); + sctp_mark_missing(q, &q->retransmit, NULL, highest_new_tsn, 0); + + /* Run through the transmitted queue. + * Credit bytes received and free those chunks which we can. + * + * This is a MASSIVE candidate for optimization. + */ + list_for_each(pos, transport_list) { + transport = list_entry(pos, struct sctp_transport, + transports); + sctp_check_transmitted(q, &transport->transmitted, + transport, sack, highest_new_tsn); + /* + * SFR-CACC algorithm: + * C) Let count_of_newacks be the number of + * destinations for which cacc_saw_newack is set. + */ + if (transport->cacc.cacc_saw_newack) + count_of_newacks ++; + } + + list_for_each(pos, transport_list) { + transport = list_entry(pos, struct sctp_transport, + transports); + sctp_mark_missing(q, &transport->transmitted, transport, + highest_new_tsn, count_of_newacks); + } + + /* Move the Cumulative TSN Ack Point if appropriate. */ + if (TSN_lt(asoc->ctsn_ack_point, sack_ctsn)) + asoc->ctsn_ack_point = sack_ctsn; + + /* Update unack_data field in the assoc. */ + sctp_sack_update_unack_data(asoc, sack); + + ctsn = asoc->ctsn_ack_point; + + /* Throw away stuff rotting on the sack queue. */ + list_for_each_safe(lchunk, temp, &q->sacked) { + tchunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + tsn = ntohl(tchunk->subh.data_hdr->tsn); + if (TSN_lte(tsn, ctsn)) + sctp_chunk_free(tchunk); + } + + /* ii) Set rwnd equal to the newly received a_rwnd minus the + * number of bytes still outstanding after processing the + * Cumulative TSN Ack and the Gap Ack Blocks. + */ + + sack_a_rwnd = ntohl(sack->a_rwnd); + outstanding = q->outstanding_bytes; + + if (outstanding < sack_a_rwnd) + sack_a_rwnd -= outstanding; + else + sack_a_rwnd = 0; + + asoc->peer.rwnd = sack_a_rwnd; + + sctp_generate_fwdtsn(q, sack_ctsn); + + SCTP_DEBUG_PRINTK("%s: sack Cumulative TSN Ack is 0x%x.\n", + __FUNCTION__, sack_ctsn); + SCTP_DEBUG_PRINTK("%s: Cumulative TSN Ack of association, " + "%p is 0x%x. Adv peer ack point: 0x%x\n", + __FUNCTION__, asoc, ctsn, asoc->adv_peer_ack_point); + + /* See if all chunks are acked. + * Make sure the empty queue handler will get run later. + */ + q->empty = skb_queue_empty(&q->out) && skb_queue_empty(&q->control) && + list_empty(&q->retransmit); + if (!q->empty) + goto finish; + + list_for_each(pos, transport_list) { + transport = list_entry(pos, struct sctp_transport, + transports); + q->empty = q->empty && list_empty(&transport->transmitted); + if (!q->empty) + goto finish; + } + + SCTP_DEBUG_PRINTK("sack queue is empty.\n"); +finish: + return q->empty; +} + +/* Is the outqueue empty? */ +int sctp_outq_is_empty(const struct sctp_outq *q) +{ + return q->empty; +} + +/******************************************************************** + * 2nd Level Abstractions + ********************************************************************/ + +/* Go through a transport's transmitted list or the association's retransmit + * list and move chunks that are acked by the Cumulative TSN Ack to q->sacked. + * The retransmit list will not have an associated transport. + * + * I added coherent debug information output. --xguo + * + * Instead of printing 'sacked' or 'kept' for each TSN on the + * transmitted_queue, we print a range: SACKED: TSN1-TSN2, TSN3, TSN4-TSN5. + * KEPT TSN6-TSN7, etc. + */ +static void sctp_check_transmitted(struct sctp_outq *q, + struct list_head *transmitted_queue, + struct sctp_transport *transport, + struct sctp_sackhdr *sack, + __u32 highest_new_tsn_in_sack) +{ + struct list_head *lchunk; + struct sctp_chunk *tchunk; + struct list_head tlist; + __u32 tsn; + __u32 sack_ctsn; + __u32 rtt; + __u8 restart_timer = 0; + int bytes_acked = 0; + + /* These state variables are for coherent debug output. --xguo */ + +#if SCTP_DEBUG + __u32 dbg_ack_tsn = 0; /* An ACKed TSN range starts here... */ + __u32 dbg_last_ack_tsn = 0; /* ...and finishes here. */ + __u32 dbg_kept_tsn = 0; /* An un-ACKed range starts here... */ + __u32 dbg_last_kept_tsn = 0; /* ...and finishes here. */ + + /* 0 : The last TSN was ACKed. + * 1 : The last TSN was NOT ACKed (i.e. KEPT). + * -1: We need to initialize. + */ + int dbg_prt_state = -1; +#endif /* SCTP_DEBUG */ + + sack_ctsn = ntohl(sack->cum_tsn_ack); + + INIT_LIST_HEAD(&tlist); + + /* The while loop will skip empty transmitted queues. */ + while (NULL != (lchunk = sctp_list_dequeue(transmitted_queue))) { + tchunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + + if (sctp_chunk_abandoned(tchunk)) { + /* Move the chunk to abandoned list. */ + sctp_insert_list(&q->abandoned, lchunk); + continue; + } + + tsn = ntohl(tchunk->subh.data_hdr->tsn); + if (sctp_acked(sack, tsn)) { + /* If this queue is the retransmit queue, the + * retransmit timer has already reclaimed + * the outstanding bytes for this chunk, so only + * count bytes associated with a transport. + */ + if (transport) { + /* If this chunk is being used for RTT + * measurement, calculate the RTT and update + * the RTO using this value. + * + * 6.3.1 C5) Karn's algorithm: RTT measurements + * MUST NOT be made using packets that were + * retransmitted (and thus for which it is + * ambiguous whether the reply was for the + * first instance of the packet or a later + * instance). + */ + if (!tchunk->tsn_gap_acked && + !tchunk->resent && + tchunk->rtt_in_progress) { + rtt = jiffies - tchunk->sent_at; + sctp_transport_update_rto(transport, + rtt); + } + } + if (TSN_lte(tsn, sack_ctsn)) { + /* RFC 2960 6.3.2 Retransmission Timer Rules + * + * R3) Whenever a SACK is received + * that acknowledges the DATA chunk + * with the earliest outstanding TSN + * for that address, restart T3-rtx + * timer for that address with its + * current RTO. + */ + restart_timer = 1; + + if (!tchunk->tsn_gap_acked) { + tchunk->tsn_gap_acked = 1; + bytes_acked += sctp_data_size(tchunk); + /* + * SFR-CACC algorithm: + * 2) If the SACK contains gap acks + * and the flag CHANGEOVER_ACTIVE is + * set the receiver of the SACK MUST + * take the following action: + * + * B) For each TSN t being acked that + * has not been acked in any SACK so + * far, set cacc_saw_newack to 1 for + * the destination that the TSN was + * sent to. + */ + if (transport && + sack->num_gap_ack_blocks && + q->asoc->peer.primary_path->cacc. + changeover_active) + transport->cacc.cacc_saw_newack + = 1; + } + + list_add_tail(&tchunk->transmitted_list, + &q->sacked); + } else { + /* RFC2960 7.2.4, sctpimpguide-05 2.8.2 + * M2) Each time a SACK arrives reporting + * 'Stray DATA chunk(s)' record the highest TSN + * reported as newly acknowledged, call this + * value 'HighestTSNinSack'. A newly + * acknowledged DATA chunk is one not + * previously acknowledged in a SACK. + * + * When the SCTP sender of data receives a SACK + * chunk that acknowledges, for the first time, + * the receipt of a DATA chunk, all the still + * unacknowledged DATA chunks whose TSN is + * older than that newly acknowledged DATA + * chunk, are qualified as 'Stray DATA chunks'. + */ + if (!tchunk->tsn_gap_acked) { + tchunk->tsn_gap_acked = 1; + bytes_acked += sctp_data_size(tchunk); + } + list_add_tail(lchunk, &tlist); + } + +#if SCTP_DEBUG + switch (dbg_prt_state) { + case 0: /* last TSN was ACKed */ + if (dbg_last_ack_tsn + 1 == tsn) { + /* This TSN belongs to the + * current ACK range. + */ + break; + } + + if (dbg_last_ack_tsn != dbg_ack_tsn) { + /* Display the end of the + * current range. + */ + SCTP_DEBUG_PRINTK("-%08x", + dbg_last_ack_tsn); + } + + /* Start a new range. */ + SCTP_DEBUG_PRINTK(",%08x", tsn); + dbg_ack_tsn = tsn; + break; + + case 1: /* The last TSN was NOT ACKed. */ + if (dbg_last_kept_tsn != dbg_kept_tsn) { + /* Display the end of current range. */ + SCTP_DEBUG_PRINTK("-%08x", + dbg_last_kept_tsn); + } + + SCTP_DEBUG_PRINTK("\n"); + + /* FALL THROUGH... */ + default: + /* This is the first-ever TSN we examined. */ + /* Start a new range of ACK-ed TSNs. */ + SCTP_DEBUG_PRINTK("ACKed: %08x", tsn); + dbg_prt_state = 0; + dbg_ack_tsn = tsn; + }; + + dbg_last_ack_tsn = tsn; +#endif /* SCTP_DEBUG */ + + } else { + if (tchunk->tsn_gap_acked) { + SCTP_DEBUG_PRINTK("%s: Receiver reneged on " + "data TSN: 0x%x\n", + __FUNCTION__, + tsn); + tchunk->tsn_gap_acked = 0; + + bytes_acked -= sctp_data_size(tchunk); + + /* RFC 2960 6.3.2 Retransmission Timer Rules + * + * R4) Whenever a SACK is received missing a + * TSN that was previously acknowledged via a + * Gap Ack Block, start T3-rtx for the + * destination address to which the DATA + * chunk was originally + * transmitted if it is not already running. + */ + restart_timer = 1; + } + + list_add_tail(lchunk, &tlist); + +#if SCTP_DEBUG + /* See the above comments on ACK-ed TSNs. */ + switch (dbg_prt_state) { + case 1: + if (dbg_last_kept_tsn + 1 == tsn) + break; + + if (dbg_last_kept_tsn != dbg_kept_tsn) + SCTP_DEBUG_PRINTK("-%08x", + dbg_last_kept_tsn); + + SCTP_DEBUG_PRINTK(",%08x", tsn); + dbg_kept_tsn = tsn; + break; + + case 0: + if (dbg_last_ack_tsn != dbg_ack_tsn) + SCTP_DEBUG_PRINTK("-%08x", + dbg_last_ack_tsn); + SCTP_DEBUG_PRINTK("\n"); + + /* FALL THROUGH... */ + default: + SCTP_DEBUG_PRINTK("KEPT: %08x",tsn); + dbg_prt_state = 1; + dbg_kept_tsn = tsn; + }; + + dbg_last_kept_tsn = tsn; +#endif /* SCTP_DEBUG */ + } + } + +#if SCTP_DEBUG + /* Finish off the last range, displaying its ending TSN. */ + switch (dbg_prt_state) { + case 0: + if (dbg_last_ack_tsn != dbg_ack_tsn) { + SCTP_DEBUG_PRINTK("-%08x\n", dbg_last_ack_tsn); + } else { + SCTP_DEBUG_PRINTK("\n"); + } + break; + + case 1: + if (dbg_last_kept_tsn != dbg_kept_tsn) { + SCTP_DEBUG_PRINTK("-%08x\n", dbg_last_kept_tsn); + } else { + SCTP_DEBUG_PRINTK("\n"); + } + }; +#endif /* SCTP_DEBUG */ + if (transport) { + if (bytes_acked) { + /* 8.2. When an outstanding TSN is acknowledged, + * the endpoint shall clear the error counter of + * the destination transport address to which the + * DATA chunk was last sent. + * The association's overall error counter is + * also cleared. + */ + transport->error_count = 0; + transport->asoc->overall_error_count = 0; + + /* Mark the destination transport address as + * active if it is not so marked. + */ + if (!transport->active) { + sctp_assoc_control_transport( + transport->asoc, + transport, + SCTP_TRANSPORT_UP, + SCTP_RECEIVED_SACK); + } + + sctp_transport_raise_cwnd(transport, sack_ctsn, + bytes_acked); + + transport->flight_size -= bytes_acked; + q->outstanding_bytes -= bytes_acked; + } else { + /* RFC 2960 6.1, sctpimpguide-06 2.15.2 + * When a sender is doing zero window probing, it + * should not timeout the association if it continues + * to receive new packets from the receiver. The + * reason is that the receiver MAY keep its window + * closed for an indefinite time. + * A sender is doing zero window probing when the + * receiver's advertised window is zero, and there is + * only one data chunk in flight to the receiver. + */ + if (!q->asoc->peer.rwnd && + !list_empty(&tlist) && + (sack_ctsn+2 == q->asoc->next_tsn)) { + SCTP_DEBUG_PRINTK("%s: SACK received for zero " + "window probe: %u\n", + __FUNCTION__, sack_ctsn); + q->asoc->overall_error_count = 0; + transport->error_count = 0; + } + } + + /* RFC 2960 6.3.2 Retransmission Timer Rules + * + * R2) Whenever all outstanding data sent to an address have + * been acknowledged, turn off the T3-rtx timer of that + * address. + */ + if (!transport->flight_size) { + if (timer_pending(&transport->T3_rtx_timer) && + del_timer(&transport->T3_rtx_timer)) { + sctp_transport_put(transport); + } + } else if (restart_timer) { + if (!mod_timer(&transport->T3_rtx_timer, + jiffies + transport->rto)) + sctp_transport_hold(transport); + } + } + + list_splice(&tlist, transmitted_queue); +} + +/* Mark chunks as missing and consequently may get retransmitted. */ +static void sctp_mark_missing(struct sctp_outq *q, + struct list_head *transmitted_queue, + struct sctp_transport *transport, + __u32 highest_new_tsn_in_sack, + int count_of_newacks) +{ + struct sctp_chunk *chunk; + struct list_head *pos; + __u32 tsn; + char do_fast_retransmit = 0; + struct sctp_transport *primary = q->asoc->peer.primary_path; + + list_for_each(pos, transmitted_queue) { + + chunk = list_entry(pos, struct sctp_chunk, transmitted_list); + tsn = ntohl(chunk->subh.data_hdr->tsn); + + /* RFC 2960 7.2.4, sctpimpguide-05 2.8.2 M3) Examine all + * 'Unacknowledged TSN's', if the TSN number of an + * 'Unacknowledged TSN' is smaller than the 'HighestTSNinSack' + * value, increment the 'TSN.Missing.Report' count on that + * chunk if it has NOT been fast retransmitted or marked for + * fast retransmit already. + */ + if (!chunk->fast_retransmit && + !chunk->tsn_gap_acked && + TSN_lt(tsn, highest_new_tsn_in_sack)) { + + /* SFR-CACC may require us to skip marking + * this chunk as missing. + */ + if (!transport || !sctp_cacc_skip(primary, transport, + count_of_newacks, tsn)) { + chunk->tsn_missing_report++; + + SCTP_DEBUG_PRINTK( + "%s: TSN 0x%x missing counter: %d\n", + __FUNCTION__, tsn, + chunk->tsn_missing_report); + } + } + /* + * M4) If any DATA chunk is found to have a + * 'TSN.Missing.Report' + * value larger than or equal to 4, mark that chunk for + * retransmission and start the fast retransmit procedure. + */ + + if (chunk->tsn_missing_report >= 4) { + chunk->fast_retransmit = 1; + do_fast_retransmit = 1; + } + } + + if (transport) { + if (do_fast_retransmit) + sctp_retransmit(q, transport, SCTP_RTXR_FAST_RTX); + + SCTP_DEBUG_PRINTK("%s: transport: %p, cwnd: %d, " + "ssthresh: %d, flight_size: %d, pba: %d\n", + __FUNCTION__, transport, transport->cwnd, + transport->ssthresh, transport->flight_size, + transport->partial_bytes_acked); + } +} + +/* Is the given TSN acked by this packet? */ +static int sctp_acked(struct sctp_sackhdr *sack, __u32 tsn) +{ + int i; + sctp_sack_variable_t *frags; + __u16 gap; + __u32 ctsn = ntohl(sack->cum_tsn_ack); + + if (TSN_lte(tsn, ctsn)) + goto pass; + + /* 3.3.4 Selective Acknowledgement (SACK) (3): + * + * Gap Ack Blocks: + * These fields contain the Gap Ack Blocks. They are repeated + * for each Gap Ack Block up to the number of Gap Ack Blocks + * defined in the Number of Gap Ack Blocks field. All DATA + * chunks with TSNs greater than or equal to (Cumulative TSN + * Ack + Gap Ack Block Start) and less than or equal to + * (Cumulative TSN Ack + Gap Ack Block End) of each Gap Ack + * Block are assumed to have been received correctly. + */ + + frags = sack->variable; + gap = tsn - ctsn; + for (i = 0; i < ntohs(sack->num_gap_ack_blocks); ++i) { + if (TSN_lte(ntohs(frags[i].gab.start), gap) && + TSN_lte(gap, ntohs(frags[i].gab.end))) + goto pass; + } + + return 0; +pass: + return 1; +} + +static inline int sctp_get_skip_pos(struct sctp_fwdtsn_skip *skiplist, + int nskips, __u16 stream) +{ + int i; + + for (i = 0; i < nskips; i++) { + if (skiplist[i].stream == stream) + return i; + } + return i; +} + +/* Create and add a fwdtsn chunk to the outq's control queue if needed. */ +static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 ctsn) +{ + struct sctp_association *asoc = q->asoc; + struct sctp_chunk *ftsn_chunk = NULL; + struct sctp_fwdtsn_skip ftsn_skip_arr[10]; + int nskips = 0; + int skip_pos = 0; + __u32 tsn; + struct sctp_chunk *chunk; + struct list_head *lchunk, *temp; + + /* PR-SCTP C1) Let SackCumAck be the Cumulative TSN ACK carried in the + * received SACK. + * + * If (Advanced.Peer.Ack.Point < SackCumAck), then update + * Advanced.Peer.Ack.Point to be equal to SackCumAck. + */ + if (TSN_lt(asoc->adv_peer_ack_point, ctsn)) + asoc->adv_peer_ack_point = ctsn; + + /* PR-SCTP C2) Try to further advance the "Advanced.Peer.Ack.Point" + * locally, that is, to move "Advanced.Peer.Ack.Point" up as long as + * the chunk next in the out-queue space is marked as "abandoned" as + * shown in the following example: + * + * Assuming that a SACK arrived with the Cumulative TSN ACK 102 + * and the Advanced.Peer.Ack.Point is updated to this value: + * + * out-queue at the end of ==> out-queue after Adv.Ack.Point + * normal SACK processing local advancement + * ... ... + * Adv.Ack.Pt-> 102 acked 102 acked + * 103 abandoned 103 abandoned + * 104 abandoned Adv.Ack.P-> 104 abandoned + * 105 105 + * 106 acked 106 acked + * ... ... + * + * In this example, the data sender successfully advanced the + * "Advanced.Peer.Ack.Point" from 102 to 104 locally. + */ + list_for_each_safe(lchunk, temp, &q->abandoned) { + chunk = list_entry(lchunk, struct sctp_chunk, + transmitted_list); + tsn = ntohl(chunk->subh.data_hdr->tsn); + + /* Remove any chunks in the abandoned queue that are acked by + * the ctsn. + */ + if (TSN_lte(tsn, ctsn)) { + list_del_init(lchunk); + if (!chunk->tsn_gap_acked) { + chunk->transport->flight_size -= + sctp_data_size(chunk); + q->outstanding_bytes -= sctp_data_size(chunk); + } + sctp_chunk_free(chunk); + } else { + if (TSN_lte(tsn, asoc->adv_peer_ack_point+1)) { + asoc->adv_peer_ack_point = tsn; + if (chunk->chunk_hdr->flags & + SCTP_DATA_UNORDERED) + continue; + skip_pos = sctp_get_skip_pos(&ftsn_skip_arr[0], + nskips, + chunk->subh.data_hdr->stream); + ftsn_skip_arr[skip_pos].stream = + chunk->subh.data_hdr->stream; + ftsn_skip_arr[skip_pos].ssn = + chunk->subh.data_hdr->ssn; + if (skip_pos == nskips) + nskips++; + if (nskips == 10) + break; + } else + break; + } + } + + /* PR-SCTP C3) If, after step C1 and C2, the "Advanced.Peer.Ack.Point" + * is greater than the Cumulative TSN ACK carried in the received + * SACK, the data sender MUST send the data receiver a FORWARD TSN + * chunk containing the latest value of the + * "Advanced.Peer.Ack.Point". + * + * C4) For each "abandoned" TSN the sender of the FORWARD TSN SHOULD + * list each stream and sequence number in the forwarded TSN. This + * information will enable the receiver to easily find any + * stranded TSN's waiting on stream reorder queues. Each stream + * SHOULD only be reported once; this means that if multiple + * abandoned messages occur in the same stream then only the + * highest abandoned stream sequence number is reported. If the + * total size of the FORWARD TSN does NOT fit in a single MTU then + * the sender of the FORWARD TSN SHOULD lower the + * Advanced.Peer.Ack.Point to the last TSN that will fit in a + * single MTU. + */ + if (asoc->adv_peer_ack_point > ctsn) + ftsn_chunk = sctp_make_fwdtsn(asoc, asoc->adv_peer_ack_point, + nskips, &ftsn_skip_arr[0]); + + if (ftsn_chunk) { + __skb_queue_tail(&q->control, (struct sk_buff *)ftsn_chunk); + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + } +} diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c new file mode 100644 index 000000000000..3a7ebfcc1fdb --- /dev/null +++ b/net/sctp/primitive.c @@ -0,0 +1,219 @@ +/* SCTP kernel reference Implementation + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * + * This file is part of the SCTP kernel reference Implementation + * + * These functions implement the SCTP primitive functions from Section 10. + * + * Note that the descriptions from the specification are USER level + * functions--this file is the functions which populate the struct proto + * for SCTP which is the BOTTOM of the sockets interface. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Narasimha Budihal + * Karl Knutson + * Ardelle Fan + * Kevin Gao + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include /* For struct list_head */ +#include +#include +#include /* For struct timeval */ +#include +#include +#include + +#define DECLARE_PRIMITIVE(name) \ +/* This is called in the code as sctp_primitive_ ## name. */ \ +int sctp_primitive_ ## name(struct sctp_association *asoc, \ + void *arg) { \ + int error = 0; \ + sctp_event_t event_type; sctp_subtype_t subtype; \ + sctp_state_t state; \ + struct sctp_endpoint *ep; \ + \ + event_type = SCTP_EVENT_T_PRIMITIVE; \ + subtype = SCTP_ST_PRIMITIVE(SCTP_PRIMITIVE_ ## name); \ + state = asoc ? asoc->state : SCTP_STATE_CLOSED; \ + ep = asoc ? asoc->ep : NULL; \ + \ + error = sctp_do_sm(event_type, subtype, state, ep, asoc, \ + arg, GFP_KERNEL); \ + return error; \ +} + +/* 10.1 ULP-to-SCTP + * B) Associate + * + * Format: ASSOCIATE(local SCTP instance name, destination transport addr, + * outbound stream count) + * -> association id [,destination transport addr list] [,outbound stream + * count] + * + * This primitive allows the upper layer to initiate an association to a + * specific peer endpoint. + * + * This version assumes that asoc is fully populated with the initial + * parameters. We then return a traditional kernel indicator of + * success or failure. + */ + +/* This is called in the code as sctp_primitive_ASSOCIATE. */ + +DECLARE_PRIMITIVE(ASSOCIATE) + +/* 10.1 ULP-to-SCTP + * C) Shutdown + * + * Format: SHUTDOWN(association id) + * -> result + * + * Gracefully closes an association. Any locally queued user data + * will be delivered to the peer. The association will be terminated only + * after the peer acknowledges all the SCTP packets sent. A success code + * will be returned on successful termination of the association. If + * attempting to terminate the association results in a failure, an error + * code shall be returned. + */ + +DECLARE_PRIMITIVE(SHUTDOWN); + +/* 10.1 ULP-to-SCTP + * C) Abort + * + * Format: Abort(association id [, cause code]) + * -> result + * + * Ungracefully closes an association. Any locally queued user data + * will be discarded and an ABORT chunk is sent to the peer. A success + * code will be returned on successful abortion of the association. If + * attempting to abort the association results in a failure, an error + * code shall be returned. + */ + +DECLARE_PRIMITIVE(ABORT); + +/* 10.1 ULP-to-SCTP + * E) Send + * + * Format: SEND(association id, buffer address, byte count [,context] + * [,stream id] [,life time] [,destination transport address] + * [,unorder flag] [,no-bundle flag] [,payload protocol-id] ) + * -> result + * + * This is the main method to send user data via SCTP. + * + * Mandatory attributes: + * + * o association id - local handle to the SCTP association + * + * o buffer address - the location where the user message to be + * transmitted is stored; + * + * o byte count - The size of the user data in number of bytes; + * + * Optional attributes: + * + * o context - an optional 32 bit integer that will be carried in the + * sending failure notification to the ULP if the transportation of + * this User Message fails. + * + * o stream id - to indicate which stream to send the data on. If not + * specified, stream 0 will be used. + * + * o life time - specifies the life time of the user data. The user data + * will not be sent by SCTP after the life time expires. This + * parameter can be used to avoid efforts to transmit stale + * user messages. SCTP notifies the ULP if the data cannot be + * initiated to transport (i.e. sent to the destination via SCTP's + * send primitive) within the life time variable. However, the + * user data will be transmitted if SCTP has attempted to transmit a + * chunk before the life time expired. + * + * o destination transport address - specified as one of the destination + * transport addresses of the peer endpoint to which this packet + * should be sent. Whenever possible, SCTP should use this destination + * transport address for sending the packets, instead of the current + * primary path. + * + * o unorder flag - this flag, if present, indicates that the user + * would like the data delivered in an unordered fashion to the peer + * (i.e., the U flag is set to 1 on all DATA chunks carrying this + * message). + * + * o no-bundle flag - instructs SCTP not to bundle this user data with + * other outbound DATA chunks. SCTP MAY still bundle even when + * this flag is present, when faced with network congestion. + * + * o payload protocol-id - A 32 bit unsigned integer that is to be + * passed to the peer indicating the type of payload protocol data + * being transmitted. This value is passed as opaque data by SCTP. + */ + +DECLARE_PRIMITIVE(SEND); + +/* 10.1 ULP-to-SCTP + * J) Request Heartbeat + * + * Format: REQUESTHEARTBEAT(association id, destination transport address) + * + * -> result + * + * Instructs the local endpoint to perform a HeartBeat on the specified + * destination transport address of the given association. The returned + * result should indicate whether the transmission of the HEARTBEAT + * chunk to the destination address is successful. + * + * Mandatory attributes: + * + * o association id - local handle to the SCTP association + * + * o destination transport address - the transport address of the + * association on which a heartbeat should be issued. + */ + +DECLARE_PRIMITIVE(REQUESTHEARTBEAT); + +/* ADDIP +* 3.1.1 Address Configuration Change Chunk (ASCONF) +* +* This chunk is used to communicate to the remote endpoint one of the +* configuration change requests that MUST be acknowledged. The +* information carried in the ASCONF Chunk uses the form of a +* Type-Length-Value (TLV), as described in "3.2.1 Optional/ +* Variable-length Parameter Format" in RFC2960 [5], forall variable +* parameters. +*/ + +DECLARE_PRIMITIVE(ASCONF); diff --git a/net/sctp/proc.c b/net/sctp/proc.c new file mode 100644 index 000000000000..e42fd8c2916b --- /dev/null +++ b/net/sctp/proc.c @@ -0,0 +1,288 @@ +/* SCTP kernel reference Implementation + * Copyright (c) 2003 International Business Machines, Corp. + * + * This file is part of the SCTP kernel reference Implementation + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include + +static struct snmp_mib sctp_snmp_list[] = { + SNMP_MIB_ITEM("SctpCurrEstab", SCTP_MIB_CURRESTAB), + SNMP_MIB_ITEM("SctpActiveEstabs", SCTP_MIB_ACTIVEESTABS), + SNMP_MIB_ITEM("SctpPassiveEstabs", SCTP_MIB_PASSIVEESTABS), + SNMP_MIB_ITEM("SctpAborteds", SCTP_MIB_ABORTEDS), + SNMP_MIB_ITEM("SctpShutdowns", SCTP_MIB_SHUTDOWNS), + SNMP_MIB_ITEM("SctpOutOfBlues", SCTP_MIB_OUTOFBLUES), + SNMP_MIB_ITEM("SctpChecksumErrors", SCTP_MIB_CHECKSUMERRORS), + SNMP_MIB_ITEM("SctpOutCtrlChunks", SCTP_MIB_OUTCTRLCHUNKS), + SNMP_MIB_ITEM("SctpOutOrderChunks", SCTP_MIB_OUTORDERCHUNKS), + SNMP_MIB_ITEM("SctpOutUnorderChunks", SCTP_MIB_OUTUNORDERCHUNKS), + SNMP_MIB_ITEM("SctpInCtrlChunks", SCTP_MIB_INCTRLCHUNKS), + SNMP_MIB_ITEM("SctpInOrderChunks", SCTP_MIB_INORDERCHUNKS), + SNMP_MIB_ITEM("SctpInUnorderChunks", SCTP_MIB_INUNORDERCHUNKS), + SNMP_MIB_ITEM("SctpFragUsrMsgs", SCTP_MIB_FRAGUSRMSGS), + SNMP_MIB_ITEM("SctpReasmUsrMsgs", SCTP_MIB_REASMUSRMSGS), + SNMP_MIB_ITEM("SctpOutSCTPPacks", SCTP_MIB_OUTSCTPPACKS), + SNMP_MIB_ITEM("SctpInSCTPPacks", SCTP_MIB_INSCTPPACKS), +}; + +/* Return the current value of a particular entry in the mib by adding its + * per cpu counters. + */ +static unsigned long +fold_field(void *mib[], int nr) +{ + unsigned long res = 0; + int i; + + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_possible(i)) + continue; + res += + *((unsigned long *) (((void *) per_cpu_ptr(mib[0], i)) + + sizeof (unsigned long) * nr)); + res += + *((unsigned long *) (((void *) per_cpu_ptr(mib[1], i)) + + sizeof (unsigned long) * nr)); + } + return res; +} + +/* Display sctp snmp mib statistics(/proc/net/sctp/snmp). */ +static int sctp_snmp_seq_show(struct seq_file *seq, void *v) +{ + int i; + + for (i = 0; sctp_snmp_list[i].name != NULL; i++) + seq_printf(seq, "%-32s\t%ld\n", sctp_snmp_list[i].name, + fold_field((void **)sctp_statistics, + sctp_snmp_list[i].entry)); + + return 0; +} + +/* Initialize the seq file operations for 'snmp' object. */ +static int sctp_snmp_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, sctp_snmp_seq_show, NULL); +} + +static struct file_operations sctp_snmp_seq_fops = { + .owner = THIS_MODULE, + .open = sctp_snmp_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* Set up the proc fs entry for 'snmp' object. */ +int __init sctp_snmp_proc_init(void) +{ + struct proc_dir_entry *p; + + p = create_proc_entry("snmp", S_IRUGO, proc_net_sctp); + if (!p) + return -ENOMEM; + + p->proc_fops = &sctp_snmp_seq_fops; + + return 0; +} + +/* Cleanup the proc fs entry for 'snmp' object. */ +void sctp_snmp_proc_exit(void) +{ + remove_proc_entry("snmp", proc_net_sctp); +} + +/* Dump local addresses of an association/endpoint. */ +static void sctp_seq_dump_local_addrs(struct seq_file *seq, struct sctp_ep_common *epb) +{ + struct list_head *pos; + struct sctp_sockaddr_entry *laddr; + union sctp_addr *addr; + struct sctp_af *af; + + list_for_each(pos, &epb->bind_addr.address_list) { + laddr = list_entry(pos, struct sctp_sockaddr_entry, list); + addr = (union sctp_addr *)&laddr->a; + af = sctp_get_af_specific(addr->sa.sa_family); + af->seq_dump_addr(seq, addr); + } +} + +/* Dump remote addresses of an association. */ +static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_association *assoc) +{ + struct list_head *pos; + struct sctp_transport *transport; + union sctp_addr *addr; + struct sctp_af *af; + + list_for_each(pos, &assoc->peer.transport_addr_list) { + transport = list_entry(pos, struct sctp_transport, transports); + addr = (union sctp_addr *)&transport->ipaddr; + af = sctp_get_af_specific(addr->sa.sa_family); + af->seq_dump_addr(seq, addr); + } +} + +/* Display sctp endpoints (/proc/net/sctp/eps). */ +static int sctp_eps_seq_show(struct seq_file *seq, void *v) +{ + struct sctp_hashbucket *head; + struct sctp_ep_common *epb; + struct sctp_endpoint *ep; + struct sock *sk; + int hash; + + seq_printf(seq, " ENDPT SOCK STY SST HBKT LPORT LADDRS\n"); + for (hash = 0; hash < sctp_ep_hashsize; hash++) { + head = &sctp_ep_hashtable[hash]; + read_lock(&head->lock); + for (epb = head->chain; epb; epb = epb->next) { + ep = sctp_ep(epb); + sk = epb->sk; + seq_printf(seq, "%8p %8p %-3d %-3d %-4d %-5d ", ep, sk, + sctp_sk(sk)->type, sk->sk_state, hash, + epb->bind_addr.port); + sctp_seq_dump_local_addrs(seq, epb); + seq_printf(seq, "\n"); + } + read_unlock(&head->lock); + } + + return 0; +} + +/* Initialize the seq file operations for 'eps' object. */ +static int sctp_eps_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, sctp_eps_seq_show, NULL); +} + +static struct file_operations sctp_eps_seq_fops = { + .open = sctp_eps_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* Set up the proc fs entry for 'eps' object. */ +int __init sctp_eps_proc_init(void) +{ + struct proc_dir_entry *p; + + p = create_proc_entry("eps", S_IRUGO, proc_net_sctp); + if (!p) + return -ENOMEM; + + p->proc_fops = &sctp_eps_seq_fops; + + return 0; +} + +/* Cleanup the proc fs entry for 'eps' object. */ +void sctp_eps_proc_exit(void) +{ + remove_proc_entry("eps", proc_net_sctp); +} + +/* Display sctp associations (/proc/net/sctp/assocs). */ +static int sctp_assocs_seq_show(struct seq_file *seq, void *v) +{ + struct sctp_hashbucket *head; + struct sctp_ep_common *epb; + struct sctp_association *assoc; + struct sock *sk; + int hash; + + seq_printf(seq, " ASSOC SOCK STY SST ST HBKT LPORT RPORT " + "LADDRS <-> RADDRS\n"); + for (hash = 0; hash < sctp_assoc_hashsize; hash++) { + head = &sctp_assoc_hashtable[hash]; + read_lock(&head->lock); + for (epb = head->chain; epb; epb = epb->next) { + assoc = sctp_assoc(epb); + sk = epb->sk; + seq_printf(seq, + "%8p %8p %-3d %-3d %-2d %-4d %-5d %-5d ", + assoc, sk, sctp_sk(sk)->type, sk->sk_state, + assoc->state, hash, epb->bind_addr.port, + assoc->peer.port); + sctp_seq_dump_local_addrs(seq, epb); + seq_printf(seq, "<-> "); + sctp_seq_dump_remote_addrs(seq, assoc); + seq_printf(seq, "\n"); + } + read_unlock(&head->lock); + } + + return 0; +} + +/* Initialize the seq file operations for 'assocs' object. */ +static int sctp_assocs_seq_open(struct inode *inode, struct file *file) +{ + return single_open(file, sctp_assocs_seq_show, NULL); +} + +static struct file_operations sctp_assocs_seq_fops = { + .open = sctp_assocs_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* Set up the proc fs entry for 'assocs' object. */ +int __init sctp_assocs_proc_init(void) +{ + struct proc_dir_entry *p; + + p = create_proc_entry("assocs", S_IRUGO, proc_net_sctp); + if (!p) + return -ENOMEM; + + p->proc_fops = &sctp_assocs_seq_fops; + + return 0; +} + +/* Cleanup the proc fs entry for 'assocs' object. */ +void sctp_assocs_proc_exit(void) +{ + remove_proc_entry("assocs", proc_net_sctp); +} diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c new file mode 100644 index 000000000000..b9813cf3d91c --- /dev/null +++ b/net/sctp/protocol.c @@ -0,0 +1,1240 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 Nokia, Inc. + * Copyright (c) 2001 La Monte H.P. Yarroll + * + * This file is part of the SCTP kernel reference Implementation + * + * Initialization/cleanup for SCTP protocol support. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Jon Grimm + * Sridhar Samudrala + * Daisy Chang + * Ardelle Fan + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Global data structures. */ +struct sctp_globals sctp_globals; +struct proc_dir_entry *proc_net_sctp; +DEFINE_SNMP_STAT(struct sctp_mib, sctp_statistics); + +struct idr sctp_assocs_id; +DEFINE_SPINLOCK(sctp_assocs_id_lock); + +/* This is the global socket data structure used for responding to + * the Out-of-the-blue (OOTB) packets. A control sock will be created + * for this socket at the initialization time. + */ +static struct socket *sctp_ctl_socket; + +static struct sctp_pf *sctp_pf_inet6_specific; +static struct sctp_pf *sctp_pf_inet_specific; +static struct sctp_af *sctp_af_v4_specific; +static struct sctp_af *sctp_af_v6_specific; + +kmem_cache_t *sctp_chunk_cachep; +kmem_cache_t *sctp_bucket_cachep; + +extern int sctp_snmp_proc_init(void); +extern int sctp_snmp_proc_exit(void); +extern int sctp_eps_proc_init(void); +extern int sctp_eps_proc_exit(void); +extern int sctp_assocs_proc_init(void); +extern int sctp_assocs_proc_exit(void); + +/* Return the address of the control sock. */ +struct sock *sctp_get_ctl_sock(void) +{ + return sctp_ctl_socket->sk; +} + +/* Set up the proc fs entry for the SCTP protocol. */ +static __init int sctp_proc_init(void) +{ + if (!proc_net_sctp) { + struct proc_dir_entry *ent; + ent = proc_mkdir("net/sctp", NULL); + if (ent) { + ent->owner = THIS_MODULE; + proc_net_sctp = ent; + } else + goto out_nomem; + } + + if (sctp_snmp_proc_init()) + goto out_nomem; + if (sctp_eps_proc_init()) + goto out_nomem; + if (sctp_assocs_proc_init()) + goto out_nomem; + + return 0; + +out_nomem: + return -ENOMEM; +} + +/* Clean up the proc fs entry for the SCTP protocol. + * Note: Do not make this __exit as it is used in the init error + * path. + */ +static void sctp_proc_exit(void) +{ + sctp_snmp_proc_exit(); + sctp_eps_proc_exit(); + sctp_assocs_proc_exit(); + + if (proc_net_sctp) { + proc_net_sctp = NULL; + remove_proc_entry("net/sctp", NULL); + } +} + +/* Private helper to extract ipv4 address and stash them in + * the protocol structure. + */ +static void sctp_v4_copy_addrlist(struct list_head *addrlist, + struct net_device *dev) +{ + struct in_device *in_dev; + struct in_ifaddr *ifa; + struct sctp_sockaddr_entry *addr; + + rcu_read_lock(); + if ((in_dev = __in_dev_get(dev)) == NULL) { + rcu_read_unlock(); + return; + } + + for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) { + /* Add the address to the local list. */ + addr = t_new(struct sctp_sockaddr_entry, GFP_ATOMIC); + if (addr) { + addr->a.v4.sin_family = AF_INET; + addr->a.v4.sin_port = 0; + addr->a.v4.sin_addr.s_addr = ifa->ifa_local; + list_add_tail(&addr->list, addrlist); + } + } + + rcu_read_unlock(); +} + +/* Extract our IP addresses from the system and stash them in the + * protocol structure. + */ +static void __sctp_get_local_addr_list(void) +{ + struct net_device *dev; + struct list_head *pos; + struct sctp_af *af; + + read_lock(&dev_base_lock); + for (dev = dev_base; dev; dev = dev->next) { + __list_for_each(pos, &sctp_address_families) { + af = list_entry(pos, struct sctp_af, list); + af->copy_addrlist(&sctp_local_addr_list, dev); + } + } + read_unlock(&dev_base_lock); +} + +static void sctp_get_local_addr_list(void) +{ + unsigned long flags; + + sctp_spin_lock_irqsave(&sctp_local_addr_lock, flags); + __sctp_get_local_addr_list(); + sctp_spin_unlock_irqrestore(&sctp_local_addr_lock, flags); +} + +/* Free the existing local addresses. */ +static void __sctp_free_local_addr_list(void) +{ + struct sctp_sockaddr_entry *addr; + struct list_head *pos, *temp; + + list_for_each_safe(pos, temp, &sctp_local_addr_list) { + addr = list_entry(pos, struct sctp_sockaddr_entry, list); + list_del(pos); + kfree(addr); + } +} + +/* Free the existing local addresses. */ +static void sctp_free_local_addr_list(void) +{ + unsigned long flags; + + sctp_spin_lock_irqsave(&sctp_local_addr_lock, flags); + __sctp_free_local_addr_list(); + sctp_spin_unlock_irqrestore(&sctp_local_addr_lock, flags); +} + +/* Copy the local addresses which are valid for 'scope' into 'bp'. */ +int sctp_copy_local_addr_list(struct sctp_bind_addr *bp, sctp_scope_t scope, + int gfp, int copy_flags) +{ + struct sctp_sockaddr_entry *addr; + int error = 0; + struct list_head *pos; + unsigned long flags; + + sctp_spin_lock_irqsave(&sctp_local_addr_lock, flags); + list_for_each(pos, &sctp_local_addr_list) { + addr = list_entry(pos, struct sctp_sockaddr_entry, list); + if (sctp_in_scope(&addr->a, scope)) { + /* Now that the address is in scope, check to see if + * the address type is really supported by the local + * sock as well as the remote peer. + */ + if ((((AF_INET == addr->a.sa.sa_family) && + (copy_flags & SCTP_ADDR4_PEERSUPP))) || + (((AF_INET6 == addr->a.sa.sa_family) && + (copy_flags & SCTP_ADDR6_ALLOWED) && + (copy_flags & SCTP_ADDR6_PEERSUPP)))) { + error = sctp_add_bind_addr(bp, &addr->a, + GFP_ATOMIC); + if (error) + goto end_copy; + } + } + } + +end_copy: + sctp_spin_unlock_irqrestore(&sctp_local_addr_lock, flags); + return error; +} + +/* Initialize a sctp_addr from in incoming skb. */ +static void sctp_v4_from_skb(union sctp_addr *addr, struct sk_buff *skb, + int is_saddr) +{ + void *from; + __u16 *port; + struct sctphdr *sh; + + port = &addr->v4.sin_port; + addr->v4.sin_family = AF_INET; + + sh = (struct sctphdr *) skb->h.raw; + if (is_saddr) { + *port = ntohs(sh->source); + from = &skb->nh.iph->saddr; + } else { + *port = ntohs(sh->dest); + from = &skb->nh.iph->daddr; + } + memcpy(&addr->v4.sin_addr.s_addr, from, sizeof(struct in_addr)); +} + +/* Initialize an sctp_addr from a socket. */ +static void sctp_v4_from_sk(union sctp_addr *addr, struct sock *sk) +{ + addr->v4.sin_family = AF_INET; + addr->v4.sin_port = inet_sk(sk)->num; + addr->v4.sin_addr.s_addr = inet_sk(sk)->rcv_saddr; +} + +/* Initialize sk->sk_rcv_saddr from sctp_addr. */ +static void sctp_v4_to_sk_saddr(union sctp_addr *addr, struct sock *sk) +{ + inet_sk(sk)->rcv_saddr = addr->v4.sin_addr.s_addr; +} + +/* Initialize sk->sk_daddr from sctp_addr. */ +static void sctp_v4_to_sk_daddr(union sctp_addr *addr, struct sock *sk) +{ + inet_sk(sk)->daddr = addr->v4.sin_addr.s_addr; +} + +/* Initialize a sctp_addr from an address parameter. */ +static void sctp_v4_from_addr_param(union sctp_addr *addr, + union sctp_addr_param *param, + __u16 port, int iif) +{ + addr->v4.sin_family = AF_INET; + addr->v4.sin_port = port; + addr->v4.sin_addr.s_addr = param->v4.addr.s_addr; +} + +/* Initialize an address parameter from a sctp_addr and return the length + * of the address parameter. + */ +static int sctp_v4_to_addr_param(const union sctp_addr *addr, + union sctp_addr_param *param) +{ + int length = sizeof(sctp_ipv4addr_param_t); + + param->v4.param_hdr.type = SCTP_PARAM_IPV4_ADDRESS; + param->v4.param_hdr.length = ntohs(length); + param->v4.addr.s_addr = addr->v4.sin_addr.s_addr; + + return length; +} + +/* Initialize a sctp_addr from a dst_entry. */ +static void sctp_v4_dst_saddr(union sctp_addr *saddr, struct dst_entry *dst, + unsigned short port) +{ + struct rtable *rt = (struct rtable *)dst; + saddr->v4.sin_family = AF_INET; + saddr->v4.sin_port = port; + saddr->v4.sin_addr.s_addr = rt->rt_src; +} + +/* Compare two addresses exactly. */ +static int sctp_v4_cmp_addr(const union sctp_addr *addr1, + const union sctp_addr *addr2) +{ + if (addr1->sa.sa_family != addr2->sa.sa_family) + return 0; + if (addr1->v4.sin_port != addr2->v4.sin_port) + return 0; + if (addr1->v4.sin_addr.s_addr != addr2->v4.sin_addr.s_addr) + return 0; + + return 1; +} + +/* Initialize addr struct to INADDR_ANY. */ +static void sctp_v4_inaddr_any(union sctp_addr *addr, unsigned short port) +{ + addr->v4.sin_family = AF_INET; + addr->v4.sin_addr.s_addr = INADDR_ANY; + addr->v4.sin_port = port; +} + +/* Is this a wildcard address? */ +static int sctp_v4_is_any(const union sctp_addr *addr) +{ + return INADDR_ANY == addr->v4.sin_addr.s_addr; +} + +/* This function checks if the address is a valid address to be used for + * SCTP binding. + * + * Output: + * Return 0 - If the address is a non-unicast or an illegal address. + * Return 1 - If the address is a unicast. + */ +static int sctp_v4_addr_valid(union sctp_addr *addr, struct sctp_sock *sp) +{ + /* Is this a non-unicast address or a unusable SCTP address? */ + if (IS_IPV4_UNUSABLE_ADDRESS(&addr->v4.sin_addr.s_addr)) + return 0; + + return 1; +} + +/* Should this be available for binding? */ +static int sctp_v4_available(union sctp_addr *addr, struct sctp_sock *sp) +{ + int ret = inet_addr_type(addr->v4.sin_addr.s_addr); + + /* FIXME: ip_nonlocal_bind sysctl support. */ + + if (addr->v4.sin_addr.s_addr != INADDR_ANY && ret != RTN_LOCAL) + return 0; + return 1; +} + +/* Checking the loopback, private and other address scopes as defined in + * RFC 1918. The IPv4 scoping is based on the draft for SCTP IPv4 + * scoping . + * + * Level 0 - unusable SCTP addresses + * Level 1 - loopback address + * Level 2 - link-local addresses + * Level 3 - private addresses. + * Level 4 - global addresses + * For INIT and INIT-ACK address list, let L be the level of + * of requested destination address, sender and receiver + * SHOULD include all of its addresses with level greater + * than or equal to L. + */ +static sctp_scope_t sctp_v4_scope(union sctp_addr *addr) +{ + sctp_scope_t retval; + + /* Should IPv4 scoping be a sysctl configurable option + * so users can turn it off (default on) for certain + * unconventional networking environments? + */ + + /* Check for unusable SCTP addresses. */ + if (IS_IPV4_UNUSABLE_ADDRESS(&addr->v4.sin_addr.s_addr)) { + retval = SCTP_SCOPE_UNUSABLE; + } else if (LOOPBACK(addr->v4.sin_addr.s_addr)) { + retval = SCTP_SCOPE_LOOPBACK; + } else if (IS_IPV4_LINK_ADDRESS(&addr->v4.sin_addr.s_addr)) { + retval = SCTP_SCOPE_LINK; + } else if (IS_IPV4_PRIVATE_ADDRESS(&addr->v4.sin_addr.s_addr)) { + retval = SCTP_SCOPE_PRIVATE; + } else { + retval = SCTP_SCOPE_GLOBAL; + } + + return retval; +} + +/* Returns a valid dst cache entry for the given source and destination ip + * addresses. If an association is passed, trys to get a dst entry with a + * source address that matches an address in the bind address list. + */ +static struct dst_entry *sctp_v4_get_dst(struct sctp_association *asoc, + union sctp_addr *daddr, + union sctp_addr *saddr) +{ + struct rtable *rt; + struct flowi fl; + struct sctp_bind_addr *bp; + rwlock_t *addr_lock; + struct sctp_sockaddr_entry *laddr; + struct list_head *pos; + struct dst_entry *dst = NULL; + union sctp_addr dst_saddr; + + memset(&fl, 0x0, sizeof(struct flowi)); + fl.fl4_dst = daddr->v4.sin_addr.s_addr; + fl.proto = IPPROTO_SCTP; + if (asoc) { + fl.fl4_tos = RT_CONN_FLAGS(asoc->base.sk); + fl.oif = asoc->base.sk->sk_bound_dev_if; + } + if (saddr) + fl.fl4_src = saddr->v4.sin_addr.s_addr; + + SCTP_DEBUG_PRINTK("%s: DST:%u.%u.%u.%u, SRC:%u.%u.%u.%u - ", + __FUNCTION__, NIPQUAD(fl.fl4_dst), + NIPQUAD(fl.fl4_src)); + + if (!ip_route_output_key(&rt, &fl)) { + dst = &rt->u.dst; + } + + /* If there is no association or if a source address is passed, no + * more validation is required. + */ + if (!asoc || saddr) + goto out; + + bp = &asoc->base.bind_addr; + addr_lock = &asoc->base.addr_lock; + + if (dst) { + /* Walk through the bind address list and look for a bind + * address that matches the source address of the returned dst. + */ + sctp_read_lock(addr_lock); + list_for_each(pos, &bp->address_list) { + laddr = list_entry(pos, struct sctp_sockaddr_entry, + list); + sctp_v4_dst_saddr(&dst_saddr, dst, bp->port); + if (sctp_v4_cmp_addr(&dst_saddr, &laddr->a)) + goto out_unlock; + } + sctp_read_unlock(addr_lock); + + /* None of the bound addresses match the source address of the + * dst. So release it. + */ + dst_release(dst); + dst = NULL; + } + + /* Walk through the bind address list and try to get a dst that + * matches a bind address as the source address. + */ + sctp_read_lock(addr_lock); + list_for_each(pos, &bp->address_list) { + laddr = list_entry(pos, struct sctp_sockaddr_entry, list); + + if (AF_INET == laddr->a.sa.sa_family) { + fl.fl4_src = laddr->a.v4.sin_addr.s_addr; + if (!ip_route_output_key(&rt, &fl)) { + dst = &rt->u.dst; + goto out_unlock; + } + } + } + +out_unlock: + sctp_read_unlock(addr_lock); +out: + if (dst) + SCTP_DEBUG_PRINTK("rt_dst:%u.%u.%u.%u, rt_src:%u.%u.%u.%u\n", + NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_src)); + else + SCTP_DEBUG_PRINTK("NO ROUTE\n"); + + return dst; +} + +/* For v4, the source address is cached in the route entry(dst). So no need + * to cache it separately and hence this is an empty routine. + */ +static void sctp_v4_get_saddr(struct sctp_association *asoc, + struct dst_entry *dst, + union sctp_addr *daddr, + union sctp_addr *saddr) +{ + struct rtable *rt = (struct rtable *)dst; + + if (rt) { + saddr->v4.sin_family = AF_INET; + saddr->v4.sin_port = asoc->base.bind_addr.port; + saddr->v4.sin_addr.s_addr = rt->rt_src; + } +} + +/* What interface did this skb arrive on? */ +static int sctp_v4_skb_iif(const struct sk_buff *skb) +{ + return ((struct rtable *)skb->dst)->rt_iif; +} + +/* Was this packet marked by Explicit Congestion Notification? */ +static int sctp_v4_is_ce(const struct sk_buff *skb) +{ + return INET_ECN_is_ce(skb->nh.iph->tos); +} + +/* Create and initialize a new sk for the socket returned by accept(). */ +static struct sock *sctp_v4_create_accept_sk(struct sock *sk, + struct sctp_association *asoc) +{ + struct inet_sock *inet = inet_sk(sk); + struct inet_sock *newinet; + struct sock *newsk = sk_alloc(PF_INET, GFP_KERNEL, sk->sk_prot, 1); + + if (!newsk) + goto out; + + sock_init_data(NULL, newsk); + + newsk->sk_type = SOCK_STREAM; + + newsk->sk_no_check = sk->sk_no_check; + newsk->sk_reuse = sk->sk_reuse; + newsk->sk_shutdown = sk->sk_shutdown; + + newsk->sk_destruct = inet_sock_destruct; + newsk->sk_family = PF_INET; + newsk->sk_protocol = IPPROTO_SCTP; + newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; + sock_reset_flag(newsk, SOCK_ZAPPED); + + newinet = inet_sk(newsk); + + /* Initialize sk's sport, dport, rcv_saddr and daddr for + * getsockname() and getpeername() + */ + newinet->sport = inet->sport; + newinet->saddr = inet->saddr; + newinet->rcv_saddr = inet->rcv_saddr; + newinet->dport = htons(asoc->peer.port); + newinet->daddr = asoc->peer.primary_addr.v4.sin_addr.s_addr; + newinet->pmtudisc = inet->pmtudisc; + newinet->id = 0; + + newinet->uc_ttl = -1; + newinet->mc_loop = 1; + newinet->mc_ttl = 1; + newinet->mc_index = 0; + newinet->mc_list = NULL; + +#ifdef INET_REFCNT_DEBUG + atomic_inc(&inet_sock_nr); +#endif + + if (newsk->sk_prot->init(newsk)) { + sk_common_release(newsk); + newsk = NULL; + } + +out: + return newsk; +} + +/* Map address, empty for v4 family */ +static void sctp_v4_addr_v4map(struct sctp_sock *sp, union sctp_addr *addr) +{ + /* Empty */ +} + +/* Dump the v4 addr to the seq file. */ +static void sctp_v4_seq_dump_addr(struct seq_file *seq, union sctp_addr *addr) +{ + seq_printf(seq, "%d.%d.%d.%d ", NIPQUAD(addr->v4.sin_addr)); +} + +/* Event handler for inet address addition/deletion events. + * Basically, whenever there is an event, we re-build our local address list. + */ +int sctp_inetaddr_event(struct notifier_block *this, unsigned long ev, + void *ptr) +{ + unsigned long flags; + + sctp_spin_lock_irqsave(&sctp_local_addr_lock, flags); + __sctp_free_local_addr_list(); + __sctp_get_local_addr_list(); + sctp_spin_unlock_irqrestore(&sctp_local_addr_lock, flags); + + return NOTIFY_DONE; +} + +/* + * Initialize the control inode/socket with a control endpoint data + * structure. This endpoint is reserved exclusively for the OOTB processing. + */ +static int sctp_ctl_sock_init(void) +{ + int err; + sa_family_t family; + + if (sctp_get_pf_specific(PF_INET6)) + family = PF_INET6; + else + family = PF_INET; + + err = sock_create_kern(family, SOCK_SEQPACKET, IPPROTO_SCTP, + &sctp_ctl_socket); + if (err < 0) { + printk(KERN_ERR + "SCTP: Failed to create the SCTP control socket.\n"); + return err; + } + sctp_ctl_socket->sk->sk_allocation = GFP_ATOMIC; + inet_sk(sctp_ctl_socket->sk)->uc_ttl = -1; + + return 0; +} + +/* Register address family specific functions. */ +int sctp_register_af(struct sctp_af *af) +{ + switch (af->sa_family) { + case AF_INET: + if (sctp_af_v4_specific) + return 0; + sctp_af_v4_specific = af; + break; + case AF_INET6: + if (sctp_af_v6_specific) + return 0; + sctp_af_v6_specific = af; + break; + default: + return 0; + } + + INIT_LIST_HEAD(&af->list); + list_add_tail(&af->list, &sctp_address_families); + return 1; +} + +/* Get the table of functions for manipulating a particular address + * family. + */ +struct sctp_af *sctp_get_af_specific(sa_family_t family) +{ + switch (family) { + case AF_INET: + return sctp_af_v4_specific; + case AF_INET6: + return sctp_af_v6_specific; + default: + return NULL; + } +} + +/* Common code to initialize a AF_INET msg_name. */ +static void sctp_inet_msgname(char *msgname, int *addr_len) +{ + struct sockaddr_in *sin; + + sin = (struct sockaddr_in *)msgname; + *addr_len = sizeof(struct sockaddr_in); + sin->sin_family = AF_INET; + memset(sin->sin_zero, 0, sizeof(sin->sin_zero)); +} + +/* Copy the primary address of the peer primary address as the msg_name. */ +static void sctp_inet_event_msgname(struct sctp_ulpevent *event, char *msgname, + int *addr_len) +{ + struct sockaddr_in *sin, *sinfrom; + + if (msgname) { + struct sctp_association *asoc; + + asoc = event->asoc; + sctp_inet_msgname(msgname, addr_len); + sin = (struct sockaddr_in *)msgname; + sinfrom = &asoc->peer.primary_addr.v4; + sin->sin_port = htons(asoc->peer.port); + sin->sin_addr.s_addr = sinfrom->sin_addr.s_addr; + } +} + +/* Initialize and copy out a msgname from an inbound skb. */ +static void sctp_inet_skb_msgname(struct sk_buff *skb, char *msgname, int *len) +{ + struct sctphdr *sh; + struct sockaddr_in *sin; + + if (msgname) { + sctp_inet_msgname(msgname, len); + sin = (struct sockaddr_in *)msgname; + sh = (struct sctphdr *)skb->h.raw; + sin->sin_port = sh->source; + sin->sin_addr.s_addr = skb->nh.iph->saddr; + } +} + +/* Do we support this AF? */ +static int sctp_inet_af_supported(sa_family_t family, struct sctp_sock *sp) +{ + /* PF_INET only supports AF_INET addresses. */ + return (AF_INET == family); +} + +/* Address matching with wildcards allowed. */ +static int sctp_inet_cmp_addr(const union sctp_addr *addr1, + const union sctp_addr *addr2, + struct sctp_sock *opt) +{ + /* PF_INET only supports AF_INET addresses. */ + if (addr1->sa.sa_family != addr2->sa.sa_family) + return 0; + if (INADDR_ANY == addr1->v4.sin_addr.s_addr || + INADDR_ANY == addr2->v4.sin_addr.s_addr) + return 1; + if (addr1->v4.sin_addr.s_addr == addr2->v4.sin_addr.s_addr) + return 1; + + return 0; +} + +/* Verify that provided sockaddr looks bindable. Common verification has + * already been taken care of. + */ +static int sctp_inet_bind_verify(struct sctp_sock *opt, union sctp_addr *addr) +{ + return sctp_v4_available(addr, opt); +} + +/* Verify that sockaddr looks sendable. Common verification has already + * been taken care of. + */ +static int sctp_inet_send_verify(struct sctp_sock *opt, union sctp_addr *addr) +{ + return 1; +} + +/* Fill in Supported Address Type information for INIT and INIT-ACK + * chunks. Returns number of addresses supported. + */ +static int sctp_inet_supported_addrs(const struct sctp_sock *opt, + __u16 *types) +{ + types[0] = SCTP_PARAM_IPV4_ADDRESS; + return 1; +} + +/* Wrapper routine that calls the ip transmit routine. */ +static inline int sctp_v4_xmit(struct sk_buff *skb, + struct sctp_transport *transport, int ipfragok) +{ + SCTP_DEBUG_PRINTK("%s: skb:%p, len:%d, " + "src:%u.%u.%u.%u, dst:%u.%u.%u.%u\n", + __FUNCTION__, skb, skb->len, + NIPQUAD(((struct rtable *)skb->dst)->rt_src), + NIPQUAD(((struct rtable *)skb->dst)->rt_dst)); + + SCTP_INC_STATS(SCTP_MIB_OUTSCTPPACKS); + return ip_queue_xmit(skb, ipfragok); +} + +static struct sctp_af sctp_ipv4_specific; + +static struct sctp_pf sctp_pf_inet = { + .event_msgname = sctp_inet_event_msgname, + .skb_msgname = sctp_inet_skb_msgname, + .af_supported = sctp_inet_af_supported, + .cmp_addr = sctp_inet_cmp_addr, + .bind_verify = sctp_inet_bind_verify, + .send_verify = sctp_inet_send_verify, + .supported_addrs = sctp_inet_supported_addrs, + .create_accept_sk = sctp_v4_create_accept_sk, + .addr_v4map = sctp_v4_addr_v4map, + .af = &sctp_ipv4_specific, +}; + +/* Notifier for inetaddr addition/deletion events. */ +static struct notifier_block sctp_inetaddr_notifier = { + .notifier_call = sctp_inetaddr_event, +}; + +/* Socket operations. */ +static struct proto_ops inet_seqpacket_ops = { + .family = PF_INET, + .owner = THIS_MODULE, + .release = inet_release, /* Needs to be wrapped... */ + .bind = inet_bind, + .connect = inet_dgram_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet_getname, /* Semantics are different. */ + .poll = sctp_poll, + .ioctl = inet_ioctl, + .listen = sctp_inet_listen, + .shutdown = inet_shutdown, /* Looks harmless. */ + .setsockopt = sock_common_setsockopt, /* IP_SOL IP_OPTION is a problem. */ + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +/* Registration with AF_INET family. */ +static struct inet_protosw sctp_seqpacket_protosw = { + .type = SOCK_SEQPACKET, + .protocol = IPPROTO_SCTP, + .prot = &sctp_prot, + .ops = &inet_seqpacket_ops, + .capability = -1, + .no_check = 0, + .flags = SCTP_PROTOSW_FLAG +}; +static struct inet_protosw sctp_stream_protosw = { + .type = SOCK_STREAM, + .protocol = IPPROTO_SCTP, + .prot = &sctp_prot, + .ops = &inet_seqpacket_ops, + .capability = -1, + .no_check = 0, + .flags = SCTP_PROTOSW_FLAG +}; + +/* Register with IP layer. */ +static struct net_protocol sctp_protocol = { + .handler = sctp_rcv, + .err_handler = sctp_v4_err, + .no_policy = 1, +}; + +/* IPv4 address related functions. */ +static struct sctp_af sctp_ipv4_specific = { + .sctp_xmit = sctp_v4_xmit, + .setsockopt = ip_setsockopt, + .getsockopt = ip_getsockopt, + .get_dst = sctp_v4_get_dst, + .get_saddr = sctp_v4_get_saddr, + .copy_addrlist = sctp_v4_copy_addrlist, + .from_skb = sctp_v4_from_skb, + .from_sk = sctp_v4_from_sk, + .to_sk_saddr = sctp_v4_to_sk_saddr, + .to_sk_daddr = sctp_v4_to_sk_daddr, + .from_addr_param= sctp_v4_from_addr_param, + .to_addr_param = sctp_v4_to_addr_param, + .dst_saddr = sctp_v4_dst_saddr, + .cmp_addr = sctp_v4_cmp_addr, + .addr_valid = sctp_v4_addr_valid, + .inaddr_any = sctp_v4_inaddr_any, + .is_any = sctp_v4_is_any, + .available = sctp_v4_available, + .scope = sctp_v4_scope, + .skb_iif = sctp_v4_skb_iif, + .is_ce = sctp_v4_is_ce, + .seq_dump_addr = sctp_v4_seq_dump_addr, + .net_header_len = sizeof(struct iphdr), + .sockaddr_len = sizeof(struct sockaddr_in), + .sa_family = AF_INET, +}; + +struct sctp_pf *sctp_get_pf_specific(sa_family_t family) { + + switch (family) { + case PF_INET: + return sctp_pf_inet_specific; + case PF_INET6: + return sctp_pf_inet6_specific; + default: + return NULL; + } +} + +/* Register the PF specific function table. */ +int sctp_register_pf(struct sctp_pf *pf, sa_family_t family) +{ + switch (family) { + case PF_INET: + if (sctp_pf_inet_specific) + return 0; + sctp_pf_inet_specific = pf; + break; + case PF_INET6: + if (sctp_pf_inet6_specific) + return 0; + sctp_pf_inet6_specific = pf; + break; + default: + return 0; + } + return 1; +} + +static int __init init_sctp_mibs(void) +{ + sctp_statistics[0] = alloc_percpu(struct sctp_mib); + if (!sctp_statistics[0]) + return -ENOMEM; + sctp_statistics[1] = alloc_percpu(struct sctp_mib); + if (!sctp_statistics[1]) { + free_percpu(sctp_statistics[0]); + return -ENOMEM; + } + return 0; + +} + +static void cleanup_sctp_mibs(void) +{ + free_percpu(sctp_statistics[0]); + free_percpu(sctp_statistics[1]); +} + +/* Initialize the universe into something sensible. */ +SCTP_STATIC __init int sctp_init(void) +{ + int i; + int status = -EINVAL; + unsigned long goal; + int order; + + /* SCTP_DEBUG sanity check. */ + if (!sctp_sanity_check()) + goto out; + + status = proto_register(&sctp_prot, 1); + if (status) + goto out; + + /* Add SCTP to inet_protos hash table. */ + status = -EAGAIN; + if (inet_add_protocol(&sctp_protocol, IPPROTO_SCTP) < 0) + goto err_add_protocol; + + /* Add SCTP(TCP and UDP style) to inetsw linked list. */ + inet_register_protosw(&sctp_seqpacket_protosw); + inet_register_protosw(&sctp_stream_protosw); + + /* Allocate a cache pools. */ + status = -ENOBUFS; + sctp_bucket_cachep = kmem_cache_create("sctp_bind_bucket", + sizeof(struct sctp_bind_bucket), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + + if (!sctp_bucket_cachep) + goto err_bucket_cachep; + + sctp_chunk_cachep = kmem_cache_create("sctp_chunk", + sizeof(struct sctp_chunk), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!sctp_chunk_cachep) + goto err_chunk_cachep; + + /* Allocate and initialise sctp mibs. */ + status = init_sctp_mibs(); + if (status) + goto err_init_mibs; + + /* Initialize proc fs directory. */ + status = sctp_proc_init(); + if (status) + goto err_init_proc; + + /* Initialize object count debugging. */ + sctp_dbg_objcnt_init(); + + /* Initialize the SCTP specific PF functions. */ + sctp_register_pf(&sctp_pf_inet, PF_INET); + /* + * 14. Suggested SCTP Protocol Parameter Values + */ + /* The following protocol parameters are RECOMMENDED: */ + /* RTO.Initial - 3 seconds */ + sctp_rto_initial = SCTP_RTO_INITIAL; + /* RTO.Min - 1 second */ + sctp_rto_min = SCTP_RTO_MIN; + /* RTO.Max - 60 seconds */ + sctp_rto_max = SCTP_RTO_MAX; + /* RTO.Alpha - 1/8 */ + sctp_rto_alpha = SCTP_RTO_ALPHA; + /* RTO.Beta - 1/4 */ + sctp_rto_beta = SCTP_RTO_BETA; + + /* Valid.Cookie.Life - 60 seconds */ + sctp_valid_cookie_life = 60 * HZ; + + /* Whether Cookie Preservative is enabled(1) or not(0) */ + sctp_cookie_preserve_enable = 1; + + /* Max.Burst - 4 */ + sctp_max_burst = SCTP_MAX_BURST; + + /* Association.Max.Retrans - 10 attempts + * Path.Max.Retrans - 5 attempts (per destination address) + * Max.Init.Retransmits - 8 attempts + */ + sctp_max_retrans_association = 10; + sctp_max_retrans_path = 5; + sctp_max_retrans_init = 8; + + /* HB.interval - 30 seconds */ + sctp_hb_interval = 30 * HZ; + + /* Implementation specific variables. */ + + /* Initialize default stream count setup information. */ + sctp_max_instreams = SCTP_DEFAULT_INSTREAMS; + sctp_max_outstreams = SCTP_DEFAULT_OUTSTREAMS; + + /* Initialize handle used for association ids. */ + idr_init(&sctp_assocs_id); + + /* Size and allocate the association hash table. + * The methodology is similar to that of the tcp hash tables. + */ + if (num_physpages >= (128 * 1024)) + goal = num_physpages >> (22 - PAGE_SHIFT); + else + goal = num_physpages >> (24 - PAGE_SHIFT); + + for (order = 0; (1UL << order) < goal; order++) + ; + + do { + sctp_assoc_hashsize = (1UL << order) * PAGE_SIZE / + sizeof(struct sctp_hashbucket); + if ((sctp_assoc_hashsize > (64 * 1024)) && order > 0) + continue; + sctp_assoc_hashtable = (struct sctp_hashbucket *) + __get_free_pages(GFP_ATOMIC, order); + } while (!sctp_assoc_hashtable && --order > 0); + if (!sctp_assoc_hashtable) { + printk(KERN_ERR "SCTP: Failed association hash alloc.\n"); + status = -ENOMEM; + goto err_ahash_alloc; + } + for (i = 0; i < sctp_assoc_hashsize; i++) { + rwlock_init(&sctp_assoc_hashtable[i].lock); + sctp_assoc_hashtable[i].chain = NULL; + } + + /* Allocate and initialize the endpoint hash table. */ + sctp_ep_hashsize = 64; + sctp_ep_hashtable = (struct sctp_hashbucket *) + kmalloc(64 * sizeof(struct sctp_hashbucket), GFP_KERNEL); + if (!sctp_ep_hashtable) { + printk(KERN_ERR "SCTP: Failed endpoint_hash alloc.\n"); + status = -ENOMEM; + goto err_ehash_alloc; + } + for (i = 0; i < sctp_ep_hashsize; i++) { + rwlock_init(&sctp_ep_hashtable[i].lock); + sctp_ep_hashtable[i].chain = NULL; + } + + /* Allocate and initialize the SCTP port hash table. */ + do { + sctp_port_hashsize = (1UL << order) * PAGE_SIZE / + sizeof(struct sctp_bind_hashbucket); + if ((sctp_port_hashsize > (64 * 1024)) && order > 0) + continue; + sctp_port_hashtable = (struct sctp_bind_hashbucket *) + __get_free_pages(GFP_ATOMIC, order); + } while (!sctp_port_hashtable && --order > 0); + if (!sctp_port_hashtable) { + printk(KERN_ERR "SCTP: Failed bind hash alloc."); + status = -ENOMEM; + goto err_bhash_alloc; + } + for (i = 0; i < sctp_port_hashsize; i++) { + spin_lock_init(&sctp_port_hashtable[i].lock); + sctp_port_hashtable[i].chain = NULL; + } + + spin_lock_init(&sctp_port_alloc_lock); + sctp_port_rover = sysctl_local_port_range[0] - 1; + + printk(KERN_INFO "SCTP: Hash tables configured " + "(established %d bind %d)\n", + sctp_assoc_hashsize, sctp_port_hashsize); + + /* Disable ADDIP by default. */ + sctp_addip_enable = 0; + + /* Enable PR-SCTP by default. */ + sctp_prsctp_enable = 1; + + sctp_sysctl_register(); + + INIT_LIST_HEAD(&sctp_address_families); + sctp_register_af(&sctp_ipv4_specific); + + status = sctp_v6_init(); + if (status) + goto err_v6_init; + + /* Initialize the control inode/socket for handling OOTB packets. */ + if ((status = sctp_ctl_sock_init())) { + printk (KERN_ERR + "SCTP: Failed to initialize the SCTP control sock.\n"); + goto err_ctl_sock_init; + } + + /* Initialize the local address list. */ + INIT_LIST_HEAD(&sctp_local_addr_list); + spin_lock_init(&sctp_local_addr_lock); + + /* Register notifier for inet address additions/deletions. */ + register_inetaddr_notifier(&sctp_inetaddr_notifier); + + sctp_get_local_addr_list(); + + __unsafe(THIS_MODULE); + status = 0; +out: + return status; +err_add_protocol: + proto_unregister(&sctp_prot); +err_ctl_sock_init: + sctp_v6_exit(); +err_v6_init: + sctp_sysctl_unregister(); + list_del(&sctp_ipv4_specific.list); + free_pages((unsigned long)sctp_port_hashtable, + get_order(sctp_port_hashsize * + sizeof(struct sctp_bind_hashbucket))); +err_bhash_alloc: + kfree(sctp_ep_hashtable); +err_ehash_alloc: + free_pages((unsigned long)sctp_assoc_hashtable, + get_order(sctp_assoc_hashsize * + sizeof(struct sctp_hashbucket))); +err_ahash_alloc: + sctp_dbg_objcnt_exit(); +err_init_proc: + sctp_proc_exit(); + cleanup_sctp_mibs(); +err_init_mibs: + kmem_cache_destroy(sctp_chunk_cachep); +err_chunk_cachep: + kmem_cache_destroy(sctp_bucket_cachep); +err_bucket_cachep: + inet_del_protocol(&sctp_protocol, IPPROTO_SCTP); + inet_unregister_protosw(&sctp_seqpacket_protosw); + inet_unregister_protosw(&sctp_stream_protosw); + goto out; +} + +/* Exit handler for the SCTP protocol. */ +SCTP_STATIC __exit void sctp_exit(void) +{ + /* BUG. This should probably do something useful like clean + * up all the remaining associations and all that memory. + */ + + /* Unregister notifier for inet address additions/deletions. */ + unregister_inetaddr_notifier(&sctp_inetaddr_notifier); + + /* Free the local address list. */ + sctp_free_local_addr_list(); + + /* Free the control endpoint. */ + sock_release(sctp_ctl_socket); + + sctp_v6_exit(); + sctp_sysctl_unregister(); + list_del(&sctp_ipv4_specific.list); + + free_pages((unsigned long)sctp_assoc_hashtable, + get_order(sctp_assoc_hashsize * + sizeof(struct sctp_hashbucket))); + kfree(sctp_ep_hashtable); + free_pages((unsigned long)sctp_port_hashtable, + get_order(sctp_port_hashsize * + sizeof(struct sctp_bind_hashbucket))); + + kmem_cache_destroy(sctp_chunk_cachep); + kmem_cache_destroy(sctp_bucket_cachep); + + sctp_dbg_objcnt_exit(); + sctp_proc_exit(); + cleanup_sctp_mibs(); + + inet_del_protocol(&sctp_protocol, IPPROTO_SCTP); + inet_unregister_protosw(&sctp_seqpacket_protosw); + inet_unregister_protosw(&sctp_stream_protosw); + proto_unregister(&sctp_prot); +} + +module_init(sctp_init); +module_exit(sctp_exit); + +MODULE_AUTHOR("Linux Kernel SCTP developers "); +MODULE_DESCRIPTION("Support for the SCTP protocol (RFC2960)"); +MODULE_LICENSE("GPL"); diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c new file mode 100644 index 000000000000..1db12cc18cf7 --- /dev/null +++ b/net/sctp/sm_make_chunk.c @@ -0,0 +1,2766 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001-2002 Intel Corp. + * + * This file is part of the SCTP kernel reference Implementation + * + * These functions work with the state functions in sctp_sm_statefuns.c + * to implement the state operations. These functions implement the + * steps which require modifying existing data structures. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * C. Robin + * Jon Grimm + * Xingang Guo + * Dajiang Zhang + * Sridhar Samudrala + * Daisy Chang + * Ardelle Fan + * Kevin Gao + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include /* for get_random_bytes */ +#include +#include + +extern kmem_cache_t *sctp_chunk_cachep; + +SCTP_STATIC +struct sctp_chunk *sctp_make_chunk(const struct sctp_association *asoc, + __u8 type, __u8 flags, int paylen); +static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const struct sctp_chunk *init_chunk, + int *cookie_len, + const __u8 *raw_addrs, int addrs_len); +static int sctp_process_param(struct sctp_association *asoc, + union sctp_params param, + const union sctp_addr *peer_addr, + int gfp); + +/* What was the inbound interface for this chunk? */ +int sctp_chunk_iif(const struct sctp_chunk *chunk) +{ + struct sctp_af *af; + int iif = 0; + + af = sctp_get_af_specific(ipver2af(chunk->skb->nh.iph->version)); + if (af) + iif = af->skb_iif(chunk->skb); + + return iif; +} + +/* RFC 2960 3.3.2 Initiation (INIT) (1) + * + * Note 2: The ECN capable field is reserved for future use of + * Explicit Congestion Notification. + */ +static const struct sctp_paramhdr ecap_param = { + SCTP_PARAM_ECN_CAPABLE, + __constant_htons(sizeof(struct sctp_paramhdr)), +}; +static const struct sctp_paramhdr prsctp_param = { + SCTP_PARAM_FWD_TSN_SUPPORT, + __constant_htons(sizeof(struct sctp_paramhdr)), +}; + +/* A helper to initialize to initialize an op error inside a + * provided chunk, as most cause codes will be embedded inside an + * abort chunk. + */ +void sctp_init_cause(struct sctp_chunk *chunk, __u16 cause_code, + const void *payload, size_t paylen) +{ + sctp_errhdr_t err; + int padlen; + __u16 len; + + /* Cause code constants are now defined in network order. */ + err.cause = cause_code; + len = sizeof(sctp_errhdr_t) + paylen; + padlen = len % 4; + err.length = htons(len); + len += padlen; + sctp_addto_chunk(chunk, sizeof(sctp_errhdr_t), &err); + chunk->subh.err_hdr = sctp_addto_chunk(chunk, paylen, payload); +} + +/* 3.3.2 Initiation (INIT) (1) + * + * This chunk is used to initiate a SCTP association between two + * endpoints. The format of the INIT chunk is shown below: + * + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type = 1 | Chunk Flags | Chunk Length | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Initiate Tag | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Advertised Receiver Window Credit (a_rwnd) | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Number of Outbound Streams | Number of Inbound Streams | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Initial TSN | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * \ \ + * / Optional/Variable-Length Parameters / + * \ \ + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * + * The INIT chunk contains the following parameters. Unless otherwise + * noted, each parameter MUST only be included once in the INIT chunk. + * + * Fixed Parameters Status + * ---------------------------------------------- + * Initiate Tag Mandatory + * Advertised Receiver Window Credit Mandatory + * Number of Outbound Streams Mandatory + * Number of Inbound Streams Mandatory + * Initial TSN Mandatory + * + * Variable Parameters Status Type Value + * ------------------------------------------------------------- + * IPv4 Address (Note 1) Optional 5 + * IPv6 Address (Note 1) Optional 6 + * Cookie Preservative Optional 9 + * Reserved for ECN Capable (Note 2) Optional 32768 (0x8000) + * Host Name Address (Note 3) Optional 11 + * Supported Address Types (Note 4) Optional 12 + */ +struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc, + const struct sctp_bind_addr *bp, + int gfp, int vparam_len) +{ + sctp_inithdr_t init; + union sctp_params addrs; + size_t chunksize; + struct sctp_chunk *retval = NULL; + int num_types, addrs_len = 0; + struct sctp_sock *sp; + sctp_supported_addrs_param_t sat; + __u16 types[2]; + sctp_adaption_ind_param_t aiparam; + + /* RFC 2960 3.3.2 Initiation (INIT) (1) + * + * Note 1: The INIT chunks can contain multiple addresses that + * can be IPv4 and/or IPv6 in any combination. + */ + retval = NULL; + + /* Convert the provided bind address list to raw format. */ + addrs = sctp_bind_addrs_to_raw(bp, &addrs_len, gfp); + + init.init_tag = htonl(asoc->c.my_vtag); + init.a_rwnd = htonl(asoc->rwnd); + init.num_outbound_streams = htons(asoc->c.sinit_num_ostreams); + init.num_inbound_streams = htons(asoc->c.sinit_max_instreams); + init.initial_tsn = htonl(asoc->c.initial_tsn); + + /* How many address types are needed? */ + sp = sctp_sk(asoc->base.sk); + num_types = sp->pf->supported_addrs(sp, types); + + chunksize = sizeof(init) + addrs_len + SCTP_SAT_LEN(num_types); + chunksize += sizeof(ecap_param); + if (sctp_prsctp_enable) + chunksize += sizeof(prsctp_param); + chunksize += sizeof(aiparam); + chunksize += vparam_len; + + /* RFC 2960 3.3.2 Initiation (INIT) (1) + * + * Note 3: An INIT chunk MUST NOT contain more than one Host + * Name address parameter. Moreover, the sender of the INIT + * MUST NOT combine any other address types with the Host Name + * address in the INIT. The receiver of INIT MUST ignore any + * other address types if the Host Name address parameter is + * present in the received INIT chunk. + * + * PLEASE DO NOT FIXME [This version does not support Host Name.] + */ + + retval = sctp_make_chunk(asoc, SCTP_CID_INIT, 0, chunksize); + if (!retval) + goto nodata; + + retval->subh.init_hdr = + sctp_addto_chunk(retval, sizeof(init), &init); + retval->param_hdr.v = + sctp_addto_chunk(retval, addrs_len, addrs.v); + + /* RFC 2960 3.3.2 Initiation (INIT) (1) + * + * Note 4: This parameter, when present, specifies all the + * address types the sending endpoint can support. The absence + * of this parameter indicates that the sending endpoint can + * support any address type. + */ + sat.param_hdr.type = SCTP_PARAM_SUPPORTED_ADDRESS_TYPES; + sat.param_hdr.length = htons(SCTP_SAT_LEN(num_types)); + sctp_addto_chunk(retval, sizeof(sat), &sat); + sctp_addto_chunk(retval, num_types * sizeof(__u16), &types); + + sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); + if (sctp_prsctp_enable) + sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); + aiparam.param_hdr.type = SCTP_PARAM_ADAPTION_LAYER_IND; + aiparam.param_hdr.length = htons(sizeof(aiparam)); + aiparam.adaption_ind = htonl(sp->adaption_ind); + sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); +nodata: + if (addrs.v) + kfree(addrs.v); + return retval; +} + +struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + int gfp, int unkparam_len) +{ + sctp_inithdr_t initack; + struct sctp_chunk *retval; + union sctp_params addrs; + int addrs_len; + sctp_cookie_param_t *cookie; + int cookie_len; + size_t chunksize; + sctp_adaption_ind_param_t aiparam; + + retval = NULL; + + /* Note: there may be no addresses to embed. */ + addrs = sctp_bind_addrs_to_raw(&asoc->base.bind_addr, &addrs_len, gfp); + + initack.init_tag = htonl(asoc->c.my_vtag); + initack.a_rwnd = htonl(asoc->rwnd); + initack.num_outbound_streams = htons(asoc->c.sinit_num_ostreams); + initack.num_inbound_streams = htons(asoc->c.sinit_max_instreams); + initack.initial_tsn = htonl(asoc->c.initial_tsn); + + /* FIXME: We really ought to build the cookie right + * into the packet instead of allocating more fresh memory. + */ + cookie = sctp_pack_cookie(asoc->ep, asoc, chunk, &cookie_len, + addrs.v, addrs_len); + if (!cookie) + goto nomem_cookie; + + /* Calculate the total size of allocation, include the reserved + * space for reporting unknown parameters if it is specified. + */ + chunksize = sizeof(initack) + addrs_len + cookie_len + unkparam_len; + + /* Tell peer that we'll do ECN only if peer advertised such cap. */ + if (asoc->peer.ecn_capable) + chunksize += sizeof(ecap_param); + + /* Tell peer that we'll do PR-SCTP only if peer advertised. */ + if (asoc->peer.prsctp_capable) + chunksize += sizeof(prsctp_param); + + chunksize += sizeof(aiparam); + + /* Now allocate and fill out the chunk. */ + retval = sctp_make_chunk(asoc, SCTP_CID_INIT_ACK, 0, chunksize); + if (!retval) + goto nomem_chunk; + + /* Per the advice in RFC 2960 6.4, send this reply to + * the source of the INIT packet. + */ + retval->transport = chunk->transport; + retval->subh.init_hdr = + sctp_addto_chunk(retval, sizeof(initack), &initack); + retval->param_hdr.v = sctp_addto_chunk(retval, addrs_len, addrs.v); + sctp_addto_chunk(retval, cookie_len, cookie); + if (asoc->peer.ecn_capable) + sctp_addto_chunk(retval, sizeof(ecap_param), &ecap_param); + if (asoc->peer.prsctp_capable) + sctp_addto_chunk(retval, sizeof(prsctp_param), &prsctp_param); + + aiparam.param_hdr.type = SCTP_PARAM_ADAPTION_LAYER_IND; + aiparam.param_hdr.length = htons(sizeof(aiparam)); + aiparam.adaption_ind = htonl(sctp_sk(asoc->base.sk)->adaption_ind); + sctp_addto_chunk(retval, sizeof(aiparam), &aiparam); + + /* We need to remove the const qualifier at this point. */ + retval->asoc = (struct sctp_association *) asoc; + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it received the DATA or control chunk + * to which it is replying. + * + * [INIT ACK back to where the INIT came from.] + */ + if (chunk) + retval->transport = chunk->transport; + +nomem_chunk: + kfree(cookie); +nomem_cookie: + if (addrs.v) + kfree(addrs.v); + return retval; +} + +/* 3.3.11 Cookie Echo (COOKIE ECHO) (10): + * + * This chunk is used only during the initialization of an association. + * It is sent by the initiator of an association to its peer to complete + * the initialization process. This chunk MUST precede any DATA chunk + * sent within the association, but MAY be bundled with one or more DATA + * chunks in the same packet. + * + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type = 10 |Chunk Flags | Length | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * / Cookie / + * \ \ + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Chunk Flags: 8 bit + * + * Set to zero on transmit and ignored on receipt. + * + * Length: 16 bits (unsigned integer) + * + * Set to the size of the chunk in bytes, including the 4 bytes of + * the chunk header and the size of the Cookie. + * + * Cookie: variable size + * + * This field must contain the exact cookie received in the + * State Cookie parameter from the previous INIT ACK. + * + * An implementation SHOULD make the cookie as small as possible + * to insure interoperability. + */ +struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc, + const struct sctp_chunk *chunk) +{ + struct sctp_chunk *retval; + void *cookie; + int cookie_len; + + cookie = asoc->peer.cookie; + cookie_len = asoc->peer.cookie_len; + + /* Build a cookie echo chunk. */ + retval = sctp_make_chunk(asoc, SCTP_CID_COOKIE_ECHO, 0, cookie_len); + if (!retval) + goto nodata; + retval->subh.cookie_hdr = + sctp_addto_chunk(retval, cookie_len, cookie); + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it * received the DATA or control chunk + * to which it is replying. + * + * [COOKIE ECHO back to where the INIT ACK came from.] + */ + if (chunk) + retval->transport = chunk->transport; + +nodata: + return retval; +} + +/* 3.3.12 Cookie Acknowledgement (COOKIE ACK) (11): + * + * This chunk is used only during the initialization of an + * association. It is used to acknowledge the receipt of a COOKIE + * ECHO chunk. This chunk MUST precede any DATA or SACK chunk sent + * within the association, but MAY be bundled with one or more DATA + * chunks or SACK chunk in the same SCTP packet. + * + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type = 11 |Chunk Flags | Length = 4 | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Chunk Flags: 8 bits + * + * Set to zero on transmit and ignored on receipt. + */ +struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc, + const struct sctp_chunk *chunk) +{ + struct sctp_chunk *retval; + + retval = sctp_make_chunk(asoc, SCTP_CID_COOKIE_ACK, 0, 0); + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it * received the DATA or control chunk + * to which it is replying. + * + * [COOKIE ACK back to where the COOKIE ECHO came from.] + */ + if (retval && chunk) + retval->transport = chunk->transport; + + return retval; +} + +/* + * Appendix A: Explicit Congestion Notification: + * CWR: + * + * RFC 2481 details a specific bit for a sender to send in the header of + * its next outbound TCP segment to indicate to its peer that it has + * reduced its congestion window. This is termed the CWR bit. For + * SCTP the same indication is made by including the CWR chunk. + * This chunk contains one data element, i.e. the TSN number that + * was sent in the ECNE chunk. This element represents the lowest + * TSN number in the datagram that was originally marked with the + * CE bit. + * + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Chunk Type=13 | Flags=00000000| Chunk Length = 8 | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Lowest TSN Number | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Note: The CWR is considered a Control chunk. + */ +struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc, + const __u32 lowest_tsn, + const struct sctp_chunk *chunk) +{ + struct sctp_chunk *retval; + sctp_cwrhdr_t cwr; + + cwr.lowest_tsn = htonl(lowest_tsn); + retval = sctp_make_chunk(asoc, SCTP_CID_ECN_CWR, 0, + sizeof(sctp_cwrhdr_t)); + + if (!retval) + goto nodata; + + retval->subh.ecn_cwr_hdr = + sctp_addto_chunk(retval, sizeof(cwr), &cwr); + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it * received the DATA or control chunk + * to which it is replying. + * + * [Report a reduced congestion window back to where the ECNE + * came from.] + */ + if (chunk) + retval->transport = chunk->transport; + +nodata: + return retval; +} + +/* Make an ECNE chunk. This is a congestion experienced report. */ +struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc, + const __u32 lowest_tsn) +{ + struct sctp_chunk *retval; + sctp_ecnehdr_t ecne; + + ecne.lowest_tsn = htonl(lowest_tsn); + retval = sctp_make_chunk(asoc, SCTP_CID_ECN_ECNE, 0, + sizeof(sctp_ecnehdr_t)); + if (!retval) + goto nodata; + retval->subh.ecne_hdr = + sctp_addto_chunk(retval, sizeof(ecne), &ecne); + +nodata: + return retval; +} + +/* Make a DATA chunk for the given association from the provided + * parameters. However, do not populate the data payload. + */ +struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc, + const struct sctp_sndrcvinfo *sinfo, + int data_len, __u8 flags, __u16 ssn) +{ + struct sctp_chunk *retval; + struct sctp_datahdr dp; + int chunk_len; + + /* We assign the TSN as LATE as possible, not here when + * creating the chunk. + */ + dp.tsn = 0; + dp.stream = htons(sinfo->sinfo_stream); + dp.ppid = sinfo->sinfo_ppid; + + /* Set the flags for an unordered send. */ + if (sinfo->sinfo_flags & MSG_UNORDERED) { + flags |= SCTP_DATA_UNORDERED; + dp.ssn = 0; + } else + dp.ssn = htons(ssn); + + chunk_len = sizeof(dp) + data_len; + retval = sctp_make_chunk(asoc, SCTP_CID_DATA, flags, chunk_len); + if (!retval) + goto nodata; + + retval->subh.data_hdr = sctp_addto_chunk(retval, sizeof(dp), &dp); + memcpy(&retval->sinfo, sinfo, sizeof(struct sctp_sndrcvinfo)); + +nodata: + return retval; +} + +/* Create a selective ackowledgement (SACK) for the given + * association. This reports on which TSN's we've seen to date, + * including duplicates and gaps. + */ +struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc) +{ + struct sctp_chunk *retval; + struct sctp_sackhdr sack; + int len; + __u32 ctsn; + __u16 num_gabs, num_dup_tsns; + struct sctp_tsnmap *map = (struct sctp_tsnmap *)&asoc->peer.tsn_map; + + ctsn = sctp_tsnmap_get_ctsn(map); + SCTP_DEBUG_PRINTK("sackCTSNAck sent: 0x%x.\n", ctsn); + + /* How much room is needed in the chunk? */ + num_gabs = sctp_tsnmap_num_gabs(map); + num_dup_tsns = sctp_tsnmap_num_dups(map); + + /* Initialize the SACK header. */ + sack.cum_tsn_ack = htonl(ctsn); + sack.a_rwnd = htonl(asoc->a_rwnd); + sack.num_gap_ack_blocks = htons(num_gabs); + sack.num_dup_tsns = htons(num_dup_tsns); + + len = sizeof(sack) + + sizeof(struct sctp_gap_ack_block) * num_gabs + + sizeof(__u32) * num_dup_tsns; + + /* Create the chunk. */ + retval = sctp_make_chunk(asoc, SCTP_CID_SACK, 0, len); + if (!retval) + goto nodata; + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, etc.) to the same destination transport + * address from which it received the DATA or control chunk to + * which it is replying. This rule should also be followed if + * the endpoint is bundling DATA chunks together with the + * reply chunk. + * + * However, when acknowledging multiple DATA chunks received + * in packets from different source addresses in a single + * SACK, the SACK chunk may be transmitted to one of the + * destination transport addresses from which the DATA or + * control chunks being acknowledged were received. + * + * [BUG: We do not implement the following paragraph. + * Perhaps we should remember the last transport we used for a + * SACK and avoid that (if possible) if we have seen any + * duplicates. --piggy] + * + * When a receiver of a duplicate DATA chunk sends a SACK to a + * multi- homed endpoint it MAY be beneficial to vary the + * destination address and not use the source address of the + * DATA chunk. The reason being that receiving a duplicate + * from a multi-homed endpoint might indicate that the return + * path (as specified in the source address of the DATA chunk) + * for the SACK is broken. + * + * [Send to the address from which we last received a DATA chunk.] + */ + retval->transport = asoc->peer.last_data_from; + + retval->subh.sack_hdr = + sctp_addto_chunk(retval, sizeof(sack), &sack); + + /* Add the gap ack block information. */ + if (num_gabs) + sctp_addto_chunk(retval, sizeof(__u32) * num_gabs, + sctp_tsnmap_get_gabs(map)); + + /* Add the duplicate TSN information. */ + if (num_dup_tsns) + sctp_addto_chunk(retval, sizeof(__u32) * num_dup_tsns, + sctp_tsnmap_get_dups(map)); + +nodata: + return retval; +} + +/* Make a SHUTDOWN chunk. */ +struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc, + const struct sctp_chunk *chunk) +{ + struct sctp_chunk *retval; + sctp_shutdownhdr_t shut; + __u32 ctsn; + + ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); + shut.cum_tsn_ack = htonl(ctsn); + + retval = sctp_make_chunk(asoc, SCTP_CID_SHUTDOWN, 0, + sizeof(sctp_shutdownhdr_t)); + if (!retval) + goto nodata; + + retval->subh.shutdown_hdr = + sctp_addto_chunk(retval, sizeof(shut), &shut); + + if (chunk) + retval->transport = chunk->transport; +nodata: + return retval; +} + +struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc, + const struct sctp_chunk *chunk) +{ + struct sctp_chunk *retval; + + retval = sctp_make_chunk(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0); + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it * received the DATA or control chunk + * to which it is replying. + * + * [ACK back to where the SHUTDOWN came from.] + */ + if (retval && chunk) + retval->transport = chunk->transport; + + return retval; +} + +struct sctp_chunk *sctp_make_shutdown_complete( + const struct sctp_association *asoc, + const struct sctp_chunk *chunk) +{ + struct sctp_chunk *retval; + __u8 flags = 0; + + /* Maybe set the T-bit if we have no association. */ + flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T; + + retval = sctp_make_chunk(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, 0); + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it * received the DATA or control chunk + * to which it is replying. + * + * [Report SHUTDOWN COMPLETE back to where the SHUTDOWN ACK + * came from.] + */ + if (retval && chunk) + retval->transport = chunk->transport; + + return retval; +} + +/* Create an ABORT. Note that we set the T bit if we have no + * association. + */ +struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + const size_t hint) +{ + struct sctp_chunk *retval; + __u8 flags = 0; + + /* Maybe set the T-bit if we have no association. */ + flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T; + + retval = sctp_make_chunk(asoc, SCTP_CID_ABORT, flags, hint); + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it * received the DATA or control chunk + * to which it is replying. + * + * [ABORT back to where the offender came from.] + */ + if (retval && chunk) + retval->transport = chunk->transport; + + return retval; +} + +/* Helper to create ABORT with a NO_USER_DATA error. */ +struct sctp_chunk *sctp_make_abort_no_data( + const struct sctp_association *asoc, + const struct sctp_chunk *chunk, __u32 tsn) +{ + struct sctp_chunk *retval; + __u32 payload; + + retval = sctp_make_abort(asoc, chunk, sizeof(sctp_errhdr_t) + + sizeof(tsn)); + + if (!retval) + goto no_mem; + + /* Put the tsn back into network byte order. */ + payload = htonl(tsn); + sctp_init_cause(retval, SCTP_ERROR_NO_DATA, (const void *)&payload, + sizeof(payload)); + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it * received the DATA or control chunk + * to which it is replying. + * + * [ABORT back to where the offender came from.] + */ + if (chunk) + retval->transport = chunk->transport; + +no_mem: + return retval; +} + +/* Helper to create ABORT with a SCTP_ERROR_USER_ABORT error. */ +struct sctp_chunk *sctp_make_abort_user(const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + const struct msghdr *msg) +{ + struct sctp_chunk *retval; + void *payload = NULL, *payoff; + size_t paylen = 0; + struct iovec *iov = NULL; + int iovlen = 0; + + if (msg) { + iov = msg->msg_iov; + iovlen = msg->msg_iovlen; + paylen = get_user_iov_size(iov, iovlen); + } + + retval = sctp_make_abort(asoc, chunk, sizeof(sctp_errhdr_t) + paylen); + if (!retval) + goto err_chunk; + + if (paylen) { + /* Put the msg_iov together into payload. */ + payload = kmalloc(paylen, GFP_ATOMIC); + if (!payload) + goto err_payload; + payoff = payload; + + for (; iovlen > 0; --iovlen) { + if (copy_from_user(payoff, iov->iov_base,iov->iov_len)) + goto err_copy; + payoff += iov->iov_len; + iov++; + } + } + + sctp_init_cause(retval, SCTP_ERROR_USER_ABORT, payload, paylen); + + if (paylen) + kfree(payload); + + return retval; + +err_copy: + kfree(payload); +err_payload: + sctp_chunk_free(retval); + retval = NULL; +err_chunk: + return retval; +} + +/* Make an ABORT chunk with a PROTOCOL VIOLATION cause code. */ +struct sctp_chunk *sctp_make_abort_violation( + const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + const __u8 *payload, + const size_t paylen) +{ + struct sctp_chunk *retval; + struct sctp_paramhdr phdr; + + retval = sctp_make_abort(asoc, chunk, sizeof(sctp_errhdr_t) + paylen + + sizeof(sctp_chunkhdr_t)); + if (!retval) + goto end; + + sctp_init_cause(retval, SCTP_ERROR_PROTO_VIOLATION, payload, paylen); + + phdr.type = htons(chunk->chunk_hdr->type); + phdr.length = chunk->chunk_hdr->length; + sctp_addto_chunk(retval, sizeof(sctp_paramhdr_t), &phdr); + +end: + return retval; +} + +/* Make a HEARTBEAT chunk. */ +struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc, + const struct sctp_transport *transport, + const void *payload, const size_t paylen) +{ + struct sctp_chunk *retval = sctp_make_chunk(asoc, SCTP_CID_HEARTBEAT, + 0, paylen); + + if (!retval) + goto nodata; + + /* Cast away the 'const', as this is just telling the chunk + * what transport it belongs to. + */ + retval->transport = (struct sctp_transport *) transport; + retval->subh.hbs_hdr = sctp_addto_chunk(retval, paylen, payload); + +nodata: + return retval; +} + +struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + const void *payload, const size_t paylen) +{ + struct sctp_chunk *retval; + + retval = sctp_make_chunk(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen); + if (!retval) + goto nodata; + + retval->subh.hbs_hdr = sctp_addto_chunk(retval, paylen, payload); + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, * etc.) to the same destination transport + * address from which it * received the DATA or control chunk + * to which it is replying. + * + * [HBACK back to where the HEARTBEAT came from.] + */ + if (chunk) + retval->transport = chunk->transport; + +nodata: + return retval; +} + +/* Create an Operation Error chunk with the specified space reserved. + * This routine can be used for containing multiple causes in the chunk. + */ +static struct sctp_chunk *sctp_make_op_error_space( + const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + size_t size) +{ + struct sctp_chunk *retval; + + retval = sctp_make_chunk(asoc, SCTP_CID_ERROR, 0, + sizeof(sctp_errhdr_t) + size); + if (!retval) + goto nodata; + + /* RFC 2960 6.4 Multi-homed SCTP Endpoints + * + * An endpoint SHOULD transmit reply chunks (e.g., SACK, + * HEARTBEAT ACK, etc.) to the same destination transport + * address from which it received the DATA or control chunk + * to which it is replying. + * + */ + if (chunk) + retval->transport = chunk->transport; + +nodata: + return retval; +} + +/* Create an Operation Error chunk. */ +struct sctp_chunk *sctp_make_op_error(const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + __u16 cause_code, const void *payload, + size_t paylen) +{ + struct sctp_chunk *retval; + + retval = sctp_make_op_error_space(asoc, chunk, paylen); + if (!retval) + goto nodata; + + sctp_init_cause(retval, cause_code, payload, paylen); + +nodata: + return retval; +} + +/******************************************************************** + * 2nd Level Abstractions + ********************************************************************/ + +/* Turn an skb into a chunk. + * FIXME: Eventually move the structure directly inside the skb->cb[]. + */ +struct sctp_chunk *sctp_chunkify(struct sk_buff *skb, + const struct sctp_association *asoc, + struct sock *sk) +{ + struct sctp_chunk *retval; + + retval = kmem_cache_alloc(sctp_chunk_cachep, SLAB_ATOMIC); + + if (!retval) + goto nodata; + memset(retval, 0, sizeof(struct sctp_chunk)); + + if (!sk) { + SCTP_DEBUG_PRINTK("chunkifying skb %p w/o an sk\n", skb); + } + + retval->skb = skb; + retval->asoc = (struct sctp_association *)asoc; + retval->resent = 0; + retval->has_tsn = 0; + retval->has_ssn = 0; + retval->rtt_in_progress = 0; + retval->sent_at = 0; + retval->singleton = 1; + retval->end_of_packet = 0; + retval->ecn_ce_done = 0; + retval->pdiscard = 0; + + /* sctpimpguide-05.txt Section 2.8.2 + * M1) Each time a new DATA chunk is transmitted + * set the 'TSN.Missing.Report' count for that TSN to 0. The + * 'TSN.Missing.Report' count will be used to determine missing chunks + * and when to fast retransmit. + */ + retval->tsn_missing_report = 0; + retval->tsn_gap_acked = 0; + retval->fast_retransmit = 0; + + /* If this is a fragmented message, track all fragments + * of the message (for SEND_FAILED). + */ + retval->msg = NULL; + + /* Polish the bead hole. */ + INIT_LIST_HEAD(&retval->transmitted_list); + INIT_LIST_HEAD(&retval->frag_list); + SCTP_DBG_OBJCNT_INC(chunk); + atomic_set(&retval->refcnt, 1); + +nodata: + return retval; +} + +/* Set chunk->source and dest based on the IP header in chunk->skb. */ +void sctp_init_addrs(struct sctp_chunk *chunk, union sctp_addr *src, + union sctp_addr *dest) +{ + memcpy(&chunk->source, src, sizeof(union sctp_addr)); + memcpy(&chunk->dest, dest, sizeof(union sctp_addr)); +} + +/* Extract the source address from a chunk. */ +const union sctp_addr *sctp_source(const struct sctp_chunk *chunk) +{ + /* If we have a known transport, use that. */ + if (chunk->transport) { + return &chunk->transport->ipaddr; + } else { + /* Otherwise, extract it from the IP header. */ + return &chunk->source; + } +} + +/* Create a new chunk, setting the type and flags headers from the + * arguments, reserving enough space for a 'paylen' byte payload. + */ +SCTP_STATIC +struct sctp_chunk *sctp_make_chunk(const struct sctp_association *asoc, + __u8 type, __u8 flags, int paylen) +{ + struct sctp_chunk *retval; + sctp_chunkhdr_t *chunk_hdr; + struct sk_buff *skb; + struct sock *sk; + + /* No need to allocate LL here, as this is only a chunk. */ + skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), + GFP_ATOMIC); + if (!skb) + goto nodata; + + /* Make room for the chunk header. */ + chunk_hdr = (sctp_chunkhdr_t *)skb_put(skb, sizeof(sctp_chunkhdr_t)); + chunk_hdr->type = type; + chunk_hdr->flags = flags; + chunk_hdr->length = htons(sizeof(sctp_chunkhdr_t)); + + sk = asoc ? asoc->base.sk : NULL; + retval = sctp_chunkify(skb, asoc, sk); + if (!retval) { + kfree_skb(skb); + goto nodata; + } + + retval->chunk_hdr = chunk_hdr; + retval->chunk_end = ((__u8 *)chunk_hdr) + sizeof(struct sctp_chunkhdr); + + /* Set the skb to the belonging sock for accounting. */ + skb->sk = sk; + + return retval; +nodata: + return NULL; +} + + +/* Release the memory occupied by a chunk. */ +static void sctp_chunk_destroy(struct sctp_chunk *chunk) +{ + /* Free the chunk skb data and the SCTP_chunk stub itself. */ + dev_kfree_skb(chunk->skb); + + SCTP_DBG_OBJCNT_DEC(chunk); + kmem_cache_free(sctp_chunk_cachep, chunk); +} + +/* Possibly, free the chunk. */ +void sctp_chunk_free(struct sctp_chunk *chunk) +{ + /* Make sure that we are not on any list. */ + skb_unlink((struct sk_buff *) chunk); + list_del_init(&chunk->transmitted_list); + + /* Release our reference on the message tracker. */ + if (chunk->msg) + sctp_datamsg_put(chunk->msg); + + sctp_chunk_put(chunk); +} + +/* Grab a reference to the chunk. */ +void sctp_chunk_hold(struct sctp_chunk *ch) +{ + atomic_inc(&ch->refcnt); +} + +/* Release a reference to the chunk. */ +void sctp_chunk_put(struct sctp_chunk *ch) +{ + if (atomic_dec_and_test(&ch->refcnt)) + sctp_chunk_destroy(ch); +} + +/* Append bytes to the end of a chunk. Will panic if chunk is not big + * enough. + */ +void *sctp_addto_chunk(struct sctp_chunk *chunk, int len, const void *data) +{ + void *target; + void *padding; + int chunklen = ntohs(chunk->chunk_hdr->length); + int padlen = chunklen % 4; + + padding = skb_put(chunk->skb, padlen); + target = skb_put(chunk->skb, len); + + memset(padding, 0, padlen); + memcpy(target, data, len); + + /* Adjust the chunk length field. */ + chunk->chunk_hdr->length = htons(chunklen + padlen + len); + chunk->chunk_end = chunk->skb->tail; + + return target; +} + +/* Append bytes from user space to the end of a chunk. Will panic if + * chunk is not big enough. + * Returns a kernel err value. + */ +int sctp_user_addto_chunk(struct sctp_chunk *chunk, int off, int len, + struct iovec *data) +{ + __u8 *target; + int err = 0; + + /* Make room in chunk for data. */ + target = skb_put(chunk->skb, len); + + /* Copy data (whole iovec) into chunk */ + if ((err = memcpy_fromiovecend(target, data, off, len))) + goto out; + + /* Adjust the chunk length field. */ + chunk->chunk_hdr->length = + htons(ntohs(chunk->chunk_hdr->length) + len); + chunk->chunk_end = chunk->skb->tail; + +out: + return err; +} + +/* Helper function to assign a TSN if needed. This assumes that both + * the data_hdr and association have already been assigned. + */ +void sctp_chunk_assign_ssn(struct sctp_chunk *chunk) +{ + __u16 ssn; + __u16 sid; + + if (chunk->has_ssn) + return; + + /* This is the last possible instant to assign a SSN. */ + if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { + ssn = 0; + } else { + sid = htons(chunk->subh.data_hdr->stream); + if (chunk->chunk_hdr->flags & SCTP_DATA_LAST_FRAG) + ssn = sctp_ssn_next(&chunk->asoc->ssnmap->out, sid); + else + ssn = sctp_ssn_peek(&chunk->asoc->ssnmap->out, sid); + ssn = htons(ssn); + } + + chunk->subh.data_hdr->ssn = ssn; + chunk->has_ssn = 1; +} + +/* Helper function to assign a TSN if needed. This assumes that both + * the data_hdr and association have already been assigned. + */ +void sctp_chunk_assign_tsn(struct sctp_chunk *chunk) +{ + if (!chunk->has_tsn) { + /* This is the last possible instant to + * assign a TSN. + */ + chunk->subh.data_hdr->tsn = + htonl(sctp_association_get_next_tsn(chunk->asoc)); + chunk->has_tsn = 1; + } +} + +/* Create a CLOSED association to use with an incoming packet. */ +struct sctp_association *sctp_make_temp_asoc(const struct sctp_endpoint *ep, + struct sctp_chunk *chunk, int gfp) +{ + struct sctp_association *asoc; + struct sk_buff *skb; + sctp_scope_t scope; + struct sctp_af *af; + + /* Create the bare association. */ + scope = sctp_scope(sctp_source(chunk)); + asoc = sctp_association_new(ep, ep->base.sk, scope, gfp); + if (!asoc) + goto nodata; + asoc->temp = 1; + skb = chunk->skb; + /* Create an entry for the source address of the packet. */ + af = sctp_get_af_specific(ipver2af(skb->nh.iph->version)); + if (unlikely(!af)) + goto fail; + af->from_skb(&asoc->c.peer_addr, skb, 1); +nodata: + return asoc; + +fail: + sctp_association_free(asoc); + return NULL; +} + +/* Build a cookie representing asoc. + * This INCLUDES the param header needed to put the cookie in the INIT ACK. + */ +static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const struct sctp_chunk *init_chunk, + int *cookie_len, + const __u8 *raw_addrs, int addrs_len) +{ + sctp_cookie_param_t *retval; + struct sctp_signed_cookie *cookie; + struct scatterlist sg; + int headersize, bodysize; + unsigned int keylen; + char *key; + + headersize = sizeof(sctp_paramhdr_t) + SCTP_SECRET_SIZE; + bodysize = sizeof(struct sctp_cookie) + + ntohs(init_chunk->chunk_hdr->length) + addrs_len; + + /* Pad out the cookie to a multiple to make the signature + * functions simpler to write. + */ + if (bodysize % SCTP_COOKIE_MULTIPLE) + bodysize += SCTP_COOKIE_MULTIPLE + - (bodysize % SCTP_COOKIE_MULTIPLE); + *cookie_len = headersize + bodysize; + + retval = (sctp_cookie_param_t *)kmalloc(*cookie_len, GFP_ATOMIC); + + if (!retval) { + *cookie_len = 0; + goto nodata; + } + + /* Clear this memory since we are sending this data structure + * out on the network. + */ + memset(retval, 0x00, *cookie_len); + cookie = (struct sctp_signed_cookie *) retval->body; + + /* Set up the parameter header. */ + retval->p.type = SCTP_PARAM_STATE_COOKIE; + retval->p.length = htons(*cookie_len); + + /* Copy the cookie part of the association itself. */ + cookie->c = asoc->c; + /* Save the raw address list length in the cookie. */ + cookie->c.raw_addr_list_len = addrs_len; + + /* Remember PR-SCTP capability. */ + cookie->c.prsctp_capable = asoc->peer.prsctp_capable; + + /* Save adaption indication in the cookie. */ + cookie->c.adaption_ind = asoc->peer.adaption_ind; + + /* Set an expiration time for the cookie. */ + do_gettimeofday(&cookie->c.expiration); + TIMEVAL_ADD(asoc->cookie_life, cookie->c.expiration); + + /* Copy the peer's init packet. */ + memcpy(&cookie->c.peer_init[0], init_chunk->chunk_hdr, + ntohs(init_chunk->chunk_hdr->length)); + + /* Copy the raw local address list of the association. */ + memcpy((__u8 *)&cookie->c.peer_init[0] + + ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len); + + if (sctp_sk(ep->base.sk)->hmac) { + /* Sign the message. */ + sg.page = virt_to_page(&cookie->c); + sg.offset = (unsigned long)(&cookie->c) % PAGE_SIZE; + sg.length = bodysize; + keylen = SCTP_SECRET_SIZE; + key = (char *)ep->secret_key[ep->current_key]; + + sctp_crypto_hmac(sctp_sk(ep->base.sk)->hmac, key, &keylen, + &sg, 1, cookie->signature); + } + +nodata: + return retval; +} + +/* Unpack the cookie from COOKIE ECHO chunk, recreating the association. */ +struct sctp_association *sctp_unpack_cookie( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + struct sctp_chunk *chunk, int gfp, + int *error, struct sctp_chunk **errp) +{ + struct sctp_association *retval = NULL; + struct sctp_signed_cookie *cookie; + struct sctp_cookie *bear_cookie; + int headersize, bodysize, fixed_size; + __u8 digest[SCTP_SIGNATURE_SIZE]; + struct scatterlist sg; + unsigned int keylen, len; + char *key; + sctp_scope_t scope; + struct sk_buff *skb = chunk->skb; + + headersize = sizeof(sctp_chunkhdr_t) + SCTP_SECRET_SIZE; + bodysize = ntohs(chunk->chunk_hdr->length) - headersize; + fixed_size = headersize + sizeof(struct sctp_cookie); + + /* Verify that the chunk looks like it even has a cookie. + * There must be enough room for our cookie and our peer's + * INIT chunk. + */ + len = ntohs(chunk->chunk_hdr->length); + if (len < fixed_size + sizeof(struct sctp_chunkhdr)) + goto malformed; + + /* Verify that the cookie has been padded out. */ + if (bodysize % SCTP_COOKIE_MULTIPLE) + goto malformed; + + /* Process the cookie. */ + cookie = chunk->subh.cookie_hdr; + bear_cookie = &cookie->c; + + if (!sctp_sk(ep->base.sk)->hmac) + goto no_hmac; + + /* Check the signature. */ + keylen = SCTP_SECRET_SIZE; + sg.page = virt_to_page(bear_cookie); + sg.offset = (unsigned long)(bear_cookie) % PAGE_SIZE; + sg.length = bodysize; + key = (char *)ep->secret_key[ep->current_key]; + + memset(digest, 0x00, sizeof(digest)); + sctp_crypto_hmac(sctp_sk(ep->base.sk)->hmac, key, &keylen, &sg, + 1, digest); + + if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { + /* Try the previous key. */ + key = (char *)ep->secret_key[ep->last_key]; + memset(digest, 0x00, sizeof(digest)); + sctp_crypto_hmac(sctp_sk(ep->base.sk)->hmac, key, &keylen, + &sg, 1, digest); + + if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) { + /* Yikes! Still bad signature! */ + *error = -SCTP_IERROR_BAD_SIG; + goto fail; + } + } + +no_hmac: + /* IG Section 2.35.2: + * 3) Compare the port numbers and the verification tag contained + * within the COOKIE ECHO chunk to the actual port numbers and the + * verification tag within the SCTP common header of the received + * packet. If these values do not match the packet MUST be silently + * discarded, + */ + if (ntohl(chunk->sctp_hdr->vtag) != bear_cookie->my_vtag) { + *error = -SCTP_IERROR_BAD_TAG; + goto fail; + } + + if (ntohs(chunk->sctp_hdr->source) != bear_cookie->peer_addr.v4.sin_port || + ntohs(chunk->sctp_hdr->dest) != bear_cookie->my_port) { + *error = -SCTP_IERROR_BAD_PORTS; + goto fail; + } + + /* Check to see if the cookie is stale. If there is already + * an association, there is no need to check cookie's expiration + * for init collision case of lost COOKIE ACK. + */ + if (!asoc && tv_lt(bear_cookie->expiration, skb->stamp)) { + __u16 len; + /* + * Section 3.3.10.3 Stale Cookie Error (3) + * + * Cause of error + * --------------- + * Stale Cookie Error: Indicates the receipt of a valid State + * Cookie that has expired. + */ + len = ntohs(chunk->chunk_hdr->length); + *errp = sctp_make_op_error_space(asoc, chunk, len); + if (*errp) { + suseconds_t usecs = (skb->stamp.tv_sec - + bear_cookie->expiration.tv_sec) * 1000000L + + skb->stamp.tv_usec - + bear_cookie->expiration.tv_usec; + + usecs = htonl(usecs); + sctp_init_cause(*errp, SCTP_ERROR_STALE_COOKIE, + &usecs, sizeof(usecs)); + *error = -SCTP_IERROR_STALE_COOKIE; + } else + *error = -SCTP_IERROR_NOMEM; + + goto fail; + } + + /* Make a new base association. */ + scope = sctp_scope(sctp_source(chunk)); + retval = sctp_association_new(ep, ep->base.sk, scope, gfp); + if (!retval) { + *error = -SCTP_IERROR_NOMEM; + goto fail; + } + + /* Set up our peer's port number. */ + retval->peer.port = ntohs(chunk->sctp_hdr->source); + + /* Populate the association from the cookie. */ + memcpy(&retval->c, bear_cookie, sizeof(*bear_cookie)); + + if (sctp_assoc_set_bind_addr_from_cookie(retval, bear_cookie, + GFP_ATOMIC) < 0) { + *error = -SCTP_IERROR_NOMEM; + goto fail; + } + + /* Also, add the destination address. */ + if (list_empty(&retval->base.bind_addr.address_list)) { + sctp_add_bind_addr(&retval->base.bind_addr, &chunk->dest, + GFP_ATOMIC); + } + + retval->next_tsn = retval->c.initial_tsn; + retval->ctsn_ack_point = retval->next_tsn - 1; + retval->addip_serial = retval->c.initial_tsn; + retval->adv_peer_ack_point = retval->ctsn_ack_point; + retval->peer.prsctp_capable = retval->c.prsctp_capable; + retval->peer.adaption_ind = retval->c.adaption_ind; + + /* The INIT stuff will be done by the side effects. */ + return retval; + +fail: + if (retval) + sctp_association_free(retval); + + return NULL; + +malformed: + /* Yikes! The packet is either corrupt or deliberately + * malformed. + */ + *error = -SCTP_IERROR_MALFORMED; + goto fail; +} + +/******************************************************************** + * 3rd Level Abstractions + ********************************************************************/ + +struct __sctp_missing { + __u32 num_missing; + __u16 type; +} __attribute__((packed)); + +/* + * Report a missing mandatory parameter. + */ +static int sctp_process_missing_param(const struct sctp_association *asoc, + sctp_param_t paramtype, + struct sctp_chunk *chunk, + struct sctp_chunk **errp) +{ + struct __sctp_missing report; + __u16 len; + + len = WORD_ROUND(sizeof(report)); + + /* Make an ERROR chunk, preparing enough room for + * returning multiple unknown parameters. + */ + if (!*errp) + *errp = sctp_make_op_error_space(asoc, chunk, len); + + if (*errp) { + report.num_missing = htonl(1); + report.type = paramtype; + sctp_init_cause(*errp, SCTP_ERROR_INV_PARAM, + &report, sizeof(report)); + } + + /* Stop processing this chunk. */ + return 0; +} + +/* Report an Invalid Mandatory Parameter. */ +static int sctp_process_inv_mandatory(const struct sctp_association *asoc, + struct sctp_chunk *chunk, + struct sctp_chunk **errp) +{ + /* Invalid Mandatory Parameter Error has no payload. */ + + if (!*errp) + *errp = sctp_make_op_error_space(asoc, chunk, 0); + + if (*errp) + sctp_init_cause(*errp, SCTP_ERROR_INV_PARAM, NULL, 0); + + /* Stop processing this chunk. */ + return 0; +} + +static int sctp_process_inv_paramlength(const struct sctp_association *asoc, + struct sctp_paramhdr *param, + const struct sctp_chunk *chunk, + struct sctp_chunk **errp) +{ + char error[] = "The following parameter had invalid length:"; + size_t payload_len = WORD_ROUND(sizeof(error)) + + sizeof(sctp_paramhdr_t); + + + /* Create an error chunk and fill it in with our payload. */ + if (!*errp) + *errp = sctp_make_op_error_space(asoc, chunk, payload_len); + + if (*errp) { + sctp_init_cause(*errp, SCTP_ERROR_PROTO_VIOLATION, error, + sizeof(error)); + sctp_addto_chunk(*errp, sizeof(sctp_paramhdr_t), param); + } + + return 0; +} + + +/* Do not attempt to handle the HOST_NAME parm. However, do + * send back an indicator to the peer. + */ +static int sctp_process_hn_param(const struct sctp_association *asoc, + union sctp_params param, + struct sctp_chunk *chunk, + struct sctp_chunk **errp) +{ + __u16 len = ntohs(param.p->length); + + /* Make an ERROR chunk. */ + if (!*errp) + *errp = sctp_make_op_error_space(asoc, chunk, len); + + if (*errp) + sctp_init_cause(*errp, SCTP_ERROR_DNS_FAILED, + param.v, len); + + /* Stop processing this chunk. */ + return 0; +} + +/* RFC 3.2.1 & the Implementers Guide 2.2. + * + * The Parameter Types are encoded such that the + * highest-order two bits specify the action that must be + * taken if the processing endpoint does not recognize the + * Parameter Type. + * + * 00 - Stop processing this SCTP chunk and discard it, + * do not process any further chunks within it. + * + * 01 - Stop processing this SCTP chunk and discard it, + * do not process any further chunks within it, and report + * the unrecognized parameter in an 'Unrecognized + * Parameter Type' (in either an ERROR or in the INIT ACK). + * + * 10 - Skip this parameter and continue processing. + * + * 11 - Skip this parameter and continue processing but + * report the unrecognized parameter in an + * 'Unrecognized Parameter Type' (in either an ERROR or in + * the INIT ACK). + * + * Return value: + * 0 - discard the chunk + * 1 - continue with the chunk + */ +static int sctp_process_unk_param(const struct sctp_association *asoc, + union sctp_params param, + struct sctp_chunk *chunk, + struct sctp_chunk **errp) +{ + int retval = 1; + + switch (param.p->type & SCTP_PARAM_ACTION_MASK) { + case SCTP_PARAM_ACTION_DISCARD: + retval = 0; + break; + case SCTP_PARAM_ACTION_DISCARD_ERR: + retval = 0; + /* Make an ERROR chunk, preparing enough room for + * returning multiple unknown parameters. + */ + if (NULL == *errp) + *errp = sctp_make_op_error_space(asoc, chunk, + ntohs(chunk->chunk_hdr->length)); + + if (*errp) + sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM, + param.v, + WORD_ROUND(ntohs(param.p->length))); + + break; + case SCTP_PARAM_ACTION_SKIP: + break; + case SCTP_PARAM_ACTION_SKIP_ERR: + /* Make an ERROR chunk, preparing enough room for + * returning multiple unknown parameters. + */ + if (NULL == *errp) + *errp = sctp_make_op_error_space(asoc, chunk, + ntohs(chunk->chunk_hdr->length)); + + if (*errp) { + sctp_init_cause(*errp, SCTP_ERROR_UNKNOWN_PARAM, + param.v, + WORD_ROUND(ntohs(param.p->length))); + } else { + /* If there is no memory for generating the ERROR + * report as specified, an ABORT will be triggered + * to the peer and the association won't be + * established. + */ + retval = 0; + } + + break; + default: + break; + } + + return retval; +} + +/* Find unrecognized parameters in the chunk. + * Return values: + * 0 - discard the chunk + * 1 - continue with the chunk + */ +static int sctp_verify_param(const struct sctp_association *asoc, + union sctp_params param, + sctp_cid_t cid, + struct sctp_chunk *chunk, + struct sctp_chunk **err_chunk) +{ + int retval = 1; + + /* FIXME - This routine is not looking at each parameter per the + * chunk type, i.e., unrecognized parameters should be further + * identified based on the chunk id. + */ + + switch (param.p->type) { + case SCTP_PARAM_IPV4_ADDRESS: + case SCTP_PARAM_IPV6_ADDRESS: + case SCTP_PARAM_COOKIE_PRESERVATIVE: + case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES: + case SCTP_PARAM_STATE_COOKIE: + case SCTP_PARAM_HEARTBEAT_INFO: + case SCTP_PARAM_UNRECOGNIZED_PARAMETERS: + case SCTP_PARAM_ECN_CAPABLE: + case SCTP_PARAM_ADAPTION_LAYER_IND: + break; + + case SCTP_PARAM_HOST_NAME_ADDRESS: + /* Tell the peer, we won't support this param. */ + return sctp_process_hn_param(asoc, param, chunk, err_chunk); + case SCTP_PARAM_FWD_TSN_SUPPORT: + if (sctp_prsctp_enable) + break; + /* Fall Through */ + default: + SCTP_DEBUG_PRINTK("Unrecognized param: %d for chunk %d.\n", + ntohs(param.p->type), cid); + return sctp_process_unk_param(asoc, param, chunk, err_chunk); + + break; + } + return retval; +} + +/* Verify the INIT packet before we process it. */ +int sctp_verify_init(const struct sctp_association *asoc, + sctp_cid_t cid, + sctp_init_chunk_t *peer_init, + struct sctp_chunk *chunk, + struct sctp_chunk **errp) +{ + union sctp_params param; + int has_cookie = 0; + + /* Verify stream values are non-zero. */ + if ((0 == peer_init->init_hdr.num_outbound_streams) || + (0 == peer_init->init_hdr.num_inbound_streams)) { + + sctp_process_inv_mandatory(asoc, chunk, errp); + return 0; + } + + /* Check for missing mandatory parameters. */ + sctp_walk_params(param, peer_init, init_hdr.params) { + + if (SCTP_PARAM_STATE_COOKIE == param.p->type) + has_cookie = 1; + + } /* for (loop through all parameters) */ + + /* There is a possibility that a parameter length was bad and + * in that case we would have stoped walking the parameters. + * The current param.p would point at the bad one. + * Current consensus on the mailing list is to generate a PROTOCOL + * VIOLATION error. We build the ERROR chunk here and let the normal + * error handling code build and send the packet. + */ + if (param.v < (void*)chunk->chunk_end - sizeof(sctp_paramhdr_t)) { + sctp_process_inv_paramlength(asoc, param.p, chunk, errp); + return 0; + } + + /* The only missing mandatory param possible today is + * the state cookie for an INIT-ACK chunk. + */ + if ((SCTP_CID_INIT_ACK == cid) && !has_cookie) { + sctp_process_missing_param(asoc, SCTP_PARAM_STATE_COOKIE, + chunk, errp); + return 0; + } + + /* Find unrecognized parameters. */ + + sctp_walk_params(param, peer_init, init_hdr.params) { + + if (!sctp_verify_param(asoc, param, cid, chunk, errp)) { + if (SCTP_PARAM_HOST_NAME_ADDRESS == param.p->type) + return 0; + else + return 1; + } + + } /* for (loop through all parameters) */ + + return 1; +} + +/* Unpack the parameters in an INIT packet into an association. + * Returns 0 on failure, else success. + * FIXME: This is an association method. + */ +int sctp_process_init(struct sctp_association *asoc, sctp_cid_t cid, + const union sctp_addr *peer_addr, + sctp_init_chunk_t *peer_init, int gfp) +{ + union sctp_params param; + struct sctp_transport *transport; + struct list_head *pos, *temp; + char *cookie; + + /* We must include the address that the INIT packet came from. + * This is the only address that matters for an INIT packet. + * When processing a COOKIE ECHO, we retrieve the from address + * of the INIT from the cookie. + */ + + /* This implementation defaults to making the first transport + * added as the primary transport. The source address seems to + * be a a better choice than any of the embedded addresses. + */ + if (peer_addr) + if(!sctp_assoc_add_peer(asoc, peer_addr, gfp)) + goto nomem; + + /* Process the initialization parameters. */ + + sctp_walk_params(param, peer_init, init_hdr.params) { + + if (!sctp_process_param(asoc, param, peer_addr, gfp)) + goto clean_up; + } + + /* The fixed INIT headers are always in network byte + * order. + */ + asoc->peer.i.init_tag = + ntohl(peer_init->init_hdr.init_tag); + asoc->peer.i.a_rwnd = + ntohl(peer_init->init_hdr.a_rwnd); + asoc->peer.i.num_outbound_streams = + ntohs(peer_init->init_hdr.num_outbound_streams); + asoc->peer.i.num_inbound_streams = + ntohs(peer_init->init_hdr.num_inbound_streams); + asoc->peer.i.initial_tsn = + ntohl(peer_init->init_hdr.initial_tsn); + + /* Apply the upper bounds for output streams based on peer's + * number of inbound streams. + */ + if (asoc->c.sinit_num_ostreams > + ntohs(peer_init->init_hdr.num_inbound_streams)) { + asoc->c.sinit_num_ostreams = + ntohs(peer_init->init_hdr.num_inbound_streams); + } + + if (asoc->c.sinit_max_instreams > + ntohs(peer_init->init_hdr.num_outbound_streams)) { + asoc->c.sinit_max_instreams = + ntohs(peer_init->init_hdr.num_outbound_streams); + } + + /* Copy Initiation tag from INIT to VT_peer in cookie. */ + asoc->c.peer_vtag = asoc->peer.i.init_tag; + + /* Peer Rwnd : Current calculated value of the peer's rwnd. */ + asoc->peer.rwnd = asoc->peer.i.a_rwnd; + + /* Copy cookie in case we need to resend COOKIE-ECHO. */ + cookie = asoc->peer.cookie; + if (cookie) { + asoc->peer.cookie = kmalloc(asoc->peer.cookie_len, gfp); + if (!asoc->peer.cookie) + goto clean_up; + memcpy(asoc->peer.cookie, cookie, asoc->peer.cookie_len); + } + + /* RFC 2960 7.2.1 The initial value of ssthresh MAY be arbitrarily + * high (for example, implementations MAY use the size of the receiver + * advertised window). + */ + list_for_each(pos, &asoc->peer.transport_addr_list) { + transport = list_entry(pos, struct sctp_transport, transports); + transport->ssthresh = asoc->peer.i.a_rwnd; + } + + /* Set up the TSN tracking pieces. */ + sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_SIZE, + asoc->peer.i.initial_tsn); + + /* RFC 2960 6.5 Stream Identifier and Stream Sequence Number + * + * The stream sequence number in all the streams shall start + * from 0 when the association is established. Also, when the + * stream sequence number reaches the value 65535 the next + * stream sequence number shall be set to 0. + */ + + /* Allocate storage for the negotiated streams if it is not a temporary * association. + */ + if (!asoc->temp) { + int assoc_id; + int error; + + asoc->ssnmap = sctp_ssnmap_new(asoc->c.sinit_max_instreams, + asoc->c.sinit_num_ostreams, gfp); + if (!asoc->ssnmap) + goto clean_up; + + retry: + if (unlikely(!idr_pre_get(&sctp_assocs_id, gfp))) + goto clean_up; + spin_lock_bh(&sctp_assocs_id_lock); + error = idr_get_new_above(&sctp_assocs_id, (void *)asoc, 1, + &assoc_id); + spin_unlock_bh(&sctp_assocs_id_lock); + if (error == -EAGAIN) + goto retry; + else if (error) + goto clean_up; + + asoc->assoc_id = (sctp_assoc_t) assoc_id; + } + + /* ADDIP Section 4.1 ASCONF Chunk Procedures + * + * When an endpoint has an ASCONF signaled change to be sent to the + * remote endpoint it should do the following: + * ... + * A2) A serial number should be assigned to the Chunk. The serial + * number should be a monotonically increasing number. All serial + * numbers are defined to be initialized at the start of the + * association to the same value as the Initial TSN. + */ + asoc->peer.addip_serial = asoc->peer.i.initial_tsn - 1; + return 1; + +clean_up: + /* Release the transport structures. */ + list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { + transport = list_entry(pos, struct sctp_transport, transports); + list_del_init(pos); + sctp_transport_free(transport); + } +nomem: + return 0; +} + + +/* Update asoc with the option described in param. + * + * RFC2960 3.3.2.1 Optional/Variable Length Parameters in INIT + * + * asoc is the association to update. + * param is the variable length parameter to use for update. + * cid tells us if this is an INIT, INIT ACK or COOKIE ECHO. + * If the current packet is an INIT we want to minimize the amount of + * work we do. In particular, we should not build transport + * structures for the addresses. + */ +static int sctp_process_param(struct sctp_association *asoc, + union sctp_params param, + const union sctp_addr *peer_addr, + int gfp) +{ + union sctp_addr addr; + int i; + __u16 sat; + int retval = 1; + sctp_scope_t scope; + time_t stale; + struct sctp_af *af; + + /* We maintain all INIT parameters in network byte order all the + * time. This allows us to not worry about whether the parameters + * came from a fresh INIT, and INIT ACK, or were stored in a cookie. + */ + switch (param.p->type) { + case SCTP_PARAM_IPV6_ADDRESS: + if (PF_INET6 != asoc->base.sk->sk_family) + break; + /* Fall through. */ + case SCTP_PARAM_IPV4_ADDRESS: + af = sctp_get_af_specific(param_type2af(param.p->type)); + af->from_addr_param(&addr, param.addr, asoc->peer.port, 0); + scope = sctp_scope(peer_addr); + if (sctp_in_scope(&addr, scope)) + if (!sctp_assoc_add_peer(asoc, &addr, gfp)) + return 0; + break; + + case SCTP_PARAM_COOKIE_PRESERVATIVE: + if (!sctp_cookie_preserve_enable) + break; + + stale = ntohl(param.life->lifespan_increment); + + /* Suggested Cookie Life span increment's unit is msec, + * (1/1000sec). + */ + asoc->cookie_life.tv_sec += stale / 1000; + asoc->cookie_life.tv_usec += (stale % 1000) * 1000; + break; + + case SCTP_PARAM_HOST_NAME_ADDRESS: + SCTP_DEBUG_PRINTK("unimplemented SCTP_HOST_NAME_ADDRESS\n"); + break; + + case SCTP_PARAM_SUPPORTED_ADDRESS_TYPES: + /* Turn off the default values first so we'll know which + * ones are really set by the peer. + */ + asoc->peer.ipv4_address = 0; + asoc->peer.ipv6_address = 0; + + /* Cycle through address types; avoid divide by 0. */ + sat = ntohs(param.p->length) - sizeof(sctp_paramhdr_t); + if (sat) + sat /= sizeof(__u16); + + for (i = 0; i < sat; ++i) { + switch (param.sat->types[i]) { + case SCTP_PARAM_IPV4_ADDRESS: + asoc->peer.ipv4_address = 1; + break; + + case SCTP_PARAM_IPV6_ADDRESS: + asoc->peer.ipv6_address = 1; + break; + + case SCTP_PARAM_HOST_NAME_ADDRESS: + asoc->peer.hostname_address = 1; + break; + + default: /* Just ignore anything else. */ + break; + }; + } + break; + + case SCTP_PARAM_STATE_COOKIE: + asoc->peer.cookie_len = + ntohs(param.p->length) - sizeof(sctp_paramhdr_t); + asoc->peer.cookie = param.cookie->body; + break; + + case SCTP_PARAM_HEARTBEAT_INFO: + /* Would be odd to receive, but it causes no problems. */ + break; + + case SCTP_PARAM_UNRECOGNIZED_PARAMETERS: + /* Rejected during verify stage. */ + break; + + case SCTP_PARAM_ECN_CAPABLE: + asoc->peer.ecn_capable = 1; + break; + + case SCTP_PARAM_ADAPTION_LAYER_IND: + asoc->peer.adaption_ind = param.aind->adaption_ind; + break; + + case SCTP_PARAM_FWD_TSN_SUPPORT: + if (sctp_prsctp_enable) { + asoc->peer.prsctp_capable = 1; + break; + } + /* Fall Through */ + default: + /* Any unrecognized parameters should have been caught + * and handled by sctp_verify_param() which should be + * called prior to this routine. Simply log the error + * here. + */ + SCTP_DEBUG_PRINTK("Ignoring param: %d for association %p.\n", + ntohs(param.p->type), asoc); + break; + }; + + return retval; +} + +/* Select a new verification tag. */ +__u32 sctp_generate_tag(const struct sctp_endpoint *ep) +{ + /* I believe that this random number generator complies with RFC1750. + * A tag of 0 is reserved for special cases (e.g. INIT). + */ + __u32 x; + + do { + get_random_bytes(&x, sizeof(__u32)); + } while (x == 0); + + return x; +} + +/* Select an initial TSN to send during startup. */ +__u32 sctp_generate_tsn(const struct sctp_endpoint *ep) +{ + __u32 retval; + + get_random_bytes(&retval, sizeof(__u32)); + return retval; +} + +/* + * ADDIP 3.1.1 Address Configuration Change Chunk (ASCONF) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type = 0xC1 | Chunk Flags | Chunk Length | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Serial Number | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Address Parameter | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ASCONF Parameter #1 | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * \ \ + * / .... / + * \ \ + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ASCONF Parameter #N | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Address Parameter and other parameter will not be wrapped in this function + */ +static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc, + union sctp_addr *addr, + int vparam_len) +{ + sctp_addiphdr_t asconf; + struct sctp_chunk *retval; + int length = sizeof(asconf) + vparam_len; + union sctp_addr_param addrparam; + int addrlen; + struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family); + + addrlen = af->to_addr_param(addr, &addrparam); + if (!addrlen) + return NULL; + length += addrlen; + + /* Create the chunk. */ + retval = sctp_make_chunk(asoc, SCTP_CID_ASCONF, 0, length); + if (!retval) + return NULL; + + asconf.serial = htonl(asoc->addip_serial++); + + retval->subh.addip_hdr = + sctp_addto_chunk(retval, sizeof(asconf), &asconf); + retval->param_hdr.v = + sctp_addto_chunk(retval, addrlen, &addrparam); + + return retval; +} + +/* ADDIP + * 3.2.1 Add IP Address + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type = 0xC001 | Length = Variable | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ASCONF-Request Correlation ID | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Address Parameter | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * 3.2.2 Delete IP Address + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type = 0xC002 | Length = Variable | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ASCONF-Request Correlation ID | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Address Parameter | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + */ +struct sctp_chunk *sctp_make_asconf_update_ip(struct sctp_association *asoc, + union sctp_addr *laddr, + struct sockaddr *addrs, + int addrcnt, + __u16 flags) +{ + sctp_addip_param_t param; + struct sctp_chunk *retval; + union sctp_addr_param addr_param; + union sctp_addr *addr; + void *addr_buf; + struct sctp_af *af; + int paramlen = sizeof(param); + int addr_param_len = 0; + int totallen = 0; + int i; + + /* Get total length of all the address parameters. */ + addr_buf = addrs; + for (i = 0; i < addrcnt; i++) { + addr = (union sctp_addr *)addr_buf; + af = sctp_get_af_specific(addr->v4.sin_family); + addr_param_len = af->to_addr_param(addr, &addr_param); + + totallen += paramlen; + totallen += addr_param_len; + + addr_buf += af->sockaddr_len; + } + + /* Create an asconf chunk with the required length. */ + retval = sctp_make_asconf(asoc, laddr, totallen); + if (!retval) + return NULL; + + /* Add the address parameters to the asconf chunk. */ + addr_buf = addrs; + for (i = 0; i < addrcnt; i++) { + addr = (union sctp_addr *)addr_buf; + af = sctp_get_af_specific(addr->v4.sin_family); + addr_param_len = af->to_addr_param(addr, &addr_param); + param.param_hdr.type = flags; + param.param_hdr.length = htons(paramlen + addr_param_len); + param.crr_id = i; + + sctp_addto_chunk(retval, paramlen, ¶m); + sctp_addto_chunk(retval, addr_param_len, &addr_param); + + addr_buf += af->sockaddr_len; + } + return retval; +} + +/* ADDIP + * 3.2.4 Set Primary IP Address + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type =0xC004 | Length = Variable | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ASCONF-Request Correlation ID | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Address Parameter | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Create an ASCONF chunk with Set Primary IP address parameter. + */ +struct sctp_chunk *sctp_make_asconf_set_prim(struct sctp_association *asoc, + union sctp_addr *addr) +{ + sctp_addip_param_t param; + struct sctp_chunk *retval; + int len = sizeof(param); + union sctp_addr_param addrparam; + int addrlen; + struct sctp_af *af = sctp_get_af_specific(addr->v4.sin_family); + + addrlen = af->to_addr_param(addr, &addrparam); + if (!addrlen) + return NULL; + len += addrlen; + + /* Create the chunk and make asconf header. */ + retval = sctp_make_asconf(asoc, addr, len); + if (!retval) + return NULL; + + param.param_hdr.type = SCTP_PARAM_SET_PRIMARY; + param.param_hdr.length = htons(len); + param.crr_id = 0; + + sctp_addto_chunk(retval, sizeof(param), ¶m); + sctp_addto_chunk(retval, addrlen, &addrparam); + + return retval; +} + +/* ADDIP 3.1.2 Address Configuration Acknowledgement Chunk (ASCONF-ACK) + * 0 1 2 3 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Type = 0x80 | Chunk Flags | Chunk Length | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | Serial Number | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ASCONF Parameter Response#1 | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * \ \ + * / .... / + * \ \ + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * | ASCONF Parameter Response#N | + * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + * + * Create an ASCONF_ACK chunk with enough space for the parameter responses. + */ +static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *asoc, + __u32 serial, int vparam_len) +{ + sctp_addiphdr_t asconf; + struct sctp_chunk *retval; + int length = sizeof(asconf) + vparam_len; + + /* Create the chunk. */ + retval = sctp_make_chunk(asoc, SCTP_CID_ASCONF_ACK, 0, length); + if (!retval) + return NULL; + + asconf.serial = htonl(serial); + + retval->subh.addip_hdr = + sctp_addto_chunk(retval, sizeof(asconf), &asconf); + + return retval; +} + +/* Add response parameters to an ASCONF_ACK chunk. */ +static void sctp_add_asconf_response(struct sctp_chunk *chunk, __u32 crr_id, + __u16 err_code, sctp_addip_param_t *asconf_param) +{ + sctp_addip_param_t ack_param; + sctp_errhdr_t err_param; + int asconf_param_len = 0; + int err_param_len = 0; + __u16 response_type; + + if (SCTP_ERROR_NO_ERROR == err_code) { + response_type = SCTP_PARAM_SUCCESS_REPORT; + } else { + response_type = SCTP_PARAM_ERR_CAUSE; + err_param_len = sizeof(err_param); + if (asconf_param) + asconf_param_len = + ntohs(asconf_param->param_hdr.length); + } + + /* Add Success Indication or Error Cause Indication parameter. */ + ack_param.param_hdr.type = response_type; + ack_param.param_hdr.length = htons(sizeof(ack_param) + + err_param_len + + asconf_param_len); + ack_param.crr_id = crr_id; + sctp_addto_chunk(chunk, sizeof(ack_param), &ack_param); + + if (SCTP_ERROR_NO_ERROR == err_code) + return; + + /* Add Error Cause parameter. */ + err_param.cause = err_code; + err_param.length = htons(err_param_len + asconf_param_len); + sctp_addto_chunk(chunk, err_param_len, &err_param); + + /* Add the failed TLV copied from ASCONF chunk. */ + if (asconf_param) + sctp_addto_chunk(chunk, asconf_param_len, asconf_param); +} + +/* Process a asconf parameter. */ +static __u16 sctp_process_asconf_param(struct sctp_association *asoc, + struct sctp_chunk *asconf, + sctp_addip_param_t *asconf_param) +{ + struct sctp_transport *peer; + struct sctp_af *af; + union sctp_addr addr; + struct list_head *pos; + union sctp_addr_param *addr_param; + + addr_param = (union sctp_addr_param *) + ((void *)asconf_param + sizeof(sctp_addip_param_t)); + + af = sctp_get_af_specific(param_type2af(addr_param->v4.param_hdr.type)); + if (unlikely(!af)) + return SCTP_ERROR_INV_PARAM; + + af->from_addr_param(&addr, addr_param, asoc->peer.port, 0); + switch (asconf_param->param_hdr.type) { + case SCTP_PARAM_ADD_IP: + /* ADDIP 4.3 D9) If an endpoint receives an ADD IP address + * request and does not have the local resources to add this + * new address to the association, it MUST return an Error + * Cause TLV set to the new error code 'Operation Refused + * Due to Resource Shortage'. + */ + + peer = sctp_assoc_add_peer(asoc, &addr, GFP_ATOMIC); + if (!peer) + return SCTP_ERROR_RSRC_LOW; + + /* Start the heartbeat timer. */ + if (!mod_timer(&peer->hb_timer, sctp_transport_timeout(peer))) + sctp_transport_hold(peer); + break; + case SCTP_PARAM_DEL_IP: + /* ADDIP 4.3 D7) If a request is received to delete the + * last remaining IP address of a peer endpoint, the receiver + * MUST send an Error Cause TLV with the error cause set to the + * new error code 'Request to Delete Last Remaining IP Address'. + */ + pos = asoc->peer.transport_addr_list.next; + if (pos->next == &asoc->peer.transport_addr_list) + return SCTP_ERROR_DEL_LAST_IP; + + /* ADDIP 4.3 D8) If a request is received to delete an IP + * address which is also the source address of the IP packet + * which contained the ASCONF chunk, the receiver MUST reject + * this request. To reject the request the receiver MUST send + * an Error Cause TLV set to the new error code 'Request to + * Delete Source IP Address' + */ + if (sctp_cmp_addr_exact(sctp_source(asconf), &addr)) + return SCTP_ERROR_DEL_SRC_IP; + + sctp_assoc_del_peer(asoc, &addr); + break; + case SCTP_PARAM_SET_PRIMARY: + peer = sctp_assoc_lookup_paddr(asoc, &addr); + if (!peer) + return SCTP_ERROR_INV_PARAM; + + sctp_assoc_set_primary(asoc, peer); + break; + default: + return SCTP_ERROR_INV_PARAM; + break; + } + + return SCTP_ERROR_NO_ERROR; +} + +/* Process an incoming ASCONF chunk with the next expected serial no. and + * return an ASCONF_ACK chunk to be sent in response. + */ +struct sctp_chunk *sctp_process_asconf(struct sctp_association *asoc, + struct sctp_chunk *asconf) +{ + sctp_addiphdr_t *hdr; + union sctp_addr_param *addr_param; + sctp_addip_param_t *asconf_param; + struct sctp_chunk *asconf_ack; + + __u16 err_code; + int length = 0; + int chunk_len = asconf->skb->len; + __u32 serial; + int all_param_pass = 1; + + hdr = (sctp_addiphdr_t *)asconf->skb->data; + serial = ntohl(hdr->serial); + + /* Skip the addiphdr and store a pointer to address parameter. */ + length = sizeof(sctp_addiphdr_t); + addr_param = (union sctp_addr_param *)(asconf->skb->data + length); + chunk_len -= length; + + /* Skip the address parameter and store a pointer to the first + * asconf paramter. + */ + length = ntohs(addr_param->v4.param_hdr.length); + asconf_param = (sctp_addip_param_t *)((void *)addr_param + length); + chunk_len -= length; + + /* create an ASCONF_ACK chunk. + * Based on the definitions of parameters, we know that the size of + * ASCONF_ACK parameters are less than or equal to the twice of ASCONF + * paramters. + */ + asconf_ack = sctp_make_asconf_ack(asoc, serial, chunk_len * 2); + if (!asconf_ack) + goto done; + + /* Process the TLVs contained within the ASCONF chunk. */ + while (chunk_len > 0) { + err_code = sctp_process_asconf_param(asoc, asconf, + asconf_param); + /* ADDIP 4.1 A7) + * If an error response is received for a TLV parameter, + * all TLVs with no response before the failed TLV are + * considered successful if not reported. All TLVs after + * the failed response are considered unsuccessful unless + * a specific success indication is present for the parameter. + */ + if (SCTP_ERROR_NO_ERROR != err_code) + all_param_pass = 0; + + if (!all_param_pass) + sctp_add_asconf_response(asconf_ack, + asconf_param->crr_id, err_code, + asconf_param); + + /* ADDIP 4.3 D11) When an endpoint receiving an ASCONF to add + * an IP address sends an 'Out of Resource' in its response, it + * MUST also fail any subsequent add or delete requests bundled + * in the ASCONF. + */ + if (SCTP_ERROR_RSRC_LOW == err_code) + goto done; + + /* Move to the next ASCONF param. */ + length = ntohs(asconf_param->param_hdr.length); + asconf_param = (sctp_addip_param_t *)((void *)asconf_param + + length); + chunk_len -= length; + } + +done: + asoc->peer.addip_serial++; + + /* If we are sending a new ASCONF_ACK hold a reference to it in assoc + * after freeing the reference to old asconf ack if any. + */ + if (asconf_ack) { + if (asoc->addip_last_asconf_ack) + sctp_chunk_free(asoc->addip_last_asconf_ack); + + sctp_chunk_hold(asconf_ack); + asoc->addip_last_asconf_ack = asconf_ack; + } + + return asconf_ack; +} + +/* Process a asconf parameter that is successfully acked. */ +static int sctp_asconf_param_success(struct sctp_association *asoc, + sctp_addip_param_t *asconf_param) +{ + struct sctp_af *af; + union sctp_addr addr; + struct sctp_bind_addr *bp = &asoc->base.bind_addr; + union sctp_addr_param *addr_param; + struct list_head *pos; + struct sctp_transport *transport; + int retval = 0; + + addr_param = (union sctp_addr_param *) + ((void *)asconf_param + sizeof(sctp_addip_param_t)); + + /* We have checked the packet before, so we do not check again. */ + af = sctp_get_af_specific(param_type2af(addr_param->v4.param_hdr.type)); + af->from_addr_param(&addr, addr_param, bp->port, 0); + + switch (asconf_param->param_hdr.type) { + case SCTP_PARAM_ADD_IP: + sctp_local_bh_disable(); + sctp_write_lock(&asoc->base.addr_lock); + retval = sctp_add_bind_addr(bp, &addr, GFP_ATOMIC); + sctp_write_unlock(&asoc->base.addr_lock); + sctp_local_bh_enable(); + break; + case SCTP_PARAM_DEL_IP: + sctp_local_bh_disable(); + sctp_write_lock(&asoc->base.addr_lock); + retval = sctp_del_bind_addr(bp, &addr); + sctp_write_unlock(&asoc->base.addr_lock); + sctp_local_bh_enable(); + list_for_each(pos, &asoc->peer.transport_addr_list) { + transport = list_entry(pos, struct sctp_transport, + transports); + sctp_transport_route(transport, NULL, + sctp_sk(asoc->base.sk)); + } + break; + default: + break; + } + + return retval; +} + +/* Get the corresponding ASCONF response error code from the ASCONF_ACK chunk + * for the given asconf parameter. If there is no response for this parameter, + * return the error code based on the third argument 'no_err'. + * ADDIP 4.1 + * A7) If an error response is received for a TLV parameter, all TLVs with no + * response before the failed TLV are considered successful if not reported. + * All TLVs after the failed response are considered unsuccessful unless a + * specific success indication is present for the parameter. + */ +static __u16 sctp_get_asconf_response(struct sctp_chunk *asconf_ack, + sctp_addip_param_t *asconf_param, + int no_err) +{ + sctp_addip_param_t *asconf_ack_param; + sctp_errhdr_t *err_param; + int length; + int asconf_ack_len = asconf_ack->skb->len; + __u16 err_code; + + if (no_err) + err_code = SCTP_ERROR_NO_ERROR; + else + err_code = SCTP_ERROR_REQ_REFUSED; + + /* Skip the addiphdr from the asconf_ack chunk and store a pointer to + * the first asconf_ack parameter. + */ + length = sizeof(sctp_addiphdr_t); + asconf_ack_param = (sctp_addip_param_t *)(asconf_ack->skb->data + + length); + asconf_ack_len -= length; + + while (asconf_ack_len > 0) { + if (asconf_ack_param->crr_id == asconf_param->crr_id) { + switch(asconf_ack_param->param_hdr.type) { + case SCTP_PARAM_SUCCESS_REPORT: + return SCTP_ERROR_NO_ERROR; + case SCTP_PARAM_ERR_CAUSE: + length = sizeof(sctp_addip_param_t); + err_param = (sctp_errhdr_t *) + ((void *)asconf_ack_param + length); + asconf_ack_len -= length; + if (asconf_ack_len > 0) + return err_param->cause; + else + return SCTP_ERROR_INV_PARAM; + break; + default: + return SCTP_ERROR_INV_PARAM; + } + } + + length = ntohs(asconf_ack_param->param_hdr.length); + asconf_ack_param = (sctp_addip_param_t *) + ((void *)asconf_ack_param + length); + asconf_ack_len -= length; + } + + return err_code; +} + +/* Process an incoming ASCONF_ACK chunk against the cached last ASCONF chunk. */ +int sctp_process_asconf_ack(struct sctp_association *asoc, + struct sctp_chunk *asconf_ack) +{ + struct sctp_chunk *asconf = asoc->addip_last_asconf; + union sctp_addr_param *addr_param; + sctp_addip_param_t *asconf_param; + int length = 0; + int asconf_len = asconf->skb->len; + int all_param_pass = 0; + int no_err = 1; + int retval = 0; + __u16 err_code = SCTP_ERROR_NO_ERROR; + + /* Skip the chunkhdr and addiphdr from the last asconf sent and store + * a pointer to address parameter. + */ + length = sizeof(sctp_addip_chunk_t); + addr_param = (union sctp_addr_param *)(asconf->skb->data + length); + asconf_len -= length; + + /* Skip the address parameter in the last asconf sent and store a + * pointer to the first asconf paramter. + */ + length = ntohs(addr_param->v4.param_hdr.length); + asconf_param = (sctp_addip_param_t *)((void *)addr_param + length); + asconf_len -= length; + + /* ADDIP 4.1 + * A8) If there is no response(s) to specific TLV parameter(s), and no + * failures are indicated, then all request(s) are considered + * successful. + */ + if (asconf_ack->skb->len == sizeof(sctp_addiphdr_t)) + all_param_pass = 1; + + /* Process the TLVs contained in the last sent ASCONF chunk. */ + while (asconf_len > 0) { + if (all_param_pass) + err_code = SCTP_ERROR_NO_ERROR; + else { + err_code = sctp_get_asconf_response(asconf_ack, + asconf_param, + no_err); + if (no_err && (SCTP_ERROR_NO_ERROR != err_code)) + no_err = 0; + } + + switch (err_code) { + case SCTP_ERROR_NO_ERROR: + retval = sctp_asconf_param_success(asoc, asconf_param); + break; + + case SCTP_ERROR_RSRC_LOW: + retval = 1; + break; + + case SCTP_ERROR_INV_PARAM: + /* Disable sending this type of asconf parameter in + * future. + */ + asoc->peer.addip_disabled_mask |= + asconf_param->param_hdr.type; + break; + + case SCTP_ERROR_REQ_REFUSED: + case SCTP_ERROR_DEL_LAST_IP: + case SCTP_ERROR_DEL_SRC_IP: + default: + break; + } + + /* Skip the processed asconf parameter and move to the next + * one. + */ + length = ntohs(asconf_param->param_hdr.length); + asconf_param = (sctp_addip_param_t *)((void *)asconf_param + + length); + asconf_len -= length; + } + + /* Free the cached last sent asconf chunk. */ + sctp_chunk_free(asconf); + asoc->addip_last_asconf = NULL; + + /* Send the next asconf chunk from the addip chunk queue. */ + asconf = (struct sctp_chunk *)__skb_dequeue(&asoc->addip_chunks); + if (asconf) { + /* Hold the chunk until an ASCONF_ACK is received. */ + sctp_chunk_hold(asconf); + if (sctp_primitive_ASCONF(asoc, asconf)) + sctp_chunk_free(asconf); + else + asoc->addip_last_asconf = asconf; + } + + return retval; +} + +/* Make a FWD TSN chunk. */ +struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc, + __u32 new_cum_tsn, size_t nstreams, + struct sctp_fwdtsn_skip *skiplist) +{ + struct sctp_chunk *retval = NULL; + struct sctp_fwdtsn_chunk *ftsn_chunk; + struct sctp_fwdtsn_hdr ftsn_hdr; + struct sctp_fwdtsn_skip skip; + size_t hint; + int i; + + hint = (nstreams + 1) * sizeof(__u32); + + /* Maybe set the T-bit if we have no association. */ + retval = sctp_make_chunk(asoc, SCTP_CID_FWD_TSN, 0, hint); + + if (!retval) + return NULL; + + ftsn_chunk = (struct sctp_fwdtsn_chunk *)retval->subh.fwdtsn_hdr; + + ftsn_hdr.new_cum_tsn = htonl(new_cum_tsn); + retval->subh.fwdtsn_hdr = + sctp_addto_chunk(retval, sizeof(ftsn_hdr), &ftsn_hdr); + + for (i = 0; i < nstreams; i++) { + skip.stream = skiplist[i].stream; + skip.ssn = skiplist[i].ssn; + sctp_addto_chunk(retval, sizeof(skip), &skip); + } + + return retval; +} diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c new file mode 100644 index 000000000000..f65fa441952f --- /dev/null +++ b/net/sctp/sm_sideeffect.c @@ -0,0 +1,1395 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * + * This file is part of the SCTP kernel reference Implementation + * + * These functions work with the state functions in sctp_sm_statefuns.c + * to implement that state operations. These functions implement the + * steps which require modifying existing data structures. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Jon Grimm + * Hui Huang + * Dajiang Zhang + * Daisy Chang + * Sridhar Samudrala + * Ardelle Fan + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include +#include +#include + +static int sctp_cmd_interpreter(sctp_event_t event_type, + sctp_subtype_t subtype, + sctp_state_t state, + struct sctp_endpoint *ep, + struct sctp_association *asoc, + void *event_arg, + sctp_disposition_t status, + sctp_cmd_seq_t *commands, + int gfp); +static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype, + sctp_state_t state, + struct sctp_endpoint *ep, + struct sctp_association *asoc, + void *event_arg, + sctp_disposition_t status, + sctp_cmd_seq_t *commands, + int gfp); + +/******************************************************************** + * Helper functions + ********************************************************************/ + +/* A helper function for delayed processing of INET ECN CE bit. */ +static void sctp_do_ecn_ce_work(struct sctp_association *asoc, + __u32 lowest_tsn) +{ + /* Save the TSN away for comparison when we receive CWR */ + + asoc->last_ecne_tsn = lowest_tsn; + asoc->need_ecne = 1; +} + +/* Helper function for delayed processing of SCTP ECNE chunk. */ +/* RFC 2960 Appendix A + * + * RFC 2481 details a specific bit for a sender to send in + * the header of its next outbound TCP segment to indicate to + * its peer that it has reduced its congestion window. This + * is termed the CWR bit. For SCTP the same indication is made + * by including the CWR chunk. This chunk contains one data + * element, i.e. the TSN number that was sent in the ECNE chunk. + * This element represents the lowest TSN number in the datagram + * that was originally marked with the CE bit. + */ +static struct sctp_chunk *sctp_do_ecn_ecne_work(struct sctp_association *asoc, + __u32 lowest_tsn, + struct sctp_chunk *chunk) +{ + struct sctp_chunk *repl; + + /* Our previously transmitted packet ran into some congestion + * so we should take action by reducing cwnd and ssthresh + * and then ACK our peer that we we've done so by + * sending a CWR. + */ + + /* First, try to determine if we want to actually lower + * our cwnd variables. Only lower them if the ECNE looks more + * recent than the last response. + */ + if (TSN_lt(asoc->last_cwr_tsn, lowest_tsn)) { + struct sctp_transport *transport; + + /* Find which transport's congestion variables + * need to be adjusted. + */ + transport = sctp_assoc_lookup_tsn(asoc, lowest_tsn); + + /* Update the congestion variables. */ + if (transport) + sctp_transport_lower_cwnd(transport, + SCTP_LOWER_CWND_ECNE); + asoc->last_cwr_tsn = lowest_tsn; + } + + /* Always try to quiet the other end. In case of lost CWR, + * resend last_cwr_tsn. + */ + repl = sctp_make_cwr(asoc, asoc->last_cwr_tsn, chunk); + + /* If we run out of memory, it will look like a lost CWR. We'll + * get back in sync eventually. + */ + return repl; +} + +/* Helper function to do delayed processing of ECN CWR chunk. */ +static void sctp_do_ecn_cwr_work(struct sctp_association *asoc, + __u32 lowest_tsn) +{ + /* Turn off ECNE getting auto-prepended to every outgoing + * packet + */ + asoc->need_ecne = 0; +} + +/* Generate SACK if necessary. We call this at the end of a packet. */ +static int sctp_gen_sack(struct sctp_association *asoc, int force, + sctp_cmd_seq_t *commands) +{ + __u32 ctsn, max_tsn_seen; + struct sctp_chunk *sack; + int error = 0; + + if (force) + asoc->peer.sack_needed = 1; + + ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); + max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map); + + /* From 12.2 Parameters necessary per association (i.e. the TCB): + * + * Ack State : This flag indicates if the next received packet + * : is to be responded to with a SACK. ... + * : When DATA chunks are out of order, SACK's + * : are not delayed (see Section 6). + * + * [This is actually not mentioned in Section 6, but we + * implement it here anyway. --piggy] + */ + if (max_tsn_seen != ctsn) + asoc->peer.sack_needed = 1; + + /* From 6.2 Acknowledgement on Reception of DATA Chunks: + * + * Section 4.2 of [RFC2581] SHOULD be followed. Specifically, + * an acknowledgement SHOULD be generated for at least every + * second packet (not every second DATA chunk) received, and + * SHOULD be generated within 200 ms of the arrival of any + * unacknowledged DATA chunk. ... + */ + if (!asoc->peer.sack_needed) { + /* We will need a SACK for the next packet. */ + asoc->peer.sack_needed = 1; + goto out; + } else { + if (asoc->a_rwnd > asoc->rwnd) + asoc->a_rwnd = asoc->rwnd; + sack = sctp_make_sack(asoc); + if (!sack) + goto nomem; + + asoc->peer.sack_needed = 0; + + error = sctp_outq_tail(&asoc->outqueue, sack); + + /* Stop the SACK timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); + } +out: + return error; +nomem: + error = -ENOMEM; + return error; +} + +/* When the T3-RTX timer expires, it calls this function to create the + * relevant state machine event. + */ +void sctp_generate_t3_rtx_event(unsigned long peer) +{ + int error; + struct sctp_transport *transport = (struct sctp_transport *) peer; + struct sctp_association *asoc = transport->asoc; + + /* Check whether a task is in the sock. */ + + sctp_bh_lock_sock(asoc->base.sk); + if (sock_owned_by_user(asoc->base.sk)) { + SCTP_DEBUG_PRINTK("%s:Sock is busy.\n", __FUNCTION__); + + /* Try again later. */ + if (!mod_timer(&transport->T3_rtx_timer, jiffies + (HZ/20))) + sctp_transport_hold(transport); + goto out_unlock; + } + + /* Is this transport really dead and just waiting around for + * the timer to let go of the reference? + */ + if (transport->dead) + goto out_unlock; + + /* Run through the state machine. */ + error = sctp_do_sm(SCTP_EVENT_T_TIMEOUT, + SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_T3_RTX), + asoc->state, + asoc->ep, asoc, + transport, GFP_ATOMIC); + + if (error) + asoc->base.sk->sk_err = -error; + +out_unlock: + sctp_bh_unlock_sock(asoc->base.sk); + sctp_transport_put(transport); +} + +/* This is a sa interface for producing timeout events. It works + * for timeouts which use the association as their parameter. + */ +static void sctp_generate_timeout_event(struct sctp_association *asoc, + sctp_event_timeout_t timeout_type) +{ + int error = 0; + + sctp_bh_lock_sock(asoc->base.sk); + if (sock_owned_by_user(asoc->base.sk)) { + SCTP_DEBUG_PRINTK("%s:Sock is busy: timer %d\n", + __FUNCTION__, + timeout_type); + + /* Try again later. */ + if (!mod_timer(&asoc->timers[timeout_type], jiffies + (HZ/20))) + sctp_association_hold(asoc); + goto out_unlock; + } + + /* Is this association really dead and just waiting around for + * the timer to let go of the reference? + */ + if (asoc->base.dead) + goto out_unlock; + + /* Run through the state machine. */ + error = sctp_do_sm(SCTP_EVENT_T_TIMEOUT, + SCTP_ST_TIMEOUT(timeout_type), + asoc->state, asoc->ep, asoc, + (void *)timeout_type, GFP_ATOMIC); + + if (error) + asoc->base.sk->sk_err = -error; + +out_unlock: + sctp_bh_unlock_sock(asoc->base.sk); + sctp_association_put(asoc); +} + +static void sctp_generate_t1_cookie_event(unsigned long data) +{ + struct sctp_association *asoc = (struct sctp_association *) data; + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_COOKIE); +} + +static void sctp_generate_t1_init_event(unsigned long data) +{ + struct sctp_association *asoc = (struct sctp_association *) data; + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T1_INIT); +} + +static void sctp_generate_t2_shutdown_event(unsigned long data) +{ + struct sctp_association *asoc = (struct sctp_association *) data; + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T2_SHUTDOWN); +} + +static void sctp_generate_t4_rto_event(unsigned long data) +{ + struct sctp_association *asoc = (struct sctp_association *) data; + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_T4_RTO); +} + +static void sctp_generate_t5_shutdown_guard_event(unsigned long data) +{ + struct sctp_association *asoc = (struct sctp_association *)data; + sctp_generate_timeout_event(asoc, + SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD); + +} /* sctp_generate_t5_shutdown_guard_event() */ + +static void sctp_generate_autoclose_event(unsigned long data) +{ + struct sctp_association *asoc = (struct sctp_association *) data; + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_AUTOCLOSE); +} + +/* Generate a heart beat event. If the sock is busy, reschedule. Make + * sure that the transport is still valid. + */ +void sctp_generate_heartbeat_event(unsigned long data) +{ + int error = 0; + struct sctp_transport *transport = (struct sctp_transport *) data; + struct sctp_association *asoc = transport->asoc; + + sctp_bh_lock_sock(asoc->base.sk); + if (sock_owned_by_user(asoc->base.sk)) { + SCTP_DEBUG_PRINTK("%s:Sock is busy.\n", __FUNCTION__); + + /* Try again later. */ + if (!mod_timer(&transport->hb_timer, jiffies + (HZ/20))) + sctp_transport_hold(transport); + goto out_unlock; + } + + /* Is this structure just waiting around for us to actually + * get destroyed? + */ + if (transport->dead) + goto out_unlock; + + error = sctp_do_sm(SCTP_EVENT_T_TIMEOUT, + SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_HEARTBEAT), + asoc->state, asoc->ep, asoc, + transport, GFP_ATOMIC); + + if (error) + asoc->base.sk->sk_err = -error; + +out_unlock: + sctp_bh_unlock_sock(asoc->base.sk); + sctp_transport_put(transport); +} + +/* Inject a SACK Timeout event into the state machine. */ +static void sctp_generate_sack_event(unsigned long data) +{ + struct sctp_association *asoc = (struct sctp_association *) data; + sctp_generate_timeout_event(asoc, SCTP_EVENT_TIMEOUT_SACK); +} + +sctp_timer_event_t *sctp_timer_events[SCTP_NUM_TIMEOUT_TYPES] = { + NULL, + sctp_generate_t1_cookie_event, + sctp_generate_t1_init_event, + sctp_generate_t2_shutdown_event, + NULL, + sctp_generate_t4_rto_event, + sctp_generate_t5_shutdown_guard_event, + sctp_generate_heartbeat_event, + sctp_generate_sack_event, + sctp_generate_autoclose_event, +}; + + +/* RFC 2960 8.2 Path Failure Detection + * + * When its peer endpoint is multi-homed, an endpoint should keep a + * error counter for each of the destination transport addresses of the + * peer endpoint. + * + * Each time the T3-rtx timer expires on any address, or when a + * HEARTBEAT sent to an idle address is not acknowledged within a RTO, + * the error counter of that destination address will be incremented. + * When the value in the error counter exceeds the protocol parameter + * 'Path.Max.Retrans' of that destination address, the endpoint should + * mark the destination transport address as inactive, and a + * notification SHOULD be sent to the upper layer. + * + */ +static void sctp_do_8_2_transport_strike(struct sctp_association *asoc, + struct sctp_transport *transport) +{ + /* The check for association's overall error counter exceeding the + * threshold is done in the state function. + */ + asoc->overall_error_count++; + + if (transport->active && + (transport->error_count++ >= transport->max_retrans)) { + SCTP_DEBUG_PRINTK("transport_strike: transport " + "IP:%d.%d.%d.%d failed.\n", + NIPQUAD(transport->ipaddr.v4.sin_addr)); + sctp_assoc_control_transport(asoc, transport, + SCTP_TRANSPORT_DOWN, + SCTP_FAILED_THRESHOLD); + } + + /* E2) For the destination address for which the timer + * expires, set RTO <- RTO * 2 ("back off the timer"). The + * maximum value discussed in rule C7 above (RTO.max) may be + * used to provide an upper bound to this doubling operation. + */ + transport->rto = min((transport->rto * 2), transport->asoc->rto_max); +} + +/* Worker routine to handle INIT command failure. */ +static void sctp_cmd_init_failed(sctp_cmd_seq_t *commands, + struct sctp_association *asoc, + unsigned error) +{ + struct sctp_ulpevent *event; + + event = sctp_ulpevent_make_assoc_change(asoc,0, SCTP_CANT_STR_ASSOC, + (__u16)error, 0, 0, + GFP_ATOMIC); + + if (event) + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(event)); + + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + + /* SEND_FAILED sent later when cleaning up the association. */ + asoc->outqueue.error = error; + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); +} + +/* Worker routine to handle SCTP_CMD_ASSOC_FAILED. */ +static void sctp_cmd_assoc_failed(sctp_cmd_seq_t *commands, + struct sctp_association *asoc, + sctp_event_t event_type, + sctp_subtype_t subtype, + struct sctp_chunk *chunk, + unsigned error) +{ + struct sctp_ulpevent *event; + + /* Cancel any partial delivery in progress. */ + sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); + + event = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_LOST, + (__u16)error, 0, 0, + GFP_ATOMIC); + if (event) + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(event)); + + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + + /* Set sk_err to ECONNRESET on a 1-1 style socket. */ + if (!sctp_style(asoc->base.sk, UDP)) + asoc->base.sk->sk_err = ECONNRESET; + + /* SEND_FAILED sent later when cleaning up the association. */ + asoc->outqueue.error = error; + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); +} + +/* Process an init chunk (may be real INIT/INIT-ACK or an embedded INIT + * inside the cookie. In reality, this is only used for INIT-ACK processing + * since all other cases use "temporary" associations and can do all + * their work in statefuns directly. + */ +static int sctp_cmd_process_init(sctp_cmd_seq_t *commands, + struct sctp_association *asoc, + struct sctp_chunk *chunk, + sctp_init_chunk_t *peer_init, int gfp) +{ + int error; + + /* We only process the init as a sideeffect in a single + * case. This is when we process the INIT-ACK. If we + * fail during INIT processing (due to malloc problems), + * just return the error and stop processing the stack. + */ + if (!sctp_process_init(asoc, chunk->chunk_hdr->type, + sctp_source(chunk), peer_init, gfp)) + error = -ENOMEM; + else + error = 0; + + return error; +} + +/* Helper function to break out starting up of heartbeat timers. */ +static void sctp_cmd_hb_timers_start(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc) +{ + struct sctp_transport *t; + struct list_head *pos; + + /* Start a heartbeat timer for each transport on the association. + * hold a reference on the transport to make sure none of + * the needed data structures go away. + */ + list_for_each(pos, &asoc->peer.transport_addr_list) { + t = list_entry(pos, struct sctp_transport, transports); + + if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t))) + sctp_transport_hold(t); + } +} + +static void sctp_cmd_hb_timers_stop(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc) +{ + struct sctp_transport *t; + struct list_head *pos; + + /* Stop all heartbeat timers. */ + + list_for_each(pos, &asoc->peer.transport_addr_list) { + t = list_entry(pos, struct sctp_transport, transports); + if (del_timer(&t->hb_timer)) + sctp_transport_put(t); + } +} + +/* Helper function to stop any pending T3-RTX timers */ +static void sctp_cmd_t3_rtx_timers_stop(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc) +{ + struct sctp_transport *t; + struct list_head *pos; + + list_for_each(pos, &asoc->peer.transport_addr_list) { + t = list_entry(pos, struct sctp_transport, transports); + if (timer_pending(&t->T3_rtx_timer) && + del_timer(&t->T3_rtx_timer)) { + sctp_transport_put(t); + } + } +} + + +/* Helper function to update the heartbeat timer. */ +static void sctp_cmd_hb_timer_update(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc, + struct sctp_transport *t) +{ + /* Update the heartbeat timer. */ + if (!mod_timer(&t->hb_timer, sctp_transport_timeout(t))) + sctp_transport_hold(t); +} + +/* Helper function to handle the reception of an HEARTBEAT ACK. */ +static void sctp_cmd_transport_on(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc, + struct sctp_transport *t, + struct sctp_chunk *chunk) +{ + sctp_sender_hb_info_t *hbinfo; + + /* 8.3 Upon the receipt of the HEARTBEAT ACK, the sender of the + * HEARTBEAT should clear the error counter of the destination + * transport address to which the HEARTBEAT was sent. + * The association's overall error count is also cleared. + */ + t->error_count = 0; + t->asoc->overall_error_count = 0; + + /* Mark the destination transport address as active if it is not so + * marked. + */ + if (!t->active) + sctp_assoc_control_transport(asoc, t, SCTP_TRANSPORT_UP, + SCTP_HEARTBEAT_SUCCESS); + + /* The receiver of the HEARTBEAT ACK should also perform an + * RTT measurement for that destination transport address + * using the time value carried in the HEARTBEAT ACK chunk. + */ + hbinfo = (sctp_sender_hb_info_t *) chunk->skb->data; + sctp_transport_update_rto(t, (jiffies - hbinfo->sent_at)); +} + +/* Helper function to do a transport reset at the expiry of the hearbeat + * timer. + */ +static void sctp_cmd_transport_reset(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc, + struct sctp_transport *t) +{ + sctp_transport_lower_cwnd(t, SCTP_LOWER_CWND_INACTIVE); + + /* Mark one strike against a transport. */ + sctp_do_8_2_transport_strike(asoc, t); +} + +/* Helper function to process the process SACK command. */ +static int sctp_cmd_process_sack(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc, + struct sctp_sackhdr *sackh) +{ + int err; + + if (sctp_outq_sack(&asoc->outqueue, sackh)) { + /* There are no more TSNs awaiting SACK. */ + err = sctp_do_sm(SCTP_EVENT_T_OTHER, + SCTP_ST_OTHER(SCTP_EVENT_NO_PENDING_TSN), + asoc->state, asoc->ep, asoc, NULL, + GFP_ATOMIC); + } else { + /* Windows may have opened, so we need + * to check if we have DATA to transmit + */ + err = sctp_outq_flush(&asoc->outqueue, 0); + } + + return err; +} + +/* Helper function to set the timeout value for T2-SHUTDOWN timer and to set + * the transport for a shutdown chunk. + */ +static void sctp_cmd_setup_t2(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc, + struct sctp_chunk *chunk) +{ + struct sctp_transport *t; + + t = sctp_assoc_choose_shutdown_transport(asoc); + asoc->shutdown_last_sent_to = t; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T2_SHUTDOWN] = t->rto; + chunk->transport = t; +} + +/* Helper function to change the state of an association. */ +static void sctp_cmd_new_state(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc, + sctp_state_t state) +{ + struct sock *sk = asoc->base.sk; + + asoc->state = state; + + if (sctp_style(sk, TCP)) { + /* Change the sk->sk_state of a TCP-style socket that has + * sucessfully completed a connect() call. + */ + if (sctp_state(asoc, ESTABLISHED) && sctp_sstate(sk, CLOSED)) + sk->sk_state = SCTP_SS_ESTABLISHED; + + /* Set the RCV_SHUTDOWN flag when a SHUTDOWN is received. */ + if (sctp_state(asoc, SHUTDOWN_RECEIVED) && + sctp_sstate(sk, ESTABLISHED)) + sk->sk_shutdown |= RCV_SHUTDOWN; + } + + if (sctp_state(asoc, ESTABLISHED) || + sctp_state(asoc, CLOSED) || + sctp_state(asoc, SHUTDOWN_RECEIVED)) { + /* Wake up any processes waiting in the asoc's wait queue in + * sctp_wait_for_connect() or sctp_wait_for_sndbuf(). + */ + if (waitqueue_active(&asoc->wait)) + wake_up_interruptible(&asoc->wait); + + /* Wake up any processes waiting in the sk's sleep queue of + * a TCP-style or UDP-style peeled-off socket in + * sctp_wait_for_accept() or sctp_wait_for_packet(). + * For a UDP-style socket, the waiters are woken up by the + * notifications. + */ + if (!sctp_style(sk, UDP)) + sk->sk_state_change(sk); + } +} + +/* Helper function to delete an association. */ +static void sctp_cmd_delete_tcb(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc) +{ + struct sock *sk = asoc->base.sk; + + /* If it is a non-temporary association belonging to a TCP-style + * listening socket that is not closed, do not free it so that accept() + * can pick it up later. + */ + if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING) && + (!asoc->temp) && (sk->sk_shutdown != SHUTDOWN_MASK)) + return; + + sctp_unhash_established(asoc); + sctp_association_free(asoc); +} + +/* + * ADDIP Section 4.1 ASCONF Chunk Procedures + * A4) Start a T-4 RTO timer, using the RTO value of the selected + * destination address (we use active path instead of primary path just + * because primary path may be inactive. + */ +static void sctp_cmd_setup_t4(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc, + struct sctp_chunk *chunk) +{ + struct sctp_transport *t; + + t = asoc->peer.active_path; + asoc->timeouts[SCTP_EVENT_TIMEOUT_T4_RTO] = t->rto; + chunk->transport = t; +} + +/* Process an incoming Operation Error Chunk. */ +static void sctp_cmd_process_operr(sctp_cmd_seq_t *cmds, + struct sctp_association *asoc, + struct sctp_chunk *chunk) +{ + struct sctp_operr_chunk *operr_chunk; + struct sctp_errhdr *err_hdr; + + operr_chunk = (struct sctp_operr_chunk *)chunk->chunk_hdr; + err_hdr = &operr_chunk->err_hdr; + + switch (err_hdr->cause) { + case SCTP_ERROR_UNKNOWN_CHUNK: + { + struct sctp_chunkhdr *unk_chunk_hdr; + + unk_chunk_hdr = (struct sctp_chunkhdr *)err_hdr->variable; + switch (unk_chunk_hdr->type) { + /* ADDIP 4.1 A9) If the peer responds to an ASCONF with an + * ERROR chunk reporting that it did not recognized the ASCONF + * chunk type, the sender of the ASCONF MUST NOT send any + * further ASCONF chunks and MUST stop its T-4 timer. + */ + case SCTP_CID_ASCONF: + asoc->peer.asconf_capable = 0; + sctp_add_cmd_sf(cmds, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); + break; + default: + break; + } + break; + } + default: + break; + } +} + +/* Process variable FWDTSN chunk information. */ +static void sctp_cmd_process_fwdtsn(struct sctp_ulpq *ulpq, + struct sctp_chunk *chunk) +{ + struct sctp_fwdtsn_skip *skip; + /* Walk through all the skipped SSNs */ + sctp_walk_fwdtsn(skip, chunk) { + sctp_ulpq_skip(ulpq, ntohs(skip->stream), ntohs(skip->ssn)); + } + + return; +} + +/* Helper function to remove the association non-primary peer + * transports. + */ +static void sctp_cmd_del_non_primary(struct sctp_association *asoc) +{ + struct sctp_transport *t; + struct list_head *pos; + struct list_head *temp; + + list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) { + t = list_entry(pos, struct sctp_transport, transports); + if (!sctp_cmp_addr_exact(&t->ipaddr, + &asoc->peer.primary_addr)) { + sctp_assoc_del_peer(asoc, &t->ipaddr); + } + } + + return; +} + +/* These three macros allow us to pull the debugging code out of the + * main flow of sctp_do_sm() to keep attention focused on the real + * functionality there. + */ +#define DEBUG_PRE \ + SCTP_DEBUG_PRINTK("sctp_do_sm prefn: " \ + "ep %p, %s, %s, asoc %p[%s], %s\n", \ + ep, sctp_evttype_tbl[event_type], \ + (*debug_fn)(subtype), asoc, \ + sctp_state_tbl[state], state_fn->name) + +#define DEBUG_POST \ + SCTP_DEBUG_PRINTK("sctp_do_sm postfn: " \ + "asoc %p, status: %s\n", \ + asoc, sctp_status_tbl[status]) + +#define DEBUG_POST_SFX \ + SCTP_DEBUG_PRINTK("sctp_do_sm post sfx: error %d, asoc %p[%s]\n", \ + error, asoc, \ + sctp_state_tbl[(asoc && sctp_id2assoc(ep->base.sk, \ + sctp_assoc2id(asoc)))?asoc->state:SCTP_STATE_CLOSED]) + +/* + * This is the master state machine processing function. + * + * If you want to understand all of lksctp, this is a + * good place to start. + */ +int sctp_do_sm(sctp_event_t event_type, sctp_subtype_t subtype, + sctp_state_t state, + struct sctp_endpoint *ep, + struct sctp_association *asoc, + void *event_arg, + int gfp) +{ + sctp_cmd_seq_t commands; + const sctp_sm_table_entry_t *state_fn; + sctp_disposition_t status; + int error = 0; + typedef const char *(printfn_t)(sctp_subtype_t); + + static printfn_t *table[] = { + NULL, sctp_cname, sctp_tname, sctp_oname, sctp_pname, + }; + printfn_t *debug_fn __attribute__ ((unused)) = table[event_type]; + + /* Look up the state function, run it, and then process the + * side effects. These three steps are the heart of lksctp. + */ + state_fn = sctp_sm_lookup_event(event_type, state, subtype); + + sctp_init_cmd_seq(&commands); + + DEBUG_PRE; + status = (*state_fn->fn)(ep, asoc, subtype, event_arg, &commands); + DEBUG_POST; + + error = sctp_side_effects(event_type, subtype, state, + ep, asoc, event_arg, status, + &commands, gfp); + DEBUG_POST_SFX; + + return error; +} + +#undef DEBUG_PRE +#undef DEBUG_POST + +/***************************************************************** + * This the master state function side effect processing function. + *****************************************************************/ +static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype, + sctp_state_t state, + struct sctp_endpoint *ep, + struct sctp_association *asoc, + void *event_arg, + sctp_disposition_t status, + sctp_cmd_seq_t *commands, + int gfp) +{ + int error; + + /* FIXME - Most of the dispositions left today would be categorized + * as "exceptional" dispositions. For those dispositions, it + * may not be proper to run through any of the commands at all. + * For example, the command interpreter might be run only with + * disposition SCTP_DISPOSITION_CONSUME. + */ + if (0 != (error = sctp_cmd_interpreter(event_type, subtype, state, + ep, asoc, + event_arg, status, + commands, gfp))) + goto bail; + + switch (status) { + case SCTP_DISPOSITION_DISCARD: + SCTP_DEBUG_PRINTK("Ignored sctp protocol event - state %d, " + "event_type %d, event_id %d\n", + state, event_type, subtype.chunk); + break; + + case SCTP_DISPOSITION_NOMEM: + /* We ran out of memory, so we need to discard this + * packet. + */ + /* BUG--we should now recover some memory, probably by + * reneging... + */ + error = -ENOMEM; + break; + + case SCTP_DISPOSITION_DELETE_TCB: + /* This should now be a command. */ + break; + + case SCTP_DISPOSITION_CONSUME: + case SCTP_DISPOSITION_ABORT: + /* + * We should no longer have much work to do here as the + * real work has been done as explicit commands above. + */ + break; + + case SCTP_DISPOSITION_VIOLATION: + printk(KERN_ERR "sctp protocol violation state %d " + "chunkid %d\n", state, subtype.chunk); + break; + + case SCTP_DISPOSITION_NOT_IMPL: + printk(KERN_WARNING "sctp unimplemented feature in state %d, " + "event_type %d, event_id %d\n", + state, event_type, subtype.chunk); + break; + + case SCTP_DISPOSITION_BUG: + printk(KERN_ERR "sctp bug in state %d, " + "event_type %d, event_id %d\n", + state, event_type, subtype.chunk); + BUG(); + break; + + default: + printk(KERN_ERR "sctp impossible disposition %d " + "in state %d, event_type %d, event_id %d\n", + status, state, event_type, subtype.chunk); + BUG(); + break; + }; + +bail: + return error; +} + +/******************************************************************** + * 2nd Level Abstractions + ********************************************************************/ + +/* This is the side-effect interpreter. */ +static int sctp_cmd_interpreter(sctp_event_t event_type, + sctp_subtype_t subtype, + sctp_state_t state, + struct sctp_endpoint *ep, + struct sctp_association *asoc, + void *event_arg, + sctp_disposition_t status, + sctp_cmd_seq_t *commands, + int gfp) +{ + int error = 0; + int force; + sctp_cmd_t *cmd; + struct sctp_chunk *new_obj; + struct sctp_chunk *chunk = NULL; + struct sctp_packet *packet; + struct list_head *pos; + struct timer_list *timer; + unsigned long timeout; + struct sctp_transport *t; + struct sctp_sackhdr sackh; + int local_cork = 0; + + if (SCTP_EVENT_T_TIMEOUT != event_type) + chunk = (struct sctp_chunk *) event_arg; + + /* Note: This whole file is a huge candidate for rework. + * For example, each command could either have its own handler, so + * the loop would look like: + * while (cmds) + * cmd->handle(x, y, z) + * --jgrimm + */ + while (NULL != (cmd = sctp_next_cmd(commands))) { + switch (cmd->verb) { + case SCTP_CMD_NOP: + /* Do nothing. */ + break; + + case SCTP_CMD_NEW_ASOC: + /* Register a new association. */ + if (local_cork) { + sctp_outq_uncork(&asoc->outqueue); + local_cork = 0; + } + asoc = cmd->obj.ptr; + /* Register with the endpoint. */ + sctp_endpoint_add_asoc(ep, asoc); + sctp_hash_established(asoc); + break; + + case SCTP_CMD_UPDATE_ASSOC: + sctp_assoc_update(asoc, cmd->obj.ptr); + break; + + case SCTP_CMD_PURGE_OUTQUEUE: + sctp_outq_teardown(&asoc->outqueue); + break; + + case SCTP_CMD_DELETE_TCB: + if (local_cork) { + sctp_outq_uncork(&asoc->outqueue); + local_cork = 0; + } + /* Delete the current association. */ + sctp_cmd_delete_tcb(commands, asoc); + asoc = NULL; + break; + + case SCTP_CMD_NEW_STATE: + /* Enter a new state. */ + sctp_cmd_new_state(commands, asoc, cmd->obj.state); + break; + + case SCTP_CMD_REPORT_TSN: + /* Record the arrival of a TSN. */ + sctp_tsnmap_mark(&asoc->peer.tsn_map, cmd->obj.u32); + break; + + case SCTP_CMD_REPORT_FWDTSN: + /* Move the Cumulattive TSN Ack ahead. */ + sctp_tsnmap_skip(&asoc->peer.tsn_map, cmd->obj.u32); + + /* Abort any in progress partial delivery. */ + sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); + break; + + case SCTP_CMD_PROCESS_FWDTSN: + sctp_cmd_process_fwdtsn(&asoc->ulpq, cmd->obj.ptr); + break; + + case SCTP_CMD_GEN_SACK: + /* Generate a Selective ACK. + * The argument tells us whether to just count + * the packet and MAYBE generate a SACK, or + * force a SACK out. + */ + force = cmd->obj.i32; + error = sctp_gen_sack(asoc, force, commands); + break; + + case SCTP_CMD_PROCESS_SACK: + /* Process an inbound SACK. */ + error = sctp_cmd_process_sack(commands, asoc, + cmd->obj.ptr); + break; + + case SCTP_CMD_GEN_INIT_ACK: + /* Generate an INIT ACK chunk. */ + new_obj = sctp_make_init_ack(asoc, chunk, GFP_ATOMIC, + 0); + if (!new_obj) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(new_obj)); + break; + + case SCTP_CMD_PEER_INIT: + /* Process a unified INIT from the peer. + * Note: Only used during INIT-ACK processing. If + * there is an error just return to the outter + * layer which will bail. + */ + error = sctp_cmd_process_init(commands, asoc, chunk, + cmd->obj.ptr, gfp); + break; + + case SCTP_CMD_GEN_COOKIE_ECHO: + /* Generate a COOKIE ECHO chunk. */ + new_obj = sctp_make_cookie_echo(asoc, chunk); + if (!new_obj) { + if (cmd->obj.ptr) + sctp_chunk_free(cmd->obj.ptr); + goto nomem; + } + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(new_obj)); + + /* If there is an ERROR chunk to be sent along with + * the COOKIE_ECHO, send it, too. + */ + if (cmd->obj.ptr) + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(cmd->obj.ptr)); + + /* FIXME - Eventually come up with a cleaner way to + * enabling COOKIE-ECHO + DATA bundling during + * multihoming stale cookie scenarios, the following + * command plays with asoc->peer.retran_path to + * avoid the problem of sending the COOKIE-ECHO and + * DATA in different paths, which could result + * in the association being ABORTed if the DATA chunk + * is processed first by the server. Checking the + * init error counter simply causes this command + * to be executed only during failed attempts of + * association establishment. + */ + if ((asoc->peer.retran_path != + asoc->peer.primary_path) && + (asoc->counters[SCTP_COUNTER_INIT_ERROR] > 0)) { + sctp_add_cmd_sf(commands, + SCTP_CMD_FORCE_PRIM_RETRAN, + SCTP_NULL()); + } + + break; + + case SCTP_CMD_GEN_SHUTDOWN: + /* Generate SHUTDOWN when in SHUTDOWN_SENT state. + * Reset error counts. + */ + asoc->overall_error_count = 0; + + /* Generate a SHUTDOWN chunk. */ + new_obj = sctp_make_shutdown(asoc, chunk); + if (!new_obj) + goto nomem; + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(new_obj)); + break; + + case SCTP_CMD_CHUNK_ULP: + /* Send a chunk to the sockets layer. */ + SCTP_DEBUG_PRINTK("sm_sideff: %s %p, %s %p.\n", + "chunk_up:", cmd->obj.ptr, + "ulpq:", &asoc->ulpq); + sctp_ulpq_tail_data(&asoc->ulpq, cmd->obj.ptr, + GFP_ATOMIC); + break; + + case SCTP_CMD_EVENT_ULP: + /* Send a notification to the sockets layer. */ + SCTP_DEBUG_PRINTK("sm_sideff: %s %p, %s %p.\n", + "event_up:",cmd->obj.ptr, + "ulpq:",&asoc->ulpq); + sctp_ulpq_tail_event(&asoc->ulpq, cmd->obj.ptr); + break; + + case SCTP_CMD_REPLY: + /* If an caller has not already corked, do cork. */ + if (!asoc->outqueue.cork) { + sctp_outq_cork(&asoc->outqueue); + local_cork = 1; + } + /* Send a chunk to our peer. */ + error = sctp_outq_tail(&asoc->outqueue, cmd->obj.ptr); + break; + + case SCTP_CMD_SEND_PKT: + /* Send a full packet to our peer. */ + packet = cmd->obj.ptr; + sctp_packet_transmit(packet); + sctp_ootb_pkt_free(packet); + break; + + case SCTP_CMD_RETRAN: + /* Mark a transport for retransmission. */ + sctp_retransmit(&asoc->outqueue, cmd->obj.transport, + SCTP_RTXR_T3_RTX); + break; + + case SCTP_CMD_TRANSMIT: + /* Kick start transmission. */ + error = sctp_outq_uncork(&asoc->outqueue); + local_cork = 0; + break; + + case SCTP_CMD_ECN_CE: + /* Do delayed CE processing. */ + sctp_do_ecn_ce_work(asoc, cmd->obj.u32); + break; + + case SCTP_CMD_ECN_ECNE: + /* Do delayed ECNE processing. */ + new_obj = sctp_do_ecn_ecne_work(asoc, cmd->obj.u32, + chunk); + if (new_obj) + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(new_obj)); + break; + + case SCTP_CMD_ECN_CWR: + /* Do delayed CWR processing. */ + sctp_do_ecn_cwr_work(asoc, cmd->obj.u32); + break; + + case SCTP_CMD_SETUP_T2: + sctp_cmd_setup_t2(commands, asoc, cmd->obj.ptr); + break; + + case SCTP_CMD_TIMER_START: + timer = &asoc->timers[cmd->obj.to]; + timeout = asoc->timeouts[cmd->obj.to]; + if (!timeout) + BUG(); + + timer->expires = jiffies + timeout; + sctp_association_hold(asoc); + add_timer(timer); + break; + + case SCTP_CMD_TIMER_RESTART: + timer = &asoc->timers[cmd->obj.to]; + timeout = asoc->timeouts[cmd->obj.to]; + if (!mod_timer(timer, jiffies + timeout)) + sctp_association_hold(asoc); + break; + + case SCTP_CMD_TIMER_STOP: + timer = &asoc->timers[cmd->obj.to]; + if (timer_pending(timer) && del_timer(timer)) + sctp_association_put(asoc); + break; + + case SCTP_CMD_INIT_RESTART: + /* Do the needed accounting and updates + * associated with restarting an initialization + * timer. + */ + asoc->counters[SCTP_COUNTER_INIT_ERROR]++; + asoc->timeouts[cmd->obj.to] *= 2; + if (asoc->timeouts[cmd->obj.to] > + asoc->max_init_timeo) { + asoc->timeouts[cmd->obj.to] = + asoc->max_init_timeo; + } + + /* If we've sent any data bundled with + * COOKIE-ECHO we need to resend. + */ + list_for_each(pos, &asoc->peer.transport_addr_list) { + t = list_entry(pos, struct sctp_transport, + transports); + sctp_retransmit_mark(&asoc->outqueue, t, 0); + } + + sctp_add_cmd_sf(commands, + SCTP_CMD_TIMER_RESTART, + SCTP_TO(cmd->obj.to)); + break; + + case SCTP_CMD_INIT_FAILED: + sctp_cmd_init_failed(commands, asoc, cmd->obj.u32); + break; + + case SCTP_CMD_ASSOC_FAILED: + sctp_cmd_assoc_failed(commands, asoc, event_type, + subtype, chunk, cmd->obj.u32); + break; + + case SCTP_CMD_COUNTER_INC: + asoc->counters[cmd->obj.counter]++; + break; + + case SCTP_CMD_COUNTER_RESET: + asoc->counters[cmd->obj.counter] = 0; + break; + + case SCTP_CMD_REPORT_DUP: + sctp_tsnmap_mark_dup(&asoc->peer.tsn_map, + cmd->obj.u32); + break; + + case SCTP_CMD_REPORT_BAD_TAG: + SCTP_DEBUG_PRINTK("vtag mismatch!\n"); + break; + + case SCTP_CMD_STRIKE: + /* Mark one strike against a transport. */ + sctp_do_8_2_transport_strike(asoc, cmd->obj.transport); + break; + + case SCTP_CMD_TRANSPORT_RESET: + t = cmd->obj.transport; + sctp_cmd_transport_reset(commands, asoc, t); + break; + + case SCTP_CMD_TRANSPORT_ON: + t = cmd->obj.transport; + sctp_cmd_transport_on(commands, asoc, t, chunk); + break; + + case SCTP_CMD_HB_TIMERS_START: + sctp_cmd_hb_timers_start(commands, asoc); + break; + + case SCTP_CMD_HB_TIMER_UPDATE: + t = cmd->obj.transport; + sctp_cmd_hb_timer_update(commands, asoc, t); + break; + + case SCTP_CMD_HB_TIMERS_STOP: + sctp_cmd_hb_timers_stop(commands, asoc); + break; + + case SCTP_CMD_REPORT_ERROR: + error = cmd->obj.error; + break; + + case SCTP_CMD_PROCESS_CTSN: + /* Dummy up a SACK for processing. */ + sackh.cum_tsn_ack = cmd->obj.u32; + sackh.a_rwnd = 0; + sackh.num_gap_ack_blocks = 0; + sackh.num_dup_tsns = 0; + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, + SCTP_SACKH(&sackh)); + break; + + case SCTP_CMD_DISCARD_PACKET: + /* We need to discard the whole packet. */ + chunk->pdiscard = 1; + break; + + case SCTP_CMD_RTO_PENDING: + t = cmd->obj.transport; + t->rto_pending = 1; + break; + + case SCTP_CMD_PART_DELIVER: + sctp_ulpq_partial_delivery(&asoc->ulpq, cmd->obj.ptr, + GFP_ATOMIC); + break; + + case SCTP_CMD_RENEGE: + sctp_ulpq_renege(&asoc->ulpq, cmd->obj.ptr, + GFP_ATOMIC); + break; + + case SCTP_CMD_SETUP_T4: + sctp_cmd_setup_t4(commands, asoc, cmd->obj.ptr); + break; + + case SCTP_CMD_PROCESS_OPERR: + sctp_cmd_process_operr(commands, asoc, chunk); + break; + case SCTP_CMD_CLEAR_INIT_TAG: + asoc->peer.i.init_tag = 0; + break; + case SCTP_CMD_DEL_NON_PRIMARY: + sctp_cmd_del_non_primary(asoc); + break; + case SCTP_CMD_T3_RTX_TIMERS_STOP: + sctp_cmd_t3_rtx_timers_stop(commands, asoc); + break; + case SCTP_CMD_FORCE_PRIM_RETRAN: + t = asoc->peer.retran_path; + asoc->peer.retran_path = asoc->peer.primary_path; + error = sctp_outq_uncork(&asoc->outqueue); + local_cork = 0; + asoc->peer.retran_path = t; + break; + default: + printk(KERN_WARNING "Impossible command: %u, %p\n", + cmd->verb, cmd->obj.ptr); + break; + }; + if (error) + break; + } + +out: + if (local_cork) + sctp_outq_uncork(&asoc->outqueue); + return error; +nomem: + error = -ENOMEM; + goto out; +} + diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c new file mode 100644 index 000000000000..278c56a2d076 --- /dev/null +++ b/net/sctp/sm_statefuns.c @@ -0,0 +1,5238 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001-2002 Intel Corp. + * Copyright (c) 2002 Nokia Corp. + * + * This file is part of the SCTP kernel reference Implementation + * + * This is part of the SCTP Linux Kernel Reference Implementation. + * + * These are the state functions for the state machine. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Mathew Kotowsky + * Sridhar Samudrala + * Jon Grimm + * Hui Huang + * Dajiang Zhang + * Daisy Chang + * Ardelle Fan + * Ryan Layer + * Kevin Gao + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct sctp_packet *sctp_abort_pkt_new(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + struct sctp_chunk *chunk, + const void *payload, + size_t paylen); +static int sctp_eat_data(const struct sctp_association *asoc, + struct sctp_chunk *chunk, + sctp_cmd_seq_t *commands); +static struct sctp_packet *sctp_ootb_pkt_new(const struct sctp_association *asoc, + const struct sctp_chunk *chunk); +static void sctp_send_stale_cookie_err(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + sctp_cmd_seq_t *commands, + struct sctp_chunk *err_chunk); +static sctp_disposition_t sctp_sf_do_5_2_6_stale(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands); +static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands); +static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk); + + +/* Small helper function that checks if the chunk length + * is of the appropriate length. The 'required_length' argument + * is set to be the size of a specific chunk we are testing. + * Return Values: 1 = Valid length + * 0 = Invalid length + * + */ +static inline int +sctp_chunk_length_valid(struct sctp_chunk *chunk, + __u16 required_length) +{ + __u16 chunk_length = ntohs(chunk->chunk_hdr->length); + + if (unlikely(chunk_length < required_length)) + return 0; + + return 1; +} + +/********************************************************** + * These are the state functions for handling chunk events. + **********************************************************/ + +/* + * Process the final SHUTDOWN COMPLETE. + * + * Section: 4 (C) (diagram), 9.2 + * Upon reception of the SHUTDOWN COMPLETE chunk the endpoint will verify + * that it is in SHUTDOWN-ACK-SENT state, if it is not the chunk should be + * discarded. If the endpoint is in the SHUTDOWN-ACK-SENT state the endpoint + * should stop the T2-shutdown timer and remove all knowledge of the + * association (and thus the association enters the CLOSED state). + * + * Verification Tag: 8.5.1(C) + * C) Rules for packet carrying SHUTDOWN COMPLETE: + * ... + * - The receiver of a SHUTDOWN COMPLETE shall accept the packet if the + * Verification Tag field of the packet matches its own tag OR it is + * set to its peer's tag and the T bit is set in the Chunk Flags. + * Otherwise, the receiver MUST silently discard the packet and take + * no further action. An endpoint MUST ignore the SHUTDOWN COMPLETE if + * it is not in the SHUTDOWN-ACK-SENT state. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_4_C(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_ulpevent *ev; + + /* RFC 2960 6.10 Bundling + * + * An endpoint MUST NOT bundle INIT, INIT ACK or + * SHUTDOWN COMPLETE with any other chunks. + */ + if (!chunk->singleton) + return SCTP_DISPOSITION_VIOLATION; + + if (!sctp_vtag_verify_either(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* RFC 2960 10.2 SCTP-to-ULP + * + * H) SHUTDOWN COMPLETE notification + * + * When SCTP completes the shutdown procedures (section 9.2) this + * notification is passed to the upper layer. + */ + ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_SHUTDOWN_COMP, + 0, 0, 0, GFP_ATOMIC); + if (!ev) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); + + /* Upon reception of the SHUTDOWN COMPLETE chunk the endpoint + * will verify that it is in SHUTDOWN-ACK-SENT state, if it is + * not the chunk should be discarded. If the endpoint is in + * the SHUTDOWN-ACK-SENT state the endpoint should stop the + * T2-shutdown timer and remove all knowledge of the + * association (and thus the association enters the CLOSED + * state). + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); + + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + + SCTP_INC_STATS(SCTP_MIB_SHUTDOWNS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); + + return SCTP_DISPOSITION_DELETE_TCB; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Respond to a normal INIT chunk. + * We are the side that is being asked for an association. + * + * Section: 5.1 Normal Establishment of an Association, B + * B) "Z" shall respond immediately with an INIT ACK chunk. The + * destination IP address of the INIT ACK MUST be set to the source + * IP address of the INIT to which this INIT ACK is responding. In + * the response, besides filling in other parameters, "Z" must set the + * Verification Tag field to Tag_A, and also provide its own + * Verification Tag (Tag_Z) in the Initiate Tag field. + * + * Verification Tag: Must be 0. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_5_1B_init(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_chunk *repl; + struct sctp_association *new_asoc; + struct sctp_chunk *err_chunk; + struct sctp_packet *packet; + sctp_unrecognized_param_t *unk_param; + struct sock *sk; + int len; + + /* 6.10 Bundling + * An endpoint MUST NOT bundle INIT, INIT ACK or + * SHUTDOWN COMPLETE with any other chunks. + * + * IG Section 2.11.2 + * Furthermore, we require that the receiver of an INIT chunk MUST + * enforce these rules by silently discarding an arriving packet + * with an INIT chunk that is bundled with other chunks. + */ + if (!chunk->singleton) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* If the packet is an OOTB packet which is temporarily on the + * control endpoint, respond with an ABORT. + */ + if (ep == sctp_sk((sctp_get_ctl_sock()))->ep) + return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands); + + sk = ep->base.sk; + /* If the endpoint is not listening or if the number of associations + * on the TCP-style socket exceed the max backlog, respond with an + * ABORT. + */ + if (!sctp_sstate(sk, LISTENING) || + (sctp_style(sk, TCP) && + sk_acceptq_is_full(sk))) + return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands); + + /* 3.1 A packet containing an INIT chunk MUST have a zero Verification + * Tag. + */ + if (chunk->sctp_hdr->vtag != 0) + return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands); + + /* Make sure that the INIT chunk has a valid length. + * Normally, this would cause an ABORT with a Protocol Violation + * error, but since we don't have an association, we'll + * just discard the packet. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_init_chunk_t))) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Verify the INIT chunk before processing it. */ + err_chunk = NULL; + if (!sctp_verify_init(asoc, chunk->chunk_hdr->type, + (sctp_init_chunk_t *)chunk->chunk_hdr, chunk, + &err_chunk)) { + /* This chunk contains fatal error. It is to be discarded. + * Send an ABORT, with causes if there is any. + */ + if (err_chunk) { + packet = sctp_abort_pkt_new(ep, asoc, arg, + (__u8 *)(err_chunk->chunk_hdr) + + sizeof(sctp_chunkhdr_t), + ntohs(err_chunk->chunk_hdr->length) - + sizeof(sctp_chunkhdr_t)); + + sctp_chunk_free(err_chunk); + + if (packet) { + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, + SCTP_PACKET(packet)); + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + return SCTP_DISPOSITION_CONSUME; + } else { + return SCTP_DISPOSITION_NOMEM; + } + } else { + return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, + commands); + } + } + + /* Grab the INIT header. */ + chunk->subh.init_hdr = (sctp_inithdr_t *)chunk->skb->data; + + /* Tag the variable length parameters. */ + chunk->param_hdr.v = skb_pull(chunk->skb, sizeof(sctp_inithdr_t)); + + new_asoc = sctp_make_temp_asoc(ep, chunk, GFP_ATOMIC); + if (!new_asoc) + goto nomem; + + /* The call, sctp_process_init(), can fail on memory allocation. */ + if (!sctp_process_init(new_asoc, chunk->chunk_hdr->type, + sctp_source(chunk), + (sctp_init_chunk_t *)chunk->chunk_hdr, + GFP_ATOMIC)) + goto nomem_init; + + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc)); + + /* B) "Z" shall respond immediately with an INIT ACK chunk. */ + + /* If there are errors need to be reported for unknown parameters, + * make sure to reserve enough room in the INIT ACK for them. + */ + len = 0; + if (err_chunk) + len = ntohs(err_chunk->chunk_hdr->length) - + sizeof(sctp_chunkhdr_t); + + if (sctp_assoc_set_bind_addr_from_ep(new_asoc, GFP_ATOMIC) < 0) + goto nomem_ack; + + repl = sctp_make_init_ack(new_asoc, chunk, GFP_ATOMIC, len); + if (!repl) + goto nomem_ack; + + /* If there are errors need to be reported for unknown parameters, + * include them in the outgoing INIT ACK as "Unrecognized parameter" + * parameter. + */ + if (err_chunk) { + /* Get the "Unrecognized parameter" parameter(s) out of the + * ERROR chunk generated by sctp_verify_init(). Since the + * error cause code for "unknown parameter" and the + * "Unrecognized parameter" type is the same, we can + * construct the parameters in INIT ACK by copying the + * ERROR causes over. + */ + unk_param = (sctp_unrecognized_param_t *) + ((__u8 *)(err_chunk->chunk_hdr) + + sizeof(sctp_chunkhdr_t)); + /* Replace the cause code with the "Unrecognized parameter" + * parameter type. + */ + sctp_addto_chunk(repl, len, unk_param); + sctp_chunk_free(err_chunk); + } + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + + /* + * Note: After sending out INIT ACK with the State Cookie parameter, + * "Z" MUST NOT allocate any resources, nor keep any states for the + * new association. Otherwise, "Z" will be vulnerable to resource + * attacks. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); + + return SCTP_DISPOSITION_DELETE_TCB; + +nomem_ack: + if (err_chunk) + sctp_chunk_free(err_chunk); +nomem_init: + sctp_association_free(new_asoc); +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Respond to a normal INIT ACK chunk. + * We are the side that is initiating the association. + * + * Section: 5.1 Normal Establishment of an Association, C + * C) Upon reception of the INIT ACK from "Z", "A" shall stop the T1-init + * timer and leave COOKIE-WAIT state. "A" shall then send the State + * Cookie received in the INIT ACK chunk in a COOKIE ECHO chunk, start + * the T1-cookie timer, and enter the COOKIE-ECHOED state. + * + * Note: The COOKIE ECHO chunk can be bundled with any pending outbound + * DATA chunks, but it MUST be the first chunk in the packet and + * until the COOKIE ACK is returned the sender MUST NOT send any + * other packets to the peer. + * + * Verification Tag: 3.3.3 + * If the value of the Initiate Tag in a received INIT ACK chunk is + * found to be 0, the receiver MUST treat it as an error and close the + * association by transmitting an ABORT. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_5_1C_ack(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + sctp_init_chunk_t *initchunk; + __u32 init_tag; + struct sctp_chunk *err_chunk; + struct sctp_packet *packet; + sctp_disposition_t ret; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the INIT-ACK chunk has a valid length */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_initack_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + /* 6.10 Bundling + * An endpoint MUST NOT bundle INIT, INIT ACK or + * SHUTDOWN COMPLETE with any other chunks. + */ + if (!chunk->singleton) + return SCTP_DISPOSITION_VIOLATION; + + /* Grab the INIT header. */ + chunk->subh.init_hdr = (sctp_inithdr_t *) chunk->skb->data; + + init_tag = ntohl(chunk->subh.init_hdr->init_tag); + + /* Verification Tag: 3.3.3 + * If the value of the Initiate Tag in a received INIT ACK + * chunk is found to be 0, the receiver MUST treat it as an + * error and close the association by transmitting an ABORT. + */ + if (!init_tag) { + struct sctp_chunk *reply = sctp_make_abort(asoc, chunk, 0); + if (!reply) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); + return SCTP_DISPOSITION_DELETE_TCB; + } + + /* Verify the INIT chunk before processing it. */ + err_chunk = NULL; + if (!sctp_verify_init(asoc, chunk->chunk_hdr->type, + (sctp_init_chunk_t *)chunk->chunk_hdr, chunk, + &err_chunk)) { + + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + + /* This chunk contains fatal error. It is to be discarded. + * Send an ABORT, with causes if there is any. + */ + if (err_chunk) { + packet = sctp_abort_pkt_new(ep, asoc, arg, + (__u8 *)(err_chunk->chunk_hdr) + + sizeof(sctp_chunkhdr_t), + ntohs(err_chunk->chunk_hdr->length) - + sizeof(sctp_chunkhdr_t)); + + sctp_chunk_free(err_chunk); + + if (packet) { + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, + SCTP_PACKET(packet)); + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, + SCTP_NULL()); + return SCTP_DISPOSITION_CONSUME; + } else { + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, + SCTP_NULL()); + return SCTP_DISPOSITION_NOMEM; + } + } else { + ret = sctp_sf_tabort_8_4_8(ep, asoc, type, arg, + commands); + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, + SCTP_NULL()); + return ret; + } + } + + /* Tag the variable length parameters. Note that we never + * convert the parameters in an INIT chunk. + */ + chunk->param_hdr.v = skb_pull(chunk->skb, sizeof(sctp_inithdr_t)); + + initchunk = (sctp_init_chunk_t *) chunk->chunk_hdr; + + sctp_add_cmd_sf(commands, SCTP_CMD_PEER_INIT, + SCTP_PEER_INIT(initchunk)); + + /* 5.1 C) "A" shall stop the T1-init timer and leave + * COOKIE-WAIT state. "A" shall then ... start the T1-cookie + * timer, and enter the COOKIE-ECHOED state. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE)); + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_COOKIE_ECHOED)); + + /* 5.1 C) "A" shall then send the State Cookie received in the + * INIT ACK chunk in a COOKIE ECHO chunk, ... + */ + /* If there is any errors to report, send the ERROR chunk generated + * for unknown parameters as well. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_COOKIE_ECHO, + SCTP_CHUNK(err_chunk)); + + return SCTP_DISPOSITION_CONSUME; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Respond to a normal COOKIE ECHO chunk. + * We are the side that is being asked for an association. + * + * Section: 5.1 Normal Establishment of an Association, D + * D) Upon reception of the COOKIE ECHO chunk, Endpoint "Z" will reply + * with a COOKIE ACK chunk after building a TCB and moving to + * the ESTABLISHED state. A COOKIE ACK chunk may be bundled with + * any pending DATA chunks (and/or SACK chunks), but the COOKIE ACK + * chunk MUST be the first chunk in the packet. + * + * IMPLEMENTATION NOTE: An implementation may choose to send the + * Communication Up notification to the SCTP user upon reception + * of a valid COOKIE ECHO chunk. + * + * Verification Tag: 8.5.1 Exceptions in Verification Tag Rules + * D) Rules for packet carrying a COOKIE ECHO + * + * - When sending a COOKIE ECHO, the endpoint MUST use the value of the + * Initial Tag received in the INIT ACK. + * + * - The receiver of a COOKIE ECHO follows the procedures in Section 5. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_5_1D_ce(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_association *new_asoc; + sctp_init_chunk_t *peer_init; + struct sctp_chunk *repl; + struct sctp_ulpevent *ev; + int error = 0; + struct sctp_chunk *err_chk_p; + + /* If the packet is an OOTB packet which is temporarily on the + * control endpoint, respond with an ABORT. + */ + if (ep == sctp_sk((sctp_get_ctl_sock()))->ep) + return sctp_sf_ootb(ep, asoc, type, arg, commands); + + /* Make sure that the COOKIE_ECHO chunk has a valid length. + * In this case, we check that we have enough for at least a + * chunk header. More detailed verification is done + * in sctp_unpack_cookie(). + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* "Decode" the chunk. We have no optional parameters so we + * are in good shape. + */ + chunk->subh.cookie_hdr = + (struct sctp_signed_cookie *)chunk->skb->data; + skb_pull(chunk->skb, + ntohs(chunk->chunk_hdr->length) - sizeof(sctp_chunkhdr_t)); + + /* 5.1 D) Upon reception of the COOKIE ECHO chunk, Endpoint + * "Z" will reply with a COOKIE ACK chunk after building a TCB + * and moving to the ESTABLISHED state. + */ + new_asoc = sctp_unpack_cookie(ep, asoc, chunk, GFP_ATOMIC, &error, + &err_chk_p); + + /* FIXME: + * If the re-build failed, what is the proper error path + * from here? + * + * [We should abort the association. --piggy] + */ + if (!new_asoc) { + /* FIXME: Several errors are possible. A bad cookie should + * be silently discarded, but think about logging it too. + */ + switch (error) { + case -SCTP_IERROR_NOMEM: + goto nomem; + + case -SCTP_IERROR_STALE_COOKIE: + sctp_send_stale_cookie_err(ep, asoc, chunk, commands, + err_chk_p); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + case -SCTP_IERROR_BAD_SIG: + default: + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + }; + } + + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc)); + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_ESTABLISHED)); + SCTP_INC_STATS(SCTP_MIB_CURRESTAB); + SCTP_INC_STATS(SCTP_MIB_PASSIVEESTABS); + sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL()); + + if (new_asoc->autoclose) + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, + SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); + + sctp_add_cmd_sf(commands, SCTP_CMD_TRANSMIT, SCTP_NULL()); + + /* Re-build the bind address for the association is done in + * the sctp_unpack_cookie() already. + */ + /* This is a brand-new association, so these are not yet side + * effects--it is safe to run them here. + */ + peer_init = &chunk->subh.cookie_hdr->c.peer_init[0]; + + if (!sctp_process_init(new_asoc, chunk->chunk_hdr->type, + &chunk->subh.cookie_hdr->c.peer_addr, + peer_init, GFP_ATOMIC)) + goto nomem_init; + + repl = sctp_make_cookie_ack(new_asoc, chunk); + if (!repl) + goto nomem_repl; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + + /* RFC 2960 5.1 Normal Establishment of an Association + * + * D) IMPLEMENTATION NOTE: An implementation may choose to + * send the Communication Up notification to the SCTP user + * upon reception of a valid COOKIE ECHO chunk. + */ + ev = sctp_ulpevent_make_assoc_change(new_asoc, 0, SCTP_COMM_UP, 0, + new_asoc->c.sinit_num_ostreams, + new_asoc->c.sinit_max_instreams, + GFP_ATOMIC); + if (!ev) + goto nomem_ev; + + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); + + /* Sockets API Draft Section 5.3.1.6 + * When a peer sends a Adaption Layer Indication parameter , SCTP + * delivers this notification to inform the application that of the + * peers requested adaption layer. + */ + if (new_asoc->peer.adaption_ind) { + ev = sctp_ulpevent_make_adaption_indication(new_asoc, + GFP_ATOMIC); + if (!ev) + goto nomem_ev; + + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(ev)); + } + + return SCTP_DISPOSITION_CONSUME; + +nomem_ev: + sctp_chunk_free(repl); +nomem_repl: +nomem_init: + sctp_association_free(new_asoc); +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Respond to a normal COOKIE ACK chunk. + * We are the side that is being asked for an association. + * + * RFC 2960 5.1 Normal Establishment of an Association + * + * E) Upon reception of the COOKIE ACK, endpoint "A" will move from the + * COOKIE-ECHOED state to the ESTABLISHED state, stopping the T1-cookie + * timer. It may also notify its ULP about the successful + * establishment of the association with a Communication Up + * notification (see Section 10). + * + * Verification Tag: + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_5_1E_ca(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_ulpevent *ev; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Verify that the chunk length for the COOKIE-ACK is OK. + * If we don't do this, any bundled chunks may be junked. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + /* Reset init error count upon receipt of COOKIE-ACK, + * to avoid problems with the managemement of this + * counter in stale cookie situations when a transition back + * from the COOKIE-ECHOED state to the COOKIE-WAIT + * state is performed. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_COUNTER_RESET, + SCTP_COUNTER(SCTP_COUNTER_INIT_ERROR)); + + /* RFC 2960 5.1 Normal Establishment of an Association + * + * E) Upon reception of the COOKIE ACK, endpoint "A" will move + * from the COOKIE-ECHOED state to the ESTABLISHED state, + * stopping the T1-cookie timer. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE)); + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_ESTABLISHED)); + SCTP_INC_STATS(SCTP_MIB_CURRESTAB); + SCTP_INC_STATS(SCTP_MIB_ACTIVEESTABS); + sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL()); + if (asoc->autoclose) + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, + SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); + sctp_add_cmd_sf(commands, SCTP_CMD_TRANSMIT, SCTP_NULL()); + + /* It may also notify its ULP about the successful + * establishment of the association with a Communication Up + * notification (see Section 10). + */ + ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_UP, + 0, asoc->c.sinit_num_ostreams, + asoc->c.sinit_max_instreams, + GFP_ATOMIC); + + if (!ev) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); + + /* Sockets API Draft Section 5.3.1.6 + * When a peer sends a Adaption Layer Indication parameter , SCTP + * delivers this notification to inform the application that of the + * peers requested adaption layer. + */ + if (asoc->peer.adaption_ind) { + ev = sctp_ulpevent_make_adaption_indication(asoc, GFP_ATOMIC); + if (!ev) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(ev)); + } + + return SCTP_DISPOSITION_CONSUME; +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* Generate and sendout a heartbeat packet. */ +static sctp_disposition_t sctp_sf_heartbeat(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_transport *transport = (struct sctp_transport *) arg; + struct sctp_chunk *reply; + sctp_sender_hb_info_t hbinfo; + size_t paylen = 0; + + hbinfo.param_hdr.type = SCTP_PARAM_HEARTBEAT_INFO; + hbinfo.param_hdr.length = htons(sizeof(sctp_sender_hb_info_t)); + hbinfo.daddr = transport->ipaddr; + hbinfo.sent_at = jiffies; + + /* Send a heartbeat to our peer. */ + paylen = sizeof(sctp_sender_hb_info_t); + reply = sctp_make_heartbeat(asoc, transport, &hbinfo, paylen); + if (!reply) + return SCTP_DISPOSITION_NOMEM; + + /* Set rto_pending indicating that an RTT measurement + * is started with this heartbeat chunk. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_RTO_PENDING, + SCTP_TRANSPORT(transport)); + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + return SCTP_DISPOSITION_CONSUME; +} + +/* Generate a HEARTBEAT packet on the given transport. */ +sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_transport *transport = (struct sctp_transport *) arg; + + if (asoc->overall_error_count > asoc->max_retrans) { + /* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */ + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_U32(SCTP_ERROR_NO_ERROR)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + return SCTP_DISPOSITION_DELETE_TCB; + } + + /* Section 3.3.5. + * The Sender-specific Heartbeat Info field should normally include + * information about the sender's current time when this HEARTBEAT + * chunk is sent and the destination transport address to which this + * HEARTBEAT is sent (see Section 8.3). + */ + + if (transport->hb_allowed) { + if (SCTP_DISPOSITION_NOMEM == + sctp_sf_heartbeat(ep, asoc, type, arg, + commands)) + return SCTP_DISPOSITION_NOMEM; + /* Set transport error counter and association error counter + * when sending heartbeat. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_RESET, + SCTP_TRANSPORT(transport)); + } + sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMER_UPDATE, + SCTP_TRANSPORT(transport)); + + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Process an heartbeat request. + * + * Section: 8.3 Path Heartbeat + * The receiver of the HEARTBEAT should immediately respond with a + * HEARTBEAT ACK that contains the Heartbeat Information field copied + * from the received HEARTBEAT chunk. + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * When receiving an SCTP packet, the endpoint MUST ensure that the + * value in the Verification Tag field of the received SCTP packet + * matches its own Tag. If the received Verification Tag value does not + * match the receiver's own tag value, the receiver shall silently + * discard the packet and shall not process it any further except for + * those cases listed in Section 8.5.1 below. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_beat_8_3(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_chunk *reply; + size_t paylen = 0; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the HEARTBEAT chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_heartbeat_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + /* 8.3 The receiver of the HEARTBEAT should immediately + * respond with a HEARTBEAT ACK that contains the Heartbeat + * Information field copied from the received HEARTBEAT chunk. + */ + chunk->subh.hb_hdr = (sctp_heartbeathdr_t *) chunk->skb->data; + paylen = ntohs(chunk->chunk_hdr->length) - sizeof(sctp_chunkhdr_t); + skb_pull(chunk->skb, paylen); + + reply = sctp_make_heartbeat_ack(asoc, chunk, + chunk->subh.hb_hdr, paylen); + if (!reply) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + return SCTP_DISPOSITION_CONSUME; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Process the returning HEARTBEAT ACK. + * + * Section: 8.3 Path Heartbeat + * Upon the receipt of the HEARTBEAT ACK, the sender of the HEARTBEAT + * should clear the error counter of the destination transport + * address to which the HEARTBEAT was sent, and mark the destination + * transport address as active if it is not so marked. The endpoint may + * optionally report to the upper layer when an inactive destination + * address is marked as active due to the reception of the latest + * HEARTBEAT ACK. The receiver of the HEARTBEAT ACK must also + * clear the association overall error count as well (as defined + * in section 8.1). + * + * The receiver of the HEARTBEAT ACK should also perform an RTT + * measurement for that destination transport address using the time + * value carried in the HEARTBEAT ACK chunk. + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + union sctp_addr from_addr; + struct sctp_transport *link; + sctp_sender_hb_info_t *hbinfo; + unsigned long max_interval; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the HEARTBEAT-ACK chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_heartbeat_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + hbinfo = (sctp_sender_hb_info_t *) chunk->skb->data; + from_addr = hbinfo->daddr; + link = sctp_assoc_lookup_paddr(asoc, &from_addr); + + /* This should never happen, but lets log it if so. */ + if (!link) { + printk(KERN_WARNING + "%s: Could not find address %d.%d.%d.%d\n", + __FUNCTION__, NIPQUAD(from_addr.v4.sin_addr)); + return SCTP_DISPOSITION_DISCARD; + } + + max_interval = link->hb_interval + link->rto; + + /* Check if the timestamp looks valid. */ + if (time_after(hbinfo->sent_at, jiffies) || + time_after(jiffies, hbinfo->sent_at + max_interval)) { + SCTP_DEBUG_PRINTK("%s: HEARTBEAT ACK with invalid timestamp" + "received for transport: %p\n", + __FUNCTION__, link); + return SCTP_DISPOSITION_DISCARD; + } + + /* 8.3 Upon the receipt of the HEARTBEAT ACK, the sender of + * the HEARTBEAT should clear the error counter of the + * destination transport address to which the HEARTBEAT was + * sent and mark the destination transport address as active if + * it is not so marked. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TRANSPORT_ON, SCTP_TRANSPORT(link)); + + return SCTP_DISPOSITION_CONSUME; +} + +/* Helper function to send out an abort for the restart + * condition. + */ +static int sctp_sf_send_restart_abort(union sctp_addr *ssa, + struct sctp_chunk *init, + sctp_cmd_seq_t *commands) +{ + int len; + struct sctp_packet *pkt; + union sctp_addr_param *addrparm; + struct sctp_errhdr *errhdr; + struct sctp_endpoint *ep; + char buffer[sizeof(struct sctp_errhdr)+sizeof(union sctp_addr_param)]; + struct sctp_af *af = sctp_get_af_specific(ssa->v4.sin_family); + + /* Build the error on the stack. We are way to malloc crazy + * throughout the code today. + */ + errhdr = (struct sctp_errhdr *)buffer; + addrparm = (union sctp_addr_param *)errhdr->variable; + + /* Copy into a parm format. */ + len = af->to_addr_param(ssa, addrparm); + len += sizeof(sctp_errhdr_t); + + errhdr->cause = SCTP_ERROR_RESTART; + errhdr->length = htons(len); + + /* Assign to the control socket. */ + ep = sctp_sk((sctp_get_ctl_sock()))->ep; + + /* Association is NULL since this may be a restart attack and we + * want to send back the attacker's vtag. + */ + pkt = sctp_abort_pkt_new(ep, NULL, init, errhdr, len); + + if (!pkt) + goto out; + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, SCTP_PACKET(pkt)); + + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + + /* Discard the rest of the inbound packet. */ + sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL()); + +out: + /* Even if there is no memory, treat as a failure so + * the packet will get dropped. + */ + return 0; +} + +/* A restart is occurring, check to make sure no new addresses + * are being added as we may be under a takeover attack. + */ +static int sctp_sf_check_restart_addrs(const struct sctp_association *new_asoc, + const struct sctp_association *asoc, + struct sctp_chunk *init, + sctp_cmd_seq_t *commands) +{ + struct sctp_transport *new_addr, *addr; + struct list_head *pos, *pos2; + int found; + + /* Implementor's Guide - Sectin 5.2.2 + * ... + * Before responding the endpoint MUST check to see if the + * unexpected INIT adds new addresses to the association. If new + * addresses are added to the association, the endpoint MUST respond + * with an ABORT.. + */ + + /* Search through all current addresses and make sure + * we aren't adding any new ones. + */ + new_addr = NULL; + found = 0; + + list_for_each(pos, &new_asoc->peer.transport_addr_list) { + new_addr = list_entry(pos, struct sctp_transport, transports); + found = 0; + list_for_each(pos2, &asoc->peer.transport_addr_list) { + addr = list_entry(pos2, struct sctp_transport, + transports); + if (sctp_cmp_addr_exact(&new_addr->ipaddr, + &addr->ipaddr)) { + found = 1; + break; + } + } + if (!found) + break; + } + + /* If a new address was added, ABORT the sender. */ + if (!found && new_addr) { + sctp_sf_send_restart_abort(&new_addr->ipaddr, init, commands); + } + + /* Return success if all addresses were found. */ + return found; +} + +/* Populate the verification/tie tags based on overlapping INIT + * scenario. + * + * Note: Do not use in CLOSED or SHUTDOWN-ACK-SENT state. + */ +static void sctp_tietags_populate(struct sctp_association *new_asoc, + const struct sctp_association *asoc) +{ + switch (asoc->state) { + + /* 5.2.1 INIT received in COOKIE-WAIT or COOKIE-ECHOED State */ + + case SCTP_STATE_COOKIE_WAIT: + new_asoc->c.my_vtag = asoc->c.my_vtag; + new_asoc->c.my_ttag = asoc->c.my_vtag; + new_asoc->c.peer_ttag = 0; + break; + + case SCTP_STATE_COOKIE_ECHOED: + new_asoc->c.my_vtag = asoc->c.my_vtag; + new_asoc->c.my_ttag = asoc->c.my_vtag; + new_asoc->c.peer_ttag = asoc->c.peer_vtag; + break; + + /* 5.2.2 Unexpected INIT in States Other than CLOSED, COOKIE-ECHOED, + * COOKIE-WAIT and SHUTDOWN-ACK-SENT + */ + default: + new_asoc->c.my_ttag = asoc->c.my_vtag; + new_asoc->c.peer_ttag = asoc->c.peer_vtag; + break; + }; + + /* Other parameters for the endpoint SHOULD be copied from the + * existing parameters of the association (e.g. number of + * outbound streams) into the INIT ACK and cookie. + */ + new_asoc->rwnd = asoc->rwnd; + new_asoc->c.sinit_num_ostreams = asoc->c.sinit_num_ostreams; + new_asoc->c.sinit_max_instreams = asoc->c.sinit_max_instreams; + new_asoc->c.initial_tsn = asoc->c.initial_tsn; +} + +/* + * Compare vtag/tietag values to determine unexpected COOKIE-ECHO + * handling action. + * + * RFC 2960 5.2.4 Handle a COOKIE ECHO when a TCB exists. + * + * Returns value representing action to be taken. These action values + * correspond to Action/Description values in RFC 2960, Table 2. + */ +static char sctp_tietags_compare(struct sctp_association *new_asoc, + const struct sctp_association *asoc) +{ + /* In this case, the peer may have restarted. */ + if ((asoc->c.my_vtag != new_asoc->c.my_vtag) && + (asoc->c.peer_vtag != new_asoc->c.peer_vtag) && + (asoc->c.my_vtag == new_asoc->c.my_ttag) && + (asoc->c.peer_vtag == new_asoc->c.peer_ttag)) + return 'A'; + + /* Collision case B. */ + if ((asoc->c.my_vtag == new_asoc->c.my_vtag) && + ((asoc->c.peer_vtag != new_asoc->c.peer_vtag) || + (0 == asoc->c.peer_vtag))) { + return 'B'; + } + + /* Collision case D. */ + if ((asoc->c.my_vtag == new_asoc->c.my_vtag) && + (asoc->c.peer_vtag == new_asoc->c.peer_vtag)) + return 'D'; + + /* Collision case C. */ + if ((asoc->c.my_vtag != new_asoc->c.my_vtag) && + (asoc->c.peer_vtag == new_asoc->c.peer_vtag) && + (0 == new_asoc->c.my_ttag) && + (0 == new_asoc->c.peer_ttag)) + return 'C'; + + /* No match to any of the special cases; discard this packet. */ + return 'E'; +} + +/* Common helper routine for both duplicate and simulataneous INIT + * chunk handling. + */ +static sctp_disposition_t sctp_sf_do_unexpected_init( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, sctp_cmd_seq_t *commands) +{ + sctp_disposition_t retval; + struct sctp_chunk *chunk = arg; + struct sctp_chunk *repl; + struct sctp_association *new_asoc; + struct sctp_chunk *err_chunk; + struct sctp_packet *packet; + sctp_unrecognized_param_t *unk_param; + int len; + + /* 6.10 Bundling + * An endpoint MUST NOT bundle INIT, INIT ACK or + * SHUTDOWN COMPLETE with any other chunks. + * + * IG Section 2.11.2 + * Furthermore, we require that the receiver of an INIT chunk MUST + * enforce these rules by silently discarding an arriving packet + * with an INIT chunk that is bundled with other chunks. + */ + if (!chunk->singleton) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* 3.1 A packet containing an INIT chunk MUST have a zero Verification + * Tag. + */ + if (chunk->sctp_hdr->vtag != 0) + return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands); + + /* Make sure that the INIT chunk has a valid length. + * In this case, we generate a protocol violation since we have + * an association established. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_init_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + /* Grab the INIT header. */ + chunk->subh.init_hdr = (sctp_inithdr_t *) chunk->skb->data; + + /* Tag the variable length parameters. */ + chunk->param_hdr.v = skb_pull(chunk->skb, sizeof(sctp_inithdr_t)); + + /* Verify the INIT chunk before processing it. */ + err_chunk = NULL; + if (!sctp_verify_init(asoc, chunk->chunk_hdr->type, + (sctp_init_chunk_t *)chunk->chunk_hdr, chunk, + &err_chunk)) { + /* This chunk contains fatal error. It is to be discarded. + * Send an ABORT, with causes if there is any. + */ + if (err_chunk) { + packet = sctp_abort_pkt_new(ep, asoc, arg, + (__u8 *)(err_chunk->chunk_hdr) + + sizeof(sctp_chunkhdr_t), + ntohs(err_chunk->chunk_hdr->length) - + sizeof(sctp_chunkhdr_t)); + + if (packet) { + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, + SCTP_PACKET(packet)); + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + retval = SCTP_DISPOSITION_CONSUME; + } else { + retval = SCTP_DISPOSITION_NOMEM; + } + goto cleanup; + } else { + return sctp_sf_tabort_8_4_8(ep, asoc, type, arg, + commands); + } + } + + /* + * Other parameters for the endpoint SHOULD be copied from the + * existing parameters of the association (e.g. number of + * outbound streams) into the INIT ACK and cookie. + * FIXME: We are copying parameters from the endpoint not the + * association. + */ + new_asoc = sctp_make_temp_asoc(ep, chunk, GFP_ATOMIC); + if (!new_asoc) + goto nomem; + + /* In the outbound INIT ACK the endpoint MUST copy its current + * Verification Tag and Peers Verification tag into a reserved + * place (local tie-tag and per tie-tag) within the state cookie. + */ + if (!sctp_process_init(new_asoc, chunk->chunk_hdr->type, + sctp_source(chunk), + (sctp_init_chunk_t *)chunk->chunk_hdr, + GFP_ATOMIC)) { + retval = SCTP_DISPOSITION_NOMEM; + goto nomem_init; + } + + /* Make sure no new addresses are being added during the + * restart. Do not do this check for COOKIE-WAIT state, + * since there are no peer addresses to check against. + * Upon return an ABORT will have been sent if needed. + */ + if (!sctp_state(asoc, COOKIE_WAIT)) { + if (!sctp_sf_check_restart_addrs(new_asoc, asoc, chunk, + commands)) { + retval = SCTP_DISPOSITION_CONSUME; + goto cleanup_asoc; + } + } + + sctp_tietags_populate(new_asoc, asoc); + + /* B) "Z" shall respond immediately with an INIT ACK chunk. */ + + /* If there are errors need to be reported for unknown parameters, + * make sure to reserve enough room in the INIT ACK for them. + */ + len = 0; + if (err_chunk) { + len = ntohs(err_chunk->chunk_hdr->length) - + sizeof(sctp_chunkhdr_t); + } + + if (sctp_assoc_set_bind_addr_from_ep(new_asoc, GFP_ATOMIC) < 0) + goto nomem; + + repl = sctp_make_init_ack(new_asoc, chunk, GFP_ATOMIC, len); + if (!repl) + goto nomem; + + /* If there are errors need to be reported for unknown parameters, + * include them in the outgoing INIT ACK as "Unrecognized parameter" + * parameter. + */ + if (err_chunk) { + /* Get the "Unrecognized parameter" parameter(s) out of the + * ERROR chunk generated by sctp_verify_init(). Since the + * error cause code for "unknown parameter" and the + * "Unrecognized parameter" type is the same, we can + * construct the parameters in INIT ACK by copying the + * ERROR causes over. + */ + unk_param = (sctp_unrecognized_param_t *) + ((__u8 *)(err_chunk->chunk_hdr) + + sizeof(sctp_chunkhdr_t)); + /* Replace the cause code with the "Unrecognized parameter" + * parameter type. + */ + sctp_addto_chunk(repl, len, unk_param); + } + + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc)); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + + /* + * Note: After sending out INIT ACK with the State Cookie parameter, + * "Z" MUST NOT allocate any resources for this new association. + * Otherwise, "Z" will be vulnerable to resource attacks. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); + retval = SCTP_DISPOSITION_CONSUME; + +cleanup: + if (err_chunk) + sctp_chunk_free(err_chunk); + return retval; +nomem: + retval = SCTP_DISPOSITION_NOMEM; + goto cleanup; +nomem_init: +cleanup_asoc: + sctp_association_free(new_asoc); + goto cleanup; +} + +/* + * Handle simultanous INIT. + * This means we started an INIT and then we got an INIT request from + * our peer. + * + * Section: 5.2.1 INIT received in COOKIE-WAIT or COOKIE-ECHOED State (Item B) + * This usually indicates an initialization collision, i.e., each + * endpoint is attempting, at about the same time, to establish an + * association with the other endpoint. + * + * Upon receipt of an INIT in the COOKIE-WAIT or COOKIE-ECHOED state, an + * endpoint MUST respond with an INIT ACK using the same parameters it + * sent in its original INIT chunk (including its Verification Tag, + * unchanged). These original parameters are combined with those from the + * newly received INIT chunk. The endpoint shall also generate a State + * Cookie with the INIT ACK. The endpoint uses the parameters sent in its + * INIT to calculate the State Cookie. + * + * After that, the endpoint MUST NOT change its state, the T1-init + * timer shall be left running and the corresponding TCB MUST NOT be + * destroyed. The normal procedures for handling State Cookies when + * a TCB exists will resolve the duplicate INITs to a single association. + * + * For an endpoint that is in the COOKIE-ECHOED state it MUST populate + * its Tie-Tags with the Tag information of itself and its peer (see + * section 5.2.2 for a description of the Tie-Tags). + * + * Verification Tag: Not explicit, but an INIT can not have a valid + * verification tag, so we skip the check. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_5_2_1_siminit(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* Call helper to do the real work for both simulataneous and + * duplicate INIT chunk handling. + */ + return sctp_sf_do_unexpected_init(ep, asoc, type, arg, commands); +} + +/* + * Handle duplicated INIT messages. These are usually delayed + * restransmissions. + * + * Section: 5.2.2 Unexpected INIT in States Other than CLOSED, + * COOKIE-ECHOED and COOKIE-WAIT + * + * Unless otherwise stated, upon reception of an unexpected INIT for + * this association, the endpoint shall generate an INIT ACK with a + * State Cookie. In the outbound INIT ACK the endpoint MUST copy its + * current Verification Tag and peer's Verification Tag into a reserved + * place within the state cookie. We shall refer to these locations as + * the Peer's-Tie-Tag and the Local-Tie-Tag. The outbound SCTP packet + * containing this INIT ACK MUST carry a Verification Tag value equal to + * the Initiation Tag found in the unexpected INIT. And the INIT ACK + * MUST contain a new Initiation Tag (randomly generated see Section + * 5.3.1). Other parameters for the endpoint SHOULD be copied from the + * existing parameters of the association (e.g. number of outbound + * streams) into the INIT ACK and cookie. + * + * After sending out the INIT ACK, the endpoint shall take no further + * actions, i.e., the existing association, including its current state, + * and the corresponding TCB MUST NOT be changed. + * + * Note: Only when a TCB exists and the association is not in a COOKIE- + * WAIT state are the Tie-Tags populated. For a normal association INIT + * (i.e. the endpoint is in a COOKIE-WAIT state), the Tie-Tags MUST be + * set to 0 (indicating that no previous TCB existed). The INIT ACK and + * State Cookie are populated as specified in section 5.2.1. + * + * Verification Tag: Not specified, but an INIT has no way of knowing + * what the verification tag could be, so we ignore it. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_5_2_2_dupinit(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* Call helper to do the real work for both simulataneous and + * duplicate INIT chunk handling. + */ + return sctp_sf_do_unexpected_init(ep, asoc, type, arg, commands); +} + + + +/* Unexpected COOKIE-ECHO handler for peer restart (Table 2, action 'A') + * + * Section 5.2.4 + * A) In this case, the peer may have restarted. + */ +static sctp_disposition_t sctp_sf_do_dupcook_a(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + struct sctp_chunk *chunk, + sctp_cmd_seq_t *commands, + struct sctp_association *new_asoc) +{ + sctp_init_chunk_t *peer_init; + struct sctp_ulpevent *ev; + struct sctp_chunk *repl; + struct sctp_chunk *err; + sctp_disposition_t disposition; + + /* new_asoc is a brand-new association, so these are not yet + * side effects--it is safe to run them here. + */ + peer_init = &chunk->subh.cookie_hdr->c.peer_init[0]; + + if (!sctp_process_init(new_asoc, chunk->chunk_hdr->type, + sctp_source(chunk), peer_init, + GFP_ATOMIC)) + goto nomem; + + /* Make sure no new addresses are being added during the + * restart. Though this is a pretty complicated attack + * since you'd have to get inside the cookie. + */ + if (!sctp_sf_check_restart_addrs(new_asoc, asoc, chunk, commands)) { + return SCTP_DISPOSITION_CONSUME; + } + + /* If the endpoint is in the SHUTDOWN-ACK-SENT state and recognizes + * the peer has restarted (Action A), it MUST NOT setup a new + * association but instead resend the SHUTDOWN ACK and send an ERROR + * chunk with a "Cookie Received while Shutting Down" error cause to + * its peer. + */ + if (sctp_state(asoc, SHUTDOWN_ACK_SENT)) { + disposition = sctp_sf_do_9_2_reshutack(ep, asoc, + SCTP_ST_CHUNK(chunk->chunk_hdr->type), + chunk, commands); + if (SCTP_DISPOSITION_NOMEM == disposition) + goto nomem; + + err = sctp_make_op_error(asoc, chunk, + SCTP_ERROR_COOKIE_IN_SHUTDOWN, + NULL, 0); + if (err) + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(err)); + + return SCTP_DISPOSITION_CONSUME; + } + + /* For now, fail any unsent/unacked data. Consider the optional + * choice of resending of this data. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_PURGE_OUTQUEUE, SCTP_NULL()); + + /* Update the content of current association. */ + sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc)); + + repl = sctp_make_cookie_ack(new_asoc, chunk); + if (!repl) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + + /* Report association restart to upper layer. */ + ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_RESTART, 0, + new_asoc->c.sinit_num_ostreams, + new_asoc->c.sinit_max_instreams, + GFP_ATOMIC); + if (!ev) + goto nomem_ev; + + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); + return SCTP_DISPOSITION_CONSUME; + +nomem_ev: + sctp_chunk_free(repl); +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* Unexpected COOKIE-ECHO handler for setup collision (Table 2, action 'B') + * + * Section 5.2.4 + * B) In this case, both sides may be attempting to start an association + * at about the same time but the peer endpoint started its INIT + * after responding to the local endpoint's INIT + */ +/* This case represents an initialization collision. */ +static sctp_disposition_t sctp_sf_do_dupcook_b(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + struct sctp_chunk *chunk, + sctp_cmd_seq_t *commands, + struct sctp_association *new_asoc) +{ + sctp_init_chunk_t *peer_init; + struct sctp_ulpevent *ev; + struct sctp_chunk *repl; + + /* new_asoc is a brand-new association, so these are not yet + * side effects--it is safe to run them here. + */ + peer_init = &chunk->subh.cookie_hdr->c.peer_init[0]; + if (!sctp_process_init(new_asoc, chunk->chunk_hdr->type, + sctp_source(chunk), peer_init, + GFP_ATOMIC)) + goto nomem; + + /* Update the content of current association. */ + sctp_add_cmd_sf(commands, SCTP_CMD_UPDATE_ASSOC, SCTP_ASOC(new_asoc)); + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_ESTABLISHED)); + SCTP_INC_STATS(SCTP_MIB_CURRESTAB); + sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, SCTP_NULL()); + + repl = sctp_make_cookie_ack(new_asoc, chunk); + if (!repl) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + sctp_add_cmd_sf(commands, SCTP_CMD_TRANSMIT, SCTP_NULL()); + + /* RFC 2960 5.1 Normal Establishment of an Association + * + * D) IMPLEMENTATION NOTE: An implementation may choose to + * send the Communication Up notification to the SCTP user + * upon reception of a valid COOKIE ECHO chunk. + */ + ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_COMM_UP, 0, + new_asoc->c.sinit_num_ostreams, + new_asoc->c.sinit_max_instreams, + GFP_ATOMIC); + if (!ev) + goto nomem_ev; + + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); + + /* Sockets API Draft Section 5.3.1.6 + * When a peer sends a Adaption Layer Indication parameter , SCTP + * delivers this notification to inform the application that of the + * peers requested adaption layer. + */ + if (asoc->peer.adaption_ind) { + ev = sctp_ulpevent_make_adaption_indication(asoc, GFP_ATOMIC); + if (!ev) + goto nomem_ev; + + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(ev)); + } + + return SCTP_DISPOSITION_CONSUME; + +nomem_ev: + sctp_chunk_free(repl); +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* Unexpected COOKIE-ECHO handler for setup collision (Table 2, action 'C') + * + * Section 5.2.4 + * C) In this case, the local endpoint's cookie has arrived late. + * Before it arrived, the local endpoint sent an INIT and received an + * INIT-ACK and finally sent a COOKIE ECHO with the peer's same tag + * but a new tag of its own. + */ +/* This case represents an initialization collision. */ +static sctp_disposition_t sctp_sf_do_dupcook_c(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + struct sctp_chunk *chunk, + sctp_cmd_seq_t *commands, + struct sctp_association *new_asoc) +{ + /* The cookie should be silently discarded. + * The endpoint SHOULD NOT change states and should leave + * any timers running. + */ + return SCTP_DISPOSITION_DISCARD; +} + +/* Unexpected COOKIE-ECHO handler lost chunk (Table 2, action 'D') + * + * Section 5.2.4 + * + * D) When both local and remote tags match the endpoint should always + * enter the ESTABLISHED state, if it has not already done so. + */ +/* This case represents an initialization collision. */ +static sctp_disposition_t sctp_sf_do_dupcook_d(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + struct sctp_chunk *chunk, + sctp_cmd_seq_t *commands, + struct sctp_association *new_asoc) +{ + struct sctp_ulpevent *ev = NULL; + struct sctp_chunk *repl; + + /* Clarification from Implementor's Guide: + * D) When both local and remote tags match the endpoint should + * enter the ESTABLISHED state, if it is in the COOKIE-ECHOED state. + * It should stop any cookie timer that may be running and send + * a COOKIE ACK. + */ + + /* Don't accidentally move back into established state. */ + if (asoc->state < SCTP_STATE_ESTABLISHED) { + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE)); + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_ESTABLISHED)); + SCTP_INC_STATS(SCTP_MIB_CURRESTAB); + sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_START, + SCTP_NULL()); + + /* RFC 2960 5.1 Normal Establishment of an Association + * + * D) IMPLEMENTATION NOTE: An implementation may choose + * to send the Communication Up notification to the + * SCTP user upon reception of a valid COOKIE + * ECHO chunk. + */ + ev = sctp_ulpevent_make_assoc_change(new_asoc, 0, + SCTP_COMM_UP, 0, + new_asoc->c.sinit_num_ostreams, + new_asoc->c.sinit_max_instreams, + GFP_ATOMIC); + if (!ev) + goto nomem; + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(ev)); + + /* Sockets API Draft Section 5.3.1.6 + * When a peer sends a Adaption Layer Indication parameter, + * SCTP delivers this notification to inform the application + * that of the peers requested adaption layer. + */ + if (new_asoc->peer.adaption_ind) { + ev = sctp_ulpevent_make_adaption_indication(new_asoc, + GFP_ATOMIC); + if (!ev) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(ev)); + } + } + sctp_add_cmd_sf(commands, SCTP_CMD_TRANSMIT, SCTP_NULL()); + + repl = sctp_make_cookie_ack(new_asoc, chunk); + if (!repl) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + sctp_add_cmd_sf(commands, SCTP_CMD_TRANSMIT, SCTP_NULL()); + + return SCTP_DISPOSITION_CONSUME; + +nomem: + if (ev) + sctp_ulpevent_free(ev); + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Handle a duplicate COOKIE-ECHO. This usually means a cookie-carrying + * chunk was retransmitted and then delayed in the network. + * + * Section: 5.2.4 Handle a COOKIE ECHO when a TCB exists + * + * Verification Tag: None. Do cookie validation. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_5_2_4_dupcook(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + sctp_disposition_t retval; + struct sctp_chunk *chunk = arg; + struct sctp_association *new_asoc; + int error = 0; + char action; + struct sctp_chunk *err_chk_p; + + /* Make sure that the chunk has a valid length from the protocol + * perspective. In this case check to make sure we have at least + * enough for the chunk header. Cookie length verification is + * done later. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + /* "Decode" the chunk. We have no optional parameters so we + * are in good shape. + */ + chunk->subh.cookie_hdr = (struct sctp_signed_cookie *)chunk->skb->data; + skb_pull(chunk->skb, ntohs(chunk->chunk_hdr->length) - + sizeof(sctp_chunkhdr_t)); + + /* In RFC 2960 5.2.4 3, if both Verification Tags in the State Cookie + * of a duplicate COOKIE ECHO match the Verification Tags of the + * current association, consider the State Cookie valid even if + * the lifespan is exceeded. + */ + new_asoc = sctp_unpack_cookie(ep, asoc, chunk, GFP_ATOMIC, &error, + &err_chk_p); + + /* FIXME: + * If the re-build failed, what is the proper error path + * from here? + * + * [We should abort the association. --piggy] + */ + if (!new_asoc) { + /* FIXME: Several errors are possible. A bad cookie should + * be silently discarded, but think about logging it too. + */ + switch (error) { + case -SCTP_IERROR_NOMEM: + goto nomem; + + case -SCTP_IERROR_STALE_COOKIE: + sctp_send_stale_cookie_err(ep, asoc, chunk, commands, + err_chk_p); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + case -SCTP_IERROR_BAD_SIG: + default: + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + }; + } + + /* Compare the tie_tag in cookie with the verification tag of + * current association. + */ + action = sctp_tietags_compare(new_asoc, asoc); + + switch (action) { + case 'A': /* Association restart. */ + retval = sctp_sf_do_dupcook_a(ep, asoc, chunk, commands, + new_asoc); + break; + + case 'B': /* Collision case B. */ + retval = sctp_sf_do_dupcook_b(ep, asoc, chunk, commands, + new_asoc); + break; + + case 'C': /* Collision case C. */ + retval = sctp_sf_do_dupcook_c(ep, asoc, chunk, commands, + new_asoc); + break; + + case 'D': /* Collision case D. */ + retval = sctp_sf_do_dupcook_d(ep, asoc, chunk, commands, + new_asoc); + break; + + default: /* Discard packet for all others. */ + retval = sctp_sf_pdiscard(ep, asoc, type, arg, commands); + break; + }; + + /* Delete the tempory new association. */ + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, SCTP_ASOC(new_asoc)); + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); + + return retval; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Process an ABORT. (SHUTDOWN-PENDING state) + * + * See sctp_sf_do_9_1_abort(). + */ +sctp_disposition_t sctp_sf_shutdown_pending_abort( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + + if (!sctp_vtag_verify_either(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the ABORT chunk has a valid length. + * Since this is an ABORT chunk, we have to discard it + * because of the following text: + * RFC 2960, Section 3.3.7 + * If an endpoint receives an ABORT with a format error or for an + * association that doesn't exist, it MUST silently discard it. + * Becasue the length is "invalid", we can't really discard just + * as we do not know its true length. So, to be safe, discard the + * packet. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t))) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Stop the T5-shutdown guard timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); + + return sctp_sf_do_9_1_abort(ep, asoc, type, arg, commands); +} + +/* + * Process an ABORT. (SHUTDOWN-SENT state) + * + * See sctp_sf_do_9_1_abort(). + */ +sctp_disposition_t sctp_sf_shutdown_sent_abort(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + + if (!sctp_vtag_verify_either(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the ABORT chunk has a valid length. + * Since this is an ABORT chunk, we have to discard it + * because of the following text: + * RFC 2960, Section 3.3.7 + * If an endpoint receives an ABORT with a format error or for an + * association that doesn't exist, it MUST silently discard it. + * Becasue the length is "invalid", we can't really discard just + * as we do not know its true length. So, to be safe, discard the + * packet. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t))) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Stop the T2-shutdown timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + + /* Stop the T5-shutdown guard timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); + + return sctp_sf_do_9_1_abort(ep, asoc, type, arg, commands); +} + +/* + * Process an ABORT. (SHUTDOWN-ACK-SENT state) + * + * See sctp_sf_do_9_1_abort(). + */ +sctp_disposition_t sctp_sf_shutdown_ack_sent_abort( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* The same T2 timer, so we should be able to use + * common function with the SHUTDOWN-SENT state. + */ + return sctp_sf_shutdown_sent_abort(ep, asoc, type, arg, commands); +} + +/* + * Handle an Error received in COOKIE_ECHOED state. + * + * Only handle the error type of stale COOKIE Error, the other errors will + * be ignored. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_cookie_echoed_err(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + sctp_errhdr_t *err; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the ERROR chunk has a valid length. + * The parameter walking depends on this as well. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_operr_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + /* Process the error here */ + /* FUTURE FIXME: When PR-SCTP related and other optional + * parms are emitted, this will have to change to handle multiple + * errors. + */ + sctp_walk_errors(err, chunk->chunk_hdr) { + if (SCTP_ERROR_STALE_COOKIE == err->cause) + return sctp_sf_do_5_2_6_stale(ep, asoc, type, + arg, commands); + } + + /* It is possible to have malformed error causes, and that + * will cause us to end the walk early. However, since + * we are discarding the packet, there should be no adverse + * affects. + */ + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); +} + +/* + * Handle a Stale COOKIE Error + * + * Section: 5.2.6 Handle Stale COOKIE Error + * If the association is in the COOKIE-ECHOED state, the endpoint may elect + * one of the following three alternatives. + * ... + * 3) Send a new INIT chunk to the endpoint, adding a Cookie + * Preservative parameter requesting an extension to the lifetime of + * the State Cookie. When calculating the time extension, an + * implementation SHOULD use the RTT information measured based on the + * previous COOKIE ECHO / ERROR exchange, and should add no more + * than 1 second beyond the measured RTT, due to long State Cookie + * lifetimes making the endpoint more subject to a replay attack. + * + * Verification Tag: Not explicit, but safe to ignore. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +static sctp_disposition_t sctp_sf_do_5_2_6_stale(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + time_t stale; + sctp_cookie_preserve_param_t bht; + sctp_errhdr_t *err; + struct sctp_chunk *reply; + struct sctp_bind_addr *bp; + int attempts; + + attempts = asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1; + + if (attempts >= asoc->max_init_attempts) { + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, + SCTP_U32(SCTP_ERROR_STALE_COOKIE)); + return SCTP_DISPOSITION_DELETE_TCB; + } + + err = (sctp_errhdr_t *)(chunk->skb->data); + + /* When calculating the time extension, an implementation + * SHOULD use the RTT information measured based on the + * previous COOKIE ECHO / ERROR exchange, and should add no + * more than 1 second beyond the measured RTT, due to long + * State Cookie lifetimes making the endpoint more subject to + * a replay attack. + * Measure of Staleness's unit is usec. (1/1000000 sec) + * Suggested Cookie Life-span Increment's unit is msec. + * (1/1000 sec) + * In general, if you use the suggested cookie life, the value + * found in the field of measure of staleness should be doubled + * to give ample time to retransmit the new cookie and thus + * yield a higher probability of success on the reattempt. + */ + stale = ntohl(*(suseconds_t *)((u8 *)err + sizeof(sctp_errhdr_t))); + stale = (stale * 2) / 1000; + + bht.param_hdr.type = SCTP_PARAM_COOKIE_PRESERVATIVE; + bht.param_hdr.length = htons(sizeof(bht)); + bht.lifespan_increment = htonl(stale); + + /* Build that new INIT chunk. */ + bp = (struct sctp_bind_addr *) &asoc->base.bind_addr; + reply = sctp_make_init(asoc, bp, GFP_ATOMIC, sizeof(bht)); + if (!reply) + goto nomem; + + sctp_addto_chunk(reply, sizeof(bht), &bht); + + /* Clear peer's init_tag cached in assoc as we are sending a new INIT */ + sctp_add_cmd_sf(commands, SCTP_CMD_CLEAR_INIT_TAG, SCTP_NULL()); + + /* Stop pending T3-rtx and heartbeat timers */ + sctp_add_cmd_sf(commands, SCTP_CMD_T3_RTX_TIMERS_STOP, SCTP_NULL()); + sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_STOP, SCTP_NULL()); + + /* Delete non-primary peer ip addresses since we are transitioning + * back to the COOKIE-WAIT state + */ + sctp_add_cmd_sf(commands, SCTP_CMD_DEL_NON_PRIMARY, SCTP_NULL()); + + /* If we've sent any data bundled with COOKIE-ECHO we will need to + * resend + */ + sctp_add_cmd_sf(commands, SCTP_CMD_RETRAN, + SCTP_TRANSPORT(asoc->peer.primary_path)); + + /* Cast away the const modifier, as we want to just + * rerun it through as a sideffect. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_COUNTER_INC, + SCTP_COUNTER(SCTP_COUNTER_INIT_ERROR)); + + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_COOKIE)); + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_COOKIE_WAIT)); + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + + return SCTP_DISPOSITION_CONSUME; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Process an ABORT. + * + * Section: 9.1 + * After checking the Verification Tag, the receiving endpoint shall + * remove the association from its record, and shall report the + * termination to its upper layer. + * + * Verification Tag: 8.5.1 Exceptions in Verification Tag Rules + * B) Rules for packet carrying ABORT: + * + * - The endpoint shall always fill in the Verification Tag field of the + * outbound packet with the destination endpoint's tag value if it + * is known. + * + * - If the ABORT is sent in response to an OOTB packet, the endpoint + * MUST follow the procedure described in Section 8.4. + * + * - The receiver MUST accept the packet if the Verification Tag + * matches either its own tag, OR the tag of its peer. Otherwise, the + * receiver MUST silently discard the packet and take no further + * action. + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_9_1_abort(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + unsigned len; + __u16 error = SCTP_ERROR_NO_ERROR; + + if (!sctp_vtag_verify_either(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the ABORT chunk has a valid length. + * Since this is an ABORT chunk, we have to discard it + * because of the following text: + * RFC 2960, Section 3.3.7 + * If an endpoint receives an ABORT with a format error or for an + * association that doesn't exist, it MUST silently discard it. + * Becasue the length is "invalid", we can't really discard just + * as we do not know its true length. So, to be safe, discard the + * packet. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t))) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* See if we have an error cause code in the chunk. */ + len = ntohs(chunk->chunk_hdr->length); + if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) + error = ((sctp_errhdr_t *)chunk->skb->data)->cause; + + /* ASSOC_FAILED will DELETE_TCB. */ + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, SCTP_U32(error)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + + return SCTP_DISPOSITION_ABORT; +} + +/* + * Process an ABORT. (COOKIE-WAIT state) + * + * See sctp_sf_do_9_1_abort() above. + */ +sctp_disposition_t sctp_sf_cookie_wait_abort(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + unsigned len; + __u16 error = SCTP_ERROR_NO_ERROR; + + if (!sctp_vtag_verify_either(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the ABORT chunk has a valid length. + * Since this is an ABORT chunk, we have to discard it + * because of the following text: + * RFC 2960, Section 3.3.7 + * If an endpoint receives an ABORT with a format error or for an + * association that doesn't exist, it MUST silently discard it. + * Becasue the length is "invalid", we can't really discard just + * as we do not know its true length. So, to be safe, discard the + * packet. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_abort_chunk_t))) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* See if we have an error cause code in the chunk. */ + len = ntohs(chunk->chunk_hdr->length); + if (len >= sizeof(struct sctp_chunkhdr) + sizeof(struct sctp_errhdr)) + error = ((sctp_errhdr_t *)chunk->skb->data)->cause; + + sctp_stop_t1_and_abort(commands, error); + return SCTP_DISPOSITION_ABORT; +} + +/* + * Process an incoming ICMP as an ABORT. (COOKIE-WAIT state) + */ +sctp_disposition_t sctp_sf_cookie_wait_icmp_abort(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + sctp_stop_t1_and_abort(commands, SCTP_ERROR_NO_ERROR); + return SCTP_DISPOSITION_ABORT; +} + +/* + * Process an ABORT. (COOKIE-ECHOED state) + */ +sctp_disposition_t sctp_sf_cookie_echoed_abort(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* There is a single T1 timer, so we should be able to use + * common function with the COOKIE-WAIT state. + */ + return sctp_sf_cookie_wait_abort(ep, asoc, type, arg, commands); +} + +/* + * Stop T1 timer and abort association with "INIT failed". + * + * This is common code called by several sctp_sf_*_abort() functions above. + */ +void sctp_stop_t1_and_abort(sctp_cmd_seq_t *commands, __u16 error) +{ + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); + /* CMD_INIT_FAILED will DELETE_TCB. */ + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, + SCTP_U32(error)); +} + +/* + * sctp_sf_do_9_2_shut + * + * Section: 9.2 + * Upon the reception of the SHUTDOWN, the peer endpoint shall + * - enter the SHUTDOWN-RECEIVED state, + * + * - stop accepting new data from its SCTP user + * + * - verify, by checking the Cumulative TSN Ack field of the chunk, + * that all its outstanding DATA chunks have been received by the + * SHUTDOWN sender. + * + * Once an endpoint as reached the SHUTDOWN-RECEIVED state it MUST NOT + * send a SHUTDOWN in response to a ULP request. And should discard + * subsequent SHUTDOWN chunks. + * + * If there are still outstanding DATA chunks left, the SHUTDOWN + * receiver shall continue to follow normal data transmission + * procedures defined in Section 6 until all outstanding DATA chunks + * are acknowledged; however, the SHUTDOWN receiver MUST NOT accept + * new data from its SCTP user. + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_9_2_shutdown(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + sctp_shutdownhdr_t *sdh; + sctp_disposition_t disposition; + struct sctp_ulpevent *ev; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the SHUTDOWN chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, + sizeof(struct sctp_shutdown_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + /* Convert the elaborate header. */ + sdh = (sctp_shutdownhdr_t *)chunk->skb->data; + skb_pull(chunk->skb, sizeof(sctp_shutdownhdr_t)); + chunk->subh.shutdown_hdr = sdh; + + /* Upon the reception of the SHUTDOWN, the peer endpoint shall + * - enter the SHUTDOWN-RECEIVED state, + * - stop accepting new data from its SCTP user + * + * [This is implicit in the new state.] + */ + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_SHUTDOWN_RECEIVED)); + disposition = SCTP_DISPOSITION_CONSUME; + + if (sctp_outq_is_empty(&asoc->outqueue)) { + disposition = sctp_sf_do_9_2_shutdown_ack(ep, asoc, type, + arg, commands); + } + + if (SCTP_DISPOSITION_NOMEM == disposition) + goto out; + + /* - verify, by checking the Cumulative TSN Ack field of the + * chunk, that all its outstanding DATA chunks have been + * received by the SHUTDOWN sender. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_CTSN, + SCTP_U32(chunk->subh.shutdown_hdr->cum_tsn_ack)); + + /* API 5.3.1.5 SCTP_SHUTDOWN_EVENT + * When a peer sends a SHUTDOWN, SCTP delivers this notification to + * inform the application that it should cease sending data. + */ + ev = sctp_ulpevent_make_shutdown_event(asoc, 0, GFP_ATOMIC); + if (!ev) { + disposition = SCTP_DISPOSITION_NOMEM; + goto out; + } + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); + +out: + return disposition; +} + +/* RFC 2960 9.2 + * If an endpoint is in SHUTDOWN-ACK-SENT state and receives an INIT chunk + * (e.g., if the SHUTDOWN COMPLETE was lost) with source and destination + * transport addresses (either in the IP addresses or in the INIT chunk) + * that belong to this association, it should discard the INIT chunk and + * retransmit the SHUTDOWN ACK chunk. + */ +sctp_disposition_t sctp_sf_do_9_2_reshutack(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = (struct sctp_chunk *) arg; + struct sctp_chunk *reply; + + /* Since we are not going to really process this INIT, there + * is no point in verifying chunk boundries. Just generate + * the SHUTDOWN ACK. + */ + reply = sctp_make_shutdown_ack(asoc, chunk); + if (NULL == reply) + goto nomem; + + /* Set the transport for the SHUTDOWN ACK chunk and the timeout for + * the T2-SHUTDOWN timer. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply)); + + /* and restart the T2-shutdown timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + + return SCTP_DISPOSITION_CONSUME; +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * sctp_sf_do_ecn_cwr + * + * Section: Appendix A: Explicit Congestion Notification + * + * CWR: + * + * RFC 2481 details a specific bit for a sender to send in the header of + * its next outbound TCP segment to indicate to its peer that it has + * reduced its congestion window. This is termed the CWR bit. For + * SCTP the same indication is made by including the CWR chunk. + * This chunk contains one data element, i.e. the TSN number that + * was sent in the ECNE chunk. This element represents the lowest + * TSN number in the datagram that was originally marked with the + * CE bit. + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_ecn_cwr(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + sctp_cwrhdr_t *cwr; + struct sctp_chunk *chunk = arg; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_ecne_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + cwr = (sctp_cwrhdr_t *) chunk->skb->data; + skb_pull(chunk->skb, sizeof(sctp_cwrhdr_t)); + + cwr->lowest_tsn = ntohl(cwr->lowest_tsn); + + /* Does this CWR ack the last sent congestion notification? */ + if (TSN_lte(asoc->last_ecne_tsn, cwr->lowest_tsn)) { + /* Stop sending ECNE. */ + sctp_add_cmd_sf(commands, + SCTP_CMD_ECN_CWR, + SCTP_U32(cwr->lowest_tsn)); + } + return SCTP_DISPOSITION_CONSUME; +} + +/* + * sctp_sf_do_ecne + * + * Section: Appendix A: Explicit Congestion Notification + * + * ECN-Echo + * + * RFC 2481 details a specific bit for a receiver to send back in its + * TCP acknowledgements to notify the sender of the Congestion + * Experienced (CE) bit having arrived from the network. For SCTP this + * same indication is made by including the ECNE chunk. This chunk + * contains one data element, i.e. the lowest TSN associated with the IP + * datagram marked with the CE bit..... + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_ecne(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + sctp_ecnehdr_t *ecne; + struct sctp_chunk *chunk = arg; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_ecne_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + ecne = (sctp_ecnehdr_t *) chunk->skb->data; + skb_pull(chunk->skb, sizeof(sctp_ecnehdr_t)); + + /* If this is a newer ECNE than the last CWR packet we sent out */ + sctp_add_cmd_sf(commands, SCTP_CMD_ECN_ECNE, + SCTP_U32(ntohl(ecne->lowest_tsn))); + + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Section: 6.2 Acknowledgement on Reception of DATA Chunks + * + * The SCTP endpoint MUST always acknowledge the reception of each valid + * DATA chunk. + * + * The guidelines on delayed acknowledgement algorithm specified in + * Section 4.2 of [RFC2581] SHOULD be followed. Specifically, an + * acknowledgement SHOULD be generated for at least every second packet + * (not every second DATA chunk) received, and SHOULD be generated within + * 200 ms of the arrival of any unacknowledged DATA chunk. In some + * situations it may be beneficial for an SCTP transmitter to be more + * conservative than the algorithms detailed in this document allow. + * However, an SCTP transmitter MUST NOT be more aggressive than the + * following algorithms allow. + * + * A SCTP receiver MUST NOT generate more than one SACK for every + * incoming packet, other than to update the offered window as the + * receiving application consumes new data. + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_eat_data_6_2(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + int error; + + if (!sctp_vtag_verify(chunk, asoc)) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG, + SCTP_NULL()); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + } + + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_data_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + error = sctp_eat_data(asoc, chunk, commands ); + switch (error) { + case SCTP_IERROR_NO_ERROR: + break; + case SCTP_IERROR_HIGH_TSN: + case SCTP_IERROR_BAD_STREAM: + goto discard_noforce; + case SCTP_IERROR_DUP_TSN: + case SCTP_IERROR_IGNORE_TSN: + goto discard_force; + case SCTP_IERROR_NO_DATA: + goto consume; + default: + BUG(); + } + + if (asoc->autoclose) { + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); + } + + /* If this is the last chunk in a packet, we need to count it + * toward sack generation. Note that we need to SACK every + * OTHER packet containing data chunks, EVEN IF WE DISCARD + * THEM. We elect to NOT generate SACK's if the chunk fails + * the verification tag test. + * + * RFC 2960 6.2 Acknowledgement on Reception of DATA Chunks + * + * The SCTP endpoint MUST always acknowledge the reception of + * each valid DATA chunk. + * + * The guidelines on delayed acknowledgement algorithm + * specified in Section 4.2 of [RFC2581] SHOULD be followed. + * Specifically, an acknowledgement SHOULD be generated for at + * least every second packet (not every second DATA chunk) + * received, and SHOULD be generated within 200 ms of the + * arrival of any unacknowledged DATA chunk. In some + * situations it may be beneficial for an SCTP transmitter to + * be more conservative than the algorithms detailed in this + * document allow. However, an SCTP transmitter MUST NOT be + * more aggressive than the following algorithms allow. + */ + if (chunk->end_of_packet) { + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); + + /* Start the SACK timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); + } + + return SCTP_DISPOSITION_CONSUME; + +discard_force: + /* RFC 2960 6.2 Acknowledgement on Reception of DATA Chunks + * + * When a packet arrives with duplicate DATA chunk(s) and with + * no new DATA chunk(s), the endpoint MUST immediately send a + * SACK with no delay. If a packet arrives with duplicate + * DATA chunk(s) bundled with new DATA chunks, the endpoint + * MAY immediately send a SACK. Normally receipt of duplicate + * DATA chunks will occur when the original SACK chunk was lost + * and the peer's RTO has expired. The duplicate TSN number(s) + * SHOULD be reported in the SACK as duplicate. + */ + /* In our case, we split the MAY SACK advice up whether or not + * the last chunk is a duplicate.' + */ + if (chunk->end_of_packet) + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE()); + return SCTP_DISPOSITION_DISCARD; + +discard_noforce: + if (chunk->end_of_packet) { + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); + + /* Start the SACK timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); + } + return SCTP_DISPOSITION_DISCARD; +consume: + return SCTP_DISPOSITION_CONSUME; + +} + +/* + * sctp_sf_eat_data_fast_4_4 + * + * Section: 4 (4) + * (4) In SHUTDOWN-SENT state the endpoint MUST acknowledge any received + * DATA chunks without delay. + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_eat_data_fast_4_4(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + int error; + + if (!sctp_vtag_verify(chunk, asoc)) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG, + SCTP_NULL()); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + } + + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_data_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + error = sctp_eat_data(asoc, chunk, commands ); + switch (error) { + case SCTP_IERROR_NO_ERROR: + case SCTP_IERROR_HIGH_TSN: + case SCTP_IERROR_DUP_TSN: + case SCTP_IERROR_IGNORE_TSN: + case SCTP_IERROR_BAD_STREAM: + break; + case SCTP_IERROR_NO_DATA: + goto consume; + default: + BUG(); + } + + /* Go a head and force a SACK, since we are shutting down. */ + + /* Implementor's Guide. + * + * While in SHUTDOWN-SENT state, the SHUTDOWN sender MUST immediately + * respond to each received packet containing one or more DATA chunk(s) + * with a SACK, a SHUTDOWN chunk, and restart the T2-shutdown timer + */ + if (chunk->end_of_packet) { + /* We must delay the chunk creation since the cumulative + * TSN has not been updated yet. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SHUTDOWN, SCTP_NULL()); + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE()); + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + } + +consume: + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Section: 6.2 Processing a Received SACK + * D) Any time a SACK arrives, the endpoint performs the following: + * + * i) If Cumulative TSN Ack is less than the Cumulative TSN Ack Point, + * then drop the SACK. Since Cumulative TSN Ack is monotonically + * increasing, a SACK whose Cumulative TSN Ack is less than the + * Cumulative TSN Ack Point indicates an out-of-order SACK. + * + * ii) Set rwnd equal to the newly received a_rwnd minus the number + * of bytes still outstanding after processing the Cumulative TSN Ack + * and the Gap Ack Blocks. + * + * iii) If the SACK is missing a TSN that was previously + * acknowledged via a Gap Ack Block (e.g., the data receiver + * reneged on the data), then mark the corresponding DATA chunk + * as available for retransmit: Mark it as missing for fast + * retransmit as described in Section 7.2.4 and if no retransmit + * timer is running for the destination address to which the DATA + * chunk was originally transmitted, then T3-rtx is started for + * that destination address. + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_eat_sack_6_2(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + sctp_sackhdr_t *sackh; + __u32 ctsn; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the SACK chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_sack_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + /* Pull the SACK chunk from the data buffer */ + sackh = sctp_sm_pull_sack(chunk); + /* Was this a bogus SACK? */ + if (!sackh) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + chunk->subh.sack_hdr = sackh; + ctsn = ntohl(sackh->cum_tsn_ack); + + /* i) If Cumulative TSN Ack is less than the Cumulative TSN + * Ack Point, then drop the SACK. Since Cumulative TSN + * Ack is monotonically increasing, a SACK whose + * Cumulative TSN Ack is less than the Cumulative TSN Ack + * Point indicates an out-of-order SACK. + */ + if (TSN_lt(ctsn, asoc->ctsn_ack_point)) { + SCTP_DEBUG_PRINTK("ctsn %x\n", ctsn); + SCTP_DEBUG_PRINTK("ctsn_ack_point %x\n", asoc->ctsn_ack_point); + return SCTP_DISPOSITION_DISCARD; + } + + /* Return this SACK for further processing. */ + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_SACK, SCTP_SACKH(sackh)); + + /* Note: We do the rest of the work on the PROCESS_SACK + * sideeffect. + */ + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Generate an ABORT in response to a packet. + * + * Section: 8.4 Handle "Out of the blue" Packets + * + * 8) The receiver should respond to the sender of the OOTB packet + * with an ABORT. When sending the ABORT, the receiver of the + * OOTB packet MUST fill in the Verification Tag field of the + * outbound packet with the value found in the Verification Tag + * field of the OOTB packet and set the T-bit in the Chunk Flags + * to indicate that no TCB was found. After sending this ABORT, + * the receiver of the OOTB packet shall discard the OOTB packet + * and take no further action. + * + * Verification Tag: + * + * The return value is the disposition of the chunk. +*/ +sctp_disposition_t sctp_sf_tabort_8_4_8(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_packet *packet = NULL; + struct sctp_chunk *chunk = arg; + struct sctp_chunk *abort; + + packet = sctp_ootb_pkt_new(asoc, chunk); + + if (packet) { + /* Make an ABORT. The T bit will be set if the asoc + * is NULL. + */ + abort = sctp_make_abort(asoc, chunk, 0); + if (!abort) { + sctp_ootb_pkt_free(packet); + return SCTP_DISPOSITION_NOMEM; + } + + /* Set the skb to the belonging sock for accounting. */ + abort->skb->sk = ep->base.sk; + + sctp_packet_append_chunk(packet, abort); + + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, + SCTP_PACKET(packet)); + + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + + return SCTP_DISPOSITION_CONSUME; + } + + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Received an ERROR chunk from peer. Generate SCTP_REMOTE_ERROR + * event as ULP notification for each cause included in the chunk. + * + * API 5.3.1.3 - SCTP_REMOTE_ERROR + * + * The return value is the disposition of the chunk. +*/ +sctp_disposition_t sctp_sf_operr_notify(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_ulpevent *ev; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the ERROR chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_operr_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + while (chunk->chunk_end > chunk->skb->data) { + ev = sctp_ulpevent_make_remote_error(asoc, chunk, 0, + GFP_ATOMIC); + if (!ev) + goto nomem; + + if (!sctp_add_cmd(commands, SCTP_CMD_EVENT_ULP, + SCTP_ULPEVENT(ev))) { + sctp_ulpevent_free(ev); + goto nomem; + } + + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_OPERR, + SCTP_CHUNK(chunk)); + } + return SCTP_DISPOSITION_CONSUME; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Process an inbound SHUTDOWN ACK. + * + * From Section 9.2: + * Upon the receipt of the SHUTDOWN ACK, the SHUTDOWN sender shall + * stop the T2-shutdown timer, send a SHUTDOWN COMPLETE chunk to its + * peer, and remove all record of the association. + * + * The return value is the disposition. + */ +sctp_disposition_t sctp_sf_do_9_2_final(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_chunk *reply; + struct sctp_ulpevent *ev; + + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the SHUTDOWN_ACK chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + /* 10.2 H) SHUTDOWN COMPLETE notification + * + * When SCTP completes the shutdown procedures (section 9.2) this + * notification is passed to the upper layer. + */ + ev = sctp_ulpevent_make_assoc_change(asoc, 0, SCTP_SHUTDOWN_COMP, + 0, 0, 0, GFP_ATOMIC); + if (!ev) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, SCTP_ULPEVENT(ev)); + + /* Upon the receipt of the SHUTDOWN ACK, the SHUTDOWN sender shall + * stop the T2-shutdown timer, + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); + + /* ...send a SHUTDOWN COMPLETE chunk to its peer, */ + reply = sctp_make_shutdown_complete(asoc, chunk); + if (!reply) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + SCTP_INC_STATS(SCTP_MIB_SHUTDOWNS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + + /* ...and remove all record of the association. */ + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); + return SCTP_DISPOSITION_DELETE_TCB; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * RFC 2960, 8.4 - Handle "Out of the blue" Packets + * 5) If the packet contains a SHUTDOWN ACK chunk, the receiver should + * respond to the sender of the OOTB packet with a SHUTDOWN COMPLETE. + * When sending the SHUTDOWN COMPLETE, the receiver of the OOTB + * packet must fill in the Verification Tag field of the outbound + * packet with the Verification Tag received in the SHUTDOWN ACK and + * set the T-bit in the Chunk Flags to indicate that no TCB was + * found. Otherwise, + * + * 8) The receiver should respond to the sender of the OOTB packet with + * an ABORT. When sending the ABORT, the receiver of the OOTB packet + * MUST fill in the Verification Tag field of the outbound packet + * with the value found in the Verification Tag field of the OOTB + * packet and set the T-bit in the Chunk Flags to indicate that no + * TCB was found. After sending this ABORT, the receiver of the OOTB + * packet shall discard the OOTB packet and take no further action. + */ +sctp_disposition_t sctp_sf_ootb(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sk_buff *skb = chunk->skb; + sctp_chunkhdr_t *ch; + __u8 *ch_end; + int ootb_shut_ack = 0; + + SCTP_INC_STATS(SCTP_MIB_OUTOFBLUES); + + ch = (sctp_chunkhdr_t *) chunk->chunk_hdr; + do { + /* Break out if chunk length is less then minimal. */ + if (ntohs(ch->length) < sizeof(sctp_chunkhdr_t)) + break; + + ch_end = ((__u8 *)ch) + WORD_ROUND(ntohs(ch->length)); + + if (SCTP_CID_SHUTDOWN_ACK == ch->type) + ootb_shut_ack = 1; + + /* RFC 2960, Section 3.3.7 + * Moreover, under any circumstances, an endpoint that + * receives an ABORT MUST NOT respond to that ABORT by + * sending an ABORT of its own. + */ + if (SCTP_CID_ABORT == ch->type) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + ch = (sctp_chunkhdr_t *) ch_end; + } while (ch_end < skb->tail); + + if (ootb_shut_ack) + sctp_sf_shut_8_4_5(ep, asoc, type, arg, commands); + else + sctp_sf_tabort_8_4_8(ep, asoc, type, arg, commands); + + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); +} + +/* + * Handle an "Out of the blue" SHUTDOWN ACK. + * + * Section: 8.4 5) + * 5) If the packet contains a SHUTDOWN ACK chunk, the receiver should + * respond to the sender of the OOTB packet with a SHUTDOWN COMPLETE. + * When sending the SHUTDOWN COMPLETE, the receiver of the OOTB packet + * must fill in the Verification Tag field of the outbound packet with + * the Verification Tag received in the SHUTDOWN ACK and set the + * T-bit in the Chunk Flags to indicate that no TCB was found. + * + * Inputs + * (endpoint, asoc, type, arg, commands) + * + * Outputs + * (sctp_disposition_t) + * + * The return value is the disposition of the chunk. + */ +static sctp_disposition_t sctp_sf_shut_8_4_5(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_packet *packet = NULL; + struct sctp_chunk *chunk = arg; + struct sctp_chunk *shut; + + packet = sctp_ootb_pkt_new(asoc, chunk); + + if (packet) { + /* Make an SHUTDOWN_COMPLETE. + * The T bit will be set if the asoc is NULL. + */ + shut = sctp_make_shutdown_complete(asoc, chunk); + if (!shut) { + sctp_ootb_pkt_free(packet); + return SCTP_DISPOSITION_NOMEM; + } + + /* Set the skb to the belonging sock for accounting. */ + shut->skb->sk = ep->base.sk; + + sctp_packet_append_chunk(packet, shut); + + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, + SCTP_PACKET(packet)); + + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + + /* If the chunk length is invalid, we don't want to process + * the reset of the packet. + */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_chunkhdr_t))) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + return SCTP_DISPOSITION_CONSUME; + } + + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Handle SHUTDOWN ACK in COOKIE_ECHOED or COOKIE_WAIT state. + * + * Verification Tag: 8.5.1 E) Rules for packet carrying a SHUTDOWN ACK + * If the receiver is in COOKIE-ECHOED or COOKIE-WAIT state the + * procedures in section 8.4 SHOULD be followed, in other words it + * should be treated as an Out Of The Blue packet. + * [This means that we do NOT check the Verification Tag on these + * chunks. --piggy ] + * + */ +sctp_disposition_t sctp_sf_do_8_5_1_E_sa(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* Although we do have an association in this case, it corresponds + * to a restarted association. So the packet is treated as an OOTB + * packet and the state function that handles OOTB SHUTDOWN_ACK is + * called with a NULL association. + */ + return sctp_sf_shut_8_4_5(ep, NULL, type, arg, commands); +} + +/* ADDIP Section 4.2 Upon reception of an ASCONF Chunk. */ +sctp_disposition_t sctp_sf_do_asconf(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_chunk *asconf_ack = NULL; + sctp_addiphdr_t *hdr; + __u32 serial; + + if (!sctp_vtag_verify(chunk, asoc)) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG, + SCTP_NULL()); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + } + + /* Make sure that the ASCONF ADDIP chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(sctp_addip_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + hdr = (sctp_addiphdr_t *)chunk->skb->data; + serial = ntohl(hdr->serial); + + /* ADDIP 4.2 C1) Compare the value of the serial number to the value + * the endpoint stored in a new association variable + * 'Peer-Serial-Number'. + */ + if (serial == asoc->peer.addip_serial + 1) { + /* ADDIP 4.2 C2) If the value found in the serial number is + * equal to the ('Peer-Serial-Number' + 1), the endpoint MUST + * do V1-V5. + */ + asconf_ack = sctp_process_asconf((struct sctp_association *) + asoc, chunk); + if (!asconf_ack) + return SCTP_DISPOSITION_NOMEM; + } else if (serial == asoc->peer.addip_serial) { + /* ADDIP 4.2 C3) If the value found in the serial number is + * equal to the value stored in the 'Peer-Serial-Number' + * IMPLEMENTATION NOTE: As an optimization a receiver may wish + * to save the last ASCONF-ACK for some predetermined period of + * time and instead of re-processing the ASCONF (with the same + * serial number) it may just re-transmit the ASCONF-ACK. + */ + if (asoc->addip_last_asconf_ack) + asconf_ack = asoc->addip_last_asconf_ack; + else + return SCTP_DISPOSITION_DISCARD; + } else { + /* ADDIP 4.2 C4) Otherwise, the ASCONF Chunk is discarded since + * it must be either a stale packet or from an attacker. + */ + return SCTP_DISPOSITION_DISCARD; + } + + /* ADDIP 4.2 C5) In both cases C2 and C3 the ASCONF-ACK MUST be sent + * back to the source address contained in the IP header of the ASCONF + * being responded to. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(asconf_ack)); + + return SCTP_DISPOSITION_CONSUME; +} + +/* + * ADDIP Section 4.3 General rules for address manipulation + * When building TLV parameters for the ASCONF Chunk that will add or + * delete IP addresses the D0 to D13 rules should be applied: + */ +sctp_disposition_t sctp_sf_do_asconf_ack(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *asconf_ack = arg; + struct sctp_chunk *last_asconf = asoc->addip_last_asconf; + struct sctp_chunk *abort; + sctp_addiphdr_t *addip_hdr; + __u32 sent_serial, rcvd_serial; + + if (!sctp_vtag_verify(asconf_ack, asoc)) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG, + SCTP_NULL()); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + } + + /* Make sure that the ADDIP chunk has a valid length. */ + if (!sctp_chunk_length_valid(asconf_ack, sizeof(sctp_addip_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + addip_hdr = (sctp_addiphdr_t *)asconf_ack->skb->data; + rcvd_serial = ntohl(addip_hdr->serial); + + if (last_asconf) { + addip_hdr = (sctp_addiphdr_t *)last_asconf->subh.addip_hdr; + sent_serial = ntohl(addip_hdr->serial); + } else { + sent_serial = asoc->addip_serial - 1; + } + + /* D0) If an endpoint receives an ASCONF-ACK that is greater than or + * equal to the next serial number to be used but no ASCONF chunk is + * outstanding the endpoint MUST ABORT the association. Note that a + * sequence number is greater than if it is no more than 2^^31-1 + * larger than the current sequence number (using serial arithmetic). + */ + if (ADDIP_SERIAL_gte(rcvd_serial, sent_serial + 1) && + !(asoc->addip_last_asconf)) { + abort = sctp_make_abort(asoc, asconf_ack, + sizeof(sctp_errhdr_t)); + if (abort) { + sctp_init_cause(abort, SCTP_ERROR_ASCONF_ACK, NULL, 0); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(abort)); + } + /* We are going to ABORT, so we might as well stop + * processing the rest of the chunks in the packet. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); + sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET,SCTP_NULL()); + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_U32(SCTP_ERROR_ASCONF_ACK)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + return SCTP_DISPOSITION_ABORT; + } + + if ((rcvd_serial == sent_serial) && asoc->addip_last_asconf) { + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); + + if (!sctp_process_asconf_ack((struct sctp_association *)asoc, + asconf_ack)) + return SCTP_DISPOSITION_CONSUME; + + abort = sctp_make_abort(asoc, asconf_ack, + sizeof(sctp_errhdr_t)); + if (abort) { + sctp_init_cause(abort, SCTP_ERROR_RSRC_LOW, NULL, 0); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(abort)); + } + /* We are going to ABORT, so we might as well stop + * processing the rest of the chunks in the packet. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET,SCTP_NULL()); + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_U32(SCTP_ERROR_ASCONF_ACK)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + return SCTP_DISPOSITION_ABORT; + } + + return SCTP_DISPOSITION_DISCARD; +} + +/* + * PR-SCTP Section 3.6 Receiver Side Implementation of PR-SCTP + * + * When a FORWARD TSN chunk arrives, the data receiver MUST first update + * its cumulative TSN point to the value carried in the FORWARD TSN + * chunk, and then MUST further advance its cumulative TSN point locally + * if possible. + * After the above processing, the data receiver MUST stop reporting any + * missing TSNs earlier than or equal to the new cumulative TSN point. + * + * Verification Tag: 8.5 Verification Tag [Normal verification] + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_eat_fwd_tsn(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_fwdtsn_hdr *fwdtsn_hdr; + __u16 len; + __u32 tsn; + + if (!sctp_vtag_verify(chunk, asoc)) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG, + SCTP_NULL()); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + } + + /* Make sure that the FORWARD_TSN chunk has valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + fwdtsn_hdr = (struct sctp_fwdtsn_hdr *)chunk->skb->data; + chunk->subh.fwdtsn_hdr = fwdtsn_hdr; + len = ntohs(chunk->chunk_hdr->length); + len -= sizeof(struct sctp_chunkhdr); + skb_pull(chunk->skb, len); + + tsn = ntohl(fwdtsn_hdr->new_cum_tsn); + SCTP_DEBUG_PRINTK("%s: TSN 0x%x.\n", __FUNCTION__, tsn); + + /* The TSN is too high--silently discard the chunk and count on it + * getting retransmitted later. + */ + if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0) + goto discard_noforce; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn)); + if (len > sizeof(struct sctp_fwdtsn_hdr)) + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN, + SCTP_CHUNK(chunk)); + + /* Count this as receiving DATA. */ + if (asoc->autoclose) { + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); + } + + /* FIXME: For now send a SACK, but DATA processing may + * send another. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); + /* Start the SACK timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); + + return SCTP_DISPOSITION_CONSUME; + +discard_noforce: + return SCTP_DISPOSITION_DISCARD; +} + +sctp_disposition_t sctp_sf_eat_fwd_tsn_fast( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_fwdtsn_hdr *fwdtsn_hdr; + __u16 len; + __u32 tsn; + + if (!sctp_vtag_verify(chunk, asoc)) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_BAD_TAG, + SCTP_NULL()); + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + } + + /* Make sure that the FORWARD_TSN chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_fwdtsn_chunk))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + fwdtsn_hdr = (struct sctp_fwdtsn_hdr *)chunk->skb->data; + chunk->subh.fwdtsn_hdr = fwdtsn_hdr; + len = ntohs(chunk->chunk_hdr->length); + len -= sizeof(struct sctp_chunkhdr); + skb_pull(chunk->skb, len); + + tsn = ntohl(fwdtsn_hdr->new_cum_tsn); + SCTP_DEBUG_PRINTK("%s: TSN 0x%x.\n", __FUNCTION__, tsn); + + /* The TSN is too high--silently discard the chunk and count on it + * getting retransmitted later. + */ + if (sctp_tsnmap_check(&asoc->peer.tsn_map, tsn) < 0) + goto gen_shutdown; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_FWDTSN, SCTP_U32(tsn)); + if (len > sizeof(struct sctp_fwdtsn_hdr)) + sctp_add_cmd_sf(commands, SCTP_CMD_PROCESS_FWDTSN, + SCTP_CHUNK(chunk)); + + /* Go a head and force a SACK, since we are shutting down. */ +gen_shutdown: + /* Implementor's Guide. + * + * While in SHUTDOWN-SENT state, the SHUTDOWN sender MUST immediately + * respond to each received packet containing one or more DATA chunk(s) + * with a SACK, a SHUTDOWN chunk, and restart the T2-shutdown timer + */ + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SHUTDOWN, SCTP_NULL()); + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE()); + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Process an unknown chunk. + * + * Section: 3.2. Also, 2.1 in the implementor's guide. + * + * Chunk Types are encoded such that the highest-order two bits specify + * the action that must be taken if the processing endpoint does not + * recognize the Chunk Type. + * + * 00 - Stop processing this SCTP packet and discard it, do not process + * any further chunks within it. + * + * 01 - Stop processing this SCTP packet and discard it, do not process + * any further chunks within it, and report the unrecognized + * chunk in an 'Unrecognized Chunk Type'. + * + * 10 - Skip this chunk and continue processing. + * + * 11 - Skip this chunk and continue processing, but report in an ERROR + * Chunk using the 'Unrecognized Chunk Type' cause of error. + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_unk_chunk(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *unk_chunk = arg; + struct sctp_chunk *err_chunk; + sctp_chunkhdr_t *hdr; + + SCTP_DEBUG_PRINTK("Processing the unknown chunk id %d.\n", type.chunk); + + if (!sctp_vtag_verify(unk_chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the chunk has a valid length. + * Since we don't know the chunk type, we use a general + * chunkhdr structure to make a comparison. + */ + if (!sctp_chunk_length_valid(unk_chunk, sizeof(sctp_chunkhdr_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + + switch (type.chunk & SCTP_CID_ACTION_MASK) { + case SCTP_CID_ACTION_DISCARD: + /* Discard the packet. */ + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + break; + case SCTP_CID_ACTION_DISCARD_ERR: + /* Discard the packet. */ + sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Generate an ERROR chunk as response. */ + hdr = unk_chunk->chunk_hdr; + err_chunk = sctp_make_op_error(asoc, unk_chunk, + SCTP_ERROR_UNKNOWN_CHUNK, hdr, + WORD_ROUND(ntohs(hdr->length))); + if (err_chunk) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(err_chunk)); + } + return SCTP_DISPOSITION_CONSUME; + break; + case SCTP_CID_ACTION_SKIP: + /* Skip the chunk. */ + return SCTP_DISPOSITION_DISCARD; + break; + case SCTP_CID_ACTION_SKIP_ERR: + /* Generate an ERROR chunk as response. */ + hdr = unk_chunk->chunk_hdr; + err_chunk = sctp_make_op_error(asoc, unk_chunk, + SCTP_ERROR_UNKNOWN_CHUNK, hdr, + WORD_ROUND(ntohs(hdr->length))); + if (err_chunk) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(err_chunk)); + } + /* Skip the chunk. */ + return SCTP_DISPOSITION_CONSUME; + break; + default: + break; + } + + return SCTP_DISPOSITION_DISCARD; +} + +/* + * Discard the chunk. + * + * Section: 0.2, 5.2.3, 5.2.5, 5.2.6, 6.0, 8.4.6, 8.5.1c, 9.2 + * [Too numerous to mention...] + * Verification Tag: No verification needed. + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_discard_chunk(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + SCTP_DEBUG_PRINTK("Chunk %d is discarded\n", type.chunk); + return SCTP_DISPOSITION_DISCARD; +} + +/* + * Discard the whole packet. + * + * Section: 8.4 2) + * + * 2) If the OOTB packet contains an ABORT chunk, the receiver MUST + * silently discard the OOTB packet and take no further action. + * Otherwise, + * + * Verification Tag: No verification necessary + * + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_pdiscard(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL()); + + return SCTP_DISPOSITION_CONSUME; +} + + +/* + * The other end is violating protocol. + * + * Section: Not specified + * Verification Tag: Not specified + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (asoc, reply_msg, msg_up, timers, counters) + * + * We simply tag the chunk as a violation. The state machine will log + * the violation and continue. + */ +sctp_disposition_t sctp_sf_violation(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + return SCTP_DISPOSITION_VIOLATION; +} + + +/* + * Handle a protocol violation when the chunk length is invalid. + * "Invalid" length is identified as smaller then the minimal length a + * given chunk can be. For example, a SACK chunk has invalid length + * if it's length is set to be smaller then the size of sctp_sack_chunk_t. + * + * We inform the other end by sending an ABORT with a Protocol Violation + * error code. + * + * Section: Not specified + * Verification Tag: Nothing to do + * Inputs + * (endpoint, asoc, chunk) + * + * Outputs + * (reply_msg, msg_up, counters) + * + * Generate an ABORT chunk and terminate the association. + */ +sctp_disposition_t sctp_sf_violation_chunklen(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + struct sctp_chunk *abort = NULL; + char err_str[]="The following chunk had invalid length:"; + + /* Make the abort chunk. */ + abort = sctp_make_abort_violation(asoc, chunk, err_str, + sizeof(err_str)); + if (!abort) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + + if (asoc->state <= SCTP_STATE_COOKIE_ECHOED) { + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, + SCTP_U32(SCTP_ERROR_PROTO_VIOLATION)); + } else { + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_U32(SCTP_ERROR_PROTO_VIOLATION)); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + } + + sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET, SCTP_NULL()); + + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + + return SCTP_DISPOSITION_ABORT; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/*************************************************************************** + * These are the state functions for handling primitive (Section 10) events. + ***************************************************************************/ +/* + * sctp_sf_do_prm_asoc + * + * Section: 10.1 ULP-to-SCTP + * B) Associate + * + * Format: ASSOCIATE(local SCTP instance name, destination transport addr, + * outbound stream count) + * -> association id [,destination transport addr list] [,outbound stream + * count] + * + * This primitive allows the upper layer to initiate an association to a + * specific peer endpoint. + * + * The peer endpoint shall be specified by one of the transport addresses + * which defines the endpoint (see Section 1.4). If the local SCTP + * instance has not been initialized, the ASSOCIATE is considered an + * error. + * [This is not relevant for the kernel implementation since we do all + * initialization at boot time. It we hadn't initialized we wouldn't + * get anywhere near this code.] + * + * An association id, which is a local handle to the SCTP association, + * will be returned on successful establishment of the association. If + * SCTP is not able to open an SCTP association with the peer endpoint, + * an error is returned. + * [In the kernel implementation, the struct sctp_association needs to + * be created BEFORE causing this primitive to run.] + * + * Other association parameters may be returned, including the + * complete destination transport addresses of the peer as well as the + * outbound stream count of the local endpoint. One of the transport + * address from the returned destination addresses will be selected by + * the local endpoint as default primary path for sending SCTP packets + * to this peer. The returned "destination transport addr list" can + * be used by the ULP to change the default primary path or to force + * sending a packet to a specific transport address. [All of this + * stuff happens when the INIT ACK arrives. This is a NON-BLOCKING + * function.] + * + * Mandatory attributes: + * + * o local SCTP instance name - obtained from the INITIALIZE operation. + * [This is the argument asoc.] + * o destination transport addr - specified as one of the transport + * addresses of the peer endpoint with which the association is to be + * established. + * [This is asoc->peer.active_path.] + * o outbound stream count - the number of outbound streams the ULP + * would like to open towards this peer endpoint. + * [BUG: This is not currently implemented.] + * Optional attributes: + * + * None. + * + * The return value is a disposition. + */ +sctp_disposition_t sctp_sf_do_prm_asoc(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *repl; + + /* The comment below says that we enter COOKIE-WAIT AFTER + * sending the INIT, but that doesn't actually work in our + * implementation... + */ + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_COOKIE_WAIT)); + + /* RFC 2960 5.1 Normal Establishment of an Association + * + * A) "A" first sends an INIT chunk to "Z". In the INIT, "A" + * must provide its Verification Tag (Tag_A) in the Initiate + * Tag field. Tag_A SHOULD be a random number in the range of + * 1 to 4294967295 (see 5.3.1 for Tag value selection). ... + */ + + repl = sctp_make_init(asoc, &asoc->base.bind_addr, GFP_ATOMIC, 0); + if (!repl) + goto nomem; + + /* Cast away the const modifier, as we want to just + * rerun it through as a sideffect. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_ASOC, + SCTP_ASOC((struct sctp_association *) asoc)); + + /* After sending the INIT, "A" starts the T1-init timer and + * enters the COOKIE-WAIT state. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + return SCTP_DISPOSITION_CONSUME; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Process the SEND primitive. + * + * Section: 10.1 ULP-to-SCTP + * E) Send + * + * Format: SEND(association id, buffer address, byte count [,context] + * [,stream id] [,life time] [,destination transport address] + * [,unorder flag] [,no-bundle flag] [,payload protocol-id] ) + * -> result + * + * This is the main method to send user data via SCTP. + * + * Mandatory attributes: + * + * o association id - local handle to the SCTP association + * + * o buffer address - the location where the user message to be + * transmitted is stored; + * + * o byte count - The size of the user data in number of bytes; + * + * Optional attributes: + * + * o context - an optional 32 bit integer that will be carried in the + * sending failure notification to the ULP if the transportation of + * this User Message fails. + * + * o stream id - to indicate which stream to send the data on. If not + * specified, stream 0 will be used. + * + * o life time - specifies the life time of the user data. The user data + * will not be sent by SCTP after the life time expires. This + * parameter can be used to avoid efforts to transmit stale + * user messages. SCTP notifies the ULP if the data cannot be + * initiated to transport (i.e. sent to the destination via SCTP's + * send primitive) within the life time variable. However, the + * user data will be transmitted if SCTP has attempted to transmit a + * chunk before the life time expired. + * + * o destination transport address - specified as one of the destination + * transport addresses of the peer endpoint to which this packet + * should be sent. Whenever possible, SCTP should use this destination + * transport address for sending the packets, instead of the current + * primary path. + * + * o unorder flag - this flag, if present, indicates that the user + * would like the data delivered in an unordered fashion to the peer + * (i.e., the U flag is set to 1 on all DATA chunks carrying this + * message). + * + * o no-bundle flag - instructs SCTP not to bundle this user data with + * other outbound DATA chunks. SCTP MAY still bundle even when + * this flag is present, when faced with network congestion. + * + * o payload protocol-id - A 32 bit unsigned integer that is to be + * passed to the peer indicating the type of payload protocol data + * being transmitted. This value is passed as opaque data by SCTP. + * + * The return value is the disposition. + */ +sctp_disposition_t sctp_sf_do_prm_send(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(chunk)); + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Process the SHUTDOWN primitive. + * + * Section: 10.1: + * C) Shutdown + * + * Format: SHUTDOWN(association id) + * -> result + * + * Gracefully closes an association. Any locally queued user data + * will be delivered to the peer. The association will be terminated only + * after the peer acknowledges all the SCTP packets sent. A success code + * will be returned on successful termination of the association. If + * attempting to terminate the association results in a failure, an error + * code shall be returned. + * + * Mandatory attributes: + * + * o association id - local handle to the SCTP association + * + * Optional attributes: + * + * None. + * + * The return value is the disposition. + */ +sctp_disposition_t sctp_sf_do_9_2_prm_shutdown( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + int disposition; + + /* From 9.2 Shutdown of an Association + * Upon receipt of the SHUTDOWN primitive from its upper + * layer, the endpoint enters SHUTDOWN-PENDING state and + * remains there until all outstanding data has been + * acknowledged by its peer. The endpoint accepts no new data + * from its upper layer, but retransmits data to the far end + * if necessary to fill gaps. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_SHUTDOWN_PENDING)); + + /* sctpimpguide-05 Section 2.12.2 + * The sender of the SHUTDOWN MAY also start an overall guard timer + * 'T5-shutdown-guard' to bound the overall time for shutdown sequence. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, + SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); + + disposition = SCTP_DISPOSITION_CONSUME; + if (sctp_outq_is_empty(&asoc->outqueue)) { + disposition = sctp_sf_do_9_2_start_shutdown(ep, asoc, type, + arg, commands); + } + return disposition; +} + +/* + * Process the ABORT primitive. + * + * Section: 10.1: + * C) Abort + * + * Format: Abort(association id [, cause code]) + * -> result + * + * Ungracefully closes an association. Any locally queued user data + * will be discarded and an ABORT chunk is sent to the peer. A success code + * will be returned on successful abortion of the association. If + * attempting to abort the association results in a failure, an error + * code shall be returned. + * + * Mandatory attributes: + * + * o association id - local handle to the SCTP association + * + * Optional attributes: + * + * o cause code - reason of the abort to be passed to the peer + * + * None. + * + * The return value is the disposition. + */ +sctp_disposition_t sctp_sf_do_9_1_prm_abort( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* From 9.1 Abort of an Association + * Upon receipt of the ABORT primitive from its upper + * layer, the endpoint enters CLOSED state and + * discard all outstanding data has been + * acknowledged by its peer. The endpoint accepts no new data + * from its upper layer, but retransmits data to the far end + * if necessary to fill gaps. + */ + struct msghdr *msg = arg; + struct sctp_chunk *abort; + sctp_disposition_t retval; + + retval = SCTP_DISPOSITION_CONSUME; + + /* Generate ABORT chunk to send the peer. */ + abort = sctp_make_abort_user(asoc, NULL, msg); + if (!abort) + retval = SCTP_DISPOSITION_NOMEM; + else + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); + + /* Even if we can't send the ABORT due to low memory delete the + * TCB. This is a departure from our typical NOMEM handling. + */ + + /* Delete the established association. */ + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_U32(SCTP_ERROR_USER_ABORT)); + + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + + return retval; +} + +/* We tried an illegal operation on an association which is closed. */ +sctp_disposition_t sctp_sf_error_closed(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_ERROR, SCTP_ERROR(-EINVAL)); + return SCTP_DISPOSITION_CONSUME; +} + +/* We tried an illegal operation on an association which is shutting + * down. + */ +sctp_disposition_t sctp_sf_error_shutdown(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_ERROR, + SCTP_ERROR(-ESHUTDOWN)); + return SCTP_DISPOSITION_CONSUME; +} + +/* + * sctp_cookie_wait_prm_shutdown + * + * Section: 4 Note: 2 + * Verification Tag: + * Inputs + * (endpoint, asoc) + * + * The RFC does not explicitly address this issue, but is the route through the + * state table when someone issues a shutdown while in COOKIE_WAIT state. + * + * Outputs + * (timers) + */ +sctp_disposition_t sctp_sf_cookie_wait_prm_shutdown( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); + + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + + SCTP_INC_STATS(SCTP_MIB_SHUTDOWNS); + + sctp_add_cmd_sf(commands, SCTP_CMD_DELETE_TCB, SCTP_NULL()); + + return SCTP_DISPOSITION_DELETE_TCB; +} + +/* + * sctp_cookie_echoed_prm_shutdown + * + * Section: 4 Note: 2 + * Verification Tag: + * Inputs + * (endpoint, asoc) + * + * The RFC does not explcitly address this issue, but is the route through the + * state table when someone issues a shutdown while in COOKIE_ECHOED state. + * + * Outputs + * (timers) + */ +sctp_disposition_t sctp_sf_cookie_echoed_prm_shutdown( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, sctp_cmd_seq_t *commands) +{ + /* There is a single T1 timer, so we should be able to use + * common function with the COOKIE-WAIT state. + */ + return sctp_sf_cookie_wait_prm_shutdown(ep, asoc, type, arg, commands); +} + +/* + * sctp_sf_cookie_wait_prm_abort + * + * Section: 4 Note: 2 + * Verification Tag: + * Inputs + * (endpoint, asoc) + * + * The RFC does not explicitly address this issue, but is the route through the + * state table when someone issues an abort while in COOKIE_WAIT state. + * + * Outputs + * (timers) + */ +sctp_disposition_t sctp_sf_cookie_wait_prm_abort( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct msghdr *msg = arg; + struct sctp_chunk *abort; + sctp_disposition_t retval; + + /* Stop T1-init timer */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT)); + retval = SCTP_DISPOSITION_CONSUME; + + /* Generate ABORT chunk to send the peer */ + abort = sctp_make_abort_user(asoc, NULL, msg); + if (!abort) + retval = SCTP_DISPOSITION_NOMEM; + else + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort)); + + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_CLOSED)); + + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + + /* Even if we can't send the ABORT due to low memory delete the + * TCB. This is a departure from our typical NOMEM handling. + */ + + /* Delete the established association. */ + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, + SCTP_U32(SCTP_ERROR_USER_ABORT)); + + return retval; +} + +/* + * sctp_sf_cookie_echoed_prm_abort + * + * Section: 4 Note: 3 + * Verification Tag: + * Inputs + * (endpoint, asoc) + * + * The RFC does not explcitly address this issue, but is the route through the + * state table when someone issues an abort while in COOKIE_ECHOED state. + * + * Outputs + * (timers) + */ +sctp_disposition_t sctp_sf_cookie_echoed_prm_abort( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* There is a single T1 timer, so we should be able to use + * common function with the COOKIE-WAIT state. + */ + return sctp_sf_cookie_wait_prm_abort(ep, asoc, type, arg, commands); +} + +/* + * sctp_sf_shutdown_pending_prm_abort + * + * Inputs + * (endpoint, asoc) + * + * The RFC does not explicitly address this issue, but is the route through the + * state table when someone issues an abort while in SHUTDOWN-PENDING state. + * + * Outputs + * (timers) + */ +sctp_disposition_t sctp_sf_shutdown_pending_prm_abort( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* Stop the T5-shutdown guard timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); + + return sctp_sf_do_9_1_prm_abort(ep, asoc, type, arg, commands); +} + +/* + * sctp_sf_shutdown_sent_prm_abort + * + * Inputs + * (endpoint, asoc) + * + * The RFC does not explicitly address this issue, but is the route through the + * state table when someone issues an abort while in SHUTDOWN-SENT state. + * + * Outputs + * (timers) + */ +sctp_disposition_t sctp_sf_shutdown_sent_prm_abort( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* Stop the T2-shutdown timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + + /* Stop the T5-shutdown guard timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); + + return sctp_sf_do_9_1_prm_abort(ep, asoc, type, arg, commands); +} + +/* + * sctp_sf_cookie_echoed_prm_abort + * + * Inputs + * (endpoint, asoc) + * + * The RFC does not explcitly address this issue, but is the route through the + * state table when someone issues an abort while in COOKIE_ECHOED state. + * + * Outputs + * (timers) + */ +sctp_disposition_t sctp_sf_shutdown_ack_sent_prm_abort( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + /* The same T2 timer, so we should be able to use + * common function with the SHUTDOWN-SENT state. + */ + return sctp_sf_shutdown_sent_prm_abort(ep, asoc, type, arg, commands); +} + +/* + * Process the REQUESTHEARTBEAT primitive + * + * 10.1 ULP-to-SCTP + * J) Request Heartbeat + * + * Format: REQUESTHEARTBEAT(association id, destination transport address) + * + * -> result + * + * Instructs the local endpoint to perform a HeartBeat on the specified + * destination transport address of the given association. The returned + * result should indicate whether the transmission of the HEARTBEAT + * chunk to the destination address is successful. + * + * Mandatory attributes: + * + * o association id - local handle to the SCTP association + * + * o destination transport address - the transport address of the + * association on which a heartbeat should be issued. + */ +sctp_disposition_t sctp_sf_do_prm_requestheartbeat( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + return sctp_sf_heartbeat(ep, asoc, type, (struct sctp_transport *)arg, + commands); +} + +/* + * ADDIP Section 4.1 ASCONF Chunk Procedures + * When an endpoint has an ASCONF signaled change to be sent to the + * remote endpoint it should do A1 to A9 + */ +sctp_disposition_t sctp_sf_do_prm_asconf(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = arg; + + sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T4, SCTP_CHUNK(chunk)); + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, + SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(chunk)); + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Ignore the primitive event + * + * The return value is the disposition of the primitive. + */ +sctp_disposition_t sctp_sf_ignore_primitive( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + SCTP_DEBUG_PRINTK("Primitive type %d is ignored.\n", type.primitive); + return SCTP_DISPOSITION_DISCARD; +} + +/*************************************************************************** + * These are the state functions for the OTHER events. + ***************************************************************************/ + +/* + * Start the shutdown negotiation. + * + * From Section 9.2: + * Once all its outstanding data has been acknowledged, the endpoint + * shall send a SHUTDOWN chunk to its peer including in the Cumulative + * TSN Ack field the last sequential TSN it has received from the peer. + * It shall then start the T2-shutdown timer and enter the SHUTDOWN-SENT + * state. If the timer expires, the endpoint must re-send the SHUTDOWN + * with the updated last sequential TSN received from its peer. + * + * The return value is the disposition. + */ +sctp_disposition_t sctp_sf_do_9_2_start_shutdown( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *reply; + + /* Once all its outstanding data has been acknowledged, the + * endpoint shall send a SHUTDOWN chunk to its peer including + * in the Cumulative TSN Ack field the last sequential TSN it + * has received from the peer. + */ + reply = sctp_make_shutdown(asoc, NULL); + if (!reply) + goto nomem; + + /* Set the transport for the SHUTDOWN chunk and the timeout for the + * T2-shutdown timer. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply)); + + /* It shall then start the T2-shutdown timer */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + + if (asoc->autoclose) + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); + + /* and enter the SHUTDOWN-SENT state. */ + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_SHUTDOWN_SENT)); + + /* sctp-implguide 2.10 Issues with Heartbeating and failover + * + * HEARTBEAT ... is discontinued after sending either SHUTDOWN + * or SHUTDOWN-ACK. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_STOP, SCTP_NULL()); + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + + return SCTP_DISPOSITION_CONSUME; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Generate a SHUTDOWN ACK now that everything is SACK'd. + * + * From Section 9.2: + * + * If it has no more outstanding DATA chunks, the SHUTDOWN receiver + * shall send a SHUTDOWN ACK and start a T2-shutdown timer of its own, + * entering the SHUTDOWN-ACK-SENT state. If the timer expires, the + * endpoint must re-send the SHUTDOWN ACK. + * + * The return value is the disposition. + */ +sctp_disposition_t sctp_sf_do_9_2_shutdown_ack( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = (struct sctp_chunk *) arg; + struct sctp_chunk *reply; + + /* There are 2 ways of getting here: + * 1) called in response to a SHUTDOWN chunk + * 2) called when SCTP_EVENT_NO_PENDING_TSN event is issued. + * + * For the case (2), the arg parameter is set to NULL. We need + * to check that we have a chunk before accessing it's fields. + */ + if (chunk) { + if (!sctp_vtag_verify(chunk, asoc)) + return sctp_sf_pdiscard(ep, asoc, type, arg, commands); + + /* Make sure that the SHUTDOWN chunk has a valid length. */ + if (!sctp_chunk_length_valid(chunk, sizeof(struct sctp_shutdown_chunk_t))) + return sctp_sf_violation_chunklen(ep, asoc, type, arg, + commands); + } + + /* If it has no more outstanding DATA chunks, the SHUTDOWN receiver + * shall send a SHUTDOWN ACK ... + */ + reply = sctp_make_shutdown_ack(asoc, chunk); + if (!reply) + goto nomem; + + /* Set the transport for the SHUTDOWN ACK chunk and the timeout for + * the T2-shutdown timer. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply)); + + /* and start/restart a T2-shutdown timer of its own, */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + + if (asoc->autoclose) + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_AUTOCLOSE)); + + /* Enter the SHUTDOWN-ACK-SENT state. */ + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_SHUTDOWN_ACK_SENT)); + + /* sctp-implguide 2.10 Issues with Heartbeating and failover + * + * HEARTBEAT ... is discontinued after sending either SHUTDOWN + * or SHUTDOWN-ACK. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_HB_TIMERS_STOP, SCTP_NULL()); + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + + return SCTP_DISPOSITION_CONSUME; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * Ignore the event defined as other + * + * The return value is the disposition of the event. + */ +sctp_disposition_t sctp_sf_ignore_other(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + SCTP_DEBUG_PRINTK("The event other type %d is ignored\n", type.other); + return SCTP_DISPOSITION_DISCARD; +} + +/************************************************************ + * These are the state functions for handling timeout events. + ************************************************************/ + +/* + * RTX Timeout + * + * Section: 6.3.3 Handle T3-rtx Expiration + * + * Whenever the retransmission timer T3-rtx expires for a destination + * address, do the following: + * [See below] + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_do_6_3_3_rtx(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_transport *transport = arg; + + if (asoc->overall_error_count >= asoc->max_retrans) { + /* CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */ + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_U32(SCTP_ERROR_NO_ERROR)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + return SCTP_DISPOSITION_DELETE_TCB; + } + + /* E1) For the destination address for which the timer + * expires, adjust its ssthresh with rules defined in Section + * 7.2.3 and set the cwnd <- MTU. + */ + + /* E2) For the destination address for which the timer + * expires, set RTO <- RTO * 2 ("back off the timer"). The + * maximum value discussed in rule C7 above (RTO.max) may be + * used to provide an upper bound to this doubling operation. + */ + + /* E3) Determine how many of the earliest (i.e., lowest TSN) + * outstanding DATA chunks for the address for which the + * T3-rtx has expired will fit into a single packet, subject + * to the MTU constraint for the path corresponding to the + * destination transport address to which the retransmission + * is being sent (this may be different from the address for + * which the timer expires [see Section 6.4]). Call this + * value K. Bundle and retransmit those K DATA chunks in a + * single packet to the destination endpoint. + * + * Note: Any DATA chunks that were sent to the address for + * which the T3-rtx timer expired but did not fit in one MTU + * (rule E3 above), should be marked for retransmission and + * sent as soon as cwnd allows (normally when a SACK arrives). + */ + + /* NB: Rules E4 and F1 are implicit in R1. */ + sctp_add_cmd_sf(commands, SCTP_CMD_RETRAN, SCTP_TRANSPORT(transport)); + + /* Do some failure management (Section 8.2). */ + sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE, SCTP_TRANSPORT(transport)); + + return SCTP_DISPOSITION_CONSUME; +} + +/* + * Generate delayed SACK on timeout + * + * Section: 6.2 Acknowledgement on Reception of DATA Chunks + * + * The guidelines on delayed acknowledgement algorithm specified in + * Section 4.2 of [RFC2581] SHOULD be followed. Specifically, an + * acknowledgement SHOULD be generated for at least every second packet + * (not every second DATA chunk) received, and SHOULD be generated + * within 200 ms of the arrival of any unacknowledged DATA chunk. In + * some situations it may be beneficial for an SCTP transmitter to be + * more conservative than the algorithms detailed in this document + * allow. However, an SCTP transmitter MUST NOT be more aggressive than + * the following algorithms allow. + */ +sctp_disposition_t sctp_sf_do_6_2_sack(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_FORCE()); + return SCTP_DISPOSITION_CONSUME; +} + +/* + * sctp_sf_t1_timer_expire + * + * Section: 4 Note: 2 + * Verification Tag: + * Inputs + * (endpoint, asoc) + * + * RFC 2960 Section 4 Notes + * 2) If the T1-init timer expires, the endpoint MUST retransmit INIT + * and re-start the T1-init timer without changing state. This MUST + * be repeated up to 'Max.Init.Retransmits' times. After that, the + * endpoint MUST abort the initialization process and report the + * error to SCTP user. + * + * 3) If the T1-cookie timer expires, the endpoint MUST retransmit + * COOKIE ECHO and re-start the T1-cookie timer without changing + * state. This MUST be repeated up to 'Max.Init.Retransmits' times. + * After that, the endpoint MUST abort the initialization process and + * report the error to SCTP user. + * + * Outputs + * (timers, events) + * + */ +sctp_disposition_t sctp_sf_t1_timer_expire(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *repl; + struct sctp_bind_addr *bp; + sctp_event_timeout_t timer = (sctp_event_timeout_t) arg; + int timeout; + int attempts; + + timeout = asoc->timeouts[timer]; + attempts = asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1; + repl = NULL; + + SCTP_DEBUG_PRINTK("Timer T1 expired.\n"); + + if (attempts < asoc->max_init_attempts) { + switch (timer) { + case SCTP_EVENT_TIMEOUT_T1_INIT: + bp = (struct sctp_bind_addr *) &asoc->base.bind_addr; + repl = sctp_make_init(asoc, bp, GFP_ATOMIC, 0); + break; + + case SCTP_EVENT_TIMEOUT_T1_COOKIE: + repl = sctp_make_cookie_echo(asoc, NULL); + break; + + default: + BUG(); + break; + }; + + if (!repl) + goto nomem; + + /* Issue a sideeffect to do the needed accounting. */ + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_RESTART, + SCTP_TO(timer)); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(repl)); + } else { + sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED, + SCTP_U32(SCTP_ERROR_NO_ERROR)); + return SCTP_DISPOSITION_DELETE_TCB; + } + + return SCTP_DISPOSITION_CONSUME; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* RFC2960 9.2 If the timer expires, the endpoint must re-send the SHUTDOWN + * with the updated last sequential TSN received from its peer. + * + * An endpoint should limit the number of retransmissions of the + * SHUTDOWN chunk to the protocol parameter 'Association.Max.Retrans'. + * If this threshold is exceeded the endpoint should destroy the TCB and + * MUST report the peer endpoint unreachable to the upper layer (and + * thus the association enters the CLOSED state). The reception of any + * packet from its peer (i.e. as the peer sends all of its queued DATA + * chunks) should clear the endpoint's retransmission count and restart + * the T2-Shutdown timer, giving its peer ample opportunity to transmit + * all of its queued DATA chunks that have not yet been sent. + */ +sctp_disposition_t sctp_sf_t2_timer_expire(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *reply = NULL; + + SCTP_DEBUG_PRINTK("Timer T2 expired.\n"); + if (asoc->overall_error_count >= asoc->max_retrans) { + /* Note: CMD_ASSOC_FAILED calls CMD_DELETE_TCB. */ + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_U32(SCTP_ERROR_NO_ERROR)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + return SCTP_DISPOSITION_DELETE_TCB; + } + + switch (asoc->state) { + case SCTP_STATE_SHUTDOWN_SENT: + reply = sctp_make_shutdown(asoc, NULL); + break; + + case SCTP_STATE_SHUTDOWN_ACK_SENT: + reply = sctp_make_shutdown_ack(asoc, NULL); + break; + + default: + BUG(); + break; + }; + + if (!reply) + goto nomem; + + /* Do some failure management (Section 8.2). */ + sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE, + SCTP_TRANSPORT(asoc->shutdown_last_sent_to)); + + /* Set the transport for the SHUTDOWN/ACK chunk and the timeout for + * the T2-shutdown timer. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T2, SCTP_CHUNK(reply)); + + /* Restart the T2-shutdown timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN)); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + return SCTP_DISPOSITION_CONSUME; + +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* + * ADDIP Section 4.1 ASCONF CHunk Procedures + * If the T4 RTO timer expires the endpoint should do B1 to B5 + */ +sctp_disposition_t sctp_sf_t4_timer_expire( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *chunk = asoc->addip_last_asconf; + struct sctp_transport *transport = chunk->transport; + + /* ADDIP 4.1 B1) Increment the error counters and perform path failure + * detection on the appropriate destination address as defined in + * RFC2960 [5] section 8.1 and 8.2. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_STRIKE, SCTP_TRANSPORT(transport)); + + /* Reconfig T4 timer and transport. */ + sctp_add_cmd_sf(commands, SCTP_CMD_SETUP_T4, SCTP_CHUNK(chunk)); + + /* ADDIP 4.1 B2) Increment the association error counters and perform + * endpoint failure detection on the association as defined in + * RFC2960 [5] section 8.1 and 8.2. + * association error counter is incremented in SCTP_CMD_STRIKE. + */ + if (asoc->overall_error_count >= asoc->max_retrans) { + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, + SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_U32(SCTP_ERROR_NO_ERROR)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_INC_STATS(SCTP_MIB_CURRESTAB); + return SCTP_DISPOSITION_ABORT; + } + + /* ADDIP 4.1 B3) Back-off the destination address RTO value to which + * the ASCONF chunk was sent by doubling the RTO timer value. + * This is done in SCTP_CMD_STRIKE. + */ + + /* ADDIP 4.1 B4) Re-transmit the ASCONF Chunk last sent and if possible + * choose an alternate destination address (please refer to RFC2960 + * [5] section 6.4.1). An endpoint MUST NOT add new parameters to this + * chunk, it MUST be the same (including its serial number) as the last + * ASCONF sent. + */ + sctp_chunk_hold(asoc->addip_last_asconf); + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(asoc->addip_last_asconf)); + + /* ADDIP 4.1 B5) Restart the T-4 RTO timer. Note that if a different + * destination is selected, then the RTO used will be that of the new + * destination address. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_T4_RTO)); + + return SCTP_DISPOSITION_CONSUME; +} + +/* sctpimpguide-05 Section 2.12.2 + * The sender of the SHUTDOWN MAY also start an overall guard timer + * 'T5-shutdown-guard' to bound the overall time for shutdown sequence. + * At the expiration of this timer the sender SHOULD abort the association + * by sending an ABORT chunk. + */ +sctp_disposition_t sctp_sf_t5_timer_expire(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + struct sctp_chunk *reply = NULL; + + SCTP_DEBUG_PRINTK("Timer T5 expired.\n"); + + reply = sctp_make_abort(asoc, NULL, 0); + if (!reply) + goto nomem; + + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(reply)); + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_U32(SCTP_ERROR_NO_ERROR)); + + return SCTP_DISPOSITION_DELETE_TCB; +nomem: + return SCTP_DISPOSITION_NOMEM; +} + +/* Handle expiration of AUTOCLOSE timer. When the autoclose timer expires, + * the association is automatically closed by starting the shutdown process. + * The work that needs to be done is same as when SHUTDOWN is initiated by + * the user. So this routine looks same as sctp_sf_do_9_2_prm_shutdown(). + */ +sctp_disposition_t sctp_sf_autoclose_timer_expire( + const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + int disposition; + + /* From 9.2 Shutdown of an Association + * Upon receipt of the SHUTDOWN primitive from its upper + * layer, the endpoint enters SHUTDOWN-PENDING state and + * remains there until all outstanding data has been + * acknowledged by its peer. The endpoint accepts no new data + * from its upper layer, but retransmits data to the far end + * if necessary to fill gaps. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE, + SCTP_STATE(SCTP_STATE_SHUTDOWN_PENDING)); + + /* sctpimpguide-05 Section 2.12.2 + * The sender of the SHUTDOWN MAY also start an overall guard timer + * 'T5-shutdown-guard' to bound the overall time for shutdown sequence. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_START, + SCTP_TO(SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD)); + disposition = SCTP_DISPOSITION_CONSUME; + if (sctp_outq_is_empty(&asoc->outqueue)) { + disposition = sctp_sf_do_9_2_start_shutdown(ep, asoc, type, + arg, commands); + } + return disposition; +} + +/***************************************************************************** + * These are sa state functions which could apply to all types of events. + ****************************************************************************/ + +/* + * This table entry is not implemented. + * + * Inputs + * (endpoint, asoc, chunk) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_not_impl(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + return SCTP_DISPOSITION_NOT_IMPL; +} + +/* + * This table entry represents a bug. + * + * Inputs + * (endpoint, asoc, chunk) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_bug(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + return SCTP_DISPOSITION_BUG; +} + +/* + * This table entry represents the firing of a timer in the wrong state. + * Since timer deletion cannot be guaranteed a timer 'may' end up firing + * when the association is in the wrong state. This event should + * be ignored, so as to prevent any rearming of the timer. + * + * Inputs + * (endpoint, asoc, chunk) + * + * The return value is the disposition of the chunk. + */ +sctp_disposition_t sctp_sf_timer_ignore(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const sctp_subtype_t type, + void *arg, + sctp_cmd_seq_t *commands) +{ + SCTP_DEBUG_PRINTK("Timer %d ignored.\n", type.chunk); + return SCTP_DISPOSITION_CONSUME; +} + +/******************************************************************** + * 2nd Level Abstractions + ********************************************************************/ + +/* Pull the SACK chunk based on the SACK header. */ +static struct sctp_sackhdr *sctp_sm_pull_sack(struct sctp_chunk *chunk) +{ + struct sctp_sackhdr *sack; + unsigned int len; + __u16 num_blocks; + __u16 num_dup_tsns; + + /* Protect ourselves from reading too far into + * the skb from a bogus sender. + */ + sack = (struct sctp_sackhdr *) chunk->skb->data; + + num_blocks = ntohs(sack->num_gap_ack_blocks); + num_dup_tsns = ntohs(sack->num_dup_tsns); + len = sizeof(struct sctp_sackhdr); + len += (num_blocks + num_dup_tsns) * sizeof(__u32); + if (len > chunk->skb->len) + return NULL; + + skb_pull(chunk->skb, len); + + return sack; +} + +/* Create an ABORT packet to be sent as a response, with the specified + * error causes. + */ +static struct sctp_packet *sctp_abort_pkt_new(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + struct sctp_chunk *chunk, + const void *payload, + size_t paylen) +{ + struct sctp_packet *packet; + struct sctp_chunk *abort; + + packet = sctp_ootb_pkt_new(asoc, chunk); + + if (packet) { + /* Make an ABORT. + * The T bit will be set if the asoc is NULL. + */ + abort = sctp_make_abort(asoc, chunk, paylen); + if (!abort) { + sctp_ootb_pkt_free(packet); + return NULL; + } + /* Add specified error causes, i.e., payload, to the + * end of the chunk. + */ + sctp_addto_chunk(abort, paylen, payload); + + /* Set the skb to the belonging sock for accounting. */ + abort->skb->sk = ep->base.sk; + + sctp_packet_append_chunk(packet, abort); + + } + + return packet; +} + +/* Allocate a packet for responding in the OOTB conditions. */ +static struct sctp_packet *sctp_ootb_pkt_new(const struct sctp_association *asoc, + const struct sctp_chunk *chunk) +{ + struct sctp_packet *packet; + struct sctp_transport *transport; + __u16 sport; + __u16 dport; + __u32 vtag; + + /* Get the source and destination port from the inbound packet. */ + sport = ntohs(chunk->sctp_hdr->dest); + dport = ntohs(chunk->sctp_hdr->source); + + /* The V-tag is going to be the same as the inbound packet if no + * association exists, otherwise, use the peer's vtag. + */ + if (asoc) { + vtag = asoc->peer.i.init_tag; + } else { + /* Special case the INIT and stale COOKIE_ECHO as there is no + * vtag yet. + */ + switch(chunk->chunk_hdr->type) { + case SCTP_CID_INIT: + { + sctp_init_chunk_t *init; + + init = (sctp_init_chunk_t *)chunk->chunk_hdr; + vtag = ntohl(init->init_hdr.init_tag); + break; + } + default: + vtag = ntohl(chunk->sctp_hdr->vtag); + break; + } + } + + /* Make a transport for the bucket, Eliza... */ + transport = sctp_transport_new(sctp_source(chunk), GFP_ATOMIC); + if (!transport) + goto nomem; + + /* Cache a route for the transport with the chunk's destination as + * the source address. + */ + sctp_transport_route(transport, (union sctp_addr *)&chunk->dest, + sctp_sk(sctp_get_ctl_sock())); + + packet = sctp_packet_init(&transport->packet, transport, sport, dport); + packet = sctp_packet_config(packet, vtag, 0); + + return packet; + +nomem: + return NULL; +} + +/* Free the packet allocated earlier for responding in the OOTB condition. */ +void sctp_ootb_pkt_free(struct sctp_packet *packet) +{ + sctp_transport_free(packet->transport); +} + +/* Send a stale cookie error when a invalid COOKIE ECHO chunk is found */ +static void sctp_send_stale_cookie_err(const struct sctp_endpoint *ep, + const struct sctp_association *asoc, + const struct sctp_chunk *chunk, + sctp_cmd_seq_t *commands, + struct sctp_chunk *err_chunk) +{ + struct sctp_packet *packet; + + if (err_chunk) { + packet = sctp_ootb_pkt_new(asoc, chunk); + if (packet) { + struct sctp_signed_cookie *cookie; + + /* Override the OOTB vtag from the cookie. */ + cookie = chunk->subh.cookie_hdr; + packet->vtag = cookie->c.peer_vtag; + + /* Set the skb to the belonging sock for accounting. */ + err_chunk->skb->sk = ep->base.sk; + sctp_packet_append_chunk(packet, err_chunk); + sctp_add_cmd_sf(commands, SCTP_CMD_SEND_PKT, + SCTP_PACKET(packet)); + SCTP_INC_STATS(SCTP_MIB_OUTCTRLCHUNKS); + } else + sctp_chunk_free (err_chunk); + } +} + + +/* Process a data chunk */ +static int sctp_eat_data(const struct sctp_association *asoc, + struct sctp_chunk *chunk, + sctp_cmd_seq_t *commands) +{ + sctp_datahdr_t *data_hdr; + struct sctp_chunk *err; + size_t datalen; + sctp_verb_t deliver; + int tmp; + __u32 tsn; + + data_hdr = chunk->subh.data_hdr = (sctp_datahdr_t *)chunk->skb->data; + skb_pull(chunk->skb, sizeof(sctp_datahdr_t)); + + tsn = ntohl(data_hdr->tsn); + SCTP_DEBUG_PRINTK("eat_data: TSN 0x%x.\n", tsn); + + /* ASSERT: Now skb->data is really the user data. */ + + /* Process ECN based congestion. + * + * Since the chunk structure is reused for all chunks within + * a packet, we use ecn_ce_done to track if we've already + * done CE processing for this packet. + * + * We need to do ECN processing even if we plan to discard the + * chunk later. + */ + + if (!chunk->ecn_ce_done) { + struct sctp_af *af; + chunk->ecn_ce_done = 1; + + af = sctp_get_af_specific( + ipver2af(chunk->skb->nh.iph->version)); + + if (af && af->is_ce(chunk->skb) && asoc->peer.ecn_capable) { + /* Do real work as sideffect. */ + sctp_add_cmd_sf(commands, SCTP_CMD_ECN_CE, + SCTP_U32(tsn)); + } + } + + tmp = sctp_tsnmap_check(&asoc->peer.tsn_map, tsn); + if (tmp < 0) { + /* The TSN is too high--silently discard the chunk and + * count on it getting retransmitted later. + */ + return SCTP_IERROR_HIGH_TSN; + } else if (tmp > 0) { + /* This is a duplicate. Record it. */ + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_DUP, SCTP_U32(tsn)); + return SCTP_IERROR_DUP_TSN; + } + + /* This is a new TSN. */ + + /* Discard if there is no room in the receive window. + * Actually, allow a little bit of overflow (up to a MTU). + */ + datalen = ntohs(chunk->chunk_hdr->length); + datalen -= sizeof(sctp_data_chunk_t); + + deliver = SCTP_CMD_CHUNK_ULP; + + /* Think about partial delivery. */ + if ((datalen >= asoc->rwnd) && (!asoc->ulpq.pd_mode)) { + + /* Even if we don't accept this chunk there is + * memory pressure. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_PART_DELIVER, SCTP_NULL()); + } + + /* Spill over rwnd a little bit. Note: While allowed, this spill over + * seems a bit troublesome in that frag_point varies based on + * PMTU. In cases, such as loopback, this might be a rather + * large spill over. + */ + if (!asoc->rwnd || asoc->rwnd_over || + (datalen > asoc->rwnd + asoc->frag_point)) { + + /* If this is the next TSN, consider reneging to make + * room. Note: Playing nice with a confused sender. A + * malicious sender can still eat up all our buffer + * space and in the future we may want to detect and + * do more drastic reneging. + */ + if (sctp_tsnmap_has_gap(&asoc->peer.tsn_map) && + (sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + 1) == tsn) { + SCTP_DEBUG_PRINTK("Reneging for tsn:%u\n", tsn); + deliver = SCTP_CMD_RENEGE; + } else { + SCTP_DEBUG_PRINTK("Discard tsn: %u len: %Zd, " + "rwnd: %d\n", tsn, datalen, + asoc->rwnd); + return SCTP_IERROR_IGNORE_TSN; + } + } + + /* + * Section 3.3.10.9 No User Data (9) + * + * Cause of error + * --------------- + * No User Data: This error cause is returned to the originator of a + * DATA chunk if a received DATA chunk has no user data. + */ + if (unlikely(0 == datalen)) { + err = sctp_make_abort_no_data(asoc, chunk, tsn); + if (err) { + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(err)); + } + /* We are going to ABORT, so we might as well stop + * processing the rest of the chunks in the packet. + */ + sctp_add_cmd_sf(commands, SCTP_CMD_DISCARD_PACKET,SCTP_NULL()); + sctp_add_cmd_sf(commands, SCTP_CMD_ASSOC_FAILED, + SCTP_U32(SCTP_ERROR_NO_DATA)); + SCTP_INC_STATS(SCTP_MIB_ABORTEDS); + SCTP_DEC_STATS(SCTP_MIB_CURRESTAB); + return SCTP_IERROR_NO_DATA; + } + + /* If definately accepting the DATA chunk, record its TSN, otherwise + * wait for renege processing. + */ + if (SCTP_CMD_CHUNK_ULP == deliver) + sctp_add_cmd_sf(commands, SCTP_CMD_REPORT_TSN, SCTP_U32(tsn)); + + /* Note: Some chunks may get overcounted (if we drop) or overcounted + * if we renege and the chunk arrives again. + */ + if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) + SCTP_INC_STATS(SCTP_MIB_INUNORDERCHUNKS); + else + SCTP_INC_STATS(SCTP_MIB_INORDERCHUNKS); + + /* RFC 2960 6.5 Stream Identifier and Stream Sequence Number + * + * If an endpoint receive a DATA chunk with an invalid stream + * identifier, it shall acknowledge the reception of the DATA chunk + * following the normal procedure, immediately send an ERROR chunk + * with cause set to "Invalid Stream Identifier" (See Section 3.3.10) + * and discard the DATA chunk. + */ + if (ntohs(data_hdr->stream) >= asoc->c.sinit_max_instreams) { + err = sctp_make_op_error(asoc, chunk, SCTP_ERROR_INV_STRM, + &data_hdr->stream, + sizeof(data_hdr->stream)); + if (err) + sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, + SCTP_CHUNK(err)); + return SCTP_IERROR_BAD_STREAM; + } + + /* Send the data up to the user. Note: Schedule the + * SCTP_CMD_CHUNK_ULP cmd before the SCTP_CMD_GEN_SACK, as the SACK + * chunk needs the updated rwnd. + */ + sctp_add_cmd_sf(commands, deliver, SCTP_CHUNK(chunk)); + + return SCTP_IERROR_NO_ERROR; +} diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c new file mode 100644 index 000000000000..8967846f69e8 --- /dev/null +++ b/net/sctp/sm_statetable.c @@ -0,0 +1,1004 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 Nokia, Inc. + * + * This file is part of the SCTP kernel reference Implementation + * + * These are the state tables for the SCTP state machine. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Jon Grimm + * Hui Huang + * Daisy Chang + * Ardelle Fan + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include + +static const sctp_sm_table_entry_t +primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES]; +static const sctp_sm_table_entry_t +other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES]; +static const sctp_sm_table_entry_t +timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES]; + +static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(sctp_cid_t cid, + sctp_state_t state); + + +static const sctp_sm_table_entry_t bug = { + .fn = sctp_sf_bug, + .name = "sctp_sf_bug" +}; + +#define DO_LOOKUP(_max, _type, _table) \ + if ((event_subtype._type > (_max))) { \ + printk(KERN_WARNING \ + "sctp table %p possible attack:" \ + " event %d exceeds max %d\n", \ + _table, event_subtype._type, _max); \ + return &bug; \ + } \ + return &_table[event_subtype._type][(int)state]; + +const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type, + sctp_state_t state, + sctp_subtype_t event_subtype) +{ + switch (event_type) { + case SCTP_EVENT_T_CHUNK: + return sctp_chunk_event_lookup(event_subtype.chunk, state); + break; + case SCTP_EVENT_T_TIMEOUT: + DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout, + timeout_event_table); + break; + + case SCTP_EVENT_T_OTHER: + DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other, other_event_table); + break; + + case SCTP_EVENT_T_PRIMITIVE: + DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive, + primitive_event_table); + break; + + default: + /* Yikes! We got an illegal event type. */ + return &bug; + }; +} + +#define TYPE_SCTP_DATA { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_tabort_8_4_8, .name = "sctp_sf_tabort_8_4_8"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_eat_data_6_2, .name = "sctp_sf_eat_data_6_2"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_eat_data_6_2, .name = "sctp_sf_eat_data_6_2"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_eat_data_fast_4_4, .name = "sctp_sf_eat_data_fast_4_4"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ +} /* TYPE_SCTP_DATA */ + +#define TYPE_SCTP_INIT { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_do_5_1B_init, .name = "sctp_sf_do_5_1B_init"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_do_5_2_1_siminit, .name = "sctp_sf_do_5_2_1_siminit"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_do_5_2_1_siminit, .name = "sctp_sf_do_5_2_1_siminit"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_do_5_2_2_dupinit, .name = "sctp_sf_do_5_2_2_dupinit"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_do_5_2_2_dupinit, .name = "sctp_sf_do_5_2_2_dupinit"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_do_5_2_2_dupinit, .name = "sctp_sf_do_5_2_2_dupinit"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_do_5_2_2_dupinit, .name = "sctp_sf_do_5_2_2_dupinit"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_do_9_2_reshutack, .name = "sctp_sf_do_9_2_reshutack"}, \ +} /* TYPE_SCTP_INIT */ + +#define TYPE_SCTP_INIT_ACK { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_do_5_1C_ack, .name = "sctp_sf_do_5_1C_ack"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ +} /* TYPE_SCTP_INIT_ACK */ + +#define TYPE_SCTP_SACK { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_tabort_8_4_8, .name = "sctp_sf_tabort_8_4_8"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_eat_sack_6_2, .name = "sctp_sf_eat_sack_6_2"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_eat_sack_6_2, .name = "sctp_sf_eat_sack_6_2"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_eat_sack_6_2, .name = "sctp_sf_eat_sack_6_2"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_eat_sack_6_2, .name = "sctp_sf_eat_sack_6_2"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ +} /* TYPE_SCTP_SACK */ + +#define TYPE_SCTP_HEARTBEAT { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_tabort_8_4_8, .name = "sctp_sf_tabort_8_4_8"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_beat_8_3, .name = "sctp_sf_beat_8_3"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_beat_8_3, .name = "sctp_sf_beat_8_3"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_beat_8_3, .name = "sctp_sf_beat_8_3"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_beat_8_3, .name = "sctp_sf_beat_8_3"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_beat_8_3, .name = "sctp_sf_beat_8_3"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + /* This should not happen, but we are nice. */ \ + {.fn = sctp_sf_beat_8_3, .name = "sctp_sf_beat_8_3"}, \ +} /* TYPE_SCTP_HEARTBEAT */ + +#define TYPE_SCTP_HEARTBEAT_ACK { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_tabort_8_4_8, .name = "sctp_sf_tabort_8_4_8"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_violation, .name = "sctp_sf_violation"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_backbeat_8_3, .name = "sctp_sf_backbeat_8_3"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_backbeat_8_3, .name = "sctp_sf_backbeat_8_3"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_backbeat_8_3, .name = "sctp_sf_backbeat_8_3"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_backbeat_8_3, .name = "sctp_sf_backbeat_8_3"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ +} /* TYPE_SCTP_HEARTBEAT_ACK */ + +#define TYPE_SCTP_ABORT { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_pdiscard, .name = "sctp_sf_pdiscard"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_cookie_wait_abort, .name = "sctp_sf_cookie_wait_abort"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_cookie_echoed_abort, \ + .name = "sctp_sf_cookie_echoed_abort"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_do_9_1_abort, .name = "sctp_sf_do_9_1_abort"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_shutdown_pending_abort, \ + .name = "sctp_sf_shutdown_pending_abort"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_shutdown_sent_abort, \ + .name = "sctp_sf_shutdown_sent_abort"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_do_9_1_abort, .name = "sctp_sf_do_9_1_abort"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_shutdown_ack_sent_abort, \ + .name = "sctp_sf_shutdown_ack_sent_abort"}, \ +} /* TYPE_SCTP_ABORT */ + +#define TYPE_SCTP_SHUTDOWN { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_tabort_8_4_8, .name = "sctp_sf_tabort_8_4_8"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_do_9_2_shutdown, .name = "sctp_sf_do_9_2_shutdown"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_do_9_2_shutdown_ack, \ + .name = "sctp_sf_do_9_2_shutdown_ack"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ +} /* TYPE_SCTP_SHUTDOWN */ + +#define TYPE_SCTP_SHUTDOWN_ACK { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_do_8_5_1_E_sa, .name = "sctp_sf_do_8_5_1_E_sa"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_do_8_5_1_E_sa, .name = "sctp_sf_do_8_5_1_E_sa"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_violation, .name = "sctp_sf_violation"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_violation, .name = "sctp_sf_violation"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_do_9_2_final, .name = "sctp_sf_do_9_2_final"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_violation, .name = "sctp_sf_violation"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_do_9_2_final, .name = "sctp_sf_do_9_2_final"}, \ +} /* TYPE_SCTP_SHUTDOWN_ACK */ + +#define TYPE_SCTP_ERROR { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_tabort_8_4_8, .name = "sctp_sf_tabort_8_4_8"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_cookie_echoed_err, .name = "sctp_sf_cookie_echoed_err"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_operr_notify, .name = "sctp_sf_operr_notify"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_operr_notify, .name = "sctp_sf_operr_notify"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_operr_notify, .name = "sctp_sf_operr_notify"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ +} /* TYPE_SCTP_ERROR */ + +#define TYPE_SCTP_COOKIE_ECHO { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_do_5_1D_ce, .name = "sctp_sf_do_5_1D_ce"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_do_5_2_4_dupcook, .name = "sctp_sf_do_5_2_4_dupcook"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_do_5_2_4_dupcook, .name = "sctp_sf_do_5_2_4_dupcook"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_do_5_2_4_dupcook, .name = "sctp_sf_do_5_2_4_dupcook"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_do_5_2_4_dupcook, .name = "sctp_sf_do_5_2_4_dupcook"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_do_5_2_4_dupcook, .name = "sctp_sf_do_5_2_4_dupcook"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_do_5_2_4_dupcook, .name = "sctp_sf_do_5_2_4_dupcook"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_do_5_2_4_dupcook, .name = "sctp_sf_do_5_2_4_dupcook"}, \ +} /* TYPE_SCTP_COOKIE_ECHO */ + +#define TYPE_SCTP_COOKIE_ACK { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_do_5_1E_ca, .name = "sctp_sf_do_5_1E_ca"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ +} /* TYPE_SCTP_COOKIE_ACK */ + +#define TYPE_SCTP_ECN_ECNE { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_do_ecne, .name = "sctp_sf_do_ecne"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_do_ecne, .name = "sctp_sf_do_ecne"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_do_ecne, .name = "sctp_sf_do_ecne"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_do_ecne, .name = "sctp_sf_do_ecne"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_do_ecne, .name = "sctp_sf_do_ecne"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ +} /* TYPE_SCTP_ECN_ECNE */ + +#define TYPE_SCTP_ECN_CWR { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_do_ecn_cwr, .name = "sctp_sf_do_ecn_cwr"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_do_ecn_cwr, .name = "sctp_sf_do_ecn_cwr"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_do_ecn_cwr, .name = "sctp_sf_do_ecn_cwr"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ +} /* TYPE_SCTP_ECN_CWR */ + +#define TYPE_SCTP_SHUTDOWN_COMPLETE { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_do_4_C, .name = "sctp_sf_do_4_C"}, \ +} /* TYPE_SCTP_SHUTDOWN_COMPLETE */ + +/* The primary index for this table is the chunk type. + * The secondary index for this table is the state. + * + * For base protocol (RFC 2960). + */ +static const sctp_sm_table_entry_t chunk_event_table[SCTP_NUM_BASE_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { + TYPE_SCTP_DATA, + TYPE_SCTP_INIT, + TYPE_SCTP_INIT_ACK, + TYPE_SCTP_SACK, + TYPE_SCTP_HEARTBEAT, + TYPE_SCTP_HEARTBEAT_ACK, + TYPE_SCTP_ABORT, + TYPE_SCTP_SHUTDOWN, + TYPE_SCTP_SHUTDOWN_ACK, + TYPE_SCTP_ERROR, + TYPE_SCTP_COOKIE_ECHO, + TYPE_SCTP_COOKIE_ACK, + TYPE_SCTP_ECN_ECNE, + TYPE_SCTP_ECN_CWR, + TYPE_SCTP_SHUTDOWN_COMPLETE, +}; /* state_fn_t chunk_event_table[][] */ + +#define TYPE_SCTP_ASCONF { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_do_asconf, .name = "sctp_sf_do_asconf"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ +} /* TYPE_SCTP_ASCONF */ + +#define TYPE_SCTP_ASCONF_ACK { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_do_asconf_ack, .name = "sctp_sf_do_asconf_ack"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ +} /* TYPE_SCTP_ASCONF_ACK */ + +/* The primary index for this table is the chunk type. + * The secondary index for this table is the state. + */ +static const sctp_sm_table_entry_t addip_chunk_event_table[SCTP_NUM_ADDIP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { + TYPE_SCTP_ASCONF, + TYPE_SCTP_ASCONF_ACK, +}; /*state_fn_t addip_chunk_event_table[][] */ + +#define TYPE_SCTP_FWD_TSN { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_tabort_8_4_8, .name = "sctp_sf_tabort_8_4_8"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_eat_fwd_tsn, .name = "sctp_sf_eat_fwd_tsn"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_eat_fwd_tsn, .name = "sctp_sf_eat_fwd_tsn"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_eat_fwd_tsn_fast, .name = "sctp_sf_eat_fwd_tsn_fast"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_discard_chunk, .name = "sctp_sf_discard_chunk"}, \ +} /* TYPE_SCTP_FWD_TSN */ + +/* The primary index for this table is the chunk type. + * The secondary index for this table is the state. + */ +static const sctp_sm_table_entry_t prsctp_chunk_event_table[SCTP_NUM_PRSCTP_CHUNK_TYPES][SCTP_STATE_NUM_STATES] = { + TYPE_SCTP_FWD_TSN, +}; /*state_fn_t prsctp_chunk_event_table[][] */ + +static const sctp_sm_table_entry_t +chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = { + /* SCTP_STATE_EMPTY */ + {.fn = sctp_sf_ootb, .name = "sctp_sf_ootb"}, + /* SCTP_STATE_CLOSED */ + {.fn = sctp_sf_tabort_8_4_8, .name = "sctp_sf_tabort_8_4_8"}, + /* SCTP_STATE_COOKIE_WAIT */ + {.fn = sctp_sf_unk_chunk, .name = "sctp_sf_unk_chunk"}, + /* SCTP_STATE_COOKIE_ECHOED */ + {.fn = sctp_sf_unk_chunk, .name = "sctp_sf_unk_chunk"}, + /* SCTP_STATE_ESTABLISHED */ + {.fn = sctp_sf_unk_chunk, .name = "sctp_sf_unk_chunk"}, + /* SCTP_STATE_SHUTDOWN_PENDING */ + {.fn = sctp_sf_unk_chunk, .name = "sctp_sf_unk_chunk"}, + /* SCTP_STATE_SHUTDOWN_SENT */ + {.fn = sctp_sf_unk_chunk, .name = "sctp_sf_unk_chunk"}, + /* SCTP_STATE_SHUTDOWN_RECEIVED */ + {.fn = sctp_sf_unk_chunk, .name = "sctp_sf_unk_chunk"}, + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ + {.fn = sctp_sf_unk_chunk, .name = "sctp_sf_unk_chunk"}, +}; /* chunk unknown */ + + +#define TYPE_SCTP_PRIMITIVE_ASSOCIATE { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_do_prm_asoc, .name = "sctp_sf_do_prm_asoc"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_not_impl, .name = "sctp_sf_not_impl"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_not_impl, .name = "sctp_sf_not_impl"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_not_impl, .name = "sctp_sf_not_impl"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_not_impl, .name = "sctp_sf_not_impl"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_not_impl, .name = "sctp_sf_not_impl"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_not_impl, .name = "sctp_sf_not_impl"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_not_impl, .name = "sctp_sf_not_impl"}, \ +} /* TYPE_SCTP_PRIMITIVE_ASSOCIATE */ + +#define TYPE_SCTP_PRIMITIVE_SHUTDOWN { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_error_closed, .name = "sctp_sf_error_closed"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_cookie_wait_prm_shutdown, \ + .name = "sctp_sf_cookie_wait_prm_shutdown"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_cookie_echoed_prm_shutdown, \ + .name = "sctp_sf_cookie_echoed_prm_shutdown"},\ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_do_9_2_prm_shutdown, \ + .name = "sctp_sf_do_9_2_prm_shutdown"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_ignore_primitive, .name = "sctp_sf_ignore_primitive"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_ignore_primitive, .name = "sctp_sf_ignore_primitive"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_ignore_primitive, .name = "sctp_sf_ignore_primitive"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_ignore_primitive, .name = "sctp_sf_ignore_primitive"}, \ +} /* TYPE_SCTP_PRIMITIVE_SHUTDOWN */ + +#define TYPE_SCTP_PRIMITIVE_ABORT { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_error_closed, .name = "sctp_sf_error_closed"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_cookie_wait_prm_abort, \ + .name = "sctp_sf_cookie_wait_prm_abort"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_cookie_echoed_prm_abort, \ + .name = "sctp_sf_cookie_echoed_prm_abort"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_do_9_1_prm_abort, \ + .name = "sctp_sf_do_9_1_prm_abort"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_shutdown_pending_prm_abort, \ + .name = "sctp_sf_shutdown_pending_prm_abort"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_shutdown_sent_prm_abort, \ + .name = "sctp_sf_shutdown_sent_prm_abort"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_do_9_1_prm_abort, \ + .name = "sctp_sf_do_9_1_prm_abort"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_shutdown_ack_sent_prm_abort, \ + .name = "sctp_sf_shutdown_ack_sent_prm_abort"}, \ +} /* TYPE_SCTP_PRIMITIVE_ABORT */ + +#define TYPE_SCTP_PRIMITIVE_SEND { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_error_closed, .name = "sctp_sf_error_closed"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_do_prm_send, .name = "sctp_sf_do_prm_send"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_do_prm_send, .name = "sctp_sf_do_prm_send"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_do_prm_send, .name = "sctp_sf_do_prm_send"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_error_shutdown, .name = "sctp_sf_error_shutdown"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_error_shutdown, .name = "sctp_sf_error_shutdown"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_error_shutdown, .name = "sctp_sf_error_shutdown"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_error_shutdown, .name = "sctp_sf_error_shutdown"}, \ +} /* TYPE_SCTP_PRIMITIVE_SEND */ + +#define TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_do_prm_requestheartbeat, \ + .name = "sctp_sf_do_prm_requestheartbeat"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_do_prm_requestheartbeat, \ + .name = "sctp_sf_do_prm_requestheartbeat"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_do_prm_requestheartbeat, \ + .name = "sctp_sf_do_prm_requestheartbeat"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_do_prm_requestheartbeat, \ + .name = "sctp_sf_do_prm_requestheartbeat"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_do_prm_requestheartbeat, \ + .name = "sctp_sf_do_prm_requestheartbeat"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_do_prm_requestheartbeat, \ + .name = "sctp_sf_do_prm_requestheartbeat"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_do_prm_requestheartbeat, \ + .name = "sctp_sf_do_prm_requestheartbeat"}, \ +} /* TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT */ + +#define TYPE_SCTP_PRIMITIVE_ASCONF { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_error_closed, .name = "sctp_sf_error_closed"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_error_closed, .name = "sctp_sf_error_closed"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_error_closed, .name = "sctp_sf_error_closed"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_do_prm_asconf, .name = "sctp_sf_do_prm_asconf"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_error_shutdown, .name = "sctp_sf_error_shutdown"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_error_shutdown, .name = "sctp_sf_error_shutdown"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_error_shutdown, .name = "sctp_sf_error_shutdown"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_error_shutdown, .name = "sctp_sf_error_shutdown"}, \ +} /* TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT */ + +/* The primary index for this table is the primitive type. + * The secondary index for this table is the state. + */ +static const sctp_sm_table_entry_t primitive_event_table[SCTP_NUM_PRIMITIVE_TYPES][SCTP_STATE_NUM_STATES] = { + TYPE_SCTP_PRIMITIVE_ASSOCIATE, + TYPE_SCTP_PRIMITIVE_SHUTDOWN, + TYPE_SCTP_PRIMITIVE_ABORT, + TYPE_SCTP_PRIMITIVE_SEND, + TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT, + TYPE_SCTP_PRIMITIVE_ASCONF, +}; + +#define TYPE_SCTP_OTHER_NO_PENDING_TSN { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_ignore_other, .name = "sctp_sf_ignore_other"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_ignore_other, .name = "sctp_sf_ignore_other"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_ignore_other, .name = "sctp_sf_ignore_other"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_ignore_other, .name = "sctp_sf_ignore_other"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_do_9_2_start_shutdown, \ + .name = "sctp_do_9_2_start_shutdown"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_ignore_other, .name = "sctp_sf_ignore_other"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_do_9_2_shutdown_ack, \ + .name = "sctp_sf_do_9_2_shutdown_ack"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_ignore_other, .name = "sctp_sf_ignore_other"}, \ +} + +#define TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_ignore_other, .name = "sctp_sf_ignore_other"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_cookie_wait_icmp_abort, \ + .name = "sctp_sf_cookie_wait_icmp_abort"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_ignore_other, .name = "sctp_sf_ignore_other"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_ignore_other, .name = "sctp_sf_ignore_other"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_ignore_other, .name = "sctp_sf_ignore_other"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_ignore_other, .name = "sctp_sf_ignore_other"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_ignore_other, .name = "sctp_sf_ignore_other"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_ignore_other, .name = "sctp_sf_ignore_other"}, \ +} + +static const sctp_sm_table_entry_t other_event_table[SCTP_NUM_OTHER_TYPES][SCTP_STATE_NUM_STATES] = { + TYPE_SCTP_OTHER_NO_PENDING_TSN, + TYPE_SCTP_OTHER_ICMP_PROTO_UNREACH, +}; + +#define TYPE_SCTP_EVENT_TIMEOUT_NONE { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_t1_timer_expire, .name = "sctp_sf_t1_timer_expire"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_T1_INIT { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_t1_timer_expire, .name = "sctp_sf_t1_timer_expire"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_t2_timer_expire, .name = "sctp_sf_t2_timer_expire"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_t2_timer_expire, .name = "sctp_sf_t2_timer_expire"}, \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_T3_RTX { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_do_6_3_3_rtx, .name = "sctp_sf_do_6_3_3_rtx"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_do_6_3_3_rtx, .name = "sctp_sf_do_6_3_3_rtx"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_do_6_3_3_rtx, .name = "sctp_sf_do_6_3_3_rtx"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_do_6_3_3_rtx, .name = "sctp_sf_do_6_3_3_rtx"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_T4_RTO { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_t4_timer_expire, .name = "sctp_sf_t4_timer_expire"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_t5_timer_expire, .name = "sctp_sf_t5_timer_expire"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_t5_timer_expire, .name = "sctp_sf_t5_timer_expire"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_sendbeat_8_3, .name = "sctp_sf_sendbeat_8_3"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_sendbeat_8_3, .name = "sctp_sf_sendbeat_8_3"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_sendbeat_8_3, .name = "sctp_sf_sendbeat_8_3"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_SACK { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_bug, .name = "sctp_sf_bug"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_do_6_2_sack, .name = "sctp_sf_do_6_2_sack"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_do_6_2_sack, .name = "sctp_sf_do_6_2_sack"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_do_6_2_sack, .name = "sctp_sf_do_6_2_sack"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ +} + +#define TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE { \ + /* SCTP_STATE_EMPTY */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_CLOSED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_COOKIE_WAIT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_COOKIE_ECHOED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_ESTABLISHED */ \ + {.fn = sctp_sf_autoclose_timer_expire, \ + .name = "sctp_sf_autoclose_timer_expire"}, \ + /* SCTP_STATE_SHUTDOWN_PENDING */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_SENT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_RECEIVED */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ + /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \ + {.fn = sctp_sf_timer_ignore, .name = "sctp_sf_timer_ignore"}, \ +} + +static const sctp_sm_table_entry_t timeout_event_table[SCTP_NUM_TIMEOUT_TYPES][SCTP_STATE_NUM_STATES] = { + TYPE_SCTP_EVENT_TIMEOUT_NONE, + TYPE_SCTP_EVENT_TIMEOUT_T1_COOKIE, + TYPE_SCTP_EVENT_TIMEOUT_T1_INIT, + TYPE_SCTP_EVENT_TIMEOUT_T2_SHUTDOWN, + TYPE_SCTP_EVENT_TIMEOUT_T3_RTX, + TYPE_SCTP_EVENT_TIMEOUT_T4_RTO, + TYPE_SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD, + TYPE_SCTP_EVENT_TIMEOUT_HEARTBEAT, + TYPE_SCTP_EVENT_TIMEOUT_SACK, + TYPE_SCTP_EVENT_TIMEOUT_AUTOCLOSE, +}; + +static const sctp_sm_table_entry_t *sctp_chunk_event_lookup(sctp_cid_t cid, + sctp_state_t state) +{ + if (state > SCTP_STATE_MAX) + return &bug; + + if (cid >= 0 && cid <= SCTP_CID_BASE_MAX) + return &chunk_event_table[cid][state]; + + if (sctp_prsctp_enable) { + if (cid == SCTP_CID_FWD_TSN) + return &prsctp_chunk_event_table[0][state]; + } + + if (sctp_addip_enable) { + if (cid == SCTP_CID_ASCONF) + return &addip_chunk_event_table[0][state]; + + if (cid == SCTP_CID_ASCONF_ACK) + return &addip_chunk_event_table[1][state]; + } + + return &chunk_event_table_unknown[state]; +} diff --git a/net/sctp/socket.c b/net/sctp/socket.c new file mode 100644 index 000000000000..e8c210182571 --- /dev/null +++ b/net/sctp/socket.c @@ -0,0 +1,4797 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001-2003 Intel Corp. + * Copyright (c) 2001-2002 Nokia, Inc. + * Copyright (c) 2001 La Monte H.P. Yarroll + * + * This file is part of the SCTP kernel reference Implementation + * + * These functions interface with the sockets layer to implement the + * SCTP Extensions for the Sockets API. + * + * Note that the descriptions from the specification are USER level + * functions--this file is the functions which populate the struct proto + * for SCTP which is the BOTTOM of the sockets interface. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Narasimha Budihal + * Karl Knutson + * Jon Grimm + * Xingang Guo + * Daisy Chang + * Sridhar Samudrala + * Inaky Perez-Gonzalez + * Ardelle Fan + * Ryan Layer + * Anup Pemmaiah + * Kevin Gao + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include /* for sa_family_t */ +#include +#include +#include + +/* WARNING: Please do not remove the SCTP_STATIC attribute to + * any of the functions below as they are used to export functions + * used by a project regression testsuite. + */ + +/* Forward declarations for internal helper functions. */ +static int sctp_writeable(struct sock *sk); +static void sctp_wfree(struct sk_buff *skb); +static int sctp_wait_for_sndbuf(struct sctp_association *, long *timeo_p, + size_t msg_len); +static int sctp_wait_for_packet(struct sock * sk, int *err, long *timeo_p); +static int sctp_wait_for_connect(struct sctp_association *, long *timeo_p); +static int sctp_wait_for_accept(struct sock *sk, long timeo); +static void sctp_wait_for_close(struct sock *sk, long timeo); +static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, + union sctp_addr *addr, int len); +static int sctp_bindx_add(struct sock *, struct sockaddr *, int); +static int sctp_bindx_rem(struct sock *, struct sockaddr *, int); +static int sctp_send_asconf_add_ip(struct sock *, struct sockaddr *, int); +static int sctp_send_asconf_del_ip(struct sock *, struct sockaddr *, int); +static int sctp_send_asconf(struct sctp_association *asoc, + struct sctp_chunk *chunk); +static int sctp_do_bind(struct sock *, union sctp_addr *, int); +static int sctp_autobind(struct sock *sk); +static void sctp_sock_migrate(struct sock *, struct sock *, + struct sctp_association *, sctp_socket_type_t); +static char *sctp_hmac_alg = SCTP_COOKIE_HMAC_ALG; + +extern kmem_cache_t *sctp_bucket_cachep; + +/* Get the sndbuf space available at the time on the association. */ +static inline int sctp_wspace(struct sctp_association *asoc) +{ + struct sock *sk = asoc->base.sk; + int amt = 0; + + amt = sk->sk_sndbuf - asoc->sndbuf_used; + if (amt < 0) + amt = 0; + return amt; +} + +/* Increment the used sndbuf space count of the corresponding association by + * the size of the outgoing data chunk. + * Also, set the skb destructor for sndbuf accounting later. + * + * Since it is always 1-1 between chunk and skb, and also a new skb is always + * allocated for chunk bundling in sctp_packet_transmit(), we can use the + * destructor in the data chunk skb for the purpose of the sndbuf space + * tracking. + */ +static inline void sctp_set_owner_w(struct sctp_chunk *chunk) +{ + struct sctp_association *asoc = chunk->asoc; + struct sock *sk = asoc->base.sk; + + /* The sndbuf space is tracked per association. */ + sctp_association_hold(asoc); + + chunk->skb->destructor = sctp_wfree; + /* Save the chunk pointer in skb for sctp_wfree to use later. */ + *((struct sctp_chunk **)(chunk->skb->cb)) = chunk; + + asoc->sndbuf_used += SCTP_DATA_SNDSIZE(chunk); + sk->sk_wmem_queued += SCTP_DATA_SNDSIZE(chunk); +} + +/* Verify that this is a valid address. */ +static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr, + int len) +{ + struct sctp_af *af; + + /* Verify basic sockaddr. */ + af = sctp_sockaddr_af(sctp_sk(sk), addr, len); + if (!af) + return -EINVAL; + + /* Is this a valid SCTP address? */ + if (!af->addr_valid(addr, sctp_sk(sk))) + return -EINVAL; + + if (!sctp_sk(sk)->pf->send_verify(sctp_sk(sk), (addr))) + return -EINVAL; + + return 0; +} + +/* Look up the association by its id. If this is not a UDP-style + * socket, the ID field is always ignored. + */ +struct sctp_association *sctp_id2assoc(struct sock *sk, sctp_assoc_t id) +{ + struct sctp_association *asoc = NULL; + + /* If this is not a UDP-style socket, assoc id should be ignored. */ + if (!sctp_style(sk, UDP)) { + /* Return NULL if the socket state is not ESTABLISHED. It + * could be a TCP-style listening socket or a socket which + * hasn't yet called connect() to establish an association. + */ + if (!sctp_sstate(sk, ESTABLISHED)) + return NULL; + + /* Get the first and the only association from the list. */ + if (!list_empty(&sctp_sk(sk)->ep->asocs)) + asoc = list_entry(sctp_sk(sk)->ep->asocs.next, + struct sctp_association, asocs); + return asoc; + } + + /* Otherwise this is a UDP-style socket. */ + if (!id || (id == (sctp_assoc_t)-1)) + return NULL; + + spin_lock_bh(&sctp_assocs_id_lock); + asoc = (struct sctp_association *)idr_find(&sctp_assocs_id, (int)id); + spin_unlock_bh(&sctp_assocs_id_lock); + + if (!asoc || (asoc->base.sk != sk) || asoc->base.dead) + return NULL; + + return asoc; +} + +/* Look up the transport from an address and an assoc id. If both address and + * id are specified, the associations matching the address and the id should be + * the same. + */ +static struct sctp_transport *sctp_addr_id2transport(struct sock *sk, + struct sockaddr_storage *addr, + sctp_assoc_t id) +{ + struct sctp_association *addr_asoc = NULL, *id_asoc = NULL; + struct sctp_transport *transport; + union sctp_addr *laddr = (union sctp_addr *)addr; + + laddr->v4.sin_port = ntohs(laddr->v4.sin_port); + addr_asoc = sctp_endpoint_lookup_assoc(sctp_sk(sk)->ep, + (union sctp_addr *)addr, + &transport); + laddr->v4.sin_port = htons(laddr->v4.sin_port); + + if (!addr_asoc) + return NULL; + + id_asoc = sctp_id2assoc(sk, id); + if (id_asoc && (id_asoc != addr_asoc)) + return NULL; + + sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), + (union sctp_addr *)addr); + + return transport; +} + +/* API 3.1.2 bind() - UDP Style Syntax + * The syntax of bind() is, + * + * ret = bind(int sd, struct sockaddr *addr, int addrlen); + * + * sd - the socket descriptor returned by socket(). + * addr - the address structure (struct sockaddr_in or struct + * sockaddr_in6 [RFC 2553]), + * addr_len - the size of the address structure. + */ +SCTP_STATIC int sctp_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + int retval = 0; + + sctp_lock_sock(sk); + + SCTP_DEBUG_PRINTK("sctp_bind(sk: %p, uaddr: %p, addr_len: %d)\n", + sk, uaddr, addr_len); + + /* Disallow binding twice. */ + if (!sctp_sk(sk)->ep->base.bind_addr.port) + retval = sctp_do_bind(sk, (union sctp_addr *)uaddr, + addr_len); + else + retval = -EINVAL; + + sctp_release_sock(sk); + + return retval; +} + +static long sctp_get_port_local(struct sock *, union sctp_addr *); + +/* Verify this is a valid sockaddr. */ +static struct sctp_af *sctp_sockaddr_af(struct sctp_sock *opt, + union sctp_addr *addr, int len) +{ + struct sctp_af *af; + + /* Check minimum size. */ + if (len < sizeof (struct sockaddr)) + return NULL; + + /* Does this PF support this AF? */ + if (!opt->pf->af_supported(addr->sa.sa_family, opt)) + return NULL; + + /* If we get this far, af is valid. */ + af = sctp_get_af_specific(addr->sa.sa_family); + + if (len < af->sockaddr_len) + return NULL; + + return af; +} + +/* Bind a local address either to an endpoint or to an association. */ +SCTP_STATIC int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct sctp_endpoint *ep = sp->ep; + struct sctp_bind_addr *bp = &ep->base.bind_addr; + struct sctp_af *af; + unsigned short snum; + int ret = 0; + + SCTP_DEBUG_PRINTK("sctp_do_bind(sk: %p, newaddr: %p, len: %d)\n", + sk, addr, len); + + /* Common sockaddr verification. */ + af = sctp_sockaddr_af(sp, addr, len); + if (!af) + return -EINVAL; + + /* PF specific bind() address verification. */ + if (!sp->pf->bind_verify(sp, addr)) + return -EADDRNOTAVAIL; + + snum= ntohs(addr->v4.sin_port); + + SCTP_DEBUG_PRINTK("sctp_do_bind: port: %d, new port: %d\n", + bp->port, snum); + + /* We must either be unbound, or bind to the same port. */ + if (bp->port && (snum != bp->port)) { + SCTP_DEBUG_PRINTK("sctp_do_bind:" + " New port %d does not match existing port " + "%d.\n", snum, bp->port); + return -EINVAL; + } + + if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE)) + return -EACCES; + + /* Make sure we are allowed to bind here. + * The function sctp_get_port_local() does duplicate address + * detection. + */ + if ((ret = sctp_get_port_local(sk, addr))) { + if (ret == (long) sk) { + /* This endpoint has a conflicting address. */ + return -EINVAL; + } else { + return -EADDRINUSE; + } + } + + /* Refresh ephemeral port. */ + if (!bp->port) + bp->port = inet_sk(sk)->num; + + /* Add the address to the bind address list. */ + sctp_local_bh_disable(); + sctp_write_lock(&ep->base.addr_lock); + + /* Use GFP_ATOMIC since BHs are disabled. */ + addr->v4.sin_port = ntohs(addr->v4.sin_port); + ret = sctp_add_bind_addr(bp, addr, GFP_ATOMIC); + addr->v4.sin_port = htons(addr->v4.sin_port); + sctp_write_unlock(&ep->base.addr_lock); + sctp_local_bh_enable(); + + /* Copy back into socket for getsockname() use. */ + if (!ret) { + inet_sk(sk)->sport = htons(inet_sk(sk)->num); + af->to_sk_saddr(addr, sk); + } + + return ret; +} + + /* ADDIP Section 4.1.1 Congestion Control of ASCONF Chunks + * + * R1) One and only one ASCONF Chunk MAY be in transit and unacknowledged + * at any one time. If a sender, after sending an ASCONF chunk, decides + * it needs to transfer another ASCONF Chunk, it MUST wait until the + * ASCONF-ACK Chunk returns from the previous ASCONF Chunk before sending a + * subsequent ASCONF. Note this restriction binds each side, so at any + * time two ASCONF may be in-transit on any given association (one sent + * from each endpoint). + */ +static int sctp_send_asconf(struct sctp_association *asoc, + struct sctp_chunk *chunk) +{ + int retval = 0; + + /* If there is an outstanding ASCONF chunk, queue it for later + * transmission. + */ + if (asoc->addip_last_asconf) { + __skb_queue_tail(&asoc->addip_chunks, (struct sk_buff *)chunk); + goto out; + } + + /* Hold the chunk until an ASCONF_ACK is received. */ + sctp_chunk_hold(chunk); + retval = sctp_primitive_ASCONF(asoc, chunk); + if (retval) + sctp_chunk_free(chunk); + else + asoc->addip_last_asconf = chunk; + +out: + return retval; +} + +/* Add a list of addresses as bind addresses to local endpoint or + * association. + * + * Basically run through each address specified in the addrs/addrcnt + * array/length pair, determine if it is IPv6 or IPv4 and call + * sctp_do_bind() on it. + * + * If any of them fails, then the operation will be reversed and the + * ones that were added will be removed. + * + * Only sctp_setsockopt_bindx() is supposed to call this function. + */ +int sctp_bindx_add(struct sock *sk, struct sockaddr *addrs, int addrcnt) +{ + int cnt; + int retval = 0; + void *addr_buf; + struct sockaddr *sa_addr; + struct sctp_af *af; + + SCTP_DEBUG_PRINTK("sctp_bindx_add (sk: %p, addrs: %p, addrcnt: %d)\n", + sk, addrs, addrcnt); + + addr_buf = addrs; + for (cnt = 0; cnt < addrcnt; cnt++) { + /* The list may contain either IPv4 or IPv6 address; + * determine the address length for walking thru the list. + */ + sa_addr = (struct sockaddr *)addr_buf; + af = sctp_get_af_specific(sa_addr->sa_family); + if (!af) { + retval = -EINVAL; + goto err_bindx_add; + } + + retval = sctp_do_bind(sk, (union sctp_addr *)sa_addr, + af->sockaddr_len); + + addr_buf += af->sockaddr_len; + +err_bindx_add: + if (retval < 0) { + /* Failed. Cleanup the ones that have been added */ + if (cnt > 0) + sctp_bindx_rem(sk, addrs, cnt); + return retval; + } + } + + return retval; +} + +/* Send an ASCONF chunk with Add IP address parameters to all the peers of the + * associations that are part of the endpoint indicating that a list of local + * addresses are added to the endpoint. + * + * If any of the addresses is already in the bind address list of the + * association, we do not send the chunk for that association. But it will not + * affect other associations. + * + * Only sctp_setsockopt_bindx() is supposed to call this function. + */ +static int sctp_send_asconf_add_ip(struct sock *sk, + struct sockaddr *addrs, + int addrcnt) +{ + struct sctp_sock *sp; + struct sctp_endpoint *ep; + struct sctp_association *asoc; + struct sctp_bind_addr *bp; + struct sctp_chunk *chunk; + struct sctp_sockaddr_entry *laddr; + union sctp_addr *addr; + void *addr_buf; + struct sctp_af *af; + struct list_head *pos; + struct list_head *p; + int i; + int retval = 0; + + if (!sctp_addip_enable) + return retval; + + sp = sctp_sk(sk); + ep = sp->ep; + + SCTP_DEBUG_PRINTK("%s: (sk: %p, addrs: %p, addrcnt: %d)\n", + __FUNCTION__, sk, addrs, addrcnt); + + list_for_each(pos, &ep->asocs) { + asoc = list_entry(pos, struct sctp_association, asocs); + + if (!asoc->peer.asconf_capable) + continue; + + if (asoc->peer.addip_disabled_mask & SCTP_PARAM_ADD_IP) + continue; + + if (!sctp_state(asoc, ESTABLISHED)) + continue; + + /* Check if any address in the packed array of addresses is + * in the bind address list of the association. If so, + * do not send the asconf chunk to its peer, but continue with + * other associations. + */ + addr_buf = addrs; + for (i = 0; i < addrcnt; i++) { + addr = (union sctp_addr *)addr_buf; + af = sctp_get_af_specific(addr->v4.sin_family); + if (!af) { + retval = -EINVAL; + goto out; + } + + if (sctp_assoc_lookup_laddr(asoc, addr)) + break; + + addr_buf += af->sockaddr_len; + } + if (i < addrcnt) + continue; + + /* Use the first address in bind addr list of association as + * Address Parameter of ASCONF CHUNK. + */ + sctp_read_lock(&asoc->base.addr_lock); + bp = &asoc->base.bind_addr; + p = bp->address_list.next; + laddr = list_entry(p, struct sctp_sockaddr_entry, list); + sctp_read_unlock(&asoc->base.addr_lock); + + chunk = sctp_make_asconf_update_ip(asoc, &laddr->a, addrs, + addrcnt, SCTP_PARAM_ADD_IP); + if (!chunk) { + retval = -ENOMEM; + goto out; + } + + retval = sctp_send_asconf(asoc, chunk); + + /* FIXME: After sending the add address ASCONF chunk, we + * cannot append the address to the association's binding + * address list, because the new address may be used as the + * source of a message sent to the peer before the ASCONF + * chunk is received by the peer. So we should wait until + * ASCONF_ACK is received. + */ + } + +out: + return retval; +} + +/* Remove a list of addresses from bind addresses list. Do not remove the + * last address. + * + * Basically run through each address specified in the addrs/addrcnt + * array/length pair, determine if it is IPv6 or IPv4 and call + * sctp_del_bind() on it. + * + * If any of them fails, then the operation will be reversed and the + * ones that were removed will be added back. + * + * At least one address has to be left; if only one address is + * available, the operation will return -EBUSY. + * + * Only sctp_setsockopt_bindx() is supposed to call this function. + */ +int sctp_bindx_rem(struct sock *sk, struct sockaddr *addrs, int addrcnt) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct sctp_endpoint *ep = sp->ep; + int cnt; + struct sctp_bind_addr *bp = &ep->base.bind_addr; + int retval = 0; + union sctp_addr saveaddr; + void *addr_buf; + struct sockaddr *sa_addr; + struct sctp_af *af; + + SCTP_DEBUG_PRINTK("sctp_bindx_rem (sk: %p, addrs: %p, addrcnt: %d)\n", + sk, addrs, addrcnt); + + addr_buf = addrs; + for (cnt = 0; cnt < addrcnt; cnt++) { + /* If the bind address list is empty or if there is only one + * bind address, there is nothing more to be removed (we need + * at least one address here). + */ + if (list_empty(&bp->address_list) || + (sctp_list_single_entry(&bp->address_list))) { + retval = -EBUSY; + goto err_bindx_rem; + } + + /* The list may contain either IPv4 or IPv6 address; + * determine the address length to copy the address to + * saveaddr. + */ + sa_addr = (struct sockaddr *)addr_buf; + af = sctp_get_af_specific(sa_addr->sa_family); + if (!af) { + retval = -EINVAL; + goto err_bindx_rem; + } + memcpy(&saveaddr, sa_addr, af->sockaddr_len); + saveaddr.v4.sin_port = ntohs(saveaddr.v4.sin_port); + if (saveaddr.v4.sin_port != bp->port) { + retval = -EINVAL; + goto err_bindx_rem; + } + + /* FIXME - There is probably a need to check if sk->sk_saddr and + * sk->sk_rcv_addr are currently set to one of the addresses to + * be removed. This is something which needs to be looked into + * when we are fixing the outstanding issues with multi-homing + * socket routing and failover schemes. Refer to comments in + * sctp_do_bind(). -daisy + */ + sctp_local_bh_disable(); + sctp_write_lock(&ep->base.addr_lock); + + retval = sctp_del_bind_addr(bp, &saveaddr); + + sctp_write_unlock(&ep->base.addr_lock); + sctp_local_bh_enable(); + + addr_buf += af->sockaddr_len; +err_bindx_rem: + if (retval < 0) { + /* Failed. Add the ones that has been removed back */ + if (cnt > 0) + sctp_bindx_add(sk, addrs, cnt); + return retval; + } + } + + return retval; +} + +/* Send an ASCONF chunk with Delete IP address parameters to all the peers of + * the associations that are part of the endpoint indicating that a list of + * local addresses are removed from the endpoint. + * + * If any of the addresses is already in the bind address list of the + * association, we do not send the chunk for that association. But it will not + * affect other associations. + * + * Only sctp_setsockopt_bindx() is supposed to call this function. + */ +static int sctp_send_asconf_del_ip(struct sock *sk, + struct sockaddr *addrs, + int addrcnt) +{ + struct sctp_sock *sp; + struct sctp_endpoint *ep; + struct sctp_association *asoc; + struct sctp_bind_addr *bp; + struct sctp_chunk *chunk; + union sctp_addr *laddr; + void *addr_buf; + struct sctp_af *af; + struct list_head *pos; + int i; + int retval = 0; + + if (!sctp_addip_enable) + return retval; + + sp = sctp_sk(sk); + ep = sp->ep; + + SCTP_DEBUG_PRINTK("%s: (sk: %p, addrs: %p, addrcnt: %d)\n", + __FUNCTION__, sk, addrs, addrcnt); + + list_for_each(pos, &ep->asocs) { + asoc = list_entry(pos, struct sctp_association, asocs); + + if (!asoc->peer.asconf_capable) + continue; + + if (asoc->peer.addip_disabled_mask & SCTP_PARAM_DEL_IP) + continue; + + if (!sctp_state(asoc, ESTABLISHED)) + continue; + + /* Check if any address in the packed array of addresses is + * not present in the bind address list of the association. + * If so, do not send the asconf chunk to its peer, but + * continue with other associations. + */ + addr_buf = addrs; + for (i = 0; i < addrcnt; i++) { + laddr = (union sctp_addr *)addr_buf; + af = sctp_get_af_specific(laddr->v4.sin_family); + if (!af) { + retval = -EINVAL; + goto out; + } + + if (!sctp_assoc_lookup_laddr(asoc, laddr)) + break; + + addr_buf += af->sockaddr_len; + } + if (i < addrcnt) + continue; + + /* Find one address in the association's bind address list + * that is not in the packed array of addresses. This is to + * make sure that we do not delete all the addresses in the + * association. + */ + sctp_read_lock(&asoc->base.addr_lock); + bp = &asoc->base.bind_addr; + laddr = sctp_find_unmatch_addr(bp, (union sctp_addr *)addrs, + addrcnt, sp); + sctp_read_unlock(&asoc->base.addr_lock); + if (!laddr) + continue; + + chunk = sctp_make_asconf_update_ip(asoc, laddr, addrs, addrcnt, + SCTP_PARAM_DEL_IP); + if (!chunk) { + retval = -ENOMEM; + goto out; + } + + retval = sctp_send_asconf(asoc, chunk); + + /* FIXME: After sending the delete address ASCONF chunk, we + * cannot remove the addresses from the association's bind + * address list, because there maybe some packet send to + * the delete addresses, so we should wait until ASCONF_ACK + * packet is received. + */ + } +out: + return retval; +} + +/* Helper for tunneling sctp_bindx() requests through sctp_setsockopt() + * + * API 8.1 + * int sctp_bindx(int sd, struct sockaddr *addrs, int addrcnt, + * int flags); + * + * If sd is an IPv4 socket, the addresses passed must be IPv4 addresses. + * If the sd is an IPv6 socket, the addresses passed can either be IPv4 + * or IPv6 addresses. + * + * A single address may be specified as INADDR_ANY or IN6ADDR_ANY, see + * Section 3.1.2 for this usage. + * + * addrs is a pointer to an array of one or more socket addresses. Each + * address is contained in its appropriate structure (i.e. struct + * sockaddr_in or struct sockaddr_in6) the family of the address type + * must be used to distengish the address length (note that this + * representation is termed a "packed array" of addresses). The caller + * specifies the number of addresses in the array with addrcnt. + * + * On success, sctp_bindx() returns 0. On failure, sctp_bindx() returns + * -1, and sets errno to the appropriate error code. + * + * For SCTP, the port given in each socket address must be the same, or + * sctp_bindx() will fail, setting errno to EINVAL. + * + * The flags parameter is formed from the bitwise OR of zero or more of + * the following currently defined flags: + * + * SCTP_BINDX_ADD_ADDR + * + * SCTP_BINDX_REM_ADDR + * + * SCTP_BINDX_ADD_ADDR directs SCTP to add the given addresses to the + * association, and SCTP_BINDX_REM_ADDR directs SCTP to remove the given + * addresses from the association. The two flags are mutually exclusive; + * if both are given, sctp_bindx() will fail with EINVAL. A caller may + * not remove all addresses from an association; sctp_bindx() will + * reject such an attempt with EINVAL. + * + * An application can use sctp_bindx(SCTP_BINDX_ADD_ADDR) to associate + * additional addresses with an endpoint after calling bind(). Or use + * sctp_bindx(SCTP_BINDX_REM_ADDR) to remove some addresses a listening + * socket is associated with so that no new association accepted will be + * associated with those addresses. If the endpoint supports dynamic + * address a SCTP_BINDX_REM_ADDR or SCTP_BINDX_ADD_ADDR may cause a + * endpoint to send the appropriate message to the peer to change the + * peers address lists. + * + * Adding and removing addresses from a connected association is + * optional functionality. Implementations that do not support this + * functionality should return EOPNOTSUPP. + * + * Basically do nothing but copying the addresses from user to kernel + * land and invoking either sctp_bindx_add() or sctp_bindx_rem() on the sk. + * This is used for tunneling the sctp_bindx() request through sctp_setsockopt() * from userspace. + * + * We don't use copy_from_user() for optimization: we first do the + * sanity checks (buffer size -fast- and access check-healthy + * pointer); if all of those succeed, then we can alloc the memory + * (expensive operation) needed to copy the data to kernel. Then we do + * the copying without checking the user space area + * (__copy_from_user()). + * + * On exit there is no need to do sockfd_put(), sys_setsockopt() does + * it. + * + * sk The sk of the socket + * addrs The pointer to the addresses in user land + * addrssize Size of the addrs buffer + * op Operation to perform (add or remove, see the flags of + * sctp_bindx) + * + * Returns 0 if ok, <0 errno code on error. + */ +SCTP_STATIC int sctp_setsockopt_bindx(struct sock* sk, + struct sockaddr __user *addrs, + int addrs_size, int op) +{ + struct sockaddr *kaddrs; + int err; + int addrcnt = 0; + int walk_size = 0; + struct sockaddr *sa_addr; + void *addr_buf; + struct sctp_af *af; + + SCTP_DEBUG_PRINTK("sctp_setsocktopt_bindx: sk %p addrs %p" + " addrs_size %d opt %d\n", sk, addrs, addrs_size, op); + + if (unlikely(addrs_size <= 0)) + return -EINVAL; + + /* Check the user passed a healthy pointer. */ + if (unlikely(!access_ok(VERIFY_READ, addrs, addrs_size))) + return -EFAULT; + + /* Alloc space for the address array in kernel memory. */ + kaddrs = (struct sockaddr *)kmalloc(addrs_size, GFP_KERNEL); + if (unlikely(!kaddrs)) + return -ENOMEM; + + if (__copy_from_user(kaddrs, addrs, addrs_size)) { + kfree(kaddrs); + return -EFAULT; + } + + /* Walk through the addrs buffer and count the number of addresses. */ + addr_buf = kaddrs; + while (walk_size < addrs_size) { + sa_addr = (struct sockaddr *)addr_buf; + af = sctp_get_af_specific(sa_addr->sa_family); + + /* If the address family is not supported or if this address + * causes the address buffer to overflow return EINVAL. + */ + if (!af || (walk_size + af->sockaddr_len) > addrs_size) { + kfree(kaddrs); + return -EINVAL; + } + addrcnt++; + addr_buf += af->sockaddr_len; + walk_size += af->sockaddr_len; + } + + /* Do the work. */ + switch (op) { + case SCTP_BINDX_ADD_ADDR: + err = sctp_bindx_add(sk, kaddrs, addrcnt); + if (err) + goto out; + err = sctp_send_asconf_add_ip(sk, kaddrs, addrcnt); + break; + + case SCTP_BINDX_REM_ADDR: + err = sctp_bindx_rem(sk, kaddrs, addrcnt); + if (err) + goto out; + err = sctp_send_asconf_del_ip(sk, kaddrs, addrcnt); + break; + + default: + err = -EINVAL; + break; + }; + +out: + kfree(kaddrs); + + return err; +} + +/* API 3.1.4 close() - UDP Style Syntax + * Applications use close() to perform graceful shutdown (as described in + * Section 10.1 of [SCTP]) on ALL the associations currently represented + * by a UDP-style socket. + * + * The syntax is + * + * ret = close(int sd); + * + * sd - the socket descriptor of the associations to be closed. + * + * To gracefully shutdown a specific association represented by the + * UDP-style socket, an application should use the sendmsg() call, + * passing no user data, but including the appropriate flag in the + * ancillary data (see Section xxxx). + * + * If sd in the close() call is a branched-off socket representing only + * one association, the shutdown is performed on that association only. + * + * 4.1.6 close() - TCP Style Syntax + * + * Applications use close() to gracefully close down an association. + * + * The syntax is: + * + * int close(int sd); + * + * sd - the socket descriptor of the association to be closed. + * + * After an application calls close() on a socket descriptor, no further + * socket operations will succeed on that descriptor. + * + * API 7.1.4 SO_LINGER + * + * An application using the TCP-style socket can use this option to + * perform the SCTP ABORT primitive. The linger option structure is: + * + * struct linger { + * int l_onoff; // option on/off + * int l_linger; // linger time + * }; + * + * To enable the option, set l_onoff to 1. If the l_linger value is set + * to 0, calling close() is the same as the ABORT primitive. If the + * value is set to a negative value, the setsockopt() call will return + * an error. If the value is set to a positive value linger_time, the + * close() can be blocked for at most linger_time ms. If the graceful + * shutdown phase does not finish during this period, close() will + * return but the graceful shutdown phase continues in the system. + */ +SCTP_STATIC void sctp_close(struct sock *sk, long timeout) +{ + struct sctp_endpoint *ep; + struct sctp_association *asoc; + struct list_head *pos, *temp; + + SCTP_DEBUG_PRINTK("sctp_close(sk: 0x%p, timeout:%ld)\n", sk, timeout); + + sctp_lock_sock(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + + ep = sctp_sk(sk)->ep; + + /* Walk all associations on a socket, not on an endpoint. */ + list_for_each_safe(pos, temp, &ep->asocs) { + asoc = list_entry(pos, struct sctp_association, asocs); + + if (sctp_style(sk, TCP)) { + /* A closed association can still be in the list if + * it belongs to a TCP-style listening socket that is + * not yet accepted. If so, free it. If not, send an + * ABORT or SHUTDOWN based on the linger options. + */ + if (sctp_state(asoc, CLOSED)) { + sctp_unhash_established(asoc); + sctp_association_free(asoc); + + } else if (sock_flag(sk, SOCK_LINGER) && + !sk->sk_lingertime) + sctp_primitive_ABORT(asoc, NULL); + else + sctp_primitive_SHUTDOWN(asoc, NULL); + } else + sctp_primitive_SHUTDOWN(asoc, NULL); + } + + /* Clean up any skbs sitting on the receive queue. */ + sctp_queue_purge_ulpevents(&sk->sk_receive_queue); + sctp_queue_purge_ulpevents(&sctp_sk(sk)->pd_lobby); + + /* On a TCP-style socket, block for at most linger_time if set. */ + if (sctp_style(sk, TCP) && timeout) + sctp_wait_for_close(sk, timeout); + + /* This will run the backlog queue. */ + sctp_release_sock(sk); + + /* Supposedly, no process has access to the socket, but + * the net layers still may. + */ + sctp_local_bh_disable(); + sctp_bh_lock_sock(sk); + + /* Hold the sock, since sk_common_release() will put sock_put() + * and we have just a little more cleanup. + */ + sock_hold(sk); + sk_common_release(sk); + + sctp_bh_unlock_sock(sk); + sctp_local_bh_enable(); + + sock_put(sk); + + SCTP_DBG_OBJCNT_DEC(sock); +} + +/* Handle EPIPE error. */ +static int sctp_error(struct sock *sk, int flags, int err) +{ + if (err == -EPIPE) + err = sock_error(sk) ? : -EPIPE; + if (err == -EPIPE && !(flags & MSG_NOSIGNAL)) + send_sig(SIGPIPE, current, 0); + return err; +} + +/* API 3.1.3 sendmsg() - UDP Style Syntax + * + * An application uses sendmsg() and recvmsg() calls to transmit data to + * and receive data from its peer. + * + * ssize_t sendmsg(int socket, const struct msghdr *message, + * int flags); + * + * socket - the socket descriptor of the endpoint. + * message - pointer to the msghdr structure which contains a single + * user message and possibly some ancillary data. + * + * See Section 5 for complete description of the data + * structures. + * + * flags - flags sent or received with the user message, see Section + * 5 for complete description of the flags. + * + * Note: This function could use a rewrite especially when explicit + * connect support comes in. + */ +/* BUG: We do not implement the equivalent of sk_stream_wait_memory(). */ + +SCTP_STATIC int sctp_msghdr_parse(const struct msghdr *, sctp_cmsgs_t *); + +SCTP_STATIC int sctp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t msg_len) +{ + struct sctp_sock *sp; + struct sctp_endpoint *ep; + struct sctp_association *new_asoc=NULL, *asoc=NULL; + struct sctp_transport *transport, *chunk_tp; + struct sctp_chunk *chunk; + union sctp_addr to; + struct sockaddr *msg_name = NULL; + struct sctp_sndrcvinfo default_sinfo = { 0 }; + struct sctp_sndrcvinfo *sinfo; + struct sctp_initmsg *sinit; + sctp_assoc_t associd = 0; + sctp_cmsgs_t cmsgs = { NULL }; + int err; + sctp_scope_t scope; + long timeo; + __u16 sinfo_flags = 0; + struct sctp_datamsg *datamsg; + struct list_head *pos; + int msg_flags = msg->msg_flags; + + SCTP_DEBUG_PRINTK("sctp_sendmsg(sk: %p, msg: %p, msg_len: %zu)\n", + sk, msg, msg_len); + + err = 0; + sp = sctp_sk(sk); + ep = sp->ep; + + SCTP_DEBUG_PRINTK("Using endpoint: %s.\n", ep->debug_name); + + /* We cannot send a message over a TCP-style listening socket. */ + if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) { + err = -EPIPE; + goto out_nounlock; + } + + /* Parse out the SCTP CMSGs. */ + err = sctp_msghdr_parse(msg, &cmsgs); + + if (err) { + SCTP_DEBUG_PRINTK("msghdr parse err = %x\n", err); + goto out_nounlock; + } + + /* Fetch the destination address for this packet. This + * address only selects the association--it is not necessarily + * the address we will send to. + * For a peeled-off socket, msg_name is ignored. + */ + if (!sctp_style(sk, UDP_HIGH_BANDWIDTH) && msg->msg_name) { + int msg_namelen = msg->msg_namelen; + + err = sctp_verify_addr(sk, (union sctp_addr *)msg->msg_name, + msg_namelen); + if (err) + return err; + + if (msg_namelen > sizeof(to)) + msg_namelen = sizeof(to); + memcpy(&to, msg->msg_name, msg_namelen); + SCTP_DEBUG_PRINTK("Just memcpy'd. msg_name is " + "0x%x:%u.\n", + to.v4.sin_addr.s_addr, to.v4.sin_port); + + to.v4.sin_port = ntohs(to.v4.sin_port); + msg_name = msg->msg_name; + } + + sinfo = cmsgs.info; + sinit = cmsgs.init; + + /* Did the user specify SNDRCVINFO? */ + if (sinfo) { + sinfo_flags = sinfo->sinfo_flags; + associd = sinfo->sinfo_assoc_id; + } + + SCTP_DEBUG_PRINTK("msg_len: %zu, sinfo_flags: 0x%x\n", + msg_len, sinfo_flags); + + /* MSG_EOF or MSG_ABORT cannot be set on a TCP-style socket. */ + if (sctp_style(sk, TCP) && (sinfo_flags & (MSG_EOF | MSG_ABORT))) { + err = -EINVAL; + goto out_nounlock; + } + + /* If MSG_EOF is set, no data can be sent. Disallow sending zero + * length messages when MSG_EOF|MSG_ABORT is not set. + * If MSG_ABORT is set, the message length could be non zero with + * the msg_iov set to the user abort reason. + */ + if (((sinfo_flags & MSG_EOF) && (msg_len > 0)) || + (!(sinfo_flags & (MSG_EOF|MSG_ABORT)) && (msg_len == 0))) { + err = -EINVAL; + goto out_nounlock; + } + + /* If MSG_ADDR_OVER is set, there must be an address + * specified in msg_name. + */ + if ((sinfo_flags & MSG_ADDR_OVER) && (!msg->msg_name)) { + err = -EINVAL; + goto out_nounlock; + } + + transport = NULL; + + SCTP_DEBUG_PRINTK("About to look up association.\n"); + + sctp_lock_sock(sk); + + /* If a msg_name has been specified, assume this is to be used. */ + if (msg_name) { + /* Look for a matching association on the endpoint. */ + asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport); + if (!asoc) { + /* If we could not find a matching association on the + * endpoint, make sure that it is not a TCP-style + * socket that already has an association or there is + * no peeled-off association on another socket. + */ + if ((sctp_style(sk, TCP) && + sctp_sstate(sk, ESTABLISHED)) || + sctp_endpoint_is_peeled_off(ep, &to)) { + err = -EADDRNOTAVAIL; + goto out_unlock; + } + } + } else { + asoc = sctp_id2assoc(sk, associd); + if (!asoc) { + err = -EPIPE; + goto out_unlock; + } + } + + if (asoc) { + SCTP_DEBUG_PRINTK("Just looked up association: %p.\n", asoc); + + /* We cannot send a message on a TCP-style SCTP_SS_ESTABLISHED + * socket that has an association in CLOSED state. This can + * happen when an accepted socket has an association that is + * already CLOSED. + */ + if (sctp_state(asoc, CLOSED) && sctp_style(sk, TCP)) { + err = -EPIPE; + goto out_unlock; + } + + if (sinfo_flags & MSG_EOF) { + SCTP_DEBUG_PRINTK("Shutting down association: %p\n", + asoc); + sctp_primitive_SHUTDOWN(asoc, NULL); + err = 0; + goto out_unlock; + } + if (sinfo_flags & MSG_ABORT) { + SCTP_DEBUG_PRINTK("Aborting association: %p\n", asoc); + sctp_primitive_ABORT(asoc, msg); + err = 0; + goto out_unlock; + } + } + + /* Do we need to create the association? */ + if (!asoc) { + SCTP_DEBUG_PRINTK("There is no association yet.\n"); + + if (sinfo_flags & (MSG_EOF | MSG_ABORT)) { + err = -EINVAL; + goto out_unlock; + } + + /* Check for invalid stream against the stream counts, + * either the default or the user specified stream counts. + */ + if (sinfo) { + if (!sinit || (sinit && !sinit->sinit_num_ostreams)) { + /* Check against the defaults. */ + if (sinfo->sinfo_stream >= + sp->initmsg.sinit_num_ostreams) { + err = -EINVAL; + goto out_unlock; + } + } else { + /* Check against the requested. */ + if (sinfo->sinfo_stream >= + sinit->sinit_num_ostreams) { + err = -EINVAL; + goto out_unlock; + } + } + } + + /* + * API 3.1.2 bind() - UDP Style Syntax + * If a bind() or sctp_bindx() is not called prior to a + * sendmsg() call that initiates a new association, the + * system picks an ephemeral port and will choose an address + * set equivalent to binding with a wildcard address. + */ + if (!ep->base.bind_addr.port) { + if (sctp_autobind(sk)) { + err = -EAGAIN; + goto out_unlock; + } + } + + scope = sctp_scope(&to); + new_asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL); + if (!new_asoc) { + err = -ENOMEM; + goto out_unlock; + } + asoc = new_asoc; + + /* If the SCTP_INIT ancillary data is specified, set all + * the association init values accordingly. + */ + if (sinit) { + if (sinit->sinit_num_ostreams) { + asoc->c.sinit_num_ostreams = + sinit->sinit_num_ostreams; + } + if (sinit->sinit_max_instreams) { + asoc->c.sinit_max_instreams = + sinit->sinit_max_instreams; + } + if (sinit->sinit_max_attempts) { + asoc->max_init_attempts + = sinit->sinit_max_attempts; + } + if (sinit->sinit_max_init_timeo) { + asoc->max_init_timeo = + msecs_to_jiffies(sinit->sinit_max_init_timeo); + } + } + + /* Prime the peer's transport structures. */ + transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL); + if (!transport) { + err = -ENOMEM; + goto out_free; + } + err = sctp_assoc_set_bind_addr_from_ep(asoc, GFP_KERNEL); + if (err < 0) { + err = -ENOMEM; + goto out_free; + } + } + + /* ASSERT: we have a valid association at this point. */ + SCTP_DEBUG_PRINTK("We have a valid association.\n"); + + if (!sinfo) { + /* If the user didn't specify SNDRCVINFO, make up one with + * some defaults. + */ + default_sinfo.sinfo_stream = asoc->default_stream; + default_sinfo.sinfo_flags = asoc->default_flags; + default_sinfo.sinfo_ppid = asoc->default_ppid; + default_sinfo.sinfo_context = asoc->default_context; + default_sinfo.sinfo_timetolive = asoc->default_timetolive; + default_sinfo.sinfo_assoc_id = sctp_assoc2id(asoc); + sinfo = &default_sinfo; + } + + /* API 7.1.7, the sndbuf size per association bounds the + * maximum size of data that can be sent in a single send call. + */ + if (msg_len > sk->sk_sndbuf) { + err = -EMSGSIZE; + goto out_free; + } + + /* If fragmentation is disabled and the message length exceeds the + * association fragmentation point, return EMSGSIZE. The I-D + * does not specify what this error is, but this looks like + * a great fit. + */ + if (sctp_sk(sk)->disable_fragments && (msg_len > asoc->frag_point)) { + err = -EMSGSIZE; + goto out_free; + } + + if (sinfo) { + /* Check for invalid stream. */ + if (sinfo->sinfo_stream >= asoc->c.sinit_num_ostreams) { + err = -EINVAL; + goto out_free; + } + } + + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + if (!sctp_wspace(asoc)) { + err = sctp_wait_for_sndbuf(asoc, &timeo, msg_len); + if (err) + goto out_free; + } + + /* If an address is passed with the sendto/sendmsg call, it is used + * to override the primary destination address in the TCP model, or + * when MSG_ADDR_OVER flag is set in the UDP model. + */ + if ((sctp_style(sk, TCP) && msg_name) || + (sinfo_flags & MSG_ADDR_OVER)) { + chunk_tp = sctp_assoc_lookup_paddr(asoc, &to); + if (!chunk_tp) { + err = -EINVAL; + goto out_free; + } + } else + chunk_tp = NULL; + + /* Auto-connect, if we aren't connected already. */ + if (sctp_state(asoc, CLOSED)) { + err = sctp_primitive_ASSOCIATE(asoc, NULL); + if (err < 0) + goto out_free; + SCTP_DEBUG_PRINTK("We associated primitively.\n"); + } + + /* Break the message into multiple chunks of maximum size. */ + datamsg = sctp_datamsg_from_user(asoc, sinfo, msg, msg_len); + if (!datamsg) { + err = -ENOMEM; + goto out_free; + } + + /* Now send the (possibly) fragmented message. */ + list_for_each(pos, &datamsg->chunks) { + chunk = list_entry(pos, struct sctp_chunk, frag_list); + sctp_datamsg_track(chunk); + + /* Do accounting for the write space. */ + sctp_set_owner_w(chunk); + + chunk->transport = chunk_tp; + + /* Send it to the lower layers. Note: all chunks + * must either fail or succeed. The lower layer + * works that way today. Keep it that way or this + * breaks. + */ + err = sctp_primitive_SEND(asoc, chunk); + /* Did the lower layer accept the chunk? */ + if (err) + sctp_chunk_free(chunk); + SCTP_DEBUG_PRINTK("We sent primitively.\n"); + } + + sctp_datamsg_free(datamsg); + if (err) + goto out_free; + else + err = msg_len; + + /* If we are already past ASSOCIATE, the lower + * layers are responsible for association cleanup. + */ + goto out_unlock; + +out_free: + if (new_asoc) + sctp_association_free(asoc); +out_unlock: + sctp_release_sock(sk); + +out_nounlock: + return sctp_error(sk, msg_flags, err); + +#if 0 +do_sock_err: + if (msg_len) + err = msg_len; + else + err = sock_error(sk); + goto out; + +do_interrupted: + if (msg_len) + err = msg_len; + goto out; +#endif /* 0 */ +} + +/* This is an extended version of skb_pull() that removes the data from the + * start of a skb even when data is spread across the list of skb's in the + * frag_list. len specifies the total amount of data that needs to be removed. + * when 'len' bytes could be removed from the skb, it returns 0. + * If 'len' exceeds the total skb length, it returns the no. of bytes that + * could not be removed. + */ +static int sctp_skb_pull(struct sk_buff *skb, int len) +{ + struct sk_buff *list; + int skb_len = skb_headlen(skb); + int rlen; + + if (len <= skb_len) { + __skb_pull(skb, len); + return 0; + } + len -= skb_len; + __skb_pull(skb, skb_len); + + for (list = skb_shinfo(skb)->frag_list; list; list = list->next) { + rlen = sctp_skb_pull(list, len); + skb->len -= (len-rlen); + skb->data_len -= (len-rlen); + + if (!rlen) + return 0; + + len = rlen; + } + + return len; +} + +/* API 3.1.3 recvmsg() - UDP Style Syntax + * + * ssize_t recvmsg(int socket, struct msghdr *message, + * int flags); + * + * socket - the socket descriptor of the endpoint. + * message - pointer to the msghdr structure which contains a single + * user message and possibly some ancillary data. + * + * See Section 5 for complete description of the data + * structures. + * + * flags - flags sent or received with the user message, see Section + * 5 for complete description of the flags. + */ +static struct sk_buff *sctp_skb_recv_datagram(struct sock *, int, int, int *); + +SCTP_STATIC int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t len, int noblock, + int flags, int *addr_len) +{ + struct sctp_ulpevent *event = NULL; + struct sctp_sock *sp = sctp_sk(sk); + struct sk_buff *skb; + int copied; + int err = 0; + int skb_len; + + SCTP_DEBUG_PRINTK("sctp_recvmsg(%s: %p, %s: %p, %s: %zd, %s: %d, %s: " + "0x%x, %s: %p)\n", "sk", sk, "msghdr", msg, + "len", len, "knoblauch", noblock, + "flags", flags, "addr_len", addr_len); + + sctp_lock_sock(sk); + + if (sctp_style(sk, TCP) && !sctp_sstate(sk, ESTABLISHED)) { + err = -ENOTCONN; + goto out; + } + + skb = sctp_skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out; + + /* Get the total length of the skb including any skb's in the + * frag_list. + */ + skb_len = skb->len; + + copied = skb_len; + if (copied > len) + copied = len; + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + + event = sctp_skb2event(skb); + + if (err) + goto out_free; + + sock_recv_timestamp(msg, sk, skb); + if (sctp_ulpevent_is_notification(event)) { + msg->msg_flags |= MSG_NOTIFICATION; + sp->pf->event_msgname(event, msg->msg_name, addr_len); + } else { + sp->pf->skb_msgname(skb, msg->msg_name, addr_len); + } + + /* Check if we allow SCTP_SNDRCVINFO. */ + if (sp->subscribe.sctp_data_io_event) + sctp_ulpevent_read_sndrcvinfo(event, msg); +#if 0 + /* FIXME: we should be calling IP/IPv6 layers. */ + if (sk->sk_protinfo.af_inet.cmsg_flags) + ip_cmsg_recv(msg, skb); +#endif + + err = copied; + + /* If skb's length exceeds the user's buffer, update the skb and + * push it back to the receive_queue so that the next call to + * recvmsg() will return the remaining data. Don't set MSG_EOR. + */ + if (skb_len > copied) { + msg->msg_flags &= ~MSG_EOR; + if (flags & MSG_PEEK) + goto out_free; + sctp_skb_pull(skb, copied); + skb_queue_head(&sk->sk_receive_queue, skb); + + /* When only partial message is copied to the user, increase + * rwnd by that amount. If all the data in the skb is read, + * rwnd is updated when the event is freed. + */ + sctp_assoc_rwnd_increase(event->asoc, copied); + goto out; + } else if ((event->msg_flags & MSG_NOTIFICATION) || + (event->msg_flags & MSG_EOR)) + msg->msg_flags |= MSG_EOR; + else + msg->msg_flags &= ~MSG_EOR; + +out_free: + if (flags & MSG_PEEK) { + /* Release the skb reference acquired after peeking the skb in + * sctp_skb_recv_datagram(). + */ + kfree_skb(skb); + } else { + /* Free the event which includes releasing the reference to + * the owner of the skb, freeing the skb and updating the + * rwnd. + */ + sctp_ulpevent_free(event); + } +out: + sctp_release_sock(sk); + return err; +} + +/* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS) + * + * This option is a on/off flag. If enabled no SCTP message + * fragmentation will be performed. Instead if a message being sent + * exceeds the current PMTU size, the message will NOT be sent and + * instead a error will be indicated to the user. + */ +static int sctp_setsockopt_disable_fragments(struct sock *sk, + char __user *optval, int optlen) +{ + int val; + + if (optlen < sizeof(int)) + return -EINVAL; + + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + sctp_sk(sk)->disable_fragments = (val == 0) ? 0 : 1; + + return 0; +} + +static int sctp_setsockopt_events(struct sock *sk, char __user *optval, + int optlen) +{ + if (optlen != sizeof(struct sctp_event_subscribe)) + return -EINVAL; + if (copy_from_user(&sctp_sk(sk)->subscribe, optval, optlen)) + return -EFAULT; + return 0; +} + +/* 7.1.8 Automatic Close of associations (SCTP_AUTOCLOSE) + * + * This socket option is applicable to the UDP-style socket only. When + * set it will cause associations that are idle for more than the + * specified number of seconds to automatically close. An association + * being idle is defined an association that has NOT sent or received + * user data. The special value of '0' indicates that no automatic + * close of any associations should be performed. The option expects an + * integer defining the number of seconds of idle time before an + * association is closed. + */ +static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, + int optlen) +{ + struct sctp_sock *sp = sctp_sk(sk); + + /* Applicable to UDP-style socket only */ + if (sctp_style(sk, TCP)) + return -EOPNOTSUPP; + if (optlen != sizeof(int)) + return -EINVAL; + if (copy_from_user(&sp->autoclose, optval, optlen)) + return -EFAULT; + + sp->ep->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; + return 0; +} + +/* 7.1.13 Peer Address Parameters (SCTP_PEER_ADDR_PARAMS) + * + * Applications can enable or disable heartbeats for any peer address of + * an association, modify an address's heartbeat interval, force a + * heartbeat to be sent immediately, and adjust the address's maximum + * number of retransmissions sent before an address is considered + * unreachable. The following structure is used to access and modify an + * address's parameters: + * + * struct sctp_paddrparams { + * sctp_assoc_t spp_assoc_id; + * struct sockaddr_storage spp_address; + * uint32_t spp_hbinterval; + * uint16_t spp_pathmaxrxt; + * }; + * + * spp_assoc_id - (UDP style socket) This is filled in the application, + * and identifies the association for this query. + * spp_address - This specifies which address is of interest. + * spp_hbinterval - This contains the value of the heartbeat interval, + * in milliseconds. A value of 0, when modifying the + * parameter, specifies that the heartbeat on this + * address should be disabled. A value of UINT32_MAX + * (4294967295), when modifying the parameter, + * specifies that a heartbeat should be sent + * immediately to the peer address, and the current + * interval should remain unchanged. + * spp_pathmaxrxt - This contains the maximum number of + * retransmissions before this address shall be + * considered unreachable. + */ +static int sctp_setsockopt_peer_addr_params(struct sock *sk, + char __user *optval, int optlen) +{ + struct sctp_paddrparams params; + struct sctp_transport *trans; + int error; + + if (optlen != sizeof(struct sctp_paddrparams)) + return -EINVAL; + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + + /* + * API 7. Socket Options (setting the default value for the endpoint) + * All options that support specific settings on an association by + * filling in either an association id variable or a sockaddr_storage + * SHOULD also support setting of the same value for the entire endpoint + * (i.e. future associations). To accomplish this the following logic is + * used when setting one of these options: + + * c) If neither the sockaddr_storage or association identification is + * set i.e. the sockaddr_storage is set to all 0's (INADDR_ANY) and + * the association identification is 0, the settings are a default + * and to be applied to the endpoint (all future associations). + */ + + /* update default value for endpoint (all future associations) */ + if (!params.spp_assoc_id && + sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { + /* Manual heartbeat on an endpoint is invalid. */ + if (0xffffffff == params.spp_hbinterval) + return -EINVAL; + else if (params.spp_hbinterval) + sctp_sk(sk)->paddrparam.spp_hbinterval = + params.spp_hbinterval; + if (params.spp_pathmaxrxt) + sctp_sk(sk)->paddrparam.spp_pathmaxrxt = + params.spp_pathmaxrxt; + return 0; + } + + trans = sctp_addr_id2transport(sk, ¶ms.spp_address, + params.spp_assoc_id); + if (!trans) + return -EINVAL; + + /* Applications can enable or disable heartbeats for any peer address + * of an association, modify an address's heartbeat interval, force a + * heartbeat to be sent immediately, and adjust the address's maximum + * number of retransmissions sent before an address is considered + * unreachable. + * + * The value of the heartbeat interval, in milliseconds. A value of + * UINT32_MAX (4294967295), when modifying the parameter, specifies + * that a heartbeat should be sent immediately to the peer address, + * and the current interval should remain unchanged. + */ + if (0xffffffff == params.spp_hbinterval) { + error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans); + if (error) + return error; + } else { + /* The value of the heartbeat interval, in milliseconds. A value of 0, + * when modifying the parameter, specifies that the heartbeat on this + * address should be disabled. + */ + if (params.spp_hbinterval) { + trans->hb_allowed = 1; + trans->hb_interval = + msecs_to_jiffies(params.spp_hbinterval); + } else + trans->hb_allowed = 0; + } + + /* spp_pathmaxrxt contains the maximum number of retransmissions + * before this address shall be considered unreachable. + */ + if (params.spp_pathmaxrxt) + trans->max_retrans = params.spp_pathmaxrxt; + + return 0; +} + +/* 7.1.3 Initialization Parameters (SCTP_INITMSG) + * + * Applications can specify protocol parameters for the default association + * initialization. The option name argument to setsockopt() and getsockopt() + * is SCTP_INITMSG. + * + * Setting initialization parameters is effective only on an unconnected + * socket (for UDP-style sockets only future associations are effected + * by the change). With TCP-style sockets, this option is inherited by + * sockets derived from a listener socket. + */ +static int sctp_setsockopt_initmsg(struct sock *sk, char __user *optval, int optlen) +{ + struct sctp_initmsg sinit; + struct sctp_sock *sp = sctp_sk(sk); + + if (optlen != sizeof(struct sctp_initmsg)) + return -EINVAL; + if (copy_from_user(&sinit, optval, optlen)) + return -EFAULT; + + if (sinit.sinit_num_ostreams) + sp->initmsg.sinit_num_ostreams = sinit.sinit_num_ostreams; + if (sinit.sinit_max_instreams) + sp->initmsg.sinit_max_instreams = sinit.sinit_max_instreams; + if (sinit.sinit_max_attempts) + sp->initmsg.sinit_max_attempts = sinit.sinit_max_attempts; + if (sinit.sinit_max_init_timeo) + sp->initmsg.sinit_max_init_timeo = sinit.sinit_max_init_timeo; + + return 0; +} + +/* + * 7.1.14 Set default send parameters (SCTP_DEFAULT_SEND_PARAM) + * + * Applications that wish to use the sendto() system call may wish to + * specify a default set of parameters that would normally be supplied + * through the inclusion of ancillary data. This socket option allows + * such an application to set the default sctp_sndrcvinfo structure. + * The application that wishes to use this socket option simply passes + * in to this call the sctp_sndrcvinfo structure defined in Section + * 5.2.2) The input parameters accepted by this call include + * sinfo_stream, sinfo_flags, sinfo_ppid, sinfo_context, + * sinfo_timetolive. The user must provide the sinfo_assoc_id field in + * to this call if the caller is using the UDP model. + */ +static int sctp_setsockopt_default_send_param(struct sock *sk, + char __user *optval, int optlen) +{ + struct sctp_sndrcvinfo info; + struct sctp_association *asoc; + struct sctp_sock *sp = sctp_sk(sk); + + if (optlen != sizeof(struct sctp_sndrcvinfo)) + return -EINVAL; + if (copy_from_user(&info, optval, optlen)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, info.sinfo_assoc_id); + if (!asoc && info.sinfo_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (asoc) { + asoc->default_stream = info.sinfo_stream; + asoc->default_flags = info.sinfo_flags; + asoc->default_ppid = info.sinfo_ppid; + asoc->default_context = info.sinfo_context; + asoc->default_timetolive = info.sinfo_timetolive; + } else { + sp->default_stream = info.sinfo_stream; + sp->default_flags = info.sinfo_flags; + sp->default_ppid = info.sinfo_ppid; + sp->default_context = info.sinfo_context; + sp->default_timetolive = info.sinfo_timetolive; + } + + return 0; +} + +/* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR) + * + * Requests that the local SCTP stack use the enclosed peer address as + * the association primary. The enclosed address must be one of the + * association peer's addresses. + */ +static int sctp_setsockopt_primary_addr(struct sock *sk, char __user *optval, + int optlen) +{ + struct sctp_prim prim; + struct sctp_transport *trans; + + if (optlen != sizeof(struct sctp_prim)) + return -EINVAL; + + if (copy_from_user(&prim, optval, sizeof(struct sctp_prim))) + return -EFAULT; + + trans = sctp_addr_id2transport(sk, &prim.ssp_addr, prim.ssp_assoc_id); + if (!trans) + return -EINVAL; + + sctp_assoc_set_primary(trans->asoc, trans); + + return 0; +} + +/* + * 7.1.5 SCTP_NODELAY + * + * Turn on/off any Nagle-like algorithm. This means that packets are + * generally sent as soon as possible and no unnecessary delays are + * introduced, at the cost of more packets in the network. Expects an + * integer boolean flag. + */ +static int sctp_setsockopt_nodelay(struct sock *sk, char __user *optval, + int optlen) +{ + int val; + + if (optlen < sizeof(int)) + return -EINVAL; + if (get_user(val, (int __user *)optval)) + return -EFAULT; + + sctp_sk(sk)->nodelay = (val == 0) ? 0 : 1; + return 0; +} + +/* + * + * 7.1.1 SCTP_RTOINFO + * + * The protocol parameters used to initialize and bound retransmission + * timeout (RTO) are tunable. sctp_rtoinfo structure is used to access + * and modify these parameters. + * All parameters are time values, in milliseconds. A value of 0, when + * modifying the parameters, indicates that the current value should not + * be changed. + * + */ +static int sctp_setsockopt_rtoinfo(struct sock *sk, char __user *optval, int optlen) { + struct sctp_rtoinfo rtoinfo; + struct sctp_association *asoc; + + if (optlen != sizeof (struct sctp_rtoinfo)) + return -EINVAL; + + if (copy_from_user(&rtoinfo, optval, optlen)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, rtoinfo.srto_assoc_id); + + /* Set the values to the specific association */ + if (!asoc && rtoinfo.srto_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (asoc) { + if (rtoinfo.srto_initial != 0) + asoc->rto_initial = + msecs_to_jiffies(rtoinfo.srto_initial); + if (rtoinfo.srto_max != 0) + asoc->rto_max = msecs_to_jiffies(rtoinfo.srto_max); + if (rtoinfo.srto_min != 0) + asoc->rto_min = msecs_to_jiffies(rtoinfo.srto_min); + } else { + /* If there is no association or the association-id = 0 + * set the values to the endpoint. + */ + struct sctp_sock *sp = sctp_sk(sk); + + if (rtoinfo.srto_initial != 0) + sp->rtoinfo.srto_initial = rtoinfo.srto_initial; + if (rtoinfo.srto_max != 0) + sp->rtoinfo.srto_max = rtoinfo.srto_max; + if (rtoinfo.srto_min != 0) + sp->rtoinfo.srto_min = rtoinfo.srto_min; + } + + return 0; +} + +/* + * + * 7.1.2 SCTP_ASSOCINFO + * + * This option is used to tune the the maximum retransmission attempts + * of the association. + * Returns an error if the new association retransmission value is + * greater than the sum of the retransmission value of the peer. + * See [SCTP] for more information. + * + */ +static int sctp_setsockopt_associnfo(struct sock *sk, char __user *optval, int optlen) +{ + + struct sctp_assocparams assocparams; + struct sctp_association *asoc; + + if (optlen != sizeof(struct sctp_assocparams)) + return -EINVAL; + if (copy_from_user(&assocparams, optval, optlen)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, assocparams.sasoc_assoc_id); + + if (!asoc && assocparams.sasoc_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + /* Set the values to the specific association */ + if (asoc) { + if (assocparams.sasoc_asocmaxrxt != 0) + asoc->max_retrans = assocparams.sasoc_asocmaxrxt; + if (assocparams.sasoc_cookie_life != 0) { + asoc->cookie_life.tv_sec = + assocparams.sasoc_cookie_life / 1000; + asoc->cookie_life.tv_usec = + (assocparams.sasoc_cookie_life % 1000) + * 1000; + } + } else { + /* Set the values to the endpoint */ + struct sctp_sock *sp = sctp_sk(sk); + + if (assocparams.sasoc_asocmaxrxt != 0) + sp->assocparams.sasoc_asocmaxrxt = + assocparams.sasoc_asocmaxrxt; + if (assocparams.sasoc_cookie_life != 0) + sp->assocparams.sasoc_cookie_life = + assocparams.sasoc_cookie_life; + } + return 0; +} + +/* + * 7.1.16 Set/clear IPv4 mapped addresses (SCTP_I_WANT_MAPPED_V4_ADDR) + * + * This socket option is a boolean flag which turns on or off mapped V4 + * addresses. If this option is turned on and the socket is type + * PF_INET6, then IPv4 addresses will be mapped to V6 representation. + * If this option is turned off, then no mapping will be done of V4 + * addresses and a user will receive both PF_INET6 and PF_INET type + * addresses on the socket. + */ +static int sctp_setsockopt_mappedv4(struct sock *sk, char __user *optval, int optlen) +{ + int val; + struct sctp_sock *sp = sctp_sk(sk); + + if (optlen < sizeof(int)) + return -EINVAL; + if (get_user(val, (int __user *)optval)) + return -EFAULT; + if (val) + sp->v4mapped = 1; + else + sp->v4mapped = 0; + + return 0; +} + +/* + * 7.1.17 Set the maximum fragrmentation size (SCTP_MAXSEG) + * + * This socket option specifies the maximum size to put in any outgoing + * SCTP chunk. If a message is larger than this size it will be + * fragmented by SCTP into the specified size. Note that the underlying + * SCTP implementation may fragment into smaller sized chunks when the + * PMTU of the underlying association is smaller than the value set by + * the user. + */ +static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, int optlen) +{ + struct sctp_association *asoc; + struct list_head *pos; + struct sctp_sock *sp = sctp_sk(sk); + int val; + + if (optlen < sizeof(int)) + return -EINVAL; + if (get_user(val, (int __user *)optval)) + return -EFAULT; + if ((val < 8) || (val > SCTP_MAX_CHUNK_LEN)) + return -EINVAL; + sp->user_frag = val; + + if (val) { + /* Update the frag_point of the existing associations. */ + list_for_each(pos, &(sp->ep->asocs)) { + asoc = list_entry(pos, struct sctp_association, asocs); + asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); + } + } + + return 0; +} + + +/* + * 7.1.9 Set Peer Primary Address (SCTP_SET_PEER_PRIMARY_ADDR) + * + * Requests that the peer mark the enclosed address as the association + * primary. The enclosed address must be one of the association's + * locally bound addresses. The following structure is used to make a + * set primary request: + */ +static int sctp_setsockopt_peer_primary_addr(struct sock *sk, char __user *optval, + int optlen) +{ + struct sctp_sock *sp; + struct sctp_endpoint *ep; + struct sctp_association *asoc = NULL; + struct sctp_setpeerprim prim; + struct sctp_chunk *chunk; + int err; + + sp = sctp_sk(sk); + ep = sp->ep; + + if (!sctp_addip_enable) + return -EPERM; + + if (optlen != sizeof(struct sctp_setpeerprim)) + return -EINVAL; + + if (copy_from_user(&prim, optval, optlen)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, prim.sspp_assoc_id); + if (!asoc) + return -EINVAL; + + if (!asoc->peer.asconf_capable) + return -EPERM; + + if (asoc->peer.addip_disabled_mask & SCTP_PARAM_SET_PRIMARY) + return -EPERM; + + if (!sctp_state(asoc, ESTABLISHED)) + return -ENOTCONN; + + if (!sctp_assoc_lookup_laddr(asoc, (union sctp_addr *)&prim.sspp_addr)) + return -EADDRNOTAVAIL; + + /* Create an ASCONF chunk with SET_PRIMARY parameter */ + chunk = sctp_make_asconf_set_prim(asoc, + (union sctp_addr *)&prim.sspp_addr); + if (!chunk) + return -ENOMEM; + + err = sctp_send_asconf(asoc, chunk); + + SCTP_DEBUG_PRINTK("We set peer primary addr primitively.\n"); + + return err; +} + +static int sctp_setsockopt_adaption_layer(struct sock *sk, char __user *optval, + int optlen) +{ + __u32 val; + + if (optlen < sizeof(__u32)) + return -EINVAL; + if (copy_from_user(&val, optval, sizeof(__u32))) + return -EFAULT; + + sctp_sk(sk)->adaption_ind = val; + + return 0; +} + +/* API 6.2 setsockopt(), getsockopt() + * + * Applications use setsockopt() and getsockopt() to set or retrieve + * socket options. Socket options are used to change the default + * behavior of sockets calls. They are described in Section 7. + * + * The syntax is: + * + * ret = getsockopt(int sd, int level, int optname, void __user *optval, + * int __user *optlen); + * ret = setsockopt(int sd, int level, int optname, const void __user *optval, + * int optlen); + * + * sd - the socket descript. + * level - set to IPPROTO_SCTP for all SCTP options. + * optname - the option name. + * optval - the buffer to store the value of the option. + * optlen - the size of the buffer. + */ +SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname, + char __user *optval, int optlen) +{ + int retval = 0; + + SCTP_DEBUG_PRINTK("sctp_setsockopt(sk: %p... optname: %d)\n", + sk, optname); + + /* I can hardly begin to describe how wrong this is. This is + * so broken as to be worse than useless. The API draft + * REALLY is NOT helpful here... I am not convinced that the + * semantics of setsockopt() with a level OTHER THAN SOL_SCTP + * are at all well-founded. + */ + if (level != SOL_SCTP) { + struct sctp_af *af = sctp_sk(sk)->pf->af; + retval = af->setsockopt(sk, level, optname, optval, optlen); + goto out_nounlock; + } + + sctp_lock_sock(sk); + + switch (optname) { + case SCTP_SOCKOPT_BINDX_ADD: + /* 'optlen' is the size of the addresses buffer. */ + retval = sctp_setsockopt_bindx(sk, (struct sockaddr __user *)optval, + optlen, SCTP_BINDX_ADD_ADDR); + break; + + case SCTP_SOCKOPT_BINDX_REM: + /* 'optlen' is the size of the addresses buffer. */ + retval = sctp_setsockopt_bindx(sk, (struct sockaddr __user *)optval, + optlen, SCTP_BINDX_REM_ADDR); + break; + + case SCTP_DISABLE_FRAGMENTS: + retval = sctp_setsockopt_disable_fragments(sk, optval, optlen); + break; + + case SCTP_EVENTS: + retval = sctp_setsockopt_events(sk, optval, optlen); + break; + + case SCTP_AUTOCLOSE: + retval = sctp_setsockopt_autoclose(sk, optval, optlen); + break; + + case SCTP_PEER_ADDR_PARAMS: + retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen); + break; + + case SCTP_INITMSG: + retval = sctp_setsockopt_initmsg(sk, optval, optlen); + break; + case SCTP_DEFAULT_SEND_PARAM: + retval = sctp_setsockopt_default_send_param(sk, optval, + optlen); + break; + case SCTP_PRIMARY_ADDR: + retval = sctp_setsockopt_primary_addr(sk, optval, optlen); + break; + case SCTP_SET_PEER_PRIMARY_ADDR: + retval = sctp_setsockopt_peer_primary_addr(sk, optval, optlen); + break; + case SCTP_NODELAY: + retval = sctp_setsockopt_nodelay(sk, optval, optlen); + break; + case SCTP_RTOINFO: + retval = sctp_setsockopt_rtoinfo(sk, optval, optlen); + break; + case SCTP_ASSOCINFO: + retval = sctp_setsockopt_associnfo(sk, optval, optlen); + break; + case SCTP_I_WANT_MAPPED_V4_ADDR: + retval = sctp_setsockopt_mappedv4(sk, optval, optlen); + break; + case SCTP_MAXSEG: + retval = sctp_setsockopt_maxseg(sk, optval, optlen); + break; + case SCTP_ADAPTION_LAYER: + retval = sctp_setsockopt_adaption_layer(sk, optval, optlen); + break; + + default: + retval = -ENOPROTOOPT; + break; + }; + + sctp_release_sock(sk); + +out_nounlock: + return retval; +} + +/* API 3.1.6 connect() - UDP Style Syntax + * + * An application may use the connect() call in the UDP model to initiate an + * association without sending data. + * + * The syntax is: + * + * ret = connect(int sd, const struct sockaddr *nam, socklen_t len); + * + * sd: the socket descriptor to have a new association added to. + * + * nam: the address structure (either struct sockaddr_in or struct + * sockaddr_in6 defined in RFC2553 [7]). + * + * len: the size of the address. + */ +SCTP_STATIC int sctp_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + struct sctp_sock *sp; + struct sctp_endpoint *ep; + struct sctp_association *asoc; + struct sctp_transport *transport; + union sctp_addr to; + struct sctp_af *af; + sctp_scope_t scope; + long timeo; + int err = 0; + + sctp_lock_sock(sk); + + SCTP_DEBUG_PRINTK("%s - sk: %p, sockaddr: %p, addr_len: %d)\n", + __FUNCTION__, sk, uaddr, addr_len); + + sp = sctp_sk(sk); + ep = sp->ep; + + /* connect() cannot be done on a socket that is already in ESTABLISHED + * state - UDP-style peeled off socket or a TCP-style socket that + * is already connected. + * It cannot be done even on a TCP-style listening socket. + */ + if (sctp_sstate(sk, ESTABLISHED) || + (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING))) { + err = -EISCONN; + goto out_unlock; + } + + err = sctp_verify_addr(sk, (union sctp_addr *)uaddr, addr_len); + if (err) + goto out_unlock; + + if (addr_len > sizeof(to)) + addr_len = sizeof(to); + memcpy(&to, uaddr, addr_len); + to.v4.sin_port = ntohs(to.v4.sin_port); + + asoc = sctp_endpoint_lookup_assoc(ep, &to, &transport); + if (asoc) { + if (asoc->state >= SCTP_STATE_ESTABLISHED) + err = -EISCONN; + else + err = -EALREADY; + goto out_unlock; + } + + /* If we could not find a matching association on the endpoint, + * make sure that there is no peeled-off association matching the + * peer address even on another socket. + */ + if (sctp_endpoint_is_peeled_off(ep, &to)) { + err = -EADDRNOTAVAIL; + goto out_unlock; + } + + /* If a bind() or sctp_bindx() is not called prior to a connect() + * call, the system picks an ephemeral port and will choose an address + * set equivalent to binding with a wildcard address. + */ + if (!ep->base.bind_addr.port) { + if (sctp_autobind(sk)) { + err = -EAGAIN; + goto out_unlock; + } + } + + scope = sctp_scope(&to); + asoc = sctp_association_new(ep, sk, scope, GFP_KERNEL); + if (!asoc) { + err = -ENOMEM; + goto out_unlock; + } + + /* Prime the peer's transport structures. */ + transport = sctp_assoc_add_peer(asoc, &to, GFP_KERNEL); + if (!transport) { + sctp_association_free(asoc); + goto out_unlock; + } + err = sctp_assoc_set_bind_addr_from_ep(asoc, GFP_KERNEL); + if (err < 0) { + sctp_association_free(asoc); + goto out_unlock; + } + + err = sctp_primitive_ASSOCIATE(asoc, NULL); + if (err < 0) { + sctp_association_free(asoc); + goto out_unlock; + } + + /* Initialize sk's dport and daddr for getpeername() */ + inet_sk(sk)->dport = htons(asoc->peer.port); + af = sctp_get_af_specific(to.sa.sa_family); + af->to_sk_daddr(&to, sk); + + timeo = sock_sndtimeo(sk, sk->sk_socket->file->f_flags & O_NONBLOCK); + err = sctp_wait_for_connect(asoc, &timeo); + +out_unlock: + sctp_release_sock(sk); + + return err; +} + +/* FIXME: Write comments. */ +SCTP_STATIC int sctp_disconnect(struct sock *sk, int flags) +{ + return -EOPNOTSUPP; /* STUB */ +} + +/* 4.1.4 accept() - TCP Style Syntax + * + * Applications use accept() call to remove an established SCTP + * association from the accept queue of the endpoint. A new socket + * descriptor will be returned from accept() to represent the newly + * formed association. + */ +SCTP_STATIC struct sock *sctp_accept(struct sock *sk, int flags, int *err) +{ + struct sctp_sock *sp; + struct sctp_endpoint *ep; + struct sock *newsk = NULL; + struct sctp_association *asoc; + long timeo; + int error = 0; + + sctp_lock_sock(sk); + + sp = sctp_sk(sk); + ep = sp->ep; + + if (!sctp_style(sk, TCP)) { + error = -EOPNOTSUPP; + goto out; + } + + if (!sctp_sstate(sk, LISTENING)) { + error = -EINVAL; + goto out; + } + + timeo = sock_rcvtimeo(sk, sk->sk_socket->file->f_flags & O_NONBLOCK); + + error = sctp_wait_for_accept(sk, timeo); + if (error) + goto out; + + /* We treat the list of associations on the endpoint as the accept + * queue and pick the first association on the list. + */ + asoc = list_entry(ep->asocs.next, struct sctp_association, asocs); + + newsk = sp->pf->create_accept_sk(sk, asoc); + if (!newsk) { + error = -ENOMEM; + goto out; + } + + /* Populate the fields of the newsk from the oldsk and migrate the + * asoc to the newsk. + */ + sctp_sock_migrate(sk, newsk, asoc, SCTP_SOCKET_TCP); + +out: + sctp_release_sock(sk); + *err = error; + return newsk; +} + +/* The SCTP ioctl handler. */ +SCTP_STATIC int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg) +{ + return -ENOIOCTLCMD; +} + +/* This is the function which gets called during socket creation to + * initialized the SCTP-specific portion of the sock. + * The sock structure should already be zero-filled memory. + */ +SCTP_STATIC int sctp_init_sock(struct sock *sk) +{ + struct sctp_endpoint *ep; + struct sctp_sock *sp; + + SCTP_DEBUG_PRINTK("sctp_init_sock(sk: %p)\n", sk); + + sp = sctp_sk(sk); + + /* Initialize the SCTP per socket area. */ + switch (sk->sk_type) { + case SOCK_SEQPACKET: + sp->type = SCTP_SOCKET_UDP; + break; + case SOCK_STREAM: + sp->type = SCTP_SOCKET_TCP; + break; + default: + return -ESOCKTNOSUPPORT; + } + + /* Initialize default send parameters. These parameters can be + * modified with the SCTP_DEFAULT_SEND_PARAM socket option. + */ + sp->default_stream = 0; + sp->default_ppid = 0; + sp->default_flags = 0; + sp->default_context = 0; + sp->default_timetolive = 0; + + /* Initialize default setup parameters. These parameters + * can be modified with the SCTP_INITMSG socket option or + * overridden by the SCTP_INIT CMSG. + */ + sp->initmsg.sinit_num_ostreams = sctp_max_outstreams; + sp->initmsg.sinit_max_instreams = sctp_max_instreams; + sp->initmsg.sinit_max_attempts = sctp_max_retrans_init; + sp->initmsg.sinit_max_init_timeo = jiffies_to_msecs(sctp_rto_max); + + /* Initialize default RTO related parameters. These parameters can + * be modified for with the SCTP_RTOINFO socket option. + */ + sp->rtoinfo.srto_initial = jiffies_to_msecs(sctp_rto_initial); + sp->rtoinfo.srto_max = jiffies_to_msecs(sctp_rto_max); + sp->rtoinfo.srto_min = jiffies_to_msecs(sctp_rto_min); + + /* Initialize default association related parameters. These parameters + * can be modified with the SCTP_ASSOCINFO socket option. + */ + sp->assocparams.sasoc_asocmaxrxt = sctp_max_retrans_association; + sp->assocparams.sasoc_number_peer_destinations = 0; + sp->assocparams.sasoc_peer_rwnd = 0; + sp->assocparams.sasoc_local_rwnd = 0; + sp->assocparams.sasoc_cookie_life = + jiffies_to_msecs(sctp_valid_cookie_life); + + /* Initialize default event subscriptions. By default, all the + * options are off. + */ + memset(&sp->subscribe, 0, sizeof(struct sctp_event_subscribe)); + + /* Default Peer Address Parameters. These defaults can + * be modified via SCTP_PEER_ADDR_PARAMS + */ + sp->paddrparam.spp_hbinterval = jiffies_to_msecs(sctp_hb_interval); + sp->paddrparam.spp_pathmaxrxt = sctp_max_retrans_path; + + /* If enabled no SCTP message fragmentation will be performed. + * Configure through SCTP_DISABLE_FRAGMENTS socket option. + */ + sp->disable_fragments = 0; + + /* Turn on/off any Nagle-like algorithm. */ + sp->nodelay = 1; + + /* Enable by default. */ + sp->v4mapped = 1; + + /* Auto-close idle associations after the configured + * number of seconds. A value of 0 disables this + * feature. Configure through the SCTP_AUTOCLOSE socket option, + * for UDP-style sockets only. + */ + sp->autoclose = 0; + + /* User specified fragmentation limit. */ + sp->user_frag = 0; + + sp->adaption_ind = 0; + + sp->pf = sctp_get_pf_specific(sk->sk_family); + + /* Control variables for partial data delivery. */ + sp->pd_mode = 0; + skb_queue_head_init(&sp->pd_lobby); + + /* Create a per socket endpoint structure. Even if we + * change the data structure relationships, this may still + * be useful for storing pre-connect address information. + */ + ep = sctp_endpoint_new(sk, GFP_KERNEL); + if (!ep) + return -ENOMEM; + + sp->ep = ep; + sp->hmac = NULL; + + SCTP_DBG_OBJCNT_INC(sock); + return 0; +} + +/* Cleanup any SCTP per socket resources. */ +SCTP_STATIC int sctp_destroy_sock(struct sock *sk) +{ + struct sctp_endpoint *ep; + + SCTP_DEBUG_PRINTK("sctp_destroy_sock(sk: %p)\n", sk); + + /* Release our hold on the endpoint. */ + ep = sctp_sk(sk)->ep; + sctp_endpoint_free(ep); + + return 0; +} + +/* API 4.1.7 shutdown() - TCP Style Syntax + * int shutdown(int socket, int how); + * + * sd - the socket descriptor of the association to be closed. + * how - Specifies the type of shutdown. The values are + * as follows: + * SHUT_RD + * Disables further receive operations. No SCTP + * protocol action is taken. + * SHUT_WR + * Disables further send operations, and initiates + * the SCTP shutdown sequence. + * SHUT_RDWR + * Disables further send and receive operations + * and initiates the SCTP shutdown sequence. + */ +SCTP_STATIC void sctp_shutdown(struct sock *sk, int how) +{ + struct sctp_endpoint *ep; + struct sctp_association *asoc; + + if (!sctp_style(sk, TCP)) + return; + + if (how & SEND_SHUTDOWN) { + ep = sctp_sk(sk)->ep; + if (!list_empty(&ep->asocs)) { + asoc = list_entry(ep->asocs.next, + struct sctp_association, asocs); + sctp_primitive_SHUTDOWN(asoc, NULL); + } + } +} + +/* 7.2.1 Association Status (SCTP_STATUS) + + * Applications can retrieve current status information about an + * association, including association state, peer receiver window size, + * number of unacked data chunks, and number of data chunks pending + * receipt. This information is read-only. + */ +static int sctp_getsockopt_sctp_status(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_status status; + struct sctp_association *asoc = NULL; + struct sctp_transport *transport; + sctp_assoc_t associd; + int retval = 0; + + if (len != sizeof(status)) { + retval = -EINVAL; + goto out; + } + + if (copy_from_user(&status, optval, sizeof(status))) { + retval = -EFAULT; + goto out; + } + + associd = status.sstat_assoc_id; + asoc = sctp_id2assoc(sk, associd); + if (!asoc) { + retval = -EINVAL; + goto out; + } + + transport = asoc->peer.primary_path; + + status.sstat_assoc_id = sctp_assoc2id(asoc); + status.sstat_state = asoc->state; + status.sstat_rwnd = asoc->peer.rwnd; + status.sstat_unackdata = asoc->unack_data; + + status.sstat_penddata = sctp_tsnmap_pending(&asoc->peer.tsn_map); + status.sstat_instrms = asoc->c.sinit_max_instreams; + status.sstat_outstrms = asoc->c.sinit_num_ostreams; + status.sstat_fragmentation_point = asoc->frag_point; + status.sstat_primary.spinfo_assoc_id = sctp_assoc2id(transport->asoc); + memcpy(&status.sstat_primary.spinfo_address, + &(transport->ipaddr), sizeof(union sctp_addr)); + /* Map ipv4 address into v4-mapped-on-v6 address. */ + sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), + (union sctp_addr *)&status.sstat_primary.spinfo_address); + status.sstat_primary.spinfo_state = transport->active; + status.sstat_primary.spinfo_cwnd = transport->cwnd; + status.sstat_primary.spinfo_srtt = transport->srtt; + status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto); + status.sstat_primary.spinfo_mtu = transport->pmtu; + + if (put_user(len, optlen)) { + retval = -EFAULT; + goto out; + } + + SCTP_DEBUG_PRINTK("sctp_getsockopt_sctp_status(%d): %d %d %d\n", + len, status.sstat_state, status.sstat_rwnd, + status.sstat_assoc_id); + + if (copy_to_user(optval, &status, len)) { + retval = -EFAULT; + goto out; + } + +out: + return (retval); +} + + +/* 7.2.2 Peer Address Information (SCTP_GET_PEER_ADDR_INFO) + * + * Applications can retrieve information about a specific peer address + * of an association, including its reachability state, congestion + * window, and retransmission timer values. This information is + * read-only. + */ +static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_paddrinfo pinfo; + struct sctp_transport *transport; + int retval = 0; + + if (len != sizeof(pinfo)) { + retval = -EINVAL; + goto out; + } + + if (copy_from_user(&pinfo, optval, sizeof(pinfo))) { + retval = -EFAULT; + goto out; + } + + transport = sctp_addr_id2transport(sk, &pinfo.spinfo_address, + pinfo.spinfo_assoc_id); + if (!transport) + return -EINVAL; + + pinfo.spinfo_assoc_id = sctp_assoc2id(transport->asoc); + pinfo.spinfo_state = transport->active; + pinfo.spinfo_cwnd = transport->cwnd; + pinfo.spinfo_srtt = transport->srtt; + pinfo.spinfo_rto = jiffies_to_msecs(transport->rto); + pinfo.spinfo_mtu = transport->pmtu; + + if (put_user(len, optlen)) { + retval = -EFAULT; + goto out; + } + + if (copy_to_user(optval, &pinfo, len)) { + retval = -EFAULT; + goto out; + } + +out: + return (retval); +} + +/* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS) + * + * This option is a on/off flag. If enabled no SCTP message + * fragmentation will be performed. Instead if a message being sent + * exceeds the current PMTU size, the message will NOT be sent and + * instead a error will be indicated to the user. + */ +static int sctp_getsockopt_disable_fragments(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + int val; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + val = (sctp_sk(sk)->disable_fragments == 1); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + return 0; +} + +/* 7.1.15 Set notification and ancillary events (SCTP_EVENTS) + * + * This socket option is used to specify various notifications and + * ancillary data the user wishes to receive. + */ +static int sctp_getsockopt_events(struct sock *sk, int len, char __user *optval, + int __user *optlen) +{ + if (len != sizeof(struct sctp_event_subscribe)) + return -EINVAL; + if (copy_to_user(optval, &sctp_sk(sk)->subscribe, len)) + return -EFAULT; + return 0; +} + +/* 7.1.8 Automatic Close of associations (SCTP_AUTOCLOSE) + * + * This socket option is applicable to the UDP-style socket only. When + * set it will cause associations that are idle for more than the + * specified number of seconds to automatically close. An association + * being idle is defined an association that has NOT sent or received + * user data. The special value of '0' indicates that no automatic + * close of any associations should be performed. The option expects an + * integer defining the number of seconds of idle time before an + * association is closed. + */ +static int sctp_getsockopt_autoclose(struct sock *sk, int len, char __user *optval, int __user *optlen) +{ + /* Applicable to UDP-style socket only */ + if (sctp_style(sk, TCP)) + return -EOPNOTSUPP; + if (len != sizeof(int)) + return -EINVAL; + if (copy_to_user(optval, &sctp_sk(sk)->autoclose, len)) + return -EFAULT; + return 0; +} + +/* Helper routine to branch off an association to a new socket. */ +SCTP_STATIC int sctp_do_peeloff(struct sctp_association *asoc, + struct socket **sockp) +{ + struct sock *sk = asoc->base.sk; + struct socket *sock; + int err = 0; + + /* An association cannot be branched off from an already peeled-off + * socket, nor is this supported for tcp style sockets. + */ + if (!sctp_style(sk, UDP)) + return -EINVAL; + + /* Create a new socket. */ + err = sock_create(sk->sk_family, SOCK_SEQPACKET, IPPROTO_SCTP, &sock); + if (err < 0) + return err; + + /* Populate the fields of the newsk from the oldsk and migrate the + * asoc to the newsk. + */ + sctp_sock_migrate(sk, sock->sk, asoc, SCTP_SOCKET_UDP_HIGH_BANDWIDTH); + *sockp = sock; + + return err; +} + +static int sctp_getsockopt_peeloff(struct sock *sk, int len, char __user *optval, int __user *optlen) +{ + sctp_peeloff_arg_t peeloff; + struct socket *newsock; + int retval = 0; + struct sctp_association *asoc; + + if (len != sizeof(sctp_peeloff_arg_t)) + return -EINVAL; + if (copy_from_user(&peeloff, optval, len)) + return -EFAULT; + + asoc = sctp_id2assoc(sk, peeloff.associd); + if (!asoc) { + retval = -EINVAL; + goto out; + } + + SCTP_DEBUG_PRINTK("%s: sk: %p asoc: %p\n", __FUNCTION__, sk, asoc); + + retval = sctp_do_peeloff(asoc, &newsock); + if (retval < 0) + goto out; + + /* Map the socket to an unused fd that can be returned to the user. */ + retval = sock_map_fd(newsock); + if (retval < 0) { + sock_release(newsock); + goto out; + } + + SCTP_DEBUG_PRINTK("%s: sk: %p asoc: %p newsk: %p sd: %d\n", + __FUNCTION__, sk, asoc, newsock->sk, retval); + + /* Return the fd mapped to the new socket. */ + peeloff.sd = retval; + if (copy_to_user(optval, &peeloff, len)) + retval = -EFAULT; + +out: + return retval; +} + +/* 7.1.13 Peer Address Parameters (SCTP_PEER_ADDR_PARAMS) + * + * Applications can enable or disable heartbeats for any peer address of + * an association, modify an address's heartbeat interval, force a + * heartbeat to be sent immediately, and adjust the address's maximum + * number of retransmissions sent before an address is considered + * unreachable. The following structure is used to access and modify an + * address's parameters: + * + * struct sctp_paddrparams { + * sctp_assoc_t spp_assoc_id; + * struct sockaddr_storage spp_address; + * uint32_t spp_hbinterval; + * uint16_t spp_pathmaxrxt; + * }; + * + * spp_assoc_id - (UDP style socket) This is filled in the application, + * and identifies the association for this query. + * spp_address - This specifies which address is of interest. + * spp_hbinterval - This contains the value of the heartbeat interval, + * in milliseconds. A value of 0, when modifying the + * parameter, specifies that the heartbeat on this + * address should be disabled. A value of UINT32_MAX + * (4294967295), when modifying the parameter, + * specifies that a heartbeat should be sent + * immediately to the peer address, and the current + * interval should remain unchanged. + * spp_pathmaxrxt - This contains the maximum number of + * retransmissions before this address shall be + * considered unreachable. + */ +static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_paddrparams params; + struct sctp_transport *trans; + + if (len != sizeof(struct sctp_paddrparams)) + return -EINVAL; + if (copy_from_user(¶ms, optval, len)) + return -EFAULT; + + /* If no association id is specified retrieve the default value + * for the endpoint that will be used for all future associations + */ + if (!params.spp_assoc_id && + sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { + params.spp_hbinterval = sctp_sk(sk)->paddrparam.spp_hbinterval; + params.spp_pathmaxrxt = sctp_sk(sk)->paddrparam.spp_pathmaxrxt; + + goto done; + } + + trans = sctp_addr_id2transport(sk, ¶ms.spp_address, + params.spp_assoc_id); + if (!trans) + return -EINVAL; + + /* The value of the heartbeat interval, in milliseconds. A value of 0, + * when modifying the parameter, specifies that the heartbeat on this + * address should be disabled. + */ + if (!trans->hb_allowed) + params.spp_hbinterval = 0; + else + params.spp_hbinterval = jiffies_to_msecs(trans->hb_interval); + + /* spp_pathmaxrxt contains the maximum number of retransmissions + * before this address shall be considered unreachable. + */ + params.spp_pathmaxrxt = trans->max_retrans; + +done: + if (copy_to_user(optval, ¶ms, len)) + return -EFAULT; + + if (put_user(len, optlen)) + return -EFAULT; + + return 0; +} + +/* 7.1.3 Initialization Parameters (SCTP_INITMSG) + * + * Applications can specify protocol parameters for the default association + * initialization. The option name argument to setsockopt() and getsockopt() + * is SCTP_INITMSG. + * + * Setting initialization parameters is effective only on an unconnected + * socket (for UDP-style sockets only future associations are effected + * by the change). With TCP-style sockets, this option is inherited by + * sockets derived from a listener socket. + */ +static int sctp_getsockopt_initmsg(struct sock *sk, int len, char __user *optval, int __user *optlen) +{ + if (len != sizeof(struct sctp_initmsg)) + return -EINVAL; + if (copy_to_user(optval, &sctp_sk(sk)->initmsg, len)) + return -EFAULT; + return 0; +} + +static int sctp_getsockopt_peer_addrs_num(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + sctp_assoc_t id; + struct sctp_association *asoc; + struct list_head *pos; + int cnt = 0; + + if (len != sizeof(sctp_assoc_t)) + return -EINVAL; + + if (copy_from_user(&id, optval, sizeof(sctp_assoc_t))) + return -EFAULT; + + /* For UDP-style sockets, id specifies the association to query. */ + asoc = sctp_id2assoc(sk, id); + if (!asoc) + return -EINVAL; + + list_for_each(pos, &asoc->peer.transport_addr_list) { + cnt ++; + } + + return cnt; +} + +static int sctp_getsockopt_peer_addrs(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_association *asoc; + struct list_head *pos; + int cnt = 0; + struct sctp_getaddrs getaddrs; + struct sctp_transport *from; + void __user *to; + union sctp_addr temp; + struct sctp_sock *sp = sctp_sk(sk); + int addrlen; + + if (len != sizeof(struct sctp_getaddrs)) + return -EINVAL; + + if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs))) + return -EFAULT; + + if (getaddrs.addr_num <= 0) return -EINVAL; + + /* For UDP-style sockets, id specifies the association to query. */ + asoc = sctp_id2assoc(sk, getaddrs.assoc_id); + if (!asoc) + return -EINVAL; + + to = (void __user *)getaddrs.addrs; + list_for_each(pos, &asoc->peer.transport_addr_list) { + from = list_entry(pos, struct sctp_transport, transports); + memcpy(&temp, &from->ipaddr, sizeof(temp)); + sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp); + addrlen = sctp_get_af_specific(sk->sk_family)->sockaddr_len; + temp.v4.sin_port = htons(temp.v4.sin_port); + if (copy_to_user(to, &temp, addrlen)) + return -EFAULT; + to += addrlen ; + cnt ++; + if (cnt >= getaddrs.addr_num) break; + } + getaddrs.addr_num = cnt; + if (copy_to_user(optval, &getaddrs, sizeof(struct sctp_getaddrs))) + return -EFAULT; + + return 0; +} + +static int sctp_getsockopt_local_addrs_num(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + sctp_assoc_t id; + struct sctp_bind_addr *bp; + struct sctp_association *asoc; + struct list_head *pos; + struct sctp_sockaddr_entry *addr; + rwlock_t *addr_lock; + unsigned long flags; + int cnt = 0; + + if (len != sizeof(sctp_assoc_t)) + return -EINVAL; + + if (copy_from_user(&id, optval, sizeof(sctp_assoc_t))) + return -EFAULT; + + /* + * For UDP-style sockets, id specifies the association to query. + * If the id field is set to the value '0' then the locally bound + * addresses are returned without regard to any particular + * association. + */ + if (0 == id) { + bp = &sctp_sk(sk)->ep->base.bind_addr; + addr_lock = &sctp_sk(sk)->ep->base.addr_lock; + } else { + asoc = sctp_id2assoc(sk, id); + if (!asoc) + return -EINVAL; + bp = &asoc->base.bind_addr; + addr_lock = &asoc->base.addr_lock; + } + + sctp_read_lock(addr_lock); + + /* If the endpoint is bound to 0.0.0.0 or ::0, count the valid + * addresses from the global local address list. + */ + if (sctp_list_single_entry(&bp->address_list)) { + addr = list_entry(bp->address_list.next, + struct sctp_sockaddr_entry, list); + if (sctp_is_any(&addr->a)) { + sctp_spin_lock_irqsave(&sctp_local_addr_lock, flags); + list_for_each(pos, &sctp_local_addr_list) { + addr = list_entry(pos, + struct sctp_sockaddr_entry, + list); + if ((PF_INET == sk->sk_family) && + (AF_INET6 == addr->a.sa.sa_family)) + continue; + cnt++; + } + sctp_spin_unlock_irqrestore(&sctp_local_addr_lock, + flags); + } else { + cnt = 1; + } + goto done; + } + + list_for_each(pos, &bp->address_list) { + cnt ++; + } + +done: + sctp_read_unlock(addr_lock); + return cnt; +} + +/* Helper function that copies local addresses to user and returns the number + * of addresses copied. + */ +static int sctp_copy_laddrs_to_user(struct sock *sk, __u16 port, int max_addrs, + void __user *to) +{ + struct list_head *pos; + struct sctp_sockaddr_entry *addr; + unsigned long flags; + union sctp_addr temp; + int cnt = 0; + int addrlen; + + sctp_spin_lock_irqsave(&sctp_local_addr_lock, flags); + list_for_each(pos, &sctp_local_addr_list) { + addr = list_entry(pos, struct sctp_sockaddr_entry, list); + if ((PF_INET == sk->sk_family) && + (AF_INET6 == addr->a.sa.sa_family)) + continue; + memcpy(&temp, &addr->a, sizeof(temp)); + sctp_get_pf_specific(sk->sk_family)->addr_v4map(sctp_sk(sk), + &temp); + addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; + temp.v4.sin_port = htons(port); + if (copy_to_user(to, &temp, addrlen)) { + sctp_spin_unlock_irqrestore(&sctp_local_addr_lock, + flags); + return -EFAULT; + } + to += addrlen; + cnt ++; + if (cnt >= max_addrs) break; + } + sctp_spin_unlock_irqrestore(&sctp_local_addr_lock, flags); + + return cnt; +} + +static int sctp_getsockopt_local_addrs(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_bind_addr *bp; + struct sctp_association *asoc; + struct list_head *pos; + int cnt = 0; + struct sctp_getaddrs getaddrs; + struct sctp_sockaddr_entry *addr; + void __user *to; + union sctp_addr temp; + struct sctp_sock *sp = sctp_sk(sk); + int addrlen; + rwlock_t *addr_lock; + int err = 0; + + if (len != sizeof(struct sctp_getaddrs)) + return -EINVAL; + + if (copy_from_user(&getaddrs, optval, sizeof(struct sctp_getaddrs))) + return -EFAULT; + + if (getaddrs.addr_num <= 0) return -EINVAL; + /* + * For UDP-style sockets, id specifies the association to query. + * If the id field is set to the value '0' then the locally bound + * addresses are returned without regard to any particular + * association. + */ + if (0 == getaddrs.assoc_id) { + bp = &sctp_sk(sk)->ep->base.bind_addr; + addr_lock = &sctp_sk(sk)->ep->base.addr_lock; + } else { + asoc = sctp_id2assoc(sk, getaddrs.assoc_id); + if (!asoc) + return -EINVAL; + bp = &asoc->base.bind_addr; + addr_lock = &asoc->base.addr_lock; + } + + to = getaddrs.addrs; + + sctp_read_lock(addr_lock); + + /* If the endpoint is bound to 0.0.0.0 or ::0, get the valid + * addresses from the global local address list. + */ + if (sctp_list_single_entry(&bp->address_list)) { + addr = list_entry(bp->address_list.next, + struct sctp_sockaddr_entry, list); + if (sctp_is_any(&addr->a)) { + cnt = sctp_copy_laddrs_to_user(sk, bp->port, + getaddrs.addr_num, to); + if (cnt < 0) { + err = cnt; + goto unlock; + } + goto copy_getaddrs; + } + } + + list_for_each(pos, &bp->address_list) { + addr = list_entry(pos, struct sctp_sockaddr_entry, list); + memcpy(&temp, &addr->a, sizeof(temp)); + sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, &temp); + addrlen = sctp_get_af_specific(temp.sa.sa_family)->sockaddr_len; + temp.v4.sin_port = htons(temp.v4.sin_port); + if (copy_to_user(to, &temp, addrlen)) { + err = -EFAULT; + goto unlock; + } + to += addrlen; + cnt ++; + if (cnt >= getaddrs.addr_num) break; + } + +copy_getaddrs: + getaddrs.addr_num = cnt; + if (copy_to_user(optval, &getaddrs, sizeof(struct sctp_getaddrs))) + err = -EFAULT; + +unlock: + sctp_read_unlock(addr_lock); + return err; +} + +/* 7.1.10 Set Primary Address (SCTP_PRIMARY_ADDR) + * + * Requests that the local SCTP stack use the enclosed peer address as + * the association primary. The enclosed address must be one of the + * association peer's addresses. + */ +static int sctp_getsockopt_primary_addr(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + struct sctp_prim prim; + struct sctp_association *asoc; + struct sctp_sock *sp = sctp_sk(sk); + + if (len != sizeof(struct sctp_prim)) + return -EINVAL; + + if (copy_from_user(&prim, optval, sizeof(struct sctp_prim))) + return -EFAULT; + + asoc = sctp_id2assoc(sk, prim.ssp_assoc_id); + if (!asoc) + return -EINVAL; + + if (!asoc->peer.primary_path) + return -ENOTCONN; + + asoc->peer.primary_path->ipaddr.v4.sin_port = + htons(asoc->peer.primary_path->ipaddr.v4.sin_port); + memcpy(&prim.ssp_addr, &asoc->peer.primary_path->ipaddr, + sizeof(union sctp_addr)); + asoc->peer.primary_path->ipaddr.v4.sin_port = + ntohs(asoc->peer.primary_path->ipaddr.v4.sin_port); + + sctp_get_pf_specific(sk->sk_family)->addr_v4map(sp, + (union sctp_addr *)&prim.ssp_addr); + + if (copy_to_user(optval, &prim, sizeof(struct sctp_prim))) + return -EFAULT; + + return 0; +} + +/* + * 7.1.11 Set Adaption Layer Indicator (SCTP_ADAPTION_LAYER) + * + * Requests that the local endpoint set the specified Adaption Layer + * Indication parameter for all future INIT and INIT-ACK exchanges. + */ +static int sctp_getsockopt_adaption_layer(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + __u32 val; + + if (len < sizeof(__u32)) + return -EINVAL; + + len = sizeof(__u32); + val = sctp_sk(sk)->adaption_ind; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + return 0; +} + +/* + * + * 7.1.14 Set default send parameters (SCTP_DEFAULT_SEND_PARAM) + * + * Applications that wish to use the sendto() system call may wish to + * specify a default set of parameters that would normally be supplied + * through the inclusion of ancillary data. This socket option allows + * such an application to set the default sctp_sndrcvinfo structure. + + + * The application that wishes to use this socket option simply passes + * in to this call the sctp_sndrcvinfo structure defined in Section + * 5.2.2) The input parameters accepted by this call include + * sinfo_stream, sinfo_flags, sinfo_ppid, sinfo_context, + * sinfo_timetolive. The user must provide the sinfo_assoc_id field in + * to this call if the caller is using the UDP model. + * + * For getsockopt, it get the default sctp_sndrcvinfo structure. + */ +static int sctp_getsockopt_default_send_param(struct sock *sk, + int len, char __user *optval, + int __user *optlen) +{ + struct sctp_sndrcvinfo info; + struct sctp_association *asoc; + struct sctp_sock *sp = sctp_sk(sk); + + if (len != sizeof(struct sctp_sndrcvinfo)) + return -EINVAL; + if (copy_from_user(&info, optval, sizeof(struct sctp_sndrcvinfo))) + return -EFAULT; + + asoc = sctp_id2assoc(sk, info.sinfo_assoc_id); + if (!asoc && info.sinfo_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (asoc) { + info.sinfo_stream = asoc->default_stream; + info.sinfo_flags = asoc->default_flags; + info.sinfo_ppid = asoc->default_ppid; + info.sinfo_context = asoc->default_context; + info.sinfo_timetolive = asoc->default_timetolive; + } else { + info.sinfo_stream = sp->default_stream; + info.sinfo_flags = sp->default_flags; + info.sinfo_ppid = sp->default_ppid; + info.sinfo_context = sp->default_context; + info.sinfo_timetolive = sp->default_timetolive; + } + + if (copy_to_user(optval, &info, sizeof(struct sctp_sndrcvinfo))) + return -EFAULT; + + return 0; +} + +/* + * + * 7.1.5 SCTP_NODELAY + * + * Turn on/off any Nagle-like algorithm. This means that packets are + * generally sent as soon as possible and no unnecessary delays are + * introduced, at the cost of more packets in the network. Expects an + * integer boolean flag. + */ + +static int sctp_getsockopt_nodelay(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + int val; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + val = (sctp_sk(sk)->nodelay == 1); + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + return 0; +} + +/* + * + * 7.1.1 SCTP_RTOINFO + * + * The protocol parameters used to initialize and bound retransmission + * timeout (RTO) are tunable. sctp_rtoinfo structure is used to access + * and modify these parameters. + * All parameters are time values, in milliseconds. A value of 0, when + * modifying the parameters, indicates that the current value should not + * be changed. + * + */ +static int sctp_getsockopt_rtoinfo(struct sock *sk, int len, + char __user *optval, + int __user *optlen) { + struct sctp_rtoinfo rtoinfo; + struct sctp_association *asoc; + + if (len != sizeof (struct sctp_rtoinfo)) + return -EINVAL; + + if (copy_from_user(&rtoinfo, optval, sizeof (struct sctp_rtoinfo))) + return -EFAULT; + + asoc = sctp_id2assoc(sk, rtoinfo.srto_assoc_id); + + if (!asoc && rtoinfo.srto_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + /* Values corresponding to the specific association. */ + if (asoc) { + rtoinfo.srto_initial = jiffies_to_msecs(asoc->rto_initial); + rtoinfo.srto_max = jiffies_to_msecs(asoc->rto_max); + rtoinfo.srto_min = jiffies_to_msecs(asoc->rto_min); + } else { + /* Values corresponding to the endpoint. */ + struct sctp_sock *sp = sctp_sk(sk); + + rtoinfo.srto_initial = sp->rtoinfo.srto_initial; + rtoinfo.srto_max = sp->rtoinfo.srto_max; + rtoinfo.srto_min = sp->rtoinfo.srto_min; + } + + if (put_user(len, optlen)) + return -EFAULT; + + if (copy_to_user(optval, &rtoinfo, len)) + return -EFAULT; + + return 0; +} + +/* + * + * 7.1.2 SCTP_ASSOCINFO + * + * This option is used to tune the the maximum retransmission attempts + * of the association. + * Returns an error if the new association retransmission value is + * greater than the sum of the retransmission value of the peer. + * See [SCTP] for more information. + * + */ +static int sctp_getsockopt_associnfo(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + + struct sctp_assocparams assocparams; + struct sctp_association *asoc; + struct list_head *pos; + int cnt = 0; + + if (len != sizeof (struct sctp_assocparams)) + return -EINVAL; + + if (copy_from_user(&assocparams, optval, + sizeof (struct sctp_assocparams))) + return -EFAULT; + + asoc = sctp_id2assoc(sk, assocparams.sasoc_assoc_id); + + if (!asoc && assocparams.sasoc_assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + /* Values correspoinding to the specific association */ + if (assocparams.sasoc_assoc_id != 0) { + assocparams.sasoc_asocmaxrxt = asoc->max_retrans; + assocparams.sasoc_peer_rwnd = asoc->peer.rwnd; + assocparams.sasoc_local_rwnd = asoc->a_rwnd; + assocparams.sasoc_cookie_life = (asoc->cookie_life.tv_sec + * 1000) + + (asoc->cookie_life.tv_usec + / 1000); + + list_for_each(pos, &asoc->peer.transport_addr_list) { + cnt ++; + } + + assocparams.sasoc_number_peer_destinations = cnt; + } else { + /* Values corresponding to the endpoint */ + struct sctp_sock *sp = sctp_sk(sk); + + assocparams.sasoc_asocmaxrxt = sp->assocparams.sasoc_asocmaxrxt; + assocparams.sasoc_peer_rwnd = sp->assocparams.sasoc_peer_rwnd; + assocparams.sasoc_local_rwnd = sp->assocparams.sasoc_local_rwnd; + assocparams.sasoc_cookie_life = + sp->assocparams.sasoc_cookie_life; + assocparams.sasoc_number_peer_destinations = + sp->assocparams. + sasoc_number_peer_destinations; + } + + if (put_user(len, optlen)) + return -EFAULT; + + if (copy_to_user(optval, &assocparams, len)) + return -EFAULT; + + return 0; +} + +/* + * 7.1.16 Set/clear IPv4 mapped addresses (SCTP_I_WANT_MAPPED_V4_ADDR) + * + * This socket option is a boolean flag which turns on or off mapped V4 + * addresses. If this option is turned on and the socket is type + * PF_INET6, then IPv4 addresses will be mapped to V6 representation. + * If this option is turned off, then no mapping will be done of V4 + * addresses and a user will receive both PF_INET6 and PF_INET type + * addresses on the socket. + */ +static int sctp_getsockopt_mappedv4(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + int val; + struct sctp_sock *sp = sctp_sk(sk); + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + val = sp->v4mapped; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +/* + * 7.1.17 Set the maximum fragrmentation size (SCTP_MAXSEG) + * + * This socket option specifies the maximum size to put in any outgoing + * SCTP chunk. If a message is larger than this size it will be + * fragmented by SCTP into the specified size. Note that the underlying + * SCTP implementation may fragment into smaller sized chunks when the + * PMTU of the underlying association is smaller than the value set by + * the user. + */ +static int sctp_getsockopt_maxseg(struct sock *sk, int len, + char __user *optval, int __user *optlen) +{ + int val; + + if (len < sizeof(int)) + return -EINVAL; + + len = sizeof(int); + + val = sctp_sk(sk)->user_frag; + if (put_user(len, optlen)) + return -EFAULT; + if (copy_to_user(optval, &val, len)) + return -EFAULT; + + return 0; +} + +SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen) +{ + int retval = 0; + int len; + + SCTP_DEBUG_PRINTK("sctp_getsockopt(sk: %p, ...)\n", sk); + + /* I can hardly begin to describe how wrong this is. This is + * so broken as to be worse than useless. The API draft + * REALLY is NOT helpful here... I am not convinced that the + * semantics of getsockopt() with a level OTHER THAN SOL_SCTP + * are at all well-founded. + */ + if (level != SOL_SCTP) { + struct sctp_af *af = sctp_sk(sk)->pf->af; + + retval = af->getsockopt(sk, level, optname, optval, optlen); + return retval; + } + + if (get_user(len, optlen)) + return -EFAULT; + + sctp_lock_sock(sk); + + switch (optname) { + case SCTP_STATUS: + retval = sctp_getsockopt_sctp_status(sk, len, optval, optlen); + break; + case SCTP_DISABLE_FRAGMENTS: + retval = sctp_getsockopt_disable_fragments(sk, len, optval, + optlen); + break; + case SCTP_EVENTS: + retval = sctp_getsockopt_events(sk, len, optval, optlen); + break; + case SCTP_AUTOCLOSE: + retval = sctp_getsockopt_autoclose(sk, len, optval, optlen); + break; + case SCTP_SOCKOPT_PEELOFF: + retval = sctp_getsockopt_peeloff(sk, len, optval, optlen); + break; + case SCTP_PEER_ADDR_PARAMS: + retval = sctp_getsockopt_peer_addr_params(sk, len, optval, + optlen); + break; + case SCTP_INITMSG: + retval = sctp_getsockopt_initmsg(sk, len, optval, optlen); + break; + case SCTP_GET_PEER_ADDRS_NUM: + retval = sctp_getsockopt_peer_addrs_num(sk, len, optval, + optlen); + break; + case SCTP_GET_LOCAL_ADDRS_NUM: + retval = sctp_getsockopt_local_addrs_num(sk, len, optval, + optlen); + break; + case SCTP_GET_PEER_ADDRS: + retval = sctp_getsockopt_peer_addrs(sk, len, optval, + optlen); + break; + case SCTP_GET_LOCAL_ADDRS: + retval = sctp_getsockopt_local_addrs(sk, len, optval, + optlen); + break; + case SCTP_DEFAULT_SEND_PARAM: + retval = sctp_getsockopt_default_send_param(sk, len, + optval, optlen); + break; + case SCTP_PRIMARY_ADDR: + retval = sctp_getsockopt_primary_addr(sk, len, optval, optlen); + break; + case SCTP_NODELAY: + retval = sctp_getsockopt_nodelay(sk, len, optval, optlen); + break; + case SCTP_RTOINFO: + retval = sctp_getsockopt_rtoinfo(sk, len, optval, optlen); + break; + case SCTP_ASSOCINFO: + retval = sctp_getsockopt_associnfo(sk, len, optval, optlen); + break; + case SCTP_I_WANT_MAPPED_V4_ADDR: + retval = sctp_getsockopt_mappedv4(sk, len, optval, optlen); + break; + case SCTP_MAXSEG: + retval = sctp_getsockopt_maxseg(sk, len, optval, optlen); + break; + case SCTP_GET_PEER_ADDR_INFO: + retval = sctp_getsockopt_peer_addr_info(sk, len, optval, + optlen); + break; + case SCTP_ADAPTION_LAYER: + retval = sctp_getsockopt_adaption_layer(sk, len, optval, + optlen); + break; + default: + retval = -ENOPROTOOPT; + break; + }; + + sctp_release_sock(sk); + return retval; +} + +static void sctp_hash(struct sock *sk) +{ + /* STUB */ +} + +static void sctp_unhash(struct sock *sk) +{ + /* STUB */ +} + +/* Check if port is acceptable. Possibly find first available port. + * + * The port hash table (contained in the 'global' SCTP protocol storage + * returned by struct sctp_protocol *sctp_get_protocol()). The hash + * table is an array of 4096 lists (sctp_bind_hashbucket). Each + * list (the list number is the port number hashed out, so as you + * would expect from a hash function, all the ports in a given list have + * such a number that hashes out to the same list number; you were + * expecting that, right?); so each list has a set of ports, with a + * link to the socket (struct sock) that uses it, the port number and + * a fastreuse flag (FIXME: NPI ipg). + */ +static struct sctp_bind_bucket *sctp_bucket_create( + struct sctp_bind_hashbucket *head, unsigned short snum); + +static long sctp_get_port_local(struct sock *sk, union sctp_addr *addr) +{ + struct sctp_bind_hashbucket *head; /* hash list */ + struct sctp_bind_bucket *pp; /* hash list port iterator */ + unsigned short snum; + int ret; + + /* NOTE: Remember to put this back to net order. */ + addr->v4.sin_port = ntohs(addr->v4.sin_port); + snum = addr->v4.sin_port; + + SCTP_DEBUG_PRINTK("sctp_get_port() begins, snum=%d\n", snum); + sctp_local_bh_disable(); + + if (snum == 0) { + /* Search for an available port. + * + * 'sctp_port_rover' was the last port assigned, so + * we start to search from 'sctp_port_rover + + * 1'. What we do is first check if port 'rover' is + * already in the hash table; if not, we use that; if + * it is, we try next. + */ + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int remaining = (high - low) + 1; + int rover; + int index; + + sctp_spin_lock(&sctp_port_alloc_lock); + rover = sctp_port_rover; + do { + rover++; + if ((rover < low) || (rover > high)) + rover = low; + index = sctp_phashfn(rover); + head = &sctp_port_hashtable[index]; + sctp_spin_lock(&head->lock); + for (pp = head->chain; pp; pp = pp->next) + if (pp->port == rover) + goto next; + break; + next: + sctp_spin_unlock(&head->lock); + } while (--remaining > 0); + sctp_port_rover = rover; + sctp_spin_unlock(&sctp_port_alloc_lock); + + /* Exhausted local port range during search? */ + ret = 1; + if (remaining <= 0) + goto fail; + + /* OK, here is the one we will use. HEAD (the port + * hash table list entry) is non-NULL and we hold it's + * mutex. + */ + snum = rover; + } else { + /* We are given an specific port number; we verify + * that it is not being used. If it is used, we will + * exahust the search in the hash list corresponding + * to the port number (snum) - we detect that with the + * port iterator, pp being NULL. + */ + head = &sctp_port_hashtable[sctp_phashfn(snum)]; + sctp_spin_lock(&head->lock); + for (pp = head->chain; pp; pp = pp->next) { + if (pp->port == snum) + goto pp_found; + } + } + pp = NULL; + goto pp_not_found; +pp_found: + if (!hlist_empty(&pp->owner)) { + /* We had a port hash table hit - there is an + * available port (pp != NULL) and it is being + * used by other socket (pp->owner not empty); that other + * socket is going to be sk2. + */ + int reuse = sk->sk_reuse; + struct sock *sk2; + struct hlist_node *node; + + SCTP_DEBUG_PRINTK("sctp_get_port() found a possible match\n"); + if (pp->fastreuse && sk->sk_reuse) + goto success; + + /* Run through the list of sockets bound to the port + * (pp->port) [via the pointers bind_next and + * bind_pprev in the struct sock *sk2 (pp->sk)]. On each one, + * we get the endpoint they describe and run through + * the endpoint's list of IP (v4 or v6) addresses, + * comparing each of the addresses with the address of + * the socket sk. If we find a match, then that means + * that this port/socket (sk) combination are already + * in an endpoint. + */ + sk_for_each_bound(sk2, node, &pp->owner) { + struct sctp_endpoint *ep2; + ep2 = sctp_sk(sk2)->ep; + + if (reuse && sk2->sk_reuse) + continue; + + if (sctp_bind_addr_match(&ep2->base.bind_addr, addr, + sctp_sk(sk))) { + ret = (long)sk2; + goto fail_unlock; + } + } + SCTP_DEBUG_PRINTK("sctp_get_port(): Found a match\n"); + } +pp_not_found: + /* If there was a hash table miss, create a new port. */ + ret = 1; + if (!pp && !(pp = sctp_bucket_create(head, snum))) + goto fail_unlock; + + /* In either case (hit or miss), make sure fastreuse is 1 only + * if sk->sk_reuse is too (that is, if the caller requested + * SO_REUSEADDR on this socket -sk-). + */ + if (hlist_empty(&pp->owner)) + pp->fastreuse = sk->sk_reuse ? 1 : 0; + else if (pp->fastreuse && !sk->sk_reuse) + pp->fastreuse = 0; + + /* We are set, so fill up all the data in the hash table + * entry, tie the socket list information with the rest of the + * sockets FIXME: Blurry, NPI (ipg). + */ +success: + inet_sk(sk)->num = snum; + if (!sctp_sk(sk)->bind_hash) { + sk_add_bind_node(sk, &pp->owner); + sctp_sk(sk)->bind_hash = pp; + } + ret = 0; + +fail_unlock: + sctp_spin_unlock(&head->lock); + +fail: + sctp_local_bh_enable(); + addr->v4.sin_port = htons(addr->v4.sin_port); + return ret; +} + +/* Assign a 'snum' port to the socket. If snum == 0, an ephemeral + * port is requested. + */ +static int sctp_get_port(struct sock *sk, unsigned short snum) +{ + long ret; + union sctp_addr addr; + struct sctp_af *af = sctp_sk(sk)->pf->af; + + /* Set up a dummy address struct from the sk. */ + af->from_sk(&addr, sk); + addr.v4.sin_port = htons(snum); + + /* Note: sk->sk_num gets filled in if ephemeral port request. */ + ret = sctp_get_port_local(sk, &addr); + + return (ret ? 1 : 0); +} + +/* + * 3.1.3 listen() - UDP Style Syntax + * + * By default, new associations are not accepted for UDP style sockets. + * An application uses listen() to mark a socket as being able to + * accept new associations. + */ +SCTP_STATIC int sctp_seqpacket_listen(struct sock *sk, int backlog) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct sctp_endpoint *ep = sp->ep; + + /* Only UDP style sockets that are not peeled off are allowed to + * listen(). + */ + if (!sctp_style(sk, UDP)) + return -EINVAL; + + /* If backlog is zero, disable listening. */ + if (!backlog) { + if (sctp_sstate(sk, CLOSED)) + return 0; + + sctp_unhash_endpoint(ep); + sk->sk_state = SCTP_SS_CLOSED; + } + + /* Return if we are already listening. */ + if (sctp_sstate(sk, LISTENING)) + return 0; + + /* + * If a bind() or sctp_bindx() is not called prior to a listen() + * call that allows new associations to be accepted, the system + * picks an ephemeral port and will choose an address set equivalent + * to binding with a wildcard address. + * + * This is not currently spelled out in the SCTP sockets + * extensions draft, but follows the practice as seen in TCP + * sockets. + */ + if (!ep->base.bind_addr.port) { + if (sctp_autobind(sk)) + return -EAGAIN; + } + sk->sk_state = SCTP_SS_LISTENING; + sctp_hash_endpoint(ep); + return 0; +} + +/* + * 4.1.3 listen() - TCP Style Syntax + * + * Applications uses listen() to ready the SCTP endpoint for accepting + * inbound associations. + */ +SCTP_STATIC int sctp_stream_listen(struct sock *sk, int backlog) +{ + struct sctp_sock *sp = sctp_sk(sk); + struct sctp_endpoint *ep = sp->ep; + + /* If backlog is zero, disable listening. */ + if (!backlog) { + if (sctp_sstate(sk, CLOSED)) + return 0; + + sctp_unhash_endpoint(ep); + sk->sk_state = SCTP_SS_CLOSED; + } + + if (sctp_sstate(sk, LISTENING)) + return 0; + + /* + * If a bind() or sctp_bindx() is not called prior to a listen() + * call that allows new associations to be accepted, the system + * picks an ephemeral port and will choose an address set equivalent + * to binding with a wildcard address. + * + * This is not currently spelled out in the SCTP sockets + * extensions draft, but follows the practice as seen in TCP + * sockets. + */ + if (!ep->base.bind_addr.port) { + if (sctp_autobind(sk)) + return -EAGAIN; + } + sk->sk_state = SCTP_SS_LISTENING; + sk->sk_max_ack_backlog = backlog; + sctp_hash_endpoint(ep); + return 0; +} + +/* + * Move a socket to LISTENING state. + */ +int sctp_inet_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + struct crypto_tfm *tfm=NULL; + int err = -EINVAL; + + if (unlikely(backlog < 0)) + goto out; + + sctp_lock_sock(sk); + + if (sock->state != SS_UNCONNECTED) + goto out; + + /* Allocate HMAC for generating cookie. */ + if (sctp_hmac_alg) { + tfm = sctp_crypto_alloc_tfm(sctp_hmac_alg, 0); + if (!tfm) { + err = -ENOSYS; + goto out; + } + } + + switch (sock->type) { + case SOCK_SEQPACKET: + err = sctp_seqpacket_listen(sk, backlog); + break; + case SOCK_STREAM: + err = sctp_stream_listen(sk, backlog); + break; + default: + break; + }; + if (err) + goto cleanup; + + /* Store away the transform reference. */ + sctp_sk(sk)->hmac = tfm; +out: + sctp_release_sock(sk); + return err; +cleanup: + if (tfm) + sctp_crypto_free_tfm(tfm); + goto out; +} + +/* + * This function is done by modeling the current datagram_poll() and the + * tcp_poll(). Note that, based on these implementations, we don't + * lock the socket in this function, even though it seems that, + * ideally, locking or some other mechanisms can be used to ensure + * the integrity of the counters (sndbuf and wmem_queued) used + * in this place. We assume that we don't need locks either until proven + * otherwise. + * + * Another thing to note is that we include the Async I/O support + * here, again, by modeling the current TCP/UDP code. We don't have + * a good way to test with it yet. + */ +unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + struct sctp_sock *sp = sctp_sk(sk); + unsigned int mask; + + poll_wait(file, sk->sk_sleep, wait); + + /* A TCP-style listening socket becomes readable when the accept queue + * is not empty. + */ + if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) + return (!list_empty(&sp->ep->asocs)) ? + (POLLIN | POLLRDNORM) : 0; + + mask = 0; + + /* Is there any exceptional events? */ + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) + mask |= POLLERR; + if (sk->sk_shutdown == SHUTDOWN_MASK) + mask |= POLLHUP; + + /* Is it readable? Reconsider this code with TCP-style support. */ + if (!skb_queue_empty(&sk->sk_receive_queue) || + (sk->sk_shutdown & RCV_SHUTDOWN)) + mask |= POLLIN | POLLRDNORM; + + /* The association is either gone or not ready. */ + if (!sctp_style(sk, UDP) && sctp_sstate(sk, CLOSED)) + return mask; + + /* Is it writable? */ + if (sctp_writeable(sk)) { + mask |= POLLOUT | POLLWRNORM; + } else { + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + /* + * Since the socket is not locked, the buffer + * might be made available after the writeable check and + * before the bit is set. This could cause a lost I/O + * signal. tcp_poll() has a race breaker for this race + * condition. Based on their implementation, we put + * in the following code to cover it as well. + */ + if (sctp_writeable(sk)) + mask |= POLLOUT | POLLWRNORM; + } + return mask; +} + +/******************************************************************** + * 2nd Level Abstractions + ********************************************************************/ + +static struct sctp_bind_bucket *sctp_bucket_create( + struct sctp_bind_hashbucket *head, unsigned short snum) +{ + struct sctp_bind_bucket *pp; + + pp = kmem_cache_alloc(sctp_bucket_cachep, SLAB_ATOMIC); + SCTP_DBG_OBJCNT_INC(bind_bucket); + if (pp) { + pp->port = snum; + pp->fastreuse = 0; + INIT_HLIST_HEAD(&pp->owner); + if ((pp->next = head->chain) != NULL) + pp->next->pprev = &pp->next; + head->chain = pp; + pp->pprev = &head->chain; + } + return pp; +} + +/* Caller must hold hashbucket lock for this tb with local BH disabled */ +static void sctp_bucket_destroy(struct sctp_bind_bucket *pp) +{ + if (hlist_empty(&pp->owner)) { + if (pp->next) + pp->next->pprev = pp->pprev; + *(pp->pprev) = pp->next; + kmem_cache_free(sctp_bucket_cachep, pp); + SCTP_DBG_OBJCNT_DEC(bind_bucket); + } +} + +/* Release this socket's reference to a local port. */ +static inline void __sctp_put_port(struct sock *sk) +{ + struct sctp_bind_hashbucket *head = + &sctp_port_hashtable[sctp_phashfn(inet_sk(sk)->num)]; + struct sctp_bind_bucket *pp; + + sctp_spin_lock(&head->lock); + pp = sctp_sk(sk)->bind_hash; + __sk_del_bind_node(sk); + sctp_sk(sk)->bind_hash = NULL; + inet_sk(sk)->num = 0; + sctp_bucket_destroy(pp); + sctp_spin_unlock(&head->lock); +} + +void sctp_put_port(struct sock *sk) +{ + sctp_local_bh_disable(); + __sctp_put_port(sk); + sctp_local_bh_enable(); +} + +/* + * The system picks an ephemeral port and choose an address set equivalent + * to binding with a wildcard address. + * One of those addresses will be the primary address for the association. + * This automatically enables the multihoming capability of SCTP. + */ +static int sctp_autobind(struct sock *sk) +{ + union sctp_addr autoaddr; + struct sctp_af *af; + unsigned short port; + + /* Initialize a local sockaddr structure to INADDR_ANY. */ + af = sctp_sk(sk)->pf->af; + + port = htons(inet_sk(sk)->num); + af->inaddr_any(&autoaddr, port); + + return sctp_do_bind(sk, &autoaddr, af->sockaddr_len); +} + +/* Parse out IPPROTO_SCTP CMSG headers. Perform only minimal validation. + * + * From RFC 2292 + * 4.2 The cmsghdr Structure * + * + * When ancillary data is sent or received, any number of ancillary data + * objects can be specified by the msg_control and msg_controllen members of + * the msghdr structure, because each object is preceded by + * a cmsghdr structure defining the object's length (the cmsg_len member). + * Historically Berkeley-derived implementations have passed only one object + * at a time, but this API allows multiple objects to be + * passed in a single call to sendmsg() or recvmsg(). The following example + * shows two ancillary data objects in a control buffer. + * + * |<--------------------------- msg_controllen -------------------------->| + * | | + * + * |<----- ancillary data object ----->|<----- ancillary data object ----->| + * + * |<---------- CMSG_SPACE() --------->|<---------- CMSG_SPACE() --------->| + * | | | + * + * |<---------- cmsg_len ---------->| |<--------- cmsg_len ----------->| | + * + * |<--------- CMSG_LEN() --------->| |<-------- CMSG_LEN() ---------->| | + * | | | | | + * + * +-----+-----+-----+--+-----------+--+-----+-----+-----+--+-----------+--+ + * |cmsg_|cmsg_|cmsg_|XX| |XX|cmsg_|cmsg_|cmsg_|XX| |XX| + * + * |len |level|type |XX|cmsg_data[]|XX|len |level|type |XX|cmsg_data[]|XX| + * + * +-----+-----+-----+--+-----------+--+-----+-----+-----+--+-----------+--+ + * ^ + * | + * + * msg_control + * points here + */ +SCTP_STATIC int sctp_msghdr_parse(const struct msghdr *msg, + sctp_cmsgs_t *cmsgs) +{ + struct cmsghdr *cmsg; + + for (cmsg = CMSG_FIRSTHDR(msg); + cmsg != NULL; + cmsg = CMSG_NXTHDR((struct msghdr*)msg, cmsg)) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + /* Should we parse this header or ignore? */ + if (cmsg->cmsg_level != IPPROTO_SCTP) + continue; + + /* Strictly check lengths following example in SCM code. */ + switch (cmsg->cmsg_type) { + case SCTP_INIT: + /* SCTP Socket API Extension + * 5.2.1 SCTP Initiation Structure (SCTP_INIT) + * + * This cmsghdr structure provides information for + * initializing new SCTP associations with sendmsg(). + * The SCTP_INITMSG socket option uses this same data + * structure. This structure is not used for + * recvmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ ---------------------- + * IPPROTO_SCTP SCTP_INIT struct sctp_initmsg + */ + if (cmsg->cmsg_len != + CMSG_LEN(sizeof(struct sctp_initmsg))) + return -EINVAL; + cmsgs->init = (struct sctp_initmsg *)CMSG_DATA(cmsg); + break; + + case SCTP_SNDRCV: + /* SCTP Socket API Extension + * 5.2.2 SCTP Header Information Structure(SCTP_SNDRCV) + * + * This cmsghdr structure specifies SCTP options for + * sendmsg() and describes SCTP header information + * about a received message through recvmsg(). + * + * cmsg_level cmsg_type cmsg_data[] + * ------------ ------------ ---------------------- + * IPPROTO_SCTP SCTP_SNDRCV struct sctp_sndrcvinfo + */ + if (cmsg->cmsg_len != + CMSG_LEN(sizeof(struct sctp_sndrcvinfo))) + return -EINVAL; + + cmsgs->info = + (struct sctp_sndrcvinfo *)CMSG_DATA(cmsg); + + /* Minimally, validate the sinfo_flags. */ + if (cmsgs->info->sinfo_flags & + ~(MSG_UNORDERED | MSG_ADDR_OVER | + MSG_ABORT | MSG_EOF)) + return -EINVAL; + break; + + default: + return -EINVAL; + }; + } + return 0; +} + +/* + * Wait for a packet.. + * Note: This function is the same function as in core/datagram.c + * with a few modifications to make lksctp work. + */ +static int sctp_wait_for_packet(struct sock * sk, int *err, long *timeo_p) +{ + int error; + DEFINE_WAIT(wait); + + prepare_to_wait_exclusive(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + + /* Socket errors? */ + error = sock_error(sk); + if (error) + goto out; + + if (!skb_queue_empty(&sk->sk_receive_queue)) + goto ready; + + /* Socket shut down? */ + if (sk->sk_shutdown & RCV_SHUTDOWN) + goto out; + + /* Sequenced packets can come disconnected. If so we report the + * problem. + */ + error = -ENOTCONN; + + /* Is there a good reason to think that we may receive some data? */ + if (list_empty(&sctp_sk(sk)->ep->asocs) && !sctp_sstate(sk, LISTENING)) + goto out; + + /* Handle signals. */ + if (signal_pending(current)) + goto interrupted; + + /* Let another process have a go. Since we are going to sleep + * anyway. Note: This may cause odd behaviors if the message + * does not fit in the user's buffer, but this seems to be the + * only way to honor MSG_DONTWAIT realistically. + */ + sctp_release_sock(sk); + *timeo_p = schedule_timeout(*timeo_p); + sctp_lock_sock(sk); + +ready: + finish_wait(sk->sk_sleep, &wait); + return 0; + +interrupted: + error = sock_intr_errno(*timeo_p); + +out: + finish_wait(sk->sk_sleep, &wait); + *err = error; + return error; +} + +/* Receive a datagram. + * Note: This is pretty much the same routine as in core/datagram.c + * with a few changes to make lksctp work. + */ +static struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, + int noblock, int *err) +{ + int error; + struct sk_buff *skb; + long timeo; + + /* Caller is allowed not to check sk->sk_err before calling. */ + error = sock_error(sk); + if (error) + goto no_packet; + + timeo = sock_rcvtimeo(sk, noblock); + + SCTP_DEBUG_PRINTK("Timeout: timeo: %ld, MAX: %ld.\n", + timeo, MAX_SCHEDULE_TIMEOUT); + + do { + /* Again only user level code calls this function, + * so nothing interrupt level + * will suddenly eat the receive_queue. + * + * Look at current nfs client by the way... + * However, this function was corrent in any case. 8) + */ + if (flags & MSG_PEEK) { + unsigned long cpu_flags; + + sctp_spin_lock_irqsave(&sk->sk_receive_queue.lock, + cpu_flags); + skb = skb_peek(&sk->sk_receive_queue); + if (skb) + atomic_inc(&skb->users); + sctp_spin_unlock_irqrestore(&sk->sk_receive_queue.lock, + cpu_flags); + } else { + skb = skb_dequeue(&sk->sk_receive_queue); + } + + if (skb) + return skb; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + /* User doesn't want to wait. */ + error = -EAGAIN; + if (!timeo) + goto no_packet; + } while (sctp_wait_for_packet(sk, err, &timeo) == 0); + + return NULL; + +no_packet: + *err = error; + return NULL; +} + +/* If sndbuf has changed, wake up per association sndbuf waiters. */ +static void __sctp_write_space(struct sctp_association *asoc) +{ + struct sock *sk = asoc->base.sk; + struct socket *sock = sk->sk_socket; + + if ((sctp_wspace(asoc) > 0) && sock) { + if (waitqueue_active(&asoc->wait)) + wake_up_interruptible(&asoc->wait); + + if (sctp_writeable(sk)) { + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + + /* Note that we try to include the Async I/O support + * here by modeling from the current TCP/UDP code. + * We have not tested with it yet. + */ + if (sock->fasync_list && + !(sk->sk_shutdown & SEND_SHUTDOWN)) + sock_wake_async(sock, 2, POLL_OUT); + } + } +} + +/* Do accounting for the sndbuf space. + * Decrement the used sndbuf space of the corresponding association by the + * data size which was just transmitted(freed). + */ +static void sctp_wfree(struct sk_buff *skb) +{ + struct sctp_association *asoc; + struct sctp_chunk *chunk; + struct sock *sk; + + /* Get the saved chunk pointer. */ + chunk = *((struct sctp_chunk **)(skb->cb)); + asoc = chunk->asoc; + sk = asoc->base.sk; + asoc->sndbuf_used -= SCTP_DATA_SNDSIZE(chunk); + sk->sk_wmem_queued -= SCTP_DATA_SNDSIZE(chunk); + __sctp_write_space(asoc); + + sctp_association_put(asoc); +} + +/* Helper function to wait for space in the sndbuf. */ +static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p, + size_t msg_len) +{ + struct sock *sk = asoc->base.sk; + int err = 0; + long current_timeo = *timeo_p; + DEFINE_WAIT(wait); + + SCTP_DEBUG_PRINTK("wait_for_sndbuf: asoc=%p, timeo=%ld, msg_len=%zu\n", + asoc, (long)(*timeo_p), msg_len); + + /* Increment the association's refcnt. */ + sctp_association_hold(asoc); + + /* Wait on the association specific sndbuf space. */ + for (;;) { + prepare_to_wait_exclusive(&asoc->wait, &wait, + TASK_INTERRUPTIBLE); + if (!*timeo_p) + goto do_nonblock; + if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING || + asoc->base.dead) + goto do_error; + if (signal_pending(current)) + goto do_interrupted; + if (msg_len <= sctp_wspace(asoc)) + break; + + /* Let another process have a go. Since we are going + * to sleep anyway. + */ + sctp_release_sock(sk); + current_timeo = schedule_timeout(current_timeo); + sctp_lock_sock(sk); + + *timeo_p = current_timeo; + } + +out: + finish_wait(&asoc->wait, &wait); + + /* Release the association's refcnt. */ + sctp_association_put(asoc); + + return err; + +do_error: + err = -EPIPE; + goto out; + +do_interrupted: + err = sock_intr_errno(*timeo_p); + goto out; + +do_nonblock: + err = -EAGAIN; + goto out; +} + +/* If socket sndbuf has changed, wake up all per association waiters. */ +void sctp_write_space(struct sock *sk) +{ + struct sctp_association *asoc; + struct list_head *pos; + + /* Wake up the tasks in each wait queue. */ + list_for_each(pos, &((sctp_sk(sk))->ep->asocs)) { + asoc = list_entry(pos, struct sctp_association, asocs); + __sctp_write_space(asoc); + } +} + +/* Is there any sndbuf space available on the socket? + * + * Note that wmem_queued is the sum of the send buffers on all of the + * associations on the same socket. For a UDP-style socket with + * multiple associations, it is possible for it to be "unwriteable" + * prematurely. I assume that this is acceptable because + * a premature "unwriteable" is better than an accidental "writeable" which + * would cause an unwanted block under certain circumstances. For the 1-1 + * UDP-style sockets or TCP-style sockets, this code should work. + * - Daisy + */ +static int sctp_writeable(struct sock *sk) +{ + int amt = 0; + + amt = sk->sk_sndbuf - sk->sk_wmem_queued; + if (amt < 0) + amt = 0; + return amt; +} + +/* Wait for an association to go into ESTABLISHED state. If timeout is 0, + * returns immediately with EINPROGRESS. + */ +static int sctp_wait_for_connect(struct sctp_association *asoc, long *timeo_p) +{ + struct sock *sk = asoc->base.sk; + int err = 0; + long current_timeo = *timeo_p; + DEFINE_WAIT(wait); + + SCTP_DEBUG_PRINTK("%s: asoc=%p, timeo=%ld\n", __FUNCTION__, asoc, + (long)(*timeo_p)); + + /* Increment the association's refcnt. */ + sctp_association_hold(asoc); + + for (;;) { + prepare_to_wait_exclusive(&asoc->wait, &wait, + TASK_INTERRUPTIBLE); + if (!*timeo_p) + goto do_nonblock; + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + if (sk->sk_err || asoc->state >= SCTP_STATE_SHUTDOWN_PENDING || + asoc->base.dead) + goto do_error; + if (signal_pending(current)) + goto do_interrupted; + + if (sctp_state(asoc, ESTABLISHED)) + break; + + /* Let another process have a go. Since we are going + * to sleep anyway. + */ + sctp_release_sock(sk); + current_timeo = schedule_timeout(current_timeo); + sctp_lock_sock(sk); + + *timeo_p = current_timeo; + } + +out: + finish_wait(&asoc->wait, &wait); + + /* Release the association's refcnt. */ + sctp_association_put(asoc); + + return err; + +do_error: + if (asoc->counters[SCTP_COUNTER_INIT_ERROR] + 1 >= + asoc->max_init_attempts) + err = -ETIMEDOUT; + else + err = -ECONNREFUSED; + goto out; + +do_interrupted: + err = sock_intr_errno(*timeo_p); + goto out; + +do_nonblock: + err = -EINPROGRESS; + goto out; +} + +static int sctp_wait_for_accept(struct sock *sk, long timeo) +{ + struct sctp_endpoint *ep; + int err = 0; + DEFINE_WAIT(wait); + + ep = sctp_sk(sk)->ep; + + + for (;;) { + prepare_to_wait_exclusive(sk->sk_sleep, &wait, + TASK_INTERRUPTIBLE); + + if (list_empty(&ep->asocs)) { + sctp_release_sock(sk); + timeo = schedule_timeout(timeo); + sctp_lock_sock(sk); + } + + err = -EINVAL; + if (!sctp_sstate(sk, LISTENING)) + break; + + err = 0; + if (!list_empty(&ep->asocs)) + break; + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + break; + + err = -EAGAIN; + if (!timeo) + break; + } + + finish_wait(sk->sk_sleep, &wait); + + return err; +} + +void sctp_wait_for_close(struct sock *sk, long timeout) +{ + DEFINE_WAIT(wait); + + do { + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + if (list_empty(&sctp_sk(sk)->ep->asocs)) + break; + sctp_release_sock(sk); + timeout = schedule_timeout(timeout); + sctp_lock_sock(sk); + } while (!signal_pending(current) && timeout); + + finish_wait(sk->sk_sleep, &wait); +} + +/* Populate the fields of the newsk from the oldsk and migrate the assoc + * and its messages to the newsk. + */ +static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk, + struct sctp_association *assoc, + sctp_socket_type_t type) +{ + struct sctp_sock *oldsp = sctp_sk(oldsk); + struct sctp_sock *newsp = sctp_sk(newsk); + struct sctp_bind_bucket *pp; /* hash list port iterator */ + struct sctp_endpoint *newep = newsp->ep; + struct sk_buff *skb, *tmp; + struct sctp_ulpevent *event; + + /* Migrate socket buffer sizes and all the socket level options to the + * new socket. + */ + newsk->sk_sndbuf = oldsk->sk_sndbuf; + newsk->sk_rcvbuf = oldsk->sk_rcvbuf; + /* Brute force copy old sctp opt. */ + inet_sk_copy_descendant(newsk, oldsk); + + /* Restore the ep value that was overwritten with the above structure + * copy. + */ + newsp->ep = newep; + newsp->hmac = NULL; + + /* Hook this new socket in to the bind_hash list. */ + pp = sctp_sk(oldsk)->bind_hash; + sk_add_bind_node(newsk, &pp->owner); + sctp_sk(newsk)->bind_hash = pp; + inet_sk(newsk)->num = inet_sk(oldsk)->num; + + /* Move any messages in the old socket's receive queue that are for the + * peeled off association to the new socket's receive queue. + */ + sctp_skb_for_each(skb, &oldsk->sk_receive_queue, tmp) { + event = sctp_skb2event(skb); + if (event->asoc == assoc) { + __skb_unlink(skb, skb->list); + __skb_queue_tail(&newsk->sk_receive_queue, skb); + } + } + + /* Clean up any messages pending delivery due to partial + * delivery. Three cases: + * 1) No partial deliver; no work. + * 2) Peeling off partial delivery; keep pd_lobby in new pd_lobby. + * 3) Peeling off non-partial delivery; move pd_lobby to receive_queue. + */ + skb_queue_head_init(&newsp->pd_lobby); + sctp_sk(newsk)->pd_mode = assoc->ulpq.pd_mode; + + if (sctp_sk(oldsk)->pd_mode) { + struct sk_buff_head *queue; + + /* Decide which queue to move pd_lobby skbs to. */ + if (assoc->ulpq.pd_mode) { + queue = &newsp->pd_lobby; + } else + queue = &newsk->sk_receive_queue; + + /* Walk through the pd_lobby, looking for skbs that + * need moved to the new socket. + */ + sctp_skb_for_each(skb, &oldsp->pd_lobby, tmp) { + event = sctp_skb2event(skb); + if (event->asoc == assoc) { + __skb_unlink(skb, skb->list); + __skb_queue_tail(queue, skb); + } + } + + /* Clear up any skbs waiting for the partial + * delivery to finish. + */ + if (assoc->ulpq.pd_mode) + sctp_clear_pd(oldsk); + + } + + /* Set the type of socket to indicate that it is peeled off from the + * original UDP-style socket or created with the accept() call on a + * TCP-style socket.. + */ + newsp->type = type; + + /* Migrate the association to the new socket. */ + sctp_assoc_migrate(assoc, newsk); + + /* If the association on the newsk is already closed before accept() + * is called, set RCV_SHUTDOWN flag. + */ + if (sctp_state(assoc, CLOSED) && sctp_style(newsk, TCP)) + newsk->sk_shutdown |= RCV_SHUTDOWN; + + newsk->sk_state = SCTP_SS_ESTABLISHED; +} + +/* This proto struct describes the ULP interface for SCTP. */ +struct proto sctp_prot = { + .name = "SCTP", + .owner = THIS_MODULE, + .close = sctp_close, + .connect = sctp_connect, + .disconnect = sctp_disconnect, + .accept = sctp_accept, + .ioctl = sctp_ioctl, + .init = sctp_init_sock, + .destroy = sctp_destroy_sock, + .shutdown = sctp_shutdown, + .setsockopt = sctp_setsockopt, + .getsockopt = sctp_getsockopt, + .sendmsg = sctp_sendmsg, + .recvmsg = sctp_recvmsg, + .bind = sctp_bind, + .backlog_rcv = sctp_backlog_rcv, + .hash = sctp_hash, + .unhash = sctp_unhash, + .get_port = sctp_get_port, + .obj_size = sizeof(struct sctp_sock), +}; + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +struct proto sctpv6_prot = { + .name = "SCTPv6", + .owner = THIS_MODULE, + .close = sctp_close, + .connect = sctp_connect, + .disconnect = sctp_disconnect, + .accept = sctp_accept, + .ioctl = sctp_ioctl, + .init = sctp_init_sock, + .destroy = sctp_destroy_sock, + .shutdown = sctp_shutdown, + .setsockopt = sctp_setsockopt, + .getsockopt = sctp_getsockopt, + .sendmsg = sctp_sendmsg, + .recvmsg = sctp_recvmsg, + .bind = sctp_bind, + .backlog_rcv = sctp_backlog_rcv, + .hash = sctp_hash, + .unhash = sctp_unhash, + .get_port = sctp_get_port, + .obj_size = sizeof(struct sctp6_sock), +}; +#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ diff --git a/net/sctp/ssnmap.c b/net/sctp/ssnmap.c new file mode 100644 index 000000000000..e627d2b451b6 --- /dev/null +++ b/net/sctp/ssnmap.c @@ -0,0 +1,131 @@ +/* SCTP kernel reference Implementation + * Copyright (c) 2003 International Business Machines, Corp. + * + * This file is part of the SCTP kernel reference Implementation + * + * These functions manipulate sctp SSN tracker. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Jon Grimm + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include + +#define MAX_KMALLOC_SIZE 131072 + +static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in, + __u16 out); + +/* Storage size needed for map includes 2 headers and then the + * specific needs of in or out streams. + */ +static inline size_t sctp_ssnmap_size(__u16 in, __u16 out) +{ + return sizeof(struct sctp_ssnmap) + (in + out) * sizeof(__u16); +} + + +/* Create a new sctp_ssnmap. + * Allocate room to store at least 'len' contiguous TSNs. + */ +struct sctp_ssnmap *sctp_ssnmap_new(__u16 in, __u16 out, int gfp) +{ + struct sctp_ssnmap *retval; + int size; + + size = sctp_ssnmap_size(in, out); + if (size <= MAX_KMALLOC_SIZE) + retval = kmalloc(size, gfp); + else + retval = (struct sctp_ssnmap *) + __get_free_pages(gfp, get_order(size)); + if (!retval) + goto fail; + + if (!sctp_ssnmap_init(retval, in, out)) + goto fail_map; + + retval->malloced = 1; + SCTP_DBG_OBJCNT_INC(ssnmap); + + return retval; + +fail_map: + if (size <= MAX_KMALLOC_SIZE) + kfree(retval); + else + free_pages((unsigned long)retval, get_order(size)); +fail: + return NULL; +} + + +/* Initialize a block of memory as a ssnmap. */ +static struct sctp_ssnmap *sctp_ssnmap_init(struct sctp_ssnmap *map, __u16 in, + __u16 out) +{ + memset(map, 0x00, sctp_ssnmap_size(in, out)); + + /* Start 'in' stream just after the map header. */ + map->in.ssn = (__u16 *)&map[1]; + map->in.len = in; + + /* Start 'out' stream just after 'in'. */ + map->out.ssn = &map->in.ssn[in]; + map->out.len = out; + + return map; +} + +/* Clear out the ssnmap streams. */ +void sctp_ssnmap_clear(struct sctp_ssnmap *map) +{ + size_t size; + + size = (map->in.len + map->out.len) * sizeof(__u16); + memset(map->in.ssn, 0x00, size); +} + +/* Dispose of a ssnmap. */ +void sctp_ssnmap_free(struct sctp_ssnmap *map) +{ + if (map && map->malloced) { + int size; + + size = sctp_ssnmap_size(map->in.len, map->out.len); + if (size <= MAX_KMALLOC_SIZE) + kfree(map); + else + free_pages((unsigned long)map, get_order(size)); + SCTP_DBG_OBJCNT_DEC(ssnmap); + } +} diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c new file mode 100644 index 000000000000..89fa20c73a5c --- /dev/null +++ b/net/sctp/sysctl.c @@ -0,0 +1,251 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2002, 2004 + * Copyright (c) 2002 Intel Corp. + * + * This file is part of the SCTP kernel reference Implementation + * + * Sysctl related interfaces for SCTP. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Mingqin Liu + * Jon Grimm + * Ardelle Fan + * Ryan Layer + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include + +static ctl_handler sctp_sysctl_jiffies_ms; +static long rto_timer_min = 1; +static long rto_timer_max = 86400000; /* One day */ + +static ctl_table sctp_table[] = { + { + .ctl_name = NET_SCTP_RTO_INITIAL, + .procname = "rto_initial", + .data = &sctp_rto_initial, + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = &proc_doulongvec_ms_jiffies_minmax, + .strategy = &sctp_sysctl_jiffies_ms, + .extra1 = &rto_timer_min, + .extra2 = &rto_timer_max + }, + { + .ctl_name = NET_SCTP_RTO_MIN, + .procname = "rto_min", + .data = &sctp_rto_min, + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = &proc_doulongvec_ms_jiffies_minmax, + .strategy = &sctp_sysctl_jiffies_ms, + .extra1 = &rto_timer_min, + .extra2 = &rto_timer_max + }, + { + .ctl_name = NET_SCTP_RTO_MAX, + .procname = "rto_max", + .data = &sctp_rto_max, + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = &proc_doulongvec_ms_jiffies_minmax, + .strategy = &sctp_sysctl_jiffies_ms, + .extra1 = &rto_timer_min, + .extra2 = &rto_timer_max + }, + { + .ctl_name = NET_SCTP_VALID_COOKIE_LIFE, + .procname = "valid_cookie_life", + .data = &sctp_valid_cookie_life, + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = &proc_doulongvec_ms_jiffies_minmax, + .strategy = &sctp_sysctl_jiffies_ms, + .extra1 = &rto_timer_min, + .extra2 = &rto_timer_max + }, + { + .ctl_name = NET_SCTP_MAX_BURST, + .procname = "max_burst", + .data = &sctp_max_burst, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_SCTP_ASSOCIATION_MAX_RETRANS, + .procname = "association_max_retrans", + .data = &sctp_max_retrans_association, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_SCTP_PATH_MAX_RETRANS, + .procname = "path_max_retrans", + .data = &sctp_max_retrans_path, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_SCTP_MAX_INIT_RETRANSMITS, + .procname = "max_init_retransmits", + .data = &sctp_max_retrans_init, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_SCTP_HB_INTERVAL, + .procname = "hb_interval", + .data = &sctp_hb_interval, + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = &proc_doulongvec_ms_jiffies_minmax, + .strategy = &sctp_sysctl_jiffies_ms, + .extra1 = &rto_timer_min, + .extra2 = &rto_timer_max + }, + { + .ctl_name = NET_SCTP_PRESERVE_ENABLE, + .procname = "cookie_preserve_enable", + .data = &sctp_cookie_preserve_enable, + .maxlen = sizeof(long), + .mode = 0644, + .proc_handler = &proc_doulongvec_ms_jiffies_minmax, + .strategy = &sctp_sysctl_jiffies_ms, + .extra1 = &rto_timer_min, + .extra2 = &rto_timer_max + }, + { + .ctl_name = NET_SCTP_RTO_ALPHA, + .procname = "rto_alpha_exp_divisor", + .data = &sctp_rto_alpha, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_SCTP_RTO_BETA, + .procname = "rto_beta_exp_divisor", + .data = &sctp_rto_beta, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_SCTP_ADDIP_ENABLE, + .procname = "addip_enable", + .data = &sctp_addip_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = NET_SCTP_PRSCTP_ENABLE, + .procname = "prsctp_enable", + .data = &sctp_prsctp_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = 0 } +}; + +static ctl_table sctp_net_table[] = { + { + .ctl_name = NET_SCTP, + .procname = "sctp", + .mode = 0555, + .child = sctp_table + }, + { .ctl_name = 0 } +}; + +static ctl_table sctp_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = sctp_net_table + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header * sctp_sysctl_header; + +/* Sysctl registration. */ +void sctp_sysctl_register(void) +{ + sctp_sysctl_header = register_sysctl_table(sctp_root_table, 0); +} + +/* Sysctl deregistration. */ +void sctp_sysctl_unregister(void) +{ + unregister_sysctl_table(sctp_sysctl_header); +} + +/* Strategy function to convert jiffies to milliseconds. */ +static int sctp_sysctl_jiffies_ms(ctl_table *table, int __user *name, int nlen, + void __user *oldval, size_t __user *oldlenp, + void __user *newval, size_t newlen, void **context) { + + if (oldval) { + size_t olen; + + if (oldlenp) { + if (get_user(olen, oldlenp)) + return -EFAULT; + + if (olen != sizeof (int)) + return -EINVAL; + } + if (put_user((*(int *)(table->data) * 1000) / HZ, + (int __user *)oldval) || + (oldlenp && put_user(sizeof (int), oldlenp))) + return -EFAULT; + } + if (newval && newlen) { + int new; + + if (newlen != sizeof (int)) + return -EINVAL; + + if (get_user(new, (int __user *)newval)) + return -EFAULT; + + *(int *)(table->data) = (new * HZ) / 1000; + } + return 1; +} diff --git a/net/sctp/transport.c b/net/sctp/transport.c new file mode 100644 index 000000000000..f30882e1e96a --- /dev/null +++ b/net/sctp/transport.c @@ -0,0 +1,514 @@ +/* SCTP kernel reference Implementation + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001-2003 International Business Machines Corp. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 La Monte H.P. Yarroll + * + * This file is part of the SCTP kernel reference Implementation + * + * This module provides the abstraction for an SCTP tranport representing + * a remote transport address. For local transport addresses, we just use + * union sctp_addr. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Karl Knutson + * Jon Grimm + * Xingang Guo + * Hui Huang + * Sridhar Samudrala + * Ardelle Fan + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include + +/* 1st Level Abstractions. */ + +/* Initialize a new transport from provided memory. */ +static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, + const union sctp_addr *addr, + int gfp) +{ + /* Copy in the address. */ + peer->ipaddr = *addr; + peer->af_specific = sctp_get_af_specific(addr->sa.sa_family); + peer->asoc = NULL; + + peer->dst = NULL; + memset(&peer->saddr, 0, sizeof(union sctp_addr)); + + /* From 6.3.1 RTO Calculation: + * + * C1) Until an RTT measurement has been made for a packet sent to the + * given destination transport address, set RTO to the protocol + * parameter 'RTO.Initial'. + */ + peer->rtt = 0; + peer->rto = sctp_rto_initial; + peer->rttvar = 0; + peer->srtt = 0; + peer->rto_pending = 0; + + peer->last_time_heard = jiffies; + peer->last_time_used = jiffies; + peer->last_time_ecne_reduced = jiffies; + + peer->active = SCTP_ACTIVE; + peer->hb_allowed = 0; + + /* Initialize the default path max_retrans. */ + peer->max_retrans = sctp_max_retrans_path; + peer->error_count = 0; + + INIT_LIST_HEAD(&peer->transmitted); + INIT_LIST_HEAD(&peer->send_ready); + INIT_LIST_HEAD(&peer->transports); + + /* Set up the retransmission timer. */ + init_timer(&peer->T3_rtx_timer); + peer->T3_rtx_timer.function = sctp_generate_t3_rtx_event; + peer->T3_rtx_timer.data = (unsigned long)peer; + + /* Set up the heartbeat timer. */ + init_timer(&peer->hb_timer); + peer->hb_interval = SCTP_DEFAULT_TIMEOUT_HEARTBEAT; + peer->hb_timer.function = sctp_generate_heartbeat_event; + peer->hb_timer.data = (unsigned long)peer; + + atomic_set(&peer->refcnt, 1); + peer->dead = 0; + + peer->malloced = 0; + + /* Initialize the state information for SFR-CACC */ + peer->cacc.changeover_active = 0; + peer->cacc.cycling_changeover = 0; + peer->cacc.next_tsn_at_change = 0; + peer->cacc.cacc_saw_newack = 0; + + return peer; +} + +/* Allocate and initialize a new transport. */ +struct sctp_transport *sctp_transport_new(const union sctp_addr *addr, int gfp) +{ + struct sctp_transport *transport; + + transport = t_new(struct sctp_transport, gfp); + if (!transport) + goto fail; + + if (!sctp_transport_init(transport, addr, gfp)) + goto fail_init; + + transport->malloced = 1; + SCTP_DBG_OBJCNT_INC(transport); + + return transport; + +fail_init: + kfree(transport); + +fail: + return NULL; +} + +/* This transport is no longer needed. Free up if possible, or + * delay until it last reference count. + */ +void sctp_transport_free(struct sctp_transport *transport) +{ + transport->dead = 1; + + /* Try to delete the heartbeat timer. */ + if (del_timer(&transport->hb_timer)) + sctp_transport_put(transport); + + /* Delete the T3_rtx timer if it's active. + * There is no point in not doing this now and letting + * structure hang around in memory since we know + * the tranport is going away. + */ + if (timer_pending(&transport->T3_rtx_timer) && + del_timer(&transport->T3_rtx_timer)) + sctp_transport_put(transport); + + + sctp_transport_put(transport); +} + +/* Destroy the transport data structure. + * Assumes there are no more users of this structure. + */ +static void sctp_transport_destroy(struct sctp_transport *transport) +{ + SCTP_ASSERT(transport->dead, "Transport is not dead", return); + + if (transport->asoc) + sctp_association_put(transport->asoc); + + sctp_packet_free(&transport->packet); + + dst_release(transport->dst); + kfree(transport); + SCTP_DBG_OBJCNT_DEC(transport); +} + +/* Start T3_rtx timer if it is not already running and update the heartbeat + * timer. This routine is called every time a DATA chunk is sent. + */ +void sctp_transport_reset_timers(struct sctp_transport *transport) +{ + /* RFC 2960 6.3.2 Retransmission Timer Rules + * + * R1) Every time a DATA chunk is sent to any address(including a + * retransmission), if the T3-rtx timer of that address is not running + * start it running so that it will expire after the RTO of that + * address. + */ + + if (!timer_pending(&transport->T3_rtx_timer)) + if (!mod_timer(&transport->T3_rtx_timer, + jiffies + transport->rto)) + sctp_transport_hold(transport); + + /* When a data chunk is sent, reset the heartbeat interval. */ + if (!mod_timer(&transport->hb_timer, + sctp_transport_timeout(transport))) + sctp_transport_hold(transport); +} + +/* This transport has been assigned to an association. + * Initialize fields from the association or from the sock itself. + * Register the reference count in the association. + */ +void sctp_transport_set_owner(struct sctp_transport *transport, + struct sctp_association *asoc) +{ + transport->asoc = asoc; + sctp_association_hold(asoc); +} + +/* Initialize the pmtu of a transport. */ +void sctp_transport_pmtu(struct sctp_transport *transport) +{ + struct dst_entry *dst; + + dst = transport->af_specific->get_dst(NULL, &transport->ipaddr, NULL); + + if (dst) { + transport->pmtu = dst_mtu(dst); + dst_release(dst); + } else + transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; +} + +/* Caches the dst entry and source address for a transport's destination + * address. + */ +void sctp_transport_route(struct sctp_transport *transport, + union sctp_addr *saddr, struct sctp_sock *opt) +{ + struct sctp_association *asoc = transport->asoc; + struct sctp_af *af = transport->af_specific; + union sctp_addr *daddr = &transport->ipaddr; + struct dst_entry *dst; + + dst = af->get_dst(asoc, daddr, saddr); + + if (saddr) + memcpy(&transport->saddr, saddr, sizeof(union sctp_addr)); + else + af->get_saddr(asoc, dst, daddr, &transport->saddr); + + transport->dst = dst; + if (dst) { + transport->pmtu = dst_mtu(dst); + + /* Initialize sk->sk_rcv_saddr, if the transport is the + * association's active path for getsockname(). + */ + if (asoc && (transport == asoc->peer.active_path)) + af->to_sk_saddr(&transport->saddr, asoc->base.sk); + } else + transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; +} + +/* Hold a reference to a transport. */ +void sctp_transport_hold(struct sctp_transport *transport) +{ + atomic_inc(&transport->refcnt); +} + +/* Release a reference to a transport and clean up + * if there are no more references. + */ +void sctp_transport_put(struct sctp_transport *transport) +{ + if (atomic_dec_and_test(&transport->refcnt)) + sctp_transport_destroy(transport); +} + +/* Update transport's RTO based on the newly calculated RTT. */ +void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt) +{ + /* Check for valid transport. */ + SCTP_ASSERT(tp, "NULL transport", return); + + /* We should not be doing any RTO updates unless rto_pending is set. */ + SCTP_ASSERT(tp->rto_pending, "rto_pending not set", return); + + if (tp->rttvar || tp->srtt) { + /* 6.3.1 C3) When a new RTT measurement R' is made, set + * RTTVAR <- (1 - RTO.Beta) * RTTVAR + RTO.Beta * |SRTT - R'| + * SRTT <- (1 - RTO.Alpha) * SRTT + RTO.Alpha * R' + */ + + /* Note: The above algorithm has been rewritten to + * express rto_beta and rto_alpha as inverse powers + * of two. + * For example, assuming the default value of RTO.Alpha of + * 1/8, rto_alpha would be expressed as 3. + */ + tp->rttvar = tp->rttvar - (tp->rttvar >> sctp_rto_beta) + + ((abs(tp->srtt - rtt)) >> sctp_rto_beta); + tp->srtt = tp->srtt - (tp->srtt >> sctp_rto_alpha) + + (rtt >> sctp_rto_alpha); + } else { + /* 6.3.1 C2) When the first RTT measurement R is made, set + * SRTT <- R, RTTVAR <- R/2. + */ + tp->srtt = rtt; + tp->rttvar = rtt >> 1; + } + + /* 6.3.1 G1) Whenever RTTVAR is computed, if RTTVAR = 0, then + * adjust RTTVAR <- G, where G is the CLOCK GRANULARITY. + */ + if (tp->rttvar == 0) + tp->rttvar = SCTP_CLOCK_GRANULARITY; + + /* 6.3.1 C3) After the computation, update RTO <- SRTT + 4 * RTTVAR. */ + tp->rto = tp->srtt + (tp->rttvar << 2); + + /* 6.3.1 C6) Whenever RTO is computed, if it is less than RTO.Min + * seconds then it is rounded up to RTO.Min seconds. + */ + if (tp->rto < tp->asoc->rto_min) + tp->rto = tp->asoc->rto_min; + + /* 6.3.1 C7) A maximum value may be placed on RTO provided it is + * at least RTO.max seconds. + */ + if (tp->rto > tp->asoc->rto_max) + tp->rto = tp->asoc->rto_max; + + tp->rtt = rtt; + + /* Reset rto_pending so that a new RTT measurement is started when a + * new data chunk is sent. + */ + tp->rto_pending = 0; + + SCTP_DEBUG_PRINTK("%s: transport: %p, rtt: %d, srtt: %d " + "rttvar: %d, rto: %d\n", __FUNCTION__, + tp, rtt, tp->srtt, tp->rttvar, tp->rto); +} + +/* This routine updates the transport's cwnd and partial_bytes_acked + * parameters based on the bytes acked in the received SACK. + */ +void sctp_transport_raise_cwnd(struct sctp_transport *transport, + __u32 sack_ctsn, __u32 bytes_acked) +{ + __u32 cwnd, ssthresh, flight_size, pba, pmtu; + + cwnd = transport->cwnd; + flight_size = transport->flight_size; + + /* The appropriate cwnd increase algorithm is performed if, and only + * if the cumulative TSN has advanced and the congestion window is + * being fully utilized. + */ + if ((transport->asoc->ctsn_ack_point >= sack_ctsn) || + (flight_size < cwnd)) + return; + + ssthresh = transport->ssthresh; + pba = transport->partial_bytes_acked; + pmtu = transport->asoc->pmtu; + + if (cwnd <= ssthresh) { + /* RFC 2960 7.2.1, sctpimpguide-05 2.14.2 When cwnd is less + * than or equal to ssthresh an SCTP endpoint MUST use the + * slow start algorithm to increase cwnd only if the current + * congestion window is being fully utilized and an incoming + * SACK advances the Cumulative TSN Ack Point. Only when these + * two conditions are met can the cwnd be increased otherwise + * the cwnd MUST not be increased. If these conditions are met + * then cwnd MUST be increased by at most the lesser of + * 1) the total size of the previously outstanding DATA + * chunk(s) acknowledged, and 2) the destination's path MTU. + */ + if (bytes_acked > pmtu) + cwnd += pmtu; + else + cwnd += bytes_acked; + SCTP_DEBUG_PRINTK("%s: SLOW START: transport: %p, " + "bytes_acked: %d, cwnd: %d, ssthresh: %d, " + "flight_size: %d, pba: %d\n", + __FUNCTION__, + transport, bytes_acked, cwnd, + ssthresh, flight_size, pba); + } else { + /* RFC 2960 7.2.2 Whenever cwnd is greater than ssthresh, + * upon each SACK arrival that advances the Cumulative TSN Ack + * Point, increase partial_bytes_acked by the total number of + * bytes of all new chunks acknowledged in that SACK including + * chunks acknowledged by the new Cumulative TSN Ack and by + * Gap Ack Blocks. + * + * When partial_bytes_acked is equal to or greater than cwnd + * and before the arrival of the SACK the sender had cwnd or + * more bytes of data outstanding (i.e., before arrival of the + * SACK, flightsize was greater than or equal to cwnd), + * increase cwnd by MTU, and reset partial_bytes_acked to + * (partial_bytes_acked - cwnd). + */ + pba += bytes_acked; + if (pba >= cwnd) { + cwnd += pmtu; + pba = ((cwnd < pba) ? (pba - cwnd) : 0); + } + SCTP_DEBUG_PRINTK("%s: CONGESTION AVOIDANCE: " + "transport: %p, bytes_acked: %d, cwnd: %d, " + "ssthresh: %d, flight_size: %d, pba: %d\n", + __FUNCTION__, + transport, bytes_acked, cwnd, + ssthresh, flight_size, pba); + } + + transport->cwnd = cwnd; + transport->partial_bytes_acked = pba; +} + +/* This routine is used to lower the transport's cwnd when congestion is + * detected. + */ +void sctp_transport_lower_cwnd(struct sctp_transport *transport, + sctp_lower_cwnd_t reason) +{ + switch (reason) { + case SCTP_LOWER_CWND_T3_RTX: + /* RFC 2960 Section 7.2.3, sctpimpguide + * When the T3-rtx timer expires on an address, SCTP should + * perform slow start by: + * ssthresh = max(cwnd/2, 4*MTU) + * cwnd = 1*MTU + * partial_bytes_acked = 0 + */ + transport->ssthresh = max(transport->cwnd/2, + 4*transport->asoc->pmtu); + transport->cwnd = transport->asoc->pmtu; + break; + + case SCTP_LOWER_CWND_FAST_RTX: + /* RFC 2960 7.2.4 Adjust the ssthresh and cwnd of the + * destination address(es) to which the missing DATA chunks + * were last sent, according to the formula described in + * Section 7.2.3. + * + * RFC 2960 7.2.3, sctpimpguide Upon detection of packet + * losses from SACK (see Section 7.2.4), An endpoint + * should do the following: + * ssthresh = max(cwnd/2, 4*MTU) + * cwnd = ssthresh + * partial_bytes_acked = 0 + */ + transport->ssthresh = max(transport->cwnd/2, + 4*transport->asoc->pmtu); + transport->cwnd = transport->ssthresh; + break; + + case SCTP_LOWER_CWND_ECNE: + /* RFC 2481 Section 6.1.2. + * If the sender receives an ECN-Echo ACK packet + * then the sender knows that congestion was encountered in the + * network on the path from the sender to the receiver. The + * indication of congestion should be treated just as a + * congestion loss in non-ECN Capable TCP. That is, the TCP + * source halves the congestion window "cwnd" and reduces the + * slow start threshold "ssthresh". + * A critical condition is that TCP does not react to + * congestion indications more than once every window of + * data (or more loosely more than once every round-trip time). + */ + if ((jiffies - transport->last_time_ecne_reduced) > + transport->rtt) { + transport->ssthresh = max(transport->cwnd/2, + 4*transport->asoc->pmtu); + transport->cwnd = transport->ssthresh; + transport->last_time_ecne_reduced = jiffies; + } + break; + + case SCTP_LOWER_CWND_INACTIVE: + /* RFC 2960 Section 7.2.1, sctpimpguide + * When the endpoint does not transmit data on a given + * transport address, the cwnd of the transport address + * should be adjusted to max(cwnd/2, 4*MTU) per RTO. + * NOTE: Although the draft recommends that this check needs + * to be done every RTO interval, we do it every hearbeat + * interval. + */ + if ((jiffies - transport->last_time_used) > transport->rto) + transport->cwnd = max(transport->cwnd/2, + 4*transport->asoc->pmtu); + break; + }; + + transport->partial_bytes_acked = 0; + SCTP_DEBUG_PRINTK("%s: transport: %p reason: %d cwnd: " + "%d ssthresh: %d\n", __FUNCTION__, + transport, reason, + transport->cwnd, transport->ssthresh); +} + +/* What is the next timeout value for this transport? */ +unsigned long sctp_transport_timeout(struct sctp_transport *t) +{ + unsigned long timeout; + timeout = t->hb_interval + t->rto + sctp_jitter(t->rto); + timeout += jiffies; + return timeout; +} diff --git a/net/sctp/tsnmap.c b/net/sctp/tsnmap.c new file mode 100644 index 000000000000..ac4fae161bc7 --- /dev/null +++ b/net/sctp/tsnmap.c @@ -0,0 +1,417 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001 Intel Corp. + * + * This file is part of the SCTP kernel reference Implementation + * + * These functions manipulate sctp tsn mapping array. + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * La Monte H.P. Yarroll + * Jon Grimm + * Karl Knutson + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include + +static void sctp_tsnmap_update(struct sctp_tsnmap *map); +static void sctp_tsnmap_find_gap_ack(__u8 *map, __u16 off, + __u16 len, __u16 base, + int *started, __u16 *start, + int *ended, __u16 *end); + +/* Initialize a block of memory as a tsnmap. */ +struct sctp_tsnmap *sctp_tsnmap_init(struct sctp_tsnmap *map, __u16 len, + __u32 initial_tsn) +{ + map->tsn_map = map->raw_map; + map->overflow_map = map->tsn_map + len; + map->len = len; + + /* Clear out a TSN ack status. */ + memset(map->tsn_map, 0x00, map->len + map->len); + + /* Keep track of TSNs represented by tsn_map. */ + map->base_tsn = initial_tsn; + map->overflow_tsn = initial_tsn + map->len; + map->cumulative_tsn_ack_point = initial_tsn - 1; + map->max_tsn_seen = map->cumulative_tsn_ack_point; + map->malloced = 0; + map->num_dup_tsns = 0; + + return map; +} + +/* Test the tracking state of this TSN. + * Returns: + * 0 if the TSN has not yet been seen + * >0 if the TSN has been seen (duplicate) + * <0 if the TSN is invalid (too large to track) + */ +int sctp_tsnmap_check(const struct sctp_tsnmap *map, __u32 tsn) +{ + __s32 gap; + int dup; + + /* Calculate the index into the mapping arrays. */ + gap = tsn - map->base_tsn; + + /* Verify that we can hold this TSN. */ + if (gap >= (/* base */ map->len + /* overflow */ map->len)) { + dup = -1; + goto out; + } + + /* Honk if we've already seen this TSN. + * We have three cases: + * 1. The TSN is ancient or belongs to a previous tsn_map. + * 2. The TSN is already marked in the tsn_map. + * 3. The TSN is already marked in the tsn_map_overflow. + */ + if (gap < 0 || + (gap < map->len && map->tsn_map[gap]) || + (gap >= map->len && map->overflow_map[gap - map->len])) + dup = 1; + else + dup = 0; + +out: + return dup; +} + + +/* Mark this TSN as seen. */ +void sctp_tsnmap_mark(struct sctp_tsnmap *map, __u32 tsn) +{ + __s32 gap; + + /* Vacuously mark any TSN which precedes the map base or + * exceeds the end of the map. + */ + if (TSN_lt(tsn, map->base_tsn)) + return; + if (!TSN_lt(tsn, map->base_tsn + map->len + map->len)) + return; + + /* Bump the max. */ + if (TSN_lt(map->max_tsn_seen, tsn)) + map->max_tsn_seen = tsn; + + /* Assert: TSN is in range. */ + gap = tsn - map->base_tsn; + + /* Mark the TSN as received. */ + if (gap < map->len) + map->tsn_map[gap]++; + else + map->overflow_map[gap - map->len]++; + + /* Go fixup any internal TSN mapping variables including + * cumulative_tsn_ack_point. + */ + sctp_tsnmap_update(map); +} + + +/* Initialize a Gap Ack Block iterator from memory being provided. */ +SCTP_STATIC void sctp_tsnmap_iter_init(const struct sctp_tsnmap *map, + struct sctp_tsnmap_iter *iter) +{ + /* Only start looking one past the Cumulative TSN Ack Point. */ + iter->start = map->cumulative_tsn_ack_point + 1; +} + +/* Get the next Gap Ack Blocks. Returns 0 if there was not another block + * to get. + */ +SCTP_STATIC int sctp_tsnmap_next_gap_ack(const struct sctp_tsnmap *map, + struct sctp_tsnmap_iter *iter, + __u16 *start, __u16 *end) +{ + int started, ended; + __u16 _start, _end, offset; + + /* We haven't found a gap yet. */ + started = ended = 0; + + /* If there are no more gap acks possible, get out fast. */ + if (TSN_lte(map->max_tsn_seen, iter->start)) + return 0; + + /* Search the first mapping array. */ + if (iter->start - map->base_tsn < map->len) { + + offset = iter->start - map->base_tsn; + sctp_tsnmap_find_gap_ack(map->tsn_map, offset, map->len, 0, + &started, &_start, &ended, &_end); + } + + /* Do we need to check the overflow map? */ + if (!ended) { + /* Fix up where we'd like to start searching in the + * overflow map. + */ + if (iter->start - map->base_tsn < map->len) + offset = 0; + else + offset = iter->start - map->base_tsn - map->len; + + /* Search the overflow map. */ + sctp_tsnmap_find_gap_ack(map->overflow_map, + offset, + map->len, + map->len, + &started, &_start, + &ended, &_end); + } + + /* The Gap Ack Block happens to end at the end of the + * overflow map. + */ + if (started && !ended) { + ended++; + _end = map->len + map->len - 1; + } + + /* If we found a Gap Ack Block, return the start and end and + * bump the iterator forward. + */ + if (ended) { + /* Fix up the start and end based on the + * Cumulative TSN Ack offset into the map. + */ + int gap = map->cumulative_tsn_ack_point - + map->base_tsn; + + *start = _start - gap; + *end = _end - gap; + + /* Move the iterator forward. */ + iter->start = map->cumulative_tsn_ack_point + *end + 1; + } + + return ended; +} + +/* Mark this and any lower TSN as seen. */ +void sctp_tsnmap_skip(struct sctp_tsnmap *map, __u32 tsn) +{ + __s32 gap; + + /* Vacuously mark any TSN which precedes the map base or + * exceeds the end of the map. + */ + if (TSN_lt(tsn, map->base_tsn)) + return; + if (!TSN_lt(tsn, map->base_tsn + map->len + map->len)) + return; + + /* Bump the max. */ + if (TSN_lt(map->max_tsn_seen, tsn)) + map->max_tsn_seen = tsn; + + /* Assert: TSN is in range. */ + gap = tsn - map->base_tsn + 1; + + /* Mark the TSNs as received. */ + if (gap <= map->len) + memset(map->tsn_map, 0x01, gap); + else { + memset(map->tsn_map, 0x01, map->len); + memset(map->overflow_map, 0x01, (gap - map->len)); + } + + /* Go fixup any internal TSN mapping variables including + * cumulative_tsn_ack_point. + */ + sctp_tsnmap_update(map); +} + +/******************************************************************** + * 2nd Level Abstractions + ********************************************************************/ + +/* This private helper function updates the tsnmap buffers and + * the Cumulative TSN Ack Point. + */ +static void sctp_tsnmap_update(struct sctp_tsnmap *map) +{ + __u32 ctsn; + + ctsn = map->cumulative_tsn_ack_point; + do { + ctsn++; + if (ctsn == map->overflow_tsn) { + /* Now tsn_map must have been all '1's, + * so we swap the map and check the overflow table + */ + __u8 *tmp = map->tsn_map; + memset(tmp, 0, map->len); + map->tsn_map = map->overflow_map; + map->overflow_map = tmp; + + /* Update the tsn_map boundaries. */ + map->base_tsn += map->len; + map->overflow_tsn += map->len; + } + } while (map->tsn_map[ctsn - map->base_tsn]); + + map->cumulative_tsn_ack_point = ctsn - 1; /* Back up one. */ +} + +/* How many data chunks are we missing from our peer? + */ +__u16 sctp_tsnmap_pending(struct sctp_tsnmap *map) +{ + __u32 cum_tsn = map->cumulative_tsn_ack_point; + __u32 max_tsn = map->max_tsn_seen; + __u32 base_tsn = map->base_tsn; + __u16 pending_data; + __s32 gap, start, end, i; + + pending_data = max_tsn - cum_tsn; + gap = max_tsn - base_tsn; + + if (gap <= 0 || gap >= (map->len + map->len)) + goto out; + + start = ((cum_tsn >= base_tsn) ? (cum_tsn - base_tsn + 1) : 0); + end = ((gap > map->len ) ? map->len : gap + 1); + + for (i = start; i < end; i++) { + if (map->tsn_map[i]) + pending_data--; + } + + if (gap >= map->len) { + start = 0; + end = gap - map->len + 1; + for (i = start; i < end; i++) { + if (map->overflow_map[i]) + pending_data--; + } + } + +out: + return pending_data; +} + +/* This is a private helper for finding Gap Ack Blocks. It searches a + * single array for the start and end of a Gap Ack Block. + * + * The flags "started" and "ended" tell is if we found the beginning + * or (respectively) the end of a Gap Ack Block. + */ +static void sctp_tsnmap_find_gap_ack(__u8 *map, __u16 off, + __u16 len, __u16 base, + int *started, __u16 *start, + int *ended, __u16 *end) +{ + int i = off; + + /* Look through the entire array, but break out + * early if we have found the end of the Gap Ack Block. + */ + + /* Also, stop looking past the maximum TSN seen. */ + + /* Look for the start. */ + if (!(*started)) { + for (; i < len; i++) { + if (map[i]) { + (*started)++; + *start = base + i; + break; + } + } + } + + /* Look for the end. */ + if (*started) { + /* We have found the start, let's find the + * end. If we find the end, break out. + */ + for (; i < len; i++) { + if (!map[i]) { + (*ended)++; + *end = base + i - 1; + break; + } + } + } +} + +/* Renege that we have seen a TSN. */ +void sctp_tsnmap_renege(struct sctp_tsnmap *map, __u32 tsn) +{ + __s32 gap; + + if (TSN_lt(tsn, map->base_tsn)) + return; + if (!TSN_lt(tsn, map->base_tsn + map->len + map->len)) + return; + + /* Assert: TSN is in range. */ + gap = tsn - map->base_tsn; + + /* Pretend we never saw the TSN. */ + if (gap < map->len) + map->tsn_map[gap] = 0; + else + map->overflow_map[gap - map->len] = 0; +} + +/* How many gap ack blocks do we have recorded? */ +__u16 sctp_tsnmap_num_gabs(struct sctp_tsnmap *map) +{ + struct sctp_tsnmap_iter iter; + int gabs = 0; + + /* Refresh the gap ack information. */ + if (sctp_tsnmap_has_gap(map)) { + sctp_tsnmap_iter_init(map, &iter); + while (sctp_tsnmap_next_gap_ack(map, &iter, + &map->gabs[gabs].start, + &map->gabs[gabs].end)) { + + map->gabs[gabs].start = htons(map->gabs[gabs].start); + map->gabs[gabs].end = htons(map->gabs[gabs].end); + gabs++; + if (gabs >= SCTP_MAX_GABS) + break; + } + } + return gabs; +} diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c new file mode 100644 index 000000000000..17d0ff534735 --- /dev/null +++ b/net/sctp/ulpevent.c @@ -0,0 +1,942 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 Nokia, Inc. + * Copyright (c) 2001 La Monte H.P. Yarroll + * + * These functions manipulate an sctp event. The struct ulpevent is used + * to carry notifications and data to the ULP (sockets). + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Jon Grimm + * La Monte H.P. Yarroll + * Ardelle Fan + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include + +static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, + struct sctp_association *asoc); +static void sctp_ulpevent_release_data(struct sctp_ulpevent *event); + +/* Stub skb destructor. */ +static void sctp_stub_rfree(struct sk_buff *skb) +{ +/* WARNING: This function is just a warning not to use the + * skb destructor. If the skb is shared, we may get the destructor + * callback on some processor that does not own the sock_lock. This + * was occuring with PACKET socket applications that were monitoring + * our skbs. We can't take the sock_lock, because we can't risk + * recursing if we do really own the sock lock. Instead, do all + * of our rwnd manipulation while we own the sock_lock outright. + */ +} + +/* Initialize an ULP event from an given skb. */ +SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags) +{ + memset(event, 0, sizeof(struct sctp_ulpevent)); + event->msg_flags = msg_flags; +} + +/* Create a new sctp_ulpevent. */ +SCTP_STATIC struct sctp_ulpevent *sctp_ulpevent_new(int size, int msg_flags, + int gfp) +{ + struct sctp_ulpevent *event; + struct sk_buff *skb; + + skb = alloc_skb(size, gfp); + if (!skb) + goto fail; + + event = sctp_skb2event(skb); + sctp_ulpevent_init(event, msg_flags); + + return event; + +fail: + return NULL; +} + +/* Is this a MSG_NOTIFICATION? */ +int sctp_ulpevent_is_notification(const struct sctp_ulpevent *event) +{ + return MSG_NOTIFICATION == (event->msg_flags & MSG_NOTIFICATION); +} + +/* Hold the association in case the msg_name needs read out of + * the association. + */ +static inline void sctp_ulpevent_set_owner(struct sctp_ulpevent *event, + const struct sctp_association *asoc) +{ + struct sk_buff *skb; + + /* Cast away the const, as we are just wanting to + * bump the reference count. + */ + sctp_association_hold((struct sctp_association *)asoc); + skb = sctp_event2skb(event); + skb->sk = asoc->base.sk; + event->asoc = (struct sctp_association *)asoc; + skb->destructor = sctp_stub_rfree; +} + +/* A simple destructor to give up the reference to the association. */ +static inline void sctp_ulpevent_release_owner(struct sctp_ulpevent *event) +{ + sctp_association_put(event->asoc); +} + +/* Create and initialize an SCTP_ASSOC_CHANGE event. + * + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * Communication notifications inform the ULP that an SCTP association + * has either begun or ended. The identifier for a new association is + * provided by this notification. + * + * Note: There is no field checking here. If a field is unused it will be + * zero'd out. + */ +struct sctp_ulpevent *sctp_ulpevent_make_assoc_change( + const struct sctp_association *asoc, + __u16 flags, __u16 state, __u16 error, __u16 outbound, + __u16 inbound, int gfp) +{ + struct sctp_ulpevent *event; + struct sctp_assoc_change *sac; + struct sk_buff *skb; + + event = sctp_ulpevent_new(sizeof(struct sctp_assoc_change), + MSG_NOTIFICATION, gfp); + if (!event) + goto fail; + skb = sctp_event2skb(event); + sac = (struct sctp_assoc_change *) + skb_put(skb, sizeof(struct sctp_assoc_change)); + + /* Socket Extensions for SCTP + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * sac_type: + * It should be SCTP_ASSOC_CHANGE. + */ + sac->sac_type = SCTP_ASSOC_CHANGE; + + /* Socket Extensions for SCTP + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * sac_state: 32 bits (signed integer) + * This field holds one of a number of values that communicate the + * event that happened to the association. + */ + sac->sac_state = state; + + /* Socket Extensions for SCTP + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * sac_flags: 16 bits (unsigned integer) + * Currently unused. + */ + sac->sac_flags = 0; + + /* Socket Extensions for SCTP + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * sac_length: sizeof (__u32) + * This field is the total length of the notification data, including + * the notification header. + */ + sac->sac_length = sizeof(struct sctp_assoc_change); + + /* Socket Extensions for SCTP + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * sac_error: 32 bits (signed integer) + * + * If the state was reached due to a error condition (e.g. + * COMMUNICATION_LOST) any relevant error information is available in + * this field. This corresponds to the protocol error codes defined in + * [SCTP]. + */ + sac->sac_error = error; + + /* Socket Extensions for SCTP + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * sac_outbound_streams: 16 bits (unsigned integer) + * sac_inbound_streams: 16 bits (unsigned integer) + * + * The maximum number of streams allowed in each direction are + * available in sac_outbound_streams and sac_inbound streams. + */ + sac->sac_outbound_streams = outbound; + sac->sac_inbound_streams = inbound; + + /* Socket Extensions for SCTP + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * sac_assoc_id: sizeof (sctp_assoc_t) + * + * The association id field, holds the identifier for the association. + * All notifications for a given association have the same association + * identifier. For TCP style socket, this field is ignored. + */ + sctp_ulpevent_set_owner(event, asoc); + sac->sac_assoc_id = sctp_assoc2id(asoc); + + return event; + +fail: + return NULL; +} + +/* Create and initialize an SCTP_PEER_ADDR_CHANGE event. + * + * Socket Extensions for SCTP - draft-01 + * 5.3.1.2 SCTP_PEER_ADDR_CHANGE + * + * When a destination address on a multi-homed peer encounters a change + * an interface details event is sent. + */ +struct sctp_ulpevent *sctp_ulpevent_make_peer_addr_change( + const struct sctp_association *asoc, + const struct sockaddr_storage *aaddr, + int flags, int state, int error, int gfp) +{ + struct sctp_ulpevent *event; + struct sctp_paddr_change *spc; + struct sk_buff *skb; + + event = sctp_ulpevent_new(sizeof(struct sctp_paddr_change), + MSG_NOTIFICATION, gfp); + if (!event) + goto fail; + + skb = sctp_event2skb(event); + spc = (struct sctp_paddr_change *) + skb_put(skb, sizeof(struct sctp_paddr_change)); + + /* Sockets API Extensions for SCTP + * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE + * + * spc_type: + * + * It should be SCTP_PEER_ADDR_CHANGE. + */ + spc->spc_type = SCTP_PEER_ADDR_CHANGE; + + /* Sockets API Extensions for SCTP + * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE + * + * spc_length: sizeof (__u32) + * + * This field is the total length of the notification data, including + * the notification header. + */ + spc->spc_length = sizeof(struct sctp_paddr_change); + + /* Sockets API Extensions for SCTP + * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE + * + * spc_flags: 16 bits (unsigned integer) + * Currently unused. + */ + spc->spc_flags = 0; + + /* Sockets API Extensions for SCTP + * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE + * + * spc_state: 32 bits (signed integer) + * + * This field holds one of a number of values that communicate the + * event that happened to the address. + */ + spc->spc_state = state; + + /* Sockets API Extensions for SCTP + * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE + * + * spc_error: 32 bits (signed integer) + * + * If the state was reached due to any error condition (e.g. + * ADDRESS_UNREACHABLE) any relevant error information is available in + * this field. + */ + spc->spc_error = error; + + /* Socket Extensions for SCTP + * 5.3.1.1 SCTP_ASSOC_CHANGE + * + * spc_assoc_id: sizeof (sctp_assoc_t) + * + * The association id field, holds the identifier for the association. + * All notifications for a given association have the same association + * identifier. For TCP style socket, this field is ignored. + */ + sctp_ulpevent_set_owner(event, asoc); + spc->spc_assoc_id = sctp_assoc2id(asoc); + + /* Sockets API Extensions for SCTP + * Section 5.3.1.2 SCTP_PEER_ADDR_CHANGE + * + * spc_aaddr: sizeof (struct sockaddr_storage) + * + * The affected address field, holds the remote peer's address that is + * encountering the change of state. + */ + memcpy(&spc->spc_aaddr, aaddr, sizeof(struct sockaddr_storage)); + + /* Map ipv4 address into v4-mapped-on-v6 address. */ + sctp_get_pf_specific(asoc->base.sk->sk_family)->addr_v4map( + sctp_sk(asoc->base.sk), + (union sctp_addr *)&spc->spc_aaddr); + + return event; + +fail: + return NULL; +} + +/* Create and initialize an SCTP_REMOTE_ERROR notification. + * + * Note: This assumes that the chunk->skb->data already points to the + * operation error payload. + * + * Socket Extensions for SCTP - draft-01 + * 5.3.1.3 SCTP_REMOTE_ERROR + * + * A remote peer may send an Operational Error message to its peer. + * This message indicates a variety of error conditions on an + * association. The entire error TLV as it appears on the wire is + * included in a SCTP_REMOTE_ERROR event. Please refer to the SCTP + * specification [SCTP] and any extensions for a list of possible + * error formats. + */ +struct sctp_ulpevent *sctp_ulpevent_make_remote_error( + const struct sctp_association *asoc, struct sctp_chunk *chunk, + __u16 flags, int gfp) +{ + struct sctp_ulpevent *event; + struct sctp_remote_error *sre; + struct sk_buff *skb; + sctp_errhdr_t *ch; + __u16 cause; + int elen; + + ch = (sctp_errhdr_t *)(chunk->skb->data); + cause = ch->cause; + elen = WORD_ROUND(ntohs(ch->length)) - sizeof(sctp_errhdr_t); + + /* Pull off the ERROR header. */ + skb_pull(chunk->skb, sizeof(sctp_errhdr_t)); + + /* Copy the skb to a new skb with room for us to prepend + * notification with. + */ + skb = skb_copy_expand(chunk->skb, sizeof(struct sctp_remote_error), + 0, gfp); + + /* Pull off the rest of the cause TLV from the chunk. */ + skb_pull(chunk->skb, elen); + if (!skb) + goto fail; + + /* Embed the event fields inside the cloned skb. */ + event = sctp_skb2event(skb); + sctp_ulpevent_init(event, MSG_NOTIFICATION); + + sre = (struct sctp_remote_error *) + skb_push(skb, sizeof(struct sctp_remote_error)); + + /* Trim the buffer to the right length. */ + skb_trim(skb, sizeof(struct sctp_remote_error) + elen); + + /* Socket Extensions for SCTP + * 5.3.1.3 SCTP_REMOTE_ERROR + * + * sre_type: + * It should be SCTP_REMOTE_ERROR. + */ + sre->sre_type = SCTP_REMOTE_ERROR; + + /* + * Socket Extensions for SCTP + * 5.3.1.3 SCTP_REMOTE_ERROR + * + * sre_flags: 16 bits (unsigned integer) + * Currently unused. + */ + sre->sre_flags = 0; + + /* Socket Extensions for SCTP + * 5.3.1.3 SCTP_REMOTE_ERROR + * + * sre_length: sizeof (__u32) + * + * This field is the total length of the notification data, + * including the notification header. + */ + sre->sre_length = skb->len; + + /* Socket Extensions for SCTP + * 5.3.1.3 SCTP_REMOTE_ERROR + * + * sre_error: 16 bits (unsigned integer) + * This value represents one of the Operational Error causes defined in + * the SCTP specification, in network byte order. + */ + sre->sre_error = cause; + + /* Socket Extensions for SCTP + * 5.3.1.3 SCTP_REMOTE_ERROR + * + * sre_assoc_id: sizeof (sctp_assoc_t) + * + * The association id field, holds the identifier for the association. + * All notifications for a given association have the same association + * identifier. For TCP style socket, this field is ignored. + */ + sctp_ulpevent_set_owner(event, asoc); + sre->sre_assoc_id = sctp_assoc2id(asoc); + + return event; + +fail: + return NULL; +} + +/* Create and initialize a SCTP_SEND_FAILED notification. + * + * Socket Extensions for SCTP - draft-01 + * 5.3.1.4 SCTP_SEND_FAILED + */ +struct sctp_ulpevent *sctp_ulpevent_make_send_failed( + const struct sctp_association *asoc, struct sctp_chunk *chunk, + __u16 flags, __u32 error, int gfp) +{ + struct sctp_ulpevent *event; + struct sctp_send_failed *ssf; + struct sk_buff *skb; + + /* Pull off any padding. */ + int len = ntohs(chunk->chunk_hdr->length); + + /* Make skb with more room so we can prepend notification. */ + skb = skb_copy_expand(chunk->skb, + sizeof(struct sctp_send_failed), /* headroom */ + 0, /* tailroom */ + gfp); + if (!skb) + goto fail; + + /* Pull off the common chunk header and DATA header. */ + skb_pull(skb, sizeof(struct sctp_data_chunk)); + len -= sizeof(struct sctp_data_chunk); + + /* Embed the event fields inside the cloned skb. */ + event = sctp_skb2event(skb); + sctp_ulpevent_init(event, MSG_NOTIFICATION); + + ssf = (struct sctp_send_failed *) + skb_push(skb, sizeof(struct sctp_send_failed)); + + /* Socket Extensions for SCTP + * 5.3.1.4 SCTP_SEND_FAILED + * + * ssf_type: + * It should be SCTP_SEND_FAILED. + */ + ssf->ssf_type = SCTP_SEND_FAILED; + + /* Socket Extensions for SCTP + * 5.3.1.4 SCTP_SEND_FAILED + * + * ssf_flags: 16 bits (unsigned integer) + * The flag value will take one of the following values + * + * SCTP_DATA_UNSENT - Indicates that the data was never put on + * the wire. + * + * SCTP_DATA_SENT - Indicates that the data was put on the wire. + * Note that this does not necessarily mean that the + * data was (or was not) successfully delivered. + */ + ssf->ssf_flags = flags; + + /* Socket Extensions for SCTP + * 5.3.1.4 SCTP_SEND_FAILED + * + * ssf_length: sizeof (__u32) + * This field is the total length of the notification data, including + * the notification header. + */ + ssf->ssf_length = sizeof(struct sctp_send_failed) + len; + skb_trim(skb, ssf->ssf_length); + + /* Socket Extensions for SCTP + * 5.3.1.4 SCTP_SEND_FAILED + * + * ssf_error: 16 bits (unsigned integer) + * This value represents the reason why the send failed, and if set, + * will be a SCTP protocol error code as defined in [SCTP] section + * 3.3.10. + */ + ssf->ssf_error = error; + + /* Socket Extensions for SCTP + * 5.3.1.4 SCTP_SEND_FAILED + * + * ssf_info: sizeof (struct sctp_sndrcvinfo) + * The original send information associated with the undelivered + * message. + */ + memcpy(&ssf->ssf_info, &chunk->sinfo, sizeof(struct sctp_sndrcvinfo)); + + /* Per TSVWG discussion with Randy. Allow the application to + * ressemble a fragmented message. + */ + ssf->ssf_info.sinfo_flags = chunk->chunk_hdr->flags; + + /* Socket Extensions for SCTP + * 5.3.1.4 SCTP_SEND_FAILED + * + * ssf_assoc_id: sizeof (sctp_assoc_t) + * The association id field, sf_assoc_id, holds the identifier for the + * association. All notifications for a given association have the + * same association identifier. For TCP style socket, this field is + * ignored. + */ + sctp_ulpevent_set_owner(event, asoc); + ssf->ssf_assoc_id = sctp_assoc2id(asoc); + return event; + +fail: + return NULL; +} + +/* Create and initialize a SCTP_SHUTDOWN_EVENT notification. + * + * Socket Extensions for SCTP - draft-01 + * 5.3.1.5 SCTP_SHUTDOWN_EVENT + */ +struct sctp_ulpevent *sctp_ulpevent_make_shutdown_event( + const struct sctp_association *asoc, + __u16 flags, int gfp) +{ + struct sctp_ulpevent *event; + struct sctp_shutdown_event *sse; + struct sk_buff *skb; + + event = sctp_ulpevent_new(sizeof(struct sctp_shutdown_event), + MSG_NOTIFICATION, gfp); + if (!event) + goto fail; + + skb = sctp_event2skb(event); + sse = (struct sctp_shutdown_event *) + skb_put(skb, sizeof(struct sctp_shutdown_event)); + + /* Socket Extensions for SCTP + * 5.3.1.5 SCTP_SHUTDOWN_EVENT + * + * sse_type + * It should be SCTP_SHUTDOWN_EVENT + */ + sse->sse_type = SCTP_SHUTDOWN_EVENT; + + /* Socket Extensions for SCTP + * 5.3.1.5 SCTP_SHUTDOWN_EVENT + * + * sse_flags: 16 bits (unsigned integer) + * Currently unused. + */ + sse->sse_flags = 0; + + /* Socket Extensions for SCTP + * 5.3.1.5 SCTP_SHUTDOWN_EVENT + * + * sse_length: sizeof (__u32) + * This field is the total length of the notification data, including + * the notification header. + */ + sse->sse_length = sizeof(struct sctp_shutdown_event); + + /* Socket Extensions for SCTP + * 5.3.1.5 SCTP_SHUTDOWN_EVENT + * + * sse_assoc_id: sizeof (sctp_assoc_t) + * The association id field, holds the identifier for the association. + * All notifications for a given association have the same association + * identifier. For TCP style socket, this field is ignored. + */ + sctp_ulpevent_set_owner(event, asoc); + sse->sse_assoc_id = sctp_assoc2id(asoc); + + return event; + +fail: + return NULL; +} + +/* Create and initialize a SCTP_ADAPTION_INDICATION notification. + * + * Socket Extensions for SCTP + * 5.3.1.6 SCTP_ADAPTION_INDICATION + */ +struct sctp_ulpevent *sctp_ulpevent_make_adaption_indication( + const struct sctp_association *asoc, int gfp) +{ + struct sctp_ulpevent *event; + struct sctp_adaption_event *sai; + struct sk_buff *skb; + + event = sctp_ulpevent_new(sizeof(struct sctp_adaption_event), + MSG_NOTIFICATION, gfp); + if (!event) + goto fail; + + skb = sctp_event2skb(event); + sai = (struct sctp_adaption_event *) + skb_put(skb, sizeof(struct sctp_adaption_event)); + + sai->sai_type = SCTP_ADAPTION_INDICATION; + sai->sai_flags = 0; + sai->sai_length = sizeof(struct sctp_adaption_event); + sai->sai_adaption_ind = asoc->peer.adaption_ind; + sctp_ulpevent_set_owner(event, asoc); + sai->sai_assoc_id = sctp_assoc2id(asoc); + + return event; + +fail: + return NULL; +} + +/* A message has been received. Package this message as a notification + * to pass it to the upper layers. Go ahead and calculate the sndrcvinfo + * even if filtered out later. + * + * Socket Extensions for SCTP + * 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV) + */ +struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, + struct sctp_chunk *chunk, + int gfp) +{ + struct sctp_ulpevent *event = NULL; + struct sk_buff *skb; + size_t padding, len; + + /* Clone the original skb, sharing the data. */ + skb = skb_clone(chunk->skb, gfp); + if (!skb) + goto fail; + + /* First calculate the padding, so we don't inadvertently + * pass up the wrong length to the user. + * + * RFC 2960 - Section 3.2 Chunk Field Descriptions + * + * The total length of a chunk(including Type, Length and Value fields) + * MUST be a multiple of 4 bytes. If the length of the chunk is not a + * multiple of 4 bytes, the sender MUST pad the chunk with all zero + * bytes and this padding is not included in the chunk length field. + * The sender should never pad with more than 3 bytes. The receiver + * MUST ignore the padding bytes. + */ + len = ntohs(chunk->chunk_hdr->length); + padding = WORD_ROUND(len) - len; + + /* Fixup cloned skb with just this chunks data. */ + skb_trim(skb, chunk->chunk_end - padding - skb->data); + + /* Embed the event fields inside the cloned skb. */ + event = sctp_skb2event(skb); + + /* Initialize event with flags 0. */ + sctp_ulpevent_init(event, 0); + + sctp_ulpevent_receive_data(event, asoc); + + event->stream = ntohs(chunk->subh.data_hdr->stream); + event->ssn = ntohs(chunk->subh.data_hdr->ssn); + event->ppid = chunk->subh.data_hdr->ppid; + if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED) { + event->flags |= MSG_UNORDERED; + event->cumtsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); + } + event->tsn = ntohl(chunk->subh.data_hdr->tsn); + event->msg_flags |= chunk->chunk_hdr->flags; + event->iif = sctp_chunk_iif(chunk); + +fail: + return event; +} + +/* Create a partial delivery related event. + * + * 5.3.1.7 SCTP_PARTIAL_DELIVERY_EVENT + * + * When a receiver is engaged in a partial delivery of a + * message this notification will be used to indicate + * various events. + */ +struct sctp_ulpevent *sctp_ulpevent_make_pdapi( + const struct sctp_association *asoc, __u32 indication, int gfp) +{ + struct sctp_ulpevent *event; + struct sctp_pdapi_event *pd; + struct sk_buff *skb; + + event = sctp_ulpevent_new(sizeof(struct sctp_pdapi_event), + MSG_NOTIFICATION, gfp); + if (!event) + goto fail; + + skb = sctp_event2skb(event); + pd = (struct sctp_pdapi_event *) + skb_put(skb, sizeof(struct sctp_pdapi_event)); + + /* pdapi_type + * It should be SCTP_PARTIAL_DELIVERY_EVENT + * + * pdapi_flags: 16 bits (unsigned integer) + * Currently unused. + */ + pd->pdapi_type = SCTP_PARTIAL_DELIVERY_EVENT; + pd->pdapi_flags = 0; + + /* pdapi_length: 32 bits (unsigned integer) + * + * This field is the total length of the notification data, including + * the notification header. It will generally be sizeof (struct + * sctp_pdapi_event). + */ + pd->pdapi_length = sizeof(struct sctp_pdapi_event); + + /* pdapi_indication: 32 bits (unsigned integer) + * + * This field holds the indication being sent to the application. + */ + pd->pdapi_indication = indication; + + /* pdapi_assoc_id: sizeof (sctp_assoc_t) + * + * The association id field, holds the identifier for the association. + */ + sctp_ulpevent_set_owner(event, asoc); + pd->pdapi_assoc_id = sctp_assoc2id(asoc); + + return event; +fail: + return NULL; +} + +/* Return the notification type, assuming this is a notification + * event. + */ +__u16 sctp_ulpevent_get_notification_type(const struct sctp_ulpevent *event) +{ + union sctp_notification *notification; + struct sk_buff *skb; + + skb = sctp_event2skb((struct sctp_ulpevent *)event); + notification = (union sctp_notification *) skb->data; + return notification->sn_header.sn_type; +} + +/* Copy out the sndrcvinfo into a msghdr. */ +void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, + struct msghdr *msghdr) +{ + struct sctp_sndrcvinfo sinfo; + + if (sctp_ulpevent_is_notification(event)) + return; + + /* Sockets API Extensions for SCTP + * Section 5.2.2 SCTP Header Information Structure (SCTP_SNDRCV) + * + * sinfo_stream: 16 bits (unsigned integer) + * + * For recvmsg() the SCTP stack places the message's stream number in + * this value. + */ + sinfo.sinfo_stream = event->stream; + /* sinfo_ssn: 16 bits (unsigned integer) + * + * For recvmsg() this value contains the stream sequence number that + * the remote endpoint placed in the DATA chunk. For fragmented + * messages this is the same number for all deliveries of the message + * (if more than one recvmsg() is needed to read the message). + */ + sinfo.sinfo_ssn = event->ssn; + /* sinfo_ppid: 32 bits (unsigned integer) + * + * In recvmsg() this value is + * the same information that was passed by the upper layer in the peer + * application. Please note that byte order issues are NOT accounted + * for and this information is passed opaquely by the SCTP stack from + * one end to the other. + */ + sinfo.sinfo_ppid = event->ppid; + /* sinfo_flags: 16 bits (unsigned integer) + * + * This field may contain any of the following flags and is composed of + * a bitwise OR of these values. + * + * recvmsg() flags: + * + * MSG_UNORDERED - This flag is present when the message was sent + * non-ordered. + */ + sinfo.sinfo_flags = event->flags; + /* sinfo_tsn: 32 bit (unsigned integer) + * + * For the receiving side, this field holds a TSN that was + * assigned to one of the SCTP Data Chunks. + */ + sinfo.sinfo_tsn = event->tsn; + /* sinfo_cumtsn: 32 bit (unsigned integer) + * + * This field will hold the current cumulative TSN as + * known by the underlying SCTP layer. Note this field is + * ignored when sending and only valid for a receive + * operation when sinfo_flags are set to MSG_UNORDERED. + */ + sinfo.sinfo_cumtsn = event->cumtsn; + /* sinfo_assoc_id: sizeof (sctp_assoc_t) + * + * The association handle field, sinfo_assoc_id, holds the identifier + * for the association announced in the COMMUNICATION_UP notification. + * All notifications for a given association have the same identifier. + * Ignored for one-to-one style sockets. + */ + sinfo.sinfo_assoc_id = sctp_assoc2id(event->asoc); + + /* These fields are not used while receiving. */ + sinfo.sinfo_context = 0; + sinfo.sinfo_timetolive = 0; + + put_cmsg(msghdr, IPPROTO_SCTP, SCTP_SNDRCV, + sizeof(struct sctp_sndrcvinfo), (void *)&sinfo); +} + +/* Do accounting for bytes received and hold a reference to the association + * for each skb. + */ +static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, + struct sctp_association *asoc) +{ + struct sk_buff *skb, *frag; + + skb = sctp_event2skb(event); + /* Set the owner and charge rwnd for bytes received. */ + sctp_ulpevent_set_owner(event, asoc); + sctp_assoc_rwnd_decrease(asoc, skb_headlen(skb)); + + if (!skb->data_len) + return; + + /* Note: Not clearing the entire event struct as this is just a + * fragment of the real event. However, we still need to do rwnd + * accounting. + * In general, the skb passed from IP can have only 1 level of + * fragments. But we allow multiple levels of fragments. + */ + for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) { + sctp_ulpevent_receive_data(sctp_skb2event(frag), asoc); + } +} + +/* Do accounting for bytes just read by user and release the references to + * the association. + */ +static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) +{ + struct sk_buff *skb, *frag; + + /* Current stack structures assume that the rcv buffer is + * per socket. For UDP style sockets this is not true as + * multiple associations may be on a single UDP-style socket. + * Use the local private area of the skb to track the owning + * association. + */ + + skb = sctp_event2skb(event); + sctp_assoc_rwnd_increase(event->asoc, skb_headlen(skb)); + + if (!skb->data_len) + goto done; + + /* Don't forget the fragments. */ + for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) { + /* NOTE: skb_shinfos are recursive. Although IP returns + * skb's with only 1 level of fragments, SCTP reassembly can + * increase the levels. + */ + sctp_ulpevent_release_data(sctp_skb2event(frag)); + } + +done: + sctp_ulpevent_release_owner(event); +} + +/* Free a ulpevent that has an owner. It includes releasing the reference + * to the owner, updating the rwnd in case of a DATA event and freeing the + * skb. + * See comments in sctp_stub_rfree(). + */ +void sctp_ulpevent_free(struct sctp_ulpevent *event) +{ + if (sctp_ulpevent_is_notification(event)) + sctp_ulpevent_release_owner(event); + else + sctp_ulpevent_release_data(event); + + kfree_skb(sctp_event2skb(event)); +} + +/* Purge the skb lists holding ulpevents. */ +void sctp_queue_purge_ulpevents(struct sk_buff_head *list) +{ + struct sk_buff *skb; + while ((skb = skb_dequeue(list)) != NULL) + sctp_ulpevent_free(sctp_skb2event(skb)); +} diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c new file mode 100644 index 000000000000..d5dd2cf7ac4a --- /dev/null +++ b/net/sctp/ulpqueue.c @@ -0,0 +1,864 @@ +/* SCTP kernel reference Implementation + * (C) Copyright IBM Corp. 2001, 2004 + * Copyright (c) 1999-2000 Cisco, Inc. + * Copyright (c) 1999-2001 Motorola, Inc. + * Copyright (c) 2001 Intel Corp. + * Copyright (c) 2001 Nokia, Inc. + * Copyright (c) 2001 La Monte H.P. Yarroll + * + * This abstraction carries sctp events to the ULP (sockets). + * + * The SCTP reference implementation is free software; + * you can redistribute it and/or modify it under the terms of + * the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * The SCTP reference implementation is distributed in the hope that it + * will be useful, but WITHOUT ANY WARRANTY; without even the implied + * ************************ + * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with GNU CC; see the file COPYING. If not, write to + * the Free Software Foundation, 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + * + * Please send any bug reports or fixes you make to the + * email address(es): + * lksctp developers + * + * Or submit a bug report through the following website: + * http://www.sf.net/projects/lksctp + * + * Written or modified by: + * Jon Grimm + * La Monte H.P. Yarroll + * Sridhar Samudrala + * + * Any bugs reported given to us we will try to fix... any fixes shared will + * be incorporated into the next SCTP release. + */ + +#include +#include +#include +#include +#include +#include + +/* Forward declarations for internal helpers. */ +static struct sctp_ulpevent * sctp_ulpq_reasm(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *); +static struct sctp_ulpevent * sctp_ulpq_order(struct sctp_ulpq *, + struct sctp_ulpevent *); + +/* 1st Level Abstractions */ + +/* Initialize a ULP queue from a block of memory. */ +struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq, + struct sctp_association *asoc) +{ + memset(ulpq, 0, sizeof(struct sctp_ulpq)); + + ulpq->asoc = asoc; + skb_queue_head_init(&ulpq->reasm); + skb_queue_head_init(&ulpq->lobby); + ulpq->pd_mode = 0; + ulpq->malloced = 0; + + return ulpq; +} + + +/* Flush the reassembly and ordering queues. */ +static void sctp_ulpq_flush(struct sctp_ulpq *ulpq) +{ + struct sk_buff *skb; + struct sctp_ulpevent *event; + + while ((skb = __skb_dequeue(&ulpq->lobby)) != NULL) { + event = sctp_skb2event(skb); + sctp_ulpevent_free(event); + } + + while ((skb = __skb_dequeue(&ulpq->reasm)) != NULL) { + event = sctp_skb2event(skb); + sctp_ulpevent_free(event); + } + +} + +/* Dispose of a ulpqueue. */ +void sctp_ulpq_free(struct sctp_ulpq *ulpq) +{ + sctp_ulpq_flush(ulpq); + if (ulpq->malloced) + kfree(ulpq); +} + +/* Process an incoming DATA chunk. */ +int sctp_ulpq_tail_data(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, + int gfp) +{ + struct sk_buff_head temp; + sctp_data_chunk_t *hdr; + struct sctp_ulpevent *event; + + hdr = (sctp_data_chunk_t *) chunk->chunk_hdr; + + /* Create an event from the incoming chunk. */ + event = sctp_ulpevent_make_rcvmsg(chunk->asoc, chunk, gfp); + if (!event) + return -ENOMEM; + + /* Do reassembly if needed. */ + event = sctp_ulpq_reasm(ulpq, event); + + /* Do ordering if needed. */ + if ((event) && (event->msg_flags & MSG_EOR)){ + /* Create a temporary list to collect chunks on. */ + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + + event = sctp_ulpq_order(ulpq, event); + } + + /* Send event to the ULP. */ + if (event) + sctp_ulpq_tail_event(ulpq, event); + + return 0; +} + +/* Add a new event for propagation to the ULP. */ +/* Clear the partial delivery mode for this socket. Note: This + * assumes that no association is currently in partial delivery mode. + */ +int sctp_clear_pd(struct sock *sk) +{ + struct sctp_sock *sp = sctp_sk(sk); + + sp->pd_mode = 0; + if (!skb_queue_empty(&sp->pd_lobby)) { + struct list_head *list; + sctp_skb_list_tail(&sp->pd_lobby, &sk->sk_receive_queue); + list = (struct list_head *)&sctp_sk(sk)->pd_lobby; + INIT_LIST_HEAD(list); + return 1; + } + return 0; +} + +/* Clear the pd_mode and restart any pending messages waiting for delivery. */ +static int sctp_ulpq_clear_pd(struct sctp_ulpq *ulpq) +{ + ulpq->pd_mode = 0; + return sctp_clear_pd(ulpq->asoc->base.sk); +} + + + +int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sctp_ulpevent *event) +{ + struct sock *sk = ulpq->asoc->base.sk; + struct sk_buff_head *queue; + int clear_pd = 0; + + /* If the socket is just going to throw this away, do not + * even try to deliver it. + */ + if (sock_flag(sk, SOCK_DEAD) || (sk->sk_shutdown & RCV_SHUTDOWN)) + goto out_free; + + /* Check if the user wishes to receive this event. */ + if (!sctp_ulpevent_is_enabled(event, &sctp_sk(sk)->subscribe)) + goto out_free; + + /* If we are in partial delivery mode, post to the lobby until + * partial delivery is cleared, unless, of course _this_ is + * the association the cause of the partial delivery. + */ + + if (!sctp_sk(sk)->pd_mode) { + queue = &sk->sk_receive_queue; + } else if (ulpq->pd_mode) { + if (event->msg_flags & MSG_NOTIFICATION) + queue = &sctp_sk(sk)->pd_lobby; + else { + clear_pd = event->msg_flags & MSG_EOR; + queue = &sk->sk_receive_queue; + } + } else + queue = &sctp_sk(sk)->pd_lobby; + + + /* If we are harvesting multiple skbs they will be + * collected on a list. + */ + if (sctp_event2skb(event)->list) + sctp_skb_list_tail(sctp_event2skb(event)->list, queue); + else + __skb_queue_tail(queue, sctp_event2skb(event)); + + /* Did we just complete partial delivery and need to get + * rolling again? Move pending data to the receive + * queue. + */ + if (clear_pd) + sctp_ulpq_clear_pd(ulpq); + + if (queue == &sk->sk_receive_queue) + sk->sk_data_ready(sk, 0); + return 1; + +out_free: + if (sctp_event2skb(event)->list) + sctp_queue_purge_ulpevents(sctp_event2skb(event)->list); + else + sctp_ulpevent_free(event); + return 0; +} + +/* 2nd Level Abstractions */ + +/* Helper function to store chunks that need to be reassembled. */ +static inline void sctp_ulpq_store_reasm(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sk_buff *pos; + struct sctp_ulpevent *cevent; + __u32 tsn, ctsn; + + tsn = event->tsn; + + /* See if it belongs at the end. */ + pos = skb_peek_tail(&ulpq->reasm); + if (!pos) { + __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); + return; + } + + /* Short circuit just dropping it at the end. */ + cevent = sctp_skb2event(pos); + ctsn = cevent->tsn; + if (TSN_lt(ctsn, tsn)) { + __skb_queue_tail(&ulpq->reasm, sctp_event2skb(event)); + return; + } + + /* Find the right place in this list. We store them by TSN. */ + skb_queue_walk(&ulpq->reasm, pos) { + cevent = sctp_skb2event(pos); + ctsn = cevent->tsn; + + if (TSN_lt(tsn, ctsn)) + break; + } + + /* Insert before pos. */ + __skb_insert(sctp_event2skb(event), pos->prev, pos, &ulpq->reasm); + +} + +/* Helper function to return an event corresponding to the reassembled + * datagram. + * This routine creates a re-assembled skb given the first and last skb's + * as stored in the reassembly queue. The skb's may be non-linear if the sctp + * payload was fragmented on the way and ip had to reassemble them. + * We add the rest of skb's to the first skb's fraglist. + */ +static struct sctp_ulpevent *sctp_make_reassembled_event(struct sk_buff *f_frag, struct sk_buff *l_frag) +{ + struct sk_buff *pos; + struct sctp_ulpevent *event; + struct sk_buff *pnext, *last; + struct sk_buff *list = skb_shinfo(f_frag)->frag_list; + + /* Store the pointer to the 2nd skb */ + if (f_frag == l_frag) + pos = NULL; + else + pos = f_frag->next; + + /* Get the last skb in the f_frag's frag_list if present. */ + for (last = list; list; last = list, list = list->next); + + /* Add the list of remaining fragments to the first fragments + * frag_list. + */ + if (last) + last->next = pos; + else + skb_shinfo(f_frag)->frag_list = pos; + + /* Remove the first fragment from the reassembly queue. */ + __skb_unlink(f_frag, f_frag->list); + while (pos) { + + pnext = pos->next; + + /* Update the len and data_len fields of the first fragment. */ + f_frag->len += pos->len; + f_frag->data_len += pos->len; + + /* Remove the fragment from the reassembly queue. */ + __skb_unlink(pos, pos->list); + + /* Break if we have reached the last fragment. */ + if (pos == l_frag) + break; + pos->next = pnext; + pos = pnext; + }; + + event = sctp_skb2event(f_frag); + SCTP_INC_STATS(SCTP_MIB_REASMUSRMSGS); + + return event; +} + + +/* Helper function to check if an incoming chunk has filled up the last + * missing fragment in a SCTP datagram and return the corresponding event. + */ +static inline struct sctp_ulpevent *sctp_ulpq_retrieve_reassembled(struct sctp_ulpq *ulpq) +{ + struct sk_buff *pos; + struct sctp_ulpevent *cevent; + struct sk_buff *first_frag = NULL; + __u32 ctsn, next_tsn; + struct sctp_ulpevent *retval = NULL; + + /* Initialized to 0 just to avoid compiler warning message. Will + * never be used with this value. It is referenced only after it + * is set when we find the first fragment of a message. + */ + next_tsn = 0; + + /* The chunks are held in the reasm queue sorted by TSN. + * Walk through the queue sequentially and look for a sequence of + * fragmented chunks that complete a datagram. + * 'first_frag' and next_tsn are reset when we find a chunk which + * is the first fragment of a datagram. Once these 2 fields are set + * we expect to find the remaining middle fragments and the last + * fragment in order. If not, first_frag is reset to NULL and we + * start the next pass when we find another first fragment. + */ + skb_queue_walk(&ulpq->reasm, pos) { + cevent = sctp_skb2event(pos); + ctsn = cevent->tsn; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_FIRST_FRAG: + first_frag = pos; + next_tsn = ctsn + 1; + break; + + case SCTP_DATA_MIDDLE_FRAG: + if ((first_frag) && (ctsn == next_tsn)) + next_tsn++; + else + first_frag = NULL; + break; + + case SCTP_DATA_LAST_FRAG: + if (first_frag && (ctsn == next_tsn)) + goto found; + else + first_frag = NULL; + break; + }; + + } +done: + return retval; +found: + retval = sctp_make_reassembled_event(first_frag, pos); + if (retval) + retval->msg_flags |= MSG_EOR; + goto done; +} + +/* Retrieve the next set of fragments of a partial message. */ +static inline struct sctp_ulpevent *sctp_ulpq_retrieve_partial(struct sctp_ulpq *ulpq) +{ + struct sk_buff *pos, *last_frag, *first_frag; + struct sctp_ulpevent *cevent; + __u32 ctsn, next_tsn; + int is_last; + struct sctp_ulpevent *retval; + + /* The chunks are held in the reasm queue sorted by TSN. + * Walk through the queue sequentially and look for the first + * sequence of fragmented chunks. + */ + + if (skb_queue_empty(&ulpq->reasm)) + return NULL; + + last_frag = first_frag = NULL; + retval = NULL; + next_tsn = 0; + is_last = 0; + + skb_queue_walk(&ulpq->reasm, pos) { + cevent = sctp_skb2event(pos); + ctsn = cevent->tsn; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_MIDDLE_FRAG: + if (!first_frag) { + first_frag = pos; + next_tsn = ctsn + 1; + last_frag = pos; + } else if (next_tsn == ctsn) + next_tsn++; + else + goto done; + break; + case SCTP_DATA_LAST_FRAG: + if (!first_frag) + first_frag = pos; + else if (ctsn != next_tsn) + goto done; + last_frag = pos; + is_last = 1; + goto done; + default: + return NULL; + }; + } + + /* We have the reassembled event. There is no need to look + * further. + */ +done: + retval = sctp_make_reassembled_event(first_frag, last_frag); + if (retval && is_last) + retval->msg_flags |= MSG_EOR; + + return retval; +} + + +/* Helper function to reassemble chunks. Hold chunks on the reasm queue that + * need reassembling. + */ +static struct sctp_ulpevent *sctp_ulpq_reasm(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sctp_ulpevent *retval = NULL; + + /* Check if this is part of a fragmented message. */ + if (SCTP_DATA_NOT_FRAG == (event->msg_flags & SCTP_DATA_FRAG_MASK)) { + event->msg_flags |= MSG_EOR; + return event; + } + + sctp_ulpq_store_reasm(ulpq, event); + if (!ulpq->pd_mode) + retval = sctp_ulpq_retrieve_reassembled(ulpq); + else { + __u32 ctsn, ctsnap; + + /* Do not even bother unless this is the next tsn to + * be delivered. + */ + ctsn = event->tsn; + ctsnap = sctp_tsnmap_get_ctsn(&ulpq->asoc->peer.tsn_map); + if (TSN_lte(ctsn, ctsnap)) + retval = sctp_ulpq_retrieve_partial(ulpq); + } + + return retval; +} + +/* Retrieve the first part (sequential fragments) for partial delivery. */ +static inline struct sctp_ulpevent *sctp_ulpq_retrieve_first(struct sctp_ulpq *ulpq) +{ + struct sk_buff *pos, *last_frag, *first_frag; + struct sctp_ulpevent *cevent; + __u32 ctsn, next_tsn; + struct sctp_ulpevent *retval; + + /* The chunks are held in the reasm queue sorted by TSN. + * Walk through the queue sequentially and look for a sequence of + * fragmented chunks that start a datagram. + */ + + if (skb_queue_empty(&ulpq->reasm)) + return NULL; + + last_frag = first_frag = NULL; + retval = NULL; + next_tsn = 0; + + skb_queue_walk(&ulpq->reasm, pos) { + cevent = sctp_skb2event(pos); + ctsn = cevent->tsn; + + switch (cevent->msg_flags & SCTP_DATA_FRAG_MASK) { + case SCTP_DATA_FIRST_FRAG: + if (!first_frag) { + first_frag = pos; + next_tsn = ctsn + 1; + last_frag = pos; + } else + goto done; + break; + + case SCTP_DATA_MIDDLE_FRAG: + if (!first_frag) + return NULL; + if (ctsn == next_tsn) { + next_tsn++; + last_frag = pos; + } else + goto done; + break; + default: + return NULL; + }; + } + + /* We have the reassembled event. There is no need to look + * further. + */ +done: + retval = sctp_make_reassembled_event(first_frag, last_frag); + return retval; +} + +/* Helper function to gather skbs that have possibly become + * ordered by an an incoming chunk. + */ +static inline void sctp_ulpq_retrieve_ordered(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sk_buff *pos, *tmp; + struct sctp_ulpevent *cevent; + struct sctp_stream *in; + __u16 sid, csid; + __u16 ssn, cssn; + + sid = event->stream; + ssn = event->ssn; + in = &ulpq->asoc->ssnmap->in; + + /* We are holding the chunks by stream, by SSN. */ + sctp_skb_for_each(pos, &ulpq->lobby, tmp) { + cevent = (struct sctp_ulpevent *) pos->cb; + csid = cevent->stream; + cssn = cevent->ssn; + + /* Have we gone too far? */ + if (csid > sid) + break; + + /* Have we not gone far enough? */ + if (csid < sid) + continue; + + if (cssn != sctp_ssn_peek(in, sid)) + break; + + /* Found it, so mark in the ssnmap. */ + sctp_ssn_next(in, sid); + + __skb_unlink(pos, pos->list); + + /* Attach all gathered skbs to the event. */ + __skb_queue_tail(sctp_event2skb(event)->list, pos); + } +} + +/* Helper function to store chunks needing ordering. */ +static inline void sctp_ulpq_store_ordered(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + struct sk_buff *pos; + struct sctp_ulpevent *cevent; + __u16 sid, csid; + __u16 ssn, cssn; + + pos = skb_peek_tail(&ulpq->lobby); + if (!pos) { + __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); + return; + } + + sid = event->stream; + ssn = event->ssn; + + cevent = (struct sctp_ulpevent *) pos->cb; + csid = cevent->stream; + cssn = cevent->ssn; + if (sid > csid) { + __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); + return; + } + + if ((sid == csid) && SSN_lt(cssn, ssn)) { + __skb_queue_tail(&ulpq->lobby, sctp_event2skb(event)); + return; + } + + /* Find the right place in this list. We store them by + * stream ID and then by SSN. + */ + skb_queue_walk(&ulpq->lobby, pos) { + cevent = (struct sctp_ulpevent *) pos->cb; + csid = cevent->stream; + cssn = cevent->ssn; + + if (csid > sid) + break; + if (csid == sid && SSN_lt(ssn, cssn)) + break; + } + + + /* Insert before pos. */ + __skb_insert(sctp_event2skb(event), pos->prev, pos, &ulpq->lobby); + +} + +static struct sctp_ulpevent *sctp_ulpq_order(struct sctp_ulpq *ulpq, + struct sctp_ulpevent *event) +{ + __u16 sid, ssn; + struct sctp_stream *in; + + /* Check if this message needs ordering. */ + if (SCTP_DATA_UNORDERED & event->msg_flags) + return event; + + /* Note: The stream ID must be verified before this routine. */ + sid = event->stream; + ssn = event->ssn; + in = &ulpq->asoc->ssnmap->in; + + /* Is this the expected SSN for this stream ID? */ + if (ssn != sctp_ssn_peek(in, sid)) { + /* We've received something out of order, so find where it + * needs to be placed. We order by stream and then by SSN. + */ + sctp_ulpq_store_ordered(ulpq, event); + return NULL; + } + + /* Mark that the next chunk has been found. */ + sctp_ssn_next(in, sid); + + /* Go find any other chunks that were waiting for + * ordering. + */ + sctp_ulpq_retrieve_ordered(ulpq, event); + + return event; +} + +/* Helper function to gather skbs that have possibly become + * ordered by forward tsn skipping their dependencies. + */ +static inline void sctp_ulpq_reap_ordered(struct sctp_ulpq *ulpq) +{ + struct sk_buff *pos, *tmp; + struct sctp_ulpevent *cevent; + struct sctp_ulpevent *event = NULL; + struct sctp_stream *in; + struct sk_buff_head temp; + __u16 csid, cssn; + + in = &ulpq->asoc->ssnmap->in; + + /* We are holding the chunks by stream, by SSN. */ + sctp_skb_for_each(pos, &ulpq->lobby, tmp) { + cevent = (struct sctp_ulpevent *) pos->cb; + csid = cevent->stream; + cssn = cevent->ssn; + + if (cssn != sctp_ssn_peek(in, csid)) + break; + + /* Found it, so mark in the ssnmap. */ + sctp_ssn_next(in, csid); + + __skb_unlink(pos, pos->list); + if (!event) { + /* Create a temporary list to collect chunks on. */ + event = sctp_skb2event(pos); + skb_queue_head_init(&temp); + __skb_queue_tail(&temp, sctp_event2skb(event)); + } else { + /* Attach all gathered skbs to the event. */ + __skb_queue_tail(sctp_event2skb(event)->list, pos); + } + } + + /* Send event to the ULP. */ + if (event) + sctp_ulpq_tail_event(ulpq, event); +} + +/* Skip over an SSN. */ +void sctp_ulpq_skip(struct sctp_ulpq *ulpq, __u16 sid, __u16 ssn) +{ + struct sctp_stream *in; + + /* Note: The stream ID must be verified before this routine. */ + in = &ulpq->asoc->ssnmap->in; + + /* Is this an old SSN? If so ignore. */ + if (SSN_lt(ssn, sctp_ssn_peek(in, sid))) + return; + + /* Mark that we are no longer expecting this SSN or lower. */ + sctp_ssn_skip(in, sid, ssn); + + /* Go find any other chunks that were waiting for + * ordering and deliver them if needed. + */ + sctp_ulpq_reap_ordered(ulpq); + return; +} + +/* Renege 'needed' bytes from the ordering queue. */ +static __u16 sctp_ulpq_renege_order(struct sctp_ulpq *ulpq, __u16 needed) +{ + __u16 freed = 0; + __u32 tsn; + struct sk_buff *skb; + struct sctp_ulpevent *event; + struct sctp_tsnmap *tsnmap; + + tsnmap = &ulpq->asoc->peer.tsn_map; + + while ((skb = __skb_dequeue_tail(&ulpq->lobby)) != NULL) { + freed += skb_headlen(skb); + event = sctp_skb2event(skb); + tsn = event->tsn; + + sctp_ulpevent_free(event); + sctp_tsnmap_renege(tsnmap, tsn); + if (freed >= needed) + return freed; + } + + return freed; +} + +/* Renege 'needed' bytes from the reassembly queue. */ +static __u16 sctp_ulpq_renege_frags(struct sctp_ulpq *ulpq, __u16 needed) +{ + __u16 freed = 0; + __u32 tsn; + struct sk_buff *skb; + struct sctp_ulpevent *event; + struct sctp_tsnmap *tsnmap; + + tsnmap = &ulpq->asoc->peer.tsn_map; + + /* Walk backwards through the list, reneges the newest tsns. */ + while ((skb = __skb_dequeue_tail(&ulpq->reasm)) != NULL) { + freed += skb_headlen(skb); + event = sctp_skb2event(skb); + tsn = event->tsn; + + sctp_ulpevent_free(event); + sctp_tsnmap_renege(tsnmap, tsn); + if (freed >= needed) + return freed; + } + + return freed; +} + +/* Partial deliver the first message as there is pressure on rwnd. */ +void sctp_ulpq_partial_delivery(struct sctp_ulpq *ulpq, + struct sctp_chunk *chunk, int gfp) +{ + struct sctp_ulpevent *event; + struct sctp_association *asoc; + + asoc = ulpq->asoc; + + /* Are we already in partial delivery mode? */ + if (!sctp_sk(asoc->base.sk)->pd_mode) { + + /* Is partial delivery possible? */ + event = sctp_ulpq_retrieve_first(ulpq); + /* Send event to the ULP. */ + if (event) { + sctp_ulpq_tail_event(ulpq, event); + sctp_sk(asoc->base.sk)->pd_mode = 1; + ulpq->pd_mode = 1; + return; + } + } +} + +/* Renege some packets to make room for an incoming chunk. */ +void sctp_ulpq_renege(struct sctp_ulpq *ulpq, struct sctp_chunk *chunk, + int gfp) +{ + struct sctp_association *asoc; + __u16 needed, freed; + + asoc = ulpq->asoc; + + if (chunk) { + needed = ntohs(chunk->chunk_hdr->length); + needed -= sizeof(sctp_data_chunk_t); + } else + needed = SCTP_DEFAULT_MAXWINDOW; + + freed = 0; + + if (skb_queue_empty(&asoc->base.sk->sk_receive_queue)) { + freed = sctp_ulpq_renege_order(ulpq, needed); + if (freed < needed) { + freed += sctp_ulpq_renege_frags(ulpq, needed - freed); + } + } + /* If able to free enough room, accept this chunk. */ + if (chunk && (freed >= needed)) { + __u32 tsn; + tsn = ntohl(chunk->subh.data_hdr->tsn); + sctp_tsnmap_mark(&asoc->peer.tsn_map, tsn); + sctp_ulpq_tail_data(ulpq, chunk, gfp); + + sctp_ulpq_partial_delivery(ulpq, chunk, gfp); + } + + return; +} + + + +/* Notify the application if an association is aborted and in + * partial delivery mode. Send up any pending received messages. + */ +void sctp_ulpq_abort_pd(struct sctp_ulpq *ulpq, int gfp) +{ + struct sctp_ulpevent *ev = NULL; + struct sock *sk; + + if (!ulpq->pd_mode) + return; + + sk = ulpq->asoc->base.sk; + if (sctp_ulpevent_type_enabled(SCTP_PARTIAL_DELIVERY_EVENT, + &sctp_sk(sk)->subscribe)) + ev = sctp_ulpevent_make_pdapi(ulpq->asoc, + SCTP_PARTIAL_DELIVERY_ABORTED, + gfp); + if (ev) + __skb_queue_tail(&sk->sk_receive_queue, sctp_event2skb(ev)); + + /* If there is data waiting, send it up the socket now. */ + if (sctp_ulpq_clear_pd(ulpq) || ev) + sk->sk_data_ready(sk, 0); +} diff --git a/net/socket.c b/net/socket.c new file mode 100644 index 000000000000..2cd44990d8d3 --- /dev/null +++ b/net/socket.c @@ -0,0 +1,2088 @@ +/* + * NET An implementation of the SOCKET network access protocol. + * + * Version: @(#)socket.c 1.1.93 18/02/95 + * + * Authors: Orest Zborowski, + * Ross Biro, + * Fred N. van Kempen, + * + * Fixes: + * Anonymous : NOTSOCK/BADF cleanup. Error fix in + * shutdown() + * Alan Cox : verify_area() fixes + * Alan Cox : Removed DDI + * Jonathan Kamens : SOCK_DGRAM reconnect bug + * Alan Cox : Moved a load of checks to the very + * top level. + * Alan Cox : Move address structures to/from user + * mode above the protocol layers. + * Rob Janssen : Allow 0 length sends. + * Alan Cox : Asynchronous I/O support (cribbed from the + * tty drivers). + * Niibe Yutaka : Asynchronous I/O for writes (4.4BSD style) + * Jeff Uphoff : Made max number of sockets command-line + * configurable. + * Matti Aarnio : Made the number of sockets dynamic, + * to be allocated when needed, and mr. + * Uphoff's max is used as max to be + * allowed to allocate. + * Linus : Argh. removed all the socket allocation + * altogether: it's in the inode now. + * Alan Cox : Made sock_alloc()/sock_release() public + * for NetROM and future kernel nfsd type + * stuff. + * Alan Cox : sendmsg/recvmsg basics. + * Tom Dyas : Export net symbols. + * Marcin Dalecki : Fixed problems with CONFIG_NET="n". + * Alan Cox : Added thread locking to sys_* calls + * for sockets. May have errors at the + * moment. + * Kevin Buhr : Fixed the dumb errors in the above. + * Andi Kleen : Some small cleanups, optimizations, + * and fixed a copy_from_user() bug. + * Tigran Aivazian : sys_send(args) calls sys_sendto(args, NULL, 0) + * Tigran Aivazian : Made listen(2) backlog sanity checks + * protocol-independent + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * + * This module is effectively the top level interface to the BSD socket + * paradigm. + * + * Based upon Swansea University Computer Society NET3.039 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_NET_RADIO +#include /* Note : will define WIRELESS_EXT */ +#endif /* CONFIG_NET_RADIO */ + +#include +#include + +#include + +#include +#include + +static int sock_no_open(struct inode *irrelevant, struct file *dontcare); +static ssize_t sock_aio_read(struct kiocb *iocb, char __user *buf, + size_t size, loff_t pos); +static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *buf, + size_t size, loff_t pos); +static int sock_mmap(struct file *file, struct vm_area_struct * vma); + +static int sock_close(struct inode *inode, struct file *file); +static unsigned int sock_poll(struct file *file, + struct poll_table_struct *wait); +static long sock_ioctl(struct file *file, + unsigned int cmd, unsigned long arg); +static int sock_fasync(int fd, struct file *filp, int on); +static ssize_t sock_readv(struct file *file, const struct iovec *vector, + unsigned long count, loff_t *ppos); +static ssize_t sock_writev(struct file *file, const struct iovec *vector, + unsigned long count, loff_t *ppos); +static ssize_t sock_sendpage(struct file *file, struct page *page, + int offset, size_t size, loff_t *ppos, int more); + + +/* + * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear + * in the operation structures but are done directly via the socketcall() multiplexor. + */ + +static struct file_operations socket_file_ops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .aio_read = sock_aio_read, + .aio_write = sock_aio_write, + .poll = sock_poll, + .unlocked_ioctl = sock_ioctl, + .mmap = sock_mmap, + .open = sock_no_open, /* special open code to disallow open via /proc */ + .release = sock_close, + .fasync = sock_fasync, + .readv = sock_readv, + .writev = sock_writev, + .sendpage = sock_sendpage +}; + +/* + * The protocol list. Each protocol is registered in here. + */ + +static struct net_proto_family *net_families[NPROTO]; + +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) +static atomic_t net_family_lockct = ATOMIC_INIT(0); +static DEFINE_SPINLOCK(net_family_lock); + +/* The strategy is: modifications net_family vector are short, do not + sleep and veeery rare, but read access should be free of any exclusive + locks. + */ + +static void net_family_write_lock(void) +{ + spin_lock(&net_family_lock); + while (atomic_read(&net_family_lockct) != 0) { + spin_unlock(&net_family_lock); + + yield(); + + spin_lock(&net_family_lock); + } +} + +static __inline__ void net_family_write_unlock(void) +{ + spin_unlock(&net_family_lock); +} + +static __inline__ void net_family_read_lock(void) +{ + atomic_inc(&net_family_lockct); + spin_unlock_wait(&net_family_lock); +} + +static __inline__ void net_family_read_unlock(void) +{ + atomic_dec(&net_family_lockct); +} + +#else +#define net_family_write_lock() do { } while(0) +#define net_family_write_unlock() do { } while(0) +#define net_family_read_lock() do { } while(0) +#define net_family_read_unlock() do { } while(0) +#endif + + +/* + * Statistics counters of the socket lists + */ + +static DEFINE_PER_CPU(int, sockets_in_use) = 0; + +/* + * Support routines. Move socket addresses back and forth across the kernel/user + * divide and look after the messy bits. + */ + +#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - + 16 for IP, 16 for IPX, + 24 for IPv6, + about 80 for AX.25 + must be at least one bigger than + the AF_UNIX size (see net/unix/af_unix.c + :unix_mkname()). + */ + +/** + * move_addr_to_kernel - copy a socket address into kernel space + * @uaddr: Address in user space + * @kaddr: Address in kernel space + * @ulen: Length in user space + * + * The address is copied into kernel space. If the provided address is + * too long an error code of -EINVAL is returned. If the copy gives + * invalid addresses -EFAULT is returned. On a success 0 is returned. + */ + +int move_addr_to_kernel(void __user *uaddr, int ulen, void *kaddr) +{ + if(ulen<0||ulen>MAX_SOCK_ADDR) + return -EINVAL; + if(ulen==0) + return 0; + if(copy_from_user(kaddr,uaddr,ulen)) + return -EFAULT; + return 0; +} + +/** + * move_addr_to_user - copy an address to user space + * @kaddr: kernel space address + * @klen: length of address in kernel + * @uaddr: user space address + * @ulen: pointer to user length field + * + * The value pointed to by ulen on entry is the buffer length available. + * This is overwritten with the buffer space used. -EINVAL is returned + * if an overlong buffer is specified or a negative buffer size. -EFAULT + * is returned if either the buffer or the length field are not + * accessible. + * After copying the data up to the limit the user specifies, the true + * length of the data is written over the length limit the user + * specified. Zero is returned for a success. + */ + +int move_addr_to_user(void *kaddr, int klen, void __user *uaddr, int __user *ulen) +{ + int err; + int len; + + if((err=get_user(len, ulen))) + return err; + if(len>klen) + len=klen; + if(len<0 || len> MAX_SOCK_ADDR) + return -EINVAL; + if(len) + { + if(copy_to_user(uaddr,kaddr,len)) + return -EFAULT; + } + /* + * "fromlen shall refer to the value before truncation.." + * 1003.1g + */ + return __put_user(klen, ulen); +} + +#define SOCKFS_MAGIC 0x534F434B + +static kmem_cache_t * sock_inode_cachep; + +static struct inode *sock_alloc_inode(struct super_block *sb) +{ + struct socket_alloc *ei; + ei = (struct socket_alloc *)kmem_cache_alloc(sock_inode_cachep, SLAB_KERNEL); + if (!ei) + return NULL; + init_waitqueue_head(&ei->socket.wait); + + ei->socket.fasync_list = NULL; + ei->socket.state = SS_UNCONNECTED; + ei->socket.flags = 0; + ei->socket.ops = NULL; + ei->socket.sk = NULL; + ei->socket.file = NULL; + ei->socket.flags = 0; + + return &ei->vfs_inode; +} + +static void sock_destroy_inode(struct inode *inode) +{ + kmem_cache_free(sock_inode_cachep, + container_of(inode, struct socket_alloc, vfs_inode)); +} + +static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + struct socket_alloc *ei = (struct socket_alloc *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(&ei->vfs_inode); +} + +static int init_inodecache(void) +{ + sock_inode_cachep = kmem_cache_create("sock_inode_cache", + sizeof(struct socket_alloc), + 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, + init_once, NULL); + if (sock_inode_cachep == NULL) + return -ENOMEM; + return 0; +} + +static struct super_operations sockfs_ops = { + .alloc_inode = sock_alloc_inode, + .destroy_inode =sock_destroy_inode, + .statfs = simple_statfs, +}; + +static struct super_block *sockfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return get_sb_pseudo(fs_type, "socket:", &sockfs_ops, SOCKFS_MAGIC); +} + +static struct vfsmount *sock_mnt; + +static struct file_system_type sock_fs_type = { + .name = "sockfs", + .get_sb = sockfs_get_sb, + .kill_sb = kill_anon_super, +}; +static int sockfs_delete_dentry(struct dentry *dentry) +{ + return 1; +} +static struct dentry_operations sockfs_dentry_operations = { + .d_delete = sockfs_delete_dentry, +}; + +/* + * Obtains the first available file descriptor and sets it up for use. + * + * This function creates file structure and maps it to fd space + * of current process. On success it returns file descriptor + * and file struct implicitly stored in sock->file. + * Note that another thread may close file descriptor before we return + * from this function. We use the fact that now we do not refer + * to socket after mapping. If one day we will need it, this + * function will increment ref. count on file by 1. + * + * In any case returned fd MAY BE not valid! + * This race condition is unavoidable + * with shared fd spaces, we cannot solve it inside kernel, + * but we take care of internal coherence yet. + */ + +int sock_map_fd(struct socket *sock) +{ + int fd; + struct qstr this; + char name[32]; + + /* + * Find a file descriptor suitable for return to the user. + */ + + fd = get_unused_fd(); + if (fd >= 0) { + struct file *file = get_empty_filp(); + + if (!file) { + put_unused_fd(fd); + fd = -ENFILE; + goto out; + } + + sprintf(name, "[%lu]", SOCK_INODE(sock)->i_ino); + this.name = name; + this.len = strlen(name); + this.hash = SOCK_INODE(sock)->i_ino; + + file->f_dentry = d_alloc(sock_mnt->mnt_sb->s_root, &this); + if (!file->f_dentry) { + put_filp(file); + put_unused_fd(fd); + fd = -ENOMEM; + goto out; + } + file->f_dentry->d_op = &sockfs_dentry_operations; + d_add(file->f_dentry, SOCK_INODE(sock)); + file->f_vfsmnt = mntget(sock_mnt); + file->f_mapping = file->f_dentry->d_inode->i_mapping; + + sock->file = file; + file->f_op = SOCK_INODE(sock)->i_fop = &socket_file_ops; + file->f_mode = FMODE_READ | FMODE_WRITE; + file->f_flags = O_RDWR; + file->f_pos = 0; + fd_install(fd, file); + } + +out: + return fd; +} + +/** + * sockfd_lookup - Go from a file number to its socket slot + * @fd: file handle + * @err: pointer to an error code return + * + * The file handle passed in is locked and the socket it is bound + * too is returned. If an error occurs the err pointer is overwritten + * with a negative errno code and NULL is returned. The function checks + * for both invalid handles and passing a handle which is not a socket. + * + * On a success the socket object pointer is returned. + */ + +struct socket *sockfd_lookup(int fd, int *err) +{ + struct file *file; + struct inode *inode; + struct socket *sock; + + if (!(file = fget(fd))) + { + *err = -EBADF; + return NULL; + } + + inode = file->f_dentry->d_inode; + if (!S_ISSOCK(inode->i_mode)) { + *err = -ENOTSOCK; + fput(file); + return NULL; + } + + sock = SOCKET_I(inode); + if (sock->file != file) { + printk(KERN_ERR "socki_lookup: socket file changed!\n"); + sock->file = file; + } + return sock; +} + +/** + * sock_alloc - allocate a socket + * + * Allocate a new inode and socket object. The two are bound together + * and initialised. The socket is then returned. If we are out of inodes + * NULL is returned. + */ + +static struct socket *sock_alloc(void) +{ + struct inode * inode; + struct socket * sock; + + inode = new_inode(sock_mnt->mnt_sb); + if (!inode) + return NULL; + + sock = SOCKET_I(inode); + + inode->i_mode = S_IFSOCK|S_IRWXUGO; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + + get_cpu_var(sockets_in_use)++; + put_cpu_var(sockets_in_use); + return sock; +} + +/* + * In theory you can't get an open on this inode, but /proc provides + * a back door. Remember to keep it shut otherwise you'll let the + * creepy crawlies in. + */ + +static int sock_no_open(struct inode *irrelevant, struct file *dontcare) +{ + return -ENXIO; +} + +struct file_operations bad_sock_fops = { + .owner = THIS_MODULE, + .open = sock_no_open, +}; + +/** + * sock_release - close a socket + * @sock: socket to close + * + * The socket is released from the protocol stack if it has a release + * callback, and the inode is then released if the socket is bound to + * an inode not a file. + */ + +void sock_release(struct socket *sock) +{ + if (sock->ops) { + struct module *owner = sock->ops->owner; + + sock->ops->release(sock); + sock->ops = NULL; + module_put(owner); + } + + if (sock->fasync_list) + printk(KERN_ERR "sock_release: fasync list not empty!\n"); + + get_cpu_var(sockets_in_use)--; + put_cpu_var(sockets_in_use); + if (!sock->file) { + iput(SOCK_INODE(sock)); + return; + } + sock->file=NULL; +} + +static inline int __sock_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size) +{ + struct sock_iocb *si = kiocb_to_siocb(iocb); + int err; + + si->sock = sock; + si->scm = NULL; + si->msg = msg; + si->size = size; + + err = security_socket_sendmsg(sock, msg, size); + if (err) + return err; + + return sock->ops->sendmsg(iocb, sock, msg, size); +} + +int sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) +{ + struct kiocb iocb; + struct sock_iocb siocb; + int ret; + + init_sync_kiocb(&iocb, NULL); + iocb.private = &siocb; + ret = __sock_sendmsg(&iocb, sock, msg, size); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&iocb); + return ret; +} + +int kernel_sendmsg(struct socket *sock, struct msghdr *msg, + struct kvec *vec, size_t num, size_t size) +{ + mm_segment_t oldfs = get_fs(); + int result; + + set_fs(KERNEL_DS); + /* + * the following is safe, since for compiler definitions of kvec and + * iovec are identical, yielding the same in-core layout and alignment + */ + msg->msg_iov = (struct iovec *)vec, + msg->msg_iovlen = num; + result = sock_sendmsg(sock, msg, size); + set_fs(oldfs); + return result; +} + +static inline int __sock_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + int err; + struct sock_iocb *si = kiocb_to_siocb(iocb); + + si->sock = sock; + si->scm = NULL; + si->msg = msg; + si->size = size; + si->flags = flags; + + err = security_socket_recvmsg(sock, msg, size, flags); + if (err) + return err; + + return sock->ops->recvmsg(iocb, sock, msg, size, flags); +} + +int sock_recvmsg(struct socket *sock, struct msghdr *msg, + size_t size, int flags) +{ + struct kiocb iocb; + struct sock_iocb siocb; + int ret; + + init_sync_kiocb(&iocb, NULL); + iocb.private = &siocb; + ret = __sock_recvmsg(&iocb, sock, msg, size, flags); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&iocb); + return ret; +} + +int kernel_recvmsg(struct socket *sock, struct msghdr *msg, + struct kvec *vec, size_t num, + size_t size, int flags) +{ + mm_segment_t oldfs = get_fs(); + int result; + + set_fs(KERNEL_DS); + /* + * the following is safe, since for compiler definitions of kvec and + * iovec are identical, yielding the same in-core layout and alignment + */ + msg->msg_iov = (struct iovec *)vec, + msg->msg_iovlen = num; + result = sock_recvmsg(sock, msg, size, flags); + set_fs(oldfs); + return result; +} + +static void sock_aio_dtor(struct kiocb *iocb) +{ + kfree(iocb->private); +} + +/* + * Read data from a socket. ubuf is a user mode pointer. We make sure the user + * area ubuf...ubuf+size-1 is writable before asking the protocol. + */ + +static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf, + size_t size, loff_t pos) +{ + struct sock_iocb *x, siocb; + struct socket *sock; + int flags; + + if (pos != 0) + return -ESPIPE; + if (size==0) /* Match SYS5 behaviour */ + return 0; + + if (is_sync_kiocb(iocb)) + x = &siocb; + else { + x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL); + if (!x) + return -ENOMEM; + iocb->ki_dtor = sock_aio_dtor; + } + iocb->private = x; + x->kiocb = iocb; + sock = SOCKET_I(iocb->ki_filp->f_dentry->d_inode); + + x->async_msg.msg_name = NULL; + x->async_msg.msg_namelen = 0; + x->async_msg.msg_iov = &x->async_iov; + x->async_msg.msg_iovlen = 1; + x->async_msg.msg_control = NULL; + x->async_msg.msg_controllen = 0; + x->async_iov.iov_base = ubuf; + x->async_iov.iov_len = size; + flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; + + return __sock_recvmsg(iocb, sock, &x->async_msg, size, flags); +} + + +/* + * Write data to a socket. We verify that the user area ubuf..ubuf+size-1 + * is readable by the user process. + */ + +static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf, + size_t size, loff_t pos) +{ + struct sock_iocb *x, siocb; + struct socket *sock; + + if (pos != 0) + return -ESPIPE; + if(size==0) /* Match SYS5 behaviour */ + return 0; + + if (is_sync_kiocb(iocb)) + x = &siocb; + else { + x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL); + if (!x) + return -ENOMEM; + iocb->ki_dtor = sock_aio_dtor; + } + iocb->private = x; + x->kiocb = iocb; + sock = SOCKET_I(iocb->ki_filp->f_dentry->d_inode); + + x->async_msg.msg_name = NULL; + x->async_msg.msg_namelen = 0; + x->async_msg.msg_iov = &x->async_iov; + x->async_msg.msg_iovlen = 1; + x->async_msg.msg_control = NULL; + x->async_msg.msg_controllen = 0; + x->async_msg.msg_flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; + if (sock->type == SOCK_SEQPACKET) + x->async_msg.msg_flags |= MSG_EOR; + x->async_iov.iov_base = (void __user *)ubuf; + x->async_iov.iov_len = size; + + return __sock_sendmsg(iocb, sock, &x->async_msg, size); +} + +ssize_t sock_sendpage(struct file *file, struct page *page, + int offset, size_t size, loff_t *ppos, int more) +{ + struct socket *sock; + int flags; + + sock = SOCKET_I(file->f_dentry->d_inode); + + flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; + if (more) + flags |= MSG_MORE; + + return sock->ops->sendpage(sock, page, offset, size, flags); +} + +static int sock_readv_writev(int type, struct inode * inode, + struct file * file, const struct iovec * iov, + long count, size_t size) +{ + struct msghdr msg; + struct socket *sock; + + sock = SOCKET_I(inode); + + msg.msg_name = NULL; + msg.msg_namelen = 0; + msg.msg_control = NULL; + msg.msg_controllen = 0; + msg.msg_iov = (struct iovec *) iov; + msg.msg_iovlen = count; + msg.msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + + /* read() does a VERIFY_WRITE */ + if (type == VERIFY_WRITE) + return sock_recvmsg(sock, &msg, size, msg.msg_flags); + + if (sock->type == SOCK_SEQPACKET) + msg.msg_flags |= MSG_EOR; + + return sock_sendmsg(sock, &msg, size); +} + +static ssize_t sock_readv(struct file *file, const struct iovec *vector, + unsigned long count, loff_t *ppos) +{ + size_t tot_len = 0; + int i; + for (i = 0 ; i < count ; i++) + tot_len += vector[i].iov_len; + return sock_readv_writev(VERIFY_WRITE, file->f_dentry->d_inode, + file, vector, count, tot_len); +} + +static ssize_t sock_writev(struct file *file, const struct iovec *vector, + unsigned long count, loff_t *ppos) +{ + size_t tot_len = 0; + int i; + for (i = 0 ; i < count ; i++) + tot_len += vector[i].iov_len; + return sock_readv_writev(VERIFY_READ, file->f_dentry->d_inode, + file, vector, count, tot_len); +} + + +/* + * Atomic setting of ioctl hooks to avoid race + * with module unload. + */ + +static DECLARE_MUTEX(br_ioctl_mutex); +static int (*br_ioctl_hook)(unsigned int cmd, void __user *arg) = NULL; + +void brioctl_set(int (*hook)(unsigned int, void __user *)) +{ + down(&br_ioctl_mutex); + br_ioctl_hook = hook; + up(&br_ioctl_mutex); +} +EXPORT_SYMBOL(brioctl_set); + +static DECLARE_MUTEX(vlan_ioctl_mutex); +static int (*vlan_ioctl_hook)(void __user *arg); + +void vlan_ioctl_set(int (*hook)(void __user *)) +{ + down(&vlan_ioctl_mutex); + vlan_ioctl_hook = hook; + up(&vlan_ioctl_mutex); +} +EXPORT_SYMBOL(vlan_ioctl_set); + +static DECLARE_MUTEX(dlci_ioctl_mutex); +static int (*dlci_ioctl_hook)(unsigned int, void __user *); + +void dlci_ioctl_set(int (*hook)(unsigned int, void __user *)) +{ + down(&dlci_ioctl_mutex); + dlci_ioctl_hook = hook; + up(&dlci_ioctl_mutex); +} +EXPORT_SYMBOL(dlci_ioctl_set); + +/* + * With an ioctl, arg may well be a user mode pointer, but we don't know + * what to do with it - that's up to the protocol still. + */ + +static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + struct socket *sock; + void __user *argp = (void __user *)arg; + int pid, err; + + sock = SOCKET_I(file->f_dentry->d_inode); + if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15)) { + err = dev_ioctl(cmd, argp); + } else +#ifdef WIRELESS_EXT + if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { + err = dev_ioctl(cmd, argp); + } else +#endif /* WIRELESS_EXT */ + switch (cmd) { + case FIOSETOWN: + case SIOCSPGRP: + err = -EFAULT; + if (get_user(pid, (int __user *)argp)) + break; + err = f_setown(sock->file, pid, 1); + break; + case FIOGETOWN: + case SIOCGPGRP: + err = put_user(sock->file->f_owner.pid, (int __user *)argp); + break; + case SIOCGIFBR: + case SIOCSIFBR: + case SIOCBRADDBR: + case SIOCBRDELBR: + err = -ENOPKG; + if (!br_ioctl_hook) + request_module("bridge"); + + down(&br_ioctl_mutex); + if (br_ioctl_hook) + err = br_ioctl_hook(cmd, argp); + up(&br_ioctl_mutex); + break; + case SIOCGIFVLAN: + case SIOCSIFVLAN: + err = -ENOPKG; + if (!vlan_ioctl_hook) + request_module("8021q"); + + down(&vlan_ioctl_mutex); + if (vlan_ioctl_hook) + err = vlan_ioctl_hook(argp); + up(&vlan_ioctl_mutex); + break; + case SIOCGIFDIVERT: + case SIOCSIFDIVERT: + /* Convert this to call through a hook */ + err = divert_ioctl(cmd, argp); + break; + case SIOCADDDLCI: + case SIOCDELDLCI: + err = -ENOPKG; + if (!dlci_ioctl_hook) + request_module("dlci"); + + if (dlci_ioctl_hook) { + down(&dlci_ioctl_mutex); + err = dlci_ioctl_hook(cmd, argp); + up(&dlci_ioctl_mutex); + } + break; + default: + err = sock->ops->ioctl(sock, cmd, arg); + break; + } + return err; +} + +int sock_create_lite(int family, int type, int protocol, struct socket **res) +{ + int err; + struct socket *sock = NULL; + + err = security_socket_create(family, type, protocol, 1); + if (err) + goto out; + + sock = sock_alloc(); + if (!sock) { + err = -ENOMEM; + goto out; + } + + security_socket_post_create(sock, family, type, protocol, 1); + sock->type = type; +out: + *res = sock; + return err; +} + +/* No kernel lock held - perfect */ +static unsigned int sock_poll(struct file *file, poll_table * wait) +{ + struct socket *sock; + + /* + * We can't return errors to poll, so it's either yes or no. + */ + sock = SOCKET_I(file->f_dentry->d_inode); + return sock->ops->poll(file, sock, wait); +} + +static int sock_mmap(struct file * file, struct vm_area_struct * vma) +{ + struct socket *sock = SOCKET_I(file->f_dentry->d_inode); + + return sock->ops->mmap(file, sock, vma); +} + +int sock_close(struct inode *inode, struct file *filp) +{ + /* + * It was possible the inode is NULL we were + * closing an unfinished socket. + */ + + if (!inode) + { + printk(KERN_DEBUG "sock_close: NULL inode\n"); + return 0; + } + sock_fasync(-1, filp, 0); + sock_release(SOCKET_I(inode)); + return 0; +} + +/* + * Update the socket async list + * + * Fasync_list locking strategy. + * + * 1. fasync_list is modified only under process context socket lock + * i.e. under semaphore. + * 2. fasync_list is used under read_lock(&sk->sk_callback_lock) + * or under socket lock. + * 3. fasync_list can be used from softirq context, so that + * modification under socket lock have to be enhanced with + * write_lock_bh(&sk->sk_callback_lock). + * --ANK (990710) + */ + +static int sock_fasync(int fd, struct file *filp, int on) +{ + struct fasync_struct *fa, *fna=NULL, **prev; + struct socket *sock; + struct sock *sk; + + if (on) + { + fna=(struct fasync_struct *)kmalloc(sizeof(struct fasync_struct), GFP_KERNEL); + if(fna==NULL) + return -ENOMEM; + } + + sock = SOCKET_I(filp->f_dentry->d_inode); + + if ((sk=sock->sk) == NULL) { + kfree(fna); + return -EINVAL; + } + + lock_sock(sk); + + prev=&(sock->fasync_list); + + for (fa=*prev; fa!=NULL; prev=&fa->fa_next,fa=*prev) + if (fa->fa_file==filp) + break; + + if(on) + { + if(fa!=NULL) + { + write_lock_bh(&sk->sk_callback_lock); + fa->fa_fd=fd; + write_unlock_bh(&sk->sk_callback_lock); + + kfree(fna); + goto out; + } + fna->fa_file=filp; + fna->fa_fd=fd; + fna->magic=FASYNC_MAGIC; + fna->fa_next=sock->fasync_list; + write_lock_bh(&sk->sk_callback_lock); + sock->fasync_list=fna; + write_unlock_bh(&sk->sk_callback_lock); + } + else + { + if (fa!=NULL) + { + write_lock_bh(&sk->sk_callback_lock); + *prev=fa->fa_next; + write_unlock_bh(&sk->sk_callback_lock); + kfree(fa); + } + } + +out: + release_sock(sock->sk); + return 0; +} + +/* This function may be called only under socket lock or callback_lock */ + +int sock_wake_async(struct socket *sock, int how, int band) +{ + if (!sock || !sock->fasync_list) + return -1; + switch (how) + { + case 1: + + if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags)) + break; + goto call_kill; + case 2: + if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags)) + break; + /* fall through */ + case 0: + call_kill: + __kill_fasync(sock->fasync_list, SIGIO, band); + break; + case 3: + __kill_fasync(sock->fasync_list, SIGURG, band); + } + return 0; +} + +static int __sock_create(int family, int type, int protocol, struct socket **res, int kern) +{ + int err; + struct socket *sock; + + /* + * Check protocol is in range + */ + if (family < 0 || family >= NPROTO) + return -EAFNOSUPPORT; + if (type < 0 || type >= SOCK_MAX) + return -EINVAL; + + /* Compatibility. + + This uglymoron is moved from INET layer to here to avoid + deadlock in module load. + */ + if (family == PF_INET && type == SOCK_PACKET) { + static int warned; + if (!warned) { + warned = 1; + printk(KERN_INFO "%s uses obsolete (PF_INET,SOCK_PACKET)\n", current->comm); + } + family = PF_PACKET; + } + + err = security_socket_create(family, type, protocol, kern); + if (err) + return err; + +#if defined(CONFIG_KMOD) + /* Attempt to load a protocol module if the find failed. + * + * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user + * requested real, full-featured networking support upon configuration. + * Otherwise module support will break! + */ + if (net_families[family]==NULL) + { + request_module("net-pf-%d",family); + } +#endif + + net_family_read_lock(); + if (net_families[family] == NULL) { + err = -EAFNOSUPPORT; + goto out; + } + +/* + * Allocate the socket and allow the family to set things up. if + * the protocol is 0, the family is instructed to select an appropriate + * default. + */ + + if (!(sock = sock_alloc())) { + printk(KERN_WARNING "socket: no more sockets\n"); + err = -ENFILE; /* Not exactly a match, but its the + closest posix thing */ + goto out; + } + + sock->type = type; + + /* + * We will call the ->create function, that possibly is in a loadable + * module, so we have to bump that loadable module refcnt first. + */ + err = -EAFNOSUPPORT; + if (!try_module_get(net_families[family]->owner)) + goto out_release; + + if ((err = net_families[family]->create(sock, protocol)) < 0) + goto out_module_put; + /* + * Now to bump the refcnt of the [loadable] module that owns this + * socket at sock_release time we decrement its refcnt. + */ + if (!try_module_get(sock->ops->owner)) { + sock->ops = NULL; + goto out_module_put; + } + /* + * Now that we're done with the ->create function, the [loadable] + * module can have its refcnt decremented + */ + module_put(net_families[family]->owner); + *res = sock; + security_socket_post_create(sock, family, type, protocol, kern); + +out: + net_family_read_unlock(); + return err; +out_module_put: + module_put(net_families[family]->owner); +out_release: + sock_release(sock); + goto out; +} + +int sock_create(int family, int type, int protocol, struct socket **res) +{ + return __sock_create(family, type, protocol, res, 0); +} + +int sock_create_kern(int family, int type, int protocol, struct socket **res) +{ + return __sock_create(family, type, protocol, res, 1); +} + +asmlinkage long sys_socket(int family, int type, int protocol) +{ + int retval; + struct socket *sock; + + retval = sock_create(family, type, protocol, &sock); + if (retval < 0) + goto out; + + retval = sock_map_fd(sock); + if (retval < 0) + goto out_release; + +out: + /* It may be already another descriptor 8) Not kernel problem. */ + return retval; + +out_release: + sock_release(sock); + return retval; +} + +/* + * Create a pair of connected sockets. + */ + +asmlinkage long sys_socketpair(int family, int type, int protocol, int __user *usockvec) +{ + struct socket *sock1, *sock2; + int fd1, fd2, err; + + /* + * Obtain the first socket and check if the underlying protocol + * supports the socketpair call. + */ + + err = sock_create(family, type, protocol, &sock1); + if (err < 0) + goto out; + + err = sock_create(family, type, protocol, &sock2); + if (err < 0) + goto out_release_1; + + err = sock1->ops->socketpair(sock1, sock2); + if (err < 0) + goto out_release_both; + + fd1 = fd2 = -1; + + err = sock_map_fd(sock1); + if (err < 0) + goto out_release_both; + fd1 = err; + + err = sock_map_fd(sock2); + if (err < 0) + goto out_close_1; + fd2 = err; + + /* fd1 and fd2 may be already another descriptors. + * Not kernel problem. + */ + + err = put_user(fd1, &usockvec[0]); + if (!err) + err = put_user(fd2, &usockvec[1]); + if (!err) + return 0; + + sys_close(fd2); + sys_close(fd1); + return err; + +out_close_1: + sock_release(sock2); + sys_close(fd1); + return err; + +out_release_both: + sock_release(sock2); +out_release_1: + sock_release(sock1); +out: + return err; +} + + +/* + * Bind a name to a socket. Nothing much to do here since it's + * the protocol's responsibility to handle the local address. + * + * We move the socket address to kernel space before we call + * the protocol layer (having also checked the address is ok). + */ + +asmlinkage long sys_bind(int fd, struct sockaddr __user *umyaddr, int addrlen) +{ + struct socket *sock; + char address[MAX_SOCK_ADDR]; + int err; + + if((sock = sockfd_lookup(fd,&err))!=NULL) + { + if((err=move_addr_to_kernel(umyaddr,addrlen,address))>=0) { + err = security_socket_bind(sock, (struct sockaddr *)address, addrlen); + if (err) { + sockfd_put(sock); + return err; + } + err = sock->ops->bind(sock, (struct sockaddr *)address, addrlen); + } + sockfd_put(sock); + } + return err; +} + + +/* + * Perform a listen. Basically, we allow the protocol to do anything + * necessary for a listen, and if that works, we mark the socket as + * ready for listening. + */ + +int sysctl_somaxconn = SOMAXCONN; + +asmlinkage long sys_listen(int fd, int backlog) +{ + struct socket *sock; + int err; + + if ((sock = sockfd_lookup(fd, &err)) != NULL) { + if ((unsigned) backlog > sysctl_somaxconn) + backlog = sysctl_somaxconn; + + err = security_socket_listen(sock, backlog); + if (err) { + sockfd_put(sock); + return err; + } + + err=sock->ops->listen(sock, backlog); + sockfd_put(sock); + } + return err; +} + + +/* + * For accept, we attempt to create a new socket, set up the link + * with the client, wake up the client, then return the new + * connected fd. We collect the address of the connector in kernel + * space and move it to user at the very end. This is unclean because + * we open the socket then return an error. + * + * 1003.1g adds the ability to recvmsg() to query connection pending + * status to recvmsg. We need to add that support in a way thats + * clean when we restucture accept also. + */ + +asmlinkage long sys_accept(int fd, struct sockaddr __user *upeer_sockaddr, int __user *upeer_addrlen) +{ + struct socket *sock, *newsock; + int err, len; + char address[MAX_SOCK_ADDR]; + + sock = sockfd_lookup(fd, &err); + if (!sock) + goto out; + + err = -ENFILE; + if (!(newsock = sock_alloc())) + goto out_put; + + newsock->type = sock->type; + newsock->ops = sock->ops; + + err = security_socket_accept(sock, newsock); + if (err) + goto out_release; + + /* + * We don't need try_module_get here, as the listening socket (sock) + * has the protocol module (sock->ops->owner) held. + */ + __module_get(newsock->ops->owner); + + err = sock->ops->accept(sock, newsock, sock->file->f_flags); + if (err < 0) + goto out_release; + + if (upeer_sockaddr) { + if(newsock->ops->getname(newsock, (struct sockaddr *)address, &len, 2)<0) { + err = -ECONNABORTED; + goto out_release; + } + err = move_addr_to_user(address, len, upeer_sockaddr, upeer_addrlen); + if (err < 0) + goto out_release; + } + + /* File flags are not inherited via accept() unlike another OSes. */ + + if ((err = sock_map_fd(newsock)) < 0) + goto out_release; + + security_socket_post_accept(sock, newsock); + +out_put: + sockfd_put(sock); +out: + return err; +out_release: + sock_release(newsock); + goto out_put; +} + + +/* + * Attempt to connect to a socket with the server address. The address + * is in user space so we verify it is OK and move it to kernel space. + * + * For 1003.1g we need to add clean support for a bind to AF_UNSPEC to + * break bindings + * + * NOTE: 1003.1g draft 6.3 is broken with respect to AX.25/NetROM and + * other SEQPACKET protocols that take time to connect() as it doesn't + * include the -EINPROGRESS status for such sockets. + */ + +asmlinkage long sys_connect(int fd, struct sockaddr __user *uservaddr, int addrlen) +{ + struct socket *sock; + char address[MAX_SOCK_ADDR]; + int err; + + sock = sockfd_lookup(fd, &err); + if (!sock) + goto out; + err = move_addr_to_kernel(uservaddr, addrlen, address); + if (err < 0) + goto out_put; + + err = security_socket_connect(sock, (struct sockaddr *)address, addrlen); + if (err) + goto out_put; + + err = sock->ops->connect(sock, (struct sockaddr *) address, addrlen, + sock->file->f_flags); +out_put: + sockfd_put(sock); +out: + return err; +} + +/* + * Get the local address ('name') of a socket object. Move the obtained + * name to user space. + */ + +asmlinkage long sys_getsockname(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len) +{ + struct socket *sock; + char address[MAX_SOCK_ADDR]; + int len, err; + + sock = sockfd_lookup(fd, &err); + if (!sock) + goto out; + + err = security_socket_getsockname(sock); + if (err) + goto out_put; + + err = sock->ops->getname(sock, (struct sockaddr *)address, &len, 0); + if (err) + goto out_put; + err = move_addr_to_user(address, len, usockaddr, usockaddr_len); + +out_put: + sockfd_put(sock); +out: + return err; +} + +/* + * Get the remote address ('name') of a socket object. Move the obtained + * name to user space. + */ + +asmlinkage long sys_getpeername(int fd, struct sockaddr __user *usockaddr, int __user *usockaddr_len) +{ + struct socket *sock; + char address[MAX_SOCK_ADDR]; + int len, err; + + if ((sock = sockfd_lookup(fd, &err))!=NULL) + { + err = security_socket_getpeername(sock); + if (err) { + sockfd_put(sock); + return err; + } + + err = sock->ops->getname(sock, (struct sockaddr *)address, &len, 1); + if (!err) + err=move_addr_to_user(address,len, usockaddr, usockaddr_len); + sockfd_put(sock); + } + return err; +} + +/* + * Send a datagram to a given address. We move the address into kernel + * space and check the user space data area is readable before invoking + * the protocol. + */ + +asmlinkage long sys_sendto(int fd, void __user * buff, size_t len, unsigned flags, + struct sockaddr __user *addr, int addr_len) +{ + struct socket *sock; + char address[MAX_SOCK_ADDR]; + int err; + struct msghdr msg; + struct iovec iov; + + sock = sockfd_lookup(fd, &err); + if (!sock) + goto out; + iov.iov_base=buff; + iov.iov_len=len; + msg.msg_name=NULL; + msg.msg_iov=&iov; + msg.msg_iovlen=1; + msg.msg_control=NULL; + msg.msg_controllen=0; + msg.msg_namelen=0; + if(addr) + { + err = move_addr_to_kernel(addr, addr_len, address); + if (err < 0) + goto out_put; + msg.msg_name=address; + msg.msg_namelen=addr_len; + } + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + msg.msg_flags = flags; + err = sock_sendmsg(sock, &msg, len); + +out_put: + sockfd_put(sock); +out: + return err; +} + +/* + * Send a datagram down a socket. + */ + +asmlinkage long sys_send(int fd, void __user * buff, size_t len, unsigned flags) +{ + return sys_sendto(fd, buff, len, flags, NULL, 0); +} + +/* + * Receive a frame from the socket and optionally record the address of the + * sender. We verify the buffers are writable and if needed move the + * sender address from kernel to user space. + */ + +asmlinkage long sys_recvfrom(int fd, void __user * ubuf, size_t size, unsigned flags, + struct sockaddr __user *addr, int __user *addr_len) +{ + struct socket *sock; + struct iovec iov; + struct msghdr msg; + char address[MAX_SOCK_ADDR]; + int err,err2; + + sock = sockfd_lookup(fd, &err); + if (!sock) + goto out; + + msg.msg_control=NULL; + msg.msg_controllen=0; + msg.msg_iovlen=1; + msg.msg_iov=&iov; + iov.iov_len=size; + iov.iov_base=ubuf; + msg.msg_name=address; + msg.msg_namelen=MAX_SOCK_ADDR; + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + err=sock_recvmsg(sock, &msg, size, flags); + + if(err >= 0 && addr != NULL) + { + err2=move_addr_to_user(address, msg.msg_namelen, addr, addr_len); + if(err2<0) + err=err2; + } + sockfd_put(sock); +out: + return err; +} + +/* + * Receive a datagram from a socket. + */ + +asmlinkage long sys_recv(int fd, void __user * ubuf, size_t size, unsigned flags) +{ + return sys_recvfrom(fd, ubuf, size, flags, NULL, NULL); +} + +/* + * Set a socket option. Because we don't know the option lengths we have + * to pass the user mode parameter for the protocols to sort out. + */ + +asmlinkage long sys_setsockopt(int fd, int level, int optname, char __user *optval, int optlen) +{ + int err; + struct socket *sock; + + if (optlen < 0) + return -EINVAL; + + if ((sock = sockfd_lookup(fd, &err))!=NULL) + { + err = security_socket_setsockopt(sock,level,optname); + if (err) { + sockfd_put(sock); + return err; + } + + if (level == SOL_SOCKET) + err=sock_setsockopt(sock,level,optname,optval,optlen); + else + err=sock->ops->setsockopt(sock, level, optname, optval, optlen); + sockfd_put(sock); + } + return err; +} + +/* + * Get a socket option. Because we don't know the option lengths we have + * to pass a user mode parameter for the protocols to sort out. + */ + +asmlinkage long sys_getsockopt(int fd, int level, int optname, char __user *optval, int __user *optlen) +{ + int err; + struct socket *sock; + + if ((sock = sockfd_lookup(fd, &err))!=NULL) + { + err = security_socket_getsockopt(sock, level, + optname); + if (err) { + sockfd_put(sock); + return err; + } + + if (level == SOL_SOCKET) + err=sock_getsockopt(sock,level,optname,optval,optlen); + else + err=sock->ops->getsockopt(sock, level, optname, optval, optlen); + sockfd_put(sock); + } + return err; +} + + +/* + * Shutdown a socket. + */ + +asmlinkage long sys_shutdown(int fd, int how) +{ + int err; + struct socket *sock; + + if ((sock = sockfd_lookup(fd, &err))!=NULL) + { + err = security_socket_shutdown(sock, how); + if (err) { + sockfd_put(sock); + return err; + } + + err=sock->ops->shutdown(sock, how); + sockfd_put(sock); + } + return err; +} + +/* A couple of helpful macros for getting the address of the 32/64 bit + * fields which are the same type (int / unsigned) on our platforms. + */ +#define COMPAT_MSG(msg, member) ((MSG_CMSG_COMPAT & flags) ? &msg##_compat->member : &msg->member) +#define COMPAT_NAMELEN(msg) COMPAT_MSG(msg, msg_namelen) +#define COMPAT_FLAGS(msg) COMPAT_MSG(msg, msg_flags) + + +/* + * BSD sendmsg interface + */ + +asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags) +{ + struct compat_msghdr __user *msg_compat = (struct compat_msghdr __user *)msg; + struct socket *sock; + char address[MAX_SOCK_ADDR]; + struct iovec iovstack[UIO_FASTIOV], *iov = iovstack; + unsigned char ctl[sizeof(struct cmsghdr) + 20]; /* 20 is size of ipv6_pktinfo */ + unsigned char *ctl_buf = ctl; + struct msghdr msg_sys; + int err, ctl_len, iov_size, total_len; + + err = -EFAULT; + if (MSG_CMSG_COMPAT & flags) { + if (get_compat_msghdr(&msg_sys, msg_compat)) + return -EFAULT; + } else if (copy_from_user(&msg_sys, msg, sizeof(struct msghdr))) + return -EFAULT; + + sock = sockfd_lookup(fd, &err); + if (!sock) + goto out; + + /* do not move before msg_sys is valid */ + err = -EMSGSIZE; + if (msg_sys.msg_iovlen > UIO_MAXIOV) + goto out_put; + + /* Check whether to allocate the iovec area*/ + err = -ENOMEM; + iov_size = msg_sys.msg_iovlen * sizeof(struct iovec); + if (msg_sys.msg_iovlen > UIO_FASTIOV) { + iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); + if (!iov) + goto out_put; + } + + /* This will also move the address data into kernel space */ + if (MSG_CMSG_COMPAT & flags) { + err = verify_compat_iovec(&msg_sys, iov, address, VERIFY_READ); + } else + err = verify_iovec(&msg_sys, iov, address, VERIFY_READ); + if (err < 0) + goto out_freeiov; + total_len = err; + + err = -ENOBUFS; + + if (msg_sys.msg_controllen > INT_MAX) + goto out_freeiov; + ctl_len = msg_sys.msg_controllen; + if ((MSG_CMSG_COMPAT & flags) && ctl_len) { + err = cmsghdr_from_user_compat_to_kern(&msg_sys, ctl, sizeof(ctl)); + if (err) + goto out_freeiov; + ctl_buf = msg_sys.msg_control; + } else if (ctl_len) { + if (ctl_len > sizeof(ctl)) + { + ctl_buf = sock_kmalloc(sock->sk, ctl_len, GFP_KERNEL); + if (ctl_buf == NULL) + goto out_freeiov; + } + err = -EFAULT; + /* + * Careful! Before this, msg_sys.msg_control contains a user pointer. + * Afterwards, it will be a kernel pointer. Thus the compiler-assisted + * checking falls down on this. + */ + if (copy_from_user(ctl_buf, (void __user *) msg_sys.msg_control, ctl_len)) + goto out_freectl; + msg_sys.msg_control = ctl_buf; + } + msg_sys.msg_flags = flags; + + if (sock->file->f_flags & O_NONBLOCK) + msg_sys.msg_flags |= MSG_DONTWAIT; + err = sock_sendmsg(sock, &msg_sys, total_len); + +out_freectl: + if (ctl_buf != ctl) + sock_kfree_s(sock->sk, ctl_buf, ctl_len); +out_freeiov: + if (iov != iovstack) + sock_kfree_s(sock->sk, iov, iov_size); +out_put: + sockfd_put(sock); +out: + return err; +} + +/* + * BSD recvmsg interface + */ + +asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg, unsigned int flags) +{ + struct compat_msghdr __user *msg_compat = (struct compat_msghdr __user *)msg; + struct socket *sock; + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov=iovstack; + struct msghdr msg_sys; + unsigned long cmsg_ptr; + int err, iov_size, total_len, len; + + /* kernel mode address */ + char addr[MAX_SOCK_ADDR]; + + /* user mode address pointers */ + struct sockaddr __user *uaddr; + int __user *uaddr_len; + + if (MSG_CMSG_COMPAT & flags) { + if (get_compat_msghdr(&msg_sys, msg_compat)) + return -EFAULT; + } else + if (copy_from_user(&msg_sys,msg,sizeof(struct msghdr))) + return -EFAULT; + + sock = sockfd_lookup(fd, &err); + if (!sock) + goto out; + + err = -EMSGSIZE; + if (msg_sys.msg_iovlen > UIO_MAXIOV) + goto out_put; + + /* Check whether to allocate the iovec area*/ + err = -ENOMEM; + iov_size = msg_sys.msg_iovlen * sizeof(struct iovec); + if (msg_sys.msg_iovlen > UIO_FASTIOV) { + iov = sock_kmalloc(sock->sk, iov_size, GFP_KERNEL); + if (!iov) + goto out_put; + } + + /* + * Save the user-mode address (verify_iovec will change the + * kernel msghdr to use the kernel address space) + */ + + uaddr = (void __user *) msg_sys.msg_name; + uaddr_len = COMPAT_NAMELEN(msg); + if (MSG_CMSG_COMPAT & flags) { + err = verify_compat_iovec(&msg_sys, iov, addr, VERIFY_WRITE); + } else + err = verify_iovec(&msg_sys, iov, addr, VERIFY_WRITE); + if (err < 0) + goto out_freeiov; + total_len=err; + + cmsg_ptr = (unsigned long)msg_sys.msg_control; + msg_sys.msg_flags = 0; + if (MSG_CMSG_COMPAT & flags) + msg_sys.msg_flags = MSG_CMSG_COMPAT; + + if (sock->file->f_flags & O_NONBLOCK) + flags |= MSG_DONTWAIT; + err = sock_recvmsg(sock, &msg_sys, total_len, flags); + if (err < 0) + goto out_freeiov; + len = err; + + if (uaddr != NULL) { + err = move_addr_to_user(addr, msg_sys.msg_namelen, uaddr, uaddr_len); + if (err < 0) + goto out_freeiov; + } + err = __put_user(msg_sys.msg_flags, COMPAT_FLAGS(msg)); + if (err) + goto out_freeiov; + if (MSG_CMSG_COMPAT & flags) + err = __put_user((unsigned long)msg_sys.msg_control-cmsg_ptr, + &msg_compat->msg_controllen); + else + err = __put_user((unsigned long)msg_sys.msg_control-cmsg_ptr, + &msg->msg_controllen); + if (err) + goto out_freeiov; + err = len; + +out_freeiov: + if (iov != iovstack) + sock_kfree_s(sock->sk, iov, iov_size); +out_put: + sockfd_put(sock); +out: + return err; +} + +#ifdef __ARCH_WANT_SYS_SOCKETCALL + +/* Argument list sizes for sys_socketcall */ +#define AL(x) ((x) * sizeof(unsigned long)) +static unsigned char nargs[18]={AL(0),AL(3),AL(3),AL(3),AL(2),AL(3), + AL(3),AL(3),AL(4),AL(4),AL(4),AL(6), + AL(6),AL(2),AL(5),AL(5),AL(3),AL(3)}; +#undef AL + +/* + * System call vectors. + * + * Argument checking cleaned up. Saved 20% in size. + * This function doesn't need to set the kernel lock because + * it is set by the callees. + */ + +asmlinkage long sys_socketcall(int call, unsigned long __user *args) +{ + unsigned long a[6]; + unsigned long a0,a1; + int err; + + if(call<1||call>SYS_RECVMSG) + return -EINVAL; + + /* copy_from_user should be SMP safe. */ + if (copy_from_user(a, args, nargs[call])) + return -EFAULT; + + a0=a[0]; + a1=a[1]; + + switch(call) + { + case SYS_SOCKET: + err = sys_socket(a0,a1,a[2]); + break; + case SYS_BIND: + err = sys_bind(a0,(struct sockaddr __user *)a1, a[2]); + break; + case SYS_CONNECT: + err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]); + break; + case SYS_LISTEN: + err = sys_listen(a0,a1); + break; + case SYS_ACCEPT: + err = sys_accept(a0,(struct sockaddr __user *)a1, (int __user *)a[2]); + break; + case SYS_GETSOCKNAME: + err = sys_getsockname(a0,(struct sockaddr __user *)a1, (int __user *)a[2]); + break; + case SYS_GETPEERNAME: + err = sys_getpeername(a0, (struct sockaddr __user *)a1, (int __user *)a[2]); + break; + case SYS_SOCKETPAIR: + err = sys_socketpair(a0,a1, a[2], (int __user *)a[3]); + break; + case SYS_SEND: + err = sys_send(a0, (void __user *)a1, a[2], a[3]); + break; + case SYS_SENDTO: + err = sys_sendto(a0,(void __user *)a1, a[2], a[3], + (struct sockaddr __user *)a[4], a[5]); + break; + case SYS_RECV: + err = sys_recv(a0, (void __user *)a1, a[2], a[3]); + break; + case SYS_RECVFROM: + err = sys_recvfrom(a0, (void __user *)a1, a[2], a[3], + (struct sockaddr __user *)a[4], (int __user *)a[5]); + break; + case SYS_SHUTDOWN: + err = sys_shutdown(a0,a1); + break; + case SYS_SETSOCKOPT: + err = sys_setsockopt(a0, a1, a[2], (char __user *)a[3], a[4]); + break; + case SYS_GETSOCKOPT: + err = sys_getsockopt(a0, a1, a[2], (char __user *)a[3], (int __user *)a[4]); + break; + case SYS_SENDMSG: + err = sys_sendmsg(a0, (struct msghdr __user *) a1, a[2]); + break; + case SYS_RECVMSG: + err = sys_recvmsg(a0, (struct msghdr __user *) a1, a[2]); + break; + default: + err = -EINVAL; + break; + } + return err; +} + +#endif /* __ARCH_WANT_SYS_SOCKETCALL */ + +/* + * This function is called by a protocol handler that wants to + * advertise its address family, and have it linked into the + * SOCKET module. + */ + +int sock_register(struct net_proto_family *ops) +{ + int err; + + if (ops->family >= NPROTO) { + printk(KERN_CRIT "protocol %d >= NPROTO(%d)\n", ops->family, NPROTO); + return -ENOBUFS; + } + net_family_write_lock(); + err = -EEXIST; + if (net_families[ops->family] == NULL) { + net_families[ops->family]=ops; + err = 0; + } + net_family_write_unlock(); + printk(KERN_INFO "NET: Registered protocol family %d\n", + ops->family); + return err; +} + +/* + * This function is called by a protocol handler that wants to + * remove its address family, and have it unlinked from the + * SOCKET module. + */ + +int sock_unregister(int family) +{ + if (family < 0 || family >= NPROTO) + return -1; + + net_family_write_lock(); + net_families[family]=NULL; + net_family_write_unlock(); + printk(KERN_INFO "NET: Unregistered protocol family %d\n", + family); + return 0; +} + + +extern void sk_init(void); + +void __init sock_init(void) +{ + /* + * Initialize sock SLAB cache. + */ + + sk_init(); + +#ifdef SLAB_SKB + /* + * Initialize skbuff SLAB cache + */ + skb_init(); +#endif + + /* + * Initialize the protocols module. + */ + + init_inodecache(); + register_filesystem(&sock_fs_type); + sock_mnt = kern_mount(&sock_fs_type); + /* The real protocol initialization is performed when + * do_initcalls is run. + */ + +#ifdef CONFIG_NETFILTER + netfilter_init(); +#endif +} + +#ifdef CONFIG_PROC_FS +void socket_seq_show(struct seq_file *seq) +{ + int cpu; + int counter = 0; + + for (cpu = 0; cpu < NR_CPUS; cpu++) + counter += per_cpu(sockets_in_use, cpu); + + /* It can be negative, by the way. 8) */ + if (counter < 0) + counter = 0; + + seq_printf(seq, "sockets: used %d\n", counter); +} +#endif /* CONFIG_PROC_FS */ + +/* ABI emulation layers need these two */ +EXPORT_SYMBOL(move_addr_to_kernel); +EXPORT_SYMBOL(move_addr_to_user); +EXPORT_SYMBOL(sock_create); +EXPORT_SYMBOL(sock_create_kern); +EXPORT_SYMBOL(sock_create_lite); +EXPORT_SYMBOL(sock_map_fd); +EXPORT_SYMBOL(sock_recvmsg); +EXPORT_SYMBOL(sock_register); +EXPORT_SYMBOL(sock_release); +EXPORT_SYMBOL(sock_sendmsg); +EXPORT_SYMBOL(sock_unregister); +EXPORT_SYMBOL(sock_wake_async); +EXPORT_SYMBOL(sockfd_lookup); +EXPORT_SYMBOL(kernel_sendmsg); +EXPORT_SYMBOL(kernel_recvmsg); diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile new file mode 100644 index 000000000000..46a2ce00a29b --- /dev/null +++ b/net/sunrpc/Makefile @@ -0,0 +1,15 @@ +# +# Makefile for Linux kernel SUN RPC +# + + +obj-$(CONFIG_SUNRPC) += sunrpc.o +obj-$(CONFIG_SUNRPC_GSS) += auth_gss/ + +sunrpc-y := clnt.o xprt.o sched.o \ + auth.o auth_null.o auth_unix.o \ + svc.o svcsock.o svcauth.o svcauth_unix.o \ + pmap_clnt.o timer.o xdr.o \ + sunrpc_syms.o cache.o rpc_pipe.o +sunrpc-$(CONFIG_PROC_FS) += stats.o +sunrpc-$(CONFIG_SYSCTL) += sysctl.o diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c new file mode 100644 index 000000000000..9bcec9b927b9 --- /dev/null +++ b/net/sunrpc/auth.c @@ -0,0 +1,395 @@ +/* + * linux/net/sunrpc/auth.c + * + * Generic RPC client authentication API. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static struct rpc_authops * auth_flavors[RPC_AUTH_MAXFLAVOR] = { + &authnull_ops, /* AUTH_NULL */ + &authunix_ops, /* AUTH_UNIX */ + NULL, /* others can be loadable modules */ +}; + +static u32 +pseudoflavor_to_flavor(u32 flavor) { + if (flavor >= RPC_AUTH_MAXFLAVOR) + return RPC_AUTH_GSS; + return flavor; +} + +int +rpcauth_register(struct rpc_authops *ops) +{ + rpc_authflavor_t flavor; + + if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) + return -EINVAL; + if (auth_flavors[flavor] != NULL) + return -EPERM; /* what else? */ + auth_flavors[flavor] = ops; + return 0; +} + +int +rpcauth_unregister(struct rpc_authops *ops) +{ + rpc_authflavor_t flavor; + + if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) + return -EINVAL; + if (auth_flavors[flavor] != ops) + return -EPERM; /* what else? */ + auth_flavors[flavor] = NULL; + return 0; +} + +struct rpc_auth * +rpcauth_create(rpc_authflavor_t pseudoflavor, struct rpc_clnt *clnt) +{ + struct rpc_auth *auth; + struct rpc_authops *ops; + u32 flavor = pseudoflavor_to_flavor(pseudoflavor); + + if (flavor >= RPC_AUTH_MAXFLAVOR || !(ops = auth_flavors[flavor])) + return NULL; + auth = ops->create(clnt, pseudoflavor); + if (!auth) + return NULL; + if (clnt->cl_auth) + rpcauth_destroy(clnt->cl_auth); + clnt->cl_auth = auth; + return auth; +} + +void +rpcauth_destroy(struct rpc_auth *auth) +{ + if (!atomic_dec_and_test(&auth->au_count)) + return; + auth->au_ops->destroy(auth); +} + +static DEFINE_SPINLOCK(rpc_credcache_lock); + +/* + * Initialize RPC credential cache + */ +int +rpcauth_init_credcache(struct rpc_auth *auth, unsigned long expire) +{ + struct rpc_cred_cache *new; + int i; + + new = (struct rpc_cred_cache *)kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + return -ENOMEM; + for (i = 0; i < RPC_CREDCACHE_NR; i++) + INIT_HLIST_HEAD(&new->hashtable[i]); + new->expire = expire; + new->nextgc = jiffies + (expire >> 1); + auth->au_credcache = new; + return 0; +} + +/* + * Destroy a list of credentials + */ +static inline +void rpcauth_destroy_credlist(struct hlist_head *head) +{ + struct rpc_cred *cred; + + while (!hlist_empty(head)) { + cred = hlist_entry(head->first, struct rpc_cred, cr_hash); + hlist_del_init(&cred->cr_hash); + put_rpccred(cred); + } +} + +/* + * Clear the RPC credential cache, and delete those credentials + * that are not referenced. + */ +void +rpcauth_free_credcache(struct rpc_auth *auth) +{ + struct rpc_cred_cache *cache = auth->au_credcache; + HLIST_HEAD(free); + struct hlist_node *pos, *next; + struct rpc_cred *cred; + int i; + + spin_lock(&rpc_credcache_lock); + for (i = 0; i < RPC_CREDCACHE_NR; i++) { + hlist_for_each_safe(pos, next, &cache->hashtable[i]) { + cred = hlist_entry(pos, struct rpc_cred, cr_hash); + __hlist_del(&cred->cr_hash); + hlist_add_head(&cred->cr_hash, &free); + } + } + spin_unlock(&rpc_credcache_lock); + rpcauth_destroy_credlist(&free); +} + +static void +rpcauth_prune_expired(struct rpc_auth *auth, struct rpc_cred *cred, struct hlist_head *free) +{ + if (atomic_read(&cred->cr_count) != 1) + return; + if (time_after(jiffies, cred->cr_expire + auth->au_credcache->expire)) + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + if (!(cred->cr_flags & RPCAUTH_CRED_UPTODATE)) { + __hlist_del(&cred->cr_hash); + hlist_add_head(&cred->cr_hash, free); + } +} + +/* + * Remove stale credentials. Avoid sleeping inside the loop. + */ +static void +rpcauth_gc_credcache(struct rpc_auth *auth, struct hlist_head *free) +{ + struct rpc_cred_cache *cache = auth->au_credcache; + struct hlist_node *pos, *next; + struct rpc_cred *cred; + int i; + + dprintk("RPC: gc'ing RPC credentials for auth %p\n", auth); + for (i = 0; i < RPC_CREDCACHE_NR; i++) { + hlist_for_each_safe(pos, next, &cache->hashtable[i]) { + cred = hlist_entry(pos, struct rpc_cred, cr_hash); + rpcauth_prune_expired(auth, cred, free); + } + } + cache->nextgc = jiffies + cache->expire; +} + +/* + * Look up a process' credentials in the authentication cache + */ +struct rpc_cred * +rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, + int taskflags) +{ + struct rpc_cred_cache *cache = auth->au_credcache; + HLIST_HEAD(free); + struct hlist_node *pos, *next; + struct rpc_cred *new = NULL, + *cred = NULL; + int nr = 0; + + if (!(taskflags & RPC_TASK_ROOTCREDS)) + nr = acred->uid & RPC_CREDCACHE_MASK; +retry: + spin_lock(&rpc_credcache_lock); + if (time_before(cache->nextgc, jiffies)) + rpcauth_gc_credcache(auth, &free); + hlist_for_each_safe(pos, next, &cache->hashtable[nr]) { + struct rpc_cred *entry; + entry = hlist_entry(pos, struct rpc_cred, cr_hash); + if (entry->cr_ops->crmatch(acred, entry, taskflags)) { + hlist_del(&entry->cr_hash); + cred = entry; + break; + } + rpcauth_prune_expired(auth, entry, &free); + } + if (new) { + if (cred) + hlist_add_head(&new->cr_hash, &free); + else + cred = new; + } + if (cred) { + hlist_add_head(&cred->cr_hash, &cache->hashtable[nr]); + get_rpccred(cred); + } + spin_unlock(&rpc_credcache_lock); + + rpcauth_destroy_credlist(&free); + + if (!cred) { + new = auth->au_ops->crcreate(auth, acred, taskflags); + if (!IS_ERR(new)) { +#ifdef RPC_DEBUG + new->cr_magic = RPCAUTH_CRED_MAGIC; +#endif + goto retry; + } else + cred = new; + } + + return (struct rpc_cred *) cred; +} + +struct rpc_cred * +rpcauth_lookupcred(struct rpc_auth *auth, int taskflags) +{ + struct auth_cred acred = { + .uid = current->fsuid, + .gid = current->fsgid, + .group_info = current->group_info, + }; + struct rpc_cred *ret; + + dprintk("RPC: looking up %s cred\n", + auth->au_ops->au_name); + get_group_info(acred.group_info); + ret = auth->au_ops->lookup_cred(auth, &acred, taskflags); + put_group_info(acred.group_info); + return ret; +} + +struct rpc_cred * +rpcauth_bindcred(struct rpc_task *task) +{ + struct rpc_auth *auth = task->tk_auth; + struct auth_cred acred = { + .uid = current->fsuid, + .gid = current->fsgid, + .group_info = current->group_info, + }; + struct rpc_cred *ret; + + dprintk("RPC: %4d looking up %s cred\n", + task->tk_pid, task->tk_auth->au_ops->au_name); + get_group_info(acred.group_info); + ret = auth->au_ops->lookup_cred(auth, &acred, task->tk_flags); + if (!IS_ERR(ret)) + task->tk_msg.rpc_cred = ret; + else + task->tk_status = PTR_ERR(ret); + put_group_info(acred.group_info); + return ret; +} + +void +rpcauth_holdcred(struct rpc_task *task) +{ + dprintk("RPC: %4d holding %s cred %p\n", + task->tk_pid, task->tk_auth->au_ops->au_name, task->tk_msg.rpc_cred); + if (task->tk_msg.rpc_cred) + get_rpccred(task->tk_msg.rpc_cred); +} + +void +put_rpccred(struct rpc_cred *cred) +{ + cred->cr_expire = jiffies; + if (!atomic_dec_and_test(&cred->cr_count)) + return; + cred->cr_ops->crdestroy(cred); +} + +void +rpcauth_unbindcred(struct rpc_task *task) +{ + struct rpc_auth *auth = task->tk_auth; + struct rpc_cred *cred = task->tk_msg.rpc_cred; + + dprintk("RPC: %4d releasing %s cred %p\n", + task->tk_pid, auth->au_ops->au_name, cred); + + put_rpccred(cred); + task->tk_msg.rpc_cred = NULL; +} + +u32 * +rpcauth_marshcred(struct rpc_task *task, u32 *p) +{ + struct rpc_auth *auth = task->tk_auth; + struct rpc_cred *cred = task->tk_msg.rpc_cred; + + dprintk("RPC: %4d marshaling %s cred %p\n", + task->tk_pid, auth->au_ops->au_name, cred); + return cred->cr_ops->crmarshal(task, p); +} + +u32 * +rpcauth_checkverf(struct rpc_task *task, u32 *p) +{ + struct rpc_auth *auth = task->tk_auth; + struct rpc_cred *cred = task->tk_msg.rpc_cred; + + dprintk("RPC: %4d validating %s cred %p\n", + task->tk_pid, auth->au_ops->au_name, cred); + return cred->cr_ops->crvalidate(task, p); +} + +int +rpcauth_wrap_req(struct rpc_task *task, kxdrproc_t encode, void *rqstp, + u32 *data, void *obj) +{ + struct rpc_cred *cred = task->tk_msg.rpc_cred; + + dprintk("RPC: %4d using %s cred %p to wrap rpc data\n", + task->tk_pid, cred->cr_ops->cr_name, cred); + if (cred->cr_ops->crwrap_req) + return cred->cr_ops->crwrap_req(task, encode, rqstp, data, obj); + /* By default, we encode the arguments normally. */ + return encode(rqstp, data, obj); +} + +int +rpcauth_unwrap_resp(struct rpc_task *task, kxdrproc_t decode, void *rqstp, + u32 *data, void *obj) +{ + struct rpc_cred *cred = task->tk_msg.rpc_cred; + + dprintk("RPC: %4d using %s cred %p to unwrap rpc data\n", + task->tk_pid, cred->cr_ops->cr_name, cred); + if (cred->cr_ops->crunwrap_resp) + return cred->cr_ops->crunwrap_resp(task, decode, rqstp, + data, obj); + /* By default, we decode the arguments normally. */ + return decode(rqstp, data, obj); +} + +int +rpcauth_refreshcred(struct rpc_task *task) +{ + struct rpc_auth *auth = task->tk_auth; + struct rpc_cred *cred = task->tk_msg.rpc_cred; + int err; + + dprintk("RPC: %4d refreshing %s cred %p\n", + task->tk_pid, auth->au_ops->au_name, cred); + err = cred->cr_ops->crrefresh(task); + if (err < 0) + task->tk_status = err; + return err; +} + +void +rpcauth_invalcred(struct rpc_task *task) +{ + dprintk("RPC: %4d invalidating %s cred %p\n", + task->tk_pid, task->tk_auth->au_ops->au_name, task->tk_msg.rpc_cred); + spin_lock(&rpc_credcache_lock); + if (task->tk_msg.rpc_cred) + task->tk_msg.rpc_cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + spin_unlock(&rpc_credcache_lock); +} + +int +rpcauth_uptodatecred(struct rpc_task *task) +{ + return !(task->tk_msg.rpc_cred) || + (task->tk_msg.rpc_cred->cr_flags & RPCAUTH_CRED_UPTODATE); +} diff --git a/net/sunrpc/auth_gss/Makefile b/net/sunrpc/auth_gss/Makefile new file mode 100644 index 000000000000..fe1b874084bc --- /dev/null +++ b/net/sunrpc/auth_gss/Makefile @@ -0,0 +1,18 @@ +# +# Makefile for Linux kernel rpcsec_gss implementation +# + +obj-$(CONFIG_SUNRPC_GSS) += auth_rpcgss.o + +auth_rpcgss-objs := auth_gss.o gss_generic_token.o \ + gss_mech_switch.o svcauth_gss.o gss_krb5_crypto.o + +obj-$(CONFIG_RPCSEC_GSS_KRB5) += rpcsec_gss_krb5.o + +rpcsec_gss_krb5-objs := gss_krb5_mech.o gss_krb5_seal.o gss_krb5_unseal.o \ + gss_krb5_seqnum.o + +obj-$(CONFIG_RPCSEC_GSS_SPKM3) += rpcsec_gss_spkm3.o + +rpcsec_gss_spkm3-objs := gss_spkm3_mech.o gss_spkm3_seal.o gss_spkm3_unseal.o \ + gss_spkm3_token.o diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c new file mode 100644 index 000000000000..a33b627cbef4 --- /dev/null +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -0,0 +1,1152 @@ +/* + * linux/net/sunrpc/auth_gss.c + * + * RPCSEC_GSS client authentication. + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Dug Song + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $Id$ + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct rpc_authops authgss_ops; + +static struct rpc_credops gss_credops; + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +#define NFS_NGROUPS 16 + +#define GSS_CRED_EXPIRE (60 * HZ) /* XXX: reasonable? */ +#define GSS_CRED_SLACK 1024 /* XXX: unused */ +/* length of a krb5 verifier (48), plus data added before arguments when + * using integrity (two 4-byte integers): */ +#define GSS_VERF_SLACK 56 + +/* XXX this define must match the gssd define +* as it is passed to gssd to signal the use of +* machine creds should be part of the shared rpc interface */ + +#define CA_RUN_AS_MACHINE 0x00000200 + +/* dump the buffer in `emacs-hexl' style */ +#define isprint(c) ((c > 0x1f) && (c < 0x7f)) + +static DEFINE_RWLOCK(gss_ctx_lock); + +struct gss_auth { + struct rpc_auth rpc_auth; + struct gss_api_mech *mech; + enum rpc_gss_svc service; + struct list_head upcalls; + struct rpc_clnt *client; + struct dentry *dentry; + char path[48]; + spinlock_t lock; +}; + +static void gss_destroy_ctx(struct gss_cl_ctx *); +static struct rpc_pipe_ops gss_upcall_ops; + +void +print_hexl(u32 *p, u_int length, u_int offset) +{ + u_int i, j, jm; + u8 c, *cp; + + dprintk("RPC: print_hexl: length %d\n",length); + dprintk("\n"); + cp = (u8 *) p; + + for (i = 0; i < length; i += 0x10) { + dprintk(" %04x: ", (u_int)(i + offset)); + jm = length - i; + jm = jm > 16 ? 16 : jm; + + for (j = 0; j < jm; j++) { + if ((j % 2) == 1) + dprintk("%02x ", (u_int)cp[i+j]); + else + dprintk("%02x", (u_int)cp[i+j]); + } + for (; j < 16; j++) { + if ((j % 2) == 1) + dprintk(" "); + else + dprintk(" "); + } + dprintk(" "); + + for (j = 0; j < jm; j++) { + c = cp[i+j]; + c = isprint(c) ? c : '.'; + dprintk("%c", c); + } + dprintk("\n"); + } +} + +EXPORT_SYMBOL(print_hexl); + +static inline struct gss_cl_ctx * +gss_get_ctx(struct gss_cl_ctx *ctx) +{ + atomic_inc(&ctx->count); + return ctx; +} + +static inline void +gss_put_ctx(struct gss_cl_ctx *ctx) +{ + if (atomic_dec_and_test(&ctx->count)) + gss_destroy_ctx(ctx); +} + +static void +gss_cred_set_ctx(struct rpc_cred *cred, struct gss_cl_ctx *ctx) +{ + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); + struct gss_cl_ctx *old; + write_lock(&gss_ctx_lock); + old = gss_cred->gc_ctx; + gss_cred->gc_ctx = ctx; + cred->cr_flags |= RPCAUTH_CRED_UPTODATE; + write_unlock(&gss_ctx_lock); + if (old) + gss_put_ctx(old); +} + +static int +gss_cred_is_uptodate_ctx(struct rpc_cred *cred) +{ + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); + int res = 0; + + read_lock(&gss_ctx_lock); + if ((cred->cr_flags & RPCAUTH_CRED_UPTODATE) && gss_cred->gc_ctx) + res = 1; + read_unlock(&gss_ctx_lock); + return res; +} + +static const void * +simple_get_bytes(const void *p, const void *end, void *res, size_t len) +{ + const void *q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + memcpy(res, p, len); + return q; +} + +static inline const void * +simple_get_netobj(const void *p, const void *end, struct xdr_netobj *dest) +{ + const void *q; + unsigned int len; + + p = simple_get_bytes(p, end, &len, sizeof(len)); + if (IS_ERR(p)) + return p; + q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + dest->data = kmalloc(len, GFP_KERNEL); + if (unlikely(dest->data == NULL)) + return ERR_PTR(-ENOMEM); + dest->len = len; + memcpy(dest->data, p, len); + return q; +} + +static struct gss_cl_ctx * +gss_cred_get_ctx(struct rpc_cred *cred) +{ + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, gc_base); + struct gss_cl_ctx *ctx = NULL; + + read_lock(&gss_ctx_lock); + if (gss_cred->gc_ctx) + ctx = gss_get_ctx(gss_cred->gc_ctx); + read_unlock(&gss_ctx_lock); + return ctx; +} + +static struct gss_cl_ctx * +gss_alloc_context(void) +{ + struct gss_cl_ctx *ctx; + + ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); + if (ctx != NULL) { + memset(ctx, 0, sizeof(*ctx)); + ctx->gc_proc = RPC_GSS_PROC_DATA; + ctx->gc_seq = 1; /* NetApp 6.4R1 doesn't accept seq. no. 0 */ + spin_lock_init(&ctx->gc_seq_lock); + atomic_set(&ctx->count,1); + } + return ctx; +} + +#define GSSD_MIN_TIMEOUT (60 * 60) +static const void * +gss_fill_context(const void *p, const void *end, struct gss_cl_ctx *ctx, struct gss_api_mech *gm) +{ + const void *q; + unsigned int seclen; + unsigned int timeout; + u32 window_size; + int ret; + + /* First unsigned int gives the lifetime (in seconds) of the cred */ + p = simple_get_bytes(p, end, &timeout, sizeof(timeout)); + if (IS_ERR(p)) + goto err; + if (timeout == 0) + timeout = GSSD_MIN_TIMEOUT; + ctx->gc_expiry = jiffies + (unsigned long)timeout * HZ * 3 / 4; + /* Sequence number window. Determines the maximum number of simultaneous requests */ + p = simple_get_bytes(p, end, &window_size, sizeof(window_size)); + if (IS_ERR(p)) + goto err; + ctx->gc_win = window_size; + /* gssd signals an error by passing ctx->gc_win = 0: */ + if (ctx->gc_win == 0) { + /* in which case, p points to an error code which we ignore */ + p = ERR_PTR(-EACCES); + goto err; + } + /* copy the opaque wire context */ + p = simple_get_netobj(p, end, &ctx->gc_wire_ctx); + if (IS_ERR(p)) + goto err; + /* import the opaque security context */ + p = simple_get_bytes(p, end, &seclen, sizeof(seclen)); + if (IS_ERR(p)) + goto err; + q = (const void *)((const char *)p + seclen); + if (unlikely(q > end || q < p)) { + p = ERR_PTR(-EFAULT); + goto err; + } + ret = gss_import_sec_context(p, seclen, gm, &ctx->gc_gss_ctx); + if (ret < 0) { + p = ERR_PTR(ret); + goto err; + } + return q; +err: + dprintk("RPC: gss_fill_context returning %ld\n", -PTR_ERR(p)); + return p; +} + + +struct gss_upcall_msg { + atomic_t count; + uid_t uid; + struct rpc_pipe_msg msg; + struct list_head list; + struct gss_auth *auth; + struct rpc_wait_queue rpc_waitqueue; + wait_queue_head_t waitqueue; + struct gss_cl_ctx *ctx; +}; + +static void +gss_release_msg(struct gss_upcall_msg *gss_msg) +{ + if (!atomic_dec_and_test(&gss_msg->count)) + return; + BUG_ON(!list_empty(&gss_msg->list)); + if (gss_msg->ctx != NULL) + gss_put_ctx(gss_msg->ctx); + kfree(gss_msg); +} + +static struct gss_upcall_msg * +__gss_find_upcall(struct gss_auth *gss_auth, uid_t uid) +{ + struct gss_upcall_msg *pos; + list_for_each_entry(pos, &gss_auth->upcalls, list) { + if (pos->uid != uid) + continue; + atomic_inc(&pos->count); + dprintk("RPC: gss_find_upcall found msg %p\n", pos); + return pos; + } + dprintk("RPC: gss_find_upcall found nothing\n"); + return NULL; +} + +/* Try to add a upcall to the pipefs queue. + * If an upcall owned by our uid already exists, then we return a reference + * to that upcall instead of adding the new upcall. + */ +static inline struct gss_upcall_msg * +gss_add_msg(struct gss_auth *gss_auth, struct gss_upcall_msg *gss_msg) +{ + struct gss_upcall_msg *old; + + spin_lock(&gss_auth->lock); + old = __gss_find_upcall(gss_auth, gss_msg->uid); + if (old == NULL) { + atomic_inc(&gss_msg->count); + list_add(&gss_msg->list, &gss_auth->upcalls); + } else + gss_msg = old; + spin_unlock(&gss_auth->lock); + return gss_msg; +} + +static void +__gss_unhash_msg(struct gss_upcall_msg *gss_msg) +{ + if (list_empty(&gss_msg->list)) + return; + list_del_init(&gss_msg->list); + rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); + wake_up_all(&gss_msg->waitqueue); + atomic_dec(&gss_msg->count); +} + +static void +gss_unhash_msg(struct gss_upcall_msg *gss_msg) +{ + struct gss_auth *gss_auth = gss_msg->auth; + + spin_lock(&gss_auth->lock); + __gss_unhash_msg(gss_msg); + spin_unlock(&gss_auth->lock); +} + +static void +gss_upcall_callback(struct rpc_task *task) +{ + struct gss_cred *gss_cred = container_of(task->tk_msg.rpc_cred, + struct gss_cred, gc_base); + struct gss_upcall_msg *gss_msg = gss_cred->gc_upcall; + + BUG_ON(gss_msg == NULL); + if (gss_msg->ctx) + gss_cred_set_ctx(task->tk_msg.rpc_cred, gss_get_ctx(gss_msg->ctx)); + else + task->tk_status = gss_msg->msg.errno; + spin_lock(&gss_msg->auth->lock); + gss_cred->gc_upcall = NULL; + rpc_wake_up_status(&gss_msg->rpc_waitqueue, gss_msg->msg.errno); + spin_unlock(&gss_msg->auth->lock); + gss_release_msg(gss_msg); +} + +static inline struct gss_upcall_msg * +gss_alloc_msg(struct gss_auth *gss_auth, uid_t uid) +{ + struct gss_upcall_msg *gss_msg; + + gss_msg = kmalloc(sizeof(*gss_msg), GFP_KERNEL); + if (gss_msg != NULL) { + memset(gss_msg, 0, sizeof(*gss_msg)); + INIT_LIST_HEAD(&gss_msg->list); + rpc_init_wait_queue(&gss_msg->rpc_waitqueue, "RPCSEC_GSS upcall waitq"); + init_waitqueue_head(&gss_msg->waitqueue); + atomic_set(&gss_msg->count, 1); + gss_msg->msg.data = &gss_msg->uid; + gss_msg->msg.len = sizeof(gss_msg->uid); + gss_msg->uid = uid; + gss_msg->auth = gss_auth; + } + return gss_msg; +} + +static struct gss_upcall_msg * +gss_setup_upcall(struct rpc_clnt *clnt, struct gss_auth *gss_auth, struct rpc_cred *cred) +{ + struct gss_upcall_msg *gss_new, *gss_msg; + + gss_new = gss_alloc_msg(gss_auth, cred->cr_uid); + if (gss_new == NULL) + return ERR_PTR(-ENOMEM); + gss_msg = gss_add_msg(gss_auth, gss_new); + if (gss_msg == gss_new) { + int res = rpc_queue_upcall(gss_auth->dentry->d_inode, &gss_new->msg); + if (res) { + gss_unhash_msg(gss_new); + gss_msg = ERR_PTR(res); + } + } else + gss_release_msg(gss_new); + return gss_msg; +} + +static inline int +gss_refresh_upcall(struct rpc_task *task) +{ + struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct gss_auth *gss_auth = container_of(task->tk_client->cl_auth, + struct gss_auth, rpc_auth); + struct gss_cred *gss_cred = container_of(cred, + struct gss_cred, gc_base); + struct gss_upcall_msg *gss_msg; + int err = 0; + + dprintk("RPC: %4u gss_refresh_upcall for uid %u\n", task->tk_pid, cred->cr_uid); + gss_msg = gss_setup_upcall(task->tk_client, gss_auth, cred); + if (IS_ERR(gss_msg)) { + err = PTR_ERR(gss_msg); + goto out; + } + spin_lock(&gss_auth->lock); + if (gss_cred->gc_upcall != NULL) + rpc_sleep_on(&gss_cred->gc_upcall->rpc_waitqueue, task, NULL, NULL); + else if (gss_msg->ctx == NULL && gss_msg->msg.errno >= 0) { + task->tk_timeout = 0; + gss_cred->gc_upcall = gss_msg; + /* gss_upcall_callback will release the reference to gss_upcall_msg */ + atomic_inc(&gss_msg->count); + rpc_sleep_on(&gss_msg->rpc_waitqueue, task, gss_upcall_callback, NULL); + } else + err = gss_msg->msg.errno; + spin_unlock(&gss_auth->lock); + gss_release_msg(gss_msg); +out: + dprintk("RPC: %4u gss_refresh_upcall for uid %u result %d\n", task->tk_pid, + cred->cr_uid, err); + return err; +} + +static inline int +gss_create_upcall(struct gss_auth *gss_auth, struct gss_cred *gss_cred) +{ + struct rpc_cred *cred = &gss_cred->gc_base; + struct gss_upcall_msg *gss_msg; + DEFINE_WAIT(wait); + int err = 0; + + dprintk("RPC: gss_upcall for uid %u\n", cred->cr_uid); + gss_msg = gss_setup_upcall(gss_auth->client, gss_auth, cred); + if (IS_ERR(gss_msg)) { + err = PTR_ERR(gss_msg); + goto out; + } + for (;;) { + prepare_to_wait(&gss_msg->waitqueue, &wait, TASK_INTERRUPTIBLE); + spin_lock(&gss_auth->lock); + if (gss_msg->ctx != NULL || gss_msg->msg.errno < 0) { + spin_unlock(&gss_auth->lock); + break; + } + spin_unlock(&gss_auth->lock); + if (signalled()) { + err = -ERESTARTSYS; + goto out_intr; + } + schedule(); + } + if (gss_msg->ctx) + gss_cred_set_ctx(cred, gss_get_ctx(gss_msg->ctx)); + else + err = gss_msg->msg.errno; +out_intr: + finish_wait(&gss_msg->waitqueue, &wait); + gss_release_msg(gss_msg); +out: + dprintk("RPC: gss_create_upcall for uid %u result %d\n", cred->cr_uid, err); + return err; +} + +static ssize_t +gss_pipe_upcall(struct file *filp, struct rpc_pipe_msg *msg, + char __user *dst, size_t buflen) +{ + char *data = (char *)msg->data + msg->copied; + ssize_t mlen = msg->len; + ssize_t left; + + if (mlen > buflen) + mlen = buflen; + left = copy_to_user(dst, data, mlen); + if (left < 0) { + msg->errno = left; + return left; + } + mlen -= left; + msg->copied += mlen; + msg->errno = 0; + return mlen; +} + +#define MSG_BUF_MAXSIZE 1024 + +static ssize_t +gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen) +{ + const void *p, *end; + void *buf; + struct rpc_clnt *clnt; + struct gss_auth *gss_auth; + struct rpc_cred *cred; + struct gss_upcall_msg *gss_msg; + struct gss_cl_ctx *ctx; + uid_t uid; + int err = -EFBIG; + + if (mlen > MSG_BUF_MAXSIZE) + goto out; + err = -ENOMEM; + buf = kmalloc(mlen, GFP_KERNEL); + if (!buf) + goto out; + + clnt = RPC_I(filp->f_dentry->d_inode)->private; + err = -EFAULT; + if (copy_from_user(buf, src, mlen)) + goto err; + + end = (const void *)((char *)buf + mlen); + p = simple_get_bytes(buf, end, &uid, sizeof(uid)); + if (IS_ERR(p)) { + err = PTR_ERR(p); + goto err; + } + + err = -ENOMEM; + ctx = gss_alloc_context(); + if (ctx == NULL) + goto err; + err = 0; + gss_auth = container_of(clnt->cl_auth, struct gss_auth, rpc_auth); + p = gss_fill_context(p, end, ctx, gss_auth->mech); + if (IS_ERR(p)) { + err = PTR_ERR(p); + if (err != -EACCES) + goto err_put_ctx; + } + spin_lock(&gss_auth->lock); + gss_msg = __gss_find_upcall(gss_auth, uid); + if (gss_msg) { + if (err == 0 && gss_msg->ctx == NULL) + gss_msg->ctx = gss_get_ctx(ctx); + gss_msg->msg.errno = err; + __gss_unhash_msg(gss_msg); + spin_unlock(&gss_auth->lock); + gss_release_msg(gss_msg); + } else { + struct auth_cred acred = { .uid = uid }; + spin_unlock(&gss_auth->lock); + cred = rpcauth_lookup_credcache(clnt->cl_auth, &acred, 0); + if (IS_ERR(cred)) { + err = PTR_ERR(cred); + goto err_put_ctx; + } + gss_cred_set_ctx(cred, gss_get_ctx(ctx)); + } + gss_put_ctx(ctx); + kfree(buf); + dprintk("RPC: gss_pipe_downcall returning length %Zu\n", mlen); + return mlen; +err_put_ctx: + gss_put_ctx(ctx); +err: + kfree(buf); +out: + dprintk("RPC: gss_pipe_downcall returning %d\n", err); + return err; +} + +static void +gss_pipe_release(struct inode *inode) +{ + struct rpc_inode *rpci = RPC_I(inode); + struct rpc_clnt *clnt; + struct rpc_auth *auth; + struct gss_auth *gss_auth; + + clnt = rpci->private; + auth = clnt->cl_auth; + gss_auth = container_of(auth, struct gss_auth, rpc_auth); + spin_lock(&gss_auth->lock); + while (!list_empty(&gss_auth->upcalls)) { + struct gss_upcall_msg *gss_msg; + + gss_msg = list_entry(gss_auth->upcalls.next, + struct gss_upcall_msg, list); + gss_msg->msg.errno = -EPIPE; + atomic_inc(&gss_msg->count); + __gss_unhash_msg(gss_msg); + spin_unlock(&gss_auth->lock); + gss_release_msg(gss_msg); + spin_lock(&gss_auth->lock); + } + spin_unlock(&gss_auth->lock); +} + +static void +gss_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct gss_upcall_msg *gss_msg = container_of(msg, struct gss_upcall_msg, msg); + static unsigned long ratelimit; + + if (msg->errno < 0) { + dprintk("RPC: gss_pipe_destroy_msg releasing msg %p\n", + gss_msg); + atomic_inc(&gss_msg->count); + gss_unhash_msg(gss_msg); + if (msg->errno == -ETIMEDOUT || msg->errno == -EPIPE) { + unsigned long now = jiffies; + if (time_after(now, ratelimit)) { + printk(KERN_WARNING "RPC: AUTH_GSS upcall timed out.\n" + "Please check user daemon is running!\n"); + ratelimit = now + 15*HZ; + } + } + gss_release_msg(gss_msg); + } +} + +/* + * NOTE: we have the opportunity to use different + * parameters based on the input flavor (which must be a pseudoflavor) + */ +static struct rpc_auth * +gss_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor) +{ + struct gss_auth *gss_auth; + struct rpc_auth * auth; + + dprintk("RPC: creating GSS authenticator for client %p\n",clnt); + + if (!try_module_get(THIS_MODULE)) + return NULL; + if (!(gss_auth = kmalloc(sizeof(*gss_auth), GFP_KERNEL))) + goto out_dec; + gss_auth->client = clnt; + gss_auth->mech = gss_mech_get_by_pseudoflavor(flavor); + if (!gss_auth->mech) { + printk(KERN_WARNING "%s: Pseudoflavor %d not found!", + __FUNCTION__, flavor); + goto err_free; + } + gss_auth->service = gss_pseudoflavor_to_service(gss_auth->mech, flavor); + /* FIXME: Will go away once privacy support is merged in */ + if (gss_auth->service == RPC_GSS_SVC_PRIVACY) + gss_auth->service = RPC_GSS_SVC_INTEGRITY; + INIT_LIST_HEAD(&gss_auth->upcalls); + spin_lock_init(&gss_auth->lock); + auth = &gss_auth->rpc_auth; + auth->au_cslack = GSS_CRED_SLACK >> 2; + auth->au_rslack = GSS_VERF_SLACK >> 2; + auth->au_ops = &authgss_ops; + auth->au_flavor = flavor; + atomic_set(&auth->au_count, 1); + + if (rpcauth_init_credcache(auth, GSS_CRED_EXPIRE) < 0) + goto err_put_mech; + + snprintf(gss_auth->path, sizeof(gss_auth->path), "%s/%s", + clnt->cl_pathname, + gss_auth->mech->gm_name); + gss_auth->dentry = rpc_mkpipe(gss_auth->path, clnt, &gss_upcall_ops, RPC_PIPE_WAIT_FOR_OPEN); + if (IS_ERR(gss_auth->dentry)) + goto err_put_mech; + + return auth; +err_put_mech: + gss_mech_put(gss_auth->mech); +err_free: + kfree(gss_auth); +out_dec: + module_put(THIS_MODULE); + return NULL; +} + +static void +gss_destroy(struct rpc_auth *auth) +{ + struct gss_auth *gss_auth; + + dprintk("RPC: destroying GSS authenticator %p flavor %d\n", + auth, auth->au_flavor); + + gss_auth = container_of(auth, struct gss_auth, rpc_auth); + rpc_unlink(gss_auth->path); + gss_mech_put(gss_auth->mech); + + rpcauth_free_credcache(auth); + kfree(gss_auth); + module_put(THIS_MODULE); +} + +/* gss_destroy_cred (and gss_destroy_ctx) are used to clean up after failure + * to create a new cred or context, so they check that things have been + * allocated before freeing them. */ +static void +gss_destroy_ctx(struct gss_cl_ctx *ctx) +{ + dprintk("RPC: gss_destroy_ctx\n"); + + if (ctx->gc_gss_ctx) + gss_delete_sec_context(&ctx->gc_gss_ctx); + + kfree(ctx->gc_wire_ctx.data); + kfree(ctx); +} + +static void +gss_destroy_cred(struct rpc_cred *rc) +{ + struct gss_cred *cred = container_of(rc, struct gss_cred, gc_base); + + dprintk("RPC: gss_destroy_cred \n"); + + if (cred->gc_ctx) + gss_put_ctx(cred->gc_ctx); + kfree(cred); +} + +/* + * Lookup RPCSEC_GSS cred for the current process + */ +static struct rpc_cred * +gss_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int taskflags) +{ + return rpcauth_lookup_credcache(auth, acred, taskflags); +} + +static struct rpc_cred * +gss_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int taskflags) +{ + struct gss_auth *gss_auth = container_of(auth, struct gss_auth, rpc_auth); + struct gss_cred *cred = NULL; + int err = -ENOMEM; + + dprintk("RPC: gss_create_cred for uid %d, flavor %d\n", + acred->uid, auth->au_flavor); + + if (!(cred = kmalloc(sizeof(*cred), GFP_KERNEL))) + goto out_err; + + memset(cred, 0, sizeof(*cred)); + atomic_set(&cred->gc_count, 1); + cred->gc_uid = acred->uid; + /* + * Note: in order to force a call to call_refresh(), we deliberately + * fail to flag the credential as RPCAUTH_CRED_UPTODATE. + */ + cred->gc_flags = 0; + cred->gc_base.cr_ops = &gss_credops; + cred->gc_service = gss_auth->service; + err = gss_create_upcall(gss_auth, cred); + if (err < 0) + goto out_err; + + return &cred->gc_base; + +out_err: + dprintk("RPC: gss_create_cred failed with error %d\n", err); + if (cred) gss_destroy_cred(&cred->gc_base); + return ERR_PTR(err); +} + +static int +gss_match(struct auth_cred *acred, struct rpc_cred *rc, int taskflags) +{ + struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base); + + /* Don't match with creds that have expired. */ + if (gss_cred->gc_ctx && time_after(jiffies, gss_cred->gc_ctx->gc_expiry)) + return 0; + return (rc->cr_uid == acred->uid); +} + +/* +* Marshal credentials. +* Maybe we should keep a cached credential for performance reasons. +*/ +static u32 * +gss_marshal(struct rpc_task *task, u32 *p) +{ + struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, + gc_base); + struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + u32 *cred_len; + struct rpc_rqst *req = task->tk_rqstp; + u32 maj_stat = 0; + struct xdr_netobj mic; + struct kvec iov; + struct xdr_buf verf_buf; + + dprintk("RPC: %4u gss_marshal\n", task->tk_pid); + + *p++ = htonl(RPC_AUTH_GSS); + cred_len = p++; + + spin_lock(&ctx->gc_seq_lock); + req->rq_seqno = ctx->gc_seq++; + spin_unlock(&ctx->gc_seq_lock); + + *p++ = htonl((u32) RPC_GSS_VERSION); + *p++ = htonl((u32) ctx->gc_proc); + *p++ = htonl((u32) req->rq_seqno); + *p++ = htonl((u32) gss_cred->gc_service); + p = xdr_encode_netobj(p, &ctx->gc_wire_ctx); + *cred_len = htonl((p - (cred_len + 1)) << 2); + + /* We compute the checksum for the verifier over the xdr-encoded bytes + * starting with the xid and ending at the end of the credential: */ + iov.iov_base = req->rq_snd_buf.head[0].iov_base; + if (task->tk_client->cl_xprt->stream) + /* See clnt.c:call_header() */ + iov.iov_base += 4; + iov.iov_len = (u8 *)p - (u8 *)iov.iov_base; + xdr_buf_from_iov(&iov, &verf_buf); + + /* set verifier flavor*/ + *p++ = htonl(RPC_AUTH_GSS); + + mic.data = (u8 *)(p + 1); + maj_stat = gss_get_mic(ctx->gc_gss_ctx, + GSS_C_QOP_DEFAULT, + &verf_buf, &mic); + if (maj_stat == GSS_S_CONTEXT_EXPIRED) { + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + } else if (maj_stat != 0) { + printk("gss_marshal: gss_get_mic FAILED (%d)\n", maj_stat); + goto out_put_ctx; + } + p = xdr_encode_opaque(p, NULL, mic.len); + gss_put_ctx(ctx); + return p; +out_put_ctx: + gss_put_ctx(ctx); + return NULL; +} + +/* +* Refresh credentials. XXX - finish +*/ +static int +gss_refresh(struct rpc_task *task) +{ + + if (!gss_cred_is_uptodate_ctx(task->tk_msg.rpc_cred)) + return gss_refresh_upcall(task); + return 0; +} + +static u32 * +gss_validate(struct rpc_task *task, u32 *p) +{ + struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, + gc_base); + struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + u32 seq, qop_state; + struct kvec iov; + struct xdr_buf verf_buf; + struct xdr_netobj mic; + u32 flav,len; + u32 maj_stat; + + dprintk("RPC: %4u gss_validate\n", task->tk_pid); + + flav = ntohl(*p++); + if ((len = ntohl(*p++)) > RPC_MAX_AUTH_SIZE) + goto out_bad; + if (flav != RPC_AUTH_GSS) + goto out_bad; + seq = htonl(task->tk_rqstp->rq_seqno); + iov.iov_base = &seq; + iov.iov_len = sizeof(seq); + xdr_buf_from_iov(&iov, &verf_buf); + mic.data = (u8 *)p; + mic.len = len; + + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &verf_buf, &mic, &qop_state); + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + if (maj_stat) + goto out_bad; + switch (gss_cred->gc_service) { + case RPC_GSS_SVC_NONE: + /* verifier data, flavor, length: */ + task->tk_auth->au_rslack = XDR_QUADLEN(len) + 2; + break; + case RPC_GSS_SVC_INTEGRITY: + /* verifier data, flavor, length, length, sequence number: */ + task->tk_auth->au_rslack = XDR_QUADLEN(len) + 4; + break; + case RPC_GSS_SVC_PRIVACY: + goto out_bad; + } + gss_put_ctx(ctx); + dprintk("RPC: %4u GSS gss_validate: gss_verify_mic succeeded.\n", + task->tk_pid); + return p + XDR_QUADLEN(len); +out_bad: + gss_put_ctx(ctx); + dprintk("RPC: %4u gss_validate failed.\n", task->tk_pid); + return NULL; +} + +static inline int +gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + kxdrproc_t encode, struct rpc_rqst *rqstp, u32 *p, void *obj) +{ + struct xdr_buf *snd_buf = &rqstp->rq_snd_buf; + struct xdr_buf integ_buf; + u32 *integ_len = NULL; + struct xdr_netobj mic; + u32 offset, *q; + struct kvec *iov; + u32 maj_stat = 0; + int status = -EIO; + + integ_len = p++; + offset = (u8 *)p - (u8 *)snd_buf->head[0].iov_base; + *p++ = htonl(rqstp->rq_seqno); + + status = encode(rqstp, p, obj); + if (status) + return status; + + if (xdr_buf_subsegment(snd_buf, &integ_buf, + offset, snd_buf->len - offset)) + return status; + *integ_len = htonl(integ_buf.len); + + /* guess whether we're in the head or the tail: */ + if (snd_buf->page_len || snd_buf->tail[0].iov_len) + iov = snd_buf->tail; + else + iov = snd_buf->head; + p = iov->iov_base + iov->iov_len; + mic.data = (u8 *)(p + 1); + + maj_stat = gss_get_mic(ctx->gc_gss_ctx, + GSS_C_QOP_DEFAULT, &integ_buf, &mic); + status = -EIO; /* XXX? */ + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + else if (maj_stat) + return status; + q = xdr_encode_opaque(p, NULL, mic.len); + + offset = (u8 *)q - (u8 *)p; + iov->iov_len += offset; + snd_buf->len += offset; + return 0; +} + +static int +gss_wrap_req(struct rpc_task *task, + kxdrproc_t encode, void *rqstp, u32 *p, void *obj) +{ + struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, + gc_base); + struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + int status = -EIO; + + dprintk("RPC: %4u gss_wrap_req\n", task->tk_pid); + if (ctx->gc_proc != RPC_GSS_PROC_DATA) { + /* The spec seems a little ambiguous here, but I think that not + * wrapping context destruction requests makes the most sense. + */ + status = encode(rqstp, p, obj); + goto out; + } + switch (gss_cred->gc_service) { + case RPC_GSS_SVC_NONE: + status = encode(rqstp, p, obj); + break; + case RPC_GSS_SVC_INTEGRITY: + status = gss_wrap_req_integ(cred, ctx, encode, + rqstp, p, obj); + break; + case RPC_GSS_SVC_PRIVACY: + break; + } +out: + gss_put_ctx(ctx); + dprintk("RPC: %4u gss_wrap_req returning %d\n", task->tk_pid, status); + return status; +} + +static inline int +gss_unwrap_resp_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx, + struct rpc_rqst *rqstp, u32 **p) +{ + struct xdr_buf *rcv_buf = &rqstp->rq_rcv_buf; + struct xdr_buf integ_buf; + struct xdr_netobj mic; + u32 data_offset, mic_offset; + u32 integ_len; + u32 maj_stat; + int status = -EIO; + + integ_len = ntohl(*(*p)++); + if (integ_len & 3) + return status; + data_offset = (u8 *)(*p) - (u8 *)rcv_buf->head[0].iov_base; + mic_offset = integ_len + data_offset; + if (mic_offset > rcv_buf->len) + return status; + if (ntohl(*(*p)++) != rqstp->rq_seqno) + return status; + + if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, + mic_offset - data_offset)) + return status; + + if (xdr_buf_read_netobj(rcv_buf, &mic, mic_offset)) + return status; + + maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, + &mic, NULL); + if (maj_stat == GSS_S_CONTEXT_EXPIRED) + cred->cr_flags &= ~RPCAUTH_CRED_UPTODATE; + if (maj_stat != GSS_S_COMPLETE) + return status; + return 0; +} + +static int +gss_unwrap_resp(struct rpc_task *task, + kxdrproc_t decode, void *rqstp, u32 *p, void *obj) +{ + struct rpc_cred *cred = task->tk_msg.rpc_cred; + struct gss_cred *gss_cred = container_of(cred, struct gss_cred, + gc_base); + struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + int status = -EIO; + + if (ctx->gc_proc != RPC_GSS_PROC_DATA) + goto out_decode; + switch (gss_cred->gc_service) { + case RPC_GSS_SVC_NONE: + break; + case RPC_GSS_SVC_INTEGRITY: + status = gss_unwrap_resp_integ(cred, ctx, rqstp, &p); + if (status) + goto out; + break; + case RPC_GSS_SVC_PRIVACY: + break; + } +out_decode: + status = decode(rqstp, p, obj); +out: + gss_put_ctx(ctx); + dprintk("RPC: %4u gss_unwrap_resp returning %d\n", task->tk_pid, + status); + return status; +} + +static struct rpc_authops authgss_ops = { + .owner = THIS_MODULE, + .au_flavor = RPC_AUTH_GSS, +#ifdef RPC_DEBUG + .au_name = "RPCSEC_GSS", +#endif + .create = gss_create, + .destroy = gss_destroy, + .lookup_cred = gss_lookup_cred, + .crcreate = gss_create_cred +}; + +static struct rpc_credops gss_credops = { + .cr_name = "AUTH_GSS", + .crdestroy = gss_destroy_cred, + .crmatch = gss_match, + .crmarshal = gss_marshal, + .crrefresh = gss_refresh, + .crvalidate = gss_validate, + .crwrap_req = gss_wrap_req, + .crunwrap_resp = gss_unwrap_resp, +}; + +static struct rpc_pipe_ops gss_upcall_ops = { + .upcall = gss_pipe_upcall, + .downcall = gss_pipe_downcall, + .destroy_msg = gss_pipe_destroy_msg, + .release_pipe = gss_pipe_release, +}; + +/* + * Initialize RPCSEC_GSS module + */ +static int __init init_rpcsec_gss(void) +{ + int err = 0; + + err = rpcauth_register(&authgss_ops); + if (err) + goto out; + err = gss_svc_init(); + if (err) + goto out_unregister; + return 0; +out_unregister: + rpcauth_unregister(&authgss_ops); +out: + return err; +} + +static void __exit exit_rpcsec_gss(void) +{ + gss_svc_shutdown(); + rpcauth_unregister(&authgss_ops); +} + +MODULE_LICENSE("GPL"); +module_init(init_rpcsec_gss) +module_exit(exit_rpcsec_gss) diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c new file mode 100644 index 000000000000..826df44e7fca --- /dev/null +++ b/net/sunrpc/auth_gss/gss_generic_token.c @@ -0,0 +1,235 @@ +/* + * linux/net/sunrpc/gss_generic_token.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/generic/util_token.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + + +/* TWRITE_STR from gssapiP_generic.h */ +#define TWRITE_STR(ptr, str, len) \ + memcpy((ptr), (char *) (str), (len)); \ + (ptr) += (len); + +/* XXXX this code currently makes the assumption that a mech oid will + never be longer than 127 bytes. This assumption is not inherent in + the interfaces, so the code can be fixed if the OSI namespace + balloons unexpectedly. */ + +/* Each token looks like this: + +0x60 tag for APPLICATION 0, SEQUENCE + (constructed, definite-length) + possible multiple bytes, need to parse/generate + 0x06 tag for OBJECT IDENTIFIER + compile-time constant string (assume 1 byte) + compile-time constant string + the ANY containing the application token + bytes 0,1 are the token type + bytes 2,n are the token data + +For the purposes of this abstraction, the token "header" consists of +the sequence tag and length octets, the mech OID DER encoding, and the +first two inner bytes, which indicate the token type. The token +"body" consists of everything else. + +*/ + +static int +der_length_size( int length) +{ + if (length < (1<<7)) + return(1); + else if (length < (1<<8)) + return(2); +#if (SIZEOF_INT == 2) + else + return(3); +#else + else if (length < (1<<16)) + return(3); + else if (length < (1<<24)) + return(4); + else + return(5); +#endif +} + +static void +der_write_length(unsigned char **buf, int length) +{ + if (length < (1<<7)) { + *(*buf)++ = (unsigned char) length; + } else { + *(*buf)++ = (unsigned char) (der_length_size(length)+127); +#if (SIZEOF_INT > 2) + if (length >= (1<<24)) + *(*buf)++ = (unsigned char) (length>>24); + if (length >= (1<<16)) + *(*buf)++ = (unsigned char) ((length>>16)&0xff); +#endif + if (length >= (1<<8)) + *(*buf)++ = (unsigned char) ((length>>8)&0xff); + *(*buf)++ = (unsigned char) (length&0xff); + } +} + +/* returns decoded length, or < 0 on failure. Advances buf and + decrements bufsize */ + +static int +der_read_length(unsigned char **buf, int *bufsize) +{ + unsigned char sf; + int ret; + + if (*bufsize < 1) + return(-1); + sf = *(*buf)++; + (*bufsize)--; + if (sf & 0x80) { + if ((sf &= 0x7f) > ((*bufsize)-1)) + return(-1); + if (sf > SIZEOF_INT) + return (-1); + ret = 0; + for (; sf; sf--) { + ret = (ret<<8) + (*(*buf)++); + (*bufsize)--; + } + } else { + ret = sf; + } + + return(ret); +} + +/* returns the length of a token, given the mech oid and the body size */ + +int +g_token_size(struct xdr_netobj *mech, unsigned int body_size) +{ + /* set body_size to sequence contents size */ + body_size += 4 + (int) mech->len; /* NEED overflow check */ + return(1 + der_length_size(body_size) + body_size); +} + +EXPORT_SYMBOL(g_token_size); + +/* fills in a buffer with the token header. The buffer is assumed to + be the right size. buf is advanced past the token header */ + +void +g_make_token_header(struct xdr_netobj *mech, int body_size, unsigned char **buf) +{ + *(*buf)++ = 0x60; + der_write_length(buf, 4 + mech->len + body_size); + *(*buf)++ = 0x06; + *(*buf)++ = (unsigned char) mech->len; + TWRITE_STR(*buf, mech->data, ((int) mech->len)); +} + +EXPORT_SYMBOL(g_make_token_header); + +/* + * Given a buffer containing a token, reads and verifies the token, + * leaving buf advanced past the token header, and setting body_size + * to the number of remaining bytes. Returns 0 on success, + * G_BAD_TOK_HEADER for a variety of errors, and G_WRONG_MECH if the + * mechanism in the token does not match the mech argument. buf and + * *body_size are left unmodified on error. + */ +u32 +g_verify_token_header(struct xdr_netobj *mech, int *body_size, + unsigned char **buf_in, int toksize) +{ + unsigned char *buf = *buf_in; + int seqsize; + struct xdr_netobj toid; + int ret = 0; + + if ((toksize-=1) < 0) + return(G_BAD_TOK_HEADER); + if (*buf++ != 0x60) + return(G_BAD_TOK_HEADER); + + if ((seqsize = der_read_length(&buf, &toksize)) < 0) + return(G_BAD_TOK_HEADER); + + if (seqsize != toksize) + return(G_BAD_TOK_HEADER); + + if ((toksize-=1) < 0) + return(G_BAD_TOK_HEADER); + if (*buf++ != 0x06) + return(G_BAD_TOK_HEADER); + + if ((toksize-=1) < 0) + return(G_BAD_TOK_HEADER); + toid.len = *buf++; + + if ((toksize-=toid.len) < 0) + return(G_BAD_TOK_HEADER); + toid.data = buf; + buf+=toid.len; + + if (! g_OID_equal(&toid, mech)) + ret = G_WRONG_MECH; + + /* G_WRONG_MECH is not returned immediately because it's more important + to return G_BAD_TOK_HEADER if the token header is in fact bad */ + + if ((toksize-=2) < 0) + return(G_BAD_TOK_HEADER); + + if (ret) + return(ret); + + if (!ret) { + *buf_in = buf; + *body_size = toksize; + } + + return(ret); +} + +EXPORT_SYMBOL(g_verify_token_header); + diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c new file mode 100644 index 000000000000..24c21f2a33a7 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -0,0 +1,209 @@ +/* + * linux/net/sunrpc/gss_krb5_crypto.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * Bruce Fields + */ + +/* + * Copyright (C) 1998 by the FundsXpress, INC. + * + * All rights reserved. + * + * Export of this software from the United States of America may require + * a specific license from the United States Government. It is the + * responsibility of any person or organization contemplating export to + * obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of FundsXpress. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. FundsXpress makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +u32 +krb5_encrypt( + struct crypto_tfm *tfm, + void * iv, + void * in, + void * out, + int length) +{ + u32 ret = -EINVAL; + struct scatterlist sg[1]; + u8 local_iv[16] = {0}; + + dprintk("RPC: krb5_encrypt: input data:\n"); + print_hexl((u32 *)in, length, 0); + + if (length % crypto_tfm_alg_blocksize(tfm) != 0) + goto out; + + if (crypto_tfm_alg_ivsize(tfm) > 16) { + dprintk("RPC: gss_k5encrypt: tfm iv size to large %d\n", + crypto_tfm_alg_ivsize(tfm)); + goto out; + } + + if (iv) + memcpy(local_iv, iv, crypto_tfm_alg_ivsize(tfm)); + + memcpy(out, in, length); + sg[0].page = virt_to_page(out); + sg[0].offset = offset_in_page(out); + sg[0].length = length; + + ret = crypto_cipher_encrypt_iv(tfm, sg, sg, length, local_iv); + + dprintk("RPC: krb5_encrypt: output data:\n"); + print_hexl((u32 *)out, length, 0); +out: + dprintk("RPC: krb5_encrypt returns %d\n",ret); + return(ret); +} + +EXPORT_SYMBOL(krb5_encrypt); + +u32 +krb5_decrypt( + struct crypto_tfm *tfm, + void * iv, + void * in, + void * out, + int length) +{ + u32 ret = -EINVAL; + struct scatterlist sg[1]; + u8 local_iv[16] = {0}; + + dprintk("RPC: krb5_decrypt: input data:\n"); + print_hexl((u32 *)in, length, 0); + + if (length % crypto_tfm_alg_blocksize(tfm) != 0) + goto out; + + if (crypto_tfm_alg_ivsize(tfm) > 16) { + dprintk("RPC: gss_k5decrypt: tfm iv size to large %d\n", + crypto_tfm_alg_ivsize(tfm)); + goto out; + } + if (iv) + memcpy(local_iv,iv, crypto_tfm_alg_ivsize(tfm)); + + memcpy(out, in, length); + sg[0].page = virt_to_page(out); + sg[0].offset = offset_in_page(out); + sg[0].length = length; + + ret = crypto_cipher_decrypt_iv(tfm, sg, sg, length, local_iv); + + dprintk("RPC: krb5_decrypt: output_data:\n"); + print_hexl((u32 *)out, length, 0); +out: + dprintk("RPC: gss_k5decrypt returns %d\n",ret); + return(ret); +} + +EXPORT_SYMBOL(krb5_decrypt); + +static void +buf_to_sg(struct scatterlist *sg, char *ptr, int len) { + sg->page = virt_to_page(ptr); + sg->offset = offset_in_page(ptr); + sg->length = len; +} + +/* checksum the plaintext data and hdrlen bytes of the token header */ +s32 +make_checksum(s32 cksumtype, char *header, int hdrlen, struct xdr_buf *body, + struct xdr_netobj *cksum) +{ + char *cksumname; + struct crypto_tfm *tfm = NULL; /* XXX add to ctx? */ + struct scatterlist sg[1]; + u32 code = GSS_S_FAILURE; + int len, thislen, offset; + int i; + + switch (cksumtype) { + case CKSUMTYPE_RSA_MD5: + cksumname = "md5"; + break; + default: + dprintk("RPC: krb5_make_checksum:" + " unsupported checksum %d", cksumtype); + goto out; + } + if (!(tfm = crypto_alloc_tfm(cksumname, 0))) + goto out; + cksum->len = crypto_tfm_alg_digestsize(tfm); + if ((cksum->data = kmalloc(cksum->len, GFP_KERNEL)) == NULL) + goto out; + + crypto_digest_init(tfm); + buf_to_sg(sg, header, hdrlen); + crypto_digest_update(tfm, sg, 1); + if (body->head[0].iov_len) { + buf_to_sg(sg, body->head[0].iov_base, body->head[0].iov_len); + crypto_digest_update(tfm, sg, 1); + } + + len = body->page_len; + if (len != 0) { + offset = body->page_base & (PAGE_CACHE_SIZE - 1); + i = body->page_base >> PAGE_CACHE_SHIFT; + thislen = PAGE_CACHE_SIZE - offset; + do { + if (thislen > len) + thislen = len; + sg->page = body->pages[i]; + sg->offset = offset; + sg->length = thislen; + kmap(sg->page); /* XXX kmap_atomic? */ + crypto_digest_update(tfm, sg, 1); + kunmap(sg->page); + len -= thislen; + i++; + offset = 0; + thislen = PAGE_CACHE_SIZE; + } while(len != 0); + } + if (body->tail[0].iov_len) { + buf_to_sg(sg, body->tail[0].iov_base, body->tail[0].iov_len); + crypto_digest_update(tfm, sg, 1); + } + crypto_digest_final(tfm, cksum->data); + code = 0; +out: + if (tfm) + crypto_free_tfm(tfm); + return code; +} + +EXPORT_SYMBOL(make_checksum); diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c new file mode 100644 index 000000000000..cf726510df8e --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -0,0 +1,275 @@ +/* + * linux/net/sunrpc/gss_krb5_mech.c + * + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * J. Bruce Fields + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static const void * +simple_get_bytes(const void *p, const void *end, void *res, int len) +{ + const void *q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + memcpy(res, p, len); + return q; +} + +static const void * +simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res) +{ + const void *q; + unsigned int len; + + p = simple_get_bytes(p, end, &len, sizeof(len)); + if (IS_ERR(p)) + return p; + q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + res->data = kmalloc(len, GFP_KERNEL); + if (unlikely(res->data == NULL)) + return ERR_PTR(-ENOMEM); + memcpy(res->data, p, len); + res->len = len; + return q; +} + +static inline const void * +get_key(const void *p, const void *end, struct crypto_tfm **res) +{ + struct xdr_netobj key; + int alg, alg_mode; + char *alg_name; + + p = simple_get_bytes(p, end, &alg, sizeof(alg)); + if (IS_ERR(p)) + goto out_err; + p = simple_get_netobj(p, end, &key); + if (IS_ERR(p)) + goto out_err; + + switch (alg) { + case ENCTYPE_DES_CBC_RAW: + alg_name = "des"; + alg_mode = CRYPTO_TFM_MODE_CBC; + break; + default: + dprintk("RPC: get_key: unsupported algorithm %d\n", alg); + goto out_err_free_key; + } + if (!(*res = crypto_alloc_tfm(alg_name, alg_mode))) + goto out_err_free_key; + if (crypto_cipher_setkey(*res, key.data, key.len)) + goto out_err_free_tfm; + + kfree(key.data); + return p; + +out_err_free_tfm: + crypto_free_tfm(*res); +out_err_free_key: + kfree(key.data); + p = ERR_PTR(-EINVAL); +out_err: + return p; +} + +static int +gss_import_sec_context_kerberos(const void *p, + size_t len, + struct gss_ctx *ctx_id) +{ + const void *end = (const void *)((const char *)p + len); + struct krb5_ctx *ctx; + + if (!(ctx = kmalloc(sizeof(*ctx), GFP_KERNEL))) + goto out_err; + memset(ctx, 0, sizeof(*ctx)); + + p = simple_get_bytes(p, end, &ctx->initiate, sizeof(ctx->initiate)); + if (IS_ERR(p)) + goto out_err_free_ctx; + p = simple_get_bytes(p, end, &ctx->seed_init, sizeof(ctx->seed_init)); + if (IS_ERR(p)) + goto out_err_free_ctx; + p = simple_get_bytes(p, end, ctx->seed, sizeof(ctx->seed)); + if (IS_ERR(p)) + goto out_err_free_ctx; + p = simple_get_bytes(p, end, &ctx->signalg, sizeof(ctx->signalg)); + if (IS_ERR(p)) + goto out_err_free_ctx; + p = simple_get_bytes(p, end, &ctx->sealalg, sizeof(ctx->sealalg)); + if (IS_ERR(p)) + goto out_err_free_ctx; + p = simple_get_bytes(p, end, &ctx->endtime, sizeof(ctx->endtime)); + if (IS_ERR(p)) + goto out_err_free_ctx; + p = simple_get_bytes(p, end, &ctx->seq_send, sizeof(ctx->seq_send)); + if (IS_ERR(p)) + goto out_err_free_ctx; + p = simple_get_netobj(p, end, &ctx->mech_used); + if (IS_ERR(p)) + goto out_err_free_ctx; + p = get_key(p, end, &ctx->enc); + if (IS_ERR(p)) + goto out_err_free_mech; + p = get_key(p, end, &ctx->seq); + if (IS_ERR(p)) + goto out_err_free_key1; + if (p != end) { + p = ERR_PTR(-EFAULT); + goto out_err_free_key2; + } + + ctx_id->internal_ctx_id = ctx; + dprintk("RPC: Succesfully imported new context.\n"); + return 0; + +out_err_free_key2: + crypto_free_tfm(ctx->seq); +out_err_free_key1: + crypto_free_tfm(ctx->enc); +out_err_free_mech: + kfree(ctx->mech_used.data); +out_err_free_ctx: + kfree(ctx); +out_err: + return PTR_ERR(p); +} + +static void +gss_delete_sec_context_kerberos(void *internal_ctx) { + struct krb5_ctx *kctx = internal_ctx; + + if (kctx->seq) + crypto_free_tfm(kctx->seq); + if (kctx->enc) + crypto_free_tfm(kctx->enc); + if (kctx->mech_used.data) + kfree(kctx->mech_used.data); + kfree(kctx); +} + +static u32 +gss_verify_mic_kerberos(struct gss_ctx *ctx, + struct xdr_buf *message, + struct xdr_netobj *mic_token, + u32 *qstate) { + u32 maj_stat = 0; + int qop_state; + struct krb5_ctx *kctx = ctx->internal_ctx_id; + + maj_stat = krb5_read_token(kctx, mic_token, message, &qop_state, + KG_TOK_MIC_MSG); + if (!maj_stat && qop_state) + *qstate = qop_state; + + dprintk("RPC: gss_verify_mic_kerberos returning %d\n", maj_stat); + return maj_stat; +} + +static u32 +gss_get_mic_kerberos(struct gss_ctx *ctx, + u32 qop, + struct xdr_buf *message, + struct xdr_netobj *mic_token) { + u32 err = 0; + struct krb5_ctx *kctx = ctx->internal_ctx_id; + + err = krb5_make_token(kctx, qop, message, mic_token, KG_TOK_MIC_MSG); + + dprintk("RPC: gss_get_mic_kerberos returning %d\n",err); + + return err; +} + +static struct gss_api_ops gss_kerberos_ops = { + .gss_import_sec_context = gss_import_sec_context_kerberos, + .gss_get_mic = gss_get_mic_kerberos, + .gss_verify_mic = gss_verify_mic_kerberos, + .gss_delete_sec_context = gss_delete_sec_context_kerberos, +}; + +static struct pf_desc gss_kerberos_pfs[] = { + [0] = { + .pseudoflavor = RPC_AUTH_GSS_KRB5, + .service = RPC_GSS_SVC_NONE, + .name = "krb5", + }, + [1] = { + .pseudoflavor = RPC_AUTH_GSS_KRB5I, + .service = RPC_GSS_SVC_INTEGRITY, + .name = "krb5i", + }, +}; + +static struct gss_api_mech gss_kerberos_mech = { + .gm_name = "krb5", + .gm_owner = THIS_MODULE, + .gm_ops = &gss_kerberos_ops, + .gm_pf_num = ARRAY_SIZE(gss_kerberos_pfs), + .gm_pfs = gss_kerberos_pfs, +}; + +static int __init init_kerberos_module(void) +{ + int status; + + status = gss_mech_register(&gss_kerberos_mech); + if (status) + printk("Failed to register kerberos gss mechanism!\n"); + return status; +} + +static void __exit cleanup_kerberos_module(void) +{ + gss_mech_unregister(&gss_kerberos_mech); +} + +MODULE_LICENSE("GPL"); +module_init(init_kerberos_module); +module_exit(cleanup_kerberos_module); diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c new file mode 100644 index 000000000000..afeeb8715a77 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -0,0 +1,176 @@ +/* + * linux/net/sunrpc/gss_krb5_seal.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5seal.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * J. Bruce Fields + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * Copyright (C) 1998 by the FundsXpress, INC. + * + * All rights reserved. + * + * Export of this software from the United States of America may require + * a specific license from the United States Government. It is the + * responsibility of any person or organization contemplating export to + * obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of FundsXpress. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. FundsXpress makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static inline int +gss_krb5_padding(int blocksize, int length) { + /* Most of the code is block-size independent but in practice we + * use only 8: */ + BUG_ON(blocksize != 8); + return 8 - (length & 7); +} + +u32 +krb5_make_token(struct krb5_ctx *ctx, int qop_req, + struct xdr_buf *text, struct xdr_netobj *token, + int toktype) +{ + s32 checksum_type; + struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; + int blocksize = 0, tmsglen; + unsigned char *ptr, *krb5_hdr, *msg_start; + s32 now; + + dprintk("RPC: gss_krb5_seal\n"); + + now = get_seconds(); + + if (qop_req != 0) + goto out_err; + + switch (ctx->signalg) { + case SGN_ALG_DES_MAC_MD5: + checksum_type = CKSUMTYPE_RSA_MD5; + break; + default: + dprintk("RPC: gss_krb5_seal: ctx->signalg %d not" + " supported\n", ctx->signalg); + goto out_err; + } + if (ctx->sealalg != SEAL_ALG_NONE && ctx->sealalg != SEAL_ALG_DES) { + dprintk("RPC: gss_krb5_seal: ctx->sealalg %d not supported\n", + ctx->sealalg); + goto out_err; + } + + if (toktype == KG_TOK_WRAP_MSG) { + blocksize = crypto_tfm_alg_blocksize(ctx->enc); + tmsglen = blocksize + text->len + + gss_krb5_padding(blocksize, blocksize + text->len); + } else { + tmsglen = 0; + } + + token->len = g_token_size(&ctx->mech_used, 22 + tmsglen); + + ptr = token->data; + g_make_token_header(&ctx->mech_used, 22 + tmsglen, &ptr); + + *ptr++ = (unsigned char) ((toktype>>8)&0xff); + *ptr++ = (unsigned char) (toktype&0xff); + + /* ptr now at byte 2 of header described in rfc 1964, section 1.2.1: */ + krb5_hdr = ptr - 2; + msg_start = krb5_hdr + 24; + + *(u16 *)(krb5_hdr + 2) = htons(ctx->signalg); + memset(krb5_hdr + 4, 0xff, 4); + if (toktype == KG_TOK_WRAP_MSG) + *(u16 *)(krb5_hdr + 4) = htons(ctx->sealalg); + + if (toktype == KG_TOK_WRAP_MSG) { + /* XXX removing support for now */ + goto out_err; + } else { /* Sign only. */ + if (make_checksum(checksum_type, krb5_hdr, 8, text, + &md5cksum)) + goto out_err; + } + + switch (ctx->signalg) { + case SGN_ALG_DES_MAC_MD5: + if (krb5_encrypt(ctx->seq, NULL, md5cksum.data, + md5cksum.data, md5cksum.len)) + goto out_err; + memcpy(krb5_hdr + 16, + md5cksum.data + md5cksum.len - KRB5_CKSUM_LENGTH, + KRB5_CKSUM_LENGTH); + + dprintk("RPC: make_seal_token: cksum data: \n"); + print_hexl((u32 *) (krb5_hdr + 16), KRB5_CKSUM_LENGTH, 0); + break; + default: + BUG(); + } + + kfree(md5cksum.data); + + if ((krb5_make_seq_num(ctx->seq, ctx->initiate ? 0 : 0xff, + ctx->seq_send, krb5_hdr + 16, krb5_hdr + 8))) + goto out_err; + + ctx->seq_send++; + + return ((ctx->endtime < now) ? GSS_S_CONTEXT_EXPIRED : GSS_S_COMPLETE); +out_err: + if (md5cksum.data) kfree(md5cksum.data); + return GSS_S_FAILURE; +} diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c new file mode 100644 index 000000000000..c53ead39118d --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c @@ -0,0 +1,88 @@ +/* + * linux/net/sunrpc/gss_krb5_seqnum.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/util_seqnum.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +s32 +krb5_make_seq_num(struct crypto_tfm *key, + int direction, + s32 seqnum, + unsigned char *cksum, unsigned char *buf) +{ + unsigned char plain[8]; + + plain[0] = (unsigned char) (seqnum & 0xff); + plain[1] = (unsigned char) ((seqnum >> 8) & 0xff); + plain[2] = (unsigned char) ((seqnum >> 16) & 0xff); + plain[3] = (unsigned char) ((seqnum >> 24) & 0xff); + + plain[4] = direction; + plain[5] = direction; + plain[6] = direction; + plain[7] = direction; + + return krb5_encrypt(key, cksum, plain, buf, 8); +} + +s32 +krb5_get_seq_num(struct crypto_tfm *key, + unsigned char *cksum, + unsigned char *buf, + int *direction, s32 * seqnum) +{ + s32 code; + unsigned char plain[8]; + + dprintk("RPC: krb5_get_seq_num:\n"); + + if ((code = krb5_decrypt(key, cksum, buf, plain, 8))) + return code; + + if ((plain[4] != plain[5]) || (plain[4] != plain[6]) + || (plain[4] != plain[7])) + return (s32)KG_BAD_SEQ; + + *direction = plain[4]; + + *seqnum = ((plain[0]) | + (plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24)); + + return (0); +} diff --git a/net/sunrpc/auth_gss/gss_krb5_unseal.c b/net/sunrpc/auth_gss/gss_krb5_unseal.c new file mode 100644 index 000000000000..8767fc53183d --- /dev/null +++ b/net/sunrpc/auth_gss/gss_krb5_unseal.c @@ -0,0 +1,202 @@ +/* + * linux/net/sunrpc/gss_krb5_unseal.c + * + * Adapted from MIT Kerberos 5-1.2.1 lib/gssapi/krb5/k5unseal.c + * + * Copyright (c) 2000 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + */ + +/* + * Copyright 1993 by OpenVision Technologies, Inc. + * + * Permission to use, copy, modify, distribute, and sell this software + * and its documentation for any purpose is hereby granted without fee, + * provided that the above copyright notice appears in all copies and + * that both that copyright notice and this permission notice appear in + * supporting documentation, and that the name of OpenVision not be used + * in advertising or publicity pertaining to distribution of the software + * without specific, written prior permission. OpenVision makes no + * representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied warranty. + * + * OPENVISION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO + * EVENT SHALL OPENVISION BE LIABLE FOR ANY SPECIAL, INDIRECT OR + * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF + * USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * Copyright (C) 1998 by the FundsXpress, INC. + * + * All rights reserved. + * + * Export of this software from the United States of America may require + * a specific license from the United States Government. It is the + * responsibility of any person or organization contemplating export to + * obtain such a license before exporting. + * + * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and + * distribute this software and its documentation for any purpose and + * without fee is hereby granted, provided that the above copyright + * notice appear in all copies and that both that copyright notice and + * this permission notice appear in supporting documentation, and that + * the name of FundsXpress. not be used in advertising or publicity pertaining + * to distribution of the software without specific, written prior + * permission. FundsXpress makes no representations about the suitability of + * this software for any purpose. It is provided "as is" without express + * or implied warranty. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND WITHOUT ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED + * WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + + +/* message_buffer is an input if toktype is MIC and an output if it is WRAP: + * If toktype is MIC: read_token is a mic token, and message_buffer is the + * data that the mic was supposedly taken over. + * If toktype is WRAP: read_token is a wrap token, and message_buffer is used + * to return the decrypted data. + */ + +/* XXX will need to change prototype and/or just split into a separate function + * when we add privacy (because read_token will be in pages too). */ +u32 +krb5_read_token(struct krb5_ctx *ctx, + struct xdr_netobj *read_token, + struct xdr_buf *message_buffer, + int *qop_state, int toktype) +{ + int signalg; + int sealalg; + s32 checksum_type; + struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; + s32 now; + int direction; + s32 seqnum; + unsigned char *ptr = (unsigned char *)read_token->data; + int bodysize; + u32 ret = GSS_S_DEFECTIVE_TOKEN; + + dprintk("RPC: krb5_read_token\n"); + + if (g_verify_token_header(&ctx->mech_used, &bodysize, &ptr, + read_token->len)) + goto out; + + if ((*ptr++ != ((toktype>>8)&0xff)) || (*ptr++ != (toktype&0xff))) + goto out; + + /* XXX sanity-check bodysize?? */ + + if (toktype == KG_TOK_WRAP_MSG) { + /* XXX gone */ + goto out; + } + + /* get the sign and seal algorithms */ + + signalg = ptr[0] + (ptr[1] << 8); + sealalg = ptr[2] + (ptr[3] << 8); + + /* Sanity checks */ + + if ((ptr[4] != 0xff) || (ptr[5] != 0xff)) + goto out; + + if (((toktype != KG_TOK_WRAP_MSG) && (sealalg != 0xffff)) || + ((toktype == KG_TOK_WRAP_MSG) && (sealalg == 0xffff))) + goto out; + + /* in the current spec, there is only one valid seal algorithm per + key type, so a simple comparison is ok */ + + if ((toktype == KG_TOK_WRAP_MSG) && !(sealalg == ctx->sealalg)) + goto out; + + /* there are several mappings of seal algorithms to sign algorithms, + but few enough that we can try them all. */ + + if ((ctx->sealalg == SEAL_ALG_NONE && signalg > 1) || + (ctx->sealalg == SEAL_ALG_1 && signalg != SGN_ALG_3) || + (ctx->sealalg == SEAL_ALG_DES3KD && + signalg != SGN_ALG_HMAC_SHA1_DES3_KD)) + goto out; + + /* compute the checksum of the message */ + + /* initialize the the cksum */ + switch (signalg) { + case SGN_ALG_DES_MAC_MD5: + checksum_type = CKSUMTYPE_RSA_MD5; + break; + default: + ret = GSS_S_DEFECTIVE_TOKEN; + goto out; + } + + switch (signalg) { + case SGN_ALG_DES_MAC_MD5: + ret = make_checksum(checksum_type, ptr - 2, 8, + message_buffer, &md5cksum); + if (ret) + goto out; + + ret = krb5_encrypt(ctx->seq, NULL, md5cksum.data, + md5cksum.data, 16); + if (ret) + goto out; + + if (memcmp(md5cksum.data + 8, ptr + 14, 8)) { + ret = GSS_S_BAD_SIG; + goto out; + } + break; + default: + ret = GSS_S_DEFECTIVE_TOKEN; + goto out; + } + + /* it got through unscathed. Make sure the context is unexpired */ + + if (qop_state) + *qop_state = GSS_C_QOP_DEFAULT; + + now = get_seconds(); + + ret = GSS_S_CONTEXT_EXPIRED; + if (now > ctx->endtime) + goto out; + + /* do sequencing checks */ + + ret = GSS_S_BAD_SIG; + if ((ret = krb5_get_seq_num(ctx->seq, ptr + 14, ptr + 6, &direction, + &seqnum))) + goto out; + + if ((ctx->initiate && direction != 0xff) || + (!ctx->initiate && direction != 0)) + goto out; + + ret = GSS_S_COMPLETE; +out: + if (md5cksum.data) kfree(md5cksum.data); + return ret; +} diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c new file mode 100644 index 000000000000..9dfb68377d69 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -0,0 +1,301 @@ +/* + * linux/net/sunrpc/gss_mech_switch.c + * + * Copyright (c) 2001 The Regents of the University of Michigan. + * All rights reserved. + * + * J. Bruce Fields + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static LIST_HEAD(registered_mechs); +static DEFINE_SPINLOCK(registered_mechs_lock); + +static void +gss_mech_free(struct gss_api_mech *gm) +{ + struct pf_desc *pf; + int i; + + for (i = 0; i < gm->gm_pf_num; i++) { + pf = &gm->gm_pfs[i]; + if (pf->auth_domain_name) + kfree(pf->auth_domain_name); + pf->auth_domain_name = NULL; + } +} + +static inline char * +make_auth_domain_name(char *name) +{ + static char *prefix = "gss/"; + char *new; + + new = kmalloc(strlen(name) + strlen(prefix) + 1, GFP_KERNEL); + if (new) { + strcpy(new, prefix); + strcat(new, name); + } + return new; +} + +static int +gss_mech_svc_setup(struct gss_api_mech *gm) +{ + struct pf_desc *pf; + int i, status; + + for (i = 0; i < gm->gm_pf_num; i++) { + pf = &gm->gm_pfs[i]; + pf->auth_domain_name = make_auth_domain_name(pf->name); + status = -ENOMEM; + if (pf->auth_domain_name == NULL) + goto out; + status = svcauth_gss_register_pseudoflavor(pf->pseudoflavor, + pf->auth_domain_name); + if (status) + goto out; + } + return 0; +out: + gss_mech_free(gm); + return status; +} + +int +gss_mech_register(struct gss_api_mech *gm) +{ + int status; + + status = gss_mech_svc_setup(gm); + if (status) + return status; + spin_lock(®istered_mechs_lock); + list_add(&gm->gm_list, ®istered_mechs); + spin_unlock(®istered_mechs_lock); + dprintk("RPC: registered gss mechanism %s\n", gm->gm_name); + return 0; +} + +EXPORT_SYMBOL(gss_mech_register); + +void +gss_mech_unregister(struct gss_api_mech *gm) +{ + spin_lock(®istered_mechs_lock); + list_del(&gm->gm_list); + spin_unlock(®istered_mechs_lock); + dprintk("RPC: unregistered gss mechanism %s\n", gm->gm_name); + gss_mech_free(gm); +} + +EXPORT_SYMBOL(gss_mech_unregister); + +struct gss_api_mech * +gss_mech_get(struct gss_api_mech *gm) +{ + __module_get(gm->gm_owner); + return gm; +} + +EXPORT_SYMBOL(gss_mech_get); + +struct gss_api_mech * +gss_mech_get_by_name(const char *name) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (0 == strcmp(name, pos->gm_name)) { + if (try_module_get(pos->gm_owner)) + gm = pos; + break; + } + } + spin_unlock(®istered_mechs_lock); + return gm; + +} + +EXPORT_SYMBOL(gss_mech_get_by_name); + +static inline int +mech_supports_pseudoflavor(struct gss_api_mech *gm, u32 pseudoflavor) +{ + int i; + + for (i = 0; i < gm->gm_pf_num; i++) { + if (gm->gm_pfs[i].pseudoflavor == pseudoflavor) + return 1; + } + return 0; +} + +struct gss_api_mech * +gss_mech_get_by_pseudoflavor(u32 pseudoflavor) +{ + struct gss_api_mech *pos, *gm = NULL; + + spin_lock(®istered_mechs_lock); + list_for_each_entry(pos, ®istered_mechs, gm_list) { + if (!mech_supports_pseudoflavor(pos, pseudoflavor)) { + module_put(pos->gm_owner); + continue; + } + if (try_module_get(pos->gm_owner)) + gm = pos; + break; + } + spin_unlock(®istered_mechs_lock); + return gm; +} + +EXPORT_SYMBOL(gss_mech_get_by_pseudoflavor); + +u32 +gss_pseudoflavor_to_service(struct gss_api_mech *gm, u32 pseudoflavor) +{ + int i; + + for (i = 0; i < gm->gm_pf_num; i++) { + if (gm->gm_pfs[i].pseudoflavor == pseudoflavor) + return gm->gm_pfs[i].service; + } + return 0; +} + +EXPORT_SYMBOL(gss_pseudoflavor_to_service); + +char * +gss_service_to_auth_domain_name(struct gss_api_mech *gm, u32 service) +{ + int i; + + for (i = 0; i < gm->gm_pf_num; i++) { + if (gm->gm_pfs[i].service == service) + return gm->gm_pfs[i].auth_domain_name; + } + return NULL; +} + +EXPORT_SYMBOL(gss_service_to_auth_domain_name); + +void +gss_mech_put(struct gss_api_mech * gm) +{ + module_put(gm->gm_owner); +} + +EXPORT_SYMBOL(gss_mech_put); + +/* The mech could probably be determined from the token instead, but it's just + * as easy for now to pass it in. */ +int +gss_import_sec_context(const void *input_token, size_t bufsize, + struct gss_api_mech *mech, + struct gss_ctx **ctx_id) +{ + if (!(*ctx_id = kmalloc(sizeof(**ctx_id), GFP_KERNEL))) + return GSS_S_FAILURE; + memset(*ctx_id, 0, sizeof(**ctx_id)); + (*ctx_id)->mech_type = gss_mech_get(mech); + + return mech->gm_ops + ->gss_import_sec_context(input_token, bufsize, *ctx_id); +} + +/* gss_get_mic: compute a mic over message and return mic_token. */ + +u32 +gss_get_mic(struct gss_ctx *context_handle, + u32 qop, + struct xdr_buf *message, + struct xdr_netobj *mic_token) +{ + return context_handle->mech_type->gm_ops + ->gss_get_mic(context_handle, + qop, + message, + mic_token); +} + +/* gss_verify_mic: check whether the provided mic_token verifies message. */ + +u32 +gss_verify_mic(struct gss_ctx *context_handle, + struct xdr_buf *message, + struct xdr_netobj *mic_token, + u32 *qstate) +{ + return context_handle->mech_type->gm_ops + ->gss_verify_mic(context_handle, + message, + mic_token, + qstate); +} + +/* gss_delete_sec_context: free all resources associated with context_handle. + * Note this differs from the RFC 2744-specified prototype in that we don't + * bother returning an output token, since it would never be used anyway. */ + +u32 +gss_delete_sec_context(struct gss_ctx **context_handle) +{ + dprintk("RPC: gss_delete_sec_context deleting %p\n", + *context_handle); + + if (!*context_handle) + return(GSS_S_NO_CONTEXT); + if ((*context_handle)->internal_ctx_id != 0) + (*context_handle)->mech_type->gm_ops + ->gss_delete_sec_context((*context_handle) + ->internal_ctx_id); + if ((*context_handle)->mech_type) + gss_mech_put((*context_handle)->mech_type); + kfree(*context_handle); + *context_handle=NULL; + return GSS_S_COMPLETE; +} diff --git a/net/sunrpc/auth_gss/gss_spkm3_mech.c b/net/sunrpc/auth_gss/gss_spkm3_mech.c new file mode 100644 index 000000000000..dad05994c3eb --- /dev/null +++ b/net/sunrpc/auth_gss/gss_spkm3_mech.c @@ -0,0 +1,300 @@ +/* + * linux/net/sunrpc/gss_spkm3_mech.c + * + * Copyright (c) 2003 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * J. Bruce Fields + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static const void * +simple_get_bytes(const void *p, const void *end, void *res, int len) +{ + const void *q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + memcpy(res, p, len); + return q; +} + +static const void * +simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res) +{ + const void *q; + unsigned int len; + p = simple_get_bytes(p, end, &len, sizeof(len)); + if (IS_ERR(p)) + return p; + res->len = len; + if (len == 0) { + res->data = NULL; + return p; + } + q = (const void *)((const char *)p + len); + if (unlikely(q > end || q < p)) + return ERR_PTR(-EFAULT); + res->data = kmalloc(len, GFP_KERNEL); + if (unlikely(res->data == NULL)) + return ERR_PTR(-ENOMEM); + memcpy(res->data, p, len); + return q; +} + +static inline const void * +get_key(const void *p, const void *end, struct crypto_tfm **res, int *resalg) +{ + struct xdr_netobj key = { 0 }; + int alg_mode,setkey = 0; + char *alg_name; + + p = simple_get_bytes(p, end, resalg, sizeof(*resalg)); + if (IS_ERR(p)) + goto out_err; + p = simple_get_netobj(p, end, &key); + if (IS_ERR(p)) + goto out_err; + + switch (*resalg) { + case NID_des_cbc: + alg_name = "des"; + alg_mode = CRYPTO_TFM_MODE_CBC; + setkey = 1; + break; + case NID_md5: + if (key.len == 0) { + dprintk("RPC: SPKM3 get_key: NID_md5 zero Key length\n"); + } + alg_name = "md5"; + alg_mode = 0; + setkey = 0; + break; + default: + dprintk("RPC: SPKM3 get_key: unsupported algorithm %d", *resalg); + goto out_err_free_key; + } + if (!(*res = crypto_alloc_tfm(alg_name, alg_mode))) + goto out_err_free_key; + if (setkey) { + if (crypto_cipher_setkey(*res, key.data, key.len)) + goto out_err_free_tfm; + } + + if(key.len > 0) + kfree(key.data); + return p; + +out_err_free_tfm: + crypto_free_tfm(*res); +out_err_free_key: + if(key.len > 0) + kfree(key.data); + p = ERR_PTR(-EINVAL); +out_err: + return p; +} + +static int +gss_import_sec_context_spkm3(const void *p, size_t len, + struct gss_ctx *ctx_id) +{ + const void *end = (const void *)((const char *)p + len); + struct spkm3_ctx *ctx; + + if (!(ctx = kmalloc(sizeof(*ctx), GFP_KERNEL))) + goto out_err; + memset(ctx, 0, sizeof(*ctx)); + + p = simple_get_netobj(p, end, &ctx->ctx_id); + if (IS_ERR(p)) + goto out_err_free_ctx; + + p = simple_get_bytes(p, end, &ctx->qop, sizeof(ctx->qop)); + if (IS_ERR(p)) + goto out_err_free_ctx_id; + + p = simple_get_netobj(p, end, &ctx->mech_used); + if (IS_ERR(p)) + goto out_err_free_mech; + + p = simple_get_bytes(p, end, &ctx->ret_flags, sizeof(ctx->ret_flags)); + if (IS_ERR(p)) + goto out_err_free_mech; + + p = simple_get_bytes(p, end, &ctx->req_flags, sizeof(ctx->req_flags)); + if (IS_ERR(p)) + goto out_err_free_mech; + + p = simple_get_netobj(p, end, &ctx->share_key); + if (IS_ERR(p)) + goto out_err_free_s_key; + + p = get_key(p, end, &ctx->derived_conf_key, &ctx->conf_alg); + if (IS_ERR(p)) + goto out_err_free_s_key; + + p = get_key(p, end, &ctx->derived_integ_key, &ctx->intg_alg); + if (IS_ERR(p)) + goto out_err_free_key1; + + p = simple_get_bytes(p, end, &ctx->keyestb_alg, sizeof(ctx->keyestb_alg)); + if (IS_ERR(p)) + goto out_err_free_key2; + + p = simple_get_bytes(p, end, &ctx->owf_alg, sizeof(ctx->owf_alg)); + if (IS_ERR(p)) + goto out_err_free_key2; + + if (p != end) + goto out_err_free_key2; + + ctx_id->internal_ctx_id = ctx; + + dprintk("Succesfully imported new spkm context.\n"); + return 0; + +out_err_free_key2: + crypto_free_tfm(ctx->derived_integ_key); +out_err_free_key1: + crypto_free_tfm(ctx->derived_conf_key); +out_err_free_s_key: + kfree(ctx->share_key.data); +out_err_free_mech: + kfree(ctx->mech_used.data); +out_err_free_ctx_id: + kfree(ctx->ctx_id.data); +out_err_free_ctx: + kfree(ctx); +out_err: + return PTR_ERR(p); +} + +static void +gss_delete_sec_context_spkm3(void *internal_ctx) { + struct spkm3_ctx *sctx = internal_ctx; + + if(sctx->derived_integ_key) + crypto_free_tfm(sctx->derived_integ_key); + if(sctx->derived_conf_key) + crypto_free_tfm(sctx->derived_conf_key); + if(sctx->share_key.data) + kfree(sctx->share_key.data); + if(sctx->mech_used.data) + kfree(sctx->mech_used.data); + kfree(sctx); +} + +static u32 +gss_verify_mic_spkm3(struct gss_ctx *ctx, + struct xdr_buf *signbuf, + struct xdr_netobj *checksum, + u32 *qstate) { + u32 maj_stat = 0; + int qop_state = 0; + struct spkm3_ctx *sctx = ctx->internal_ctx_id; + + dprintk("RPC: gss_verify_mic_spkm3 calling spkm3_read_token\n"); + maj_stat = spkm3_read_token(sctx, checksum, signbuf, &qop_state, + SPKM_MIC_TOK); + + if (!maj_stat && qop_state) + *qstate = qop_state; + + dprintk("RPC: gss_verify_mic_spkm3 returning %d\n", maj_stat); + return maj_stat; +} + +static u32 +gss_get_mic_spkm3(struct gss_ctx *ctx, + u32 qop, + struct xdr_buf *message_buffer, + struct xdr_netobj *message_token) { + u32 err = 0; + struct spkm3_ctx *sctx = ctx->internal_ctx_id; + + dprintk("RPC: gss_get_mic_spkm3\n"); + + err = spkm3_make_token(sctx, qop, message_buffer, + message_token, SPKM_MIC_TOK); + return err; +} + +static struct gss_api_ops gss_spkm3_ops = { + .gss_import_sec_context = gss_import_sec_context_spkm3, + .gss_get_mic = gss_get_mic_spkm3, + .gss_verify_mic = gss_verify_mic_spkm3, + .gss_delete_sec_context = gss_delete_sec_context_spkm3, +}; + +static struct pf_desc gss_spkm3_pfs[] = { + {RPC_AUTH_GSS_SPKM, 0, RPC_GSS_SVC_NONE, "spkm3"}, + {RPC_AUTH_GSS_SPKMI, 0, RPC_GSS_SVC_INTEGRITY, "spkm3i"}, +}; + +static struct gss_api_mech gss_spkm3_mech = { + .gm_name = "spkm3", + .gm_owner = THIS_MODULE, + .gm_ops = &gss_spkm3_ops, + .gm_pf_num = ARRAY_SIZE(gss_spkm3_pfs), + .gm_pfs = gss_spkm3_pfs, +}; + +static int __init init_spkm3_module(void) +{ + int status; + + status = gss_mech_register(&gss_spkm3_mech); + if (status) + printk("Failed to register spkm3 gss mechanism!\n"); + return 0; +} + +static void __exit cleanup_spkm3_module(void) +{ + gss_mech_unregister(&gss_spkm3_mech); +} + +MODULE_LICENSE("GPL"); +module_init(init_spkm3_module); +module_exit(cleanup_spkm3_module); diff --git a/net/sunrpc/auth_gss/gss_spkm3_seal.c b/net/sunrpc/auth_gss/gss_spkm3_seal.c new file mode 100644 index 000000000000..25339868d462 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_spkm3_seal.c @@ -0,0 +1,132 @@ +/* + * linux/net/sunrpc/gss_spkm3_seal.c + * + * Copyright (c) 2003 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +/* + * spkm3_make_token() + * + * Only SPKM_MIC_TOK with md5 intg-alg is supported + */ + +u32 +spkm3_make_token(struct spkm3_ctx *ctx, int qop_req, + struct xdr_buf * text, struct xdr_netobj * token, + int toktype) +{ + s32 checksum_type; + char tokhdrbuf[25]; + struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; + struct xdr_netobj mic_hdr = {.len = 0, .data = tokhdrbuf}; + int tmsglen, tokenlen = 0; + unsigned char *ptr; + s32 now; + int ctxelen = 0, ctxzbit = 0; + int md5elen = 0, md5zbit = 0; + + dprintk("RPC: spkm3_make_token\n"); + + now = jiffies; + if (qop_req != 0) + goto out_err; + + if (ctx->ctx_id.len != 16) { + dprintk("RPC: spkm3_make_token BAD ctx_id.len %d\n", + ctx->ctx_id.len); + goto out_err; + } + + switch (ctx->intg_alg) { + case NID_md5: + checksum_type = CKSUMTYPE_RSA_MD5; + break; + default: + dprintk("RPC: gss_spkm3_seal: ctx->signalg %d not" + " supported\n", ctx->intg_alg); + goto out_err; + } + /* XXX since we don't support WRAP, perhaps we don't care... */ + if (ctx->conf_alg != NID_cast5_cbc) { + dprintk("RPC: gss_spkm3_seal: ctx->sealalg %d not supported\n", + ctx->conf_alg); + goto out_err; + } + + if (toktype == SPKM_MIC_TOK) { + tmsglen = 0; + /* Calculate checksum over the mic-header */ + asn1_bitstring_len(&ctx->ctx_id, &ctxelen, &ctxzbit); + spkm3_mic_header(&mic_hdr.data, &mic_hdr.len, ctx->ctx_id.data, + ctxelen, ctxzbit); + + if (make_checksum(checksum_type, mic_hdr.data, mic_hdr.len, + text, &md5cksum)) + goto out_err; + + asn1_bitstring_len(&md5cksum, &md5elen, &md5zbit); + tokenlen = 10 + ctxelen + 1 + 2 + md5elen + 1; + + /* Create token header using generic routines */ + token->len = g_token_size(&ctx->mech_used, tokenlen + tmsglen); + + ptr = token->data; + g_make_token_header(&ctx->mech_used, tokenlen + tmsglen, &ptr); + + spkm3_make_mic_token(&ptr, tokenlen, &mic_hdr, &md5cksum, md5elen, md5zbit); + } else if (toktype == SPKM_WRAP_TOK) { /* Not Supported */ + dprintk("RPC: gss_spkm3_seal: SPKM_WRAP_TOK not supported\n"); + goto out_err; + } + kfree(md5cksum.data); + + /* XXX need to implement sequence numbers, and ctx->expired */ + + return GSS_S_COMPLETE; +out_err: + if (md5cksum.data) + kfree(md5cksum.data); + token->data = NULL; + token->len = 0; + return GSS_S_FAILURE; +} diff --git a/net/sunrpc/auth_gss/gss_spkm3_token.c b/net/sunrpc/auth_gss/gss_spkm3_token.c new file mode 100644 index 000000000000..46c08a0710f6 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_spkm3_token.c @@ -0,0 +1,266 @@ +/* + * linux/net/sunrpc/gss_spkm3_token.c + * + * Copyright (c) 2003 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +/* + * asn1_bitstring_len() + * + * calculate the asn1 bitstring length of the xdr_netobject + */ +void +asn1_bitstring_len(struct xdr_netobj *in, int *enclen, int *zerobits) +{ + int i, zbit = 0,elen = in->len; + char *ptr; + + ptr = &in->data[in->len -1]; + + /* count trailing 0's */ + for(i = in->len; i > 0; i--) { + if (*ptr == 0) { + ptr--; + elen--; + } else + break; + } + + /* count number of 0 bits in final octet */ + ptr = &in->data[elen - 1]; + for(i = 0; i < 8; i++) { + short mask = 0x01; + + if (!((mask << i) & *ptr)) + zbit++; + else + break; + } + *enclen = elen; + *zerobits = zbit; +} + +/* + * decode_asn1_bitstring() + * + * decode a bitstring into a buffer of the expected length. + * enclen = bit string length + * explen = expected length (define in rfc) + */ +int +decode_asn1_bitstring(struct xdr_netobj *out, char *in, int enclen, int explen) +{ + if (!(out->data = kmalloc(explen,GFP_KERNEL))) + return 0; + out->len = explen; + memset(out->data, 0, explen); + memcpy(out->data, in, enclen); + return 1; +} + +/* + * SPKMInnerContextToken choice SPKM_MIC asn1 token layout + * + * contextid is always 16 bytes plain data. max asn1 bitstring len = 17. + * + * tokenlen = pos[0] to end of token (max pos[45] with MD5 cksum) + * + * pos value + * ---------- + * [0] a4 SPKM-MIC tag + * [1] ?? innertoken length (max 44) + * + * + * tok_hdr piece of checksum data starts here + * + * the maximum mic-header len = 9 + 17 = 26 + * mic-header + * ---------- + * [2] 30 SEQUENCE tag + * [3] ?? mic-header length: (max 23) = TokenID + ContextID + * + * TokenID - all fields constant and can be hardcoded + * ------- + * [4] 02 Type 2 + * [5] 02 Length 2 + * [6][7] 01 01 TokenID (SPKM_MIC_TOK) + * + * ContextID - encoded length not constant, calculated + * --------- + * [8] 03 Type 3 + * [9] ?? encoded length + * [10] ?? ctxzbit + * [11] contextid + * + * mic_header piece of checksum data ends here. + * + * int-cksum - encoded length not constant, calculated + * --------- + * [??] 03 Type 3 + * [??] ?? encoded length + * [??] ?? md5zbit + * [??] int-cksum (NID_md5 = 16) + * + * maximum SPKM-MIC innercontext token length = + * 10 + encoded contextid_size(17 max) + 2 + encoded + * cksum_size (17 maxfor NID_md5) = 46 + */ + +/* + * spkm3_mic_header() + * + * Prepare the SPKM_MIC_TOK mic-header for check-sum calculation + * elen: 16 byte context id asn1 bitstring encoded length + */ +void +spkm3_mic_header(unsigned char **hdrbuf, unsigned int *hdrlen, unsigned char *ctxdata, int elen, int zbit) +{ + char *hptr = *hdrbuf; + char *top = *hdrbuf; + + *(u8 *)hptr++ = 0x30; + *(u8 *)hptr++ = elen + 7; /* on the wire header length */ + + /* tokenid */ + *(u8 *)hptr++ = 0x02; + *(u8 *)hptr++ = 0x02; + *(u8 *)hptr++ = 0x01; + *(u8 *)hptr++ = 0x01; + + /* coniextid */ + *(u8 *)hptr++ = 0x03; + *(u8 *)hptr++ = elen + 1; /* add 1 to include zbit */ + *(u8 *)hptr++ = zbit; + memcpy(hptr, ctxdata, elen); + hptr += elen; + *hdrlen = hptr - top; +} + +/* + * spkm3_mic_innercontext_token() + * + * *tokp points to the beginning of the SPKM_MIC token described + * in rfc 2025, section 3.2.1: + * + */ +void +spkm3_make_mic_token(unsigned char **tokp, int toklen, struct xdr_netobj *mic_hdr, struct xdr_netobj *md5cksum, int md5elen, int md5zbit) +{ + unsigned char *ict = *tokp; + + *(u8 *)ict++ = 0xa4; + *(u8 *)ict++ = toklen - 2; + memcpy(ict, mic_hdr->data, mic_hdr->len); + ict += mic_hdr->len; + + *(u8 *)ict++ = 0x03; + *(u8 *)ict++ = md5elen + 1; /* add 1 to include zbit */ + *(u8 *)ict++ = md5zbit; + memcpy(ict, md5cksum->data, md5elen); +} + +u32 +spkm3_verify_mic_token(unsigned char **tokp, int *mic_hdrlen, unsigned char **cksum) +{ + struct xdr_netobj spkm3_ctx_id = {.len =0, .data = NULL}; + unsigned char *ptr = *tokp; + int ctxelen; + u32 ret = GSS_S_DEFECTIVE_TOKEN; + + /* spkm3 innercontext token preamble */ + if ((ptr[0] != 0xa4) || (ptr[2] != 0x30)) { + dprintk("RPC: BAD SPKM ictoken preamble\n"); + goto out; + } + + *mic_hdrlen = ptr[3]; + + /* token type */ + if ((ptr[4] != 0x02) || (ptr[5] != 0x02)) { + dprintk("RPC: BAD asn1 SPKM3 token type\n"); + goto out; + } + + /* only support SPKM_MIC_TOK */ + if((ptr[6] != 0x01) || (ptr[7] != 0x01)) { + dprintk("RPC: ERROR unsupported SPKM3 token \n"); + goto out; + } + + /* contextid */ + if (ptr[8] != 0x03) { + dprintk("RPC: BAD SPKM3 asn1 context-id type\n"); + goto out; + } + + ctxelen = ptr[9]; + if (ctxelen > 17) { /* length includes asn1 zbit octet */ + dprintk("RPC: BAD SPKM3 contextid len %d\n", ctxelen); + goto out; + } + + /* ignore ptr[10] */ + + if(!decode_asn1_bitstring(&spkm3_ctx_id, &ptr[11], ctxelen - 1, 16)) + goto out; + + /* + * in the current implementation: the optional int-alg is not present + * so the default int-alg (md5) is used the optional snd-seq field is + * also not present + */ + + if (*mic_hdrlen != 6 + ctxelen) { + dprintk("RPC: BAD SPKM_ MIC_TOK header len %d: we only support default int-alg (should be absent) and do not support snd-seq\n", *mic_hdrlen); + goto out; + } + /* checksum */ + *cksum = (&ptr[10] + ctxelen); /* ctxelen includes ptr[10] */ + + ret = GSS_S_COMPLETE; +out: + if (spkm3_ctx_id.data) + kfree(spkm3_ctx_id.data); + return ret; +} + diff --git a/net/sunrpc/auth_gss/gss_spkm3_unseal.c b/net/sunrpc/auth_gss/gss_spkm3_unseal.c new file mode 100644 index 000000000000..65ce81bf0bc4 --- /dev/null +++ b/net/sunrpc/auth_gss/gss_spkm3_unseal.c @@ -0,0 +1,128 @@ +/* + * linux/net/sunrpc/gss_spkm3_unseal.c + * + * Copyright (c) 2003 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +/* + * spkm3_read_token() + * + * only SPKM_MIC_TOK with md5 intg-alg is supported + */ +u32 +spkm3_read_token(struct spkm3_ctx *ctx, + struct xdr_netobj *read_token, /* checksum */ + struct xdr_buf *message_buffer, /* signbuf */ + int *qop_state, int toktype) +{ + s32 code; + struct xdr_netobj wire_cksum = {.len =0, .data = NULL}; + struct xdr_netobj md5cksum = {.len = 0, .data = NULL}; + unsigned char *ptr = (unsigned char *)read_token->data; + unsigned char *cksum; + int bodysize, md5elen; + int mic_hdrlen; + u32 ret = GSS_S_DEFECTIVE_TOKEN; + + dprintk("RPC: spkm3_read_token read_token->len %d\n", read_token->len); + + if (g_verify_token_header((struct xdr_netobj *) &ctx->mech_used, + &bodysize, &ptr, read_token->len)) + goto out; + + /* decode the token */ + + if (toktype == SPKM_MIC_TOK) { + + if ((ret = spkm3_verify_mic_token(&ptr, &mic_hdrlen, &cksum))) + goto out; + + if (*cksum++ != 0x03) { + dprintk("RPC: spkm3_read_token BAD checksum type\n"); + goto out; + } + md5elen = *cksum++; + cksum++; /* move past the zbit */ + + if(!decode_asn1_bitstring(&wire_cksum, cksum, md5elen - 1, 16)) + goto out; + + /* HARD CODED FOR MD5 */ + + /* compute the checksum of the message. + * ptr + 2 = start of header piece of checksum + * mic_hdrlen + 2 = length of header piece of checksum + */ + ret = GSS_S_DEFECTIVE_TOKEN; + code = make_checksum(CKSUMTYPE_RSA_MD5, ptr + 2, + mic_hdrlen + 2, + message_buffer, &md5cksum); + + if (code) + goto out; + + dprintk("RPC: spkm3_read_token: digest wire_cksum.len %d:\n", + wire_cksum.len); + dprintk(" md5cksum.data\n"); + print_hexl((u32 *) md5cksum.data, 16, 0); + dprintk(" cksum.data:\n"); + print_hexl((u32 *) wire_cksum.data, wire_cksum.len, 0); + + ret = GSS_S_BAD_SIG; + code = memcmp(md5cksum.data, wire_cksum.data, wire_cksum.len); + if (code) + goto out; + + } else { + dprintk("RPC: BAD or UNSUPPORTED SPKM3 token type: %d\n",toktype); + goto out; + } + + /* XXX: need to add expiration and sequencing */ + ret = GSS_S_COMPLETE; +out: + if (md5cksum.data) + kfree(md5cksum.data); + if (wire_cksum.data) + kfree(wire_cksum.data); + return ret; +} diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c new file mode 100644 index 000000000000..5c8fe3bfc494 --- /dev/null +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -0,0 +1,1080 @@ +/* + * Neil Brown + * J. Bruce Fields + * Andy Adamson + * Dug Song + * + * RPCSEC_GSS server authentication. + * This implements RPCSEC_GSS as defined in rfc2203 (rpcsec_gss) and rfc2078 + * (gssapi) + * + * The RPCSEC_GSS involves three stages: + * 1/ context creation + * 2/ data exchange + * 3/ context destruction + * + * Context creation is handled largely by upcalls to user-space. + * In particular, GSS_Accept_sec_context is handled by an upcall + * Data exchange is handled entirely within the kernel + * In particular, GSS_GetMIC, GSS_VerifyMIC, GSS_Seal, GSS_Unseal are in-kernel. + * Context destruction is handled in-kernel + * GSS_Delete_sec_context is in-kernel + * + * Context creation is initiated by a RPCSEC_GSS_INIT request arriving. + * The context handle and gss_token are used as a key into the rpcsec_init cache. + * The content of this cache includes some of the outputs of GSS_Accept_sec_context, + * being major_status, minor_status, context_handle, reply_token. + * These are sent back to the client. + * Sequence window management is handled by the kernel. The window size if currently + * a compile time constant. + * + * When user-space is happy that a context is established, it places an entry + * in the rpcsec_context cache. The key for this cache is the context_handle. + * The content includes: + * uid/gidlist - for determining access rights + * mechanism type + * mechanism specific information, such as a key + * + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +/* The rpcsec_init cache is used for mapping RPCSEC_GSS_{,CONT_}INIT requests + * into replies. + * + * Key is context handle (\x if empty) and gss_token. + * Content is major_status minor_status (integers) context_handle, reply_token. + * + */ + +static int netobj_equal(struct xdr_netobj *a, struct xdr_netobj *b) +{ + return a->len == b->len && 0 == memcmp(a->data, b->data, a->len); +} + +#define RSI_HASHBITS 6 +#define RSI_HASHMAX (1<in_handle.data); + kfree(rsii->in_token.data); + kfree(rsii->out_handle.data); + kfree(rsii->out_token.data); +} + +static void rsi_put(struct cache_head *item, struct cache_detail *cd) +{ + struct rsi *rsii = container_of(item, struct rsi, h); + if (cache_put(item, cd)) { + rsi_free(rsii); + kfree(rsii); + } +} + +static inline int rsi_hash(struct rsi *item) +{ + return hash_mem(item->in_handle.data, item->in_handle.len, RSI_HASHBITS) + ^ hash_mem(item->in_token.data, item->in_token.len, RSI_HASHBITS); +} + +static inline int rsi_match(struct rsi *item, struct rsi *tmp) +{ + return netobj_equal(&item->in_handle, &tmp->in_handle) + && netobj_equal(&item->in_token, &tmp->in_token); +} + +static int dup_to_netobj(struct xdr_netobj *dst, char *src, int len) +{ + dst->len = len; + dst->data = (len ? kmalloc(len, GFP_KERNEL) : NULL); + if (dst->data) + memcpy(dst->data, src, len); + if (len && !dst->data) + return -ENOMEM; + return 0; +} + +static inline int dup_netobj(struct xdr_netobj *dst, struct xdr_netobj *src) +{ + return dup_to_netobj(dst, src->data, src->len); +} + +static inline void rsi_init(struct rsi *new, struct rsi *item) +{ + new->out_handle.data = NULL; + new->out_handle.len = 0; + new->out_token.data = NULL; + new->out_token.len = 0; + new->in_handle.len = item->in_handle.len; + item->in_handle.len = 0; + new->in_token.len = item->in_token.len; + item->in_token.len = 0; + new->in_handle.data = item->in_handle.data; + item->in_handle.data = NULL; + new->in_token.data = item->in_token.data; + item->in_token.data = NULL; +} + +static inline void rsi_update(struct rsi *new, struct rsi *item) +{ + BUG_ON(new->out_handle.data || new->out_token.data); + new->out_handle.len = item->out_handle.len; + item->out_handle.len = 0; + new->out_token.len = item->out_token.len; + item->out_token.len = 0; + new->out_handle.data = item->out_handle.data; + item->out_handle.data = NULL; + new->out_token.data = item->out_token.data; + item->out_token.data = NULL; + + new->major_status = item->major_status; + new->minor_status = item->minor_status; +} + +static void rsi_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +{ + struct rsi *rsii = container_of(h, struct rsi, h); + + qword_addhex(bpp, blen, rsii->in_handle.data, rsii->in_handle.len); + qword_addhex(bpp, blen, rsii->in_token.data, rsii->in_token.len); + (*bpp)[-1] = '\n'; +} + + +static int rsi_parse(struct cache_detail *cd, + char *mesg, int mlen) +{ + /* context token expiry major minor context token */ + char *buf = mesg; + char *ep; + int len; + struct rsi rsii, *rsip = NULL; + time_t expiry; + int status = -EINVAL; + + memset(&rsii, 0, sizeof(rsii)); + /* handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + status = -ENOMEM; + if (dup_to_netobj(&rsii.in_handle, buf, len)) + goto out; + + /* token */ + len = qword_get(&mesg, buf, mlen); + status = -EINVAL; + if (len < 0) + goto out; + status = -ENOMEM; + if (dup_to_netobj(&rsii.in_token, buf, len)) + goto out; + + rsii.h.flags = 0; + /* expiry */ + expiry = get_expiry(&mesg); + status = -EINVAL; + if (expiry == 0) + goto out; + + /* major/minor */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + if (len == 0) { + goto out; + } else { + rsii.major_status = simple_strtoul(buf, &ep, 10); + if (*ep) + goto out; + len = qword_get(&mesg, buf, mlen); + if (len <= 0) + goto out; + rsii.minor_status = simple_strtoul(buf, &ep, 10); + if (*ep) + goto out; + + /* out_handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) + goto out; + status = -ENOMEM; + if (dup_to_netobj(&rsii.out_handle, buf, len)) + goto out; + + /* out_token */ + len = qword_get(&mesg, buf, mlen); + status = -EINVAL; + if (len < 0) + goto out; + status = -ENOMEM; + if (dup_to_netobj(&rsii.out_token, buf, len)) + goto out; + } + rsii.h.expiry_time = expiry; + rsip = rsi_lookup(&rsii, 1); + status = 0; +out: + rsi_free(&rsii); + if (rsip) + rsi_put(&rsip->h, &rsi_cache); + return status; +} + +static struct cache_detail rsi_cache = { + .hash_size = RSI_HASHMAX, + .hash_table = rsi_table, + .name = "auth.rpcsec.init", + .cache_put = rsi_put, + .cache_request = rsi_request, + .cache_parse = rsi_parse, +}; + +static DefineSimpleCacheLookup(rsi, 0) + +/* + * The rpcsec_context cache is used to store a context that is + * used in data exchange. + * The key is a context handle. The content is: + * uid, gidlist, mechanism, service-set, mech-specific-data + */ + +#define RSC_HASHBITS 10 +#define RSC_HASHMAX (1<handle.data); + if (rsci->mechctx) + gss_delete_sec_context(&rsci->mechctx); + if (rsci->cred.cr_group_info) + put_group_info(rsci->cred.cr_group_info); +} + +static void rsc_put(struct cache_head *item, struct cache_detail *cd) +{ + struct rsc *rsci = container_of(item, struct rsc, h); + + if (cache_put(item, cd)) { + rsc_free(rsci); + kfree(rsci); + } +} + +static inline int +rsc_hash(struct rsc *rsci) +{ + return hash_mem(rsci->handle.data, rsci->handle.len, RSC_HASHBITS); +} + +static inline int +rsc_match(struct rsc *new, struct rsc *tmp) +{ + return netobj_equal(&new->handle, &tmp->handle); +} + +static inline void +rsc_init(struct rsc *new, struct rsc *tmp) +{ + new->handle.len = tmp->handle.len; + tmp->handle.len = 0; + new->handle.data = tmp->handle.data; + tmp->handle.data = NULL; + new->mechctx = NULL; + new->cred.cr_group_info = NULL; +} + +static inline void +rsc_update(struct rsc *new, struct rsc *tmp) +{ + new->mechctx = tmp->mechctx; + tmp->mechctx = NULL; + memset(&new->seqdata, 0, sizeof(new->seqdata)); + spin_lock_init(&new->seqdata.sd_lock); + new->cred = tmp->cred; + tmp->cred.cr_group_info = NULL; +} + +static int rsc_parse(struct cache_detail *cd, + char *mesg, int mlen) +{ + /* contexthandle expiry [ uid gid N mechname ...mechdata... ] */ + char *buf = mesg; + int len, rv; + struct rsc rsci, *rscp = NULL; + time_t expiry; + int status = -EINVAL; + + memset(&rsci, 0, sizeof(rsci)); + /* context handle */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) goto out; + status = -ENOMEM; + if (dup_to_netobj(&rsci.handle, buf, len)) + goto out; + + rsci.h.flags = 0; + /* expiry */ + expiry = get_expiry(&mesg); + status = -EINVAL; + if (expiry == 0) + goto out; + + /* uid, or NEGATIVE */ + rv = get_int(&mesg, &rsci.cred.cr_uid); + if (rv == -EINVAL) + goto out; + if (rv == -ENOENT) + set_bit(CACHE_NEGATIVE, &rsci.h.flags); + else { + int N, i; + struct gss_api_mech *gm; + + /* gid */ + if (get_int(&mesg, &rsci.cred.cr_gid)) + goto out; + + /* number of additional gid's */ + if (get_int(&mesg, &N)) + goto out; + status = -ENOMEM; + rsci.cred.cr_group_info = groups_alloc(N); + if (rsci.cred.cr_group_info == NULL) + goto out; + + /* gid's */ + status = -EINVAL; + for (i=0; ih, &rsc_cache); + return status; +} + +static struct cache_detail rsc_cache = { + .hash_size = RSC_HASHMAX, + .hash_table = rsc_table, + .name = "auth.rpcsec.context", + .cache_put = rsc_put, + .cache_parse = rsc_parse, +}; + +static DefineSimpleCacheLookup(rsc, 0); + +static struct rsc * +gss_svc_searchbyctx(struct xdr_netobj *handle) +{ + struct rsc rsci; + struct rsc *found; + + memset(&rsci, 0, sizeof(rsci)); + if (dup_to_netobj(&rsci.handle, handle->data, handle->len)) + return NULL; + found = rsc_lookup(&rsci, 0); + rsc_free(&rsci); + if (!found) + return NULL; + if (cache_check(&rsc_cache, &found->h, NULL)) + return NULL; + return found; +} + +/* Implements sequence number algorithm as specified in RFC 2203. */ +static int +gss_check_seq_num(struct rsc *rsci, int seq_num) +{ + struct gss_svc_seq_data *sd = &rsci->seqdata; + + spin_lock(&sd->sd_lock); + if (seq_num > sd->sd_max) { + if (seq_num >= sd->sd_max + GSS_SEQ_WIN) { + memset(sd->sd_win,0,sizeof(sd->sd_win)); + sd->sd_max = seq_num; + } else while (sd->sd_max < seq_num) { + sd->sd_max++; + __clear_bit(sd->sd_max % GSS_SEQ_WIN, sd->sd_win); + } + __set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win); + goto ok; + } else if (seq_num <= sd->sd_max - GSS_SEQ_WIN) { + goto drop; + } + /* sd_max - GSS_SEQ_WIN < seq_num <= sd_max */ + if (__test_and_set_bit(seq_num % GSS_SEQ_WIN, sd->sd_win)) + goto drop; +ok: + spin_unlock(&sd->sd_lock); + return 1; +drop: + spin_unlock(&sd->sd_lock); + return 0; +} + +static inline u32 round_up_to_quad(u32 i) +{ + return (i + 3 ) & ~3; +} + +static inline int +svc_safe_getnetobj(struct kvec *argv, struct xdr_netobj *o) +{ + int l; + + if (argv->iov_len < 4) + return -1; + o->len = ntohl(svc_getu32(argv)); + l = round_up_to_quad(o->len); + if (argv->iov_len < l) + return -1; + o->data = argv->iov_base; + argv->iov_base += l; + argv->iov_len -= l; + return 0; +} + +static inline int +svc_safe_putnetobj(struct kvec *resv, struct xdr_netobj *o) +{ + u32 *p; + + if (resv->iov_len + 4 > PAGE_SIZE) + return -1; + svc_putu32(resv, htonl(o->len)); + p = resv->iov_base + resv->iov_len; + resv->iov_len += round_up_to_quad(o->len); + if (resv->iov_len > PAGE_SIZE) + return -1; + memcpy(p, o->data, o->len); + memset((u8 *)p + o->len, 0, round_up_to_quad(o->len) - o->len); + return 0; +} + +/* Verify the checksum on the header and return SVC_OK on success. + * Otherwise, return SVC_DROP (in the case of a bad sequence number) + * or return SVC_DENIED and indicate error in authp. + */ +static int +gss_verify_header(struct svc_rqst *rqstp, struct rsc *rsci, + u32 *rpcstart, struct rpc_gss_wire_cred *gc, u32 *authp) +{ + struct gss_ctx *ctx_id = rsci->mechctx; + struct xdr_buf rpchdr; + struct xdr_netobj checksum; + u32 flavor = 0; + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec iov; + + /* data to compute the checksum over: */ + iov.iov_base = rpcstart; + iov.iov_len = (u8 *)argv->iov_base - (u8 *)rpcstart; + xdr_buf_from_iov(&iov, &rpchdr); + + *authp = rpc_autherr_badverf; + if (argv->iov_len < 4) + return SVC_DENIED; + flavor = ntohl(svc_getu32(argv)); + if (flavor != RPC_AUTH_GSS) + return SVC_DENIED; + if (svc_safe_getnetobj(argv, &checksum)) + return SVC_DENIED; + + if (rqstp->rq_deferred) /* skip verification of revisited request */ + return SVC_OK; + if (gss_verify_mic(ctx_id, &rpchdr, &checksum, NULL) + != GSS_S_COMPLETE) { + *authp = rpcsec_gsserr_credproblem; + return SVC_DENIED; + } + + if (gc->gc_seq > MAXSEQ) { + dprintk("RPC: svcauth_gss: discarding request with large sequence number %d\n", + gc->gc_seq); + *authp = rpcsec_gsserr_ctxproblem; + return SVC_DENIED; + } + if (!gss_check_seq_num(rsci, gc->gc_seq)) { + dprintk("RPC: svcauth_gss: discarding request with old sequence number %d\n", + gc->gc_seq); + return SVC_DROP; + } + return SVC_OK; +} + +static int +gss_write_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq) +{ + u32 xdr_seq; + u32 maj_stat; + struct xdr_buf verf_data; + struct xdr_netobj mic; + u32 *p; + struct kvec iov; + + svc_putu32(rqstp->rq_res.head, htonl(RPC_AUTH_GSS)); + xdr_seq = htonl(seq); + + iov.iov_base = &xdr_seq; + iov.iov_len = sizeof(xdr_seq); + xdr_buf_from_iov(&iov, &verf_data); + p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len; + mic.data = (u8 *)(p + 1); + maj_stat = gss_get_mic(ctx_id, 0, &verf_data, &mic); + if (maj_stat != GSS_S_COMPLETE) + return -1; + *p++ = htonl(mic.len); + memset((u8 *)p + mic.len, 0, round_up_to_quad(mic.len) - mic.len); + p += XDR_QUADLEN(mic.len); + if (!xdr_ressize_check(rqstp, p)) + return -1; + return 0; +} + +struct gss_domain { + struct auth_domain h; + u32 pseudoflavor; +}; + +static struct auth_domain * +find_gss_auth_domain(struct gss_ctx *ctx, u32 svc) +{ + char *name; + + name = gss_service_to_auth_domain_name(ctx->mech_type, svc); + if (!name) + return NULL; + return auth_domain_find(name); +} + +int +svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name) +{ + struct gss_domain *new; + struct auth_domain *test; + int stat = -ENOMEM; + + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + goto out; + cache_init(&new->h.h); + new->h.name = kmalloc(strlen(name) + 1, GFP_KERNEL); + if (!new->h.name) + goto out_free_dom; + strcpy(new->h.name, name); + new->h.flavour = RPC_AUTH_GSS; + new->pseudoflavor = pseudoflavor; + new->h.h.expiry_time = NEVER; + + test = auth_domain_lookup(&new->h, 1); + if (test == &new->h) { + BUG_ON(atomic_dec_and_test(&new->h.h.refcnt)); + } else { /* XXX Duplicate registration? */ + auth_domain_put(&new->h); + goto out; + } + return 0; + +out_free_dom: + kfree(new); +out: + return stat; +} + +EXPORT_SYMBOL(svcauth_gss_register_pseudoflavor); + +static inline int +read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj) +{ + u32 raw; + int status; + + status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj)); + if (status) + return status; + *obj = ntohl(raw); + return 0; +} + +/* It would be nice if this bit of code could be shared with the client. + * Obstacles: + * The client shouldn't malloc(), would have to pass in own memory. + * The server uses base of head iovec as read pointer, while the + * client uses separate pointer. */ +static int +unwrap_integ_data(struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) +{ + int stat = -EINVAL; + u32 integ_len, maj_stat; + struct xdr_netobj mic; + struct xdr_buf integ_buf; + + integ_len = ntohl(svc_getu32(&buf->head[0])); + if (integ_len & 3) + goto out; + if (integ_len > buf->len) + goto out; + if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) + BUG(); + /* copy out mic... */ + if (read_u32_from_xdr_buf(buf, integ_len, &mic.len)) + BUG(); + if (mic.len > RPC_MAX_AUTH_SIZE) + goto out; + mic.data = kmalloc(mic.len, GFP_KERNEL); + if (!mic.data) + goto out; + if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len)) + goto out; + maj_stat = gss_verify_mic(ctx, &integ_buf, &mic, NULL); + if (maj_stat != GSS_S_COMPLETE) + goto out; + if (ntohl(svc_getu32(&buf->head[0])) != seq) + goto out; + stat = 0; +out: + return stat; +} + +struct gss_svc_data { + /* decoded gss client cred: */ + struct rpc_gss_wire_cred clcred; + /* pointer to the beginning of the procedure-specific results, + * which may be encrypted/checksummed in svcauth_gss_release: */ + u32 *body_start; + struct rsc *rsci; +}; + +static int +svcauth_gss_set_client(struct svc_rqst *rqstp) +{ + struct gss_svc_data *svcdata = rqstp->rq_auth_data; + struct rsc *rsci = svcdata->rsci; + struct rpc_gss_wire_cred *gc = &svcdata->clcred; + + rqstp->rq_client = find_gss_auth_domain(rsci->mechctx, gc->gc_svc); + if (rqstp->rq_client == NULL) + return SVC_DENIED; + return SVC_OK; +} + +/* + * Accept an rpcsec packet. + * If context establishment, punt to user space + * If data exchange, verify/decrypt + * If context destruction, handle here + * In the context establishment and destruction case we encode + * response here and return SVC_COMPLETE. + */ +static int +svcauth_gss_accept(struct svc_rqst *rqstp, u32 *authp) +{ + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec *resv = &rqstp->rq_res.head[0]; + u32 crlen; + struct xdr_netobj tmpobj; + struct gss_svc_data *svcdata = rqstp->rq_auth_data; + struct rpc_gss_wire_cred *gc; + struct rsc *rsci = NULL; + struct rsi *rsip, rsikey; + u32 *rpcstart; + u32 *reject_stat = resv->iov_base + resv->iov_len; + int ret; + + dprintk("RPC: svcauth_gss: argv->iov_len = %zd\n",argv->iov_len); + + *authp = rpc_autherr_badcred; + if (!svcdata) + svcdata = kmalloc(sizeof(*svcdata), GFP_KERNEL); + if (!svcdata) + goto auth_err; + rqstp->rq_auth_data = svcdata; + svcdata->body_start = NULL; + svcdata->rsci = NULL; + gc = &svcdata->clcred; + + /* start of rpc packet is 7 u32's back from here: + * xid direction rpcversion prog vers proc flavour + */ + rpcstart = argv->iov_base; + rpcstart -= 7; + + /* credential is: + * version(==1), proc(0,1,2,3), seq, service (1,2,3), handle + * at least 5 u32s, and is preceeded by length, so that makes 6. + */ + + if (argv->iov_len < 5 * 4) + goto auth_err; + crlen = ntohl(svc_getu32(argv)); + if (ntohl(svc_getu32(argv)) != RPC_GSS_VERSION) + goto auth_err; + gc->gc_proc = ntohl(svc_getu32(argv)); + gc->gc_seq = ntohl(svc_getu32(argv)); + gc->gc_svc = ntohl(svc_getu32(argv)); + if (svc_safe_getnetobj(argv, &gc->gc_ctx)) + goto auth_err; + if (crlen != round_up_to_quad(gc->gc_ctx.len) + 5 * 4) + goto auth_err; + + if ((gc->gc_proc != RPC_GSS_PROC_DATA) && (rqstp->rq_proc != 0)) + goto auth_err; + + /* + * We've successfully parsed the credential. Let's check out the + * verifier. An AUTH_NULL verifier is allowed (and required) for + * INIT and CONTINUE_INIT requests. AUTH_RPCSEC_GSS is required for + * PROC_DATA and PROC_DESTROY. + * + * AUTH_NULL verifier is 0 (AUTH_NULL), 0 (length). + * AUTH_RPCSEC_GSS verifier is: + * 6 (AUTH_RPCSEC_GSS), length, checksum. + * checksum is calculated over rpcheader from xid up to here. + */ + *authp = rpc_autherr_badverf; + switch (gc->gc_proc) { + case RPC_GSS_PROC_INIT: + case RPC_GSS_PROC_CONTINUE_INIT: + if (argv->iov_len < 2 * 4) + goto auth_err; + if (ntohl(svc_getu32(argv)) != RPC_AUTH_NULL) + goto auth_err; + if (ntohl(svc_getu32(argv)) != 0) + goto auth_err; + break; + case RPC_GSS_PROC_DATA: + case RPC_GSS_PROC_DESTROY: + *authp = rpcsec_gsserr_credproblem; + rsci = gss_svc_searchbyctx(&gc->gc_ctx); + if (!rsci) + goto auth_err; + switch (gss_verify_header(rqstp, rsci, rpcstart, gc, authp)) { + case SVC_OK: + break; + case SVC_DENIED: + goto auth_err; + case SVC_DROP: + goto drop; + } + break; + default: + *authp = rpc_autherr_rejectedcred; + goto auth_err; + } + + /* now act upon the command: */ + switch (gc->gc_proc) { + case RPC_GSS_PROC_INIT: + case RPC_GSS_PROC_CONTINUE_INIT: + *authp = rpc_autherr_badcred; + if (gc->gc_proc == RPC_GSS_PROC_INIT && gc->gc_ctx.len != 0) + goto auth_err; + memset(&rsikey, 0, sizeof(rsikey)); + if (dup_netobj(&rsikey.in_handle, &gc->gc_ctx)) + goto drop; + *authp = rpc_autherr_badverf; + if (svc_safe_getnetobj(argv, &tmpobj)) { + kfree(rsikey.in_handle.data); + goto auth_err; + } + if (dup_netobj(&rsikey.in_token, &tmpobj)) { + kfree(rsikey.in_handle.data); + goto drop; + } + + rsip = rsi_lookup(&rsikey, 0); + rsi_free(&rsikey); + if (!rsip) { + goto drop; + } + switch(cache_check(&rsi_cache, &rsip->h, &rqstp->rq_chandle)) { + case -EAGAIN: + goto drop; + case -ENOENT: + goto drop; + case 0: + rsci = gss_svc_searchbyctx(&rsip->out_handle); + if (!rsci) { + goto drop; + } + if (gss_write_verf(rqstp, rsci->mechctx, GSS_SEQ_WIN)) + goto drop; + if (resv->iov_len + 4 > PAGE_SIZE) + goto drop; + svc_putu32(resv, rpc_success); + if (svc_safe_putnetobj(resv, &rsip->out_handle)) + goto drop; + if (resv->iov_len + 3 * 4 > PAGE_SIZE) + goto drop; + svc_putu32(resv, htonl(rsip->major_status)); + svc_putu32(resv, htonl(rsip->minor_status)); + svc_putu32(resv, htonl(GSS_SEQ_WIN)); + if (svc_safe_putnetobj(resv, &rsip->out_token)) + goto drop; + rqstp->rq_client = NULL; + } + goto complete; + case RPC_GSS_PROC_DESTROY: + set_bit(CACHE_NEGATIVE, &rsci->h.flags); + if (resv->iov_len + 4 > PAGE_SIZE) + goto drop; + svc_putu32(resv, rpc_success); + goto complete; + case RPC_GSS_PROC_DATA: + *authp = rpcsec_gsserr_ctxproblem; + if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) + goto auth_err; + rqstp->rq_cred = rsci->cred; + get_group_info(rsci->cred.cr_group_info); + *authp = rpc_autherr_badcred; + switch (gc->gc_svc) { + case RPC_GSS_SVC_NONE: + break; + case RPC_GSS_SVC_INTEGRITY: + if (unwrap_integ_data(&rqstp->rq_arg, + gc->gc_seq, rsci->mechctx)) + goto auth_err; + /* placeholders for length and seq. number: */ + svcdata->body_start = resv->iov_base + resv->iov_len; + svc_putu32(resv, 0); + svc_putu32(resv, 0); + break; + case RPC_GSS_SVC_PRIVACY: + /* currently unsupported */ + default: + goto auth_err; + } + svcdata->rsci = rsci; + cache_get(&rsci->h); + ret = SVC_OK; + goto out; + } +auth_err: + /* Restore write pointer to original value: */ + xdr_ressize_check(rqstp, reject_stat); + ret = SVC_DENIED; + goto out; +complete: + ret = SVC_COMPLETE; + goto out; +drop: + ret = SVC_DROP; +out: + if (rsci) + rsc_put(&rsci->h, &rsc_cache); + return ret; +} + +static int +svcauth_gss_release(struct svc_rqst *rqstp) +{ + struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; + struct rpc_gss_wire_cred *gc = &gsd->clcred; + struct xdr_buf *resbuf = &rqstp->rq_res; + struct xdr_buf integ_buf; + struct xdr_netobj mic; + struct kvec *resv; + u32 *p; + int integ_offset, integ_len; + int stat = -EINVAL; + + if (gc->gc_proc != RPC_GSS_PROC_DATA) + goto out; + /* Release can be called twice, but we only wrap once. */ + if (gsd->body_start == NULL) + goto out; + /* normally not set till svc_send, but we need it here: */ + resbuf->len = resbuf->head[0].iov_len + + resbuf->page_len + resbuf->tail[0].iov_len; + switch (gc->gc_svc) { + case RPC_GSS_SVC_NONE: + break; + case RPC_GSS_SVC_INTEGRITY: + p = gsd->body_start; + gsd->body_start = NULL; + /* move accept_stat to right place: */ + memcpy(p, p + 2, 4); + /* don't wrap in failure case: */ + /* Note: counting on not getting here if call was not even + * accepted! */ + if (*p != rpc_success) { + resbuf->head[0].iov_len -= 2 * 4; + goto out; + } + p++; + integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base; + integ_len = resbuf->len - integ_offset; + BUG_ON(integ_len % 4); + *p++ = htonl(integ_len); + *p++ = htonl(gc->gc_seq); + if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, + integ_len)) + BUG(); + if (resbuf->page_len == 0 + && resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE + < PAGE_SIZE) { + BUG_ON(resbuf->tail[0].iov_len); + /* Use head for everything */ + resv = &resbuf->head[0]; + } else if (resbuf->tail[0].iov_base == NULL) { + /* copied from nfsd4_encode_read */ + svc_take_page(rqstp); + resbuf->tail[0].iov_base = page_address(rqstp + ->rq_respages[rqstp->rq_resused-1]); + rqstp->rq_restailpage = rqstp->rq_resused-1; + resbuf->tail[0].iov_len = 0; + resv = &resbuf->tail[0]; + } else { + resv = &resbuf->tail[0]; + } + mic.data = (u8 *)resv->iov_base + resv->iov_len + 4; + if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic)) + goto out_err; + svc_putu32(resv, htonl(mic.len)); + memset(mic.data + mic.len, 0, + round_up_to_quad(mic.len) - mic.len); + resv->iov_len += XDR_QUADLEN(mic.len) << 2; + /* not strictly required: */ + resbuf->len += XDR_QUADLEN(mic.len) << 2; + BUG_ON(resv->iov_len > PAGE_SIZE); + break; + case RPC_GSS_SVC_PRIVACY: + default: + goto out_err; + } + +out: + stat = 0; +out_err: + if (rqstp->rq_client) + auth_domain_put(rqstp->rq_client); + rqstp->rq_client = NULL; + if (rqstp->rq_cred.cr_group_info) + put_group_info(rqstp->rq_cred.cr_group_info); + rqstp->rq_cred.cr_group_info = NULL; + if (gsd->rsci) + rsc_put(&gsd->rsci->h, &rsc_cache); + gsd->rsci = NULL; + + return stat; +} + +static void +svcauth_gss_domain_release(struct auth_domain *dom) +{ + struct gss_domain *gd = container_of(dom, struct gss_domain, h); + + kfree(dom->name); + kfree(gd); +} + +static struct auth_ops svcauthops_gss = { + .name = "rpcsec_gss", + .owner = THIS_MODULE, + .flavour = RPC_AUTH_GSS, + .accept = svcauth_gss_accept, + .release = svcauth_gss_release, + .domain_release = svcauth_gss_domain_release, + .set_client = svcauth_gss_set_client, +}; + +int +gss_svc_init(void) +{ + int rv = svc_auth_register(RPC_AUTH_GSS, &svcauthops_gss); + if (rv == 0) { + cache_register(&rsc_cache); + cache_register(&rsi_cache); + } + return rv; +} + +void +gss_svc_shutdown(void) +{ + cache_unregister(&rsc_cache); + cache_unregister(&rsi_cache); + svc_auth_unregister(RPC_AUTH_GSS); +} diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c new file mode 100644 index 000000000000..9b72d3abf823 --- /dev/null +++ b/net/sunrpc/auth_null.c @@ -0,0 +1,143 @@ +/* + * linux/net/sunrpc/auth_null.c + * + * AUTH_NULL authentication. Really :-) + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static struct rpc_auth null_auth; +static struct rpc_cred null_cred; + +static struct rpc_auth * +nul_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor) +{ + atomic_inc(&null_auth.au_count); + return &null_auth; +} + +static void +nul_destroy(struct rpc_auth *auth) +{ +} + +/* + * Lookup NULL creds for current process + */ +static struct rpc_cred * +nul_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +{ + return get_rpccred(&null_cred); +} + +/* + * Destroy cred handle. + */ +static void +nul_destroy_cred(struct rpc_cred *cred) +{ +} + +/* + * Match cred handle against current process + */ +static int +nul_match(struct auth_cred *acred, struct rpc_cred *cred, int taskflags) +{ + return 1; +} + +/* + * Marshal credential. + */ +static u32 * +nul_marshal(struct rpc_task *task, u32 *p) +{ + *p++ = htonl(RPC_AUTH_NULL); + *p++ = 0; + *p++ = htonl(RPC_AUTH_NULL); + *p++ = 0; + + return p; +} + +/* + * Refresh credential. This is a no-op for AUTH_NULL + */ +static int +nul_refresh(struct rpc_task *task) +{ + task->tk_msg.rpc_cred->cr_flags |= RPCAUTH_CRED_UPTODATE; + return 0; +} + +static u32 * +nul_validate(struct rpc_task *task, u32 *p) +{ + rpc_authflavor_t flavor; + u32 size; + + flavor = ntohl(*p++); + if (flavor != RPC_AUTH_NULL) { + printk("RPC: bad verf flavor: %u\n", flavor); + return NULL; + } + + size = ntohl(*p++); + if (size != 0) { + printk("RPC: bad verf size: %u\n", size); + return NULL; + } + + return p; +} + +struct rpc_authops authnull_ops = { + .owner = THIS_MODULE, + .au_flavor = RPC_AUTH_NULL, +#ifdef RPC_DEBUG + .au_name = "NULL", +#endif + .create = nul_create, + .destroy = nul_destroy, + .lookup_cred = nul_lookup_cred, +}; + +static +struct rpc_auth null_auth = { + .au_cslack = 4, + .au_rslack = 2, + .au_ops = &authnull_ops, +}; + +static +struct rpc_credops null_credops = { + .cr_name = "AUTH_NULL", + .crdestroy = nul_destroy_cred, + .crmatch = nul_match, + .crmarshal = nul_marshal, + .crrefresh = nul_refresh, + .crvalidate = nul_validate, +}; + +static +struct rpc_cred null_cred = { + .cr_ops = &null_credops, + .cr_count = ATOMIC_INIT(1), + .cr_flags = RPCAUTH_CRED_UPTODATE, +#ifdef RPC_DEBUG + .cr_magic = RPCAUTH_CRED_MAGIC, +#endif +}; diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c new file mode 100644 index 000000000000..4ff297a9b15b --- /dev/null +++ b/net/sunrpc/auth_unix.c @@ -0,0 +1,242 @@ +/* + * linux/net/sunrpc/auth_unix.c + * + * UNIX-style authentication; no AUTH_SHORT support + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include + +#define NFS_NGROUPS 16 + +struct unx_cred { + struct rpc_cred uc_base; + gid_t uc_gid; + gid_t uc_gids[NFS_NGROUPS]; +}; +#define uc_uid uc_base.cr_uid +#define uc_count uc_base.cr_count +#define uc_flags uc_base.cr_flags +#define uc_expire uc_base.cr_expire + +#define UNX_CRED_EXPIRE (60 * HZ) + +#define UNX_WRITESLACK (21 + (UNX_MAXNODENAME >> 2)) + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_AUTH +#endif + +static struct rpc_auth unix_auth; +static struct rpc_cred_cache unix_cred_cache; +static struct rpc_credops unix_credops; + +static struct rpc_auth * +unx_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor) +{ + dprintk("RPC: creating UNIX authenticator for client %p\n", clnt); + if (atomic_inc_return(&unix_auth.au_count) == 0) + unix_cred_cache.nextgc = jiffies + (unix_cred_cache.expire >> 1); + return &unix_auth; +} + +static void +unx_destroy(struct rpc_auth *auth) +{ + dprintk("RPC: destroying UNIX authenticator %p\n", auth); + rpcauth_free_credcache(auth); +} + +/* + * Lookup AUTH_UNIX creds for current process + */ +static struct rpc_cred * +unx_lookup_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +{ + return rpcauth_lookup_credcache(auth, acred, flags); +} + +static struct rpc_cred * +unx_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) +{ + struct unx_cred *cred; + int i; + + dprintk("RPC: allocating UNIX cred for uid %d gid %d\n", + acred->uid, acred->gid); + + if (!(cred = (struct unx_cred *) kmalloc(sizeof(*cred), GFP_KERNEL))) + return ERR_PTR(-ENOMEM); + + atomic_set(&cred->uc_count, 1); + cred->uc_flags = RPCAUTH_CRED_UPTODATE; + if (flags & RPC_TASK_ROOTCREDS) { + cred->uc_uid = 0; + cred->uc_gid = 0; + cred->uc_gids[0] = NOGROUP; + } else { + int groups = acred->group_info->ngroups; + if (groups > NFS_NGROUPS) + groups = NFS_NGROUPS; + + cred->uc_uid = acred->uid; + cred->uc_gid = acred->gid; + for (i = 0; i < groups; i++) + cred->uc_gids[i] = GROUP_AT(acred->group_info, i); + if (i < NFS_NGROUPS) + cred->uc_gids[i] = NOGROUP; + } + cred->uc_base.cr_ops = &unix_credops; + + return (struct rpc_cred *) cred; +} + +static void +unx_destroy_cred(struct rpc_cred *cred) +{ + kfree(cred); +} + +/* + * Match credentials against current process creds. + * The root_override argument takes care of cases where the caller may + * request root creds (e.g. for NFS swapping). + */ +static int +unx_match(struct auth_cred *acred, struct rpc_cred *rcred, int taskflags) +{ + struct unx_cred *cred = (struct unx_cred *) rcred; + int i; + + if (!(taskflags & RPC_TASK_ROOTCREDS)) { + int groups; + + if (cred->uc_uid != acred->uid + || cred->uc_gid != acred->gid) + return 0; + + groups = acred->group_info->ngroups; + if (groups > NFS_NGROUPS) + groups = NFS_NGROUPS; + for (i = 0; i < groups ; i++) + if (cred->uc_gids[i] != GROUP_AT(acred->group_info, i)) + return 0; + return 1; + } + return (cred->uc_uid == 0 + && cred->uc_gid == 0 + && cred->uc_gids[0] == (gid_t) NOGROUP); +} + +/* + * Marshal credentials. + * Maybe we should keep a cached credential for performance reasons. + */ +static u32 * +unx_marshal(struct rpc_task *task, u32 *p) +{ + struct rpc_clnt *clnt = task->tk_client; + struct unx_cred *cred = (struct unx_cred *) task->tk_msg.rpc_cred; + u32 *base, *hold; + int i; + + *p++ = htonl(RPC_AUTH_UNIX); + base = p++; + *p++ = htonl(jiffies/HZ); + + /* + * Copy the UTS nodename captured when the client was created. + */ + p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen); + + *p++ = htonl((u32) cred->uc_uid); + *p++ = htonl((u32) cred->uc_gid); + hold = p++; + for (i = 0; i < 16 && cred->uc_gids[i] != (gid_t) NOGROUP; i++) + *p++ = htonl((u32) cred->uc_gids[i]); + *hold = htonl(p - hold - 1); /* gid array length */ + *base = htonl((p - base - 1) << 2); /* cred length */ + + *p++ = htonl(RPC_AUTH_NULL); + *p++ = htonl(0); + + return p; +} + +/* + * Refresh credentials. This is a no-op for AUTH_UNIX + */ +static int +unx_refresh(struct rpc_task *task) +{ + task->tk_msg.rpc_cred->cr_flags |= RPCAUTH_CRED_UPTODATE; + return 0; +} + +static u32 * +unx_validate(struct rpc_task *task, u32 *p) +{ + rpc_authflavor_t flavor; + u32 size; + + flavor = ntohl(*p++); + if (flavor != RPC_AUTH_NULL && + flavor != RPC_AUTH_UNIX && + flavor != RPC_AUTH_SHORT) { + printk("RPC: bad verf flavor: %u\n", flavor); + return NULL; + } + + size = ntohl(*p++); + if (size > RPC_MAX_AUTH_SIZE) { + printk("RPC: giant verf size: %u\n", size); + return NULL; + } + task->tk_auth->au_rslack = (size >> 2) + 2; + p += (size >> 2); + + return p; +} + +struct rpc_authops authunix_ops = { + .owner = THIS_MODULE, + .au_flavor = RPC_AUTH_UNIX, +#ifdef RPC_DEBUG + .au_name = "UNIX", +#endif + .create = unx_create, + .destroy = unx_destroy, + .lookup_cred = unx_lookup_cred, + .crcreate = unx_create_cred, +}; + +static +struct rpc_cred_cache unix_cred_cache = { + .expire = UNX_CRED_EXPIRE, +}; + +static +struct rpc_auth unix_auth = { + .au_cslack = UNX_WRITESLACK, + .au_rslack = 2, /* assume AUTH_NULL verf */ + .au_ops = &authunix_ops, + .au_count = ATOMIC_INIT(0), + .au_credcache = &unix_cred_cache, +}; + +static +struct rpc_credops unix_credops = { + .cr_name = "AUTH_UNIX", + .crdestroy = unx_destroy_cred, + .crmatch = unx_match, + .crmarshal = unx_marshal, + .crrefresh = unx_refresh, + .crvalidate = unx_validate, +}; diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c new file mode 100644 index 000000000000..900f5bc7e336 --- /dev/null +++ b/net/sunrpc/cache.c @@ -0,0 +1,1189 @@ +/* + * net/sunrpc/cache.c + * + * Generic code for various authentication-related caches + * used by sunrpc clients and servers. + * + * Copyright (C) 2002 Neil Brown + * + * Released under terms in GPL version 2. See COPYING. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_CACHE + +static void cache_defer_req(struct cache_req *req, struct cache_head *item); +static void cache_revisit_request(struct cache_head *item); + +void cache_init(struct cache_head *h) +{ + time_t now = get_seconds(); + h->next = NULL; + h->flags = 0; + atomic_set(&h->refcnt, 1); + h->expiry_time = now + CACHE_NEW_EXPIRY; + h->last_refresh = now; +} + + +static int cache_make_upcall(struct cache_detail *detail, struct cache_head *h); +/* + * This is the generic cache management routine for all + * the authentication caches. + * It checks the currency of a cache item and will (later) + * initiate an upcall to fill it if needed. + * + * + * Returns 0 if the cache_head can be used, or cache_puts it and returns + * -EAGAIN if upcall is pending, + * -ENOENT if cache entry was negative + */ +int cache_check(struct cache_detail *detail, + struct cache_head *h, struct cache_req *rqstp) +{ + int rv; + long refresh_age, age; + + /* First decide return status as best we can */ + if (!test_bit(CACHE_VALID, &h->flags) || + h->expiry_time < get_seconds()) + rv = -EAGAIN; + else if (detail->flush_time > h->last_refresh) + rv = -EAGAIN; + else { + /* entry is valid */ + if (test_bit(CACHE_NEGATIVE, &h->flags)) + rv = -ENOENT; + else rv = 0; + } + + /* now see if we want to start an upcall */ + refresh_age = (h->expiry_time - h->last_refresh); + age = get_seconds() - h->last_refresh; + + if (rqstp == NULL) { + if (rv == -EAGAIN) + rv = -ENOENT; + } else if (rv == -EAGAIN || age > refresh_age/2) { + dprintk("Want update, refage=%ld, age=%ld\n", refresh_age, age); + if (!test_and_set_bit(CACHE_PENDING, &h->flags)) { + switch (cache_make_upcall(detail, h)) { + case -EINVAL: + clear_bit(CACHE_PENDING, &h->flags); + if (rv == -EAGAIN) { + set_bit(CACHE_NEGATIVE, &h->flags); + cache_fresh(detail, h, get_seconds()+CACHE_NEW_EXPIRY); + rv = -ENOENT; + } + break; + + case -EAGAIN: + clear_bit(CACHE_PENDING, &h->flags); + cache_revisit_request(h); + break; + } + } + } + + if (rv == -EAGAIN) + cache_defer_req(rqstp, h); + + if (rv && h) + detail->cache_put(h, detail); + return rv; +} + +static void queue_loose(struct cache_detail *detail, struct cache_head *ch); + +void cache_fresh(struct cache_detail *detail, + struct cache_head *head, time_t expiry) +{ + + head->expiry_time = expiry; + head->last_refresh = get_seconds(); + if (!test_and_set_bit(CACHE_VALID, &head->flags)) + cache_revisit_request(head); + if (test_and_clear_bit(CACHE_PENDING, &head->flags)) + queue_loose(detail, head); +} + +/* + * caches need to be periodically cleaned. + * For this we maintain a list of cache_detail and + * a current pointer into that list and into the table + * for that entry. + * + * Each time clean_cache is called it finds the next non-empty entry + * in the current table and walks the list in that entry + * looking for entries that can be removed. + * + * An entry gets removed if: + * - The expiry is before current time + * - The last_refresh time is before the flush_time for that cache + * + * later we might drop old entries with non-NEVER expiry if that table + * is getting 'full' for some definition of 'full' + * + * The question of "how often to scan a table" is an interesting one + * and is answered in part by the use of the "nextcheck" field in the + * cache_detail. + * When a scan of a table begins, the nextcheck field is set to a time + * that is well into the future. + * While scanning, if an expiry time is found that is earlier than the + * current nextcheck time, nextcheck is set to that expiry time. + * If the flush_time is ever set to a time earlier than the nextcheck + * time, the nextcheck time is then set to that flush_time. + * + * A table is then only scanned if the current time is at least + * the nextcheck time. + * + */ + +static LIST_HEAD(cache_list); +static DEFINE_SPINLOCK(cache_list_lock); +static struct cache_detail *current_detail; +static int current_index; + +static struct file_operations cache_file_operations; +static struct file_operations content_file_operations; +static struct file_operations cache_flush_operations; + +static void do_cache_clean(void *data); +static DECLARE_WORK(cache_cleaner, do_cache_clean, NULL); + +void cache_register(struct cache_detail *cd) +{ + cd->proc_ent = proc_mkdir(cd->name, proc_net_rpc); + if (cd->proc_ent) { + struct proc_dir_entry *p; + cd->proc_ent->owner = THIS_MODULE; + cd->channel_ent = cd->content_ent = NULL; + + p = create_proc_entry("flush", S_IFREG|S_IRUSR|S_IWUSR, + cd->proc_ent); + cd->flush_ent = p; + if (p) { + p->proc_fops = &cache_flush_operations; + p->owner = THIS_MODULE; + p->data = cd; + } + + if (cd->cache_request || cd->cache_parse) { + p = create_proc_entry("channel", S_IFREG|S_IRUSR|S_IWUSR, + cd->proc_ent); + cd->channel_ent = p; + if (p) { + p->proc_fops = &cache_file_operations; + p->owner = THIS_MODULE; + p->data = cd; + } + } + if (cd->cache_show) { + p = create_proc_entry("content", S_IFREG|S_IRUSR|S_IWUSR, + cd->proc_ent); + cd->content_ent = p; + if (p) { + p->proc_fops = &content_file_operations; + p->owner = THIS_MODULE; + p->data = cd; + } + } + } + rwlock_init(&cd->hash_lock); + INIT_LIST_HEAD(&cd->queue); + spin_lock(&cache_list_lock); + cd->nextcheck = 0; + cd->entries = 0; + atomic_set(&cd->readers, 0); + cd->last_close = 0; + cd->last_warn = -1; + list_add(&cd->others, &cache_list); + spin_unlock(&cache_list_lock); + + /* start the cleaning process */ + schedule_work(&cache_cleaner); +} + +int cache_unregister(struct cache_detail *cd) +{ + cache_purge(cd); + spin_lock(&cache_list_lock); + write_lock(&cd->hash_lock); + if (cd->entries || atomic_read(&cd->inuse)) { + write_unlock(&cd->hash_lock); + spin_unlock(&cache_list_lock); + return -EBUSY; + } + if (current_detail == cd) + current_detail = NULL; + list_del_init(&cd->others); + write_unlock(&cd->hash_lock); + spin_unlock(&cache_list_lock); + if (cd->proc_ent) { + if (cd->flush_ent) + remove_proc_entry("flush", cd->proc_ent); + if (cd->channel_ent) + remove_proc_entry("channel", cd->proc_ent); + if (cd->content_ent) + remove_proc_entry("content", cd->proc_ent); + + cd->proc_ent = NULL; + remove_proc_entry(cd->name, proc_net_rpc); + } + if (list_empty(&cache_list)) { + /* module must be being unloaded so its safe to kill the worker */ + cancel_delayed_work(&cache_cleaner); + flush_scheduled_work(); + } + return 0; +} + +/* clean cache tries to find something to clean + * and cleans it. + * It returns 1 if it cleaned something, + * 0 if it didn't find anything this time + * -1 if it fell off the end of the list. + */ +static int cache_clean(void) +{ + int rv = 0; + struct list_head *next; + + spin_lock(&cache_list_lock); + + /* find a suitable table if we don't already have one */ + while (current_detail == NULL || + current_index >= current_detail->hash_size) { + if (current_detail) + next = current_detail->others.next; + else + next = cache_list.next; + if (next == &cache_list) { + current_detail = NULL; + spin_unlock(&cache_list_lock); + return -1; + } + current_detail = list_entry(next, struct cache_detail, others); + if (current_detail->nextcheck > get_seconds()) + current_index = current_detail->hash_size; + else { + current_index = 0; + current_detail->nextcheck = get_seconds()+30*60; + } + } + + /* find a non-empty bucket in the table */ + while (current_detail && + current_index < current_detail->hash_size && + current_detail->hash_table[current_index] == NULL) + current_index++; + + /* find a cleanable entry in the bucket and clean it, or set to next bucket */ + + if (current_detail && current_index < current_detail->hash_size) { + struct cache_head *ch, **cp; + struct cache_detail *d; + + write_lock(¤t_detail->hash_lock); + + /* Ok, now to clean this strand */ + + cp = & current_detail->hash_table[current_index]; + ch = *cp; + for (; ch; cp= & ch->next, ch= *cp) { + if (current_detail->nextcheck > ch->expiry_time) + current_detail->nextcheck = ch->expiry_time+1; + if (ch->expiry_time >= get_seconds() + && ch->last_refresh >= current_detail->flush_time + ) + continue; + if (test_and_clear_bit(CACHE_PENDING, &ch->flags)) + queue_loose(current_detail, ch); + + if (atomic_read(&ch->refcnt) == 1) + break; + } + if (ch) { + *cp = ch->next; + ch->next = NULL; + current_detail->entries--; + rv = 1; + } + write_unlock(¤t_detail->hash_lock); + d = current_detail; + if (!ch) + current_index ++; + spin_unlock(&cache_list_lock); + if (ch) + d->cache_put(ch, d); + } else + spin_unlock(&cache_list_lock); + + return rv; +} + +/* + * We want to regularly clean the cache, so we need to schedule some work ... + */ +static void do_cache_clean(void *data) +{ + int delay = 5; + if (cache_clean() == -1) + delay = 30*HZ; + + if (list_empty(&cache_list)) + delay = 0; + + if (delay) + schedule_delayed_work(&cache_cleaner, delay); +} + + +/* + * Clean all caches promptly. This just calls cache_clean + * repeatedly until we are sure that every cache has had a chance to + * be fully cleaned + */ +void cache_flush(void) +{ + while (cache_clean() != -1) + cond_resched(); + while (cache_clean() != -1) + cond_resched(); +} + +void cache_purge(struct cache_detail *detail) +{ + detail->flush_time = LONG_MAX; + detail->nextcheck = get_seconds(); + cache_flush(); + detail->flush_time = 1; +} + + + +/* + * Deferral and Revisiting of Requests. + * + * If a cache lookup finds a pending entry, we + * need to defer the request and revisit it later. + * All deferred requests are stored in a hash table, + * indexed by "struct cache_head *". + * As it may be wasteful to store a whole request + * structure, we allow the request to provide a + * deferred form, which must contain a + * 'struct cache_deferred_req' + * This cache_deferred_req contains a method to allow + * it to be revisited when cache info is available + */ + +#define DFR_HASHSIZE (PAGE_SIZE/sizeof(struct list_head)) +#define DFR_HASH(item) ((((long)item)>>4 ^ (((long)item)>>13)) % DFR_HASHSIZE) + +#define DFR_MAX 300 /* ??? */ + +static DEFINE_SPINLOCK(cache_defer_lock); +static LIST_HEAD(cache_defer_list); +static struct list_head cache_defer_hash[DFR_HASHSIZE]; +static int cache_defer_cnt; + +static void cache_defer_req(struct cache_req *req, struct cache_head *item) +{ + struct cache_deferred_req *dreq; + int hash = DFR_HASH(item); + + dreq = req->defer(req); + if (dreq == NULL) + return; + + dreq->item = item; + dreq->recv_time = get_seconds(); + + spin_lock(&cache_defer_lock); + + list_add(&dreq->recent, &cache_defer_list); + + if (cache_defer_hash[hash].next == NULL) + INIT_LIST_HEAD(&cache_defer_hash[hash]); + list_add(&dreq->hash, &cache_defer_hash[hash]); + + /* it is in, now maybe clean up */ + dreq = NULL; + if (++cache_defer_cnt > DFR_MAX) { + /* too much in the cache, randomly drop + * first or last + */ + if (net_random()&1) + dreq = list_entry(cache_defer_list.next, + struct cache_deferred_req, + recent); + else + dreq = list_entry(cache_defer_list.prev, + struct cache_deferred_req, + recent); + list_del(&dreq->recent); + list_del(&dreq->hash); + cache_defer_cnt--; + } + spin_unlock(&cache_defer_lock); + + if (dreq) { + /* there was one too many */ + dreq->revisit(dreq, 1); + } + if (test_bit(CACHE_VALID, &item->flags)) { + /* must have just been validated... */ + cache_revisit_request(item); + } +} + +static void cache_revisit_request(struct cache_head *item) +{ + struct cache_deferred_req *dreq; + struct list_head pending; + + struct list_head *lp; + int hash = DFR_HASH(item); + + INIT_LIST_HEAD(&pending); + spin_lock(&cache_defer_lock); + + lp = cache_defer_hash[hash].next; + if (lp) { + while (lp != &cache_defer_hash[hash]) { + dreq = list_entry(lp, struct cache_deferred_req, hash); + lp = lp->next; + if (dreq->item == item) { + list_del(&dreq->hash); + list_move(&dreq->recent, &pending); + cache_defer_cnt--; + } + } + } + spin_unlock(&cache_defer_lock); + + while (!list_empty(&pending)) { + dreq = list_entry(pending.next, struct cache_deferred_req, recent); + list_del_init(&dreq->recent); + dreq->revisit(dreq, 0); + } +} + +void cache_clean_deferred(void *owner) +{ + struct cache_deferred_req *dreq, *tmp; + struct list_head pending; + + + INIT_LIST_HEAD(&pending); + spin_lock(&cache_defer_lock); + + list_for_each_entry_safe(dreq, tmp, &cache_defer_list, recent) { + if (dreq->owner == owner) { + list_del(&dreq->hash); + list_move(&dreq->recent, &pending); + cache_defer_cnt--; + } + } + spin_unlock(&cache_defer_lock); + + while (!list_empty(&pending)) { + dreq = list_entry(pending.next, struct cache_deferred_req, recent); + list_del_init(&dreq->recent); + dreq->revisit(dreq, 1); + } +} + +/* + * communicate with user-space + * + * We have a magic /proc file - /proc/sunrpc/cache + * On read, you get a full request, or block + * On write, an update request is processed + * Poll works if anything to read, and always allows write + * + * Implemented by linked list of requests. Each open file has + * a ->private that also exists in this list. New request are added + * to the end and may wakeup and preceding readers. + * New readers are added to the head. If, on read, an item is found with + * CACHE_UPCALLING clear, we free it from the list. + * + */ + +static DEFINE_SPINLOCK(queue_lock); +static DECLARE_MUTEX(queue_io_sem); + +struct cache_queue { + struct list_head list; + int reader; /* if 0, then request */ +}; +struct cache_request { + struct cache_queue q; + struct cache_head *item; + char * buf; + int len; + int readers; +}; +struct cache_reader { + struct cache_queue q; + int offset; /* if non-0, we have a refcnt on next request */ +}; + +static ssize_t +cache_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) +{ + struct cache_reader *rp = filp->private_data; + struct cache_request *rq; + struct cache_detail *cd = PDE(filp->f_dentry->d_inode)->data; + int err; + + if (count == 0) + return 0; + + down(&queue_io_sem); /* protect against multiple concurrent + * readers on this file */ + again: + spin_lock(&queue_lock); + /* need to find next request */ + while (rp->q.list.next != &cd->queue && + list_entry(rp->q.list.next, struct cache_queue, list) + ->reader) { + struct list_head *next = rp->q.list.next; + list_move(&rp->q.list, next); + } + if (rp->q.list.next == &cd->queue) { + spin_unlock(&queue_lock); + up(&queue_io_sem); + if (rp->offset) + BUG(); + return 0; + } + rq = container_of(rp->q.list.next, struct cache_request, q.list); + if (rq->q.reader) BUG(); + if (rp->offset == 0) + rq->readers++; + spin_unlock(&queue_lock); + + if (rp->offset == 0 && !test_bit(CACHE_PENDING, &rq->item->flags)) { + err = -EAGAIN; + spin_lock(&queue_lock); + list_move(&rp->q.list, &rq->q.list); + spin_unlock(&queue_lock); + } else { + if (rp->offset + count > rq->len) + count = rq->len - rp->offset; + err = -EFAULT; + if (copy_to_user(buf, rq->buf + rp->offset, count)) + goto out; + rp->offset += count; + if (rp->offset >= rq->len) { + rp->offset = 0; + spin_lock(&queue_lock); + list_move(&rp->q.list, &rq->q.list); + spin_unlock(&queue_lock); + } + err = 0; + } + out: + if (rp->offset == 0) { + /* need to release rq */ + spin_lock(&queue_lock); + rq->readers--; + if (rq->readers == 0 && + !test_bit(CACHE_PENDING, &rq->item->flags)) { + list_del(&rq->q.list); + spin_unlock(&queue_lock); + cd->cache_put(rq->item, cd); + kfree(rq->buf); + kfree(rq); + } else + spin_unlock(&queue_lock); + } + if (err == -EAGAIN) + goto again; + up(&queue_io_sem); + return err ? err : count; +} + +static char write_buf[8192]; /* protected by queue_io_sem */ + +static ssize_t +cache_write(struct file *filp, const char __user *buf, size_t count, + loff_t *ppos) +{ + int err; + struct cache_detail *cd = PDE(filp->f_dentry->d_inode)->data; + + if (count == 0) + return 0; + if (count >= sizeof(write_buf)) + return -EINVAL; + + down(&queue_io_sem); + + if (copy_from_user(write_buf, buf, count)) { + up(&queue_io_sem); + return -EFAULT; + } + write_buf[count] = '\0'; + if (cd->cache_parse) + err = cd->cache_parse(cd, write_buf, count); + else + err = -EINVAL; + + up(&queue_io_sem); + return err ? err : count; +} + +static DECLARE_WAIT_QUEUE_HEAD(queue_wait); + +static unsigned int +cache_poll(struct file *filp, poll_table *wait) +{ + unsigned int mask; + struct cache_reader *rp = filp->private_data; + struct cache_queue *cq; + struct cache_detail *cd = PDE(filp->f_dentry->d_inode)->data; + + poll_wait(filp, &queue_wait, wait); + + /* alway allow write */ + mask = POLL_OUT | POLLWRNORM; + + if (!rp) + return mask; + + spin_lock(&queue_lock); + + for (cq= &rp->q; &cq->list != &cd->queue; + cq = list_entry(cq->list.next, struct cache_queue, list)) + if (!cq->reader) { + mask |= POLLIN | POLLRDNORM; + break; + } + spin_unlock(&queue_lock); + return mask; +} + +static int +cache_ioctl(struct inode *ino, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + int len = 0; + struct cache_reader *rp = filp->private_data; + struct cache_queue *cq; + struct cache_detail *cd = PDE(ino)->data; + + if (cmd != FIONREAD || !rp) + return -EINVAL; + + spin_lock(&queue_lock); + + /* only find the length remaining in current request, + * or the length of the next request + */ + for (cq= &rp->q; &cq->list != &cd->queue; + cq = list_entry(cq->list.next, struct cache_queue, list)) + if (!cq->reader) { + struct cache_request *cr = + container_of(cq, struct cache_request, q); + len = cr->len - rp->offset; + break; + } + spin_unlock(&queue_lock); + + return put_user(len, (int __user *)arg); +} + +static int +cache_open(struct inode *inode, struct file *filp) +{ + struct cache_reader *rp = NULL; + + nonseekable_open(inode, filp); + if (filp->f_mode & FMODE_READ) { + struct cache_detail *cd = PDE(inode)->data; + + rp = kmalloc(sizeof(*rp), GFP_KERNEL); + if (!rp) + return -ENOMEM; + rp->offset = 0; + rp->q.reader = 1; + atomic_inc(&cd->readers); + spin_lock(&queue_lock); + list_add(&rp->q.list, &cd->queue); + spin_unlock(&queue_lock); + } + filp->private_data = rp; + return 0; +} + +static int +cache_release(struct inode *inode, struct file *filp) +{ + struct cache_reader *rp = filp->private_data; + struct cache_detail *cd = PDE(inode)->data; + + if (rp) { + spin_lock(&queue_lock); + if (rp->offset) { + struct cache_queue *cq; + for (cq= &rp->q; &cq->list != &cd->queue; + cq = list_entry(cq->list.next, struct cache_queue, list)) + if (!cq->reader) { + container_of(cq, struct cache_request, q) + ->readers--; + break; + } + rp->offset = 0; + } + list_del(&rp->q.list); + spin_unlock(&queue_lock); + + filp->private_data = NULL; + kfree(rp); + + cd->last_close = get_seconds(); + atomic_dec(&cd->readers); + } + return 0; +} + + + +static struct file_operations cache_file_operations = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = cache_read, + .write = cache_write, + .poll = cache_poll, + .ioctl = cache_ioctl, /* for FIONREAD */ + .open = cache_open, + .release = cache_release, +}; + + +static void queue_loose(struct cache_detail *detail, struct cache_head *ch) +{ + struct cache_queue *cq; + spin_lock(&queue_lock); + list_for_each_entry(cq, &detail->queue, list) + if (!cq->reader) { + struct cache_request *cr = container_of(cq, struct cache_request, q); + if (cr->item != ch) + continue; + if (cr->readers != 0) + break; + list_del(&cr->q.list); + spin_unlock(&queue_lock); + detail->cache_put(cr->item, detail); + kfree(cr->buf); + kfree(cr); + return; + } + spin_unlock(&queue_lock); +} + +/* + * Support routines for text-based upcalls. + * Fields are separated by spaces. + * Fields are either mangled to quote space tab newline slosh with slosh + * or a hexified with a leading \x + * Record is terminated with newline. + * + */ + +void qword_add(char **bpp, int *lp, char *str) +{ + char *bp = *bpp; + int len = *lp; + char c; + + if (len < 0) return; + + while ((c=*str++) && len) + switch(c) { + case ' ': + case '\t': + case '\n': + case '\\': + if (len >= 4) { + *bp++ = '\\'; + *bp++ = '0' + ((c & 0300)>>6); + *bp++ = '0' + ((c & 0070)>>3); + *bp++ = '0' + ((c & 0007)>>0); + } + len -= 4; + break; + default: + *bp++ = c; + len--; + } + if (c || len <1) len = -1; + else { + *bp++ = ' '; + len--; + } + *bpp = bp; + *lp = len; +} + +void qword_addhex(char **bpp, int *lp, char *buf, int blen) +{ + char *bp = *bpp; + int len = *lp; + + if (len < 0) return; + + if (len > 2) { + *bp++ = '\\'; + *bp++ = 'x'; + len -= 2; + while (blen && len >= 2) { + unsigned char c = *buf++; + *bp++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1); + *bp++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1); + len -= 2; + blen--; + } + } + if (blen || len<1) len = -1; + else { + *bp++ = ' '; + len--; + } + *bpp = bp; + *lp = len; +} + +static void warn_no_listener(struct cache_detail *detail) +{ + if (detail->last_warn != detail->last_close) { + detail->last_warn = detail->last_close; + if (detail->warn_no_listener) + detail->warn_no_listener(detail); + } +} + +/* + * register an upcall request to user-space. + * Each request is at most one page long. + */ +static int cache_make_upcall(struct cache_detail *detail, struct cache_head *h) +{ + + char *buf; + struct cache_request *crq; + char *bp; + int len; + + if (detail->cache_request == NULL) + return -EINVAL; + + if (atomic_read(&detail->readers) == 0 && + detail->last_close < get_seconds() - 30) { + warn_no_listener(detail); + return -EINVAL; + } + + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) + return -EAGAIN; + + crq = kmalloc(sizeof (*crq), GFP_KERNEL); + if (!crq) { + kfree(buf); + return -EAGAIN; + } + + bp = buf; len = PAGE_SIZE; + + detail->cache_request(detail, h, &bp, &len); + + if (len < 0) { + kfree(buf); + kfree(crq); + return -EAGAIN; + } + crq->q.reader = 0; + crq->item = cache_get(h); + crq->buf = buf; + crq->len = PAGE_SIZE - len; + crq->readers = 0; + spin_lock(&queue_lock); + list_add_tail(&crq->q.list, &detail->queue); + spin_unlock(&queue_lock); + wake_up(&queue_wait); + return 0; +} + +/* + * parse a message from user-space and pass it + * to an appropriate cache + * Messages are, like requests, separated into fields by + * spaces and dequotes as \xHEXSTRING or embedded \nnn octal + * + * Message is + * reply cachename expiry key ... content.... + * + * key and content are both parsed by cache + */ + +#define isodigit(c) (isdigit(c) && c <= '7') +int qword_get(char **bpp, char *dest, int bufsize) +{ + /* return bytes copied, or -1 on error */ + char *bp = *bpp; + int len = 0; + + while (*bp == ' ') bp++; + + if (bp[0] == '\\' && bp[1] == 'x') { + /* HEX STRING */ + bp += 2; + while (isxdigit(bp[0]) && isxdigit(bp[1]) && len < bufsize) { + int byte = isdigit(*bp) ? *bp-'0' : toupper(*bp)-'A'+10; + bp++; + byte <<= 4; + byte |= isdigit(*bp) ? *bp-'0' : toupper(*bp)-'A'+10; + *dest++ = byte; + bp++; + len++; + } + } else { + /* text with \nnn octal quoting */ + while (*bp != ' ' && *bp != '\n' && *bp && len < bufsize-1) { + if (*bp == '\\' && + isodigit(bp[1]) && (bp[1] <= '3') && + isodigit(bp[2]) && + isodigit(bp[3])) { + int byte = (*++bp -'0'); + bp++; + byte = (byte << 3) | (*bp++ - '0'); + byte = (byte << 3) | (*bp++ - '0'); + *dest++ = byte; + len++; + } else { + *dest++ = *bp++; + len++; + } + } + } + + if (*bp != ' ' && *bp != '\n' && *bp != '\0') + return -1; + while (*bp == ' ') bp++; + *bpp = bp; + *dest = '\0'; + return len; +} + + +/* + * support /proc/sunrpc/cache/$CACHENAME/content + * as a seqfile. + * We call ->cache_show passing NULL for the item to + * get a header, then pass each real item in the cache + */ + +struct handle { + struct cache_detail *cd; +}; + +static void *c_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + unsigned hash, entry; + struct cache_head *ch; + struct cache_detail *cd = ((struct handle*)m->private)->cd; + + + read_lock(&cd->hash_lock); + if (!n--) + return SEQ_START_TOKEN; + hash = n >> 32; + entry = n & ((1LL<<32) - 1); + + for (ch=cd->hash_table[hash]; ch; ch=ch->next) + if (!entry--) + return ch; + n &= ~((1LL<<32) - 1); + do { + hash++; + n += 1LL<<32; + } while(hash < cd->hash_size && + cd->hash_table[hash]==NULL); + if (hash >= cd->hash_size) + return NULL; + *pos = n+1; + return cd->hash_table[hash]; +} + +static void *c_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct cache_head *ch = p; + int hash = (*pos >> 32); + struct cache_detail *cd = ((struct handle*)m->private)->cd; + + if (p == SEQ_START_TOKEN) + hash = 0; + else if (ch->next == NULL) { + hash++; + *pos += 1LL<<32; + } else { + ++*pos; + return ch->next; + } + *pos &= ~((1LL<<32) - 1); + while (hash < cd->hash_size && + cd->hash_table[hash] == NULL) { + hash++; + *pos += 1LL<<32; + } + if (hash >= cd->hash_size) + return NULL; + ++*pos; + return cd->hash_table[hash]; +} + +static void c_stop(struct seq_file *m, void *p) +{ + struct cache_detail *cd = ((struct handle*)m->private)->cd; + read_unlock(&cd->hash_lock); +} + +static int c_show(struct seq_file *m, void *p) +{ + struct cache_head *cp = p; + struct cache_detail *cd = ((struct handle*)m->private)->cd; + + if (p == SEQ_START_TOKEN) + return cd->cache_show(m, cd, NULL); + + ifdebug(CACHE) + seq_printf(m, "# expiry=%ld refcnt=%d\n", + cp->expiry_time, atomic_read(&cp->refcnt)); + cache_get(cp); + if (cache_check(cd, cp, NULL)) + /* cache_check does a cache_put on failure */ + seq_printf(m, "# "); + else + cache_put(cp, cd); + + return cd->cache_show(m, cd, cp); +} + +static struct seq_operations cache_content_op = { + .start = c_start, + .next = c_next, + .stop = c_stop, + .show = c_show, +}; + +static int content_open(struct inode *inode, struct file *file) +{ + int res; + struct handle *han; + struct cache_detail *cd = PDE(inode)->data; + + han = kmalloc(sizeof(*han), GFP_KERNEL); + if (han == NULL) + return -ENOMEM; + + han->cd = cd; + + res = seq_open(file, &cache_content_op); + if (res) + kfree(han); + else + ((struct seq_file *)file->private_data)->private = han; + + return res; +} +static int content_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = (struct seq_file *)file->private_data; + struct handle *han = m->private; + kfree(han); + m->private = NULL; + return seq_release(inode, file); +} + +static struct file_operations content_file_operations = { + .open = content_open, + .read = seq_read, + .llseek = seq_lseek, + .release = content_release, +}; + +static ssize_t read_flush(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct cache_detail *cd = PDE(file->f_dentry->d_inode)->data; + char tbuf[20]; + unsigned long p = *ppos; + int len; + + sprintf(tbuf, "%lu\n", cd->flush_time); + len = strlen(tbuf); + if (p >= len) + return 0; + len -= p; + if (len > count) len = count; + if (copy_to_user(buf, (void*)(tbuf+p), len)) + len = -EFAULT; + else + *ppos += len; + return len; +} + +static ssize_t write_flush(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + struct cache_detail *cd = PDE(file->f_dentry->d_inode)->data; + char tbuf[20]; + char *ep; + long flushtime; + if (*ppos || count > sizeof(tbuf)-1) + return -EINVAL; + if (copy_from_user(tbuf, buf, count)) + return -EFAULT; + tbuf[count] = 0; + flushtime = simple_strtoul(tbuf, &ep, 0); + if (*ep && *ep != '\n') + return -EINVAL; + + cd->flush_time = flushtime; + cd->nextcheck = get_seconds(); + cache_flush(); + + *ppos += count; + return count; +} + +static struct file_operations cache_flush_operations = { + .open = nonseekable_open, + .read = read_flush, + .write = write_flush, +}; diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c new file mode 100644 index 000000000000..02bc029d46fe --- /dev/null +++ b/net/sunrpc/clnt.c @@ -0,0 +1,1085 @@ +/* + * linux/net/sunrpc/rpcclnt.c + * + * This file contains the high-level RPC interface. + * It is modeled as a finite state machine to support both synchronous + * and asynchronous requests. + * + * - RPC header generation and argument serialization. + * - Credential refresh. + * - TCP connect handling. + * - Retry of operation when it is suspected the operation failed because + * of uid squashing on the server, or when the credentials were stale + * and need to be refreshed, or when a packet was damaged in transit. + * This may be have to be moved to the VFS layer. + * + * NB: BSD uses a more intelligent approach to guessing when a request + * or reply has been lost by keeping the RTO estimate for each procedure. + * We currently make do with a constant timeout value. + * + * Copyright (C) 1992,1993 Rick Sladkey + * Copyright (C) 1995,1996 Olaf Kirch + */ + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + + +#define RPC_SLACK_SPACE (1024) /* total overkill */ + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_CALL +#endif + +static DECLARE_WAIT_QUEUE_HEAD(destroy_wait); + + +static void call_start(struct rpc_task *task); +static void call_reserve(struct rpc_task *task); +static void call_reserveresult(struct rpc_task *task); +static void call_allocate(struct rpc_task *task); +static void call_encode(struct rpc_task *task); +static void call_decode(struct rpc_task *task); +static void call_bind(struct rpc_task *task); +static void call_transmit(struct rpc_task *task); +static void call_status(struct rpc_task *task); +static void call_refresh(struct rpc_task *task); +static void call_refreshresult(struct rpc_task *task); +static void call_timeout(struct rpc_task *task); +static void call_connect(struct rpc_task *task); +static void call_connect_status(struct rpc_task *task); +static u32 * call_header(struct rpc_task *task); +static u32 * call_verify(struct rpc_task *task); + + +static int +rpc_setup_pipedir(struct rpc_clnt *clnt, char *dir_name) +{ + static uint32_t clntid; + int error; + + if (dir_name == NULL) + return 0; + for (;;) { + snprintf(clnt->cl_pathname, sizeof(clnt->cl_pathname), + "%s/clnt%x", dir_name, + (unsigned int)clntid++); + clnt->cl_pathname[sizeof(clnt->cl_pathname) - 1] = '\0'; + clnt->cl_dentry = rpc_mkdir(clnt->cl_pathname, clnt); + if (!IS_ERR(clnt->cl_dentry)) + return 0; + error = PTR_ERR(clnt->cl_dentry); + if (error != -EEXIST) { + printk(KERN_INFO "RPC: Couldn't create pipefs entry %s, error %d\n", + clnt->cl_pathname, error); + return error; + } + } +} + +/* + * Create an RPC client + * FIXME: This should also take a flags argument (as in task->tk_flags). + * It's called (among others) from pmap_create_client, which may in + * turn be called by an async task. In this case, rpciod should not be + * made to sleep too long. + */ +struct rpc_clnt * +rpc_create_client(struct rpc_xprt *xprt, char *servname, + struct rpc_program *program, u32 vers, + rpc_authflavor_t flavor) +{ + struct rpc_version *version; + struct rpc_clnt *clnt = NULL; + int err; + int len; + + dprintk("RPC: creating %s client for %s (xprt %p)\n", + program->name, servname, xprt); + + err = -EINVAL; + if (!xprt) + goto out_err; + if (vers >= program->nrvers || !(version = program->version[vers])) + goto out_err; + + err = -ENOMEM; + clnt = (struct rpc_clnt *) kmalloc(sizeof(*clnt), GFP_KERNEL); + if (!clnt) + goto out_err; + memset(clnt, 0, sizeof(*clnt)); + atomic_set(&clnt->cl_users, 0); + atomic_set(&clnt->cl_count, 1); + clnt->cl_parent = clnt; + + clnt->cl_server = clnt->cl_inline_name; + len = strlen(servname) + 1; + if (len > sizeof(clnt->cl_inline_name)) { + char *buf = kmalloc(len, GFP_KERNEL); + if (buf != 0) + clnt->cl_server = buf; + else + len = sizeof(clnt->cl_inline_name); + } + strlcpy(clnt->cl_server, servname, len); + + clnt->cl_xprt = xprt; + clnt->cl_procinfo = version->procs; + clnt->cl_maxproc = version->nrprocs; + clnt->cl_protname = program->name; + clnt->cl_pmap = &clnt->cl_pmap_default; + clnt->cl_port = xprt->addr.sin_port; + clnt->cl_prog = program->number; + clnt->cl_vers = version->number; + clnt->cl_prot = xprt->prot; + clnt->cl_stats = program->stats; + rpc_init_wait_queue(&clnt->cl_pmap_default.pm_bindwait, "bindwait"); + + if (!clnt->cl_port) + clnt->cl_autobind = 1; + + clnt->cl_rtt = &clnt->cl_rtt_default; + rpc_init_rtt(&clnt->cl_rtt_default, xprt->timeout.to_initval); + + err = rpc_setup_pipedir(clnt, program->pipe_dir_name); + if (err < 0) + goto out_no_path; + + err = -ENOMEM; + if (!rpcauth_create(flavor, clnt)) { + printk(KERN_INFO "RPC: Couldn't create auth handle (flavor %u)\n", + flavor); + goto out_no_auth; + } + + /* save the nodename */ + clnt->cl_nodelen = strlen(system_utsname.nodename); + if (clnt->cl_nodelen > UNX_MAXNODENAME) + clnt->cl_nodelen = UNX_MAXNODENAME; + memcpy(clnt->cl_nodename, system_utsname.nodename, clnt->cl_nodelen); + return clnt; + +out_no_auth: + rpc_rmdir(clnt->cl_pathname); +out_no_path: + if (clnt->cl_server != clnt->cl_inline_name) + kfree(clnt->cl_server); + kfree(clnt); +out_err: + return ERR_PTR(err); +} + +/* + * This function clones the RPC client structure. It allows us to share the + * same transport while varying parameters such as the authentication + * flavour. + */ +struct rpc_clnt * +rpc_clone_client(struct rpc_clnt *clnt) +{ + struct rpc_clnt *new; + + new = (struct rpc_clnt *)kmalloc(sizeof(*new), GFP_KERNEL); + if (!new) + goto out_no_clnt; + memcpy(new, clnt, sizeof(*new)); + atomic_set(&new->cl_count, 1); + atomic_set(&new->cl_users, 0); + new->cl_parent = clnt; + atomic_inc(&clnt->cl_count); + /* Duplicate portmapper */ + rpc_init_wait_queue(&new->cl_pmap_default.pm_bindwait, "bindwait"); + /* Turn off autobind on clones */ + new->cl_autobind = 0; + new->cl_oneshot = 0; + new->cl_dead = 0; + rpc_init_rtt(&new->cl_rtt_default, clnt->cl_xprt->timeout.to_initval); + if (new->cl_auth) + atomic_inc(&new->cl_auth->au_count); + return new; +out_no_clnt: + printk(KERN_INFO "RPC: out of memory in %s\n", __FUNCTION__); + return ERR_PTR(-ENOMEM); +} + +/* + * Properly shut down an RPC client, terminating all outstanding + * requests. Note that we must be certain that cl_oneshot and + * cl_dead are cleared, or else the client would be destroyed + * when the last task releases it. + */ +int +rpc_shutdown_client(struct rpc_clnt *clnt) +{ + dprintk("RPC: shutting down %s client for %s, tasks=%d\n", + clnt->cl_protname, clnt->cl_server, + atomic_read(&clnt->cl_users)); + + while (atomic_read(&clnt->cl_users) > 0) { + /* Don't let rpc_release_client destroy us */ + clnt->cl_oneshot = 0; + clnt->cl_dead = 0; + rpc_killall_tasks(clnt); + sleep_on_timeout(&destroy_wait, 1*HZ); + } + + if (atomic_read(&clnt->cl_users) < 0) { + printk(KERN_ERR "RPC: rpc_shutdown_client clnt %p tasks=%d\n", + clnt, atomic_read(&clnt->cl_users)); +#ifdef RPC_DEBUG + rpc_show_tasks(); +#endif + BUG(); + } + + return rpc_destroy_client(clnt); +} + +/* + * Delete an RPC client + */ +int +rpc_destroy_client(struct rpc_clnt *clnt) +{ + if (!atomic_dec_and_test(&clnt->cl_count)) + return 1; + BUG_ON(atomic_read(&clnt->cl_users) != 0); + + dprintk("RPC: destroying %s client for %s\n", + clnt->cl_protname, clnt->cl_server); + if (clnt->cl_auth) { + rpcauth_destroy(clnt->cl_auth); + clnt->cl_auth = NULL; + } + if (clnt->cl_parent != clnt) { + rpc_destroy_client(clnt->cl_parent); + goto out_free; + } + if (clnt->cl_pathname[0]) + rpc_rmdir(clnt->cl_pathname); + if (clnt->cl_xprt) { + xprt_destroy(clnt->cl_xprt); + clnt->cl_xprt = NULL; + } + if (clnt->cl_server != clnt->cl_inline_name) + kfree(clnt->cl_server); +out_free: + kfree(clnt); + return 0; +} + +/* + * Release an RPC client + */ +void +rpc_release_client(struct rpc_clnt *clnt) +{ + dprintk("RPC: rpc_release_client(%p, %d)\n", + clnt, atomic_read(&clnt->cl_users)); + + if (!atomic_dec_and_test(&clnt->cl_users)) + return; + wake_up(&destroy_wait); + if (clnt->cl_oneshot || clnt->cl_dead) + rpc_destroy_client(clnt); +} + +/* + * Default callback for async RPC calls + */ +static void +rpc_default_callback(struct rpc_task *task) +{ +} + +/* + * Export the signal mask handling for aysnchronous code that + * sleeps on RPC calls + */ + +void rpc_clnt_sigmask(struct rpc_clnt *clnt, sigset_t *oldset) +{ + unsigned long sigallow = sigmask(SIGKILL); + unsigned long irqflags; + + /* Turn off various signals */ + if (clnt->cl_intr) { + struct k_sigaction *action = current->sighand->action; + if (action[SIGINT-1].sa.sa_handler == SIG_DFL) + sigallow |= sigmask(SIGINT); + if (action[SIGQUIT-1].sa.sa_handler == SIG_DFL) + sigallow |= sigmask(SIGQUIT); + } + spin_lock_irqsave(¤t->sighand->siglock, irqflags); + *oldset = current->blocked; + siginitsetinv(¤t->blocked, sigallow & ~oldset->sig[0]); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); +} + +void rpc_clnt_sigunmask(struct rpc_clnt *clnt, sigset_t *oldset) +{ + unsigned long irqflags; + + spin_lock_irqsave(¤t->sighand->siglock, irqflags); + current->blocked = *oldset; + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, irqflags); +} + +/* + * New rpc_call implementation + */ +int rpc_call_sync(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) +{ + struct rpc_task *task; + sigset_t oldset; + int status; + + /* If this client is slain all further I/O fails */ + if (clnt->cl_dead) + return -EIO; + + BUG_ON(flags & RPC_TASK_ASYNC); + + rpc_clnt_sigmask(clnt, &oldset); + + status = -ENOMEM; + task = rpc_new_task(clnt, NULL, flags); + if (task == NULL) + goto out; + + rpc_call_setup(task, msg, 0); + + /* Set up the call info struct and execute the task */ + if (task->tk_status == 0) + status = rpc_execute(task); + else { + status = task->tk_status; + rpc_release_task(task); + } + +out: + rpc_clnt_sigunmask(clnt, &oldset); + + return status; +} + +/* + * New rpc_call implementation + */ +int +rpc_call_async(struct rpc_clnt *clnt, struct rpc_message *msg, int flags, + rpc_action callback, void *data) +{ + struct rpc_task *task; + sigset_t oldset; + int status; + + /* If this client is slain all further I/O fails */ + if (clnt->cl_dead) + return -EIO; + + flags |= RPC_TASK_ASYNC; + + rpc_clnt_sigmask(clnt, &oldset); + + /* Create/initialize a new RPC task */ + if (!callback) + callback = rpc_default_callback; + status = -ENOMEM; + if (!(task = rpc_new_task(clnt, callback, flags))) + goto out; + task->tk_calldata = data; + + rpc_call_setup(task, msg, 0); + + /* Set up the call info struct and execute the task */ + status = task->tk_status; + if (status == 0) + rpc_execute(task); + else + rpc_release_task(task); + +out: + rpc_clnt_sigunmask(clnt, &oldset); + + return status; +} + + +void +rpc_call_setup(struct rpc_task *task, struct rpc_message *msg, int flags) +{ + task->tk_msg = *msg; + task->tk_flags |= flags; + /* Bind the user cred */ + if (task->tk_msg.rpc_cred != NULL) + rpcauth_holdcred(task); + else + rpcauth_bindcred(task); + + if (task->tk_status == 0) + task->tk_action = call_start; + else + task->tk_action = NULL; +} + +void +rpc_setbufsize(struct rpc_clnt *clnt, unsigned int sndsize, unsigned int rcvsize) +{ + struct rpc_xprt *xprt = clnt->cl_xprt; + + xprt->sndsize = 0; + if (sndsize) + xprt->sndsize = sndsize + RPC_SLACK_SPACE; + xprt->rcvsize = 0; + if (rcvsize) + xprt->rcvsize = rcvsize + RPC_SLACK_SPACE; + if (xprt_connected(xprt)) + xprt_sock_setbufsize(xprt); +} + +/* + * Return size of largest payload RPC client can support, in bytes + * + * For stream transports, this is one RPC record fragment (see RFC + * 1831), as we don't support multi-record requests yet. For datagram + * transports, this is the size of an IP packet minus the IP, UDP, and + * RPC header sizes. + */ +size_t rpc_max_payload(struct rpc_clnt *clnt) +{ + return clnt->cl_xprt->max_payload; +} +EXPORT_SYMBOL(rpc_max_payload); + +/* + * Restart an (async) RPC call. Usually called from within the + * exit handler. + */ +void +rpc_restart_call(struct rpc_task *task) +{ + if (RPC_ASSASSINATED(task)) + return; + + task->tk_action = call_start; +} + +/* + * 0. Initial state + * + * Other FSM states can be visited zero or more times, but + * this state is visited exactly once for each RPC. + */ +static void +call_start(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + + dprintk("RPC: %4d call_start %s%d proc %d (%s)\n", task->tk_pid, + clnt->cl_protname, clnt->cl_vers, task->tk_msg.rpc_proc->p_proc, + (RPC_IS_ASYNC(task) ? "async" : "sync")); + + /* Increment call count */ + task->tk_msg.rpc_proc->p_count++; + clnt->cl_stats->rpccnt++; + task->tk_action = call_reserve; +} + +/* + * 1. Reserve an RPC call slot + */ +static void +call_reserve(struct rpc_task *task) +{ + dprintk("RPC: %4d call_reserve\n", task->tk_pid); + + if (!rpcauth_uptodatecred(task)) { + task->tk_action = call_refresh; + return; + } + + task->tk_status = 0; + task->tk_action = call_reserveresult; + xprt_reserve(task); +} + +/* + * 1b. Grok the result of xprt_reserve() + */ +static void +call_reserveresult(struct rpc_task *task) +{ + int status = task->tk_status; + + dprintk("RPC: %4d call_reserveresult (status %d)\n", + task->tk_pid, task->tk_status); + + /* + * After a call to xprt_reserve(), we must have either + * a request slot or else an error status. + */ + task->tk_status = 0; + if (status >= 0) { + if (task->tk_rqstp) { + task->tk_action = call_allocate; + return; + } + + printk(KERN_ERR "%s: status=%d, but no request slot, exiting\n", + __FUNCTION__, status); + rpc_exit(task, -EIO); + return; + } + + /* + * Even though there was an error, we may have acquired + * a request slot somehow. Make sure not to leak it. + */ + if (task->tk_rqstp) { + printk(KERN_ERR "%s: status=%d, request allocated anyway\n", + __FUNCTION__, status); + xprt_release(task); + } + + switch (status) { + case -EAGAIN: /* woken up; retry */ + task->tk_action = call_reserve; + return; + case -EIO: /* probably a shutdown */ + break; + default: + printk(KERN_ERR "%s: unrecognized error %d, exiting\n", + __FUNCTION__, status); + break; + } + rpc_exit(task, status); +} + +/* + * 2. Allocate the buffer. For details, see sched.c:rpc_malloc. + * (Note: buffer memory is freed in rpc_task_release). + */ +static void +call_allocate(struct rpc_task *task) +{ + unsigned int bufsiz; + + dprintk("RPC: %4d call_allocate (status %d)\n", + task->tk_pid, task->tk_status); + task->tk_action = call_bind; + if (task->tk_buffer) + return; + + /* FIXME: compute buffer requirements more exactly using + * auth->au_wslack */ + bufsiz = task->tk_msg.rpc_proc->p_bufsiz + RPC_SLACK_SPACE; + + if (rpc_malloc(task, bufsiz << 1) != NULL) + return; + printk(KERN_INFO "RPC: buffer allocation failed for task %p\n", task); + + if (RPC_IS_ASYNC(task) || !(task->tk_client->cl_intr && signalled())) { + xprt_release(task); + task->tk_action = call_reserve; + rpc_delay(task, HZ>>4); + return; + } + + rpc_exit(task, -ERESTARTSYS); +} + +/* + * 3. Encode arguments of an RPC call + */ +static void +call_encode(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_rqst *req = task->tk_rqstp; + struct xdr_buf *sndbuf = &req->rq_snd_buf; + struct xdr_buf *rcvbuf = &req->rq_rcv_buf; + unsigned int bufsiz; + kxdrproc_t encode; + int status; + u32 *p; + + dprintk("RPC: %4d call_encode (status %d)\n", + task->tk_pid, task->tk_status); + + /* Default buffer setup */ + bufsiz = task->tk_bufsize >> 1; + sndbuf->head[0].iov_base = (void *)task->tk_buffer; + sndbuf->head[0].iov_len = bufsiz; + sndbuf->tail[0].iov_len = 0; + sndbuf->page_len = 0; + sndbuf->len = 0; + sndbuf->buflen = bufsiz; + rcvbuf->head[0].iov_base = (void *)((char *)task->tk_buffer + bufsiz); + rcvbuf->head[0].iov_len = bufsiz; + rcvbuf->tail[0].iov_len = 0; + rcvbuf->page_len = 0; + rcvbuf->len = 0; + rcvbuf->buflen = bufsiz; + + /* Encode header and provided arguments */ + encode = task->tk_msg.rpc_proc->p_encode; + if (!(p = call_header(task))) { + printk(KERN_INFO "RPC: call_header failed, exit EIO\n"); + rpc_exit(task, -EIO); + return; + } + if (encode && (status = rpcauth_wrap_req(task, encode, req, p, + task->tk_msg.rpc_argp)) < 0) { + printk(KERN_WARNING "%s: can't encode arguments: %d\n", + clnt->cl_protname, -status); + rpc_exit(task, status); + } +} + +/* + * 4. Get the server port number if not yet set + */ +static void +call_bind(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = clnt->cl_xprt; + + dprintk("RPC: %4d call_bind xprt %p %s connected\n", task->tk_pid, + xprt, (xprt_connected(xprt) ? "is" : "is not")); + + task->tk_action = (xprt_connected(xprt)) ? call_transmit : call_connect; + + if (!clnt->cl_port) { + task->tk_action = call_connect; + task->tk_timeout = RPC_CONNECT_TIMEOUT; + rpc_getport(task, clnt); + } +} + +/* + * 4a. Connect to the RPC server (TCP case) + */ +static void +call_connect(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + + dprintk("RPC: %4d call_connect status %d\n", + task->tk_pid, task->tk_status); + + if (xprt_connected(clnt->cl_xprt)) { + task->tk_action = call_transmit; + return; + } + task->tk_action = call_connect_status; + if (task->tk_status < 0) + return; + xprt_connect(task); +} + +/* + * 4b. Sort out connect result + */ +static void +call_connect_status(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + int status = task->tk_status; + + task->tk_status = 0; + if (status >= 0) { + clnt->cl_stats->netreconn++; + task->tk_action = call_transmit; + return; + } + + /* Something failed: we may have to rebind */ + if (clnt->cl_autobind) + clnt->cl_port = 0; + switch (status) { + case -ENOTCONN: + case -ETIMEDOUT: + case -EAGAIN: + task->tk_action = (clnt->cl_port == 0) ? call_bind : call_connect; + break; + default: + rpc_exit(task, -EIO); + } +} + +/* + * 5. Transmit the RPC request, and wait for reply + */ +static void +call_transmit(struct rpc_task *task) +{ + dprintk("RPC: %4d call_transmit (status %d)\n", + task->tk_pid, task->tk_status); + + task->tk_action = call_status; + if (task->tk_status < 0) + return; + task->tk_status = xprt_prepare_transmit(task); + if (task->tk_status != 0) + return; + /* Encode here so that rpcsec_gss can use correct sequence number. */ + if (!task->tk_rqstp->rq_bytes_sent) + call_encode(task); + if (task->tk_status < 0) + return; + xprt_transmit(task); + if (task->tk_status < 0) + return; + if (!task->tk_msg.rpc_proc->p_decode) { + task->tk_action = NULL; + rpc_wake_up_task(task); + } +} + +/* + * 6. Sort out the RPC call status + */ +static void +call_status(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_rqst *req = task->tk_rqstp; + int status; + + if (req->rq_received > 0 && !req->rq_bytes_sent) + task->tk_status = req->rq_received; + + dprintk("RPC: %4d call_status (status %d)\n", + task->tk_pid, task->tk_status); + + status = task->tk_status; + if (status >= 0) { + task->tk_action = call_decode; + return; + } + + task->tk_status = 0; + switch(status) { + case -ETIMEDOUT: + task->tk_action = call_timeout; + break; + case -ECONNREFUSED: + case -ENOTCONN: + req->rq_bytes_sent = 0; + if (clnt->cl_autobind) + clnt->cl_port = 0; + task->tk_action = call_bind; + break; + case -EAGAIN: + task->tk_action = call_transmit; + break; + case -EIO: + /* shutdown or soft timeout */ + rpc_exit(task, status); + break; + default: + if (clnt->cl_chatty) + printk("%s: RPC call returned error %d\n", + clnt->cl_protname, -status); + rpc_exit(task, status); + break; + } +} + +/* + * 6a. Handle RPC timeout + * We do not release the request slot, so we keep using the + * same XID for all retransmits. + */ +static void +call_timeout(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + + if (xprt_adjust_timeout(task->tk_rqstp) == 0) { + dprintk("RPC: %4d call_timeout (minor)\n", task->tk_pid); + goto retry; + } + + dprintk("RPC: %4d call_timeout (major)\n", task->tk_pid); + if (RPC_IS_SOFT(task)) { + if (clnt->cl_chatty) + printk(KERN_NOTICE "%s: server %s not responding, timed out\n", + clnt->cl_protname, clnt->cl_server); + rpc_exit(task, -EIO); + return; + } + + if (clnt->cl_chatty && !(task->tk_flags & RPC_CALL_MAJORSEEN)) { + task->tk_flags |= RPC_CALL_MAJORSEEN; + printk(KERN_NOTICE "%s: server %s not responding, still trying\n", + clnt->cl_protname, clnt->cl_server); + } + if (clnt->cl_autobind) + clnt->cl_port = 0; + +retry: + clnt->cl_stats->rpcretrans++; + task->tk_action = call_bind; + task->tk_status = 0; +} + +/* + * 7. Decode the RPC reply + */ +static void +call_decode(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_rqst *req = task->tk_rqstp; + kxdrproc_t decode = task->tk_msg.rpc_proc->p_decode; + u32 *p; + + dprintk("RPC: %4d call_decode (status %d)\n", + task->tk_pid, task->tk_status); + + if (clnt->cl_chatty && (task->tk_flags & RPC_CALL_MAJORSEEN)) { + printk(KERN_NOTICE "%s: server %s OK\n", + clnt->cl_protname, clnt->cl_server); + task->tk_flags &= ~RPC_CALL_MAJORSEEN; + } + + if (task->tk_status < 12) { + if (!RPC_IS_SOFT(task)) { + task->tk_action = call_bind; + clnt->cl_stats->rpcretrans++; + goto out_retry; + } + printk(KERN_WARNING "%s: too small RPC reply size (%d bytes)\n", + clnt->cl_protname, task->tk_status); + rpc_exit(task, -EIO); + return; + } + + req->rq_rcv_buf.len = req->rq_private_buf.len; + + /* Check that the softirq receive buffer is valid */ + WARN_ON(memcmp(&req->rq_rcv_buf, &req->rq_private_buf, + sizeof(req->rq_rcv_buf)) != 0); + + /* Verify the RPC header */ + if (!(p = call_verify(task))) { + if (task->tk_action == NULL) + return; + goto out_retry; + } + + task->tk_action = NULL; + + if (decode) + task->tk_status = rpcauth_unwrap_resp(task, decode, req, p, + task->tk_msg.rpc_resp); + dprintk("RPC: %4d call_decode result %d\n", task->tk_pid, + task->tk_status); + return; +out_retry: + req->rq_received = req->rq_private_buf.len = 0; + task->tk_status = 0; +} + +/* + * 8. Refresh the credentials if rejected by the server + */ +static void +call_refresh(struct rpc_task *task) +{ + dprintk("RPC: %4d call_refresh\n", task->tk_pid); + + xprt_release(task); /* Must do to obtain new XID */ + task->tk_action = call_refreshresult; + task->tk_status = 0; + task->tk_client->cl_stats->rpcauthrefresh++; + rpcauth_refreshcred(task); +} + +/* + * 8a. Process the results of a credential refresh + */ +static void +call_refreshresult(struct rpc_task *task) +{ + int status = task->tk_status; + dprintk("RPC: %4d call_refreshresult (status %d)\n", + task->tk_pid, task->tk_status); + + task->tk_status = 0; + task->tk_action = call_reserve; + if (status >= 0 && rpcauth_uptodatecred(task)) + return; + if (status == -EACCES) { + rpc_exit(task, -EACCES); + return; + } + task->tk_action = call_refresh; + if (status != -ETIMEDOUT) + rpc_delay(task, 3*HZ); + return; +} + +/* + * Call header serialization + */ +static u32 * +call_header(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_xprt *xprt = clnt->cl_xprt; + struct rpc_rqst *req = task->tk_rqstp; + u32 *p = req->rq_svec[0].iov_base; + + /* FIXME: check buffer size? */ + if (xprt->stream) + *p++ = 0; /* fill in later */ + *p++ = req->rq_xid; /* XID */ + *p++ = htonl(RPC_CALL); /* CALL */ + *p++ = htonl(RPC_VERSION); /* RPC version */ + *p++ = htonl(clnt->cl_prog); /* program number */ + *p++ = htonl(clnt->cl_vers); /* program version */ + *p++ = htonl(task->tk_msg.rpc_proc->p_proc); /* procedure */ + return rpcauth_marshcred(task, p); +} + +/* + * Reply header verification + */ +static u32 * +call_verify(struct rpc_task *task) +{ + struct kvec *iov = &task->tk_rqstp->rq_rcv_buf.head[0]; + int len = task->tk_rqstp->rq_rcv_buf.len >> 2; + u32 *p = iov->iov_base, n; + int error = -EACCES; + + if ((len -= 3) < 0) + goto out_overflow; + p += 1; /* skip XID */ + + if ((n = ntohl(*p++)) != RPC_REPLY) { + printk(KERN_WARNING "call_verify: not an RPC reply: %x\n", n); + goto out_retry; + } + if ((n = ntohl(*p++)) != RPC_MSG_ACCEPTED) { + if (--len < 0) + goto out_overflow; + switch ((n = ntohl(*p++))) { + case RPC_AUTH_ERROR: + break; + case RPC_MISMATCH: + printk(KERN_WARNING "%s: RPC call version mismatch!\n", __FUNCTION__); + goto out_eio; + default: + printk(KERN_WARNING "%s: RPC call rejected, unknown error: %x\n", __FUNCTION__, n); + goto out_eio; + } + if (--len < 0) + goto out_overflow; + switch ((n = ntohl(*p++))) { + case RPC_AUTH_REJECTEDCRED: + case RPC_AUTH_REJECTEDVERF: + case RPCSEC_GSS_CREDPROBLEM: + case RPCSEC_GSS_CTXPROBLEM: + if (!task->tk_cred_retry) + break; + task->tk_cred_retry--; + dprintk("RPC: %4d call_verify: retry stale creds\n", + task->tk_pid); + rpcauth_invalcred(task); + task->tk_action = call_refresh; + return NULL; + case RPC_AUTH_BADCRED: + case RPC_AUTH_BADVERF: + /* possibly garbled cred/verf? */ + if (!task->tk_garb_retry) + break; + task->tk_garb_retry--; + dprintk("RPC: %4d call_verify: retry garbled creds\n", + task->tk_pid); + task->tk_action = call_bind; + return NULL; + case RPC_AUTH_TOOWEAK: + printk(KERN_NOTICE "call_verify: server requires stronger " + "authentication.\n"); + break; + default: + printk(KERN_WARNING "call_verify: unknown auth error: %x\n", n); + error = -EIO; + } + dprintk("RPC: %4d call_verify: call rejected %d\n", + task->tk_pid, n); + goto out_err; + } + if (!(p = rpcauth_checkverf(task, p))) { + printk(KERN_WARNING "call_verify: auth check failed\n"); + goto out_retry; /* bad verifier, retry */ + } + len = p - (u32 *)iov->iov_base - 1; + if (len < 0) + goto out_overflow; + switch ((n = ntohl(*p++))) { + case RPC_SUCCESS: + return p; + case RPC_PROG_UNAVAIL: + printk(KERN_WARNING "RPC: call_verify: program %u is unsupported by server %s\n", + (unsigned int)task->tk_client->cl_prog, + task->tk_client->cl_server); + goto out_eio; + case RPC_PROG_MISMATCH: + printk(KERN_WARNING "RPC: call_verify: program %u, version %u unsupported by server %s\n", + (unsigned int)task->tk_client->cl_prog, + (unsigned int)task->tk_client->cl_vers, + task->tk_client->cl_server); + goto out_eio; + case RPC_PROC_UNAVAIL: + printk(KERN_WARNING "RPC: call_verify: proc %p unsupported by program %u, version %u on server %s\n", + task->tk_msg.rpc_proc, + task->tk_client->cl_prog, + task->tk_client->cl_vers, + task->tk_client->cl_server); + goto out_eio; + case RPC_GARBAGE_ARGS: + dprintk("RPC: %4d %s: server saw garbage\n", task->tk_pid, __FUNCTION__); + break; /* retry */ + default: + printk(KERN_WARNING "call_verify: server accept status: %x\n", n); + /* Also retry */ + } + +out_retry: + task->tk_client->cl_stats->rpcgarbage++; + if (task->tk_garb_retry) { + task->tk_garb_retry--; + dprintk(KERN_WARNING "RPC %s: retrying %4d\n", __FUNCTION__, task->tk_pid); + task->tk_action = call_bind; + return NULL; + } + printk(KERN_WARNING "RPC %s: retry failed, exit EIO\n", __FUNCTION__); +out_eio: + error = -EIO; +out_err: + rpc_exit(task, error); + return NULL; +out_overflow: + printk(KERN_WARNING "RPC %s: server reply was truncated.\n", __FUNCTION__); + goto out_retry; +} diff --git a/net/sunrpc/pmap_clnt.c b/net/sunrpc/pmap_clnt.c new file mode 100644 index 000000000000..d0b1d2c34a4d --- /dev/null +++ b/net/sunrpc/pmap_clnt.c @@ -0,0 +1,298 @@ +/* + * linux/net/sunrpc/pmap.c + * + * Portmapper client. + * + * Copyright (C) 1996, Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef RPC_DEBUG +# define RPCDBG_FACILITY RPCDBG_PMAP +#endif + +#define PMAP_SET 1 +#define PMAP_UNSET 2 +#define PMAP_GETPORT 3 + +static struct rpc_procinfo pmap_procedures[]; +static struct rpc_clnt * pmap_create(char *, struct sockaddr_in *, int); +static void pmap_getport_done(struct rpc_task *); +static struct rpc_program pmap_program; +static DEFINE_SPINLOCK(pmap_lock); + +/* + * Obtain the port for a given RPC service on a given host. This one can + * be called for an ongoing RPC request. + */ +void +rpc_getport(struct rpc_task *task, struct rpc_clnt *clnt) +{ + struct rpc_portmap *map = clnt->cl_pmap; + struct sockaddr_in *sap = &clnt->cl_xprt->addr; + struct rpc_message msg = { + .rpc_proc = &pmap_procedures[PMAP_GETPORT], + .rpc_argp = map, + .rpc_resp = &clnt->cl_port, + .rpc_cred = NULL + }; + struct rpc_clnt *pmap_clnt; + struct rpc_task *child; + + dprintk("RPC: %4d rpc_getport(%s, %d, %d, %d)\n", + task->tk_pid, clnt->cl_server, + map->pm_prog, map->pm_vers, map->pm_prot); + + spin_lock(&pmap_lock); + if (map->pm_binding) { + rpc_sleep_on(&map->pm_bindwait, task, NULL, NULL); + spin_unlock(&pmap_lock); + return; + } + map->pm_binding = 1; + spin_unlock(&pmap_lock); + + pmap_clnt = pmap_create(clnt->cl_server, sap, map->pm_prot); + if (IS_ERR(pmap_clnt)) { + task->tk_status = PTR_ERR(pmap_clnt); + goto bailout; + } + task->tk_status = 0; + + /* + * Note: rpc_new_child will release client after a failure. + */ + if (!(child = rpc_new_child(pmap_clnt, task))) + goto bailout; + + /* Setup the call info struct */ + rpc_call_setup(child, &msg, 0); + + /* ... and run the child task */ + rpc_run_child(task, child, pmap_getport_done); + return; + +bailout: + spin_lock(&pmap_lock); + map->pm_binding = 0; + rpc_wake_up(&map->pm_bindwait); + spin_unlock(&pmap_lock); + task->tk_status = -EIO; + task->tk_action = NULL; +} + +#ifdef CONFIG_ROOT_NFS +int +rpc_getport_external(struct sockaddr_in *sin, __u32 prog, __u32 vers, int prot) +{ + struct rpc_portmap map = { + .pm_prog = prog, + .pm_vers = vers, + .pm_prot = prot, + .pm_port = 0 + }; + struct rpc_clnt *pmap_clnt; + char hostname[32]; + int status; + + dprintk("RPC: rpc_getport_external(%u.%u.%u.%u, %d, %d, %d)\n", + NIPQUAD(sin->sin_addr.s_addr), prog, vers, prot); + + sprintf(hostname, "%u.%u.%u.%u", NIPQUAD(sin->sin_addr.s_addr)); + pmap_clnt = pmap_create(hostname, sin, prot); + if (IS_ERR(pmap_clnt)) + return PTR_ERR(pmap_clnt); + + /* Setup the call info struct */ + status = rpc_call(pmap_clnt, PMAP_GETPORT, &map, &map.pm_port, 0); + + if (status >= 0) { + if (map.pm_port != 0) + return map.pm_port; + status = -EACCES; + } + return status; +} +#endif + +static void +pmap_getport_done(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_portmap *map = clnt->cl_pmap; + + dprintk("RPC: %4d pmap_getport_done(status %d, port %d)\n", + task->tk_pid, task->tk_status, clnt->cl_port); + if (task->tk_status < 0) { + /* Make the calling task exit with an error */ + task->tk_action = NULL; + } else if (clnt->cl_port == 0) { + /* Program not registered */ + task->tk_status = -EACCES; + task->tk_action = NULL; + } else { + /* byte-swap port number first */ + clnt->cl_port = htons(clnt->cl_port); + clnt->cl_xprt->addr.sin_port = clnt->cl_port; + } + spin_lock(&pmap_lock); + map->pm_binding = 0; + rpc_wake_up(&map->pm_bindwait); + spin_unlock(&pmap_lock); +} + +/* + * Set or unset a port registration with the local portmapper. + * port == 0 means unregister, port != 0 means register. + */ +int +rpc_register(u32 prog, u32 vers, int prot, unsigned short port, int *okay) +{ + struct sockaddr_in sin; + struct rpc_portmap map; + struct rpc_clnt *pmap_clnt; + int error = 0; + + dprintk("RPC: registering (%d, %d, %d, %d) with portmapper.\n", + prog, vers, prot, port); + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_LOOPBACK); + pmap_clnt = pmap_create("localhost", &sin, IPPROTO_UDP); + if (IS_ERR(pmap_clnt)) { + error = PTR_ERR(pmap_clnt); + dprintk("RPC: couldn't create pmap client. Error = %d\n", error); + return error; + } + + map.pm_prog = prog; + map.pm_vers = vers; + map.pm_prot = prot; + map.pm_port = port; + + error = rpc_call(pmap_clnt, port? PMAP_SET : PMAP_UNSET, + &map, okay, 0); + + if (error < 0) { + printk(KERN_WARNING + "RPC: failed to contact portmap (errno %d).\n", + error); + } + dprintk("RPC: registration status %d/%d\n", error, *okay); + + /* Client deleted automatically because cl_oneshot == 1 */ + return error; +} + +static struct rpc_clnt * +pmap_create(char *hostname, struct sockaddr_in *srvaddr, int proto) +{ + struct rpc_xprt *xprt; + struct rpc_clnt *clnt; + + /* printk("pmap: create xprt\n"); */ + xprt = xprt_create_proto(proto, srvaddr, NULL); + if (IS_ERR(xprt)) + return (struct rpc_clnt *)xprt; + xprt->addr.sin_port = htons(RPC_PMAP_PORT); + + /* printk("pmap: create clnt\n"); */ + clnt = rpc_create_client(xprt, hostname, + &pmap_program, RPC_PMAP_VERSION, + RPC_AUTH_UNIX); + if (IS_ERR(clnt)) { + xprt_destroy(xprt); + } else { + clnt->cl_softrtry = 1; + clnt->cl_chatty = 1; + clnt->cl_oneshot = 1; + } + return clnt; +} + +/* + * XDR encode/decode functions for PMAP + */ +static int +xdr_encode_mapping(struct rpc_rqst *req, u32 *p, struct rpc_portmap *map) +{ + dprintk("RPC: xdr_encode_mapping(%d, %d, %d, %d)\n", + map->pm_prog, map->pm_vers, map->pm_prot, map->pm_port); + *p++ = htonl(map->pm_prog); + *p++ = htonl(map->pm_vers); + *p++ = htonl(map->pm_prot); + *p++ = htonl(map->pm_port); + + req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + return 0; +} + +static int +xdr_decode_port(struct rpc_rqst *req, u32 *p, unsigned short *portp) +{ + *portp = (unsigned short) ntohl(*p++); + return 0; +} + +static int +xdr_decode_bool(struct rpc_rqst *req, u32 *p, unsigned int *boolp) +{ + *boolp = (unsigned int) ntohl(*p++); + return 0; +} + +static struct rpc_procinfo pmap_procedures[] = { +[PMAP_SET] = { + .p_proc = PMAP_SET, + .p_encode = (kxdrproc_t) xdr_encode_mapping, + .p_decode = (kxdrproc_t) xdr_decode_bool, + .p_bufsiz = 4, + .p_count = 1, + }, +[PMAP_UNSET] = { + .p_proc = PMAP_UNSET, + .p_encode = (kxdrproc_t) xdr_encode_mapping, + .p_decode = (kxdrproc_t) xdr_decode_bool, + .p_bufsiz = 4, + .p_count = 1, + }, +[PMAP_GETPORT] = { + .p_proc = PMAP_GETPORT, + .p_encode = (kxdrproc_t) xdr_encode_mapping, + .p_decode = (kxdrproc_t) xdr_decode_port, + .p_bufsiz = 4, + .p_count = 1, + }, +}; + +static struct rpc_version pmap_version2 = { + .number = 2, + .nrprocs = 4, + .procs = pmap_procedures +}; + +static struct rpc_version * pmap_version[] = { + NULL, + NULL, + &pmap_version2 +}; + +static struct rpc_stat pmap_stats; + +static struct rpc_program pmap_program = { + .name = "portmap", + .number = RPC_PMAP_PROGRAM, + .nrvers = ARRAY_SIZE(pmap_version), + .version = pmap_version, + .stats = &pmap_stats, +}; diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c new file mode 100644 index 000000000000..554f224c0445 --- /dev/null +++ b/net/sunrpc/rpc_pipe.c @@ -0,0 +1,838 @@ +/* + * net/sunrpc/rpc_pipe.c + * + * Userland/kernel interface for rpcauth_gss. + * Code shamelessly plagiarized from fs/nfsd/nfsctl.c + * and fs/driverfs/inode.c + * + * Copyright (c) 2002, Trond Myklebust + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +static struct vfsmount *rpc_mount; +static int rpc_mount_count; + +static struct file_system_type rpc_pipe_fs_type; + + +static kmem_cache_t *rpc_inode_cachep; + +#define RPC_UPCALL_TIMEOUT (30*HZ) + +static void +__rpc_purge_upcall(struct inode *inode, int err) +{ + struct rpc_inode *rpci = RPC_I(inode); + struct rpc_pipe_msg *msg; + + while (!list_empty(&rpci->pipe)) { + msg = list_entry(rpci->pipe.next, struct rpc_pipe_msg, list); + list_del_init(&msg->list); + msg->errno = err; + rpci->ops->destroy_msg(msg); + } + while (!list_empty(&rpci->in_upcall)) { + msg = list_entry(rpci->pipe.next, struct rpc_pipe_msg, list); + list_del_init(&msg->list); + msg->errno = err; + rpci->ops->destroy_msg(msg); + } + rpci->pipelen = 0; + wake_up(&rpci->waitq); +} + +static void +rpc_timeout_upcall_queue(void *data) +{ + struct rpc_inode *rpci = (struct rpc_inode *)data; + struct inode *inode = &rpci->vfs_inode; + + down(&inode->i_sem); + if (rpci->nreaders == 0 && !list_empty(&rpci->pipe)) + __rpc_purge_upcall(inode, -ETIMEDOUT); + up(&inode->i_sem); +} + +int +rpc_queue_upcall(struct inode *inode, struct rpc_pipe_msg *msg) +{ + struct rpc_inode *rpci = RPC_I(inode); + int res = 0; + + down(&inode->i_sem); + if (rpci->nreaders) { + list_add_tail(&msg->list, &rpci->pipe); + rpci->pipelen += msg->len; + } else if (rpci->flags & RPC_PIPE_WAIT_FOR_OPEN) { + if (list_empty(&rpci->pipe)) + schedule_delayed_work(&rpci->queue_timeout, + RPC_UPCALL_TIMEOUT); + list_add_tail(&msg->list, &rpci->pipe); + rpci->pipelen += msg->len; + } else + res = -EPIPE; + up(&inode->i_sem); + wake_up(&rpci->waitq); + return res; +} + +static void +rpc_close_pipes(struct inode *inode) +{ + struct rpc_inode *rpci = RPC_I(inode); + + cancel_delayed_work(&rpci->queue_timeout); + flush_scheduled_work(); + down(&inode->i_sem); + if (rpci->ops != NULL) { + rpci->nreaders = 0; + __rpc_purge_upcall(inode, -EPIPE); + rpci->nwriters = 0; + if (rpci->ops->release_pipe) + rpci->ops->release_pipe(inode); + rpci->ops = NULL; + } + up(&inode->i_sem); +} + +static inline void +rpc_inode_setowner(struct inode *inode, void *private) +{ + RPC_I(inode)->private = private; +} + +static struct inode * +rpc_alloc_inode(struct super_block *sb) +{ + struct rpc_inode *rpci; + rpci = (struct rpc_inode *)kmem_cache_alloc(rpc_inode_cachep, SLAB_KERNEL); + if (!rpci) + return NULL; + return &rpci->vfs_inode; +} + +static void +rpc_destroy_inode(struct inode *inode) +{ + kmem_cache_free(rpc_inode_cachep, RPC_I(inode)); +} + +static int +rpc_pipe_open(struct inode *inode, struct file *filp) +{ + struct rpc_inode *rpci = RPC_I(inode); + int res = -ENXIO; + + down(&inode->i_sem); + if (rpci->ops != NULL) { + if (filp->f_mode & FMODE_READ) + rpci->nreaders ++; + if (filp->f_mode & FMODE_WRITE) + rpci->nwriters ++; + res = 0; + } + up(&inode->i_sem); + return res; +} + +static int +rpc_pipe_release(struct inode *inode, struct file *filp) +{ + struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); + struct rpc_pipe_msg *msg; + + down(&inode->i_sem); + if (rpci->ops == NULL) + goto out; + msg = (struct rpc_pipe_msg *)filp->private_data; + if (msg != NULL) { + msg->errno = -EPIPE; + list_del_init(&msg->list); + rpci->ops->destroy_msg(msg); + } + if (filp->f_mode & FMODE_WRITE) + rpci->nwriters --; + if (filp->f_mode & FMODE_READ) + rpci->nreaders --; + if (!rpci->nreaders) + __rpc_purge_upcall(inode, -EPIPE); + if (rpci->ops->release_pipe) + rpci->ops->release_pipe(inode); +out: + up(&inode->i_sem); + return 0; +} + +static ssize_t +rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct rpc_inode *rpci = RPC_I(inode); + struct rpc_pipe_msg *msg; + int res = 0; + + down(&inode->i_sem); + if (rpci->ops == NULL) { + res = -EPIPE; + goto out_unlock; + } + msg = filp->private_data; + if (msg == NULL) { + if (!list_empty(&rpci->pipe)) { + msg = list_entry(rpci->pipe.next, + struct rpc_pipe_msg, + list); + list_move(&msg->list, &rpci->in_upcall); + rpci->pipelen -= msg->len; + filp->private_data = msg; + msg->copied = 0; + } + if (msg == NULL) + goto out_unlock; + } + /* NOTE: it is up to the callback to update msg->copied */ + res = rpci->ops->upcall(filp, msg, buf, len); + if (res < 0 || msg->len == msg->copied) { + filp->private_data = NULL; + list_del_init(&msg->list); + rpci->ops->destroy_msg(msg); + } +out_unlock: + up(&inode->i_sem); + return res; +} + +static ssize_t +rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *offset) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct rpc_inode *rpci = RPC_I(inode); + int res; + + down(&inode->i_sem); + res = -EPIPE; + if (rpci->ops != NULL) + res = rpci->ops->downcall(filp, buf, len); + up(&inode->i_sem); + return res; +} + +static unsigned int +rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait) +{ + struct rpc_inode *rpci; + unsigned int mask = 0; + + rpci = RPC_I(filp->f_dentry->d_inode); + poll_wait(filp, &rpci->waitq, wait); + + mask = POLLOUT | POLLWRNORM; + if (rpci->ops == NULL) + mask |= POLLERR | POLLHUP; + if (!list_empty(&rpci->pipe)) + mask |= POLLIN | POLLRDNORM; + return mask; +} + +static int +rpc_pipe_ioctl(struct inode *ino, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct rpc_inode *rpci = RPC_I(filp->f_dentry->d_inode); + int len; + + switch (cmd) { + case FIONREAD: + if (rpci->ops == NULL) + return -EPIPE; + len = rpci->pipelen; + if (filp->private_data) { + struct rpc_pipe_msg *msg; + msg = (struct rpc_pipe_msg *)filp->private_data; + len += msg->len - msg->copied; + } + return put_user(len, (int __user *)arg); + default: + return -EINVAL; + } +} + +static struct file_operations rpc_pipe_fops = { + .owner = THIS_MODULE, + .llseek = no_llseek, + .read = rpc_pipe_read, + .write = rpc_pipe_write, + .poll = rpc_pipe_poll, + .ioctl = rpc_pipe_ioctl, + .open = rpc_pipe_open, + .release = rpc_pipe_release, +}; + +static int +rpc_show_info(struct seq_file *m, void *v) +{ + struct rpc_clnt *clnt = m->private; + + seq_printf(m, "RPC server: %s\n", clnt->cl_server); + seq_printf(m, "service: %s (%d) version %d\n", clnt->cl_protname, + clnt->cl_prog, clnt->cl_vers); + seq_printf(m, "address: %u.%u.%u.%u\n", + NIPQUAD(clnt->cl_xprt->addr.sin_addr.s_addr)); + seq_printf(m, "protocol: %s\n", + clnt->cl_xprt->prot == IPPROTO_UDP ? "udp" : "tcp"); + return 0; +} + +static int +rpc_info_open(struct inode *inode, struct file *file) +{ + struct rpc_clnt *clnt; + int ret = single_open(file, rpc_show_info, NULL); + + if (!ret) { + struct seq_file *m = file->private_data; + down(&inode->i_sem); + clnt = RPC_I(inode)->private; + if (clnt) { + atomic_inc(&clnt->cl_users); + m->private = clnt; + } else { + single_release(inode, file); + ret = -EINVAL; + } + up(&inode->i_sem); + } + return ret; +} + +static int +rpc_info_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + struct rpc_clnt *clnt = (struct rpc_clnt *)m->private; + + if (clnt) + rpc_release_client(clnt); + return single_release(inode, file); +} + +static struct file_operations rpc_info_operations = { + .owner = THIS_MODULE, + .open = rpc_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = rpc_info_release, +}; + + +/* + * We have a single directory with 1 node in it. + */ +enum { + RPCAUTH_Root = 1, + RPCAUTH_lockd, + RPCAUTH_mount, + RPCAUTH_nfs, + RPCAUTH_portmap, + RPCAUTH_statd, + RPCAUTH_RootEOF +}; + +/* + * Description of fs contents. + */ +struct rpc_filelist { + char *name; + struct file_operations *i_fop; + int mode; +}; + +static struct rpc_filelist files[] = { + [RPCAUTH_lockd] = { + .name = "lockd", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, + [RPCAUTH_mount] = { + .name = "mount", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, + [RPCAUTH_nfs] = { + .name = "nfs", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, + [RPCAUTH_portmap] = { + .name = "portmap", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, + [RPCAUTH_statd] = { + .name = "statd", + .mode = S_IFDIR | S_IRUGO | S_IXUGO, + }, +}; + +enum { + RPCAUTH_info = 2, + RPCAUTH_EOF +}; + +static struct rpc_filelist authfiles[] = { + [RPCAUTH_info] = { + .name = "info", + .i_fop = &rpc_info_operations, + .mode = S_IFREG | S_IRUSR, + }, +}; + +static int +rpc_get_mount(void) +{ + return simple_pin_fs("rpc_pipefs", &rpc_mount, &rpc_mount_count); +} + +static void +rpc_put_mount(void) +{ + simple_release_fs(&rpc_mount, &rpc_mount_count); +} + +static int +rpc_lookup_parent(char *path, struct nameidata *nd) +{ + if (path[0] == '\0') + return -ENOENT; + if (rpc_get_mount()) { + printk(KERN_WARNING "%s: %s failed to mount " + "pseudofilesystem \n", __FILE__, __FUNCTION__); + return -ENODEV; + } + nd->mnt = mntget(rpc_mount); + nd->dentry = dget(rpc_mount->mnt_root); + nd->last_type = LAST_ROOT; + nd->flags = LOOKUP_PARENT; + nd->depth = 0; + + if (path_walk(path, nd)) { + printk(KERN_WARNING "%s: %s failed to find path %s\n", + __FILE__, __FUNCTION__, path); + rpc_put_mount(); + return -ENOENT; + } + return 0; +} + +static void +rpc_release_path(struct nameidata *nd) +{ + path_release(nd); + rpc_put_mount(); +} + +static struct inode * +rpc_get_inode(struct super_block *sb, int mode) +{ + struct inode *inode = new_inode(sb); + if (!inode) + return NULL; + inode->i_mode = mode; + inode->i_uid = inode->i_gid = 0; + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_blocks = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + switch(mode & S_IFMT) { + case S_IFDIR: + inode->i_fop = &simple_dir_operations; + inode->i_op = &simple_dir_inode_operations; + inode->i_nlink++; + default: + break; + } + return inode; +} + +/* + * FIXME: This probably has races. + */ +static void +rpc_depopulate(struct dentry *parent) +{ + struct inode *dir = parent->d_inode; + struct list_head *pos, *next; + struct dentry *dentry, *dvec[10]; + int n = 0; + + down(&dir->i_sem); +repeat: + spin_lock(&dcache_lock); + list_for_each_safe(pos, next, &parent->d_subdirs) { + dentry = list_entry(pos, struct dentry, d_child); + spin_lock(&dentry->d_lock); + if (!d_unhashed(dentry)) { + dget_locked(dentry); + __d_drop(dentry); + spin_unlock(&dentry->d_lock); + dvec[n++] = dentry; + if (n == ARRAY_SIZE(dvec)) + break; + } else + spin_unlock(&dentry->d_lock); + } + spin_unlock(&dcache_lock); + if (n) { + do { + dentry = dvec[--n]; + if (dentry->d_inode) { + rpc_close_pipes(dentry->d_inode); + rpc_inode_setowner(dentry->d_inode, NULL); + simple_unlink(dir, dentry); + } + dput(dentry); + } while (n); + goto repeat; + } + up(&dir->i_sem); +} + +static int +rpc_populate(struct dentry *parent, + struct rpc_filelist *files, + int start, int eof) +{ + struct inode *inode, *dir = parent->d_inode; + void *private = RPC_I(dir)->private; + struct dentry *dentry; + int mode, i; + + down(&dir->i_sem); + for (i = start; i < eof; i++) { + dentry = d_alloc_name(parent, files[i].name); + if (!dentry) + goto out_bad; + mode = files[i].mode; + inode = rpc_get_inode(dir->i_sb, mode); + if (!inode) { + dput(dentry); + goto out_bad; + } + inode->i_ino = i; + if (files[i].i_fop) + inode->i_fop = files[i].i_fop; + if (private) + rpc_inode_setowner(inode, private); + if (S_ISDIR(mode)) + dir->i_nlink++; + d_add(dentry, inode); + } + up(&dir->i_sem); + return 0; +out_bad: + up(&dir->i_sem); + printk(KERN_WARNING "%s: %s failed to populate directory %s\n", + __FILE__, __FUNCTION__, parent->d_name.name); + return -ENOMEM; +} + +static int +__rpc_mkdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode; + + inode = rpc_get_inode(dir->i_sb, S_IFDIR | S_IRUSR | S_IXUSR); + if (!inode) + goto out_err; + inode->i_ino = iunique(dir->i_sb, 100); + d_instantiate(dentry, inode); + dir->i_nlink++; + inode_dir_notify(dir, DN_CREATE); + rpc_get_mount(); + return 0; +out_err: + printk(KERN_WARNING "%s: %s failed to allocate inode for dentry %s\n", + __FILE__, __FUNCTION__, dentry->d_name.name); + return -ENOMEM; +} + +static int +__rpc_rmdir(struct inode *dir, struct dentry *dentry) +{ + int error; + + shrink_dcache_parent(dentry); + if (dentry->d_inode) { + rpc_close_pipes(dentry->d_inode); + rpc_inode_setowner(dentry->d_inode, NULL); + } + if ((error = simple_rmdir(dir, dentry)) != 0) + return error; + if (!error) { + inode_dir_notify(dir, DN_DELETE); + d_drop(dentry); + rpc_put_mount(); + } + return 0; +} + +static struct dentry * +rpc_lookup_negative(char *path, struct nameidata *nd) +{ + struct dentry *dentry; + struct inode *dir; + int error; + + if ((error = rpc_lookup_parent(path, nd)) != 0) + return ERR_PTR(error); + dir = nd->dentry->d_inode; + down(&dir->i_sem); + dentry = lookup_hash(&nd->last, nd->dentry); + if (IS_ERR(dentry)) + goto out_err; + if (dentry->d_inode) { + dput(dentry); + dentry = ERR_PTR(-EEXIST); + goto out_err; + } + return dentry; +out_err: + up(&dir->i_sem); + rpc_release_path(nd); + return dentry; +} + + +struct dentry * +rpc_mkdir(char *path, struct rpc_clnt *rpc_client) +{ + struct nameidata nd; + struct dentry *dentry; + struct inode *dir; + int error; + + dentry = rpc_lookup_negative(path, &nd); + if (IS_ERR(dentry)) + return dentry; + dir = nd.dentry->d_inode; + if ((error = __rpc_mkdir(dir, dentry)) != 0) + goto err_dput; + RPC_I(dentry->d_inode)->private = rpc_client; + error = rpc_populate(dentry, authfiles, + RPCAUTH_info, RPCAUTH_EOF); + if (error) + goto err_depopulate; +out: + up(&dir->i_sem); + rpc_release_path(&nd); + return dentry; +err_depopulate: + rpc_depopulate(dentry); + __rpc_rmdir(dir, dentry); +err_dput: + dput(dentry); + printk(KERN_WARNING "%s: %s() failed to create directory %s (errno = %d)\n", + __FILE__, __FUNCTION__, path, error); + dentry = ERR_PTR(error); + goto out; +} + +int +rpc_rmdir(char *path) +{ + struct nameidata nd; + struct dentry *dentry; + struct inode *dir; + int error; + + if ((error = rpc_lookup_parent(path, &nd)) != 0) + return error; + dir = nd.dentry->d_inode; + down(&dir->i_sem); + dentry = lookup_hash(&nd.last, nd.dentry); + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); + goto out_release; + } + rpc_depopulate(dentry); + error = __rpc_rmdir(dir, dentry); + dput(dentry); +out_release: + up(&dir->i_sem); + rpc_release_path(&nd); + return error; +} + +struct dentry * +rpc_mkpipe(char *path, void *private, struct rpc_pipe_ops *ops, int flags) +{ + struct nameidata nd; + struct dentry *dentry; + struct inode *dir, *inode; + struct rpc_inode *rpci; + + dentry = rpc_lookup_negative(path, &nd); + if (IS_ERR(dentry)) + return dentry; + dir = nd.dentry->d_inode; + inode = rpc_get_inode(dir->i_sb, S_IFSOCK | S_IRUSR | S_IWUSR); + if (!inode) + goto err_dput; + inode->i_ino = iunique(dir->i_sb, 100); + inode->i_fop = &rpc_pipe_fops; + d_instantiate(dentry, inode); + rpci = RPC_I(inode); + rpci->private = private; + rpci->flags = flags; + rpci->ops = ops; + inode_dir_notify(dir, DN_CREATE); +out: + up(&dir->i_sem); + rpc_release_path(&nd); + return dentry; +err_dput: + dput(dentry); + dentry = ERR_PTR(-ENOMEM); + printk(KERN_WARNING "%s: %s() failed to create pipe %s (errno = %d)\n", + __FILE__, __FUNCTION__, path, -ENOMEM); + goto out; +} + +int +rpc_unlink(char *path) +{ + struct nameidata nd; + struct dentry *dentry; + struct inode *dir; + int error; + + if ((error = rpc_lookup_parent(path, &nd)) != 0) + return error; + dir = nd.dentry->d_inode; + down(&dir->i_sem); + dentry = lookup_hash(&nd.last, nd.dentry); + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); + goto out_release; + } + d_drop(dentry); + if (dentry->d_inode) { + rpc_close_pipes(dentry->d_inode); + rpc_inode_setowner(dentry->d_inode, NULL); + error = simple_unlink(dir, dentry); + } + dput(dentry); + inode_dir_notify(dir, DN_DELETE); +out_release: + up(&dir->i_sem); + rpc_release_path(&nd); + return error; +} + +/* + * populate the filesystem + */ +static struct super_operations s_ops = { + .alloc_inode = rpc_alloc_inode, + .destroy_inode = rpc_destroy_inode, + .statfs = simple_statfs, +}; + +#define RPCAUTH_GSSMAGIC 0x67596969 + +static int +rpc_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode; + struct dentry *root; + + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = RPCAUTH_GSSMAGIC; + sb->s_op = &s_ops; + sb->s_time_gran = 1; + + inode = rpc_get_inode(sb, S_IFDIR | 0755); + if (!inode) + return -ENOMEM; + root = d_alloc_root(inode); + if (!root) { + iput(inode); + return -ENOMEM; + } + if (rpc_populate(root, files, RPCAUTH_Root + 1, RPCAUTH_RootEOF)) + goto out; + sb->s_root = root; + return 0; +out: + d_genocide(root); + dput(root); + return -ENOMEM; +} + +static struct super_block * +rpc_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return get_sb_single(fs_type, flags, data, rpc_fill_super); +} + +static struct file_system_type rpc_pipe_fs_type = { + .owner = THIS_MODULE, + .name = "rpc_pipefs", + .get_sb = rpc_get_sb, + .kill_sb = kill_litter_super, +}; + +static void +init_once(void * foo, kmem_cache_t * cachep, unsigned long flags) +{ + struct rpc_inode *rpci = (struct rpc_inode *) foo; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) { + inode_init_once(&rpci->vfs_inode); + rpci->private = NULL; + rpci->nreaders = 0; + rpci->nwriters = 0; + INIT_LIST_HEAD(&rpci->in_upcall); + INIT_LIST_HEAD(&rpci->pipe); + rpci->pipelen = 0; + init_waitqueue_head(&rpci->waitq); + INIT_WORK(&rpci->queue_timeout, rpc_timeout_upcall_queue, rpci); + rpci->ops = NULL; + } +} + +int register_rpc_pipefs(void) +{ + rpc_inode_cachep = kmem_cache_create("rpc_inode_cache", + sizeof(struct rpc_inode), + 0, SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, + init_once, NULL); + if (!rpc_inode_cachep) + return -ENOMEM; + register_filesystem(&rpc_pipe_fs_type); + return 0; +} + +void unregister_rpc_pipefs(void) +{ + if (kmem_cache_destroy(rpc_inode_cachep)) + printk(KERN_WARNING "RPC: unable to free inode cache\n"); + unregister_filesystem(&rpc_pipe_fs_type); +} diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c new file mode 100644 index 000000000000..c06614d0e31d --- /dev/null +++ b/net/sunrpc/sched.c @@ -0,0 +1,1119 @@ +/* + * linux/net/sunrpc/sched.c + * + * Scheduling for synchronous and asynchronous RPC requests. + * + * Copyright (C) 1996 Olaf Kirch, + * + * TCP NFS related read + write fixes + * (C) 1999 Dave Airlie, University of Limerick, Ireland + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef RPC_DEBUG +#define RPCDBG_FACILITY RPCDBG_SCHED +#define RPC_TASK_MAGIC_ID 0xf00baa +static int rpc_task_id; +#endif + +/* + * RPC slabs and memory pools + */ +#define RPC_BUFFER_MAXSIZE (2048) +#define RPC_BUFFER_POOLSIZE (8) +#define RPC_TASK_POOLSIZE (8) +static kmem_cache_t *rpc_task_slabp; +static kmem_cache_t *rpc_buffer_slabp; +static mempool_t *rpc_task_mempool; +static mempool_t *rpc_buffer_mempool; + +static void __rpc_default_timer(struct rpc_task *task); +static void rpciod_killall(void); +static void rpc_free(struct rpc_task *task); + +static void rpc_async_schedule(void *); + +/* + * RPC tasks that create another task (e.g. for contacting the portmapper) + * will wait on this queue for their child's completion + */ +static RPC_WAITQ(childq, "childq"); + +/* + * RPC tasks sit here while waiting for conditions to improve. + */ +static RPC_WAITQ(delay_queue, "delayq"); + +/* + * All RPC tasks are linked into this list + */ +static LIST_HEAD(all_tasks); + +/* + * rpciod-related stuff + */ +static DECLARE_MUTEX(rpciod_sema); +static unsigned int rpciod_users; +static struct workqueue_struct *rpciod_workqueue; + +/* + * Spinlock for other critical sections of code. + */ +static DEFINE_SPINLOCK(rpc_sched_lock); + +/* + * Disable the timer for a given RPC task. Should be called with + * queue->lock and bh_disabled in order to avoid races within + * rpc_run_timer(). + */ +static inline void +__rpc_disable_timer(struct rpc_task *task) +{ + dprintk("RPC: %4d disabling timer\n", task->tk_pid); + task->tk_timeout_fn = NULL; + task->tk_timeout = 0; +} + +/* + * Run a timeout function. + * We use the callback in order to allow __rpc_wake_up_task() + * and friends to disable the timer synchronously on SMP systems + * without calling del_timer_sync(). The latter could cause a + * deadlock if called while we're holding spinlocks... + */ +static void rpc_run_timer(struct rpc_task *task) +{ + void (*callback)(struct rpc_task *); + + callback = task->tk_timeout_fn; + task->tk_timeout_fn = NULL; + if (callback && RPC_IS_QUEUED(task)) { + dprintk("RPC: %4d running timer\n", task->tk_pid); + callback(task); + } + smp_mb__before_clear_bit(); + clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate); + smp_mb__after_clear_bit(); +} + +/* + * Set up a timer for the current task. + */ +static inline void +__rpc_add_timer(struct rpc_task *task, rpc_action timer) +{ + if (!task->tk_timeout) + return; + + dprintk("RPC: %4d setting alarm for %lu ms\n", + task->tk_pid, task->tk_timeout * 1000 / HZ); + + if (timer) + task->tk_timeout_fn = timer; + else + task->tk_timeout_fn = __rpc_default_timer; + set_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate); + mod_timer(&task->tk_timer, jiffies + task->tk_timeout); +} + +/* + * Delete any timer for the current task. Because we use del_timer_sync(), + * this function should never be called while holding queue->lock. + */ +static void +rpc_delete_timer(struct rpc_task *task) +{ + if (RPC_IS_QUEUED(task)) + return; + if (test_and_clear_bit(RPC_TASK_HAS_TIMER, &task->tk_runstate)) { + del_singleshot_timer_sync(&task->tk_timer); + dprintk("RPC: %4d deleting timer\n", task->tk_pid); + } +} + +/* + * Add new request to a priority queue. + */ +static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + struct list_head *q; + struct rpc_task *t; + + INIT_LIST_HEAD(&task->u.tk_wait.links); + q = &queue->tasks[task->tk_priority]; + if (unlikely(task->tk_priority > queue->maxpriority)) + q = &queue->tasks[queue->maxpriority]; + list_for_each_entry(t, q, u.tk_wait.list) { + if (t->tk_cookie == task->tk_cookie) { + list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links); + return; + } + } + list_add_tail(&task->u.tk_wait.list, q); +} + +/* + * Add new request to wait queue. + * + * Swapper tasks always get inserted at the head of the queue. + * This should avoid many nasty memory deadlocks and hopefully + * improve overall performance. + * Everyone else gets appended to the queue to ensure proper FIFO behavior. + */ +static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, struct rpc_task *task) +{ + BUG_ON (RPC_IS_QUEUED(task)); + + if (RPC_IS_PRIORITY(queue)) + __rpc_add_wait_queue_priority(queue, task); + else if (RPC_IS_SWAPPER(task)) + list_add(&task->u.tk_wait.list, &queue->tasks[0]); + else + list_add_tail(&task->u.tk_wait.list, &queue->tasks[0]); + task->u.tk_wait.rpc_waitq = queue; + rpc_set_queued(task); + + dprintk("RPC: %4d added to queue %p \"%s\"\n", + task->tk_pid, queue, rpc_qname(queue)); +} + +/* + * Remove request from a priority queue. + */ +static void __rpc_remove_wait_queue_priority(struct rpc_task *task) +{ + struct rpc_task *t; + + if (!list_empty(&task->u.tk_wait.links)) { + t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list); + list_move(&t->u.tk_wait.list, &task->u.tk_wait.list); + list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links); + } + list_del(&task->u.tk_wait.list); +} + +/* + * Remove request from queue. + * Note: must be called with spin lock held. + */ +static void __rpc_remove_wait_queue(struct rpc_task *task) +{ + struct rpc_wait_queue *queue; + queue = task->u.tk_wait.rpc_waitq; + + if (RPC_IS_PRIORITY(queue)) + __rpc_remove_wait_queue_priority(task); + else + list_del(&task->u.tk_wait.list); + dprintk("RPC: %4d removed from queue %p \"%s\"\n", + task->tk_pid, queue, rpc_qname(queue)); +} + +static inline void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority) +{ + queue->priority = priority; + queue->count = 1 << (priority * 2); +} + +static inline void rpc_set_waitqueue_cookie(struct rpc_wait_queue *queue, unsigned long cookie) +{ + queue->cookie = cookie; + queue->nr = RPC_BATCH_COUNT; +} + +static inline void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue) +{ + rpc_set_waitqueue_priority(queue, queue->maxpriority); + rpc_set_waitqueue_cookie(queue, 0); +} + +static void __rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname, int maxprio) +{ + int i; + + spin_lock_init(&queue->lock); + for (i = 0; i < ARRAY_SIZE(queue->tasks); i++) + INIT_LIST_HEAD(&queue->tasks[i]); + queue->maxpriority = maxprio; + rpc_reset_waitqueue_priority(queue); +#ifdef RPC_DEBUG + queue->name = qname; +#endif +} + +void rpc_init_priority_wait_queue(struct rpc_wait_queue *queue, const char *qname) +{ + __rpc_init_priority_wait_queue(queue, qname, RPC_PRIORITY_HIGH); +} + +void rpc_init_wait_queue(struct rpc_wait_queue *queue, const char *qname) +{ + __rpc_init_priority_wait_queue(queue, qname, 0); +} +EXPORT_SYMBOL(rpc_init_wait_queue); + +/* + * Make an RPC task runnable. + * + * Note: If the task is ASYNC, this must be called with + * the spinlock held to protect the wait queue operation. + */ +static void rpc_make_runnable(struct rpc_task *task) +{ + int do_ret; + + BUG_ON(task->tk_timeout_fn); + do_ret = rpc_test_and_set_running(task); + rpc_clear_queued(task); + if (do_ret) + return; + if (RPC_IS_ASYNC(task)) { + int status; + + INIT_WORK(&task->u.tk_work, rpc_async_schedule, (void *)task); + status = queue_work(task->tk_workqueue, &task->u.tk_work); + if (status < 0) { + printk(KERN_WARNING "RPC: failed to add task to queue: error: %d!\n", status); + task->tk_status = status; + return; + } + } else + wake_up(&task->u.tk_wait.waitq); +} + +/* + * Place a newly initialized task on the workqueue. + */ +static inline void +rpc_schedule_run(struct rpc_task *task) +{ + /* Don't run a child twice! */ + if (RPC_IS_ACTIVATED(task)) + return; + task->tk_active = 1; + rpc_make_runnable(task); +} + +/* + * Prepare for sleeping on a wait queue. + * By always appending tasks to the list we ensure FIFO behavior. + * NB: An RPC task will only receive interrupt-driven events as long + * as it's on a wait queue. + */ +static void __rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, + rpc_action action, rpc_action timer) +{ + dprintk("RPC: %4d sleep_on(queue \"%s\" time %ld)\n", task->tk_pid, + rpc_qname(q), jiffies); + + if (!RPC_IS_ASYNC(task) && !RPC_IS_ACTIVATED(task)) { + printk(KERN_ERR "RPC: Inactive synchronous task put to sleep!\n"); + return; + } + + /* Mark the task as being activated if so needed */ + if (!RPC_IS_ACTIVATED(task)) + task->tk_active = 1; + + __rpc_add_wait_queue(q, task); + + BUG_ON(task->tk_callback != NULL); + task->tk_callback = action; + __rpc_add_timer(task, timer); +} + +void rpc_sleep_on(struct rpc_wait_queue *q, struct rpc_task *task, + rpc_action action, rpc_action timer) +{ + /* + * Protect the queue operations. + */ + spin_lock_bh(&q->lock); + __rpc_sleep_on(q, task, action, timer); + spin_unlock_bh(&q->lock); +} + +/** + * __rpc_do_wake_up_task - wake up a single rpc_task + * @task: task to be woken up + * + * Caller must hold queue->lock, and have cleared the task queued flag. + */ +static void __rpc_do_wake_up_task(struct rpc_task *task) +{ + dprintk("RPC: %4d __rpc_wake_up_task (now %ld)\n", task->tk_pid, jiffies); + +#ifdef RPC_DEBUG + BUG_ON(task->tk_magic != RPC_TASK_MAGIC_ID); +#endif + /* Has the task been executed yet? If not, we cannot wake it up! */ + if (!RPC_IS_ACTIVATED(task)) { + printk(KERN_ERR "RPC: Inactive task (%p) being woken up!\n", task); + return; + } + + __rpc_disable_timer(task); + __rpc_remove_wait_queue(task); + + rpc_make_runnable(task); + + dprintk("RPC: __rpc_wake_up_task done\n"); +} + +/* + * Wake up the specified task + */ +static void __rpc_wake_up_task(struct rpc_task *task) +{ + if (rpc_start_wakeup(task)) { + if (RPC_IS_QUEUED(task)) + __rpc_do_wake_up_task(task); + rpc_finish_wakeup(task); + } +} + +/* + * Default timeout handler if none specified by user + */ +static void +__rpc_default_timer(struct rpc_task *task) +{ + dprintk("RPC: %d timeout (default timer)\n", task->tk_pid); + task->tk_status = -ETIMEDOUT; + rpc_wake_up_task(task); +} + +/* + * Wake up the specified task + */ +void rpc_wake_up_task(struct rpc_task *task) +{ + if (rpc_start_wakeup(task)) { + if (RPC_IS_QUEUED(task)) { + struct rpc_wait_queue *queue = task->u.tk_wait.rpc_waitq; + + spin_lock_bh(&queue->lock); + __rpc_do_wake_up_task(task); + spin_unlock_bh(&queue->lock); + } + rpc_finish_wakeup(task); + } +} + +/* + * Wake up the next task on a priority queue. + */ +static struct rpc_task * __rpc_wake_up_next_priority(struct rpc_wait_queue *queue) +{ + struct list_head *q; + struct rpc_task *task; + + /* + * Service a batch of tasks from a single cookie. + */ + q = &queue->tasks[queue->priority]; + if (!list_empty(q)) { + task = list_entry(q->next, struct rpc_task, u.tk_wait.list); + if (queue->cookie == task->tk_cookie) { + if (--queue->nr) + goto out; + list_move_tail(&task->u.tk_wait.list, q); + } + /* + * Check if we need to switch queues. + */ + if (--queue->count) + goto new_cookie; + } + + /* + * Service the next queue. + */ + do { + if (q == &queue->tasks[0]) + q = &queue->tasks[queue->maxpriority]; + else + q = q - 1; + if (!list_empty(q)) { + task = list_entry(q->next, struct rpc_task, u.tk_wait.list); + goto new_queue; + } + } while (q != &queue->tasks[queue->priority]); + + rpc_reset_waitqueue_priority(queue); + return NULL; + +new_queue: + rpc_set_waitqueue_priority(queue, (unsigned int)(q - &queue->tasks[0])); +new_cookie: + rpc_set_waitqueue_cookie(queue, task->tk_cookie); +out: + __rpc_wake_up_task(task); + return task; +} + +/* + * Wake up the next task on the wait queue. + */ +struct rpc_task * rpc_wake_up_next(struct rpc_wait_queue *queue) +{ + struct rpc_task *task = NULL; + + dprintk("RPC: wake_up_next(%p \"%s\")\n", queue, rpc_qname(queue)); + spin_lock_bh(&queue->lock); + if (RPC_IS_PRIORITY(queue)) + task = __rpc_wake_up_next_priority(queue); + else { + task_for_first(task, &queue->tasks[0]) + __rpc_wake_up_task(task); + } + spin_unlock_bh(&queue->lock); + + return task; +} + +/** + * rpc_wake_up - wake up all rpc_tasks + * @queue: rpc_wait_queue on which the tasks are sleeping + * + * Grabs queue->lock + */ +void rpc_wake_up(struct rpc_wait_queue *queue) +{ + struct rpc_task *task; + + struct list_head *head; + spin_lock_bh(&queue->lock); + head = &queue->tasks[queue->maxpriority]; + for (;;) { + while (!list_empty(head)) { + task = list_entry(head->next, struct rpc_task, u.tk_wait.list); + __rpc_wake_up_task(task); + } + if (head == &queue->tasks[0]) + break; + head--; + } + spin_unlock_bh(&queue->lock); +} + +/** + * rpc_wake_up_status - wake up all rpc_tasks and set their status value. + * @queue: rpc_wait_queue on which the tasks are sleeping + * @status: status value to set + * + * Grabs queue->lock + */ +void rpc_wake_up_status(struct rpc_wait_queue *queue, int status) +{ + struct list_head *head; + struct rpc_task *task; + + spin_lock_bh(&queue->lock); + head = &queue->tasks[queue->maxpriority]; + for (;;) { + while (!list_empty(head)) { + task = list_entry(head->next, struct rpc_task, u.tk_wait.list); + task->tk_status = status; + __rpc_wake_up_task(task); + } + if (head == &queue->tasks[0]) + break; + head--; + } + spin_unlock_bh(&queue->lock); +} + +/* + * Run a task at a later time + */ +static void __rpc_atrun(struct rpc_task *); +void +rpc_delay(struct rpc_task *task, unsigned long delay) +{ + task->tk_timeout = delay; + rpc_sleep_on(&delay_queue, task, NULL, __rpc_atrun); +} + +static void +__rpc_atrun(struct rpc_task *task) +{ + task->tk_status = 0; + rpc_wake_up_task(task); +} + +/* + * This is the RPC `scheduler' (or rather, the finite state machine). + */ +static int __rpc_execute(struct rpc_task *task) +{ + int status = 0; + + dprintk("RPC: %4d rpc_execute flgs %x\n", + task->tk_pid, task->tk_flags); + + BUG_ON(RPC_IS_QUEUED(task)); + + restarted: + while (1) { + /* + * Garbage collection of pending timers... + */ + rpc_delete_timer(task); + + /* + * Execute any pending callback. + */ + if (RPC_DO_CALLBACK(task)) { + /* Define a callback save pointer */ + void (*save_callback)(struct rpc_task *); + + /* + * If a callback exists, save it, reset it, + * call it. + * The save is needed to stop from resetting + * another callback set within the callback handler + * - Dave + */ + save_callback=task->tk_callback; + task->tk_callback=NULL; + lock_kernel(); + save_callback(task); + unlock_kernel(); + } + + /* + * Perform the next FSM step. + * tk_action may be NULL when the task has been killed + * by someone else. + */ + if (!RPC_IS_QUEUED(task)) { + if (!task->tk_action) + break; + lock_kernel(); + task->tk_action(task); + unlock_kernel(); + } + + /* + * Lockless check for whether task is sleeping or not. + */ + if (!RPC_IS_QUEUED(task)) + continue; + rpc_clear_running(task); + if (RPC_IS_ASYNC(task)) { + /* Careful! we may have raced... */ + if (RPC_IS_QUEUED(task)) + return 0; + if (rpc_test_and_set_running(task)) + return 0; + continue; + } + + /* sync task: sleep here */ + dprintk("RPC: %4d sync task going to sleep\n", task->tk_pid); + if (RPC_TASK_UNINTERRUPTIBLE(task)) { + __wait_event(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task)); + } else { + __wait_event_interruptible(task->u.tk_wait.waitq, !RPC_IS_QUEUED(task), status); + /* + * When a sync task receives a signal, it exits with + * -ERESTARTSYS. In order to catch any callbacks that + * clean up after sleeping on some queue, we don't + * break the loop here, but go around once more. + */ + if (status == -ERESTARTSYS) { + dprintk("RPC: %4d got signal\n", task->tk_pid); + task->tk_flags |= RPC_TASK_KILLED; + rpc_exit(task, -ERESTARTSYS); + rpc_wake_up_task(task); + } + } + rpc_set_running(task); + dprintk("RPC: %4d sync task resuming\n", task->tk_pid); + } + + if (task->tk_exit) { + lock_kernel(); + task->tk_exit(task); + unlock_kernel(); + /* If tk_action is non-null, the user wants us to restart */ + if (task->tk_action) { + if (!RPC_ASSASSINATED(task)) { + /* Release RPC slot and buffer memory */ + if (task->tk_rqstp) + xprt_release(task); + rpc_free(task); + goto restarted; + } + printk(KERN_ERR "RPC: dead task tries to walk away.\n"); + } + } + + dprintk("RPC: %4d exit() = %d\n", task->tk_pid, task->tk_status); + status = task->tk_status; + + /* Release all resources associated with the task */ + rpc_release_task(task); + return status; +} + +/* + * User-visible entry point to the scheduler. + * + * This may be called recursively if e.g. an async NFS task updates + * the attributes and finds that dirty pages must be flushed. + * NOTE: Upon exit of this function the task is guaranteed to be + * released. In particular note that tk_release() will have + * been called, so your task memory may have been freed. + */ +int +rpc_execute(struct rpc_task *task) +{ + BUG_ON(task->tk_active); + + task->tk_active = 1; + rpc_set_running(task); + return __rpc_execute(task); +} + +static void rpc_async_schedule(void *arg) +{ + __rpc_execute((struct rpc_task *)arg); +} + +/* + * Allocate memory for RPC purposes. + * + * We try to ensure that some NFS reads and writes can always proceed + * by using a mempool when allocating 'small' buffers. + * In order to avoid memory starvation triggering more writebacks of + * NFS requests, we use GFP_NOFS rather than GFP_KERNEL. + */ +void * +rpc_malloc(struct rpc_task *task, size_t size) +{ + int gfp; + + if (task->tk_flags & RPC_TASK_SWAPPER) + gfp = GFP_ATOMIC; + else + gfp = GFP_NOFS; + + if (size > RPC_BUFFER_MAXSIZE) { + task->tk_buffer = kmalloc(size, gfp); + if (task->tk_buffer) + task->tk_bufsize = size; + } else { + task->tk_buffer = mempool_alloc(rpc_buffer_mempool, gfp); + if (task->tk_buffer) + task->tk_bufsize = RPC_BUFFER_MAXSIZE; + } + return task->tk_buffer; +} + +static void +rpc_free(struct rpc_task *task) +{ + if (task->tk_buffer) { + if (task->tk_bufsize == RPC_BUFFER_MAXSIZE) + mempool_free(task->tk_buffer, rpc_buffer_mempool); + else + kfree(task->tk_buffer); + task->tk_buffer = NULL; + task->tk_bufsize = 0; + } +} + +/* + * Creation and deletion of RPC task structures + */ +void rpc_init_task(struct rpc_task *task, struct rpc_clnt *clnt, rpc_action callback, int flags) +{ + memset(task, 0, sizeof(*task)); + init_timer(&task->tk_timer); + task->tk_timer.data = (unsigned long) task; + task->tk_timer.function = (void (*)(unsigned long)) rpc_run_timer; + task->tk_client = clnt; + task->tk_flags = flags; + task->tk_exit = callback; + + /* Initialize retry counters */ + task->tk_garb_retry = 2; + task->tk_cred_retry = 2; + + task->tk_priority = RPC_PRIORITY_NORMAL; + task->tk_cookie = (unsigned long)current; + + /* Initialize workqueue for async tasks */ + task->tk_workqueue = rpciod_workqueue; + if (!RPC_IS_ASYNC(task)) + init_waitqueue_head(&task->u.tk_wait.waitq); + + if (clnt) { + atomic_inc(&clnt->cl_users); + if (clnt->cl_softrtry) + task->tk_flags |= RPC_TASK_SOFT; + if (!clnt->cl_intr) + task->tk_flags |= RPC_TASK_NOINTR; + } + +#ifdef RPC_DEBUG + task->tk_magic = RPC_TASK_MAGIC_ID; + task->tk_pid = rpc_task_id++; +#endif + /* Add to global list of all tasks */ + spin_lock(&rpc_sched_lock); + list_add_tail(&task->tk_task, &all_tasks); + spin_unlock(&rpc_sched_lock); + + dprintk("RPC: %4d new task procpid %d\n", task->tk_pid, + current->pid); +} + +static struct rpc_task * +rpc_alloc_task(void) +{ + return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS); +} + +static void +rpc_default_free_task(struct rpc_task *task) +{ + dprintk("RPC: %4d freeing task\n", task->tk_pid); + mempool_free(task, rpc_task_mempool); +} + +/* + * Create a new task for the specified client. We have to + * clean up after an allocation failure, as the client may + * have specified "oneshot". + */ +struct rpc_task * +rpc_new_task(struct rpc_clnt *clnt, rpc_action callback, int flags) +{ + struct rpc_task *task; + + task = rpc_alloc_task(); + if (!task) + goto cleanup; + + rpc_init_task(task, clnt, callback, flags); + + /* Replace tk_release */ + task->tk_release = rpc_default_free_task; + + dprintk("RPC: %4d allocated task\n", task->tk_pid); + task->tk_flags |= RPC_TASK_DYNAMIC; +out: + return task; + +cleanup: + /* Check whether to release the client */ + if (clnt) { + printk("rpc_new_task: failed, users=%d, oneshot=%d\n", + atomic_read(&clnt->cl_users), clnt->cl_oneshot); + atomic_inc(&clnt->cl_users); /* pretend we were used ... */ + rpc_release_client(clnt); + } + goto out; +} + +void rpc_release_task(struct rpc_task *task) +{ + dprintk("RPC: %4d release task\n", task->tk_pid); + +#ifdef RPC_DEBUG + BUG_ON(task->tk_magic != RPC_TASK_MAGIC_ID); +#endif + + /* Remove from global task list */ + spin_lock(&rpc_sched_lock); + list_del(&task->tk_task); + spin_unlock(&rpc_sched_lock); + + BUG_ON (RPC_IS_QUEUED(task)); + task->tk_active = 0; + + /* Synchronously delete any running timer */ + rpc_delete_timer(task); + + /* Release resources */ + if (task->tk_rqstp) + xprt_release(task); + if (task->tk_msg.rpc_cred) + rpcauth_unbindcred(task); + rpc_free(task); + if (task->tk_client) { + rpc_release_client(task->tk_client); + task->tk_client = NULL; + } + +#ifdef RPC_DEBUG + task->tk_magic = 0; +#endif + if (task->tk_release) + task->tk_release(task); +} + +/** + * rpc_find_parent - find the parent of a child task. + * @child: child task + * + * Checks that the parent task is still sleeping on the + * queue 'childq'. If so returns a pointer to the parent. + * Upon failure returns NULL. + * + * Caller must hold childq.lock + */ +static inline struct rpc_task *rpc_find_parent(struct rpc_task *child) +{ + struct rpc_task *task, *parent; + struct list_head *le; + + parent = (struct rpc_task *) child->tk_calldata; + task_for_each(task, le, &childq.tasks[0]) + if (task == parent) + return parent; + + return NULL; +} + +static void rpc_child_exit(struct rpc_task *child) +{ + struct rpc_task *parent; + + spin_lock_bh(&childq.lock); + if ((parent = rpc_find_parent(child)) != NULL) { + parent->tk_status = child->tk_status; + __rpc_wake_up_task(parent); + } + spin_unlock_bh(&childq.lock); +} + +/* + * Note: rpc_new_task releases the client after a failure. + */ +struct rpc_task * +rpc_new_child(struct rpc_clnt *clnt, struct rpc_task *parent) +{ + struct rpc_task *task; + + task = rpc_new_task(clnt, NULL, RPC_TASK_ASYNC | RPC_TASK_CHILD); + if (!task) + goto fail; + task->tk_exit = rpc_child_exit; + task->tk_calldata = parent; + return task; + +fail: + parent->tk_status = -ENOMEM; + return NULL; +} + +void rpc_run_child(struct rpc_task *task, struct rpc_task *child, rpc_action func) +{ + spin_lock_bh(&childq.lock); + /* N.B. Is it possible for the child to have already finished? */ + __rpc_sleep_on(&childq, task, func, NULL); + rpc_schedule_run(child); + spin_unlock_bh(&childq.lock); +} + +/* + * Kill all tasks for the given client. + * XXX: kill their descendants as well? + */ +void rpc_killall_tasks(struct rpc_clnt *clnt) +{ + struct rpc_task *rovr; + struct list_head *le; + + dprintk("RPC: killing all tasks for client %p\n", clnt); + + /* + * Spin lock all_tasks to prevent changes... + */ + spin_lock(&rpc_sched_lock); + alltask_for_each(rovr, le, &all_tasks) { + if (! RPC_IS_ACTIVATED(rovr)) + continue; + if (!clnt || rovr->tk_client == clnt) { + rovr->tk_flags |= RPC_TASK_KILLED; + rpc_exit(rovr, -EIO); + rpc_wake_up_task(rovr); + } + } + spin_unlock(&rpc_sched_lock); +} + +static DECLARE_MUTEX_LOCKED(rpciod_running); + +static void rpciod_killall(void) +{ + unsigned long flags; + + while (!list_empty(&all_tasks)) { + clear_thread_flag(TIF_SIGPENDING); + rpc_killall_tasks(NULL); + flush_workqueue(rpciod_workqueue); + if (!list_empty(&all_tasks)) { + dprintk("rpciod_killall: waiting for tasks to exit\n"); + yield(); + } + } + + spin_lock_irqsave(¤t->sighand->siglock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); +} + +/* + * Start up the rpciod process if it's not already running. + */ +int +rpciod_up(void) +{ + struct workqueue_struct *wq; + int error = 0; + + down(&rpciod_sema); + dprintk("rpciod_up: users %d\n", rpciod_users); + rpciod_users++; + if (rpciod_workqueue) + goto out; + /* + * If there's no pid, we should be the first user. + */ + if (rpciod_users > 1) + printk(KERN_WARNING "rpciod_up: no workqueue, %d users??\n", rpciod_users); + /* + * Create the rpciod thread and wait for it to start. + */ + error = -ENOMEM; + wq = create_workqueue("rpciod"); + if (wq == NULL) { + printk(KERN_WARNING "rpciod_up: create workqueue failed, error=%d\n", error); + rpciod_users--; + goto out; + } + rpciod_workqueue = wq; + error = 0; +out: + up(&rpciod_sema); + return error; +} + +void +rpciod_down(void) +{ + down(&rpciod_sema); + dprintk("rpciod_down sema %d\n", rpciod_users); + if (rpciod_users) { + if (--rpciod_users) + goto out; + } else + printk(KERN_WARNING "rpciod_down: no users??\n"); + + if (!rpciod_workqueue) { + dprintk("rpciod_down: Nothing to do!\n"); + goto out; + } + rpciod_killall(); + + destroy_workqueue(rpciod_workqueue); + rpciod_workqueue = NULL; + out: + up(&rpciod_sema); +} + +#ifdef RPC_DEBUG +void rpc_show_tasks(void) +{ + struct list_head *le; + struct rpc_task *t; + + spin_lock(&rpc_sched_lock); + if (list_empty(&all_tasks)) { + spin_unlock(&rpc_sched_lock); + return; + } + printk("-pid- proc flgs status -client- -prog- --rqstp- -timeout " + "-rpcwait -action- --exit--\n"); + alltask_for_each(t, le, &all_tasks) { + const char *rpc_waitq = "none"; + + if (RPC_IS_QUEUED(t)) + rpc_waitq = rpc_qname(t->u.tk_wait.rpc_waitq); + + printk("%05d %04d %04x %06d %8p %6d %8p %08ld %8s %8p %8p\n", + t->tk_pid, + (t->tk_msg.rpc_proc ? t->tk_msg.rpc_proc->p_proc : -1), + t->tk_flags, t->tk_status, + t->tk_client, + (t->tk_client ? t->tk_client->cl_prog : 0), + t->tk_rqstp, t->tk_timeout, + rpc_waitq, + t->tk_action, t->tk_exit); + } + spin_unlock(&rpc_sched_lock); +} +#endif + +void +rpc_destroy_mempool(void) +{ + if (rpc_buffer_mempool) + mempool_destroy(rpc_buffer_mempool); + if (rpc_task_mempool) + mempool_destroy(rpc_task_mempool); + if (rpc_task_slabp && kmem_cache_destroy(rpc_task_slabp)) + printk(KERN_INFO "rpc_task: not all structures were freed\n"); + if (rpc_buffer_slabp && kmem_cache_destroy(rpc_buffer_slabp)) + printk(KERN_INFO "rpc_buffers: not all structures were freed\n"); +} + +int +rpc_init_mempool(void) +{ + rpc_task_slabp = kmem_cache_create("rpc_tasks", + sizeof(struct rpc_task), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!rpc_task_slabp) + goto err_nomem; + rpc_buffer_slabp = kmem_cache_create("rpc_buffers", + RPC_BUFFER_MAXSIZE, + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!rpc_buffer_slabp) + goto err_nomem; + rpc_task_mempool = mempool_create(RPC_TASK_POOLSIZE, + mempool_alloc_slab, + mempool_free_slab, + rpc_task_slabp); + if (!rpc_task_mempool) + goto err_nomem; + rpc_buffer_mempool = mempool_create(RPC_BUFFER_POOLSIZE, + mempool_alloc_slab, + mempool_free_slab, + rpc_buffer_slabp); + if (!rpc_buffer_mempool) + goto err_nomem; + return 0; +err_nomem: + rpc_destroy_mempool(); + return -ENOMEM; +} diff --git a/net/sunrpc/stats.c b/net/sunrpc/stats.c new file mode 100644 index 000000000000..9b67dc19944c --- /dev/null +++ b/net/sunrpc/stats.c @@ -0,0 +1,175 @@ +/* + * linux/net/sunrpc/stats.c + * + * procfs-based user access to generic RPC statistics. The stats files + * reside in /proc/net/rpc. + * + * The read routines assume that the buffer passed in is just big enough. + * If you implement an RPC service that has its own stats routine which + * appends the generic RPC stats, make sure you don't exceed the PAGE_SIZE + * limit. + * + * Copyright (C) 1995, 1996, 1997 Olaf Kirch + */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_MISC + +struct proc_dir_entry *proc_net_rpc = NULL; + +/* + * Get RPC client stats + */ +static int rpc_proc_show(struct seq_file *seq, void *v) { + const struct rpc_stat *statp = seq->private; + const struct rpc_program *prog = statp->program; + int i, j; + + seq_printf(seq, + "net %d %d %d %d\n", + statp->netcnt, + statp->netudpcnt, + statp->nettcpcnt, + statp->nettcpconn); + seq_printf(seq, + "rpc %d %d %d\n", + statp->rpccnt, + statp->rpcretrans, + statp->rpcauthrefresh); + + for (i = 0; i < prog->nrvers; i++) { + const struct rpc_version *vers = prog->version[i]; + if (!vers) + continue; + seq_printf(seq, "proc%d %d", + vers->number, vers->nrprocs); + for (j = 0; j < vers->nrprocs; j++) + seq_printf(seq, " %d", + vers->procs[j].p_count); + seq_putc(seq, '\n'); + } + return 0; +} + +static int rpc_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, rpc_proc_show, PDE(inode)->data); +} + +static struct file_operations rpc_proc_fops = { + .owner = THIS_MODULE, + .open = rpc_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +/* + * Get RPC server stats + */ +void svc_seq_show(struct seq_file *seq, const struct svc_stat *statp) { + const struct svc_program *prog = statp->program; + const struct svc_procedure *proc; + const struct svc_version *vers; + int i, j; + + seq_printf(seq, + "net %d %d %d %d\n", + statp->netcnt, + statp->netudpcnt, + statp->nettcpcnt, + statp->nettcpconn); + seq_printf(seq, + "rpc %d %d %d %d %d\n", + statp->rpccnt, + statp->rpcbadfmt+statp->rpcbadauth+statp->rpcbadclnt, + statp->rpcbadfmt, + statp->rpcbadauth, + statp->rpcbadclnt); + + for (i = 0; i < prog->pg_nvers; i++) { + if (!(vers = prog->pg_vers[i]) || !(proc = vers->vs_proc)) + continue; + seq_printf(seq, "proc%d %d", i, vers->vs_nproc); + for (j = 0; j < vers->vs_nproc; j++, proc++) + seq_printf(seq, " %d", proc->pc_count); + seq_putc(seq, '\n'); + } +} + +/* + * Register/unregister RPC proc files + */ +static inline struct proc_dir_entry * +do_register(const char *name, void *data, struct file_operations *fops) +{ + struct proc_dir_entry *ent; + + rpc_proc_init(); + dprintk("RPC: registering /proc/net/rpc/%s\n", name); + + ent = create_proc_entry(name, 0, proc_net_rpc); + if (ent) { + ent->proc_fops = fops; + ent->data = data; + } + return ent; +} + +struct proc_dir_entry * +rpc_proc_register(struct rpc_stat *statp) +{ + return do_register(statp->program->name, statp, &rpc_proc_fops); +} + +void +rpc_proc_unregister(const char *name) +{ + remove_proc_entry(name, proc_net_rpc); +} + +struct proc_dir_entry * +svc_proc_register(struct svc_stat *statp, struct file_operations *fops) +{ + return do_register(statp->program->pg_name, statp, fops); +} + +void +svc_proc_unregister(const char *name) +{ + remove_proc_entry(name, proc_net_rpc); +} + +void +rpc_proc_init(void) +{ + dprintk("RPC: registering /proc/net/rpc\n"); + if (!proc_net_rpc) { + struct proc_dir_entry *ent; + ent = proc_mkdir("rpc", proc_net); + if (ent) { + ent->owner = THIS_MODULE; + proc_net_rpc = ent; + } + } +} + +void +rpc_proc_exit(void) +{ + dprintk("RPC: unregistering /proc/net/rpc\n"); + if (proc_net_rpc) { + proc_net_rpc = NULL; + remove_proc_entry("net/rpc", NULL); + } +} + diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c new file mode 100644 index 000000000000..d4f26bf9e732 --- /dev/null +++ b/net/sunrpc/sunrpc_syms.c @@ -0,0 +1,185 @@ +/* + * linux/net/sunrpc/sunrpc_syms.c + * + * Symbols exported by the sunrpc module. + * + * Copyright (C) 1997 Olaf Kirch + */ + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + + +/* RPC scheduler */ +EXPORT_SYMBOL(rpc_execute); +EXPORT_SYMBOL(rpc_init_task); +EXPORT_SYMBOL(rpc_sleep_on); +EXPORT_SYMBOL(rpc_wake_up_next); +EXPORT_SYMBOL(rpc_wake_up_task); +EXPORT_SYMBOL(rpc_new_child); +EXPORT_SYMBOL(rpc_run_child); +EXPORT_SYMBOL(rpciod_down); +EXPORT_SYMBOL(rpciod_up); +EXPORT_SYMBOL(rpc_new_task); +EXPORT_SYMBOL(rpc_wake_up_status); +EXPORT_SYMBOL(rpc_release_task); + +/* RPC client functions */ +EXPORT_SYMBOL(rpc_create_client); +EXPORT_SYMBOL(rpc_clone_client); +EXPORT_SYMBOL(rpc_destroy_client); +EXPORT_SYMBOL(rpc_shutdown_client); +EXPORT_SYMBOL(rpc_release_client); +EXPORT_SYMBOL(rpc_killall_tasks); +EXPORT_SYMBOL(rpc_call_sync); +EXPORT_SYMBOL(rpc_call_async); +EXPORT_SYMBOL(rpc_call_setup); +EXPORT_SYMBOL(rpc_clnt_sigmask); +EXPORT_SYMBOL(rpc_clnt_sigunmask); +EXPORT_SYMBOL(rpc_delay); +EXPORT_SYMBOL(rpc_restart_call); +EXPORT_SYMBOL(rpc_setbufsize); +EXPORT_SYMBOL(rpc_unlink); +EXPORT_SYMBOL(rpc_wake_up); +EXPORT_SYMBOL(rpc_queue_upcall); +EXPORT_SYMBOL(rpc_mkpipe); + +/* Client transport */ +EXPORT_SYMBOL(xprt_create_proto); +EXPORT_SYMBOL(xprt_destroy); +EXPORT_SYMBOL(xprt_set_timeout); +EXPORT_SYMBOL(xprt_udp_slot_table_entries); +EXPORT_SYMBOL(xprt_tcp_slot_table_entries); + +/* Client credential cache */ +EXPORT_SYMBOL(rpcauth_register); +EXPORT_SYMBOL(rpcauth_unregister); +EXPORT_SYMBOL(rpcauth_create); +EXPORT_SYMBOL(rpcauth_lookupcred); +EXPORT_SYMBOL(rpcauth_lookup_credcache); +EXPORT_SYMBOL(rpcauth_free_credcache); +EXPORT_SYMBOL(rpcauth_init_credcache); +EXPORT_SYMBOL(put_rpccred); + +/* RPC server stuff */ +EXPORT_SYMBOL(svc_create); +EXPORT_SYMBOL(svc_create_thread); +EXPORT_SYMBOL(svc_exit_thread); +EXPORT_SYMBOL(svc_destroy); +EXPORT_SYMBOL(svc_drop); +EXPORT_SYMBOL(svc_process); +EXPORT_SYMBOL(svc_recv); +EXPORT_SYMBOL(svc_wake_up); +EXPORT_SYMBOL(svc_makesock); +EXPORT_SYMBOL(svc_reserve); +EXPORT_SYMBOL(svc_auth_register); +EXPORT_SYMBOL(auth_domain_lookup); +EXPORT_SYMBOL(svc_authenticate); +EXPORT_SYMBOL(svc_set_client); + +/* RPC statistics */ +#ifdef CONFIG_PROC_FS +EXPORT_SYMBOL(rpc_proc_register); +EXPORT_SYMBOL(rpc_proc_unregister); +EXPORT_SYMBOL(svc_proc_register); +EXPORT_SYMBOL(svc_proc_unregister); +EXPORT_SYMBOL(svc_seq_show); +#endif + +/* caching... */ +EXPORT_SYMBOL(auth_domain_find); +EXPORT_SYMBOL(auth_domain_put); +EXPORT_SYMBOL(auth_unix_add_addr); +EXPORT_SYMBOL(auth_unix_forget_old); +EXPORT_SYMBOL(auth_unix_lookup); +EXPORT_SYMBOL(cache_check); +EXPORT_SYMBOL(cache_flush); +EXPORT_SYMBOL(cache_purge); +EXPORT_SYMBOL(cache_fresh); +EXPORT_SYMBOL(cache_init); +EXPORT_SYMBOL(cache_register); +EXPORT_SYMBOL(cache_unregister); +EXPORT_SYMBOL(qword_add); +EXPORT_SYMBOL(qword_addhex); +EXPORT_SYMBOL(qword_get); +EXPORT_SYMBOL(svcauth_unix_purge); +EXPORT_SYMBOL(unix_domain_find); + +/* Generic XDR */ +EXPORT_SYMBOL(xdr_encode_string); +EXPORT_SYMBOL(xdr_decode_string); +EXPORT_SYMBOL(xdr_decode_string_inplace); +EXPORT_SYMBOL(xdr_decode_netobj); +EXPORT_SYMBOL(xdr_encode_netobj); +EXPORT_SYMBOL(xdr_encode_pages); +EXPORT_SYMBOL(xdr_inline_pages); +EXPORT_SYMBOL(xdr_shift_buf); +EXPORT_SYMBOL(xdr_buf_from_iov); +EXPORT_SYMBOL(xdr_buf_subsegment); +EXPORT_SYMBOL(xdr_buf_read_netobj); +EXPORT_SYMBOL(read_bytes_from_xdr_buf); + +/* Debugging symbols */ +#ifdef RPC_DEBUG +EXPORT_SYMBOL(rpc_debug); +EXPORT_SYMBOL(nfs_debug); +EXPORT_SYMBOL(nfsd_debug); +EXPORT_SYMBOL(nlm_debug); +#endif + +extern int register_rpc_pipefs(void); +extern void unregister_rpc_pipefs(void); + +static int __init +init_sunrpc(void) +{ + int err = register_rpc_pipefs(); + if (err) + goto out; + err = rpc_init_mempool() != 0; + if (err) + goto out; +#ifdef RPC_DEBUG + rpc_register_sysctl(); +#endif +#ifdef CONFIG_PROC_FS + rpc_proc_init(); +#endif + cache_register(&auth_domain_cache); + cache_register(&ip_map_cache); +out: + return err; +} + +static void __exit +cleanup_sunrpc(void) +{ + unregister_rpc_pipefs(); + rpc_destroy_mempool(); + cache_unregister(&auth_domain_cache); + cache_unregister(&ip_map_cache); +#ifdef RPC_DEBUG + rpc_unregister_sysctl(); +#endif +#ifdef CONFIG_PROC_FS + rpc_proc_exit(); +#endif +} +MODULE_LICENSE("GPL"); +module_init(init_sunrpc); +module_exit(cleanup_sunrpc); diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c new file mode 100644 index 000000000000..bb2d99f33315 --- /dev/null +++ b/net/sunrpc/svc.c @@ -0,0 +1,490 @@ +/* + * linux/net/sunrpc/svc.c + * + * High-level RPC service routines + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_SVCDSP +#define RPC_PARANOIA 1 + +/* + * Create an RPC service + */ +struct svc_serv * +svc_create(struct svc_program *prog, unsigned int bufsize) +{ + struct svc_serv *serv; + int vers; + unsigned int xdrsize; + + if (!(serv = (struct svc_serv *) kmalloc(sizeof(*serv), GFP_KERNEL))) + return NULL; + memset(serv, 0, sizeof(*serv)); + serv->sv_program = prog; + serv->sv_nrthreads = 1; + serv->sv_stats = prog->pg_stats; + serv->sv_bufsz = bufsize? bufsize : 4096; + prog->pg_lovers = prog->pg_nvers-1; + xdrsize = 0; + for (vers=0; verspg_nvers ; vers++) + if (prog->pg_vers[vers]) { + prog->pg_hivers = vers; + if (prog->pg_lovers > vers) + prog->pg_lovers = vers; + if (prog->pg_vers[vers]->vs_xdrsize > xdrsize) + xdrsize = prog->pg_vers[vers]->vs_xdrsize; + } + serv->sv_xdrsize = xdrsize; + INIT_LIST_HEAD(&serv->sv_threads); + INIT_LIST_HEAD(&serv->sv_sockets); + INIT_LIST_HEAD(&serv->sv_tempsocks); + INIT_LIST_HEAD(&serv->sv_permsocks); + spin_lock_init(&serv->sv_lock); + + serv->sv_name = prog->pg_name; + + /* Remove any stale portmap registrations */ + svc_register(serv, 0, 0); + + return serv; +} + +/* + * Destroy an RPC service + */ +void +svc_destroy(struct svc_serv *serv) +{ + struct svc_sock *svsk; + + dprintk("RPC: svc_destroy(%s, %d)\n", + serv->sv_program->pg_name, + serv->sv_nrthreads); + + if (serv->sv_nrthreads) { + if (--(serv->sv_nrthreads) != 0) { + svc_sock_update_bufs(serv); + return; + } + } else + printk("svc_destroy: no threads for serv=%p!\n", serv); + + while (!list_empty(&serv->sv_tempsocks)) { + svsk = list_entry(serv->sv_tempsocks.next, + struct svc_sock, + sk_list); + svc_delete_socket(svsk); + } + while (!list_empty(&serv->sv_permsocks)) { + svsk = list_entry(serv->sv_permsocks.next, + struct svc_sock, + sk_list); + svc_delete_socket(svsk); + } + + cache_clean_deferred(serv); + + /* Unregister service with the portmapper */ + svc_register(serv, 0, 0); + kfree(serv); +} + +/* + * Allocate an RPC server's buffer space. + * We allocate pages and place them in rq_argpages. + */ +static int +svc_init_buffer(struct svc_rqst *rqstp, unsigned int size) +{ + int pages; + int arghi; + + if (size > RPCSVC_MAXPAYLOAD) + size = RPCSVC_MAXPAYLOAD; + pages = 2 + (size+ PAGE_SIZE -1) / PAGE_SIZE; + rqstp->rq_argused = 0; + rqstp->rq_resused = 0; + arghi = 0; + if (pages > RPCSVC_MAXPAGES) + BUG(); + while (pages) { + struct page *p = alloc_page(GFP_KERNEL); + if (!p) + break; + rqstp->rq_argpages[arghi++] = p; + pages--; + } + rqstp->rq_arghi = arghi; + return ! pages; +} + +/* + * Release an RPC server buffer + */ +static void +svc_release_buffer(struct svc_rqst *rqstp) +{ + while (rqstp->rq_arghi) + put_page(rqstp->rq_argpages[--rqstp->rq_arghi]); + while (rqstp->rq_resused) { + if (rqstp->rq_respages[--rqstp->rq_resused] == NULL) + continue; + put_page(rqstp->rq_respages[rqstp->rq_resused]); + } + rqstp->rq_argused = 0; +} + +/* + * Create a server thread + */ +int +svc_create_thread(svc_thread_fn func, struct svc_serv *serv) +{ + struct svc_rqst *rqstp; + int error = -ENOMEM; + + rqstp = kmalloc(sizeof(*rqstp), GFP_KERNEL); + if (!rqstp) + goto out; + + memset(rqstp, 0, sizeof(*rqstp)); + init_waitqueue_head(&rqstp->rq_wait); + + if (!(rqstp->rq_argp = (u32 *) kmalloc(serv->sv_xdrsize, GFP_KERNEL)) + || !(rqstp->rq_resp = (u32 *) kmalloc(serv->sv_xdrsize, GFP_KERNEL)) + || !svc_init_buffer(rqstp, serv->sv_bufsz)) + goto out_thread; + + serv->sv_nrthreads++; + rqstp->rq_server = serv; + error = kernel_thread((int (*)(void *)) func, rqstp, 0); + if (error < 0) + goto out_thread; + svc_sock_update_bufs(serv); + error = 0; +out: + return error; + +out_thread: + svc_exit_thread(rqstp); + goto out; +} + +/* + * Destroy an RPC server thread + */ +void +svc_exit_thread(struct svc_rqst *rqstp) +{ + struct svc_serv *serv = rqstp->rq_server; + + svc_release_buffer(rqstp); + if (rqstp->rq_resp) + kfree(rqstp->rq_resp); + if (rqstp->rq_argp) + kfree(rqstp->rq_argp); + if (rqstp->rq_auth_data) + kfree(rqstp->rq_auth_data); + kfree(rqstp); + + /* Release the server */ + if (serv) + svc_destroy(serv); +} + +/* + * Register an RPC service with the local portmapper. + * To unregister a service, call this routine with + * proto and port == 0. + */ +int +svc_register(struct svc_serv *serv, int proto, unsigned short port) +{ + struct svc_program *progp; + unsigned long flags; + int i, error = 0, dummy; + + progp = serv->sv_program; + + dprintk("RPC: svc_register(%s, %s, %d)\n", + progp->pg_name, proto == IPPROTO_UDP? "udp" : "tcp", port); + + if (!port) + clear_thread_flag(TIF_SIGPENDING); + + for (i = 0; i < progp->pg_nvers; i++) { + if (progp->pg_vers[i] == NULL) + continue; + error = rpc_register(progp->pg_prog, i, proto, port, &dummy); + if (error < 0) + break; + if (port && !dummy) { + error = -EACCES; + break; + } + } + + if (!port) { + spin_lock_irqsave(¤t->sighand->siglock, flags); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + } + + return error; +} + +/* + * Process the RPC request. + */ +int +svc_process(struct svc_serv *serv, struct svc_rqst *rqstp) +{ + struct svc_program *progp; + struct svc_version *versp = NULL; /* compiler food */ + struct svc_procedure *procp = NULL; + struct kvec * argv = &rqstp->rq_arg.head[0]; + struct kvec * resv = &rqstp->rq_res.head[0]; + kxdrproc_t xdr; + u32 *statp; + u32 dir, prog, vers, proc, + auth_stat, rpc_stat; + int auth_res; + u32 *accept_statp; + + rpc_stat = rpc_success; + + if (argv->iov_len < 6*4) + goto err_short_len; + + /* setup response xdr_buf. + * Initially it has just one page + */ + svc_take_page(rqstp); /* must succeed */ + resv->iov_base = page_address(rqstp->rq_respages[0]); + resv->iov_len = 0; + rqstp->rq_res.pages = rqstp->rq_respages+1; + rqstp->rq_res.len = 0; + rqstp->rq_res.page_base = 0; + rqstp->rq_res.page_len = 0; + rqstp->rq_res.tail[0].iov_len = 0; + /* tcp needs a space for the record length... */ + if (rqstp->rq_prot == IPPROTO_TCP) + svc_putu32(resv, 0); + + rqstp->rq_xid = svc_getu32(argv); + svc_putu32(resv, rqstp->rq_xid); + + dir = ntohl(svc_getu32(argv)); + vers = ntohl(svc_getu32(argv)); + + /* First words of reply: */ + svc_putu32(resv, xdr_one); /* REPLY */ + + if (dir != 0) /* direction != CALL */ + goto err_bad_dir; + if (vers != 2) /* RPC version number */ + goto err_bad_rpc; + + /* Save position in case we later decide to reject: */ + accept_statp = resv->iov_base + resv->iov_len; + + svc_putu32(resv, xdr_zero); /* ACCEPT */ + + rqstp->rq_prog = prog = ntohl(svc_getu32(argv)); /* program number */ + rqstp->rq_vers = vers = ntohl(svc_getu32(argv)); /* version number */ + rqstp->rq_proc = proc = ntohl(svc_getu32(argv)); /* procedure number */ + + progp = serv->sv_program; + /* + * Decode auth data, and add verifier to reply buffer. + * We do this before anything else in order to get a decent + * auth verifier. + */ + auth_res = svc_authenticate(rqstp, &auth_stat); + /* Also give the program a chance to reject this call: */ + if (auth_res == SVC_OK) { + auth_stat = rpc_autherr_badcred; + auth_res = progp->pg_authenticate(rqstp); + } + switch (auth_res) { + case SVC_OK: + break; + case SVC_GARBAGE: + rpc_stat = rpc_garbage_args; + goto err_bad; + case SVC_SYSERR: + rpc_stat = rpc_system_err; + goto err_bad; + case SVC_DENIED: + goto err_bad_auth; + case SVC_DROP: + goto dropit; + case SVC_COMPLETE: + goto sendit; + } + + if (prog != progp->pg_prog) + goto err_bad_prog; + + if (vers >= progp->pg_nvers || + !(versp = progp->pg_vers[vers])) + goto err_bad_vers; + + procp = versp->vs_proc + proc; + if (proc >= versp->vs_nproc || !procp->pc_func) + goto err_bad_proc; + rqstp->rq_server = serv; + rqstp->rq_procinfo = procp; + + /* Syntactic check complete */ + serv->sv_stats->rpccnt++; + + /* Build the reply header. */ + statp = resv->iov_base +resv->iov_len; + svc_putu32(resv, rpc_success); /* RPC_SUCCESS */ + + /* Bump per-procedure stats counter */ + procp->pc_count++; + + /* Initialize storage for argp and resp */ + memset(rqstp->rq_argp, 0, procp->pc_argsize); + memset(rqstp->rq_resp, 0, procp->pc_ressize); + + /* un-reserve some of the out-queue now that we have a + * better idea of reply size + */ + if (procp->pc_xdrressize) + svc_reserve(rqstp, procp->pc_xdrressize<<2); + + /* Call the function that processes the request. */ + if (!versp->vs_dispatch) { + /* Decode arguments */ + xdr = procp->pc_decode; + if (xdr && !xdr(rqstp, argv->iov_base, rqstp->rq_argp)) + goto err_garbage; + + *statp = procp->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); + + /* Encode reply */ + if (*statp == rpc_success && (xdr = procp->pc_encode) + && !xdr(rqstp, resv->iov_base+resv->iov_len, rqstp->rq_resp)) { + dprintk("svc: failed to encode reply\n"); + /* serv->sv_stats->rpcsystemerr++; */ + *statp = rpc_system_err; + } + } else { + dprintk("svc: calling dispatcher\n"); + if (!versp->vs_dispatch(rqstp, statp)) { + /* Release reply info */ + if (procp->pc_release) + procp->pc_release(rqstp, NULL, rqstp->rq_resp); + goto dropit; + } + } + + /* Check RPC status result */ + if (*statp != rpc_success) + resv->iov_len = ((void*)statp) - resv->iov_base + 4; + + /* Release reply info */ + if (procp->pc_release) + procp->pc_release(rqstp, NULL, rqstp->rq_resp); + + if (procp->pc_encode == NULL) + goto dropit; + + sendit: + if (svc_authorise(rqstp)) + goto dropit; + return svc_send(rqstp); + + dropit: + svc_authorise(rqstp); /* doesn't hurt to call this twice */ + dprintk("svc: svc_process dropit\n"); + svc_drop(rqstp); + return 0; + +err_short_len: +#ifdef RPC_PARANOIA + printk("svc: short len %Zd, dropping request\n", argv->iov_len); +#endif + goto dropit; /* drop request */ + +err_bad_dir: +#ifdef RPC_PARANOIA + printk("svc: bad direction %d, dropping request\n", dir); +#endif + serv->sv_stats->rpcbadfmt++; + goto dropit; /* drop request */ + +err_bad_rpc: + serv->sv_stats->rpcbadfmt++; + svc_putu32(resv, xdr_one); /* REJECT */ + svc_putu32(resv, xdr_zero); /* RPC_MISMATCH */ + svc_putu32(resv, xdr_two); /* Only RPCv2 supported */ + svc_putu32(resv, xdr_two); + goto sendit; + +err_bad_auth: + dprintk("svc: authentication failed (%d)\n", ntohl(auth_stat)); + serv->sv_stats->rpcbadauth++; + /* Restore write pointer to location of accept status: */ + xdr_ressize_check(rqstp, accept_statp); + svc_putu32(resv, xdr_one); /* REJECT */ + svc_putu32(resv, xdr_one); /* AUTH_ERROR */ + svc_putu32(resv, auth_stat); /* status */ + goto sendit; + +err_bad_prog: +#ifdef RPC_PARANOIA + if (prog != 100227 || progp->pg_prog != 100003) + printk("svc: unknown program %d (me %d)\n", prog, progp->pg_prog); + /* else it is just a Solaris client seeing if ACLs are supported */ +#endif + serv->sv_stats->rpcbadfmt++; + svc_putu32(resv, rpc_prog_unavail); + goto sendit; + +err_bad_vers: +#ifdef RPC_PARANOIA + printk("svc: unknown version (%d)\n", vers); +#endif + serv->sv_stats->rpcbadfmt++; + svc_putu32(resv, rpc_prog_mismatch); + svc_putu32(resv, htonl(progp->pg_lovers)); + svc_putu32(resv, htonl(progp->pg_hivers)); + goto sendit; + +err_bad_proc: +#ifdef RPC_PARANOIA + printk("svc: unknown procedure (%d)\n", proc); +#endif + serv->sv_stats->rpcbadfmt++; + svc_putu32(resv, rpc_proc_unavail); + goto sendit; + +err_garbage: +#ifdef RPC_PARANOIA + printk("svc: failed to decode args\n"); +#endif + rpc_stat = rpc_garbage_args; +err_bad: + serv->sv_stats->rpcbadfmt++; + svc_putu32(resv, rpc_stat); + goto sendit; +} diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c new file mode 100644 index 000000000000..bde8147ef2db --- /dev/null +++ b/net/sunrpc/svcauth.c @@ -0,0 +1,216 @@ +/* + * linux/net/sunrpc/svcauth.c + * + * The generic interface for RPC authentication on the server side. + * + * Copyright (C) 1995, 1996 Olaf Kirch + * + * CHANGES + * 19-Apr-2000 Chris Evans - Security fix + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_AUTH + + +/* + * Table of authenticators + */ +extern struct auth_ops svcauth_null; +extern struct auth_ops svcauth_unix; + +static DEFINE_SPINLOCK(authtab_lock); +static struct auth_ops *authtab[RPC_AUTH_MAXFLAVOR] = { + [0] = &svcauth_null, + [1] = &svcauth_unix, +}; + +int +svc_authenticate(struct svc_rqst *rqstp, u32 *authp) +{ + rpc_authflavor_t flavor; + struct auth_ops *aops; + + *authp = rpc_auth_ok; + + flavor = ntohl(svc_getu32(&rqstp->rq_arg.head[0])); + + dprintk("svc: svc_authenticate (%d)\n", flavor); + + spin_lock(&authtab_lock); + if (flavor >= RPC_AUTH_MAXFLAVOR || !(aops = authtab[flavor]) + || !try_module_get(aops->owner)) { + spin_unlock(&authtab_lock); + *authp = rpc_autherr_badcred; + return SVC_DENIED; + } + spin_unlock(&authtab_lock); + + rqstp->rq_authop = aops; + return aops->accept(rqstp, authp); +} + +int svc_set_client(struct svc_rqst *rqstp) +{ + return rqstp->rq_authop->set_client(rqstp); +} + +/* A request, which was authenticated, has now executed. + * Time to finalise the the credentials and verifier + * and release and resources + */ +int svc_authorise(struct svc_rqst *rqstp) +{ + struct auth_ops *aops = rqstp->rq_authop; + int rv = 0; + + rqstp->rq_authop = NULL; + + if (aops) { + rv = aops->release(rqstp); + module_put(aops->owner); + } + return rv; +} + +int +svc_auth_register(rpc_authflavor_t flavor, struct auth_ops *aops) +{ + int rv = -EINVAL; + spin_lock(&authtab_lock); + if (flavor < RPC_AUTH_MAXFLAVOR && authtab[flavor] == NULL) { + authtab[flavor] = aops; + rv = 0; + } + spin_unlock(&authtab_lock); + return rv; +} + +void +svc_auth_unregister(rpc_authflavor_t flavor) +{ + spin_lock(&authtab_lock); + if (flavor < RPC_AUTH_MAXFLAVOR) + authtab[flavor] = NULL; + spin_unlock(&authtab_lock); +} +EXPORT_SYMBOL(svc_auth_unregister); + +/************************************************** + * cache for domain name to auth_domain + * Entries are only added by flavours which will normally + * have a structure that 'inherits' from auth_domain. + * e.g. when an IP -> domainname is given to auth_unix, + * and the domain name doesn't exist, it will create a + * auth_unix_domain and add it to this hash table. + * If it finds the name does exist, but isn't AUTH_UNIX, + * it will complain. + */ + +/* + * Auth auth_domain cache is somewhat different to other caches, + * largely because the entries are possibly of different types: + * each auth flavour has it's own type. + * One consequence of this that DefineCacheLookup cannot + * allocate a new structure as it cannot know the size. + * Notice that the "INIT" code fragment is quite different + * from other caches. When auth_domain_lookup might be + * creating a new domain, the new domain is passed in + * complete and it is used as-is rather than being copied into + * another structure. + */ +#define DN_HASHBITS 6 +#define DN_HASHMAX (1<flavour]->domain_release(dom); +} + + +struct cache_detail auth_domain_cache = { + .hash_size = DN_HASHMAX, + .hash_table = auth_domain_table, + .name = "auth.domain", + .cache_put = auth_domain_drop, +}; + +void auth_domain_put(struct auth_domain *dom) +{ + auth_domain_drop(&dom->h, &auth_domain_cache); +} + +static inline int auth_domain_hash(struct auth_domain *item) +{ + return hash_str(item->name, DN_HASHBITS); +} +static inline int auth_domain_match(struct auth_domain *tmp, struct auth_domain *item) +{ + return strcmp(tmp->name, item->name) == 0; +} + +struct auth_domain * +auth_domain_lookup(struct auth_domain *item, int set) +{ + struct auth_domain *tmp = NULL; + struct cache_head **hp, **head; + head = &auth_domain_cache.hash_table[auth_domain_hash(item)]; + + if (set) + write_lock(&auth_domain_cache.hash_lock); + else + read_lock(&auth_domain_cache.hash_lock); + for (hp=head; *hp != NULL; hp = &tmp->h.next) { + tmp = container_of(*hp, struct auth_domain, h); + if (!auth_domain_match(tmp, item)) + continue; + if (!set) { + cache_get(&tmp->h); + goto out_noset; + } + *hp = tmp->h.next; + tmp->h.next = NULL; + auth_domain_drop(&tmp->h, &auth_domain_cache); + goto out_set; + } + /* Didn't find anything */ + if (!set) + goto out_nada; + auth_domain_cache.entries++; +out_set: + item->h.next = *head; + *head = &item->h; + cache_get(&item->h); + write_unlock(&auth_domain_cache.hash_lock); + cache_fresh(&auth_domain_cache, &item->h, item->h.expiry_time); + cache_get(&item->h); + return item; +out_nada: + tmp = NULL; +out_noset: + read_unlock(&auth_domain_cache.hash_lock); + return tmp; +} + +struct auth_domain *auth_domain_find(char *name) +{ + struct auth_domain *rv, ad; + + ad.name = name; + rv = auth_domain_lookup(&ad, 0); + return rv; +} diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c new file mode 100644 index 000000000000..2b99b4028d31 --- /dev/null +++ b/net/sunrpc/svcauth_unix.c @@ -0,0 +1,502 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define RPCDBG_FACILITY RPCDBG_AUTH + + +/* + * AUTHUNIX and AUTHNULL credentials are both handled here. + * AUTHNULL is treated just like AUTHUNIX except that the uid/gid + * are always nobody (-2). i.e. we do the same IP address checks for + * AUTHNULL as for AUTHUNIX, and that is done here. + */ + + +static char *strdup(char *s) +{ + char *rv = kmalloc(strlen(s)+1, GFP_KERNEL); + if (rv) + strcpy(rv, s); + return rv; +} + +struct unix_domain { + struct auth_domain h; + int addr_changes; + /* other stuff later */ +}; + +struct auth_domain *unix_domain_find(char *name) +{ + struct auth_domain *rv, ud; + struct unix_domain *new; + + ud.name = name; + + rv = auth_domain_lookup(&ud, 0); + + foundit: + if (rv && rv->flavour != RPC_AUTH_UNIX) { + auth_domain_put(rv); + return NULL; + } + if (rv) + return rv; + + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (new == NULL) + return NULL; + cache_init(&new->h.h); + new->h.name = strdup(name); + new->h.flavour = RPC_AUTH_UNIX; + new->addr_changes = 0; + new->h.h.expiry_time = NEVER; + + rv = auth_domain_lookup(&new->h, 2); + if (rv == &new->h) { + if (atomic_dec_and_test(&new->h.h.refcnt)) BUG(); + } else { + auth_domain_put(&new->h); + goto foundit; + } + + return rv; +} + +static void svcauth_unix_domain_release(struct auth_domain *dom) +{ + struct unix_domain *ud = container_of(dom, struct unix_domain, h); + + kfree(dom->name); + kfree(ud); +} + + +/************************************************** + * cache for IP address to unix_domain + * as needed by AUTH_UNIX + */ +#define IP_HASHBITS 8 +#define IP_HASHMAX (1<flags) && + !test_bit(CACHE_NEGATIVE, &item->flags)) + auth_domain_put(&im->m_client->h); + kfree(im); + } +} + +static inline int ip_map_hash(struct ip_map *item) +{ + return hash_str(item->m_class, IP_HASHBITS) ^ + hash_long((unsigned long)item->m_addr.s_addr, IP_HASHBITS); +} +static inline int ip_map_match(struct ip_map *item, struct ip_map *tmp) +{ + return strcmp(tmp->m_class, item->m_class) == 0 + && tmp->m_addr.s_addr == item->m_addr.s_addr; +} +static inline void ip_map_init(struct ip_map *new, struct ip_map *item) +{ + strcpy(new->m_class, item->m_class); + new->m_addr.s_addr = item->m_addr.s_addr; +} +static inline void ip_map_update(struct ip_map *new, struct ip_map *item) +{ + cache_get(&item->m_client->h.h); + new->m_client = item->m_client; + new->m_add_change = item->m_add_change; +} + +static void ip_map_request(struct cache_detail *cd, + struct cache_head *h, + char **bpp, int *blen) +{ + char text_addr[20]; + struct ip_map *im = container_of(h, struct ip_map, h); + __u32 addr = im->m_addr.s_addr; + + snprintf(text_addr, 20, "%u.%u.%u.%u", + ntohl(addr) >> 24 & 0xff, + ntohl(addr) >> 16 & 0xff, + ntohl(addr) >> 8 & 0xff, + ntohl(addr) >> 0 & 0xff); + + qword_add(bpp, blen, im->m_class); + qword_add(bpp, blen, text_addr); + (*bpp)[-1] = '\n'; +} + +static struct ip_map *ip_map_lookup(struct ip_map *, int); + +static int ip_map_parse(struct cache_detail *cd, + char *mesg, int mlen) +{ + /* class ipaddress [domainname] */ + /* should be safe just to use the start of the input buffer + * for scratch: */ + char *buf = mesg; + int len; + int b1,b2,b3,b4; + char c; + struct ip_map ipm, *ipmp; + struct auth_domain *dom; + time_t expiry; + + if (mesg[mlen-1] != '\n') + return -EINVAL; + mesg[mlen-1] = 0; + + /* class */ + len = qword_get(&mesg, ipm.m_class, sizeof(ipm.m_class)); + if (len <= 0) return -EINVAL; + + /* ip address */ + len = qword_get(&mesg, buf, mlen); + if (len <= 0) return -EINVAL; + + if (sscanf(buf, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4) + return -EINVAL; + + expiry = get_expiry(&mesg); + if (expiry ==0) + return -EINVAL; + + /* domainname, or empty for NEGATIVE */ + len = qword_get(&mesg, buf, mlen); + if (len < 0) return -EINVAL; + + if (len) { + dom = unix_domain_find(buf); + if (dom == NULL) + return -ENOENT; + } else + dom = NULL; + + ipm.m_addr.s_addr = + htonl((((((b1<<8)|b2)<<8)|b3)<<8)|b4); + ipm.h.flags = 0; + if (dom) { + ipm.m_client = container_of(dom, struct unix_domain, h); + ipm.m_add_change = ipm.m_client->addr_changes; + } else + set_bit(CACHE_NEGATIVE, &ipm.h.flags); + ipm.h.expiry_time = expiry; + + ipmp = ip_map_lookup(&ipm, 1); + if (ipmp) + ip_map_put(&ipmp->h, &ip_map_cache); + if (dom) + auth_domain_put(dom); + if (!ipmp) + return -ENOMEM; + cache_flush(); + return 0; +} + +static int ip_map_show(struct seq_file *m, + struct cache_detail *cd, + struct cache_head *h) +{ + struct ip_map *im; + struct in_addr addr; + char *dom = "-no-domain-"; + + if (h == NULL) { + seq_puts(m, "#class IP domain\n"); + return 0; + } + im = container_of(h, struct ip_map, h); + /* class addr domain */ + addr = im->m_addr; + + if (test_bit(CACHE_VALID, &h->flags) && + !test_bit(CACHE_NEGATIVE, &h->flags)) + dom = im->m_client->h.name; + + seq_printf(m, "%s %d.%d.%d.%d %s\n", + im->m_class, + htonl(addr.s_addr) >> 24 & 0xff, + htonl(addr.s_addr) >> 16 & 0xff, + htonl(addr.s_addr) >> 8 & 0xff, + htonl(addr.s_addr) >> 0 & 0xff, + dom + ); + return 0; +} + + +struct cache_detail ip_map_cache = { + .hash_size = IP_HASHMAX, + .hash_table = ip_table, + .name = "auth.unix.ip", + .cache_put = ip_map_put, + .cache_request = ip_map_request, + .cache_parse = ip_map_parse, + .cache_show = ip_map_show, +}; + +static DefineSimpleCacheLookup(ip_map, 0) + + +int auth_unix_add_addr(struct in_addr addr, struct auth_domain *dom) +{ + struct unix_domain *udom; + struct ip_map ip, *ipmp; + + if (dom->flavour != RPC_AUTH_UNIX) + return -EINVAL; + udom = container_of(dom, struct unix_domain, h); + strcpy(ip.m_class, "nfsd"); + ip.m_addr = addr; + ip.m_client = udom; + ip.m_add_change = udom->addr_changes+1; + ip.h.flags = 0; + ip.h.expiry_time = NEVER; + + ipmp = ip_map_lookup(&ip, 1); + + if (ipmp) { + ip_map_put(&ipmp->h, &ip_map_cache); + return 0; + } else + return -ENOMEM; +} + +int auth_unix_forget_old(struct auth_domain *dom) +{ + struct unix_domain *udom; + + if (dom->flavour != RPC_AUTH_UNIX) + return -EINVAL; + udom = container_of(dom, struct unix_domain, h); + udom->addr_changes++; + return 0; +} + +struct auth_domain *auth_unix_lookup(struct in_addr addr) +{ + struct ip_map key, *ipm; + struct auth_domain *rv; + + strcpy(key.m_class, "nfsd"); + key.m_addr = addr; + + ipm = ip_map_lookup(&key, 0); + + if (!ipm) + return NULL; + if (cache_check(&ip_map_cache, &ipm->h, NULL)) + return NULL; + + if ((ipm->m_client->addr_changes - ipm->m_add_change) >0) { + if (test_and_set_bit(CACHE_NEGATIVE, &ipm->h.flags) == 0) + auth_domain_put(&ipm->m_client->h); + rv = NULL; + } else { + rv = &ipm->m_client->h; + cache_get(&rv->h); + } + ip_map_put(&ipm->h, &ip_map_cache); + return rv; +} + +void svcauth_unix_purge(void) +{ + cache_purge(&ip_map_cache); + cache_purge(&auth_domain_cache); +} + +static int +svcauth_unix_set_client(struct svc_rqst *rqstp) +{ + struct ip_map key, *ipm; + + rqstp->rq_client = NULL; + if (rqstp->rq_proc == 0) + return SVC_OK; + + strcpy(key.m_class, rqstp->rq_server->sv_program->pg_class); + key.m_addr = rqstp->rq_addr.sin_addr; + + ipm = ip_map_lookup(&key, 0); + + if (ipm == NULL) + return SVC_DENIED; + + switch (cache_check(&ip_map_cache, &ipm->h, &rqstp->rq_chandle)) { + default: + BUG(); + case -EAGAIN: + return SVC_DROP; + case -ENOENT: + return SVC_DENIED; + case 0: + rqstp->rq_client = &ipm->m_client->h; + cache_get(&rqstp->rq_client->h); + ip_map_put(&ipm->h, &ip_map_cache); + break; + } + return SVC_OK; +} + +static int +svcauth_null_accept(struct svc_rqst *rqstp, u32 *authp) +{ + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec *resv = &rqstp->rq_res.head[0]; + struct svc_cred *cred = &rqstp->rq_cred; + + cred->cr_group_info = NULL; + rqstp->rq_client = NULL; + + if (argv->iov_len < 3*4) + return SVC_GARBAGE; + + if (svc_getu32(argv) != 0) { + dprintk("svc: bad null cred\n"); + *authp = rpc_autherr_badcred; + return SVC_DENIED; + } + if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) { + dprintk("svc: bad null verf\n"); + *authp = rpc_autherr_badverf; + return SVC_DENIED; + } + + /* Signal that mapping to nobody uid/gid is required */ + cred->cr_uid = (uid_t) -1; + cred->cr_gid = (gid_t) -1; + cred->cr_group_info = groups_alloc(0); + if (cred->cr_group_info == NULL) + return SVC_DROP; /* kmalloc failure - client must retry */ + + /* Put NULL verifier */ + svc_putu32(resv, RPC_AUTH_NULL); + svc_putu32(resv, 0); + + return SVC_OK; +} + +static int +svcauth_null_release(struct svc_rqst *rqstp) +{ + if (rqstp->rq_client) + auth_domain_put(rqstp->rq_client); + rqstp->rq_client = NULL; + if (rqstp->rq_cred.cr_group_info) + put_group_info(rqstp->rq_cred.cr_group_info); + rqstp->rq_cred.cr_group_info = NULL; + + return 0; /* don't drop */ +} + + +struct auth_ops svcauth_null = { + .name = "null", + .owner = THIS_MODULE, + .flavour = RPC_AUTH_NULL, + .accept = svcauth_null_accept, + .release = svcauth_null_release, + .set_client = svcauth_unix_set_client, +}; + + +static int +svcauth_unix_accept(struct svc_rqst *rqstp, u32 *authp) +{ + struct kvec *argv = &rqstp->rq_arg.head[0]; + struct kvec *resv = &rqstp->rq_res.head[0]; + struct svc_cred *cred = &rqstp->rq_cred; + u32 slen, i; + int len = argv->iov_len; + + cred->cr_group_info = NULL; + rqstp->rq_client = NULL; + + if ((len -= 3*4) < 0) + return SVC_GARBAGE; + + svc_getu32(argv); /* length */ + svc_getu32(argv); /* time stamp */ + slen = XDR_QUADLEN(ntohl(svc_getu32(argv))); /* machname length */ + if (slen > 64 || (len -= (slen + 3)*4) < 0) + goto badcred; + argv->iov_base = (void*)((u32*)argv->iov_base + slen); /* skip machname */ + argv->iov_len -= slen*4; + + cred->cr_uid = ntohl(svc_getu32(argv)); /* uid */ + cred->cr_gid = ntohl(svc_getu32(argv)); /* gid */ + slen = ntohl(svc_getu32(argv)); /* gids length */ + if (slen > 16 || (len -= (slen + 2)*4) < 0) + goto badcred; + cred->cr_group_info = groups_alloc(slen); + if (cred->cr_group_info == NULL) + return SVC_DROP; + for (i = 0; i < slen; i++) + GROUP_AT(cred->cr_group_info, i) = ntohl(svc_getu32(argv)); + + if (svc_getu32(argv) != RPC_AUTH_NULL || svc_getu32(argv) != 0) { + *authp = rpc_autherr_badverf; + return SVC_DENIED; + } + + /* Put NULL verifier */ + svc_putu32(resv, RPC_AUTH_NULL); + svc_putu32(resv, 0); + + return SVC_OK; + +badcred: + *authp = rpc_autherr_badcred; + return SVC_DENIED; +} + +static int +svcauth_unix_release(struct svc_rqst *rqstp) +{ + /* Verifier (such as it is) is already in place. + */ + if (rqstp->rq_client) + auth_domain_put(rqstp->rq_client); + rqstp->rq_client = NULL; + if (rqstp->rq_cred.cr_group_info) + put_group_info(rqstp->rq_cred.cr_group_info); + rqstp->rq_cred.cr_group_info = NULL; + + return 0; +} + + +struct auth_ops svcauth_unix = { + .name = "unix", + .owner = THIS_MODULE, + .flavour = RPC_AUTH_UNIX, + .accept = svcauth_unix_accept, + .release = svcauth_unix_release, + .domain_release = svcauth_unix_domain_release, + .set_client = svcauth_unix_set_client, +}; + diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c new file mode 100644 index 000000000000..05907035bc96 --- /dev/null +++ b/net/sunrpc/svcsock.c @@ -0,0 +1,1585 @@ +/* + * linux/net/sunrpc/svcsock.c + * + * These are the RPC server socket internals. + * + * The server scheduling algorithm does not always distribute the load + * evenly when servicing a single client. May need to modify the + * svc_sock_enqueue procedure... + * + * TCP support is largely untested and may be a little slow. The problem + * is that we currently do two separate recvfrom's, one for the 4-byte + * record length, and the second for the actual record. This could possibly + * be improved by always reading a minimum size of around 100 bytes and + * tucking any superfluous bytes away in a temporary store. Still, that + * leaves write requests out in the rain. An alternative may be to peek at + * the first skb in the queue, and if it matches the next TCP sequence + * number, to extract the record marker. Yuck. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* SMP locking strategy: + * + * svc_serv->sv_lock protects most stuff for that service. + * + * Some flags can be set to certain values at any time + * providing that certain rules are followed: + * + * SK_BUSY can be set to 0 at any time. + * svc_sock_enqueue must be called afterwards + * SK_CONN, SK_DATA, can be set or cleared at any time. + * after a set, svc_sock_enqueue must be called. + * after a clear, the socket must be read/accepted + * if this succeeds, it must be set again. + * SK_CLOSE can set at any time. It is never cleared. + * + */ + +#define RPCDBG_FACILITY RPCDBG_SVCSOCK + + +static struct svc_sock *svc_setup_socket(struct svc_serv *, struct socket *, + int *errp, int pmap_reg); +static void svc_udp_data_ready(struct sock *, int); +static int svc_udp_recvfrom(struct svc_rqst *); +static int svc_udp_sendto(struct svc_rqst *); + +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk); +static int svc_deferred_recv(struct svc_rqst *rqstp); +static struct cache_deferred_req *svc_defer(struct cache_req *req); + +/* + * Queue up an idle server thread. Must have serv->sv_lock held. + * Note: this is really a stack rather than a queue, so that we only + * use as many different threads as we need, and the rest don't polute + * the cache. + */ +static inline void +svc_serv_enqueue(struct svc_serv *serv, struct svc_rqst *rqstp) +{ + list_add(&rqstp->rq_list, &serv->sv_threads); +} + +/* + * Dequeue an nfsd thread. Must have serv->sv_lock held. + */ +static inline void +svc_serv_dequeue(struct svc_serv *serv, struct svc_rqst *rqstp) +{ + list_del(&rqstp->rq_list); +} + +/* + * Release an skbuff after use + */ +static inline void +svc_release_skb(struct svc_rqst *rqstp) +{ + struct sk_buff *skb = rqstp->rq_skbuff; + struct svc_deferred_req *dr = rqstp->rq_deferred; + + if (skb) { + rqstp->rq_skbuff = NULL; + + dprintk("svc: service %p, releasing skb %p\n", rqstp, skb); + skb_free_datagram(rqstp->rq_sock->sk_sk, skb); + } + if (dr) { + rqstp->rq_deferred = NULL; + kfree(dr); + } +} + +/* + * Any space to write? + */ +static inline unsigned long +svc_sock_wspace(struct svc_sock *svsk) +{ + int wspace; + + if (svsk->sk_sock->type == SOCK_STREAM) + wspace = sk_stream_wspace(svsk->sk_sk); + else + wspace = sock_wspace(svsk->sk_sk); + + return wspace; +} + +/* + * Queue up a socket with data pending. If there are idle nfsd + * processes, wake 'em up. + * + */ +static void +svc_sock_enqueue(struct svc_sock *svsk) +{ + struct svc_serv *serv = svsk->sk_server; + struct svc_rqst *rqstp; + + if (!(svsk->sk_flags & + ( (1<sk_flags)) + return; + + spin_lock_bh(&serv->sv_lock); + + if (!list_empty(&serv->sv_threads) && + !list_empty(&serv->sv_sockets)) + printk(KERN_ERR + "svc_sock_enqueue: threads and sockets both waiting??\n"); + + if (test_bit(SK_DEAD, &svsk->sk_flags)) { + /* Don't enqueue dead sockets */ + dprintk("svc: socket %p is dead, not enqueued\n", svsk->sk_sk); + goto out_unlock; + } + + if (test_bit(SK_BUSY, &svsk->sk_flags)) { + /* Don't enqueue socket while daemon is receiving */ + dprintk("svc: socket %p busy, not enqueued\n", svsk->sk_sk); + goto out_unlock; + } + + set_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + if (((svsk->sk_reserved + serv->sv_bufsz)*2 + > svc_sock_wspace(svsk)) + && !test_bit(SK_CLOSE, &svsk->sk_flags) + && !test_bit(SK_CONN, &svsk->sk_flags)) { + /* Don't enqueue while not enough space for reply */ + dprintk("svc: socket %p no space, %d*2 > %ld, not enqueued\n", + svsk->sk_sk, svsk->sk_reserved+serv->sv_bufsz, + svc_sock_wspace(svsk)); + goto out_unlock; + } + clear_bit(SOCK_NOSPACE, &svsk->sk_sock->flags); + + /* Mark socket as busy. It will remain in this state until the + * server has processed all pending data and put the socket back + * on the idle list. + */ + set_bit(SK_BUSY, &svsk->sk_flags); + + if (!list_empty(&serv->sv_threads)) { + rqstp = list_entry(serv->sv_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: socket %p served by daemon %p\n", + svsk->sk_sk, rqstp); + svc_serv_dequeue(serv, rqstp); + if (rqstp->rq_sock) + printk(KERN_ERR + "svc_sock_enqueue: server %p, rq_sock=%p!\n", + rqstp, rqstp->rq_sock); + rqstp->rq_sock = svsk; + svsk->sk_inuse++; + rqstp->rq_reserved = serv->sv_bufsz; + svsk->sk_reserved += rqstp->rq_reserved; + wake_up(&rqstp->rq_wait); + } else { + dprintk("svc: socket %p put into queue\n", svsk->sk_sk); + list_add_tail(&svsk->sk_ready, &serv->sv_sockets); + } + +out_unlock: + spin_unlock_bh(&serv->sv_lock); +} + +/* + * Dequeue the first socket. Must be called with the serv->sv_lock held. + */ +static inline struct svc_sock * +svc_sock_dequeue(struct svc_serv *serv) +{ + struct svc_sock *svsk; + + if (list_empty(&serv->sv_sockets)) + return NULL; + + svsk = list_entry(serv->sv_sockets.next, + struct svc_sock, sk_ready); + list_del_init(&svsk->sk_ready); + + dprintk("svc: socket %p dequeued, inuse=%d\n", + svsk->sk_sk, svsk->sk_inuse); + + return svsk; +} + +/* + * Having read something from a socket, check whether it + * needs to be re-enqueued. + * Note: SK_DATA only gets cleared when a read-attempt finds + * no (or insufficient) data. + */ +static inline void +svc_sock_received(struct svc_sock *svsk) +{ + clear_bit(SK_BUSY, &svsk->sk_flags); + svc_sock_enqueue(svsk); +} + + +/** + * svc_reserve - change the space reserved for the reply to a request. + * @rqstp: The request in question + * @space: new max space to reserve + * + * Each request reserves some space on the output queue of the socket + * to make sure the reply fits. This function reduces that reserved + * space to be the amount of space used already, plus @space. + * + */ +void svc_reserve(struct svc_rqst *rqstp, int space) +{ + space += rqstp->rq_res.head[0].iov_len; + + if (space < rqstp->rq_reserved) { + struct svc_sock *svsk = rqstp->rq_sock; + spin_lock_bh(&svsk->sk_server->sv_lock); + svsk->sk_reserved -= (rqstp->rq_reserved - space); + rqstp->rq_reserved = space; + spin_unlock_bh(&svsk->sk_server->sv_lock); + + svc_sock_enqueue(svsk); + } +} + +/* + * Release a socket after use. + */ +static inline void +svc_sock_put(struct svc_sock *svsk) +{ + struct svc_serv *serv = svsk->sk_server; + + spin_lock_bh(&serv->sv_lock); + if (!--(svsk->sk_inuse) && test_bit(SK_DEAD, &svsk->sk_flags)) { + spin_unlock_bh(&serv->sv_lock); + dprintk("svc: releasing dead socket\n"); + sock_release(svsk->sk_sock); + kfree(svsk); + } + else + spin_unlock_bh(&serv->sv_lock); +} + +static void +svc_sock_release(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk = rqstp->rq_sock; + + svc_release_skb(rqstp); + + svc_free_allpages(rqstp); + rqstp->rq_res.page_len = 0; + rqstp->rq_res.page_base = 0; + + + /* Reset response buffer and release + * the reservation. + * But first, check that enough space was reserved + * for the reply, otherwise we have a bug! + */ + if ((rqstp->rq_res.len) > rqstp->rq_reserved) + printk(KERN_ERR "RPC request reserved %d but used %d\n", + rqstp->rq_reserved, + rqstp->rq_res.len); + + rqstp->rq_res.head[0].iov_len = 0; + svc_reserve(rqstp, 0); + rqstp->rq_sock = NULL; + + svc_sock_put(svsk); +} + +/* + * External function to wake up a server waiting for data + */ +void +svc_wake_up(struct svc_serv *serv) +{ + struct svc_rqst *rqstp; + + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_threads)) { + rqstp = list_entry(serv->sv_threads.next, + struct svc_rqst, + rq_list); + dprintk("svc: daemon %p woken up.\n", rqstp); + /* + svc_serv_dequeue(serv, rqstp); + rqstp->rq_sock = NULL; + */ + wake_up(&rqstp->rq_wait); + } + spin_unlock_bh(&serv->sv_lock); +} + +/* + * Generic sendto routine + */ +static int +svc_sendto(struct svc_rqst *rqstp, struct xdr_buf *xdr) +{ + struct svc_sock *svsk = rqstp->rq_sock; + struct socket *sock = svsk->sk_sock; + int slen; + char buffer[CMSG_SPACE(sizeof(struct in_pktinfo))]; + struct cmsghdr *cmh = (struct cmsghdr *)buffer; + struct in_pktinfo *pki = (struct in_pktinfo *)CMSG_DATA(cmh); + int len = 0; + int result; + int size; + struct page **ppage = xdr->pages; + size_t base = xdr->page_base; + unsigned int pglen = xdr->page_len; + unsigned int flags = MSG_MORE; + + slen = xdr->len; + + if (rqstp->rq_prot == IPPROTO_UDP) { + /* set the source and destination */ + struct msghdr msg; + msg.msg_name = &rqstp->rq_addr; + msg.msg_namelen = sizeof(rqstp->rq_addr); + msg.msg_iov = NULL; + msg.msg_iovlen = 0; + msg.msg_flags = MSG_MORE; + + msg.msg_control = cmh; + msg.msg_controllen = sizeof(buffer); + cmh->cmsg_len = CMSG_LEN(sizeof(*pki)); + cmh->cmsg_level = SOL_IP; + cmh->cmsg_type = IP_PKTINFO; + pki->ipi_ifindex = 0; + pki->ipi_spec_dst.s_addr = rqstp->rq_daddr; + + if (sock_sendmsg(sock, &msg, 0) < 0) + goto out; + } + + /* send head */ + if (slen == xdr->head[0].iov_len) + flags = 0; + len = sock->ops->sendpage(sock, rqstp->rq_respages[0], 0, xdr->head[0].iov_len, flags); + if (len != xdr->head[0].iov_len) + goto out; + slen -= xdr->head[0].iov_len; + if (slen == 0) + goto out; + + /* send page data */ + size = PAGE_SIZE - base < pglen ? PAGE_SIZE - base : pglen; + while (pglen > 0) { + if (slen == size) + flags = 0; + result = sock->ops->sendpage(sock, *ppage, base, size, flags); + if (result > 0) + len += result; + if (result != size) + goto out; + slen -= size; + pglen -= size; + size = PAGE_SIZE < pglen ? PAGE_SIZE : pglen; + base = 0; + ppage++; + } + /* send tail */ + if (xdr->tail[0].iov_len) { + result = sock->ops->sendpage(sock, rqstp->rq_respages[rqstp->rq_restailpage], + ((unsigned long)xdr->tail[0].iov_base)& (PAGE_SIZE-1), + xdr->tail[0].iov_len, 0); + + if (result > 0) + len += result; + } +out: + dprintk("svc: socket %p sendto([%p %Zu... ], %d) = %d (addr %x)\n", + rqstp->rq_sock, xdr->head[0].iov_base, xdr->head[0].iov_len, xdr->len, len, + rqstp->rq_addr.sin_addr.s_addr); + + return len; +} + +/* + * Check input queue length + */ +static int +svc_recv_available(struct svc_sock *svsk) +{ + mm_segment_t oldfs; + struct socket *sock = svsk->sk_sock; + int avail, err; + + oldfs = get_fs(); set_fs(KERNEL_DS); + err = sock->ops->ioctl(sock, TIOCINQ, (unsigned long) &avail); + set_fs(oldfs); + + return (err >= 0)? avail : err; +} + +/* + * Generic recvfrom routine. + */ +static int +svc_recvfrom(struct svc_rqst *rqstp, struct kvec *iov, int nr, int buflen) +{ + struct msghdr msg; + struct socket *sock; + int len, alen; + + rqstp->rq_addrlen = sizeof(rqstp->rq_addr); + sock = rqstp->rq_sock->sk_sock; + + msg.msg_name = &rqstp->rq_addr; + msg.msg_namelen = sizeof(rqstp->rq_addr); + msg.msg_control = NULL; + msg.msg_controllen = 0; + + msg.msg_flags = MSG_DONTWAIT; + + len = kernel_recvmsg(sock, &msg, iov, nr, buflen, MSG_DONTWAIT); + + /* sock_recvmsg doesn't fill in the name/namelen, so we must.. + * possibly we should cache this in the svc_sock structure + * at accept time. FIXME + */ + alen = sizeof(rqstp->rq_addr); + sock->ops->getname(sock, (struct sockaddr *)&rqstp->rq_addr, &alen, 1); + + dprintk("svc: socket %p recvfrom(%p, %Zu) = %d\n", + rqstp->rq_sock, iov[0].iov_base, iov[0].iov_len, len); + + return len; +} + +/* + * Set socket snd and rcv buffer lengths + */ +static inline void +svc_sock_setbufsize(struct socket *sock, unsigned int snd, unsigned int rcv) +{ +#if 0 + mm_segment_t oldfs; + oldfs = get_fs(); set_fs(KERNEL_DS); + sock_setsockopt(sock, SOL_SOCKET, SO_SNDBUF, + (char*)&snd, sizeof(snd)); + sock_setsockopt(sock, SOL_SOCKET, SO_RCVBUF, + (char*)&rcv, sizeof(rcv)); +#else + /* sock_setsockopt limits use to sysctl_?mem_max, + * which isn't acceptable. Until that is made conditional + * on not having CAP_SYS_RESOURCE or similar, we go direct... + * DaveM said I could! + */ + lock_sock(sock->sk); + sock->sk->sk_sndbuf = snd * 2; + sock->sk->sk_rcvbuf = rcv * 2; + sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK|SOCK_RCVBUF_LOCK; + release_sock(sock->sk); +#endif +} +/* + * INET callback when data has been received on the socket. + */ +static void +svc_udp_data_ready(struct sock *sk, int count) +{ + struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); + + if (!svsk) + goto out; + dprintk("svc: socket %p(inet %p), count=%d, busy=%d\n", + svsk, sk, count, test_bit(SK_BUSY, &svsk->sk_flags)); + set_bit(SK_DATA, &svsk->sk_flags); + svc_sock_enqueue(svsk); + out: + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); +} + +/* + * INET callback when space is newly available on the socket. + */ +static void +svc_write_space(struct sock *sk) +{ + struct svc_sock *svsk = (struct svc_sock *)(sk->sk_user_data); + + if (svsk) { + dprintk("svc: socket %p(inet %p), write_space busy=%d\n", + svsk, sk, test_bit(SK_BUSY, &svsk->sk_flags)); + svc_sock_enqueue(svsk); + } + + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) { + printk(KERN_WARNING "RPC svc_write_space: some sleeping on %p\n", + svsk); + wake_up_interruptible(sk->sk_sleep); + } +} + +/* + * Receive a datagram from a UDP socket. + */ +extern int +csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb); + +static int +svc_udp_recvfrom(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk = rqstp->rq_sock; + struct svc_serv *serv = svsk->sk_server; + struct sk_buff *skb; + int err, len; + + if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) + /* udp sockets need large rcvbuf as all pending + * requests are still in that buffer. sndbuf must + * also be large enough that there is enough space + * for one reply per thread. + */ + svc_sock_setbufsize(svsk->sk_sock, + (serv->sv_nrthreads+3) * serv->sv_bufsz, + (serv->sv_nrthreads+3) * serv->sv_bufsz); + + if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { + svc_sock_received(svsk); + return svc_deferred_recv(rqstp); + } + + clear_bit(SK_DATA, &svsk->sk_flags); + while ((skb = skb_recv_datagram(svsk->sk_sk, 0, 1, &err)) == NULL) { + if (err == -EAGAIN) { + svc_sock_received(svsk); + return err; + } + /* possibly an icmp error */ + dprintk("svc: recvfrom returned error %d\n", -err); + } + if (skb->stamp.tv_sec == 0) { + skb->stamp.tv_sec = xtime.tv_sec; + skb->stamp.tv_usec = xtime.tv_nsec * 1000; + /* Don't enable netstamp, sunrpc doesn't + need that much accuracy */ + } + svsk->sk_sk->sk_stamp = skb->stamp; + set_bit(SK_DATA, &svsk->sk_flags); /* there may be more data... */ + + /* + * Maybe more packets - kick another thread ASAP. + */ + svc_sock_received(svsk); + + len = skb->len - sizeof(struct udphdr); + rqstp->rq_arg.len = len; + + rqstp->rq_prot = IPPROTO_UDP; + + /* Get sender address */ + rqstp->rq_addr.sin_family = AF_INET; + rqstp->rq_addr.sin_port = skb->h.uh->source; + rqstp->rq_addr.sin_addr.s_addr = skb->nh.iph->saddr; + rqstp->rq_daddr = skb->nh.iph->daddr; + + if (skb_is_nonlinear(skb)) { + /* we have to copy */ + local_bh_disable(); + if (csum_partial_copy_to_xdr(&rqstp->rq_arg, skb)) { + local_bh_enable(); + /* checksum error */ + skb_free_datagram(svsk->sk_sk, skb); + return 0; + } + local_bh_enable(); + skb_free_datagram(svsk->sk_sk, skb); + } else { + /* we can use it in-place */ + rqstp->rq_arg.head[0].iov_base = skb->data + sizeof(struct udphdr); + rqstp->rq_arg.head[0].iov_len = len; + if (skb->ip_summed != CHECKSUM_UNNECESSARY) { + if ((unsigned short)csum_fold(skb_checksum(skb, 0, skb->len, skb->csum))) { + skb_free_datagram(svsk->sk_sk, skb); + return 0; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + } + rqstp->rq_skbuff = skb; + } + + rqstp->rq_arg.page_base = 0; + if (len <= rqstp->rq_arg.head[0].iov_len) { + rqstp->rq_arg.head[0].iov_len = len; + rqstp->rq_arg.page_len = 0; + } else { + rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; + rqstp->rq_argused += (rqstp->rq_arg.page_len + PAGE_SIZE - 1)/ PAGE_SIZE; + } + + if (serv->sv_stats) + serv->sv_stats->netudpcnt++; + + return len; +} + +static int +svc_udp_sendto(struct svc_rqst *rqstp) +{ + int error; + + error = svc_sendto(rqstp, &rqstp->rq_res); + if (error == -ECONNREFUSED) + /* ICMP error on earlier request. */ + error = svc_sendto(rqstp, &rqstp->rq_res); + + return error; +} + +static void +svc_udp_init(struct svc_sock *svsk) +{ + svsk->sk_sk->sk_data_ready = svc_udp_data_ready; + svsk->sk_sk->sk_write_space = svc_write_space; + svsk->sk_recvfrom = svc_udp_recvfrom; + svsk->sk_sendto = svc_udp_sendto; + + /* initialise setting must have enough space to + * receive and respond to one request. + * svc_udp_recvfrom will re-adjust if necessary + */ + svc_sock_setbufsize(svsk->sk_sock, + 3 * svsk->sk_server->sv_bufsz, + 3 * svsk->sk_server->sv_bufsz); + + set_bit(SK_DATA, &svsk->sk_flags); /* might have come in before data_ready set up */ + set_bit(SK_CHNGBUF, &svsk->sk_flags); +} + +/* + * A data_ready event on a listening socket means there's a connection + * pending. Do not use state_change as a substitute for it. + */ +static void +svc_tcp_listen_data_ready(struct sock *sk, int count_unused) +{ + struct svc_sock *svsk; + + dprintk("svc: socket %p TCP (listen) state change %d\n", + sk, sk->sk_state); + + if (sk->sk_state != TCP_LISTEN) { + /* + * This callback may called twice when a new connection + * is established as a child socket inherits everything + * from a parent LISTEN socket. + * 1) data_ready method of the parent socket will be called + * when one of child sockets become ESTABLISHED. + * 2) data_ready method of the child socket may be called + * when it receives data before the socket is accepted. + * In case of 2, we should ignore it silently. + */ + goto out; + } + if (!(svsk = (struct svc_sock *) sk->sk_user_data)) { + printk("svc: socket %p: no user data\n", sk); + goto out; + } + set_bit(SK_CONN, &svsk->sk_flags); + svc_sock_enqueue(svsk); + out: + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible_all(sk->sk_sleep); +} + +/* + * A state change on a connected socket means it's dying or dead. + */ +static void +svc_tcp_state_change(struct sock *sk) +{ + struct svc_sock *svsk; + + dprintk("svc: socket %p TCP (connected) state change %d (svsk %p)\n", + sk, sk->sk_state, sk->sk_user_data); + + if (!(svsk = (struct svc_sock *) sk->sk_user_data)) { + printk("svc: socket %p: no user data\n", sk); + goto out; + } + set_bit(SK_CLOSE, &svsk->sk_flags); + svc_sock_enqueue(svsk); + out: + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible_all(sk->sk_sleep); +} + +static void +svc_tcp_data_ready(struct sock *sk, int count) +{ + struct svc_sock * svsk; + + dprintk("svc: socket %p TCP data ready (svsk %p)\n", + sk, sk->sk_user_data); + if (!(svsk = (struct svc_sock *)(sk->sk_user_data))) + goto out; + set_bit(SK_DATA, &svsk->sk_flags); + svc_sock_enqueue(svsk); + out: + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); +} + +/* + * Accept a TCP connection + */ +static void +svc_tcp_accept(struct svc_sock *svsk) +{ + struct sockaddr_in sin; + struct svc_serv *serv = svsk->sk_server; + struct socket *sock = svsk->sk_sock; + struct socket *newsock; + struct proto_ops *ops; + struct svc_sock *newsvsk; + int err, slen; + + dprintk("svc: tcp_accept %p sock %p\n", svsk, sock); + if (!sock) + return; + + err = sock_create_lite(PF_INET, SOCK_STREAM, IPPROTO_TCP, &newsock); + if (err) { + if (err == -ENOMEM) + printk(KERN_WARNING "%s: no more sockets!\n", + serv->sv_name); + return; + } + + dprintk("svc: tcp_accept %p allocated\n", newsock); + newsock->ops = ops = sock->ops; + + clear_bit(SK_CONN, &svsk->sk_flags); + if ((err = ops->accept(sock, newsock, O_NONBLOCK)) < 0) { + if (err != -EAGAIN && net_ratelimit()) + printk(KERN_WARNING "%s: accept failed (err %d)!\n", + serv->sv_name, -err); + goto failed; /* aborted connection or whatever */ + } + set_bit(SK_CONN, &svsk->sk_flags); + svc_sock_enqueue(svsk); + + slen = sizeof(sin); + err = ops->getname(newsock, (struct sockaddr *) &sin, &slen, 1); + if (err < 0) { + if (net_ratelimit()) + printk(KERN_WARNING "%s: peername failed (err %d)!\n", + serv->sv_name, -err); + goto failed; /* aborted connection or whatever */ + } + + /* Ideally, we would want to reject connections from unauthorized + * hosts here, but when we get encription, the IP of the host won't + * tell us anything. For now just warn about unpriv connections. + */ + if (ntohs(sin.sin_port) >= 1024) { + dprintk(KERN_WARNING + "%s: connect from unprivileged port: %u.%u.%u.%u:%d\n", + serv->sv_name, + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + } + + dprintk("%s: connect from %u.%u.%u.%u:%04x\n", serv->sv_name, + NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port)); + + /* make sure that a write doesn't block forever when + * low on memory + */ + newsock->sk->sk_sndtimeo = HZ*30; + + if (!(newsvsk = svc_setup_socket(serv, newsock, &err, 0))) + goto failed; + + + /* make sure that we don't have too many active connections. + * If we have, something must be dropped. + * + * There's no point in trying to do random drop here for + * DoS prevention. The NFS clients does 1 reconnect in 15 + * seconds. An attacker can easily beat that. + * + * The only somewhat efficient mechanism would be if drop + * old connections from the same IP first. But right now + * we don't even record the client IP in svc_sock. + */ + if (serv->sv_tmpcnt > (serv->sv_nrthreads+3)*20) { + struct svc_sock *svsk = NULL; + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_tempsocks)) { + if (net_ratelimit()) { + /* Try to help the admin */ + printk(KERN_NOTICE "%s: too many open TCP " + "sockets, consider increasing the " + "number of nfsd threads\n", + serv->sv_name); + printk(KERN_NOTICE "%s: last TCP connect from " + "%u.%u.%u.%u:%d\n", + serv->sv_name, + NIPQUAD(sin.sin_addr.s_addr), + ntohs(sin.sin_port)); + } + /* + * Always select the oldest socket. It's not fair, + * but so is life + */ + svsk = list_entry(serv->sv_tempsocks.prev, + struct svc_sock, + sk_list); + set_bit(SK_CLOSE, &svsk->sk_flags); + svsk->sk_inuse ++; + } + spin_unlock_bh(&serv->sv_lock); + + if (svsk) { + svc_sock_enqueue(svsk); + svc_sock_put(svsk); + } + + } + + if (serv->sv_stats) + serv->sv_stats->nettcpconn++; + + return; + +failed: + sock_release(newsock); + return; +} + +/* + * Receive data from a TCP socket. + */ +static int +svc_tcp_recvfrom(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk = rqstp->rq_sock; + struct svc_serv *serv = svsk->sk_server; + int len; + struct kvec vec[RPCSVC_MAXPAGES]; + int pnum, vlen; + + dprintk("svc: tcp_recv %p data %d conn %d close %d\n", + svsk, test_bit(SK_DATA, &svsk->sk_flags), + test_bit(SK_CONN, &svsk->sk_flags), + test_bit(SK_CLOSE, &svsk->sk_flags)); + + if ((rqstp->rq_deferred = svc_deferred_dequeue(svsk))) { + svc_sock_received(svsk); + return svc_deferred_recv(rqstp); + } + + if (test_bit(SK_CLOSE, &svsk->sk_flags)) { + svc_delete_socket(svsk); + return 0; + } + + if (test_bit(SK_CONN, &svsk->sk_flags)) { + svc_tcp_accept(svsk); + svc_sock_received(svsk); + return 0; + } + + if (test_and_clear_bit(SK_CHNGBUF, &svsk->sk_flags)) + /* sndbuf needs to have room for one request + * per thread, otherwise we can stall even when the + * network isn't a bottleneck. + * rcvbuf just needs to be able to hold a few requests. + * Normally they will be removed from the queue + * as soon a a complete request arrives. + */ + svc_sock_setbufsize(svsk->sk_sock, + (serv->sv_nrthreads+3) * serv->sv_bufsz, + 3 * serv->sv_bufsz); + + clear_bit(SK_DATA, &svsk->sk_flags); + + /* Receive data. If we haven't got the record length yet, get + * the next four bytes. Otherwise try to gobble up as much as + * possible up to the complete record length. + */ + if (svsk->sk_tcplen < 4) { + unsigned long want = 4 - svsk->sk_tcplen; + struct kvec iov; + + iov.iov_base = ((char *) &svsk->sk_reclen) + svsk->sk_tcplen; + iov.iov_len = want; + if ((len = svc_recvfrom(rqstp, &iov, 1, want)) < 0) + goto error; + svsk->sk_tcplen += len; + + if (len < want) { + dprintk("svc: short recvfrom while reading record length (%d of %lu)\n", + len, want); + svc_sock_received(svsk); + return -EAGAIN; /* record header not complete */ + } + + svsk->sk_reclen = ntohl(svsk->sk_reclen); + if (!(svsk->sk_reclen & 0x80000000)) { + /* FIXME: technically, a record can be fragmented, + * and non-terminal fragments will not have the top + * bit set in the fragment length header. + * But apparently no known nfs clients send fragmented + * records. */ + printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (non-terminal)\n", + (unsigned long) svsk->sk_reclen); + goto err_delete; + } + svsk->sk_reclen &= 0x7fffffff; + dprintk("svc: TCP record, %d bytes\n", svsk->sk_reclen); + if (svsk->sk_reclen > serv->sv_bufsz) { + printk(KERN_NOTICE "RPC: bad TCP reclen 0x%08lx (large)\n", + (unsigned long) svsk->sk_reclen); + goto err_delete; + } + } + + /* Check whether enough data is available */ + len = svc_recv_available(svsk); + if (len < 0) + goto error; + + if (len < svsk->sk_reclen) { + dprintk("svc: incomplete TCP record (%d of %d)\n", + len, svsk->sk_reclen); + svc_sock_received(svsk); + return -EAGAIN; /* record not complete */ + } + len = svsk->sk_reclen; + set_bit(SK_DATA, &svsk->sk_flags); + + vec[0] = rqstp->rq_arg.head[0]; + vlen = PAGE_SIZE; + pnum = 1; + while (vlen < len) { + vec[pnum].iov_base = page_address(rqstp->rq_argpages[rqstp->rq_argused++]); + vec[pnum].iov_len = PAGE_SIZE; + pnum++; + vlen += PAGE_SIZE; + } + + /* Now receive data */ + len = svc_recvfrom(rqstp, vec, pnum, len); + if (len < 0) + goto error; + + dprintk("svc: TCP complete record (%d bytes)\n", len); + rqstp->rq_arg.len = len; + rqstp->rq_arg.page_base = 0; + if (len <= rqstp->rq_arg.head[0].iov_len) { + rqstp->rq_arg.head[0].iov_len = len; + rqstp->rq_arg.page_len = 0; + } else { + rqstp->rq_arg.page_len = len - rqstp->rq_arg.head[0].iov_len; + } + + rqstp->rq_skbuff = NULL; + rqstp->rq_prot = IPPROTO_TCP; + + /* Reset TCP read info */ + svsk->sk_reclen = 0; + svsk->sk_tcplen = 0; + + svc_sock_received(svsk); + if (serv->sv_stats) + serv->sv_stats->nettcpcnt++; + + return len; + + err_delete: + svc_delete_socket(svsk); + return -EAGAIN; + + error: + if (len == -EAGAIN) { + dprintk("RPC: TCP recvfrom got EAGAIN\n"); + svc_sock_received(svsk); + } else { + printk(KERN_NOTICE "%s: recvfrom returned errno %d\n", + svsk->sk_server->sv_name, -len); + svc_sock_received(svsk); + } + + return len; +} + +/* + * Send out data on TCP socket. + */ +static int +svc_tcp_sendto(struct svc_rqst *rqstp) +{ + struct xdr_buf *xbufp = &rqstp->rq_res; + int sent; + u32 reclen; + + /* Set up the first element of the reply kvec. + * Any other kvecs that may be in use have been taken + * care of by the server implementation itself. + */ + reclen = htonl(0x80000000|((xbufp->len ) - 4)); + memcpy(xbufp->head[0].iov_base, &reclen, 4); + + if (test_bit(SK_DEAD, &rqstp->rq_sock->sk_flags)) + return -ENOTCONN; + + sent = svc_sendto(rqstp, &rqstp->rq_res); + if (sent != xbufp->len) { + printk(KERN_NOTICE "rpc-srv/tcp: %s: %s %d when sending %d bytes - shutting down socket\n", + rqstp->rq_sock->sk_server->sv_name, + (sent<0)?"got error":"sent only", + sent, xbufp->len); + svc_delete_socket(rqstp->rq_sock); + sent = -EAGAIN; + } + return sent; +} + +static void +svc_tcp_init(struct svc_sock *svsk) +{ + struct sock *sk = svsk->sk_sk; + struct tcp_sock *tp = tcp_sk(sk); + + svsk->sk_recvfrom = svc_tcp_recvfrom; + svsk->sk_sendto = svc_tcp_sendto; + + if (sk->sk_state == TCP_LISTEN) { + dprintk("setting up TCP socket for listening\n"); + sk->sk_data_ready = svc_tcp_listen_data_ready; + set_bit(SK_CONN, &svsk->sk_flags); + } else { + dprintk("setting up TCP socket for reading\n"); + sk->sk_state_change = svc_tcp_state_change; + sk->sk_data_ready = svc_tcp_data_ready; + sk->sk_write_space = svc_write_space; + + svsk->sk_reclen = 0; + svsk->sk_tcplen = 0; + + tp->nonagle = 1; /* disable Nagle's algorithm */ + + /* initialise setting must have enough space to + * receive and respond to one request. + * svc_tcp_recvfrom will re-adjust if necessary + */ + svc_sock_setbufsize(svsk->sk_sock, + 3 * svsk->sk_server->sv_bufsz, + 3 * svsk->sk_server->sv_bufsz); + + set_bit(SK_CHNGBUF, &svsk->sk_flags); + set_bit(SK_DATA, &svsk->sk_flags); + if (sk->sk_state != TCP_ESTABLISHED) + set_bit(SK_CLOSE, &svsk->sk_flags); + } +} + +void +svc_sock_update_bufs(struct svc_serv *serv) +{ + /* + * The number of server threads has changed. Update + * rcvbuf and sndbuf accordingly on all sockets + */ + struct list_head *le; + + spin_lock_bh(&serv->sv_lock); + list_for_each(le, &serv->sv_permsocks) { + struct svc_sock *svsk = + list_entry(le, struct svc_sock, sk_list); + set_bit(SK_CHNGBUF, &svsk->sk_flags); + } + list_for_each(le, &serv->sv_tempsocks) { + struct svc_sock *svsk = + list_entry(le, struct svc_sock, sk_list); + set_bit(SK_CHNGBUF, &svsk->sk_flags); + } + spin_unlock_bh(&serv->sv_lock); +} + +/* + * Receive the next request on any socket. + */ +int +svc_recv(struct svc_serv *serv, struct svc_rqst *rqstp, long timeout) +{ + struct svc_sock *svsk =NULL; + int len; + int pages; + struct xdr_buf *arg; + DECLARE_WAITQUEUE(wait, current); + + dprintk("svc: server %p waiting for data (to = %ld)\n", + rqstp, timeout); + + if (rqstp->rq_sock) + printk(KERN_ERR + "svc_recv: service %p, socket not NULL!\n", + rqstp); + if (waitqueue_active(&rqstp->rq_wait)) + printk(KERN_ERR + "svc_recv: service %p, wait queue active!\n", + rqstp); + + /* Initialize the buffers */ + /* first reclaim pages that were moved to response list */ + svc_pushback_allpages(rqstp); + + /* now allocate needed pages. If we get a failure, sleep briefly */ + pages = 2 + (serv->sv_bufsz + PAGE_SIZE -1) / PAGE_SIZE; + while (rqstp->rq_arghi < pages) { + struct page *p = alloc_page(GFP_KERNEL); + if (!p) { + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ/2); + continue; + } + rqstp->rq_argpages[rqstp->rq_arghi++] = p; + } + + /* Make arg->head point to first page and arg->pages point to rest */ + arg = &rqstp->rq_arg; + arg->head[0].iov_base = page_address(rqstp->rq_argpages[0]); + arg->head[0].iov_len = PAGE_SIZE; + rqstp->rq_argused = 1; + arg->pages = rqstp->rq_argpages + 1; + arg->page_base = 0; + /* save at least one page for response */ + arg->page_len = (pages-2)*PAGE_SIZE; + arg->len = (pages-1)*PAGE_SIZE; + arg->tail[0].iov_len = 0; + + try_to_freeze(PF_FREEZE); + if (signalled()) + return -EINTR; + + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&serv->sv_tempsocks)) { + svsk = list_entry(serv->sv_tempsocks.next, + struct svc_sock, sk_list); + /* apparently the "standard" is that clients close + * idle connections after 5 minutes, servers after + * 6 minutes + * http://www.connectathon.org/talks96/nfstcp.pdf + */ + if (get_seconds() - svsk->sk_lastrecv < 6*60 + || test_bit(SK_BUSY, &svsk->sk_flags)) + svsk = NULL; + } + if (svsk) { + set_bit(SK_BUSY, &svsk->sk_flags); + set_bit(SK_CLOSE, &svsk->sk_flags); + rqstp->rq_sock = svsk; + svsk->sk_inuse++; + } else if ((svsk = svc_sock_dequeue(serv)) != NULL) { + rqstp->rq_sock = svsk; + svsk->sk_inuse++; + rqstp->rq_reserved = serv->sv_bufsz; + svsk->sk_reserved += rqstp->rq_reserved; + } else { + /* No data pending. Go to sleep */ + svc_serv_enqueue(serv, rqstp); + + /* + * We have to be able to interrupt this wait + * to bring down the daemons ... + */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&rqstp->rq_wait, &wait); + spin_unlock_bh(&serv->sv_lock); + + schedule_timeout(timeout); + + try_to_freeze(PF_FREEZE); + + spin_lock_bh(&serv->sv_lock); + remove_wait_queue(&rqstp->rq_wait, &wait); + + if (!(svsk = rqstp->rq_sock)) { + svc_serv_dequeue(serv, rqstp); + spin_unlock_bh(&serv->sv_lock); + dprintk("svc: server %p, no data yet\n", rqstp); + return signalled()? -EINTR : -EAGAIN; + } + } + spin_unlock_bh(&serv->sv_lock); + + dprintk("svc: server %p, socket %p, inuse=%d\n", + rqstp, svsk, svsk->sk_inuse); + len = svsk->sk_recvfrom(rqstp); + dprintk("svc: got len=%d\n", len); + + /* No data, incomplete (TCP) read, or accept() */ + if (len == 0 || len == -EAGAIN) { + rqstp->rq_res.len = 0; + svc_sock_release(rqstp); + return -EAGAIN; + } + svsk->sk_lastrecv = get_seconds(); + if (test_bit(SK_TEMP, &svsk->sk_flags)) { + /* push active sockets to end of list */ + spin_lock_bh(&serv->sv_lock); + if (!list_empty(&svsk->sk_list)) + list_move_tail(&svsk->sk_list, &serv->sv_tempsocks); + spin_unlock_bh(&serv->sv_lock); + } + + rqstp->rq_secure = ntohs(rqstp->rq_addr.sin_port) < 1024; + rqstp->rq_chandle.defer = svc_defer; + + if (serv->sv_stats) + serv->sv_stats->netcnt++; + return len; +} + +/* + * Drop request + */ +void +svc_drop(struct svc_rqst *rqstp) +{ + dprintk("svc: socket %p dropped request\n", rqstp->rq_sock); + svc_sock_release(rqstp); +} + +/* + * Return reply to client. + */ +int +svc_send(struct svc_rqst *rqstp) +{ + struct svc_sock *svsk; + int len; + struct xdr_buf *xb; + + if ((svsk = rqstp->rq_sock) == NULL) { + printk(KERN_WARNING "NULL socket pointer in %s:%d\n", + __FILE__, __LINE__); + return -EFAULT; + } + + /* release the receive skb before sending the reply */ + svc_release_skb(rqstp); + + /* calculate over-all length */ + xb = & rqstp->rq_res; + xb->len = xb->head[0].iov_len + + xb->page_len + + xb->tail[0].iov_len; + + /* Grab svsk->sk_sem to serialize outgoing data. */ + down(&svsk->sk_sem); + if (test_bit(SK_DEAD, &svsk->sk_flags)) + len = -ENOTCONN; + else + len = svsk->sk_sendto(rqstp); + up(&svsk->sk_sem); + svc_sock_release(rqstp); + + if (len == -ECONNREFUSED || len == -ENOTCONN || len == -EAGAIN) + return 0; + return len; +} + +/* + * Initialize socket for RPC use and create svc_sock struct + * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF. + */ +static struct svc_sock * +svc_setup_socket(struct svc_serv *serv, struct socket *sock, + int *errp, int pmap_register) +{ + struct svc_sock *svsk; + struct sock *inet; + + dprintk("svc: svc_setup_socket %p\n", sock); + if (!(svsk = kmalloc(sizeof(*svsk), GFP_KERNEL))) { + *errp = -ENOMEM; + return NULL; + } + memset(svsk, 0, sizeof(*svsk)); + + inet = sock->sk; + + /* Register socket with portmapper */ + if (*errp >= 0 && pmap_register) + *errp = svc_register(serv, inet->sk_protocol, + ntohs(inet_sk(inet)->sport)); + + if (*errp < 0) { + kfree(svsk); + return NULL; + } + + set_bit(SK_BUSY, &svsk->sk_flags); + inet->sk_user_data = svsk; + svsk->sk_sock = sock; + svsk->sk_sk = inet; + svsk->sk_ostate = inet->sk_state_change; + svsk->sk_odata = inet->sk_data_ready; + svsk->sk_owspace = inet->sk_write_space; + svsk->sk_server = serv; + svsk->sk_lastrecv = get_seconds(); + INIT_LIST_HEAD(&svsk->sk_deferred); + INIT_LIST_HEAD(&svsk->sk_ready); + sema_init(&svsk->sk_sem, 1); + + /* Initialize the socket */ + if (sock->type == SOCK_DGRAM) + svc_udp_init(svsk); + else + svc_tcp_init(svsk); + + spin_lock_bh(&serv->sv_lock); + if (!pmap_register) { + set_bit(SK_TEMP, &svsk->sk_flags); + list_add(&svsk->sk_list, &serv->sv_tempsocks); + serv->sv_tmpcnt++; + } else { + clear_bit(SK_TEMP, &svsk->sk_flags); + list_add(&svsk->sk_list, &serv->sv_permsocks); + } + spin_unlock_bh(&serv->sv_lock); + + dprintk("svc: svc_setup_socket created %p (inet %p)\n", + svsk, svsk->sk_sk); + + clear_bit(SK_BUSY, &svsk->sk_flags); + svc_sock_enqueue(svsk); + return svsk; +} + +/* + * Create socket for RPC service. + */ +static int +svc_create_socket(struct svc_serv *serv, int protocol, struct sockaddr_in *sin) +{ + struct svc_sock *svsk; + struct socket *sock; + int error; + int type; + + dprintk("svc: svc_create_socket(%s, %d, %u.%u.%u.%u:%d)\n", + serv->sv_program->pg_name, protocol, + NIPQUAD(sin->sin_addr.s_addr), + ntohs(sin->sin_port)); + + if (protocol != IPPROTO_UDP && protocol != IPPROTO_TCP) { + printk(KERN_WARNING "svc: only UDP and TCP " + "sockets supported\n"); + return -EINVAL; + } + type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + + if ((error = sock_create_kern(PF_INET, type, protocol, &sock)) < 0) + return error; + + if (sin != NULL) { + if (type == SOCK_STREAM) + sock->sk->sk_reuse = 1; /* allow address reuse */ + error = sock->ops->bind(sock, (struct sockaddr *) sin, + sizeof(*sin)); + if (error < 0) + goto bummer; + } + + if (protocol == IPPROTO_TCP) { + if ((error = sock->ops->listen(sock, 64)) < 0) + goto bummer; + } + + if ((svsk = svc_setup_socket(serv, sock, &error, 1)) != NULL) + return 0; + +bummer: + dprintk("svc: svc_create_socket error = %d\n", -error); + sock_release(sock); + return error; +} + +/* + * Remove a dead socket + */ +void +svc_delete_socket(struct svc_sock *svsk) +{ + struct svc_serv *serv; + struct sock *sk; + + dprintk("svc: svc_delete_socket(%p)\n", svsk); + + serv = svsk->sk_server; + sk = svsk->sk_sk; + + sk->sk_state_change = svsk->sk_ostate; + sk->sk_data_ready = svsk->sk_odata; + sk->sk_write_space = svsk->sk_owspace; + + spin_lock_bh(&serv->sv_lock); + + list_del_init(&svsk->sk_list); + list_del_init(&svsk->sk_ready); + if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags)) + if (test_bit(SK_TEMP, &svsk->sk_flags)) + serv->sv_tmpcnt--; + + if (!svsk->sk_inuse) { + spin_unlock_bh(&serv->sv_lock); + sock_release(svsk->sk_sock); + kfree(svsk); + } else { + spin_unlock_bh(&serv->sv_lock); + dprintk(KERN_NOTICE "svc: server socket destroy delayed\n"); + /* svsk->sk_server = NULL; */ + } +} + +/* + * Make a socket for nfsd and lockd + */ +int +svc_makesock(struct svc_serv *serv, int protocol, unsigned short port) +{ + struct sockaddr_in sin; + + dprintk("svc: creating socket proto = %d\n", protocol); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = INADDR_ANY; + sin.sin_port = htons(port); + return svc_create_socket(serv, protocol, &sin); +} + +/* + * Handle defer and revisit of requests + */ + +static void svc_revisit(struct cache_deferred_req *dreq, int too_many) +{ + struct svc_deferred_req *dr = container_of(dreq, struct svc_deferred_req, handle); + struct svc_serv *serv = dreq->owner; + struct svc_sock *svsk; + + if (too_many) { + svc_sock_put(dr->svsk); + kfree(dr); + return; + } + dprintk("revisit queued\n"); + svsk = dr->svsk; + dr->svsk = NULL; + spin_lock_bh(&serv->sv_lock); + list_add(&dr->handle.recent, &svsk->sk_deferred); + spin_unlock_bh(&serv->sv_lock); + set_bit(SK_DEFERRED, &svsk->sk_flags); + svc_sock_enqueue(svsk); + svc_sock_put(svsk); +} + +static struct cache_deferred_req * +svc_defer(struct cache_req *req) +{ + struct svc_rqst *rqstp = container_of(req, struct svc_rqst, rq_chandle); + int size = sizeof(struct svc_deferred_req) + (rqstp->rq_arg.len); + struct svc_deferred_req *dr; + + if (rqstp->rq_arg.page_len) + return NULL; /* if more than a page, give up FIXME */ + if (rqstp->rq_deferred) { + dr = rqstp->rq_deferred; + rqstp->rq_deferred = NULL; + } else { + int skip = rqstp->rq_arg.len - rqstp->rq_arg.head[0].iov_len; + /* FIXME maybe discard if size too large */ + dr = kmalloc(size, GFP_KERNEL); + if (dr == NULL) + return NULL; + + dr->handle.owner = rqstp->rq_server; + dr->prot = rqstp->rq_prot; + dr->addr = rqstp->rq_addr; + dr->argslen = rqstp->rq_arg.len >> 2; + memcpy(dr->args, rqstp->rq_arg.head[0].iov_base-skip, dr->argslen<<2); + } + spin_lock_bh(&rqstp->rq_server->sv_lock); + rqstp->rq_sock->sk_inuse++; + dr->svsk = rqstp->rq_sock; + spin_unlock_bh(&rqstp->rq_server->sv_lock); + + dr->handle.revisit = svc_revisit; + return &dr->handle; +} + +/* + * recv data from a deferred request into an active one + */ +static int svc_deferred_recv(struct svc_rqst *rqstp) +{ + struct svc_deferred_req *dr = rqstp->rq_deferred; + + rqstp->rq_arg.head[0].iov_base = dr->args; + rqstp->rq_arg.head[0].iov_len = dr->argslen<<2; + rqstp->rq_arg.page_len = 0; + rqstp->rq_arg.len = dr->argslen<<2; + rqstp->rq_prot = dr->prot; + rqstp->rq_addr = dr->addr; + return dr->argslen<<2; +} + + +static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk) +{ + struct svc_deferred_req *dr = NULL; + struct svc_serv *serv = svsk->sk_server; + + if (!test_bit(SK_DEFERRED, &svsk->sk_flags)) + return NULL; + spin_lock_bh(&serv->sv_lock); + clear_bit(SK_DEFERRED, &svsk->sk_flags); + if (!list_empty(&svsk->sk_deferred)) { + dr = list_entry(svsk->sk_deferred.next, + struct svc_deferred_req, + handle.recent); + list_del_init(&dr->handle.recent); + set_bit(SK_DEFERRED, &svsk->sk_flags); + } + spin_unlock_bh(&serv->sv_lock); + return dr; +} diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c new file mode 100644 index 000000000000..1b9616a12e24 --- /dev/null +++ b/net/sunrpc/sysctl.c @@ -0,0 +1,193 @@ +/* + * linux/net/sunrpc/sysctl.c + * + * Sysctl interface to sunrpc module. + * + * I would prefer to register the sunrpc table below sys/net, but that's + * impossible at the moment. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* + * Declare the debug flags here + */ +unsigned int rpc_debug; +unsigned int nfs_debug; +unsigned int nfsd_debug; +unsigned int nlm_debug; + +#ifdef RPC_DEBUG + +static struct ctl_table_header *sunrpc_table_header; +static ctl_table sunrpc_table[]; + +void +rpc_register_sysctl(void) +{ + if (!sunrpc_table_header) { + sunrpc_table_header = register_sysctl_table(sunrpc_table, 1); +#ifdef CONFIG_PROC_FS + if (sunrpc_table[0].de) + sunrpc_table[0].de->owner = THIS_MODULE; +#endif + } + +} + +void +rpc_unregister_sysctl(void) +{ + if (sunrpc_table_header) { + unregister_sysctl_table(sunrpc_table_header); + sunrpc_table_header = NULL; + } +} + +static int +proc_dodebug(ctl_table *table, int write, struct file *file, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char tmpbuf[20], c, *s; + char __user *p; + unsigned int value; + size_t left, len; + + if ((*ppos && !write) || !*lenp) { + *lenp = 0; + return 0; + } + + left = *lenp; + + if (write) { + if (!access_ok(VERIFY_READ, buffer, left)) + return -EFAULT; + p = buffer; + while (left && __get_user(c, p) >= 0 && isspace(c)) + left--, p++; + if (!left) + goto done; + + if (left > sizeof(tmpbuf) - 1) + return -EINVAL; + if (copy_from_user(tmpbuf, p, left)) + return -EFAULT; + tmpbuf[left] = '\0'; + + for (s = tmpbuf, value = 0; '0' <= *s && *s <= '9'; s++, left--) + value = 10 * value + (*s - '0'); + if (*s && !isspace(*s)) + return -EINVAL; + while (left && isspace(*s)) + left--, s++; + *(unsigned int *) table->data = value; + /* Display the RPC tasks on writing to rpc_debug */ + if (table->ctl_name == CTL_RPCDEBUG) { + rpc_show_tasks(); + } + } else { + if (!access_ok(VERIFY_WRITE, buffer, left)) + return -EFAULT; + len = sprintf(tmpbuf, "%d", *(unsigned int *) table->data); + if (len > left) + len = left; + if (__copy_to_user(buffer, tmpbuf, len)) + return -EFAULT; + if ((left -= len) > 0) { + if (put_user('\n', (char __user *)buffer + len)) + return -EFAULT; + left--; + } + } + +done: + *lenp -= left; + *ppos += *lenp; + return 0; +} + +static unsigned int min_slot_table_size = RPC_MIN_SLOT_TABLE; +static unsigned int max_slot_table_size = RPC_MAX_SLOT_TABLE; + +static ctl_table debug_table[] = { + { + .ctl_name = CTL_RPCDEBUG, + .procname = "rpc_debug", + .data = &rpc_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dodebug + }, + { + .ctl_name = CTL_NFSDEBUG, + .procname = "nfs_debug", + .data = &nfs_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dodebug + }, + { + .ctl_name = CTL_NFSDDEBUG, + .procname = "nfsd_debug", + .data = &nfsd_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dodebug + }, + { + .ctl_name = CTL_NLMDEBUG, + .procname = "nlm_debug", + .data = &nlm_debug, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dodebug + }, + { + .ctl_name = CTL_SLOTTABLE_UDP, + .procname = "udp_slot_table_entries", + .data = &xprt_udp_slot_table_entries, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_slot_table_size, + .extra2 = &max_slot_table_size + }, + { + .ctl_name = CTL_SLOTTABLE_TCP, + .procname = "tcp_slot_table_entries", + .data = &xprt_tcp_slot_table_entries, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_slot_table_size, + .extra2 = &max_slot_table_size + }, + { .ctl_name = 0 } +}; + +static ctl_table sunrpc_table[] = { + { + .ctl_name = CTL_SUNRPC, + .procname = "sunrpc", + .mode = 0555, + .child = debug_table + }, + { .ctl_name = 0 } +}; + +#endif diff --git a/net/sunrpc/timer.c b/net/sunrpc/timer.c new file mode 100644 index 000000000000..bcbdf6430d5c --- /dev/null +++ b/net/sunrpc/timer.c @@ -0,0 +1,107 @@ +/* + * linux/net/sunrpc/timer.c + * + * Estimate RPC request round trip time. + * + * Based on packet round-trip and variance estimator algorithms described + * in appendix A of "Congestion Avoidance and Control" by Van Jacobson + * and Michael J. Karels (ACM Computer Communication Review; Proceedings + * of the Sigcomm '88 Symposium in Stanford, CA, August, 1988). + * + * This RTT estimator is used only for RPC over datagram protocols. + * + * Copyright (C) 2002 Trond Myklebust + */ + +#include + +#include +#include + +#include +#include +#include + +#define RPC_RTO_MAX (60*HZ) +#define RPC_RTO_INIT (HZ/5) +#define RPC_RTO_MIN (HZ/10) + +void +rpc_init_rtt(struct rpc_rtt *rt, unsigned long timeo) +{ + unsigned long init = 0; + unsigned i; + + rt->timeo = timeo; + + if (timeo > RPC_RTO_INIT) + init = (timeo - RPC_RTO_INIT) << 3; + for (i = 0; i < 5; i++) { + rt->srtt[i] = init; + rt->sdrtt[i] = RPC_RTO_INIT; + rt->ntimeouts[i] = 0; + } +} + +/* + * NB: When computing the smoothed RTT and standard deviation, + * be careful not to produce negative intermediate results. + */ +void +rpc_update_rtt(struct rpc_rtt *rt, unsigned timer, long m) +{ + long *srtt, *sdrtt; + + if (timer-- == 0) + return; + + /* jiffies wrapped; ignore this one */ + if (m < 0) + return; + + if (m == 0) + m = 1L; + + srtt = (long *)&rt->srtt[timer]; + m -= *srtt >> 3; + *srtt += m; + + if (m < 0) + m = -m; + + sdrtt = (long *)&rt->sdrtt[timer]; + m -= *sdrtt >> 2; + *sdrtt += m; + + /* Set lower bound on the variance */ + if (*sdrtt < RPC_RTO_MIN) + *sdrtt = RPC_RTO_MIN; +} + +/* + * Estimate rto for an nfs rpc sent via. an unreliable datagram. + * Use the mean and mean deviation of rtt for the appropriate type of rpc + * for the frequent rpcs and a default for the others. + * The justification for doing "other" this way is that these rpcs + * happen so infrequently that timer est. would probably be stale. + * Also, since many of these rpcs are + * non-idempotent, a conservative timeout is desired. + * getattr, lookup, + * read, write, commit - A+4D + * other - timeo + */ + +unsigned long +rpc_calc_rto(struct rpc_rtt *rt, unsigned timer) +{ + unsigned long res; + + if (timer-- == 0) + return rt->timeo; + + res = ((rt->srtt[timer] + 7) >> 3) + rt->sdrtt[timer]; + if (res > RPC_RTO_MAX) + res = RPC_RTO_MAX; + + return res; +} diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c new file mode 100644 index 000000000000..4484931018eb --- /dev/null +++ b/net/sunrpc/xdr.c @@ -0,0 +1,917 @@ +/* + * linux/net/sunrpc/xdr.c + * + * Generic XDR support. + * + * Copyright (C) 1995, 1996 Olaf Kirch + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * XDR functions for basic NFS types + */ +u32 * +xdr_encode_netobj(u32 *p, const struct xdr_netobj *obj) +{ + unsigned int quadlen = XDR_QUADLEN(obj->len); + + p[quadlen] = 0; /* zero trailing bytes */ + *p++ = htonl(obj->len); + memcpy(p, obj->data, obj->len); + return p + XDR_QUADLEN(obj->len); +} + +u32 * +xdr_decode_netobj(u32 *p, struct xdr_netobj *obj) +{ + unsigned int len; + + if ((len = ntohl(*p++)) > XDR_MAX_NETOBJ) + return NULL; + obj->len = len; + obj->data = (u8 *) p; + return p + XDR_QUADLEN(len); +} + +/** + * xdr_encode_opaque_fixed - Encode fixed length opaque data + * @p - pointer to current position in XDR buffer. + * @ptr - pointer to data to encode (or NULL) + * @nbytes - size of data. + * + * Copy the array of data of length nbytes at ptr to the XDR buffer + * at position p, then align to the next 32-bit boundary by padding + * with zero bytes (see RFC1832). + * Note: if ptr is NULL, only the padding is performed. + * + * Returns the updated current XDR buffer position + * + */ +u32 *xdr_encode_opaque_fixed(u32 *p, const void *ptr, unsigned int nbytes) +{ + if (likely(nbytes != 0)) { + unsigned int quadlen = XDR_QUADLEN(nbytes); + unsigned int padding = (quadlen << 2) - nbytes; + + if (ptr != NULL) + memcpy(p, ptr, nbytes); + if (padding != 0) + memset((char *)p + nbytes, 0, padding); + p += quadlen; + } + return p; +} +EXPORT_SYMBOL(xdr_encode_opaque_fixed); + +/** + * xdr_encode_opaque - Encode variable length opaque data + * @p - pointer to current position in XDR buffer. + * @ptr - pointer to data to encode (or NULL) + * @nbytes - size of data. + * + * Returns the updated current XDR buffer position + */ +u32 *xdr_encode_opaque(u32 *p, const void *ptr, unsigned int nbytes) +{ + *p++ = htonl(nbytes); + return xdr_encode_opaque_fixed(p, ptr, nbytes); +} +EXPORT_SYMBOL(xdr_encode_opaque); + +u32 * +xdr_encode_string(u32 *p, const char *string) +{ + return xdr_encode_array(p, string, strlen(string)); +} + +u32 * +xdr_decode_string(u32 *p, char **sp, int *lenp, int maxlen) +{ + unsigned int len; + char *string; + + if ((len = ntohl(*p++)) > maxlen) + return NULL; + if (lenp) + *lenp = len; + if ((len % 4) != 0) { + string = (char *) p; + } else { + string = (char *) (p - 1); + memmove(string, p, len); + } + string[len] = '\0'; + *sp = string; + return p + XDR_QUADLEN(len); +} + +u32 * +xdr_decode_string_inplace(u32 *p, char **sp, int *lenp, int maxlen) +{ + unsigned int len; + + if ((len = ntohl(*p++)) > maxlen) + return NULL; + *lenp = len; + *sp = (char *) p; + return p + XDR_QUADLEN(len); +} + +void +xdr_encode_pages(struct xdr_buf *xdr, struct page **pages, unsigned int base, + unsigned int len) +{ + struct kvec *tail = xdr->tail; + u32 *p; + + xdr->pages = pages; + xdr->page_base = base; + xdr->page_len = len; + + p = (u32 *)xdr->head[0].iov_base + XDR_QUADLEN(xdr->head[0].iov_len); + tail->iov_base = p; + tail->iov_len = 0; + + if (len & 3) { + unsigned int pad = 4 - (len & 3); + + *p = 0; + tail->iov_base = (char *)p + (len & 3); + tail->iov_len = pad; + len += pad; + } + xdr->buflen += len; + xdr->len += len; +} + +void +xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, + struct page **pages, unsigned int base, unsigned int len) +{ + struct kvec *head = xdr->head; + struct kvec *tail = xdr->tail; + char *buf = (char *)head->iov_base; + unsigned int buflen = head->iov_len; + + head->iov_len = offset; + + xdr->pages = pages; + xdr->page_base = base; + xdr->page_len = len; + + tail->iov_base = buf + offset; + tail->iov_len = buflen - offset; + + xdr->buflen += len; +} + +void +xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, + skb_reader_t *desc, + skb_read_actor_t copy_actor) +{ + struct page **ppage = xdr->pages; + unsigned int len, pglen = xdr->page_len; + int ret; + + len = xdr->head[0].iov_len; + if (base < len) { + len -= base; + ret = copy_actor(desc, (char *)xdr->head[0].iov_base + base, len); + if (ret != len || !desc->count) + return; + base = 0; + } else + base -= len; + + if (pglen == 0) + goto copy_tail; + if (base >= pglen) { + base -= pglen; + goto copy_tail; + } + if (base || xdr->page_base) { + pglen -= base; + base += xdr->page_base; + ppage += base >> PAGE_CACHE_SHIFT; + base &= ~PAGE_CACHE_MASK; + } + do { + char *kaddr; + + len = PAGE_CACHE_SIZE; + kaddr = kmap_atomic(*ppage, KM_SKB_SUNRPC_DATA); + if (base) { + len -= base; + if (pglen < len) + len = pglen; + ret = copy_actor(desc, kaddr + base, len); + base = 0; + } else { + if (pglen < len) + len = pglen; + ret = copy_actor(desc, kaddr, len); + } + flush_dcache_page(*ppage); + kunmap_atomic(kaddr, KM_SKB_SUNRPC_DATA); + if (ret != len || !desc->count) + return; + ppage++; + } while ((pglen -= len) != 0); +copy_tail: + len = xdr->tail[0].iov_len; + if (base < len) + copy_actor(desc, (char *)xdr->tail[0].iov_base + base, len - base); +} + + +int +xdr_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, + struct xdr_buf *xdr, unsigned int base, int msgflags) +{ + struct page **ppage = xdr->pages; + unsigned int len, pglen = xdr->page_len; + int err, ret = 0; + ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int); + + len = xdr->head[0].iov_len; + if (base < len || (addr != NULL && base == 0)) { + struct kvec iov = { + .iov_base = xdr->head[0].iov_base + base, + .iov_len = len - base, + }; + struct msghdr msg = { + .msg_name = addr, + .msg_namelen = addrlen, + .msg_flags = msgflags, + }; + if (xdr->len > len) + msg.msg_flags |= MSG_MORE; + + if (iov.iov_len != 0) + err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); + else + err = kernel_sendmsg(sock, &msg, NULL, 0, 0); + if (ret == 0) + ret = err; + else if (err > 0) + ret += err; + if (err != iov.iov_len) + goto out; + base = 0; + } else + base -= len; + + if (pglen == 0) + goto copy_tail; + if (base >= pglen) { + base -= pglen; + goto copy_tail; + } + if (base || xdr->page_base) { + pglen -= base; + base += xdr->page_base; + ppage += base >> PAGE_CACHE_SHIFT; + base &= ~PAGE_CACHE_MASK; + } + + sendpage = sock->ops->sendpage ? : sock_no_sendpage; + do { + int flags = msgflags; + + len = PAGE_CACHE_SIZE; + if (base) + len -= base; + if (pglen < len) + len = pglen; + + if (pglen != len || xdr->tail[0].iov_len != 0) + flags |= MSG_MORE; + + /* Hmm... We might be dealing with highmem pages */ + if (PageHighMem(*ppage)) + sendpage = sock_no_sendpage; + err = sendpage(sock, *ppage, base, len, flags); + if (ret == 0) + ret = err; + else if (err > 0) + ret += err; + if (err != len) + goto out; + base = 0; + ppage++; + } while ((pglen -= len) != 0); +copy_tail: + len = xdr->tail[0].iov_len; + if (base < len) { + struct kvec iov = { + .iov_base = xdr->tail[0].iov_base + base, + .iov_len = len - base, + }; + struct msghdr msg = { + .msg_flags = msgflags, + }; + err = kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); + if (ret == 0) + ret = err; + else if (err > 0) + ret += err; + } +out: + return ret; +} + + +/* + * Helper routines for doing 'memmove' like operations on a struct xdr_buf + * + * _shift_data_right_pages + * @pages: vector of pages containing both the source and dest memory area. + * @pgto_base: page vector address of destination + * @pgfrom_base: page vector address of source + * @len: number of bytes to copy + * + * Note: the addresses pgto_base and pgfrom_base are both calculated in + * the same way: + * if a memory area starts at byte 'base' in page 'pages[i]', + * then its address is given as (i << PAGE_CACHE_SHIFT) + base + * Also note: pgfrom_base must be < pgto_base, but the memory areas + * they point to may overlap. + */ +static void +_shift_data_right_pages(struct page **pages, size_t pgto_base, + size_t pgfrom_base, size_t len) +{ + struct page **pgfrom, **pgto; + char *vfrom, *vto; + size_t copy; + + BUG_ON(pgto_base <= pgfrom_base); + + pgto_base += len; + pgfrom_base += len; + + pgto = pages + (pgto_base >> PAGE_CACHE_SHIFT); + pgfrom = pages + (pgfrom_base >> PAGE_CACHE_SHIFT); + + pgto_base &= ~PAGE_CACHE_MASK; + pgfrom_base &= ~PAGE_CACHE_MASK; + + do { + /* Are any pointers crossing a page boundary? */ + if (pgto_base == 0) { + flush_dcache_page(*pgto); + pgto_base = PAGE_CACHE_SIZE; + pgto--; + } + if (pgfrom_base == 0) { + pgfrom_base = PAGE_CACHE_SIZE; + pgfrom--; + } + + copy = len; + if (copy > pgto_base) + copy = pgto_base; + if (copy > pgfrom_base) + copy = pgfrom_base; + pgto_base -= copy; + pgfrom_base -= copy; + + vto = kmap_atomic(*pgto, KM_USER0); + vfrom = kmap_atomic(*pgfrom, KM_USER1); + memmove(vto + pgto_base, vfrom + pgfrom_base, copy); + kunmap_atomic(vfrom, KM_USER1); + kunmap_atomic(vto, KM_USER0); + + } while ((len -= copy) != 0); + flush_dcache_page(*pgto); +} + +/* + * _copy_to_pages + * @pages: array of pages + * @pgbase: page vector address of destination + * @p: pointer to source data + * @len: length + * + * Copies data from an arbitrary memory location into an array of pages + * The copy is assumed to be non-overlapping. + */ +static void +_copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len) +{ + struct page **pgto; + char *vto; + size_t copy; + + pgto = pages + (pgbase >> PAGE_CACHE_SHIFT); + pgbase &= ~PAGE_CACHE_MASK; + + do { + copy = PAGE_CACHE_SIZE - pgbase; + if (copy > len) + copy = len; + + vto = kmap_atomic(*pgto, KM_USER0); + memcpy(vto + pgbase, p, copy); + kunmap_atomic(vto, KM_USER0); + + pgbase += copy; + if (pgbase == PAGE_CACHE_SIZE) { + flush_dcache_page(*pgto); + pgbase = 0; + pgto++; + } + p += copy; + + } while ((len -= copy) != 0); + flush_dcache_page(*pgto); +} + +/* + * _copy_from_pages + * @p: pointer to destination + * @pages: array of pages + * @pgbase: offset of source data + * @len: length + * + * Copies data into an arbitrary memory location from an array of pages + * The copy is assumed to be non-overlapping. + */ +static void +_copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len) +{ + struct page **pgfrom; + char *vfrom; + size_t copy; + + pgfrom = pages + (pgbase >> PAGE_CACHE_SHIFT); + pgbase &= ~PAGE_CACHE_MASK; + + do { + copy = PAGE_CACHE_SIZE - pgbase; + if (copy > len) + copy = len; + + vfrom = kmap_atomic(*pgfrom, KM_USER0); + memcpy(p, vfrom + pgbase, copy); + kunmap_atomic(vfrom, KM_USER0); + + pgbase += copy; + if (pgbase == PAGE_CACHE_SIZE) { + pgbase = 0; + pgfrom++; + } + p += copy; + + } while ((len -= copy) != 0); +} + +/* + * xdr_shrink_bufhead + * @buf: xdr_buf + * @len: bytes to remove from buf->head[0] + * + * Shrinks XDR buffer's header kvec buf->head[0] by + * 'len' bytes. The extra data is not lost, but is instead + * moved into the inlined pages and/or the tail. + */ +static void +xdr_shrink_bufhead(struct xdr_buf *buf, size_t len) +{ + struct kvec *head, *tail; + size_t copy, offs; + unsigned int pglen = buf->page_len; + + tail = buf->tail; + head = buf->head; + BUG_ON (len > head->iov_len); + + /* Shift the tail first */ + if (tail->iov_len != 0) { + if (tail->iov_len > len) { + copy = tail->iov_len - len; + memmove((char *)tail->iov_base + len, + tail->iov_base, copy); + } + /* Copy from the inlined pages into the tail */ + copy = len; + if (copy > pglen) + copy = pglen; + offs = len - copy; + if (offs >= tail->iov_len) + copy = 0; + else if (copy > tail->iov_len - offs) + copy = tail->iov_len - offs; + if (copy != 0) + _copy_from_pages((char *)tail->iov_base + offs, + buf->pages, + buf->page_base + pglen + offs - len, + copy); + /* Do we also need to copy data from the head into the tail ? */ + if (len > pglen) { + offs = copy = len - pglen; + if (copy > tail->iov_len) + copy = tail->iov_len; + memcpy(tail->iov_base, + (char *)head->iov_base + + head->iov_len - offs, + copy); + } + } + /* Now handle pages */ + if (pglen != 0) { + if (pglen > len) + _shift_data_right_pages(buf->pages, + buf->page_base + len, + buf->page_base, + pglen - len); + copy = len; + if (len > pglen) + copy = pglen; + _copy_to_pages(buf->pages, buf->page_base, + (char *)head->iov_base + head->iov_len - len, + copy); + } + head->iov_len -= len; + buf->buflen -= len; + /* Have we truncated the message? */ + if (buf->len > buf->buflen) + buf->len = buf->buflen; +} + +/* + * xdr_shrink_pagelen + * @buf: xdr_buf + * @len: bytes to remove from buf->pages + * + * Shrinks XDR buffer's page array buf->pages by + * 'len' bytes. The extra data is not lost, but is instead + * moved into the tail. + */ +static void +xdr_shrink_pagelen(struct xdr_buf *buf, size_t len) +{ + struct kvec *tail; + size_t copy; + char *p; + unsigned int pglen = buf->page_len; + + tail = buf->tail; + BUG_ON (len > pglen); + + /* Shift the tail first */ + if (tail->iov_len != 0) { + p = (char *)tail->iov_base + len; + if (tail->iov_len > len) { + copy = tail->iov_len - len; + memmove(p, tail->iov_base, copy); + } else + buf->buflen -= len; + /* Copy from the inlined pages into the tail */ + copy = len; + if (copy > tail->iov_len) + copy = tail->iov_len; + _copy_from_pages((char *)tail->iov_base, + buf->pages, buf->page_base + pglen - len, + copy); + } + buf->page_len -= len; + buf->buflen -= len; + /* Have we truncated the message? */ + if (buf->len > buf->buflen) + buf->len = buf->buflen; +} + +void +xdr_shift_buf(struct xdr_buf *buf, size_t len) +{ + xdr_shrink_bufhead(buf, len); +} + +/** + * xdr_init_encode - Initialize a struct xdr_stream for sending data. + * @xdr: pointer to xdr_stream struct + * @buf: pointer to XDR buffer in which to encode data + * @p: current pointer inside XDR buffer + * + * Note: at the moment the RPC client only passes the length of our + * scratch buffer in the xdr_buf's header kvec. Previously this + * meant we needed to call xdr_adjust_iovec() after encoding the + * data. With the new scheme, the xdr_stream manages the details + * of the buffer length, and takes care of adjusting the kvec + * length for us. + */ +void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p) +{ + struct kvec *iov = buf->head; + + xdr->buf = buf; + xdr->iov = iov; + xdr->end = (uint32_t *)((char *)iov->iov_base + iov->iov_len); + buf->len = iov->iov_len = (char *)p - (char *)iov->iov_base; + xdr->p = p; +} +EXPORT_SYMBOL(xdr_init_encode); + +/** + * xdr_reserve_space - Reserve buffer space for sending + * @xdr: pointer to xdr_stream + * @nbytes: number of bytes to reserve + * + * Checks that we have enough buffer space to encode 'nbytes' more + * bytes of data. If so, update the total xdr_buf length, and + * adjust the length of the current kvec. + */ +uint32_t * xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes) +{ + uint32_t *p = xdr->p; + uint32_t *q; + + /* align nbytes on the next 32-bit boundary */ + nbytes += 3; + nbytes &= ~3; + q = p + (nbytes >> 2); + if (unlikely(q > xdr->end || q < p)) + return NULL; + xdr->p = q; + xdr->iov->iov_len += nbytes; + xdr->buf->len += nbytes; + return p; +} +EXPORT_SYMBOL(xdr_reserve_space); + +/** + * xdr_write_pages - Insert a list of pages into an XDR buffer for sending + * @xdr: pointer to xdr_stream + * @pages: list of pages + * @base: offset of first byte + * @len: length of data in bytes + * + */ +void xdr_write_pages(struct xdr_stream *xdr, struct page **pages, unsigned int base, + unsigned int len) +{ + struct xdr_buf *buf = xdr->buf; + struct kvec *iov = buf->tail; + buf->pages = pages; + buf->page_base = base; + buf->page_len = len; + + iov->iov_base = (char *)xdr->p; + iov->iov_len = 0; + xdr->iov = iov; + + if (len & 3) { + unsigned int pad = 4 - (len & 3); + + BUG_ON(xdr->p >= xdr->end); + iov->iov_base = (char *)xdr->p + (len & 3); + iov->iov_len += pad; + len += pad; + *xdr->p++ = 0; + } + buf->buflen += len; + buf->len += len; +} +EXPORT_SYMBOL(xdr_write_pages); + +/** + * xdr_init_decode - Initialize an xdr_stream for decoding data. + * @xdr: pointer to xdr_stream struct + * @buf: pointer to XDR buffer from which to decode data + * @p: current pointer inside XDR buffer + */ +void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, uint32_t *p) +{ + struct kvec *iov = buf->head; + unsigned int len = iov->iov_len; + + if (len > buf->len) + len = buf->len; + xdr->buf = buf; + xdr->iov = iov; + xdr->p = p; + xdr->end = (uint32_t *)((char *)iov->iov_base + len); +} +EXPORT_SYMBOL(xdr_init_decode); + +/** + * xdr_inline_decode - Retrieve non-page XDR data to decode + * @xdr: pointer to xdr_stream struct + * @nbytes: number of bytes of data to decode + * + * Check if the input buffer is long enough to enable us to decode + * 'nbytes' more bytes of data starting at the current position. + * If so return the current pointer, then update the current + * pointer position. + */ +uint32_t * xdr_inline_decode(struct xdr_stream *xdr, size_t nbytes) +{ + uint32_t *p = xdr->p; + uint32_t *q = p + XDR_QUADLEN(nbytes); + + if (unlikely(q > xdr->end || q < p)) + return NULL; + xdr->p = q; + return p; +} +EXPORT_SYMBOL(xdr_inline_decode); + +/** + * xdr_read_pages - Ensure page-based XDR data to decode is aligned at current pointer position + * @xdr: pointer to xdr_stream struct + * @len: number of bytes of page data + * + * Moves data beyond the current pointer position from the XDR head[] buffer + * into the page list. Any data that lies beyond current position + "len" + * bytes is moved into the XDR tail[]. The current pointer is then + * repositioned at the beginning of the XDR tail. + */ +void xdr_read_pages(struct xdr_stream *xdr, unsigned int len) +{ + struct xdr_buf *buf = xdr->buf; + struct kvec *iov; + ssize_t shift; + unsigned int end; + int padding; + + /* Realign pages to current pointer position */ + iov = buf->head; + shift = iov->iov_len + (char *)iov->iov_base - (char *)xdr->p; + if (shift > 0) + xdr_shrink_bufhead(buf, shift); + + /* Truncate page data and move it into the tail */ + if (buf->page_len > len) + xdr_shrink_pagelen(buf, buf->page_len - len); + padding = (XDR_QUADLEN(len) << 2) - len; + xdr->iov = iov = buf->tail; + /* Compute remaining message length. */ + end = iov->iov_len; + shift = buf->buflen - buf->len; + if (shift < end) + end -= shift; + else if (shift > 0) + end = 0; + /* + * Position current pointer at beginning of tail, and + * set remaining message length. + */ + xdr->p = (uint32_t *)((char *)iov->iov_base + padding); + xdr->end = (uint32_t *)((char *)iov->iov_base + end); +} +EXPORT_SYMBOL(xdr_read_pages); + +static struct kvec empty_iov = {.iov_base = NULL, .iov_len = 0}; + +void +xdr_buf_from_iov(struct kvec *iov, struct xdr_buf *buf) +{ + buf->head[0] = *iov; + buf->tail[0] = empty_iov; + buf->page_len = 0; + buf->buflen = buf->len = iov->iov_len; +} + +/* Sets subiov to the intersection of iov with the buffer of length len + * starting base bytes after iov. Indicates empty intersection by setting + * length of subiov to zero. Decrements len by length of subiov, sets base + * to zero (or decrements it by length of iov if subiov is empty). */ +static void +iov_subsegment(struct kvec *iov, struct kvec *subiov, int *base, int *len) +{ + if (*base > iov->iov_len) { + subiov->iov_base = NULL; + subiov->iov_len = 0; + *base -= iov->iov_len; + } else { + subiov->iov_base = iov->iov_base + *base; + subiov->iov_len = min(*len, (int)iov->iov_len - *base); + *base = 0; + } + *len -= subiov->iov_len; +} + +/* Sets subbuf to the portion of buf of length len beginning base bytes + * from the start of buf. Returns -1 if base of length are out of bounds. */ +int +xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, + int base, int len) +{ + int i; + + subbuf->buflen = subbuf->len = len; + iov_subsegment(buf->head, subbuf->head, &base, &len); + + if (base < buf->page_len) { + i = (base + buf->page_base) >> PAGE_CACHE_SHIFT; + subbuf->pages = &buf->pages[i]; + subbuf->page_base = (base + buf->page_base) & ~PAGE_CACHE_MASK; + subbuf->page_len = min((int)buf->page_len - base, len); + len -= subbuf->page_len; + base = 0; + } else { + base -= buf->page_len; + subbuf->page_len = 0; + } + + iov_subsegment(buf->tail, subbuf->tail, &base, &len); + if (base || len) + return -1; + return 0; +} + +/* obj is assumed to point to allocated memory of size at least len: */ +int +read_bytes_from_xdr_buf(struct xdr_buf *buf, int base, void *obj, int len) +{ + struct xdr_buf subbuf; + int this_len; + int status; + + status = xdr_buf_subsegment(buf, &subbuf, base, len); + if (status) + goto out; + this_len = min(len, (int)subbuf.head[0].iov_len); + memcpy(obj, subbuf.head[0].iov_base, this_len); + len -= this_len; + obj += this_len; + this_len = min(len, (int)subbuf.page_len); + if (this_len) + _copy_from_pages(obj, subbuf.pages, subbuf.page_base, this_len); + len -= this_len; + obj += this_len; + this_len = min(len, (int)subbuf.tail[0].iov_len); + memcpy(obj, subbuf.tail[0].iov_base, this_len); +out: + return status; +} + +static int +read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj) +{ + u32 raw; + int status; + + status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj)); + if (status) + return status; + *obj = ntohl(raw); + return 0; +} + +/* If the netobj starting offset bytes from the start of xdr_buf is contained + * entirely in the head or the tail, set object to point to it; otherwise + * try to find space for it at the end of the tail, copy it there, and + * set obj to point to it. */ +int +xdr_buf_read_netobj(struct xdr_buf *buf, struct xdr_netobj *obj, int offset) +{ + u32 tail_offset = buf->head[0].iov_len + buf->page_len; + u32 obj_end_offset; + + if (read_u32_from_xdr_buf(buf, offset, &obj->len)) + goto out; + obj_end_offset = offset + 4 + obj->len; + + if (obj_end_offset <= buf->head[0].iov_len) { + /* The obj is contained entirely in the head: */ + obj->data = buf->head[0].iov_base + offset + 4; + } else if (offset + 4 >= tail_offset) { + if (obj_end_offset - tail_offset + > buf->tail[0].iov_len) + goto out; + /* The obj is contained entirely in the tail: */ + obj->data = buf->tail[0].iov_base + + offset - tail_offset + 4; + } else { + /* use end of tail as storage for obj: + * (We don't copy to the beginning because then we'd have + * to worry about doing a potentially overlapping copy. + * This assumes the object is at most half the length of the + * tail.) */ + if (obj->len > buf->tail[0].iov_len) + goto out; + obj->data = buf->tail[0].iov_base + buf->tail[0].iov_len - + obj->len; + if (read_bytes_from_xdr_buf(buf, offset + 4, + obj->data, obj->len)) + goto out; + + } + return 0; +out: + return -1; +} diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c new file mode 100644 index 000000000000..c74a6bb94074 --- /dev/null +++ b/net/sunrpc/xprt.c @@ -0,0 +1,1678 @@ +/* + * linux/net/sunrpc/xprt.c + * + * This is a generic RPC call interface supporting congestion avoidance, + * and asynchronous calls. + * + * The interface works like this: + * + * - When a process places a call, it allocates a request slot if + * one is available. Otherwise, it sleeps on the backlog queue + * (xprt_reserve). + * - Next, the caller puts together the RPC message, stuffs it into + * the request struct, and calls xprt_call(). + * - xprt_call transmits the message and installs the caller on the + * socket's wait list. At the same time, it installs a timer that + * is run after the packet's timeout has expired. + * - When a packet arrives, the data_ready handler walks the list of + * pending requests for that socket. If a matching XID is found, the + * caller is woken up, and the timer removed. + * - When no reply arrives within the timeout interval, the timer is + * fired by the kernel and runs xprt_timer(). It either adjusts the + * timeout values (minor timeout) or wakes up the caller with a status + * of -ETIMEDOUT. + * - When the caller receives a notification from RPC that a reply arrived, + * it should release the RPC slot, and process the reply. + * If the call timed out, it may choose to retry the operation by + * adjusting the initial timeout value, and simply calling rpc_call + * again. + * + * Support for async RPC is done through a set of RPC-specific scheduling + * primitives that `transparently' work for processes as well as async + * tasks that rely on callbacks. + * + * Copyright (C) 1995-1997, Olaf Kirch + * + * TCP callback races fixes (C) 1998 Red Hat Software + * TCP send fixes (C) 1998 Red Hat Software + * TCP NFS related read + write fixes + * (C) 1999 Dave Airlie, University of Limerick, Ireland + * + * Rewrite of larges part of the code in order to stabilize TCP stuff. + * Fix behaviour when socket buffer is full. + * (C) 1999 Trond Myklebust + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * Local variables + */ + +#ifdef RPC_DEBUG +# undef RPC_DEBUG_DATA +# define RPCDBG_FACILITY RPCDBG_XPRT +#endif + +#define XPRT_MAX_BACKOFF (8) +#define XPRT_IDLE_TIMEOUT (5*60*HZ) +#define XPRT_MAX_RESVPORT (800) + +/* + * Local functions + */ +static void xprt_request_init(struct rpc_task *, struct rpc_xprt *); +static inline void do_xprt_reserve(struct rpc_task *); +static void xprt_disconnect(struct rpc_xprt *); +static void xprt_connect_status(struct rpc_task *task); +static struct rpc_xprt * xprt_setup(int proto, struct sockaddr_in *ap, + struct rpc_timeout *to); +static struct socket *xprt_create_socket(struct rpc_xprt *, int, int); +static void xprt_bind_socket(struct rpc_xprt *, struct socket *); +static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); + +static int xprt_clear_backlog(struct rpc_xprt *xprt); + +#ifdef RPC_DEBUG_DATA +/* + * Print the buffer contents (first 128 bytes only--just enough for + * diropres return). + */ +static void +xprt_pktdump(char *msg, u32 *packet, unsigned int count) +{ + u8 *buf = (u8 *) packet; + int j; + + dprintk("RPC: %s\n", msg); + for (j = 0; j < count && j < 128; j += 4) { + if (!(j & 31)) { + if (j) + dprintk("\n"); + dprintk("0x%04x ", j); + } + dprintk("%02x%02x%02x%02x ", + buf[j], buf[j+1], buf[j+2], buf[j+3]); + } + dprintk("\n"); +} +#else +static inline void +xprt_pktdump(char *msg, u32 *packet, unsigned int count) +{ + /* NOP */ +} +#endif + +/* + * Look up RPC transport given an INET socket + */ +static inline struct rpc_xprt * +xprt_from_sock(struct sock *sk) +{ + return (struct rpc_xprt *) sk->sk_user_data; +} + +/* + * Serialize write access to sockets, in order to prevent different + * requests from interfering with each other. + * Also prevents TCP socket connects from colliding with writes. + */ +static int +__xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + + if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) { + if (task == xprt->snd_task) + return 1; + if (task == NULL) + return 0; + goto out_sleep; + } + if (xprt->nocong || __xprt_get_cong(xprt, task)) { + xprt->snd_task = task; + if (req) { + req->rq_bytes_sent = 0; + req->rq_ntrans++; + } + return 1; + } + smp_mb__before_clear_bit(); + clear_bit(XPRT_LOCKED, &xprt->sockstate); + smp_mb__after_clear_bit(); +out_sleep: + dprintk("RPC: %4d failed to lock socket %p\n", task->tk_pid, xprt); + task->tk_timeout = 0; + task->tk_status = -EAGAIN; + if (req && req->rq_ntrans) + rpc_sleep_on(&xprt->resend, task, NULL, NULL); + else + rpc_sleep_on(&xprt->sending, task, NULL, NULL); + return 0; +} + +static inline int +xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) +{ + int retval; + + spin_lock_bh(&xprt->sock_lock); + retval = __xprt_lock_write(xprt, task); + spin_unlock_bh(&xprt->sock_lock); + return retval; +} + + +static void +__xprt_lock_write_next(struct rpc_xprt *xprt) +{ + struct rpc_task *task; + + if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) + return; + if (!xprt->nocong && RPCXPRT_CONGESTED(xprt)) + goto out_unlock; + task = rpc_wake_up_next(&xprt->resend); + if (!task) { + task = rpc_wake_up_next(&xprt->sending); + if (!task) + goto out_unlock; + } + if (xprt->nocong || __xprt_get_cong(xprt, task)) { + struct rpc_rqst *req = task->tk_rqstp; + xprt->snd_task = task; + if (req) { + req->rq_bytes_sent = 0; + req->rq_ntrans++; + } + return; + } +out_unlock: + smp_mb__before_clear_bit(); + clear_bit(XPRT_LOCKED, &xprt->sockstate); + smp_mb__after_clear_bit(); +} + +/* + * Releases the socket for use by other requests. + */ +static void +__xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) +{ + if (xprt->snd_task == task) { + xprt->snd_task = NULL; + smp_mb__before_clear_bit(); + clear_bit(XPRT_LOCKED, &xprt->sockstate); + smp_mb__after_clear_bit(); + __xprt_lock_write_next(xprt); + } +} + +static inline void +xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) +{ + spin_lock_bh(&xprt->sock_lock); + __xprt_release_write(xprt, task); + spin_unlock_bh(&xprt->sock_lock); +} + +/* + * Write data to socket. + */ +static inline int +xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) +{ + struct socket *sock = xprt->sock; + struct xdr_buf *xdr = &req->rq_snd_buf; + struct sockaddr *addr = NULL; + int addrlen = 0; + unsigned int skip; + int result; + + if (!sock) + return -ENOTCONN; + + xprt_pktdump("packet data:", + req->rq_svec->iov_base, + req->rq_svec->iov_len); + + /* For UDP, we need to provide an address */ + if (!xprt->stream) { + addr = (struct sockaddr *) &xprt->addr; + addrlen = sizeof(xprt->addr); + } + /* Dont repeat bytes */ + skip = req->rq_bytes_sent; + + clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); + result = xdr_sendpages(sock, addr, addrlen, xdr, skip, MSG_DONTWAIT); + + dprintk("RPC: xprt_sendmsg(%d) = %d\n", xdr->len - skip, result); + + if (result >= 0) + return result; + + switch (result) { + case -ECONNREFUSED: + /* When the server has died, an ICMP port unreachable message + * prompts ECONNREFUSED. + */ + case -EAGAIN: + break; + case -ECONNRESET: + case -ENOTCONN: + case -EPIPE: + /* connection broken */ + if (xprt->stream) + result = -ENOTCONN; + break; + default: + printk(KERN_NOTICE "RPC: sendmsg returned error %d\n", -result); + } + return result; +} + +/* + * Van Jacobson congestion avoidance. Check if the congestion window + * overflowed. Put the task to sleep if this is the case. + */ +static int +__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + + if (req->rq_cong) + return 1; + dprintk("RPC: %4d xprt_cwnd_limited cong = %ld cwnd = %ld\n", + task->tk_pid, xprt->cong, xprt->cwnd); + if (RPCXPRT_CONGESTED(xprt)) + return 0; + req->rq_cong = 1; + xprt->cong += RPC_CWNDSCALE; + return 1; +} + +/* + * Adjust the congestion window, and wake up the next task + * that has been sleeping due to congestion + */ +static void +__xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) +{ + if (!req->rq_cong) + return; + req->rq_cong = 0; + xprt->cong -= RPC_CWNDSCALE; + __xprt_lock_write_next(xprt); +} + +/* + * Adjust RPC congestion window + * We use a time-smoothed congestion estimator to avoid heavy oscillation. + */ +static void +xprt_adjust_cwnd(struct rpc_xprt *xprt, int result) +{ + unsigned long cwnd; + + cwnd = xprt->cwnd; + if (result >= 0 && cwnd <= xprt->cong) { + /* The (cwnd >> 1) term makes sure + * the result gets rounded properly. */ + cwnd += (RPC_CWNDSCALE * RPC_CWNDSCALE + (cwnd >> 1)) / cwnd; + if (cwnd > RPC_MAXCWND(xprt)) + cwnd = RPC_MAXCWND(xprt); + __xprt_lock_write_next(xprt); + } else if (result == -ETIMEDOUT) { + cwnd >>= 1; + if (cwnd < RPC_CWNDSCALE) + cwnd = RPC_CWNDSCALE; + } + dprintk("RPC: cong %ld, cwnd was %ld, now %ld\n", + xprt->cong, xprt->cwnd, cwnd); + xprt->cwnd = cwnd; +} + +/* + * Reset the major timeout value + */ +static void xprt_reset_majortimeo(struct rpc_rqst *req) +{ + struct rpc_timeout *to = &req->rq_xprt->timeout; + + req->rq_majortimeo = req->rq_timeout; + if (to->to_exponential) + req->rq_majortimeo <<= to->to_retries; + else + req->rq_majortimeo += to->to_increment * to->to_retries; + if (req->rq_majortimeo > to->to_maxval || req->rq_majortimeo == 0) + req->rq_majortimeo = to->to_maxval; + req->rq_majortimeo += jiffies; +} + +/* + * Adjust timeout values etc for next retransmit + */ +int xprt_adjust_timeout(struct rpc_rqst *req) +{ + struct rpc_xprt *xprt = req->rq_xprt; + struct rpc_timeout *to = &xprt->timeout; + int status = 0; + + if (time_before(jiffies, req->rq_majortimeo)) { + if (to->to_exponential) + req->rq_timeout <<= 1; + else + req->rq_timeout += to->to_increment; + if (to->to_maxval && req->rq_timeout >= to->to_maxval) + req->rq_timeout = to->to_maxval; + req->rq_retries++; + pprintk("RPC: %lu retrans\n", jiffies); + } else { + req->rq_timeout = to->to_initval; + req->rq_retries = 0; + xprt_reset_majortimeo(req); + /* Reset the RTT counters == "slow start" */ + spin_lock_bh(&xprt->sock_lock); + rpc_init_rtt(req->rq_task->tk_client->cl_rtt, to->to_initval); + spin_unlock_bh(&xprt->sock_lock); + pprintk("RPC: %lu timeout\n", jiffies); + status = -ETIMEDOUT; + } + + if (req->rq_timeout == 0) { + printk(KERN_WARNING "xprt_adjust_timeout: rq_timeout = 0!\n"); + req->rq_timeout = 5 * HZ; + } + return status; +} + +/* + * Close down a transport socket + */ +static void +xprt_close(struct rpc_xprt *xprt) +{ + struct socket *sock = xprt->sock; + struct sock *sk = xprt->inet; + + if (!sk) + return; + + write_lock_bh(&sk->sk_callback_lock); + xprt->inet = NULL; + xprt->sock = NULL; + + sk->sk_user_data = NULL; + sk->sk_data_ready = xprt->old_data_ready; + sk->sk_state_change = xprt->old_state_change; + sk->sk_write_space = xprt->old_write_space; + write_unlock_bh(&sk->sk_callback_lock); + + sk->sk_no_check = 0; + + sock_release(sock); +} + +static void +xprt_socket_autoclose(void *args) +{ + struct rpc_xprt *xprt = (struct rpc_xprt *)args; + + xprt_disconnect(xprt); + xprt_close(xprt); + xprt_release_write(xprt, NULL); +} + +/* + * Mark a transport as disconnected + */ +static void +xprt_disconnect(struct rpc_xprt *xprt) +{ + dprintk("RPC: disconnected transport %p\n", xprt); + spin_lock_bh(&xprt->sock_lock); + xprt_clear_connected(xprt); + rpc_wake_up_status(&xprt->pending, -ENOTCONN); + spin_unlock_bh(&xprt->sock_lock); +} + +/* + * Used to allow disconnection when we've been idle + */ +static void +xprt_init_autodisconnect(unsigned long data) +{ + struct rpc_xprt *xprt = (struct rpc_xprt *)data; + + spin_lock(&xprt->sock_lock); + if (!list_empty(&xprt->recv) || xprt->shutdown) + goto out_abort; + if (test_and_set_bit(XPRT_LOCKED, &xprt->sockstate)) + goto out_abort; + spin_unlock(&xprt->sock_lock); + /* Let keventd close the socket */ + if (test_bit(XPRT_CONNECTING, &xprt->sockstate) != 0) + xprt_release_write(xprt, NULL); + else + schedule_work(&xprt->task_cleanup); + return; +out_abort: + spin_unlock(&xprt->sock_lock); +} + +static void xprt_socket_connect(void *args) +{ + struct rpc_xprt *xprt = (struct rpc_xprt *)args; + struct socket *sock = xprt->sock; + int status = -EIO; + + if (xprt->shutdown || xprt->addr.sin_port == 0) + goto out; + + /* + * Start by resetting any existing state + */ + xprt_close(xprt); + sock = xprt_create_socket(xprt, xprt->prot, xprt->resvport); + if (sock == NULL) { + /* couldn't create socket or bind to reserved port; + * this is likely a permanent error, so cause an abort */ + goto out; + } + xprt_bind_socket(xprt, sock); + xprt_sock_setbufsize(xprt); + + status = 0; + if (!xprt->stream) + goto out; + + /* + * Tell the socket layer to start connecting... + */ + status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr, + sizeof(xprt->addr), O_NONBLOCK); + dprintk("RPC: %p connect status %d connected %d sock state %d\n", + xprt, -status, xprt_connected(xprt), sock->sk->sk_state); + if (status < 0) { + switch (status) { + case -EINPROGRESS: + case -EALREADY: + goto out_clear; + } + } +out: + if (status < 0) + rpc_wake_up_status(&xprt->pending, status); + else + rpc_wake_up(&xprt->pending); +out_clear: + smp_mb__before_clear_bit(); + clear_bit(XPRT_CONNECTING, &xprt->sockstate); + smp_mb__after_clear_bit(); +} + +/* + * Attempt to connect a TCP socket. + * + */ +void xprt_connect(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + dprintk("RPC: %4d xprt_connect xprt %p %s connected\n", task->tk_pid, + xprt, (xprt_connected(xprt) ? "is" : "is not")); + + if (xprt->shutdown) { + task->tk_status = -EIO; + return; + } + if (!xprt->addr.sin_port) { + task->tk_status = -EIO; + return; + } + if (!xprt_lock_write(xprt, task)) + return; + if (xprt_connected(xprt)) + goto out_write; + + if (task->tk_rqstp) + task->tk_rqstp->rq_bytes_sent = 0; + + task->tk_timeout = RPC_CONNECT_TIMEOUT; + rpc_sleep_on(&xprt->pending, task, xprt_connect_status, NULL); + if (!test_and_set_bit(XPRT_CONNECTING, &xprt->sockstate)) { + /* Note: if we are here due to a dropped connection + * we delay reconnecting by RPC_REESTABLISH_TIMEOUT/HZ + * seconds + */ + if (xprt->sock != NULL) + schedule_delayed_work(&xprt->sock_connect, + RPC_REESTABLISH_TIMEOUT); + else + schedule_work(&xprt->sock_connect); + } + return; + out_write: + xprt_release_write(xprt, task); +} + +/* + * We arrive here when awoken from waiting on connection establishment. + */ +static void +xprt_connect_status(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + if (task->tk_status >= 0) { + dprintk("RPC: %4d xprt_connect_status: connection established\n", + task->tk_pid); + return; + } + + /* if soft mounted, just cause this RPC to fail */ + if (RPC_IS_SOFT(task)) + task->tk_status = -EIO; + + switch (task->tk_status) { + case -ECONNREFUSED: + case -ECONNRESET: + case -ENOTCONN: + return; + case -ETIMEDOUT: + dprintk("RPC: %4d xprt_connect_status: timed out\n", + task->tk_pid); + break; + default: + printk(KERN_ERR "RPC: error %d connecting to server %s\n", + -task->tk_status, task->tk_client->cl_server); + } + xprt_release_write(xprt, task); +} + +/* + * Look up the RPC request corresponding to a reply, and then lock it. + */ +static inline struct rpc_rqst * +xprt_lookup_rqst(struct rpc_xprt *xprt, u32 xid) +{ + struct list_head *pos; + struct rpc_rqst *req = NULL; + + list_for_each(pos, &xprt->recv) { + struct rpc_rqst *entry = list_entry(pos, struct rpc_rqst, rq_list); + if (entry->rq_xid == xid) { + req = entry; + break; + } + } + return req; +} + +/* + * Complete reply received. + * The TCP code relies on us to remove the request from xprt->pending. + */ +static void +xprt_complete_rqst(struct rpc_xprt *xprt, struct rpc_rqst *req, int copied) +{ + struct rpc_task *task = req->rq_task; + struct rpc_clnt *clnt = task->tk_client; + + /* Adjust congestion window */ + if (!xprt->nocong) { + unsigned timer = task->tk_msg.rpc_proc->p_timer; + xprt_adjust_cwnd(xprt, copied); + __xprt_put_cong(xprt, req); + if (timer) { + if (req->rq_ntrans == 1) + rpc_update_rtt(clnt->cl_rtt, timer, + (long)jiffies - req->rq_xtime); + rpc_set_timeo(clnt->cl_rtt, timer, req->rq_ntrans - 1); + } + } + +#ifdef RPC_PROFILE + /* Profile only reads for now */ + if (copied > 1024) { + static unsigned long nextstat; + static unsigned long pkt_rtt, pkt_len, pkt_cnt; + + pkt_cnt++; + pkt_len += req->rq_slen + copied; + pkt_rtt += jiffies - req->rq_xtime; + if (time_before(nextstat, jiffies)) { + printk("RPC: %lu %ld cwnd\n", jiffies, xprt->cwnd); + printk("RPC: %ld %ld %ld %ld stat\n", + jiffies, pkt_cnt, pkt_len, pkt_rtt); + pkt_rtt = pkt_len = pkt_cnt = 0; + nextstat = jiffies + 5 * HZ; + } + } +#endif + + dprintk("RPC: %4d has input (%d bytes)\n", task->tk_pid, copied); + list_del_init(&req->rq_list); + req->rq_received = req->rq_private_buf.len = copied; + + /* ... and wake up the process. */ + rpc_wake_up_task(task); + return; +} + +static size_t +skb_read_bits(skb_reader_t *desc, void *to, size_t len) +{ + if (len > desc->count) + len = desc->count; + if (skb_copy_bits(desc->skb, desc->offset, to, len)) + return 0; + desc->count -= len; + desc->offset += len; + return len; +} + +static size_t +skb_read_and_csum_bits(skb_reader_t *desc, void *to, size_t len) +{ + unsigned int csum2, pos; + + if (len > desc->count) + len = desc->count; + pos = desc->offset; + csum2 = skb_copy_and_csum_bits(desc->skb, pos, to, len, 0); + desc->csum = csum_block_add(desc->csum, csum2, pos); + desc->count -= len; + desc->offset += len; + return len; +} + +/* + * We have set things up such that we perform the checksum of the UDP + * packet in parallel with the copies into the RPC client iovec. -DaveM + */ +int +csum_partial_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) +{ + skb_reader_t desc; + + desc.skb = skb; + desc.offset = sizeof(struct udphdr); + desc.count = skb->len - desc.offset; + + if (skb->ip_summed == CHECKSUM_UNNECESSARY) + goto no_checksum; + + desc.csum = csum_partial(skb->data, desc.offset, skb->csum); + xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_and_csum_bits); + if (desc.offset != skb->len) { + unsigned int csum2; + csum2 = skb_checksum(skb, desc.offset, skb->len - desc.offset, 0); + desc.csum = csum_block_add(desc.csum, csum2, desc.offset); + } + if (desc.count) + return -1; + if ((unsigned short)csum_fold(desc.csum)) + return -1; + return 0; +no_checksum: + xdr_partial_copy_from_skb(xdr, 0, &desc, skb_read_bits); + if (desc.count) + return -1; + return 0; +} + +/* + * Input handler for RPC replies. Called from a bottom half and hence + * atomic. + */ +static void +udp_data_ready(struct sock *sk, int len) +{ + struct rpc_task *task; + struct rpc_xprt *xprt; + struct rpc_rqst *rovr; + struct sk_buff *skb; + int err, repsize, copied; + u32 _xid, *xp; + + read_lock(&sk->sk_callback_lock); + dprintk("RPC: udp_data_ready...\n"); + if (!(xprt = xprt_from_sock(sk))) { + printk("RPC: udp_data_ready request not found!\n"); + goto out; + } + + dprintk("RPC: udp_data_ready client %p\n", xprt); + + if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) + goto out; + + if (xprt->shutdown) + goto dropit; + + repsize = skb->len - sizeof(struct udphdr); + if (repsize < 4) { + printk("RPC: impossible RPC reply size %d!\n", repsize); + goto dropit; + } + + /* Copy the XID from the skb... */ + xp = skb_header_pointer(skb, sizeof(struct udphdr), + sizeof(_xid), &_xid); + if (xp == NULL) + goto dropit; + + /* Look up and lock the request corresponding to the given XID */ + spin_lock(&xprt->sock_lock); + rovr = xprt_lookup_rqst(xprt, *xp); + if (!rovr) + goto out_unlock; + task = rovr->rq_task; + + dprintk("RPC: %4d received reply\n", task->tk_pid); + + if ((copied = rovr->rq_private_buf.buflen) > repsize) + copied = repsize; + + /* Suck it into the iovec, verify checksum if not done by hw. */ + if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) + goto out_unlock; + + /* Something worked... */ + dst_confirm(skb->dst); + + xprt_complete_rqst(xprt, rovr, copied); + + out_unlock: + spin_unlock(&xprt->sock_lock); + dropit: + skb_free_datagram(sk, skb); + out: + read_unlock(&sk->sk_callback_lock); +} + +/* + * Copy from an skb into memory and shrink the skb. + */ +static inline size_t +tcp_copy_data(skb_reader_t *desc, void *p, size_t len) +{ + if (len > desc->count) + len = desc->count; + if (skb_copy_bits(desc->skb, desc->offset, p, len)) + return 0; + desc->offset += len; + desc->count -= len; + return len; +} + +/* + * TCP read fragment marker + */ +static inline void +tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + size_t len, used; + char *p; + + p = ((char *) &xprt->tcp_recm) + xprt->tcp_offset; + len = sizeof(xprt->tcp_recm) - xprt->tcp_offset; + used = tcp_copy_data(desc, p, len); + xprt->tcp_offset += used; + if (used != len) + return; + xprt->tcp_reclen = ntohl(xprt->tcp_recm); + if (xprt->tcp_reclen & 0x80000000) + xprt->tcp_flags |= XPRT_LAST_FRAG; + else + xprt->tcp_flags &= ~XPRT_LAST_FRAG; + xprt->tcp_reclen &= 0x7fffffff; + xprt->tcp_flags &= ~XPRT_COPY_RECM; + xprt->tcp_offset = 0; + /* Sanity check of the record length */ + if (xprt->tcp_reclen < 4) { + printk(KERN_ERR "RPC: Invalid TCP record fragment length\n"); + xprt_disconnect(xprt); + } + dprintk("RPC: reading TCP record fragment of length %d\n", + xprt->tcp_reclen); +} + +static void +tcp_check_recm(struct rpc_xprt *xprt) +{ + if (xprt->tcp_offset == xprt->tcp_reclen) { + xprt->tcp_flags |= XPRT_COPY_RECM; + xprt->tcp_offset = 0; + if (xprt->tcp_flags & XPRT_LAST_FRAG) { + xprt->tcp_flags &= ~XPRT_COPY_DATA; + xprt->tcp_flags |= XPRT_COPY_XID; + xprt->tcp_copied = 0; + } + } +} + +/* + * TCP read xid + */ +static inline void +tcp_read_xid(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + size_t len, used; + char *p; + + len = sizeof(xprt->tcp_xid) - xprt->tcp_offset; + dprintk("RPC: reading XID (%Zu bytes)\n", len); + p = ((char *) &xprt->tcp_xid) + xprt->tcp_offset; + used = tcp_copy_data(desc, p, len); + xprt->tcp_offset += used; + if (used != len) + return; + xprt->tcp_flags &= ~XPRT_COPY_XID; + xprt->tcp_flags |= XPRT_COPY_DATA; + xprt->tcp_copied = 4; + dprintk("RPC: reading reply for XID %08x\n", + ntohl(xprt->tcp_xid)); + tcp_check_recm(xprt); +} + +/* + * TCP read and complete request + */ +static inline void +tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + struct rpc_rqst *req; + struct xdr_buf *rcvbuf; + size_t len; + + /* Find and lock the request corresponding to this xid */ + spin_lock(&xprt->sock_lock); + req = xprt_lookup_rqst(xprt, xprt->tcp_xid); + if (!req) { + xprt->tcp_flags &= ~XPRT_COPY_DATA; + dprintk("RPC: XID %08x request not found!\n", + ntohl(xprt->tcp_xid)); + spin_unlock(&xprt->sock_lock); + return; + } + + rcvbuf = &req->rq_private_buf; + len = desc->count; + if (len > xprt->tcp_reclen - xprt->tcp_offset) { + skb_reader_t my_desc; + + len = xprt->tcp_reclen - xprt->tcp_offset; + memcpy(&my_desc, desc, sizeof(my_desc)); + my_desc.count = len; + xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, + &my_desc, tcp_copy_data); + desc->count -= len; + desc->offset += len; + } else + xdr_partial_copy_from_skb(rcvbuf, xprt->tcp_copied, + desc, tcp_copy_data); + xprt->tcp_copied += len; + xprt->tcp_offset += len; + + if (xprt->tcp_copied == req->rq_private_buf.buflen) + xprt->tcp_flags &= ~XPRT_COPY_DATA; + else if (xprt->tcp_offset == xprt->tcp_reclen) { + if (xprt->tcp_flags & XPRT_LAST_FRAG) + xprt->tcp_flags &= ~XPRT_COPY_DATA; + } + + if (!(xprt->tcp_flags & XPRT_COPY_DATA)) { + dprintk("RPC: %4d received reply complete\n", + req->rq_task->tk_pid); + xprt_complete_rqst(xprt, req, xprt->tcp_copied); + } + spin_unlock(&xprt->sock_lock); + tcp_check_recm(xprt); +} + +/* + * TCP discard extra bytes from a short read + */ +static inline void +tcp_read_discard(struct rpc_xprt *xprt, skb_reader_t *desc) +{ + size_t len; + + len = xprt->tcp_reclen - xprt->tcp_offset; + if (len > desc->count) + len = desc->count; + desc->count -= len; + desc->offset += len; + xprt->tcp_offset += len; + tcp_check_recm(xprt); +} + +/* + * TCP record receive routine + * We first have to grab the record marker, then the XID, then the data. + */ +static int +tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct rpc_xprt *xprt = rd_desc->arg.data; + skb_reader_t desc = { + .skb = skb, + .offset = offset, + .count = len, + .csum = 0 + }; + + dprintk("RPC: tcp_data_recv\n"); + do { + /* Read in a new fragment marker if necessary */ + /* Can we ever really expect to get completely empty fragments? */ + if (xprt->tcp_flags & XPRT_COPY_RECM) { + tcp_read_fraghdr(xprt, &desc); + continue; + } + /* Read in the xid if necessary */ + if (xprt->tcp_flags & XPRT_COPY_XID) { + tcp_read_xid(xprt, &desc); + continue; + } + /* Read in the request data */ + if (xprt->tcp_flags & XPRT_COPY_DATA) { + tcp_read_request(xprt, &desc); + continue; + } + /* Skip over any trailing bytes on short reads */ + tcp_read_discard(xprt, &desc); + } while (desc.count); + dprintk("RPC: tcp_data_recv done\n"); + return len - desc.count; +} + +static void tcp_data_ready(struct sock *sk, int bytes) +{ + struct rpc_xprt *xprt; + read_descriptor_t rd_desc; + + read_lock(&sk->sk_callback_lock); + dprintk("RPC: tcp_data_ready...\n"); + if (!(xprt = xprt_from_sock(sk))) { + printk("RPC: tcp_data_ready socket info not found!\n"); + goto out; + } + if (xprt->shutdown) + goto out; + + /* We use rd_desc to pass struct xprt to tcp_data_recv */ + rd_desc.arg.data = xprt; + rd_desc.count = 65536; + tcp_read_sock(sk, &rd_desc, tcp_data_recv); +out: + read_unlock(&sk->sk_callback_lock); +} + +static void +tcp_state_change(struct sock *sk) +{ + struct rpc_xprt *xprt; + + read_lock(&sk->sk_callback_lock); + if (!(xprt = xprt_from_sock(sk))) + goto out; + dprintk("RPC: tcp_state_change client %p...\n", xprt); + dprintk("RPC: state %x conn %d dead %d zapped %d\n", + sk->sk_state, xprt_connected(xprt), + sock_flag(sk, SOCK_DEAD), + sock_flag(sk, SOCK_ZAPPED)); + + switch (sk->sk_state) { + case TCP_ESTABLISHED: + spin_lock_bh(&xprt->sock_lock); + if (!xprt_test_and_set_connected(xprt)) { + /* Reset TCP record info */ + xprt->tcp_offset = 0; + xprt->tcp_reclen = 0; + xprt->tcp_copied = 0; + xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; + rpc_wake_up(&xprt->pending); + } + spin_unlock_bh(&xprt->sock_lock); + break; + case TCP_SYN_SENT: + case TCP_SYN_RECV: + break; + default: + if (xprt_test_and_clear_connected(xprt)) + rpc_wake_up_status(&xprt->pending, -ENOTCONN); + break; + } + out: + read_unlock(&sk->sk_callback_lock); +} + +/* + * Called when more output buffer space is available for this socket. + * We try not to wake our writers until they can make "significant" + * progress, otherwise we'll waste resources thrashing sock_sendmsg + * with a bunch of small requests. + */ +static void +xprt_write_space(struct sock *sk) +{ + struct rpc_xprt *xprt; + struct socket *sock; + + read_lock(&sk->sk_callback_lock); + if (!(xprt = xprt_from_sock(sk)) || !(sock = sk->sk_socket)) + goto out; + if (xprt->shutdown) + goto out; + + /* Wait until we have enough socket memory */ + if (xprt->stream) { + /* from net/core/stream.c:sk_stream_write_space */ + if (sk_stream_wspace(sk) < sk_stream_min_wspace(sk)) + goto out; + } else { + /* from net/core/sock.c:sock_def_write_space */ + if (!sock_writeable(sk)) + goto out; + } + + if (!test_and_clear_bit(SOCK_NOSPACE, &sock->flags)) + goto out; + + spin_lock_bh(&xprt->sock_lock); + if (xprt->snd_task) + rpc_wake_up_task(xprt->snd_task); + spin_unlock_bh(&xprt->sock_lock); +out: + read_unlock(&sk->sk_callback_lock); +} + +/* + * RPC receive timeout handler. + */ +static void +xprt_timer(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + spin_lock(&xprt->sock_lock); + if (req->rq_received) + goto out; + + xprt_adjust_cwnd(req->rq_xprt, -ETIMEDOUT); + __xprt_put_cong(xprt, req); + + dprintk("RPC: %4d xprt_timer (%s request)\n", + task->tk_pid, req ? "pending" : "backlogged"); + + task->tk_status = -ETIMEDOUT; +out: + task->tk_timeout = 0; + rpc_wake_up_task(task); + spin_unlock(&xprt->sock_lock); +} + +/* + * Place the actual RPC call. + * We have to copy the iovec because sendmsg fiddles with its contents. + */ +int +xprt_prepare_transmit(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + int err = 0; + + dprintk("RPC: %4d xprt_prepare_transmit\n", task->tk_pid); + + if (xprt->shutdown) + return -EIO; + + spin_lock_bh(&xprt->sock_lock); + if (req->rq_received && !req->rq_bytes_sent) { + err = req->rq_received; + goto out_unlock; + } + if (!__xprt_lock_write(xprt, task)) { + err = -EAGAIN; + goto out_unlock; + } + + if (!xprt_connected(xprt)) { + err = -ENOTCONN; + goto out_unlock; + } +out_unlock: + spin_unlock_bh(&xprt->sock_lock); + return err; +} + +void +xprt_transmit(struct rpc_task *task) +{ + struct rpc_clnt *clnt = task->tk_client; + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + int status, retry = 0; + + + dprintk("RPC: %4d xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); + + /* set up everything as needed. */ + /* Write the record marker */ + if (xprt->stream) { + u32 *marker = req->rq_svec[0].iov_base; + + *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); + } + + smp_rmb(); + if (!req->rq_received) { + if (list_empty(&req->rq_list)) { + spin_lock_bh(&xprt->sock_lock); + /* Update the softirq receive buffer */ + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, + sizeof(req->rq_private_buf)); + /* Add request to the receive list */ + list_add_tail(&req->rq_list, &xprt->recv); + spin_unlock_bh(&xprt->sock_lock); + xprt_reset_majortimeo(req); + } + } else if (!req->rq_bytes_sent) + return; + + /* Continue transmitting the packet/record. We must be careful + * to cope with writespace callbacks arriving _after_ we have + * called xprt_sendmsg(). + */ + while (1) { + req->rq_xtime = jiffies; + status = xprt_sendmsg(xprt, req); + + if (status < 0) + break; + + if (xprt->stream) { + req->rq_bytes_sent += status; + + /* If we've sent the entire packet, immediately + * reset the count of bytes sent. */ + if (req->rq_bytes_sent >= req->rq_slen) { + req->rq_bytes_sent = 0; + goto out_receive; + } + } else { + if (status >= req->rq_slen) + goto out_receive; + status = -EAGAIN; + break; + } + + dprintk("RPC: %4d xmit incomplete (%d left of %d)\n", + task->tk_pid, req->rq_slen - req->rq_bytes_sent, + req->rq_slen); + + status = -EAGAIN; + if (retry++ > 50) + break; + } + + /* Note: at this point, task->tk_sleeping has not yet been set, + * hence there is no danger of the waking up task being put on + * schedq, and being picked up by a parallel run of rpciod(). + */ + task->tk_status = status; + + switch (status) { + case -EAGAIN: + if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { + /* Protect against races with xprt_write_space */ + spin_lock_bh(&xprt->sock_lock); + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) { + task->tk_timeout = req->rq_timeout; + rpc_sleep_on(&xprt->pending, task, NULL, NULL); + } + spin_unlock_bh(&xprt->sock_lock); + return; + } + /* Keep holding the socket if it is blocked */ + rpc_delay(task, HZ>>4); + return; + case -ECONNREFUSED: + task->tk_timeout = RPC_REESTABLISH_TIMEOUT; + rpc_sleep_on(&xprt->sending, task, NULL, NULL); + case -ENOTCONN: + return; + default: + if (xprt->stream) + xprt_disconnect(xprt); + } + xprt_release_write(xprt, task); + return; + out_receive: + dprintk("RPC: %4d xmit complete\n", task->tk_pid); + /* Set the task's receive timeout value */ + spin_lock_bh(&xprt->sock_lock); + if (!xprt->nocong) { + int timer = task->tk_msg.rpc_proc->p_timer; + task->tk_timeout = rpc_calc_rto(clnt->cl_rtt, timer); + task->tk_timeout <<= rpc_ntimeo(clnt->cl_rtt, timer) + req->rq_retries; + if (task->tk_timeout > xprt->timeout.to_maxval || task->tk_timeout == 0) + task->tk_timeout = xprt->timeout.to_maxval; + } else + task->tk_timeout = req->rq_timeout; + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (!req->rq_received) + rpc_sleep_on(&xprt->pending, task, NULL, xprt_timer); + __xprt_release_write(xprt, task); + spin_unlock_bh(&xprt->sock_lock); +} + +/* + * Reserve an RPC call slot. + */ +static inline void +do_xprt_reserve(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + task->tk_status = 0; + if (task->tk_rqstp) + return; + if (!list_empty(&xprt->free)) { + struct rpc_rqst *req = list_entry(xprt->free.next, struct rpc_rqst, rq_list); + list_del_init(&req->rq_list); + task->tk_rqstp = req; + xprt_request_init(task, xprt); + return; + } + dprintk("RPC: waiting for request slot\n"); + task->tk_status = -EAGAIN; + task->tk_timeout = 0; + rpc_sleep_on(&xprt->backlog, task, NULL, NULL); +} + +void +xprt_reserve(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + + task->tk_status = -EIO; + if (!xprt->shutdown) { + spin_lock(&xprt->xprt_lock); + do_xprt_reserve(task); + spin_unlock(&xprt->xprt_lock); + if (task->tk_rqstp) + del_timer_sync(&xprt->timer); + } +} + +/* + * Allocate a 'unique' XID + */ +static inline u32 xprt_alloc_xid(struct rpc_xprt *xprt) +{ + return xprt->xid++; +} + +static inline void xprt_init_xid(struct rpc_xprt *xprt) +{ + get_random_bytes(&xprt->xid, sizeof(xprt->xid)); +} + +/* + * Initialize RPC request + */ +static void +xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) +{ + struct rpc_rqst *req = task->tk_rqstp; + + req->rq_timeout = xprt->timeout.to_initval; + req->rq_task = task; + req->rq_xprt = xprt; + req->rq_xid = xprt_alloc_xid(xprt); + dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid, + req, ntohl(req->rq_xid)); +} + +/* + * Release an RPC call slot + */ +void +xprt_release(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_rqst *req; + + if (!(req = task->tk_rqstp)) + return; + spin_lock_bh(&xprt->sock_lock); + __xprt_release_write(xprt, task); + __xprt_put_cong(xprt, req); + if (!list_empty(&req->rq_list)) + list_del(&req->rq_list); + xprt->last_used = jiffies; + if (list_empty(&xprt->recv) && !xprt->shutdown) + mod_timer(&xprt->timer, xprt->last_used + XPRT_IDLE_TIMEOUT); + spin_unlock_bh(&xprt->sock_lock); + task->tk_rqstp = NULL; + memset(req, 0, sizeof(*req)); /* mark unused */ + + dprintk("RPC: %4d release request %p\n", task->tk_pid, req); + + spin_lock(&xprt->xprt_lock); + list_add(&req->rq_list, &xprt->free); + xprt_clear_backlog(xprt); + spin_unlock(&xprt->xprt_lock); +} + +/* + * Set default timeout parameters + */ +static void +xprt_default_timeout(struct rpc_timeout *to, int proto) +{ + if (proto == IPPROTO_UDP) + xprt_set_timeout(to, 5, 5 * HZ); + else + xprt_set_timeout(to, 5, 60 * HZ); +} + +/* + * Set constant timeout + */ +void +xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr) +{ + to->to_initval = + to->to_increment = incr; + to->to_maxval = incr * retr; + to->to_retries = retr; + to->to_exponential = 0; +} + +unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE; +unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE; + +/* + * Initialize an RPC client + */ +static struct rpc_xprt * +xprt_setup(int proto, struct sockaddr_in *ap, struct rpc_timeout *to) +{ + struct rpc_xprt *xprt; + unsigned int entries; + size_t slot_table_size; + struct rpc_rqst *req; + + dprintk("RPC: setting up %s transport...\n", + proto == IPPROTO_UDP? "UDP" : "TCP"); + + entries = (proto == IPPROTO_TCP)? + xprt_tcp_slot_table_entries : xprt_udp_slot_table_entries; + + if ((xprt = kmalloc(sizeof(struct rpc_xprt), GFP_KERNEL)) == NULL) + return ERR_PTR(-ENOMEM); + memset(xprt, 0, sizeof(*xprt)); /* Nnnngh! */ + xprt->max_reqs = entries; + slot_table_size = entries * sizeof(xprt->slot[0]); + xprt->slot = kmalloc(slot_table_size, GFP_KERNEL); + if (xprt->slot == NULL) { + kfree(xprt); + return ERR_PTR(-ENOMEM); + } + memset(xprt->slot, 0, slot_table_size); + + xprt->addr = *ap; + xprt->prot = proto; + xprt->stream = (proto == IPPROTO_TCP)? 1 : 0; + if (xprt->stream) { + xprt->cwnd = RPC_MAXCWND(xprt); + xprt->nocong = 1; + xprt->max_payload = (1U << 31) - 1; + } else { + xprt->cwnd = RPC_INITCWND; + xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); + } + spin_lock_init(&xprt->sock_lock); + spin_lock_init(&xprt->xprt_lock); + init_waitqueue_head(&xprt->cong_wait); + + INIT_LIST_HEAD(&xprt->free); + INIT_LIST_HEAD(&xprt->recv); + INIT_WORK(&xprt->sock_connect, xprt_socket_connect, xprt); + INIT_WORK(&xprt->task_cleanup, xprt_socket_autoclose, xprt); + init_timer(&xprt->timer); + xprt->timer.function = xprt_init_autodisconnect; + xprt->timer.data = (unsigned long) xprt; + xprt->last_used = jiffies; + xprt->port = XPRT_MAX_RESVPORT; + + /* Set timeout parameters */ + if (to) { + xprt->timeout = *to; + } else + xprt_default_timeout(&xprt->timeout, xprt->prot); + + rpc_init_wait_queue(&xprt->pending, "xprt_pending"); + rpc_init_wait_queue(&xprt->sending, "xprt_sending"); + rpc_init_wait_queue(&xprt->resend, "xprt_resend"); + rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog"); + + /* initialize free list */ + for (req = &xprt->slot[entries-1]; req >= &xprt->slot[0]; req--) + list_add(&req->rq_list, &xprt->free); + + xprt_init_xid(xprt); + + /* Check whether we want to use a reserved port */ + xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; + + dprintk("RPC: created transport %p with %u slots\n", xprt, + xprt->max_reqs); + + return xprt; +} + +/* + * Bind to a reserved port + */ +static inline int xprt_bindresvport(struct rpc_xprt *xprt, struct socket *sock) +{ + struct sockaddr_in myaddr = { + .sin_family = AF_INET, + }; + int err, port; + + /* Were we already bound to a given port? Try to reuse it */ + port = xprt->port; + do { + myaddr.sin_port = htons(port); + err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, + sizeof(myaddr)); + if (err == 0) { + xprt->port = port; + return 0; + } + if (--port == 0) + port = XPRT_MAX_RESVPORT; + } while (err == -EADDRINUSE && port != xprt->port); + + printk("RPC: Can't bind to reserved port (%d).\n", -err); + return err; +} + +static void +xprt_bind_socket(struct rpc_xprt *xprt, struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (xprt->inet) + return; + + write_lock_bh(&sk->sk_callback_lock); + sk->sk_user_data = xprt; + xprt->old_data_ready = sk->sk_data_ready; + xprt->old_state_change = sk->sk_state_change; + xprt->old_write_space = sk->sk_write_space; + if (xprt->prot == IPPROTO_UDP) { + sk->sk_data_ready = udp_data_ready; + sk->sk_no_check = UDP_CSUM_NORCV; + xprt_set_connected(xprt); + } else { + tcp_sk(sk)->nonagle = 1; /* disable Nagle's algorithm */ + sk->sk_data_ready = tcp_data_ready; + sk->sk_state_change = tcp_state_change; + xprt_clear_connected(xprt); + } + sk->sk_write_space = xprt_write_space; + + /* Reset to new socket */ + xprt->sock = sock; + xprt->inet = sk; + write_unlock_bh(&sk->sk_callback_lock); + + return; +} + +/* + * Set socket buffer length + */ +void +xprt_sock_setbufsize(struct rpc_xprt *xprt) +{ + struct sock *sk = xprt->inet; + + if (xprt->stream) + return; + if (xprt->rcvsize) { + sk->sk_userlocks |= SOCK_RCVBUF_LOCK; + sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs * 2; + } + if (xprt->sndsize) { + sk->sk_userlocks |= SOCK_SNDBUF_LOCK; + sk->sk_sndbuf = xprt->sndsize * xprt->max_reqs * 2; + sk->sk_write_space(sk); + } +} + +/* + * Datastream sockets are created here, but xprt_connect will create + * and connect stream sockets. + */ +static struct socket * xprt_create_socket(struct rpc_xprt *xprt, int proto, int resvport) +{ + struct socket *sock; + int type, err; + + dprintk("RPC: xprt_create_socket(%s %d)\n", + (proto == IPPROTO_UDP)? "udp" : "tcp", proto); + + type = (proto == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + + if ((err = sock_create_kern(PF_INET, type, proto, &sock)) < 0) { + printk("RPC: can't create socket (%d).\n", -err); + return NULL; + } + + /* If the caller has the capability, bind to a reserved port */ + if (resvport && xprt_bindresvport(xprt, sock) < 0) { + printk("RPC: can't bind to reserved port.\n"); + goto failed; + } + + return sock; + +failed: + sock_release(sock); + return NULL; +} + +/* + * Create an RPC client transport given the protocol and peer address. + */ +struct rpc_xprt * +xprt_create_proto(int proto, struct sockaddr_in *sap, struct rpc_timeout *to) +{ + struct rpc_xprt *xprt; + + xprt = xprt_setup(proto, sap, to); + if (IS_ERR(xprt)) + dprintk("RPC: xprt_create_proto failed\n"); + else + dprintk("RPC: xprt_create_proto created xprt %p\n", xprt); + return xprt; +} + +/* + * Prepare for transport shutdown. + */ +static void +xprt_shutdown(struct rpc_xprt *xprt) +{ + xprt->shutdown = 1; + rpc_wake_up(&xprt->sending); + rpc_wake_up(&xprt->resend); + rpc_wake_up(&xprt->pending); + rpc_wake_up(&xprt->backlog); + wake_up(&xprt->cong_wait); + del_timer_sync(&xprt->timer); +} + +/* + * Clear the xprt backlog queue + */ +static int +xprt_clear_backlog(struct rpc_xprt *xprt) { + rpc_wake_up_next(&xprt->backlog); + wake_up(&xprt->cong_wait); + return 1; +} + +/* + * Destroy an RPC transport, killing off all requests. + */ +int +xprt_destroy(struct rpc_xprt *xprt) +{ + dprintk("RPC: destroying transport %p\n", xprt); + xprt_shutdown(xprt); + xprt_disconnect(xprt); + xprt_close(xprt); + kfree(xprt->slot); + kfree(xprt); + + return 0; +} diff --git a/net/sysctl_net.c b/net/sysctl_net.c new file mode 100644 index 000000000000..3f6e31069c54 --- /dev/null +++ b/net/sysctl_net.c @@ -0,0 +1,65 @@ +/* -*- linux-c -*- + * sysctl_net.c: sysctl interface to net subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net directories for each protocol family. [MS] + * + * $Log: sysctl_net.c,v $ + * Revision 1.2 1996/05/08 20:24:40 shaver + * Added bits for NET_BRIDGE and the NET_IPV4_ARP stuff and + * NET_IPV4_IP_FORWARD. + * + * + */ + +#include +#include +#include + +#ifdef CONFIG_INET +extern struct ctl_table ipv4_table[]; +#endif + +extern struct ctl_table core_table[]; + +#ifdef CONFIG_NET +extern struct ctl_table ether_table[]; +#endif + +#ifdef CONFIG_TR +extern struct ctl_table tr_table[]; +#endif + +struct ctl_table net_table[] = { + { + .ctl_name = NET_CORE, + .procname = "core", + .mode = 0555, + .child = core_table, + }, +#ifdef CONFIG_NET + { + .ctl_name = NET_ETHER, + .procname = "ethernet", + .mode = 0555, + .child = ether_table, + }, +#endif +#ifdef CONFIG_INET + { + .ctl_name = NET_IPV4, + .procname = "ipv4", + .mode = 0555, + .child = ipv4_table + }, +#endif +#ifdef CONFIG_TR + { + .ctl_name = NET_TR, + .procname = "token-ring", + .mode = 0555, + .child = tr_table, + }, +#endif + { 0 }, +}; diff --git a/net/unix/Makefile b/net/unix/Makefile new file mode 100644 index 000000000000..b852a2bde9a8 --- /dev/null +++ b/net/unix/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the Linux unix domain socket layer. +# + +obj-$(CONFIG_UNIX) += unix.o + +unix-y := af_unix.o garbage.o +unix-$(CONFIG_SYSCTL) += sysctl_net_unix.o diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c new file mode 100644 index 000000000000..acc73fe68698 --- /dev/null +++ b/net/unix/af_unix.c @@ -0,0 +1,2098 @@ +/* + * NET4: Implementation of BSD Unix domain sockets. + * + * Authors: Alan Cox, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Version: $Id: af_unix.c,v 1.133 2002/02/08 03:57:19 davem Exp $ + * + * Fixes: + * Linus Torvalds : Assorted bug cures. + * Niibe Yutaka : async I/O support. + * Carsten Paeth : PF_UNIX check, address fixes. + * Alan Cox : Limit size of allocated blocks. + * Alan Cox : Fixed the stupid socketpair bug. + * Alan Cox : BSD compatibility fine tuning. + * Alan Cox : Fixed a bug in connect when interrupted. + * Alan Cox : Sorted out a proper draft version of + * file descriptor passing hacked up from + * Mike Shaver's work. + * Marty Leisner : Fixes to fd passing + * Nick Nevin : recvmsg bugfix. + * Alan Cox : Started proper garbage collector + * Heiko EiBfeldt : Missing verify_area check + * Alan Cox : Started POSIXisms + * Andreas Schwab : Replace inode by dentry for proper + * reference counting + * Kirk Petersen : Made this a module + * Christoph Rohland : Elegant non-blocking accept/connect algorithm. + * Lots of bug fixes. + * Alexey Kuznetosv : Repaired (I hope) bugs introduces + * by above two patches. + * Andrea Arcangeli : If possible we block in connect(2) + * if the max backlog of the listen socket + * is been reached. This won't break + * old apps and it will avoid huge amount + * of socks hashed (this for unix_gc() + * performances reasons). + * Security fix that limits the max + * number of socks to 2*max_files and + * the number of skb queueable in the + * dgram receiver. + * Artur Skawina : Hash function optimizations + * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8) + * Malcolm Beattie : Set peercred for socketpair + * Michal Ostrowski : Module initialization cleanup. + * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT, + * the core infrastructure is doing that + * for all net proto families now (2.5.69+) + * + * + * Known differences from reference BSD that was tested: + * + * [TO FIX] + * ECONNREFUSED is not returned from one end of a connected() socket to the + * other the moment one end closes. + * fstat() doesn't return st_dev=0, and give the blksize as high water mark + * and a fake inode identifier (nor the BSD first socket fstat twice bug). + * [NOT TO FIX] + * accept() returns a path name even if the connecting socket has closed + * in the meantime (BSD loses the path and gives up). + * accept() returns 0 length path for an unbound connector. BSD returns 16 + * and a null first byte in the path (but not for gethost/peername - BSD bug ??) + * socketpair(...SOCK_RAW..) doesn't panic the kernel. + * BSD af_unix apparently has connect forgetting to block properly. + * (need to check this with the POSIX spec in detail) + * + * Differences from 2.0.0-11-... (ANK) + * Bug fixes and improvements. + * - client shutdown killed server socket. + * - removed all useless cli/sti pairs. + * + * Semantic changes/extensions. + * - generic control message passing. + * - SCM_CREDENTIALS control message. + * - "Abstract" (not FS based) socket bindings. + * Abstract names are sequences of bytes (not zero terminated) + * started by 0, so that this name space does not intersect + * with BSD names. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int sysctl_unix_max_dgram_qlen = 10; + +struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; +DEFINE_RWLOCK(unix_table_lock); +static atomic_t unix_nr_socks = ATOMIC_INIT(0); + +#define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) + +#define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE) + +/* + * SMP locking strategy: + * hash table is protected with rwlock unix_table_lock + * each socket state is protected by separate rwlock. + */ + +static inline unsigned unix_hash_fold(unsigned hash) +{ + hash ^= hash>>16; + hash ^= hash>>8; + return hash&(UNIX_HASH_SIZE-1); +} + +#define unix_peer(sk) (unix_sk(sk)->peer) + +static inline int unix_our_peer(struct sock *sk, struct sock *osk) +{ + return unix_peer(osk) == sk; +} + +static inline int unix_may_send(struct sock *sk, struct sock *osk) +{ + return (unix_peer(osk) == NULL || unix_our_peer(sk, osk)); +} + +static struct sock *unix_peer_get(struct sock *s) +{ + struct sock *peer; + + unix_state_rlock(s); + peer = unix_peer(s); + if (peer) + sock_hold(peer); + unix_state_runlock(s); + return peer; +} + +static inline void unix_release_addr(struct unix_address *addr) +{ + if (atomic_dec_and_test(&addr->refcnt)) + kfree(addr); +} + +/* + * Check unix socket name: + * - should be not zero length. + * - if started by not zero, should be NULL terminated (FS object) + * - if started by zero, it is abstract name. + */ + +static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp) +{ + if (len <= sizeof(short) || len > sizeof(*sunaddr)) + return -EINVAL; + if (!sunaddr || sunaddr->sun_family != AF_UNIX) + return -EINVAL; + if (sunaddr->sun_path[0]) { + /* + * This may look like an off by one error but it is a bit more + * subtle. 108 is the longest valid AF_UNIX path for a binding. + * sun_path[108] doesnt as such exist. However in kernel space + * we are guaranteed that it is a valid memory location in our + * kernel address buffer. + */ + ((char *)sunaddr)[len]=0; + len = strlen(sunaddr->sun_path)+1+sizeof(short); + return len; + } + + *hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0)); + return len; +} + +static void __unix_remove_socket(struct sock *sk) +{ + sk_del_node_init(sk); +} + +static void __unix_insert_socket(struct hlist_head *list, struct sock *sk) +{ + BUG_TRAP(sk_unhashed(sk)); + sk_add_node(sk, list); +} + +static inline void unix_remove_socket(struct sock *sk) +{ + write_lock(&unix_table_lock); + __unix_remove_socket(sk); + write_unlock(&unix_table_lock); +} + +static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) +{ + write_lock(&unix_table_lock); + __unix_insert_socket(list, sk); + write_unlock(&unix_table_lock); +} + +static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname, + int len, int type, unsigned hash) +{ + struct sock *s; + struct hlist_node *node; + + sk_for_each(s, node, &unix_socket_table[hash ^ type]) { + struct unix_sock *u = unix_sk(s); + + if (u->addr->len == len && + !memcmp(u->addr->name, sunname, len)) + goto found; + } + s = NULL; +found: + return s; +} + +static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname, + int len, int type, + unsigned hash) +{ + struct sock *s; + + read_lock(&unix_table_lock); + s = __unix_find_socket_byname(sunname, len, type, hash); + if (s) + sock_hold(s); + read_unlock(&unix_table_lock); + return s; +} + +static struct sock *unix_find_socket_byinode(struct inode *i) +{ + struct sock *s; + struct hlist_node *node; + + read_lock(&unix_table_lock); + sk_for_each(s, node, + &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { + struct dentry *dentry = unix_sk(s)->dentry; + + if(dentry && dentry->d_inode == i) + { + sock_hold(s); + goto found; + } + } + s = NULL; +found: + read_unlock(&unix_table_lock); + return s; +} + +static inline int unix_writable(struct sock *sk) +{ + return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf; +} + +static void unix_write_space(struct sock *sk) +{ + read_lock(&sk->sk_callback_lock); + if (unix_writable(sk)) { + if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + wake_up_interruptible(sk->sk_sleep); + sk_wake_async(sk, 2, POLL_OUT); + } + read_unlock(&sk->sk_callback_lock); +} + +/* When dgram socket disconnects (or changes its peer), we clear its receive + * queue of packets arrived from previous peer. First, it allows to do + * flow control based only on wmem_alloc; second, sk connected to peer + * may receive messages only from that peer. */ +static void unix_dgram_disconnected(struct sock *sk, struct sock *other) +{ + if (skb_queue_len(&sk->sk_receive_queue)) { + skb_queue_purge(&sk->sk_receive_queue); + wake_up_interruptible_all(&unix_sk(sk)->peer_wait); + + /* If one link of bidirectional dgram pipe is disconnected, + * we signal error. Messages are lost. Do not make this, + * when peer was not connected to us. + */ + if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) { + other->sk_err = ECONNRESET; + other->sk_error_report(other); + } + } +} + +static void unix_sock_destructor(struct sock *sk) +{ + struct unix_sock *u = unix_sk(sk); + + skb_queue_purge(&sk->sk_receive_queue); + + BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc)); + BUG_TRAP(sk_unhashed(sk)); + BUG_TRAP(!sk->sk_socket); + if (!sock_flag(sk, SOCK_DEAD)) { + printk("Attempt to release alive unix socket: %p\n", sk); + return; + } + + if (u->addr) + unix_release_addr(u->addr); + + atomic_dec(&unix_nr_socks); +#ifdef UNIX_REFCNT_DEBUG + printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks)); +#endif +} + +static int unix_release_sock (struct sock *sk, int embrion) +{ + struct unix_sock *u = unix_sk(sk); + struct dentry *dentry; + struct vfsmount *mnt; + struct sock *skpair; + struct sk_buff *skb; + int state; + + unix_remove_socket(sk); + + /* Clear state */ + unix_state_wlock(sk); + sock_orphan(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + dentry = u->dentry; + u->dentry = NULL; + mnt = u->mnt; + u->mnt = NULL; + state = sk->sk_state; + sk->sk_state = TCP_CLOSE; + unix_state_wunlock(sk); + + wake_up_interruptible_all(&u->peer_wait); + + skpair=unix_peer(sk); + + if (skpair!=NULL) { + if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { + unix_state_wlock(skpair); + /* No more writes */ + skpair->sk_shutdown = SHUTDOWN_MASK; + if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) + skpair->sk_err = ECONNRESET; + unix_state_wunlock(skpair); + skpair->sk_state_change(skpair); + read_lock(&skpair->sk_callback_lock); + sk_wake_async(skpair,1,POLL_HUP); + read_unlock(&skpair->sk_callback_lock); + } + sock_put(skpair); /* It may now die */ + unix_peer(sk) = NULL; + } + + /* Try to flush out this socket. Throw out buffers at least */ + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + if (state==TCP_LISTEN) + unix_release_sock(skb->sk, 1); + /* passed fds are erased in the kfree_skb hook */ + kfree_skb(skb); + } + + if (dentry) { + dput(dentry); + mntput(mnt); + } + + sock_put(sk); + + /* ---- Socket is dead now and most probably destroyed ---- */ + + /* + * Fixme: BSD difference: In BSD all sockets connected to use get + * ECONNRESET and we die on the spot. In Linux we behave + * like files and pipes do and wait for the last + * dereference. + * + * Can't we simply set sock->err? + * + * What the above comment does talk about? --ANK(980817) + */ + + if (atomic_read(&unix_tot_inflight)) + unix_gc(); /* Garbage collect fds */ + + return 0; +} + +static int unix_listen(struct socket *sock, int backlog) +{ + int err; + struct sock *sk = sock->sk; + struct unix_sock *u = unix_sk(sk); + + err = -EOPNOTSUPP; + if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET) + goto out; /* Only stream/seqpacket sockets accept */ + err = -EINVAL; + if (!u->addr) + goto out; /* No listens on an unbound socket */ + unix_state_wlock(sk); + if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) + goto out_unlock; + if (backlog > sk->sk_max_ack_backlog) + wake_up_interruptible_all(&u->peer_wait); + sk->sk_max_ack_backlog = backlog; + sk->sk_state = TCP_LISTEN; + /* set credentials so connect can copy them */ + sk->sk_peercred.pid = current->tgid; + sk->sk_peercred.uid = current->euid; + sk->sk_peercred.gid = current->egid; + err = 0; + +out_unlock: + unix_state_wunlock(sk); +out: + return err; +} + +static int unix_release(struct socket *); +static int unix_bind(struct socket *, struct sockaddr *, int); +static int unix_stream_connect(struct socket *, struct sockaddr *, + int addr_len, int flags); +static int unix_socketpair(struct socket *, struct socket *); +static int unix_accept(struct socket *, struct socket *, int); +static int unix_getname(struct socket *, struct sockaddr *, int *, int); +static unsigned int unix_poll(struct file *, struct socket *, poll_table *); +static int unix_ioctl(struct socket *, unsigned int, unsigned long); +static int unix_shutdown(struct socket *, int); +static int unix_stream_sendmsg(struct kiocb *, struct socket *, + struct msghdr *, size_t); +static int unix_stream_recvmsg(struct kiocb *, struct socket *, + struct msghdr *, size_t, int); +static int unix_dgram_sendmsg(struct kiocb *, struct socket *, + struct msghdr *, size_t); +static int unix_dgram_recvmsg(struct kiocb *, struct socket *, + struct msghdr *, size_t, int); +static int unix_dgram_connect(struct socket *, struct sockaddr *, + int, int); +static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *, + struct msghdr *, size_t); + +static struct proto_ops unix_stream_ops = { + .family = PF_UNIX, + .owner = THIS_MODULE, + .release = unix_release, + .bind = unix_bind, + .connect = unix_stream_connect, + .socketpair = unix_socketpair, + .accept = unix_accept, + .getname = unix_getname, + .poll = unix_poll, + .ioctl = unix_ioctl, + .listen = unix_listen, + .shutdown = unix_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = unix_stream_sendmsg, + .recvmsg = unix_stream_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto_ops unix_dgram_ops = { + .family = PF_UNIX, + .owner = THIS_MODULE, + .release = unix_release, + .bind = unix_bind, + .connect = unix_dgram_connect, + .socketpair = unix_socketpair, + .accept = sock_no_accept, + .getname = unix_getname, + .poll = datagram_poll, + .ioctl = unix_ioctl, + .listen = sock_no_listen, + .shutdown = unix_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = unix_dgram_sendmsg, + .recvmsg = unix_dgram_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto_ops unix_seqpacket_ops = { + .family = PF_UNIX, + .owner = THIS_MODULE, + .release = unix_release, + .bind = unix_bind, + .connect = unix_stream_connect, + .socketpair = unix_socketpair, + .accept = unix_accept, + .getname = unix_getname, + .poll = datagram_poll, + .ioctl = unix_ioctl, + .listen = unix_listen, + .shutdown = unix_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = unix_seqpacket_sendmsg, + .recvmsg = unix_dgram_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct proto unix_proto = { + .name = "UNIX", + .owner = THIS_MODULE, + .obj_size = sizeof(struct unix_sock), +}; + +static struct sock * unix_create1(struct socket *sock) +{ + struct sock *sk = NULL; + struct unix_sock *u; + + if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files) + goto out; + + sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1); + if (!sk) + goto out; + + atomic_inc(&unix_nr_socks); + + sock_init_data(sock,sk); + + sk->sk_write_space = unix_write_space; + sk->sk_max_ack_backlog = sysctl_unix_max_dgram_qlen; + sk->sk_destruct = unix_sock_destructor; + u = unix_sk(sk); + u->dentry = NULL; + u->mnt = NULL; + rwlock_init(&u->lock); + atomic_set(&u->inflight, sock ? 0 : -1); + init_MUTEX(&u->readsem); /* single task reading lock */ + init_waitqueue_head(&u->peer_wait); + unix_insert_socket(unix_sockets_unbound, sk); +out: + return sk; +} + +static int unix_create(struct socket *sock, int protocol) +{ + if (protocol && protocol != PF_UNIX) + return -EPROTONOSUPPORT; + + sock->state = SS_UNCONNECTED; + + switch (sock->type) { + case SOCK_STREAM: + sock->ops = &unix_stream_ops; + break; + /* + * Believe it or not BSD has AF_UNIX, SOCK_RAW though + * nothing uses it. + */ + case SOCK_RAW: + sock->type=SOCK_DGRAM; + case SOCK_DGRAM: + sock->ops = &unix_dgram_ops; + break; + case SOCK_SEQPACKET: + sock->ops = &unix_seqpacket_ops; + break; + default: + return -ESOCKTNOSUPPORT; + } + + return unix_create1(sock) ? 0 : -ENOMEM; +} + +static int unix_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + + if (!sk) + return 0; + + sock->sk = NULL; + + return unix_release_sock (sk, 0); +} + +static int unix_autobind(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct unix_sock *u = unix_sk(sk); + static u32 ordernum = 1; + struct unix_address * addr; + int err; + + down(&u->readsem); + + err = 0; + if (u->addr) + goto out; + + err = -ENOMEM; + addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL); + if (!addr) + goto out; + + memset(addr, 0, sizeof(*addr) + sizeof(short) + 16); + addr->name->sun_family = AF_UNIX; + atomic_set(&addr->refcnt, 1); + +retry: + addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); + addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0)); + + write_lock(&unix_table_lock); + ordernum = (ordernum+1)&0xFFFFF; + + if (__unix_find_socket_byname(addr->name, addr->len, sock->type, + addr->hash)) { + write_unlock(&unix_table_lock); + /* Sanity yield. It is unusual case, but yet... */ + if (!(ordernum&0xFF)) + yield(); + goto retry; + } + addr->hash ^= sk->sk_type; + + __unix_remove_socket(sk); + u->addr = addr; + __unix_insert_socket(&unix_socket_table[addr->hash], sk); + write_unlock(&unix_table_lock); + err = 0; + +out: up(&u->readsem); + return err; +} + +static struct sock *unix_find_other(struct sockaddr_un *sunname, int len, + int type, unsigned hash, int *error) +{ + struct sock *u; + struct nameidata nd; + int err = 0; + + if (sunname->sun_path[0]) { + err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd); + if (err) + goto fail; + err = permission(nd.dentry->d_inode,MAY_WRITE, &nd); + if (err) + goto put_fail; + + err = -ECONNREFUSED; + if (!S_ISSOCK(nd.dentry->d_inode->i_mode)) + goto put_fail; + u=unix_find_socket_byinode(nd.dentry->d_inode); + if (!u) + goto put_fail; + + if (u->sk_type == type) + touch_atime(nd.mnt, nd.dentry); + + path_release(&nd); + + err=-EPROTOTYPE; + if (u->sk_type != type) { + sock_put(u); + goto fail; + } + } else { + err = -ECONNREFUSED; + u=unix_find_socket_byname(sunname, len, type, hash); + if (u) { + struct dentry *dentry; + dentry = unix_sk(u)->dentry; + if (dentry) + touch_atime(unix_sk(u)->mnt, dentry); + } else + goto fail; + } + return u; + +put_fail: + path_release(&nd); +fail: + *error=err; + return NULL; +} + + +static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct unix_sock *u = unix_sk(sk); + struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; + struct dentry * dentry = NULL; + struct nameidata nd; + int err; + unsigned hash; + struct unix_address *addr; + struct hlist_head *list; + + err = -EINVAL; + if (sunaddr->sun_family != AF_UNIX) + goto out; + + if (addr_len==sizeof(short)) { + err = unix_autobind(sock); + goto out; + } + + err = unix_mkname(sunaddr, addr_len, &hash); + if (err < 0) + goto out; + addr_len = err; + + down(&u->readsem); + + err = -EINVAL; + if (u->addr) + goto out_up; + + err = -ENOMEM; + addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL); + if (!addr) + goto out_up; + + memcpy(addr->name, sunaddr, addr_len); + addr->len = addr_len; + addr->hash = hash ^ sk->sk_type; + atomic_set(&addr->refcnt, 1); + + if (sunaddr->sun_path[0]) { + unsigned int mode; + err = 0; + /* + * Get the parent directory, calculate the hash for last + * component. + */ + err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd); + if (err) + goto out_mknod_parent; + /* + * Yucky last component or no last component at all? + * (foo/., foo/.., /////) + */ + err = -EEXIST; + if (nd.last_type != LAST_NORM) + goto out_mknod; + /* + * Lock the directory. + */ + down(&nd.dentry->d_inode->i_sem); + /* + * Do the final lookup. + */ + dentry = lookup_hash(&nd.last, nd.dentry); + err = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_mknod_unlock; + err = -ENOENT; + /* + * Special case - lookup gave negative, but... we had foo/bar/ + * From the vfs_mknod() POV we just have a negative dentry - + * all is fine. Let's be bastards - you had / on the end, you've + * been asking for (non-existent) directory. -ENOENT for you. + */ + if (nd.last.name[nd.last.len] && !dentry->d_inode) + goto out_mknod_dput; + /* + * All right, let's create it. + */ + mode = S_IFSOCK | + (SOCK_INODE(sock)->i_mode & ~current->fs->umask); + err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0); + if (err) + goto out_mknod_dput; + up(&nd.dentry->d_inode->i_sem); + dput(nd.dentry); + nd.dentry = dentry; + + addr->hash = UNIX_HASH_SIZE; + } + + write_lock(&unix_table_lock); + + if (!sunaddr->sun_path[0]) { + err = -EADDRINUSE; + if (__unix_find_socket_byname(sunaddr, addr_len, + sk->sk_type, hash)) { + unix_release_addr(addr); + goto out_unlock; + } + + list = &unix_socket_table[addr->hash]; + } else { + list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)]; + u->dentry = nd.dentry; + u->mnt = nd.mnt; + } + + err = 0; + __unix_remove_socket(sk); + u->addr = addr; + __unix_insert_socket(list, sk); + +out_unlock: + write_unlock(&unix_table_lock); +out_up: + up(&u->readsem); +out: + return err; + +out_mknod_dput: + dput(dentry); +out_mknod_unlock: + up(&nd.dentry->d_inode->i_sem); +out_mknod: + path_release(&nd); +out_mknod_parent: + if (err==-EEXIST) + err=-EADDRINUSE; + unix_release_addr(addr); + goto out_up; +} + +static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, + int alen, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr; + struct sock *other; + unsigned hash; + int err; + + if (addr->sa_family != AF_UNSPEC) { + err = unix_mkname(sunaddr, alen, &hash); + if (err < 0) + goto out; + alen = err; + + if (test_bit(SOCK_PASSCRED, &sock->flags) && + !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0) + goto out; + + other=unix_find_other(sunaddr, alen, sock->type, hash, &err); + if (!other) + goto out; + + unix_state_wlock(sk); + + err = -EPERM; + if (!unix_may_send(sk, other)) + goto out_unlock; + + err = security_unix_may_send(sk->sk_socket, other->sk_socket); + if (err) + goto out_unlock; + + } else { + /* + * 1003.1g breaking connected state with AF_UNSPEC + */ + other = NULL; + unix_state_wlock(sk); + } + + /* + * If it was connected, reconnect. + */ + if (unix_peer(sk)) { + struct sock *old_peer = unix_peer(sk); + unix_peer(sk)=other; + unix_state_wunlock(sk); + + if (other != old_peer) + unix_dgram_disconnected(sk, old_peer); + sock_put(old_peer); + } else { + unix_peer(sk)=other; + unix_state_wunlock(sk); + } + return 0; + +out_unlock: + unix_state_wunlock(sk); + sock_put(other); +out: + return err; +} + +static long unix_wait_for_peer(struct sock *other, long timeo) +{ + struct unix_sock *u = unix_sk(other); + int sched; + DEFINE_WAIT(wait); + + prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE); + + sched = !sock_flag(other, SOCK_DEAD) && + !(other->sk_shutdown & RCV_SHUTDOWN) && + (skb_queue_len(&other->sk_receive_queue) > + other->sk_max_ack_backlog); + + unix_state_runlock(other); + + if (sched) + timeo = schedule_timeout(timeo); + + finish_wait(&u->peer_wait, &wait); + return timeo; +} + +static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; + struct sock *sk = sock->sk; + struct unix_sock *u = unix_sk(sk), *newu, *otheru; + struct sock *newsk = NULL; + struct sock *other = NULL; + struct sk_buff *skb = NULL; + unsigned hash; + int st; + int err; + long timeo; + + err = unix_mkname(sunaddr, addr_len, &hash); + if (err < 0) + goto out; + addr_len = err; + + if (test_bit(SOCK_PASSCRED, &sock->flags) + && !u->addr && (err = unix_autobind(sock)) != 0) + goto out; + + timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); + + /* First of all allocate resources. + If we will make it after state is locked, + we will have to recheck all again in any case. + */ + + err = -ENOMEM; + + /* create new sock for complete connection */ + newsk = unix_create1(NULL); + if (newsk == NULL) + goto out; + + /* Allocate skb for sending to listening sock */ + skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); + if (skb == NULL) + goto out; + +restart: + /* Find listening sock. */ + other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err); + if (!other) + goto out; + + /* Latch state of peer */ + unix_state_rlock(other); + + /* Apparently VFS overslept socket death. Retry. */ + if (sock_flag(other, SOCK_DEAD)) { + unix_state_runlock(other); + sock_put(other); + goto restart; + } + + err = -ECONNREFUSED; + if (other->sk_state != TCP_LISTEN) + goto out_unlock; + + if (skb_queue_len(&other->sk_receive_queue) > + other->sk_max_ack_backlog) { + err = -EAGAIN; + if (!timeo) + goto out_unlock; + + timeo = unix_wait_for_peer(other, timeo); + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + goto out; + sock_put(other); + goto restart; + } + + /* Latch our state. + + It is tricky place. We need to grab write lock and cannot + drop lock on peer. It is dangerous because deadlock is + possible. Connect to self case and simultaneous + attempt to connect are eliminated by checking socket + state. other is TCP_LISTEN, if sk is TCP_LISTEN we + check this before attempt to grab lock. + + Well, and we have to recheck the state after socket locked. + */ + st = sk->sk_state; + + switch (st) { + case TCP_CLOSE: + /* This is ok... continue with connect */ + break; + case TCP_ESTABLISHED: + /* Socket is already connected */ + err = -EISCONN; + goto out_unlock; + default: + err = -EINVAL; + goto out_unlock; + } + + unix_state_wlock(sk); + + if (sk->sk_state != st) { + unix_state_wunlock(sk); + unix_state_runlock(other); + sock_put(other); + goto restart; + } + + err = security_unix_stream_connect(sock, other->sk_socket, newsk); + if (err) { + unix_state_wunlock(sk); + goto out_unlock; + } + + /* The way is open! Fastly set all the necessary fields... */ + + sock_hold(sk); + unix_peer(newsk) = sk; + newsk->sk_state = TCP_ESTABLISHED; + newsk->sk_type = sk->sk_type; + newsk->sk_peercred.pid = current->tgid; + newsk->sk_peercred.uid = current->euid; + newsk->sk_peercred.gid = current->egid; + newu = unix_sk(newsk); + newsk->sk_sleep = &newu->peer_wait; + otheru = unix_sk(other); + + /* copy address information from listening to new sock*/ + if (otheru->addr) { + atomic_inc(&otheru->addr->refcnt); + newu->addr = otheru->addr; + } + if (otheru->dentry) { + newu->dentry = dget(otheru->dentry); + newu->mnt = mntget(otheru->mnt); + } + + /* Set credentials */ + sk->sk_peercred = other->sk_peercred; + + sock_hold(newsk); + unix_peer(sk) = newsk; + sock->state = SS_CONNECTED; + sk->sk_state = TCP_ESTABLISHED; + + unix_state_wunlock(sk); + + /* take ten and and send info to listening sock */ + spin_lock(&other->sk_receive_queue.lock); + __skb_queue_tail(&other->sk_receive_queue, skb); + /* Undo artificially decreased inflight after embrion + * is installed to listening socket. */ + atomic_inc(&newu->inflight); + spin_unlock(&other->sk_receive_queue.lock); + unix_state_runlock(other); + other->sk_data_ready(other, 0); + sock_put(other); + return 0; + +out_unlock: + if (other) + unix_state_runlock(other); + +out: + if (skb) + kfree_skb(skb); + if (newsk) + unix_release_sock(newsk, 0); + if (other) + sock_put(other); + return err; +} + +static int unix_socketpair(struct socket *socka, struct socket *sockb) +{ + struct sock *ska=socka->sk, *skb = sockb->sk; + + /* Join our sockets back to back */ + sock_hold(ska); + sock_hold(skb); + unix_peer(ska)=skb; + unix_peer(skb)=ska; + ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid; + ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid; + ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid; + + if (ska->sk_type != SOCK_DGRAM) { + ska->sk_state = TCP_ESTABLISHED; + skb->sk_state = TCP_ESTABLISHED; + socka->state = SS_CONNECTED; + sockb->state = SS_CONNECTED; + } + return 0; +} + +static int unix_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk = sock->sk; + struct sock *tsk; + struct sk_buff *skb; + int err; + + err = -EOPNOTSUPP; + if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET) + goto out; + + err = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + goto out; + + /* If socket state is TCP_LISTEN it cannot change (for now...), + * so that no locks are necessary. + */ + + skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err); + if (!skb) { + /* This means receive shutdown. */ + if (err == 0) + err = -EINVAL; + goto out; + } + + tsk = skb->sk; + skb_free_datagram(sk, skb); + wake_up_interruptible(&unix_sk(sk)->peer_wait); + + /* attach accepted sock to socket */ + unix_state_wlock(tsk); + newsock->state = SS_CONNECTED; + sock_graft(tsk, newsock); + unix_state_wunlock(tsk); + return 0; + +out: + return err; +} + + +static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer) +{ + struct sock *sk = sock->sk; + struct unix_sock *u; + struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; + int err = 0; + + if (peer) { + sk = unix_peer_get(sk); + + err = -ENOTCONN; + if (!sk) + goto out; + err = 0; + } else { + sock_hold(sk); + } + + u = unix_sk(sk); + unix_state_rlock(sk); + if (!u->addr) { + sunaddr->sun_family = AF_UNIX; + sunaddr->sun_path[0] = 0; + *uaddr_len = sizeof(short); + } else { + struct unix_address *addr = u->addr; + + *uaddr_len = addr->len; + memcpy(sunaddr, addr->name, *uaddr_len); + } + unix_state_runlock(sk); + sock_put(sk); +out: + return err; +} + +static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + int i; + + scm->fp = UNIXCB(skb).fp; + skb->destructor = sock_wfree; + UNIXCB(skb).fp = NULL; + + for (i=scm->fp->count-1; i>=0; i--) + unix_notinflight(scm->fp->fp[i]); +} + +static void unix_destruct_fds(struct sk_buff *skb) +{ + struct scm_cookie scm; + memset(&scm, 0, sizeof(scm)); + unix_detach_fds(&scm, skb); + + /* Alas, it calls VFS */ + /* So fscking what? fput() had been SMP-safe since the last Summer */ + scm_destroy(&scm); + sock_wfree(skb); +} + +static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + int i; + for (i=scm->fp->count-1; i>=0; i--) + unix_inflight(scm->fp->fp[i]); + UNIXCB(skb).fp = scm->fp; + skb->destructor = unix_destruct_fds; + scm->fp = NULL; +} + +/* + * Send AF_UNIX data. + */ + +static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock_iocb *siocb = kiocb_to_siocb(kiocb); + struct sock *sk = sock->sk; + struct unix_sock *u = unix_sk(sk); + struct sockaddr_un *sunaddr=msg->msg_name; + struct sock *other = NULL; + int namelen = 0; /* fake GCC */ + int err; + unsigned hash; + struct sk_buff *skb; + long timeo; + struct scm_cookie tmp_scm; + + if (NULL == siocb->scm) + siocb->scm = &tmp_scm; + err = scm_send(sock, msg, siocb->scm); + if (err < 0) + return err; + + err = -EOPNOTSUPP; + if (msg->msg_flags&MSG_OOB) + goto out; + + if (msg->msg_namelen) { + err = unix_mkname(sunaddr, msg->msg_namelen, &hash); + if (err < 0) + goto out; + namelen = err; + } else { + sunaddr = NULL; + err = -ENOTCONN; + other = unix_peer_get(sk); + if (!other) + goto out; + } + + if (test_bit(SOCK_PASSCRED, &sock->flags) + && !u->addr && (err = unix_autobind(sock)) != 0) + goto out; + + err = -EMSGSIZE; + if (len > sk->sk_sndbuf - 32) + goto out; + + skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err); + if (skb==NULL) + goto out; + + memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); + if (siocb->scm->fp) + unix_attach_fds(siocb->scm, skb); + + skb->h.raw = skb->data; + err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + if (err) + goto out_free; + + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); + +restart: + if (!other) { + err = -ECONNRESET; + if (sunaddr == NULL) + goto out_free; + + other = unix_find_other(sunaddr, namelen, sk->sk_type, + hash, &err); + if (other==NULL) + goto out_free; + } + + unix_state_rlock(other); + err = -EPERM; + if (!unix_may_send(sk, other)) + goto out_unlock; + + if (sock_flag(other, SOCK_DEAD)) { + /* + * Check with 1003.1g - what should + * datagram error + */ + unix_state_runlock(other); + sock_put(other); + + err = 0; + unix_state_wlock(sk); + if (unix_peer(sk) == other) { + unix_peer(sk)=NULL; + unix_state_wunlock(sk); + + unix_dgram_disconnected(sk, other); + sock_put(other); + err = -ECONNREFUSED; + } else { + unix_state_wunlock(sk); + } + + other = NULL; + if (err) + goto out_free; + goto restart; + } + + err = -EPIPE; + if (other->sk_shutdown & RCV_SHUTDOWN) + goto out_unlock; + + if (sk->sk_type != SOCK_SEQPACKET) { + err = security_unix_may_send(sk->sk_socket, other->sk_socket); + if (err) + goto out_unlock; + } + + if (unix_peer(other) != sk && + (skb_queue_len(&other->sk_receive_queue) > + other->sk_max_ack_backlog)) { + if (!timeo) { + err = -EAGAIN; + goto out_unlock; + } + + timeo = unix_wait_for_peer(other, timeo); + + err = sock_intr_errno(timeo); + if (signal_pending(current)) + goto out_free; + + goto restart; + } + + skb_queue_tail(&other->sk_receive_queue, skb); + unix_state_runlock(other); + other->sk_data_ready(other, len); + sock_put(other); + scm_destroy(siocb->scm); + return len; + +out_unlock: + unix_state_runlock(other); +out_free: + kfree_skb(skb); +out: + if (other) + sock_put(other); + scm_destroy(siocb->scm); + return err; +} + + +static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock_iocb *siocb = kiocb_to_siocb(kiocb); + struct sock *sk = sock->sk; + struct sock *other = NULL; + struct sockaddr_un *sunaddr=msg->msg_name; + int err,size; + struct sk_buff *skb; + int sent=0; + struct scm_cookie tmp_scm; + + if (NULL == siocb->scm) + siocb->scm = &tmp_scm; + err = scm_send(sock, msg, siocb->scm); + if (err < 0) + return err; + + err = -EOPNOTSUPP; + if (msg->msg_flags&MSG_OOB) + goto out_err; + + if (msg->msg_namelen) { + err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; + goto out_err; + } else { + sunaddr = NULL; + err = -ENOTCONN; + other = unix_peer_get(sk); + if (!other) + goto out_err; + } + + if (sk->sk_shutdown & SEND_SHUTDOWN) + goto pipe_err; + + while(sent < len) + { + /* + * Optimisation for the fact that under 0.01% of X messages typically + * need breaking up. + */ + + size=len-sent; + + /* Keep two messages in the pipe so it schedules better */ + if (size > sk->sk_sndbuf / 2 - 64) + size = sk->sk_sndbuf / 2 - 64; + + if (size > SKB_MAX_ALLOC) + size = SKB_MAX_ALLOC; + + /* + * Grab a buffer + */ + + skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err); + + if (skb==NULL) + goto out_err; + + /* + * If you pass two values to the sock_alloc_send_skb + * it tries to grab the large buffer with GFP_NOFS + * (which can fail easily), and if it fails grab the + * fallback size buffer which is under a page and will + * succeed. [Alan] + */ + size = min_t(int, size, skb_tailroom(skb)); + + memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); + if (siocb->scm->fp) + unix_attach_fds(siocb->scm, skb); + + if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) { + kfree_skb(skb); + goto out_err; + } + + unix_state_rlock(other); + + if (sock_flag(other, SOCK_DEAD) || + (other->sk_shutdown & RCV_SHUTDOWN)) + goto pipe_err_free; + + skb_queue_tail(&other->sk_receive_queue, skb); + unix_state_runlock(other); + other->sk_data_ready(other, size); + sent+=size; + } + sock_put(other); + + scm_destroy(siocb->scm); + siocb->scm = NULL; + + return sent; + +pipe_err_free: + unix_state_runlock(other); + kfree_skb(skb); +pipe_err: + if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL)) + send_sig(SIGPIPE,current,0); + err = -EPIPE; +out_err: + if (other) + sock_put(other); + scm_destroy(siocb->scm); + siocb->scm = NULL; + return sent ? : err; +} + +static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + int err; + struct sock *sk = sock->sk; + + err = sock_error(sk); + if (err) + return err; + + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + + if (msg->msg_namelen) + msg->msg_namelen = 0; + + return unix_dgram_sendmsg(kiocb, sock, msg, len); +} + +static void unix_copy_addr(struct msghdr *msg, struct sock *sk) +{ + struct unix_sock *u = unix_sk(sk); + + msg->msg_namelen = 0; + if (u->addr) { + msg->msg_namelen = u->addr->len; + memcpy(msg->msg_name, u->addr->name, u->addr->len); + } +} + +static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, + int flags) +{ + struct sock_iocb *siocb = kiocb_to_siocb(iocb); + struct scm_cookie tmp_scm; + struct sock *sk = sock->sk; + struct unix_sock *u = unix_sk(sk); + int noblock = flags & MSG_DONTWAIT; + struct sk_buff *skb; + int err; + + err = -EOPNOTSUPP; + if (flags&MSG_OOB) + goto out; + + msg->msg_namelen = 0; + + down(&u->readsem); + + skb = skb_recv_datagram(sk, flags, noblock, &err); + if (!skb) + goto out_unlock; + + wake_up_interruptible(&u->peer_wait); + + if (msg->msg_name) + unix_copy_addr(msg, skb->sk); + + if (size > skb->len) + size = skb->len; + else if (size < skb->len) + msg->msg_flags |= MSG_TRUNC; + + err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size); + if (err) + goto out_free; + + if (!siocb->scm) { + siocb->scm = &tmp_scm; + memset(&tmp_scm, 0, sizeof(tmp_scm)); + } + siocb->scm->creds = *UNIXCREDS(skb); + + if (!(flags & MSG_PEEK)) + { + if (UNIXCB(skb).fp) + unix_detach_fds(siocb->scm, skb); + } + else + { + /* It is questionable: on PEEK we could: + - do not return fds - good, but too simple 8) + - return fds, and do not return them on read (old strategy, + apparently wrong) + - clone fds (I chose it for now, it is the most universal + solution) + + POSIX 1003.1g does not actually define this clearly + at all. POSIX 1003.1g doesn't define a lot of things + clearly however! + + */ + if (UNIXCB(skb).fp) + siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp); + } + err = size; + + scm_recv(sock, msg, siocb->scm, flags); + +out_free: + skb_free_datagram(sk,skb); +out_unlock: + up(&u->readsem); +out: + return err; +} + +/* + * Sleep until data has arrive. But check for races.. + */ + +static long unix_stream_data_wait(struct sock * sk, long timeo) +{ + DEFINE_WAIT(wait); + + unix_state_rlock(sk); + + for (;;) { + prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); + + if (skb_queue_len(&sk->sk_receive_queue) || + sk->sk_err || + (sk->sk_shutdown & RCV_SHUTDOWN) || + signal_pending(current) || + !timeo) + break; + + set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + unix_state_runlock(sk); + timeo = schedule_timeout(timeo); + unix_state_rlock(sk); + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + } + + finish_wait(sk->sk_sleep, &wait); + unix_state_runlock(sk); + return timeo; +} + + + +static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, + int flags) +{ + struct sock_iocb *siocb = kiocb_to_siocb(iocb); + struct scm_cookie tmp_scm; + struct sock *sk = sock->sk; + struct unix_sock *u = unix_sk(sk); + struct sockaddr_un *sunaddr=msg->msg_name; + int copied = 0; + int check_creds = 0; + int target; + int err = 0; + long timeo; + + err = -EINVAL; + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + + err = -EOPNOTSUPP; + if (flags&MSG_OOB) + goto out; + + target = sock_rcvlowat(sk, flags&MSG_WAITALL, size); + timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT); + + msg->msg_namelen = 0; + + /* Lock the socket to prevent queue disordering + * while sleeps in memcpy_tomsg + */ + + if (!siocb->scm) { + siocb->scm = &tmp_scm; + memset(&tmp_scm, 0, sizeof(tmp_scm)); + } + + down(&u->readsem); + + do + { + int chunk; + struct sk_buff *skb; + + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb==NULL) + { + if (copied >= target) + break; + + /* + * POSIX 1003.1g mandates this order. + */ + + if ((err = sock_error(sk)) != 0) + break; + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + err = -EAGAIN; + if (!timeo) + break; + up(&u->readsem); + + timeo = unix_stream_data_wait(sk, timeo); + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + goto out; + } + down(&u->readsem); + continue; + } + + if (check_creds) { + /* Never glue messages from different writers */ + if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, sizeof(siocb->scm->creds)) != 0) { + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + } else { + /* Copy credentials */ + siocb->scm->creds = *UNIXCREDS(skb); + check_creds = 1; + } + + /* Copy address just once */ + if (sunaddr) + { + unix_copy_addr(msg, skb->sk); + sunaddr = NULL; + } + + chunk = min_t(unsigned int, skb->len, size); + if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { + skb_queue_head(&sk->sk_receive_queue, skb); + if (copied == 0) + copied = -EFAULT; + break; + } + copied += chunk; + size -= chunk; + + /* Mark read part of skb as used */ + if (!(flags & MSG_PEEK)) + { + skb_pull(skb, chunk); + + if (UNIXCB(skb).fp) + unix_detach_fds(siocb->scm, skb); + + /* put the skb back if we didn't use it up.. */ + if (skb->len) + { + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + + kfree_skb(skb); + + if (siocb->scm->fp) + break; + } + else + { + /* It is questionable, see note in unix_dgram_recvmsg. + */ + if (UNIXCB(skb).fp) + siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp); + + /* put message back and return */ + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + } while (size); + + up(&u->readsem); + scm_recv(sock, msg, siocb->scm, flags); +out: + return copied ? : err; +} + +static int unix_shutdown(struct socket *sock, int mode) +{ + struct sock *sk = sock->sk; + struct sock *other; + + mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN); + + if (mode) { + unix_state_wlock(sk); + sk->sk_shutdown |= mode; + other=unix_peer(sk); + if (other) + sock_hold(other); + unix_state_wunlock(sk); + sk->sk_state_change(sk); + + if (other && + (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) { + + int peer_mode = 0; + + if (mode&RCV_SHUTDOWN) + peer_mode |= SEND_SHUTDOWN; + if (mode&SEND_SHUTDOWN) + peer_mode |= RCV_SHUTDOWN; + unix_state_wlock(other); + other->sk_shutdown |= peer_mode; + unix_state_wunlock(other); + other->sk_state_change(other); + read_lock(&other->sk_callback_lock); + if (peer_mode == SHUTDOWN_MASK) + sk_wake_async(other,1,POLL_HUP); + else if (peer_mode & RCV_SHUTDOWN) + sk_wake_async(other,1,POLL_IN); + read_unlock(&other->sk_callback_lock); + } + if (other) + sock_put(other); + } + return 0; +} + +static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + long amount=0; + int err; + + switch(cmd) + { + case SIOCOUTQ: + amount = atomic_read(&sk->sk_wmem_alloc); + err = put_user(amount, (int __user *)arg); + break; + case SIOCINQ: + { + struct sk_buff *skb; + + if (sk->sk_state == TCP_LISTEN) { + err = -EINVAL; + break; + } + + spin_lock(&sk->sk_receive_queue.lock); + if (sk->sk_type == SOCK_STREAM || + sk->sk_type == SOCK_SEQPACKET) { + skb_queue_walk(&sk->sk_receive_queue, skb) + amount += skb->len; + } else { + skb = skb_peek(&sk->sk_receive_queue); + if (skb) + amount=skb->len; + } + spin_unlock(&sk->sk_receive_queue.lock); + err = put_user(amount, (int __user *)arg); + break; + } + + default: + err = dev_ioctl(cmd, (void __user *)arg); + break; + } + return err; +} + +static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask; + + poll_wait(file, sk->sk_sleep, wait); + mask = 0; + + /* exceptional events? */ + if (sk->sk_err) + mask |= POLLERR; + if (sk->sk_shutdown == SHUTDOWN_MASK) + mask |= POLLHUP; + + /* readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue) || + (sk->sk_shutdown & RCV_SHUTDOWN)) + mask |= POLLIN | POLLRDNORM; + + /* Connection-based need to check for termination and startup */ + if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE) + mask |= POLLHUP; + + /* + * we set writable also when the other side has shut down the + * connection. This prevents stuck sockets. + */ + if (unix_writable(sk)) + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + + return mask; +} + + +#ifdef CONFIG_PROC_FS +static struct sock *unix_seq_idx(int *iter, loff_t pos) +{ + loff_t off = 0; + struct sock *s; + + for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) { + if (off == pos) + return s; + ++off; + } + return NULL; +} + + +static void *unix_seq_start(struct seq_file *seq, loff_t *pos) +{ + read_lock(&unix_table_lock); + return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1); +} + +static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + + if (v == (void *)1) + return first_unix_socket(seq->private); + return next_unix_socket(seq->private, v); +} + +static void unix_seq_stop(struct seq_file *seq, void *v) +{ + read_unlock(&unix_table_lock); +} + +static int unix_seq_show(struct seq_file *seq, void *v) +{ + + if (v == (void *)1) + seq_puts(seq, "Num RefCount Protocol Flags Type St " + "Inode Path\n"); + else { + struct sock *s = v; + struct unix_sock *u = unix_sk(s); + unix_state_rlock(s); + + seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu", + s, + atomic_read(&s->sk_refcnt), + 0, + s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0, + s->sk_type, + s->sk_socket ? + (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) : + (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), + sock_i_ino(s)); + + if (u->addr) { + int i, len; + seq_putc(seq, ' '); + + i = 0; + len = u->addr->len - sizeof(short); + if (!UNIX_ABSTRACT(s)) + len--; + else { + seq_putc(seq, '@'); + i++; + } + for ( ; i < len; i++) + seq_putc(seq, u->addr->name->sun_path[i]); + } + unix_state_runlock(s); + seq_putc(seq, '\n'); + } + + return 0; +} + +static struct seq_operations unix_seq_ops = { + .start = unix_seq_start, + .next = unix_seq_next, + .stop = unix_seq_stop, + .show = unix_seq_show, +}; + + +static int unix_seq_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int rc = -ENOMEM; + int *iter = kmalloc(sizeof(int), GFP_KERNEL); + + if (!iter) + goto out; + + rc = seq_open(file, &unix_seq_ops); + if (rc) + goto out_kfree; + + seq = file->private_data; + seq->private = iter; + *iter = 0; +out: + return rc; +out_kfree: + kfree(iter); + goto out; +} + +static struct file_operations unix_seq_fops = { + .owner = THIS_MODULE, + .open = unix_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +#endif + +static struct net_proto_family unix_family_ops = { + .family = PF_UNIX, + .create = unix_create, + .owner = THIS_MODULE, +}; + +#ifdef CONFIG_SYSCTL +extern void unix_sysctl_register(void); +extern void unix_sysctl_unregister(void); +#else +static inline void unix_sysctl_register(void) {} +static inline void unix_sysctl_unregister(void) {} +#endif + +static int __init af_unix_init(void) +{ + int rc = -1; + struct sk_buff *dummy_skb; + + if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) { + printk(KERN_CRIT "%s: panic\n", __FUNCTION__); + goto out; + } + + rc = proto_register(&unix_proto, 1); + if (rc != 0) { + printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n", + __FUNCTION__); + goto out; + } + + sock_register(&unix_family_ops); +#ifdef CONFIG_PROC_FS + proc_net_fops_create("unix", 0, &unix_seq_fops); +#endif + unix_sysctl_register(); +out: + return rc; +} + +static void __exit af_unix_exit(void) +{ + sock_unregister(PF_UNIX); + unix_sysctl_unregister(); + proc_net_remove("unix"); + proto_unregister(&unix_proto); +} + +module_init(af_unix_init); +module_exit(af_unix_exit); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_UNIX); diff --git a/net/unix/garbage.c b/net/unix/garbage.c new file mode 100644 index 000000000000..4bd95c8f5934 --- /dev/null +++ b/net/unix/garbage.c @@ -0,0 +1,312 @@ +/* + * NET3: Garbage Collector For AF_UNIX sockets + * + * Garbage Collector: + * Copyright (C) Barak A. Pearlmutter. + * Released under the GPL version 2 or later. + * + * Chopped about by Alan Cox 22/3/96 to make it fit the AF_UNIX socket problem. + * If it doesn't work blame me, it worked when Barak sent it. + * + * Assumptions: + * + * - object w/ a bit + * - free list + * + * Current optimizations: + * + * - explicit stack instead of recursion + * - tail recurse on first born instead of immediate push/pop + * - we gather the stuff that should not be killed into tree + * and stack is just a path from root to the current pointer. + * + * Future optimizations: + * + * - don't just push entire root set; process in place + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Fixes: + * Alan Cox 07 Sept 1997 Vmalloc internal stack as needed. + * Cope with changing max_files. + * Al Viro 11 Oct 1998 + * Graph may have cycles. That is, we can send the descriptor + * of foo to bar and vice versa. Current code chokes on that. + * Fix: move SCM_RIGHTS ones into the separate list and then + * skb_free() them all instead of doing explicit fput's. + * Another problem: since fput() may block somebody may + * create a new unix_socket when we are in the middle of sweep + * phase. Fix: revert the logic wrt MARKED. Mark everything + * upon the beginning and unmark non-junk ones. + * + * [12 Oct 1998] AAARGH! New code purges all SCM_RIGHTS + * sent to connect()'ed but still not accept()'ed sockets. + * Fixed. Old code had slightly different problem here: + * extra fput() in situation when we passed the descriptor via + * such socket and closed it (descriptor). That would happen on + * each unix_gc() until the accept(). Since the struct file in + * question would go to the free list and might be reused... + * That might be the reason of random oopses on filp_close() + * in unrelated processes. + * + * AV 28 Feb 1999 + * Kill the explicit allocation of stack. Now we keep the tree + * with root in dummy + pointer (gc_current) to one of the nodes. + * Stack is represented as path from gc_current to dummy. Unmark + * now means "add to tree". Push == "make it a son of gc_current". + * Pop == "move gc_current to parent". We keep only pointers to + * parents (->gc_tree). + * AV 1 Mar 1999 + * Damn. Added missing check for ->dead in listen queues scanning. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* Internal data structures and random procedures: */ + +#define GC_HEAD ((struct sock *)(-1)) +#define GC_ORPHAN ((struct sock *)(-3)) + +static struct sock *gc_current = GC_HEAD; /* stack of objects to mark */ + +atomic_t unix_tot_inflight = ATOMIC_INIT(0); + + +static struct sock *unix_get_socket(struct file *filp) +{ + struct sock *u_sock = NULL; + struct inode *inode = filp->f_dentry->d_inode; + + /* + * Socket ? + */ + if (S_ISSOCK(inode->i_mode)) { + struct socket * sock = SOCKET_I(inode); + struct sock * s = sock->sk; + + /* + * PF_UNIX ? + */ + if (s && sock->ops && sock->ops->family == PF_UNIX) + u_sock = s; + } + return u_sock; +} + +/* + * Keep the number of times in flight count for the file + * descriptor if it is for an AF_UNIX socket. + */ + +void unix_inflight(struct file *fp) +{ + struct sock *s = unix_get_socket(fp); + if(s) { + atomic_inc(&unix_sk(s)->inflight); + atomic_inc(&unix_tot_inflight); + } +} + +void unix_notinflight(struct file *fp) +{ + struct sock *s = unix_get_socket(fp); + if(s) { + atomic_dec(&unix_sk(s)->inflight); + atomic_dec(&unix_tot_inflight); + } +} + + +/* + * Garbage Collector Support Functions + */ + +static inline struct sock *pop_stack(void) +{ + struct sock *p = gc_current; + gc_current = unix_sk(p)->gc_tree; + return p; +} + +static inline int empty_stack(void) +{ + return gc_current == GC_HEAD; +} + +static void maybe_unmark_and_push(struct sock *x) +{ + struct unix_sock *u = unix_sk(x); + + if (u->gc_tree != GC_ORPHAN) + return; + sock_hold(x); + u->gc_tree = gc_current; + gc_current = x; +} + + +/* The external entry point: unix_gc() */ + +void unix_gc(void) +{ + static DECLARE_MUTEX(unix_gc_sem); + int i; + struct sock *s; + struct sk_buff_head hitlist; + struct sk_buff *skb; + + /* + * Avoid a recursive GC. + */ + + if (down_trylock(&unix_gc_sem)) + return; + + read_lock(&unix_table_lock); + + forall_unix_sockets(i, s) + { + unix_sk(s)->gc_tree = GC_ORPHAN; + } + /* + * Everything is now marked + */ + + /* Invariant to be maintained: + - everything unmarked is either: + -- (a) on the stack, or + -- (b) has all of its children unmarked + - everything on the stack is always unmarked + - nothing is ever pushed onto the stack twice, because: + -- nothing previously unmarked is ever pushed on the stack + */ + + /* + * Push root set + */ + + forall_unix_sockets(i, s) + { + int open_count = 0; + + /* + * If all instances of the descriptor are not + * in flight we are in use. + * + * Special case: when socket s is embrion, it may be + * hashed but still not in queue of listening socket. + * In this case (see unix_create1()) we set artificial + * negative inflight counter to close race window. + * It is trick of course and dirty one. + */ + if (s->sk_socket && s->sk_socket->file) + open_count = file_count(s->sk_socket->file); + if (open_count > atomic_read(&unix_sk(s)->inflight)) + maybe_unmark_and_push(s); + } + + /* + * Mark phase + */ + + while (!empty_stack()) + { + struct sock *x = pop_stack(); + struct sock *sk; + + spin_lock(&x->sk_receive_queue.lock); + skb = skb_peek(&x->sk_receive_queue); + + /* + * Loop through all but first born + */ + + while (skb && skb != (struct sk_buff *)&x->sk_receive_queue) { + /* + * Do we have file descriptors ? + */ + if(UNIXCB(skb).fp) + { + /* + * Process the descriptors of this socket + */ + int nfd=UNIXCB(skb).fp->count; + struct file **fp = UNIXCB(skb).fp->fp; + while(nfd--) + { + /* + * Get the socket the fd matches if + * it indeed does so + */ + if((sk=unix_get_socket(*fp++))!=NULL) + { + maybe_unmark_and_push(sk); + } + } + } + /* We have to scan not-yet-accepted ones too */ + if (x->sk_state == TCP_LISTEN) + maybe_unmark_and_push(skb->sk); + skb=skb->next; + } + spin_unlock(&x->sk_receive_queue.lock); + sock_put(x); + } + + skb_queue_head_init(&hitlist); + + forall_unix_sockets(i, s) + { + struct unix_sock *u = unix_sk(s); + + if (u->gc_tree == GC_ORPHAN) { + struct sk_buff *nextsk; + + spin_lock(&s->sk_receive_queue.lock); + skb = skb_peek(&s->sk_receive_queue); + while (skb && + skb != (struct sk_buff *)&s->sk_receive_queue) { + nextsk=skb->next; + /* + * Do we have file descriptors ? + */ + if(UNIXCB(skb).fp) + { + __skb_unlink(skb, skb->list); + __skb_queue_tail(&hitlist,skb); + } + skb=nextsk; + } + spin_unlock(&s->sk_receive_queue.lock); + } + u->gc_tree = GC_ORPHAN; + } + read_unlock(&unix_table_lock); + + /* + * Here we are. Hitlist is filled. Die. + */ + + __skb_queue_purge(&hitlist); + up(&unix_gc_sem); +} diff --git a/net/unix/sysctl_net_unix.c b/net/unix/sysctl_net_unix.c new file mode 100644 index 000000000000..c974dac4580a --- /dev/null +++ b/net/unix/sysctl_net_unix.c @@ -0,0 +1,60 @@ +/* + * NET4: Sysctl interface to net af_unix subsystem. + * + * Authors: Mike Shaver. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include + +extern int sysctl_unix_max_dgram_qlen; + +static ctl_table unix_table[] = { + { + .ctl_name = NET_UNIX_MAX_DGRAM_QLEN, + .procname = "max_dgram_qlen", + .data = &sysctl_unix_max_dgram_qlen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = 0 } +}; + +static ctl_table unix_net_table[] = { + { + .ctl_name = NET_UNIX, + .procname = "unix", + .mode = 0555, + .child = unix_table + }, + { .ctl_name = 0 } +}; + +static ctl_table unix_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = unix_net_table + }, + { .ctl_name = 0 } +}; + +static struct ctl_table_header * unix_sysctl_header; + +void unix_sysctl_register(void) +{ + unix_sysctl_header = register_sysctl_table(unix_root_table, 0); +} + +void unix_sysctl_unregister(void) +{ + unregister_sysctl_table(unix_sysctl_header); +} + diff --git a/net/wanrouter/Makefile b/net/wanrouter/Makefile new file mode 100644 index 000000000000..9f188ab3dcd0 --- /dev/null +++ b/net/wanrouter/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Linux WAN router layer. +# + +obj-$(CONFIG_WAN_ROUTER) += wanrouter.o + +wanrouter-objs := wanproc.o wanmain.o diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c new file mode 100644 index 000000000000..d93b19faaab7 --- /dev/null +++ b/net/wanrouter/af_wanpipe.c @@ -0,0 +1,2611 @@ +/***************************************************************************** +* af_wanpipe.c WANPIPE(tm) Secure Socket Layer. +* +* Author: Nenad Corbic +* +* Copyright: (c) 2000 Sangoma Technologies Inc. +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version +* 2 of the License, or (at your option) any later version. +* ============================================================================ +* Due Credit: +* Wanpipe socket layer is based on Packet and +* the X25 socket layers. The above sockets were +* used for the specific use of Sangoma Technoloiges +* API programs. +* Packet socket Authors: Ross Biro, Fred N. van Kempen and +* Alan Cox. +* X25 socket Author: Jonathan Naylor. +* ============================================================================ +* Mar 15, 2002 Arnaldo C. Melo o Use wp_sk()->num, as it isnt anymore in sock +* Apr 25, 2000 Nenad Corbic o Added the ability to send zero length packets. +* Mar 13, 2000 Nenad Corbic o Added a tx buffer check via ioctl call. +* Mar 06, 2000 Nenad Corbic o Fixed the corrupt sock lcn problem. +* Server and client applicaton can run +* simultaneously without conflicts. +* Feb 29, 2000 Nenad Corbic o Added support for PVC protocols, such as +* CHDLC, Frame Relay and HDLC API. +* Jan 17, 2000 Nenad Corbic o Initial version, based on AF_PACKET socket. +* X25API support only. +* +******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_INET +#include +#endif + +#define SLOW_BACKOFF 0.1*HZ +#define FAST_BACKOFF 0.01*HZ + +//#define PRINT_DEBUG +#ifdef PRINT_DEBUG + #define DBG_PRINTK(format, a...) printk(format, ## a) +#else + #define DBG_PRINTK(format, a...) +#endif + + +/* SECURE SOCKET IMPLEMENTATION + * + * TRANSMIT: + * + * When the user sends a packet via send() system call + * the wanpipe_sendmsg() function is executed. + * + * Each packet is enqueud into sk->sk_write_queue transmit + * queue. When the packet is enqueued, a delayed transmit + * timer is triggerd which acts as a Bottom Half hander. + * + * wanpipe_delay_transmit() function (BH), dequeues packets + * from the sk->sk_write_queue transmit queue and sends it + * to the deriver via dev->hard_start_xmit(skb, dev) function. + * Note, this function is actual a function pointer of if_send() + * routine in the wanpipe driver. + * + * X25API GUARANTEED DELIVERY: + * + * In order to provide 100% guaranteed packet delivery, + * an atomic 'packet_sent' counter is implemented. Counter + * is incremented for each packet enqueued + * into sk->sk_write_queue. Counter is decremented each + * time wanpipe_delayed_transmit() function successfuly + * passes the packet to the driver. Before each send(), a poll + * routine checks the sock resources The maximum value of + * packet sent counter is 1, thus if one packet is queued, the + * application will block until that packet is passed to the + * driver. + * + * RECEIVE: + * + * Wanpipe device drivers call the socket bottom half + * function, wanpipe_rcv() to queue the incoming packets + * into an AF_WANPIPE socket queue. Based on wanpipe_rcv() + * return code, the driver knows whether the packet was + * successfully queued. If the socket queue is full, + * protocol flow control is used by the driver, if any, + * to slow down the traffic until the sock queue is free. + * + * Every time a packet arrives into a socket queue the + * socket wakes up processes which are waiting to receive + * data. + * + * If the socket queue is full, the driver sets a block + * bit which signals the socket to kick the wanpipe driver + * bottom half hander when the socket queue is partialy + * empty. wanpipe_recvmsg() function performs this action. + * + * In case of x25api, packets will never be dropped, since + * flow control is available. + * + * In case of streaming protocols like CHDLC, packets will + * be dropped but the statistics will be generated. + */ + + +/* The code below is used to test memory leaks. It prints out + * a message every time kmalloc and kfree system calls get executed. + * If the calls match there is no leak :) + */ + +/***********FOR DEBUGGING PURPOSES********************************************* +#define KMEM_SAFETYZONE 8 + +static void * dbg_kmalloc(unsigned int size, int prio, int line) { + void * v = kmalloc(size,prio); + printk(KERN_INFO "line %d kmalloc(%d,%d) = %p\n",line,size,prio,v); + return v; +} +static void dbg_kfree(void * v, int line) { + printk(KERN_INFO "line %d kfree(%p)\n",line,v); + kfree(v); +} + +#define kmalloc(x,y) dbg_kmalloc(x,y,__LINE__) +#define kfree(x) dbg_kfree(x,__LINE__) +******************************************************************************/ + + +/* List of all wanpipe sockets. */ +HLIST_HEAD(wanpipe_sklist); +static DEFINE_RWLOCK(wanpipe_sklist_lock); + +atomic_t wanpipe_socks_nr; +static unsigned long wanpipe_tx_critical; + +#if 0 +/* Private wanpipe socket structures. */ +struct wanpipe_opt +{ + void *mbox; /* Mail box */ + void *card; /* Card bouded to */ + struct net_device *dev; /* Bounded device */ + unsigned short lcn; /* Binded LCN */ + unsigned char svc; /* 0=pvc, 1=svc */ + unsigned char timer; /* flag for delayed transmit*/ + struct timer_list tx_timer; + unsigned poll_cnt; + unsigned char force; /* Used to force sock release */ + atomic_t packet_sent; +}; +#endif + +static int sk_count; +extern struct proto_ops wanpipe_ops; +static unsigned long find_free_critical; + +static void wanpipe_unlink_driver(struct sock *sk); +static void wanpipe_link_driver(struct net_device *dev, struct sock *sk); +static void wanpipe_wakeup_driver(struct sock *sk); +static int execute_command(struct sock *, unsigned char, unsigned int); +static int check_dev(struct net_device *dev, sdla_t *card); +struct net_device *wanpipe_find_free_dev(sdla_t *card); +static void wanpipe_unlink_card (struct sock *); +static int wanpipe_link_card (struct sock *); +static struct sock *wanpipe_make_new(struct sock *); +static struct sock *wanpipe_alloc_socket(void); +static inline int get_atomic_device(struct net_device *dev); +static int wanpipe_exec_cmd(struct sock *, int, unsigned int); +static int get_ioctl_cmd (struct sock *, void *); +static int set_ioctl_cmd (struct sock *, void *); +static void release_device(struct net_device *dev); +static void wanpipe_kill_sock_timer (unsigned long data); +static void wanpipe_kill_sock_irq (struct sock *); +static void wanpipe_kill_sock_accept (struct sock *); +static int wanpipe_do_bind(struct sock *sk, struct net_device *dev, + int protocol); +struct sock * get_newsk_from_skb (struct sk_buff *); +static int wanpipe_debug (struct sock *, void *); +static void wanpipe_delayed_transmit (unsigned long data); +static void release_driver(struct sock *); +static void start_cleanup_timer (struct sock *); +static void check_write_queue(struct sock *); +static int check_driver_busy (struct sock *); + +/*============================================================ + * wanpipe_rcv + * + * Wanpipe socket bottom half handler. This function + * is called by the WANPIPE device drivers to queue a + * incoming packet into the socket receive queue. + * Once the packet is queued, all processes waiting to + * read are woken up. + * + * During socket bind, this function is bounded into + * WANPIPE driver private. + *===========================================================*/ + +static int wanpipe_rcv(struct sk_buff *skb, struct net_device *dev, + struct sock *sk) +{ + struct wan_sockaddr_ll *sll = (struct wan_sockaddr_ll*)skb->cb; + wanpipe_common_t *chan = dev->priv; + /* + * When we registered the protocol we saved the socket in the data + * field for just this event. + */ + + skb->dev = dev; + + sll->sll_family = AF_WANPIPE; + sll->sll_hatype = dev->type; + sll->sll_protocol = skb->protocol; + sll->sll_pkttype = skb->pkt_type; + sll->sll_ifindex = dev->ifindex; + sll->sll_halen = 0; + + if (dev->hard_header_parse) + sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr); + + /* + * WAN_PACKET_DATA : Data which should be passed up the receive queue. + * WAN_PACKET_ASYC : Asynchronous data like place call, which should + * be passed up the listening sock. + * WAN_PACKET_ERR : Asynchronous data like clear call or restart + * which should go into an error queue. + */ + switch (skb->pkt_type){ + + case WAN_PACKET_DATA: + if (sock_queue_rcv_skb(sk,skb)<0){ + return -ENOMEM; + } + break; + case WAN_PACKET_CMD: + sk->sk_state = chan->state; + /* Bug fix: update Mar6. + * Do not set the sock lcn number here, since + * cmd is not guaranteed to be executed on the + * board, thus Lcn could be wrong */ + sk->sk_data_ready(sk, skb->len); + kfree_skb(skb); + break; + case WAN_PACKET_ERR: + sk->sk_state = chan->state; + if (sock_queue_err_skb(sk,skb)<0){ + return -ENOMEM; + } + break; + default: + printk(KERN_INFO "wansock: BH Illegal Packet Type Dropping\n"); + kfree_skb(skb); + break; + } + +//?????????????????????? +// if (sk->sk_state == WANSOCK_DISCONNECTED){ +// if (sk->sk_zapped) { +// //printk(KERN_INFO "wansock: Disconnected, killing early\n"); +// wanpipe_unlink_driver(sk); +// sk->sk_bound_dev_if = 0; +// } +// } + + return 0; +} + +/*============================================================ + * wanpipe_listen_rcv + * + * Wanpipe LISTEN socket bottom half handler. This function + * is called by the WANPIPE device drivers to queue an + * incoming call into the socket listening queue. + * Once the packet is queued, the waiting accept() process + * is woken up. + * + * During socket bind, this function is bounded into + * WANPIPE driver private. + * + * IMPORTANT NOTE: + * The accept call() is waiting for an skb packet + * which contains a pointer to a device structure. + * + * When we do a bind to a device structre, we + * bind a newly created socket into "chan->sk". Thus, + * when accept receives the skb packet, it will know + * from which dev it came form, and in turn it will know + * the address of the new sock. + * + * NOTE: This function gets called from driver ISR. + *===========================================================*/ + +static int wanpipe_listen_rcv (struct sk_buff *skb, struct sock *sk) +{ + wanpipe_opt *wp = wp_sk(sk), *newwp; + struct wan_sockaddr_ll *sll = (struct wan_sockaddr_ll*)skb->cb; + struct sock *newsk; + struct net_device *dev; + sdla_t *card; + mbox_cmd_t *mbox_ptr; + wanpipe_common_t *chan; + + /* Find a free device, if none found, all svc's are busy + */ + + card = (sdla_t*)wp->card; + if (!card){ + printk(KERN_INFO "wansock: LISTEN ERROR, No Card\n"); + return -ENODEV; + } + + dev = wanpipe_find_free_dev(card); + if (!dev){ + printk(KERN_INFO "wansock: LISTEN ERROR, No Free Device\n"); + return -ENODEV; + } + + chan=dev->priv; + chan->state = WANSOCK_CONNECTING; + + /* Allocate a new sock, which accept will bind + * and pass up to the user + */ + if ((newsk = wanpipe_make_new(sk)) == NULL){ + release_device(dev); + return -ENOMEM; + } + + + /* Initialize the new sock structure + */ + newsk->sk_bound_dev_if = dev->ifindex; + newwp = wp_sk(newsk); + newwp->card = wp->card; + + /* Insert the sock into the main wanpipe + * sock list. + */ + atomic_inc(&wanpipe_socks_nr); + + /* Allocate and fill in the new Mail Box. Then + * bind the mail box to the sock. It will be + * used by the ioctl call to read call information + * and to execute commands. + */ + if ((mbox_ptr = kmalloc(sizeof(mbox_cmd_t), GFP_ATOMIC)) == NULL) { + wanpipe_kill_sock_irq (newsk); + release_device(dev); + return -ENOMEM; + } + memset(mbox_ptr, 0, sizeof(mbox_cmd_t)); + memcpy(mbox_ptr,skb->data,skb->len); + + /* Register the lcn on which incoming call came + * from. Thus, if we have to clear it, we know + * which lcn to clear + */ + + newwp->lcn = mbox_ptr->cmd.lcn; + newwp->mbox = (void *)mbox_ptr; + + DBG_PRINTK(KERN_INFO "NEWSOCK : Device %s, bind to lcn %i\n", + dev->name,mbox_ptr->cmd.lcn); + + chan->lcn = mbox_ptr->cmd.lcn; + card->u.x.svc_to_dev_map[(chan->lcn%MAX_X25_LCN)] = dev; + + sock_reset_flag(newsk, SOCK_ZAPPED); + newwp->num = htons(X25_PROT); + + if (wanpipe_do_bind(newsk, dev, newwp->num)) { + wanpipe_kill_sock_irq (newsk); + release_device(dev); + return -EINVAL; + } + newsk->sk_state = WANSOCK_CONNECTING; + + + /* Fill in the standard sock address info */ + + sll->sll_family = AF_WANPIPE; + sll->sll_hatype = dev->type; + sll->sll_protocol = skb->protocol; + sll->sll_pkttype = skb->pkt_type; + sll->sll_ifindex = dev->ifindex; + sll->sll_halen = 0; + + skb->dev = dev; + sk->sk_ack_backlog++; + + /* We must do this manually, since the sock_queue_rcv_skb() + * function sets the skb->dev to NULL. However, we use + * the dev field in the accept function.*/ + if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= + (unsigned)sk->sk_rcvbuf) { + + wanpipe_unlink_driver(newsk); + wanpipe_kill_sock_irq (newsk); + --sk->sk_ack_backlog; + return -ENOMEM; + } + + skb_set_owner_r(skb, sk); + skb_queue_tail(&sk->sk_receive_queue, skb); + sk->sk_data_ready(sk, skb->len); + + return 0; +} + + + +/*============================================================ + * wanpipe_make_new + * + * Create a new sock, and allocate a wanpipe private + * structure to it. Also, copy the important data + * from the original sock to the new sock. + * + * This function is used by wanpipe_listen_rcv() listen + * bottom half handler. A copy of the listening sock + * is created using this function. + * + *===========================================================*/ + +static struct sock *wanpipe_make_new(struct sock *osk) +{ + struct sock *sk; + + if (osk->sk_type != SOCK_RAW) + return NULL; + + if ((sk = wanpipe_alloc_socket()) == NULL) + return NULL; + + sk->sk_type = osk->sk_type; + sk->sk_socket = osk->sk_socket; + sk->sk_priority = osk->sk_priority; + sk->sk_protocol = osk->sk_protocol; + wp_sk(sk)->num = wp_sk(osk)->num; + sk->sk_rcvbuf = osk->sk_rcvbuf; + sk->sk_sndbuf = osk->sk_sndbuf; + sk->sk_state = WANSOCK_CONNECTING; + sk->sk_sleep = osk->sk_sleep; + + if (sock_flag(osk, SOCK_DBG)) + sock_set_flag(sk, SOCK_DBG); + + return sk; +} + +/* + * FIXME: wanpipe_opt has to include a sock in its definition and stop using + * sk_protinfo, but this code is not even compilable now, so lets leave it for + * later. + */ +static struct proto wanpipe_proto = { + .name = "WANPIPE", + .owner = THIS_MODULE, + .obj_size = sizeof(struct sock), +}; + +/*============================================================ + * wanpipe_make_new + * + * Allocate memory for the a new sock, and sock + * private data. + * + * Increment the module use count. + * + * This function is used by wanpipe_create() and + * wanpipe_make_new() functions. + * + *===========================================================*/ + +static struct sock *wanpipe_alloc_socket(void) +{ + struct sock *sk; + struct wanpipe_opt *wan_opt; + + if ((sk = sk_alloc(PF_WANPIPE, GFP_ATOMIC, &wanpipe_proto, 1)) == NULL) + return NULL; + + if ((wan_opt = kmalloc(sizeof(struct wanpipe_opt), GFP_ATOMIC)) == NULL) { + sk_free(sk); + return NULL; + } + memset(wan_opt, 0x00, sizeof(struct wanpipe_opt)); + + wp_sk(sk) = wan_opt; + + /* Use timer to send data to the driver. This will act + * as a BH handler for sendmsg functions */ + init_timer(&wan_opt->tx_timer); + wan_opt->tx_timer.data = (unsigned long)sk; + wan_opt->tx_timer.function = wanpipe_delayed_transmit; + + sock_init_data(NULL, sk); + return sk; +} + + +/*============================================================ + * wanpipe_sendmsg + * + * This function implements a sendto() system call, + * for AF_WANPIPE socket family. + * During socket bind() sk->sk_bound_dev_if is initialized + * to a correct network device. This number is used + * to find a network device to which the packet should + * be passed to. + * + * Each packet is queued into sk->sk_write_queue and + * delayed transmit bottom half handler is marked for + * execution. + * + * A socket must be in WANSOCK_CONNECTED state before + * a packet is queued into sk->sk_write_queue. + *===========================================================*/ + +static int wanpipe_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, int len) +{ + wanpipe_opt *wp; + struct sock *sk = sock->sk; + struct wan_sockaddr_ll *saddr=(struct wan_sockaddr_ll *)msg->msg_name; + struct sk_buff *skb; + struct net_device *dev; + unsigned short proto; + unsigned char *addr; + int ifindex, err, reserve = 0; + + + if (!sock_flag(sk, SOCK_ZAPPED)) + return -ENETDOWN; + + if (sk->sk_state != WANSOCK_CONNECTED) + return -ENOTCONN; + + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) + return(-EINVAL); + + /* it was <=, now one can send + * zero length packets */ + if (len < sizeof(x25api_hdr_t)) + return -EINVAL; + + wp = wp_sk(sk); + + if (saddr == NULL) { + ifindex = sk->sk_bound_dev_if; + proto = wp->num; + addr = NULL; + + }else{ + if (msg->msg_namelen < sizeof(struct wan_sockaddr_ll)){ + return -EINVAL; + } + + ifindex = sk->sk_bound_dev_if; + proto = saddr->sll_protocol; + addr = saddr->sll_addr; + } + + dev = dev_get_by_index(ifindex); + if (dev == NULL){ + printk(KERN_INFO "wansock: Send failed, dev index: %i\n",ifindex); + return -ENXIO; + } + dev_put(dev); + + if (sock->type == SOCK_RAW) + reserve = dev->hard_header_len; + + if (len > dev->mtu+reserve){ + return -EMSGSIZE; + } + + skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev), + msg->msg_flags & MSG_DONTWAIT, &err); + + if (skb==NULL){ + goto out_unlock; + } + + skb_reserve(skb, LL_RESERVED_SPACE(dev)); + skb->nh.raw = skb->data; + + /* Returns -EFAULT on error */ + err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); + if (err){ + goto out_free; + } + + if (dev->hard_header) { + int res; + err = -EINVAL; + res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len); + if (res<0){ + goto out_free; + } + } + + skb->protocol = proto; + skb->dev = dev; + skb->priority = sk->sk_priority; + skb->pkt_type = WAN_PACKET_DATA; + + err = -ENETDOWN; + if (!(dev->flags & IFF_UP)) + goto out_free; + + if (atomic_read(&sk->sk_wmem_alloc) + skb->truesize > + (unsigned int)sk->sk_sndbuf){ + kfree_skb(skb); + return -ENOBUFS; + } + + skb_queue_tail(&sk->sk_write_queue,skb); + atomic_inc(&wp->packet_sent); + + if (!(test_and_set_bit(0, &wp->timer))) + mod_timer(&wp->tx_timer, jiffies + 1); + + return(len); + +out_free: + kfree_skb(skb); +out_unlock: + return err; +} + +/*============================================================ + * wanpipe_delayed_tarnsmit + * + * Transmit bottom half handler. It dequeues packets + * from sk->sk_write_queue and passes them to the + * driver. If the driver is busy, the packet is + * re-enqueued. + * + * Packet Sent counter is decremented on successful + * transmission. + *===========================================================*/ + + +static void wanpipe_delayed_transmit (unsigned long data) +{ + struct sock *sk=(struct sock *)data; + struct sk_buff *skb; + wanpipe_opt *wp = wp_sk(sk); + struct net_device *dev = wp->dev; + sdla_t *card = (sdla_t*)wp->card; + + if (!card || !dev){ + clear_bit(0, &wp->timer); + DBG_PRINTK(KERN_INFO "wansock: Transmit delay, no dev or card\n"); + return; + } + + if (sk->sk_state != WANSOCK_CONNECTED || !sock_flag(sk, SOCK_ZAPPED)) { + clear_bit(0, &wp->timer); + DBG_PRINTK(KERN_INFO "wansock: Tx Timer, State not CONNECTED\n"); + return; + } + + /* If driver is executing command, we must offload + * the board by not sending data. Otherwise a + * pending command will never get a free buffer + * to execute */ + if (atomic_read(&card->u.x.command_busy)){ + wp->tx_timer.expires = jiffies + SLOW_BACKOFF; + add_timer(&wp->tx_timer); + DBG_PRINTK(KERN_INFO "wansock: Tx Timer, command bys BACKOFF\n"); + return; + } + + + if (test_and_set_bit(0,&wanpipe_tx_critical)){ + printk(KERN_INFO "WanSock: Tx timer critical %s\n",dev->name); + wp->tx_timer.expires = jiffies + SLOW_BACKOFF; + add_timer(&wp->tx_timer); + return; + } + + /* Check for a packet in the fifo and send */ + if ((skb = skb_dequeue(&sk->sk_write_queue)) != NULL){ + + if (dev->hard_start_xmit(skb, dev) != 0){ + + /* Driver failed to transmit, re-enqueue + * the packet and retry again later */ + skb_queue_head(&sk->sk_write_queue,skb); + clear_bit(0,&wanpipe_tx_critical); + return; + }else{ + + /* Packet Sent successful. Check for more packets + * if more packets, re-trigger the transmit routine + * other wise exit + */ + atomic_dec(&wp->packet_sent); + + if (skb_peek(&sk->sk_write_queue) == NULL) { + /* If there is nothing to send, kick + * the poll routine, which will trigger + * the application to send more data */ + sk->sk_data_ready(sk, 0); + clear_bit(0, &wp->timer); + }else{ + /* Reschedule as fast as possible */ + wp->tx_timer.expires = jiffies + 1; + add_timer(&wp->tx_timer); + } + } + } + clear_bit(0,&wanpipe_tx_critical); +} + +/*============================================================ + * execute_command + * + * Execute x25api commands. The atomic variable + * chan->command is used to indicate to the driver that + * command is pending for execution. The acutal command + * structure is placed into a sock mbox structure + * (wp_sk(sk)->mbox). + * + * The sock private structure, mbox is + * used as shared memory between sock and the driver. + * Driver uses the sock mbox to execute the command + * and return the result. + * + * For all command except PLACE CALL, the function + * waits for the result. PLACE CALL can be ether + * blocking or nonblocking. The user sets this option + * via ioctl call. + *===========================================================*/ + + +static int execute_command(struct sock *sk, unsigned char cmd, unsigned int flags) +{ + wanpipe_opt *wp = wp_sk(sk); + struct net_device *dev; + wanpipe_common_t *chan=NULL; + int err=0; + DECLARE_WAITQUEUE(wait, current); + + dev = dev_get_by_index(sk->sk_bound_dev_if); + if (dev == NULL){ + printk(KERN_INFO "wansock: Exec failed no dev %i\n", + sk->sk_bound_dev_if); + return -ENODEV; + } + dev_put(dev); + + if ((chan=dev->priv) == NULL){ + printk(KERN_INFO "wansock: Exec cmd failed no priv area\n"); + return -ENODEV; + } + + if (atomic_read(&chan->command)){ + printk(KERN_INFO "wansock: ERROR: Command already running %x, %s\n", + atomic_read(&chan->command),dev->name); + return -EINVAL; + } + + if (!wp->mbox) { + printk(KERN_INFO "wansock: In execute without MBOX\n"); + return -EINVAL; + } + + ((mbox_cmd_t*)wp->mbox)->cmd.command = cmd; + ((mbox_cmd_t*)wp->mbox)->cmd.lcn = wp->lcn; + ((mbox_cmd_t*)wp->mbox)->cmd.result = 0x7F; + + + if (flags & O_NONBLOCK){ + cmd |= 0x80; + atomic_set(&chan->command, cmd); + }else{ + atomic_set(&chan->command, cmd); + } + + add_wait_queue(sk->sk_sleep,&wait); + current->state = TASK_INTERRUPTIBLE; + for (;;){ + if (((mbox_cmd_t*)wp->mbox)->cmd.result != 0x7F) { + err = 0; + break; + } + if (signal_pending(current)) { + err = -ERESTARTSYS; + break; + } + schedule(); + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep,&wait); + + return err; +} + +/*============================================================ + * wanpipe_destroy_timer + * + * Used by wanpipe_release, to delay release of + * the socket. + *===========================================================*/ + +static void wanpipe_destroy_timer(unsigned long data) +{ + struct sock *sk=(struct sock *)data; + wanpipe_opt *wp = wp_sk(sk); + + if ((!atomic_read(&sk->sk_wmem_alloc) && + !atomic_read(&sk->sk_rmem_alloc)) || + (++wp->force == 5)) { + + if (atomic_read(&sk->sk_wmem_alloc) || + atomic_read(&sk->sk_rmem_alloc)) + printk(KERN_INFO "wansock: Warning, Packet Discarded due to sock shutdown!\n"); + + kfree(wp); + wp_sk(sk) = NULL; + + if (atomic_read(&sk->sk_refcnt) != 1) { + atomic_set(&sk->sk_refcnt, 1); + DBG_PRINTK(KERN_INFO "wansock: Error, wrong reference count: %i ! :delay.\n", + atomic_read(&sk->sk_refcnt)); + } + sock_put(sk); + atomic_dec(&wanpipe_socks_nr); + return; + } + + sk->sk_timer.expires = jiffies + 5 * HZ; + add_timer(&sk->sk_timer); + printk(KERN_INFO "wansock: packet sk destroy delayed\n"); +} + +/*============================================================ + * wanpipe_unlink_driver + * + * When the socket is released, this function is + * used to remove links that bind the sock and the + * driver together. + *===========================================================*/ +static void wanpipe_unlink_driver (struct sock *sk) +{ + struct net_device *dev; + wanpipe_common_t *chan=NULL; + + sock_reset_flag(sk, SOCK_ZAPPED); + sk->sk_state = WANSOCK_DISCONNECTED; + wp_sk(sk)->dev = NULL; + + dev = dev_get_by_index(sk->sk_bound_dev_if); + if (!dev){ + printk(KERN_INFO "wansock: No dev on release\n"); + return; + } + dev_put(dev); + + if ((chan = dev->priv) == NULL){ + printk(KERN_INFO "wansock: No Priv Area on release\n"); + return; + } + + set_bit(0,&chan->common_critical); + chan->sk=NULL; + chan->func=NULL; + chan->mbox=NULL; + chan->tx_timer=NULL; + clear_bit(0,&chan->common_critical); + release_device(dev); + + return; +} + +/*============================================================ + * wanpipe_link_driver + * + * Upon successful bind(), sock is linked to a driver + * by binding in the wanpipe_rcv() bottom half handler + * to the driver function pointer, as well as sock and + * sock mailbox addresses. This way driver can pass + * data up the socket. + *===========================================================*/ + +static void wanpipe_link_driver(struct net_device *dev, struct sock *sk) +{ + wanpipe_opt *wp = wp_sk(sk); + wanpipe_common_t *chan = dev->priv; + if (!chan) + return; + set_bit(0,&chan->common_critical); + chan->sk=sk; + chan->func=wanpipe_rcv; + chan->mbox = wp->mbox; + chan->tx_timer = &wp->tx_timer; + wp->dev = dev; + sock_set_flag(sk, SOCK_ZAPPED); + clear_bit(0,&chan->common_critical); +} + + +/*============================================================ + * release_device + * + * During sock release, clear a critical bit, which + * marks the device a being taken. + *===========================================================*/ + + +static void release_device(struct net_device *dev) +{ + wanpipe_common_t *chan=dev->priv; + clear_bit(0,(void*)&chan->rw_bind); +} + +/*============================================================ + * wanpipe_release + * + * Close a PACKET socket. This is fairly simple. We + * immediately go to 'closed' state and remove our + * protocol entry in the device list. + *===========================================================*/ + +static int wanpipe_release(struct socket *sock) +{ + wanpipe_opt *wp; + struct sock *sk = sock->sk; + + if (!sk) + return 0; + + wp = wp_sk(sk); + check_write_queue(sk); + + /* Kill the tx timer, if we don't kill it now, the timer + * will run after we kill the sock. Timer code will + * try to access the sock which has been killed and cause + * kernel panic */ + + del_timer(&wp->tx_timer); + + /* + * Unhook packet receive handler. + */ + + if (wp->num == htons(X25_PROT) && + sk->sk_state != WANSOCK_DISCONNECTED && sock_flag(sk, SOCK_ZAPPED)) { + struct net_device *dev = dev_get_by_index(sk->sk_bound_dev_if); + wanpipe_common_t *chan; + if (dev){ + chan=dev->priv; + atomic_set(&chan->disconnect,1); + DBG_PRINTK(KERN_INFO "wansock: Sending Clear Indication %i\n", + sk->sk_state); + dev_put(dev); + } + } + + set_bit(1,&wanpipe_tx_critical); + write_lock(&wanpipe_sklist_lock); + sk_del_node_init(sk); + write_unlock(&wanpipe_sklist_lock); + clear_bit(1,&wanpipe_tx_critical); + + + + release_driver(sk); + + + /* + * Now the socket is dead. No more input will appear. + */ + + sk->sk_state_change(sk); /* It is useless. Just for sanity. */ + + sock->sk = NULL; + sk->sk_socket = NULL; + sock_set_flag(sk, SOCK_DEAD); + + /* Purge queues */ + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); + skb_queue_purge(&sk->sk_error_queue); + + if (atomic_read(&sk->sk_rmem_alloc) || + atomic_read(&sk->sk_wmem_alloc)) { + del_timer(&sk->sk_timer); + printk(KERN_INFO "wansock: Killing in Timer R %i , W %i\n", + atomic_read(&sk->sk_rmem_alloc), + atomic_read(&sk->sk_wmem_alloc)); + sk->sk_timer.data = (unsigned long)sk; + sk->sk_timer.expires = jiffies + HZ; + sk->sk_timer.function = wanpipe_destroy_timer; + add_timer(&sk->sk_timer); + return 0; + } + + kfree(wp); + wp_sk(sk) = NULL; + + if (atomic_read(&sk->sk_refcnt) != 1) { + DBG_PRINTK(KERN_INFO "wansock: Error, wrong reference count: %i !:release.\n", + atomic_read(&sk->sk_refcnt)); + atomic_set(&sk->sk_refcnt, 1); + } + sock_put(sk); + atomic_dec(&wanpipe_socks_nr); + return 0; +} + +/*============================================================ + * check_write_queue + * + * During sock shutdown, if the sock state is + * WANSOCK_CONNECTED and there is transmit data + * pending. Wait until data is released + * before proceeding. + *===========================================================*/ + +static void check_write_queue(struct sock *sk) +{ + + if (sk->sk_state != WANSOCK_CONNECTED) + return; + + if (!atomic_read(&sk->sk_wmem_alloc)) + return; + + printk(KERN_INFO "wansock: MAJOR ERROR, Data lost on sock release !!!\n"); + +} + +/*============================================================ + * release_driver + * + * This function is called during sock shutdown, to + * release any resources and links that bind the sock + * to the driver. It also changes the state of the + * sock to WANSOCK_DISCONNECTED + *===========================================================*/ + +static void release_driver(struct sock *sk) +{ + wanpipe_opt *wp; + struct sk_buff *skb=NULL; + struct sock *deadsk=NULL; + + if (sk->sk_state == WANSOCK_LISTEN || + sk->sk_state == WANSOCK_BIND_LISTEN) { + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + if ((deadsk = get_newsk_from_skb(skb))){ + DBG_PRINTK (KERN_INFO "wansock: RELEASE: FOUND DEAD SOCK\n"); + sock_set_flag(deadsk, SOCK_DEAD); + start_cleanup_timer(deadsk); + } + kfree_skb(skb); + } + if (sock_flag(sk, SOCK_ZAPPED)) + wanpipe_unlink_card(sk); + }else{ + if (sock_flag(sk, SOCK_ZAPPED)) + wanpipe_unlink_driver(sk); + } + sk->sk_state = WANSOCK_DISCONNECTED; + sk->sk_bound_dev_if = 0; + sock_reset_flag(sk, SOCK_ZAPPED); + wp = wp_sk(sk); + + if (wp && wp->mbox) { + kfree(wp->mbox); + wp->mbox = NULL; + } +} + +/*============================================================ + * start_cleanup_timer + * + * If new incoming call's are pending but the socket + * is being released, start the timer which will + * envoke the kill routines for pending socks. + *===========================================================*/ + + +static void start_cleanup_timer (struct sock *sk) +{ + del_timer(&sk->sk_timer); + sk->sk_timer.data = (unsigned long)sk; + sk->sk_timer.expires = jiffies + HZ; + sk->sk_timer.function = wanpipe_kill_sock_timer; + add_timer(&sk->sk_timer); +} + + +/*============================================================ + * wanpipe_kill_sock + * + * This is a function which performs actual killing + * of the sock. It releases socket resources, + * and unlinks the sock from the driver. + *===========================================================*/ + +static void wanpipe_kill_sock_timer (unsigned long data) +{ + + struct sock *sk = (struct sock *)data; + struct sock **skp; + + if (!sk) + return; + + /* This function can be called from interrupt. We must use + * appropriate locks */ + + if (test_bit(1,&wanpipe_tx_critical)){ + sk->sk_timer.expires = jiffies + 10; + add_timer(&sk->sk_timer); + return; + } + + write_lock(&wanpipe_sklist_lock); + sk_del_node_init(sk); + write_unlock(&wanpipe_sklist_lock); + + + if (wp_sk(sk)->num == htons(X25_PROT) && + sk->sk_state != WANSOCK_DISCONNECTED) { + struct net_device *dev = dev_get_by_index(sk->sk_bound_dev_if); + wanpipe_common_t *chan; + if (dev){ + chan=dev->priv; + atomic_set(&chan->disconnect,1); + dev_put(dev); + } + } + + release_driver(sk); + + sk->sk_socket = NULL; + + /* Purge queues */ + skb_queue_purge(&sk->sk_receive_queue); + skb_queue_purge(&sk->sk_write_queue); + skb_queue_purge(&sk->sk_error_queue); + + if (atomic_read(&sk->sk_rmem_alloc) || + atomic_read(&sk->sk_wmem_alloc)) { + del_timer(&sk->sk_timer); + printk(KERN_INFO "wansock: Killing SOCK in Timer\n"); + sk->sk_timer.data = (unsigned long)sk; + sk->sk_timer.expires = jiffies + HZ; + sk->sk_timer.function = wanpipe_destroy_timer; + add_timer(&sk->sk_timer); + return; + } + + if (wp_sk(sk)) { + kfree(wp_sk(sk)); + wp_sk(sk) = NULL; + } + + if (atomic_read(&sk->sk_refcnt) != 1) { + atomic_set(&sk->sk_refcnt, 1); + DBG_PRINTK(KERN_INFO "wansock: Error, wrong reference count: %i ! :timer.\n", + atomic_read(&sk->sk_refcnt)); + } + sock_put(sk); + atomic_dec(&wanpipe_socks_nr); + return; +} + +static void wanpipe_kill_sock_accept (struct sock *sk) +{ + + struct sock **skp; + + if (!sk) + return; + + /* This function can be called from interrupt. We must use + * appropriate locks */ + + write_lock(&wanpipe_sklist_lock); + sk_del_node_init(sk); + write_unlock(&wanpipe_sklist_lock); + + sk->sk_socket = NULL; + + + if (wp_sk(sk)) { + kfree(wp_sk(sk)); + wp_sk(sk) = NULL; + } + + if (atomic_read(&sk->sk_refcnt) != 1) { + atomic_set(&sk->sk_refcnt, 1); + DBG_PRINTK(KERN_INFO "wansock: Error, wrong reference count: %i ! :timer.\n", + atomic_read(&sk->sk_refcnt)); + } + sock_put(sk); + atomic_dec(&wanpipe_socks_nr); + return; +} + + +static void wanpipe_kill_sock_irq (struct sock *sk) +{ + + if (!sk) + return; + + sk->sk_socket = NULL; + + if (wp_sk(sk)) { + kfree(wp_sk(sk)); + wp_sk(sk) = NULL; + } + + if (atomic_read(&sk->sk_refcnt) != 1) { + atomic_set(&sk->sk_refcnt, 1); + DBG_PRINTK(KERN_INFO "wansock: Error, wrong reference count: %i !:listen.\n", + atomic_read(&sk->sk_refcnt)); + } + sock_put(sk); + atomic_dec(&wanpipe_socks_nr); +} + + +/*============================================================ + * wanpipe_do_bind + * + * Bottom half of the binding system call. + * Once the wanpipe_bind() function checks the + * legality of the call, this function binds the + * sock to the driver. + *===========================================================*/ + +static int wanpipe_do_bind(struct sock *sk, struct net_device *dev, + int protocol) +{ + wanpipe_opt *wp = wp_sk(sk); + wanpipe_common_t *chan=NULL; + int err=0; + + if (sock_flag(sk, SOCK_ZAPPED)) { + err = -EALREADY; + goto bind_unlock_exit; + } + + wp->num = protocol; + + if (protocol == 0){ + release_device(dev); + err = -EINVAL; + goto bind_unlock_exit; + } + + if (dev) { + if (dev->flags&IFF_UP) { + chan=dev->priv; + sk->sk_state = chan->state; + + if (wp->num == htons(X25_PROT) && + sk->sk_state != WANSOCK_DISCONNECTED && + sk->sk_state != WANSOCK_CONNECTING) { + DBG_PRINTK(KERN_INFO + "wansock: Binding to Device not DISCONNECTED %i\n", + sk->sk_state); + release_device(dev); + err = -EAGAIN; + goto bind_unlock_exit; + } + + wanpipe_link_driver(dev,sk); + sk->sk_bound_dev_if = dev->ifindex; + + /* X25 Specific option */ + if (wp->num == htons(X25_PROT)) + wp_sk(sk)->svc = chan->svc; + + } else { + sk->sk_err = ENETDOWN; + sk->sk_error_report(sk); + release_device(dev); + err = -EINVAL; + } + } else { + err = -ENODEV; + } +bind_unlock_exit: + /* FIXME where is this lock */ + + return err; +} + +/*============================================================ + * wanpipe_bind + * + * BIND() System call, which is bound to the AF_WANPIPE + * operations structure. It checks for correct wanpipe + * card name, and cross references interface names with + * the card names. Thus, interface name must belong to + * the actual card. + *===========================================================*/ + + +static int wanpipe_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct wan_sockaddr_ll *sll = (struct wan_sockaddr_ll*)uaddr; + struct sock *sk=sock->sk; + wanpipe_opt *wp = wp_sk(sk); + struct net_device *dev = NULL; + sdla_t *card=NULL; + char name[15]; + + /* + * Check legality + */ + + if (addr_len < sizeof(struct wan_sockaddr_ll)){ + printk(KERN_INFO "wansock: Address length error\n"); + return -EINVAL; + } + if (sll->sll_family != AF_WANPIPE){ + printk(KERN_INFO "wansock: Illegal family name specified.\n"); + return -EINVAL; + } + + card = wanpipe_find_card (sll->sll_card); + if (!card){ + printk(KERN_INFO "wansock: Wanpipe card not found: %s\n",sll->sll_card); + return -ENODEV; + }else{ + wp_sk(sk)->card = (void *)card; + } + + if (!strcmp(sll->sll_device,"svc_listen")){ + + /* Bind a sock to a card structure for listening + */ + int err=0; + + /* This is x25 specific area if protocol doesn't + * match, return error */ + if (sll->sll_protocol != htons(X25_PROT)) + return -EINVAL; + + err= wanpipe_link_card (sk); + if (err < 0) + return err; + + if (sll->sll_protocol) + wp->num = sll->sll_protocol; + sk->sk_state = WANSOCK_BIND_LISTEN; + return 0; + + }else if (!strcmp(sll->sll_device,"svc_connect")){ + + /* This is x25 specific area if protocol doesn't + * match, return error */ + if (sll->sll_protocol != htons(X25_PROT)) + return -EINVAL; + + /* Find a free device + */ + dev = wanpipe_find_free_dev(card); + if (dev == NULL){ + DBG_PRINTK(KERN_INFO "wansock: No free network devices for card %s\n", + card->devname); + return -EINVAL; + } + }else{ + /* Bind a socket to a interface name + * This is used by PVC mostly + */ + strlcpy(name,sll->sll_device,sizeof(name)); + dev = dev_get_by_name(name); + if (dev == NULL){ + printk(KERN_INFO "wansock: Failed to get Dev from name: %s,\n", + name); + return -ENODEV; + } + + dev_put(dev); + + if (check_dev(dev, card)){ + printk(KERN_INFO "wansock: Device %s, doesn't belong to card %s\n", + dev->name, card->devname); + return -EINVAL; + } + if (get_atomic_device (dev)) + return -EINVAL; + } + + return wanpipe_do_bind(sk, dev, sll->sll_protocol ? : wp->num); +} + +/*============================================================ + * get_atomic_device + * + * Sets a bit atomically which indicates that + * the interface is taken. This avoids race conditions. + *===========================================================*/ + + +static inline int get_atomic_device(struct net_device *dev) +{ + wanpipe_common_t *chan = dev->priv; + if (!test_and_set_bit(0,(void *)&chan->rw_bind)){ + return 0; + } + return 1; +} + +/*============================================================ + * check_dev + * + * Check that device name belongs to a particular card. + *===========================================================*/ + +static int check_dev(struct net_device *dev, sdla_t *card) +{ + struct net_device* tmp_dev; + + for (tmp_dev = card->wandev.dev; tmp_dev; + tmp_dev = *((struct net_device **)tmp_dev->priv)) { + if (tmp_dev->ifindex == dev->ifindex){ + return 0; + } + } + return 1; +} + +/*============================================================ + * wanpipe_find_free_dev + * + * Find a free network interface. If found set atomic + * bit indicating that the interface is taken. + * X25API Specific. + *===========================================================*/ + +struct net_device *wanpipe_find_free_dev(sdla_t *card) +{ + struct net_device* dev; + volatile wanpipe_common_t *chan; + + if (test_and_set_bit(0,&find_free_critical)){ + printk(KERN_INFO "CRITICAL in Find Free\n"); + } + + for (dev = card->wandev.dev; dev; + dev = *((struct net_device **)dev->priv)) { + chan = dev->priv; + if (!chan) + continue; + if (chan->usedby == API && chan->svc){ + if (!get_atomic_device (dev)){ + if (chan->state != WANSOCK_DISCONNECTED){ + release_device(dev); + }else{ + clear_bit(0,&find_free_critical); + return dev; + } + } + } + } + clear_bit(0,&find_free_critical); + return NULL; +} + +/*============================================================ + * wanpipe_create + * + * SOCKET() System call. It allocates a sock structure + * and adds the socket to the wanpipe_sk_list. + * Crates AF_WANPIPE socket. + *===========================================================*/ + +static int wanpipe_create(struct socket *sock, int protocol) +{ + struct sock *sk; + + //FIXME: This checks for root user, SECURITY ? + //if (!capable(CAP_NET_RAW)) + // return -EPERM; + + if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW) + return -ESOCKTNOSUPPORT; + + sock->state = SS_UNCONNECTED; + + if ((sk = wanpipe_alloc_socket()) == NULL) + return -ENOBUFS; + + sk->sk_reuse = 1; + sock->ops = &wanpipe_ops; + sock_init_data(sock,sk); + + sock_reset_flag(sk, SOCK_ZAPPED); + sk->sk_family = PF_WANPIPE; + wp_sk(sk)->num = protocol; + sk->sk_state = WANSOCK_DISCONNECTED; + sk->sk_ack_backlog = 0; + sk->sk_bound_dev_if = 0; + + atomic_inc(&wanpipe_socks_nr); + + /* We must disable interrupts because the ISR + * can also change the list */ + set_bit(1,&wanpipe_tx_critical); + write_lock(&wanpipe_sklist_lock); + sk_add_node(sk, &wanpipe_sklist); + write_unlock(&wanpipe_sklist_lock); + clear_bit(1,&wanpipe_tx_critical); + + return(0); +} + + +/*============================================================ + * wanpipe_recvmsg + * + * Pull a packet from our receive queue and hand it + * to the user. If necessary we block. + *===========================================================*/ + +static int wanpipe_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, int len, int flags) +{ + struct sock *sk = sock->sk; + struct sk_buff *skb; + int copied, err=-ENOBUFS; + + + /* + * If the address length field is there to be filled in, we fill + * it in now. + */ + + msg->msg_namelen = sizeof(struct wan_sockaddr_ll); + + /* + * Call the generic datagram receiver. This handles all sorts + * of horrible races and re-entrancy so we can forget about it + * in the protocol layers. + * + * Now it will return ENETDOWN, if device have just gone down, + * but then it will block. + */ + + if (flags & MSG_OOB){ + skb = skb_dequeue(&sk->sk_error_queue); + }else{ + skb=skb_recv_datagram(sk,flags,1,&err); + } + /* + * An error occurred so return it. Because skb_recv_datagram() + * handles the blocking we don't see and worry about blocking + * retries. + */ + + if(skb==NULL) + goto out; + + /* + * You lose any data beyond the buffer you gave. If it worries a + * user program they can ask the device for its MTU anyway. + */ + + copied = skb->len; + if (copied > len) + { + copied=len; + msg->msg_flags|=MSG_TRUNC; + } + + wanpipe_wakeup_driver(sk); + + /* We can't use skb_copy_datagram here */ + err = memcpy_toiovec(msg->msg_iov, skb->data, copied); + if (err) + goto out_free; + + sock_recv_timestamp(msg, sk, skb); + + if (msg->msg_name) + memcpy(msg->msg_name, skb->cb, msg->msg_namelen); + + /* + * Free or return the buffer as appropriate. Again this + * hides all the races and re-entrancy issues from us. + */ + err = (flags&MSG_TRUNC) ? skb->len : copied; + +out_free: + skb_free_datagram(sk, skb); +out: + return err; +} + + +/*============================================================ + * wanpipe_wakeup_driver + * + * If socket receive buffer is full and driver cannot + * pass data up the sock, it sets a packet_block flag. + * This function check that flag and if sock receive + * queue has room it kicks the driver BH handler. + * + * This way, driver doesn't have to poll the sock + * receive queue. + *===========================================================*/ + +static void wanpipe_wakeup_driver(struct sock *sk) +{ + struct net_device *dev = NULL; + wanpipe_common_t *chan=NULL; + + dev = dev_get_by_index(sk->sk_bound_dev_if); + if (!dev) + return; + + dev_put(dev); + + if ((chan = dev->priv) == NULL) + return; + + if (atomic_read(&chan->receive_block)){ + if (atomic_read(&sk->sk_rmem_alloc) < + ((unsigned)sk->sk_rcvbuf * 0.9)) { + printk(KERN_INFO "wansock: Queuing task for wanpipe\n"); + atomic_set(&chan->receive_block,0); + wanpipe_queue_tq(&chan->wanpipe_task); + wanpipe_mark_bh(); + } + } +} + +/*============================================================ + * wanpipe_getname + * + * I don't know what to do with this yet. + * User can use this function to get sock address + * information. Not very useful for Sangoma's purposes. + *===========================================================*/ + + +static int wanpipe_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct net_device *dev; + struct sock *sk = sock->sk; + struct wan_sockaddr_ll *sll = (struct wan_sockaddr_ll*)uaddr; + + sll->sll_family = AF_WANPIPE; + sll->sll_ifindex = sk->sk_bound_dev_if; + sll->sll_protocol = wp_sk(sk)->num; + dev = dev_get_by_index(sk->sk_bound_dev_if); + if (dev) { + sll->sll_hatype = dev->type; + sll->sll_halen = dev->addr_len; + memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len); + } else { + sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */ + sll->sll_halen = 0; + } + *uaddr_len = sizeof(*sll); + + dev_put(dev); + + return 0; +} + +/*============================================================ + * wanpipe_notifier + * + * If driver turns off network interface, this function + * will be envoked. Currently I treate it as a + * call disconnect. More thought should go into this + * function. + * + * FIXME: More thought should go into this function. + * + *===========================================================*/ + +static int wanpipe_notifier(struct notifier_block *this, unsigned long msg, void *data) +{ + struct sock *sk; + hlist_node *node; + struct net_device *dev = (struct net_device *)data; + + sk_for_each(sk, node, &wanpipe_sklist) { + struct wanpipe_opt *po = wp_sk(sk); + + if (!po) + continue; + if (dev == NULL) + continue; + + switch (msg) { + case NETDEV_DOWN: + case NETDEV_UNREGISTER: + if (dev->ifindex == sk->sk_bound_dev_if) { + printk(KERN_INFO "wansock: Device down %s\n",dev->name); + if (sock_flag(sk, SOCK_ZAPPED)) { + wanpipe_unlink_driver(sk); + sk->sk_err = ENETDOWN; + sk->sk_error_report(sk); + } + + if (msg == NETDEV_UNREGISTER) { + printk(KERN_INFO "wansock: Unregistering Device: %s\n", + dev->name); + wanpipe_unlink_driver(sk); + sk->sk_bound_dev_if = 0; + } + } + break; + case NETDEV_UP: + if (dev->ifindex == sk->sk_bound_dev_if && + po->num && !sock_flag(sk, SOCK_ZAPPED)) { + printk(KERN_INFO "wansock: Registering Device: %s\n", + dev->name); + wanpipe_link_driver(dev,sk); + } + break; + } + } + return NOTIFY_DONE; +} + +/*============================================================ + * wanpipe_ioctl + * + * Execute a user commands, and set socket options. + * + * FIXME: More thought should go into this function. + * + *===========================================================*/ + +static int wanpipe_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + int err; + + switch(cmd) + { + case SIOCGSTAMP: + return sock_get_timestamp(sk, (struct timeval __user *)arg); + + case SIOC_WANPIPE_CHECK_TX: + + return atomic_read(&sk->sk_wmem_alloc); + + case SIOC_WANPIPE_SOCK_STATE: + + if (sk->sk_state == WANSOCK_CONNECTED) + return 0; + + return 1; + + + case SIOC_WANPIPE_GET_CALL_DATA: + + return get_ioctl_cmd (sk,(void*)arg); + + case SIOC_WANPIPE_SET_CALL_DATA: + + return set_ioctl_cmd (sk,(void*)arg); + + case SIOC_WANPIPE_ACCEPT_CALL: + case SIOC_WANPIPE_CLEAR_CALL: + case SIOC_WANPIPE_RESET_CALL: + + if ((err=set_ioctl_cmd(sk,(void*)arg)) < 0) + return err; + + err=wanpipe_exec_cmd(sk,cmd,0); + get_ioctl_cmd(sk,(void*)arg); + return err; + + case SIOC_WANPIPE_DEBUG: + + return wanpipe_debug(sk,(void*)arg); + + case SIOC_WANPIPE_SET_NONBLOCK: + + if (sk->sk_state != WANSOCK_DISCONNECTED) + return -EINVAL; + + sock->file->f_flags |= O_NONBLOCK; + return 0; + +#ifdef CONFIG_INET + case SIOCADDRT: + case SIOCDELRT: + case SIOCDARP: + case SIOCGARP: + case SIOCSARP: + case SIOCDRARP: + case SIOCGRARP: + case SIOCSRARP: + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCSIFFLAGS: + return inet_dgram_ops.ioctl(sock, cmd, arg); +#endif + + default: + return dev_ioctl(cmd,(void __user *) arg); + } + /*NOTREACHED*/ +} + +/*============================================================ + * wanpipe_debug + * + * This function will pass up information about all + * active sockets. + * + * FIXME: More thought should go into this function. + * + *===========================================================*/ + +static int wanpipe_debug (struct sock *origsk, void *arg) +{ + struct sock *sk; + struct hlist_node *node; + struct net_device *dev = NULL; + wanpipe_common_t *chan=NULL; + int cnt=0, err=0; + wan_debug_t *dbg_data = (wan_debug_t *)arg; + + sk_for_each(sk, node, &wanpipe_sklist) { + wanpipe_opt *wp = wp_sk(sk); + + if (sk == origsk){ + continue; + } + + if ((err=put_user(1, &dbg_data->debug[cnt].free))) + return err; + if ((err = put_user(sk->sk_state, + &dbg_data->debug[cnt].state_sk))) + return err; + if ((err = put_user(sk->sk_rcvbuf, + &dbg_data->debug[cnt].rcvbuf))) + return err; + if ((err = put_user(atomic_read(&sk->sk_rmem_alloc), + &dbg_data->debug[cnt].rmem))) + return err; + if ((err = put_user(atomic_read(&sk->sk_wmem_alloc), + &dbg_data->debug[cnt].wmem))) + return err; + if ((err = put_user(sk->sk_sndbuf, + &dbg_data->debug[cnt].sndbuf))) + return err; + if ((err=put_user(sk_count, &dbg_data->debug[cnt].sk_count))) + return err; + if ((err=put_user(wp->poll_cnt, &dbg_data->debug[cnt].poll_cnt))) + return err; + if ((err = put_user(sk->sk_bound_dev_if, + &dbg_data->debug[cnt].bound))) + return err; + + if (sk->sk_bound_dev_if) { + dev = dev_get_by_index(sk->sk_bound_dev_if); + if (!dev) + continue; + + chan=dev->priv; + dev_put(dev); + + if ((err=put_user(chan->state, &dbg_data->debug[cnt].d_state))) + return err; + if ((err=put_user(chan->svc, &dbg_data->debug[cnt].svc))) + return err; + + if ((err=put_user(atomic_read(&chan->command), + &dbg_data->debug[cnt].command))) + return err; + + + if (wp){ + sdla_t *card = (sdla_t*)wp->card; + + if (card){ + if ((err=put_user(atomic_read(&card->u.x.command_busy), + &dbg_data->debug[cnt].cmd_busy))) + return err; + } + + if ((err=put_user(wp->lcn, + &dbg_data->debug[cnt].lcn))) + return err; + + if (wp->mbox) { + if ((err=put_user(1, &dbg_data->debug[cnt].mbox))) + return err; + } + } + + if ((err=put_user(atomic_read(&chan->receive_block), + &dbg_data->debug[cnt].rblock))) + return err; + + if (copy_to_user(dbg_data->debug[cnt].name, dev->name, strlen(dev->name))) + return -EFAULT; + } + + if (++cnt == MAX_NUM_DEBUG) + break; + } + return 0; +} + +/*============================================================ + * get_ioctl_cmd + * + * Pass up the contents of socket MBOX to the user. + *===========================================================*/ + +static int get_ioctl_cmd (struct sock *sk, void *arg) +{ + x25api_t *usr_data = (x25api_t *)arg; + mbox_cmd_t *mbox_ptr; + int err; + + if (usr_data == NULL) + return -EINVAL; + + if (!wp_sk(sk)->mbox) { + return -EINVAL; + } + + mbox_ptr = (mbox_cmd_t *)wp_sk(sk)->mbox; + + if ((err=put_user(mbox_ptr->cmd.qdm, &usr_data->hdr.qdm))) + return err; + if ((err=put_user(mbox_ptr->cmd.cause, &usr_data->hdr.cause))) + return err; + if ((err=put_user(mbox_ptr->cmd.diagn, &usr_data->hdr.diagn))) + return err; + if ((err=put_user(mbox_ptr->cmd.length, &usr_data->hdr.length))) + return err; + if ((err=put_user(mbox_ptr->cmd.result, &usr_data->hdr.result))) + return err; + if ((err=put_user(mbox_ptr->cmd.lcn, &usr_data->hdr.lcn))) + return err; + + if (mbox_ptr->cmd.length > 0){ + if (mbox_ptr->cmd.length > X25_MAX_DATA) + return -EINVAL; + + if (copy_to_user(usr_data->data, mbox_ptr->data, mbox_ptr->cmd.length)){ + printk(KERN_INFO "wansock: Copy failed !!!\n"); + return -EFAULT; + } + } + return 0; +} + +/*============================================================ + * set_ioctl_cmd + * + * Before command can be execute, socket MBOX must + * be created, and initialized with user data. + *===========================================================*/ + +static int set_ioctl_cmd (struct sock *sk, void *arg) +{ + x25api_t *usr_data = (x25api_t *)arg; + mbox_cmd_t *mbox_ptr; + int err; + + if (!wp_sk(sk)->mbox) { + void *mbox_ptr; + struct net_device *dev = dev_get_by_index(sk->sk_bound_dev_if); + if (!dev) + return -ENODEV; + + dev_put(dev); + + if ((mbox_ptr = kmalloc(sizeof(mbox_cmd_t), GFP_ATOMIC)) == NULL) + return -ENOMEM; + + memset(mbox_ptr, 0, sizeof(mbox_cmd_t)); + wp_sk(sk)->mbox = mbox_ptr; + + wanpipe_link_driver(dev,sk); + } + + mbox_ptr = (mbox_cmd_t*)wp_sk(sk)->mbox; + memset(mbox_ptr, 0, sizeof(mbox_cmd_t)); + + if (usr_data == NULL){ + return 0; + } + if ((err=get_user(mbox_ptr->cmd.qdm, &usr_data->hdr.qdm))) + return err; + if ((err=get_user(mbox_ptr->cmd.cause, &usr_data->hdr.cause))) + return err; + if ((err=get_user(mbox_ptr->cmd.diagn, &usr_data->hdr.diagn))) + return err; + if ((err=get_user(mbox_ptr->cmd.length, &usr_data->hdr.length))) + return err; + if ((err=get_user(mbox_ptr->cmd.result, &usr_data->hdr.result))) + return err; + + if (mbox_ptr->cmd.length > 0){ + if (mbox_ptr->cmd.length > X25_MAX_DATA) + return -EINVAL; + + if (copy_from_user(mbox_ptr->data, usr_data->data, mbox_ptr->cmd.length)){ + printk(KERN_INFO "Copy failed\n"); + return -EFAULT; + } + } + return 0; +} + + +/*====================================================================== + * wanpipe_poll + * + * Datagram poll: Again totally generic. This also handles + * sequenced packet sockets providing the socket receive queue + * is only ever holding data ready to receive. + * + * Note: when you _don't_ use this routine for this protocol, + * and you use a different write policy from sock_writeable() + * then please supply your own write_space callback. + *=====================================================================*/ + +unsigned int wanpipe_poll(struct file * file, struct socket *sock, poll_table *wait) +{ + struct sock *sk = sock->sk; + unsigned int mask; + + ++wp_sk(sk)->poll_cnt; + + poll_wait(file, sk->sk_sleep, wait); + mask = 0; + + /* exceptional events? */ + if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) { + mask |= POLLPRI; + return mask; + } + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLHUP; + + /* readable? */ + if (!skb_queue_empty(&sk->sk_receive_queue)) { + mask |= POLLIN | POLLRDNORM; + } + + /* connection hasn't started yet */ + if (sk->sk_state == WANSOCK_CONNECTING) { + return mask; + } + + if (sk->sk_state == WANSOCK_DISCONNECTED) { + mask = POLLPRI; + return mask; + } + + /* This check blocks the user process if there is + * a packet already queued in the socket write queue. + * This option is only for X25API protocol, for other + * protocol like chdlc enable streaming mode, + * where multiple packets can be pending in the socket + * transmit queue */ + + if (wp_sk(sk)->num == htons(X25_PROT)) { + if (atomic_read(&wp_sk(sk)->packet_sent)) + return mask; + } + + /* writable? */ + if (sock_writeable(sk)){ + mask |= POLLOUT | POLLWRNORM | POLLWRBAND; + }else{ + set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); + } + + return mask; +} + +/*====================================================================== + * wanpipe_listen + * + * X25API Specific function. Set a socket into LISTENING MODE. + *=====================================================================*/ + + +static int wanpipe_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + + /* This is x25 specific area if protocol doesn't + * match, return error */ + if (wp_sk(sk)->num != htons(X25_PROT)) + return -EINVAL; + + if (sk->sk_state == WANSOCK_BIND_LISTEN) { + + sk->sk_max_ack_backlog = backlog; + sk->sk_state = WANSOCK_LISTEN; + return 0; + }else{ + printk(KERN_INFO "wansock: Listening sock was not binded\n"); + } + + return -EINVAL; +} + +/*====================================================================== + * wanpipe_link_card + * + * Connects the listening socket to the driver + *=====================================================================*/ + +static int wanpipe_link_card (struct sock *sk) +{ + sdla_t *card = (sdla_t*)wp_sk(sk)->card; + + if (!card) + return -ENOMEM; + + if ((card->sk != NULL) || (card->func != NULL)){ + printk(KERN_INFO "wansock: Listening queue is already established\n"); + return -EINVAL; + } + + card->sk=sk; + card->func=wanpipe_listen_rcv; + sock_set_flag(sk, SOCK_ZAPPED); + + return 0; +} + +/*====================================================================== + * wanpipe_listen + * + * X25API Specific function. Disconnect listening socket from + * the driver. + *=====================================================================*/ + +static void wanpipe_unlink_card (struct sock *sk) +{ + sdla_t *card = (sdla_t*)wp_sk(sk)->card; + + if (card){ + card->sk=NULL; + card->func=NULL; + } +} + +/*====================================================================== + * wanpipe_exec_cmd + * + * Ioctl function calls this function to execute user command. + * Connect() sytem call also calls this function to execute + * place call. This function blocks until command is executed. + *=====================================================================*/ + +static int wanpipe_exec_cmd(struct sock *sk, int cmd, unsigned int flags) +{ + int err = -EINVAL; + wanpipe_opt *wp = wp_sk(sk); + mbox_cmd_t *mbox_ptr = (mbox_cmd_t*)wp->mbox; + + if (!mbox_ptr){ + printk(KERN_INFO "NO MBOX PTR !!!!!\n"); + return -EINVAL; + } + + /* This is x25 specific area if protocol doesn't + * match, return error */ + if (wp->num != htons(X25_PROT)) + return -EINVAL; + + + switch (cmd){ + + case SIOC_WANPIPE_ACCEPT_CALL: + + if (sk->sk_state != WANSOCK_CONNECTING) { + err = -EHOSTDOWN; + break; + } + + err = execute_command(sk,X25_ACCEPT_CALL,0); + if (err < 0) + break; + + /* Update. Mar6 2000. + * Do not set the sock lcn number here, since + * it is done in wanpipe_listen_rcv(). + */ + if (sk->sk_state == WANSOCK_CONNECTED) { + wp->lcn = ((mbox_cmd_t*)wp->mbox)->cmd.lcn; + DBG_PRINTK(KERN_INFO "\nwansock: Accept OK %i\n", + wp->lcn); + err = 0; + + }else{ + DBG_PRINTK (KERN_INFO "\nwansock: Accept Failed %i\n", + wp->lcn); + wp->lcn = 0; + err = -ECONNREFUSED; + } + break; + + case SIOC_WANPIPE_CLEAR_CALL: + + if (sk->sk_state == WANSOCK_DISCONNECTED) { + err = -EINVAL; + break; + } + + + /* Check if data buffers are pending for transmission, + * if so, check whether user wants to wait until data + * is transmitted, or clear a call and drop packets */ + + if (atomic_read(&sk->sk_wmem_alloc) || + check_driver_busy(sk)) { + mbox_cmd_t *mbox = wp->mbox; + if (mbox->cmd.qdm & 0x80){ + mbox->cmd.result = 0x35; + err = -EAGAIN; + break; + } + } + + sk->sk_state = WANSOCK_DISCONNECTING; + + err = execute_command(sk,X25_CLEAR_CALL,0); + if (err < 0) + break; + + err = -ECONNREFUSED; + if (sk->sk_state == WANSOCK_DISCONNECTED) { + DBG_PRINTK(KERN_INFO "\nwansock: CLEAR OK %i\n", + wp->lcn); + wp->lcn = 0; + err = 0; + } + break; + + case SIOC_WANPIPE_RESET_CALL: + + if (sk->sk_state != WANSOCK_CONNECTED) { + err = -EINVAL; + break; + } + + + /* Check if data buffers are pending for transmission, + * if so, check whether user wants to wait until data + * is transmitted, or reset a call and drop packets */ + + if (atomic_read(&sk->sk_wmem_alloc) || + check_driver_busy(sk)) { + mbox_cmd_t *mbox = wp->mbox; + if (mbox->cmd.qdm & 0x80){ + mbox->cmd.result = 0x35; + err = -EAGAIN; + break; + } + } + + + err = execute_command(sk, X25_RESET,0); + if (err < 0) + break; + + err = mbox_ptr->cmd.result; + break; + + + case X25_PLACE_CALL: + + err=execute_command(sk,X25_PLACE_CALL,flags); + if (err < 0) + break; + + if (sk->sk_state == WANSOCK_CONNECTED) { + + wp->lcn = ((mbox_cmd_t*)wp->mbox)->cmd.lcn; + + DBG_PRINTK(KERN_INFO "\nwansock: PLACE CALL OK %i\n", + wp->lcn); + err = 0; + + } else if (sk->sk_state == WANSOCK_CONNECTING && + (flags & O_NONBLOCK)) { + wp->lcn = ((mbox_cmd_t*)wp->mbox)->cmd.lcn; + DBG_PRINTK(KERN_INFO "\nwansock: Place Call OK: Waiting %i\n", + wp->lcn); + + err = 0; + + }else{ + DBG_PRINTK(KERN_INFO "\nwansock: Place call Failed\n"); + err = -ECONNREFUSED; + } + + break; + + default: + return -EINVAL; + } + + return err; +} + +static int check_driver_busy (struct sock *sk) +{ + struct net_device *dev = dev_get_by_index(sk->sk_bound_dev_if); + wanpipe_common_t *chan; + + if (!dev) + return 0; + + dev_put(dev); + + if ((chan=dev->priv) == NULL) + return 0; + + return atomic_read(&chan->driver_busy); +} + + +/*====================================================================== + * wanpipe_accept + * + * ACCEPT() System call. X25API Specific function. + * For each incoming call, create a new socket and + * return it to the user. + *=====================================================================*/ + +static int wanpipe_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk; + struct sock *newsk; + struct sk_buff *skb; + DECLARE_WAITQUEUE(wait, current); + int err=0; + + if (newsock->sk != NULL){ + wanpipe_kill_sock_accept(newsock->sk); + newsock->sk=NULL; + } + + if ((sk = sock->sk) == NULL) + return -EINVAL; + + if (sk->sk_type != SOCK_RAW) + return -EOPNOTSUPP; + + if (sk->sk_state != WANSOCK_LISTEN) + return -EINVAL; + + if (wp_sk(sk)->num != htons(X25_PROT)) + return -EINVAL; + + add_wait_queue(sk->sk_sleep,&wait); + current->state = TASK_INTERRUPTIBLE; + for (;;){ + skb = skb_dequeue(&sk->sk_receive_queue); + if (skb){ + err=0; + break; + } + if (signal_pending(current)) { + err = -ERESTARTSYS; + break; + } + schedule(); + } + current->state = TASK_RUNNING; + remove_wait_queue(sk->sk_sleep,&wait); + + if (err != 0) + return err; + + newsk = get_newsk_from_skb(skb); + if (!newsk){ + return -EINVAL; + } + + set_bit(1,&wanpipe_tx_critical); + write_lock(&wanpipe_sklist_lock); + sk_add_node(newsk, &wanpipe_sklist); + write_unlock(&wanpipe_sklist_lock); + clear_bit(1,&wanpipe_tx_critical); + + newsk->sk_socket = newsock; + newsk->sk_sleep = &newsock->wait; + + /* Now attach up the new socket */ + sk->sk_ack_backlog--; + newsock->sk = newsk; + + kfree_skb(skb); + + DBG_PRINTK(KERN_INFO "\nwansock: ACCEPT Got LCN %i\n", + wp_sk(newsk)->lcn); + return 0; +} + +/*====================================================================== + * get_newsk_from_skb + * + * Accept() uses this function to get the address of the new + * socket structure. + *=====================================================================*/ + +struct sock * get_newsk_from_skb (struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + wanpipe_common_t *chan; + + if (!dev){ + return NULL; + } + + if ((chan = dev->priv) == NULL){ + return NULL; + } + + if (!chan->sk){ + return NULL; + } + return (struct sock *)chan->sk; +} + +/*====================================================================== + * wanpipe_connect + * + * CONNECT() System Call. X25API specific function + * Check the state of the sock, and execute PLACE_CALL command. + * Connect can ether block or return without waiting for connection, + * if specified by user. + *=====================================================================*/ + +static int wanpipe_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct wan_sockaddr_ll *addr = (struct wan_sockaddr_ll*)uaddr; + struct net_device *dev; + int err; + + if (wp_sk(sk)->num != htons(X25_PROT)) + return -EINVAL; + + if (sk->sk_state == WANSOCK_CONNECTED) + return -EISCONN; /* No reconnect on a seqpacket socket */ + + if (sk->sk_state != WAN_DISCONNECTED) { + printk(KERN_INFO "wansock: Trying to connect on channel NON DISCONNECT\n"); + return -ECONNREFUSED; + } + + sk->sk_state = WANSOCK_DISCONNECTED; + sock->state = SS_UNCONNECTED; + + if (addr_len != sizeof(struct wan_sockaddr_ll)) + return -EINVAL; + + if (addr->sll_family != AF_WANPIPE) + return -EINVAL; + + if ((dev = dev_get_by_index(sk->sk_bound_dev_if)) == NULL) + return -ENETUNREACH; + + dev_put(dev); + + if (!sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */ + return -EINVAL; + + sock->state = SS_CONNECTING; + sk->sk_state = WANSOCK_CONNECTING; + + if (!wp_sk(sk)->mbox) { + if (wp_sk (sk)->svc) + return -EINVAL; + else { + int err; + if ((err=set_ioctl_cmd(sk,NULL)) < 0) + return err; + } + } + + if ((err=wanpipe_exec_cmd(sk, X25_PLACE_CALL,flags)) != 0){ + sock->state = SS_UNCONNECTED; + sk->sk_state = WANSOCK_CONNECTED; + return err; + } + + if (sk->sk_state != WANSOCK_CONNECTED && (flags & O_NONBLOCK)) { + return 0; + } + + if (sk->sk_state != WANSOCK_CONNECTED) { + sock->state = SS_UNCONNECTED; + return -ECONNREFUSED; + } + + sock->state = SS_CONNECTED; + return 0; +} + +struct proto_ops wanpipe_ops = { + .family = PF_WANPIPE, + .owner = THIS_MODULE, + .release = wanpipe_release, + .bind = wanpipe_bind, + .connect = wanpipe_connect, + .socketpair = sock_no_socketpair, + .accept = wanpipe_accept, + .getname = wanpipe_getname, + .poll = wanpipe_poll, + .ioctl = wanpipe_ioctl, + .listen = wanpipe_listen, + .shutdown = sock_no_shutdown, + .setsockopt = sock_no_setsockopt, + .getsockopt = sock_no_getsockopt, + .sendmsg = wanpipe_sendmsg, + .recvmsg = wanpipe_recvmsg +}; + +static struct net_proto_family wanpipe_family_ops = { + .family = PF_WANPIPE, + .create = wanpipe_create, + .owner = THIS_MODULE, +}; + +struct notifier_block wanpipe_netdev_notifier = { + .notifier_call = wanpipe_notifier, +}; + + +#ifdef MODULE +void cleanup_module(void) +{ + printk(KERN_INFO "wansock: Cleaning up \n"); + unregister_netdevice_notifier(&wanpipe_netdev_notifier); + sock_unregister(PF_WANPIPE); + proto_unregister(&wanpipe_proto); +} + +int init_module(void) +{ + int rc; + + printk(KERN_INFO "wansock: Registering Socket \n"); + + rc = proto_register(&wanpipe_proto, 0); + if (rc != 0) + goto out; + + sock_register(&wanpipe_family_ops); + register_netdevice_notifier(&wanpipe_netdev_notifier); +out: + return rc; +} +#endif +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_WANPIPE); diff --git a/net/wanrouter/patchlevel b/net/wanrouter/patchlevel new file mode 100644 index 000000000000..c043eea7767e --- /dev/null +++ b/net/wanrouter/patchlevel @@ -0,0 +1 @@ +2.2.1 diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c new file mode 100644 index 000000000000..956c17f6c548 --- /dev/null +++ b/net/wanrouter/wanmain.c @@ -0,0 +1,888 @@ +/***************************************************************************** +* wanmain.c WAN Multiprotocol Router Module. Main code. +* +* This module is completely hardware-independent and provides +* the following common services for the WAN Link Drivers: +* o WAN device managenment (registering, unregistering) +* o Network interface management +* o Physical connection management (dial-up, incoming calls) +* o Logical connection management (switched virtual circuits) +* o Protocol encapsulation/decapsulation +* +* Author: Gideon Hack +* +* Copyright: (c) 1995-1999 Sangoma Technologies Inc. +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version +* 2 of the License, or (at your option) any later version. +* ============================================================================ +* Nov 24, 2000 Nenad Corbic Updated for 2.4.X kernels +* Nov 07, 2000 Nenad Corbic Fixed the Mulit-Port PPP for kernels 2.2.16 and +* greater. +* Aug 2, 2000 Nenad Corbic Block the Multi-Port PPP from running on +* kernels 2.2.16 or greater. The SyncPPP +* has changed. +* Jul 13, 2000 Nenad Corbic Added SyncPPP support +* Added extra debugging in device_setup(). +* Oct 01, 1999 Gideon Hack Update for s514 PCI card +* Dec 27, 1996 Gene Kozin Initial version (based on Sangoma's WANPIPE) +* Jan 16, 1997 Gene Kozin router_devlist made public +* Jan 31, 1997 Alan Cox Hacked it about a bit for 2.1 +* Jun 27, 1997 Alan Cox realigned with vendor code +* Oct 15, 1997 Farhan Thawar changed wan_encapsulate to add a pad byte of 0 +* Apr 20, 1998 Alan Cox Fixed 2.1 symbols +* May 17, 1998 K. Baranowski Fixed SNAP encapsulation in wan_encapsulate +* Dec 15, 1998 Arnaldo Melo support for firmwares of up to 128000 bytes +* check wandev->setup return value +* Dec 22, 1998 Arnaldo Melo vmalloc/vfree used in device_setup to allocate +* kernel memory and copy configuration data to +* kernel space (for big firmwares) +* Jun 02, 1999 Gideon Hack Updates for Linux 2.0.X and 2.2.X kernels. +*****************************************************************************/ + +#include +#include /* offsetof(), etc. */ +#include /* return codes */ +#include +#include +#include /* support for loadable modules */ +#include /* kmalloc(), kfree() */ +#include /* verify_area(), etc. */ +#include /* inline mem*, str* functions */ + +#include /* htons(), etc. */ +#include /* WAN router API definitions */ + +#include /* vmalloc, vfree */ +#include /* copy_to/from_user */ +#include /* __initfunc et al. */ +#include + +#define KMEM_SAFETYZONE 8 + +/***********FOR DEBUGGING PURPOSES********************************************* +static void * dbg_kmalloc(unsigned int size, int prio, int line) { + int i = 0; + void * v = kmalloc(size+sizeof(unsigned int)+2*KMEM_SAFETYZONE*8,prio); + char * c1 = v; + c1 += sizeof(unsigned int); + *((unsigned int *)v) = size; + + for (i = 0; i < KMEM_SAFETYZONE; i++) { + c1[0] = 'D'; c1[1] = 'E'; c1[2] = 'A'; c1[3] = 'D'; + c1[4] = 'B'; c1[5] = 'E'; c1[6] = 'E'; c1[7] = 'F'; + c1 += 8; + } + c1 += size; + for (i = 0; i < KMEM_SAFETYZONE; i++) { + c1[0] = 'M'; c1[1] = 'U'; c1[2] = 'N'; c1[3] = 'G'; + c1[4] = 'W'; c1[5] = 'A'; c1[6] = 'L'; c1[7] = 'L'; + c1 += 8; + } + v = ((char *)v) + sizeof(unsigned int) + KMEM_SAFETYZONE*8; + printk(KERN_INFO "line %d kmalloc(%d,%d) = %p\n",line,size,prio,v); + return v; +} +static void dbg_kfree(void * v, int line) { + unsigned int * sp = (unsigned int *)(((char *)v) - (sizeof(unsigned int) + KMEM_SAFETYZONE*8)); + unsigned int size = *sp; + char * c1 = ((char *)v) - KMEM_SAFETYZONE*8; + int i = 0; + for (i = 0; i < KMEM_SAFETYZONE; i++) { + if ( c1[0] != 'D' || c1[1] != 'E' || c1[2] != 'A' || c1[3] != 'D' + || c1[4] != 'B' || c1[5] != 'E' || c1[6] != 'E' || c1[7] != 'F') { + printk(KERN_INFO "kmalloced block at %p has been corrupted (underrun)!\n",v); + printk(KERN_INFO " %4x: %2x %2x %2x %2x %2x %2x %2x %2x\n", i*8, + c1[0],c1[1],c1[2],c1[3],c1[4],c1[5],c1[6],c1[7] ); + } + c1 += 8; + } + c1 += size; + for (i = 0; i < KMEM_SAFETYZONE; i++) { + if ( c1[0] != 'M' || c1[1] != 'U' || c1[2] != 'N' || c1[3] != 'G' + || c1[4] != 'W' || c1[5] != 'A' || c1[6] != 'L' || c1[7] != 'L' + ) { + printk(KERN_INFO "kmalloced block at %p has been corrupted (overrun):\n",v); + printk(KERN_INFO " %4x: %2x %2x %2x %2x %2x %2x %2x %2x\n", i*8, + c1[0],c1[1],c1[2],c1[3],c1[4],c1[5],c1[6],c1[7] ); + } + c1 += 8; + } + printk(KERN_INFO "line %d kfree(%p)\n",line,v); + v = ((char *)v) - (sizeof(unsigned int) + KMEM_SAFETYZONE*8); + kfree(v); +} + +#define kmalloc(x,y) dbg_kmalloc(x,y,__LINE__) +#define kfree(x) dbg_kfree(x,__LINE__) +*****************************************************************************/ + +/* + * Function Prototypes + */ + +/* + * WAN device IOCTL handlers + */ + +static int wanrouter_device_setup(struct wan_device *wandev, + wandev_conf_t __user *u_conf); +static int wanrouter_device_stat(struct wan_device *wandev, + wandev_stat_t __user *u_stat); +static int wanrouter_device_shutdown(struct wan_device *wandev); +static int wanrouter_device_new_if(struct wan_device *wandev, + wanif_conf_t __user *u_conf); +static int wanrouter_device_del_if(struct wan_device *wandev, + char __user *u_name); + +/* + * Miscellaneous + */ + +static struct wan_device *wanrouter_find_device(char *name); +static int wanrouter_delete_interface(struct wan_device *wandev, char *name); +void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags); +void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags); + + + +/* + * Global Data + */ + +static char wanrouter_fullname[] = "Sangoma WANPIPE Router"; +static char wanrouter_copyright[] = "(c) 1995-2000 Sangoma Technologies Inc."; +static char wanrouter_modname[] = ROUTER_NAME; /* short module name */ +struct wan_device* wanrouter_router_devlist; /* list of registered devices */ + +/* + * Organize Unique Identifiers for encapsulation/decapsulation + */ + +static unsigned char wanrouter_oui_ether[] = { 0x00, 0x00, 0x00 }; +#if 0 +static unsigned char wanrouter_oui_802_2[] = { 0x00, 0x80, 0xC2 }; +#endif + +static int __init wanrouter_init(void) +{ + int err; + + printk(KERN_INFO "%s v%u.%u %s\n", + wanrouter_fullname, ROUTER_VERSION, ROUTER_RELEASE, + wanrouter_copyright); + + err = wanrouter_proc_init(); + if (err) + printk(KERN_INFO "%s: can't create entry in proc filesystem!\n", + wanrouter_modname); + + return err; +} + +static void __exit wanrouter_cleanup (void) +{ + wanrouter_proc_cleanup(); +} + +/* + * This is just plain dumb. We should move the bugger to drivers/net/wan, + * slap it first in directory and make it module_init(). The only reason + * for subsys_initcall() here is that net goes after drivers (why, BTW?) + */ +subsys_initcall(wanrouter_init); +module_exit(wanrouter_cleanup); + +/* + * Kernel APIs + */ + +/* + * Register WAN device. + * o verify device credentials + * o create an entry for the device in the /proc/net/router directory + * o initialize internally maintained fields of the wan_device structure + * o link device data space to a singly-linked list + * o if it's the first device, then start kernel 'thread' + * o increment module use count + * + * Return: + * 0 Ok + * < 0 error. + * + * Context: process + */ + + +int register_wan_device(struct wan_device *wandev) +{ + int err, namelen; + + if ((wandev == NULL) || (wandev->magic != ROUTER_MAGIC) || + (wandev->name == NULL)) + return -EINVAL; + + namelen = strlen(wandev->name); + if (!namelen || (namelen > WAN_DRVNAME_SZ)) + return -EINVAL; + + if (wanrouter_find_device(wandev->name)) + return -EEXIST; + +#ifdef WANDEBUG + printk(KERN_INFO "%s: registering WAN device %s\n", + wanrouter_modname, wandev->name); +#endif + + /* + * Register /proc directory entry + */ + err = wanrouter_proc_add(wandev); + if (err) { + printk(KERN_INFO + "%s: can't create /proc/net/router/%s entry!\n", + wanrouter_modname, wandev->name); + return err; + } + + /* + * Initialize fields of the wan_device structure maintained by the + * router and update local data. + */ + + wandev->ndev = 0; + wandev->dev = NULL; + wandev->next = wanrouter_router_devlist; + wanrouter_router_devlist = wandev; + return 0; +} + +/* + * Unregister WAN device. + * o shut down device + * o unlink device data space from the linked list + * o delete device entry in the /proc/net/router directory + * o decrement module use count + * + * Return: 0 Ok + * <0 error. + * Context: process + */ + + +int unregister_wan_device(char *name) +{ + struct wan_device *wandev, *prev; + + if (name == NULL) + return -EINVAL; + + for (wandev = wanrouter_router_devlist, prev = NULL; + wandev && strcmp(wandev->name, name); + prev = wandev, wandev = wandev->next) + ; + if (wandev == NULL) + return -ENODEV; + +#ifdef WANDEBUG + printk(KERN_INFO "%s: unregistering WAN device %s\n", + wanrouter_modname, name); +#endif + + if (wandev->state != WAN_UNCONFIGURED) + wanrouter_device_shutdown(wandev); + + if (prev) + prev->next = wandev->next; + else + wanrouter_router_devlist = wandev->next; + + wanrouter_proc_delete(wandev); + return 0; +} + +/* + * Encapsulate packet. + * + * Return: encapsulation header size + * < 0 - unsupported Ethertype + * + * Notes: + * 1. This function may be called on interrupt context. + */ + + +int wanrouter_encapsulate(struct sk_buff *skb, struct net_device *dev, + unsigned short type) +{ + int hdr_len = 0; + + switch (type) { + case ETH_P_IP: /* IP datagram encapsulation */ + hdr_len += 1; + skb_push(skb, 1); + skb->data[0] = NLPID_IP; + break; + + case ETH_P_IPX: /* SNAP encapsulation */ + case ETH_P_ARP: + hdr_len += 7; + skb_push(skb, 7); + skb->data[0] = 0; + skb->data[1] = NLPID_SNAP; + memcpy(&skb->data[2], wanrouter_oui_ether, + sizeof(wanrouter_oui_ether)); + *((unsigned short*)&skb->data[5]) = htons(type); + break; + + default: /* Unknown packet type */ + printk(KERN_INFO + "%s: unsupported Ethertype 0x%04X on interface %s!\n", + wanrouter_modname, type, dev->name); + hdr_len = -EINVAL; + } + return hdr_len; +} + + +/* + * Decapsulate packet. + * + * Return: Ethertype (in network order) + * 0 unknown encapsulation + * + * Notes: + * 1. This function may be called on interrupt context. + */ + + +unsigned short wanrouter_type_trans(struct sk_buff *skb, struct net_device *dev) +{ + int cnt = skb->data[0] ? 0 : 1; /* there may be a pad present */ + unsigned short ethertype; + + switch (skb->data[cnt]) { + case NLPID_IP: /* IP datagramm */ + ethertype = htons(ETH_P_IP); + cnt += 1; + break; + + case NLPID_SNAP: /* SNAP encapsulation */ + if (memcmp(&skb->data[cnt + 1], wanrouter_oui_ether, + sizeof(wanrouter_oui_ether))){ + printk(KERN_INFO + "%s: unsupported SNAP OUI %02X-%02X-%02X " + "on interface %s!\n", wanrouter_modname, + skb->data[cnt+1], skb->data[cnt+2], + skb->data[cnt+3], dev->name); + return 0; + } + ethertype = *((unsigned short*)&skb->data[cnt+4]); + cnt += 6; + break; + + /* add other protocols, e.g. CLNP, ESIS, ISIS, if needed */ + + default: + printk(KERN_INFO + "%s: unsupported NLPID 0x%02X on interface %s!\n", + wanrouter_modname, skb->data[cnt], dev->name); + return 0; + } + skb->protocol = ethertype; + skb->pkt_type = PACKET_HOST; /* Physically point to point */ + skb_pull(skb, cnt); + skb->mac.raw = skb->data; + return ethertype; +} + + +/* + * WAN device IOCTL. + * o find WAN device associated with this node + * o execute requested action or pass command to the device driver + */ + +int wanrouter_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + int err = 0; + struct proc_dir_entry *dent; + struct wan_device *wandev; + void __user *data = (void __user *)arg; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if ((cmd >> 8) != ROUTER_IOCTL) + return -EINVAL; + + dent = PDE(inode); + if ((dent == NULL) || (dent->data == NULL)) + return -EINVAL; + + wandev = dent->data; + if (wandev->magic != ROUTER_MAGIC) + return -EINVAL; + + switch (cmd) { + case ROUTER_SETUP: + err = wanrouter_device_setup(wandev, data); + break; + + case ROUTER_DOWN: + err = wanrouter_device_shutdown(wandev); + break; + + case ROUTER_STAT: + err = wanrouter_device_stat(wandev, data); + break; + + case ROUTER_IFNEW: + err = wanrouter_device_new_if(wandev, data); + break; + + case ROUTER_IFDEL: + err = wanrouter_device_del_if(wandev, data); + break; + + case ROUTER_IFSTAT: + break; + + default: + if ((cmd >= ROUTER_USER) && + (cmd <= ROUTER_USER_MAX) && + wandev->ioctl) + err = wandev->ioctl(wandev, cmd, arg); + else err = -EINVAL; + } + return err; +} + +/* + * WAN Driver IOCTL Handlers + */ + +/* + * Setup WAN link device. + * o verify user address space + * o allocate kernel memory and copy configuration data to kernel space + * o if configuration data includes extension, copy it to kernel space too + * o call driver's setup() entry point + */ + +static int wanrouter_device_setup(struct wan_device *wandev, + wandev_conf_t __user *u_conf) +{ + void *data = NULL; + wandev_conf_t *conf; + int err = -EINVAL; + + if (wandev->setup == NULL) { /* Nothing to do ? */ + printk(KERN_INFO "%s: ERROR, No setup script: wandev->setup()\n", + wandev->name); + return 0; + } + + conf = kmalloc(sizeof(wandev_conf_t), GFP_KERNEL); + if (conf == NULL){ + printk(KERN_INFO "%s: ERROR, Failed to allocate kernel memory !\n", + wandev->name); + return -ENOBUFS; + } + + if (copy_from_user(conf, u_conf, sizeof(wandev_conf_t))) { + printk(KERN_INFO "%s: Failed to copy user config data to kernel space!\n", + wandev->name); + kfree(conf); + return -EFAULT; + } + + if (conf->magic != ROUTER_MAGIC) { + kfree(conf); + printk(KERN_INFO "%s: ERROR, Invalid MAGIC Number\n", + wandev->name); + return -EINVAL; + } + + if (conf->data_size && conf->data) { + if (conf->data_size > 128000 || conf->data_size < 0) { + printk(KERN_INFO + "%s: ERROR, Invalid firmware data size %i !\n", + wandev->name, conf->data_size); + kfree(conf); + return -EINVAL; + } + + data = vmalloc(conf->data_size); + if (!data) { + printk(KERN_INFO + "%s: ERROR, Faild allocate kernel memory !\n", + wandev->name); + kfree(conf); + return -ENOBUFS; + } + if (!copy_from_user(data, conf->data, conf->data_size)) { + conf->data = data; + err = wandev->setup(wandev, conf); + } else { + printk(KERN_INFO + "%s: ERROR, Faild to copy from user data !\n", + wandev->name); + err = -EFAULT; + } + vfree(data); + } else { + printk(KERN_INFO + "%s: ERROR, No firmware found ! Firmware size = %i !\n", + wandev->name, conf->data_size); + } + + kfree(conf); + return err; +} + +/* + * Shutdown WAN device. + * o delete all not opened logical channels for this device + * o call driver's shutdown() entry point + */ + +static int wanrouter_device_shutdown(struct wan_device *wandev) +{ + struct net_device *dev; + int err=0; + + if (wandev->state == WAN_UNCONFIGURED) + return 0; + + printk(KERN_INFO "\n%s: Shutting Down!\n",wandev->name); + + for (dev = wandev->dev; dev;) { + err = wanrouter_delete_interface(wandev, dev->name); + if (err) + return err; + /* The above function deallocates the current dev + * structure. Therefore, we cannot use dev->priv + * as the next element: wandev->dev points to the + * next element */ + dev = wandev->dev; + } + + if (wandev->ndev) + return -EBUSY; /* there are opened interfaces */ + + if (wandev->shutdown) + err=wandev->shutdown(wandev); + + return err; +} + +/* + * Get WAN device status & statistics. + */ + +static int wanrouter_device_stat(struct wan_device *wandev, + wandev_stat_t __user *u_stat) +{ + wandev_stat_t stat; + + memset(&stat, 0, sizeof(stat)); + + /* Ask device driver to update device statistics */ + if ((wandev->state != WAN_UNCONFIGURED) && wandev->update) + wandev->update(wandev); + + /* Fill out structure */ + stat.ndev = wandev->ndev; + stat.state = wandev->state; + + if (copy_to_user(u_stat, &stat, sizeof(stat))) + return -EFAULT; + + return 0; +} + +/* + * Create new WAN interface. + * o verify user address space + * o copy configuration data to kernel address space + * o allocate network interface data space + * o call driver's new_if() entry point + * o make sure there is no interface name conflict + * o register network interface + */ + +static int wanrouter_device_new_if(struct wan_device *wandev, + wanif_conf_t __user *u_conf) +{ + wanif_conf_t *cnf; + struct net_device *dev = NULL; +#ifdef CONFIG_WANPIPE_MULTPPP + struct ppp_device *pppdev=NULL; +#endif + int err; + + if ((wandev->state == WAN_UNCONFIGURED) || (wandev->new_if == NULL)) + return -ENODEV; + + cnf = kmalloc(sizeof(wanif_conf_t), GFP_KERNEL); + if (!cnf) + return -ENOBUFS; + + err = -EFAULT; + if (copy_from_user(cnf, u_conf, sizeof(wanif_conf_t))) + goto out; + + err = -EINVAL; + if (cnf->magic != ROUTER_MAGIC) + goto out; + + if (cnf->config_id == WANCONFIG_MPPP) { +#ifdef CONFIG_WANPIPE_MULTPPP + pppdev = kmalloc(sizeof(struct ppp_device), GFP_KERNEL); + err = -ENOBUFS; + if (pppdev == NULL) + goto out; + memset(pppdev, 0, sizeof(struct ppp_device)); + pppdev->dev = kmalloc(sizeof(struct net_device), GFP_KERNEL); + if (pppdev->dev == NULL) { + kfree(pppdev); + err = -ENOBUFS; + goto out; + } + memset(pppdev->dev, 0, sizeof(struct net_device)); + err = wandev->new_if(wandev, (struct net_device *)pppdev, cnf); + dev = pppdev->dev; +#else + printk(KERN_INFO "%s: Wanpipe Mulit-Port PPP support has not been compiled in!\n", + wandev->name); + err = -EPROTONOSUPPORT; + goto out; +#endif + } else { + dev = kmalloc(sizeof(struct net_device), GFP_KERNEL); + err = -ENOBUFS; + if (dev == NULL) + goto out; + memset(dev, 0, sizeof(struct net_device)); + err = wandev->new_if(wandev, dev, cnf); + } + + if (!err) { + /* Register network interface. This will invoke init() + * function supplied by the driver. If device registered + * successfully, add it to the interface list. + */ + + if (dev->name == NULL) { + err = -EINVAL; + } else { + + #ifdef WANDEBUG + printk(KERN_INFO "%s: registering interface %s...\n", + wanrouter_modname, dev->name); + #endif + + err = register_netdev(dev); + if (!err) { + struct net_device *slave = NULL; + unsigned long smp_flags=0; + + lock_adapter_irq(&wandev->lock, &smp_flags); + + if (wandev->dev == NULL) { + wandev->dev = dev; + } else { + for (slave=wandev->dev; + *((struct net_device **)slave->priv); + slave = *((struct net_device **)slave->priv)); + + *((struct net_device **)slave->priv) = dev; + } + ++wandev->ndev; + + unlock_adapter_irq(&wandev->lock, &smp_flags); + err = 0; /* done !!! */ + goto out; + } + } + if (wandev->del_if) + wandev->del_if(wandev, dev); + } + + /* This code has moved from del_if() function */ + if (dev->priv) { + kfree(dev->priv); + dev->priv = NULL; + } + +#ifdef CONFIG_WANPIPE_MULTPPP + if (cnf->config_id == WANCONFIG_MPPP) + kfree(pppdev); + else + kfree(dev); +#else + /* Sync PPP is disabled */ + if (cnf->config_id != WANCONFIG_MPPP) + kfree(dev); +#endif + +out: + kfree(cnf); + return err; +} + + +/* + * Delete WAN logical channel. + * o verify user address space + * o copy configuration data to kernel address space + */ + +static int wanrouter_device_del_if(struct wan_device *wandev, char __user *u_name) +{ + char name[WAN_IFNAME_SZ + 1]; + int err = 0; + + if (wandev->state == WAN_UNCONFIGURED) + return -ENODEV; + + memset(name, 0, sizeof(name)); + + if (copy_from_user(name, u_name, WAN_IFNAME_SZ)) + return -EFAULT; + + err = wanrouter_delete_interface(wandev, name); + if (err) + return err; + + /* If last interface being deleted, shutdown card + * This helps with administration at leaf nodes + * (You can tell if the person at the other end of the phone + * has an interface configured) and avoids DoS vulnerabilities + * in binary driver files - this fixes a problem with the current + * Sangoma driver going into strange states when all the network + * interfaces are deleted and the link irrecoverably disconnected. + */ + + if (!wandev->ndev && wandev->shutdown) + err = wandev->shutdown(wandev); + + return err; +} + +/* + * Miscellaneous Functions + */ + +/* + * Find WAN device by name. + * Return pointer to the WAN device data space or NULL if device not found. + */ + +static struct wan_device *wanrouter_find_device(char *name) +{ + struct wan_device *wandev; + + for (wandev = wanrouter_router_devlist; + wandev && strcmp(wandev->name, name); + wandev = wandev->next); + return wandev; +} + +/* + * Delete WAN logical channel identified by its name. + * o find logical channel by its name + * o call driver's del_if() entry point + * o unregister network interface + * o unlink channel data space from linked list of channels + * o release channel data space + * + * Return: 0 success + * -ENODEV channel not found. + * -EBUSY interface is open + * + * Note: If (force != 0), then device will be destroyed even if interface + * associated with it is open. It's caller's responsibility to make + * sure that opened interfaces are not removed! + */ + +static int wanrouter_delete_interface(struct wan_device *wandev, char *name) +{ + struct net_device *dev = NULL, *prev = NULL; + unsigned long smp_flags=0; + + lock_adapter_irq(&wandev->lock, &smp_flags); + dev = wandev->dev; + prev = NULL; + while (dev && strcmp(name, dev->name)) { + struct net_device **slave = dev->priv; + prev = dev; + dev = *slave; + } + unlock_adapter_irq(&wandev->lock, &smp_flags); + + if (dev == NULL) + return -ENODEV; /* interface not found */ + + if (netif_running(dev)) + return -EBUSY; /* interface in use */ + + if (wandev->del_if) + wandev->del_if(wandev, dev); + + lock_adapter_irq(&wandev->lock, &smp_flags); + if (prev) { + struct net_device **prev_slave = prev->priv; + struct net_device **slave = dev->priv; + + *prev_slave = *slave; + } else { + struct net_device **slave = dev->priv; + wandev->dev = *slave; + } + --wandev->ndev; + unlock_adapter_irq(&wandev->lock, &smp_flags); + + printk(KERN_INFO "%s: unregistering '%s'\n", wandev->name, dev->name); + + /* Due to new interface linking method using dev->priv, + * this code has moved from del_if() function.*/ + if (dev->priv){ + kfree(dev->priv); + dev->priv=NULL; + } + + unregister_netdev(dev); + + free_netdev(dev); + + return 0; +} + +void lock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) +{ + spin_lock_irqsave(lock, *smp_flags); +} + + +void unlock_adapter_irq(spinlock_t *lock, unsigned long *smp_flags) +{ + spin_unlock_irqrestore(lock, *smp_flags); +} + +EXPORT_SYMBOL(register_wan_device); +EXPORT_SYMBOL(unregister_wan_device); +EXPORT_SYMBOL(wanrouter_encapsulate); +EXPORT_SYMBOL(wanrouter_type_trans); +EXPORT_SYMBOL(lock_adapter_irq); +EXPORT_SYMBOL(unlock_adapter_irq); + +MODULE_LICENSE("GPL"); + +/* + * End + */ diff --git a/net/wanrouter/wanproc.c b/net/wanrouter/wanproc.c new file mode 100644 index 000000000000..c28ba5a47209 --- /dev/null +++ b/net/wanrouter/wanproc.c @@ -0,0 +1,381 @@ +/***************************************************************************** +* wanproc.c WAN Router Module. /proc filesystem interface. +* +* This module is completely hardware-independent and provides +* access to the router using Linux /proc filesystem. +* +* Author: Gideon Hack +* +* Copyright: (c) 1995-1999 Sangoma Technologies Inc. +* +* This program is free software; you can redistribute it and/or +* modify it under the terms of the GNU General Public License +* as published by the Free Software Foundation; either version +* 2 of the License, or (at your option) any later version. +* ============================================================================ +* Jun 02, 1999 Gideon Hack Updates for Linux 2.2.X kernels. +* Jun 29, 1997 Alan Cox Merged with 1.0.3 vendor code +* Jan 29, 1997 Gene Kozin v1.0.1. Implemented /proc read routines +* Jan 30, 1997 Alan Cox Hacked around for 2.1 +* Dec 13, 1996 Gene Kozin Initial version (based on Sangoma's WANPIPE) +*****************************************************************************/ + +#include +#include /* __initfunc et al. */ +#include /* offsetof(), etc. */ +#include /* return codes */ +#include +#include +#include /* WAN router API definitions */ +#include +#include + +#include + +#define PROC_STATS_FORMAT "%30s: %12lu\n" + +/****** Defines and Macros **************************************************/ + +#define PROT_DECODE(prot) ((prot == WANCONFIG_FR) ? " FR" :\ + (prot == WANCONFIG_X25) ? " X25" : \ + (prot == WANCONFIG_PPP) ? " PPP" : \ + (prot == WANCONFIG_CHDLC) ? " CHDLC": \ + (prot == WANCONFIG_MPPP) ? " MPPP" : \ + " Unknown" ) + +/****** Function Prototypes *************************************************/ + +#ifdef CONFIG_PROC_FS + +/* Miscellaneous */ + +/* + * Structures for interfacing with the /proc filesystem. + * Router creates its own directory /proc/net/router with the folowing + * entries: + * config device configuration + * status global device statistics + * entry for each WAN device + */ + +/* + * Generic /proc/net/router/ file and inode operations + */ + +/* + * /proc/net/router + */ + +static struct proc_dir_entry *proc_router; + +/* Strings */ + +/* + * Interface functions + */ + +/****** Proc filesystem entry points ****************************************/ + +/* + * Iterator + */ +static void *r_start(struct seq_file *m, loff_t *pos) +{ + struct wan_device *wandev; + loff_t l = *pos; + + lock_kernel(); + if (!l--) + return SEQ_START_TOKEN; + for (wandev = wanrouter_router_devlist; l-- && wandev; + wandev = wandev->next) + ; + return wandev; +} + +static void *r_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct wan_device *wandev = v; + (*pos)++; + return (v == SEQ_START_TOKEN) ? wanrouter_router_devlist : wandev->next; +} + +static void r_stop(struct seq_file *m, void *v) +{ + unlock_kernel(); +} + +static int config_show(struct seq_file *m, void *v) +{ + struct wan_device *p = v; + if (v == SEQ_START_TOKEN) { + seq_puts(m, "Device name | port |IRQ|DMA| mem.addr |" + "mem.size|option1|option2|option3|option4\n"); + return 0; + } + if (!p->state) + return 0; + seq_printf(m, "%-15s|0x%-4X|%3u|%3u| 0x%-8lX |0x%-6X|%7u|%7u|%7u|%7u\n", + p->name, p->ioport, p->irq, p->dma, p->maddr, p->msize, + p->hw_opt[0], p->hw_opt[1], p->hw_opt[2], p->hw_opt[3]); + return 0; +} + +static int status_show(struct seq_file *m, void *v) +{ + struct wan_device *p = v; + if (v == SEQ_START_TOKEN) { + seq_puts(m, "Device name |protocol|station|interface|" + "clocking|baud rate| MTU |ndev|link state\n"); + return 0; + } + if (!p->state) + return 0; + seq_printf(m, "%-15s|%-8s| %-7s| %-9s|%-8s|%9u|%5u|%3u |", + p->name, + PROT_DECODE(p->config_id), + p->config_id == WANCONFIG_FR ? + (p->station ? "Node" : "CPE") : + (p->config_id == WANCONFIG_X25 ? + (p->station ? "DCE" : "DTE") : + ("N/A")), + p->interface ? "V.35" : "RS-232", + p->clocking ? "internal" : "external", + p->bps, + p->mtu, + p->ndev); + + switch (p->state) { + case WAN_UNCONFIGURED: + seq_printf(m, "%-12s\n", "unconfigured"); + break; + case WAN_DISCONNECTED: + seq_printf(m, "%-12s\n", "disconnected"); + break; + case WAN_CONNECTING: + seq_printf(m, "%-12s\n", "connecting"); + break; + case WAN_CONNECTED: + seq_printf(m, "%-12s\n", "connected"); + break; + default: + seq_printf(m, "%-12s\n", "invalid"); + break; + } + return 0; +} + +static struct seq_operations config_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = config_show, +}; + +static struct seq_operations status_op = { + .start = r_start, + .next = r_next, + .stop = r_stop, + .show = status_show, +}; + +static int config_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &config_op); +} + +static int status_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &status_op); +} + +static struct file_operations config_fops = { + .owner = THIS_MODULE, + .open = config_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct file_operations status_fops = { + .owner = THIS_MODULE, + .open = status_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int wandev_show(struct seq_file *m, void *v) +{ + struct wan_device *wandev = m->private; + + if (wandev->magic != ROUTER_MAGIC) + return 0; + + if (!wandev->state) { + seq_puts(m, "device is not configured!\n"); + return 0; + } + + /* Update device statistics */ + if (wandev->update) { + int err = wandev->update(wandev); + if (err == -EAGAIN) { + seq_puts(m, "Device is busy!\n"); + return 0; + } + if (err) { + seq_puts(m, "Device is not configured!\n"); + return 0; + } + } + + seq_printf(m, PROC_STATS_FORMAT, + "total packets received", wandev->stats.rx_packets); + seq_printf(m, PROC_STATS_FORMAT, + "total packets transmitted", wandev->stats.tx_packets); + seq_printf(m, PROC_STATS_FORMAT, + "total bytes received", wandev->stats.rx_bytes); + seq_printf(m, PROC_STATS_FORMAT, + "total bytes transmitted", wandev->stats.tx_bytes); + seq_printf(m, PROC_STATS_FORMAT, + "bad packets received", wandev->stats.rx_errors); + seq_printf(m, PROC_STATS_FORMAT, + "packet transmit problems", wandev->stats.tx_errors); + seq_printf(m, PROC_STATS_FORMAT, + "received frames dropped", wandev->stats.rx_dropped); + seq_printf(m, PROC_STATS_FORMAT, + "transmit frames dropped", wandev->stats.tx_dropped); + seq_printf(m, PROC_STATS_FORMAT, + "multicast packets received", wandev->stats.multicast); + seq_printf(m, PROC_STATS_FORMAT, + "transmit collisions", wandev->stats.collisions); + seq_printf(m, PROC_STATS_FORMAT, + "receive length errors", wandev->stats.rx_length_errors); + seq_printf(m, PROC_STATS_FORMAT, + "receiver overrun errors", wandev->stats.rx_over_errors); + seq_printf(m, PROC_STATS_FORMAT, + "CRC errors", wandev->stats.rx_crc_errors); + seq_printf(m, PROC_STATS_FORMAT, + "frame format errors (aborts)", wandev->stats.rx_frame_errors); + seq_printf(m, PROC_STATS_FORMAT, + "receiver fifo overrun", wandev->stats.rx_fifo_errors); + seq_printf(m, PROC_STATS_FORMAT, + "receiver missed packet", wandev->stats.rx_missed_errors); + seq_printf(m, PROC_STATS_FORMAT, + "aborted frames transmitted", wandev->stats.tx_aborted_errors); + return 0; +} + +static int wandev_open(struct inode *inode, struct file *file) +{ + return single_open(file, wandev_show, PDE(inode)->data); +} + +static struct file_operations wandev_fops = { + .owner = THIS_MODULE, + .open = wandev_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .ioctl = wanrouter_ioctl, +}; + +/* + * Initialize router proc interface. + */ + +int __init wanrouter_proc_init(void) +{ + struct proc_dir_entry *p; + proc_router = proc_mkdir(ROUTER_NAME, proc_net); + if (!proc_router) + goto fail; + + p = create_proc_entry("config", S_IRUGO, proc_router); + if (!p) + goto fail_config; + p->proc_fops = &config_fops; + p = create_proc_entry("status", S_IRUGO, proc_router); + if (!p) + goto fail_stat; + p->proc_fops = &status_fops; + return 0; +fail_stat: + remove_proc_entry("config", proc_router); +fail_config: + remove_proc_entry(ROUTER_NAME, proc_net); +fail: + return -ENOMEM; +} + +/* + * Clean up router proc interface. + */ + +void wanrouter_proc_cleanup(void) +{ + remove_proc_entry("config", proc_router); + remove_proc_entry("status", proc_router); + remove_proc_entry(ROUTER_NAME, proc_net); +} + +/* + * Add directory entry for WAN device. + */ + +int wanrouter_proc_add(struct wan_device* wandev) +{ + if (wandev->magic != ROUTER_MAGIC) + return -EINVAL; + + wandev->dent = create_proc_entry(wandev->name, S_IRUGO, proc_router); + if (!wandev->dent) + return -ENOMEM; + wandev->dent->proc_fops = &wandev_fops; + wandev->dent->data = wandev; + return 0; +} + +/* + * Delete directory entry for WAN device. + */ +int wanrouter_proc_delete(struct wan_device* wandev) +{ + if (wandev->magic != ROUTER_MAGIC) + return -EINVAL; + remove_proc_entry(wandev->name, proc_router); + return 0; +} + +#else + +/* + * No /proc - output stubs + */ + +int __init wanrouter_proc_init(void) +{ + return 0; +} + +void wanrouter_proc_cleanup(void) +{ +} + +int wanrouter_proc_add(struct wan_device *wandev) +{ + return 0; +} + +int wanrouter_proc_delete(struct wan_device *wandev) +{ + return 0; +} + +#endif + +/* + * End + */ + diff --git a/net/x25/Makefile b/net/x25/Makefile new file mode 100644 index 000000000000..587a71aa411d --- /dev/null +++ b/net/x25/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the Linux X.25 Packet layer. +# + +obj-$(CONFIG_X25) += x25.o + +x25-y := af_x25.o x25_dev.o x25_facilities.o x25_in.o \ + x25_link.o x25_out.o x25_route.o x25_subr.o \ + x25_timer.o x25_proc.o +x25-$(CONFIG_SYSCTL) += sysctl_net_x25.o diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c new file mode 100644 index 000000000000..2a24b243b841 --- /dev/null +++ b/net/x25/af_x25.c @@ -0,0 +1,1435 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Jonathan Naylor Started coding. + * X.25 002 Jonathan Naylor Centralised disconnect handling. + * New timer architecture. + * 2000-03-11 Henner Eisen MSG_EOR handling more POSIX compliant. + * 2000-03-22 Daniela Squassoni Allowed disabling/enabling of + * facilities negotiation and increased + * the throughput upper limit. + * 2000-08-27 Arnaldo C. Melo s/suser/capable/ + micro cleanups + * 2000-09-04 Henner Eisen Set sock->state in x25_accept(). + * Fixed x25_output() related skb leakage. + * 2000-10-02 Henner Eisen Made x25_kick() single threaded per socket. + * 2000-10-27 Henner Eisen MSG_DONTWAIT for fragment allocation. + * 2000-11-14 Henner Eisen Closing datalink from NETDEV_GOING_DOWN + * 2002-10-06 Arnaldo C. Melo Get rid of cli/sti, move proc stuff to + * x25_proc.c, using seq_file + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* For TIOCINQ/OUTQ */ +#include +#include +#include + +int sysctl_x25_restart_request_timeout = X25_DEFAULT_T20; +int sysctl_x25_call_request_timeout = X25_DEFAULT_T21; +int sysctl_x25_reset_request_timeout = X25_DEFAULT_T22; +int sysctl_x25_clear_request_timeout = X25_DEFAULT_T23; +int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2; + +HLIST_HEAD(x25_list); +DEFINE_RWLOCK(x25_list_lock); + +static struct proto_ops x25_proto_ops; + +static struct x25_address null_x25_address = {" "}; + +int x25_addr_ntoa(unsigned char *p, struct x25_address *called_addr, + struct x25_address *calling_addr) +{ + int called_len, calling_len; + char *called, *calling; + int i; + + called_len = (*p >> 0) & 0x0F; + calling_len = (*p >> 4) & 0x0F; + + called = called_addr->x25_addr; + calling = calling_addr->x25_addr; + p++; + + for (i = 0; i < (called_len + calling_len); i++) { + if (i < called_len) { + if (i % 2 != 0) { + *called++ = ((*p >> 0) & 0x0F) + '0'; + p++; + } else { + *called++ = ((*p >> 4) & 0x0F) + '0'; + } + } else { + if (i % 2 != 0) { + *calling++ = ((*p >> 0) & 0x0F) + '0'; + p++; + } else { + *calling++ = ((*p >> 4) & 0x0F) + '0'; + } + } + } + + *called = *calling = '\0'; + + return 1 + (called_len + calling_len + 1) / 2; +} + +int x25_addr_aton(unsigned char *p, struct x25_address *called_addr, + struct x25_address *calling_addr) +{ + unsigned int called_len, calling_len; + char *called, *calling; + int i; + + called = called_addr->x25_addr; + calling = calling_addr->x25_addr; + + called_len = strlen(called); + calling_len = strlen(calling); + + *p++ = (calling_len << 4) | (called_len << 0); + + for (i = 0; i < (called_len + calling_len); i++) { + if (i < called_len) { + if (i % 2 != 0) { + *p |= (*called++ - '0') << 0; + p++; + } else { + *p = 0x00; + *p |= (*called++ - '0') << 4; + } + } else { + if (i % 2 != 0) { + *p |= (*calling++ - '0') << 0; + p++; + } else { + *p = 0x00; + *p |= (*calling++ - '0') << 4; + } + } + } + + return 1 + (called_len + calling_len + 1) / 2; +} + +/* + * Socket removal during an interrupt is now safe. + */ +static void x25_remove_socket(struct sock *sk) +{ + write_lock_bh(&x25_list_lock); + sk_del_node_init(sk); + write_unlock_bh(&x25_list_lock); +} + +/* + * Kill all bound sockets on a dropped device. + */ +static void x25_kill_by_device(struct net_device *dev) +{ + struct sock *s; + struct hlist_node *node; + + write_lock_bh(&x25_list_lock); + + sk_for_each(s, node, &x25_list) + if (x25_sk(s)->neighbour && x25_sk(s)->neighbour->dev == dev) + x25_disconnect(s, ENETUNREACH, 0, 0); + + write_unlock_bh(&x25_list_lock); +} + +/* + * Handle device status changes. + */ +static int x25_device_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct x25_neigh *nb; + + if (dev->type == ARPHRD_X25 +#if defined(CONFIG_LLC) || defined(CONFIG_LLC_MODULE) + || dev->type == ARPHRD_ETHER +#endif + ) { + switch (event) { + case NETDEV_UP: + x25_link_device_up(dev); + break; + case NETDEV_GOING_DOWN: + nb = x25_get_neigh(dev); + if (nb) { + x25_terminate_link(nb); + x25_neigh_put(nb); + } + break; + case NETDEV_DOWN: + x25_kill_by_device(dev); + x25_route_device_down(dev); + x25_link_device_down(dev); + break; + } + } + + return NOTIFY_DONE; +} + +/* + * Add a socket to the bound sockets list. + */ +static void x25_insert_socket(struct sock *sk) +{ + write_lock_bh(&x25_list_lock); + sk_add_node(sk, &x25_list); + write_unlock_bh(&x25_list_lock); +} + +/* + * Find a socket that wants to accept the Call Request we just + * received. Check the full list for an address/cud match. + * If no cuds match return the next_best thing, an address match. + * Note: if a listening socket has cud set it must only get calls + * with matching cud. + */ +static struct sock *x25_find_listener(struct x25_address *addr, struct x25_calluserdata *calluserdata) +{ + struct sock *s; + struct sock *next_best; + struct hlist_node *node; + + read_lock_bh(&x25_list_lock); + next_best = NULL; + + sk_for_each(s, node, &x25_list) + if ((!strcmp(addr->x25_addr, + x25_sk(s)->source_addr.x25_addr) || + !strcmp(addr->x25_addr, + null_x25_address.x25_addr)) && + s->sk_state == TCP_LISTEN) { + + /* + * Found a listening socket, now check the incoming + * call user data vs this sockets call user data + */ + if (x25_check_calluserdata(&x25_sk(s)->calluserdata, calluserdata)) { + sock_hold(s); + goto found; + } + if (x25_sk(s)->calluserdata.cudlength == 0) { + next_best = s; + } + } + if (next_best) { + s = next_best; + sock_hold(s); + goto found; + } + s = NULL; +found: + read_unlock_bh(&x25_list_lock); + return s; +} + +/* + * Find a connected X.25 socket given my LCI and neighbour. + */ +static struct sock *__x25_find_socket(unsigned int lci, struct x25_neigh *nb) +{ + struct sock *s; + struct hlist_node *node; + + sk_for_each(s, node, &x25_list) + if (x25_sk(s)->lci == lci && x25_sk(s)->neighbour == nb) { + sock_hold(s); + goto found; + } + s = NULL; +found: + return s; +} + +struct sock *x25_find_socket(unsigned int lci, struct x25_neigh *nb) +{ + struct sock *s; + + read_lock_bh(&x25_list_lock); + s = __x25_find_socket(lci, nb); + read_unlock_bh(&x25_list_lock); + return s; +} + +/* + * Find a unique LCI for a given device. + */ +static unsigned int x25_new_lci(struct x25_neigh *nb) +{ + unsigned int lci = 1; + struct sock *sk; + + read_lock_bh(&x25_list_lock); + + while ((sk = __x25_find_socket(lci, nb)) != NULL) { + sock_put(sk); + if (++lci == 4096) { + lci = 0; + break; + } + } + + read_unlock_bh(&x25_list_lock); + return lci; +} + +/* + * Deferred destroy. + */ +void x25_destroy_socket(struct sock *); + +/* + * handler for deferred kills. + */ +static void x25_destroy_timer(unsigned long data) +{ + x25_destroy_socket((struct sock *)data); +} + +/* + * This is called from user mode and the timers. Thus it protects itself + * against interrupt users but doesn't worry about being called during + * work. Once it is removed from the queue no interrupt or bottom half + * will touch it and we are (fairly 8-) ) safe. + * Not static as it's used by the timer + */ +void x25_destroy_socket(struct sock *sk) +{ + struct sk_buff *skb; + + sock_hold(sk); + lock_sock(sk); + x25_stop_heartbeat(sk); + x25_stop_timer(sk); + + x25_remove_socket(sk); + x25_clear_queues(sk); /* Flush the queues */ + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + if (skb->sk != sk) { /* A pending connection */ + /* + * Queue the unaccepted socket for death + */ + sock_set_flag(skb->sk, SOCK_DEAD); + x25_start_heartbeat(skb->sk); + x25_sk(skb->sk)->state = X25_STATE_0; + } + + kfree_skb(skb); + } + + if (atomic_read(&sk->sk_wmem_alloc) || + atomic_read(&sk->sk_rmem_alloc)) { + /* Defer: outstanding buffers */ + sk->sk_timer.expires = jiffies + 10 * HZ; + sk->sk_timer.function = x25_destroy_timer; + sk->sk_timer.data = (unsigned long)sk; + add_timer(&sk->sk_timer); + } else { + /* drop last reference so sock_put will free */ + __sock_put(sk); + } + + release_sock(sk); + sock_put(sk); +} + +/* + * Handling for system calls applied via the various interfaces to a + * X.25 socket object. + */ + +static int x25_setsockopt(struct socket *sock, int level, int optname, + char __user *optval, int optlen) +{ + int opt; + struct sock *sk = sock->sk; + int rc = -ENOPROTOOPT; + + if (level != SOL_X25 || optname != X25_QBITINCL) + goto out; + + rc = -EINVAL; + if (optlen < sizeof(int)) + goto out; + + rc = -EFAULT; + if (get_user(opt, (int __user *)optval)) + goto out; + + x25_sk(sk)->qbitincl = !!opt; + rc = 0; +out: + return rc; +} + +static int x25_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + struct sock *sk = sock->sk; + int val, len, rc = -ENOPROTOOPT; + + if (level != SOL_X25 || optname != X25_QBITINCL) + goto out; + + rc = -EFAULT; + if (get_user(len, optlen)) + goto out; + + len = min_t(unsigned int, len, sizeof(int)); + + rc = -EINVAL; + if (len < 0) + goto out; + + rc = -EFAULT; + if (put_user(len, optlen)) + goto out; + + val = x25_sk(sk)->qbitincl; + rc = copy_to_user(optval, &val, len) ? -EFAULT : 0; +out: + return rc; +} + +static int x25_listen(struct socket *sock, int backlog) +{ + struct sock *sk = sock->sk; + int rc = -EOPNOTSUPP; + + if (sk->sk_state != TCP_LISTEN) { + memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN); + sk->sk_max_ack_backlog = backlog; + sk->sk_state = TCP_LISTEN; + rc = 0; + } + + return rc; +} + +static struct proto x25_proto = { + .name = "X25", + .owner = THIS_MODULE, + .obj_size = sizeof(struct x25_sock), +}; + +static struct sock *x25_alloc_socket(void) +{ + struct x25_sock *x25; + struct sock *sk = sk_alloc(AF_X25, GFP_ATOMIC, &x25_proto, 1); + + if (!sk) + goto out; + + sock_init_data(NULL, sk); + + x25 = x25_sk(sk); + skb_queue_head_init(&x25->ack_queue); + skb_queue_head_init(&x25->fragment_queue); + skb_queue_head_init(&x25->interrupt_in_queue); + skb_queue_head_init(&x25->interrupt_out_queue); +out: + return sk; +} + +void x25_init_timers(struct sock *sk); + +static int x25_create(struct socket *sock, int protocol) +{ + struct sock *sk; + struct x25_sock *x25; + int rc = -ESOCKTNOSUPPORT; + + if (sock->type != SOCK_SEQPACKET || protocol) + goto out; + + rc = -ENOMEM; + if ((sk = x25_alloc_socket()) == NULL) + goto out; + + x25 = x25_sk(sk); + + sock_init_data(sock, sk); + + x25_init_timers(sk); + + sock->ops = &x25_proto_ops; + sk->sk_protocol = protocol; + sk->sk_backlog_rcv = x25_backlog_rcv; + + x25->t21 = sysctl_x25_call_request_timeout; + x25->t22 = sysctl_x25_reset_request_timeout; + x25->t23 = sysctl_x25_clear_request_timeout; + x25->t2 = sysctl_x25_ack_holdback_timeout; + x25->state = X25_STATE_0; + + x25->facilities.winsize_in = X25_DEFAULT_WINDOW_SIZE; + x25->facilities.winsize_out = X25_DEFAULT_WINDOW_SIZE; + x25->facilities.pacsize_in = X25_DEFAULT_PACKET_SIZE; + x25->facilities.pacsize_out = X25_DEFAULT_PACKET_SIZE; + x25->facilities.throughput = X25_DEFAULT_THROUGHPUT; + x25->facilities.reverse = X25_DEFAULT_REVERSE; + rc = 0; +out: + return rc; +} + +static struct sock *x25_make_new(struct sock *osk) +{ + struct sock *sk = NULL; + struct x25_sock *x25, *ox25; + + if (osk->sk_type != SOCK_SEQPACKET) + goto out; + + if ((sk = x25_alloc_socket()) == NULL) + goto out; + + x25 = x25_sk(sk); + + sk->sk_type = osk->sk_type; + sk->sk_socket = osk->sk_socket; + sk->sk_priority = osk->sk_priority; + sk->sk_protocol = osk->sk_protocol; + sk->sk_rcvbuf = osk->sk_rcvbuf; + sk->sk_sndbuf = osk->sk_sndbuf; + sk->sk_state = TCP_ESTABLISHED; + sk->sk_sleep = osk->sk_sleep; + sk->sk_backlog_rcv = osk->sk_backlog_rcv; + + if (sock_flag(osk, SOCK_ZAPPED)) + sock_set_flag(sk, SOCK_ZAPPED); + + if (sock_flag(osk, SOCK_DBG)) + sock_set_flag(sk, SOCK_DBG); + + ox25 = x25_sk(osk); + x25->t21 = ox25->t21; + x25->t22 = ox25->t22; + x25->t23 = ox25->t23; + x25->t2 = ox25->t2; + x25->facilities = ox25->facilities; + x25->qbitincl = ox25->qbitincl; + + x25_init_timers(sk); +out: + return sk; +} + +static int x25_release(struct socket *sock) +{ + struct sock *sk = sock->sk; + struct x25_sock *x25; + + if (!sk) + goto out; + + x25 = x25_sk(sk); + + switch (x25->state) { + + case X25_STATE_0: + case X25_STATE_2: + x25_disconnect(sk, 0, 0, 0); + x25_destroy_socket(sk); + goto out; + + case X25_STATE_1: + case X25_STATE_3: + case X25_STATE_4: + x25_clear_queues(sk); + x25_write_internal(sk, X25_CLEAR_REQUEST); + x25_start_t23timer(sk); + x25->state = X25_STATE_2; + sk->sk_state = TCP_CLOSE; + sk->sk_shutdown |= SEND_SHUTDOWN; + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DEAD); + sock_set_flag(sk, SOCK_DESTROY); + break; + } + + sock->sk = NULL; + sk->sk_socket = NULL; /* Not used, but we should do this */ +out: + return 0; +} + +static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) +{ + struct sock *sk = sock->sk; + struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; + + if (!sock_flag(sk, SOCK_ZAPPED) || + addr_len != sizeof(struct sockaddr_x25) || + addr->sx25_family != AF_X25) + return -EINVAL; + + x25_sk(sk)->source_addr = addr->sx25_addr; + x25_insert_socket(sk); + sock_reset_flag(sk, SOCK_ZAPPED); + SOCK_DEBUG(sk, "x25_bind: socket is bound\n"); + + return 0; +} + +static int x25_wait_for_connection_establishment(struct sock *sk) +{ + DECLARE_WAITQUEUE(wait, current); + int rc; + + add_wait_queue_exclusive(sk->sk_sleep, &wait); + for (;;) { + __set_current_state(TASK_INTERRUPTIBLE); + rc = -ERESTARTSYS; + if (signal_pending(current)) + break; + rc = sock_error(sk); + if (rc) { + sk->sk_socket->state = SS_UNCONNECTED; + break; + } + rc = 0; + if (sk->sk_state != TCP_ESTABLISHED) { + release_sock(sk); + schedule(); + lock_sock(sk); + } else + break; + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sk_sleep, &wait); + return rc; +} + +static int x25_connect(struct socket *sock, struct sockaddr *uaddr, + int addr_len, int flags) +{ + struct sock *sk = sock->sk; + struct x25_sock *x25 = x25_sk(sk); + struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; + struct x25_route *rt; + int rc = 0; + + lock_sock(sk); + if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { + sock->state = SS_CONNECTED; + goto out; /* Connect completed during a ERESTARTSYS event */ + } + + rc = -ECONNREFUSED; + if (sk->sk_state == TCP_CLOSE && sock->state == SS_CONNECTING) { + sock->state = SS_UNCONNECTED; + goto out; + } + + rc = -EISCONN; /* No reconnect on a seqpacket socket */ + if (sk->sk_state == TCP_ESTABLISHED) + goto out; + + sk->sk_state = TCP_CLOSE; + sock->state = SS_UNCONNECTED; + + rc = -EINVAL; + if (addr_len != sizeof(struct sockaddr_x25) || + addr->sx25_family != AF_X25) + goto out; + + rc = -ENETUNREACH; + rt = x25_get_route(&addr->sx25_addr); + if (!rt) + goto out; + + x25->neighbour = x25_get_neigh(rt->dev); + if (!x25->neighbour) + goto out_put_route; + + x25_limit_facilities(&x25->facilities, x25->neighbour); + + x25->lci = x25_new_lci(x25->neighbour); + if (!x25->lci) + goto out_put_neigh; + + rc = -EINVAL; + if (sock_flag(sk, SOCK_ZAPPED)) /* Must bind first - autobinding does not work */ + goto out_put_neigh; + + if (!strcmp(x25->source_addr.x25_addr, null_x25_address.x25_addr)) + memset(&x25->source_addr, '\0', X25_ADDR_LEN); + + x25->dest_addr = addr->sx25_addr; + + /* Move to connecting socket, start sending Connect Requests */ + sock->state = SS_CONNECTING; + sk->sk_state = TCP_SYN_SENT; + + x25->state = X25_STATE_1; + + x25_write_internal(sk, X25_CALL_REQUEST); + + x25_start_heartbeat(sk); + x25_start_t21timer(sk); + + /* Now the loop */ + rc = -EINPROGRESS; + if (sk->sk_state != TCP_ESTABLISHED && (flags & O_NONBLOCK)) + goto out_put_neigh; + + rc = x25_wait_for_connection_establishment(sk); + if (rc) + goto out_put_neigh; + + sock->state = SS_CONNECTED; + rc = 0; +out_put_neigh: + if (rc) + x25_neigh_put(x25->neighbour); +out_put_route: + x25_route_put(rt); +out: + release_sock(sk); + return rc; +} + +static int x25_wait_for_data(struct sock *sk, int timeout) +{ + DECLARE_WAITQUEUE(wait, current); + int rc = 0; + + add_wait_queue_exclusive(sk->sk_sleep, &wait); + for (;;) { + __set_current_state(TASK_INTERRUPTIBLE); + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + rc = -ERESTARTSYS; + if (signal_pending(current)) + break; + rc = -EAGAIN; + if (!timeout) + break; + rc = 0; + if (skb_queue_empty(&sk->sk_receive_queue)) { + release_sock(sk); + timeout = schedule_timeout(timeout); + lock_sock(sk); + } else + break; + } + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk->sk_sleep, &wait); + return rc; +} + +static int x25_accept(struct socket *sock, struct socket *newsock, int flags) +{ + struct sock *sk = sock->sk; + struct sock *newsk; + struct sk_buff *skb; + int rc = -EINVAL; + + if (!sk || sk->sk_state != TCP_LISTEN) + goto out; + + rc = -EOPNOTSUPP; + if (sk->sk_type != SOCK_SEQPACKET) + goto out; + + lock_sock(sk); + rc = x25_wait_for_data(sk, sk->sk_rcvtimeo); + if (rc) + goto out2; + skb = skb_dequeue(&sk->sk_receive_queue); + rc = -EINVAL; + if (!skb->sk) + goto out2; + newsk = skb->sk; + newsk->sk_socket = newsock; + newsk->sk_sleep = &newsock->wait; + + /* Now attach up the new socket */ + skb->sk = NULL; + kfree_skb(skb); + sk->sk_ack_backlog--; + newsock->sk = newsk; + newsock->state = SS_CONNECTED; + rc = 0; +out2: + release_sock(sk); +out: + return rc; +} + +static int x25_getname(struct socket *sock, struct sockaddr *uaddr, + int *uaddr_len, int peer) +{ + struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)uaddr; + struct sock *sk = sock->sk; + struct x25_sock *x25 = x25_sk(sk); + + if (peer) { + if (sk->sk_state != TCP_ESTABLISHED) + return -ENOTCONN; + sx25->sx25_addr = x25->dest_addr; + } else + sx25->sx25_addr = x25->source_addr; + + sx25->sx25_family = AF_X25; + *uaddr_len = sizeof(*sx25); + + return 0; +} + +int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, + unsigned int lci) +{ + struct sock *sk; + struct sock *make; + struct x25_sock *makex25; + struct x25_address source_addr, dest_addr; + struct x25_facilities facilities; + struct x25_calluserdata calluserdata; + int len, rc; + + /* + * Remove the LCI and frame type. + */ + skb_pull(skb, X25_STD_MIN_LEN); + + /* + * Extract the X.25 addresses and convert them to ASCII strings, + * and remove them. + */ + skb_pull(skb, x25_addr_ntoa(skb->data, &source_addr, &dest_addr)); + + /* + * Get the length of the facilities, skip past them for the moment + * get the call user data because this is needed to determine + * the correct listener + */ + len = skb->data[0] + 1; + skb_pull(skb,len); + + /* + * Incoming Call User Data. + */ + if (skb->len >= 0) { + memcpy(calluserdata.cuddata, skb->data, skb->len); + calluserdata.cudlength = skb->len; + } + + skb_push(skb,len); + + /* + * Find a listener for the particular address/cud pair. + */ + sk = x25_find_listener(&source_addr,&calluserdata); + + /* + * We can't accept the Call Request. + */ + if (sk == NULL || sk_acceptq_is_full(sk)) + goto out_clear_request; + + /* + * Try to reach a compromise on the requested facilities. + */ + if ((len = x25_negotiate_facilities(skb, sk, &facilities)) == -1) + goto out_sock_put; + + /* + * current neighbour/link might impose additional limits + * on certain facilties + */ + + x25_limit_facilities(&facilities, nb); + + /* + * Try to create a new socket. + */ + make = x25_make_new(sk); + if (!make) + goto out_sock_put; + + /* + * Remove the facilities + */ + skb_pull(skb, len); + + skb->sk = make; + make->sk_state = TCP_ESTABLISHED; + + makex25 = x25_sk(make); + makex25->lci = lci; + makex25->dest_addr = dest_addr; + makex25->source_addr = source_addr; + makex25->neighbour = nb; + makex25->facilities = facilities; + makex25->vc_facil_mask = x25_sk(sk)->vc_facil_mask; + makex25->calluserdata = calluserdata; + + x25_write_internal(make, X25_CALL_ACCEPTED); + + makex25->state = X25_STATE_3; + + sk->sk_ack_backlog++; + + x25_insert_socket(make); + + skb_queue_head(&sk->sk_receive_queue, skb); + + x25_start_heartbeat(make); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skb->len); + rc = 1; + sock_put(sk); +out: + return rc; +out_sock_put: + sock_put(sk); +out_clear_request: + rc = 0; + x25_transmit_clear_request(nb, lci, 0x01); + goto out; +} + +static int x25_sendmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t len) +{ + struct sock *sk = sock->sk; + struct x25_sock *x25 = x25_sk(sk); + struct sockaddr_x25 *usx25 = (struct sockaddr_x25 *)msg->msg_name; + struct sockaddr_x25 sx25; + struct sk_buff *skb; + unsigned char *asmptr; + int noblock = msg->msg_flags & MSG_DONTWAIT; + size_t size; + int qbit = 0, rc = -EINVAL; + + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_OOB|MSG_EOR|MSG_CMSG_COMPAT)) + goto out; + + /* we currently don't support segmented records at the user interface */ + if (!(msg->msg_flags & (MSG_EOR|MSG_OOB))) + goto out; + + rc = -EADDRNOTAVAIL; + if (sock_flag(sk, SOCK_ZAPPED)) + goto out; + + rc = -EPIPE; + if (sk->sk_shutdown & SEND_SHUTDOWN) { + send_sig(SIGPIPE, current, 0); + goto out; + } + + rc = -ENETUNREACH; + if (!x25->neighbour) + goto out; + + if (usx25) { + rc = -EINVAL; + if (msg->msg_namelen < sizeof(sx25)) + goto out; + memcpy(&sx25, usx25, sizeof(sx25)); + rc = -EISCONN; + if (strcmp(x25->dest_addr.x25_addr, sx25.sx25_addr.x25_addr)) + goto out; + rc = -EINVAL; + if (sx25.sx25_family != AF_X25) + goto out; + } else { + /* + * FIXME 1003.1g - if the socket is like this because + * it has become closed (not started closed) we ought + * to SIGPIPE, EPIPE; + */ + rc = -ENOTCONN; + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + + sx25.sx25_family = AF_X25; + sx25.sx25_addr = x25->dest_addr; + } + + SOCK_DEBUG(sk, "x25_sendmsg: sendto: Addresses built.\n"); + + /* Build a packet */ + SOCK_DEBUG(sk, "x25_sendmsg: sendto: building packet.\n"); + + if ((msg->msg_flags & MSG_OOB) && len > 32) + len = 32; + + size = len + X25_MAX_L2_LEN + X25_EXT_MIN_LEN; + + skb = sock_alloc_send_skb(sk, size, noblock, &rc); + if (!skb) + goto out; + X25_SKB_CB(skb)->flags = msg->msg_flags; + + skb_reserve(skb, X25_MAX_L2_LEN + X25_EXT_MIN_LEN); + + /* + * Put the data on the end + */ + SOCK_DEBUG(sk, "x25_sendmsg: Copying user data\n"); + + asmptr = skb->h.raw = skb_put(skb, len); + + rc = memcpy_fromiovec(asmptr, msg->msg_iov, len); + if (rc) + goto out_kfree_skb; + + /* + * If the Q BIT Include socket option is in force, the first + * byte of the user data is the logical value of the Q Bit. + */ + if (x25->qbitincl) { + qbit = skb->data[0]; + skb_pull(skb, 1); + } + + /* + * Push down the X.25 header + */ + SOCK_DEBUG(sk, "x25_sendmsg: Building X.25 Header.\n"); + + if (msg->msg_flags & MSG_OOB) { + if (x25->neighbour->extended) { + asmptr = skb_push(skb, X25_STD_MIN_LEN); + *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; + *asmptr++ = (x25->lci >> 0) & 0xFF; + *asmptr++ = X25_INTERRUPT; + } else { + asmptr = skb_push(skb, X25_STD_MIN_LEN); + *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; + *asmptr++ = (x25->lci >> 0) & 0xFF; + *asmptr++ = X25_INTERRUPT; + } + } else { + if (x25->neighbour->extended) { + /* Build an Extended X.25 header */ + asmptr = skb_push(skb, X25_EXT_MIN_LEN); + *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_EXTSEQ; + *asmptr++ = (x25->lci >> 0) & 0xFF; + *asmptr++ = X25_DATA; + *asmptr++ = X25_DATA; + } else { + /* Build an Standard X.25 header */ + asmptr = skb_push(skb, X25_STD_MIN_LEN); + *asmptr++ = ((x25->lci >> 8) & 0x0F) | X25_GFI_STDSEQ; + *asmptr++ = (x25->lci >> 0) & 0xFF; + *asmptr++ = X25_DATA; + } + + if (qbit) + skb->data[0] |= X25_Q_BIT; + } + + SOCK_DEBUG(sk, "x25_sendmsg: Built header.\n"); + SOCK_DEBUG(sk, "x25_sendmsg: Transmitting buffer\n"); + + rc = -ENOTCONN; + if (sk->sk_state != TCP_ESTABLISHED) + goto out_kfree_skb; + + if (msg->msg_flags & MSG_OOB) + skb_queue_tail(&x25->interrupt_out_queue, skb); + else { + len = x25_output(sk, skb); + if (len < 0) + kfree_skb(skb); + else if (x25->qbitincl) + len++; + } + + /* + * lock_sock() is currently only used to serialize this x25_kick() + * against input-driven x25_kick() calls. It currently only blocks + * incoming packets for this socket and does not protect against + * any other socket state changes and is not called from anywhere + * else. As x25_kick() cannot block and as long as all socket + * operations are BKL-wrapped, we don't need take to care about + * purging the backlog queue in x25_release(). + * + * Using lock_sock() to protect all socket operations entirely + * (and making the whole x25 stack SMP aware) unfortunately would + * require major changes to {send,recv}msg and skb allocation methods. + * -> 2.5 ;) + */ + lock_sock(sk); + x25_kick(sk); + release_sock(sk); + rc = len; +out: + return rc; +out_kfree_skb: + kfree_skb(skb); + goto out; +} + + +static int x25_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, + int flags) +{ + struct sock *sk = sock->sk; + struct x25_sock *x25 = x25_sk(sk); + struct sockaddr_x25 *sx25 = (struct sockaddr_x25 *)msg->msg_name; + size_t copied; + int qbit; + struct sk_buff *skb; + unsigned char *asmptr; + int rc = -ENOTCONN; + + /* + * This works for seqpacket too. The receiver has ordered the queue for + * us! We do one quick check first though + */ + if (sk->sk_state != TCP_ESTABLISHED) + goto out; + + if (flags & MSG_OOB) { + rc = -EINVAL; + if (sock_flag(sk, SOCK_URGINLINE) || + !skb_peek(&x25->interrupt_in_queue)) + goto out; + + skb = skb_dequeue(&x25->interrupt_in_queue); + + skb_pull(skb, X25_STD_MIN_LEN); + + /* + * No Q bit information on Interrupt data. + */ + if (x25->qbitincl) { + asmptr = skb_push(skb, 1); + *asmptr = 0x00; + } + + msg->msg_flags |= MSG_OOB; + } else { + /* Now we can treat all alike */ + skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, + flags & MSG_DONTWAIT, &rc); + if (!skb) + goto out; + + qbit = (skb->data[0] & X25_Q_BIT) == X25_Q_BIT; + + skb_pull(skb, x25->neighbour->extended ? + X25_EXT_MIN_LEN : X25_STD_MIN_LEN); + + if (x25->qbitincl) { + asmptr = skb_push(skb, 1); + *asmptr = qbit; + } + } + + skb->h.raw = skb->data; + + copied = skb->len; + + if (copied > size) { + copied = size; + msg->msg_flags |= MSG_TRUNC; + } + + /* Currently, each datagram always contains a complete record */ + msg->msg_flags |= MSG_EOR; + + rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + if (rc) + goto out_free_dgram; + + if (sx25) { + sx25->sx25_family = AF_X25; + sx25->sx25_addr = x25->dest_addr; + } + + msg->msg_namelen = sizeof(struct sockaddr_x25); + + lock_sock(sk); + x25_check_rbuf(sk); + release_sock(sk); + rc = copied; +out_free_dgram: + skb_free_datagram(sk, skb); +out: + return rc; +} + + +static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) +{ + struct sock *sk = sock->sk; + struct x25_sock *x25 = x25_sk(sk); + void __user *argp = (void __user *)arg; + int rc; + + switch (cmd) { + case TIOCOUTQ: { + int amount = sk->sk_sndbuf - + atomic_read(&sk->sk_wmem_alloc); + if (amount < 0) + amount = 0; + rc = put_user(amount, (unsigned int __user *)argp); + break; + } + + case TIOCINQ: { + struct sk_buff *skb; + int amount = 0; + /* + * These two are safe on a single CPU system as + * only user tasks fiddle here + */ + if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) + amount = skb->len; + rc = put_user(amount, (unsigned int __user *)argp); + break; + } + + case SIOCGSTAMP: + rc = -EINVAL; + if (sk) + rc = sock_get_timestamp(sk, + (struct timeval __user *)argp); + break; + case SIOCGIFADDR: + case SIOCSIFADDR: + case SIOCGIFDSTADDR: + case SIOCSIFDSTADDR: + case SIOCGIFBRDADDR: + case SIOCSIFBRDADDR: + case SIOCGIFNETMASK: + case SIOCSIFNETMASK: + case SIOCGIFMETRIC: + case SIOCSIFMETRIC: + rc = -EINVAL; + break; + case SIOCADDRT: + case SIOCDELRT: + rc = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + rc = x25_route_ioctl(cmd, argp); + break; + case SIOCX25GSUBSCRIP: + rc = x25_subscr_ioctl(cmd, argp); + break; + case SIOCX25SSUBSCRIP: + rc = -EPERM; + if (!capable(CAP_NET_ADMIN)) + break; + rc = x25_subscr_ioctl(cmd, argp); + break; + case SIOCX25GFACILITIES: { + struct x25_facilities fac = x25->facilities; + rc = copy_to_user(argp, &fac, + sizeof(fac)) ? -EFAULT : 0; + break; + } + + case SIOCX25SFACILITIES: { + struct x25_facilities facilities; + rc = -EFAULT; + if (copy_from_user(&facilities, argp, + sizeof(facilities))) + break; + rc = -EINVAL; + if (sk->sk_state != TCP_LISTEN && + sk->sk_state != TCP_CLOSE) + break; + if (facilities.pacsize_in < X25_PS16 || + facilities.pacsize_in > X25_PS4096) + break; + if (facilities.pacsize_out < X25_PS16 || + facilities.pacsize_out > X25_PS4096) + break; + if (facilities.winsize_in < 1 || + facilities.winsize_in > 127) + break; + if (facilities.throughput < 0x03 || + facilities.throughput > 0xDD) + break; + if (facilities.reverse && facilities.reverse != 1) + break; + x25->facilities = facilities; + rc = 0; + break; + } + + case SIOCX25GCALLUSERDATA: { + struct x25_calluserdata cud = x25->calluserdata; + rc = copy_to_user(argp, &cud, + sizeof(cud)) ? -EFAULT : 0; + break; + } + + case SIOCX25SCALLUSERDATA: { + struct x25_calluserdata calluserdata; + + rc = -EFAULT; + if (copy_from_user(&calluserdata, argp, + sizeof(calluserdata))) + break; + rc = -EINVAL; + if (calluserdata.cudlength > X25_MAX_CUD_LEN) + break; + x25->calluserdata = calluserdata; + rc = 0; + break; + } + + case SIOCX25GCAUSEDIAG: { + struct x25_causediag causediag; + causediag = x25->causediag; + rc = copy_to_user(argp, &causediag, + sizeof(causediag)) ? -EFAULT : 0; + break; + } + + default: + rc = dev_ioctl(cmd, argp); + break; + } + + return rc; +} + +static struct net_proto_family x25_family_ops = { + .family = AF_X25, + .create = x25_create, + .owner = THIS_MODULE, +}; + +static struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = { + .family = AF_X25, + .owner = THIS_MODULE, + .release = x25_release, + .bind = x25_bind, + .connect = x25_connect, + .socketpair = sock_no_socketpair, + .accept = x25_accept, + .getname = x25_getname, + .poll = datagram_poll, + .ioctl = x25_ioctl, + .listen = x25_listen, + .shutdown = sock_no_shutdown, + .setsockopt = x25_setsockopt, + .getsockopt = x25_getsockopt, + .sendmsg = x25_sendmsg, + .recvmsg = x25_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +#include +SOCKOPS_WRAP(x25_proto, AF_X25); + +static struct packet_type x25_packet_type = { + .type = __constant_htons(ETH_P_X25), + .func = x25_lapb_receive_frame, +}; + +static struct notifier_block x25_dev_notifier = { + .notifier_call = x25_device_event, +}; + +void x25_kill_by_neigh(struct x25_neigh *nb) +{ + struct sock *s; + struct hlist_node *node; + + write_lock_bh(&x25_list_lock); + + sk_for_each(s, node, &x25_list) + if (x25_sk(s)->neighbour == nb) + x25_disconnect(s, ENETUNREACH, 0, 0); + + write_unlock_bh(&x25_list_lock); +} + +static int __init x25_init(void) +{ + int rc = proto_register(&x25_proto, 0); + + if (rc != 0) + goto out; + + sock_register(&x25_family_ops); + + dev_add_pack(&x25_packet_type); + + register_netdevice_notifier(&x25_dev_notifier); + + printk(KERN_INFO "X.25 for Linux. Version 0.2 for Linux 2.1.15\n"); + +#ifdef CONFIG_SYSCTL + x25_register_sysctl(); +#endif + x25_proc_init(); +out: + return rc; +} +module_init(x25_init); + +static void __exit x25_exit(void) +{ + x25_proc_exit(); + x25_link_free(); + x25_route_free(); + +#ifdef CONFIG_SYSCTL + x25_unregister_sysctl(); +#endif + + unregister_netdevice_notifier(&x25_dev_notifier); + + dev_remove_pack(&x25_packet_type); + + sock_unregister(AF_X25); + proto_unregister(&x25_proto); +} +module_exit(x25_exit); + +MODULE_AUTHOR("Jonathan Naylor "); +MODULE_DESCRIPTION("The X.25 Packet Layer network layer protocol"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_NETPROTO(PF_X25); diff --git a/net/x25/sysctl_net_x25.c b/net/x25/sysctl_net_x25.c new file mode 100644 index 000000000000..aabda59c824e --- /dev/null +++ b/net/x25/sysctl_net_x25.c @@ -0,0 +1,107 @@ +/* -*- linux-c -*- + * sysctl_net_x25.c: sysctl interface to net X.25 subsystem. + * + * Begun April 1, 1996, Mike Shaver. + * Added /proc/sys/net/x25 directory entry (empty =) ). [MS] + */ + +#include +#include +#include +#include +#include +#include + +static int min_timer[] = { 1 * HZ }; +static int max_timer[] = { 300 * HZ }; + +static struct ctl_table_header *x25_table_header; + +static struct ctl_table x25_table[] = { + { + .ctl_name = NET_X25_RESTART_REQUEST_TIMEOUT, + .procname = "restart_request_timeout", + .data = &sysctl_x25_restart_request_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_timer, + .extra2 = &max_timer, + }, + { + .ctl_name = NET_X25_CALL_REQUEST_TIMEOUT, + .procname = "call_request_timeout", + .data = &sysctl_x25_call_request_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_timer, + .extra2 = &max_timer, + }, + { + .ctl_name = NET_X25_RESET_REQUEST_TIMEOUT, + .procname = "reset_request_timeout", + .data = &sysctl_x25_reset_request_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_timer, + .extra2 = &max_timer, + }, + { + .ctl_name = NET_X25_CLEAR_REQUEST_TIMEOUT, + .procname = "clear_request_timeout", + .data = &sysctl_x25_clear_request_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_timer, + .extra2 = &max_timer, + }, + { + .ctl_name = NET_X25_ACK_HOLD_BACK_TIMEOUT, + .procname = "acknowledgement_hold_back_timeout", + .data = &sysctl_x25_ack_holdback_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .strategy = &sysctl_intvec, + .extra1 = &min_timer, + .extra2 = &max_timer, + }, + { 0, }, +}; + +static struct ctl_table x25_dir_table[] = { + { + .ctl_name = NET_X25, + .procname = "x25", + .mode = 0555, + .child = x25_table, + }, + { 0, }, +}; + +static struct ctl_table x25_root_table[] = { + { + .ctl_name = CTL_NET, + .procname = "net", + .mode = 0555, + .child = x25_dir_table, + }, + { 0, }, +}; + +void __init x25_register_sysctl(void) +{ + x25_table_header = register_sysctl_table(x25_root_table, 1); +} + +void x25_unregister_sysctl(void) +{ + unregister_sysctl_table(x25_table_header); +} diff --git a/net/x25/x25_dev.c b/net/x25/x25_dev.c new file mode 100644 index 000000000000..36fc3bf6d882 --- /dev/null +++ b/net/x25/x25_dev.c @@ -0,0 +1,207 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, randomly fail to work with new + * releases, misbehave and/or generally screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Jonathan Naylor Started coding. + * 2000-09-04 Henner Eisen Prevent freeing a dangling skb. + */ + +#include +#include +#include +#include +#include +#include +#include + +static int x25_receive_data(struct sk_buff *skb, struct x25_neigh *nb) +{ + struct sock *sk; + unsigned short frametype; + unsigned int lci; + + frametype = skb->data[2]; + lci = ((skb->data[0] << 8) & 0xF00) + ((skb->data[1] << 0) & 0x0FF); + + /* + * LCI of zero is always for us, and its always a link control + * frame. + */ + if (lci == 0) { + x25_link_control(skb, nb, frametype); + return 0; + } + + /* + * Find an existing socket. + */ + if ((sk = x25_find_socket(lci, nb)) != NULL) { + int queued = 1; + + skb->h.raw = skb->data; + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) { + queued = x25_process_rx_frame(sk, skb); + } else { + sk_add_backlog(sk, skb); + } + bh_unlock_sock(sk); + return queued; + } + + /* + * Is is a Call Request ? if so process it. + */ + if (frametype == X25_CALL_REQUEST) + return x25_rx_call_request(skb, nb, lci); + + /* + * Its not a Call Request, nor is it a control frame. + * Let caller throw it away. + */ +/* + x25_transmit_clear_request(nb, lci, 0x0D); +*/ + + if (frametype != X25_CLEAR_CONFIRMATION) + printk(KERN_DEBUG "x25_receive_data(): unknown frame type %2x\n",frametype); + + return 0; +} + +int x25_lapb_receive_frame(struct sk_buff *skb, struct net_device *dev, + struct packet_type *ptype) +{ + struct sk_buff *nskb; + struct x25_neigh *nb; + + nskb = skb_copy(skb, GFP_ATOMIC); + if (!nskb) + goto drop; + kfree_skb(skb); + skb = nskb; + + /* + * Packet received from unrecognised device, throw it away. + */ + nb = x25_get_neigh(dev); + if (!nb) { + printk(KERN_DEBUG "X.25: unknown neighbour - %s\n", dev->name); + goto drop; + } + + switch (skb->data[0]) { + case 0x00: + skb_pull(skb, 1); + if (x25_receive_data(skb, nb)) { + x25_neigh_put(nb); + goto out; + } + break; + case 0x01: + x25_link_established(nb); + break; + case 0x02: + x25_link_terminated(nb); + break; + } + x25_neigh_put(nb); +drop: + kfree_skb(skb); +out: + return 0; +} + +void x25_establish_link(struct x25_neigh *nb) +{ + struct sk_buff *skb; + unsigned char *ptr; + + switch (nb->dev->type) { + case ARPHRD_X25: + if ((skb = alloc_skb(1, GFP_ATOMIC)) == NULL) { + printk(KERN_ERR "x25_dev: out of memory\n"); + return; + } + ptr = skb_put(skb, 1); + *ptr = 0x01; + break; + +#if defined(CONFIG_LLC) || defined(CONFIG_LLC_MODULE) + case ARPHRD_ETHER: + return; +#endif + default: + return; + } + + skb->protocol = htons(ETH_P_X25); + skb->dev = nb->dev; + + dev_queue_xmit(skb); +} + +void x25_terminate_link(struct x25_neigh *nb) +{ + struct sk_buff *skb; + unsigned char *ptr; + +#if defined(CONFIG_LLC) || defined(CONFIG_LLC_MODULE) + if (nb->dev->type == ARPHRD_ETHER) + return; +#endif + if (nb->dev->type != ARPHRD_X25) + return; + + skb = alloc_skb(1, GFP_ATOMIC); + if (!skb) { + printk(KERN_ERR "x25_dev: out of memory\n"); + return; + } + + ptr = skb_put(skb, 1); + *ptr = 0x02; + + skb->protocol = htons(ETH_P_X25); + skb->dev = nb->dev; + dev_queue_xmit(skb); +} + +void x25_send_frame(struct sk_buff *skb, struct x25_neigh *nb) +{ + unsigned char *dptr; + + skb->nh.raw = skb->data; + + switch (nb->dev->type) { + case ARPHRD_X25: + dptr = skb_push(skb, 1); + *dptr = 0x00; + break; + +#if defined(CONFIG_LLC) || defined(CONFIG_LLC_MODULE) + case ARPHRD_ETHER: + kfree_skb(skb); + return; +#endif + default: + kfree_skb(skb); + return; + } + + skb->protocol = htons(ETH_P_X25); + skb->dev = nb->dev; + + dev_queue_xmit(skb); +} diff --git a/net/x25/x25_facilities.c b/net/x25/x25_facilities.c new file mode 100644 index 000000000000..a21bdb95f9a8 --- /dev/null +++ b/net/x25/x25_facilities.c @@ -0,0 +1,231 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Split from x25_subr.c + * mar/20/00 Daniela Squassoni Disabling/enabling of facilities + * negotiation. + */ + +#include +#include +#include +#include +#include + +/* + * Parse a set of facilities into the facilities structure. Unrecognised + * facilities are written to the debug log file. + */ +int x25_parse_facilities(struct sk_buff *skb, + struct x25_facilities *facilities, + unsigned long *vc_fac_mask) +{ + unsigned char *p = skb->data; + unsigned int len = *p++; + + *vc_fac_mask = 0; + + while (len > 0) { + switch (*p & X25_FAC_CLASS_MASK) { + case X25_FAC_CLASS_A: + switch (*p) { + case X25_FAC_REVERSE: + facilities->reverse = p[1] & 0x01; + *vc_fac_mask |= X25_MASK_REVERSE; + break; + case X25_FAC_THROUGHPUT: + facilities->throughput = p[1]; + *vc_fac_mask |= X25_MASK_THROUGHPUT; + break; + default: + printk(KERN_DEBUG "X.25: unknown facility " + "%02X, value %02X\n", + p[0], p[1]); + break; + } + p += 2; + len -= 2; + break; + case X25_FAC_CLASS_B: + switch (*p) { + case X25_FAC_PACKET_SIZE: + facilities->pacsize_in = p[1]; + facilities->pacsize_out = p[2]; + *vc_fac_mask |= X25_MASK_PACKET_SIZE; + break; + case X25_FAC_WINDOW_SIZE: + facilities->winsize_in = p[1]; + facilities->winsize_out = p[2]; + *vc_fac_mask |= X25_MASK_WINDOW_SIZE; + break; + default: + printk(KERN_DEBUG "X.25: unknown facility " + "%02X, values %02X, %02X\n", + p[0], p[1], p[2]); + break; + } + p += 3; + len -= 3; + break; + case X25_FAC_CLASS_C: + printk(KERN_DEBUG "X.25: unknown facility %02X, " + "values %02X, %02X, %02X\n", + p[0], p[1], p[2], p[3]); + p += 4; + len -= 4; + break; + case X25_FAC_CLASS_D: + printk(KERN_DEBUG "X.25: unknown facility %02X, " + "length %d, values %02X, %02X, %02X, %02X\n", + p[0], p[1], p[2], p[3], p[4], p[5]); + len -= p[1] + 2; + p += p[1] + 2; + break; + } + } + + return p - skb->data; +} + +/* + * Create a set of facilities. + */ +int x25_create_facilities(unsigned char *buffer, + struct x25_facilities *facilities, + unsigned long facil_mask) +{ + unsigned char *p = buffer + 1; + int len; + + if (!facil_mask) { + /* + * Length of the facilities field in call_req or + * call_accept packets + */ + buffer[0] = 0; + len = 1; /* 1 byte for the length field */ + return len; + } + + if (facilities->reverse && (facil_mask & X25_MASK_REVERSE)) { + *p++ = X25_FAC_REVERSE; + *p++ = !!facilities->reverse; + } + + if (facilities->throughput && (facil_mask & X25_MASK_THROUGHPUT)) { + *p++ = X25_FAC_THROUGHPUT; + *p++ = facilities->throughput; + } + + if ((facilities->pacsize_in || facilities->pacsize_out) && + (facil_mask & X25_MASK_PACKET_SIZE)) { + *p++ = X25_FAC_PACKET_SIZE; + *p++ = facilities->pacsize_in ? : facilities->pacsize_out; + *p++ = facilities->pacsize_out ? : facilities->pacsize_in; + } + + if ((facilities->winsize_in || facilities->winsize_out) && + (facil_mask & X25_MASK_WINDOW_SIZE)) { + *p++ = X25_FAC_WINDOW_SIZE; + *p++ = facilities->winsize_in ? : facilities->winsize_out; + *p++ = facilities->winsize_out ? : facilities->winsize_in; + } + + len = p - buffer; + buffer[0] = len - 1; + + return len; +} + +/* + * Try to reach a compromise on a set of facilities. + * + * The only real problem is with reverse charging. + */ +int x25_negotiate_facilities(struct sk_buff *skb, struct sock *sk, + struct x25_facilities *new) +{ + struct x25_sock *x25 = x25_sk(sk); + struct x25_facilities *ours = &x25->facilities; + struct x25_facilities theirs; + int len; + + memset(&theirs, 0, sizeof(theirs)); + memcpy(new, ours, sizeof(*new)); + + len = x25_parse_facilities(skb, &theirs, &x25->vc_facil_mask); + + /* + * They want reverse charging, we won't accept it. + */ + if (theirs.reverse && ours->reverse) { + SOCK_DEBUG(sk, "X.25: rejecting reverse charging request"); + return -1; + } + + new->reverse = theirs.reverse; + + if (theirs.throughput) { + if (theirs.throughput < ours->throughput) { + SOCK_DEBUG(sk, "X.25: throughput negotiated down"); + new->throughput = theirs.throughput; + } + } + + if (theirs.pacsize_in && theirs.pacsize_out) { + if (theirs.pacsize_in < ours->pacsize_in) { + SOCK_DEBUG(sk, "X.25: packet size inwards negotiated down"); + new->pacsize_in = theirs.pacsize_in; + } + if (theirs.pacsize_out < ours->pacsize_out) { + SOCK_DEBUG(sk, "X.25: packet size outwards negotiated down"); + new->pacsize_out = theirs.pacsize_out; + } + } + + if (theirs.winsize_in && theirs.winsize_out) { + if (theirs.winsize_in < ours->winsize_in) { + SOCK_DEBUG(sk, "X.25: window size inwards negotiated down"); + new->winsize_in = theirs.winsize_in; + } + if (theirs.winsize_out < ours->winsize_out) { + SOCK_DEBUG(sk, "X.25: window size outwards negotiated down"); + new->winsize_out = theirs.winsize_out; + } + } + + return len; +} + +/* + * Limit values of certain facilities according to the capability of the + * currently attached x25 link. + */ +void x25_limit_facilities(struct x25_facilities *facilities, + struct x25_neigh *nb) +{ + + if (!nb->extended) { + if (facilities->winsize_in > 7) { + printk(KERN_DEBUG "X.25: incoming winsize limited to 7\n"); + facilities->winsize_in = 7; + } + if (facilities->winsize_out > 7) { + facilities->winsize_out = 7; + printk( KERN_DEBUG "X.25: outgoing winsize limited to 7\n"); + } + } +} diff --git a/net/x25/x25_in.c b/net/x25/x25_in.c new file mode 100644 index 000000000000..b0197c70a9fc --- /dev/null +++ b/net/x25/x25_in.c @@ -0,0 +1,361 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Jonathan Naylor Started coding. + * X.25 002 Jonathan Naylor Centralised disconnection code. + * New timer architecture. + * 2000-03-20 Daniela Squassoni Disabling/enabling of facilities + * negotiation. + * 2000-11-10 Henner Eisen Check and reset for out-of-sequence + * i-frames. + */ + +#include +#include +#include +#include +#include +#include +#include + +static int x25_queue_rx_frame(struct sock *sk, struct sk_buff *skb, int more) +{ + struct sk_buff *skbo, *skbn = skb; + struct x25_sock *x25 = x25_sk(sk); + + if (more) { + x25->fraglen += skb->len; + skb_queue_tail(&x25->fragment_queue, skb); + skb_set_owner_r(skb, sk); + return 0; + } + + if (!more && x25->fraglen > 0) { /* End of fragment */ + int len = x25->fraglen + skb->len; + + if ((skbn = alloc_skb(len, GFP_ATOMIC)) == NULL){ + kfree_skb(skb); + return 1; + } + + skb_queue_tail(&x25->fragment_queue, skb); + + skbn->h.raw = skbn->data; + + skbo = skb_dequeue(&x25->fragment_queue); + memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); + kfree_skb(skbo); + + while ((skbo = + skb_dequeue(&x25->fragment_queue)) != NULL) { + skb_pull(skbo, (x25->neighbour->extended) ? + X25_EXT_MIN_LEN : X25_STD_MIN_LEN); + memcpy(skb_put(skbn, skbo->len), skbo->data, skbo->len); + kfree_skb(skbo); + } + + x25->fraglen = 0; + } + + skb_set_owner_r(skbn, sk); + skb_queue_tail(&sk->sk_receive_queue, skbn); + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_data_ready(sk, skbn->len); + + return 0; +} + +/* + * State machine for state 1, Awaiting Call Accepted State. + * The handling of the timer(s) is in file x25_timer.c. + * Handling of state 0 and connection release is in af_x25.c. + */ +static int x25_state1_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + struct x25_address source_addr, dest_addr; + + switch (frametype) { + case X25_CALL_ACCEPTED: { + struct x25_sock *x25 = x25_sk(sk); + + x25_stop_timer(sk); + x25->condition = 0x00; + x25->vs = 0; + x25->va = 0; + x25->vr = 0; + x25->vl = 0; + x25->state = X25_STATE_3; + sk->sk_state = TCP_ESTABLISHED; + /* + * Parse the data in the frame. + */ + skb_pull(skb, X25_STD_MIN_LEN); + skb_pull(skb, x25_addr_ntoa(skb->data, &source_addr, &dest_addr)); + skb_pull(skb, + x25_parse_facilities(skb, &x25->facilities, + &x25->vc_facil_mask)); + /* + * Copy any Call User Data. + */ + if (skb->len >= 0) { + memcpy(x25->calluserdata.cuddata, skb->data, + skb->len); + x25->calluserdata.cudlength = skb->len; + } + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + break; + } + case X25_CLEAR_REQUEST: + x25_write_internal(sk, X25_CLEAR_CONFIRMATION); + x25_disconnect(sk, ECONNREFUSED, skb->data[3], skb->data[4]); + break; + + default: + break; + } + + return 0; +} + +/* + * State machine for state 2, Awaiting Clear Confirmation State. + * The handling of the timer(s) is in file x25_timer.c + * Handling of state 0 and connection release is in af_x25.c. + */ +static int x25_state2_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + switch (frametype) { + + case X25_CLEAR_REQUEST: + x25_write_internal(sk, X25_CLEAR_CONFIRMATION); + x25_disconnect(sk, 0, skb->data[3], skb->data[4]); + break; + + case X25_CLEAR_CONFIRMATION: + x25_disconnect(sk, 0, 0, 0); + break; + + default: + break; + } + + return 0; +} + +/* + * State machine for state 3, Connected State. + * The handling of the timer(s) is in file x25_timer.c + * Handling of state 0 and connection release is in af_x25.c. + */ +static int x25_state3_machine(struct sock *sk, struct sk_buff *skb, int frametype, int ns, int nr, int q, int d, int m) +{ + int queued = 0; + int modulus; + struct x25_sock *x25 = x25_sk(sk); + + modulus = (x25->neighbour->extended) ? X25_EMODULUS : X25_SMODULUS; + + switch (frametype) { + + case X25_RESET_REQUEST: + x25_write_internal(sk, X25_RESET_CONFIRMATION); + x25_stop_timer(sk); + x25->condition = 0x00; + x25->vs = 0; + x25->vr = 0; + x25->va = 0; + x25->vl = 0; + x25_requeue_frames(sk); + break; + + case X25_CLEAR_REQUEST: + x25_write_internal(sk, X25_CLEAR_CONFIRMATION); + x25_disconnect(sk, 0, skb->data[3], skb->data[4]); + break; + + case X25_RR: + case X25_RNR: + if (!x25_validate_nr(sk, nr)) { + x25_clear_queues(sk); + x25_write_internal(sk, X25_RESET_REQUEST); + x25_start_t22timer(sk); + x25->condition = 0x00; + x25->vs = 0; + x25->vr = 0; + x25->va = 0; + x25->vl = 0; + x25->state = X25_STATE_4; + } else { + x25_frames_acked(sk, nr); + if (frametype == X25_RNR) { + x25->condition |= X25_COND_PEER_RX_BUSY; + } else { + x25->condition &= ~X25_COND_PEER_RX_BUSY; + } + } + break; + + case X25_DATA: /* XXX */ + x25->condition &= ~X25_COND_PEER_RX_BUSY; + if ((ns != x25->vr) || !x25_validate_nr(sk, nr)) { + x25_clear_queues(sk); + x25_write_internal(sk, X25_RESET_REQUEST); + x25_start_t22timer(sk); + x25->condition = 0x00; + x25->vs = 0; + x25->vr = 0; + x25->va = 0; + x25->vl = 0; + x25->state = X25_STATE_4; + break; + } + x25_frames_acked(sk, nr); + if (ns == x25->vr) { + if (x25_queue_rx_frame(sk, skb, m) == 0) { + x25->vr = (x25->vr + 1) % modulus; + queued = 1; + } else { + /* Should never happen */ + x25_clear_queues(sk); + x25_write_internal(sk, X25_RESET_REQUEST); + x25_start_t22timer(sk); + x25->condition = 0x00; + x25->vs = 0; + x25->vr = 0; + x25->va = 0; + x25->vl = 0; + x25->state = X25_STATE_4; + break; + } + if (atomic_read(&sk->sk_rmem_alloc) > + (sk->sk_rcvbuf / 2)) + x25->condition |= X25_COND_OWN_RX_BUSY; + } + /* + * If the window is full Ack it immediately, else + * start the holdback timer. + */ + if (((x25->vl + x25->facilities.winsize_in) % modulus) == x25->vr) { + x25->condition &= ~X25_COND_ACK_PENDING; + x25_stop_timer(sk); + x25_enquiry_response(sk); + } else { + x25->condition |= X25_COND_ACK_PENDING; + x25_start_t2timer(sk); + } + break; + + case X25_INTERRUPT_CONFIRMATION: + x25->intflag = 0; + break; + + case X25_INTERRUPT: + if (sock_flag(sk, SOCK_URGINLINE)) + queued = !sock_queue_rcv_skb(sk, skb); + else { + skb_set_owner_r(skb, sk); + skb_queue_tail(&x25->interrupt_in_queue, skb); + queued = 1; + } + sk_send_sigurg(sk); + x25_write_internal(sk, X25_INTERRUPT_CONFIRMATION); + break; + + default: + printk(KERN_WARNING "x25: unknown %02X in state 3\n", frametype); + break; + } + + return queued; +} + +/* + * State machine for state 4, Awaiting Reset Confirmation State. + * The handling of the timer(s) is in file x25_timer.c + * Handling of state 0 and connection release is in af_x25.c. + */ +static int x25_state4_machine(struct sock *sk, struct sk_buff *skb, int frametype) +{ + switch (frametype) { + + case X25_RESET_REQUEST: + x25_write_internal(sk, X25_RESET_CONFIRMATION); + case X25_RESET_CONFIRMATION: { + struct x25_sock *x25 = x25_sk(sk); + + x25_stop_timer(sk); + x25->condition = 0x00; + x25->va = 0; + x25->vr = 0; + x25->vs = 0; + x25->vl = 0; + x25->state = X25_STATE_3; + x25_requeue_frames(sk); + break; + } + case X25_CLEAR_REQUEST: + x25_write_internal(sk, X25_CLEAR_CONFIRMATION); + x25_disconnect(sk, 0, skb->data[3], skb->data[4]); + break; + + default: + break; + } + + return 0; +} + +/* Higher level upcall for a LAPB frame */ +int x25_process_rx_frame(struct sock *sk, struct sk_buff *skb) +{ + struct x25_sock *x25 = x25_sk(sk); + int queued = 0, frametype, ns, nr, q, d, m; + + if (x25->state == X25_STATE_0) + return 0; + + frametype = x25_decode(sk, skb, &ns, &nr, &q, &d, &m); + + switch (x25->state) { + case X25_STATE_1: + queued = x25_state1_machine(sk, skb, frametype); + break; + case X25_STATE_2: + queued = x25_state2_machine(sk, skb, frametype); + break; + case X25_STATE_3: + queued = x25_state3_machine(sk, skb, frametype, ns, nr, q, d, m); + break; + case X25_STATE_4: + queued = x25_state4_machine(sk, skb, frametype); + break; + } + + x25_kick(sk); + + return queued; +} + +int x25_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + int queued = x25_process_rx_frame(sk, skb); + + if (!queued) + kfree_skb(skb); + + return 0; +} diff --git a/net/x25/x25_link.c b/net/x25/x25_link.c new file mode 100644 index 000000000000..0a760fe66843 --- /dev/null +++ b/net/x25/x25_link.c @@ -0,0 +1,401 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Jonathan Naylor Started coding. + * X.25 002 Jonathan Naylor New timer architecture. + * mar/20/00 Daniela Squassoni Disabling/enabling of facilities + * negotiation. + * 2000-09-04 Henner Eisen dev_hold() / dev_put() for x25_neigh. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static struct list_head x25_neigh_list = LIST_HEAD_INIT(x25_neigh_list); +static DEFINE_RWLOCK(x25_neigh_list_lock); + +static void x25_t20timer_expiry(unsigned long); + +static void x25_transmit_restart_confirmation(struct x25_neigh *nb); +static void x25_transmit_restart_request(struct x25_neigh *nb); + +/* + * Linux set/reset timer routines + */ +static inline void x25_start_t20timer(struct x25_neigh *nb) +{ + mod_timer(&nb->t20timer, jiffies + nb->t20); +} + +static void x25_t20timer_expiry(unsigned long param) +{ + struct x25_neigh *nb = (struct x25_neigh *)param; + + x25_transmit_restart_request(nb); + + x25_start_t20timer(nb); +} + +static inline void x25_stop_t20timer(struct x25_neigh *nb) +{ + del_timer(&nb->t20timer); +} + +static inline int x25_t20timer_pending(struct x25_neigh *nb) +{ + return timer_pending(&nb->t20timer); +} + +/* + * This handles all restart and diagnostic frames. + */ +void x25_link_control(struct sk_buff *skb, struct x25_neigh *nb, + unsigned short frametype) +{ + struct sk_buff *skbn; + int confirm; + + switch (frametype) { + case X25_RESTART_REQUEST: + confirm = !x25_t20timer_pending(nb); + x25_stop_t20timer(nb); + nb->state = X25_LINK_STATE_3; + if (confirm) + x25_transmit_restart_confirmation(nb); + break; + + case X25_RESTART_CONFIRMATION: + x25_stop_t20timer(nb); + nb->state = X25_LINK_STATE_3; + break; + + case X25_DIAGNOSTIC: + printk(KERN_WARNING "x25: diagnostic #%d - " + "%02X %02X %02X\n", + skb->data[3], skb->data[4], + skb->data[5], skb->data[6]); + break; + + default: + printk(KERN_WARNING "x25: received unknown %02X " + "with LCI 000\n", frametype); + break; + } + + if (nb->state == X25_LINK_STATE_3) + while ((skbn = skb_dequeue(&nb->queue)) != NULL) + x25_send_frame(skbn, nb); +} + +/* + * This routine is called when a Restart Request is needed + */ +static void x25_transmit_restart_request(struct x25_neigh *nb) +{ + unsigned char *dptr; + int len = X25_MAX_L2_LEN + X25_STD_MIN_LEN + 2; + struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC); + + if (!skb) + return; + + skb_reserve(skb, X25_MAX_L2_LEN); + + dptr = skb_put(skb, X25_STD_MIN_LEN + 2); + + *dptr++ = nb->extended ? X25_GFI_EXTSEQ : X25_GFI_STDSEQ; + *dptr++ = 0x00; + *dptr++ = X25_RESTART_REQUEST; + *dptr++ = 0x00; + *dptr++ = 0; + + skb->sk = NULL; + + x25_send_frame(skb, nb); +} + +/* + * This routine is called when a Restart Confirmation is needed + */ +static void x25_transmit_restart_confirmation(struct x25_neigh *nb) +{ + unsigned char *dptr; + int len = X25_MAX_L2_LEN + X25_STD_MIN_LEN; + struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC); + + if (!skb) + return; + + skb_reserve(skb, X25_MAX_L2_LEN); + + dptr = skb_put(skb, X25_STD_MIN_LEN); + + *dptr++ = nb->extended ? X25_GFI_EXTSEQ : X25_GFI_STDSEQ; + *dptr++ = 0x00; + *dptr++ = X25_RESTART_CONFIRMATION; + + skb->sk = NULL; + + x25_send_frame(skb, nb); +} + +/* + * This routine is called when a Clear Request is needed outside of the context + * of a connected socket. + */ +void x25_transmit_clear_request(struct x25_neigh *nb, unsigned int lci, + unsigned char cause) +{ + unsigned char *dptr; + int len = X25_MAX_L2_LEN + X25_STD_MIN_LEN + 2; + struct sk_buff *skb = alloc_skb(len, GFP_ATOMIC); + + if (!skb) + return; + + skb_reserve(skb, X25_MAX_L2_LEN); + + dptr = skb_put(skb, X25_STD_MIN_LEN + 2); + + *dptr++ = ((lci >> 8) & 0x0F) | (nb->extended ? + X25_GFI_EXTSEQ : + X25_GFI_STDSEQ); + *dptr++ = (lci >> 0) & 0xFF; + *dptr++ = X25_CLEAR_REQUEST; + *dptr++ = cause; + *dptr++ = 0x00; + + skb->sk = NULL; + + x25_send_frame(skb, nb); +} + +void x25_transmit_link(struct sk_buff *skb, struct x25_neigh *nb) +{ + switch (nb->state) { + case X25_LINK_STATE_0: + skb_queue_tail(&nb->queue, skb); + nb->state = X25_LINK_STATE_1; + x25_establish_link(nb); + break; + case X25_LINK_STATE_1: + case X25_LINK_STATE_2: + skb_queue_tail(&nb->queue, skb); + break; + case X25_LINK_STATE_3: + x25_send_frame(skb, nb); + break; + } +} + +/* + * Called when the link layer has become established. + */ +void x25_link_established(struct x25_neigh *nb) +{ + switch (nb->state) { + case X25_LINK_STATE_0: + nb->state = X25_LINK_STATE_2; + break; + case X25_LINK_STATE_1: + x25_transmit_restart_request(nb); + nb->state = X25_LINK_STATE_2; + x25_start_t20timer(nb); + break; + } +} + +/* + * Called when the link layer has terminated, or an establishment + * request has failed. + */ + +void x25_link_terminated(struct x25_neigh *nb) +{ + nb->state = X25_LINK_STATE_0; + /* Out of order: clear existing virtual calls (X.25 03/93 4.6.3) */ + x25_kill_by_neigh(nb); +} + +/* + * Add a new device. + */ +void x25_link_device_up(struct net_device *dev) +{ + struct x25_neigh *nb = kmalloc(sizeof(*nb), GFP_ATOMIC); + + if (!nb) + return; + + skb_queue_head_init(&nb->queue); + + init_timer(&nb->t20timer); + nb->t20timer.data = (unsigned long)nb; + nb->t20timer.function = &x25_t20timer_expiry; + + dev_hold(dev); + nb->dev = dev; + nb->state = X25_LINK_STATE_0; + nb->extended = 0; + /* + * Enables negotiation + */ + nb->global_facil_mask = X25_MASK_REVERSE | + X25_MASK_THROUGHPUT | + X25_MASK_PACKET_SIZE | + X25_MASK_WINDOW_SIZE; + nb->t20 = sysctl_x25_restart_request_timeout; + atomic_set(&nb->refcnt, 1); + + write_lock_bh(&x25_neigh_list_lock); + list_add(&nb->node, &x25_neigh_list); + write_unlock_bh(&x25_neigh_list_lock); +} + +/** + * __x25_remove_neigh - remove neighbour from x25_neigh_list + * @nb - neigh to remove + * + * Remove neighbour from x25_neigh_list. If it was there. + * Caller must hold x25_neigh_list_lock. + */ +static void __x25_remove_neigh(struct x25_neigh *nb) +{ + skb_queue_purge(&nb->queue); + x25_stop_t20timer(nb); + + if (nb->node.next) { + list_del(&nb->node); + x25_neigh_put(nb); + } +} + +/* + * A device has been removed, remove its links. + */ +void x25_link_device_down(struct net_device *dev) +{ + struct x25_neigh *nb; + struct list_head *entry, *tmp; + + write_lock_bh(&x25_neigh_list_lock); + + list_for_each_safe(entry, tmp, &x25_neigh_list) { + nb = list_entry(entry, struct x25_neigh, node); + + if (nb->dev == dev) { + __x25_remove_neigh(nb); + dev_put(dev); + } + } + + write_unlock_bh(&x25_neigh_list_lock); +} + +/* + * Given a device, return the neighbour address. + */ +struct x25_neigh *x25_get_neigh(struct net_device *dev) +{ + struct x25_neigh *nb, *use = NULL; + struct list_head *entry; + + read_lock_bh(&x25_neigh_list_lock); + list_for_each(entry, &x25_neigh_list) { + nb = list_entry(entry, struct x25_neigh, node); + + if (nb->dev == dev) { + use = nb; + break; + } + } + + if (use) + x25_neigh_hold(use); + read_unlock_bh(&x25_neigh_list_lock); + return use; +} + +/* + * Handle the ioctls that control the subscription functions. + */ +int x25_subscr_ioctl(unsigned int cmd, void __user *arg) +{ + struct x25_subscrip_struct x25_subscr; + struct x25_neigh *nb; + struct net_device *dev; + int rc = -EINVAL; + + if (cmd != SIOCX25GSUBSCRIP && cmd != SIOCX25SSUBSCRIP) + goto out; + + rc = -EFAULT; + if (copy_from_user(&x25_subscr, arg, sizeof(x25_subscr))) + goto out; + + rc = -EINVAL; + if ((dev = x25_dev_get(x25_subscr.device)) == NULL) + goto out; + + if ((nb = x25_get_neigh(dev)) == NULL) + goto out_dev_put; + + dev_put(dev); + + if (cmd == SIOCX25GSUBSCRIP) { + x25_subscr.extended = nb->extended; + x25_subscr.global_facil_mask = nb->global_facil_mask; + rc = copy_to_user(arg, &x25_subscr, + sizeof(x25_subscr)) ? -EFAULT : 0; + } else { + rc = -EINVAL; + if (!(x25_subscr.extended && x25_subscr.extended != 1)) { + rc = 0; + nb->extended = x25_subscr.extended; + nb->global_facil_mask = x25_subscr.global_facil_mask; + } + } + x25_neigh_put(nb); +out: + return rc; +out_dev_put: + dev_put(dev); + goto out; +} + + +/* + * Release all memory associated with X.25 neighbour structures. + */ +void __exit x25_link_free(void) +{ + struct x25_neigh *nb; + struct list_head *entry, *tmp; + + write_lock_bh(&x25_neigh_list_lock); + + list_for_each_safe(entry, tmp, &x25_neigh_list) { + nb = list_entry(entry, struct x25_neigh, node); + __x25_remove_neigh(nb); + } + write_unlock_bh(&x25_neigh_list_lock); +} diff --git a/net/x25/x25_out.c b/net/x25/x25_out.c new file mode 100644 index 000000000000..a2e62cea819a --- /dev/null +++ b/net/x25/x25_out.c @@ -0,0 +1,226 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Jonathan Naylor Started coding. + * X.25 002 Jonathan Naylor New timer architecture. + * 2000-09-04 Henner Eisen Prevented x25_output() skb leakage. + * 2000-10-27 Henner Eisen MSG_DONTWAIT for fragment allocation. + * 2000-11-10 Henner Eisen x25_send_iframe(): re-queued frames + * needed cleaned seq-number fields. + */ + +#include +#include +#include +#include +#include +#include + +static int x25_pacsize_to_bytes(unsigned int pacsize) +{ + int bytes = 1; + + if (!pacsize) + return 128; + + while (pacsize-- > 0) + bytes *= 2; + + return bytes; +} + +/* + * This is where all X.25 information frames pass. + * + * Returns the amount of user data bytes sent on success + * or a negative error code on failure. + */ +int x25_output(struct sock *sk, struct sk_buff *skb) +{ + struct sk_buff *skbn; + unsigned char header[X25_EXT_MIN_LEN]; + int err, frontlen, len; + int sent=0, noblock = X25_SKB_CB(skb)->flags & MSG_DONTWAIT; + struct x25_sock *x25 = x25_sk(sk); + int header_len = x25->neighbour->extended ? X25_EXT_MIN_LEN : + X25_STD_MIN_LEN; + int max_len = x25_pacsize_to_bytes(x25->facilities.pacsize_out); + + if (skb->len - header_len > max_len) { + /* Save a copy of the Header */ + memcpy(header, skb->data, header_len); + skb_pull(skb, header_len); + + frontlen = skb_headroom(skb); + + while (skb->len > 0) { + if ((skbn = sock_alloc_send_skb(sk, frontlen + max_len, + noblock, &err)) == NULL){ + if (err == -EWOULDBLOCK && noblock){ + kfree_skb(skb); + return sent; + } + SOCK_DEBUG(sk, "x25_output: fragment alloc" + " failed, err=%d, %d bytes " + "sent\n", err, sent); + return err; + } + + skb_reserve(skbn, frontlen); + + len = max_len > skb->len ? skb->len : max_len; + + /* Copy the user data */ + memcpy(skb_put(skbn, len), skb->data, len); + skb_pull(skb, len); + + /* Duplicate the Header */ + skb_push(skbn, header_len); + memcpy(skbn->data, header, header_len); + + if (skb->len > 0) { + if (x25->neighbour->extended) + skbn->data[3] |= X25_EXT_M_BIT; + else + skbn->data[2] |= X25_STD_M_BIT; + } + + skb_queue_tail(&sk->sk_write_queue, skbn); + sent += len; + } + + kfree_skb(skb); + } else { + skb_queue_tail(&sk->sk_write_queue, skb); + sent = skb->len - header_len; + } + return sent; +} + +/* + * This procedure is passed a buffer descriptor for an iframe. It builds + * the rest of the control part of the frame and then writes it out. + */ +static void x25_send_iframe(struct sock *sk, struct sk_buff *skb) +{ + struct x25_sock *x25 = x25_sk(sk); + + if (!skb) + return; + + if (x25->neighbour->extended) { + skb->data[2] = (x25->vs << 1) & 0xFE; + skb->data[3] &= X25_EXT_M_BIT; + skb->data[3] |= (x25->vr << 1) & 0xFE; + } else { + skb->data[2] &= X25_STD_M_BIT; + skb->data[2] |= (x25->vs << 1) & 0x0E; + skb->data[2] |= (x25->vr << 5) & 0xE0; + } + + x25_transmit_link(skb, x25->neighbour); +} + +void x25_kick(struct sock *sk) +{ + struct sk_buff *skb, *skbn; + unsigned short start, end; + int modulus; + struct x25_sock *x25 = x25_sk(sk); + + if (x25->state != X25_STATE_3) + return; + + /* + * Transmit interrupt data. + */ + if (!x25->intflag && skb_peek(&x25->interrupt_out_queue) != NULL) { + x25->intflag = 1; + skb = skb_dequeue(&x25->interrupt_out_queue); + x25_transmit_link(skb, x25->neighbour); + } + + if (x25->condition & X25_COND_PEER_RX_BUSY) + return; + + if (!skb_peek(&sk->sk_write_queue)) + return; + + modulus = x25->neighbour->extended ? X25_EMODULUS : X25_SMODULUS; + + start = skb_peek(&x25->ack_queue) ? x25->vs : x25->va; + end = (x25->va + x25->facilities.winsize_out) % modulus; + + if (start == end) + return; + + x25->vs = start; + + /* + * Transmit data until either we're out of data to send or + * the window is full. + */ + + skb = skb_dequeue(&sk->sk_write_queue); + + do { + if ((skbn = skb_clone(skb, GFP_ATOMIC)) == NULL) { + skb_queue_head(&sk->sk_write_queue, skb); + break; + } + + skb_set_owner_w(skbn, sk); + + /* + * Transmit the frame copy. + */ + x25_send_iframe(sk, skbn); + + x25->vs = (x25->vs + 1) % modulus; + + /* + * Requeue the original data frame. + */ + skb_queue_tail(&x25->ack_queue, skb); + + } while (x25->vs != end && + (skb = skb_dequeue(&sk->sk_write_queue)) != NULL); + + x25->vl = x25->vr; + x25->condition &= ~X25_COND_ACK_PENDING; + + x25_stop_timer(sk); +} + +/* + * The following routines are taken from page 170 of the 7th ARRL Computer + * Networking Conference paper, as is the whole state machine. + */ + +void x25_enquiry_response(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + if (x25->condition & X25_COND_OWN_RX_BUSY) + x25_write_internal(sk, X25_RNR); + else + x25_write_internal(sk, X25_RR); + + x25->vl = x25->vr; + x25->condition &= ~X25_COND_ACK_PENDING; + + x25_stop_timer(sk); +} diff --git a/net/x25/x25_proc.c b/net/x25/x25_proc.c new file mode 100644 index 000000000000..dfb80116c59f --- /dev/null +++ b/net/x25/x25_proc.c @@ -0,0 +1,256 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.4 with seq_file support + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * 2002/10/06 Arnaldo Carvalho de Melo seq_file support + */ + +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PROC_FS +static __inline__ struct x25_route *x25_get_route_idx(loff_t pos) +{ + struct list_head *route_entry; + struct x25_route *rt = NULL; + + list_for_each(route_entry, &x25_route_list) { + rt = list_entry(route_entry, struct x25_route, node); + if (!pos--) + goto found; + } + rt = NULL; +found: + return rt; +} + +static void *x25_seq_route_start(struct seq_file *seq, loff_t *pos) +{ + loff_t l = *pos; + + read_lock_bh(&x25_route_list_lock); + return l ? x25_get_route_idx(--l) : SEQ_START_TOKEN; +} + +static void *x25_seq_route_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct x25_route *rt; + + ++*pos; + if (v == SEQ_START_TOKEN) { + rt = NULL; + if (!list_empty(&x25_route_list)) + rt = list_entry(x25_route_list.next, + struct x25_route, node); + goto out; + } + rt = v; + if (rt->node.next != &x25_route_list) + rt = list_entry(rt->node.next, struct x25_route, node); + else + rt = NULL; +out: + return rt; +} + +static void x25_seq_route_stop(struct seq_file *seq, void *v) +{ + read_unlock_bh(&x25_route_list_lock); +} + +static int x25_seq_route_show(struct seq_file *seq, void *v) +{ + struct x25_route *rt; + + if (v == SEQ_START_TOKEN) { + seq_puts(seq, "Address Digits Device\n"); + goto out; + } + + rt = v; + seq_printf(seq, "%-15s %-6d %-5s\n", + rt->address.x25_addr, rt->sigdigits, + rt->dev ? rt->dev->name : "???"); +out: + return 0; +} + +static __inline__ struct sock *x25_get_socket_idx(loff_t pos) +{ + struct sock *s; + struct hlist_node *node; + + sk_for_each(s, node, &x25_list) + if (!pos--) + goto found; + s = NULL; +found: + return s; +} + +static void *x25_seq_socket_start(struct seq_file *seq, loff_t *pos) +{ + loff_t l = *pos; + + read_lock_bh(&x25_list_lock); + return l ? x25_get_socket_idx(--l) : SEQ_START_TOKEN; +} + +static void *x25_seq_socket_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock *s; + + ++*pos; + if (v == SEQ_START_TOKEN) { + s = sk_head(&x25_list); + goto out; + } + s = sk_next(v); +out: + return s; +} + +static void x25_seq_socket_stop(struct seq_file *seq, void *v) +{ + read_unlock_bh(&x25_list_lock); +} + +static int x25_seq_socket_show(struct seq_file *seq, void *v) +{ + struct sock *s; + struct x25_sock *x25; + struct net_device *dev; + const char *devname; + + if (v == SEQ_START_TOKEN) { + seq_printf(seq, "dest_addr src_addr dev lci st vs vr " + "va t t2 t21 t22 t23 Snd-Q Rcv-Q inode\n"); + goto out; + } + + s = v; + x25 = x25_sk(s); + + if (!x25->neighbour || (dev = x25->neighbour->dev) == NULL) + devname = "???"; + else + devname = x25->neighbour->dev->name; + + seq_printf(seq, "%-10s %-10s %-5s %3.3X %d %d %d %d %3lu %3lu " + "%3lu %3lu %3lu %5d %5d %ld\n", + !x25->dest_addr.x25_addr[0] ? "*" : x25->dest_addr.x25_addr, + !x25->source_addr.x25_addr[0] ? "*" : x25->source_addr.x25_addr, + devname, x25->lci & 0x0FFF, x25->state, x25->vs, x25->vr, + x25->va, x25_display_timer(s) / HZ, x25->t2 / HZ, + x25->t21 / HZ, x25->t22 / HZ, x25->t23 / HZ, + atomic_read(&s->sk_wmem_alloc), + atomic_read(&s->sk_rmem_alloc), + s->sk_socket ? SOCK_INODE(s->sk_socket)->i_ino : 0L); +out: + return 0; +} + +static struct seq_operations x25_seq_route_ops = { + .start = x25_seq_route_start, + .next = x25_seq_route_next, + .stop = x25_seq_route_stop, + .show = x25_seq_route_show, +}; + +static struct seq_operations x25_seq_socket_ops = { + .start = x25_seq_socket_start, + .next = x25_seq_socket_next, + .stop = x25_seq_socket_stop, + .show = x25_seq_socket_show, +}; + +static int x25_seq_socket_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &x25_seq_socket_ops); +} + +static int x25_seq_route_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &x25_seq_route_ops); +} + +static struct file_operations x25_seq_socket_fops = { + .owner = THIS_MODULE, + .open = x25_seq_socket_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct file_operations x25_seq_route_fops = { + .owner = THIS_MODULE, + .open = x25_seq_route_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static struct proc_dir_entry *x25_proc_dir; + +int __init x25_proc_init(void) +{ + struct proc_dir_entry *p; + int rc = -ENOMEM; + + x25_proc_dir = proc_mkdir("x25", proc_net); + if (!x25_proc_dir) + goto out; + + p = create_proc_entry("route", S_IRUGO, x25_proc_dir); + if (!p) + goto out_route; + p->proc_fops = &x25_seq_route_fops; + + p = create_proc_entry("socket", S_IRUGO, x25_proc_dir); + if (!p) + goto out_socket; + p->proc_fops = &x25_seq_socket_fops; + rc = 0; +out: + return rc; +out_socket: + remove_proc_entry("route", x25_proc_dir); +out_route: + remove_proc_entry("x25", proc_net); + goto out; +} + +void __exit x25_proc_exit(void) +{ + remove_proc_entry("route", x25_proc_dir); + remove_proc_entry("socket", x25_proc_dir); + remove_proc_entry("x25", proc_net); +} + +#else /* CONFIG_PROC_FS */ + +int __init x25_proc_init(void) +{ + return 0; +} + +void __exit x25_proc_exit(void) +{ +} +#endif /* CONFIG_PROC_FS */ diff --git a/net/x25/x25_route.c b/net/x25/x25_route.c new file mode 100644 index 000000000000..6c5d37517035 --- /dev/null +++ b/net/x25/x25_route.c @@ -0,0 +1,221 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Jonathan Naylor Started coding. + */ + +#include +#include +#include +#include + +struct list_head x25_route_list = LIST_HEAD_INIT(x25_route_list); +DEFINE_RWLOCK(x25_route_list_lock); + +/* + * Add a new route. + */ +static int x25_add_route(struct x25_address *address, unsigned int sigdigits, + struct net_device *dev) +{ + struct x25_route *rt; + struct list_head *entry; + int rc = -EINVAL; + + write_lock_bh(&x25_route_list_lock); + + list_for_each(entry, &x25_route_list) { + rt = list_entry(entry, struct x25_route, node); + + if (!memcmp(&rt->address, address, sigdigits) && + rt->sigdigits == sigdigits) + goto out; + } + + rt = kmalloc(sizeof(*rt), GFP_ATOMIC); + rc = -ENOMEM; + if (!rt) + goto out; + + strcpy(rt->address.x25_addr, "000000000000000"); + memcpy(rt->address.x25_addr, address->x25_addr, sigdigits); + + rt->sigdigits = sigdigits; + rt->dev = dev; + atomic_set(&rt->refcnt, 1); + + list_add(&rt->node, &x25_route_list); + rc = 0; +out: + write_unlock_bh(&x25_route_list_lock); + return rc; +} + +/** + * __x25_remove_route - remove route from x25_route_list + * @rt - route to remove + * + * Remove route from x25_route_list. If it was there. + * Caller must hold x25_route_list_lock. + */ +static void __x25_remove_route(struct x25_route *rt) +{ + if (rt->node.next) { + list_del(&rt->node); + x25_route_put(rt); + } +} + +static int x25_del_route(struct x25_address *address, unsigned int sigdigits, + struct net_device *dev) +{ + struct x25_route *rt; + struct list_head *entry; + int rc = -EINVAL; + + write_lock_bh(&x25_route_list_lock); + + list_for_each(entry, &x25_route_list) { + rt = list_entry(entry, struct x25_route, node); + + if (!memcmp(&rt->address, address, sigdigits) && + rt->sigdigits == sigdigits && rt->dev == dev) { + __x25_remove_route(rt); + rc = 0; + break; + } + } + + write_unlock_bh(&x25_route_list_lock); + return rc; +} + +/* + * A device has been removed, remove its routes. + */ +void x25_route_device_down(struct net_device *dev) +{ + struct x25_route *rt; + struct list_head *entry, *tmp; + + write_lock_bh(&x25_route_list_lock); + + list_for_each_safe(entry, tmp, &x25_route_list) { + rt = list_entry(entry, struct x25_route, node); + + if (rt->dev == dev) + __x25_remove_route(rt); + } + write_unlock_bh(&x25_route_list_lock); +} + +/* + * Check that the device given is a valid X.25 interface that is "up". + */ +struct net_device *x25_dev_get(char *devname) +{ + struct net_device *dev = dev_get_by_name(devname); + + if (dev && + (!(dev->flags & IFF_UP) || (dev->type != ARPHRD_X25 +#if defined(CONFIG_LLC) || defined(CONFIG_LLC_MODULE) + && dev->type != ARPHRD_ETHER +#endif + ))) + dev_put(dev); + + return dev; +} + +/** + * x25_get_route - Find a route given an X.25 address. + * @addr - address to find a route for + * + * Find a route given an X.25 address. + */ +struct x25_route *x25_get_route(struct x25_address *addr) +{ + struct x25_route *rt, *use = NULL; + struct list_head *entry; + + read_lock_bh(&x25_route_list_lock); + + list_for_each(entry, &x25_route_list) { + rt = list_entry(entry, struct x25_route, node); + + if (!memcmp(&rt->address, addr, rt->sigdigits)) { + if (!use) + use = rt; + else if (rt->sigdigits > use->sigdigits) + use = rt; + } + } + + if (use) + x25_route_hold(use); + + read_unlock_bh(&x25_route_list_lock); + return use; +} + +/* + * Handle the ioctls that control the routing functions. + */ +int x25_route_ioctl(unsigned int cmd, void __user *arg) +{ + struct x25_route_struct rt; + struct net_device *dev; + int rc = -EINVAL; + + if (cmd != SIOCADDRT && cmd != SIOCDELRT) + goto out; + + rc = -EFAULT; + if (copy_from_user(&rt, arg, sizeof(rt))) + goto out; + + rc = -EINVAL; + if (rt.sigdigits < 0 || rt.sigdigits > 15) + goto out; + + dev = x25_dev_get(rt.device); + if (!dev) + goto out; + + if (cmd == SIOCADDRT) + rc = x25_add_route(&rt.address, rt.sigdigits, dev); + else + rc = x25_del_route(&rt.address, rt.sigdigits, dev); + dev_put(dev); +out: + return rc; +} + +/* + * Release all memory associated with X.25 routing structures. + */ +void __exit x25_route_free(void) +{ + struct x25_route *rt; + struct list_head *entry, *tmp; + + write_lock_bh(&x25_route_list_lock); + list_for_each_safe(entry, tmp, &x25_route_list) { + rt = list_entry(entry, struct x25_route, node); + __x25_remove_route(rt); + } + write_unlock_bh(&x25_route_list_lock); +} diff --git a/net/x25/x25_subr.c b/net/x25/x25_subr.c new file mode 100644 index 000000000000..183fea3bba67 --- /dev/null +++ b/net/x25/x25_subr.c @@ -0,0 +1,374 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Jonathan Naylor Started coding. + * X.25 002 Jonathan Naylor Centralised disconnection processing. + * mar/20/00 Daniela Squassoni Disabling/enabling of facilities + * negotiation. + * jun/24/01 Arnaldo C. Melo use skb_queue_purge, cleanups + */ + +#include +#include +#include +#include +#include +#include + +/* + * This routine purges all of the queues of frames. + */ +void x25_clear_queues(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + skb_queue_purge(&sk->sk_write_queue); + skb_queue_purge(&x25->ack_queue); + skb_queue_purge(&x25->interrupt_in_queue); + skb_queue_purge(&x25->interrupt_out_queue); + skb_queue_purge(&x25->fragment_queue); +} + + +/* + * This routine purges the input queue of those frames that have been + * acknowledged. This replaces the boxes labelled "V(a) <- N(r)" on the + * SDL diagram. +*/ +void x25_frames_acked(struct sock *sk, unsigned short nr) +{ + struct sk_buff *skb; + struct x25_sock *x25 = x25_sk(sk); + int modulus = x25->neighbour->extended ? X25_EMODULUS : X25_SMODULUS; + + /* + * Remove all the ack-ed frames from the ack queue. + */ + if (x25->va != nr) + while (skb_peek(&x25->ack_queue) && x25->va != nr) { + skb = skb_dequeue(&x25->ack_queue); + kfree_skb(skb); + x25->va = (x25->va + 1) % modulus; + } +} + +void x25_requeue_frames(struct sock *sk) +{ + struct sk_buff *skb, *skb_prev = NULL; + + /* + * Requeue all the un-ack-ed frames on the output queue to be picked + * up by x25_kick. This arrangement handles the possibility of an empty + * output queue. + */ + while ((skb = skb_dequeue(&x25_sk(sk)->ack_queue)) != NULL) { + if (!skb_prev) + skb_queue_head(&sk->sk_write_queue, skb); + else + skb_append(skb_prev, skb); + skb_prev = skb; + } +} + +/* + * Validate that the value of nr is between va and vs. Return true or + * false for testing. + */ +int x25_validate_nr(struct sock *sk, unsigned short nr) +{ + struct x25_sock *x25 = x25_sk(sk); + unsigned short vc = x25->va; + int modulus = x25->neighbour->extended ? X25_EMODULUS : X25_SMODULUS; + + while (vc != x25->vs) { + if (nr == vc) + return 1; + vc = (vc + 1) % modulus; + } + + return nr == x25->vs ? 1 : 0; +} + +/* + * This routine is called when the packet layer internally generates a + * control frame. + */ +void x25_write_internal(struct sock *sk, int frametype) +{ + struct x25_sock *x25 = x25_sk(sk); + struct sk_buff *skb; + unsigned char *dptr; + unsigned char facilities[X25_MAX_FAC_LEN]; + unsigned char addresses[1 + X25_ADDR_LEN]; + unsigned char lci1, lci2; + /* + * Default safe frame size. + */ + int len = X25_MAX_L2_LEN + X25_EXT_MIN_LEN; + + /* + * Adjust frame size. + */ + switch (frametype) { + case X25_CALL_REQUEST: + len += 1 + X25_ADDR_LEN + X25_MAX_FAC_LEN + + X25_MAX_CUD_LEN; + break; + case X25_CALL_ACCEPTED: + len += 1 + X25_MAX_FAC_LEN + X25_MAX_CUD_LEN; + break; + case X25_CLEAR_REQUEST: + case X25_RESET_REQUEST: + len += 2; + break; + case X25_RR: + case X25_RNR: + case X25_REJ: + case X25_CLEAR_CONFIRMATION: + case X25_INTERRUPT_CONFIRMATION: + case X25_RESET_CONFIRMATION: + break; + default: + printk(KERN_ERR "X.25: invalid frame type %02X\n", + frametype); + return; + } + + if ((skb = alloc_skb(len, GFP_ATOMIC)) == NULL) + return; + + /* + * Space for Ethernet and 802.2 LLC headers. + */ + skb_reserve(skb, X25_MAX_L2_LEN); + + /* + * Make space for the GFI and LCI, and fill them in. + */ + dptr = skb_put(skb, 2); + + lci1 = (x25->lci >> 8) & 0x0F; + lci2 = (x25->lci >> 0) & 0xFF; + + if (x25->neighbour->extended) { + *dptr++ = lci1 | X25_GFI_EXTSEQ; + *dptr++ = lci2; + } else { + *dptr++ = lci1 | X25_GFI_STDSEQ; + *dptr++ = lci2; + } + + /* + * Now fill in the frame type specific information. + */ + switch (frametype) { + + case X25_CALL_REQUEST: + dptr = skb_put(skb, 1); + *dptr++ = X25_CALL_REQUEST; + len = x25_addr_aton(addresses, &x25->dest_addr, + &x25->source_addr); + dptr = skb_put(skb, len); + memcpy(dptr, addresses, len); + len = x25_create_facilities(facilities, + &x25->facilities, + x25->neighbour->global_facil_mask); + dptr = skb_put(skb, len); + memcpy(dptr, facilities, len); + dptr = skb_put(skb, x25->calluserdata.cudlength); + memcpy(dptr, x25->calluserdata.cuddata, + x25->calluserdata.cudlength); + x25->calluserdata.cudlength = 0; + break; + + case X25_CALL_ACCEPTED: + dptr = skb_put(skb, 2); + *dptr++ = X25_CALL_ACCEPTED; + *dptr++ = 0x00; /* Address lengths */ + len = x25_create_facilities(facilities, + &x25->facilities, + x25->vc_facil_mask); + dptr = skb_put(skb, len); + memcpy(dptr, facilities, len); + dptr = skb_put(skb, x25->calluserdata.cudlength); + memcpy(dptr, x25->calluserdata.cuddata, + x25->calluserdata.cudlength); + x25->calluserdata.cudlength = 0; + break; + + case X25_CLEAR_REQUEST: + case X25_RESET_REQUEST: + dptr = skb_put(skb, 3); + *dptr++ = frametype; + *dptr++ = 0x00; /* XXX */ + *dptr++ = 0x00; /* XXX */ + break; + + case X25_RR: + case X25_RNR: + case X25_REJ: + if (x25->neighbour->extended) { + dptr = skb_put(skb, 2); + *dptr++ = frametype; + *dptr++ = (x25->vr << 1) & 0xFE; + } else { + dptr = skb_put(skb, 1); + *dptr = frametype; + *dptr++ |= (x25->vr << 5) & 0xE0; + } + break; + + case X25_CLEAR_CONFIRMATION: + case X25_INTERRUPT_CONFIRMATION: + case X25_RESET_CONFIRMATION: + dptr = skb_put(skb, 1); + *dptr = frametype; + break; + } + + x25_transmit_link(skb, x25->neighbour); +} + +/* + * Unpick the contents of the passed X.25 Packet Layer frame. + */ +int x25_decode(struct sock *sk, struct sk_buff *skb, int *ns, int *nr, int *q, + int *d, int *m) +{ + struct x25_sock *x25 = x25_sk(sk); + unsigned char *frame = skb->data; + + *ns = *nr = *q = *d = *m = 0; + + switch (frame[2]) { + case X25_CALL_REQUEST: + case X25_CALL_ACCEPTED: + case X25_CLEAR_REQUEST: + case X25_CLEAR_CONFIRMATION: + case X25_INTERRUPT: + case X25_INTERRUPT_CONFIRMATION: + case X25_RESET_REQUEST: + case X25_RESET_CONFIRMATION: + case X25_RESTART_REQUEST: + case X25_RESTART_CONFIRMATION: + case X25_REGISTRATION_REQUEST: + case X25_REGISTRATION_CONFIRMATION: + case X25_DIAGNOSTIC: + return frame[2]; + } + + if (x25->neighbour->extended) { + if (frame[2] == X25_RR || + frame[2] == X25_RNR || + frame[2] == X25_REJ) { + *nr = (frame[3] >> 1) & 0x7F; + return frame[2]; + } + } else { + if ((frame[2] & 0x1F) == X25_RR || + (frame[2] & 0x1F) == X25_RNR || + (frame[2] & 0x1F) == X25_REJ) { + *nr = (frame[2] >> 5) & 0x07; + return frame[2] & 0x1F; + } + } + + if (x25->neighbour->extended) { + if ((frame[2] & 0x01) == X25_DATA) { + *q = (frame[0] & X25_Q_BIT) == X25_Q_BIT; + *d = (frame[0] & X25_D_BIT) == X25_D_BIT; + *m = (frame[3] & X25_EXT_M_BIT) == X25_EXT_M_BIT; + *nr = (frame[3] >> 1) & 0x7F; + *ns = (frame[2] >> 1) & 0x7F; + return X25_DATA; + } + } else { + if ((frame[2] & 0x01) == X25_DATA) { + *q = (frame[0] & X25_Q_BIT) == X25_Q_BIT; + *d = (frame[0] & X25_D_BIT) == X25_D_BIT; + *m = (frame[2] & X25_STD_M_BIT) == X25_STD_M_BIT; + *nr = (frame[2] >> 5) & 0x07; + *ns = (frame[2] >> 1) & 0x07; + return X25_DATA; + } + } + + printk(KERN_DEBUG "X.25: invalid PLP frame %02X %02X %02X\n", + frame[0], frame[1], frame[2]); + + return X25_ILLEGAL; +} + +void x25_disconnect(struct sock *sk, int reason, unsigned char cause, + unsigned char diagnostic) +{ + struct x25_sock *x25 = x25_sk(sk); + + x25_clear_queues(sk); + x25_stop_timer(sk); + + x25->lci = 0; + x25->state = X25_STATE_0; + + x25->causediag.cause = cause; + x25->causediag.diagnostic = diagnostic; + + sk->sk_state = TCP_CLOSE; + sk->sk_err = reason; + sk->sk_shutdown |= SEND_SHUTDOWN; + + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sock_set_flag(sk, SOCK_DEAD); + } +} + +/* + * Clear an own-rx-busy condition and tell the peer about this, provided + * that there is a significant amount of free receive buffer space available. + */ +void x25_check_rbuf(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + if (atomic_read(&sk->sk_rmem_alloc) < (sk->sk_rcvbuf / 2) && + (x25->condition & X25_COND_OWN_RX_BUSY)) { + x25->condition &= ~X25_COND_OWN_RX_BUSY; + x25->condition &= ~X25_COND_ACK_PENDING; + x25->vl = x25->vr; + x25_write_internal(sk, X25_RR); + x25_stop_timer(sk); + } +} + +/* + * Compare 2 calluserdata structures, used to find correct listening sockets + * when call user data is used. + */ +int x25_check_calluserdata(struct x25_calluserdata *ours, struct x25_calluserdata *theirs) +{ + int i; + if (ours->cudlength != theirs->cudlength) + return 0; + + for (i=0;icudlength;i++) { + if (ours->cuddata[i] != theirs->cuddata[i]) { + return 0; + } + } + return 1; +} + diff --git a/net/x25/x25_timer.c b/net/x25/x25_timer.c new file mode 100644 index 000000000000..d6a21a3ad80e --- /dev/null +++ b/net/x25/x25_timer.c @@ -0,0 +1,176 @@ +/* + * X.25 Packet Layer release 002 + * + * This is ALPHA test software. This code may break your machine, + * randomly fail to work with new releases, misbehave and/or generally + * screw up. It might even work. + * + * This code REQUIRES 2.1.15 or higher + * + * This module: + * This module is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * History + * X.25 001 Jonathan Naylor Started coding. + * X.25 002 Jonathan Naylor New timer architecture. + * Centralised disconnection processing. + */ + +#include +#include +#include +#include +#include +#include + +static void x25_heartbeat_expiry(unsigned long); +static void x25_timer_expiry(unsigned long); + +void x25_init_timers(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + init_timer(&x25->timer); + x25->timer.data = (unsigned long)sk; + x25->timer.function = &x25_timer_expiry; + + /* initialized by sock_init_data */ + sk->sk_timer.data = (unsigned long)sk; + sk->sk_timer.function = &x25_heartbeat_expiry; +} + +void x25_start_heartbeat(struct sock *sk) +{ + mod_timer(&sk->sk_timer, jiffies + 5 * HZ); +} + +void x25_stop_heartbeat(struct sock *sk) +{ + del_timer(&sk->sk_timer); +} + +void x25_start_t2timer(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + mod_timer(&x25->timer, jiffies + x25->t2); +} + +void x25_start_t21timer(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + mod_timer(&x25->timer, jiffies + x25->t21); +} + +void x25_start_t22timer(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + mod_timer(&x25->timer, jiffies + x25->t22); +} + +void x25_start_t23timer(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + mod_timer(&x25->timer, jiffies + x25->t23); +} + +void x25_stop_timer(struct sock *sk) +{ + del_timer(&x25_sk(sk)->timer); +} + +unsigned long x25_display_timer(struct sock *sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + if (!timer_pending(&x25->timer)) + return 0; + + return x25->timer.expires - jiffies; +} + +static void x25_heartbeat_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) /* can currently only occur in state 3 */ + goto restart_heartbeat; + + switch (x25_sk(sk)->state) { + + case X25_STATE_0: + /* + * Magic here: If we listen() and a new link dies + * before it is accepted() it isn't 'dead' so doesn't + * get removed. + */ + if (sock_flag(sk, SOCK_DESTROY) || + (sk->sk_state == TCP_LISTEN && + sock_flag(sk, SOCK_DEAD))) { + x25_destroy_socket(sk); + goto unlock; + } + break; + + case X25_STATE_3: + /* + * Check for the state of the receive buffer. + */ + x25_check_rbuf(sk); + break; + } +restart_heartbeat: + x25_start_heartbeat(sk); +unlock: + bh_unlock_sock(sk); +} + +/* + * Timer has expired, it may have been T2, T21, T22, or T23. We can tell + * by the state machine state. + */ +static inline void x25_do_timer_expiry(struct sock * sk) +{ + struct x25_sock *x25 = x25_sk(sk); + + switch (x25->state) { + + case X25_STATE_3: /* T2 */ + if (x25->condition & X25_COND_ACK_PENDING) { + x25->condition &= ~X25_COND_ACK_PENDING; + x25_enquiry_response(sk); + } + break; + + case X25_STATE_1: /* T21 */ + case X25_STATE_4: /* T22 */ + x25_write_internal(sk, X25_CLEAR_REQUEST); + x25->state = X25_STATE_2; + x25_start_t23timer(sk); + break; + + case X25_STATE_2: /* T23 */ + x25_disconnect(sk, ETIMEDOUT, 0, 0); + break; + } +} + +static void x25_timer_expiry(unsigned long param) +{ + struct sock *sk = (struct sock *)param; + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) { /* can currently only occur in state 3 */ + if (x25_sk(sk)->state == X25_STATE_3) + x25_start_t2timer(sk); + } else + x25_do_timer_expiry(sk); + bh_unlock_sock(sk); +} diff --git a/net/xfrm/Kconfig b/net/xfrm/Kconfig new file mode 100644 index 000000000000..58ca6a972c48 --- /dev/null +++ b/net/xfrm/Kconfig @@ -0,0 +1,12 @@ +# +# XFRM configuration +# +config XFRM_USER + tristate "IPsec user configuration interface" + depends on INET && XFRM + ---help--- + Support for IPsec user configuration interface used + by native Linux tools. + + If unsure, say Y. + diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile new file mode 100644 index 000000000000..693aac1aa833 --- /dev/null +++ b/net/xfrm/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the XFRM subsystem. +# + +obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_input.o xfrm_algo.o +obj-$(CONFIG_XFRM_USER) += xfrm_user.o + diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c new file mode 100644 index 000000000000..080aae243ce0 --- /dev/null +++ b/net/xfrm/xfrm_algo.c @@ -0,0 +1,729 @@ +/* + * xfrm algorithm interface + * + * Copyright (c) 2002 James Morris + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + */ + +#include +#include +#include +#include +#include +#include +#if defined(CONFIG_INET_AH) || defined(CONFIG_INET_AH_MODULE) || defined(CONFIG_INET6_AH) || defined(CONFIG_INET6_AH_MODULE) +#include +#endif +#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE) +#include +#endif +#include + +/* + * Algorithms supported by IPsec. These entries contain properties which + * are used in key negotiation and xfrm processing, and are used to verify + * that instantiated crypto transforms have correct parameters for IPsec + * purposes. + */ +static struct xfrm_algo_desc aalg_list[] = { +{ + .name = "digest_null", + + .uinfo = { + .auth = { + .icv_truncbits = 0, + .icv_fullbits = 0, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_AALG_NULL, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 0, + .sadb_alg_maxbits = 0 + } +}, +{ + .name = "md5", + + .uinfo = { + .auth = { + .icv_truncbits = 96, + .icv_fullbits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_AALG_MD5HMAC, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 128 + } +}, +{ + .name = "sha1", + + .uinfo = { + .auth = { + .icv_truncbits = 96, + .icv_fullbits = 160, + } + }, + + .desc = { + .sadb_alg_id = SADB_AALG_SHA1HMAC, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 160, + .sadb_alg_maxbits = 160 + } +}, +{ + .name = "sha256", + + .uinfo = { + .auth = { + .icv_truncbits = 96, + .icv_fullbits = 256, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_AALG_SHA2_256HMAC, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 256, + .sadb_alg_maxbits = 256 + } +}, +{ + .name = "ripemd160", + + .uinfo = { + .auth = { + .icv_truncbits = 96, + .icv_fullbits = 160, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_AALG_RIPEMD160HMAC, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 160, + .sadb_alg_maxbits = 160 + } +}, +}; + +static struct xfrm_algo_desc ealg_list[] = { +{ + .name = "cipher_null", + + .uinfo = { + .encr = { + .blockbits = 8, + .defkeybits = 0, + } + }, + + .desc = { + .sadb_alg_id = SADB_EALG_NULL, + .sadb_alg_ivlen = 0, + .sadb_alg_minbits = 0, + .sadb_alg_maxbits = 0 + } +}, +{ + .name = "des", + + .uinfo = { + .encr = { + .blockbits = 64, + .defkeybits = 64, + } + }, + + .desc = { + .sadb_alg_id = SADB_EALG_DESCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 64, + .sadb_alg_maxbits = 64 + } +}, +{ + .name = "des3_ede", + + .uinfo = { + .encr = { + .blockbits = 64, + .defkeybits = 192, + } + }, + + .desc = { + .sadb_alg_id = SADB_EALG_3DESCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 192, + .sadb_alg_maxbits = 192 + } +}, +{ + .name = "cast128", + + .uinfo = { + .encr = { + .blockbits = 64, + .defkeybits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_CASTCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 40, + .sadb_alg_maxbits = 128 + } +}, +{ + .name = "blowfish", + + .uinfo = { + .encr = { + .blockbits = 64, + .defkeybits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_BLOWFISHCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 40, + .sadb_alg_maxbits = 448 + } +}, +{ + .name = "aes", + + .uinfo = { + .encr = { + .blockbits = 128, + .defkeybits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_AESCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256 + } +}, +{ + .name = "serpent", + + .uinfo = { + .encr = { + .blockbits = 128, + .defkeybits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_SERPENTCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256, + } +}, +{ + .name = "twofish", + + .uinfo = { + .encr = { + .blockbits = 128, + .defkeybits = 128, + } + }, + + .desc = { + .sadb_alg_id = SADB_X_EALG_TWOFISHCBC, + .sadb_alg_ivlen = 8, + .sadb_alg_minbits = 128, + .sadb_alg_maxbits = 256 + } +}, +}; + +static struct xfrm_algo_desc calg_list[] = { +{ + .name = "deflate", + .uinfo = { + .comp = { + .threshold = 90, + } + }, + .desc = { .sadb_alg_id = SADB_X_CALG_DEFLATE } +}, +{ + .name = "lzs", + .uinfo = { + .comp = { + .threshold = 90, + } + }, + .desc = { .sadb_alg_id = SADB_X_CALG_LZS } +}, +{ + .name = "lzjh", + .uinfo = { + .comp = { + .threshold = 50, + } + }, + .desc = { .sadb_alg_id = SADB_X_CALG_LZJH } +}, +}; + +static inline int aalg_entries(void) +{ + return ARRAY_SIZE(aalg_list); +} + +static inline int ealg_entries(void) +{ + return ARRAY_SIZE(ealg_list); +} + +static inline int calg_entries(void) +{ + return ARRAY_SIZE(calg_list); +} + +/* Todo: generic iterators */ +struct xfrm_algo_desc *xfrm_aalg_get_byid(int alg_id) +{ + int i; + + for (i = 0; i < aalg_entries(); i++) { + if (aalg_list[i].desc.sadb_alg_id == alg_id) { + if (aalg_list[i].available) + return &aalg_list[i]; + else + break; + } + } + return NULL; +} +EXPORT_SYMBOL_GPL(xfrm_aalg_get_byid); + +struct xfrm_algo_desc *xfrm_ealg_get_byid(int alg_id) +{ + int i; + + for (i = 0; i < ealg_entries(); i++) { + if (ealg_list[i].desc.sadb_alg_id == alg_id) { + if (ealg_list[i].available) + return &ealg_list[i]; + else + break; + } + } + return NULL; +} +EXPORT_SYMBOL_GPL(xfrm_ealg_get_byid); + +struct xfrm_algo_desc *xfrm_calg_get_byid(int alg_id) +{ + int i; + + for (i = 0; i < calg_entries(); i++) { + if (calg_list[i].desc.sadb_alg_id == alg_id) { + if (calg_list[i].available) + return &calg_list[i]; + else + break; + } + } + return NULL; +} +EXPORT_SYMBOL_GPL(xfrm_calg_get_byid); + +static struct xfrm_algo_desc *xfrm_get_byname(struct xfrm_algo_desc *list, + int entries, char *name, + int probe) +{ + int i, status; + + if (!name) + return NULL; + + for (i = 0; i < entries; i++) { + if (strcmp(name, list[i].name)) + continue; + + if (list[i].available) + return &list[i]; + + if (!probe) + break; + + status = crypto_alg_available(name, 0); + if (!status) + break; + + list[i].available = status; + return &list[i]; + } + return NULL; +} + +struct xfrm_algo_desc *xfrm_aalg_get_byname(char *name, int probe) +{ + return xfrm_get_byname(aalg_list, aalg_entries(), name, probe); +} +EXPORT_SYMBOL_GPL(xfrm_aalg_get_byname); + +struct xfrm_algo_desc *xfrm_ealg_get_byname(char *name, int probe) +{ + return xfrm_get_byname(ealg_list, ealg_entries(), name, probe); +} +EXPORT_SYMBOL_GPL(xfrm_ealg_get_byname); + +struct xfrm_algo_desc *xfrm_calg_get_byname(char *name, int probe) +{ + return xfrm_get_byname(calg_list, calg_entries(), name, probe); +} +EXPORT_SYMBOL_GPL(xfrm_calg_get_byname); + +struct xfrm_algo_desc *xfrm_aalg_get_byidx(unsigned int idx) +{ + if (idx >= aalg_entries()) + return NULL; + + return &aalg_list[idx]; +} +EXPORT_SYMBOL_GPL(xfrm_aalg_get_byidx); + +struct xfrm_algo_desc *xfrm_ealg_get_byidx(unsigned int idx) +{ + if (idx >= ealg_entries()) + return NULL; + + return &ealg_list[idx]; +} +EXPORT_SYMBOL_GPL(xfrm_ealg_get_byidx); + +/* + * Probe for the availability of crypto algorithms, and set the available + * flag for any algorithms found on the system. This is typically called by + * pfkey during userspace SA add, update or register. + */ +void xfrm_probe_algs(void) +{ +#ifdef CONFIG_CRYPTO + int i, status; + + BUG_ON(in_softirq()); + + for (i = 0; i < aalg_entries(); i++) { + status = crypto_alg_available(aalg_list[i].name, 0); + if (aalg_list[i].available != status) + aalg_list[i].available = status; + } + + for (i = 0; i < ealg_entries(); i++) { + status = crypto_alg_available(ealg_list[i].name, 0); + if (ealg_list[i].available != status) + ealg_list[i].available = status; + } + + for (i = 0; i < calg_entries(); i++) { + status = crypto_alg_available(calg_list[i].name, 0); + if (calg_list[i].available != status) + calg_list[i].available = status; + } +#endif +} +EXPORT_SYMBOL_GPL(xfrm_probe_algs); + +int xfrm_count_auth_supported(void) +{ + int i, n; + + for (i = 0, n = 0; i < aalg_entries(); i++) + if (aalg_list[i].available) + n++; + return n; +} +EXPORT_SYMBOL_GPL(xfrm_count_auth_supported); + +int xfrm_count_enc_supported(void) +{ + int i, n; + + for (i = 0, n = 0; i < ealg_entries(); i++) + if (ealg_list[i].available) + n++; + return n; +} +EXPORT_SYMBOL_GPL(xfrm_count_enc_supported); + +/* Move to common area: it is shared with AH. */ + +void skb_icv_walk(const struct sk_buff *skb, struct crypto_tfm *tfm, + int offset, int len, icv_update_fn_t icv_update) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct scatterlist sg; + + /* Checksum header. */ + if (copy > 0) { + if (copy > len) + copy = len; + + sg.page = virt_to_page(skb->data + offset); + sg.offset = (unsigned long)(skb->data + offset) % PAGE_SIZE; + sg.length = copy; + + icv_update(tfm, &sg, 1); + + if ((len -= copy) == 0) + return; + offset += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + + sg.page = frag->page; + sg.offset = frag->page_offset + offset-start; + sg.length = copy; + + icv_update(tfm, &sg, 1); + + if (!(len -= copy)) + return; + offset += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + skb_icv_walk(list, tfm, offset-start, copy, icv_update); + if ((len -= copy) == 0) + return; + offset += copy; + } + start = end; + } + } + if (len) + BUG(); +} +EXPORT_SYMBOL_GPL(skb_icv_walk); + +#if defined(CONFIG_INET_ESP) || defined(CONFIG_INET_ESP_MODULE) || defined(CONFIG_INET6_ESP) || defined(CONFIG_INET6_ESP_MODULE) + +/* Looking generic it is not used in another places. */ + +int +skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + int elt = 0; + + if (copy > 0) { + if (copy > len) + copy = len; + sg[elt].page = virt_to_page(skb->data + offset); + sg[elt].offset = (unsigned long)(skb->data + offset) % PAGE_SIZE; + sg[elt].length = copy; + elt++; + if ((len -= copy) == 0) + return elt; + offset += copy; + } + + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + if ((copy = end - offset) > 0) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + if (copy > len) + copy = len; + sg[elt].page = frag->page; + sg[elt].offset = frag->page_offset+offset-start; + sg[elt].length = copy; + elt++; + if (!(len -= copy)) + return elt; + offset += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + elt += skb_to_sgvec(list, sg+elt, offset - start, copy); + if ((len -= copy) == 0) + return elt; + offset += copy; + } + start = end; + } + } + if (len) + BUG(); + return elt; +} +EXPORT_SYMBOL_GPL(skb_to_sgvec); + +/* Check that skb data bits are writable. If they are not, copy data + * to newly created private area. If "tailbits" is given, make sure that + * tailbits bytes beyond current end of skb are writable. + * + * Returns amount of elements of scatterlist to load for subsequent + * transformations and pointer to writable trailer skb. + */ + +int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) +{ + int copyflag; + int elt; + struct sk_buff *skb1, **skb_p; + + /* If skb is cloned or its head is paged, reallocate + * head pulling out all the pages (pages are considered not writable + * at the moment even if they are anonymous). + */ + if ((skb_cloned(skb) || skb_shinfo(skb)->nr_frags) && + __pskb_pull_tail(skb, skb_pagelen(skb)-skb_headlen(skb)) == NULL) + return -ENOMEM; + + /* Easy case. Most of packets will go this way. */ + if (!skb_shinfo(skb)->frag_list) { + /* A little of trouble, not enough of space for trailer. + * This should not happen, when stack is tuned to generate + * good frames. OK, on miss we reallocate and reserve even more + * space, 128 bytes is fair. */ + + if (skb_tailroom(skb) < tailbits && + pskb_expand_head(skb, 0, tailbits-skb_tailroom(skb)+128, GFP_ATOMIC)) + return -ENOMEM; + + /* Voila! */ + *trailer = skb; + return 1; + } + + /* Misery. We are in troubles, going to mincer fragments... */ + + elt = 1; + skb_p = &skb_shinfo(skb)->frag_list; + copyflag = 0; + + while ((skb1 = *skb_p) != NULL) { + int ntail = 0; + + /* The fragment is partially pulled by someone, + * this can happen on input. Copy it and everything + * after it. */ + + if (skb_shared(skb1)) + copyflag = 1; + + /* If the skb is the last, worry about trailer. */ + + if (skb1->next == NULL && tailbits) { + if (skb_shinfo(skb1)->nr_frags || + skb_shinfo(skb1)->frag_list || + skb_tailroom(skb1) < tailbits) + ntail = tailbits + 128; + } + + if (copyflag || + skb_cloned(skb1) || + ntail || + skb_shinfo(skb1)->nr_frags || + skb_shinfo(skb1)->frag_list) { + struct sk_buff *skb2; + + /* Fuck, we are miserable poor guys... */ + if (ntail == 0) + skb2 = skb_copy(skb1, GFP_ATOMIC); + else + skb2 = skb_copy_expand(skb1, + skb_headroom(skb1), + ntail, + GFP_ATOMIC); + if (unlikely(skb2 == NULL)) + return -ENOMEM; + + if (skb1->sk) + skb_set_owner_w(skb, skb1->sk); + + /* Looking around. Are we still alive? + * OK, link new skb, drop old one */ + + skb2->next = skb1->next; + *skb_p = skb2; + kfree_skb(skb1); + skb1 = skb2; + } + elt++; + *trailer = skb1; + skb_p = &skb1->next; + } + + return elt; +} +EXPORT_SYMBOL_GPL(skb_cow_data); + +void *pskb_put(struct sk_buff *skb, struct sk_buff *tail, int len) +{ + if (tail != skb) { + skb->data_len += len; + skb->len += len; + } + return skb_put(tail, len); +} +EXPORT_SYMBOL_GPL(pskb_put); +#endif diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c new file mode 100644 index 000000000000..c58a6f05a0b6 --- /dev/null +++ b/net/xfrm/xfrm_input.c @@ -0,0 +1,89 @@ +/* + * xfrm_input.c + * + * Changes: + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific portion + * + */ + +#include +#include +#include +#include + +static kmem_cache_t *secpath_cachep; + +void __secpath_destroy(struct sec_path *sp) +{ + int i; + for (i = 0; i < sp->len; i++) + xfrm_state_put(sp->x[i].xvec); + kmem_cache_free(secpath_cachep, sp); +} +EXPORT_SYMBOL(__secpath_destroy); + +struct sec_path *secpath_dup(struct sec_path *src) +{ + struct sec_path *sp; + + sp = kmem_cache_alloc(secpath_cachep, SLAB_ATOMIC); + if (!sp) + return NULL; + + sp->len = 0; + if (src) { + int i; + + memcpy(sp, src, sizeof(*sp)); + for (i = 0; i < sp->len; i++) + xfrm_state_hold(sp->x[i].xvec); + } + atomic_set(&sp->refcnt, 1); + return sp; +} +EXPORT_SYMBOL(secpath_dup); + +/* Fetch spi and seq from ipsec header */ + +int xfrm_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq) +{ + int offset, offset_seq; + + switch (nexthdr) { + case IPPROTO_AH: + offset = offsetof(struct ip_auth_hdr, spi); + offset_seq = offsetof(struct ip_auth_hdr, seq_no); + break; + case IPPROTO_ESP: + offset = offsetof(struct ip_esp_hdr, spi); + offset_seq = offsetof(struct ip_esp_hdr, seq_no); + break; + case IPPROTO_COMP: + if (!pskb_may_pull(skb, sizeof(struct ip_comp_hdr))) + return -EINVAL; + *spi = ntohl(ntohs(*(u16*)(skb->h.raw + 2))); + *seq = 0; + return 0; + default: + return 1; + } + + if (!pskb_may_pull(skb, 16)) + return -EINVAL; + + *spi = *(u32*)(skb->h.raw + offset); + *seq = *(u32*)(skb->h.raw + offset_seq); + return 0; +} +EXPORT_SYMBOL(xfrm_parse_spi); + +void __init xfrm_input_init(void) +{ + secpath_cachep = kmem_cache_create("secpath_cache", + sizeof(struct sec_path), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!secpath_cachep) + panic("XFRM: failed to allocate secpath_cache\n"); +} diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c new file mode 100644 index 000000000000..80828078733d --- /dev/null +++ b/net/xfrm/xfrm_policy.c @@ -0,0 +1,1367 @@ +/* + * xfrm_policy.c + * + * Changes: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * IPv6 support + * Kazunori MIYAZAWA @USAGI + * YOSHIFUJI Hideaki + * Split up af-specific portion + * Derek Atkins Add the post_input processor + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DECLARE_MUTEX(xfrm_cfg_sem); +EXPORT_SYMBOL(xfrm_cfg_sem); + +static DEFINE_RWLOCK(xfrm_policy_lock); + +struct xfrm_policy *xfrm_policy_list[XFRM_POLICY_MAX*2]; +EXPORT_SYMBOL(xfrm_policy_list); + +static DEFINE_RWLOCK(xfrm_policy_afinfo_lock); +static struct xfrm_policy_afinfo *xfrm_policy_afinfo[NPROTO]; + +static kmem_cache_t *xfrm_dst_cache; + +static struct work_struct xfrm_policy_gc_work; +static struct list_head xfrm_policy_gc_list = + LIST_HEAD_INIT(xfrm_policy_gc_list); +static DEFINE_SPINLOCK(xfrm_policy_gc_lock); + +static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); +static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); + +int xfrm_register_type(struct xfrm_type *type, unsigned short family) +{ + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + struct xfrm_type_map *typemap; + int err = 0; + + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + typemap = afinfo->type_map; + + write_lock(&typemap->lock); + if (likely(typemap->map[type->proto] == NULL)) + typemap->map[type->proto] = type; + else + err = -EEXIST; + write_unlock(&typemap->lock); + xfrm_policy_put_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(xfrm_register_type); + +int xfrm_unregister_type(struct xfrm_type *type, unsigned short family) +{ + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + struct xfrm_type_map *typemap; + int err = 0; + + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + typemap = afinfo->type_map; + + write_lock(&typemap->lock); + if (unlikely(typemap->map[type->proto] != type)) + err = -ENOENT; + else + typemap->map[type->proto] = NULL; + write_unlock(&typemap->lock); + xfrm_policy_put_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(xfrm_unregister_type); + +struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family) +{ + struct xfrm_policy_afinfo *afinfo; + struct xfrm_type_map *typemap; + struct xfrm_type *type; + int modload_attempted = 0; + +retry: + afinfo = xfrm_policy_get_afinfo(family); + if (unlikely(afinfo == NULL)) + return NULL; + typemap = afinfo->type_map; + + read_lock(&typemap->lock); + type = typemap->map[proto]; + if (unlikely(type && !try_module_get(type->owner))) + type = NULL; + read_unlock(&typemap->lock); + if (!type && !modload_attempted) { + xfrm_policy_put_afinfo(afinfo); + request_module("xfrm-type-%d-%d", + (int) family, (int) proto); + modload_attempted = 1; + goto retry; + } + + xfrm_policy_put_afinfo(afinfo); + return type; +} +EXPORT_SYMBOL(xfrm_get_type); + +int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, + unsigned short family) +{ + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + int err = 0; + + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + if (likely(afinfo->dst_lookup != NULL)) + err = afinfo->dst_lookup(dst, fl); + else + err = -EINVAL; + xfrm_policy_put_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(xfrm_dst_lookup); + +void xfrm_put_type(struct xfrm_type *type) +{ + module_put(type->owner); +} + +static inline unsigned long make_jiffies(long secs) +{ + if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) + return MAX_SCHEDULE_TIMEOUT-1; + else + return secs*HZ; +} + +static void xfrm_policy_timer(unsigned long data) +{ + struct xfrm_policy *xp = (struct xfrm_policy*)data; + unsigned long now = (unsigned long)xtime.tv_sec; + long next = LONG_MAX; + int warn = 0; + int dir; + + read_lock(&xp->lock); + + if (xp->dead) + goto out; + + dir = xp->index & 7; + + if (xp->lft.hard_add_expires_seconds) { + long tmo = xp->lft.hard_add_expires_seconds + + xp->curlft.add_time - now; + if (tmo <= 0) + goto expired; + if (tmo < next) + next = tmo; + } + if (xp->lft.hard_use_expires_seconds) { + long tmo = xp->lft.hard_use_expires_seconds + + (xp->curlft.use_time ? : xp->curlft.add_time) - now; + if (tmo <= 0) + goto expired; + if (tmo < next) + next = tmo; + } + if (xp->lft.soft_add_expires_seconds) { + long tmo = xp->lft.soft_add_expires_seconds + + xp->curlft.add_time - now; + if (tmo <= 0) { + warn = 1; + tmo = XFRM_KM_TIMEOUT; + } + if (tmo < next) + next = tmo; + } + if (xp->lft.soft_use_expires_seconds) { + long tmo = xp->lft.soft_use_expires_seconds + + (xp->curlft.use_time ? : xp->curlft.add_time) - now; + if (tmo <= 0) { + warn = 1; + tmo = XFRM_KM_TIMEOUT; + } + if (tmo < next) + next = tmo; + } + + if (warn) + km_policy_expired(xp, dir, 0); + if (next != LONG_MAX && + !mod_timer(&xp->timer, jiffies + make_jiffies(next))) + xfrm_pol_hold(xp); + +out: + read_unlock(&xp->lock); + xfrm_pol_put(xp); + return; + +expired: + read_unlock(&xp->lock); + km_policy_expired(xp, dir, 1); + xfrm_policy_delete(xp, dir); + xfrm_pol_put(xp); +} + + +/* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2 + * SPD calls. + */ + +struct xfrm_policy *xfrm_policy_alloc(int gfp) +{ + struct xfrm_policy *policy; + + policy = kmalloc(sizeof(struct xfrm_policy), gfp); + + if (policy) { + memset(policy, 0, sizeof(struct xfrm_policy)); + atomic_set(&policy->refcnt, 1); + rwlock_init(&policy->lock); + init_timer(&policy->timer); + policy->timer.data = (unsigned long)policy; + policy->timer.function = xfrm_policy_timer; + } + return policy; +} +EXPORT_SYMBOL(xfrm_policy_alloc); + +/* Destroy xfrm_policy: descendant resources must be released to this moment. */ + +void __xfrm_policy_destroy(struct xfrm_policy *policy) +{ + if (!policy->dead) + BUG(); + + if (policy->bundles) + BUG(); + + if (del_timer(&policy->timer)) + BUG(); + + kfree(policy); +} +EXPORT_SYMBOL(__xfrm_policy_destroy); + +static void xfrm_policy_gc_kill(struct xfrm_policy *policy) +{ + struct dst_entry *dst; + + while ((dst = policy->bundles) != NULL) { + policy->bundles = dst->next; + dst_free(dst); + } + + if (del_timer(&policy->timer)) + atomic_dec(&policy->refcnt); + + if (atomic_read(&policy->refcnt) > 1) + flow_cache_flush(); + + xfrm_pol_put(policy); +} + +static void xfrm_policy_gc_task(void *data) +{ + struct xfrm_policy *policy; + struct list_head *entry, *tmp; + struct list_head gc_list = LIST_HEAD_INIT(gc_list); + + spin_lock_bh(&xfrm_policy_gc_lock); + list_splice_init(&xfrm_policy_gc_list, &gc_list); + spin_unlock_bh(&xfrm_policy_gc_lock); + + list_for_each_safe(entry, tmp, &gc_list) { + policy = list_entry(entry, struct xfrm_policy, list); + xfrm_policy_gc_kill(policy); + } +} + +/* Rule must be locked. Release descentant resources, announce + * entry dead. The rule must be unlinked from lists to the moment. + */ + +static void xfrm_policy_kill(struct xfrm_policy *policy) +{ + int dead; + + write_lock_bh(&policy->lock); + dead = policy->dead; + policy->dead = 1; + write_unlock_bh(&policy->lock); + + if (unlikely(dead)) { + WARN_ON(1); + return; + } + + spin_lock(&xfrm_policy_gc_lock); + list_add(&policy->list, &xfrm_policy_gc_list); + spin_unlock(&xfrm_policy_gc_lock); + + schedule_work(&xfrm_policy_gc_work); +} + +/* Generate new index... KAME seems to generate them ordered by cost + * of an absolute inpredictability of ordering of rules. This will not pass. */ +static u32 xfrm_gen_index(int dir) +{ + u32 idx; + struct xfrm_policy *p; + static u32 idx_generator; + + for (;;) { + idx = (idx_generator | dir); + idx_generator += 8; + if (idx == 0) + idx = 8; + for (p = xfrm_policy_list[dir]; p; p = p->next) { + if (p->index == idx) + break; + } + if (!p) + return idx; + } +} + +int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) +{ + struct xfrm_policy *pol, **p; + struct xfrm_policy *delpol = NULL; + struct xfrm_policy **newpos = NULL; + + write_lock_bh(&xfrm_policy_lock); + for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) { + if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0) { + if (excl) { + write_unlock_bh(&xfrm_policy_lock); + return -EEXIST; + } + *p = pol->next; + delpol = pol; + if (policy->priority > pol->priority) + continue; + } else if (policy->priority >= pol->priority) { + p = &pol->next; + continue; + } + if (!newpos) + newpos = p; + if (delpol) + break; + p = &pol->next; + } + if (newpos) + p = newpos; + xfrm_pol_hold(policy); + policy->next = *p; + *p = policy; + atomic_inc(&flow_cache_genid); + policy->index = delpol ? delpol->index : xfrm_gen_index(dir); + policy->curlft.add_time = (unsigned long)xtime.tv_sec; + policy->curlft.use_time = 0; + if (!mod_timer(&policy->timer, jiffies + HZ)) + xfrm_pol_hold(policy); + write_unlock_bh(&xfrm_policy_lock); + + if (delpol) { + xfrm_policy_kill(delpol); + } + return 0; +} +EXPORT_SYMBOL(xfrm_policy_insert); + +struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel, + int delete) +{ + struct xfrm_policy *pol, **p; + + write_lock_bh(&xfrm_policy_lock); + for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) { + if (memcmp(sel, &pol->selector, sizeof(*sel)) == 0) { + xfrm_pol_hold(pol); + if (delete) + *p = pol->next; + break; + } + } + write_unlock_bh(&xfrm_policy_lock); + + if (pol && delete) { + atomic_inc(&flow_cache_genid); + xfrm_policy_kill(pol); + } + return pol; +} +EXPORT_SYMBOL(xfrm_policy_bysel); + +struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete) +{ + struct xfrm_policy *pol, **p; + + write_lock_bh(&xfrm_policy_lock); + for (p = &xfrm_policy_list[id & 7]; (pol=*p)!=NULL; p = &pol->next) { + if (pol->index == id) { + xfrm_pol_hold(pol); + if (delete) + *p = pol->next; + break; + } + } + write_unlock_bh(&xfrm_policy_lock); + + if (pol && delete) { + atomic_inc(&flow_cache_genid); + xfrm_policy_kill(pol); + } + return pol; +} +EXPORT_SYMBOL(xfrm_policy_byid); + +void xfrm_policy_flush(void) +{ + struct xfrm_policy *xp; + int dir; + + write_lock_bh(&xfrm_policy_lock); + for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { + while ((xp = xfrm_policy_list[dir]) != NULL) { + xfrm_policy_list[dir] = xp->next; + write_unlock_bh(&xfrm_policy_lock); + + xfrm_policy_kill(xp); + + write_lock_bh(&xfrm_policy_lock); + } + } + atomic_inc(&flow_cache_genid); + write_unlock_bh(&xfrm_policy_lock); +} +EXPORT_SYMBOL(xfrm_policy_flush); + +int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*), + void *data) +{ + struct xfrm_policy *xp; + int dir; + int count = 0; + int error = 0; + + read_lock_bh(&xfrm_policy_lock); + for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) { + for (xp = xfrm_policy_list[dir]; xp; xp = xp->next) + count++; + } + + if (count == 0) { + error = -ENOENT; + goto out; + } + + for (dir = 0; dir < 2*XFRM_POLICY_MAX; dir++) { + for (xp = xfrm_policy_list[dir]; xp; xp = xp->next) { + error = func(xp, dir%XFRM_POLICY_MAX, --count, data); + if (error) + goto out; + } + } + +out: + read_unlock_bh(&xfrm_policy_lock); + return error; +} +EXPORT_SYMBOL(xfrm_policy_walk); + +/* Find policy to apply to this flow. */ + +static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, + void **objp, atomic_t **obj_refp) +{ + struct xfrm_policy *pol; + + read_lock_bh(&xfrm_policy_lock); + for (pol = xfrm_policy_list[dir]; pol; pol = pol->next) { + struct xfrm_selector *sel = &pol->selector; + int match; + + if (pol->family != family) + continue; + + match = xfrm_selector_match(sel, fl, family); + if (match) { + xfrm_pol_hold(pol); + break; + } + } + read_unlock_bh(&xfrm_policy_lock); + if ((*objp = (void *) pol) != NULL) + *obj_refp = &pol->refcnt; +} + +static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl) +{ + struct xfrm_policy *pol; + + read_lock_bh(&xfrm_policy_lock); + if ((pol = sk->sk_policy[dir]) != NULL) { + int match = xfrm_selector_match(&pol->selector, fl, + sk->sk_family); + if (match) + xfrm_pol_hold(pol); + else + pol = NULL; + } + read_unlock_bh(&xfrm_policy_lock); + return pol; +} + +static void __xfrm_policy_link(struct xfrm_policy *pol, int dir) +{ + pol->next = xfrm_policy_list[dir]; + xfrm_policy_list[dir] = pol; + xfrm_pol_hold(pol); +} + +static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, + int dir) +{ + struct xfrm_policy **polp; + + for (polp = &xfrm_policy_list[dir]; + *polp != NULL; polp = &(*polp)->next) { + if (*polp == pol) { + *polp = pol->next; + return pol; + } + } + return NULL; +} + +void xfrm_policy_delete(struct xfrm_policy *pol, int dir) +{ + write_lock_bh(&xfrm_policy_lock); + pol = __xfrm_policy_unlink(pol, dir); + write_unlock_bh(&xfrm_policy_lock); + if (pol) { + if (dir < XFRM_POLICY_MAX) + atomic_inc(&flow_cache_genid); + xfrm_policy_kill(pol); + } +} + +int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) +{ + struct xfrm_policy *old_pol; + + write_lock_bh(&xfrm_policy_lock); + old_pol = sk->sk_policy[dir]; + sk->sk_policy[dir] = pol; + if (pol) { + pol->curlft.add_time = (unsigned long)xtime.tv_sec; + pol->index = xfrm_gen_index(XFRM_POLICY_MAX+dir); + __xfrm_policy_link(pol, XFRM_POLICY_MAX+dir); + } + if (old_pol) + __xfrm_policy_unlink(old_pol, XFRM_POLICY_MAX+dir); + write_unlock_bh(&xfrm_policy_lock); + + if (old_pol) { + xfrm_policy_kill(old_pol); + } + return 0; +} + +static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir) +{ + struct xfrm_policy *newp = xfrm_policy_alloc(GFP_ATOMIC); + + if (newp) { + newp->selector = old->selector; + newp->lft = old->lft; + newp->curlft = old->curlft; + newp->action = old->action; + newp->flags = old->flags; + newp->xfrm_nr = old->xfrm_nr; + newp->index = old->index; + memcpy(newp->xfrm_vec, old->xfrm_vec, + newp->xfrm_nr*sizeof(struct xfrm_tmpl)); + write_lock_bh(&xfrm_policy_lock); + __xfrm_policy_link(newp, XFRM_POLICY_MAX+dir); + write_unlock_bh(&xfrm_policy_lock); + xfrm_pol_put(newp); + } + return newp; +} + +int __xfrm_sk_clone_policy(struct sock *sk) +{ + struct xfrm_policy *p0 = sk->sk_policy[0], + *p1 = sk->sk_policy[1]; + + sk->sk_policy[0] = sk->sk_policy[1] = NULL; + if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL) + return -ENOMEM; + if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL) + return -ENOMEM; + return 0; +} + +/* Resolve list of templates for the flow, given policy. */ + +static int +xfrm_tmpl_resolve(struct xfrm_policy *policy, struct flowi *fl, + struct xfrm_state **xfrm, + unsigned short family) +{ + int nx; + int i, error; + xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family); + xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family); + + for (nx=0, i = 0; i < policy->xfrm_nr; i++) { + struct xfrm_state *x; + xfrm_address_t *remote = daddr; + xfrm_address_t *local = saddr; + struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i]; + + if (tmpl->mode) { + remote = &tmpl->id.daddr; + local = &tmpl->saddr; + } + + x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family); + + if (x && x->km.state == XFRM_STATE_VALID) { + xfrm[nx++] = x; + daddr = remote; + saddr = local; + continue; + } + if (x) { + error = (x->km.state == XFRM_STATE_ERROR ? + -EINVAL : -EAGAIN); + xfrm_state_put(x); + } + + if (!tmpl->optional) + goto fail; + } + return nx; + +fail: + for (nx--; nx>=0; nx--) + xfrm_state_put(xfrm[nx]); + return error; +} + +/* Check that the bundle accepts the flow and its components are + * still valid. + */ + +static struct dst_entry * +xfrm_find_bundle(struct flowi *fl, struct xfrm_policy *policy, unsigned short family) +{ + struct dst_entry *x; + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + if (unlikely(afinfo == NULL)) + return ERR_PTR(-EINVAL); + x = afinfo->find_bundle(fl, policy); + xfrm_policy_put_afinfo(afinfo); + return x; +} + +/* Allocate chain of dst_entry's, attach known xfrm's, calculate + * all the metrics... Shortly, bundle a bundle. + */ + +static int +xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, + struct flowi *fl, struct dst_entry **dst_p, + unsigned short family) +{ + int err; + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + if (unlikely(afinfo == NULL)) + return -EINVAL; + err = afinfo->bundle_create(policy, xfrm, nx, fl, dst_p); + xfrm_policy_put_afinfo(afinfo); + return err; +} + +static inline int policy_to_flow_dir(int dir) +{ + if (XFRM_POLICY_IN == FLOW_DIR_IN && + XFRM_POLICY_OUT == FLOW_DIR_OUT && + XFRM_POLICY_FWD == FLOW_DIR_FWD) + return dir; + switch (dir) { + default: + case XFRM_POLICY_IN: + return FLOW_DIR_IN; + case XFRM_POLICY_OUT: + return FLOW_DIR_OUT; + case XFRM_POLICY_FWD: + return FLOW_DIR_FWD; + }; +} + +static int stale_bundle(struct dst_entry *dst); + +/* Main function: finds/creates a bundle for given flow. + * + * At the moment we eat a raw IP route. Mostly to speed up lookups + * on interfaces with disabled IPsec. + */ +int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, + struct sock *sk, int flags) +{ + struct xfrm_policy *policy; + struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; + struct dst_entry *dst, *dst_orig = *dst_p; + int nx = 0; + int err; + u32 genid; + u16 family = dst_orig->ops->family; +restart: + genid = atomic_read(&flow_cache_genid); + policy = NULL; + if (sk && sk->sk_policy[1]) + policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); + + if (!policy) { + /* To accelerate a bit... */ + if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT]) + return 0; + + policy = flow_cache_lookup(fl, family, + policy_to_flow_dir(XFRM_POLICY_OUT), + xfrm_policy_lookup); + } + + if (!policy) + return 0; + + policy->curlft.use_time = (unsigned long)xtime.tv_sec; + + switch (policy->action) { + case XFRM_POLICY_BLOCK: + /* Prohibit the flow */ + xfrm_pol_put(policy); + return -EPERM; + + case XFRM_POLICY_ALLOW: + if (policy->xfrm_nr == 0) { + /* Flow passes not transformed. */ + xfrm_pol_put(policy); + return 0; + } + + /* Try to find matching bundle. + * + * LATER: help from flow cache. It is optional, this + * is required only for output policy. + */ + dst = xfrm_find_bundle(fl, policy, family); + if (IS_ERR(dst)) { + xfrm_pol_put(policy); + return PTR_ERR(dst); + } + + if (dst) + break; + + nx = xfrm_tmpl_resolve(policy, fl, xfrm, family); + + if (unlikely(nx<0)) { + err = nx; + if (err == -EAGAIN && flags) { + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(&km_waitq, &wait); + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + set_current_state(TASK_RUNNING); + remove_wait_queue(&km_waitq, &wait); + + nx = xfrm_tmpl_resolve(policy, fl, xfrm, family); + + if (nx == -EAGAIN && signal_pending(current)) { + err = -ERESTART; + goto error; + } + if (nx == -EAGAIN || + genid != atomic_read(&flow_cache_genid)) { + xfrm_pol_put(policy); + goto restart; + } + err = nx; + } + if (err < 0) + goto error; + } + if (nx == 0) { + /* Flow passes not transformed. */ + xfrm_pol_put(policy); + return 0; + } + + dst = dst_orig; + err = xfrm_bundle_create(policy, xfrm, nx, fl, &dst, family); + + if (unlikely(err)) { + int i; + for (i=0; ilock); + if (unlikely(policy->dead || stale_bundle(dst))) { + /* Wow! While we worked on resolving, this + * policy has gone. Retry. It is not paranoia, + * we just cannot enlist new bundle to dead object. + * We can't enlist stable bundles either. + */ + write_unlock_bh(&policy->lock); + + xfrm_pol_put(policy); + if (dst) + dst_free(dst); + goto restart; + } + dst->next = policy->bundles; + policy->bundles = dst; + dst_hold(dst); + write_unlock_bh(&policy->lock); + } + *dst_p = dst; + dst_release(dst_orig); + xfrm_pol_put(policy); + return 0; + +error: + dst_release(dst_orig); + xfrm_pol_put(policy); + *dst_p = NULL; + return err; +} +EXPORT_SYMBOL(xfrm_lookup); + +/* When skb is transformed back to its "native" form, we have to + * check policy restrictions. At the moment we make this in maximally + * stupid way. Shame on me. :-) Of course, connected sockets must + * have policy cached at them. + */ + +static inline int +xfrm_state_ok(struct xfrm_tmpl *tmpl, struct xfrm_state *x, + unsigned short family) +{ + if (xfrm_state_kern(x)) + return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, family); + return x->id.proto == tmpl->id.proto && + (x->id.spi == tmpl->id.spi || !tmpl->id.spi) && + (x->props.reqid == tmpl->reqid || !tmpl->reqid) && + x->props.mode == tmpl->mode && + (tmpl->aalgos & (1<props.aalgo)) && + !(x->props.mode && xfrm_state_addr_cmp(tmpl, x, family)); +} + +static inline int +xfrm_policy_ok(struct xfrm_tmpl *tmpl, struct sec_path *sp, int start, + unsigned short family) +{ + int idx = start; + + if (tmpl->optional) { + if (!tmpl->mode) + return start; + } else + start = -1; + for (; idx < sp->len; idx++) { + if (xfrm_state_ok(tmpl, sp->x[idx].xvec, family)) + return ++idx; + if (sp->x[idx].xvec->props.mode) + break; + } + return start; +} + +static int +_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family) +{ + struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); + + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + afinfo->decode_session(skb, fl); + xfrm_policy_put_afinfo(afinfo); + return 0; +} + +static inline int secpath_has_tunnel(struct sec_path *sp, int k) +{ + for (; k < sp->len; k++) { + if (sp->x[k].xvec->props.mode) + return 1; + } + + return 0; +} + +int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, + unsigned short family) +{ + struct xfrm_policy *pol; + struct flowi fl; + + if (_decode_session(skb, &fl, family) < 0) + return 0; + + /* First, check used SA against their selectors. */ + if (skb->sp) { + int i; + + for (i=skb->sp->len-1; i>=0; i--) { + struct sec_decap_state *xvec = &(skb->sp->x[i]); + if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family)) + return 0; + + /* If there is a post_input processor, try running it */ + if (xvec->xvec->type->post_input && + (xvec->xvec->type->post_input)(xvec->xvec, + &(xvec->decap), + skb) != 0) + return 0; + } + } + + pol = NULL; + if (sk && sk->sk_policy[dir]) + pol = xfrm_sk_policy_lookup(sk, dir, &fl); + + if (!pol) + pol = flow_cache_lookup(&fl, family, + policy_to_flow_dir(dir), + xfrm_policy_lookup); + + if (!pol) + return !skb->sp || !secpath_has_tunnel(skb->sp, 0); + + pol->curlft.use_time = (unsigned long)xtime.tv_sec; + + if (pol->action == XFRM_POLICY_ALLOW) { + struct sec_path *sp; + static struct sec_path dummy; + int i, k; + + if ((sp = skb->sp) == NULL) + sp = &dummy; + + /* For each tunnel xfrm, find the first matching tmpl. + * For each tmpl before that, find corresponding xfrm. + * Order is _important_. Later we will implement + * some barriers, but at the moment barriers + * are implied between each two transformations. + */ + for (i = pol->xfrm_nr-1, k = 0; i >= 0; i--) { + k = xfrm_policy_ok(pol->xfrm_vec+i, sp, k, family); + if (k < 0) + goto reject; + } + + if (secpath_has_tunnel(sp, k)) + goto reject; + + xfrm_pol_put(pol); + return 1; + } + +reject: + xfrm_pol_put(pol); + return 0; +} +EXPORT_SYMBOL(__xfrm_policy_check); + +int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) +{ + struct flowi fl; + + if (_decode_session(skb, &fl, family) < 0) + return 0; + + return xfrm_lookup(&skb->dst, &fl, NULL, 0) == 0; +} +EXPORT_SYMBOL(__xfrm_route_forward); + +/* Optimize later using cookies and generation ids. */ + +static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) +{ + if (!stale_bundle(dst)) + return dst; + + return NULL; +} + +static int stale_bundle(struct dst_entry *dst) +{ + return !xfrm_bundle_ok((struct xfrm_dst *)dst, NULL, AF_UNSPEC); +} + +static void xfrm_dst_destroy(struct dst_entry *dst) +{ + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + + dst_release(xdst->route); + + if (!dst->xfrm) + return; + xfrm_state_put(dst->xfrm); + dst->xfrm = NULL; +} + +static void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev, + int unregister) +{ + if (!unregister) + return; + + while ((dst = dst->child) && dst->xfrm && dst->dev == dev) { + dst->dev = &loopback_dev; + dev_hold(&loopback_dev); + dev_put(dev); + } +} + +static void xfrm_link_failure(struct sk_buff *skb) +{ + /* Impossible. Such dst must be popped before reaches point of failure. */ + return; +} + +static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) +{ + if (dst) { + if (dst->obsolete) { + dst_release(dst); + dst = NULL; + } + } + return dst; +} + +static void xfrm_prune_bundles(int (*func)(struct dst_entry *)) +{ + int i; + struct xfrm_policy *pol; + struct dst_entry *dst, **dstp, *gc_list = NULL; + + read_lock_bh(&xfrm_policy_lock); + for (i=0; i<2*XFRM_POLICY_MAX; i++) { + for (pol = xfrm_policy_list[i]; pol; pol = pol->next) { + write_lock(&pol->lock); + dstp = &pol->bundles; + while ((dst=*dstp) != NULL) { + if (func(dst)) { + *dstp = dst->next; + dst->next = gc_list; + gc_list = dst; + } else { + dstp = &dst->next; + } + } + write_unlock(&pol->lock); + } + } + read_unlock_bh(&xfrm_policy_lock); + + while (gc_list) { + dst = gc_list; + gc_list = dst->next; + dst_free(dst); + } +} + +static int unused_bundle(struct dst_entry *dst) +{ + return !atomic_read(&dst->__refcnt); +} + +static void __xfrm_garbage_collect(void) +{ + xfrm_prune_bundles(unused_bundle); +} + +int xfrm_flush_bundles(void) +{ + xfrm_prune_bundles(stale_bundle); + return 0; +} + +void xfrm_init_pmtu(struct dst_entry *dst) +{ + do { + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + u32 pmtu, route_mtu_cached; + + pmtu = dst_mtu(dst->child); + xdst->child_mtu_cached = pmtu; + + pmtu = xfrm_state_mtu(dst->xfrm, pmtu); + + route_mtu_cached = dst_mtu(xdst->route); + xdst->route_mtu_cached = route_mtu_cached; + + if (pmtu > route_mtu_cached) + pmtu = route_mtu_cached; + + dst->metrics[RTAX_MTU-1] = pmtu; + } while ((dst = dst->next)); +} + +EXPORT_SYMBOL(xfrm_init_pmtu); + +/* Check that the bundle accepts the flow and its components are + * still valid. + */ + +int xfrm_bundle_ok(struct xfrm_dst *first, struct flowi *fl, int family) +{ + struct dst_entry *dst = &first->u.dst; + struct xfrm_dst *last; + u32 mtu; + + if (!dst_check(dst->path, 0) || + (dst->dev && !netif_running(dst->dev))) + return 0; + + last = NULL; + + do { + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + + if (fl && !xfrm_selector_match(&dst->xfrm->sel, fl, family)) + return 0; + if (dst->xfrm->km.state != XFRM_STATE_VALID) + return 0; + + mtu = dst_mtu(dst->child); + if (xdst->child_mtu_cached != mtu) { + last = xdst; + xdst->child_mtu_cached = mtu; + } + + if (!dst_check(xdst->route, 0)) + return 0; + mtu = dst_mtu(xdst->route); + if (xdst->route_mtu_cached != mtu) { + last = xdst; + xdst->route_mtu_cached = mtu; + } + + dst = dst->child; + } while (dst->xfrm); + + if (likely(!last)) + return 1; + + mtu = last->child_mtu_cached; + for (;;) { + dst = &last->u.dst; + + mtu = xfrm_state_mtu(dst->xfrm, mtu); + if (mtu > last->route_mtu_cached) + mtu = last->route_mtu_cached; + dst->metrics[RTAX_MTU-1] = mtu; + + if (last == first) + break; + + last = last->u.next; + last->child_mtu_cached = mtu; + } + + return 1; +} + +EXPORT_SYMBOL(xfrm_bundle_ok); + +/* Well... that's _TASK_. We need to scan through transformation + * list and figure out what mss tcp should generate in order to + * final datagram fit to mtu. Mama mia... :-) + * + * Apparently, some easy way exists, but we used to choose the most + * bizarre ones. :-) So, raising Kalashnikov... tra-ta-ta. + * + * Consider this function as something like dark humour. :-) + */ +static int xfrm_get_mss(struct dst_entry *dst, u32 mtu) +{ + int res = mtu - dst->header_len; + + for (;;) { + struct dst_entry *d = dst; + int m = res; + + do { + struct xfrm_state *x = d->xfrm; + if (x) { + spin_lock_bh(&x->lock); + if (x->km.state == XFRM_STATE_VALID && + x->type && x->type->get_max_size) + m = x->type->get_max_size(d->xfrm, m); + else + m += x->props.header_len; + spin_unlock_bh(&x->lock); + } + } while ((d = d->child) != NULL); + + if (m <= mtu) + break; + res -= (m - mtu); + if (res < 88) + return mtu; + } + + return res + dst->header_len; +} + +int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) +{ + int err = 0; + if (unlikely(afinfo == NULL)) + return -EINVAL; + if (unlikely(afinfo->family >= NPROTO)) + return -EAFNOSUPPORT; + write_lock(&xfrm_policy_afinfo_lock); + if (unlikely(xfrm_policy_afinfo[afinfo->family] != NULL)) + err = -ENOBUFS; + else { + struct dst_ops *dst_ops = afinfo->dst_ops; + if (likely(dst_ops->kmem_cachep == NULL)) + dst_ops->kmem_cachep = xfrm_dst_cache; + if (likely(dst_ops->check == NULL)) + dst_ops->check = xfrm_dst_check; + if (likely(dst_ops->destroy == NULL)) + dst_ops->destroy = xfrm_dst_destroy; + if (likely(dst_ops->ifdown == NULL)) + dst_ops->ifdown = xfrm_dst_ifdown; + if (likely(dst_ops->negative_advice == NULL)) + dst_ops->negative_advice = xfrm_negative_advice; + if (likely(dst_ops->link_failure == NULL)) + dst_ops->link_failure = xfrm_link_failure; + if (likely(dst_ops->get_mss == NULL)) + dst_ops->get_mss = xfrm_get_mss; + if (likely(afinfo->garbage_collect == NULL)) + afinfo->garbage_collect = __xfrm_garbage_collect; + xfrm_policy_afinfo[afinfo->family] = afinfo; + } + write_unlock(&xfrm_policy_afinfo_lock); + return err; +} +EXPORT_SYMBOL(xfrm_policy_register_afinfo); + +int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo) +{ + int err = 0; + if (unlikely(afinfo == NULL)) + return -EINVAL; + if (unlikely(afinfo->family >= NPROTO)) + return -EAFNOSUPPORT; + write_lock(&xfrm_policy_afinfo_lock); + if (likely(xfrm_policy_afinfo[afinfo->family] != NULL)) { + if (unlikely(xfrm_policy_afinfo[afinfo->family] != afinfo)) + err = -EINVAL; + else { + struct dst_ops *dst_ops = afinfo->dst_ops; + xfrm_policy_afinfo[afinfo->family] = NULL; + dst_ops->kmem_cachep = NULL; + dst_ops->check = NULL; + dst_ops->destroy = NULL; + dst_ops->ifdown = NULL; + dst_ops->negative_advice = NULL; + dst_ops->link_failure = NULL; + dst_ops->get_mss = NULL; + afinfo->garbage_collect = NULL; + } + } + write_unlock(&xfrm_policy_afinfo_lock); + return err; +} +EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); + +static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) +{ + struct xfrm_policy_afinfo *afinfo; + if (unlikely(family >= NPROTO)) + return NULL; + read_lock(&xfrm_policy_afinfo_lock); + afinfo = xfrm_policy_afinfo[family]; + if (likely(afinfo != NULL)) + read_lock(&afinfo->lock); + read_unlock(&xfrm_policy_afinfo_lock); + return afinfo; +} + +static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) +{ + if (unlikely(afinfo == NULL)) + return; + read_unlock(&afinfo->lock); +} + +static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + switch (event) { + case NETDEV_DOWN: + xfrm_flush_bundles(); + } + return NOTIFY_DONE; +} + +static struct notifier_block xfrm_dev_notifier = { + xfrm_dev_event, + NULL, + 0 +}; + +static void __init xfrm_policy_init(void) +{ + xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache", + sizeof(struct xfrm_dst), + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (!xfrm_dst_cache) + panic("XFRM: failed to allocate xfrm_dst_cache\n"); + + INIT_WORK(&xfrm_policy_gc_work, xfrm_policy_gc_task, NULL); + register_netdevice_notifier(&xfrm_dev_notifier); +} + +void __init xfrm_init(void) +{ + xfrm_state_init(); + xfrm_policy_init(); + xfrm_input_init(); +} + diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c new file mode 100644 index 000000000000..1db59f11f37d --- /dev/null +++ b/net/xfrm/xfrm_state.c @@ -0,0 +1,1037 @@ +/* + * xfrm_state.c + * + * Changes: + * Mitsuru KANDA @USAGI + * Kazunori MIYAZAWA @USAGI + * Kunihiro Ishiguro + * IPv6 support + * YOSHIFUJI Hideaki @USAGI + * Split up af-specific functions + * Derek Atkins + * Add UDP Encapsulation + * + */ + +#include +#include +#include +#include +#include +#include + +/* Each xfrm_state may be linked to two tables: + + 1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl) + 2. Hash table by daddr to find what SAs exist for given + destination/tunnel endpoint. (output) + */ + +static DEFINE_SPINLOCK(xfrm_state_lock); + +/* Hash table to find appropriate SA towards given target (endpoint + * of tunnel or destination of transport mode) allowed by selector. + * + * Main use is finding SA after policy selected tunnel or transport mode. + * Also, it can be used by ah/esp icmp error handler to find offending SA. + */ +static struct list_head xfrm_state_bydst[XFRM_DST_HSIZE]; +static struct list_head xfrm_state_byspi[XFRM_DST_HSIZE]; + +DECLARE_WAIT_QUEUE_HEAD(km_waitq); +EXPORT_SYMBOL(km_waitq); + +static DEFINE_RWLOCK(xfrm_state_afinfo_lock); +static struct xfrm_state_afinfo *xfrm_state_afinfo[NPROTO]; + +static struct work_struct xfrm_state_gc_work; +static struct list_head xfrm_state_gc_list = LIST_HEAD_INIT(xfrm_state_gc_list); +static DEFINE_SPINLOCK(xfrm_state_gc_lock); + +static int xfrm_state_gc_flush_bundles; + +static void __xfrm_state_delete(struct xfrm_state *x); + +static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family); +static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo); + +static int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol); +static void km_state_expired(struct xfrm_state *x, int hard); + +static void xfrm_state_gc_destroy(struct xfrm_state *x) +{ + if (del_timer(&x->timer)) + BUG(); + if (x->aalg) + kfree(x->aalg); + if (x->ealg) + kfree(x->ealg); + if (x->calg) + kfree(x->calg); + if (x->encap) + kfree(x->encap); + if (x->type) { + x->type->destructor(x); + xfrm_put_type(x->type); + } + kfree(x); +} + +static void xfrm_state_gc_task(void *data) +{ + struct xfrm_state *x; + struct list_head *entry, *tmp; + struct list_head gc_list = LIST_HEAD_INIT(gc_list); + + if (xfrm_state_gc_flush_bundles) { + xfrm_state_gc_flush_bundles = 0; + xfrm_flush_bundles(); + } + + spin_lock_bh(&xfrm_state_gc_lock); + list_splice_init(&xfrm_state_gc_list, &gc_list); + spin_unlock_bh(&xfrm_state_gc_lock); + + list_for_each_safe(entry, tmp, &gc_list) { + x = list_entry(entry, struct xfrm_state, bydst); + xfrm_state_gc_destroy(x); + } + wake_up(&km_waitq); +} + +static inline unsigned long make_jiffies(long secs) +{ + if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) + return MAX_SCHEDULE_TIMEOUT-1; + else + return secs*HZ; +} + +static void xfrm_timer_handler(unsigned long data) +{ + struct xfrm_state *x = (struct xfrm_state*)data; + unsigned long now = (unsigned long)xtime.tv_sec; + long next = LONG_MAX; + int warn = 0; + + spin_lock(&x->lock); + if (x->km.state == XFRM_STATE_DEAD) + goto out; + if (x->km.state == XFRM_STATE_EXPIRED) + goto expired; + if (x->lft.hard_add_expires_seconds) { + long tmo = x->lft.hard_add_expires_seconds + + x->curlft.add_time - now; + if (tmo <= 0) + goto expired; + if (tmo < next) + next = tmo; + } + if (x->lft.hard_use_expires_seconds) { + long tmo = x->lft.hard_use_expires_seconds + + (x->curlft.use_time ? : now) - now; + if (tmo <= 0) + goto expired; + if (tmo < next) + next = tmo; + } + if (x->km.dying) + goto resched; + if (x->lft.soft_add_expires_seconds) { + long tmo = x->lft.soft_add_expires_seconds + + x->curlft.add_time - now; + if (tmo <= 0) + warn = 1; + else if (tmo < next) + next = tmo; + } + if (x->lft.soft_use_expires_seconds) { + long tmo = x->lft.soft_use_expires_seconds + + (x->curlft.use_time ? : now) - now; + if (tmo <= 0) + warn = 1; + else if (tmo < next) + next = tmo; + } + + if (warn) + km_state_expired(x, 0); +resched: + if (next != LONG_MAX && + !mod_timer(&x->timer, jiffies + make_jiffies(next))) + xfrm_state_hold(x); + goto out; + +expired: + if (x->km.state == XFRM_STATE_ACQ && x->id.spi == 0) { + x->km.state = XFRM_STATE_EXPIRED; + wake_up(&km_waitq); + next = 2; + goto resched; + } + if (x->id.spi != 0) + km_state_expired(x, 1); + __xfrm_state_delete(x); + +out: + spin_unlock(&x->lock); + xfrm_state_put(x); +} + +struct xfrm_state *xfrm_state_alloc(void) +{ + struct xfrm_state *x; + + x = kmalloc(sizeof(struct xfrm_state), GFP_ATOMIC); + + if (x) { + memset(x, 0, sizeof(struct xfrm_state)); + atomic_set(&x->refcnt, 1); + atomic_set(&x->tunnel_users, 0); + INIT_LIST_HEAD(&x->bydst); + INIT_LIST_HEAD(&x->byspi); + init_timer(&x->timer); + x->timer.function = xfrm_timer_handler; + x->timer.data = (unsigned long)x; + x->curlft.add_time = (unsigned long)xtime.tv_sec; + x->lft.soft_byte_limit = XFRM_INF; + x->lft.soft_packet_limit = XFRM_INF; + x->lft.hard_byte_limit = XFRM_INF; + x->lft.hard_packet_limit = XFRM_INF; + spin_lock_init(&x->lock); + } + return x; +} +EXPORT_SYMBOL(xfrm_state_alloc); + +void __xfrm_state_destroy(struct xfrm_state *x) +{ + BUG_TRAP(x->km.state == XFRM_STATE_DEAD); + + spin_lock_bh(&xfrm_state_gc_lock); + list_add(&x->bydst, &xfrm_state_gc_list); + spin_unlock_bh(&xfrm_state_gc_lock); + schedule_work(&xfrm_state_gc_work); +} +EXPORT_SYMBOL(__xfrm_state_destroy); + +static void __xfrm_state_delete(struct xfrm_state *x) +{ + if (x->km.state != XFRM_STATE_DEAD) { + x->km.state = XFRM_STATE_DEAD; + spin_lock(&xfrm_state_lock); + list_del(&x->bydst); + atomic_dec(&x->refcnt); + if (x->id.spi) { + list_del(&x->byspi); + atomic_dec(&x->refcnt); + } + spin_unlock(&xfrm_state_lock); + if (del_timer(&x->timer)) + atomic_dec(&x->refcnt); + + /* The number two in this test is the reference + * mentioned in the comment below plus the reference + * our caller holds. A larger value means that + * there are DSTs attached to this xfrm_state. + */ + if (atomic_read(&x->refcnt) > 2) { + xfrm_state_gc_flush_bundles = 1; + schedule_work(&xfrm_state_gc_work); + } + + /* All xfrm_state objects are created by xfrm_state_alloc. + * The xfrm_state_alloc call gives a reference, and that + * is what we are dropping here. + */ + atomic_dec(&x->refcnt); + } +} + +void xfrm_state_delete(struct xfrm_state *x) +{ + spin_lock_bh(&x->lock); + __xfrm_state_delete(x); + spin_unlock_bh(&x->lock); +} +EXPORT_SYMBOL(xfrm_state_delete); + +void xfrm_state_flush(u8 proto) +{ + int i; + struct xfrm_state *x; + + spin_lock_bh(&xfrm_state_lock); + for (i = 0; i < XFRM_DST_HSIZE; i++) { +restart: + list_for_each_entry(x, xfrm_state_bydst+i, bydst) { + if (!xfrm_state_kern(x) && + (proto == IPSEC_PROTO_ANY || x->id.proto == proto)) { + xfrm_state_hold(x); + spin_unlock_bh(&xfrm_state_lock); + + xfrm_state_delete(x); + xfrm_state_put(x); + + spin_lock_bh(&xfrm_state_lock); + goto restart; + } + } + } + spin_unlock_bh(&xfrm_state_lock); + wake_up(&km_waitq); +} +EXPORT_SYMBOL(xfrm_state_flush); + +static int +xfrm_init_tempsel(struct xfrm_state *x, struct flowi *fl, + struct xfrm_tmpl *tmpl, + xfrm_address_t *daddr, xfrm_address_t *saddr, + unsigned short family) +{ + struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); + if (!afinfo) + return -1; + afinfo->init_tempsel(x, fl, tmpl, daddr, saddr); + xfrm_state_put_afinfo(afinfo); + return 0; +} + +struct xfrm_state * +xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, + struct flowi *fl, struct xfrm_tmpl *tmpl, + struct xfrm_policy *pol, int *err, + unsigned short family) +{ + unsigned h = xfrm_dst_hash(daddr, family); + struct xfrm_state *x, *x0; + int acquire_in_progress = 0; + int error = 0; + struct xfrm_state *best = NULL; + struct xfrm_state_afinfo *afinfo; + + afinfo = xfrm_state_get_afinfo(family); + if (afinfo == NULL) { + *err = -EAFNOSUPPORT; + return NULL; + } + + spin_lock_bh(&xfrm_state_lock); + list_for_each_entry(x, xfrm_state_bydst+h, bydst) { + if (x->props.family == family && + x->props.reqid == tmpl->reqid && + xfrm_state_addr_check(x, daddr, saddr, family) && + tmpl->mode == x->props.mode && + tmpl->id.proto == x->id.proto && + (tmpl->id.spi == x->id.spi || !tmpl->id.spi)) { + /* Resolution logic: + 1. There is a valid state with matching selector. + Done. + 2. Valid state with inappropriate selector. Skip. + + Entering area of "sysdeps". + + 3. If state is not valid, selector is temporary, + it selects only session which triggered + previous resolution. Key manager will do + something to install a state with proper + selector. + */ + if (x->km.state == XFRM_STATE_VALID) { + if (!xfrm_selector_match(&x->sel, fl, family)) + continue; + if (!best || + best->km.dying > x->km.dying || + (best->km.dying == x->km.dying && + best->curlft.add_time < x->curlft.add_time)) + best = x; + } else if (x->km.state == XFRM_STATE_ACQ) { + acquire_in_progress = 1; + } else if (x->km.state == XFRM_STATE_ERROR || + x->km.state == XFRM_STATE_EXPIRED) { + if (xfrm_selector_match(&x->sel, fl, family)) + error = -ESRCH; + } + } + } + + x = best; + if (!x && !error && !acquire_in_progress) { + x0 = afinfo->state_lookup(&tmpl->id.daddr, tmpl->id.spi, tmpl->id.proto); + if (x0 != NULL) { + xfrm_state_put(x0); + error = -EEXIST; + goto out; + } + x = xfrm_state_alloc(); + if (x == NULL) { + error = -ENOMEM; + goto out; + } + /* Initialize temporary selector matching only + * to current session. */ + xfrm_init_tempsel(x, fl, tmpl, daddr, saddr, family); + + if (km_query(x, tmpl, pol) == 0) { + x->km.state = XFRM_STATE_ACQ; + list_add_tail(&x->bydst, xfrm_state_bydst+h); + xfrm_state_hold(x); + if (x->id.spi) { + h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, family); + list_add(&x->byspi, xfrm_state_byspi+h); + xfrm_state_hold(x); + } + x->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES; + xfrm_state_hold(x); + x->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ; + add_timer(&x->timer); + } else { + x->km.state = XFRM_STATE_DEAD; + xfrm_state_put(x); + x = NULL; + error = -ESRCH; + } + } +out: + if (x) + xfrm_state_hold(x); + else + *err = acquire_in_progress ? -EAGAIN : error; + spin_unlock_bh(&xfrm_state_lock); + xfrm_state_put_afinfo(afinfo); + return x; +} + +static void __xfrm_state_insert(struct xfrm_state *x) +{ + unsigned h = xfrm_dst_hash(&x->id.daddr, x->props.family); + + list_add(&x->bydst, xfrm_state_bydst+h); + xfrm_state_hold(x); + + h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, x->props.family); + + list_add(&x->byspi, xfrm_state_byspi+h); + xfrm_state_hold(x); + + if (!mod_timer(&x->timer, jiffies + HZ)) + xfrm_state_hold(x); + + wake_up(&km_waitq); +} + +void xfrm_state_insert(struct xfrm_state *x) +{ + spin_lock_bh(&xfrm_state_lock); + __xfrm_state_insert(x); + spin_unlock_bh(&xfrm_state_lock); +} +EXPORT_SYMBOL(xfrm_state_insert); + +static struct xfrm_state *__xfrm_find_acq_byseq(u32 seq); + +int xfrm_state_add(struct xfrm_state *x) +{ + struct xfrm_state_afinfo *afinfo; + struct xfrm_state *x1; + int family; + int err; + + family = x->props.family; + afinfo = xfrm_state_get_afinfo(family); + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + spin_lock_bh(&xfrm_state_lock); + + x1 = afinfo->state_lookup(&x->id.daddr, x->id.spi, x->id.proto); + if (x1) { + xfrm_state_put(x1); + x1 = NULL; + err = -EEXIST; + goto out; + } + + if (x->km.seq) { + x1 = __xfrm_find_acq_byseq(x->km.seq); + if (x1 && xfrm_addr_cmp(&x1->id.daddr, &x->id.daddr, family)) { + xfrm_state_put(x1); + x1 = NULL; + } + } + + if (!x1) + x1 = afinfo->find_acq( + x->props.mode, x->props.reqid, x->id.proto, + &x->id.daddr, &x->props.saddr, 0); + + __xfrm_state_insert(x); + err = 0; + +out: + spin_unlock_bh(&xfrm_state_lock); + xfrm_state_put_afinfo(afinfo); + + if (x1) { + xfrm_state_delete(x1); + xfrm_state_put(x1); + } + + return err; +} +EXPORT_SYMBOL(xfrm_state_add); + +int xfrm_state_update(struct xfrm_state *x) +{ + struct xfrm_state_afinfo *afinfo; + struct xfrm_state *x1; + int err; + + afinfo = xfrm_state_get_afinfo(x->props.family); + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + spin_lock_bh(&xfrm_state_lock); + x1 = afinfo->state_lookup(&x->id.daddr, x->id.spi, x->id.proto); + + err = -ESRCH; + if (!x1) + goto out; + + if (xfrm_state_kern(x1)) { + xfrm_state_put(x1); + err = -EEXIST; + goto out; + } + + if (x1->km.state == XFRM_STATE_ACQ) { + __xfrm_state_insert(x); + x = NULL; + } + err = 0; + +out: + spin_unlock_bh(&xfrm_state_lock); + xfrm_state_put_afinfo(afinfo); + + if (err) + return err; + + if (!x) { + xfrm_state_delete(x1); + xfrm_state_put(x1); + return 0; + } + + err = -EINVAL; + spin_lock_bh(&x1->lock); + if (likely(x1->km.state == XFRM_STATE_VALID)) { + if (x->encap && x1->encap) + memcpy(x1->encap, x->encap, sizeof(*x1->encap)); + memcpy(&x1->lft, &x->lft, sizeof(x1->lft)); + x1->km.dying = 0; + + if (!mod_timer(&x1->timer, jiffies + HZ)) + xfrm_state_hold(x1); + if (x1->curlft.use_time) + xfrm_state_check_expire(x1); + + err = 0; + } + spin_unlock_bh(&x1->lock); + + xfrm_state_put(x1); + + return err; +} +EXPORT_SYMBOL(xfrm_state_update); + +int xfrm_state_check_expire(struct xfrm_state *x) +{ + if (!x->curlft.use_time) + x->curlft.use_time = (unsigned long)xtime.tv_sec; + + if (x->km.state != XFRM_STATE_VALID) + return -EINVAL; + + if (x->curlft.bytes >= x->lft.hard_byte_limit || + x->curlft.packets >= x->lft.hard_packet_limit) { + km_state_expired(x, 1); + if (!mod_timer(&x->timer, jiffies + XFRM_ACQ_EXPIRES*HZ)) + xfrm_state_hold(x); + return -EINVAL; + } + + if (!x->km.dying && + (x->curlft.bytes >= x->lft.soft_byte_limit || + x->curlft.packets >= x->lft.soft_packet_limit)) + km_state_expired(x, 0); + return 0; +} +EXPORT_SYMBOL(xfrm_state_check_expire); + +static int xfrm_state_check_space(struct xfrm_state *x, struct sk_buff *skb) +{ + int nhead = x->props.header_len + LL_RESERVED_SPACE(skb->dst->dev) + - skb_headroom(skb); + + if (nhead > 0) + return pskb_expand_head(skb, nhead, 0, GFP_ATOMIC); + + /* Check tail too... */ + return 0; +} + +int xfrm_state_check(struct xfrm_state *x, struct sk_buff *skb) +{ + int err = xfrm_state_check_expire(x); + if (err < 0) + goto err; + err = xfrm_state_check_space(x, skb); +err: + return err; +} +EXPORT_SYMBOL(xfrm_state_check); + +struct xfrm_state * +xfrm_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto, + unsigned short family) +{ + struct xfrm_state *x; + struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); + if (!afinfo) + return NULL; + + spin_lock_bh(&xfrm_state_lock); + x = afinfo->state_lookup(daddr, spi, proto); + spin_unlock_bh(&xfrm_state_lock); + xfrm_state_put_afinfo(afinfo); + return x; +} +EXPORT_SYMBOL(xfrm_state_lookup); + +struct xfrm_state * +xfrm_find_acq(u8 mode, u32 reqid, u8 proto, + xfrm_address_t *daddr, xfrm_address_t *saddr, + int create, unsigned short family) +{ + struct xfrm_state *x; + struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); + if (!afinfo) + return NULL; + + spin_lock_bh(&xfrm_state_lock); + x = afinfo->find_acq(mode, reqid, proto, daddr, saddr, create); + spin_unlock_bh(&xfrm_state_lock); + xfrm_state_put_afinfo(afinfo); + return x; +} +EXPORT_SYMBOL(xfrm_find_acq); + +/* Silly enough, but I'm lazy to build resolution list */ + +static struct xfrm_state *__xfrm_find_acq_byseq(u32 seq) +{ + int i; + struct xfrm_state *x; + + for (i = 0; i < XFRM_DST_HSIZE; i++) { + list_for_each_entry(x, xfrm_state_bydst+i, bydst) { + if (x->km.seq == seq && x->km.state == XFRM_STATE_ACQ) { + xfrm_state_hold(x); + return x; + } + } + } + return NULL; +} + +struct xfrm_state *xfrm_find_acq_byseq(u32 seq) +{ + struct xfrm_state *x; + + spin_lock_bh(&xfrm_state_lock); + x = __xfrm_find_acq_byseq(seq); + spin_unlock_bh(&xfrm_state_lock); + return x; +} +EXPORT_SYMBOL(xfrm_find_acq_byseq); + +u32 xfrm_get_acqseq(void) +{ + u32 res; + static u32 acqseq; + static DEFINE_SPINLOCK(acqseq_lock); + + spin_lock_bh(&acqseq_lock); + res = (++acqseq ? : ++acqseq); + spin_unlock_bh(&acqseq_lock); + return res; +} +EXPORT_SYMBOL(xfrm_get_acqseq); + +void +xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi) +{ + u32 h; + struct xfrm_state *x0; + + if (x->id.spi) + return; + + if (minspi == maxspi) { + x0 = xfrm_state_lookup(&x->id.daddr, minspi, x->id.proto, x->props.family); + if (x0) { + xfrm_state_put(x0); + return; + } + x->id.spi = minspi; + } else { + u32 spi = 0; + minspi = ntohl(minspi); + maxspi = ntohl(maxspi); + for (h=0; hid.daddr, htonl(spi), x->id.proto, x->props.family); + if (x0 == NULL) { + x->id.spi = htonl(spi); + break; + } + xfrm_state_put(x0); + } + } + if (x->id.spi) { + spin_lock_bh(&xfrm_state_lock); + h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, x->props.family); + list_add(&x->byspi, xfrm_state_byspi+h); + xfrm_state_hold(x); + spin_unlock_bh(&xfrm_state_lock); + wake_up(&km_waitq); + } +} +EXPORT_SYMBOL(xfrm_alloc_spi); + +int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*), + void *data) +{ + int i; + struct xfrm_state *x; + int count = 0; + int err = 0; + + spin_lock_bh(&xfrm_state_lock); + for (i = 0; i < XFRM_DST_HSIZE; i++) { + list_for_each_entry(x, xfrm_state_bydst+i, bydst) { + if (proto == IPSEC_PROTO_ANY || x->id.proto == proto) + count++; + } + } + if (count == 0) { + err = -ENOENT; + goto out; + } + + for (i = 0; i < XFRM_DST_HSIZE; i++) { + list_for_each_entry(x, xfrm_state_bydst+i, bydst) { + if (proto != IPSEC_PROTO_ANY && x->id.proto != proto) + continue; + err = func(x, --count, data); + if (err) + goto out; + } + } +out: + spin_unlock_bh(&xfrm_state_lock); + return err; +} +EXPORT_SYMBOL(xfrm_state_walk); + +int xfrm_replay_check(struct xfrm_state *x, u32 seq) +{ + u32 diff; + + seq = ntohl(seq); + + if (unlikely(seq == 0)) + return -EINVAL; + + if (likely(seq > x->replay.seq)) + return 0; + + diff = x->replay.seq - seq; + if (diff >= x->props.replay_window) { + x->stats.replay_window++; + return -EINVAL; + } + + if (x->replay.bitmap & (1U << diff)) { + x->stats.replay++; + return -EINVAL; + } + return 0; +} +EXPORT_SYMBOL(xfrm_replay_check); + +void xfrm_replay_advance(struct xfrm_state *x, u32 seq) +{ + u32 diff; + + seq = ntohl(seq); + + if (seq > x->replay.seq) { + diff = seq - x->replay.seq; + if (diff < x->props.replay_window) + x->replay.bitmap = ((x->replay.bitmap) << diff) | 1; + else + x->replay.bitmap = 1; + x->replay.seq = seq; + } else { + diff = x->replay.seq - seq; + x->replay.bitmap |= (1U << diff); + } +} +EXPORT_SYMBOL(xfrm_replay_advance); + +static struct list_head xfrm_km_list = LIST_HEAD_INIT(xfrm_km_list); +static DEFINE_RWLOCK(xfrm_km_lock); + +static void km_state_expired(struct xfrm_state *x, int hard) +{ + struct xfrm_mgr *km; + + if (hard) + x->km.state = XFRM_STATE_EXPIRED; + else + x->km.dying = 1; + + read_lock(&xfrm_km_lock); + list_for_each_entry(km, &xfrm_km_list, list) + km->notify(x, hard); + read_unlock(&xfrm_km_lock); + + if (hard) + wake_up(&km_waitq); +} + +static int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol) +{ + int err = -EINVAL; + struct xfrm_mgr *km; + + read_lock(&xfrm_km_lock); + list_for_each_entry(km, &xfrm_km_list, list) { + err = km->acquire(x, t, pol, XFRM_POLICY_OUT); + if (!err) + break; + } + read_unlock(&xfrm_km_lock); + return err; +} + +int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport) +{ + int err = -EINVAL; + struct xfrm_mgr *km; + + read_lock(&xfrm_km_lock); + list_for_each_entry(km, &xfrm_km_list, list) { + if (km->new_mapping) + err = km->new_mapping(x, ipaddr, sport); + if (!err) + break; + } + read_unlock(&xfrm_km_lock); + return err; +} +EXPORT_SYMBOL(km_new_mapping); + +void km_policy_expired(struct xfrm_policy *pol, int dir, int hard) +{ + struct xfrm_mgr *km; + + read_lock(&xfrm_km_lock); + list_for_each_entry(km, &xfrm_km_list, list) + if (km->notify_policy) + km->notify_policy(pol, dir, hard); + read_unlock(&xfrm_km_lock); + + if (hard) + wake_up(&km_waitq); +} + +int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen) +{ + int err; + u8 *data; + struct xfrm_mgr *km; + struct xfrm_policy *pol = NULL; + + if (optlen <= 0 || optlen > PAGE_SIZE) + return -EMSGSIZE; + + data = kmalloc(optlen, GFP_KERNEL); + if (!data) + return -ENOMEM; + + err = -EFAULT; + if (copy_from_user(data, optval, optlen)) + goto out; + + err = -EINVAL; + read_lock(&xfrm_km_lock); + list_for_each_entry(km, &xfrm_km_list, list) { + pol = km->compile_policy(sk->sk_family, optname, data, + optlen, &err); + if (err >= 0) + break; + } + read_unlock(&xfrm_km_lock); + + if (err >= 0) { + xfrm_sk_policy_insert(sk, err, pol); + xfrm_pol_put(pol); + err = 0; + } + +out: + kfree(data); + return err; +} +EXPORT_SYMBOL(xfrm_user_policy); + +int xfrm_register_km(struct xfrm_mgr *km) +{ + write_lock_bh(&xfrm_km_lock); + list_add_tail(&km->list, &xfrm_km_list); + write_unlock_bh(&xfrm_km_lock); + return 0; +} +EXPORT_SYMBOL(xfrm_register_km); + +int xfrm_unregister_km(struct xfrm_mgr *km) +{ + write_lock_bh(&xfrm_km_lock); + list_del(&km->list); + write_unlock_bh(&xfrm_km_lock); + return 0; +} +EXPORT_SYMBOL(xfrm_unregister_km); + +int xfrm_state_register_afinfo(struct xfrm_state_afinfo *afinfo) +{ + int err = 0; + if (unlikely(afinfo == NULL)) + return -EINVAL; + if (unlikely(afinfo->family >= NPROTO)) + return -EAFNOSUPPORT; + write_lock(&xfrm_state_afinfo_lock); + if (unlikely(xfrm_state_afinfo[afinfo->family] != NULL)) + err = -ENOBUFS; + else { + afinfo->state_bydst = xfrm_state_bydst; + afinfo->state_byspi = xfrm_state_byspi; + xfrm_state_afinfo[afinfo->family] = afinfo; + } + write_unlock(&xfrm_state_afinfo_lock); + return err; +} +EXPORT_SYMBOL(xfrm_state_register_afinfo); + +int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo) +{ + int err = 0; + if (unlikely(afinfo == NULL)) + return -EINVAL; + if (unlikely(afinfo->family >= NPROTO)) + return -EAFNOSUPPORT; + write_lock(&xfrm_state_afinfo_lock); + if (likely(xfrm_state_afinfo[afinfo->family] != NULL)) { + if (unlikely(xfrm_state_afinfo[afinfo->family] != afinfo)) + err = -EINVAL; + else { + xfrm_state_afinfo[afinfo->family] = NULL; + afinfo->state_byspi = NULL; + afinfo->state_bydst = NULL; + } + } + write_unlock(&xfrm_state_afinfo_lock); + return err; +} +EXPORT_SYMBOL(xfrm_state_unregister_afinfo); + +static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family) +{ + struct xfrm_state_afinfo *afinfo; + if (unlikely(family >= NPROTO)) + return NULL; + read_lock(&xfrm_state_afinfo_lock); + afinfo = xfrm_state_afinfo[family]; + if (likely(afinfo != NULL)) + read_lock(&afinfo->lock); + read_unlock(&xfrm_state_afinfo_lock); + return afinfo; +} + +static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) +{ + if (unlikely(afinfo == NULL)) + return; + read_unlock(&afinfo->lock); +} + +/* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ +void xfrm_state_delete_tunnel(struct xfrm_state *x) +{ + if (x->tunnel) { + struct xfrm_state *t = x->tunnel; + + if (atomic_read(&t->tunnel_users) == 2) + xfrm_state_delete(t); + atomic_dec(&t->tunnel_users); + xfrm_state_put(t); + x->tunnel = NULL; + } +} +EXPORT_SYMBOL(xfrm_state_delete_tunnel); + +int xfrm_state_mtu(struct xfrm_state *x, int mtu) +{ + int res = mtu; + + res -= x->props.header_len; + + for (;;) { + int m = res; + + if (m < 68) + return 68; + + spin_lock_bh(&x->lock); + if (x->km.state == XFRM_STATE_VALID && + x->type && x->type->get_max_size) + m = x->type->get_max_size(x, m); + else + m += x->props.header_len; + spin_unlock_bh(&x->lock); + + if (m <= mtu) + break; + res -= (m - mtu); + } + + return res; +} + +EXPORT_SYMBOL(xfrm_state_mtu); + +void __init xfrm_state_init(void) +{ + int i; + + for (i=0; i + * IPv6 support + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct sock *xfrm_nl; + +static int verify_one_alg(struct rtattr **xfrma, enum xfrm_attr_type_t type) +{ + struct rtattr *rt = xfrma[type - 1]; + struct xfrm_algo *algp; + + if (!rt) + return 0; + + if ((rt->rta_len - sizeof(*rt)) < sizeof(*algp)) + return -EINVAL; + + algp = RTA_DATA(rt); + switch (type) { + case XFRMA_ALG_AUTH: + if (!algp->alg_key_len && + strcmp(algp->alg_name, "digest_null") != 0) + return -EINVAL; + break; + + case XFRMA_ALG_CRYPT: + if (!algp->alg_key_len && + strcmp(algp->alg_name, "cipher_null") != 0) + return -EINVAL; + break; + + case XFRMA_ALG_COMP: + /* Zero length keys are legal. */ + break; + + default: + return -EINVAL; + }; + + algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0'; + return 0; +} + +static int verify_encap_tmpl(struct rtattr **xfrma) +{ + struct rtattr *rt = xfrma[XFRMA_ENCAP - 1]; + struct xfrm_encap_tmpl *encap; + + if (!rt) + return 0; + + if ((rt->rta_len - sizeof(*rt)) < sizeof(*encap)) + return -EINVAL; + + return 0; +} + +static int verify_newsa_info(struct xfrm_usersa_info *p, + struct rtattr **xfrma) +{ + int err; + + err = -EINVAL; + switch (p->family) { + case AF_INET: + break; + + case AF_INET6: +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + break; +#else + err = -EAFNOSUPPORT; + goto out; +#endif + + default: + goto out; + }; + + err = -EINVAL; + switch (p->id.proto) { + case IPPROTO_AH: + if (!xfrma[XFRMA_ALG_AUTH-1] || + xfrma[XFRMA_ALG_CRYPT-1] || + xfrma[XFRMA_ALG_COMP-1]) + goto out; + break; + + case IPPROTO_ESP: + if ((!xfrma[XFRMA_ALG_AUTH-1] && + !xfrma[XFRMA_ALG_CRYPT-1]) || + xfrma[XFRMA_ALG_COMP-1]) + goto out; + break; + + case IPPROTO_COMP: + if (!xfrma[XFRMA_ALG_COMP-1] || + xfrma[XFRMA_ALG_AUTH-1] || + xfrma[XFRMA_ALG_CRYPT-1]) + goto out; + break; + + default: + goto out; + }; + + if ((err = verify_one_alg(xfrma, XFRMA_ALG_AUTH))) + goto out; + if ((err = verify_one_alg(xfrma, XFRMA_ALG_CRYPT))) + goto out; + if ((err = verify_one_alg(xfrma, XFRMA_ALG_COMP))) + goto out; + if ((err = verify_encap_tmpl(xfrma))) + goto out; + + err = -EINVAL; + switch (p->mode) { + case 0: + case 1: + break; + + default: + goto out; + }; + + err = 0; + +out: + return err; +} + +static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, + struct xfrm_algo_desc *(*get_byname)(char *, int), + struct rtattr *u_arg) +{ + struct rtattr *rta = u_arg; + struct xfrm_algo *p, *ualg; + struct xfrm_algo_desc *algo; + + if (!rta) + return 0; + + ualg = RTA_DATA(rta); + + algo = get_byname(ualg->alg_name, 1); + if (!algo) + return -ENOSYS; + *props = algo->desc.sadb_alg_id; + + p = kmalloc(sizeof(*ualg) + ualg->alg_key_len, GFP_KERNEL); + if (!p) + return -ENOMEM; + + memcpy(p, ualg, sizeof(*ualg) + ualg->alg_key_len); + *algpp = p; + return 0; +} + +static int attach_encap_tmpl(struct xfrm_encap_tmpl **encapp, struct rtattr *u_arg) +{ + struct rtattr *rta = u_arg; + struct xfrm_encap_tmpl *p, *uencap; + + if (!rta) + return 0; + + uencap = RTA_DATA(rta); + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + memcpy(p, uencap, sizeof(*p)); + *encapp = p; + return 0; +} + +static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) +{ + memcpy(&x->id, &p->id, sizeof(x->id)); + memcpy(&x->sel, &p->sel, sizeof(x->sel)); + memcpy(&x->lft, &p->lft, sizeof(x->lft)); + x->props.mode = p->mode; + x->props.replay_window = p->replay_window; + x->props.reqid = p->reqid; + x->props.family = p->family; + x->props.saddr = p->saddr; + x->props.flags = p->flags; +} + +static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p, + struct rtattr **xfrma, + int *errp) +{ + struct xfrm_state *x = xfrm_state_alloc(); + int err = -ENOMEM; + + if (!x) + goto error_no_put; + + copy_from_user_state(x, p); + + if ((err = attach_one_algo(&x->aalg, &x->props.aalgo, + xfrm_aalg_get_byname, + xfrma[XFRMA_ALG_AUTH-1]))) + goto error; + if ((err = attach_one_algo(&x->ealg, &x->props.ealgo, + xfrm_ealg_get_byname, + xfrma[XFRMA_ALG_CRYPT-1]))) + goto error; + if ((err = attach_one_algo(&x->calg, &x->props.calgo, + xfrm_calg_get_byname, + xfrma[XFRMA_ALG_COMP-1]))) + goto error; + if ((err = attach_encap_tmpl(&x->encap, xfrma[XFRMA_ENCAP-1]))) + goto error; + + err = -ENOENT; + x->type = xfrm_get_type(x->id.proto, x->props.family); + if (x->type == NULL) + goto error; + + err = x->type->init_state(x, NULL); + if (err) + goto error; + + x->curlft.add_time = (unsigned long) xtime.tv_sec; + x->km.state = XFRM_STATE_VALID; + x->km.seq = p->seq; + + return x; + +error: + x->km.state = XFRM_STATE_DEAD; + xfrm_state_put(x); +error_no_put: + *errp = err; + return NULL; +} + +static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) +{ + struct xfrm_usersa_info *p = NLMSG_DATA(nlh); + struct xfrm_state *x; + int err; + + err = verify_newsa_info(p, (struct rtattr **) xfrma); + if (err) + return err; + + x = xfrm_state_construct(p, (struct rtattr **) xfrma, &err); + if (!x) + return err; + + if (nlh->nlmsg_type == XFRM_MSG_NEWSA) + err = xfrm_state_add(x); + else + err = xfrm_state_update(x); + + if (err < 0) { + x->km.state = XFRM_STATE_DEAD; + xfrm_state_put(x); + } + + return err; +} + +static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) +{ + struct xfrm_state *x; + struct xfrm_usersa_id *p = NLMSG_DATA(nlh); + + x = xfrm_state_lookup(&p->daddr, p->spi, p->proto, p->family); + if (x == NULL) + return -ESRCH; + + if (xfrm_state_kern(x)) { + xfrm_state_put(x); + return -EPERM; + } + + xfrm_state_delete(x); + xfrm_state_put(x); + + return 0; +} + +static void copy_to_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) +{ + memcpy(&p->id, &x->id, sizeof(p->id)); + memcpy(&p->sel, &x->sel, sizeof(p->sel)); + memcpy(&p->lft, &x->lft, sizeof(p->lft)); + memcpy(&p->curlft, &x->curlft, sizeof(p->curlft)); + memcpy(&p->stats, &x->stats, sizeof(p->stats)); + p->saddr = x->props.saddr; + p->mode = x->props.mode; + p->replay_window = x->props.replay_window; + p->reqid = x->props.reqid; + p->family = x->props.family; + p->flags = x->props.flags; + p->seq = x->km.seq; +} + +struct xfrm_dump_info { + struct sk_buff *in_skb; + struct sk_buff *out_skb; + u32 nlmsg_seq; + u16 nlmsg_flags; + int start_idx; + int this_idx; +}; + +static int dump_one_state(struct xfrm_state *x, int count, void *ptr) +{ + struct xfrm_dump_info *sp = ptr; + struct sk_buff *in_skb = sp->in_skb; + struct sk_buff *skb = sp->out_skb; + struct xfrm_usersa_info *p; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + if (sp->this_idx < sp->start_idx) + goto out; + + nlh = NLMSG_PUT(skb, NETLINK_CB(in_skb).pid, + sp->nlmsg_seq, + XFRM_MSG_NEWSA, sizeof(*p)); + nlh->nlmsg_flags = sp->nlmsg_flags; + + p = NLMSG_DATA(nlh); + copy_to_user_state(x, p); + + if (x->aalg) + RTA_PUT(skb, XFRMA_ALG_AUTH, + sizeof(*(x->aalg))+(x->aalg->alg_key_len+7)/8, x->aalg); + if (x->ealg) + RTA_PUT(skb, XFRMA_ALG_CRYPT, + sizeof(*(x->ealg))+(x->ealg->alg_key_len+7)/8, x->ealg); + if (x->calg) + RTA_PUT(skb, XFRMA_ALG_COMP, sizeof(*(x->calg)), x->calg); + + if (x->encap) + RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap); + + nlh->nlmsg_len = skb->tail - b; +out: + sp->this_idx++; + return 0; + +nlmsg_failure: +rtattr_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct xfrm_dump_info info; + + info.in_skb = cb->skb; + info.out_skb = skb; + info.nlmsg_seq = cb->nlh->nlmsg_seq; + info.nlmsg_flags = NLM_F_MULTI; + info.this_idx = 0; + info.start_idx = cb->args[0]; + (void) xfrm_state_walk(IPSEC_PROTO_ANY, dump_one_state, &info); + cb->args[0] = info.this_idx; + + return skb->len; +} + +static struct sk_buff *xfrm_state_netlink(struct sk_buff *in_skb, + struct xfrm_state *x, u32 seq) +{ + struct xfrm_dump_info info; + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_ATOMIC); + if (!skb) + return ERR_PTR(-ENOMEM); + + NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; + info.in_skb = in_skb; + info.out_skb = skb; + info.nlmsg_seq = seq; + info.nlmsg_flags = 0; + info.this_idx = info.start_idx = 0; + + if (dump_one_state(x, 0, &info)) { + kfree_skb(skb); + return NULL; + } + + return skb; +} + +static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) +{ + struct xfrm_usersa_id *p = NLMSG_DATA(nlh); + struct xfrm_state *x; + struct sk_buff *resp_skb; + int err; + + x = xfrm_state_lookup(&p->daddr, p->spi, p->proto, p->family); + err = -ESRCH; + if (x == NULL) + goto out_noput; + + resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq); + if (IS_ERR(resp_skb)) { + err = PTR_ERR(resp_skb); + } else { + err = netlink_unicast(xfrm_nl, resp_skb, + NETLINK_CB(skb).pid, MSG_DONTWAIT); + } + xfrm_state_put(x); +out_noput: + return err; +} + +static int verify_userspi_info(struct xfrm_userspi_info *p) +{ + switch (p->info.id.proto) { + case IPPROTO_AH: + case IPPROTO_ESP: + break; + + case IPPROTO_COMP: + /* IPCOMP spi is 16-bits. */ + if (p->max >= 0x10000) + return -EINVAL; + break; + + default: + return -EINVAL; + }; + + if (p->min > p->max) + return -EINVAL; + + return 0; +} + +static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) +{ + struct xfrm_state *x; + struct xfrm_userspi_info *p; + struct sk_buff *resp_skb; + xfrm_address_t *daddr; + int family; + int err; + + p = NLMSG_DATA(nlh); + err = verify_userspi_info(p); + if (err) + goto out_noput; + + family = p->info.family; + daddr = &p->info.id.daddr; + + x = NULL; + if (p->info.seq) { + x = xfrm_find_acq_byseq(p->info.seq); + if (x && xfrm_addr_cmp(&x->id.daddr, daddr, family)) { + xfrm_state_put(x); + x = NULL; + } + } + + if (!x) + x = xfrm_find_acq(p->info.mode, p->info.reqid, + p->info.id.proto, daddr, + &p->info.saddr, 1, + family); + err = -ENOENT; + if (x == NULL) + goto out_noput; + + resp_skb = ERR_PTR(-ENOENT); + + spin_lock_bh(&x->lock); + if (x->km.state != XFRM_STATE_DEAD) { + xfrm_alloc_spi(x, htonl(p->min), htonl(p->max)); + if (x->id.spi) + resp_skb = xfrm_state_netlink(skb, x, nlh->nlmsg_seq); + } + spin_unlock_bh(&x->lock); + + if (IS_ERR(resp_skb)) { + err = PTR_ERR(resp_skb); + goto out; + } + + err = netlink_unicast(xfrm_nl, resp_skb, + NETLINK_CB(skb).pid, MSG_DONTWAIT); + +out: + xfrm_state_put(x); +out_noput: + return err; +} + +static int verify_policy_dir(__u8 dir) +{ + switch (dir) { + case XFRM_POLICY_IN: + case XFRM_POLICY_OUT: + case XFRM_POLICY_FWD: + break; + + default: + return -EINVAL; + }; + + return 0; +} + +static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) +{ + switch (p->share) { + case XFRM_SHARE_ANY: + case XFRM_SHARE_SESSION: + case XFRM_SHARE_USER: + case XFRM_SHARE_UNIQUE: + break; + + default: + return -EINVAL; + }; + + switch (p->action) { + case XFRM_POLICY_ALLOW: + case XFRM_POLICY_BLOCK: + break; + + default: + return -EINVAL; + }; + + switch (p->sel.family) { + case AF_INET: + break; + + case AF_INET6: +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + break; +#else + return -EAFNOSUPPORT; +#endif + + default: + return -EINVAL; + }; + + return verify_policy_dir(p->dir); +} + +static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, + int nr) +{ + int i; + + xp->xfrm_nr = nr; + for (i = 0; i < nr; i++, ut++) { + struct xfrm_tmpl *t = &xp->xfrm_vec[i]; + + memcpy(&t->id, &ut->id, sizeof(struct xfrm_id)); + memcpy(&t->saddr, &ut->saddr, + sizeof(xfrm_address_t)); + t->reqid = ut->reqid; + t->mode = ut->mode; + t->share = ut->share; + t->optional = ut->optional; + t->aalgos = ut->aalgos; + t->ealgos = ut->ealgos; + t->calgos = ut->calgos; + } +} + +static int copy_from_user_tmpl(struct xfrm_policy *pol, struct rtattr **xfrma) +{ + struct rtattr *rt = xfrma[XFRMA_TMPL-1]; + struct xfrm_user_tmpl *utmpl; + int nr; + + if (!rt) { + pol->xfrm_nr = 0; + } else { + nr = (rt->rta_len - sizeof(*rt)) / sizeof(*utmpl); + + if (nr > XFRM_MAX_DEPTH) + return -EINVAL; + + copy_templates(pol, RTA_DATA(rt), nr); + } + return 0; +} + +static void copy_from_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p) +{ + xp->priority = p->priority; + xp->index = p->index; + memcpy(&xp->selector, &p->sel, sizeof(xp->selector)); + memcpy(&xp->lft, &p->lft, sizeof(xp->lft)); + xp->action = p->action; + xp->flags = p->flags; + xp->family = p->sel.family; + /* XXX xp->share = p->share; */ +} + +static void copy_to_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_info *p, int dir) +{ + memcpy(&p->sel, &xp->selector, sizeof(p->sel)); + memcpy(&p->lft, &xp->lft, sizeof(p->lft)); + memcpy(&p->curlft, &xp->curlft, sizeof(p->curlft)); + p->priority = xp->priority; + p->index = xp->index; + p->sel.family = xp->family; + p->dir = dir; + p->action = xp->action; + p->flags = xp->flags; + p->share = XFRM_SHARE_ANY; /* XXX xp->share */ +} + +static struct xfrm_policy *xfrm_policy_construct(struct xfrm_userpolicy_info *p, struct rtattr **xfrma, int *errp) +{ + struct xfrm_policy *xp = xfrm_policy_alloc(GFP_KERNEL); + int err; + + if (!xp) { + *errp = -ENOMEM; + return NULL; + } + + copy_from_user_policy(xp, p); + err = copy_from_user_tmpl(xp, xfrma); + if (err) { + *errp = err; + kfree(xp); + xp = NULL; + } + + return xp; +} + +static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) +{ + struct xfrm_userpolicy_info *p = NLMSG_DATA(nlh); + struct xfrm_policy *xp; + int err; + int excl; + + err = verify_newpolicy_info(p); + if (err) + return err; + + xp = xfrm_policy_construct(p, (struct rtattr **) xfrma, &err); + if (!xp) + return err; + + excl = nlh->nlmsg_type == XFRM_MSG_NEWPOLICY; + err = xfrm_policy_insert(p->dir, xp, excl); + if (err) { + kfree(xp); + return err; + } + + xfrm_pol_put(xp); + + return 0; +} + +static int copy_to_user_tmpl(struct xfrm_policy *xp, struct sk_buff *skb) +{ + struct xfrm_user_tmpl vec[XFRM_MAX_DEPTH]; + int i; + + if (xp->xfrm_nr == 0) + return 0; + + for (i = 0; i < xp->xfrm_nr; i++) { + struct xfrm_user_tmpl *up = &vec[i]; + struct xfrm_tmpl *kp = &xp->xfrm_vec[i]; + + memcpy(&up->id, &kp->id, sizeof(up->id)); + up->family = xp->family; + memcpy(&up->saddr, &kp->saddr, sizeof(up->saddr)); + up->reqid = kp->reqid; + up->mode = kp->mode; + up->share = kp->share; + up->optional = kp->optional; + up->aalgos = kp->aalgos; + up->ealgos = kp->ealgos; + up->calgos = kp->calgos; + } + RTA_PUT(skb, XFRMA_TMPL, + (sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr), + vec); + + return 0; + +rtattr_failure: + return -1; +} + +static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr) +{ + struct xfrm_dump_info *sp = ptr; + struct xfrm_userpolicy_info *p; + struct sk_buff *in_skb = sp->in_skb; + struct sk_buff *skb = sp->out_skb; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + if (sp->this_idx < sp->start_idx) + goto out; + + nlh = NLMSG_PUT(skb, NETLINK_CB(in_skb).pid, + sp->nlmsg_seq, + XFRM_MSG_NEWPOLICY, sizeof(*p)); + p = NLMSG_DATA(nlh); + nlh->nlmsg_flags = sp->nlmsg_flags; + + copy_to_user_policy(xp, p, dir); + if (copy_to_user_tmpl(xp, skb) < 0) + goto nlmsg_failure; + + nlh->nlmsg_len = skb->tail - b; +out: + sp->this_idx++; + return 0; + +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int xfrm_dump_policy(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct xfrm_dump_info info; + + info.in_skb = cb->skb; + info.out_skb = skb; + info.nlmsg_seq = cb->nlh->nlmsg_seq; + info.nlmsg_flags = NLM_F_MULTI; + info.this_idx = 0; + info.start_idx = cb->args[0]; + (void) xfrm_policy_walk(dump_one_policy, &info); + cb->args[0] = info.this_idx; + + return skb->len; +} + +static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, + struct xfrm_policy *xp, + int dir, u32 seq) +{ + struct xfrm_dump_info info; + struct sk_buff *skb; + + skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); + if (!skb) + return ERR_PTR(-ENOMEM); + + NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; + info.in_skb = in_skb; + info.out_skb = skb; + info.nlmsg_seq = seq; + info.nlmsg_flags = 0; + info.this_idx = info.start_idx = 0; + + if (dump_one_policy(xp, dir, 0, &info) < 0) { + kfree_skb(skb); + return NULL; + } + + return skb; +} + +static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) +{ + struct xfrm_policy *xp; + struct xfrm_userpolicy_id *p; + int err; + int delete; + + p = NLMSG_DATA(nlh); + delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY; + + err = verify_policy_dir(p->dir); + if (err) + return err; + + if (p->index) + xp = xfrm_policy_byid(p->dir, p->index, delete); + else + xp = xfrm_policy_bysel(p->dir, &p->sel, delete); + if (xp == NULL) + return -ENOENT; + + if (!delete) { + struct sk_buff *resp_skb; + + resp_skb = xfrm_policy_netlink(skb, xp, p->dir, nlh->nlmsg_seq); + if (IS_ERR(resp_skb)) { + err = PTR_ERR(resp_skb); + } else { + err = netlink_unicast(xfrm_nl, resp_skb, + NETLINK_CB(skb).pid, + MSG_DONTWAIT); + } + } + + xfrm_pol_put(xp); + + return err; +} + +static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) +{ + struct xfrm_usersa_flush *p = NLMSG_DATA(nlh); + + xfrm_state_flush(p->proto); + return 0; +} + +static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) +{ + xfrm_policy_flush(); + return 0; +} + +static const int xfrm_msg_min[(XFRM_MSG_MAX + 1 - XFRM_MSG_BASE)] = { + NLMSG_LENGTH(sizeof(struct xfrm_usersa_info)), /* NEW SA */ + NLMSG_LENGTH(sizeof(struct xfrm_usersa_id)), /* DEL SA */ + NLMSG_LENGTH(sizeof(struct xfrm_usersa_id)), /* GET SA */ + NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_info)),/* NEW POLICY */ + NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_id)), /* DEL POLICY */ + NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_id)), /* GET POLICY */ + NLMSG_LENGTH(sizeof(struct xfrm_userspi_info)), /* ALLOC SPI */ + NLMSG_LENGTH(sizeof(struct xfrm_user_acquire)), /* ACQUIRE */ + NLMSG_LENGTH(sizeof(struct xfrm_user_expire)), /* EXPIRE */ + NLMSG_LENGTH(sizeof(struct xfrm_userpolicy_info)),/* UPD POLICY */ + NLMSG_LENGTH(sizeof(struct xfrm_usersa_info)), /* UPD SA */ + NLMSG_LENGTH(sizeof(struct xfrm_user_polexpire)), /* POLEXPIRE */ + NLMSG_LENGTH(sizeof(struct xfrm_usersa_flush)), /* FLUSH SA */ + NLMSG_LENGTH(0), /* FLUSH POLICY */ +}; + +static struct xfrm_link { + int (*doit)(struct sk_buff *, struct nlmsghdr *, void **); + int (*dump)(struct sk_buff *, struct netlink_callback *); +} xfrm_dispatch[] = { + { .doit = xfrm_add_sa, }, + { .doit = xfrm_del_sa, }, + { + .doit = xfrm_get_sa, + .dump = xfrm_dump_sa, + }, + { .doit = xfrm_add_policy }, + { .doit = xfrm_get_policy }, + { + .doit = xfrm_get_policy, + .dump = xfrm_dump_policy, + }, + { .doit = xfrm_alloc_userspi }, + {}, + {}, + { .doit = xfrm_add_policy }, + { .doit = xfrm_add_sa, }, + {}, + { .doit = xfrm_flush_sa }, + { .doit = xfrm_flush_policy }, +}; + +static int xfrm_done(struct netlink_callback *cb) +{ + return 0; +} + +static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, int *errp) +{ + struct rtattr *xfrma[XFRMA_MAX]; + struct xfrm_link *link; + int type, min_len; + + if (!(nlh->nlmsg_flags & NLM_F_REQUEST)) + return 0; + + type = nlh->nlmsg_type; + + /* A control message: ignore them */ + if (type < XFRM_MSG_BASE) + return 0; + + /* Unknown message: reply with EINVAL */ + if (type > XFRM_MSG_MAX) + goto err_einval; + + type -= XFRM_MSG_BASE; + link = &xfrm_dispatch[type]; + + /* All operations require privileges, even GET */ + if (security_netlink_recv(skb)) { + *errp = -EPERM; + return -1; + } + + if ((type == 2 || type == 5) && (nlh->nlmsg_flags & NLM_F_DUMP)) { + u32 rlen; + + if (link->dump == NULL) + goto err_einval; + + if ((*errp = netlink_dump_start(xfrm_nl, skb, nlh, + link->dump, + xfrm_done)) != 0) { + return -1; + } + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + skb_pull(skb, rlen); + return -1; + } + + memset(xfrma, 0, sizeof(xfrma)); + + if (nlh->nlmsg_len < (min_len = xfrm_msg_min[type])) + goto err_einval; + + if (nlh->nlmsg_len > min_len) { + int attrlen = nlh->nlmsg_len - NLMSG_ALIGN(min_len); + struct rtattr *attr = (void *) nlh + NLMSG_ALIGN(min_len); + + while (RTA_OK(attr, attrlen)) { + unsigned short flavor = attr->rta_type; + if (flavor) { + if (flavor > XFRMA_MAX) + goto err_einval; + xfrma[flavor - 1] = attr; + } + attr = RTA_NEXT(attr, attrlen); + } + } + + if (link->doit == NULL) + goto err_einval; + *errp = link->doit(skb, nlh, (void **) &xfrma); + + return *errp; + +err_einval: + *errp = -EINVAL; + return -1; +} + +static int xfrm_user_rcv_skb(struct sk_buff *skb) +{ + int err; + struct nlmsghdr *nlh; + + while (skb->len >= NLMSG_SPACE(0)) { + u32 rlen; + + nlh = (struct nlmsghdr *) skb->data; + if (nlh->nlmsg_len < sizeof(*nlh) || + skb->len < nlh->nlmsg_len) + return 0; + rlen = NLMSG_ALIGN(nlh->nlmsg_len); + if (rlen > skb->len) + rlen = skb->len; + if (xfrm_user_rcv_msg(skb, nlh, &err) < 0) { + if (err == 0) + return -1; + netlink_ack(skb, nlh, err); + } else if (nlh->nlmsg_flags & NLM_F_ACK) + netlink_ack(skb, nlh, 0); + skb_pull(skb, rlen); + } + + return 0; +} + +static void xfrm_netlink_rcv(struct sock *sk, int len) +{ + do { + struct sk_buff *skb; + + down(&xfrm_cfg_sem); + + while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) { + if (xfrm_user_rcv_skb(skb)) { + if (skb->len) + skb_queue_head(&sk->sk_receive_queue, + skb); + else + kfree_skb(skb); + break; + } + kfree_skb(skb); + } + + up(&xfrm_cfg_sem); + + } while (xfrm_nl && xfrm_nl->sk_receive_queue.qlen); +} + +static int build_expire(struct sk_buff *skb, struct xfrm_state *x, int hard) +{ + struct xfrm_user_expire *ue; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, 0, 0, XFRM_MSG_EXPIRE, + sizeof(*ue)); + ue = NLMSG_DATA(nlh); + nlh->nlmsg_flags = 0; + + copy_to_user_state(x, &ue->state); + ue->hard = (hard != 0) ? 1 : 0; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int xfrm_send_state_notify(struct xfrm_state *x, int hard) +{ + struct sk_buff *skb; + + skb = alloc_skb(sizeof(struct xfrm_user_expire) + 16, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + if (build_expire(skb, x, hard) < 0) + BUG(); + + NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE; + + return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC); +} + +static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, + struct xfrm_tmpl *xt, struct xfrm_policy *xp, + int dir) +{ + struct xfrm_user_acquire *ua; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + __u32 seq = xfrm_get_acqseq(); + + nlh = NLMSG_PUT(skb, 0, 0, XFRM_MSG_ACQUIRE, + sizeof(*ua)); + ua = NLMSG_DATA(nlh); + nlh->nlmsg_flags = 0; + + memcpy(&ua->id, &x->id, sizeof(ua->id)); + memcpy(&ua->saddr, &x->props.saddr, sizeof(ua->saddr)); + memcpy(&ua->sel, &x->sel, sizeof(ua->sel)); + copy_to_user_policy(xp, &ua->policy, dir); + ua->aalgos = xt->aalgos; + ua->ealgos = xt->ealgos; + ua->calgos = xt->calgos; + ua->seq = x->km.seq = seq; + + if (copy_to_user_tmpl(xp, skb) < 0) + goto nlmsg_failure; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt, + struct xfrm_policy *xp, int dir) +{ + struct sk_buff *skb; + size_t len; + + len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); + len += NLMSG_SPACE(sizeof(struct xfrm_user_acquire)); + skb = alloc_skb(len, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + if (build_acquire(skb, x, xt, xp, dir) < 0) + BUG(); + + NETLINK_CB(skb).dst_groups = XFRMGRP_ACQUIRE; + + return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_ACQUIRE, GFP_ATOMIC); +} + +/* User gives us xfrm_user_policy_info followed by an array of 0 + * or more templates. + */ +static struct xfrm_policy *xfrm_compile_policy(u16 family, int opt, + u8 *data, int len, int *dir) +{ + struct xfrm_userpolicy_info *p = (struct xfrm_userpolicy_info *)data; + struct xfrm_user_tmpl *ut = (struct xfrm_user_tmpl *) (p + 1); + struct xfrm_policy *xp; + int nr; + + switch (family) { + case AF_INET: + if (opt != IP_XFRM_POLICY) { + *dir = -EOPNOTSUPP; + return NULL; + } + break; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + case AF_INET6: + if (opt != IPV6_XFRM_POLICY) { + *dir = -EOPNOTSUPP; + return NULL; + } + break; +#endif + default: + *dir = -EINVAL; + return NULL; + } + + *dir = -EINVAL; + + if (len < sizeof(*p) || + verify_newpolicy_info(p)) + return NULL; + + nr = ((len - sizeof(*p)) / sizeof(*ut)); + if (nr > XFRM_MAX_DEPTH) + return NULL; + + xp = xfrm_policy_alloc(GFP_KERNEL); + if (xp == NULL) { + *dir = -ENOBUFS; + return NULL; + } + + copy_from_user_policy(xp, p); + copy_templates(xp, ut, nr); + + *dir = p->dir; + + return xp; +} + +static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp, + int dir, int hard) +{ + struct xfrm_user_polexpire *upe; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + + nlh = NLMSG_PUT(skb, 0, 0, XFRM_MSG_POLEXPIRE, sizeof(*upe)); + upe = NLMSG_DATA(nlh); + nlh->nlmsg_flags = 0; + + copy_to_user_policy(xp, &upe->pol, dir); + if (copy_to_user_tmpl(xp, skb) < 0) + goto nlmsg_failure; + upe->hard = !!hard; + + nlh->nlmsg_len = skb->tail - b; + return skb->len; + +nlmsg_failure: + skb_trim(skb, b - skb->data); + return -1; +} + +static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, int hard) +{ + struct sk_buff *skb; + size_t len; + + len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); + len += NLMSG_SPACE(sizeof(struct xfrm_user_polexpire)); + skb = alloc_skb(len, GFP_ATOMIC); + if (skb == NULL) + return -ENOMEM; + + if (build_polexpire(skb, xp, dir, hard) < 0) + BUG(); + + NETLINK_CB(skb).dst_groups = XFRMGRP_EXPIRE; + + return netlink_broadcast(xfrm_nl, skb, 0, XFRMGRP_EXPIRE, GFP_ATOMIC); +} + +static struct xfrm_mgr netlink_mgr = { + .id = "netlink", + .notify = xfrm_send_state_notify, + .acquire = xfrm_send_acquire, + .compile_policy = xfrm_compile_policy, + .notify_policy = xfrm_send_policy_notify, +}; + +static int __init xfrm_user_init(void) +{ + printk(KERN_INFO "Initializing IPsec netlink socket\n"); + + xfrm_nl = netlink_kernel_create(NETLINK_XFRM, xfrm_netlink_rcv); + if (xfrm_nl == NULL) + return -ENOMEM; + + xfrm_register_km(&netlink_mgr); + + return 0; +} + +static void __exit xfrm_user_exit(void) +{ + xfrm_unregister_km(&netlink_mgr); + sock_release(xfrm_nl->sk_socket); +} + +module_init(xfrm_user_init); +module_exit(xfrm_user_exit); +MODULE_LICENSE("GPL"); -- cgit v1.2.3-59-g8ed1b